diff --git a/results/MicroBenchmarks/ImageProcessing/Dither/CMakeFiles/Dither.dir/orderedDitherKernel.s b/results/MicroBenchmarks/ImageProcessing/Dither/CMakeFiles/Dither.dir/orderedDitherKernel.s index fa9e6929..f617e79c 100644 --- a/results/MicroBenchmarks/ImageProcessing/Dither/CMakeFiles/Dither.dir/orderedDitherKernel.s +++ b/results/MicroBenchmarks/ImageProcessing/Dither/CMakeFiles/Dither.dir/orderedDitherKernel.s @@ -351,9 +351,8 @@ orderedDitherKernel: # @orderedDitherKernel vreplgr2vr.w $vr1, $a5 lu32i.d $a5, 0 vrepli.w $vr2, 3 - vrepli.b $vr3, 0 ori $t3, $zero, 12 - vrepli.w $vr4, 255 + vrepli.w $vr3, 255 b .LBB0_41 .p2align 4, , 16 .LBB0_40: # %._crit_edge.us151 @@ -379,38 +378,39 @@ orderedDitherKernel: # @orderedDitherKernel # in Loop: Header=BB0_41 Depth=1 move $t5, $a3 move $t6, $a2 - vori.b $vr5, $vr0, 0 + vori.b $vr4, $vr0, 0 .p2align 4, , 16 .LBB0_44: # %vector.body269 # Parent Loop BB0_41 Depth=1 # => This Inner Loop Header: Depth=2 - vmuh.wu $vr6, $vr5, $vr1 - vsrli.w $vr6, $vr6, 1 - vori.b $vr7, $vr5, 0 - vmsub.w $vr7, $vr6, $vr2 - vld $vr6, $t5, 0 - vilvh.w $vr8, $vr3, $vr7 - vilvl.w $vr7, $vr3, $vr7 - vpickve2gr.d $t7, $vr7, 0 + vmuh.wu $vr5, $vr4, $vr1 + vsrli.w $vr5, $vr5, 1 + vori.b $vr6, $vr4, 0 + vmsub.w $vr6, $vr5, $vr2 + vld $vr5, $t5, 0 + vshuf4i.w $vr7, $vr6, 14 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vpickve2gr.d $t7, $vr6, 0 mul.d $t7, $t7, $t3 - vpickve2gr.d $t8, $vr7, 1 + vpickve2gr.d $t8, $vr6, 1 mul.d $t8, $t8, $t3 - vpickve2gr.d $fp, $vr8, 0 + vpickve2gr.d $fp, $vr7, 0 mul.d $fp, $fp, $t3 - vpickve2gr.d $s0, $vr8, 1 + vpickve2gr.d $s0, $vr7, 1 mul.d $s0, $s0, $t3 ldx.w $t7, $t4, $t7 ldx.w $t8, $t4, $t8 ldx.w $fp, $t4, $fp ldx.w $s0, $t4, $s0 - vinsgr2vr.w $vr7, $t7, 0 - vinsgr2vr.w $vr7, $t8, 1 - vinsgr2vr.w $vr7, $fp, 2 - vinsgr2vr.w $vr7, $s0, 3 - vslt.w $vr6, $vr7, $vr6 - vand.v $vr6, $vr6, $vr4 - vst $vr6, $t5, 0 - vaddi.wu $vr5, $vr5, 4 + vinsgr2vr.w $vr6, $t7, 0 + vinsgr2vr.w $vr6, $t8, 1 + vinsgr2vr.w $vr6, $fp, 2 + vinsgr2vr.w $vr6, $s0, 3 + vslt.w $vr5, $vr6, $vr5 + vand.v $vr5, $vr5, $vr3 + vst $vr5, $t5, 0 + vaddi.wu $vr4, $vr4, 4 addi.d $t6, $t6, -4 addi.d $t5, $t5, 16 bnez $t6, .LBB0_44 diff --git a/results/MicroBenchmarks/LoopVectorization/CMakeFiles/LoopEpilogueVectorizationBenchmarks.dir/EpilogueVectorization.s b/results/MicroBenchmarks/LoopVectorization/CMakeFiles/LoopEpilogueVectorizationBenchmarks.dir/EpilogueVectorization.s index c2185f80..56d54d20 100644 --- a/results/MicroBenchmarks/LoopVectorization/CMakeFiles/LoopEpilogueVectorizationBenchmarks.dir/EpilogueVectorization.s +++ b/results/MicroBenchmarks/LoopVectorization/CMakeFiles/LoopEpilogueVectorizationBenchmarks.dir/EpilogueVectorization.s @@ -2340,27 +2340,26 @@ _ZL24loopWithReductionAutoVecIhEmPT_S1_S1_i: # @_ZL24loopWithReductionAutoVecIhE addi.d $a1, $a0, 2 move $a4, $a2 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB10_5: # %vector.body # =>This Inner Loop Header: Depth=1 ld.h $a5, $a1, -2 ld.h $a6, $a1, 0 - vinsgr2vr.h $vr3, $a5, 0 - vinsgr2vr.h $vr4, $a6, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vilvl.w $vr4, $vr0, $vr4 + vinsgr2vr.h $vr2, $a5, 0 + vinsgr2vr.h $vr3, $a6, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vadd.d $vr0, $vr0, $vr2 vadd.d $vr1, $vr1, $vr3 - vadd.d $vr2, $vr2, $vr4 addi.d $a4, $a4, -4 addi.d $a1, $a1, 4 bnez $a4, .LBB10_5 # %bb.6: # %middle.block - vadd.d $vr0, $vr2, $vr1 + vadd.d $vr0, $vr1, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a1, $vr0, 0 beq $a2, $a3, .LBB10_9 @@ -2628,25 +2627,24 @@ _ZL24loopWithReductionAutoVecItEmPT_S1_S1_i: # @_ZL24loopWithReductionAutoVecItE addi.d $a1, $a0, 4 move $a4, $a2 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB14_5: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $a5, $a1, -4 ld.w $a6, $a1, 0 - vinsgr2vr.w $vr3, $a5, 0 - vinsgr2vr.w $vr4, $a6, 0 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 - vilvl.w $vr4, $vr0, $vr4 + vinsgr2vr.w $vr2, $a5, 0 + vinsgr2vr.w $vr3, $a6, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vadd.d $vr0, $vr0, $vr2 vadd.d $vr1, $vr1, $vr3 - vadd.d $vr2, $vr2, $vr4 addi.d $a4, $a4, -4 addi.d $a1, $a1, 8 bnez $a4, .LBB14_5 # %bb.6: # %middle.block - vadd.d $vr0, $vr2, $vr1 + vadd.d $vr0, $vr1, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a1, $vr0, 0 beq $a2, $a3, .LBB14_9 @@ -2894,23 +2892,22 @@ _ZL24loopWithReductionAutoVecIjEmPT_S1_S1_i: # @_ZL24loopWithReductionAutoVecIjE addi.d $a1, $a0, 8 move $a4, $a2 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB18_5: # %vector.body # =>This Inner Loop Header: Depth=1 ld.d $a5, $a1, -8 ld.d $a6, $a1, 0 - vinsgr2vr.d $vr3, $a5, 0 - vinsgr2vr.d $vr4, $a6, 0 - vilvl.w $vr3, $vr0, $vr3 - vilvl.w $vr4, $vr0, $vr4 + vinsgr2vr.d $vr2, $a5, 0 + vinsgr2vr.d $vr3, $a6, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vadd.d $vr0, $vr0, $vr2 vadd.d $vr1, $vr1, $vr3 - vadd.d $vr2, $vr2, $vr4 addi.d $a4, $a4, -4 addi.d $a1, $a1, 16 bnez $a4, .LBB18_5 # %bb.6: # %middle.block - vadd.d $vr0, $vr2, $vr1 + vadd.d $vr0, $vr1, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a1, $vr0, 0 beq $a2, $a3, .LBB18_9 diff --git a/results/MicroBenchmarks/LoopVectorization/CMakeFiles/LoopVectorizationBenchmarks.dir/VectorOperations.s b/results/MicroBenchmarks/LoopVectorization/CMakeFiles/LoopVectorizationBenchmarks.dir/VectorOperations.s index 8fb31bf1..65f21d1c 100644 --- a/results/MicroBenchmarks/LoopVectorization/CMakeFiles/LoopVectorizationBenchmarks.dir/VectorOperations.s +++ b/results/MicroBenchmarks/LoopVectorization/CMakeFiles/LoopVectorizationBenchmarks.dir/VectorOperations.s @@ -14679,19 +14679,19 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception36 # %bb.0: - addi.d $sp, $sp, -144 - .cfi_def_cfa_offset 144 - st.d $ra, $sp, 136 # 8-byte Folded Spill - st.d $fp, $sp, 128 # 8-byte Folded Spill - st.d $s0, $sp, 120 # 8-byte Folded Spill - st.d $s1, $sp, 112 # 8-byte Folded Spill - st.d $s2, $sp, 104 # 8-byte Folded Spill - st.d $s3, $sp, 96 # 8-byte Folded Spill - st.d $s4, $sp, 88 # 8-byte Folded Spill - st.d $s5, $sp, 80 # 8-byte Folded Spill - st.d $s6, $sp, 72 # 8-byte Folded Spill - st.d $s7, $sp, 64 # 8-byte Folded Spill - st.d $s8, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -128 + .cfi_def_cfa_offset 128 + st.d $ra, $sp, 120 # 8-byte Folded Spill + st.d $fp, $sp, 112 # 8-byte Folded Spill + st.d $s0, $sp, 104 # 8-byte Folded Spill + st.d $s1, $sp, 96 # 8-byte Folded Spill + st.d $s2, $sp, 88 # 8-byte Folded Spill + st.d $s3, $sp, 80 # 8-byte Folded Spill + st.d $s4, $sp, 72 # 8-byte Folded Spill + st.d $s5, $sp, 64 # 8-byte Folded Spill + st.d $s6, $sp, 56 # 8-byte Folded Spill + st.d $s7, $sp, 48 # 8-byte Folded Spill + st.d $s8, $sp, 40 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -14719,8 +14719,8 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta .Ltmp775: # EH_LABEL # %bb.1: # %.split move $s3, $a0 - st.d $s0, $sp, 32 # 8-byte Folded Spill - st.d $a0, $sp, 40 + st.d $s0, $sp, 16 # 8-byte Folded Spill + st.d $a0, $sp, 24 .Ltmp777: # EH_LABEL move $a0, $s2 pcaddu18i $ra, %call36(_Znam) @@ -14734,7 +14734,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta add.d $s1, $s3, $a0 lu12i.w $a0, 15 ori $a0, $a0, 3840 - st.h $a0, $sp, 54 + st.h $a0, $sp, 38 lu12i.w $a0, -3 ori $s8, $a0, 2288 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -14742,8 +14742,8 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta .p2align 4, , 16 .LBB36_3: # =>This Inner Loop Header: Depth=1 .Ltmp780: # EH_LABEL - addi.d $a0, $sp, 54 - addi.d $a2, $sp, 54 + addi.d $a0, $sp, 38 + addi.d $a2, $sp, 38 move $a1, $s5 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIhEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEhRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -14775,7 +14775,6 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta addi.d $a0, $s3, 32 lu12i.w $a1, -3 ori $a1, $a1, 2304 - vrepli.b $vr4, 0 .p2align 4, , 16 .LBB36_8: # %vector.body84 # =>This Inner Loop Header: Depth=1 @@ -14788,10 +14787,10 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta vinsgr2vr.d $vr1, $a4, 0 vinsgr2vr.d $vr2, $a5, 0 vinsgr2vr.d $vr3, $a2, 0 - vilvl.b $vr0, $vr4, $vr0 - vilvl.b $vr1, $vr4, $vr1 - vilvl.b $vr2, $vr4, $vr2 - vilvl.b $vr3, $vr4, $vr3 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 vst $vr0, $a0, -32 vst $vr1, $a0, -16 vst $vr2, $a0, 0 @@ -14859,8 +14858,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta addi.d $s8, $s8, 2 bnez $s1, .LBB36_10 # %bb.12: - vst $vr4, $sp, 16 # 16-byte Folded Spill - ld.d $s6, $sp, 32 # 8-byte Folded Reload + ld.d $s6, $sp, 16 # 8-byte Folded Reload ld.w $s3, $s6, 28 ld.d $s1, $s6, 16 .Ltmp798: # EH_LABEL @@ -14869,13 +14867,12 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta jirl $ra, $ra, 0 .Ltmp799: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr4, $sp, 16 # 16-byte Folded Reload bnez $s3, .LBB36_25 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s1, .LBB36_25 # %bb.15: # %.lr.ph.preheader add.d $a0, $fp, $s4 - addi.d $a1, $sp, 40 + addi.d $a1, $sp, 24 lu12i.w $a3, -3 ori $a2, $a3, 2288 ori $a3, $a3, 2304 @@ -14887,7 +14884,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta #APP #NO_APP #MEMBARRIER - ld.d $a4, $sp, 40 + ld.d $a4, $sp, 24 bgeu $a4, $a0, .LBB36_19 # %bb.17: # %.lr.ph # in Loop: Header=BB36_16 Depth=1 @@ -14914,10 +14911,10 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta vinsgr2vr.d $vr1, $t1, 0 vinsgr2vr.d $vr2, $t2, 0 vinsgr2vr.d $vr3, $a7, 0 - vilvl.b $vr0, $vr4, $vr0 - vilvl.b $vr1, $vr4, $vr1 - vilvl.b $vr2, $vr4, $vr2 - vilvl.b $vr3, $vr4, $vr3 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 vst $vr0, $a5, -32 vst $vr1, $a5, -16 vst $vr2, $a5, 0 @@ -14956,7 +14953,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 40 + ld.d $a0, $sp, 24 beqz $a0, .LBB36_28 # %bb.27: # %_ZNKSt14default_deleteIA_tEclItEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i37 pcaddu18i $ra, %call36(_ZdaPv) @@ -14965,18 +14962,18 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 56 # 8-byte Folded Reload - ld.d $s7, $sp, 64 # 8-byte Folded Reload - ld.d $s6, $sp, 72 # 8-byte Folded Reload - ld.d $s5, $sp, 80 # 8-byte Folded Reload - ld.d $s4, $sp, 88 # 8-byte Folded Reload - ld.d $s3, $sp, 96 # 8-byte Folded Reload - ld.d $s2, $sp, 104 # 8-byte Folded Reload - ld.d $s1, $sp, 112 # 8-byte Folded Reload - ld.d $s0, $sp, 120 # 8-byte Folded Reload - ld.d $fp, $sp, 128 # 8-byte Folded Reload - ld.d $ra, $sp, 136 # 8-byte Folded Reload - addi.d $sp, $sp, 144 + ld.d $s8, $sp, 40 # 8-byte Folded Reload + ld.d $s7, $sp, 48 # 8-byte Folded Reload + ld.d $s6, $sp, 56 # 8-byte Folded Reload + ld.d $s5, $sp, 64 # 8-byte Folded Reload + ld.d $s4, $sp, 72 # 8-byte Folded Reload + ld.d $s3, $sp, 80 # 8-byte Folded Reload + ld.d $s2, $sp, 88 # 8-byte Folded Reload + ld.d $s1, $sp, 96 # 8-byte Folded Reload + ld.d $s0, $sp, 104 # 8-byte Folded Reload + ld.d $fp, $sp, 112 # 8-byte Folded Reload + ld.d $ra, $sp, 120 # 8-byte Folded Reload + addi.d $sp, $sp, 128 ret .LBB36_29: .Ltmp783: # EH_LABEL @@ -15068,7 +15065,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchmark5Sta move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s3, $sp, 40 + ld.d $s3, $sp, 24 beqz $s3, .LBB36_44 .LBB36_43: # %_ZNKSt14default_deleteIA_tEclItEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i30 move $a0, $s3 @@ -15138,19 +15135,19 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception37 # %bb.0: - addi.d $sp, $sp, -144 - .cfi_def_cfa_offset 144 - st.d $ra, $sp, 136 # 8-byte Folded Spill - st.d $fp, $sp, 128 # 8-byte Folded Spill - st.d $s0, $sp, 120 # 8-byte Folded Spill - st.d $s1, $sp, 112 # 8-byte Folded Spill - st.d $s2, $sp, 104 # 8-byte Folded Spill - st.d $s3, $sp, 96 # 8-byte Folded Spill - st.d $s4, $sp, 88 # 8-byte Folded Spill - st.d $s5, $sp, 80 # 8-byte Folded Spill - st.d $s6, $sp, 72 # 8-byte Folded Spill - st.d $s7, $sp, 64 # 8-byte Folded Spill - st.d $s8, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -128 + .cfi_def_cfa_offset 128 + st.d $ra, $sp, 120 # 8-byte Folded Spill + st.d $fp, $sp, 112 # 8-byte Folded Spill + st.d $s0, $sp, 104 # 8-byte Folded Spill + st.d $s1, $sp, 96 # 8-byte Folded Spill + st.d $s2, $sp, 88 # 8-byte Folded Spill + st.d $s3, $sp, 80 # 8-byte Folded Spill + st.d $s4, $sp, 72 # 8-byte Folded Spill + st.d $s5, $sp, 64 # 8-byte Folded Spill + st.d $s6, $sp, 56 # 8-byte Folded Spill + st.d $s7, $sp, 48 # 8-byte Folded Spill + st.d $s8, $sp, 40 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -15178,7 +15175,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St .Ltmp804: # EH_LABEL # %bb.1: # %.split move $s4, $a0 - st.d $a0, $sp, 40 + st.d $a0, $sp, 24 .Ltmp806: # EH_LABEL move $a0, $s3 pcaddu18i $ra, %call36(_Znam) @@ -15193,7 +15190,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St st.d $a0, $sp, 16 # 8-byte Folded Spill lu12i.w $a0, 15 ori $a0, $a0, 3840 - st.h $a0, $sp, 54 + st.h $a0, $sp, 38 lu12i.w $s8, -3 ori $s6, $s8, 2288 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -15201,8 +15198,8 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St .p2align 4, , 16 .LBB37_3: # =>This Inner Loop Header: Depth=1 .Ltmp809: # EH_LABEL - addi.d $a0, $sp, 54 - addi.d $a2, $sp, 54 + addi.d $a0, $sp, 38 + addi.d $a2, $sp, 38 move $a1, $s5 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIhEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEhRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -15236,7 +15233,6 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St ori $s5, $a5, 1792 ori $s6, $a5, 1824 ori $s7, $a5, 1840 - vrepli.b $vr8, 0 .p2align 4, , 16 .LBB37_8: # %vector.body84 # =>This Inner Loop Header: Depth=1 @@ -15245,22 +15241,26 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St vldx $vr1, $a2, $s1 vldx $vr2, $a2, $s6 vldx $vr3, $a2, $s7 - vilvl.b $vr4, $vr8, $vr0 - vilvh.b $vr0, $vr8, $vr0 - vilvl.b $vr5, $vr8, $vr1 - vilvh.b $vr1, $vr8, $vr1 - vilvl.b $vr6, $vr8, $vr2 - vilvh.b $vr2, $vr8, $vr2 - vilvl.b $vr7, $vr8, $vr3 - vilvh.b $vr3, $vr8, $vr3 - vst $vr0, $a0, -48 - vst $vr4, $a0, -64 - vst $vr1, $a0, -16 - vst $vr5, $a0, -32 - vst $vr2, $a0, 16 - vst $vr6, $a0, 0 - vst $vr3, $a0, 48 - vst $vr7, $a0, 32 + vbsrl.v $vr4, $vr0, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vbsrl.v $vr5, $vr1, 8 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vbsrl.v $vr6, $vr2, 8 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vbsrl.v $vr7, $vr3, 8 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vst $vr0, $a0, -64 + vst $vr4, $a0, -48 + vst $vr1, $a0, -32 + vst $vr5, $a0, -16 + vst $vr2, $a0, 0 + vst $vr6, $a0, 16 + vst $vr3, $a0, 32 + vst $vr7, $a0, 48 addi.d $a1, $a1, 64 addi.d $a0, $a0, 128 bnez $a1, .LBB37_8 @@ -15326,7 +15326,6 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St addi.d $s8, $s8, 2 bnez $s2, .LBB37_10 # %bb.12: - vst $vr8, $sp, 16 # 16-byte Folded Spill move $s8, $a3 ld.w $s4, $a3, 28 ld.d $s2, $a3, 16 @@ -15336,13 +15335,12 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St jirl $ra, $ra, 0 .Ltmp828: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr8, $sp, 16 # 16-byte Folded Reload bnez $s4, .LBB37_25 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s2, .LBB37_25 # %bb.15: # %.lr.ph.preheader add.d $a0, $fp, $s1 - addi.d $a1, $sp, 40 + addi.d $a1, $sp, 24 lu12i.w $a3, -3 ori $a2, $a3, 2288 ori $a3, $a3, 2304 @@ -15354,7 +15352,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St #APP #NO_APP #MEMBARRIER - ld.d $a4, $sp, 40 + ld.d $a4, $sp, 24 bgeu $a4, $a0, .LBB37_19 # %bb.17: # %.lr.ph # in Loop: Header=BB37_16 Depth=1 @@ -15377,22 +15375,26 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St vldx $vr1, $a7, $s1 vldx $vr2, $a7, $s6 vldx $vr3, $a7, $s7 - vilvl.b $vr4, $vr8, $vr0 - vilvh.b $vr0, $vr8, $vr0 - vilvl.b $vr5, $vr8, $vr1 - vilvh.b $vr1, $vr8, $vr1 - vilvl.b $vr6, $vr8, $vr2 - vilvh.b $vr2, $vr8, $vr2 - vilvl.b $vr7, $vr8, $vr3 - vilvh.b $vr3, $vr8, $vr3 - vst $vr0, $a5, -48 - vst $vr4, $a5, -64 - vst $vr1, $a5, -16 - vst $vr5, $a5, -32 - vst $vr2, $a5, 16 - vst $vr6, $a5, 0 - vst $vr3, $a5, 48 - vst $vr7, $a5, 32 + vbsrl.v $vr4, $vr0, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vbsrl.v $vr5, $vr1, 8 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vbsrl.v $vr6, $vr2, 8 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vbsrl.v $vr7, $vr3, 8 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vst $vr0, $a5, -64 + vst $vr4, $a5, -48 + vst $vr1, $a5, -32 + vst $vr5, $a5, -16 + vst $vr2, $a5, 0 + vst $vr6, $a5, 16 + vst $vr3, $a5, 32 + vst $vr7, $a5, 48 addi.d $a6, $a6, 64 addi.d $a5, $a5, 128 bnez $a6, .LBB37_20 @@ -15427,7 +15429,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 40 + ld.d $a0, $sp, 24 beqz $a0, .LBB37_28 # %bb.27: # %_ZNKSt14default_deleteIA_tEclItEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i37 pcaddu18i $ra, %call36(_ZdaPv) @@ -15436,18 +15438,18 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 56 # 8-byte Folded Reload - ld.d $s7, $sp, 64 # 8-byte Folded Reload - ld.d $s6, $sp, 72 # 8-byte Folded Reload - ld.d $s5, $sp, 80 # 8-byte Folded Reload - ld.d $s4, $sp, 88 # 8-byte Folded Reload - ld.d $s3, $sp, 96 # 8-byte Folded Reload - ld.d $s2, $sp, 104 # 8-byte Folded Reload - ld.d $s1, $sp, 112 # 8-byte Folded Reload - ld.d $s0, $sp, 120 # 8-byte Folded Reload - ld.d $fp, $sp, 128 # 8-byte Folded Reload - ld.d $ra, $sp, 136 # 8-byte Folded Reload - addi.d $sp, $sp, 144 + ld.d $s8, $sp, 40 # 8-byte Folded Reload + ld.d $s7, $sp, 48 # 8-byte Folded Reload + ld.d $s6, $sp, 56 # 8-byte Folded Reload + ld.d $s5, $sp, 64 # 8-byte Folded Reload + ld.d $s4, $sp, 72 # 8-byte Folded Reload + ld.d $s3, $sp, 80 # 8-byte Folded Reload + ld.d $s2, $sp, 88 # 8-byte Folded Reload + ld.d $s1, $sp, 96 # 8-byte Folded Reload + ld.d $s0, $sp, 104 # 8-byte Folded Reload + ld.d $fp, $sp, 112 # 8-byte Folded Reload + ld.d $ra, $sp, 120 # 8-byte Folded Reload + addi.d $sp, $sp, 128 ret .LBB37_29: .Ltmp812: # EH_LABEL @@ -15539,7 +15541,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint16_t_RN9benchmark5St move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s4, $sp, 40 + ld.d $s4, $sp, 24 beqz $s4, .LBB37_44 .LBB37_43: # %_ZNKSt14default_deleteIA_tEclItEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i30 move $a0, $s4 @@ -15609,19 +15611,19 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception38 # %bb.0: - addi.d $sp, $sp, -144 - .cfi_def_cfa_offset 144 - st.d $ra, $sp, 136 # 8-byte Folded Spill - st.d $fp, $sp, 128 # 8-byte Folded Spill - st.d $s0, $sp, 120 # 8-byte Folded Spill - st.d $s1, $sp, 112 # 8-byte Folded Spill - st.d $s2, $sp, 104 # 8-byte Folded Spill - st.d $s3, $sp, 96 # 8-byte Folded Spill - st.d $s4, $sp, 88 # 8-byte Folded Spill - st.d $s5, $sp, 80 # 8-byte Folded Spill - st.d $s6, $sp, 72 # 8-byte Folded Spill - st.d $s7, $sp, 64 # 8-byte Folded Spill - st.d $s8, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -112 + .cfi_def_cfa_offset 112 + st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s1, $sp, 80 # 8-byte Folded Spill + st.d $s2, $sp, 72 # 8-byte Folded Spill + st.d $s3, $sp, 64 # 8-byte Folded Spill + st.d $s4, $sp, 56 # 8-byte Folded Spill + st.d $s5, $sp, 48 # 8-byte Folded Spill + st.d $s6, $sp, 40 # 8-byte Folded Spill + st.d $s7, $sp, 32 # 8-byte Folded Spill + st.d $s8, $sp, 24 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -15649,7 +15651,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # .Ltmp833: # EH_LABEL # %bb.1: move $s4, $a0 - st.d $a0, $sp, 40 + st.d $a0, $sp, 8 .Ltmp835: # EH_LABEL move $a0, $s3 pcaddu18i $ra, %call36(_Znam) @@ -15659,7 +15661,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # move $s0, $a0 lu12i.w $a0, 15 ori $a0, $a0, 3840 - st.h $a0, $sp, 54 + st.h $a0, $sp, 22 lu12i.w $s6, -3 ori $s8, $s6, 2288 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -15667,8 +15669,8 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # .p2align 4, , 16 .LBB38_3: # =>This Inner Loop Header: Depth=1 .Ltmp838: # EH_LABEL - addi.d $a0, $sp, 54 - addi.d $a2, $sp, 54 + addi.d $a0, $sp, 22 + addi.d $a2, $sp, 22 move $a1, $s5 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIhEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEhRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -15697,7 +15699,6 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # # %bb.7: # %vector.body84.preheader addi.d $a0, $s4, 16 ori $a1, $s6, 2288 - vrepli.b $vr2, 0 .p2align 4, , 16 .LBB38_8: # %vector.body84 # =>This Inner Loop Header: Depth=1 @@ -15706,8 +15707,8 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # ldptr.d $a2, $a2, 10008 vinsgr2vr.d $vr0, $a3, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 vst $vr0, $a0, -16 vst $vr1, $a0, 0 addi.d $a1, $a1, 16 @@ -15728,7 +15729,6 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # addi.d $s5, $s5, 2 bnez $s7, .LBB38_10 # %bb.12: - vst $vr2, $sp, 16 # 16-byte Folded Spill ld.w $s5, $s2, 28 ld.d $s4, $s2, 16 .Ltmp856: # EH_LABEL @@ -15737,13 +15737,12 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # jirl $ra, $ra, 0 .Ltmp857: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr2, $sp, 16 # 16-byte Folded Reload bnez $s5, .LBB38_23 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s4, .LBB38_23 # %bb.15: # %.lr.ph.preheader add.d $a0, $fp, $s1 - addi.d $a1, $sp, 40 + addi.d $a1, $sp, 8 ori $a2, $s6, 2288 b .LBB38_17 .p2align 4, , 16 @@ -15758,7 +15757,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # #APP #NO_APP #MEMBARRIER - ld.d $a3, $sp, 40 + ld.d $a3, $sp, 8 bgeu $a3, $a0, .LBB38_21 # %bb.18: # %.lr.ph # in Loop: Header=BB38_17 Depth=1 @@ -15792,8 +15791,8 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # ldptr.d $a5, $a5, 10008 vinsgr2vr.d $vr0, $a6, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 vst $vr0, $a3, -16 vst $vr1, $a3, 0 addi.d $a4, $a4, 16 @@ -15810,7 +15809,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 40 + ld.d $a0, $sp, 8 beqz $a0, .LBB38_26 # %bb.25: # %_ZNKSt14default_deleteIA_tEclItEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i37 pcaddu18i $ra, %call36(_ZdaPv) @@ -15819,18 +15818,18 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 56 # 8-byte Folded Reload - ld.d $s7, $sp, 64 # 8-byte Folded Reload - ld.d $s6, $sp, 72 # 8-byte Folded Reload - ld.d $s5, $sp, 80 # 8-byte Folded Reload - ld.d $s4, $sp, 88 # 8-byte Folded Reload - ld.d $s3, $sp, 96 # 8-byte Folded Reload - ld.d $s2, $sp, 104 # 8-byte Folded Reload - ld.d $s1, $sp, 112 # 8-byte Folded Reload - ld.d $s0, $sp, 120 # 8-byte Folded Reload - ld.d $fp, $sp, 128 # 8-byte Folded Reload - ld.d $ra, $sp, 136 # 8-byte Folded Reload - addi.d $sp, $sp, 144 + ld.d $s8, $sp, 24 # 8-byte Folded Reload + ld.d $s7, $sp, 32 # 8-byte Folded Reload + ld.d $s6, $sp, 40 # 8-byte Folded Reload + ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $s4, $sp, 56 # 8-byte Folded Reload + ld.d $s3, $sp, 64 # 8-byte Folded Reload + ld.d $s2, $sp, 72 # 8-byte Folded Reload + ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $fp, $sp, 96 # 8-byte Folded Reload + ld.d $ra, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret .LBB38_27: .Ltmp841: # EH_LABEL @@ -15922,7 +15921,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5StateE: # move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s4, $sp, 40 + ld.d $s4, $sp, 8 beqz $s4, .LBB38_42 .LBB38_41: # %_ZNKSt14default_deleteIA_tEclItEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i30 move $a0, $s4 @@ -16093,7 +16092,6 @@ _Z67benchForTruncOrZextVecWithAddInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchm addi.d $a1, $sp, 16 ori $a2, $s6, 2288 ori $a3, $s6, 2304 - vrepli.b $vr0, 0 ori $a4, $s5, 1792 .p2align 4, , 16 .LBB39_11: # %.lr.ph @@ -16126,26 +16124,26 @@ _Z67benchForTruncOrZextVecWithAddInLoopWithVW8From_uint8_t_To_uint16_t_RN9benchm ldptr.d $t2, $t0, 9992 ldptr.d $t3, $t0, 10000 ldptr.d $t0, $t0, 10008 - vinsgr2vr.d $vr1, $t1, 0 - vinsgr2vr.d $vr2, $t2, 0 - vinsgr2vr.d $vr3, $t3, 0 - vinsgr2vr.d $vr4, $t0, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.b $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vld $vr5, $a6, -32 - vld $vr6, $a6, -16 - vld $vr7, $a6, 0 - vld $vr8, $a6, 16 + vinsgr2vr.d $vr0, $t1, 0 + vinsgr2vr.d $vr1, $t2, 0 + vinsgr2vr.d $vr2, $t3, 0 + vinsgr2vr.d $vr3, $t0, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vld $vr4, $a6, -32 + vld $vr5, $a6, -16 + vld $vr6, $a6, 0 + vld $vr7, $a6, 16 + vadd.h $vr0, $vr4, $vr0 vadd.h $vr1, $vr5, $vr1 vadd.h $vr2, $vr6, $vr2 vadd.h $vr3, $vr7, $vr3 - vadd.h $vr4, $vr8, $vr4 - vst $vr1, $a6, -32 - vst $vr2, $a6, -16 - vst $vr3, $a6, 0 - vst $vr4, $a6, 16 + vst $vr0, $a6, -32 + vst $vr1, $a6, -16 + vst $vr2, $a6, 0 + vst $vr3, $a6, 16 addi.d $a7, $a7, 32 addi.d $a6, $a6, 64 bnez $a7, .LBB39_15 @@ -16386,7 +16384,6 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint16_t_RN9bench ori $a4, $s5, 1792 ori $a5, $s5, 1824 ori $a6, $s5, 1840 - vrepli.b $vr0, 0 .p2align 4, , 16 .LBB40_11: # %.lr.ph # =>This Loop Header: Depth=1 @@ -16414,42 +16411,46 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint16_t_RN9bench # Parent Loop BB40_11 Depth=1 # => This Inner Loop Header: Depth=2 add.d $t2, $fp, $t1 - vldx $vr1, $t2, $a4 - vldx $vr2, $t2, $s1 - vldx $vr3, $t2, $a5 - vldx $vr4, $t2, $a6 - vilvl.b $vr5, $vr0, $vr1 - vilvh.b $vr1, $vr0, $vr1 - vilvl.b $vr6, $vr0, $vr2 - vilvh.b $vr2, $vr0, $vr2 - vilvl.b $vr7, $vr0, $vr3 - vilvh.b $vr3, $vr0, $vr3 - vilvl.b $vr8, $vr0, $vr4 - vilvh.b $vr4, $vr0, $vr4 - vld $vr9, $t0, -64 - vld $vr10, $t0, -48 - vld $vr11, $t0, -32 - vld $vr12, $t0, -16 - vld $vr13, $t0, 0 - vld $vr14, $t0, 16 - vld $vr15, $t0, 32 - vld $vr16, $t0, 48 - vadd.h $vr1, $vr10, $vr1 - vadd.h $vr5, $vr9, $vr5 - vadd.h $vr2, $vr12, $vr2 - vadd.h $vr6, $vr11, $vr6 - vadd.h $vr3, $vr14, $vr3 - vadd.h $vr7, $vr13, $vr7 - vadd.h $vr4, $vr16, $vr4 - vadd.h $vr8, $vr15, $vr8 - vst $vr5, $t0, -64 - vst $vr1, $t0, -48 - vst $vr6, $t0, -32 - vst $vr2, $t0, -16 - vst $vr7, $t0, 0 - vst $vr3, $t0, 16 - vst $vr8, $t0, 32 - vst $vr4, $t0, 48 + vldx $vr0, $t2, $a4 + vldx $vr1, $t2, $s1 + vldx $vr2, $t2, $a5 + vldx $vr3, $t2, $a6 + vsllwil.hu.bu $vr4, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.hu.bu $vr5, $vr1, 0 + vbsrl.v $vr1, $vr1, 8 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.hu.bu $vr6, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.hu.bu $vr7, $vr3, 0 + vbsrl.v $vr3, $vr3, 8 + vsllwil.hu.bu $vr3, $vr3, 0 + vld $vr8, $t0, -64 + vld $vr9, $t0, -48 + vld $vr10, $t0, -32 + vld $vr11, $t0, -16 + vld $vr12, $t0, 0 + vld $vr13, $t0, 16 + vld $vr14, $t0, 32 + vld $vr15, $t0, 48 + vadd.h $vr0, $vr9, $vr0 + vadd.h $vr4, $vr8, $vr4 + vadd.h $vr1, $vr11, $vr1 + vadd.h $vr5, $vr10, $vr5 + vadd.h $vr2, $vr13, $vr2 + vadd.h $vr6, $vr12, $vr6 + vadd.h $vr3, $vr15, $vr3 + vadd.h $vr7, $vr14, $vr7 + vst $vr4, $t0, -64 + vst $vr0, $t0, -48 + vst $vr5, $t0, -32 + vst $vr1, $t0, -16 + vst $vr6, $t0, 0 + vst $vr2, $t0, 16 + vst $vr7, $t0, 32 + vst $vr3, $t0, 48 addi.d $t1, $t1, 64 addi.d $t0, $t0, 128 bnez $t1, .LBB40_15 @@ -16684,7 +16685,6 @@ _Z60benchForTruncOrZextVecWithAddInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5Sta add.d $a0, $fp, $s1 addi.d $a1, $sp, 8 ori $a2, $s5, 2288 - vrepli.b $vr0, 0 b .LBB41_12 .p2align 4, , 16 .LBB41_11: # %_ZL27truncOrZextVecWithAddInLoopIhtEvPKT_PT0_i.exit @@ -16732,16 +16732,16 @@ _Z60benchForTruncOrZextVecWithAddInLoopFrom_uint8_t_To_uint16_t_RN9benchmark5Sta add.d $a5, $fp, $a4 ldptr.d $a6, $a5, 10000 ldptr.d $a5, $a5, 10008 - vinsgr2vr.d $vr1, $a6, 0 - vinsgr2vr.d $vr2, $a5, 0 - vld $vr3, $a3, -16 - vld $vr4, $a3, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 + vinsgr2vr.d $vr0, $a6, 0 + vinsgr2vr.d $vr1, $a5, 0 + vld $vr2, $a3, -16 + vld $vr3, $a3, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vadd.h $vr0, $vr2, $vr0 vadd.h $vr1, $vr3, $vr1 - vadd.h $vr2, $vr4, $vr2 - vst $vr1, $a3, -16 - vst $vr2, $a3, 0 + vst $vr0, $a3, -16 + vst $vr1, $a3, 0 addi.d $a4, $a4, 16 addi.d $a3, $a3, 32 bnez $a4, .LBB41_17 @@ -16855,19 +16855,19 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception42 # %bb.0: - addi.d $sp, $sp, -144 - .cfi_def_cfa_offset 144 - st.d $ra, $sp, 136 # 8-byte Folded Spill - st.d $fp, $sp, 128 # 8-byte Folded Spill - st.d $s0, $sp, 120 # 8-byte Folded Spill - st.d $s1, $sp, 112 # 8-byte Folded Spill - st.d $s2, $sp, 104 # 8-byte Folded Spill - st.d $s3, $sp, 96 # 8-byte Folded Spill - st.d $s4, $sp, 88 # 8-byte Folded Spill - st.d $s5, $sp, 80 # 8-byte Folded Spill - st.d $s6, $sp, 72 # 8-byte Folded Spill - st.d $s7, $sp, 64 # 8-byte Folded Spill - st.d $s8, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -128 + .cfi_def_cfa_offset 128 + st.d $ra, $sp, 120 # 8-byte Folded Spill + st.d $fp, $sp, 112 # 8-byte Folded Spill + st.d $s0, $sp, 104 # 8-byte Folded Spill + st.d $s1, $sp, 96 # 8-byte Folded Spill + st.d $s2, $sp, 88 # 8-byte Folded Spill + st.d $s3, $sp, 80 # 8-byte Folded Spill + st.d $s4, $sp, 72 # 8-byte Folded Spill + st.d $s5, $sp, 64 # 8-byte Folded Spill + st.d $s6, $sp, 56 # 8-byte Folded Spill + st.d $s7, $sp, 48 # 8-byte Folded Spill + st.d $s8, $sp, 40 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -16895,8 +16895,8 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta .Ltmp904: # EH_LABEL # %bb.1: # %.split move $s3, $a0 - st.d $s0, $sp, 32 # 8-byte Folded Spill - st.d $a0, $sp, 40 + st.d $s0, $sp, 16 # 8-byte Folded Spill + st.d $a0, $sp, 24 .Ltmp906: # EH_LABEL move $a0, $s2 pcaddu18i $ra, %call36(_Znam) @@ -16910,7 +16910,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta add.d $s8, $s3, $a0 lu12i.w $a0, 15 ori $a0, $a0, 3840 - st.h $a0, $sp, 54 + st.h $a0, $sp, 38 lu12i.w $a0, -3 ori $s6, $a0, 2288 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -16918,8 +16918,8 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta .p2align 4, , 16 .LBB42_3: # =>This Inner Loop Header: Depth=1 .Ltmp909: # EH_LABEL - addi.d $a0, $sp, 54 - addi.d $a2, $sp, 54 + addi.d $a0, $sp, 38 + addi.d $a2, $sp, 38 move $a1, $s5 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIhEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEhRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -16950,7 +16950,6 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta addi.d $a0, $s3, 64 lu12i.w $a1, -3 ori $a1, $a1, 2304 - vrepli.b $vr8, 0 .p2align 4, , 16 .LBB42_8: # %vector.body84 # =>This Inner Loop Header: Depth=1 @@ -16963,26 +16962,34 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta vinsgr2vr.d $vr1, $a4, 0 vinsgr2vr.d $vr2, $a5, 0 vinsgr2vr.d $vr3, $a2, 0 - vilvl.b $vr0, $vr8, $vr0 - vilvl.h $vr4, $vr8, $vr0 - vilvh.h $vr0, $vr8, $vr0 - vilvl.b $vr1, $vr8, $vr1 - vilvl.h $vr5, $vr8, $vr1 - vilvh.h $vr1, $vr8, $vr1 - vilvl.b $vr2, $vr8, $vr2 - vilvl.h $vr6, $vr8, $vr2 - vilvh.h $vr2, $vr8, $vr2 - vilvl.b $vr3, $vr8, $vr3 - vilvl.h $vr7, $vr8, $vr3 - vilvh.h $vr3, $vr8, $vr3 - vst $vr0, $a0, -48 - vst $vr4, $a0, -64 - vst $vr1, $a0, -16 - vst $vr5, $a0, -32 - vst $vr2, $a0, 16 - vst $vr6, $a0, 0 - vst $vr3, $a0, 48 - vst $vr7, $a0, 32 + vsrli.d $vr4, $vr0, 32 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsrli.d $vr5, $vr1, 32 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsrli.d $vr6, $vr2, 32 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsrli.d $vr7, $vr3, 32 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vst $vr0, $a0, -64 + vst $vr4, $a0, -48 + vst $vr1, $a0, -32 + vst $vr5, $a0, -16 + vst $vr2, $a0, 0 + vst $vr6, $a0, 16 + vst $vr3, $a0, 32 + vst $vr7, $a0, 48 addi.d $a1, $a1, 32 addi.d $a0, $a0, 128 bnez $a1, .LBB42_8 @@ -17038,8 +17045,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta addi.d $s6, $s6, 4 bnez $s1, .LBB42_10 # %bb.12: - vst $vr8, $sp, 16 # 16-byte Folded Spill - ld.d $s6, $sp, 32 # 8-byte Folded Reload + ld.d $s6, $sp, 16 # 8-byte Folded Reload ld.w $s3, $s6, 28 ld.d $s1, $s6, 16 .Ltmp927: # EH_LABEL @@ -17048,13 +17054,12 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta jirl $ra, $ra, 0 .Ltmp928: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr8, $sp, 16 # 16-byte Folded Reload bnez $s3, .LBB42_25 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s1, .LBB42_25 # %bb.15: # %.lr.ph.preheader add.d $a0, $fp, $s4 - addi.d $a1, $sp, 40 + addi.d $a1, $sp, 24 lu12i.w $a3, -3 ori $a2, $a3, 2288 ori $a3, $a3, 2304 @@ -17066,7 +17071,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta #APP #NO_APP #MEMBARRIER - ld.d $a4, $sp, 40 + ld.d $a4, $sp, 24 bgeu $a4, $a0, .LBB42_19 # %bb.17: # %.lr.ph # in Loop: Header=BB42_16 Depth=1 @@ -17093,28 +17098,36 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta vinsgr2vr.d $vr1, $t1, 0 vinsgr2vr.d $vr2, $t2, 0 vinsgr2vr.d $vr3, $a7, 0 - vilvl.b $vr0, $vr8, $vr0 - vilvl.h $vr4, $vr8, $vr0 - vilvh.h $vr0, $vr8, $vr0 - vilvl.b $vr1, $vr8, $vr1 - vilvl.h $vr5, $vr8, $vr1 - vilvh.h $vr1, $vr8, $vr1 - vilvl.b $vr2, $vr8, $vr2 - vilvl.h $vr6, $vr8, $vr2 - vilvh.h $vr2, $vr8, $vr2 - vilvl.b $vr3, $vr8, $vr3 - vilvl.h $vr7, $vr8, $vr3 - vilvh.h $vr3, $vr8, $vr3 - vst $vr0, $a5, -48 - vst $vr4, $a5, -64 - vst $vr1, $a5, -16 - vst $vr5, $a5, -32 - vst $vr2, $a5, 16 - vst $vr6, $a5, 0 - vst $vr3, $a5, 48 - vst $vr7, $a5, 32 - addi.d $a6, $a6, 32 - addi.d $a5, $a5, 128 + vsrli.d $vr4, $vr0, 32 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsrli.d $vr5, $vr1, 32 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsrli.d $vr6, $vr2, 32 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsrli.d $vr7, $vr3, 32 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vst $vr0, $a5, -64 + vst $vr4, $a5, -48 + vst $vr1, $a5, -32 + vst $vr5, $a5, -16 + vst $vr2, $a5, 0 + vst $vr6, $a5, 16 + vst $vr3, $a5, 32 + vst $vr7, $a5, 48 + addi.d $a6, $a6, 32 + addi.d $a5, $a5, 128 bnez $a6, .LBB42_20 # %bb.21: # in Loop: Header=BB42_16 Depth=1 move $a6, $s5 @@ -17147,7 +17160,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 40 + ld.d $a0, $sp, 24 beqz $a0, .LBB42_28 # %bb.27: # %_ZNKSt14default_deleteIA_jEclIjEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i37 pcaddu18i $ra, %call36(_ZdaPv) @@ -17156,18 +17169,18 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 56 # 8-byte Folded Reload - ld.d $s7, $sp, 64 # 8-byte Folded Reload - ld.d $s6, $sp, 72 # 8-byte Folded Reload - ld.d $s5, $sp, 80 # 8-byte Folded Reload - ld.d $s4, $sp, 88 # 8-byte Folded Reload - ld.d $s3, $sp, 96 # 8-byte Folded Reload - ld.d $s2, $sp, 104 # 8-byte Folded Reload - ld.d $s1, $sp, 112 # 8-byte Folded Reload - ld.d $s0, $sp, 120 # 8-byte Folded Reload - ld.d $fp, $sp, 128 # 8-byte Folded Reload - ld.d $ra, $sp, 136 # 8-byte Folded Reload - addi.d $sp, $sp, 144 + ld.d $s8, $sp, 40 # 8-byte Folded Reload + ld.d $s7, $sp, 48 # 8-byte Folded Reload + ld.d $s6, $sp, 56 # 8-byte Folded Reload + ld.d $s5, $sp, 64 # 8-byte Folded Reload + ld.d $s4, $sp, 72 # 8-byte Folded Reload + ld.d $s3, $sp, 80 # 8-byte Folded Reload + ld.d $s2, $sp, 88 # 8-byte Folded Reload + ld.d $s1, $sp, 96 # 8-byte Folded Reload + ld.d $s0, $sp, 104 # 8-byte Folded Reload + ld.d $fp, $sp, 112 # 8-byte Folded Reload + ld.d $ra, $sp, 120 # 8-byte Folded Reload + addi.d $sp, $sp, 128 ret .LBB42_29: .Ltmp912: # EH_LABEL @@ -17259,7 +17272,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchmark5Sta move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s3, $sp, 40 + ld.d $s3, $sp, 24 beqz $s3, .LBB42_44 .LBB42_43: # %_ZNKSt14default_deleteIA_jEclIjEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i30 move $a0, $s3 @@ -17329,19 +17342,19 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception43 # %bb.0: - addi.d $sp, $sp, -144 - .cfi_def_cfa_offset 144 - st.d $ra, $sp, 136 # 8-byte Folded Spill - st.d $fp, $sp, 128 # 8-byte Folded Spill - st.d $s0, $sp, 120 # 8-byte Folded Spill - st.d $s1, $sp, 112 # 8-byte Folded Spill - st.d $s2, $sp, 104 # 8-byte Folded Spill - st.d $s3, $sp, 96 # 8-byte Folded Spill - st.d $s4, $sp, 88 # 8-byte Folded Spill - st.d $s5, $sp, 80 # 8-byte Folded Spill - st.d $s6, $sp, 72 # 8-byte Folded Spill - st.d $s7, $sp, 64 # 8-byte Folded Spill - st.d $s8, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -128 + .cfi_def_cfa_offset 128 + st.d $ra, $sp, 120 # 8-byte Folded Spill + st.d $fp, $sp, 112 # 8-byte Folded Spill + st.d $s0, $sp, 104 # 8-byte Folded Spill + st.d $s1, $sp, 96 # 8-byte Folded Spill + st.d $s2, $sp, 88 # 8-byte Folded Spill + st.d $s3, $sp, 80 # 8-byte Folded Spill + st.d $s4, $sp, 72 # 8-byte Folded Spill + st.d $s5, $sp, 64 # 8-byte Folded Spill + st.d $s6, $sp, 56 # 8-byte Folded Spill + st.d $s7, $sp, 48 # 8-byte Folded Spill + st.d $s8, $sp, 40 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -17370,7 +17383,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St # %bb.1: # %.split move $s4, $a0 st.d $s0, $sp, 8 # 8-byte Folded Spill - st.d $a0, $sp, 40 + st.d $a0, $sp, 24 .Ltmp935: # EH_LABEL move $a0, $s3 pcaddu18i $ra, %call36(_Znam) @@ -17385,7 +17398,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St add.d $s8, $s4, $a0 lu12i.w $a0, 15 ori $a0, $a0, 3840 - st.h $a0, $sp, 54 + st.h $a0, $sp, 38 lu12i.w $s6, -3 ori $s2, $s6, 2288 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -17393,8 +17406,8 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St .p2align 4, , 16 .LBB43_3: # =>This Inner Loop Header: Depth=1 .Ltmp938: # EH_LABEL - addi.d $a0, $sp, 54 - addi.d $a2, $sp, 54 + addi.d $a0, $sp, 38 + addi.d $a2, $sp, 38 move $a1, $s5 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIhEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEhRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -17427,7 +17440,6 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St ori $s6, $s7, 1824 move $a4, $s7 ori $s7, $s7, 1840 - vrepli.b $vr16, 0 .p2align 4, , 16 .LBB43_8: # %vector.body84 # =>This Inner Loop Header: Depth=1 @@ -17436,46 +17448,66 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St vldx $vr1, $a2, $s1 vldx $vr2, $a2, $s6 vldx $vr3, $a2, $s7 - vilvl.b $vr4, $vr16, $vr0 - vilvl.h $vr5, $vr16, $vr4 - vilvh.h $vr4, $vr16, $vr4 - vilvh.b $vr0, $vr16, $vr0 - vilvl.h $vr6, $vr16, $vr0 - vilvh.h $vr0, $vr16, $vr0 - vilvl.b $vr7, $vr16, $vr1 - vilvl.h $vr8, $vr16, $vr7 - vilvh.h $vr7, $vr16, $vr7 - vilvh.b $vr1, $vr16, $vr1 - vilvl.h $vr9, $vr16, $vr1 - vilvh.h $vr1, $vr16, $vr1 - vilvl.b $vr10, $vr16, $vr2 - vilvl.h $vr11, $vr16, $vr10 - vilvh.h $vr10, $vr16, $vr10 - vilvh.b $vr2, $vr16, $vr2 - vilvl.h $vr12, $vr16, $vr2 - vilvh.h $vr2, $vr16, $vr2 - vilvl.b $vr13, $vr16, $vr3 - vilvl.h $vr14, $vr16, $vr13 - vilvh.h $vr13, $vr16, $vr13 - vilvh.b $vr3, $vr16, $vr3 - vilvl.h $vr15, $vr16, $vr3 - vilvh.h $vr3, $vr16, $vr3 - vst $vr0, $a0, -80 - vst $vr6, $a0, -96 + vsrli.d $vr4, $vr0, 32 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vbsrl.v $vr5, $vr0, 8 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vbsrl.v $vr6, $vr0, 12 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsrli.d $vr7, $vr1, 32 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vbsrl.v $vr8, $vr1, 8 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vbsrl.v $vr9, $vr1, 12 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsrli.d $vr10, $vr2, 32 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vbsrl.v $vr11, $vr2, 8 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vbsrl.v $vr12, $vr2, 12 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsrli.d $vr13, $vr3, 32 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vbsrl.v $vr14, $vr3, 8 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vbsrl.v $vr15, $vr3, 12 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vst $vr0, $a0, -128 + vst $vr6, $a0, -80 + vst $vr5, $a0, -96 vst $vr4, $a0, -112 - vst $vr5, $a0, -128 - vst $vr1, $a0, -16 - vst $vr9, $a0, -32 + vst $vr1, $a0, -64 + vst $vr9, $a0, -16 + vst $vr8, $a0, -32 vst $vr7, $a0, -48 - vst $vr8, $a0, -64 - vst $vr2, $a0, 48 - vst $vr12, $a0, 32 + vst $vr2, $a0, 0 + vst $vr12, $a0, 48 + vst $vr11, $a0, 32 vst $vr10, $a0, 16 - vst $vr11, $a0, 0 - vst $vr3, $a0, 112 - vst $vr15, $a0, 96 + vst $vr3, $a0, 64 + vst $vr15, $a0, 112 + vst $vr14, $a0, 96 vst $vr13, $a0, 80 - vst $vr14, $a0, 64 addi.d $a1, $a1, 64 addi.d $a0, $a0, 256 bnez $a1, .LBB43_8 @@ -17531,7 +17563,6 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St addi.d $s2, $s2, 4 bnez $s8, .LBB43_10 # %bb.12: - vst $vr16, $sp, 16 # 16-byte Folded Spill ld.d $s8, $sp, 8 # 8-byte Folded Reload ld.w $s4, $s8, 28 ld.d $s2, $s8, 16 @@ -17541,13 +17572,12 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St jirl $ra, $ra, 0 .Ltmp957: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr16, $sp, 16 # 16-byte Folded Reload bnez $s4, .LBB43_25 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s2, .LBB43_25 # %bb.15: # %.lr.ph.preheader add.d $a0, $fp, $s1 - addi.d $a1, $sp, 40 + addi.d $a1, $sp, 24 lu12i.w $a3, -3 ori $a2, $a3, 2288 ori $a3, $a3, 2304 @@ -17559,7 +17589,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St #APP #NO_APP #MEMBARRIER - ld.d $a4, $sp, 40 + ld.d $a4, $sp, 24 bgeu $a4, $a0, .LBB43_19 # %bb.17: # %.lr.ph # in Loop: Header=BB43_16 Depth=1 @@ -17582,46 +17612,66 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St vldx $vr1, $a7, $s1 vldx $vr2, $a7, $s6 vldx $vr3, $a7, $s7 - vilvl.b $vr4, $vr16, $vr0 - vilvl.h $vr5, $vr16, $vr4 - vilvh.h $vr4, $vr16, $vr4 - vilvh.b $vr0, $vr16, $vr0 - vilvl.h $vr6, $vr16, $vr0 - vilvh.h $vr0, $vr16, $vr0 - vilvl.b $vr7, $vr16, $vr1 - vilvl.h $vr8, $vr16, $vr7 - vilvh.h $vr7, $vr16, $vr7 - vilvh.b $vr1, $vr16, $vr1 - vilvl.h $vr9, $vr16, $vr1 - vilvh.h $vr1, $vr16, $vr1 - vilvl.b $vr10, $vr16, $vr2 - vilvl.h $vr11, $vr16, $vr10 - vilvh.h $vr10, $vr16, $vr10 - vilvh.b $vr2, $vr16, $vr2 - vilvl.h $vr12, $vr16, $vr2 - vilvh.h $vr2, $vr16, $vr2 - vilvl.b $vr13, $vr16, $vr3 - vilvl.h $vr14, $vr16, $vr13 - vilvh.h $vr13, $vr16, $vr13 - vilvh.b $vr3, $vr16, $vr3 - vilvl.h $vr15, $vr16, $vr3 - vilvh.h $vr3, $vr16, $vr3 - vst $vr0, $a5, -80 - vst $vr6, $a5, -96 + vsrli.d $vr4, $vr0, 32 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vbsrl.v $vr5, $vr0, 8 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vbsrl.v $vr6, $vr0, 12 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsrli.d $vr7, $vr1, 32 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vbsrl.v $vr8, $vr1, 8 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vbsrl.v $vr9, $vr1, 12 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsrli.d $vr10, $vr2, 32 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vbsrl.v $vr11, $vr2, 8 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vbsrl.v $vr12, $vr2, 12 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsrli.d $vr13, $vr3, 32 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vbsrl.v $vr14, $vr3, 8 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vbsrl.v $vr15, $vr3, 12 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vst $vr0, $a5, -128 + vst $vr6, $a5, -80 + vst $vr5, $a5, -96 vst $vr4, $a5, -112 - vst $vr5, $a5, -128 - vst $vr1, $a5, -16 - vst $vr9, $a5, -32 + vst $vr1, $a5, -64 + vst $vr9, $a5, -16 + vst $vr8, $a5, -32 vst $vr7, $a5, -48 - vst $vr8, $a5, -64 - vst $vr2, $a5, 48 - vst $vr12, $a5, 32 + vst $vr2, $a5, 0 + vst $vr12, $a5, 48 + vst $vr11, $a5, 32 vst $vr10, $a5, 16 - vst $vr11, $a5, 0 - vst $vr3, $a5, 112 - vst $vr15, $a5, 96 + vst $vr3, $a5, 64 + vst $vr15, $a5, 112 + vst $vr14, $a5, 96 vst $vr13, $a5, 80 - vst $vr14, $a5, 64 addi.d $a6, $a6, 64 addi.d $a5, $a5, 256 bnez $a6, .LBB43_20 @@ -17656,7 +17706,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 40 + ld.d $a0, $sp, 24 beqz $a0, .LBB43_28 # %bb.27: # %_ZNKSt14default_deleteIA_jEclIjEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i37 pcaddu18i $ra, %call36(_ZdaPv) @@ -17665,18 +17715,18 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 56 # 8-byte Folded Reload - ld.d $s7, $sp, 64 # 8-byte Folded Reload - ld.d $s6, $sp, 72 # 8-byte Folded Reload - ld.d $s5, $sp, 80 # 8-byte Folded Reload - ld.d $s4, $sp, 88 # 8-byte Folded Reload - ld.d $s3, $sp, 96 # 8-byte Folded Reload - ld.d $s2, $sp, 104 # 8-byte Folded Reload - ld.d $s1, $sp, 112 # 8-byte Folded Reload - ld.d $s0, $sp, 120 # 8-byte Folded Reload - ld.d $fp, $sp, 128 # 8-byte Folded Reload - ld.d $ra, $sp, 136 # 8-byte Folded Reload - addi.d $sp, $sp, 144 + ld.d $s8, $sp, 40 # 8-byte Folded Reload + ld.d $s7, $sp, 48 # 8-byte Folded Reload + ld.d $s6, $sp, 56 # 8-byte Folded Reload + ld.d $s5, $sp, 64 # 8-byte Folded Reload + ld.d $s4, $sp, 72 # 8-byte Folded Reload + ld.d $s3, $sp, 80 # 8-byte Folded Reload + ld.d $s2, $sp, 88 # 8-byte Folded Reload + ld.d $s1, $sp, 96 # 8-byte Folded Reload + ld.d $s0, $sp, 104 # 8-byte Folded Reload + ld.d $fp, $sp, 112 # 8-byte Folded Reload + ld.d $ra, $sp, 120 # 8-byte Folded Reload + addi.d $sp, $sp, 128 ret .LBB43_29: .Ltmp941: # EH_LABEL @@ -17768,7 +17818,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint32_t_RN9benchmark5St move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s4, $sp, 40 + ld.d $s4, $sp, 24 beqz $s4, .LBB43_44 .LBB43_43: # %_ZNKSt14default_deleteIA_jEclIjEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i30 move $a0, $s4 @@ -17838,19 +17888,19 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception44 # %bb.0: - addi.d $sp, $sp, -144 - .cfi_def_cfa_offset 144 - st.d $ra, $sp, 136 # 8-byte Folded Spill - st.d $fp, $sp, 128 # 8-byte Folded Spill - st.d $s0, $sp, 120 # 8-byte Folded Spill - st.d $s1, $sp, 112 # 8-byte Folded Spill - st.d $s2, $sp, 104 # 8-byte Folded Spill - st.d $s3, $sp, 96 # 8-byte Folded Spill - st.d $s4, $sp, 88 # 8-byte Folded Spill - st.d $s5, $sp, 80 # 8-byte Folded Spill - st.d $s6, $sp, 72 # 8-byte Folded Spill - st.d $s7, $sp, 64 # 8-byte Folded Spill - st.d $s8, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -112 + .cfi_def_cfa_offset 112 + st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s1, $sp, 80 # 8-byte Folded Spill + st.d $s2, $sp, 72 # 8-byte Folded Spill + st.d $s3, $sp, 64 # 8-byte Folded Spill + st.d $s4, $sp, 56 # 8-byte Folded Spill + st.d $s5, $sp, 48 # 8-byte Folded Spill + st.d $s6, $sp, 40 # 8-byte Folded Spill + st.d $s7, $sp, 32 # 8-byte Folded Spill + st.d $s8, $sp, 24 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -17878,7 +17928,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # .Ltmp962: # EH_LABEL # %bb.1: move $s4, $a0 - st.d $a0, $sp, 40 + st.d $a0, $sp, 8 .Ltmp964: # EH_LABEL move $a0, $s3 pcaddu18i $ra, %call36(_Znam) @@ -17888,7 +17938,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # move $s0, $a0 lu12i.w $a0, 15 ori $a0, $a0, 3840 - st.h $a0, $sp, 54 + st.h $a0, $sp, 22 lu12i.w $s6, -3 ori $s8, $s6, 2288 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -17896,8 +17946,8 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # .p2align 4, , 16 .LBB44_3: # =>This Inner Loop Header: Depth=1 .Ltmp967: # EH_LABEL - addi.d $a0, $sp, 54 - addi.d $a2, $sp, 54 + addi.d $a0, $sp, 22 + addi.d $a2, $sp, 22 move $a1, $s5 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIhEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEhRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -17927,7 +17977,6 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # addi.d $a0, $s4, 16 ori $a1, $s6, 2288 ori $s5, $s7, 1812 - vrepli.b $vr2, 0 .p2align 4, , 16 .LBB44_8: # %vector.body84 # =>This Inner Loop Header: Depth=1 @@ -17936,10 +17985,10 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # ldx.w $a2, $a2, $s5 vinsgr2vr.w $vr0, $a3, 0 vinsgr2vr.w $vr1, $a2, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $a0, -16 vst $vr1, $a0, 0 addi.d $a1, $a1, 8 @@ -17960,7 +18009,6 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # addi.d $s7, $s7, 4 bnez $s8, .LBB44_10 # %bb.12: - vst $vr2, $sp, 16 # 16-byte Folded Spill ld.w $s7, $s2, 28 ld.d $s4, $s2, 16 .Ltmp985: # EH_LABEL @@ -17969,13 +18017,12 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # jirl $ra, $ra, 0 .Ltmp986: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr2, $sp, 16 # 16-byte Folded Reload bnez $s7, .LBB44_23 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s4, .LBB44_23 # %bb.15: # %.lr.ph.preheader add.d $a0, $fp, $s1 - addi.d $a1, $sp, 40 + addi.d $a1, $sp, 8 ori $a2, $s6, 2288 b .LBB44_17 .p2align 4, , 16 @@ -17990,7 +18037,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # #APP #NO_APP #MEMBARRIER - ld.d $a3, $sp, 40 + ld.d $a3, $sp, 8 bgeu $a3, $a0, .LBB44_21 # %bb.18: # %.lr.ph # in Loop: Header=BB44_17 Depth=1 @@ -18024,10 +18071,10 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # ldx.w $a5, $a5, $s5 vinsgr2vr.w $vr0, $a6, 0 vinsgr2vr.w $vr1, $a5, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $a3, -16 vst $vr1, $a3, 0 addi.d $a4, $a4, 8 @@ -18044,7 +18091,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 40 + ld.d $a0, $sp, 8 beqz $a0, .LBB44_26 # %bb.25: # %_ZNKSt14default_deleteIA_jEclIjEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i37 pcaddu18i $ra, %call36(_ZdaPv) @@ -18053,18 +18100,18 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 56 # 8-byte Folded Reload - ld.d $s7, $sp, 64 # 8-byte Folded Reload - ld.d $s6, $sp, 72 # 8-byte Folded Reload - ld.d $s5, $sp, 80 # 8-byte Folded Reload - ld.d $s4, $sp, 88 # 8-byte Folded Reload - ld.d $s3, $sp, 96 # 8-byte Folded Reload - ld.d $s2, $sp, 104 # 8-byte Folded Reload - ld.d $s1, $sp, 112 # 8-byte Folded Reload - ld.d $s0, $sp, 120 # 8-byte Folded Reload - ld.d $fp, $sp, 128 # 8-byte Folded Reload - ld.d $ra, $sp, 136 # 8-byte Folded Reload - addi.d $sp, $sp, 144 + ld.d $s8, $sp, 24 # 8-byte Folded Reload + ld.d $s7, $sp, 32 # 8-byte Folded Reload + ld.d $s6, $sp, 40 # 8-byte Folded Reload + ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $s4, $sp, 56 # 8-byte Folded Reload + ld.d $s3, $sp, 64 # 8-byte Folded Reload + ld.d $s2, $sp, 72 # 8-byte Folded Reload + ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $fp, $sp, 96 # 8-byte Folded Reload + ld.d $ra, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret .LBB44_27: .Ltmp970: # EH_LABEL @@ -18156,7 +18203,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5StateE: # move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s4, $sp, 40 + ld.d $s4, $sp, 8 beqz $s4, .LBB44_42 .LBB44_41: # %_ZNKSt14default_deleteIA_jEclIjEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i30 move $a0, $s4 @@ -18325,7 +18372,6 @@ _Z67benchForTruncOrZextVecWithAddInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchm addi.d $a1, $sp, 16 ori $a2, $s6, 2288 ori $a3, $s6, 2304 - vrepli.b $vr0, 0 ori $a4, $s5, 1792 .p2align 4, , 16 .LBB45_11: # %.lr.ph @@ -18358,46 +18404,54 @@ _Z67benchForTruncOrZextVecWithAddInLoopWithVW8From_uint8_t_To_uint32_t_RN9benchm ldptr.d $t2, $t0, 9992 ldptr.d $t3, $t0, 10000 ldptr.d $t0, $t0, 10008 - vinsgr2vr.d $vr1, $t1, 0 - vinsgr2vr.d $vr2, $t2, 0 - vinsgr2vr.d $vr3, $t3, 0 - vinsgr2vr.d $vr4, $t0, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr5, $vr0, $vr1 - vilvh.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr6, $vr0, $vr2 - vilvh.h $vr2, $vr0, $vr2 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr7, $vr0, $vr3 - vilvh.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr8, $vr0, $vr4 - vilvh.h $vr4, $vr0, $vr4 - vld $vr9, $a6, -64 - vld $vr10, $a6, -48 - vld $vr11, $a6, -32 - vld $vr12, $a6, -16 - vld $vr13, $a6, 0 - vld $vr14, $a6, 16 - vld $vr15, $a6, 32 - vld $vr16, $a6, 48 - vadd.w $vr1, $vr10, $vr1 - vadd.w $vr5, $vr9, $vr5 - vadd.w $vr2, $vr12, $vr2 - vadd.w $vr6, $vr11, $vr6 - vadd.w $vr3, $vr14, $vr3 - vadd.w $vr7, $vr13, $vr7 - vadd.w $vr4, $vr16, $vr4 - vadd.w $vr8, $vr15, $vr8 - vst $vr5, $a6, -64 - vst $vr1, $a6, -48 - vst $vr6, $a6, -32 - vst $vr2, $a6, -16 - vst $vr7, $a6, 0 - vst $vr3, $a6, 16 - vst $vr8, $a6, 32 - vst $vr4, $a6, 48 + vinsgr2vr.d $vr0, $t1, 0 + vinsgr2vr.d $vr1, $t2, 0 + vinsgr2vr.d $vr2, $t3, 0 + vinsgr2vr.d $vr3, $t0, 0 + vsllwil.hu.bu $vr4, $vr0, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsrli.d $vr0, $vr0, 32 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr5, $vr1, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsrli.d $vr1, $vr1, 32 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.hu.bu $vr6, $vr2, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsrli.d $vr2, $vr2, 32 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr7, $vr3, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsrli.d $vr3, $vr3, 32 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vld $vr8, $a6, -64 + vld $vr9, $a6, -48 + vld $vr10, $a6, -32 + vld $vr11, $a6, -16 + vld $vr12, $a6, 0 + vld $vr13, $a6, 16 + vld $vr14, $a6, 32 + vld $vr15, $a6, 48 + vadd.w $vr0, $vr9, $vr0 + vadd.w $vr4, $vr8, $vr4 + vadd.w $vr1, $vr11, $vr1 + vadd.w $vr5, $vr10, $vr5 + vadd.w $vr2, $vr13, $vr2 + vadd.w $vr6, $vr12, $vr6 + vadd.w $vr3, $vr15, $vr3 + vadd.w $vr7, $vr14, $vr7 + vst $vr4, $a6, -64 + vst $vr0, $a6, -48 + vst $vr5, $a6, -32 + vst $vr1, $a6, -16 + vst $vr6, $a6, 0 + vst $vr2, $a6, 16 + vst $vr7, $a6, 32 + vst $vr3, $a6, 48 addi.d $a7, $a7, 32 addi.d $a6, $a6, 128 bnez $a7, .LBB45_15 @@ -18534,18 +18588,26 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint32_t_RN9bench .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception46 # %bb.0: - addi.d $sp, $sp, -112 - .cfi_def_cfa_offset 112 - st.d $ra, $sp, 104 # 8-byte Folded Spill - st.d $fp, $sp, 96 # 8-byte Folded Spill - st.d $s0, $sp, 88 # 8-byte Folded Spill - st.d $s1, $sp, 80 # 8-byte Folded Spill - st.d $s2, $sp, 72 # 8-byte Folded Spill - st.d $s3, $sp, 64 # 8-byte Folded Spill - st.d $s4, $sp, 56 # 8-byte Folded Spill - st.d $s5, $sp, 48 # 8-byte Folded Spill - st.d $s6, $sp, 40 # 8-byte Folded Spill - st.d $s7, $sp, 32 # 8-byte Folded Spill + addi.d $sp, $sp, -176 + .cfi_def_cfa_offset 176 + st.d $ra, $sp, 168 # 8-byte Folded Spill + st.d $fp, $sp, 160 # 8-byte Folded Spill + st.d $s0, $sp, 152 # 8-byte Folded Spill + st.d $s1, $sp, 144 # 8-byte Folded Spill + st.d $s2, $sp, 136 # 8-byte Folded Spill + st.d $s3, $sp, 128 # 8-byte Folded Spill + st.d $s4, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 112 # 8-byte Folded Spill + st.d $s6, $sp, 104 # 8-byte Folded Spill + st.d $s7, $sp, 96 # 8-byte Folded Spill + fst.d $fs0, $sp, 88 # 8-byte Folded Spill + fst.d $fs1, $sp, 80 # 8-byte Folded Spill + fst.d $fs2, $sp, 72 # 8-byte Folded Spill + fst.d $fs3, $sp, 64 # 8-byte Folded Spill + fst.d $fs4, $sp, 56 # 8-byte Folded Spill + fst.d $fs5, $sp, 48 # 8-byte Folded Spill + fst.d $fs6, $sp, 40 # 8-byte Folded Spill + fst.d $fs7, $sp, 32 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -18556,6 +18618,14 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint32_t_RN9bench .cfi_offset 28, -64 .cfi_offset 29, -72 .cfi_offset 30, -80 + .cfi_offset 56, -88 + .cfi_offset 57, -96 + .cfi_offset 58, -104 + .cfi_offset 59, -112 + .cfi_offset 60, -120 + .cfi_offset 61, -128 + .cfi_offset 62, -136 + .cfi_offset 63, -144 move $s0, $a0 lu12i.w $s5, 2 ori $s1, $s5, 1808 @@ -18636,7 +18706,6 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint32_t_RN9bench ori $a4, $s5, 1792 ori $a5, $s5, 1824 ori $a6, $s5, 1840 - vrepli.b $vr0, 0 .p2align 4, , 16 .LBB46_11: # %.lr.ph # =>This Loop Header: Depth=1 @@ -18664,82 +18733,102 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint32_t_RN9bench # Parent Loop BB46_11 Depth=1 # => This Inner Loop Header: Depth=2 add.d $t2, $fp, $t1 - vldx $vr1, $t2, $a4 - vldx $vr2, $t2, $s1 - vldx $vr3, $t2, $a5 - vldx $vr4, $t2, $a6 - vilvh.b $vr5, $vr0, $vr1 - vilvl.h $vr6, $vr0, $vr5 - vilvh.h $vr5, $vr0, $vr5 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr7, $vr0, $vr1 - vilvh.h $vr1, $vr0, $vr1 - vilvh.b $vr8, $vr0, $vr2 - vilvl.h $vr9, $vr0, $vr8 - vilvh.h $vr8, $vr0, $vr8 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr10, $vr0, $vr2 - vilvh.h $vr2, $vr0, $vr2 - vilvh.b $vr11, $vr0, $vr3 - vilvl.h $vr12, $vr0, $vr11 - vilvh.h $vr11, $vr0, $vr11 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr13, $vr0, $vr3 - vilvh.h $vr3, $vr0, $vr3 - vilvh.b $vr14, $vr0, $vr4 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr15, $vr0, $vr4 - vld $vr16, $t0, -112 - vilvh.h $vr4, $vr0, $vr4 - vld $vr17, $t0, -128 + vldx $vr0, $t2, $a4 + vldx $vr1, $t2, $s1 + vldx $vr2, $t2, $a5 + vldx $vr3, $t2, $a6 + vsllwil.hu.bu $vr4, $vr0, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vbsrl.v $vr5, $vr0, 8 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vbsrl.v $vr6, $vr0, 12 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsrli.d $vr0, $vr0, 32 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr7, $vr1, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vbsrl.v $vr8, $vr1, 8 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vbsrl.v $vr9, $vr1, 12 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsrli.d $vr1, $vr1, 32 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.hu.bu $vr10, $vr2, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vbsrl.v $vr11, $vr2, 8 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vbsrl.v $vr12, $vr2, 12 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsrli.d $vr2, $vr2, 32 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr13, $vr3, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vbsrl.v $vr14, $vr3, 8 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vbsrl.v $vr15, $vr3, 12 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsrli.d $vr3, $vr3, 32 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vld $vr16, $t0, -128 + vld $vr17, $t0, -96 vld $vr18, $t0, -80 - vadd.w $vr1, $vr16, $vr1 - vld $vr16, $t0, -96 - vadd.w $vr7, $vr17, $vr7 - vld $vr17, $t0, -48 - vadd.w $vr5, $vr18, $vr5 - vld $vr18, $t0, -64 - vadd.w $vr6, $vr16, $vr6 - vld $vr16, $t0, -16 - vadd.w $vr2, $vr17, $vr2 - vld $vr17, $t0, -32 - vadd.w $vr10, $vr18, $vr10 - vld $vr18, $t0, 16 - vadd.w $vr8, $vr16, $vr8 - vld $vr16, $t0, 0 - vadd.w $vr9, $vr17, $vr9 - vld $vr17, $t0, 48 - vadd.w $vr3, $vr18, $vr3 - vld $vr18, $t0, 32 - vadd.w $vr13, $vr16, $vr13 - vld $vr16, $t0, 80 - vadd.w $vr11, $vr17, $vr11 - vld $vr17, $t0, 64 - vadd.w $vr12, $vr18, $vr12 - vld $vr18, $t0, 112 + vld $vr19, $t0, -112 + vld $vr20, $t0, -64 + vld $vr21, $t0, -32 + vld $vr22, $t0, -16 + vld $vr23, $t0, -48 + vld $vr24, $t0, 0 + vld $vr25, $t0, 32 + vld $vr26, $t0, 48 + vld $vr27, $t0, 16 + vld $vr28, $t0, 64 + vld $vr29, $t0, 96 + vld $vr30, $t0, 112 + vld $vr31, $t0, 80 + vadd.w $vr0, $vr19, $vr0 + vadd.w $vr6, $vr18, $vr6 + vadd.w $vr5, $vr17, $vr5 vadd.w $vr4, $vr16, $vr4 - vld $vr16, $t0, 96 - vadd.w $vr15, $vr17, $vr15 - vilvh.h $vr17, $vr0, $vr14 - vadd.w $vr17, $vr18, $vr17 - vilvl.h $vr14, $vr0, $vr14 - vadd.w $vr14, $vr16, $vr14 - vst $vr6, $t0, -96 - vst $vr5, $t0, -80 - vst $vr7, $t0, -128 - vst $vr1, $t0, -112 - vst $vr9, $t0, -32 - vst $vr8, $t0, -16 - vst $vr10, $t0, -64 - vst $vr2, $t0, -48 - vst $vr12, $t0, 32 - vst $vr11, $t0, 48 - vst $vr13, $t0, 0 - vst $vr3, $t0, 16 + vadd.w $vr1, $vr23, $vr1 + vadd.w $vr9, $vr22, $vr9 + vadd.w $vr8, $vr21, $vr8 + vadd.w $vr7, $vr20, $vr7 + vadd.w $vr2, $vr27, $vr2 + vadd.w $vr12, $vr26, $vr12 + vadd.w $vr11, $vr25, $vr11 + vadd.w $vr10, $vr24, $vr10 + vadd.w $vr3, $vr31, $vr3 + vadd.w $vr15, $vr30, $vr15 + vadd.w $vr14, $vr29, $vr14 + vadd.w $vr13, $vr28, $vr13 + vst $vr4, $t0, -128 + vst $vr5, $t0, -96 + vst $vr6, $t0, -80 + vst $vr0, $t0, -112 + vst $vr7, $t0, -64 + vst $vr8, $t0, -32 + vst $vr9, $t0, -16 + vst $vr1, $t0, -48 + vst $vr10, $t0, 0 + vst $vr11, $t0, 32 + vst $vr12, $t0, 48 + vst $vr2, $t0, 16 + vst $vr13, $t0, 64 vst $vr14, $t0, 96 - vst $vr17, $t0, 112 - vst $vr15, $t0, 64 - vst $vr4, $t0, 80 + vst $vr15, $t0, 112 + vst $vr3, $t0, 80 addi.d $t1, $t1, 64 addi.d $t0, $t0, 256 bnez $t1, .LBB46_15 @@ -18782,17 +18871,25 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint32_t_RN9bench move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s7, $sp, 32 # 8-byte Folded Reload - ld.d $s6, $sp, 40 # 8-byte Folded Reload - ld.d $s5, $sp, 48 # 8-byte Folded Reload - ld.d $s4, $sp, 56 # 8-byte Folded Reload - ld.d $s3, $sp, 64 # 8-byte Folded Reload - ld.d $s2, $sp, 72 # 8-byte Folded Reload - ld.d $s1, $sp, 80 # 8-byte Folded Reload - ld.d $s0, $sp, 88 # 8-byte Folded Reload - ld.d $fp, $sp, 96 # 8-byte Folded Reload - ld.d $ra, $sp, 104 # 8-byte Folded Reload - addi.d $sp, $sp, 112 + fld.d $fs7, $sp, 32 # 8-byte Folded Reload + fld.d $fs6, $sp, 40 # 8-byte Folded Reload + fld.d $fs5, $sp, 48 # 8-byte Folded Reload + fld.d $fs4, $sp, 56 # 8-byte Folded Reload + fld.d $fs3, $sp, 64 # 8-byte Folded Reload + fld.d $fs2, $sp, 72 # 8-byte Folded Reload + fld.d $fs1, $sp, 80 # 8-byte Folded Reload + fld.d $fs0, $sp, 88 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload + ld.d $s6, $sp, 104 # 8-byte Folded Reload + ld.d $s5, $sp, 112 # 8-byte Folded Reload + ld.d $s4, $sp, 120 # 8-byte Folded Reload + ld.d $s3, $sp, 128 # 8-byte Folded Reload + ld.d $s2, $sp, 136 # 8-byte Folded Reload + ld.d $s1, $sp, 144 # 8-byte Folded Reload + ld.d $s0, $sp, 152 # 8-byte Folded Reload + ld.d $fp, $sp, 160 # 8-byte Folded Reload + ld.d $ra, $sp, 168 # 8-byte Folded Reload + addi.d $sp, $sp, 176 ret .LBB46_24: .Ltmp1006: # EH_LABEL @@ -18975,7 +19072,6 @@ _Z60benchForTruncOrZextVecWithAddInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5Sta addi.d $a1, $sp, 16 ori $a2, $s6, 2288 ori $a3, $s5, 1812 - vrepli.b $vr0, 0 b .LBB47_12 .p2align 4, , 16 .LBB47_11: # %_ZL27truncOrZextVecWithAddInLoopIhjEvPKT_PT0_i.exit @@ -19023,18 +19119,18 @@ _Z60benchForTruncOrZextVecWithAddInLoopFrom_uint8_t_To_uint32_t_RN9benchmark5Sta add.d $a6, $fp, $a5 ldx.w $a7, $a6, $s1 ldx.w $a6, $a6, $a3 - vinsgr2vr.w $vr1, $a7, 0 - vinsgr2vr.w $vr2, $a6, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vld $vr3, $a4, -16 - vld $vr4, $a4, 0 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 + vinsgr2vr.w $vr0, $a7, 0 + vinsgr2vr.w $vr1, $a6, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vld $vr2, $a4, -16 + vld $vr3, $a4, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vadd.w $vr0, $vr2, $vr0 vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr4, $vr2 - vst $vr1, $a4, -16 - vst $vr2, $a4, 0 + vst $vr0, $a4, -16 + vst $vr1, $a4, 0 addi.d $a5, $a5, 8 addi.d $a4, $a4, 32 bnez $a5, .LBB47_17 @@ -19149,21 +19245,21 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception48 # %bb.0: - addi.d $sp, $sp, -144 - .cfi_def_cfa_offset 144 - st.d $ra, $sp, 136 # 8-byte Folded Spill - st.d $fp, $sp, 128 # 8-byte Folded Spill - st.d $s0, $sp, 120 # 8-byte Folded Spill - st.d $s1, $sp, 112 # 8-byte Folded Spill - st.d $s2, $sp, 104 # 8-byte Folded Spill - st.d $s3, $sp, 96 # 8-byte Folded Spill - st.d $s4, $sp, 88 # 8-byte Folded Spill - st.d $s5, $sp, 80 # 8-byte Folded Spill - st.d $s6, $sp, 72 # 8-byte Folded Spill - st.d $s7, $sp, 64 # 8-byte Folded Spill - st.d $s8, $sp, 56 # 8-byte Folded Spill - .cfi_offset 1, -8 - .cfi_offset 22, -16 + addi.d $sp, $sp, -128 + .cfi_def_cfa_offset 128 + st.d $ra, $sp, 120 # 8-byte Folded Spill + st.d $fp, $sp, 112 # 8-byte Folded Spill + st.d $s0, $sp, 104 # 8-byte Folded Spill + st.d $s1, $sp, 96 # 8-byte Folded Spill + st.d $s2, $sp, 88 # 8-byte Folded Spill + st.d $s3, $sp, 80 # 8-byte Folded Spill + st.d $s4, $sp, 72 # 8-byte Folded Spill + st.d $s5, $sp, 64 # 8-byte Folded Spill + st.d $s6, $sp, 56 # 8-byte Folded Spill + st.d $s7, $sp, 48 # 8-byte Folded Spill + st.d $s8, $sp, 40 # 8-byte Folded Spill + .cfi_offset 1, -8 + .cfi_offset 22, -16 .cfi_offset 23, -24 .cfi_offset 24, -32 .cfi_offset 25, -40 @@ -19189,8 +19285,8 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta .Ltmp1033: # EH_LABEL # %bb.1: # %.split move $s3, $a0 - st.d $s0, $sp, 32 # 8-byte Folded Spill - st.d $a0, $sp, 40 + st.d $s0, $sp, 16 # 8-byte Folded Spill + st.d $a0, $sp, 24 .Ltmp1035: # EH_LABEL move $a0, $s2 pcaddu18i $ra, %call36(_Znam) @@ -19204,7 +19300,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta add.d $s8, $s3, $a0 lu12i.w $a0, 15 ori $a0, $a0, 3840 - st.h $a0, $sp, 54 + st.h $a0, $sp, 38 lu12i.w $a0, -3 ori $s6, $a0, 2288 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -19212,8 +19308,8 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta .p2align 4, , 16 .LBB48_3: # =>This Inner Loop Header: Depth=1 .Ltmp1038: # EH_LABEL - addi.d $a0, $sp, 54 - addi.d $a2, $sp, 54 + addi.d $a0, $sp, 38 + addi.d $a2, $sp, 38 move $a1, $s5 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIhEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEhRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -19227,7 +19323,6 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta # %bb.5: # %vector.ph lu12i.w $a0, -3 ori $a0, $a0, 2288 - vrepli.b $vr16, 0 move $a1, $s0 .p2align 4, , 16 .LBB48_6: # %vector.body @@ -19235,9 +19330,9 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta add.d $a2, $fp, $a0 ldx.h $a2, $a2, $s4 vinsgr2vr.h $vr0, $a2, 0 - vilvl.b $vr0, $vr16, $vr0 - vilvl.h $vr0, $vr16, $vr0 - vilvl.w $vr0, $vr16, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vst $vr0, $a1, 0 addi.d $a0, $a0, 2 addi.d $a1, $a1, 16 @@ -19258,50 +19353,82 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta vinsgr2vr.d $vr1, $a4, 0 vinsgr2vr.d $vr2, $a5, 0 vinsgr2vr.d $vr3, $a2, 0 - vilvl.b $vr0, $vr16, $vr0 - vilvl.h $vr4, $vr16, $vr0 - vilvl.w $vr5, $vr16, $vr4 - vilvh.w $vr4, $vr16, $vr4 - vilvh.h $vr0, $vr16, $vr0 - vilvl.w $vr6, $vr16, $vr0 - vilvh.w $vr0, $vr16, $vr0 - vilvl.b $vr1, $vr16, $vr1 - vilvl.h $vr7, $vr16, $vr1 - vilvl.w $vr8, $vr16, $vr7 - vilvh.w $vr7, $vr16, $vr7 - vilvh.h $vr1, $vr16, $vr1 - vilvl.w $vr9, $vr16, $vr1 - vilvh.w $vr1, $vr16, $vr1 - vilvl.b $vr2, $vr16, $vr2 - vilvl.h $vr10, $vr16, $vr2 - vilvl.w $vr11, $vr16, $vr10 - vilvh.w $vr10, $vr16, $vr10 - vilvh.h $vr2, $vr16, $vr2 - vilvl.w $vr12, $vr16, $vr2 - vilvh.w $vr2, $vr16, $vr2 - vilvl.b $vr3, $vr16, $vr3 - vilvl.h $vr13, $vr16, $vr3 - vilvl.w $vr14, $vr16, $vr13 - vilvh.w $vr13, $vr16, $vr13 - vilvh.h $vr3, $vr16, $vr3 - vilvl.w $vr15, $vr16, $vr3 - vilvh.w $vr3, $vr16, $vr3 - vst $vr0, $a0, -80 - vst $vr6, $a0, -96 + vshuf4i.b $vr4, $vr0, 14 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsrli.d $vr5, $vr0, 32 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsrli.d $vr6, $vr0, 48 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vshuf4i.b $vr7, $vr1, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsrli.d $vr8, $vr1, 32 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vsrli.d $vr9, $vr1, 48 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vshuf4i.b $vr10, $vr2, 14 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vsrli.d $vr11, $vr2, 32 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vsrli.d $vr12, $vr2, 48 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vshuf4i.b $vr13, $vr3, 14 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vsrli.d $vr14, $vr3, 32 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vsrli.d $vr15, $vr3, 48 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vst $vr0, $a0, -128 + vst $vr6, $a0, -80 + vst $vr5, $a0, -96 vst $vr4, $a0, -112 - vst $vr5, $a0, -128 - vst $vr1, $a0, -16 - vst $vr9, $a0, -32 + vst $vr1, $a0, -64 + vst $vr9, $a0, -16 + vst $vr8, $a0, -32 vst $vr7, $a0, -48 - vst $vr8, $a0, -64 - vst $vr2, $a0, 48 - vst $vr12, $a0, 32 + vst $vr2, $a0, 0 + vst $vr12, $a0, 48 + vst $vr11, $a0, 32 vst $vr10, $a0, 16 - vst $vr11, $a0, 0 - vst $vr3, $a0, 112 - vst $vr15, $a0, 96 + vst $vr3, $a0, 64 + vst $vr15, $a0, 112 + vst $vr14, $a0, 96 vst $vr13, $a0, 80 - vst $vr14, $a0, 64 addi.d $a1, $a1, 32 addi.d $a0, $a0, 256 bnez $a1, .LBB48_8 @@ -19357,8 +19484,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta addi.d $s6, $s6, 8 bnez $s1, .LBB48_10 # %bb.12: - vst $vr16, $sp, 16 # 16-byte Folded Spill - ld.d $s6, $sp, 32 # 8-byte Folded Reload + ld.d $s6, $sp, 16 # 8-byte Folded Reload ld.w $s3, $s6, 28 ld.d $s1, $s6, 16 .Ltmp1056: # EH_LABEL @@ -19367,13 +19493,12 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta jirl $ra, $ra, 0 .Ltmp1057: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr16, $sp, 16 # 16-byte Folded Reload bnez $s3, .LBB48_25 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s1, .LBB48_25 # %bb.15: # %.lr.ph.preheader add.d $a0, $fp, $s4 - addi.d $a1, $sp, 40 + addi.d $a1, $sp, 24 lu12i.w $a3, -3 ori $a2, $a3, 2288 ori $a3, $a3, 2304 @@ -19385,7 +19510,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta #APP #NO_APP #MEMBARRIER - ld.d $a4, $sp, 40 + ld.d $a4, $sp, 24 bgeu $a4, $a0, .LBB48_19 # %bb.17: # %.lr.ph # in Loop: Header=BB48_16 Depth=1 @@ -19412,50 +19537,82 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta vinsgr2vr.d $vr1, $t1, 0 vinsgr2vr.d $vr2, $t2, 0 vinsgr2vr.d $vr3, $a7, 0 - vilvl.b $vr0, $vr16, $vr0 - vilvl.h $vr4, $vr16, $vr0 - vilvl.w $vr5, $vr16, $vr4 - vilvh.w $vr4, $vr16, $vr4 - vilvh.h $vr0, $vr16, $vr0 - vilvl.w $vr6, $vr16, $vr0 - vilvh.w $vr0, $vr16, $vr0 - vilvl.b $vr1, $vr16, $vr1 - vilvl.h $vr7, $vr16, $vr1 - vilvl.w $vr8, $vr16, $vr7 - vilvh.w $vr7, $vr16, $vr7 - vilvh.h $vr1, $vr16, $vr1 - vilvl.w $vr9, $vr16, $vr1 - vilvh.w $vr1, $vr16, $vr1 - vilvl.b $vr2, $vr16, $vr2 - vilvl.h $vr10, $vr16, $vr2 - vilvl.w $vr11, $vr16, $vr10 - vilvh.w $vr10, $vr16, $vr10 - vilvh.h $vr2, $vr16, $vr2 - vilvl.w $vr12, $vr16, $vr2 - vilvh.w $vr2, $vr16, $vr2 - vilvl.b $vr3, $vr16, $vr3 - vilvl.h $vr13, $vr16, $vr3 - vilvl.w $vr14, $vr16, $vr13 - vilvh.w $vr13, $vr16, $vr13 - vilvh.h $vr3, $vr16, $vr3 - vilvl.w $vr15, $vr16, $vr3 - vilvh.w $vr3, $vr16, $vr3 - vst $vr0, $a5, -80 - vst $vr6, $a5, -96 + vshuf4i.b $vr4, $vr0, 14 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsrli.d $vr5, $vr0, 32 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsrli.d $vr6, $vr0, 48 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vshuf4i.b $vr7, $vr1, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsrli.d $vr8, $vr1, 32 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vsrli.d $vr9, $vr1, 48 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vshuf4i.b $vr10, $vr2, 14 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vsrli.d $vr11, $vr2, 32 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vsrli.d $vr12, $vr2, 48 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vshuf4i.b $vr13, $vr3, 14 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vsrli.d $vr14, $vr3, 32 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vsrli.d $vr15, $vr3, 48 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vst $vr0, $a5, -128 + vst $vr6, $a5, -80 + vst $vr5, $a5, -96 vst $vr4, $a5, -112 - vst $vr5, $a5, -128 - vst $vr1, $a5, -16 - vst $vr9, $a5, -32 + vst $vr1, $a5, -64 + vst $vr9, $a5, -16 + vst $vr8, $a5, -32 vst $vr7, $a5, -48 - vst $vr8, $a5, -64 - vst $vr2, $a5, 48 - vst $vr12, $a5, 32 + vst $vr2, $a5, 0 + vst $vr12, $a5, 48 + vst $vr11, $a5, 32 vst $vr10, $a5, 16 - vst $vr11, $a5, 0 - vst $vr3, $a5, 112 - vst $vr15, $a5, 96 + vst $vr3, $a5, 64 + vst $vr15, $a5, 112 + vst $vr14, $a5, 96 vst $vr13, $a5, 80 - vst $vr14, $a5, 64 addi.d $a6, $a6, 32 addi.d $a5, $a5, 256 bnez $a6, .LBB48_20 @@ -19490,7 +19647,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 40 + ld.d $a0, $sp, 24 beqz $a0, .LBB48_28 # %bb.27: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i37 pcaddu18i $ra, %call36(_ZdaPv) @@ -19499,18 +19656,18 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 56 # 8-byte Folded Reload - ld.d $s7, $sp, 64 # 8-byte Folded Reload - ld.d $s6, $sp, 72 # 8-byte Folded Reload - ld.d $s5, $sp, 80 # 8-byte Folded Reload - ld.d $s4, $sp, 88 # 8-byte Folded Reload - ld.d $s3, $sp, 96 # 8-byte Folded Reload - ld.d $s2, $sp, 104 # 8-byte Folded Reload - ld.d $s1, $sp, 112 # 8-byte Folded Reload - ld.d $s0, $sp, 120 # 8-byte Folded Reload - ld.d $fp, $sp, 128 # 8-byte Folded Reload - ld.d $ra, $sp, 136 # 8-byte Folded Reload - addi.d $sp, $sp, 144 + ld.d $s8, $sp, 40 # 8-byte Folded Reload + ld.d $s7, $sp, 48 # 8-byte Folded Reload + ld.d $s6, $sp, 56 # 8-byte Folded Reload + ld.d $s5, $sp, 64 # 8-byte Folded Reload + ld.d $s4, $sp, 72 # 8-byte Folded Reload + ld.d $s3, $sp, 80 # 8-byte Folded Reload + ld.d $s2, $sp, 88 # 8-byte Folded Reload + ld.d $s1, $sp, 96 # 8-byte Folded Reload + ld.d $s0, $sp, 104 # 8-byte Folded Reload + ld.d $fp, $sp, 112 # 8-byte Folded Reload + ld.d $ra, $sp, 120 # 8-byte Folded Reload + addi.d $sp, $sp, 128 ret .LBB48_29: .Ltmp1041: # EH_LABEL @@ -19602,7 +19759,7 @@ _Z60benchForTruncOrZextVecInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchmark5Sta move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s3, $sp, 40 + ld.d $s3, $sp, 24 beqz $s3, .LBB48_44 .LBB48_43: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i30 move $a0, $s3 @@ -19672,19 +19829,27 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception49 # %bb.0: - addi.d $sp, $sp, -144 - .cfi_def_cfa_offset 144 - st.d $ra, $sp, 136 # 8-byte Folded Spill - st.d $fp, $sp, 128 # 8-byte Folded Spill - st.d $s0, $sp, 120 # 8-byte Folded Spill - st.d $s1, $sp, 112 # 8-byte Folded Spill - st.d $s2, $sp, 104 # 8-byte Folded Spill - st.d $s3, $sp, 96 # 8-byte Folded Spill - st.d $s4, $sp, 88 # 8-byte Folded Spill - st.d $s5, $sp, 80 # 8-byte Folded Spill - st.d $s6, $sp, 72 # 8-byte Folded Spill - st.d $s7, $sp, 64 # 8-byte Folded Spill - st.d $s8, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -192 + .cfi_def_cfa_offset 192 + st.d $ra, $sp, 184 # 8-byte Folded Spill + st.d $fp, $sp, 176 # 8-byte Folded Spill + st.d $s0, $sp, 168 # 8-byte Folded Spill + st.d $s1, $sp, 160 # 8-byte Folded Spill + st.d $s2, $sp, 152 # 8-byte Folded Spill + st.d $s3, $sp, 144 # 8-byte Folded Spill + st.d $s4, $sp, 136 # 8-byte Folded Spill + st.d $s5, $sp, 128 # 8-byte Folded Spill + st.d $s6, $sp, 120 # 8-byte Folded Spill + st.d $s7, $sp, 112 # 8-byte Folded Spill + st.d $s8, $sp, 104 # 8-byte Folded Spill + fst.d $fs0, $sp, 96 # 8-byte Folded Spill + fst.d $fs1, $sp, 88 # 8-byte Folded Spill + fst.d $fs2, $sp, 80 # 8-byte Folded Spill + fst.d $fs3, $sp, 72 # 8-byte Folded Spill + fst.d $fs4, $sp, 64 # 8-byte Folded Spill + fst.d $fs5, $sp, 56 # 8-byte Folded Spill + fst.d $fs6, $sp, 48 # 8-byte Folded Spill + fst.d $fs7, $sp, 40 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -19696,6 +19861,14 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St .cfi_offset 29, -72 .cfi_offset 30, -80 .cfi_offset 31, -88 + .cfi_offset 56, -96 + .cfi_offset 57, -104 + .cfi_offset 58, -112 + .cfi_offset 59, -120 + .cfi_offset 60, -128 + .cfi_offset 61, -136 + .cfi_offset 62, -144 + .cfi_offset 63, -152 move $s0, $a0 lu12i.w $s7, 2 ori $s1, $s7, 1808 @@ -19713,7 +19886,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St # %bb.1: # %.split move $s4, $a0 st.d $s0, $sp, 8 # 8-byte Folded Spill - st.d $a0, $sp, 40 + st.d $a0, $sp, 24 .Ltmp1064: # EH_LABEL move $a0, $s3 pcaddu18i $ra, %call36(_Znam) @@ -19728,7 +19901,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St add.d $s8, $s4, $a0 lu12i.w $a0, 15 ori $a0, $a0, 3840 - st.h $a0, $sp, 54 + st.h $a0, $sp, 38 lu12i.w $s6, -3 ori $s2, $s6, 2288 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -19736,8 +19909,8 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St .p2align 4, , 16 .LBB49_3: # =>This Inner Loop Header: Depth=1 .Ltmp1067: # EH_LABEL - addi.d $a0, $sp, 54 - addi.d $a2, $sp, 54 + addi.d $a0, $sp, 38 + addi.d $a2, $sp, 38 move $a1, $s5 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIhEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEhRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -19750,7 +19923,6 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St bnez $s2, .LBB49_3 # %bb.5: # %vector.ph ori $a0, $s6, 2288 - vrepli.b $vr4, 0 move $a1, $s0 .p2align 4, , 16 .LBB49_6: # %vector.body @@ -19758,9 +19930,9 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St add.d $a2, $fp, $a0 ldx.h $a2, $a2, $s1 vinsgr2vr.h $vr0, $a2, 0 - vilvl.b $vr0, $vr4, $vr0 - vilvl.h $vr0, $vr4, $vr0 - vilvl.w $vr0, $vr4, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vst $vr0, $a1, 0 addi.d $a0, $a0, 2 addi.d $a1, $a1, 16 @@ -19776,98 +19948,166 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St .LBB49_8: # %vector.body84 # =>This Inner Loop Header: Depth=1 add.d $a2, $fp, $a1 - vldx $vr0, $a2, $s5 - vilvh.b $vr1, $vr4, $vr0 - vilvh.h $vr2, $vr4, $vr1 - vilvh.w $vr3, $vr4, $vr2 - vst $vr3, $a0, -144 - vldx $vr3, $a2, $s1 - vilvl.b $vr0, $vr4, $vr0 - vilvl.h $vr1, $vr4, $vr1 - vilvl.w $vr2, $vr4, $vr2 - vst $vr2, $a0, -160 - vilvh.w $vr2, $vr4, $vr1 - vst $vr2, $a0, -176 - vilvl.h $vr2, $vr4, $vr0 - vilvh.h $vr0, $vr4, $vr0 - vilvl.w $vr1, $vr4, $vr1 - vst $vr1, $a0, -192 - vilvh.w $vr1, $vr4, $vr0 - vst $vr1, $a0, -208 - vilvl.w $vr1, $vr4, $vr2 - vilvh.w $vr2, $vr4, $vr2 - vilvl.w $vr0, $vr4, $vr0 - vst $vr0, $a0, -224 - vilvh.b $vr0, $vr4, $vr3 - vst $vr2, $a0, -240 - vilvh.h $vr2, $vr4, $vr0 - vst $vr1, $a0, -256 - vilvh.w $vr1, $vr4, $vr2 - vst $vr1, $a0, -16 - vldx $vr1, $a2, $s6 - vilvl.b $vr3, $vr4, $vr3 - vilvl.h $vr0, $vr4, $vr0 - vilvl.w $vr2, $vr4, $vr2 - vst $vr2, $a0, -32 - vilvh.w $vr2, $vr4, $vr0 - vst $vr2, $a0, -48 - vilvl.h $vr2, $vr4, $vr3 - vilvh.h $vr3, $vr4, $vr3 - vilvl.w $vr0, $vr4, $vr0 - vst $vr0, $a0, -64 - vilvh.w $vr0, $vr4, $vr3 - vst $vr0, $a0, -80 - vilvl.w $vr0, $vr4, $vr2 - vilvh.w $vr2, $vr4, $vr2 - vilvl.w $vr3, $vr4, $vr3 - vst $vr3, $a0, -96 - vilvh.b $vr3, $vr4, $vr1 - vst $vr2, $a0, -112 - vilvh.h $vr2, $vr4, $vr3 - vst $vr0, $a0, -128 - vilvh.w $vr0, $vr4, $vr2 - vst $vr0, $a0, 112 - vldx $vr0, $a2, $s7 - vilvl.b $vr1, $vr4, $vr1 - vilvl.h $vr3, $vr4, $vr3 - vilvl.w $vr2, $vr4, $vr2 - vst $vr2, $a0, 96 - vilvh.w $vr2, $vr4, $vr3 - vst $vr2, $a0, 80 - vilvl.h $vr2, $vr4, $vr1 - vilvh.h $vr1, $vr4, $vr1 - vilvl.w $vr3, $vr4, $vr3 - vst $vr3, $a0, 64 - vilvh.w $vr3, $vr4, $vr1 - vst $vr3, $a0, 48 - vilvl.w $vr3, $vr4, $vr2 - vilvh.w $vr2, $vr4, $vr2 - vilvl.w $vr1, $vr4, $vr1 - vst $vr1, $a0, 32 - vilvh.b $vr1, $vr4, $vr0 - vst $vr2, $a0, 16 - vilvh.h $vr2, $vr4, $vr1 - vst $vr3, $a0, 0 - vilvh.w $vr3, $vr4, $vr2 - vst $vr3, $a0, 240 - vilvl.b $vr0, $vr4, $vr0 - vilvl.h $vr1, $vr4, $vr1 - vilvl.w $vr2, $vr4, $vr2 - vst $vr2, $a0, 224 - vilvh.w $vr2, $vr4, $vr1 - vst $vr2, $a0, 208 - vilvh.h $vr2, $vr4, $vr0 - vilvl.w $vr1, $vr4, $vr1 - vst $vr1, $a0, 192 - vilvh.w $vr1, $vr4, $vr2 - vst $vr1, $a0, 176 - vilvl.h $vr0, $vr4, $vr0 - vilvl.w $vr1, $vr4, $vr2 - vst $vr1, $a0, 160 - vilvh.w $vr1, $vr4, $vr0 - vst $vr1, $a0, 144 - vilvl.w $vr0, $vr4, $vr0 - vst $vr0, $a0, 128 + vldx $vr6, $a2, $s5 + vldx $vr15, $a2, $s1 + vldx $vr12, $a2, $s6 + vldx $vr2, $a2, $s7 + vshuf4i.b $vr0, $vr6, 14 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vsrli.d $vr1, $vr6, 32 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vsrli.d $vr3, $vr6, 48 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr4, $vr6, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vbsrl.v $vr5, $vr6, 10 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vbsrl.v $vr7, $vr6, 12 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vbsrl.v $vr8, $vr6, 14 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr9, $vr8, 0 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr11, $vr6, 0 + vshuf4i.b $vr6, $vr15, 14 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsrli.d $vr8, $vr15, 32 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vsrli.d $vr10, $vr15, 48 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vbsrl.v $vr13, $vr15, 8 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vbsrl.v $vr14, $vr15, 10 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vbsrl.v $vr16, $vr15, 12 + vsllwil.hu.bu $vr16, $vr16, 0 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.du.wu $vr16, $vr16, 0 + vbsrl.v $vr17, $vr15, 14 + vsllwil.hu.bu $vr17, $vr17, 0 + vsllwil.wu.hu $vr17, $vr17, 0 + vsllwil.du.wu $vr18, $vr17, 0 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr20, $vr15, 0 + vshuf4i.b $vr15, $vr12, 14 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vsrli.d $vr17, $vr12, 32 + vsllwil.hu.bu $vr17, $vr17, 0 + vsllwil.wu.hu $vr17, $vr17, 0 + vsllwil.du.wu $vr17, $vr17, 0 + vsrli.d $vr19, $vr12, 48 + vsllwil.hu.bu $vr19, $vr19, 0 + vsllwil.wu.hu $vr19, $vr19, 0 + vsllwil.du.wu $vr19, $vr19, 0 + vbsrl.v $vr21, $vr12, 8 + vsllwil.hu.bu $vr21, $vr21, 0 + vsllwil.wu.hu $vr21, $vr21, 0 + vsllwil.du.wu $vr21, $vr21, 0 + vbsrl.v $vr22, $vr12, 10 + vsllwil.hu.bu $vr22, $vr22, 0 + vsllwil.wu.hu $vr22, $vr22, 0 + vsllwil.du.wu $vr22, $vr22, 0 + vbsrl.v $vr23, $vr12, 12 + vsllwil.hu.bu $vr23, $vr23, 0 + vsllwil.wu.hu $vr23, $vr23, 0 + vsllwil.du.wu $vr23, $vr23, 0 + vbsrl.v $vr24, $vr12, 14 + vsllwil.hu.bu $vr24, $vr24, 0 + vsllwil.wu.hu $vr24, $vr24, 0 + vsllwil.du.wu $vr24, $vr24, 0 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vshuf4i.b $vr25, $vr2, 14 + vsllwil.hu.bu $vr25, $vr25, 0 + vsllwil.wu.hu $vr25, $vr25, 0 + vsllwil.du.wu $vr25, $vr25, 0 + vsrli.d $vr26, $vr2, 32 + vsllwil.hu.bu $vr26, $vr26, 0 + vsllwil.wu.hu $vr26, $vr26, 0 + vsllwil.du.wu $vr26, $vr26, 0 + vsrli.d $vr27, $vr2, 48 + vsllwil.hu.bu $vr27, $vr27, 0 + vsllwil.wu.hu $vr27, $vr27, 0 + vsllwil.du.wu $vr27, $vr27, 0 + vbsrl.v $vr28, $vr2, 8 + vsllwil.hu.bu $vr28, $vr28, 0 + vsllwil.wu.hu $vr28, $vr28, 0 + vsllwil.du.wu $vr28, $vr28, 0 + vbsrl.v $vr29, $vr2, 10 + vsllwil.hu.bu $vr29, $vr29, 0 + vsllwil.wu.hu $vr29, $vr29, 0 + vsllwil.du.wu $vr29, $vr29, 0 + vbsrl.v $vr30, $vr2, 12 + vsllwil.hu.bu $vr30, $vr30, 0 + vsllwil.wu.hu $vr30, $vr30, 0 + vsllwil.du.wu $vr30, $vr30, 0 + vbsrl.v $vr31, $vr2, 14 + vsllwil.hu.bu $vr31, $vr31, 0 + vsllwil.wu.hu $vr31, $vr31, 0 + vsllwil.du.wu $vr31, $vr31, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vst $vr11, $a0, -256 + vst $vr9, $a0, -144 + vst $vr7, $a0, -160 + vst $vr5, $a0, -176 + vst $vr4, $a0, -192 + vst $vr3, $a0, -208 + vst $vr1, $a0, -224 + vst $vr0, $a0, -240 + vst $vr20, $a0, -128 + vst $vr18, $a0, -16 + vst $vr16, $a0, -32 + vst $vr14, $a0, -48 + vst $vr13, $a0, -64 + vst $vr10, $a0, -80 + vst $vr8, $a0, -96 + vst $vr6, $a0, -112 + vst $vr12, $a0, 0 + vst $vr24, $a0, 112 + vst $vr23, $a0, 96 + vst $vr22, $a0, 80 + vst $vr21, $a0, 64 + vst $vr19, $a0, 48 + vst $vr17, $a0, 32 + vst $vr15, $a0, 16 + vst $vr2, $a0, 128 + vst $vr31, $a0, 240 + vst $vr30, $a0, 224 + vst $vr29, $a0, 208 + vst $vr28, $a0, 192 + vst $vr27, $a0, 176 + vst $vr26, $a0, 160 + vst $vr25, $a0, 144 addi.d $a1, $a1, 64 addi.d $a0, $a0, 512 bnez $a1, .LBB49_8 @@ -19923,7 +20163,6 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St addi.d $s2, $s2, 8 bnez $s8, .LBB49_10 # %bb.12: - vst $vr4, $sp, 16 # 16-byte Folded Spill ld.d $s8, $sp, 8 # 8-byte Folded Reload ld.w $s4, $s8, 28 ld.d $s2, $s8, 16 @@ -19933,13 +20172,12 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St jirl $ra, $ra, 0 .Ltmp1086: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr4, $sp, 16 # 16-byte Folded Reload bnez $s4, .LBB49_25 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s2, .LBB49_25 # %bb.15: # %.lr.ph.preheader add.d $a0, $fp, $s1 - addi.d $a1, $sp, 40 + addi.d $a1, $sp, 24 lu12i.w $a3, -3 ori $a2, $a3, 2288 ori $a3, $a3, 2304 @@ -19951,7 +20189,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St #APP #NO_APP #MEMBARRIER - ld.d $a4, $sp, 40 + ld.d $a4, $sp, 24 bgeu $a4, $a0, .LBB49_19 # %bb.17: # %.lr.ph # in Loop: Header=BB49_16 Depth=1 @@ -19970,98 +20208,166 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St # Parent Loop BB49_16 Depth=1 # => This Inner Loop Header: Depth=2 add.d $a7, $fp, $a6 - vldx $vr0, $a7, $s5 - vilvh.b $vr1, $vr4, $vr0 - vilvh.h $vr2, $vr4, $vr1 - vilvh.w $vr3, $vr4, $vr2 - vst $vr3, $a5, -144 - vldx $vr3, $a7, $s1 - vilvl.b $vr0, $vr4, $vr0 - vilvl.h $vr1, $vr4, $vr1 - vilvl.w $vr2, $vr4, $vr2 - vst $vr2, $a5, -160 - vilvh.w $vr2, $vr4, $vr1 - vst $vr2, $a5, -176 - vilvl.h $vr2, $vr4, $vr0 - vilvh.h $vr0, $vr4, $vr0 - vilvl.w $vr1, $vr4, $vr1 - vst $vr1, $a5, -192 - vilvh.w $vr1, $vr4, $vr0 - vst $vr1, $a5, -208 - vilvl.w $vr1, $vr4, $vr2 - vilvh.w $vr2, $vr4, $vr2 - vilvl.w $vr0, $vr4, $vr0 - vst $vr0, $a5, -224 - vilvh.b $vr0, $vr4, $vr3 - vst $vr2, $a5, -240 - vilvh.h $vr2, $vr4, $vr0 - vst $vr1, $a5, -256 - vilvh.w $vr1, $vr4, $vr2 - vst $vr1, $a5, -16 - vldx $vr1, $a7, $s6 - vilvl.b $vr3, $vr4, $vr3 - vilvl.h $vr0, $vr4, $vr0 - vilvl.w $vr2, $vr4, $vr2 - vst $vr2, $a5, -32 - vilvh.w $vr2, $vr4, $vr0 - vst $vr2, $a5, -48 - vilvl.h $vr2, $vr4, $vr3 - vilvh.h $vr3, $vr4, $vr3 - vilvl.w $vr0, $vr4, $vr0 - vst $vr0, $a5, -64 - vilvh.w $vr0, $vr4, $vr3 - vst $vr0, $a5, -80 - vilvl.w $vr0, $vr4, $vr2 - vilvh.w $vr2, $vr4, $vr2 - vilvl.w $vr3, $vr4, $vr3 - vst $vr3, $a5, -96 - vilvh.b $vr3, $vr4, $vr1 - vst $vr2, $a5, -112 - vilvh.h $vr2, $vr4, $vr3 - vst $vr0, $a5, -128 - vilvh.w $vr0, $vr4, $vr2 - vst $vr0, $a5, 112 - vldx $vr0, $a7, $s7 - vilvl.b $vr1, $vr4, $vr1 - vilvl.h $vr3, $vr4, $vr3 - vilvl.w $vr2, $vr4, $vr2 - vst $vr2, $a5, 96 - vilvh.w $vr2, $vr4, $vr3 - vst $vr2, $a5, 80 - vilvl.h $vr2, $vr4, $vr1 - vilvh.h $vr1, $vr4, $vr1 - vilvl.w $vr3, $vr4, $vr3 - vst $vr3, $a5, 64 - vilvh.w $vr3, $vr4, $vr1 - vst $vr3, $a5, 48 - vilvl.w $vr3, $vr4, $vr2 - vilvh.w $vr2, $vr4, $vr2 - vilvl.w $vr1, $vr4, $vr1 - vst $vr1, $a5, 32 - vilvh.b $vr1, $vr4, $vr0 - vst $vr2, $a5, 16 - vilvh.h $vr2, $vr4, $vr1 - vst $vr3, $a5, 0 - vilvh.w $vr3, $vr4, $vr2 - vst $vr3, $a5, 240 - vilvl.b $vr0, $vr4, $vr0 - vilvl.h $vr1, $vr4, $vr1 - vilvl.w $vr2, $vr4, $vr2 - vst $vr2, $a5, 224 - vilvh.w $vr2, $vr4, $vr1 - vst $vr2, $a5, 208 - vilvh.h $vr2, $vr4, $vr0 - vilvl.w $vr1, $vr4, $vr1 - vst $vr1, $a5, 192 - vilvh.w $vr1, $vr4, $vr2 - vst $vr1, $a5, 176 - vilvl.h $vr0, $vr4, $vr0 - vilvl.w $vr1, $vr4, $vr2 - vst $vr1, $a5, 160 - vilvh.w $vr1, $vr4, $vr0 - vst $vr1, $a5, 144 - vilvl.w $vr0, $vr4, $vr0 - vst $vr0, $a5, 128 + vldx $vr6, $a7, $s5 + vldx $vr15, $a7, $s1 + vldx $vr12, $a7, $s6 + vldx $vr2, $a7, $s7 + vshuf4i.b $vr0, $vr6, 14 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vsrli.d $vr1, $vr6, 32 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vsrli.d $vr3, $vr6, 48 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr4, $vr6, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vbsrl.v $vr5, $vr6, 10 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vbsrl.v $vr7, $vr6, 12 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vbsrl.v $vr8, $vr6, 14 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr9, $vr8, 0 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr11, $vr6, 0 + vshuf4i.b $vr6, $vr15, 14 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsrli.d $vr8, $vr15, 32 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vsrli.d $vr10, $vr15, 48 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vbsrl.v $vr13, $vr15, 8 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vbsrl.v $vr14, $vr15, 10 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vbsrl.v $vr16, $vr15, 12 + vsllwil.hu.bu $vr16, $vr16, 0 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.du.wu $vr16, $vr16, 0 + vbsrl.v $vr17, $vr15, 14 + vsllwil.hu.bu $vr17, $vr17, 0 + vsllwil.wu.hu $vr17, $vr17, 0 + vsllwil.du.wu $vr18, $vr17, 0 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr20, $vr15, 0 + vshuf4i.b $vr15, $vr12, 14 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vsrli.d $vr17, $vr12, 32 + vsllwil.hu.bu $vr17, $vr17, 0 + vsllwil.wu.hu $vr17, $vr17, 0 + vsllwil.du.wu $vr17, $vr17, 0 + vsrli.d $vr19, $vr12, 48 + vsllwil.hu.bu $vr19, $vr19, 0 + vsllwil.wu.hu $vr19, $vr19, 0 + vsllwil.du.wu $vr19, $vr19, 0 + vbsrl.v $vr21, $vr12, 8 + vsllwil.hu.bu $vr21, $vr21, 0 + vsllwil.wu.hu $vr21, $vr21, 0 + vsllwil.du.wu $vr21, $vr21, 0 + vbsrl.v $vr22, $vr12, 10 + vsllwil.hu.bu $vr22, $vr22, 0 + vsllwil.wu.hu $vr22, $vr22, 0 + vsllwil.du.wu $vr22, $vr22, 0 + vbsrl.v $vr23, $vr12, 12 + vsllwil.hu.bu $vr23, $vr23, 0 + vsllwil.wu.hu $vr23, $vr23, 0 + vsllwil.du.wu $vr23, $vr23, 0 + vbsrl.v $vr24, $vr12, 14 + vsllwil.hu.bu $vr24, $vr24, 0 + vsllwil.wu.hu $vr24, $vr24, 0 + vsllwil.du.wu $vr24, $vr24, 0 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vshuf4i.b $vr25, $vr2, 14 + vsllwil.hu.bu $vr25, $vr25, 0 + vsllwil.wu.hu $vr25, $vr25, 0 + vsllwil.du.wu $vr25, $vr25, 0 + vsrli.d $vr26, $vr2, 32 + vsllwil.hu.bu $vr26, $vr26, 0 + vsllwil.wu.hu $vr26, $vr26, 0 + vsllwil.du.wu $vr26, $vr26, 0 + vsrli.d $vr27, $vr2, 48 + vsllwil.hu.bu $vr27, $vr27, 0 + vsllwil.wu.hu $vr27, $vr27, 0 + vsllwil.du.wu $vr27, $vr27, 0 + vbsrl.v $vr28, $vr2, 8 + vsllwil.hu.bu $vr28, $vr28, 0 + vsllwil.wu.hu $vr28, $vr28, 0 + vsllwil.du.wu $vr28, $vr28, 0 + vbsrl.v $vr29, $vr2, 10 + vsllwil.hu.bu $vr29, $vr29, 0 + vsllwil.wu.hu $vr29, $vr29, 0 + vsllwil.du.wu $vr29, $vr29, 0 + vbsrl.v $vr30, $vr2, 12 + vsllwil.hu.bu $vr30, $vr30, 0 + vsllwil.wu.hu $vr30, $vr30, 0 + vsllwil.du.wu $vr30, $vr30, 0 + vbsrl.v $vr31, $vr2, 14 + vsllwil.hu.bu $vr31, $vr31, 0 + vsllwil.wu.hu $vr31, $vr31, 0 + vsllwil.du.wu $vr31, $vr31, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vst $vr11, $a5, -256 + vst $vr9, $a5, -144 + vst $vr7, $a5, -160 + vst $vr5, $a5, -176 + vst $vr4, $a5, -192 + vst $vr3, $a5, -208 + vst $vr1, $a5, -224 + vst $vr0, $a5, -240 + vst $vr20, $a5, -128 + vst $vr18, $a5, -16 + vst $vr16, $a5, -32 + vst $vr14, $a5, -48 + vst $vr13, $a5, -64 + vst $vr10, $a5, -80 + vst $vr8, $a5, -96 + vst $vr6, $a5, -112 + vst $vr12, $a5, 0 + vst $vr24, $a5, 112 + vst $vr23, $a5, 96 + vst $vr22, $a5, 80 + vst $vr21, $a5, 64 + vst $vr19, $a5, 48 + vst $vr17, $a5, 32 + vst $vr15, $a5, 16 + vst $vr2, $a5, 128 + vst $vr31, $a5, 240 + vst $vr30, $a5, 224 + vst $vr29, $a5, 208 + vst $vr28, $a5, 192 + vst $vr27, $a5, 176 + vst $vr26, $a5, 160 + vst $vr25, $a5, 144 addi.d $a6, $a6, 64 addi.d $a5, $a5, 512 bnez $a6, .LBB49_20 @@ -20096,7 +20402,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 40 + ld.d $a0, $sp, 24 beqz $a0, .LBB49_28 # %bb.27: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i37 pcaddu18i $ra, %call36(_ZdaPv) @@ -20105,18 +20411,26 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 56 # 8-byte Folded Reload - ld.d $s7, $sp, 64 # 8-byte Folded Reload - ld.d $s6, $sp, 72 # 8-byte Folded Reload - ld.d $s5, $sp, 80 # 8-byte Folded Reload - ld.d $s4, $sp, 88 # 8-byte Folded Reload - ld.d $s3, $sp, 96 # 8-byte Folded Reload - ld.d $s2, $sp, 104 # 8-byte Folded Reload - ld.d $s1, $sp, 112 # 8-byte Folded Reload - ld.d $s0, $sp, 120 # 8-byte Folded Reload - ld.d $fp, $sp, 128 # 8-byte Folded Reload - ld.d $ra, $sp, 136 # 8-byte Folded Reload - addi.d $sp, $sp, 144 + fld.d $fs7, $sp, 40 # 8-byte Folded Reload + fld.d $fs6, $sp, 48 # 8-byte Folded Reload + fld.d $fs5, $sp, 56 # 8-byte Folded Reload + fld.d $fs4, $sp, 64 # 8-byte Folded Reload + fld.d $fs3, $sp, 72 # 8-byte Folded Reload + fld.d $fs2, $sp, 80 # 8-byte Folded Reload + fld.d $fs1, $sp, 88 # 8-byte Folded Reload + fld.d $fs0, $sp, 96 # 8-byte Folded Reload + ld.d $s8, $sp, 104 # 8-byte Folded Reload + ld.d $s7, $sp, 112 # 8-byte Folded Reload + ld.d $s6, $sp, 120 # 8-byte Folded Reload + ld.d $s5, $sp, 128 # 8-byte Folded Reload + ld.d $s4, $sp, 136 # 8-byte Folded Reload + ld.d $s3, $sp, 144 # 8-byte Folded Reload + ld.d $s2, $sp, 152 # 8-byte Folded Reload + ld.d $s1, $sp, 160 # 8-byte Folded Reload + ld.d $s0, $sp, 168 # 8-byte Folded Reload + ld.d $fp, $sp, 176 # 8-byte Folded Reload + ld.d $ra, $sp, 184 # 8-byte Folded Reload + addi.d $sp, $sp, 192 ret .LBB49_29: .Ltmp1070: # EH_LABEL @@ -20208,7 +20522,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW16From_uint8_t_To_uint64_t_RN9benchmark5St move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s4, $sp, 40 + ld.d $s4, $sp, 24 beqz $s4, .LBB49_44 .LBB49_43: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i30 move $a0, $s4 @@ -20278,19 +20592,19 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception50 # %bb.0: - addi.d $sp, $sp, -144 - .cfi_def_cfa_offset 144 - st.d $ra, $sp, 136 # 8-byte Folded Spill - st.d $fp, $sp, 128 # 8-byte Folded Spill - st.d $s0, $sp, 120 # 8-byte Folded Spill - st.d $s1, $sp, 112 # 8-byte Folded Spill - st.d $s2, $sp, 104 # 8-byte Folded Spill - st.d $s3, $sp, 96 # 8-byte Folded Spill - st.d $s4, $sp, 88 # 8-byte Folded Spill - st.d $s5, $sp, 80 # 8-byte Folded Spill - st.d $s6, $sp, 72 # 8-byte Folded Spill - st.d $s7, $sp, 64 # 8-byte Folded Spill - st.d $s8, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -112 + .cfi_def_cfa_offset 112 + st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s1, $sp, 80 # 8-byte Folded Spill + st.d $s2, $sp, 72 # 8-byte Folded Spill + st.d $s3, $sp, 64 # 8-byte Folded Spill + st.d $s4, $sp, 56 # 8-byte Folded Spill + st.d $s5, $sp, 48 # 8-byte Folded Spill + st.d $s6, $sp, 40 # 8-byte Folded Spill + st.d $s7, $sp, 32 # 8-byte Folded Spill + st.d $s8, $sp, 24 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -20318,7 +20632,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # .Ltmp1091: # EH_LABEL # %bb.1: move $s4, $a0 - st.d $a0, $sp, 40 + st.d $a0, $sp, 8 .Ltmp1093: # EH_LABEL move $a0, $s3 pcaddu18i $ra, %call36(_Znam) @@ -20328,7 +20642,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # move $s0, $a0 lu12i.w $a0, 15 ori $a0, $a0, 3840 - st.h $a0, $sp, 54 + st.h $a0, $sp, 22 lu12i.w $s6, -3 ori $s8, $s6, 2288 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -20336,8 +20650,8 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # .p2align 4, , 16 .LBB50_3: # =>This Inner Loop Header: Depth=1 .Ltmp1096: # EH_LABEL - addi.d $a0, $sp, 54 - addi.d $a2, $sp, 54 + addi.d $a0, $sp, 22 + addi.d $a2, $sp, 22 move $a1, $s5 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIhEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEhRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -20350,7 +20664,6 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # bnez $s8, .LBB50_3 # %bb.5: # %vector.ph ori $a0, $s6, 2288 - vrepli.b $vr2, 0 move $a1, $s0 .p2align 4, , 16 .LBB50_6: # %vector.body @@ -20358,9 +20671,9 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # add.d $a2, $fp, $a0 ldx.h $a2, $a2, $s1 vinsgr2vr.h $vr0, $a2, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.w $vr0, $vr2, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vst $vr0, $a1, 0 addi.d $a0, $a0, 2 addi.d $a1, $a1, 16 @@ -20377,12 +20690,12 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # ldx.h $a2, $a2, $s5 vinsgr2vr.h $vr0, $a3, 0 vinsgr2vr.h $vr1, $a2, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.w $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 - vilvl.w $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 vst $vr0, $a0, -16 vst $vr1, $a0, 0 addi.d $a1, $a1, 4 @@ -20403,7 +20716,6 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # addi.d $s7, $s7, 8 bnez $s8, .LBB50_10 # %bb.12: - vst $vr2, $sp, 16 # 16-byte Folded Spill ld.w $s7, $s2, 28 ld.d $s4, $s2, 16 .Ltmp1114: # EH_LABEL @@ -20412,13 +20724,12 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # jirl $ra, $ra, 0 .Ltmp1115: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr2, $sp, 16 # 16-byte Folded Reload bnez $s7, .LBB50_23 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s4, .LBB50_23 # %bb.15: # %.lr.ph.preheader add.d $a0, $fp, $s1 - addi.d $a1, $sp, 40 + addi.d $a1, $sp, 8 ori $a2, $s6, 2288 b .LBB50_17 .p2align 4, , 16 @@ -20433,7 +20744,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # #APP #NO_APP #MEMBARRIER - ld.d $a3, $sp, 40 + ld.d $a3, $sp, 8 bgeu $a3, $a0, .LBB50_21 # %bb.18: # %.lr.ph # in Loop: Header=BB50_17 Depth=1 @@ -20467,12 +20778,12 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # ldx.h $a5, $a5, $s5 vinsgr2vr.h $vr0, $a6, 0 vinsgr2vr.h $vr1, $a5, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.w $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 - vilvl.w $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 vst $vr0, $a3, -16 vst $vr1, $a3, 0 addi.d $a4, $a4, 4 @@ -20489,7 +20800,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 40 + ld.d $a0, $sp, 8 beqz $a0, .LBB50_26 # %bb.25: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i37 pcaddu18i $ra, %call36(_ZdaPv) @@ -20498,18 +20809,18 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 56 # 8-byte Folded Reload - ld.d $s7, $sp, 64 # 8-byte Folded Reload - ld.d $s6, $sp, 72 # 8-byte Folded Reload - ld.d $s5, $sp, 80 # 8-byte Folded Reload - ld.d $s4, $sp, 88 # 8-byte Folded Reload - ld.d $s3, $sp, 96 # 8-byte Folded Reload - ld.d $s2, $sp, 104 # 8-byte Folded Reload - ld.d $s1, $sp, 112 # 8-byte Folded Reload - ld.d $s0, $sp, 120 # 8-byte Folded Reload - ld.d $fp, $sp, 128 # 8-byte Folded Reload - ld.d $ra, $sp, 136 # 8-byte Folded Reload - addi.d $sp, $sp, 144 + ld.d $s8, $sp, 24 # 8-byte Folded Reload + ld.d $s7, $sp, 32 # 8-byte Folded Reload + ld.d $s6, $sp, 40 # 8-byte Folded Reload + ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $s4, $sp, 56 # 8-byte Folded Reload + ld.d $s3, $sp, 64 # 8-byte Folded Reload + ld.d $s2, $sp, 72 # 8-byte Folded Reload + ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $fp, $sp, 96 # 8-byte Folded Reload + ld.d $ra, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret .LBB50_27: .Ltmp1099: # EH_LABEL @@ -20601,7 +20912,7 @@ _Z53benchForTruncOrZextVecInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5StateE: # move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s4, $sp, 40 + ld.d $s4, $sp, 8 beqz $s4, .LBB50_42 .LBB50_41: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i30 move $a0, $s4 @@ -20676,18 +20987,26 @@ _Z67benchForTruncOrZextVecWithAddInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchm .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception51 # %bb.0: - addi.d $sp, $sp, -112 - .cfi_def_cfa_offset 112 - st.d $ra, $sp, 104 # 8-byte Folded Spill - st.d $fp, $sp, 96 # 8-byte Folded Spill - st.d $s0, $sp, 88 # 8-byte Folded Spill - st.d $s1, $sp, 80 # 8-byte Folded Spill - st.d $s2, $sp, 72 # 8-byte Folded Spill - st.d $s3, $sp, 64 # 8-byte Folded Spill - st.d $s4, $sp, 56 # 8-byte Folded Spill - st.d $s5, $sp, 48 # 8-byte Folded Spill - st.d $s6, $sp, 40 # 8-byte Folded Spill - st.d $s7, $sp, 32 # 8-byte Folded Spill + addi.d $sp, $sp, -176 + .cfi_def_cfa_offset 176 + st.d $ra, $sp, 168 # 8-byte Folded Spill + st.d $fp, $sp, 160 # 8-byte Folded Spill + st.d $s0, $sp, 152 # 8-byte Folded Spill + st.d $s1, $sp, 144 # 8-byte Folded Spill + st.d $s2, $sp, 136 # 8-byte Folded Spill + st.d $s3, $sp, 128 # 8-byte Folded Spill + st.d $s4, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 112 # 8-byte Folded Spill + st.d $s6, $sp, 104 # 8-byte Folded Spill + st.d $s7, $sp, 96 # 8-byte Folded Spill + fst.d $fs0, $sp, 88 # 8-byte Folded Spill + fst.d $fs1, $sp, 80 # 8-byte Folded Spill + fst.d $fs2, $sp, 72 # 8-byte Folded Spill + fst.d $fs3, $sp, 64 # 8-byte Folded Spill + fst.d $fs4, $sp, 56 # 8-byte Folded Spill + fst.d $fs5, $sp, 48 # 8-byte Folded Spill + fst.d $fs6, $sp, 40 # 8-byte Folded Spill + fst.d $fs7, $sp, 32 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -20698,6 +21017,14 @@ _Z67benchForTruncOrZextVecWithAddInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchm .cfi_offset 28, -64 .cfi_offset 29, -72 .cfi_offset 30, -80 + .cfi_offset 56, -88 + .cfi_offset 57, -96 + .cfi_offset 58, -104 + .cfi_offset 59, -112 + .cfi_offset 60, -120 + .cfi_offset 61, -128 + .cfi_offset 62, -136 + .cfi_offset 63, -144 move $s0, $a0 lu12i.w $s5, 2 ori $s2, $s5, 1808 @@ -20775,7 +21102,6 @@ _Z67benchForTruncOrZextVecWithAddInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchm addi.d $a1, $sp, 8 ori $a2, $s6, 2288 ori $a3, $s6, 2304 - vrepli.b $vr0, 0 ori $a4, $s5, 1792 .p2align 4, , 16 .LBB51_11: # %.lr.ph @@ -20808,86 +21134,118 @@ _Z67benchForTruncOrZextVecWithAddInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchm ldptr.d $t2, $t0, 9992 ldptr.d $t3, $t0, 10000 ldptr.d $t0, $t0, 10008 - vinsgr2vr.d $vr1, $t1, 0 - vinsgr2vr.d $vr2, $t2, 0 - vinsgr2vr.d $vr3, $t3, 0 - vinsgr2vr.d $vr4, $t0, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvh.h $vr5, $vr0, $vr1 - vilvl.w $vr6, $vr0, $vr5 - vilvh.w $vr5, $vr0, $vr5 - vilvl.h $vr1, $vr0, $vr1 - vilvl.w $vr7, $vr0, $vr1 - vilvh.w $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvh.h $vr8, $vr0, $vr2 - vilvl.w $vr9, $vr0, $vr8 - vilvh.w $vr8, $vr0, $vr8 - vilvl.h $vr2, $vr0, $vr2 - vilvl.w $vr10, $vr0, $vr2 - vilvh.w $vr2, $vr0, $vr2 - vilvl.b $vr3, $vr0, $vr3 - vilvh.h $vr11, $vr0, $vr3 - vilvl.w $vr12, $vr0, $vr11 - vilvh.w $vr11, $vr0, $vr11 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr13, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvh.h $vr14, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vilvl.w $vr15, $vr0, $vr4 - vld $vr16, $a6, -112 - vilvh.w $vr4, $vr0, $vr4 - vld $vr17, $a6, -128 + vinsgr2vr.d $vr3, $t1, 0 + vinsgr2vr.d $vr4, $t2, 0 + vinsgr2vr.d $vr5, $t3, 0 + vinsgr2vr.d $vr6, $t0, 0 + vsllwil.hu.bu $vr0, $vr3, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vsrli.d $vr1, $vr3, 32 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vsrli.d $vr2, $vr3, 48 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vshuf4i.b $vr3, $vr3, 14 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr7, $vr3, 0 + vsllwil.hu.bu $vr3, $vr4, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsrli.d $vr8, $vr4, 32 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vsrli.d $vr9, $vr4, 48 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vshuf4i.b $vr4, $vr4, 14 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsllwil.hu.bu $vr10, $vr5, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vsrli.d $vr11, $vr5, 32 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vsrli.d $vr12, $vr5, 48 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vshuf4i.b $vr5, $vr5, 14 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.hu.bu $vr13, $vr6, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vsrli.d $vr14, $vr6, 32 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vsrli.d $vr15, $vr6, 48 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vshuf4i.b $vr6, $vr6, 14 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vld $vr16, $a6, -128 + vld $vr17, $a6, -96 vld $vr18, $a6, -80 - vadd.d $vr1, $vr16, $vr1 - vld $vr16, $a6, -96 - vadd.d $vr7, $vr17, $vr7 - vld $vr17, $a6, -48 - vadd.d $vr5, $vr18, $vr5 - vld $vr18, $a6, -64 - vadd.d $vr6, $vr16, $vr6 - vld $vr16, $a6, -16 - vadd.d $vr2, $vr17, $vr2 - vld $vr17, $a6, -32 - vadd.d $vr10, $vr18, $vr10 - vld $vr18, $a6, 16 - vadd.d $vr8, $vr16, $vr8 - vld $vr16, $a6, 0 - vadd.d $vr9, $vr17, $vr9 - vld $vr17, $a6, 48 - vadd.d $vr3, $vr18, $vr3 - vld $vr18, $a6, 32 - vadd.d $vr13, $vr16, $vr13 - vld $vr16, $a6, 80 - vadd.d $vr11, $vr17, $vr11 - vld $vr17, $a6, 64 - vadd.d $vr12, $vr18, $vr12 - vld $vr18, $a6, 112 - vadd.d $vr4, $vr16, $vr4 - vld $vr16, $a6, 96 - vadd.d $vr15, $vr17, $vr15 - vilvh.w $vr17, $vr0, $vr14 - vadd.d $vr17, $vr18, $vr17 - vilvl.w $vr14, $vr0, $vr14 - vadd.d $vr14, $vr16, $vr14 - vst $vr6, $a6, -96 - vst $vr5, $a6, -80 - vst $vr7, $a6, -128 - vst $vr1, $a6, -112 - vst $vr9, $a6, -32 - vst $vr8, $a6, -16 - vst $vr10, $a6, -64 - vst $vr2, $a6, -48 - vst $vr12, $a6, 32 - vst $vr11, $a6, 48 - vst $vr13, $a6, 0 - vst $vr3, $a6, 16 + vld $vr19, $a6, -112 + vld $vr20, $a6, -64 + vld $vr21, $a6, -32 + vld $vr22, $a6, -16 + vld $vr23, $a6, -48 + vld $vr24, $a6, 0 + vld $vr25, $a6, 32 + vld $vr26, $a6, 48 + vld $vr27, $a6, 16 + vld $vr28, $a6, 64 + vld $vr29, $a6, 96 + vld $vr30, $a6, 112 + vld $vr31, $a6, 80 + vadd.d $vr7, $vr19, $vr7 + vadd.d $vr2, $vr18, $vr2 + vadd.d $vr1, $vr17, $vr1 + vadd.d $vr0, $vr16, $vr0 + vadd.d $vr4, $vr23, $vr4 + vadd.d $vr9, $vr22, $vr9 + vadd.d $vr8, $vr21, $vr8 + vadd.d $vr3, $vr20, $vr3 + vadd.d $vr5, $vr27, $vr5 + vadd.d $vr12, $vr26, $vr12 + vadd.d $vr11, $vr25, $vr11 + vadd.d $vr10, $vr24, $vr10 + vadd.d $vr6, $vr31, $vr6 + vadd.d $vr15, $vr30, $vr15 + vadd.d $vr14, $vr29, $vr14 + vadd.d $vr13, $vr28, $vr13 + vst $vr0, $a6, -128 + vst $vr1, $a6, -96 + vst $vr2, $a6, -80 + vst $vr7, $a6, -112 + vst $vr3, $a6, -64 + vst $vr8, $a6, -32 + vst $vr9, $a6, -16 + vst $vr4, $a6, -48 + vst $vr10, $a6, 0 + vst $vr11, $a6, 32 + vst $vr12, $a6, 48 + vst $vr5, $a6, 16 + vst $vr13, $a6, 64 vst $vr14, $a6, 96 - vst $vr17, $a6, 112 - vst $vr15, $a6, 64 - vst $vr4, $a6, 80 + vst $vr15, $a6, 112 + vst $vr6, $a6, 80 addi.d $a7, $a7, 32 addi.d $a6, $a6, 256 bnez $a7, .LBB51_15 @@ -20930,17 +21288,25 @@ _Z67benchForTruncOrZextVecWithAddInLoopWithVW8From_uint8_t_To_uint64_t_RN9benchm move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s7, $sp, 32 # 8-byte Folded Reload - ld.d $s6, $sp, 40 # 8-byte Folded Reload - ld.d $s5, $sp, 48 # 8-byte Folded Reload - ld.d $s4, $sp, 56 # 8-byte Folded Reload - ld.d $s3, $sp, 64 # 8-byte Folded Reload - ld.d $s2, $sp, 72 # 8-byte Folded Reload - ld.d $s1, $sp, 80 # 8-byte Folded Reload - ld.d $s0, $sp, 88 # 8-byte Folded Reload - ld.d $fp, $sp, 96 # 8-byte Folded Reload - ld.d $ra, $sp, 104 # 8-byte Folded Reload - addi.d $sp, $sp, 112 + fld.d $fs7, $sp, 32 # 8-byte Folded Reload + fld.d $fs6, $sp, 40 # 8-byte Folded Reload + fld.d $fs5, $sp, 48 # 8-byte Folded Reload + fld.d $fs4, $sp, 56 # 8-byte Folded Reload + fld.d $fs3, $sp, 64 # 8-byte Folded Reload + fld.d $fs2, $sp, 72 # 8-byte Folded Reload + fld.d $fs1, $sp, 80 # 8-byte Folded Reload + fld.d $fs0, $sp, 88 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload + ld.d $s6, $sp, 104 # 8-byte Folded Reload + ld.d $s5, $sp, 112 # 8-byte Folded Reload + ld.d $s4, $sp, 120 # 8-byte Folded Reload + ld.d $s3, $sp, 128 # 8-byte Folded Reload + ld.d $s2, $sp, 136 # 8-byte Folded Reload + ld.d $s1, $sp, 144 # 8-byte Folded Reload + ld.d $s0, $sp, 152 # 8-byte Folded Reload + ld.d $fp, $sp, 160 # 8-byte Folded Reload + ld.d $ra, $sp, 168 # 8-byte Folded Reload + addi.d $sp, $sp, 176 ret .LBB51_24: .Ltmp1121: # EH_LABEL @@ -21029,36 +21395,36 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint64_t_RN9bench .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception52 # %bb.0: - addi.d $sp, $sp, -288 - .cfi_def_cfa_offset 288 - st.d $ra, $sp, 280 # 8-byte Folded Spill - st.d $fp, $sp, 272 # 8-byte Folded Spill - st.d $s0, $sp, 264 # 8-byte Folded Spill - st.d $s1, $sp, 256 # 8-byte Folded Spill - st.d $s2, $sp, 248 # 8-byte Folded Spill - st.d $s3, $sp, 240 # 8-byte Folded Spill - st.d $s4, $sp, 232 # 8-byte Folded Spill - st.d $s5, $sp, 224 # 8-byte Folded Spill - st.d $s6, $sp, 216 # 8-byte Folded Spill - st.d $s7, $sp, 208 # 8-byte Folded Spill - fst.d $fs0, $sp, 200 # 8-byte Folded Spill - fst.d $fs1, $sp, 192 # 8-byte Folded Spill - fst.d $fs2, $sp, 184 # 8-byte Folded Spill - fst.d $fs3, $sp, 176 # 8-byte Folded Spill - fst.d $fs4, $sp, 168 # 8-byte Folded Spill - fst.d $fs5, $sp, 160 # 8-byte Folded Spill - fst.d $fs6, $sp, 152 # 8-byte Folded Spill - fst.d $fs7, $sp, 144 # 8-byte Folded Spill - .cfi_offset 1, -8 - .cfi_offset 22, -16 - .cfi_offset 23, -24 - .cfi_offset 24, -32 - .cfi_offset 25, -40 - .cfi_offset 26, -48 - .cfi_offset 27, -56 - .cfi_offset 28, -64 - .cfi_offset 29, -72 - .cfi_offset 30, -80 + addi.d $sp, $sp, -224 + .cfi_def_cfa_offset 224 + st.d $ra, $sp, 216 # 8-byte Folded Spill + st.d $fp, $sp, 208 # 8-byte Folded Spill + st.d $s0, $sp, 200 # 8-byte Folded Spill + st.d $s1, $sp, 192 # 8-byte Folded Spill + st.d $s2, $sp, 184 # 8-byte Folded Spill + st.d $s3, $sp, 176 # 8-byte Folded Spill + st.d $s4, $sp, 168 # 8-byte Folded Spill + st.d $s5, $sp, 160 # 8-byte Folded Spill + st.d $s6, $sp, 152 # 8-byte Folded Spill + st.d $s7, $sp, 144 # 8-byte Folded Spill + fst.d $fs0, $sp, 136 # 8-byte Folded Spill + fst.d $fs1, $sp, 128 # 8-byte Folded Spill + fst.d $fs2, $sp, 120 # 8-byte Folded Spill + fst.d $fs3, $sp, 112 # 8-byte Folded Spill + fst.d $fs4, $sp, 104 # 8-byte Folded Spill + fst.d $fs5, $sp, 96 # 8-byte Folded Spill + fst.d $fs6, $sp, 88 # 8-byte Folded Spill + fst.d $fs7, $sp, 80 # 8-byte Folded Spill + .cfi_offset 1, -8 + .cfi_offset 22, -16 + .cfi_offset 23, -24 + .cfi_offset 24, -32 + .cfi_offset 25, -40 + .cfi_offset 26, -48 + .cfi_offset 27, -56 + .cfi_offset 28, -64 + .cfi_offset 29, -72 + .cfi_offset 30, -80 .cfi_offset 56, -88 .cfi_offset 57, -96 .cfi_offset 58, -104 @@ -21083,10 +21449,10 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint64_t_RN9bench .Ltmp1134: # EH_LABEL # %bb.1: move $s3, $a0 - st.d $a0, $sp, 120 + st.d $a0, $sp, 56 lu12i.w $a0, 15 ori $a0, $a0, 3840 - st.h $a0, $sp, 128 + st.h $a0, $sp, 64 lu12i.w $s6, -3 ori $s7, $s6, 2288 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -21094,8 +21460,8 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint64_t_RN9bench .p2align 4, , 16 .LBB52_2: # =>This Inner Loop Header: Depth=1 .Ltmp1136: # EH_LABEL - addi.d $a0, $sp, 128 - addi.d $a2, $sp, 128 + addi.d $a0, $sp, 64 + addi.d $a2, $sp, 64 move $a1, $s4 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIhEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEhRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -21109,15 +21475,15 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint64_t_RN9bench # %bb.4: pcalau12i $a0, %pc_hi20(.LCPI52_0) vld $vr0, $a0, %pc_lo12(.LCPI52_0) - vst $vr0, $sp, 128 + vst $vr0, $sp, 64 pcalau12i $a0, %pc_hi20(_ZL3rng) addi.d $s4, $a0, %pc_lo12(_ZL3rng) move $s7, $zero .p2align 4, , 16 .LBB52_5: # =>This Inner Loop Header: Depth=1 .Ltmp1139: # EH_LABEL - addi.d $a0, $sp, 128 - addi.d $a2, $sp, 128 + addi.d $a0, $sp, 64 + addi.d $a2, $sp, 64 move $a1, $s4 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionImEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEmRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -21141,13 +21507,12 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint64_t_RN9bench beqz $s3, .LBB52_20 # %bb.10: # %.lr.ph.preheader add.d $a0, $fp, $s1 - addi.d $a1, $sp, 120 + addi.d $a1, $sp, 56 ori $a2, $s6, 2288 ori $a3, $s6, 2304 ori $a4, $s5, 1792 ori $a5, $s5, 1824 ori $a6, $s5, 1840 - vrepli.b $vr0, 0 .p2align 4, , 16 .LBB52_11: # %.lr.ph # =>This Loop Header: Depth=1 @@ -21156,7 +21521,7 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint64_t_RN9bench #APP #NO_APP #MEMBARRIER - ld.d $a7, $sp, 120 + ld.d $a7, $sp, 56 bgeu $a7, $a0, .LBB52_14 # %bb.12: # %.lr.ph # in Loop: Header=BB52_11 Depth=1 @@ -21175,174 +21540,234 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint64_t_RN9bench # Parent Loop BB52_11 Depth=1 # => This Inner Loop Header: Depth=2 add.d $t2, $fp, $t1 - vldx $vr1, $t2, $a4 - vldx $vr2, $t2, $s1 - vldx $vr4, $t2, $a5 - vldx $vr13, $t2, $a6 - vilvh.b $vr3, $vr0, $vr1 - vilvh.h $vr5, $vr0, $vr3 - vilvl.w $vr6, $vr0, $vr5 - vst $vr6, $sp, 80 # 16-byte Folded Spill - vilvh.w $vr8, $vr0, $vr5 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr12, $vr0, $vr3 - vilvh.w $vr7, $vr0, $vr3 - vilvl.b $vr1, $vr0, $vr1 - vilvh.h $vr3, $vr0, $vr1 - vilvl.w $vr6, $vr0, $vr3 - vilvh.w $vr22, $vr0, $vr3 - vilvl.h $vr1, $vr0, $vr1 - vilvl.w $vr26, $vr0, $vr1 - vilvh.w $vr28, $vr0, $vr1 - vilvh.b $vr1, $vr0, $vr2 - vilvh.h $vr5, $vr0, $vr1 - vilvl.w $vr3, $vr0, $vr5 - vst $vr3, $sp, 96 # 16-byte Folded Spill - vilvh.w $vr3, $vr0, $vr5 - vilvl.h $vr1, $vr0, $vr1 - vilvl.w $vr10, $vr0, $vr1 - vilvh.w $vr14, $vr0, $vr1 - vilvl.b $vr1, $vr0, $vr2 - vilvh.h $vr2, $vr0, $vr1 - vilvl.w $vr18, $vr0, $vr2 - vilvh.w $vr21, $vr0, $vr2 - vilvl.h $vr1, $vr0, $vr1 - vilvl.w $vr24, $vr0, $vr1 - vilvh.w $vr27, $vr0, $vr1 - vilvh.b $vr1, $vr0, $vr4 - vilvh.h $vr5, $vr0, $vr1 - vilvl.w $vr2, $vr0, $vr5 - vilvh.w $vr5, $vr0, $vr5 - vilvl.h $vr1, $vr0, $vr1 - vilvl.w $vr9, $vr0, $vr1 - vilvh.w $vr11, $vr0, $vr1 - vilvl.b $vr1, $vr0, $vr4 - vilvh.h $vr4, $vr0, $vr1 - vilvl.w $vr17, $vr0, $vr4 - vilvh.w $vr19, $vr0, $vr4 - vilvl.h $vr1, $vr0, $vr1 - vilvl.w $vr23, $vr0, $vr1 - vilvh.w $vr25, $vr0, $vr1 - vilvh.b $vr20, $vr0, $vr13 - vilvl.h $vr16, $vr0, $vr20 - vilvl.b $vr29, $vr0, $vr13 - vilvh.h $vr15, $vr0, $vr29 - vilvl.w $vr1, $vr0, $vr15 - vst $vr1, $sp, 64 # 16-byte Folded Spill - vld $vr30, $t0, -240 - vilvl.h $vr29, $vr0, $vr29 - vld $vr31, $t0, -256 - vld $vr1, $t0, -208 - vadd.d $vr4, $vr30, $vr28 - vst $vr4, $sp, 48 # 16-byte Folded Spill + vldx $vr4, $t2, $a4 + vldx $vr10, $t2, $s1 + vldx $vr15, $t2, $a5 + vldx $vr7, $t2, $a6 + vsllwil.hu.bu $vr1, $vr4, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr4, 12 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr4, 14 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr5, $vr4, 8 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vbsrl.v $vr6, $vr4, 10 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr0, $vr6, 0 + vsrli.d $vr6, $vr4, 32 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr9, $vr6, 0 + vsrli.d $vr6, $vr4, 48 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr12, $vr6, 0 + vshuf4i.b $vr4, $vr4, 14 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr16, $vr4, 0 + vsllwil.hu.bu $vr4, $vr10, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vbsrl.v $vr6, $vr10, 12 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vbsrl.v $vr8, $vr10, 14 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vbsrl.v $vr11, $vr10, 8 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vbsrl.v $vr13, $vr10, 10 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr14, $vr13, 0 + vsrli.d $vr13, $vr10, 32 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr18, $vr13, 0 + vsrli.d $vr13, $vr10, 48 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr21, $vr13, 0 + vshuf4i.b $vr10, $vr10, 14 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr23, $vr10, 0 + vsllwil.hu.bu $vr10, $vr15, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vst $vr10, $sp, 32 # 16-byte Folded Spill + vbsrl.v $vr10, $vr15, 12 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vbsrl.v $vr13, $vr15, 14 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr17, $vr13, 0 + vbsrl.v $vr13, $vr15, 8 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr19, $vr13, 0 + vbsrl.v $vr13, $vr15, 10 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr22, $vr13, 0 + vsrli.d $vr13, $vr15, 32 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr25, $vr13, 0 + vsrli.d $vr13, $vr15, 48 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr26, $vr13, 0 + vshuf4i.b $vr13, $vr15, 14 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr27, $vr13, 0 + vbsrl.v $vr15, $vr7, 14 + vbsrl.v $vr20, $vr7, 8 + vbsrl.v $vr24, $vr7, 10 + vld $vr13, $t0, -240 + vsrli.d $vr28, $vr7, 32 + vld $vr29, $t0, -208 vld $vr30, $t0, -224 - vadd.d $vr4, $vr31, $vr26 - vst $vr4, $sp, 32 # 16-byte Folded Spill + vadd.d $vr13, $vr13, $vr16 + vst $vr13, $sp, 16 # 16-byte Folded Spill vld $vr31, $t0, -176 - vadd.d $vr1, $vr1, $vr22 - vst $vr1, $sp, 16 # 16-byte Folded Spill - vld $vr1, $t0, -192 - vadd.d $vr22, $vr30, $vr6 - vld $vr30, $t0, -144 - vadd.d $vr13, $vr31, $vr7 - vld $vr31, $t0, -160 - vadd.d $vr4, $vr1, $vr12 - vld $vr12, $t0, -112 - vadd.d $vr30, $vr30, $vr8 - vld $vr8, $t0, -128 - vld $vr1, $sp, 80 # 16-byte Folded Reload + vadd.d $vr16, $vr29, $vr12 + vld $vr29, $t0, -192 + vadd.d $vr13, $vr30, $vr9 + vld $vr9, $t0, -144 + vadd.d $vr12, $vr31, $vr0 + vld $vr30, $t0, -160 + vadd.d $vr5, $vr29, $vr5 + vld $vr31, $t0, -256 + vadd.d $vr29, $vr9, $vr3 + vld $vr3, $t0, -112 + vadd.d $vr30, $vr30, $vr2 + vld $vr2, $t0, -80 vadd.d $vr31, $vr31, $vr1 - vld $vr6, $t0, -80 - vadd.d $vr12, $vr12, $vr27 - vld $vr27, $t0, -96 - vadd.d $vr8, $vr8, $vr24 - vld $vr24, $t0, -48 - vadd.d $vr6, $vr6, $vr21 + vld $vr1, $t0, -96 + vadd.d $vr9, $vr3, $vr23 + vld $vr23, $t0, -48 + vadd.d $vr2, $vr2, $vr21 vld $vr21, $t0, -64 - vadd.d $vr18, $vr27, $vr18 - vld $vr27, $t0, -16 - vadd.d $vr14, $vr24, $vr14 - vld $vr24, $t0, -32 - vadd.d $vr10, $vr21, $vr10 - vld $vr21, $t0, 16 - vadd.d $vr7, $vr27, $vr3 - vld $vr27, $t0, 0 - vld $vr1, $sp, 96 # 16-byte Folded Reload - vadd.d $vr3, $vr24, $vr1 - vld $vr24, $t0, 48 - vadd.d $vr21, $vr21, $vr25 - vld $vr25, $t0, 32 - vadd.d $vr23, $vr27, $vr23 + vadd.d $vr3, $vr1, $vr18 + vld $vr1, $t0, -16 + vadd.d $vr14, $vr23, $vr14 + vld $vr23, $t0, -32 + vadd.d $vr11, $vr21, $vr11 + vld $vr21, $t0, -128 + vadd.d $vr18, $vr1, $vr8 + vld $vr8, $t0, 16 + vadd.d $vr23, $vr23, $vr6 + vld $vr6, $t0, 48 + vadd.d $vr1, $vr21, $vr4 + vld $vr21, $t0, 32 + vadd.d $vr4, $vr8, $vr27 vld $vr27, $t0, 80 - vadd.d $vr19, $vr24, $vr19 - vld $vr24, $t0, 64 - vadd.d $vr17, $vr25, $vr17 + vadd.d $vr6, $vr6, $vr26 + vld $vr26, $t0, 64 + vadd.d $vr8, $vr21, $vr25 vld $vr25, $t0, 112 - vadd.d $vr11, $vr27, $vr11 - vld $vr27, $t0, 96 - vadd.d $vr9, $vr24, $vr9 - vilvl.w $vr24, $vr0, $vr29 - vadd.d $vr5, $vr25, $vr5 - vld $vr25, $t0, 144 - vadd.d $vr27, $vr27, $vr2 - vld $vr1, $t0, 128 - vilvh.w $vr29, $vr0, $vr29 - vadd.d $vr25, $vr25, $vr29 - vld $vr29, $t0, 176 - vadd.d $vr2, $vr1, $vr24 - vld $vr24, $t0, 160 - vilvh.w $vr15, $vr0, $vr15 - vadd.d $vr15, $vr29, $vr15 - vld $vr29, $t0, 208 - vld $vr1, $sp, 64 # 16-byte Folded Reload - vadd.d $vr24, $vr24, $vr1 - vld $vr1, $t0, 192 - vilvh.w $vr28, $vr0, $vr16 - vadd.d $vr28, $vr29, $vr28 - vilvl.w $vr16, $vr0, $vr16 - vadd.d $vr1, $vr1, $vr16 - vld $vr16, $t0, 240 - vilvh.h $vr20, $vr0, $vr20 - vld $vr29, $t0, 224 - vilvh.w $vr26, $vr0, $vr20 - vadd.d $vr16, $vr16, $vr26 - vilvl.w $vr20, $vr0, $vr20 - vadd.d $vr20, $vr29, $vr20 - vst $vr31, $t0, -160 - vst $vr30, $t0, -144 - vst $vr4, $t0, -192 - vst $vr13, $t0, -176 - vst $vr22, $t0, -224 - vld $vr4, $sp, 16 # 16-byte Folded Reload - vst $vr4, $t0, -208 - vld $vr4, $sp, 32 # 16-byte Folded Reload - vst $vr4, $t0, -256 - vld $vr4, $sp, 48 # 16-byte Folded Reload - vst $vr4, $t0, -240 - vst $vr3, $t0, -32 - vst $vr7, $t0, -16 - vst $vr10, $t0, -64 + vadd.d $vr21, $vr27, $vr22 + vld $vr22, $t0, 96 + vadd.d $vr19, $vr26, $vr19 + vld $vr26, $t0, 0 + vadd.d $vr17, $vr25, $vr17 + vsrli.d $vr25, $vr7, 48 + vadd.d $vr22, $vr22, $vr10 + vshuf4i.b $vr10, $vr7, 14 + vld $vr0, $sp, 32 # 16-byte Folded Reload + vadd.d $vr26, $vr26, $vr0 + vld $vr27, $t0, 144 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vadd.d $vr10, $vr27, $vr10 + vld $vr27, $t0, 176 + vsllwil.hu.bu $vr25, $vr25, 0 + vsllwil.wu.hu $vr25, $vr25, 0 + vsllwil.du.wu $vr25, $vr25, 0 + vadd.d $vr25, $vr27, $vr25 + vld $vr27, $t0, 160 + vsllwil.hu.bu $vr28, $vr28, 0 + vsllwil.wu.hu $vr28, $vr28, 0 + vsllwil.du.wu $vr28, $vr28, 0 + vadd.d $vr27, $vr27, $vr28 + vld $vr28, $t0, 208 + vsllwil.hu.bu $vr24, $vr24, 0 + vsllwil.wu.hu $vr24, $vr24, 0 + vsllwil.du.wu $vr24, $vr24, 0 + vadd.d $vr24, $vr28, $vr24 + vld $vr28, $t0, 192 + vsllwil.hu.bu $vr20, $vr20, 0 + vsllwil.wu.hu $vr20, $vr20, 0 + vsllwil.du.wu $vr20, $vr20, 0 + vadd.d $vr20, $vr28, $vr20 + vld $vr28, $t0, 240 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vadd.d $vr15, $vr28, $vr15 + vbsrl.v $vr28, $vr7, 12 + vld $vr0, $t0, 224 + vsllwil.hu.bu $vr28, $vr28, 0 + vsllwil.wu.hu $vr28, $vr28, 0 + vsllwil.du.wu $vr28, $vr28, 0 + vadd.d $vr0, $vr0, $vr28 + vld $vr28, $t0, 128 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vadd.d $vr7, $vr28, $vr7 + vst $vr31, $t0, -256 + vst $vr30, $t0, -160 + vst $vr29, $t0, -144 + vst $vr5, $t0, -192 + vst $vr12, $t0, -176 + vst $vr13, $t0, -224 + vst $vr16, $t0, -208 + vld $vr5, $sp, 16 # 16-byte Folded Reload + vst $vr5, $t0, -240 + vst $vr1, $t0, -128 + vst $vr23, $t0, -32 + vst $vr18, $t0, -16 + vst $vr11, $t0, -64 vst $vr14, $t0, -48 - vst $vr18, $t0, -96 - vst $vr6, $t0, -80 - vst $vr8, $t0, -128 - vst $vr12, $t0, -112 - vst $vr27, $t0, 96 - vst $vr5, $t0, 112 - vst $vr9, $t0, 64 - vst $vr11, $t0, 80 - vst $vr17, $t0, 32 - vst $vr19, $t0, 48 - vst $vr23, $t0, 0 - vst $vr21, $t0, 16 - vst $vr20, $t0, 224 - vst $vr16, $t0, 240 - vst $vr1, $t0, 192 - vst $vr28, $t0, 208 - vst $vr24, $t0, 160 - vst $vr15, $t0, 176 - vst $vr2, $t0, 128 - vst $vr25, $t0, 144 + vst $vr3, $t0, -96 + vst $vr2, $t0, -80 + vst $vr9, $t0, -112 + vst $vr26, $t0, 0 + vst $vr22, $t0, 96 + vst $vr17, $t0, 112 + vst $vr19, $t0, 64 + vst $vr21, $t0, 80 + vst $vr8, $t0, 32 + vst $vr6, $t0, 48 + vst $vr4, $t0, 16 + vst $vr7, $t0, 128 + vst $vr0, $t0, 224 + vst $vr15, $t0, 240 + vst $vr20, $t0, 192 + vst $vr24, $t0, 208 + vst $vr27, $t0, 160 + vst $vr25, $t0, 176 + vst $vr10, $t0, 144 addi.d $t1, $t1, 64 addi.d $t0, $t0, 512 bnez $t1, .LBB52_15 @@ -21376,7 +21801,7 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint64_t_RN9bench jirl $ra, $ra, 0 .Ltmp1145: # EH_LABEL # %bb.21: # %_ZNSt10unique_ptrIA_mSt14default_deleteIS0_EED2Ev.exit22 - ld.d $a0, $sp, 120 + ld.d $a0, $sp, 56 beqz $a0, .LBB52_23 # %bb.22: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i24 pcaddu18i $ra, %call36(_ZdaPv) @@ -21385,25 +21810,25 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint64_t_RN9bench move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - fld.d $fs7, $sp, 144 # 8-byte Folded Reload - fld.d $fs6, $sp, 152 # 8-byte Folded Reload - fld.d $fs5, $sp, 160 # 8-byte Folded Reload - fld.d $fs4, $sp, 168 # 8-byte Folded Reload - fld.d $fs3, $sp, 176 # 8-byte Folded Reload - fld.d $fs2, $sp, 184 # 8-byte Folded Reload - fld.d $fs1, $sp, 192 # 8-byte Folded Reload - fld.d $fs0, $sp, 200 # 8-byte Folded Reload - ld.d $s7, $sp, 208 # 8-byte Folded Reload - ld.d $s6, $sp, 216 # 8-byte Folded Reload - ld.d $s5, $sp, 224 # 8-byte Folded Reload - ld.d $s4, $sp, 232 # 8-byte Folded Reload - ld.d $s3, $sp, 240 # 8-byte Folded Reload - ld.d $s2, $sp, 248 # 8-byte Folded Reload - ld.d $s1, $sp, 256 # 8-byte Folded Reload - ld.d $s0, $sp, 264 # 8-byte Folded Reload - ld.d $fp, $sp, 272 # 8-byte Folded Reload - ld.d $ra, $sp, 280 # 8-byte Folded Reload - addi.d $sp, $sp, 288 + fld.d $fs7, $sp, 80 # 8-byte Folded Reload + fld.d $fs6, $sp, 88 # 8-byte Folded Reload + fld.d $fs5, $sp, 96 # 8-byte Folded Reload + fld.d $fs4, $sp, 104 # 8-byte Folded Reload + fld.d $fs3, $sp, 112 # 8-byte Folded Reload + fld.d $fs2, $sp, 120 # 8-byte Folded Reload + fld.d $fs1, $sp, 128 # 8-byte Folded Reload + fld.d $fs0, $sp, 136 # 8-byte Folded Reload + ld.d $s7, $sp, 144 # 8-byte Folded Reload + ld.d $s6, $sp, 152 # 8-byte Folded Reload + ld.d $s5, $sp, 160 # 8-byte Folded Reload + ld.d $s4, $sp, 168 # 8-byte Folded Reload + ld.d $s3, $sp, 176 # 8-byte Folded Reload + ld.d $s2, $sp, 184 # 8-byte Folded Reload + ld.d $s1, $sp, 192 # 8-byte Folded Reload + ld.d $s0, $sp, 200 # 8-byte Folded Reload + ld.d $fp, $sp, 208 # 8-byte Folded Reload + ld.d $ra, $sp, 216 # 8-byte Folded Reload + addi.d $sp, $sp, 224 ret .LBB52_24: .Ltmp1135: # EH_LABEL @@ -21416,7 +21841,7 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW16From_uint8_t_To_uint64_t_RN9bench jirl $ra, $ra, 0 .LBB52_25: .Ltmp1146: # EH_LABEL - ld.d $s3, $sp, 120 + ld.d $s3, $sp, 56 move $s0, $a0 bnez $s3, .LBB52_29 b .LBB52_30 @@ -21591,7 +22016,6 @@ _Z60benchForTruncOrZextVecWithAddInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5Sta addi.d $a1, $sp, 8 ori $a2, $s6, 2288 ori $a3, $s5, 1810 - vrepli.b $vr0, 0 b .LBB53_12 .p2align 4, , 16 .LBB53_11: # %_ZL27truncOrZextVecWithAddInLoopIhmEvPKT_PT0_i.exit @@ -21639,20 +22063,20 @@ _Z60benchForTruncOrZextVecWithAddInLoopFrom_uint8_t_To_uint64_t_RN9benchmark5Sta add.d $a6, $fp, $a5 ldx.h $a7, $a6, $s1 ldx.h $a6, $a6, $a3 - vinsgr2vr.h $vr1, $a7, 0 - vinsgr2vr.h $vr2, $a6, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.w $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vld $vr3, $a4, -16 - vld $vr4, $a4, 0 - vilvl.h $vr2, $vr0, $vr2 - vilvl.w $vr2, $vr0, $vr2 + vinsgr2vr.h $vr0, $a7, 0 + vinsgr2vr.h $vr1, $a6, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vld $vr2, $a4, -16 + vld $vr3, $a4, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vadd.d $vr0, $vr2, $vr0 vadd.d $vr1, $vr3, $vr1 - vadd.d $vr2, $vr4, $vr2 - vst $vr1, $a4, -16 - vst $vr2, $a4, 0 + vst $vr0, $a4, -16 + vst $vr1, $a4, 0 addi.d $a5, $a5, 4 addi.d $a4, $a4, 32 bnez $a5, .LBB53_17 @@ -21767,19 +22191,19 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception54 # %bb.0: - addi.d $sp, $sp, -176 - .cfi_def_cfa_offset 176 - st.d $ra, $sp, 168 # 8-byte Folded Spill - st.d $fp, $sp, 160 # 8-byte Folded Spill - st.d $s0, $sp, 152 # 8-byte Folded Spill - st.d $s1, $sp, 144 # 8-byte Folded Spill - st.d $s2, $sp, 136 # 8-byte Folded Spill - st.d $s3, $sp, 128 # 8-byte Folded Spill - st.d $s4, $sp, 120 # 8-byte Folded Spill - st.d $s5, $sp, 112 # 8-byte Folded Spill - st.d $s6, $sp, 104 # 8-byte Folded Spill - st.d $s7, $sp, 96 # 8-byte Folded Spill - st.d $s8, $sp, 88 # 8-byte Folded Spill + addi.d $sp, $sp, -160 + .cfi_def_cfa_offset 160 + st.d $ra, $sp, 152 # 8-byte Folded Spill + st.d $fp, $sp, 144 # 8-byte Folded Spill + st.d $s0, $sp, 136 # 8-byte Folded Spill + st.d $s1, $sp, 128 # 8-byte Folded Spill + st.d $s2, $sp, 120 # 8-byte Folded Spill + st.d $s3, $sp, 112 # 8-byte Folded Spill + st.d $s4, $sp, 104 # 8-byte Folded Spill + st.d $s5, $sp, 96 # 8-byte Folded Spill + st.d $s6, $sp, 88 # 8-byte Folded Spill + st.d $s7, $sp, 80 # 8-byte Folded Spill + st.d $s8, $sp, 72 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -21791,7 +22215,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St .cfi_offset 29, -72 .cfi_offset 30, -80 .cfi_offset 31, -88 - st.d $a0, $sp, 40 # 8-byte Folded Spill + st.d $a0, $sp, 24 # 8-byte Folded Spill lu12i.w $s6, 4 ori $s3, $s6, 3616 move $a0, $s3 @@ -21807,22 +22231,22 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St .Ltmp1162: # EH_LABEL # %bb.1: # %.split move $s2, $a0 - st.d $a0, $sp, 72 + st.d $a0, $sp, 56 .Ltmp1164: # EH_LABEL move $a0, $s0 pcaddu18i $ra, %call36(_Znam) jirl $ra, $ra, 0 - st.d $a0, $sp, 56 # 8-byte Folded Spill + st.d $a0, $sp, 40 # 8-byte Folded Spill .Ltmp1165: # EH_LABEL # %bb.2: ori $a0, $s6, 3586 add.d $s5, $fp, $a0 ori $a0, $s1, 3072 - st.d $a0, $sp, 64 # 8-byte Folded Spill + st.d $a0, $sp, 48 # 8-byte Folded Spill add.d $s1, $s2, $a0 lu12i.w $a0, -16 lu32i.d $a0, 0 - st.w $a0, $sp, 84 + st.w $a0, $sp, 68 lu12i.w $s7, -5 ori $s0, $s7, 480 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -21830,8 +22254,8 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St .p2align 4, , 16 .LBB54_3: # =>This Inner Loop Header: Depth=1 .Ltmp1167: # EH_LABEL - addi.d $a0, $sp, 84 - addi.d $a2, $sp, 84 + addi.d $a0, $sp, 68 + addi.d $a2, $sp, 68 move $a1, $s4 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionItEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEtRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -21843,7 +22267,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St stx.h $a0, $a1, $s3 bnez $s0, .LBB54_3 # %bb.5: # %vector.ph - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload addi.d $a0, $a0, 4 ori $a1, $s7, 480 ori $a2, $s6, 3618 @@ -21859,13 +22283,12 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St addi.d $a0, $a0, 8 bnez $a1, .LBB54_6 # %bb.7: # %vector.body85.preheader - st.d $s2, $sp, 48 # 8-byte Folded Spill + st.d $s2, $sp, 32 # 8-byte Folded Spill addi.d $a0, $s2, 64 ori $a1, $s7, 512 ori $s4, $s6, 3584 ori $a4, $s6, 3600 ori $a2, $s6, 3632 - vrepli.b $vr8, 0 .p2align 4, , 16 .LBB54_8: # %vector.body85 # =>This Inner Loop Header: Depth=1 @@ -21874,22 +22297,26 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St vldx $vr1, $a3, $a4 vldx $vr2, $a3, $s3 vldx $vr3, $a3, $a2 - vilvl.h $vr4, $vr8, $vr0 - vilvh.h $vr0, $vr8, $vr0 - vilvl.h $vr5, $vr8, $vr1 - vilvh.h $vr1, $vr8, $vr1 - vilvl.h $vr6, $vr8, $vr2 - vilvh.h $vr2, $vr8, $vr2 - vilvl.h $vr7, $vr8, $vr3 - vilvh.h $vr3, $vr8, $vr3 - vst $vr0, $a0, -48 - vst $vr4, $a0, -64 - vst $vr1, $a0, -16 - vst $vr5, $a0, -32 - vst $vr2, $a0, 16 - vst $vr6, $a0, 0 - vst $vr3, $a0, 48 - vst $vr7, $a0, 32 + vbsrl.v $vr4, $vr0, 8 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vbsrl.v $vr5, $vr1, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vbsrl.v $vr6, $vr2, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vbsrl.v $vr7, $vr3, 8 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vst $vr0, $a0, -64 + vst $vr4, $a0, -48 + vst $vr1, $a0, -32 + vst $vr5, $a0, -16 + vst $vr2, $a0, 0 + vst $vr6, $a0, 16 + vst $vr3, $a0, 32 + vst $vr7, $a0, 48 addi.d $a1, $a1, 64 addi.d $a0, $a0, 128 bnez $a1, .LBB54_8 @@ -21939,10 +22366,10 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St .p2align 4, , 16 .LBB54_10: # %_ZL27truncOrZextVecInLoopWithVW8ItjEvPKT_PT0_i.exit.preheader # =>This Inner Loop Header: Depth=1 - ld.d $a0, $sp, 48 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload ldx.w $a0, $a0, $a1 move $a2, $a1 - ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload ldx.w $a1, $a1, $a2 bne $a0, $a1, .LBB54_23 # %bb.11: # %_ZL27truncOrZextVecInLoopWithVW8ItjEvPKT_PT0_i.exit @@ -21951,26 +22378,24 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St addi.d $a1, $a2, 4 bnez $s7, .LBB54_10 # %bb.12: - vst $vr8, $sp, 16 # 16-byte Folded Spill - st.d $a4, $sp, 48 # 8-byte Folded Spill - ld.d $a0, $sp, 40 # 8-byte Folded Reload + st.d $a4, $sp, 32 # 8-byte Folded Spill + ld.d $a0, $sp, 24 # 8-byte Folded Reload ld.w $a1, $a0, 28 - st.d $a1, $sp, 8 # 8-byte Folded Spill + st.d $a1, $sp, 16 # 8-byte Folded Spill ld.d $s7, $a0, 16 .Ltmp1185: # EH_LABEL pcaddu18i $ra, %call36(_ZN9benchmark5State16StartKeepRunningEv) jirl $ra, $ra, 0 .Ltmp1186: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - ld.d $t2, $sp, 48 # 8-byte Folded Reload - vld $vr8, $sp, 16 # 16-byte Folded Reload - ld.d $a0, $sp, 8 # 8-byte Folded Reload + ld.d $t2, $sp, 32 # 8-byte Folded Reload + ld.d $a0, $sp, 16 # 8-byte Folded Reload bnez $a0, .LBB54_19 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s7, .LBB54_19 # %bb.15: # %.lr.ph.preheader addi.d $a0, $fp, 48 - addi.d $a1, $sp, 72 + addi.d $a1, $sp, 56 lu12i.w $a2, -5 ori $a2, $a2, 512 lu12i.w $a5, 4 @@ -21984,8 +22409,8 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St #APP #NO_APP #MEMBARRIER - ld.d $a7, $sp, 72 - ld.d $a6, $sp, 64 # 8-byte Folded Reload + ld.d $a7, $sp, 56 + ld.d $a6, $sp, 48 # 8-byte Folded Reload add.d $a6, $a7, $a6 addi.d $a7, $a7, 64 move $t0, $a2 @@ -21998,22 +22423,26 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St vldx $vr1, $t1, $a4 vldx $vr2, $t1, $a5 vldx $vr3, $t1, $s4 - vilvl.h $vr4, $vr8, $vr0 - vilvh.h $vr0, $vr8, $vr0 - vilvl.h $vr5, $vr8, $vr1 - vilvh.h $vr1, $vr8, $vr1 - vilvl.h $vr6, $vr8, $vr2 - vilvh.h $vr2, $vr8, $vr2 - vilvl.h $vr7, $vr8, $vr3 - vilvh.h $vr3, $vr8, $vr3 - vst $vr0, $a7, -48 - vst $vr4, $a7, -64 - vst $vr1, $a7, -16 - vst $vr5, $a7, -32 - vst $vr2, $a7, 16 - vst $vr6, $a7, 0 - vst $vr3, $a7, 48 - vst $vr7, $a7, 32 + vbsrl.v $vr4, $vr0, 8 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vbsrl.v $vr5, $vr1, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vbsrl.v $vr6, $vr2, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vbsrl.v $vr7, $vr3, 8 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vst $vr0, $a7, -64 + vst $vr4, $a7, -48 + vst $vr1, $a7, -32 + vst $vr5, $a7, -16 + vst $vr2, $a7, 0 + vst $vr6, $a7, 16 + vst $vr3, $a7, 32 + vst $vr7, $a7, 48 addi.d $t0, $t0, 64 addi.d $a7, $a7, 128 bnez $t0, .LBB54_17 @@ -22055,15 +22484,15 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St bnez $s7, .LBB54_16 .LBB54_19: # %_ZN9benchmark5State3endEv.exit._crit_edge .Ltmp1187: # EH_LABEL - ld.d $a0, $sp, 40 # 8-byte Folded Reload + ld.d $a0, $sp, 24 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZN9benchmark5State17FinishKeepRunningEv) jirl $ra, $ra, 0 .Ltmp1188: # EH_LABEL # %bb.20: # %_ZNSt10unique_ptrIA_jSt14default_deleteIS0_EED2Ev.exit36 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 72 + ld.d $a0, $sp, 56 beqz $a0, .LBB54_22 # %bb.21: # %_ZNKSt14default_deleteIA_jEclIjEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i38 pcaddu18i $ra, %call36(_ZdaPv) @@ -22072,18 +22501,18 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 88 # 8-byte Folded Reload - ld.d $s7, $sp, 96 # 8-byte Folded Reload - ld.d $s6, $sp, 104 # 8-byte Folded Reload - ld.d $s5, $sp, 112 # 8-byte Folded Reload - ld.d $s4, $sp, 120 # 8-byte Folded Reload - ld.d $s3, $sp, 128 # 8-byte Folded Reload - ld.d $s2, $sp, 136 # 8-byte Folded Reload - ld.d $s1, $sp, 144 # 8-byte Folded Reload - ld.d $s0, $sp, 152 # 8-byte Folded Reload - ld.d $fp, $sp, 160 # 8-byte Folded Reload - ld.d $ra, $sp, 168 # 8-byte Folded Reload - addi.d $sp, $sp, 176 + ld.d $s8, $sp, 72 # 8-byte Folded Reload + ld.d $s7, $sp, 80 # 8-byte Folded Reload + ld.d $s6, $sp, 88 # 8-byte Folded Reload + ld.d $s5, $sp, 96 # 8-byte Folded Reload + ld.d $s4, $sp, 104 # 8-byte Folded Reload + ld.d $s3, $sp, 112 # 8-byte Folded Reload + ld.d $s2, $sp, 120 # 8-byte Folded Reload + ld.d $s1, $sp, 128 # 8-byte Folded Reload + ld.d $s0, $sp, 136 # 8-byte Folded Reload + ld.d $fp, $sp, 144 # 8-byte Folded Reload + ld.d $ra, $sp, 152 # 8-byte Folded Reload + addi.d $sp, $sp, 160 ret .LBB54_23: .Ltmp1170: # EH_LABEL @@ -22117,7 +22546,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St jirl $ra, $ra, 0 .Ltmp1175: # EH_LABEL # %bb.26: # %_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc.exit18 - ld.d $a0, $sp, 48 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload ldx.wu $a1, $a0, $s0 .Ltmp1176: # EH_LABEL move $a0, $s1 @@ -22135,7 +22564,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St jirl $ra, $ra, 0 .Ltmp1179: # EH_LABEL # %bb.28: # %_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc.exit21 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload ldx.wu $a1, $a0, $s0 .Ltmp1180: # EH_LABEL move $a0, $s1 @@ -22176,10 +22605,10 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint32_t_RN9benchmark5St .Ltmp1169: # EH_LABEL .LBB54_36: move $s1, $a0 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 72 + ld.d $a0, $sp, 56 move $s2, $a0 beqz $a0, .LBB54_38 .LBB54_37: # %_ZNKSt14default_deleteIA_jEclIjEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i31 @@ -22250,19 +22679,19 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception55 # %bb.0: - addi.d $sp, $sp, -176 - .cfi_def_cfa_offset 176 - st.d $ra, $sp, 168 # 8-byte Folded Spill - st.d $fp, $sp, 160 # 8-byte Folded Spill - st.d $s0, $sp, 152 # 8-byte Folded Spill - st.d $s1, $sp, 144 # 8-byte Folded Spill - st.d $s2, $sp, 136 # 8-byte Folded Spill - st.d $s3, $sp, 128 # 8-byte Folded Spill - st.d $s4, $sp, 120 # 8-byte Folded Spill - st.d $s5, $sp, 112 # 8-byte Folded Spill - st.d $s6, $sp, 104 # 8-byte Folded Spill - st.d $s7, $sp, 96 # 8-byte Folded Spill - st.d $s8, $sp, 88 # 8-byte Folded Spill + addi.d $sp, $sp, -160 + .cfi_def_cfa_offset 160 + st.d $ra, $sp, 152 # 8-byte Folded Spill + st.d $fp, $sp, 144 # 8-byte Folded Spill + st.d $s0, $sp, 136 # 8-byte Folded Spill + st.d $s1, $sp, 128 # 8-byte Folded Spill + st.d $s2, $sp, 120 # 8-byte Folded Spill + st.d $s3, $sp, 112 # 8-byte Folded Spill + st.d $s4, $sp, 104 # 8-byte Folded Spill + st.d $s5, $sp, 96 # 8-byte Folded Spill + st.d $s6, $sp, 88 # 8-byte Folded Spill + st.d $s7, $sp, 80 # 8-byte Folded Spill + st.d $s8, $sp, 72 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -22290,22 +22719,22 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S .Ltmp1191: # EH_LABEL # %bb.1: # %.split move $s2, $a0 - st.d $a0, $sp, 72 + st.d $a0, $sp, 56 .Ltmp1193: # EH_LABEL move $a0, $s0 pcaddu18i $ra, %call36(_Znam) jirl $ra, $ra, 0 - st.d $a0, $sp, 56 # 8-byte Folded Spill + st.d $a0, $sp, 40 # 8-byte Folded Spill .Ltmp1194: # EH_LABEL # %bb.2: ori $a0, $s6, 3586 add.d $s5, $fp, $a0 ori $a0, $s1, 3072 - st.d $a0, $sp, 64 # 8-byte Folded Spill + st.d $a0, $sp, 48 # 8-byte Folded Spill add.d $s1, $s2, $a0 lu12i.w $a0, -16 lu32i.d $a0, 0 - st.w $a0, $sp, 84 + st.w $a0, $sp, 68 lu12i.w $s8, -5 ori $s0, $s8, 480 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -22313,8 +22742,8 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S .p2align 4, , 16 .LBB55_3: # =>This Inner Loop Header: Depth=1 .Ltmp1196: # EH_LABEL - addi.d $a0, $sp, 84 - addi.d $a2, $sp, 84 + addi.d $a0, $sp, 68 + addi.d $a2, $sp, 68 move $a1, $s4 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionItEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEtRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -22326,7 +22755,7 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S stx.h $a0, $a1, $s3 bnez $s0, .LBB55_3 # %bb.5: # %vector.ph - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload addi.d $a0, $a0, 4 ori $a1, $s8, 480 ori $a2, $s6, 3618 @@ -22352,7 +22781,6 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S ori $a4, $s6, 3648 ori $a5, $s6, 3696 ori $a6, $s6, 3680 - vrepli.b $vr16, 0 .p2align 4, , 16 .LBB55_8: # %vector.body85 # =>This Inner Loop Header: Depth=1 @@ -22365,38 +22793,46 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S vldx $vr5, $a7, $a4 vldx $vr6, $a7, $a5 vldx $vr7, $a7, $a6 - vilvl.h $vr8, $vr16, $vr1 - vilvh.h $vr1, $vr16, $vr1 - vilvl.h $vr9, $vr16, $vr0 - vilvh.h $vr0, $vr16, $vr0 - vilvl.h $vr10, $vr16, $vr3 - vilvh.h $vr3, $vr16, $vr3 - vilvl.h $vr11, $vr16, $vr2 - vilvh.h $vr2, $vr16, $vr2 - vilvl.h $vr12, $vr16, $vr5 - vilvh.h $vr5, $vr16, $vr5 - vilvl.h $vr13, $vr16, $vr4 - vilvh.h $vr4, $vr16, $vr4 - vilvl.h $vr14, $vr16, $vr7 - vilvh.h $vr7, $vr16, $vr7 - vilvl.h $vr15, $vr16, $vr6 - vilvh.h $vr6, $vr16, $vr6 - vst $vr0, $a0, -80 - vst $vr9, $a0, -96 - vst $vr1, $a0, -112 - vst $vr8, $a0, -128 - vst $vr2, $a0, -16 - vst $vr11, $a0, -32 - vst $vr3, $a0, -48 - vst $vr10, $a0, -64 - vst $vr4, $a0, 48 - vst $vr13, $a0, 32 - vst $vr5, $a0, 16 - vst $vr12, $a0, 0 - vst $vr6, $a0, 112 - vst $vr15, $a0, 96 - vst $vr7, $a0, 80 - vst $vr14, $a0, 64 + vbsrl.v $vr8, $vr1, 8 + vsllwil.wu.hu $vr8, $vr8, 0 + vbsrl.v $vr9, $vr0, 8 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vbsrl.v $vr10, $vr3, 8 + vsllwil.wu.hu $vr10, $vr10, 0 + vbsrl.v $vr11, $vr2, 8 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vbsrl.v $vr12, $vr5, 8 + vsllwil.wu.hu $vr12, $vr12, 0 + vbsrl.v $vr13, $vr4, 8 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vbsrl.v $vr14, $vr7, 8 + vsllwil.wu.hu $vr14, $vr14, 0 + vbsrl.v $vr15, $vr6, 8 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vst $vr0, $a0, -96 + vst $vr1, $a0, -128 + vst $vr9, $a0, -80 + vst $vr8, $a0, -112 + vst $vr2, $a0, -32 + vst $vr3, $a0, -64 + vst $vr11, $a0, -16 + vst $vr10, $a0, -48 + vst $vr4, $a0, 32 + vst $vr5, $a0, 0 + vst $vr13, $a0, 48 + vst $vr12, $a0, 16 + vst $vr6, $a0, 96 + vst $vr7, $a0, 64 + vst $vr15, $a0, 112 + vst $vr14, $a0, 80 addi.d $a1, $a1, 128 addi.d $a0, $a0, 256 bnez $a1, .LBB55_8 @@ -22449,7 +22885,7 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S ld.d $a0, $sp, 32 # 8-byte Folded Reload ldx.w $a0, $a0, $a1 move $a2, $a1 - ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload ldx.w $a1, $a1, $a2 bne $a0, $a1, .LBB55_23 # %bb.11: # %_ZL28truncOrZextVecInLoopWithVW16ItjEvPKT_PT0_i.exit @@ -22458,26 +22894,24 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S addi.d $a1, $a2, 4 bnez $s8, .LBB55_10 # %bb.12: - st.d $a5, $sp, 16 # 8-byte Folded Spill - vst $vr16, $sp, 32 # 16-byte Folded Spill + st.d $a5, $sp, 32 # 8-byte Folded Spill ld.d $a0, $sp, 24 # 8-byte Folded Reload ld.w $a1, $a0, 28 - st.d $a1, $sp, 8 # 8-byte Folded Spill + st.d $a1, $sp, 16 # 8-byte Folded Spill ld.d $s8, $a0, 16 .Ltmp1214: # EH_LABEL pcaddu18i $ra, %call36(_ZN9benchmark5State16StartKeepRunningEv) jirl $ra, $ra, 0 .Ltmp1215: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr16, $sp, 32 # 16-byte Folded Reload - ld.d $t5, $sp, 16 # 8-byte Folded Reload - ld.d $a0, $sp, 8 # 8-byte Folded Reload + ld.d $t5, $sp, 32 # 8-byte Folded Reload + ld.d $a0, $sp, 16 # 8-byte Folded Reload bnez $a0, .LBB55_19 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s8, .LBB55_19 # %bb.15: # %.lr.ph.preheader addi.d $a0, $fp, 96 - addi.d $a1, $sp, 72 + addi.d $a1, $sp, 56 lu12i.w $a2, -5 ori $a2, $a2, 512 lu12i.w $t0, 4 @@ -22494,8 +22928,8 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S #APP #NO_APP #MEMBARRIER - ld.d $t2, $sp, 72 - ld.d $t1, $sp, 64 # 8-byte Folded Reload + ld.d $t2, $sp, 56 + ld.d $t1, $sp, 48 # 8-byte Folded Reload add.d $t1, $t2, $t1 addi.d $t2, $t2, 128 move $t3, $a2 @@ -22512,38 +22946,46 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S vldx $vr5, $t4, $t0 vldx $vr6, $t4, $s4 vldx $vr7, $t4, $s7 - vilvl.h $vr8, $vr16, $vr1 - vilvh.h $vr1, $vr16, $vr1 - vilvl.h $vr9, $vr16, $vr0 - vilvh.h $vr0, $vr16, $vr0 - vilvl.h $vr10, $vr16, $vr3 - vilvh.h $vr3, $vr16, $vr3 - vilvl.h $vr11, $vr16, $vr2 - vilvh.h $vr2, $vr16, $vr2 - vilvl.h $vr12, $vr16, $vr5 - vilvh.h $vr5, $vr16, $vr5 - vilvl.h $vr13, $vr16, $vr4 - vilvh.h $vr4, $vr16, $vr4 - vilvl.h $vr14, $vr16, $vr7 - vilvh.h $vr7, $vr16, $vr7 - vilvl.h $vr15, $vr16, $vr6 - vilvh.h $vr6, $vr16, $vr6 - vst $vr0, $t2, -80 - vst $vr9, $t2, -96 - vst $vr1, $t2, -112 - vst $vr8, $t2, -128 - vst $vr2, $t2, -16 - vst $vr11, $t2, -32 - vst $vr3, $t2, -48 - vst $vr10, $t2, -64 - vst $vr4, $t2, 48 - vst $vr13, $t2, 32 - vst $vr5, $t2, 16 - vst $vr12, $t2, 0 - vst $vr6, $t2, 112 - vst $vr15, $t2, 96 - vst $vr7, $t2, 80 - vst $vr14, $t2, 64 + vbsrl.v $vr8, $vr1, 8 + vsllwil.wu.hu $vr8, $vr8, 0 + vbsrl.v $vr9, $vr0, 8 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vbsrl.v $vr10, $vr3, 8 + vsllwil.wu.hu $vr10, $vr10, 0 + vbsrl.v $vr11, $vr2, 8 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vbsrl.v $vr12, $vr5, 8 + vsllwil.wu.hu $vr12, $vr12, 0 + vbsrl.v $vr13, $vr4, 8 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vbsrl.v $vr14, $vr7, 8 + vsllwil.wu.hu $vr14, $vr14, 0 + vbsrl.v $vr15, $vr6, 8 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vst $vr0, $t2, -96 + vst $vr1, $t2, -128 + vst $vr9, $t2, -80 + vst $vr8, $t2, -112 + vst $vr2, $t2, -32 + vst $vr3, $t2, -64 + vst $vr11, $t2, -16 + vst $vr10, $t2, -48 + vst $vr4, $t2, 32 + vst $vr5, $t2, 0 + vst $vr13, $t2, 48 + vst $vr12, $t2, 16 + vst $vr6, $t2, 96 + vst $vr7, $t2, 64 + vst $vr15, $t2, 112 + vst $vr14, $t2, 80 addi.d $t3, $t3, 128 addi.d $t2, $t2, 256 bnez $t3, .LBB55_17 @@ -22590,10 +23032,10 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S jirl $ra, $ra, 0 .Ltmp1217: # EH_LABEL # %bb.20: # %_ZNSt10unique_ptrIA_jSt14default_deleteIS0_EED2Ev.exit36 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 72 + ld.d $a0, $sp, 56 beqz $a0, .LBB55_22 # %bb.21: # %_ZNKSt14default_deleteIA_jEclIjEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i38 pcaddu18i $ra, %call36(_ZdaPv) @@ -22602,18 +23044,18 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 88 # 8-byte Folded Reload - ld.d $s7, $sp, 96 # 8-byte Folded Reload - ld.d $s6, $sp, 104 # 8-byte Folded Reload - ld.d $s5, $sp, 112 # 8-byte Folded Reload - ld.d $s4, $sp, 120 # 8-byte Folded Reload - ld.d $s3, $sp, 128 # 8-byte Folded Reload - ld.d $s2, $sp, 136 # 8-byte Folded Reload - ld.d $s1, $sp, 144 # 8-byte Folded Reload - ld.d $s0, $sp, 152 # 8-byte Folded Reload - ld.d $fp, $sp, 160 # 8-byte Folded Reload - ld.d $ra, $sp, 168 # 8-byte Folded Reload - addi.d $sp, $sp, 176 + ld.d $s8, $sp, 72 # 8-byte Folded Reload + ld.d $s7, $sp, 80 # 8-byte Folded Reload + ld.d $s6, $sp, 88 # 8-byte Folded Reload + ld.d $s5, $sp, 96 # 8-byte Folded Reload + ld.d $s4, $sp, 104 # 8-byte Folded Reload + ld.d $s3, $sp, 112 # 8-byte Folded Reload + ld.d $s2, $sp, 120 # 8-byte Folded Reload + ld.d $s1, $sp, 128 # 8-byte Folded Reload + ld.d $s0, $sp, 136 # 8-byte Folded Reload + ld.d $fp, $sp, 144 # 8-byte Folded Reload + ld.d $ra, $sp, 152 # 8-byte Folded Reload + addi.d $sp, $sp, 160 ret .LBB55_23: .Ltmp1199: # EH_LABEL @@ -22665,7 +23107,7 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S jirl $ra, $ra, 0 .Ltmp1208: # EH_LABEL # %bb.28: # %_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc.exit21 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload ldx.wu $a1, $a0, $s0 .Ltmp1209: # EH_LABEL move $a0, $s1 @@ -22706,10 +23148,10 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint32_t_RN9benchmark5S .Ltmp1198: # EH_LABEL .LBB55_36: move $s1, $a0 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 72 + ld.d $a0, $sp, 56 move $s2, $a0 beqz $a0, .LBB55_38 .LBB55_37: # %_ZNKSt14default_deleteIA_jEclIjEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i31 @@ -22780,18 +23222,18 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception56 # %bb.0: - addi.d $sp, $sp, -128 - .cfi_def_cfa_offset 128 - st.d $ra, $sp, 120 # 8-byte Folded Spill - st.d $fp, $sp, 112 # 8-byte Folded Spill - st.d $s0, $sp, 104 # 8-byte Folded Spill - st.d $s1, $sp, 96 # 8-byte Folded Spill - st.d $s2, $sp, 88 # 8-byte Folded Spill - st.d $s3, $sp, 80 # 8-byte Folded Spill - st.d $s4, $sp, 72 # 8-byte Folded Spill - st.d $s5, $sp, 64 # 8-byte Folded Spill - st.d $s6, $sp, 56 # 8-byte Folded Spill - st.d $s7, $sp, 48 # 8-byte Folded Spill + addi.d $sp, $sp, -112 + .cfi_def_cfa_offset 112 + st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s1, $sp, 80 # 8-byte Folded Spill + st.d $s2, $sp, 72 # 8-byte Folded Spill + st.d $s3, $sp, 64 # 8-byte Folded Spill + st.d $s4, $sp, 56 # 8-byte Folded Spill + st.d $s5, $sp, 48 # 8-byte Folded Spill + st.d $s6, $sp, 40 # 8-byte Folded Spill + st.d $s7, $sp, 32 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -22818,7 +23260,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # .Ltmp1220: # EH_LABEL # %bb.1: move $s2, $a0 - st.d $a0, $sp, 32 + st.d $a0, $sp, 16 .Ltmp1222: # EH_LABEL move $a0, $s0 pcaddu18i $ra, %call36(_Znam) @@ -22828,7 +23270,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # move $s0, $a0 lu12i.w $a0, -16 lu32i.d $a0, 0 - st.w $a0, $sp, 44 + st.w $a0, $sp, 28 lu12i.w $s5, -5 ori $s7, $s5, 480 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -22836,8 +23278,8 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # .p2align 4, , 16 .LBB56_3: # =>This Inner Loop Header: Depth=1 .Ltmp1225: # EH_LABEL - addi.d $a0, $sp, 44 - addi.d $a2, $sp, 44 + addi.d $a0, $sp, 28 + addi.d $a2, $sp, 28 move $a1, $s4 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionItEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEtRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -22866,7 +23308,6 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # # %bb.7: # %vector.body85.preheader addi.d $a0, $s2, 16 ori $a1, $s5, 480 - vrepli.b $vr2, 0 .p2align 4, , 16 .LBB56_8: # %vector.body85 # =>This Inner Loop Header: Depth=1 @@ -22875,8 +23316,8 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # ldptr.d $a2, $a2, 20008 vinsgr2vr.d $vr0, $a3, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $a0, -16 vst $vr1, $a0, 0 addi.d $a1, $a1, 16 @@ -22897,7 +23338,6 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # addi.d $s4, $s4, 4 bnez $s6, .LBB56_10 # %bb.12: - vst $vr2, $sp, 16 # 16-byte Folded Spill ld.w $s3, $s1, 28 ld.d $s2, $s1, 16 .Ltmp1243: # EH_LABEL @@ -22906,12 +23346,11 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # jirl $ra, $ra, 0 .Ltmp1244: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr2, $sp, 16 # 16-byte Folded Reload bnez $s3, .LBB56_19 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s2, .LBB56_19 # %bb.15: - addi.d $a0, $sp, 32 + addi.d $a0, $sp, 16 ori $a1, $s5, 480 .p2align 4, , 16 .LBB56_16: # %.lr.ph @@ -22920,7 +23359,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # #APP #NO_APP #MEMBARRIER - ld.d $a2, $sp, 32 + ld.d $a2, $sp, 16 addi.d $a2, $a2, 16 move $a3, $a1 .p2align 4, , 16 @@ -22932,8 +23371,8 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # ldptr.d $a4, $a4, 20008 vinsgr2vr.d $vr0, $a5, 0 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $a2, -16 vst $vr1, $a2, 0 addi.d $a3, $a3, 16 @@ -22953,7 +23392,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 32 + ld.d $a0, $sp, 16 beqz $a0, .LBB56_22 # %bb.21: # %_ZNKSt14default_deleteIA_jEclIjEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i38 pcaddu18i $ra, %call36(_ZdaPv) @@ -22962,17 +23401,17 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s7, $sp, 48 # 8-byte Folded Reload - ld.d $s6, $sp, 56 # 8-byte Folded Reload - ld.d $s5, $sp, 64 # 8-byte Folded Reload - ld.d $s4, $sp, 72 # 8-byte Folded Reload - ld.d $s3, $sp, 80 # 8-byte Folded Reload - ld.d $s2, $sp, 88 # 8-byte Folded Reload - ld.d $s1, $sp, 96 # 8-byte Folded Reload - ld.d $s0, $sp, 104 # 8-byte Folded Reload - ld.d $fp, $sp, 112 # 8-byte Folded Reload - ld.d $ra, $sp, 120 # 8-byte Folded Reload - addi.d $sp, $sp, 128 + ld.d $s7, $sp, 32 # 8-byte Folded Reload + ld.d $s6, $sp, 40 # 8-byte Folded Reload + ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $s4, $sp, 56 # 8-byte Folded Reload + ld.d $s3, $sp, 64 # 8-byte Folded Reload + ld.d $s2, $sp, 72 # 8-byte Folded Reload + ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $fp, $sp, 96 # 8-byte Folded Reload + ld.d $ra, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret .LBB56_23: .Ltmp1228: # EH_LABEL @@ -23064,7 +23503,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5StateE: # move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s2, $sp, 32 + ld.d $s2, $sp, 16 beqz $s2, .LBB56_38 .LBB56_37: # %_ZNKSt14default_deleteIA_jEclIjEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i31 move $a0, $s2 @@ -23242,7 +23681,6 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW8From_uint16_t_To_uint32_t_RN9bench ori $a5, $s6, 3552 ori $a6, $s6, 3568 ori $a7, $s6, 3584 - vrepli.b $vr0, 0 ori $t0, $s6, 3588 ori $t1, $s6, 3592 ori $t2, $s6, 3596 @@ -23266,42 +23704,46 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW8From_uint16_t_To_uint32_t_RN9bench # Parent Loop BB57_11 Depth=1 # => This Inner Loop Header: Depth=2 add.d $s3, $a0, $s2 - vldx $vr1, $s3, $a4 - vldx $vr2, $s3, $a5 - vldx $vr3, $s3, $a6 - vldx $vr4, $s3, $a7 - vilvl.h $vr5, $vr0, $vr1 - vilvh.h $vr1, $vr0, $vr1 - vilvl.h $vr6, $vr0, $vr2 - vilvh.h $vr2, $vr0, $vr2 - vilvl.h $vr7, $vr0, $vr3 - vilvh.h $vr3, $vr0, $vr3 - vilvl.h $vr8, $vr0, $vr4 - vilvh.h $vr4, $vr0, $vr4 - vld $vr9, $t8, -64 - vld $vr10, $t8, -48 - vld $vr11, $t8, -32 - vld $vr12, $t8, -16 - vld $vr13, $t8, 0 - vld $vr14, $t8, 16 - vld $vr15, $t8, 32 - vld $vr16, $t8, 48 - vadd.w $vr1, $vr10, $vr1 - vadd.w $vr5, $vr9, $vr5 - vadd.w $vr2, $vr12, $vr2 - vadd.w $vr6, $vr11, $vr6 - vadd.w $vr3, $vr14, $vr3 - vadd.w $vr7, $vr13, $vr7 - vadd.w $vr4, $vr16, $vr4 - vadd.w $vr8, $vr15, $vr8 - vst $vr5, $t8, -64 - vst $vr1, $t8, -48 - vst $vr6, $t8, -32 - vst $vr2, $t8, -16 - vst $vr7, $t8, 0 - vst $vr3, $t8, 16 - vst $vr8, $t8, 32 - vst $vr4, $t8, 48 + vldx $vr0, $s3, $a4 + vldx $vr1, $s3, $a5 + vldx $vr2, $s3, $a6 + vldx $vr3, $s3, $a7 + vsllwil.wu.hu $vr4, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr5, $vr1, 0 + vbsrl.v $vr1, $vr1, 8 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr6, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr7, $vr3, 0 + vbsrl.v $vr3, $vr3, 8 + vsllwil.wu.hu $vr3, $vr3, 0 + vld $vr8, $t8, -64 + vld $vr9, $t8, -48 + vld $vr10, $t8, -32 + vld $vr11, $t8, -16 + vld $vr12, $t8, 0 + vld $vr13, $t8, 16 + vld $vr14, $t8, 32 + vld $vr15, $t8, 48 + vadd.w $vr0, $vr9, $vr0 + vadd.w $vr4, $vr8, $vr4 + vadd.w $vr1, $vr11, $vr1 + vadd.w $vr5, $vr10, $vr5 + vadd.w $vr2, $vr13, $vr2 + vadd.w $vr6, $vr12, $vr6 + vadd.w $vr3, $vr15, $vr3 + vadd.w $vr7, $vr14, $vr7 + vst $vr4, $t8, -64 + vst $vr0, $t8, -48 + vst $vr5, $t8, -32 + vst $vr1, $t8, -16 + vst $vr6, $t8, 0 + vst $vr2, $t8, 16 + vst $vr7, $t8, 32 + vst $vr3, $t8, 48 addi.d $s2, $s2, 64 addi.d $t8, $t8, 128 bnez $s2, .LBB57_12 @@ -23484,19 +23926,27 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint32_t_RN9benc .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception58 # %bb.0: - addi.d $sp, $sp, -112 - .cfi_def_cfa_offset 112 - st.d $ra, $sp, 104 # 8-byte Folded Spill - st.d $fp, $sp, 96 # 8-byte Folded Spill - st.d $s0, $sp, 88 # 8-byte Folded Spill - st.d $s1, $sp, 80 # 8-byte Folded Spill - st.d $s2, $sp, 72 # 8-byte Folded Spill - st.d $s3, $sp, 64 # 8-byte Folded Spill - st.d $s4, $sp, 56 # 8-byte Folded Spill - st.d $s5, $sp, 48 # 8-byte Folded Spill - st.d $s6, $sp, 40 # 8-byte Folded Spill - st.d $s7, $sp, 32 # 8-byte Folded Spill - st.d $s8, $sp, 24 # 8-byte Folded Spill + addi.d $sp, $sp, -176 + .cfi_def_cfa_offset 176 + st.d $ra, $sp, 168 # 8-byte Folded Spill + st.d $fp, $sp, 160 # 8-byte Folded Spill + st.d $s0, $sp, 152 # 8-byte Folded Spill + st.d $s1, $sp, 144 # 8-byte Folded Spill + st.d $s2, $sp, 136 # 8-byte Folded Spill + st.d $s3, $sp, 128 # 8-byte Folded Spill + st.d $s4, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 112 # 8-byte Folded Spill + st.d $s6, $sp, 104 # 8-byte Folded Spill + st.d $s7, $sp, 96 # 8-byte Folded Spill + st.d $s8, $sp, 88 # 8-byte Folded Spill + fst.d $fs0, $sp, 80 # 8-byte Folded Spill + fst.d $fs1, $sp, 72 # 8-byte Folded Spill + fst.d $fs2, $sp, 64 # 8-byte Folded Spill + fst.d $fs3, $sp, 56 # 8-byte Folded Spill + fst.d $fs4, $sp, 48 # 8-byte Folded Spill + fst.d $fs5, $sp, 40 # 8-byte Folded Spill + fst.d $fs6, $sp, 32 # 8-byte Folded Spill + fst.d $fs7, $sp, 24 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -23508,6 +23958,14 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint32_t_RN9benc .cfi_offset 29, -72 .cfi_offset 30, -80 .cfi_offset 31, -88 + .cfi_offset 56, -96 + .cfi_offset 57, -104 + .cfi_offset 58, -112 + .cfi_offset 59, -120 + .cfi_offset 60, -128 + .cfi_offset 61, -136 + .cfi_offset 62, -144 + .cfi_offset 63, -152 move $s0, $a0 lu12i.w $s6, 4 ori $s3, $s6, 3616 @@ -23596,7 +24054,6 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint32_t_RN9benc ori $t1, $s6, 3568 ori $t2, $s6, 3584 ori $t3, $s6, 3600 - vrepli.b $vr0, 0 ori $t4, $s6, 3588 ori $t5, $s6, 3592 ori $t6, $s6, 3596 @@ -23619,78 +24076,86 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint32_t_RN9benc # Parent Loop BB58_11 Depth=1 # => This Inner Loop Header: Depth=2 add.d $s7, $a0, $s6 - vldx $vr2, $s7, $a4 - vldx $vr3, $s7, $a5 - vldx $vr4, $s7, $a6 - vldx $vr5, $s7, $a7 - vldx $vr6, $s7, $t0 - vldx $vr7, $s7, $t1 - vldx $vr8, $s7, $t2 - vldx $vr1, $s7, $t3 - vilvl.h $vr9, $vr0, $vr3 - vilvh.h $vr3, $vr0, $vr3 - vilvl.h $vr10, $vr0, $vr2 - vilvh.h $vr2, $vr0, $vr2 - vilvl.h $vr11, $vr0, $vr5 - vilvh.h $vr5, $vr0, $vr5 - vilvl.h $vr12, $vr0, $vr4 - vilvh.h $vr4, $vr0, $vr4 - vilvl.h $vr13, $vr0, $vr7 - vilvh.h $vr7, $vr0, $vr7 - vilvl.h $vr14, $vr0, $vr6 - vilvh.h $vr6, $vr0, $vr6 - vilvl.h $vr15, $vr0, $vr8 - vld $vr16, $s4, -112 - vilvh.h $vr8, $vr0, $vr8 + vldx $vr0, $s7, $a4 + vldx $vr1, $s7, $a5 + vldx $vr2, $s7, $a6 + vldx $vr3, $s7, $a7 + vldx $vr4, $s7, $t0 + vldx $vr5, $s7, $t1 + vldx $vr6, $s7, $t2 + vldx $vr7, $s7, $t3 + vsllwil.wu.hu $vr8, $vr1, 0 + vsllwil.wu.hu $vr9, $vr0, 0 + vbsrl.v $vr1, $vr1, 8 + vsllwil.wu.hu $vr1, $vr1, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr10, $vr3, 0 + vsllwil.wu.hu $vr11, $vr2, 0 + vbsrl.v $vr3, $vr3, 8 + vsllwil.wu.hu $vr3, $vr3, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr12, $vr5, 0 + vsllwil.wu.hu $vr13, $vr4, 0 + vbsrl.v $vr5, $vr5, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vbsrl.v $vr4, $vr4, 8 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.wu.hu $vr14, $vr7, 0 + vsllwil.wu.hu $vr15, $vr6, 0 + vbsrl.v $vr7, $vr7, 8 + vsllwil.wu.hu $vr7, $vr7, 0 + vbsrl.v $vr6, $vr6, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vld $vr16, $s4, -96 vld $vr17, $s4, -128 vld $vr18, $s4, -80 - vadd.w $vr2, $vr16, $vr2 - vld $vr16, $s4, -96 - vadd.w $vr10, $vr17, $vr10 - vld $vr17, $s4, -48 - vadd.w $vr3, $vr18, $vr3 - vld $vr18, $s4, -64 - vadd.w $vr9, $vr16, $vr9 - vld $vr16, $s4, -16 - vadd.w $vr4, $vr17, $vr4 - vld $vr17, $s4, -32 - vadd.w $vr12, $vr18, $vr12 - vld $vr18, $s4, 16 - vadd.w $vr5, $vr16, $vr5 - vld $vr16, $s4, 0 - vadd.w $vr11, $vr17, $vr11 - vld $vr17, $s4, 48 - vadd.w $vr6, $vr18, $vr6 - vld $vr18, $s4, 32 - vadd.w $vr14, $vr16, $vr14 - vld $vr16, $s4, 80 - vadd.w $vr7, $vr17, $vr7 - vld $vr17, $s4, 64 - vadd.w $vr13, $vr18, $vr13 - vld $vr18, $s4, 112 + vld $vr19, $s4, -112 + vld $vr20, $s4, -32 + vld $vr21, $s4, -64 + vld $vr22, $s4, -16 + vld $vr23, $s4, -48 + vld $vr24, $s4, 32 + vld $vr25, $s4, 0 + vld $vr26, $s4, 48 + vld $vr27, $s4, 16 + vld $vr28, $s4, 96 + vld $vr29, $s4, 64 + vld $vr30, $s4, 112 + vld $vr31, $s4, 80 + vadd.w $vr0, $vr19, $vr0 + vadd.w $vr1, $vr18, $vr1 + vadd.w $vr9, $vr17, $vr9 vadd.w $vr8, $vr16, $vr8 - vld $vr16, $s4, 96 - vadd.w $vr15, $vr17, $vr15 - vilvh.h $vr17, $vr0, $vr1 - vadd.w $vr17, $vr18, $vr17 - vilvl.h $vr1, $vr0, $vr1 - vadd.w $vr1, $vr16, $vr1 - vst $vr9, $s4, -96 - vst $vr3, $s4, -80 - vst $vr10, $s4, -128 - vst $vr2, $s4, -112 - vst $vr11, $s4, -32 - vst $vr5, $s4, -16 - vst $vr12, $s4, -64 - vst $vr4, $s4, -48 - vst $vr13, $s4, 32 - vst $vr7, $s4, 48 - vst $vr14, $s4, 0 - vst $vr6, $s4, 16 - vst $vr1, $s4, 96 - vst $vr17, $s4, 112 + vadd.w $vr2, $vr23, $vr2 + vadd.w $vr3, $vr22, $vr3 + vadd.w $vr11, $vr21, $vr11 + vadd.w $vr10, $vr20, $vr10 + vadd.w $vr4, $vr27, $vr4 + vadd.w $vr5, $vr26, $vr5 + vadd.w $vr13, $vr25, $vr13 + vadd.w $vr12, $vr24, $vr12 + vadd.w $vr6, $vr31, $vr6 + vadd.w $vr7, $vr30, $vr7 + vadd.w $vr15, $vr29, $vr15 + vadd.w $vr14, $vr28, $vr14 + vst $vr8, $s4, -96 + vst $vr9, $s4, -128 + vst $vr1, $s4, -80 + vst $vr0, $s4, -112 + vst $vr10, $s4, -32 + vst $vr11, $s4, -64 + vst $vr3, $s4, -16 + vst $vr2, $s4, -48 + vst $vr12, $s4, 32 + vst $vr13, $s4, 0 + vst $vr5, $s4, 48 + vst $vr4, $s4, 16 + vst $vr14, $s4, 96 vst $vr15, $s4, 64 - vst $vr8, $s4, 80 + vst $vr7, $s4, 112 + vst $vr6, $s4, 80 addi.d $s6, $s6, 128 addi.d $s4, $s4, 256 bnez $s6, .LBB58_12 @@ -23778,18 +24243,26 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint32_t_RN9benc move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 24 # 8-byte Folded Reload - ld.d $s7, $sp, 32 # 8-byte Folded Reload - ld.d $s6, $sp, 40 # 8-byte Folded Reload - ld.d $s5, $sp, 48 # 8-byte Folded Reload - ld.d $s4, $sp, 56 # 8-byte Folded Reload - ld.d $s3, $sp, 64 # 8-byte Folded Reload - ld.d $s2, $sp, 72 # 8-byte Folded Reload - ld.d $s1, $sp, 80 # 8-byte Folded Reload - ld.d $s0, $sp, 88 # 8-byte Folded Reload - ld.d $fp, $sp, 96 # 8-byte Folded Reload - ld.d $ra, $sp, 104 # 8-byte Folded Reload - addi.d $sp, $sp, 112 + fld.d $fs7, $sp, 24 # 8-byte Folded Reload + fld.d $fs6, $sp, 32 # 8-byte Folded Reload + fld.d $fs5, $sp, 40 # 8-byte Folded Reload + fld.d $fs4, $sp, 48 # 8-byte Folded Reload + fld.d $fs3, $sp, 56 # 8-byte Folded Reload + fld.d $fs2, $sp, 64 # 8-byte Folded Reload + fld.d $fs1, $sp, 72 # 8-byte Folded Reload + fld.d $fs0, $sp, 80 # 8-byte Folded Reload + ld.d $s8, $sp, 88 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload + ld.d $s6, $sp, 104 # 8-byte Folded Reload + ld.d $s5, $sp, 112 # 8-byte Folded Reload + ld.d $s4, $sp, 120 # 8-byte Folded Reload + ld.d $s3, $sp, 128 # 8-byte Folded Reload + ld.d $s2, $sp, 136 # 8-byte Folded Reload + ld.d $s1, $sp, 144 # 8-byte Folded Reload + ld.d $s0, $sp, 152 # 8-byte Folded Reload + ld.d $fp, $sp, 160 # 8-byte Folded Reload + ld.d $ra, $sp, 168 # 8-byte Folded Reload + addi.d $sp, $sp, 176 ret .LBB58_18: .Ltmp1264: # EH_LABEL @@ -23968,7 +24441,6 @@ _Z61benchForTruncOrZextVecWithAddInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5St # %bb.10: addi.d $a0, $sp, 8 ori $a1, $s5, 480 - vrepli.b $vr0, 0 .p2align 4, , 16 .LBB59_11: # %.lr.ph # =>This Loop Header: Depth=1 @@ -23986,16 +24458,16 @@ _Z61benchForTruncOrZextVecWithAddInLoopFrom_uint16_t_To_uint32_t_RN9benchmark5St add.d $a4, $fp, $a3 ldptr.d $a5, $a4, 20000 ldptr.d $a4, $a4, 20008 - vinsgr2vr.d $vr1, $a5, 0 - vinsgr2vr.d $vr2, $a4, 0 - vld $vr3, $a2, -16 - vld $vr4, $a2, 0 - vilvl.h $vr1, $vr0, $vr1 - vilvl.h $vr2, $vr0, $vr2 + vinsgr2vr.d $vr0, $a5, 0 + vinsgr2vr.d $vr1, $a4, 0 + vld $vr2, $a2, -16 + vld $vr3, $a2, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vadd.w $vr0, $vr2, $vr0 vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr4, $vr2 - vst $vr1, $a2, -16 - vst $vr2, $a2, 0 + vst $vr0, $a2, -16 + vst $vr1, $a2, 0 addi.d $a3, $a3, 16 addi.d $a2, $a2, 32 bnez $a3, .LBB59_12 @@ -24112,19 +24584,19 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception60 # %bb.0: - addi.d $sp, $sp, -176 - .cfi_def_cfa_offset 176 - st.d $ra, $sp, 168 # 8-byte Folded Spill - st.d $fp, $sp, 160 # 8-byte Folded Spill - st.d $s0, $sp, 152 # 8-byte Folded Spill - st.d $s1, $sp, 144 # 8-byte Folded Spill - st.d $s2, $sp, 136 # 8-byte Folded Spill - st.d $s3, $sp, 128 # 8-byte Folded Spill - st.d $s4, $sp, 120 # 8-byte Folded Spill - st.d $s5, $sp, 112 # 8-byte Folded Spill - st.d $s6, $sp, 104 # 8-byte Folded Spill - st.d $s7, $sp, 96 # 8-byte Folded Spill - st.d $s8, $sp, 88 # 8-byte Folded Spill + addi.d $sp, $sp, -160 + .cfi_def_cfa_offset 160 + st.d $ra, $sp, 152 # 8-byte Folded Spill + st.d $fp, $sp, 144 # 8-byte Folded Spill + st.d $s0, $sp, 136 # 8-byte Folded Spill + st.d $s1, $sp, 128 # 8-byte Folded Spill + st.d $s2, $sp, 120 # 8-byte Folded Spill + st.d $s3, $sp, 112 # 8-byte Folded Spill + st.d $s4, $sp, 104 # 8-byte Folded Spill + st.d $s5, $sp, 96 # 8-byte Folded Spill + st.d $s6, $sp, 88 # 8-byte Folded Spill + st.d $s7, $sp, 80 # 8-byte Folded Spill + st.d $s8, $sp, 72 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -24152,22 +24624,22 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St .Ltmp1291: # EH_LABEL # %bb.1: # %.split move $s2, $a0 - st.d $a0, $sp, 72 + st.d $a0, $sp, 56 .Ltmp1293: # EH_LABEL move $a0, $s0 pcaddu18i $ra, %call36(_Znam) jirl $ra, $ra, 0 - st.d $a0, $sp, 56 # 8-byte Folded Spill + st.d $a0, $sp, 40 # 8-byte Folded Spill .Ltmp1294: # EH_LABEL # %bb.2: ori $a0, $s6, 3586 add.d $s5, $fp, $a0 ori $a0, $s1, 2048 - st.d $a0, $sp, 64 # 8-byte Folded Spill + st.d $a0, $sp, 48 # 8-byte Folded Spill add.d $s1, $s2, $a0 lu12i.w $a0, -16 lu32i.d $a0, 0 - st.w $a0, $sp, 84 + st.w $a0, $sp, 68 lu12i.w $s7, -5 ori $s0, $s7, 480 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -24175,8 +24647,8 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St .p2align 4, , 16 .LBB60_3: # =>This Inner Loop Header: Depth=1 .Ltmp1296: # EH_LABEL - addi.d $a0, $sp, 84 - addi.d $a2, $sp, 84 + addi.d $a0, $sp, 68 + addi.d $a2, $sp, 68 move $a1, $s4 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionItEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEtRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -24189,16 +24661,15 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St bnez $s0, .LBB60_3 # %bb.5: # %vector.ph ori $a0, $s7, 480 - vrepli.b $vr16, 0 - ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload .p2align 4, , 16 .LBB60_6: # %vector.body # =>This Inner Loop Header: Depth=1 add.d $a2, $fp, $a0 ldx.w $a2, $a2, $s3 vinsgr2vr.w $vr0, $a2, 0 - vilvl.h $vr0, $vr16, $vr0 - vilvl.w $vr0, $vr16, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vst $vr0, $a1, 0 addi.d $a0, $a0, 4 addi.d $a1, $a1, 16 @@ -24218,46 +24689,66 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St vldx $vr1, $a3, $a4 vldx $vr2, $a3, $s3 vldx $vr3, $a3, $a2 - vilvl.h $vr4, $vr16, $vr0 - vilvl.w $vr5, $vr16, $vr4 - vilvh.w $vr4, $vr16, $vr4 - vilvh.h $vr0, $vr16, $vr0 - vilvl.w $vr6, $vr16, $vr0 - vilvh.w $vr0, $vr16, $vr0 - vilvl.h $vr7, $vr16, $vr1 - vilvl.w $vr8, $vr16, $vr7 - vilvh.w $vr7, $vr16, $vr7 - vilvh.h $vr1, $vr16, $vr1 - vilvl.w $vr9, $vr16, $vr1 - vilvh.w $vr1, $vr16, $vr1 - vilvl.h $vr10, $vr16, $vr2 - vilvl.w $vr11, $vr16, $vr10 - vilvh.w $vr10, $vr16, $vr10 - vilvh.h $vr2, $vr16, $vr2 - vilvl.w $vr12, $vr16, $vr2 - vilvh.w $vr2, $vr16, $vr2 - vilvl.h $vr13, $vr16, $vr3 - vilvl.w $vr14, $vr16, $vr13 - vilvh.w $vr13, $vr16, $vr13 - vilvh.h $vr3, $vr16, $vr3 - vilvl.w $vr15, $vr16, $vr3 - vilvh.w $vr3, $vr16, $vr3 - vst $vr0, $a0, -80 - vst $vr6, $a0, -96 + vshuf4i.h $vr4, $vr0, 14 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vbsrl.v $vr5, $vr0, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vbsrl.v $vr6, $vr0, 12 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vshuf4i.h $vr7, $vr1, 14 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vbsrl.v $vr8, $vr1, 8 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vbsrl.v $vr9, $vr1, 12 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vshuf4i.h $vr10, $vr2, 14 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vbsrl.v $vr11, $vr2, 8 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vbsrl.v $vr12, $vr2, 12 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vshuf4i.h $vr13, $vr3, 14 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vbsrl.v $vr14, $vr3, 8 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vbsrl.v $vr15, $vr3, 12 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vst $vr0, $a0, -128 + vst $vr6, $a0, -80 + vst $vr5, $a0, -96 vst $vr4, $a0, -112 - vst $vr5, $a0, -128 - vst $vr1, $a0, -16 - vst $vr9, $a0, -32 + vst $vr1, $a0, -64 + vst $vr9, $a0, -16 + vst $vr8, $a0, -32 vst $vr7, $a0, -48 - vst $vr8, $a0, -64 - vst $vr2, $a0, 48 - vst $vr12, $a0, 32 + vst $vr2, $a0, 0 + vst $vr12, $a0, 48 + vst $vr11, $a0, 32 vst $vr10, $a0, 16 - vst $vr11, $a0, 0 - vst $vr3, $a0, 112 - vst $vr15, $a0, 96 + vst $vr3, $a0, 64 + vst $vr15, $a0, 112 + vst $vr14, $a0, 96 vst $vr13, $a0, 80 - vst $vr14, $a0, 64 addi.d $a1, $a1, 64 addi.d $a0, $a0, 256 bnez $a1, .LBB60_8 @@ -24310,7 +24801,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St ld.d $a0, $sp, 32 # 8-byte Folded Reload ldx.d $a0, $a0, $a1 move $a2, $a1 - ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload ldx.d $a1, $a1, $a2 bne $a0, $a1, .LBB60_23 # %bb.11: # %_ZL27truncOrZextVecInLoopWithVW8ItmEvPKT_PT0_i.exit @@ -24319,26 +24810,24 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St addi.d $a1, $a2, 8 bnez $s7, .LBB60_10 # %bb.12: - st.d $a4, $sp, 16 # 8-byte Folded Spill - vst $vr16, $sp, 32 # 16-byte Folded Spill + st.d $a4, $sp, 32 # 8-byte Folded Spill ld.d $a0, $sp, 24 # 8-byte Folded Reload ld.w $a1, $a0, 28 - st.d $a1, $sp, 8 # 8-byte Folded Spill + st.d $a1, $sp, 16 # 8-byte Folded Spill ld.d $s7, $a0, 16 .Ltmp1314: # EH_LABEL pcaddu18i $ra, %call36(_ZN9benchmark5State16StartKeepRunningEv) jirl $ra, $ra, 0 .Ltmp1315: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr16, $sp, 32 # 16-byte Folded Reload - ld.d $t2, $sp, 16 # 8-byte Folded Reload - ld.d $a0, $sp, 8 # 8-byte Folded Reload + ld.d $t2, $sp, 32 # 8-byte Folded Reload + ld.d $a0, $sp, 16 # 8-byte Folded Reload bnez $a0, .LBB60_19 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s7, .LBB60_19 # %bb.15: # %.lr.ph.preheader addi.d $a0, $fp, 48 - addi.d $a1, $sp, 72 + addi.d $a1, $sp, 56 lu12i.w $a2, -5 ori $a2, $a2, 512 lu12i.w $a5, 4 @@ -24352,8 +24841,8 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St #APP #NO_APP #MEMBARRIER - ld.d $a7, $sp, 72 - ld.d $a6, $sp, 64 # 8-byte Folded Reload + ld.d $a7, $sp, 56 + ld.d $a6, $sp, 48 # 8-byte Folded Reload add.d $a6, $a7, $a6 addi.d $a7, $a7, 128 move $t0, $a2 @@ -24366,46 +24855,66 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St vldx $vr1, $t1, $a4 vldx $vr2, $t1, $a5 vldx $vr3, $t1, $s4 - vilvl.h $vr4, $vr16, $vr0 - vilvl.w $vr5, $vr16, $vr4 - vilvh.w $vr4, $vr16, $vr4 - vilvh.h $vr0, $vr16, $vr0 - vilvl.w $vr6, $vr16, $vr0 - vilvh.w $vr0, $vr16, $vr0 - vilvl.h $vr7, $vr16, $vr1 - vilvl.w $vr8, $vr16, $vr7 - vilvh.w $vr7, $vr16, $vr7 - vilvh.h $vr1, $vr16, $vr1 - vilvl.w $vr9, $vr16, $vr1 - vilvh.w $vr1, $vr16, $vr1 - vilvl.h $vr10, $vr16, $vr2 - vilvl.w $vr11, $vr16, $vr10 - vilvh.w $vr10, $vr16, $vr10 - vilvh.h $vr2, $vr16, $vr2 - vilvl.w $vr12, $vr16, $vr2 - vilvh.w $vr2, $vr16, $vr2 - vilvl.h $vr13, $vr16, $vr3 - vilvl.w $vr14, $vr16, $vr13 - vilvh.w $vr13, $vr16, $vr13 - vilvh.h $vr3, $vr16, $vr3 - vilvl.w $vr15, $vr16, $vr3 - vilvh.w $vr3, $vr16, $vr3 - vst $vr0, $a7, -80 - vst $vr6, $a7, -96 + vshuf4i.h $vr4, $vr0, 14 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vbsrl.v $vr5, $vr0, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vbsrl.v $vr6, $vr0, 12 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vshuf4i.h $vr7, $vr1, 14 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vbsrl.v $vr8, $vr1, 8 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vbsrl.v $vr9, $vr1, 12 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vshuf4i.h $vr10, $vr2, 14 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vbsrl.v $vr11, $vr2, 8 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vbsrl.v $vr12, $vr2, 12 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vshuf4i.h $vr13, $vr3, 14 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vbsrl.v $vr14, $vr3, 8 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vbsrl.v $vr15, $vr3, 12 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vst $vr0, $a7, -128 + vst $vr6, $a7, -80 + vst $vr5, $a7, -96 vst $vr4, $a7, -112 - vst $vr5, $a7, -128 - vst $vr1, $a7, -16 - vst $vr9, $a7, -32 + vst $vr1, $a7, -64 + vst $vr9, $a7, -16 + vst $vr8, $a7, -32 vst $vr7, $a7, -48 - vst $vr8, $a7, -64 - vst $vr2, $a7, 48 - vst $vr12, $a7, 32 + vst $vr2, $a7, 0 + vst $vr12, $a7, 48 + vst $vr11, $a7, 32 vst $vr10, $a7, 16 - vst $vr11, $a7, 0 - vst $vr3, $a7, 112 - vst $vr15, $a7, 96 + vst $vr3, $a7, 64 + vst $vr15, $a7, 112 + vst $vr14, $a7, 96 vst $vr13, $a7, 80 - vst $vr14, $a7, 64 addi.d $t0, $t0, 64 addi.d $a7, $a7, 256 bnez $t0, .LBB60_17 @@ -24452,10 +24961,10 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St jirl $ra, $ra, 0 .Ltmp1317: # EH_LABEL # %bb.20: # %_ZNSt10unique_ptrIA_mSt14default_deleteIS0_EED2Ev.exit36 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 72 + ld.d $a0, $sp, 56 beqz $a0, .LBB60_22 # %bb.21: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i38 pcaddu18i $ra, %call36(_ZdaPv) @@ -24464,18 +24973,18 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 88 # 8-byte Folded Reload - ld.d $s7, $sp, 96 # 8-byte Folded Reload - ld.d $s6, $sp, 104 # 8-byte Folded Reload - ld.d $s5, $sp, 112 # 8-byte Folded Reload - ld.d $s4, $sp, 120 # 8-byte Folded Reload - ld.d $s3, $sp, 128 # 8-byte Folded Reload - ld.d $s2, $sp, 136 # 8-byte Folded Reload - ld.d $s1, $sp, 144 # 8-byte Folded Reload - ld.d $s0, $sp, 152 # 8-byte Folded Reload - ld.d $fp, $sp, 160 # 8-byte Folded Reload - ld.d $ra, $sp, 168 # 8-byte Folded Reload - addi.d $sp, $sp, 176 + ld.d $s8, $sp, 72 # 8-byte Folded Reload + ld.d $s7, $sp, 80 # 8-byte Folded Reload + ld.d $s6, $sp, 88 # 8-byte Folded Reload + ld.d $s5, $sp, 96 # 8-byte Folded Reload + ld.d $s4, $sp, 104 # 8-byte Folded Reload + ld.d $s3, $sp, 112 # 8-byte Folded Reload + ld.d $s2, $sp, 120 # 8-byte Folded Reload + ld.d $s1, $sp, 128 # 8-byte Folded Reload + ld.d $s0, $sp, 136 # 8-byte Folded Reload + ld.d $fp, $sp, 144 # 8-byte Folded Reload + ld.d $ra, $sp, 152 # 8-byte Folded Reload + addi.d $sp, $sp, 160 ret .LBB60_23: .Ltmp1299: # EH_LABEL @@ -24527,7 +25036,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St jirl $ra, $ra, 0 .Ltmp1308: # EH_LABEL # %bb.28: # %_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc.exit21 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload ldx.d $a1, $a0, $s0 .Ltmp1309: # EH_LABEL move $a0, $s1 @@ -24568,10 +25077,10 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint16_t_To_uint64_t_RN9benchmark5St .Ltmp1298: # EH_LABEL .LBB60_36: move $s1, $a0 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 72 + ld.d $a0, $sp, 56 move $s2, $a0 beqz $a0, .LBB60_38 .LBB60_37: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i31 @@ -24642,19 +25151,27 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception61 # %bb.0: - addi.d $sp, $sp, -176 - .cfi_def_cfa_offset 176 - st.d $ra, $sp, 168 # 8-byte Folded Spill - st.d $fp, $sp, 160 # 8-byte Folded Spill - st.d $s0, $sp, 152 # 8-byte Folded Spill - st.d $s1, $sp, 144 # 8-byte Folded Spill - st.d $s2, $sp, 136 # 8-byte Folded Spill - st.d $s3, $sp, 128 # 8-byte Folded Spill - st.d $s4, $sp, 120 # 8-byte Folded Spill - st.d $s5, $sp, 112 # 8-byte Folded Spill - st.d $s6, $sp, 104 # 8-byte Folded Spill - st.d $s7, $sp, 96 # 8-byte Folded Spill - st.d $s8, $sp, 88 # 8-byte Folded Spill + addi.d $sp, $sp, -224 + .cfi_def_cfa_offset 224 + st.d $ra, $sp, 216 # 8-byte Folded Spill + st.d $fp, $sp, 208 # 8-byte Folded Spill + st.d $s0, $sp, 200 # 8-byte Folded Spill + st.d $s1, $sp, 192 # 8-byte Folded Spill + st.d $s2, $sp, 184 # 8-byte Folded Spill + st.d $s3, $sp, 176 # 8-byte Folded Spill + st.d $s4, $sp, 168 # 8-byte Folded Spill + st.d $s5, $sp, 160 # 8-byte Folded Spill + st.d $s6, $sp, 152 # 8-byte Folded Spill + st.d $s7, $sp, 144 # 8-byte Folded Spill + st.d $s8, $sp, 136 # 8-byte Folded Spill + fst.d $fs0, $sp, 128 # 8-byte Folded Spill + fst.d $fs1, $sp, 120 # 8-byte Folded Spill + fst.d $fs2, $sp, 112 # 8-byte Folded Spill + fst.d $fs3, $sp, 104 # 8-byte Folded Spill + fst.d $fs4, $sp, 96 # 8-byte Folded Spill + fst.d $fs5, $sp, 88 # 8-byte Folded Spill + fst.d $fs6, $sp, 80 # 8-byte Folded Spill + fst.d $fs7, $sp, 72 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -24666,6 +25183,14 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S .cfi_offset 29, -72 .cfi_offset 30, -80 .cfi_offset 31, -88 + .cfi_offset 56, -96 + .cfi_offset 57, -104 + .cfi_offset 58, -112 + .cfi_offset 59, -120 + .cfi_offset 60, -128 + .cfi_offset 61, -136 + .cfi_offset 62, -144 + .cfi_offset 63, -152 st.d $a0, $sp, 24 # 8-byte Folded Spill lu12i.w $s6, 4 ori $s3, $s6, 3616 @@ -24682,22 +25207,22 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S .Ltmp1320: # EH_LABEL # %bb.1: # %.split move $s2, $a0 - st.d $a0, $sp, 72 + st.d $a0, $sp, 56 .Ltmp1322: # EH_LABEL move $a0, $s0 pcaddu18i $ra, %call36(_Znam) jirl $ra, $ra, 0 - st.d $a0, $sp, 56 # 8-byte Folded Spill + st.d $a0, $sp, 40 # 8-byte Folded Spill .Ltmp1323: # EH_LABEL # %bb.2: ori $a0, $s6, 3586 add.d $s5, $fp, $a0 ori $a0, $s1, 2048 - st.d $a0, $sp, 64 # 8-byte Folded Spill + st.d $a0, $sp, 48 # 8-byte Folded Spill add.d $s1, $s2, $a0 lu12i.w $a0, -16 lu32i.d $a0, 0 - st.w $a0, $sp, 84 + st.w $a0, $sp, 68 lu12i.w $s8, -5 ori $s0, $s8, 480 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -24705,8 +25230,8 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S .p2align 4, , 16 .LBB61_3: # =>This Inner Loop Header: Depth=1 .Ltmp1325: # EH_LABEL - addi.d $a0, $sp, 84 - addi.d $a2, $sp, 84 + addi.d $a0, $sp, 68 + addi.d $a2, $sp, 68 move $a1, $s4 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionItEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEtRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -24719,16 +25244,15 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S bnez $s0, .LBB61_3 # %bb.5: # %vector.ph ori $a0, $s8, 480 - vrepli.b $vr3, 0 - ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload .p2align 4, , 16 .LBB61_6: # %vector.body # =>This Inner Loop Header: Depth=1 add.d $a2, $fp, $a0 ldx.w $a2, $a2, $s3 vinsgr2vr.w $vr0, $a2, 0 - vilvl.h $vr0, $vr3, $vr0 - vilvl.w $vr0, $vr3, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vst $vr0, $a1, 0 addi.d $a0, $a0, 4 addi.d $a1, $a1, 16 @@ -24748,94 +25272,134 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S .LBB61_8: # %vector.body85 # =>This Inner Loop Header: Depth=1 add.d $a7, $fp, $a1 - vldx $vr0, $a7, $s4 - vilvh.h $vr1, $vr3, $vr0 - vilvh.w $vr2, $vr3, $vr1 - vst $vr2, $a0, -144 - vldx $vr2, $a7, $s7 - vilvl.h $vr0, $vr3, $vr0 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $a0, -160 - vilvh.w $vr1, $vr3, $vr0 - vst $vr1, $a0, -176 - vilvh.h $vr1, $vr3, $vr2 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a0, -192 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $a0, -208 - vldx $vr0, $a7, $a2 - vilvl.h $vr2, $vr3, $vr2 - vilvl.w $vr1, $vr3, $vr1 + vldx $vr6, $a7, $s4 + vldx $vr8, $a7, $s7 + vldx $vr16, $a7, $a2 + vldx $vr17, $a7, $s3 + vldx $vr12, $a7, $a3 + vldx $vr15, $a7, $a4 + vldx $vr2, $a7, $a5 + vldx $vr3, $a7, $a6 + vshuf4i.h $vr0, $vr8, 14 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vbsrl.v $vr1, $vr8, 8 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr4, $vr8, 12 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.h $vr5, $vr6, 14 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vbsrl.v $vr7, $vr6, 8 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vbsrl.v $vr9, $vr6, 12 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr11, $vr8, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr13, $vr6, 0 + vshuf4i.h $vr6, $vr17, 14 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vbsrl.v $vr8, $vr17, 8 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vbsrl.v $vr10, $vr17, 12 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vshuf4i.h $vr14, $vr16, 14 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vbsrl.v $vr18, $vr16, 8 + vsllwil.wu.hu $vr18, $vr18, 0 + vsllwil.du.wu $vr18, $vr18, 0 + vbsrl.v $vr19, $vr16, 12 + vsllwil.wu.hu $vr19, $vr19, 0 + vsllwil.du.wu $vr19, $vr19, 0 + vsllwil.wu.hu $vr17, $vr17, 0 + vsllwil.du.wu $vr17, $vr17, 0 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.du.wu $vr20, $vr16, 0 + vshuf4i.h $vr16, $vr15, 14 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.du.wu $vr16, $vr16, 0 + vbsrl.v $vr21, $vr15, 8 + vsllwil.wu.hu $vr21, $vr21, 0 + vsllwil.du.wu $vr21, $vr21, 0 + vbsrl.v $vr22, $vr15, 12 + vsllwil.wu.hu $vr22, $vr22, 0 + vsllwil.du.wu $vr22, $vr22, 0 + vshuf4i.h $vr23, $vr12, 14 + vsllwil.wu.hu $vr23, $vr23, 0 + vsllwil.du.wu $vr23, $vr23, 0 + vbsrl.v $vr24, $vr12, 8 + vsllwil.wu.hu $vr24, $vr24, 0 + vsllwil.du.wu $vr24, $vr24, 0 + vbsrl.v $vr25, $vr12, 12 + vsllwil.wu.hu $vr25, $vr25, 0 + vsllwil.du.wu $vr25, $vr25, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vshuf4i.h $vr26, $vr3, 14 + vsllwil.wu.hu $vr26, $vr26, 0 + vsllwil.du.wu $vr26, $vr26, 0 + vbsrl.v $vr27, $vr3, 8 + vsllwil.wu.hu $vr27, $vr27, 0 + vsllwil.du.wu $vr27, $vr27, 0 + vbsrl.v $vr28, $vr3, 12 + vsllwil.wu.hu $vr28, $vr28, 0 + vsllwil.du.wu $vr28, $vr28, 0 + vshuf4i.h $vr29, $vr2, 14 + vsllwil.wu.hu $vr29, $vr29, 0 + vsllwil.du.wu $vr29, $vr29, 0 + vbsrl.v $vr30, $vr2, 8 + vsllwil.wu.hu $vr30, $vr30, 0 + vsllwil.du.wu $vr30, $vr30, 0 + vbsrl.v $vr31, $vr2, 12 + vsllwil.wu.hu $vr31, $vr31, 0 + vsllwil.du.wu $vr31, $vr31, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vst $vr13, $a0, -192 + vst $vr11, $a0, -256 + vst $vr9, $a0, -144 + vst $vr7, $a0, -160 + vst $vr5, $a0, -176 + vst $vr4, $a0, -208 vst $vr1, $a0, -224 - vilvl.w $vr1, $vr3, $vr2 - vilvh.w $vr2, $vr3, $vr2 - vst $vr2, $a0, -240 - vilvh.h $vr2, $vr3, $vr0 - vst $vr1, $a0, -256 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $a0, -16 - vldx $vr1, $a7, $s3 - vilvl.h $vr0, $vr3, $vr0 - vilvl.w $vr2, $vr3, $vr2 - vst $vr2, $a0, -32 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $a0, -48 - vilvh.h $vr2, $vr3, $vr1 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a0, -64 - vilvh.w $vr0, $vr3, $vr2 - vst $vr0, $a0, -80 - vldx $vr0, $a7, $a3 - vilvl.h $vr1, $vr3, $vr1 - vilvl.w $vr2, $vr3, $vr2 - vst $vr2, $a0, -96 - vilvl.w $vr2, $vr3, $vr1 - vilvh.w $vr1, $vr3, $vr1 - vst $vr1, $a0, -112 - vilvh.h $vr1, $vr3, $vr0 - vst $vr2, $a0, -128 - vilvh.w $vr2, $vr3, $vr1 - vst $vr2, $a0, 112 - vldx $vr2, $a7, $a4 - vilvl.h $vr0, $vr3, $vr0 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $a0, 96 - vilvh.w $vr1, $vr3, $vr0 - vst $vr1, $a0, 80 - vilvh.h $vr1, $vr3, $vr2 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a0, 64 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $a0, 48 - vldx $vr0, $a7, $a5 - vilvl.h $vr2, $vr3, $vr2 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $a0, 32 - vilvl.w $vr1, $vr3, $vr2 - vilvh.w $vr2, $vr3, $vr2 - vst $vr2, $a0, 16 - vilvh.h $vr2, $vr3, $vr0 - vst $vr1, $a0, 0 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $a0, 240 - vldx $vr1, $a7, $a6 - vilvl.h $vr0, $vr3, $vr0 - vilvl.w $vr2, $vr3, $vr2 - vst $vr2, $a0, 224 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $a0, 208 - vilvh.h $vr2, $vr3, $vr1 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a0, 192 - vilvh.w $vr0, $vr3, $vr2 - vst $vr0, $a0, 176 - vilvl.h $vr0, $vr3, $vr1 - vilvl.w $vr1, $vr3, $vr2 - vst $vr1, $a0, 160 - vilvh.w $vr1, $vr3, $vr0 - vst $vr1, $a0, 144 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a0, 128 + vst $vr0, $a0, -240 + vst $vr20, $a0, -64 + vst $vr17, $a0, -128 + vst $vr19, $a0, -16 + vst $vr18, $a0, -32 + vst $vr14, $a0, -48 + vst $vr10, $a0, -80 + vst $vr8, $a0, -96 + vst $vr6, $a0, -112 + vst $vr12, $a0, 64 + vst $vr15, $a0, 0 + vst $vr25, $a0, 112 + vst $vr24, $a0, 96 + vst $vr23, $a0, 80 + vst $vr22, $a0, 48 + vst $vr21, $a0, 32 + vst $vr16, $a0, 16 + vst $vr2, $a0, 192 + vst $vr3, $a0, 128 + vst $vr31, $a0, 240 + vst $vr30, $a0, 224 + vst $vr29, $a0, 208 + vst $vr28, $a0, 176 + vst $vr27, $a0, 160 + vst $vr26, $a0, 144 addi.d $a1, $a1, 128 addi.d $a0, $a0, 512 bnez $a1, .LBB61_8 @@ -24888,7 +25452,7 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S ld.d $a0, $sp, 32 # 8-byte Folded Reload ldx.d $a0, $a0, $a1 move $a2, $a1 - ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload ldx.d $a1, $a1, $a2 bne $a0, $a1, .LBB61_23 # %bb.11: # %_ZL28truncOrZextVecInLoopWithVW16ItmEvPKT_PT0_i.exit @@ -24897,26 +25461,24 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S addi.d $a1, $a2, 8 bnez $s8, .LBB61_10 # %bb.12: - st.d $a5, $sp, 16 # 8-byte Folded Spill - vst $vr3, $sp, 32 # 16-byte Folded Spill + st.d $a5, $sp, 32 # 8-byte Folded Spill ld.d $a0, $sp, 24 # 8-byte Folded Reload ld.w $a1, $a0, 28 - st.d $a1, $sp, 8 # 8-byte Folded Spill + st.d $a1, $sp, 16 # 8-byte Folded Spill ld.d $s8, $a0, 16 .Ltmp1343: # EH_LABEL pcaddu18i $ra, %call36(_ZN9benchmark5State16StartKeepRunningEv) jirl $ra, $ra, 0 .Ltmp1344: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr3, $sp, 32 # 16-byte Folded Reload - ld.d $t5, $sp, 16 # 8-byte Folded Reload - ld.d $a0, $sp, 8 # 8-byte Folded Reload + ld.d $t5, $sp, 32 # 8-byte Folded Reload + ld.d $a0, $sp, 16 # 8-byte Folded Reload bnez $a0, .LBB61_19 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s8, .LBB61_19 # %bb.15: # %.lr.ph.preheader addi.d $a0, $fp, 96 - addi.d $a1, $sp, 72 + addi.d $a1, $sp, 56 lu12i.w $a2, -5 ori $a2, $a2, 512 lu12i.w $t0, 4 @@ -24933,8 +25495,8 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S #APP #NO_APP #MEMBARRIER - ld.d $t2, $sp, 72 - ld.d $t1, $sp, 64 # 8-byte Folded Reload + ld.d $t2, $sp, 56 + ld.d $t1, $sp, 48 # 8-byte Folded Reload add.d $t1, $t2, $t1 addi.d $t2, $t2, 256 move $t3, $a2 @@ -24943,94 +25505,134 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S # Parent Loop BB61_16 Depth=1 # => This Inner Loop Header: Depth=2 add.d $t4, $a0, $t3 - vldx $vr0, $t4, $a3 - vilvh.h $vr1, $vr3, $vr0 - vilvh.w $vr2, $vr3, $vr1 - vst $vr2, $t2, -144 - vldx $vr2, $t4, $a4 - vilvl.h $vr0, $vr3, $vr0 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $t2, -160 - vilvh.w $vr1, $vr3, $vr0 - vst $vr1, $t2, -176 - vilvh.h $vr1, $vr3, $vr2 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $t2, -192 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $t2, -208 - vldx $vr0, $t4, $a5 - vilvl.h $vr2, $vr3, $vr2 - vilvl.w $vr1, $vr3, $vr1 + vldx $vr6, $t4, $a3 + vldx $vr8, $t4, $a4 + vldx $vr16, $t4, $a5 + vldx $vr17, $t4, $a6 + vldx $vr12, $t4, $a7 + vldx $vr15, $t4, $t0 + vldx $vr2, $t4, $s4 + vldx $vr3, $t4, $s7 + vshuf4i.h $vr0, $vr8, 14 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vbsrl.v $vr1, $vr8, 8 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr4, $vr8, 12 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.h $vr5, $vr6, 14 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vbsrl.v $vr7, $vr6, 8 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vbsrl.v $vr9, $vr6, 12 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr11, $vr8, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr13, $vr6, 0 + vshuf4i.h $vr6, $vr17, 14 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vbsrl.v $vr8, $vr17, 8 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vbsrl.v $vr10, $vr17, 12 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vshuf4i.h $vr14, $vr16, 14 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vbsrl.v $vr18, $vr16, 8 + vsllwil.wu.hu $vr18, $vr18, 0 + vsllwil.du.wu $vr18, $vr18, 0 + vbsrl.v $vr19, $vr16, 12 + vsllwil.wu.hu $vr19, $vr19, 0 + vsllwil.du.wu $vr19, $vr19, 0 + vsllwil.wu.hu $vr17, $vr17, 0 + vsllwil.du.wu $vr17, $vr17, 0 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.du.wu $vr20, $vr16, 0 + vshuf4i.h $vr16, $vr15, 14 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.du.wu $vr16, $vr16, 0 + vbsrl.v $vr21, $vr15, 8 + vsllwil.wu.hu $vr21, $vr21, 0 + vsllwil.du.wu $vr21, $vr21, 0 + vbsrl.v $vr22, $vr15, 12 + vsllwil.wu.hu $vr22, $vr22, 0 + vsllwil.du.wu $vr22, $vr22, 0 + vshuf4i.h $vr23, $vr12, 14 + vsllwil.wu.hu $vr23, $vr23, 0 + vsllwil.du.wu $vr23, $vr23, 0 + vbsrl.v $vr24, $vr12, 8 + vsllwil.wu.hu $vr24, $vr24, 0 + vsllwil.du.wu $vr24, $vr24, 0 + vbsrl.v $vr25, $vr12, 12 + vsllwil.wu.hu $vr25, $vr25, 0 + vsllwil.du.wu $vr25, $vr25, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vshuf4i.h $vr26, $vr3, 14 + vsllwil.wu.hu $vr26, $vr26, 0 + vsllwil.du.wu $vr26, $vr26, 0 + vbsrl.v $vr27, $vr3, 8 + vsllwil.wu.hu $vr27, $vr27, 0 + vsllwil.du.wu $vr27, $vr27, 0 + vbsrl.v $vr28, $vr3, 12 + vsllwil.wu.hu $vr28, $vr28, 0 + vsllwil.du.wu $vr28, $vr28, 0 + vshuf4i.h $vr29, $vr2, 14 + vsllwil.wu.hu $vr29, $vr29, 0 + vsllwil.du.wu $vr29, $vr29, 0 + vbsrl.v $vr30, $vr2, 8 + vsllwil.wu.hu $vr30, $vr30, 0 + vsllwil.du.wu $vr30, $vr30, 0 + vbsrl.v $vr31, $vr2, 12 + vsllwil.wu.hu $vr31, $vr31, 0 + vsllwil.du.wu $vr31, $vr31, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vst $vr13, $t2, -192 + vst $vr11, $t2, -256 + vst $vr9, $t2, -144 + vst $vr7, $t2, -160 + vst $vr5, $t2, -176 + vst $vr4, $t2, -208 vst $vr1, $t2, -224 - vilvl.w $vr1, $vr3, $vr2 - vilvh.w $vr2, $vr3, $vr2 - vst $vr2, $t2, -240 - vilvh.h $vr2, $vr3, $vr0 - vst $vr1, $t2, -256 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $t2, -16 - vldx $vr1, $t4, $a6 - vilvl.h $vr0, $vr3, $vr0 - vilvl.w $vr2, $vr3, $vr2 - vst $vr2, $t2, -32 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $t2, -48 - vilvh.h $vr2, $vr3, $vr1 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $t2, -64 - vilvh.w $vr0, $vr3, $vr2 - vst $vr0, $t2, -80 - vldx $vr0, $t4, $a7 - vilvl.h $vr1, $vr3, $vr1 - vilvl.w $vr2, $vr3, $vr2 - vst $vr2, $t2, -96 - vilvl.w $vr2, $vr3, $vr1 - vilvh.w $vr1, $vr3, $vr1 - vst $vr1, $t2, -112 - vilvh.h $vr1, $vr3, $vr0 - vst $vr2, $t2, -128 - vilvh.w $vr2, $vr3, $vr1 - vst $vr2, $t2, 112 - vldx $vr2, $t4, $t0 - vilvl.h $vr0, $vr3, $vr0 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $t2, 96 - vilvh.w $vr1, $vr3, $vr0 - vst $vr1, $t2, 80 - vilvh.h $vr1, $vr3, $vr2 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $t2, 64 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $t2, 48 - vldx $vr0, $t4, $s4 - vilvl.h $vr2, $vr3, $vr2 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $t2, 32 - vilvl.w $vr1, $vr3, $vr2 - vilvh.w $vr2, $vr3, $vr2 - vst $vr2, $t2, 16 - vilvh.h $vr2, $vr3, $vr0 - vst $vr1, $t2, 0 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $t2, 240 - vldx $vr1, $t4, $s7 - vilvl.h $vr0, $vr3, $vr0 - vilvl.w $vr2, $vr3, $vr2 - vst $vr2, $t2, 224 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $t2, 208 - vilvh.h $vr2, $vr3, $vr1 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $t2, 192 - vilvh.w $vr0, $vr3, $vr2 - vst $vr0, $t2, 176 - vilvl.h $vr0, $vr3, $vr1 - vilvl.w $vr1, $vr3, $vr2 - vst $vr1, $t2, 160 - vilvh.w $vr1, $vr3, $vr0 - vst $vr1, $t2, 144 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $t2, 128 + vst $vr0, $t2, -240 + vst $vr20, $t2, -64 + vst $vr17, $t2, -128 + vst $vr19, $t2, -16 + vst $vr18, $t2, -32 + vst $vr14, $t2, -48 + vst $vr10, $t2, -80 + vst $vr8, $t2, -96 + vst $vr6, $t2, -112 + vst $vr12, $t2, 64 + vst $vr15, $t2, 0 + vst $vr25, $t2, 112 + vst $vr24, $t2, 96 + vst $vr23, $t2, 80 + vst $vr22, $t2, 48 + vst $vr21, $t2, 32 + vst $vr16, $t2, 16 + vst $vr2, $t2, 192 + vst $vr3, $t2, 128 + vst $vr31, $t2, 240 + vst $vr30, $t2, 224 + vst $vr29, $t2, 208 + vst $vr28, $t2, 176 + vst $vr27, $t2, 160 + vst $vr26, $t2, 144 addi.d $t3, $t3, 128 addi.d $t2, $t2, 512 bnez $t3, .LBB61_17 @@ -25077,10 +25679,10 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S jirl $ra, $ra, 0 .Ltmp1346: # EH_LABEL # %bb.20: # %_ZNSt10unique_ptrIA_mSt14default_deleteIS0_EED2Ev.exit36 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 72 + ld.d $a0, $sp, 56 beqz $a0, .LBB61_22 # %bb.21: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i38 pcaddu18i $ra, %call36(_ZdaPv) @@ -25089,18 +25691,26 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 88 # 8-byte Folded Reload - ld.d $s7, $sp, 96 # 8-byte Folded Reload - ld.d $s6, $sp, 104 # 8-byte Folded Reload - ld.d $s5, $sp, 112 # 8-byte Folded Reload - ld.d $s4, $sp, 120 # 8-byte Folded Reload - ld.d $s3, $sp, 128 # 8-byte Folded Reload - ld.d $s2, $sp, 136 # 8-byte Folded Reload - ld.d $s1, $sp, 144 # 8-byte Folded Reload - ld.d $s0, $sp, 152 # 8-byte Folded Reload - ld.d $fp, $sp, 160 # 8-byte Folded Reload - ld.d $ra, $sp, 168 # 8-byte Folded Reload - addi.d $sp, $sp, 176 + fld.d $fs7, $sp, 72 # 8-byte Folded Reload + fld.d $fs6, $sp, 80 # 8-byte Folded Reload + fld.d $fs5, $sp, 88 # 8-byte Folded Reload + fld.d $fs4, $sp, 96 # 8-byte Folded Reload + fld.d $fs3, $sp, 104 # 8-byte Folded Reload + fld.d $fs2, $sp, 112 # 8-byte Folded Reload + fld.d $fs1, $sp, 120 # 8-byte Folded Reload + fld.d $fs0, $sp, 128 # 8-byte Folded Reload + ld.d $s8, $sp, 136 # 8-byte Folded Reload + ld.d $s7, $sp, 144 # 8-byte Folded Reload + ld.d $s6, $sp, 152 # 8-byte Folded Reload + ld.d $s5, $sp, 160 # 8-byte Folded Reload + ld.d $s4, $sp, 168 # 8-byte Folded Reload + ld.d $s3, $sp, 176 # 8-byte Folded Reload + ld.d $s2, $sp, 184 # 8-byte Folded Reload + ld.d $s1, $sp, 192 # 8-byte Folded Reload + ld.d $s0, $sp, 200 # 8-byte Folded Reload + ld.d $fp, $sp, 208 # 8-byte Folded Reload + ld.d $ra, $sp, 216 # 8-byte Folded Reload + addi.d $sp, $sp, 224 ret .LBB61_23: .Ltmp1328: # EH_LABEL @@ -25152,7 +25762,7 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S jirl $ra, $ra, 0 .Ltmp1337: # EH_LABEL # %bb.28: # %_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc.exit21 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload ldx.d $a1, $a0, $s0 .Ltmp1338: # EH_LABEL move $a0, $s1 @@ -25193,10 +25803,10 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint16_t_To_uint64_t_RN9benchmark5S .Ltmp1327: # EH_LABEL .LBB61_36: move $s1, $a0 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 72 + ld.d $a0, $sp, 56 move $s2, $a0 beqz $a0, .LBB61_38 .LBB61_37: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i31 @@ -25267,20 +25877,20 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception62 # %bb.0: - addi.d $sp, $sp, -128 - .cfi_def_cfa_offset 128 - st.d $ra, $sp, 120 # 8-byte Folded Spill - st.d $fp, $sp, 112 # 8-byte Folded Spill - st.d $s0, $sp, 104 # 8-byte Folded Spill - st.d $s1, $sp, 96 # 8-byte Folded Spill - st.d $s2, $sp, 88 # 8-byte Folded Spill - st.d $s3, $sp, 80 # 8-byte Folded Spill - st.d $s4, $sp, 72 # 8-byte Folded Spill - st.d $s5, $sp, 64 # 8-byte Folded Spill - st.d $s6, $sp, 56 # 8-byte Folded Spill - st.d $s7, $sp, 48 # 8-byte Folded Spill - .cfi_offset 1, -8 - .cfi_offset 22, -16 + addi.d $sp, $sp, -112 + .cfi_def_cfa_offset 112 + st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s1, $sp, 80 # 8-byte Folded Spill + st.d $s2, $sp, 72 # 8-byte Folded Spill + st.d $s3, $sp, 64 # 8-byte Folded Spill + st.d $s4, $sp, 56 # 8-byte Folded Spill + st.d $s5, $sp, 48 # 8-byte Folded Spill + st.d $s6, $sp, 40 # 8-byte Folded Spill + st.d $s7, $sp, 32 # 8-byte Folded Spill + .cfi_offset 1, -8 + .cfi_offset 22, -16 .cfi_offset 23, -24 .cfi_offset 24, -32 .cfi_offset 25, -40 @@ -25305,7 +25915,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # .Ltmp1349: # EH_LABEL # %bb.1: move $s3, $a0 - st.d $a0, $sp, 32 + st.d $a0, $sp, 16 .Ltmp1351: # EH_LABEL move $a0, $s0 pcaddu18i $ra, %call36(_Znam) @@ -25315,7 +25925,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # move $s0, $a0 lu12i.w $a0, -16 lu32i.d $a0, 0 - st.w $a0, $sp, 44 + st.w $a0, $sp, 28 lu12i.w $s5, -5 ori $s7, $s5, 480 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -25323,8 +25933,8 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # .p2align 4, , 16 .LBB62_3: # =>This Inner Loop Header: Depth=1 .Ltmp1354: # EH_LABEL - addi.d $a0, $sp, 44 - addi.d $a2, $sp, 44 + addi.d $a0, $sp, 28 + addi.d $a2, $sp, 28 move $a1, $s4 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionItEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEtRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -25337,7 +25947,6 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # bnez $s7, .LBB62_3 # %bb.5: # %vector.ph ori $a0, $s5, 480 - vrepli.b $vr2, 0 move $a1, $s0 .p2align 4, , 16 .LBB62_6: # %vector.body @@ -25345,8 +25954,8 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # add.d $a2, $fp, $a0 ldx.w $a2, $a2, $s1 vinsgr2vr.w $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.w $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vst $vr0, $a1, 0 addi.d $a0, $a0, 4 addi.d $a1, $a1, 16 @@ -25363,10 +25972,10 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # ldx.w $a2, $a2, $s4 vinsgr2vr.w $vr0, $a3, 0 vinsgr2vr.w $vr1, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.w $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr2, $vr1 - vilvl.w $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 vst $vr0, $a0, -16 vst $vr1, $a0, 0 addi.d $a1, $a1, 8 @@ -25387,7 +25996,6 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # addi.d $s6, $s6, 8 bnez $s7, .LBB62_10 # %bb.12: - vst $vr2, $sp, 16 # 16-byte Folded Spill ld.w $s6, $s2, 28 ld.d $s3, $s2, 16 .Ltmp1372: # EH_LABEL @@ -25396,12 +26004,11 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # jirl $ra, $ra, 0 .Ltmp1373: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr2, $sp, 16 # 16-byte Folded Reload bnez $s6, .LBB62_19 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s3, .LBB62_19 # %bb.15: - addi.d $a0, $sp, 32 + addi.d $a0, $sp, 16 ori $a1, $s5, 480 .p2align 4, , 16 .LBB62_16: # %.lr.ph @@ -25410,7 +26017,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # #APP #NO_APP #MEMBARRIER - ld.d $a2, $sp, 32 + ld.d $a2, $sp, 16 addi.d $a2, $a2, 16 move $a3, $a1 .p2align 4, , 16 @@ -25422,10 +26029,10 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # ldx.w $a4, $a4, $s4 vinsgr2vr.w $vr0, $a5, 0 vinsgr2vr.w $vr1, $a4, 0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.w $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr2, $vr1 - vilvl.w $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 vst $vr0, $a2, -16 vst $vr1, $a2, 0 addi.d $a3, $a3, 8 @@ -25445,7 +26052,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 32 + ld.d $a0, $sp, 16 beqz $a0, .LBB62_22 # %bb.21: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i38 pcaddu18i $ra, %call36(_ZdaPv) @@ -25454,17 +26061,17 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s7, $sp, 48 # 8-byte Folded Reload - ld.d $s6, $sp, 56 # 8-byte Folded Reload - ld.d $s5, $sp, 64 # 8-byte Folded Reload - ld.d $s4, $sp, 72 # 8-byte Folded Reload - ld.d $s3, $sp, 80 # 8-byte Folded Reload - ld.d $s2, $sp, 88 # 8-byte Folded Reload - ld.d $s1, $sp, 96 # 8-byte Folded Reload - ld.d $s0, $sp, 104 # 8-byte Folded Reload - ld.d $fp, $sp, 112 # 8-byte Folded Reload - ld.d $ra, $sp, 120 # 8-byte Folded Reload - addi.d $sp, $sp, 128 + ld.d $s7, $sp, 32 # 8-byte Folded Reload + ld.d $s6, $sp, 40 # 8-byte Folded Reload + ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $s4, $sp, 56 # 8-byte Folded Reload + ld.d $s3, $sp, 64 # 8-byte Folded Reload + ld.d $s2, $sp, 72 # 8-byte Folded Reload + ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $fp, $sp, 96 # 8-byte Folded Reload + ld.d $ra, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret .LBB62_23: .Ltmp1357: # EH_LABEL @@ -25556,7 +26163,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5StateE: # move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s3, $sp, 32 + ld.d $s3, $sp, 16 beqz $s3, .LBB62_38 .LBB62_37: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i31 move $a0, $s3 @@ -25631,19 +26238,27 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW8From_uint16_t_To_uint64_t_RN9bench .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception63 # %bb.0: - addi.d $sp, $sp, -128 - .cfi_def_cfa_offset 128 - st.d $ra, $sp, 120 # 8-byte Folded Spill - st.d $fp, $sp, 112 # 8-byte Folded Spill - st.d $s0, $sp, 104 # 8-byte Folded Spill - st.d $s1, $sp, 96 # 8-byte Folded Spill - st.d $s2, $sp, 88 # 8-byte Folded Spill - st.d $s3, $sp, 80 # 8-byte Folded Spill - st.d $s4, $sp, 72 # 8-byte Folded Spill - st.d $s5, $sp, 64 # 8-byte Folded Spill - st.d $s6, $sp, 56 # 8-byte Folded Spill - st.d $s7, $sp, 48 # 8-byte Folded Spill - st.d $s8, $sp, 40 # 8-byte Folded Spill + addi.d $sp, $sp, -192 + .cfi_def_cfa_offset 192 + st.d $ra, $sp, 184 # 8-byte Folded Spill + st.d $fp, $sp, 176 # 8-byte Folded Spill + st.d $s0, $sp, 168 # 8-byte Folded Spill + st.d $s1, $sp, 160 # 8-byte Folded Spill + st.d $s2, $sp, 152 # 8-byte Folded Spill + st.d $s3, $sp, 144 # 8-byte Folded Spill + st.d $s4, $sp, 136 # 8-byte Folded Spill + st.d $s5, $sp, 128 # 8-byte Folded Spill + st.d $s6, $sp, 120 # 8-byte Folded Spill + st.d $s7, $sp, 112 # 8-byte Folded Spill + st.d $s8, $sp, 104 # 8-byte Folded Spill + fst.d $fs0, $sp, 96 # 8-byte Folded Spill + fst.d $fs1, $sp, 88 # 8-byte Folded Spill + fst.d $fs2, $sp, 80 # 8-byte Folded Spill + fst.d $fs3, $sp, 72 # 8-byte Folded Spill + fst.d $fs4, $sp, 64 # 8-byte Folded Spill + fst.d $fs5, $sp, 56 # 8-byte Folded Spill + fst.d $fs6, $sp, 48 # 8-byte Folded Spill + fst.d $fs7, $sp, 40 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -25655,6 +26270,14 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW8From_uint16_t_To_uint64_t_RN9bench .cfi_offset 29, -72 .cfi_offset 30, -80 .cfi_offset 31, -88 + .cfi_offset 56, -96 + .cfi_offset 57, -104 + .cfi_offset 58, -112 + .cfi_offset 59, -120 + .cfi_offset 60, -128 + .cfi_offset 61, -136 + .cfi_offset 62, -144 + .cfi_offset 63, -152 move $s0, $a0 lu12i.w $s6, 4 ori $s3, $s6, 3616 @@ -25739,7 +26362,6 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW8From_uint16_t_To_uint64_t_RN9bench ori $a5, $s6, 3552 ori $a6, $s6, 3568 ori $a7, $s6, 3584 - vrepli.b $vr0, 0 ori $t0, $s6, 3588 ori $t1, $s6, 3592 ori $t2, $s6, 3596 @@ -25763,82 +26385,102 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW8From_uint16_t_To_uint64_t_RN9bench # Parent Loop BB63_11 Depth=1 # => This Inner Loop Header: Depth=2 add.d $s3, $a0, $s2 - vldx $vr1, $s3, $a4 - vldx $vr2, $s3, $a5 - vldx $vr3, $s3, $a6 - vldx $vr4, $s3, $a7 - vilvh.h $vr5, $vr0, $vr1 - vilvl.w $vr6, $vr0, $vr5 - vilvh.w $vr5, $vr0, $vr5 - vilvl.h $vr1, $vr0, $vr1 - vilvl.w $vr7, $vr0, $vr1 - vilvh.w $vr1, $vr0, $vr1 - vilvh.h $vr8, $vr0, $vr2 - vilvl.w $vr9, $vr0, $vr8 - vilvh.w $vr8, $vr0, $vr8 - vilvl.h $vr2, $vr0, $vr2 - vilvl.w $vr10, $vr0, $vr2 - vilvh.w $vr2, $vr0, $vr2 - vilvh.h $vr11, $vr0, $vr3 - vilvl.w $vr12, $vr0, $vr11 - vilvh.w $vr11, $vr0, $vr11 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr13, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vilvh.h $vr14, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vilvl.w $vr15, $vr0, $vr4 - vld $vr16, $t8, -112 - vilvh.w $vr4, $vr0, $vr4 - vld $vr17, $t8, -128 + vldx $vr0, $s3, $a4 + vldx $vr1, $s3, $a5 + vldx $vr2, $s3, $a6 + vldx $vr3, $s3, $a7 + vsllwil.wu.hu $vr4, $vr0, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vbsrl.v $vr5, $vr0, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vbsrl.v $vr6, $vr0, 12 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.h $vr0, $vr0, 14 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vsllwil.wu.hu $vr7, $vr1, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vbsrl.v $vr8, $vr1, 8 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vbsrl.v $vr9, $vr1, 12 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vshuf4i.h $vr1, $vr1, 14 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vsllwil.wu.hu $vr10, $vr2, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vbsrl.v $vr11, $vr2, 8 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vbsrl.v $vr12, $vr2, 12 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.wu.hu $vr13, $vr3, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vbsrl.v $vr14, $vr3, 8 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vbsrl.v $vr15, $vr3, 12 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vshuf4i.h $vr3, $vr3, 14 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vld $vr16, $t8, -128 + vld $vr17, $t8, -96 vld $vr18, $t8, -80 - vadd.d $vr1, $vr16, $vr1 - vld $vr16, $t8, -96 - vadd.d $vr7, $vr17, $vr7 - vld $vr17, $t8, -48 - vadd.d $vr5, $vr18, $vr5 - vld $vr18, $t8, -64 - vadd.d $vr6, $vr16, $vr6 - vld $vr16, $t8, -16 - vadd.d $vr2, $vr17, $vr2 - vld $vr17, $t8, -32 - vadd.d $vr10, $vr18, $vr10 - vld $vr18, $t8, 16 - vadd.d $vr8, $vr16, $vr8 - vld $vr16, $t8, 0 - vadd.d $vr9, $vr17, $vr9 - vld $vr17, $t8, 48 - vadd.d $vr3, $vr18, $vr3 - vld $vr18, $t8, 32 - vadd.d $vr13, $vr16, $vr13 - vld $vr16, $t8, 80 - vadd.d $vr11, $vr17, $vr11 - vld $vr17, $t8, 64 - vadd.d $vr12, $vr18, $vr12 - vld $vr18, $t8, 112 + vld $vr19, $t8, -112 + vld $vr20, $t8, -64 + vld $vr21, $t8, -32 + vld $vr22, $t8, -16 + vld $vr23, $t8, -48 + vld $vr24, $t8, 0 + vld $vr25, $t8, 32 + vld $vr26, $t8, 48 + vld $vr27, $t8, 16 + vld $vr28, $t8, 64 + vld $vr29, $t8, 96 + vld $vr30, $t8, 112 + vld $vr31, $t8, 80 + vadd.d $vr0, $vr19, $vr0 + vadd.d $vr6, $vr18, $vr6 + vadd.d $vr5, $vr17, $vr5 vadd.d $vr4, $vr16, $vr4 - vld $vr16, $t8, 96 - vadd.d $vr15, $vr17, $vr15 - vilvh.w $vr17, $vr0, $vr14 - vadd.d $vr17, $vr18, $vr17 - vilvl.w $vr14, $vr0, $vr14 - vadd.d $vr14, $vr16, $vr14 - vst $vr6, $t8, -96 - vst $vr5, $t8, -80 - vst $vr7, $t8, -128 - vst $vr1, $t8, -112 - vst $vr9, $t8, -32 - vst $vr8, $t8, -16 - vst $vr10, $t8, -64 - vst $vr2, $t8, -48 - vst $vr12, $t8, 32 - vst $vr11, $t8, 48 - vst $vr13, $t8, 0 - vst $vr3, $t8, 16 + vadd.d $vr1, $vr23, $vr1 + vadd.d $vr9, $vr22, $vr9 + vadd.d $vr8, $vr21, $vr8 + vadd.d $vr7, $vr20, $vr7 + vadd.d $vr2, $vr27, $vr2 + vadd.d $vr12, $vr26, $vr12 + vadd.d $vr11, $vr25, $vr11 + vadd.d $vr10, $vr24, $vr10 + vadd.d $vr3, $vr31, $vr3 + vadd.d $vr15, $vr30, $vr15 + vadd.d $vr14, $vr29, $vr14 + vadd.d $vr13, $vr28, $vr13 + vst $vr4, $t8, -128 + vst $vr5, $t8, -96 + vst $vr6, $t8, -80 + vst $vr0, $t8, -112 + vst $vr7, $t8, -64 + vst $vr8, $t8, -32 + vst $vr9, $t8, -16 + vst $vr1, $t8, -48 + vst $vr10, $t8, 0 + vst $vr11, $t8, 32 + vst $vr12, $t8, 48 + vst $vr2, $t8, 16 + vst $vr13, $t8, 64 vst $vr14, $t8, 96 - vst $vr17, $t8, 112 - vst $vr15, $t8, 64 - vst $vr4, $t8, 80 + vst $vr15, $t8, 112 + vst $vr3, $t8, 80 addi.d $s2, $s2, 64 addi.d $t8, $t8, 256 bnez $s2, .LBB63_12 @@ -25926,18 +26568,26 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW8From_uint16_t_To_uint64_t_RN9bench move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 40 # 8-byte Folded Reload - ld.d $s7, $sp, 48 # 8-byte Folded Reload - ld.d $s6, $sp, 56 # 8-byte Folded Reload - ld.d $s5, $sp, 64 # 8-byte Folded Reload - ld.d $s4, $sp, 72 # 8-byte Folded Reload - ld.d $s3, $sp, 80 # 8-byte Folded Reload - ld.d $s2, $sp, 88 # 8-byte Folded Reload - ld.d $s1, $sp, 96 # 8-byte Folded Reload - ld.d $s0, $sp, 104 # 8-byte Folded Reload - ld.d $fp, $sp, 112 # 8-byte Folded Reload - ld.d $ra, $sp, 120 # 8-byte Folded Reload - addi.d $sp, $sp, 128 + fld.d $fs7, $sp, 40 # 8-byte Folded Reload + fld.d $fs6, $sp, 48 # 8-byte Folded Reload + fld.d $fs5, $sp, 56 # 8-byte Folded Reload + fld.d $fs4, $sp, 64 # 8-byte Folded Reload + fld.d $fs3, $sp, 72 # 8-byte Folded Reload + fld.d $fs2, $sp, 80 # 8-byte Folded Reload + fld.d $fs1, $sp, 88 # 8-byte Folded Reload + fld.d $fs0, $sp, 96 # 8-byte Folded Reload + ld.d $s8, $sp, 104 # 8-byte Folded Reload + ld.d $s7, $sp, 112 # 8-byte Folded Reload + ld.d $s6, $sp, 120 # 8-byte Folded Reload + ld.d $s5, $sp, 128 # 8-byte Folded Reload + ld.d $s4, $sp, 136 # 8-byte Folded Reload + ld.d $s3, $sp, 144 # 8-byte Folded Reload + ld.d $s2, $sp, 152 # 8-byte Folded Reload + ld.d $s1, $sp, 160 # 8-byte Folded Reload + ld.d $s0, $sp, 168 # 8-byte Folded Reload + ld.d $fp, $sp, 176 # 8-byte Folded Reload + ld.d $ra, $sp, 184 # 8-byte Folded Reload + addi.d $sp, $sp, 192 ret .LBB63_18: .Ltmp1379: # EH_LABEL @@ -26026,27 +26676,27 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint64_t_RN9benc .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception64 # %bb.0: - addi.d $sp, $sp, -272 - .cfi_def_cfa_offset 272 - st.d $ra, $sp, 264 # 8-byte Folded Spill - st.d $fp, $sp, 256 # 8-byte Folded Spill - st.d $s0, $sp, 248 # 8-byte Folded Spill - st.d $s1, $sp, 240 # 8-byte Folded Spill - st.d $s2, $sp, 232 # 8-byte Folded Spill - st.d $s3, $sp, 224 # 8-byte Folded Spill - st.d $s4, $sp, 216 # 8-byte Folded Spill - st.d $s5, $sp, 208 # 8-byte Folded Spill - st.d $s6, $sp, 200 # 8-byte Folded Spill - st.d $s7, $sp, 192 # 8-byte Folded Spill - st.d $s8, $sp, 184 # 8-byte Folded Spill - fst.d $fs0, $sp, 176 # 8-byte Folded Spill - fst.d $fs1, $sp, 168 # 8-byte Folded Spill - fst.d $fs2, $sp, 160 # 8-byte Folded Spill - fst.d $fs3, $sp, 152 # 8-byte Folded Spill - fst.d $fs4, $sp, 144 # 8-byte Folded Spill - fst.d $fs5, $sp, 136 # 8-byte Folded Spill - fst.d $fs6, $sp, 128 # 8-byte Folded Spill - fst.d $fs7, $sp, 120 # 8-byte Folded Spill + addi.d $sp, $sp, -240 + .cfi_def_cfa_offset 240 + st.d $ra, $sp, 232 # 8-byte Folded Spill + st.d $fp, $sp, 224 # 8-byte Folded Spill + st.d $s0, $sp, 216 # 8-byte Folded Spill + st.d $s1, $sp, 208 # 8-byte Folded Spill + st.d $s2, $sp, 200 # 8-byte Folded Spill + st.d $s3, $sp, 192 # 8-byte Folded Spill + st.d $s4, $sp, 184 # 8-byte Folded Spill + st.d $s5, $sp, 176 # 8-byte Folded Spill + st.d $s6, $sp, 168 # 8-byte Folded Spill + st.d $s7, $sp, 160 # 8-byte Folded Spill + st.d $s8, $sp, 152 # 8-byte Folded Spill + fst.d $fs0, $sp, 144 # 8-byte Folded Spill + fst.d $fs1, $sp, 136 # 8-byte Folded Spill + fst.d $fs2, $sp, 128 # 8-byte Folded Spill + fst.d $fs3, $sp, 120 # 8-byte Folded Spill + fst.d $fs4, $sp, 112 # 8-byte Folded Spill + fst.d $fs5, $sp, 104 # 8-byte Folded Spill + fst.d $fs6, $sp, 96 # 8-byte Folded Spill + fst.d $fs7, $sp, 88 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -26084,10 +26734,10 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint64_t_RN9benc move $s1, $a0 ori $a0, $s6, 3586 add.d $s5, $fp, $a0 - st.d $s1, $sp, 88 + st.d $s1, $sp, 56 lu12i.w $a0, -16 lu32i.d $a0, 0 - st.w $a0, $sp, 96 + st.w $a0, $sp, 64 lu12i.w $s8, -5 ori $s0, $s8, 480 pcalau12i $a0, %pc_hi20(_ZL3rng) @@ -26095,8 +26745,8 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint64_t_RN9benc .p2align 4, , 16 .LBB64_2: # =>This Inner Loop Header: Depth=1 .Ltmp1394: # EH_LABEL - addi.d $a0, $sp, 96 - addi.d $a2, $sp, 96 + addi.d $a0, $sp, 64 + addi.d $a2, $sp, 64 move $a1, $s4 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionItEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEtRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -26110,15 +26760,15 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint64_t_RN9benc # %bb.4: pcalau12i $a0, %pc_hi20(.LCPI64_0) vld $vr0, $a0, %pc_lo12(.LCPI64_0) - vst $vr0, $sp, 96 + vst $vr0, $sp, 64 pcalau12i $a0, %pc_hi20(_ZL3rng) addi.d $s3, $a0, %pc_lo12(_ZL3rng) move $s0, $zero .p2align 4, , 16 .LBB64_5: # =>This Inner Loop Header: Depth=1 .Ltmp1397: # EH_LABEL - addi.d $a0, $sp, 96 - addi.d $a2, $sp, 96 + addi.d $a0, $sp, 64 + addi.d $a2, $sp, 64 move $a1, $s3 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionImEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEmRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -26142,7 +26792,7 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint64_t_RN9benc beqz $s1, .LBB64_14 # %bb.10: # %.lr.ph.preheader addi.d $a0, $fp, 96 - addi.d $a1, $sp, 88 + addi.d $a1, $sp, 56 ori $a2, $s7, 2048 ori $a3, $s8, 512 ori $a4, $s6, 3488 @@ -26153,7 +26803,6 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint64_t_RN9benc ori $t1, $s6, 3568 ori $t2, $s6, 3584 ori $t3, $s6, 3600 - vrepli.b $vr0, 0 ori $t4, $s6, 3588 ori $t5, $s6, 3592 ori $t6, $s6, 3596 @@ -26167,7 +26816,7 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint64_t_RN9benc #APP #NO_APP #MEMBARRIER - ld.d $s0, $sp, 88 + ld.d $s0, $sp, 56 add.d $s3, $s0, $a2 addi.d $s4, $s0, 256 move $s6, $a3 @@ -26176,166 +26825,202 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint64_t_RN9benc # Parent Loop BB64_11 Depth=1 # => This Inner Loop Header: Depth=2 add.d $s0, $a0, $s6 - vldx $vr2, $s0, $a4 - vldx $vr3, $s0, $a5 - vldx $vr4, $s0, $a6 - vldx $vr7, $s0, $a7 - vldx $vr9, $s0, $t0 - vldx $vr10, $s0, $t1 - vldx $vr29, $s0, $t2 - vldx $vr1, $s0, $t3 - vst $vr1, $sp, 64 # 16-byte Folded Spill - vilvh.h $vr5, $vr0, $vr3 - vilvl.w $vr8, $vr0, $vr5 - vilvh.w $vr12, $vr0, $vr5 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr16, $vr0, $vr3 - vilvh.w $vr18, $vr0, $vr3 - vilvh.h $vr3, $vr0, $vr2 - vilvl.w $vr22, $vr0, $vr3 - vilvh.w $vr24, $vr0, $vr3 - vilvl.h $vr2, $vr0, $vr2 - vilvl.w $vr28, $vr0, $vr2 - vilvh.w $vr27, $vr0, $vr2 - vilvh.h $vr2, $vr0, $vr7 - vilvl.w $vr6, $vr0, $vr2 - vilvh.w $vr5, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr7 - vilvl.w $vr13, $vr0, $vr2 - vilvh.w $vr14, $vr0, $vr2 - vilvh.h $vr2, $vr0, $vr4 - vilvl.w $vr19, $vr0, $vr2 - vilvh.w $vr20, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr4 - vilvl.w $vr25, $vr0, $vr2 - vilvh.w $vr26, $vr0, $vr2 - vilvh.h $vr3, $vr0, $vr10 - vilvl.w $vr2, $vr0, $vr3 - vilvh.w $vr4, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr10 - vilvl.w $vr7, $vr0, $vr3 - vilvh.w $vr10, $vr0, $vr3 - vilvh.h $vr3, $vr0, $vr9 - vilvl.w $vr15, $vr0, $vr3 - vilvh.w $vr17, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr9 - vilvl.w $vr21, $vr0, $vr3 - vilvh.w $vr23, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr1 - vilvh.h $vr11, $vr0, $vr29 - vilvl.w $vr9, $vr0, $vr11 - vld $vr30, $s4, -240 - vilvl.h $vr29, $vr0, $vr29 - vld $vr31, $s4, -256 - vld $vr1, $s4, -208 - vadd.d $vr27, $vr30, $vr27 - vst $vr27, $sp, 48 # 16-byte Folded Spill - vld $vr30, $s4, -224 - vadd.d $vr27, $vr31, $vr28 - vst $vr27, $sp, 32 # 16-byte Folded Spill - vld $vr31, $s4, -176 - vadd.d $vr1, $vr1, $vr24 - vst $vr1, $sp, 16 # 16-byte Folded Spill - vld $vr1, $s4, -192 - vadd.d $vr24, $vr30, $vr22 + vldx $vr4, $s0, $a4 + vldx $vr6, $s0, $a5 + vldx $vr7, $s0, $a6 + vldx $vr10, $s0, $a7 + vldx $vr21, $s0, $t0 + vldx $vr19, $s0, $t1 + vldx $vr1, $s0, $t2 + vldx $vr11, $s0, $t3 + vsllwil.wu.hu $vr2, $vr6, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.wu.hu $vr3, $vr4, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr5, $vr6, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vbsrl.v $vr8, $vr6, 12 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.h $vr6, $vr6, 14 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr0, $vr6, 0 + vbsrl.v $vr6, $vr4, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr14, $vr6, 0 + vbsrl.v $vr6, $vr4, 12 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr17, $vr6, 0 + vshuf4i.h $vr4, $vr4, 14 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr20, $vr4, 0 + vsllwil.wu.hu $vr4, $vr10, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsllwil.wu.hu $vr6, $vr7, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vbsrl.v $vr9, $vr10, 8 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vbsrl.v $vr12, $vr10, 12 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vshuf4i.h $vr10, $vr10, 14 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr15, $vr10, 0 + vbsrl.v $vr10, $vr7, 8 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr18, $vr10, 0 + vbsrl.v $vr10, $vr7, 12 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr22, $vr10, 0 + vshuf4i.h $vr7, $vr7, 14 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr25, $vr7, 0 + vsllwil.wu.hu $vr7, $vr19, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.wu.hu $vr10, $vr21, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vbsrl.v $vr13, $vr19, 8 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vbsrl.v $vr16, $vr19, 12 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.du.wu $vr16, $vr16, 0 + vshuf4i.h $vr19, $vr19, 14 + vsllwil.wu.hu $vr19, $vr19, 0 + vsllwil.du.wu $vr19, $vr19, 0 + vbsrl.v $vr23, $vr21, 8 + vsllwil.wu.hu $vr23, $vr23, 0 + vsllwil.du.wu $vr23, $vr23, 0 + vbsrl.v $vr24, $vr21, 12 + vsllwil.wu.hu $vr24, $vr24, 0 + vsllwil.du.wu $vr26, $vr24, 0 + vshuf4i.h $vr21, $vr21, 14 + vsllwil.wu.hu $vr21, $vr21, 0 + vsllwil.du.wu $vr27, $vr21, 0 + vshuf4i.h $vr21, $vr11, 14 + vsllwil.wu.hu $vr21, $vr21, 0 + vsllwil.du.wu $vr21, $vr21, 0 + vbsrl.v $vr24, $vr1, 8 + vsllwil.wu.hu $vr24, $vr24, 0 + vbsrl.v $vr28, $vr1, 12 + vsllwil.wu.hu $vr28, $vr28, 0 + vld $vr29, $s4, -240 + vsllwil.du.wu $vr28, $vr28, 0 + vld $vr30, $s4, -208 + vld $vr31, $s4, -224 + vadd.d $vr20, $vr29, $vr20 + vst $vr20, $sp, 32 # 16-byte Folded Spill + vld $vr29, $s4, -176 + vadd.d $vr17, $vr30, $vr17 + vst $vr17, $sp, 16 # 16-byte Folded Spill vld $vr30, $s4, -144 - vadd.d $vr22, $vr31, $vr18 + vadd.d $vr17, $vr31, $vr14 vld $vr31, $s4, -160 - vadd.d $vr18, $vr1, $vr16 - vld $vr16, $s4, -112 - vadd.d $vr30, $vr30, $vr12 - vld $vr12, $s4, -128 - vadd.d $vr31, $vr31, $vr8 - vld $vr8, $s4, -80 - vadd.d $vr16, $vr16, $vr26 - vld $vr26, $s4, -96 - vadd.d $vr12, $vr12, $vr25 - vld $vr25, $s4, -48 - vadd.d $vr8, $vr8, $vr20 - vld $vr20, $s4, -64 - vadd.d $vr19, $vr26, $vr19 - vld $vr26, $s4, -16 - vadd.d $vr14, $vr25, $vr14 - vld $vr25, $s4, -32 - vadd.d $vr13, $vr20, $vr13 - vld $vr20, $s4, 16 - vadd.d $vr5, $vr26, $vr5 + vadd.d $vr14, $vr29, $vr0 + vld $vr29, $s4, -256 + vadd.d $vr8, $vr30, $vr8 + vld $vr30, $s4, -192 + vadd.d $vr31, $vr31, $vr5 + vld $vr5, $s4, -112 + vadd.d $vr29, $vr29, $vr3 + vld $vr3, $s4, -80 + vadd.d $vr30, $vr30, $vr2 + vld $vr2, $s4, -96 + vadd.d $vr25, $vr5, $vr25 + vld $vr5, $s4, -48 + vadd.d $vr3, $vr3, $vr22 + vld $vr22, $s4, -16 + vadd.d $vr2, $vr2, $vr18 + vld $vr18, $s4, -32 + vadd.d $vr15, $vr5, $vr15 + vld $vr5, $s4, -128 + vadd.d $vr12, $vr22, $vr12 + vld $vr22, $s4, -64 + vadd.d $vr9, $vr18, $vr9 + vld $vr18, $s4, 16 + vadd.d $vr5, $vr5, $vr6 + vld $vr6, $s4, 48 + vadd.d $vr22, $vr22, $vr4 + vld $vr4, $s4, 32 + vadd.d $vr18, $vr18, $vr27 + vld $vr27, $s4, 80 + vadd.d $vr6, $vr6, $vr26 + vld $vr26, $s4, 112 + vadd.d $vr4, $vr4, $vr23 + vld $vr23, $s4, 96 + vadd.d $vr19, $vr27, $vr19 + vshuf4i.h $vr27, $vr1, 14 + vadd.d $vr16, $vr26, $vr16 vld $vr26, $s4, 0 - vadd.d $vr25, $vr25, $vr6 - vld $vr1, $s4, 48 - vadd.d $vr20, $vr20, $vr23 - vld $vr23, $s4, 32 - vadd.d $vr21, $vr26, $vr21 - vld $vr26, $s4, 80 - vadd.d $vr6, $vr1, $vr17 - vld $vr17, $s4, 64 - vadd.d $vr15, $vr23, $vr15 - vld $vr23, $s4, 112 + vadd.d $vr13, $vr23, $vr13 + vld $vr23, $s4, 64 + vsllwil.wu.hu $vr27, $vr27, 0 vadd.d $vr10, $vr26, $vr10 - vld $vr26, $s4, 96 - vadd.d $vr7, $vr17, $vr7 - vilvl.w $vr17, $vr0, $vr29 - vadd.d $vr4, $vr23, $vr4 - vld $vr23, $s4, 144 - vadd.d $vr26, $vr26, $vr2 - vld $vr1, $s4, 128 - vilvh.w $vr29, $vr0, $vr29 - vadd.d $vr23, $vr23, $vr29 - vld $vr29, $s4, 176 - vadd.d $vr2, $vr1, $vr17 - vld $vr17, $s4, 160 - vilvh.w $vr11, $vr0, $vr11 - vadd.d $vr11, $vr29, $vr11 - vld $vr29, $s4, 208 - vadd.d $vr9, $vr17, $vr9 - vld $vr17, $s4, 192 - vilvh.w $vr27, $vr0, $vr3 - vadd.d $vr27, $vr29, $vr27 - vilvl.w $vr3, $vr0, $vr3 - vadd.d $vr3, $vr17, $vr3 - vld $vr17, $s4, 240 - vld $vr1, $sp, 64 # 16-byte Folded Reload - vilvh.h $vr29, $vr0, $vr1 - vld $vr1, $s4, 224 - vilvh.w $vr28, $vr0, $vr29 - vadd.d $vr17, $vr17, $vr28 - vilvl.w $vr28, $vr0, $vr29 - vadd.d $vr1, $vr1, $vr28 + vld $vr26, $s4, 144 + vadd.d $vr7, $vr23, $vr7 + vld $vr23, $s4, 176 + vsllwil.du.wu $vr27, $vr27, 0 + vadd.d $vr26, $vr26, $vr27 + vld $vr27, $s4, 160 + vadd.d $vr23, $vr23, $vr28 + vld $vr28, $s4, 208 + vsllwil.du.wu $vr24, $vr24, 0 + vadd.d $vr24, $vr27, $vr24 + vld $vr27, $s4, 240 + vadd.d $vr21, $vr28, $vr21 + vbsrl.v $vr28, $vr11, 12 + vsllwil.wu.hu $vr28, $vr28, 0 + vsllwil.du.wu $vr28, $vr28, 0 + vadd.d $vr27, $vr27, $vr28 + vld $vr28, $s4, 224 + vbsrl.v $vr20, $vr11, 8 + vsllwil.wu.hu $vr20, $vr20, 0 + vsllwil.du.wu $vr20, $vr20, 0 + vadd.d $vr20, $vr28, $vr20 + vld $vr28, $s4, 128 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vld $vr0, $s4, 192 + vadd.d $vr1, $vr28, $vr1 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vadd.d $vr0, $vr0, $vr11 + vst $vr30, $s4, -192 + vst $vr29, $s4, -256 vst $vr31, $s4, -160 - vst $vr30, $s4, -144 - vst $vr18, $s4, -192 - vst $vr22, $s4, -176 - vst $vr24, $s4, -224 - vld $vr18, $sp, 16 # 16-byte Folded Reload - vst $vr18, $s4, -208 - vld $vr18, $sp, 32 # 16-byte Folded Reload - vst $vr18, $s4, -256 - vld $vr18, $sp, 48 # 16-byte Folded Reload - vst $vr18, $s4, -240 - vst $vr25, $s4, -32 - vst $vr5, $s4, -16 - vst $vr13, $s4, -64 - vst $vr14, $s4, -48 - vst $vr19, $s4, -96 - vst $vr8, $s4, -80 - vst $vr12, $s4, -128 - vst $vr16, $s4, -112 - vst $vr26, $s4, 96 - vst $vr4, $s4, 112 + vst $vr8, $s4, -144 + vst $vr14, $s4, -176 + vst $vr17, $s4, -224 + vld $vr8, $sp, 16 # 16-byte Folded Reload + vst $vr8, $s4, -208 + vld $vr8, $sp, 32 # 16-byte Folded Reload + vst $vr8, $s4, -240 + vst $vr22, $s4, -64 + vst $vr5, $s4, -128 + vst $vr9, $s4, -32 + vst $vr12, $s4, -16 + vst $vr15, $s4, -48 + vst $vr2, $s4, -96 + vst $vr3, $s4, -80 + vst $vr25, $s4, -112 vst $vr7, $s4, 64 - vst $vr10, $s4, 80 - vst $vr15, $s4, 32 + vst $vr10, $s4, 0 + vst $vr13, $s4, 96 + vst $vr16, $s4, 112 + vst $vr19, $s4, 80 + vst $vr4, $s4, 32 vst $vr6, $s4, 48 - vst $vr21, $s4, 0 - vst $vr20, $s4, 16 - vst $vr1, $s4, 224 - vst $vr17, $s4, 240 - vst $vr3, $s4, 192 - vst $vr27, $s4, 208 - vst $vr9, $s4, 160 - vst $vr11, $s4, 176 - vst $vr2, $s4, 128 - vst $vr23, $s4, 144 + vst $vr18, $s4, 16 + vst $vr0, $s4, 192 + vst $vr1, $s4, 128 + vst $vr20, $s4, 224 + vst $vr27, $s4, 240 + vst $vr21, $s4, 208 + vst $vr24, $s4, 160 + vst $vr23, $s4, 176 + vst $vr26, $s4, 144 addi.d $s6, $s6, 128 addi.d $s4, $s4, 512 bnez $s6, .LBB64_12 @@ -26414,7 +27099,7 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint64_t_RN9benc jirl $ra, $ra, 0 .Ltmp1403: # EH_LABEL # %bb.15: # %_ZNSt10unique_ptrIA_mSt14default_deleteIS0_EED2Ev.exit22 - ld.d $a0, $sp, 88 + ld.d $a0, $sp, 56 beqz $a0, .LBB64_17 # %bb.16: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i24 pcaddu18i $ra, %call36(_ZdaPv) @@ -26423,26 +27108,26 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint64_t_RN9benc move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - fld.d $fs7, $sp, 120 # 8-byte Folded Reload - fld.d $fs6, $sp, 128 # 8-byte Folded Reload - fld.d $fs5, $sp, 136 # 8-byte Folded Reload - fld.d $fs4, $sp, 144 # 8-byte Folded Reload - fld.d $fs3, $sp, 152 # 8-byte Folded Reload - fld.d $fs2, $sp, 160 # 8-byte Folded Reload - fld.d $fs1, $sp, 168 # 8-byte Folded Reload - fld.d $fs0, $sp, 176 # 8-byte Folded Reload - ld.d $s8, $sp, 184 # 8-byte Folded Reload - ld.d $s7, $sp, 192 # 8-byte Folded Reload - ld.d $s6, $sp, 200 # 8-byte Folded Reload - ld.d $s5, $sp, 208 # 8-byte Folded Reload - ld.d $s4, $sp, 216 # 8-byte Folded Reload - ld.d $s3, $sp, 224 # 8-byte Folded Reload - ld.d $s2, $sp, 232 # 8-byte Folded Reload - ld.d $s1, $sp, 240 # 8-byte Folded Reload - ld.d $s0, $sp, 248 # 8-byte Folded Reload - ld.d $fp, $sp, 256 # 8-byte Folded Reload - ld.d $ra, $sp, 264 # 8-byte Folded Reload - addi.d $sp, $sp, 272 + fld.d $fs7, $sp, 88 # 8-byte Folded Reload + fld.d $fs6, $sp, 96 # 8-byte Folded Reload + fld.d $fs5, $sp, 104 # 8-byte Folded Reload + fld.d $fs4, $sp, 112 # 8-byte Folded Reload + fld.d $fs3, $sp, 120 # 8-byte Folded Reload + fld.d $fs2, $sp, 128 # 8-byte Folded Reload + fld.d $fs1, $sp, 136 # 8-byte Folded Reload + fld.d $fs0, $sp, 144 # 8-byte Folded Reload + ld.d $s8, $sp, 152 # 8-byte Folded Reload + ld.d $s7, $sp, 160 # 8-byte Folded Reload + ld.d $s6, $sp, 168 # 8-byte Folded Reload + ld.d $s5, $sp, 176 # 8-byte Folded Reload + ld.d $s4, $sp, 184 # 8-byte Folded Reload + ld.d $s3, $sp, 192 # 8-byte Folded Reload + ld.d $s2, $sp, 200 # 8-byte Folded Reload + ld.d $s1, $sp, 208 # 8-byte Folded Reload + ld.d $s0, $sp, 216 # 8-byte Folded Reload + ld.d $fp, $sp, 224 # 8-byte Folded Reload + ld.d $ra, $sp, 232 # 8-byte Folded Reload + addi.d $sp, $sp, 240 ret .LBB64_18: .Ltmp1393: # EH_LABEL @@ -26455,7 +27140,7 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint16_t_To_uint64_t_RN9benc jirl $ra, $ra, 0 .LBB64_19: .Ltmp1404: # EH_LABEL - ld.d $s1, $sp, 88 + ld.d $s1, $sp, 56 move $s0, $a0 bnez $s1, .LBB64_23 b .LBB64_24 @@ -26629,7 +27314,6 @@ _Z61benchForTruncOrZextVecWithAddInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5St addi.d $a0, $sp, 8 ori $a1, $s6, 480 ori $a2, $s5, 3620 - vrepli.b $vr0, 0 .p2align 4, , 16 .LBB65_11: # %.lr.ph # =>This Loop Header: Depth=1 @@ -26647,18 +27331,18 @@ _Z61benchForTruncOrZextVecWithAddInLoopFrom_uint16_t_To_uint64_t_RN9benchmark5St add.d $a5, $fp, $a4 ldx.w $a6, $a5, $s1 ldx.w $a5, $a5, $a2 - vinsgr2vr.w $vr1, $a6, 0 - vinsgr2vr.w $vr2, $a5, 0 - vilvl.h $vr1, $vr0, $vr1 - vilvl.w $vr1, $vr0, $vr1 - vld $vr3, $a3, -16 - vld $vr4, $a3, 0 - vilvl.h $vr2, $vr0, $vr2 - vilvl.w $vr2, $vr0, $vr2 + vinsgr2vr.w $vr0, $a6, 0 + vinsgr2vr.w $vr1, $a5, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vld $vr2, $a3, -16 + vld $vr3, $a3, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vadd.d $vr0, $vr2, $vr0 vadd.d $vr1, $vr3, $vr1 - vadd.d $vr2, $vr4, $vr2 - vst $vr1, $a3, -16 - vst $vr2, $a3, 0 + vst $vr0, $a3, -16 + vst $vr1, $a3, 0 addi.d $a4, $a4, 8 addi.d $a3, $a3, 32 bnez $a4, .LBB65_12 @@ -26776,19 +27460,19 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception66 # %bb.0: - addi.d $sp, $sp, -160 - .cfi_def_cfa_offset 160 - st.d $ra, $sp, 152 # 8-byte Folded Spill - st.d $fp, $sp, 144 # 8-byte Folded Spill - st.d $s0, $sp, 136 # 8-byte Folded Spill - st.d $s1, $sp, 128 # 8-byte Folded Spill - st.d $s2, $sp, 120 # 8-byte Folded Spill - st.d $s3, $sp, 112 # 8-byte Folded Spill - st.d $s4, $sp, 104 # 8-byte Folded Spill - st.d $s5, $sp, 96 # 8-byte Folded Spill - st.d $s6, $sp, 88 # 8-byte Folded Spill - st.d $s7, $sp, 80 # 8-byte Folded Spill - st.d $s8, $sp, 72 # 8-byte Folded Spill + addi.d $sp, $sp, -128 + .cfi_def_cfa_offset 128 + st.d $ra, $sp, 120 # 8-byte Folded Spill + st.d $fp, $sp, 112 # 8-byte Folded Spill + st.d $s0, $sp, 104 # 8-byte Folded Spill + st.d $s1, $sp, 96 # 8-byte Folded Spill + st.d $s2, $sp, 88 # 8-byte Folded Spill + st.d $s3, $sp, 80 # 8-byte Folded Spill + st.d $s4, $sp, 72 # 8-byte Folded Spill + st.d $s5, $sp, 64 # 8-byte Folded Spill + st.d $s6, $sp, 56 # 8-byte Folded Spill + st.d $s7, $sp, 48 # 8-byte Folded Spill + st.d $s8, $sp, 40 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -26800,7 +27484,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St .cfi_offset 29, -72 .cfi_offset 30, -80 .cfi_offset 31, -88 - st.d $a0, $sp, 40 # 8-byte Folded Spill + st.d $a0, $sp, 8 # 8-byte Folded Spill lu12i.w $s1, 9 ori $s4, $s1, 3136 move $a0, $s4 @@ -26816,12 +27500,12 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St .Ltmp1420: # EH_LABEL # %bb.1: # %.split move $s2, $a0 - st.d $a0, $sp, 56 + st.d $a0, $sp, 24 .Ltmp1422: # EH_LABEL move $a0, $s3 pcaddu18i $ra, %call36(_Znam) jirl $ra, $ra, 0 - st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a0, $sp, 16 # 8-byte Folded Spill .Ltmp1423: # EH_LABEL # %bb.2: ori $s1, $s1, 3072 @@ -26830,15 +27514,15 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St add.d $s8, $s2, $s7 ori $a0, $zero, 0 lu32i.d $a0, -1 - st.d $a0, $sp, 64 + st.d $a0, $sp, 32 pcalau12i $a0, %pc_hi20(_ZL3rng) addi.d $s5, $a0, %pc_lo12(_ZL3rng) move $s0, $zero .p2align 4, , 16 .LBB66_3: # =>This Inner Loop Header: Depth=1 .Ltmp1425: # EH_LABEL - addi.d $a0, $sp, 64 - addi.d $a2, $sp, 64 + addi.d $a0, $sp, 32 + addi.d $a2, $sp, 32 move $a1, $s5 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIjEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEjRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -26850,14 +27534,13 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St bne $s0, $s4, .LBB66_3 # %bb.5: # %vector.ph move $a0, $zero - vrepli.b $vr16, 0 - ld.d $a1, $sp, 48 # 8-byte Folded Reload + ld.d $a1, $sp, 16 # 8-byte Folded Reload .p2align 4, , 16 .LBB66_6: # %vector.body # =>This Inner Loop Header: Depth=1 ldx.d $a2, $fp, $a0 vinsgr2vr.d $vr0, $a2, 0 - vilvl.w $vr0, $vr16, $vr0 + vsllwil.du.wu $vr0, $vr0, 0 vst $vr0, $a1, 0 addi.d $a0, $a0, 8 addi.d $a1, $a1, 16 @@ -26877,38 +27560,46 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St vld $vr5, $a2, 64 vld $vr6, $a2, 112 vld $vr7, $a2, 96 - vilvl.w $vr8, $vr16, $vr1 - vilvh.w $vr1, $vr16, $vr1 - vilvl.w $vr9, $vr16, $vr0 - vilvh.w $vr0, $vr16, $vr0 - vilvl.w $vr10, $vr16, $vr3 - vilvh.w $vr3, $vr16, $vr3 - vilvl.w $vr11, $vr16, $vr2 - vilvh.w $vr2, $vr16, $vr2 - vilvl.w $vr12, $vr16, $vr5 - vilvh.w $vr5, $vr16, $vr5 - vilvl.w $vr13, $vr16, $vr4 - vilvh.w $vr4, $vr16, $vr4 - vilvl.w $vr14, $vr16, $vr7 - vilvh.w $vr7, $vr16, $vr7 - vilvl.w $vr15, $vr16, $vr6 - vilvh.w $vr6, $vr16, $vr6 - vst $vr0, $a1, -112 - vst $vr9, $a1, -128 - vst $vr1, $a1, -80 - vst $vr8, $a1, -96 - vst $vr2, $a1, -16 - vst $vr11, $a1, -32 - vst $vr3, $a1, -48 - vst $vr10, $a1, -64 - vst $vr4, $a1, 48 - vst $vr13, $a1, 32 - vst $vr5, $a1, 16 - vst $vr12, $a1, 0 - vst $vr6, $a1, 112 - vst $vr15, $a1, 96 - vst $vr7, $a1, 80 - vst $vr14, $a1, 64 + vshuf4i.w $vr8, $vr1, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.w $vr9, $vr0, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vshuf4i.w $vr10, $vr3, 14 + vsllwil.du.wu $vr10, $vr10, 0 + vshuf4i.w $vr11, $vr2, 14 + vsllwil.du.wu $vr11, $vr11, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vshuf4i.w $vr12, $vr5, 14 + vsllwil.du.wu $vr12, $vr12, 0 + vshuf4i.w $vr13, $vr4, 14 + vsllwil.du.wu $vr13, $vr13, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.w $vr14, $vr7, 14 + vsllwil.du.wu $vr14, $vr14, 0 + vshuf4i.w $vr15, $vr6, 14 + vsllwil.du.wu $vr15, $vr15, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vst $vr0, $a1, -128 + vst $vr1, $a1, -96 + vst $vr9, $a1, -112 + vst $vr8, $a1, -80 + vst $vr2, $a1, -32 + vst $vr3, $a1, -64 + vst $vr11, $a1, -16 + vst $vr10, $a1, -48 + vst $vr4, $a1, 32 + vst $vr5, $a1, 0 + vst $vr13, $a1, 48 + vst $vr12, $a1, 16 + vst $vr6, $a1, 96 + vst $vr7, $a1, 64 + vst $vr15, $a1, 112 + vst $vr14, $a1, 80 addi.d $a0, $a0, 128 addi.d $a1, $a1, 256 bne $a0, $s1, .LBB66_8 @@ -26947,7 +27638,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St st.d $a2, $s8, 112 st.d $a3, $s8, 120 move $s1, $fp - ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $s5, $sp, 16 # 8-byte Folded Reload .p2align 4, , 16 .LBB66_10: # %_ZL27truncOrZextVecInLoopWithVW8IjmEvPKT_PT0_i.exit.preheader # =>This Inner Loop Header: Depth=1 @@ -26960,8 +27651,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St addi.d $s1, $s1, 4 bne $s4, $s3, .LBB66_10 # %bb.12: - vst $vr16, $sp, 16 # 16-byte Folded Spill - ld.d $s1, $sp, 40 # 8-byte Folded Reload + ld.d $s1, $sp, 8 # 8-byte Folded Reload ld.w $s0, $s1, 28 ld.d $s2, $s1, 16 .Ltmp1443: # EH_LABEL @@ -26970,13 +27660,12 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St jirl $ra, $ra, 0 .Ltmp1444: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr16, $sp, 16 # 16-byte Folded Reload bnez $s0, .LBB66_19 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s2, .LBB66_19 # %bb.15: # %.lr.ph.preheader addi.d $a0, $fp, 64 - addi.d $a1, $sp, 56 + addi.d $a1, $sp, 24 lu12i.w $a2, 2 ori $a2, $a2, 1792 .p2align 4, , 16 @@ -26986,7 +27675,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St #APP #NO_APP #MEMBARRIER - ld.d $a4, $sp, 56 + ld.d $a4, $sp, 24 add.d $a3, $a4, $s7 addi.d $a4, $a4, 128 move $a5, $a0 @@ -27003,38 +27692,46 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St vld $vr5, $a5, 0 vld $vr6, $a5, 48 vld $vr7, $a5, 32 - vilvl.w $vr8, $vr16, $vr1 - vilvh.w $vr1, $vr16, $vr1 - vilvl.w $vr9, $vr16, $vr0 - vilvh.w $vr0, $vr16, $vr0 - vilvl.w $vr10, $vr16, $vr3 - vilvh.w $vr3, $vr16, $vr3 - vilvl.w $vr11, $vr16, $vr2 - vilvh.w $vr2, $vr16, $vr2 - vilvl.w $vr12, $vr16, $vr5 - vilvh.w $vr5, $vr16, $vr5 - vilvl.w $vr13, $vr16, $vr4 - vilvh.w $vr4, $vr16, $vr4 - vilvl.w $vr14, $vr16, $vr7 - vilvh.w $vr7, $vr16, $vr7 - vilvl.w $vr15, $vr16, $vr6 - vilvh.w $vr6, $vr16, $vr6 - vst $vr0, $a4, -80 - vst $vr9, $a4, -96 - vst $vr1, $a4, -112 - vst $vr8, $a4, -128 - vst $vr2, $a4, -16 - vst $vr11, $a4, -32 - vst $vr3, $a4, -48 - vst $vr10, $a4, -64 - vst $vr4, $a4, 48 - vst $vr13, $a4, 32 - vst $vr5, $a4, 16 - vst $vr12, $a4, 0 - vst $vr6, $a4, 112 - vst $vr15, $a4, 96 - vst $vr7, $a4, 80 - vst $vr14, $a4, 64 + vshuf4i.w $vr8, $vr1, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.w $vr9, $vr0, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vshuf4i.w $vr10, $vr3, 14 + vsllwil.du.wu $vr10, $vr10, 0 + vshuf4i.w $vr11, $vr2, 14 + vsllwil.du.wu $vr11, $vr11, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vshuf4i.w $vr12, $vr5, 14 + vsllwil.du.wu $vr12, $vr12, 0 + vshuf4i.w $vr13, $vr4, 14 + vsllwil.du.wu $vr13, $vr13, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.w $vr14, $vr7, 14 + vsllwil.du.wu $vr14, $vr14, 0 + vshuf4i.w $vr15, $vr6, 14 + vsllwil.du.wu $vr15, $vr15, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vst $vr0, $a4, -96 + vst $vr1, $a4, -128 + vst $vr9, $a4, -80 + vst $vr8, $a4, -112 + vst $vr2, $a4, -32 + vst $vr3, $a4, -64 + vst $vr11, $a4, -16 + vst $vr10, $a4, -48 + vst $vr4, $a4, 32 + vst $vr5, $a4, 0 + vst $vr13, $a4, 48 + vst $vr12, $a4, 16 + vst $vr6, $a4, 96 + vst $vr7, $a4, 64 + vst $vr15, $a4, 112 + vst $vr14, $a4, 80 addi.d $a6, $a6, -32 addi.d $a5, $a5, 128 addi.d $a4, $a4, 256 @@ -27085,7 +27782,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St move $a0, $s5 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 56 + ld.d $a0, $sp, 24 beqz $a0, .LBB66_22 # %bb.21: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i38 pcaddu18i $ra, %call36(_ZdaPv) @@ -27094,18 +27791,18 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 72 # 8-byte Folded Reload - ld.d $s7, $sp, 80 # 8-byte Folded Reload - ld.d $s6, $sp, 88 # 8-byte Folded Reload - ld.d $s5, $sp, 96 # 8-byte Folded Reload - ld.d $s4, $sp, 104 # 8-byte Folded Reload - ld.d $s3, $sp, 112 # 8-byte Folded Reload - ld.d $s2, $sp, 120 # 8-byte Folded Reload - ld.d $s1, $sp, 128 # 8-byte Folded Reload - ld.d $s0, $sp, 136 # 8-byte Folded Reload - ld.d $fp, $sp, 144 # 8-byte Folded Reload - ld.d $ra, $sp, 152 # 8-byte Folded Reload - addi.d $sp, $sp, 160 + ld.d $s8, $sp, 40 # 8-byte Folded Reload + ld.d $s7, $sp, 48 # 8-byte Folded Reload + ld.d $s6, $sp, 56 # 8-byte Folded Reload + ld.d $s5, $sp, 64 # 8-byte Folded Reload + ld.d $s4, $sp, 72 # 8-byte Folded Reload + ld.d $s3, $sp, 80 # 8-byte Folded Reload + ld.d $s2, $sp, 88 # 8-byte Folded Reload + ld.d $s1, $sp, 96 # 8-byte Folded Reload + ld.d $s0, $sp, 104 # 8-byte Folded Reload + ld.d $fp, $sp, 112 # 8-byte Folded Reload + ld.d $ra, $sp, 120 # 8-byte Folded Reload + addi.d $sp, $sp, 128 ret .LBB66_23: .Ltmp1428: # EH_LABEL @@ -27153,7 +27850,7 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St jirl $ra, $ra, 0 .Ltmp1437: # EH_LABEL # %bb.28: # %_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc.exit21 - ld.d $a0, $sp, 48 # 8-byte Folded Reload + ld.d $a0, $sp, 16 # 8-byte Folded Reload ldx.d $a1, $a0, $s4 .Ltmp1438: # EH_LABEL move $a0, $s1 @@ -27194,10 +27891,10 @@ _Z61benchForTruncOrZextVecInLoopWithVW8From_uint32_t_To_uint64_t_RN9benchmark5St .Ltmp1427: # EH_LABEL .LBB66_36: move $s1, $a0 - ld.d $a0, $sp, 48 # 8-byte Folded Reload + ld.d $a0, $sp, 16 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s2, $sp, 56 + ld.d $s2, $sp, 24 beqz $s2, .LBB66_38 .LBB66_37: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i31 move $a0, $s2 @@ -27267,19 +27964,27 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception67 # %bb.0: - addi.d $sp, $sp, -160 - .cfi_def_cfa_offset 160 - st.d $ra, $sp, 152 # 8-byte Folded Spill - st.d $fp, $sp, 144 # 8-byte Folded Spill - st.d $s0, $sp, 136 # 8-byte Folded Spill - st.d $s1, $sp, 128 # 8-byte Folded Spill - st.d $s2, $sp, 120 # 8-byte Folded Spill - st.d $s3, $sp, 112 # 8-byte Folded Spill - st.d $s4, $sp, 104 # 8-byte Folded Spill - st.d $s5, $sp, 96 # 8-byte Folded Spill - st.d $s6, $sp, 88 # 8-byte Folded Spill - st.d $s7, $sp, 80 # 8-byte Folded Spill - st.d $s8, $sp, 72 # 8-byte Folded Spill + addi.d $sp, $sp, -192 + .cfi_def_cfa_offset 192 + st.d $ra, $sp, 184 # 8-byte Folded Spill + st.d $fp, $sp, 176 # 8-byte Folded Spill + st.d $s0, $sp, 168 # 8-byte Folded Spill + st.d $s1, $sp, 160 # 8-byte Folded Spill + st.d $s2, $sp, 152 # 8-byte Folded Spill + st.d $s3, $sp, 144 # 8-byte Folded Spill + st.d $s4, $sp, 136 # 8-byte Folded Spill + st.d $s5, $sp, 128 # 8-byte Folded Spill + st.d $s6, $sp, 120 # 8-byte Folded Spill + st.d $s7, $sp, 112 # 8-byte Folded Spill + st.d $s8, $sp, 104 # 8-byte Folded Spill + fst.d $fs0, $sp, 96 # 8-byte Folded Spill + fst.d $fs1, $sp, 88 # 8-byte Folded Spill + fst.d $fs2, $sp, 80 # 8-byte Folded Spill + fst.d $fs3, $sp, 72 # 8-byte Folded Spill + fst.d $fs4, $sp, 64 # 8-byte Folded Spill + fst.d $fs5, $sp, 56 # 8-byte Folded Spill + fst.d $fs6, $sp, 48 # 8-byte Folded Spill + fst.d $fs7, $sp, 40 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -27291,7 +27996,15 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S .cfi_offset 29, -72 .cfi_offset 30, -80 .cfi_offset 31, -88 - st.d $a0, $sp, 40 # 8-byte Folded Spill + .cfi_offset 56, -96 + .cfi_offset 57, -104 + .cfi_offset 58, -112 + .cfi_offset 59, -120 + .cfi_offset 60, -128 + .cfi_offset 61, -136 + .cfi_offset 62, -144 + .cfi_offset 63, -152 + st.d $a0, $sp, 8 # 8-byte Folded Spill lu12i.w $s1, 9 ori $s4, $s1, 3136 move $a0, $s4 @@ -27307,12 +28020,12 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S .Ltmp1449: # EH_LABEL # %bb.1: # %.split move $s2, $a0 - st.d $a0, $sp, 56 + st.d $a0, $sp, 24 .Ltmp1451: # EH_LABEL move $a0, $s3 pcaddu18i $ra, %call36(_Znam) jirl $ra, $ra, 0 - st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a0, $sp, 16 # 8-byte Folded Spill .Ltmp1452: # EH_LABEL # %bb.2: ori $s1, $s1, 3072 @@ -27321,15 +28034,15 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S add.d $s8, $s2, $s7 ori $a0, $zero, 0 lu32i.d $a0, -1 - st.d $a0, $sp, 64 + st.d $a0, $sp, 32 pcalau12i $a0, %pc_hi20(_ZL3rng) addi.d $s5, $a0, %pc_lo12(_ZL3rng) move $s0, $zero .p2align 4, , 16 .LBB67_3: # =>This Inner Loop Header: Depth=1 .Ltmp1454: # EH_LABEL - addi.d $a0, $sp, 64 - addi.d $a2, $sp, 64 + addi.d $a0, $sp, 32 + addi.d $a2, $sp, 32 move $a1, $s5 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIjEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEjRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -27341,14 +28054,13 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S bne $s0, $s4, .LBB67_3 # %bb.5: # %vector.ph move $a0, $zero - vrepli.b $vr3, 0 - ld.d $a1, $sp, 48 # 8-byte Folded Reload + ld.d $a1, $sp, 16 # 8-byte Folded Reload .p2align 4, , 16 .LBB67_6: # %vector.body # =>This Inner Loop Header: Depth=1 ldx.d $a2, $fp, $a0 vinsgr2vr.d $vr0, $a2, 0 - vilvl.w $vr0, $vr3, $vr0 + vsllwil.du.wu $vr0, $vr0, 0 vst $vr0, $a1, 0 addi.d $a0, $a0, 8 addi.d $a1, $a1, 16 @@ -27359,87 +28071,103 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S .p2align 4, , 16 .LBB67_8: # %vector.body85 # =>This Inner Loop Header: Depth=1 - vldx $vr0, $fp, $a0 add.d $a2, $fp, $a0 + vldx $vr0, $fp, $a0 vld $vr1, $a2, 48 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $a1, -240 vld $vr2, $a2, 32 - vilvl.w $vr0, $vr3, $vr0 + vld $vr3, $a2, 16 + vld $vr4, $a2, 112 + vld $vr5, $a2, 96 + vld $vr6, $a2, 80 + vld $vr7, $a2, 64 + vld $vr8, $a2, 176 + vld $vr9, $a2, 160 + vld $vr10, $a2, 144 + vld $vr11, $a2, 128 + vld $vr12, $a2, 240 + vld $vr13, $a2, 224 + vld $vr14, $a2, 208 + vld $vr15, $a2, 192 + vshuf4i.w $vr16, $vr3, 14 + vsllwil.du.wu $vr16, $vr16, 0 + vshuf4i.w $vr17, $vr2, 14 + vsllwil.du.wu $vr17, $vr17, 0 + vshuf4i.w $vr18, $vr1, 14 + vsllwil.du.wu $vr18, $vr18, 0 + vshuf4i.w $vr19, $vr0, 14 + vsllwil.du.wu $vr19, $vr19, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vshuf4i.w $vr20, $vr7, 14 + vsllwil.du.wu $vr20, $vr20, 0 + vshuf4i.w $vr21, $vr6, 14 + vsllwil.du.wu $vr21, $vr21, 0 + vshuf4i.w $vr22, $vr5, 14 + vsllwil.du.wu $vr22, $vr22, 0 + vshuf4i.w $vr23, $vr4, 14 + vsllwil.du.wu $vr23, $vr23, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.w $vr24, $vr11, 14 + vsllwil.du.wu $vr24, $vr24, 0 + vshuf4i.w $vr25, $vr10, 14 + vsllwil.du.wu $vr25, $vr25, 0 + vshuf4i.w $vr26, $vr9, 14 + vsllwil.du.wu $vr26, $vr26, 0 + vshuf4i.w $vr27, $vr8, 14 + vsllwil.du.wu $vr27, $vr27, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.w $vr28, $vr15, 14 + vsllwil.du.wu $vr28, $vr28, 0 + vshuf4i.w $vr29, $vr14, 14 + vsllwil.du.wu $vr29, $vr29, 0 + vshuf4i.w $vr30, $vr13, 14 + vsllwil.du.wu $vr30, $vr30, 0 + vshuf4i.w $vr31, $vr12, 14 + vsllwil.du.wu $vr31, $vr31, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vsllwil.du.wu $vr12, $vr12, 0 vst $vr0, $a1, -256 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $a1, -144 - vld $vr0, $a2, 16 - vilvl.w $vr1, $vr3, $vr1 vst $vr1, $a1, -160 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $a1, -176 - vld $vr1, $a2, 112 - vilvl.w $vr2, $vr3, $vr2 vst $vr2, $a1, -192 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $a1, -208 - vld $vr2, $a2, 96 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a1, -224 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $a1, -16 - vld $vr0, $a2, 80 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $a1, -32 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $a1, -48 - vld $vr1, $a2, 64 - vilvl.w $vr2, $vr3, $vr2 - vst $vr2, $a1, -64 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $a1, -80 - vld $vr2, $a2, 176 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a1, -96 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $a1, -112 - vld $vr0, $a2, 160 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $a1, -128 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $a1, 112 - vld $vr1, $a2, 144 - vilvl.w $vr2, $vr3, $vr2 - vst $vr2, $a1, 96 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $a1, 80 - vld $vr2, $a2, 128 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a1, 64 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $a1, 48 - vld $vr0, $a2, 240 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $a1, 32 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $a1, 16 - vld $vr1, $a2, 224 - vilvl.w $vr2, $vr3, $vr2 - vst $vr2, $a1, 0 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $a1, 240 - vld $vr2, $a2, 208 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a1, 224 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $a1, 208 - vld $vr0, $a2, 192 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $a1, 192 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $a1, 176 - vilvl.w $vr1, $vr3, $vr2 - vst $vr1, $a1, 160 - vilvh.w $vr1, $vr3, $vr0 - vst $vr1, $a1, 144 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a1, 128 + vst $vr3, $a1, -224 + vst $vr19, $a1, -240 + vst $vr18, $a1, -144 + vst $vr17, $a1, -176 + vst $vr16, $a1, -208 + vst $vr4, $a1, -32 + vst $vr5, $a1, -64 + vst $vr6, $a1, -96 + vst $vr7, $a1, -128 + vst $vr23, $a1, -16 + vst $vr22, $a1, -48 + vst $vr21, $a1, -80 + vst $vr20, $a1, -112 + vst $vr8, $a1, 96 + vst $vr9, $a1, 64 + vst $vr10, $a1, 32 + vst $vr11, $a1, 0 + vst $vr27, $a1, 112 + vst $vr26, $a1, 80 + vst $vr25, $a1, 48 + vst $vr24, $a1, 16 + vst $vr12, $a1, 224 + vst $vr13, $a1, 192 + vst $vr14, $a1, 160 + vst $vr15, $a1, 128 + vst $vr31, $a1, 240 + vst $vr30, $a1, 208 + vst $vr29, $a1, 176 + vst $vr28, $a1, 144 addi.d $a0, $a0, 256 addi.d $a1, $a1, 512 bne $a0, $s1, .LBB67_8 @@ -27478,7 +28206,7 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S st.d $a2, $s8, 112 st.d $a3, $s8, 120 move $s1, $fp - ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $s5, $sp, 16 # 8-byte Folded Reload .p2align 4, , 16 .LBB67_10: # %_ZL28truncOrZextVecInLoopWithVW16IjmEvPKT_PT0_i.exit.preheader # =>This Inner Loop Header: Depth=1 @@ -27491,8 +28219,7 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S addi.d $s1, $s1, 4 bne $s4, $s3, .LBB67_10 # %bb.12: - vst $vr3, $sp, 16 # 16-byte Folded Spill - ld.d $s1, $sp, 40 # 8-byte Folded Reload + ld.d $s1, $sp, 8 # 8-byte Folded Reload ld.w $s0, $s1, 28 ld.d $s2, $s1, 16 .Ltmp1472: # EH_LABEL @@ -27501,13 +28228,12 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S jirl $ra, $ra, 0 .Ltmp1473: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr3, $sp, 16 # 16-byte Folded Reload bnez $s0, .LBB67_19 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s2, .LBB67_19 # %bb.15: # %.lr.ph.preheader addi.d $a0, $fp, 128 - addi.d $a1, $sp, 56 + addi.d $a1, $sp, 24 lu12i.w $a2, 2 ori $a2, $a2, 1792 .p2align 4, , 16 @@ -27517,7 +28243,7 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S #APP #NO_APP #MEMBARRIER - ld.d $a4, $sp, 56 + ld.d $a4, $sp, 24 add.d $a3, $a4, $s7 addi.d $a4, $a4, 256 move $a5, $a0 @@ -27528,84 +28254,100 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S # => This Inner Loop Header: Depth=2 vld $vr0, $a5, -80 vld $vr1, $a5, -96 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $a4, -144 vld $vr2, $a5, -112 - vilvl.w $vr0, $vr3, $vr0 + vld $vr3, $a5, -128 + vld $vr4, $a5, -16 + vld $vr5, $a5, -32 + vld $vr6, $a5, -48 + vld $vr7, $a5, -64 + vld $vr8, $a5, 48 + vld $vr9, $a5, 32 + vld $vr10, $a5, 16 + vld $vr11, $a5, 0 + vld $vr12, $a5, 112 + vld $vr13, $a5, 96 + vld $vr14, $a5, 80 + vld $vr15, $a5, 64 + vshuf4i.w $vr16, $vr3, 14 + vsllwil.du.wu $vr16, $vr16, 0 + vshuf4i.w $vr17, $vr2, 14 + vsllwil.du.wu $vr17, $vr17, 0 + vshuf4i.w $vr18, $vr1, 14 + vsllwil.du.wu $vr18, $vr18, 0 + vshuf4i.w $vr19, $vr0, 14 + vsllwil.du.wu $vr19, $vr19, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vshuf4i.w $vr20, $vr7, 14 + vsllwil.du.wu $vr20, $vr20, 0 + vshuf4i.w $vr21, $vr6, 14 + vsllwil.du.wu $vr21, $vr21, 0 + vshuf4i.w $vr22, $vr5, 14 + vsllwil.du.wu $vr22, $vr22, 0 + vshuf4i.w $vr23, $vr4, 14 + vsllwil.du.wu $vr23, $vr23, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.w $vr24, $vr11, 14 + vsllwil.du.wu $vr24, $vr24, 0 + vshuf4i.w $vr25, $vr10, 14 + vsllwil.du.wu $vr25, $vr25, 0 + vshuf4i.w $vr26, $vr9, 14 + vsllwil.du.wu $vr26, $vr26, 0 + vshuf4i.w $vr27, $vr8, 14 + vsllwil.du.wu $vr27, $vr27, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.w $vr28, $vr15, 14 + vsllwil.du.wu $vr28, $vr28, 0 + vshuf4i.w $vr29, $vr14, 14 + vsllwil.du.wu $vr29, $vr29, 0 + vshuf4i.w $vr30, $vr13, 14 + vsllwil.du.wu $vr30, $vr30, 0 + vshuf4i.w $vr31, $vr12, 14 + vsllwil.du.wu $vr31, $vr31, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vsllwil.du.wu $vr12, $vr12, 0 vst $vr0, $a4, -160 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $a4, -176 - vld $vr0, $a5, -128 - vilvl.w $vr1, $vr3, $vr1 vst $vr1, $a4, -192 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $a4, -208 - vld $vr1, $a5, -16 - vilvl.w $vr2, $vr3, $vr2 vst $vr2, $a4, -224 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $a4, -240 - vld $vr2, $a5, -32 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a4, -256 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $a4, -16 - vld $vr0, $a5, -48 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $a4, -32 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $a4, -48 - vld $vr1, $a5, -64 - vilvl.w $vr2, $vr3, $vr2 - vst $vr2, $a4, -64 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $a4, -80 - vld $vr2, $a5, 48 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a4, -96 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $a4, -112 - vld $vr0, $a5, 32 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $a4, -128 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $a4, 112 - vld $vr1, $a5, 16 - vilvl.w $vr2, $vr3, $vr2 - vst $vr2, $a4, 96 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $a4, 80 - vld $vr2, $a5, 0 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a4, 64 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $a4, 48 - vld $vr0, $a5, 112 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $a4, 32 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $a4, 16 - vld $vr1, $a5, 96 - vilvl.w $vr2, $vr3, $vr2 - vst $vr2, $a4, 0 - vilvh.w $vr2, $vr3, $vr0 - vst $vr2, $a4, 240 - vld $vr2, $a5, 80 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a4, 224 - vilvh.w $vr0, $vr3, $vr1 - vst $vr0, $a4, 208 - vld $vr0, $a5, 64 - vilvl.w $vr1, $vr3, $vr1 - vst $vr1, $a4, 192 - vilvh.w $vr1, $vr3, $vr2 - vst $vr1, $a4, 176 - vilvl.w $vr1, $vr3, $vr2 - vst $vr1, $a4, 160 - vilvh.w $vr1, $vr3, $vr0 - vst $vr1, $a4, 144 - vilvl.w $vr0, $vr3, $vr0 - vst $vr0, $a4, 128 + vst $vr3, $a4, -256 + vst $vr19, $a4, -144 + vst $vr18, $a4, -176 + vst $vr17, $a4, -208 + vst $vr16, $a4, -240 + vst $vr4, $a4, -32 + vst $vr5, $a4, -64 + vst $vr6, $a4, -96 + vst $vr7, $a4, -128 + vst $vr23, $a4, -16 + vst $vr22, $a4, -48 + vst $vr21, $a4, -80 + vst $vr20, $a4, -112 + vst $vr8, $a4, 96 + vst $vr9, $a4, 64 + vst $vr10, $a4, 32 + vst $vr11, $a4, 0 + vst $vr27, $a4, 112 + vst $vr26, $a4, 80 + vst $vr25, $a4, 48 + vst $vr24, $a4, 16 + vst $vr12, $a4, 224 + vst $vr13, $a4, 192 + vst $vr14, $a4, 160 + vst $vr15, $a4, 128 + vst $vr31, $a4, 240 + vst $vr30, $a4, 208 + vst $vr29, $a4, 176 + vst $vr28, $a4, 144 addi.d $a6, $a6, -64 addi.d $a5, $a5, 256 addi.d $a4, $a4, 512 @@ -27656,7 +28398,7 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S move $a0, $s5 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 56 + ld.d $a0, $sp, 24 beqz $a0, .LBB67_22 # %bb.21: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i38 pcaddu18i $ra, %call36(_ZdaPv) @@ -27665,18 +28407,26 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s8, $sp, 72 # 8-byte Folded Reload - ld.d $s7, $sp, 80 # 8-byte Folded Reload - ld.d $s6, $sp, 88 # 8-byte Folded Reload - ld.d $s5, $sp, 96 # 8-byte Folded Reload - ld.d $s4, $sp, 104 # 8-byte Folded Reload - ld.d $s3, $sp, 112 # 8-byte Folded Reload - ld.d $s2, $sp, 120 # 8-byte Folded Reload - ld.d $s1, $sp, 128 # 8-byte Folded Reload - ld.d $s0, $sp, 136 # 8-byte Folded Reload - ld.d $fp, $sp, 144 # 8-byte Folded Reload - ld.d $ra, $sp, 152 # 8-byte Folded Reload - addi.d $sp, $sp, 160 + fld.d $fs7, $sp, 40 # 8-byte Folded Reload + fld.d $fs6, $sp, 48 # 8-byte Folded Reload + fld.d $fs5, $sp, 56 # 8-byte Folded Reload + fld.d $fs4, $sp, 64 # 8-byte Folded Reload + fld.d $fs3, $sp, 72 # 8-byte Folded Reload + fld.d $fs2, $sp, 80 # 8-byte Folded Reload + fld.d $fs1, $sp, 88 # 8-byte Folded Reload + fld.d $fs0, $sp, 96 # 8-byte Folded Reload + ld.d $s8, $sp, 104 # 8-byte Folded Reload + ld.d $s7, $sp, 112 # 8-byte Folded Reload + ld.d $s6, $sp, 120 # 8-byte Folded Reload + ld.d $s5, $sp, 128 # 8-byte Folded Reload + ld.d $s4, $sp, 136 # 8-byte Folded Reload + ld.d $s3, $sp, 144 # 8-byte Folded Reload + ld.d $s2, $sp, 152 # 8-byte Folded Reload + ld.d $s1, $sp, 160 # 8-byte Folded Reload + ld.d $s0, $sp, 168 # 8-byte Folded Reload + ld.d $fp, $sp, 176 # 8-byte Folded Reload + ld.d $ra, $sp, 184 # 8-byte Folded Reload + addi.d $sp, $sp, 192 ret .LBB67_23: .Ltmp1457: # EH_LABEL @@ -27724,7 +28474,7 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S jirl $ra, $ra, 0 .Ltmp1466: # EH_LABEL # %bb.28: # %_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc.exit21 - ld.d $a0, $sp, 48 # 8-byte Folded Reload + ld.d $a0, $sp, 16 # 8-byte Folded Reload ldx.d $a1, $a0, $s4 .Ltmp1467: # EH_LABEL move $a0, $s1 @@ -27765,10 +28515,10 @@ _Z62benchForTruncOrZextVecInLoopWithVW16From_uint32_t_To_uint64_t_RN9benchmark5S .Ltmp1456: # EH_LABEL .LBB67_36: move $s1, $a0 - ld.d $a0, $sp, 48 # 8-byte Folded Reload + ld.d $a0, $sp, 16 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s2, $sp, 56 + ld.d $s2, $sp, 24 beqz $s2, .LBB67_38 .LBB67_37: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i31 move $a0, $s2 @@ -27838,17 +28588,17 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5StateE: # .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception68 # %bb.0: - addi.d $sp, $sp, -128 - .cfi_def_cfa_offset 128 - st.d $ra, $sp, 120 # 8-byte Folded Spill - st.d $fp, $sp, 112 # 8-byte Folded Spill - st.d $s0, $sp, 104 # 8-byte Folded Spill - st.d $s1, $sp, 96 # 8-byte Folded Spill - st.d $s2, $sp, 88 # 8-byte Folded Spill - st.d $s3, $sp, 80 # 8-byte Folded Spill - st.d $s4, $sp, 72 # 8-byte Folded Spill - st.d $s5, $sp, 64 # 8-byte Folded Spill - st.d $s6, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -96 + .cfi_def_cfa_offset 96 + st.d $ra, $sp, 88 # 8-byte Folded Spill + st.d $fp, $sp, 80 # 8-byte Folded Spill + st.d $s0, $sp, 72 # 8-byte Folded Spill + st.d $s1, $sp, 64 # 8-byte Folded Spill + st.d $s2, $sp, 56 # 8-byte Folded Spill + st.d $s3, $sp, 48 # 8-byte Folded Spill + st.d $s4, $sp, 40 # 8-byte Folded Spill + st.d $s5, $sp, 32 # 8-byte Folded Spill + st.d $s6, $sp, 24 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -27874,7 +28624,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5StateE: # .Ltmp1478: # EH_LABEL # %bb.1: move $s2, $a0 - st.d $a0, $sp, 40 + st.d $a0, $sp, 8 .Ltmp1480: # EH_LABEL move $a0, $s3 pcaddu18i $ra, %call36(_Znam) @@ -27884,15 +28634,15 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5StateE: # move $s0, $a0 ori $a0, $zero, 0 lu32i.d $a0, -1 - st.d $a0, $sp, 48 + st.d $a0, $sp, 16 pcalau12i $a0, %pc_hi20(_ZL3rng) addi.d $s5, $a0, %pc_lo12(_ZL3rng) move $s6, $zero .p2align 4, , 16 .LBB68_3: # =>This Inner Loop Header: Depth=1 .Ltmp1483: # EH_LABEL - addi.d $a0, $sp, 48 - addi.d $a2, $sp, 48 + addi.d $a0, $sp, 16 + addi.d $a2, $sp, 16 move $a1, $s5 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIjEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEjRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -27904,14 +28654,13 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5StateE: # bne $s6, $s4, .LBB68_3 # %bb.5: # %vector.ph move $a0, $zero - vrepli.b $vr2, 0 move $a1, $s0 .p2align 4, , 16 .LBB68_6: # %vector.body # =>This Inner Loop Header: Depth=1 ldx.d $a2, $fp, $a0 vinsgr2vr.d $vr0, $a2, 0 - vilvl.w $vr0, $vr2, $vr0 + vsllwil.du.wu $vr0, $vr0, 0 vst $vr0, $a1, 0 addi.d $a0, $a0, 8 addi.d $a1, $a1, 16 @@ -27927,8 +28676,8 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5StateE: # ld.d $a2, $a2, 8 vinsgr2vr.d $vr0, $a3, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.w $vr0, $vr2, $vr0 - vilvl.w $vr1, $vr2, $vr1 + vsllwil.du.wu $vr0, $vr0, 0 + vsllwil.du.wu $vr1, $vr1, 0 vst $vr0, $a1, -16 vst $vr1, $a1, 0 addi.d $a0, $a0, 16 @@ -27949,7 +28698,6 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5StateE: # addi.d $s5, $s5, 4 bne $s4, $s3, .LBB68_10 # %bb.12: - vst $vr2, $sp, 16 # 16-byte Folded Spill ld.w $s3, $s1, 28 ld.d $s2, $s1, 16 .Ltmp1501: # EH_LABEL @@ -27958,13 +28706,12 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5StateE: # jirl $ra, $ra, 0 .Ltmp1502: # EH_LABEL # %bb.13: # %_ZN9benchmark5State3endEv.exit.preheader - vld $vr2, $sp, 16 # 16-byte Folded Reload bnez $s3, .LBB68_19 # %bb.14: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s2, .LBB68_19 # %bb.15: # %.lr.ph.preheader addi.d $a0, $fp, 8 - addi.d $a1, $sp, 40 + addi.d $a1, $sp, 8 lu12i.w $a2, 2 ori $a2, $a2, 1808 .p2align 4, , 16 @@ -27974,7 +28721,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5StateE: # #APP #NO_APP #MEMBARRIER - ld.d $a3, $sp, 40 + ld.d $a3, $sp, 8 addi.d $a3, $a3, 16 move $a4, $a0 move $a5, $a2 @@ -27986,8 +28733,8 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5StateE: # ld.d $a7, $a4, 0 vinsgr2vr.d $vr0, $a6, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.w $vr0, $vr2, $vr0 - vilvl.w $vr1, $vr2, $vr1 + vsllwil.du.wu $vr0, $vr0, 0 + vsllwil.du.wu $vr1, $vr1, 0 vst $vr0, $a3, -16 vst $vr1, $a3, 0 addi.d $a5, $a5, -4 @@ -28008,7 +28755,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5StateE: # move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $a0, $sp, 40 + ld.d $a0, $sp, 8 beqz $a0, .LBB68_22 # %bb.21: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i38 pcaddu18i $ra, %call36(_ZdaPv) @@ -28017,16 +28764,16 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5StateE: # move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s6, $sp, 56 # 8-byte Folded Reload - ld.d $s5, $sp, 64 # 8-byte Folded Reload - ld.d $s4, $sp, 72 # 8-byte Folded Reload - ld.d $s3, $sp, 80 # 8-byte Folded Reload - ld.d $s2, $sp, 88 # 8-byte Folded Reload - ld.d $s1, $sp, 96 # 8-byte Folded Reload - ld.d $s0, $sp, 104 # 8-byte Folded Reload - ld.d $fp, $sp, 112 # 8-byte Folded Reload - ld.d $ra, $sp, 120 # 8-byte Folded Reload - addi.d $sp, $sp, 128 + ld.d $s6, $sp, 24 # 8-byte Folded Reload + ld.d $s5, $sp, 32 # 8-byte Folded Reload + ld.d $s4, $sp, 40 # 8-byte Folded Reload + ld.d $s3, $sp, 48 # 8-byte Folded Reload + ld.d $s2, $sp, 56 # 8-byte Folded Reload + ld.d $s1, $sp, 64 # 8-byte Folded Reload + ld.d $s0, $sp, 72 # 8-byte Folded Reload + ld.d $fp, $sp, 80 # 8-byte Folded Reload + ld.d $ra, $sp, 88 # 8-byte Folded Reload + addi.d $sp, $sp, 96 ret .LBB68_23: .Ltmp1486: # EH_LABEL @@ -28117,7 +28864,7 @@ _Z54benchForTruncOrZextVecInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5StateE: # move $a0, $s0 pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s2, $sp, 40 + ld.d $s2, $sp, 8 beqz $s2, .LBB68_38 .LBB68_37: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i31 move $a0, $s2 @@ -28192,18 +28939,26 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW8From_uint32_t_To_uint64_t_RN9bench .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception69 # %bb.0: - addi.d $sp, $sp, -112 - .cfi_def_cfa_offset 112 - st.d $ra, $sp, 104 # 8-byte Folded Spill - st.d $fp, $sp, 96 # 8-byte Folded Spill - st.d $s0, $sp, 88 # 8-byte Folded Spill - st.d $s1, $sp, 80 # 8-byte Folded Spill - st.d $s2, $sp, 72 # 8-byte Folded Spill - st.d $s3, $sp, 64 # 8-byte Folded Spill - st.d $s4, $sp, 56 # 8-byte Folded Spill - st.d $s5, $sp, 48 # 8-byte Folded Spill - st.d $s6, $sp, 40 # 8-byte Folded Spill - st.d $s7, $sp, 32 # 8-byte Folded Spill + addi.d $sp, $sp, -176 + .cfi_def_cfa_offset 176 + st.d $ra, $sp, 168 # 8-byte Folded Spill + st.d $fp, $sp, 160 # 8-byte Folded Spill + st.d $s0, $sp, 152 # 8-byte Folded Spill + st.d $s1, $sp, 144 # 8-byte Folded Spill + st.d $s2, $sp, 136 # 8-byte Folded Spill + st.d $s3, $sp, 128 # 8-byte Folded Spill + st.d $s4, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 112 # 8-byte Folded Spill + st.d $s6, $sp, 104 # 8-byte Folded Spill + st.d $s7, $sp, 96 # 8-byte Folded Spill + fst.d $fs0, $sp, 88 # 8-byte Folded Spill + fst.d $fs1, $sp, 80 # 8-byte Folded Spill + fst.d $fs2, $sp, 72 # 8-byte Folded Spill + fst.d $fs3, $sp, 64 # 8-byte Folded Spill + fst.d $fs4, $sp, 56 # 8-byte Folded Spill + fst.d $fs5, $sp, 48 # 8-byte Folded Spill + fst.d $fs6, $sp, 40 # 8-byte Folded Spill + fst.d $fs7, $sp, 32 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -28214,6 +28969,14 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW8From_uint32_t_To_uint64_t_RN9bench .cfi_offset 28, -64 .cfi_offset 29, -72 .cfi_offset 30, -80 + .cfi_offset 56, -88 + .cfi_offset 57, -96 + .cfi_offset 58, -104 + .cfi_offset 59, -112 + .cfi_offset 60, -120 + .cfi_offset 61, -128 + .cfi_offset 62, -136 + .cfi_offset 63, -144 move $s0, $a0 lu12i.w $s4, 9 ori $s3, $s4, 3136 @@ -28292,7 +29055,6 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW8From_uint32_t_To_uint64_t_RN9bench ori $a2, $s6, 2048 lu12i.w $a3, 2 ori $a3, $a3, 1792 - vrepli.b $vr0, 0 .p2align 4, , 16 .LBB69_11: # %.lr.ph # =>This Loop Header: Depth=1 @@ -28309,78 +29071,86 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW8From_uint32_t_To_uint64_t_RN9bench .LBB69_12: # %vector.body # Parent Loop BB69_11 Depth=1 # => This Inner Loop Header: Depth=2 - vld $vr2, $a6, -64 - vld $vr3, $a6, -48 - vld $vr4, $a6, -32 - vld $vr5, $a6, -16 - vld $vr6, $a6, 0 - vld $vr7, $a6, 16 - vld $vr8, $a6, 32 - vld $vr1, $a6, 48 - vilvl.w $vr9, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vilvl.w $vr10, $vr0, $vr2 - vilvh.w $vr2, $vr0, $vr2 - vilvl.w $vr11, $vr0, $vr5 - vilvh.w $vr5, $vr0, $vr5 - vilvl.w $vr12, $vr0, $vr4 - vilvh.w $vr4, $vr0, $vr4 - vilvl.w $vr13, $vr0, $vr7 - vilvh.w $vr7, $vr0, $vr7 - vilvl.w $vr14, $vr0, $vr6 - vilvh.w $vr6, $vr0, $vr6 - vilvl.w $vr15, $vr0, $vr8 - vld $vr16, $a5, -112 - vilvh.w $vr8, $vr0, $vr8 + vld $vr0, $a6, -64 + vld $vr1, $a6, -48 + vld $vr2, $a6, -32 + vld $vr3, $a6, -16 + vld $vr4, $a6, 0 + vld $vr5, $a6, 16 + vld $vr6, $a6, 32 + vld $vr7, $a6, 48 + vsllwil.du.wu $vr8, $vr1, 0 + vsllwil.du.wu $vr9, $vr0, 0 + vshuf4i.w $vr1, $vr1, 14 + vsllwil.du.wu $vr1, $vr1, 0 + vshuf4i.w $vr0, $vr0, 14 + vsllwil.du.wu $vr0, $vr0, 0 + vsllwil.du.wu $vr10, $vr3, 0 + vsllwil.du.wu $vr11, $vr2, 0 + vshuf4i.w $vr3, $vr3, 14 + vsllwil.du.wu $vr3, $vr3, 0 + vshuf4i.w $vr2, $vr2, 14 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.du.wu $vr12, $vr5, 0 + vsllwil.du.wu $vr13, $vr4, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.w $vr4, $vr4, 14 + vsllwil.du.wu $vr4, $vr4, 0 + vsllwil.du.wu $vr14, $vr7, 0 + vsllwil.du.wu $vr15, $vr6, 0 + vshuf4i.w $vr7, $vr7, 14 + vsllwil.du.wu $vr7, $vr7, 0 + vshuf4i.w $vr6, $vr6, 14 + vsllwil.du.wu $vr6, $vr6, 0 + vld $vr16, $a5, -96 vld $vr17, $a5, -128 vld $vr18, $a5, -80 - vadd.d $vr2, $vr16, $vr2 - vld $vr16, $a5, -96 - vadd.d $vr10, $vr17, $vr10 - vld $vr17, $a5, -48 - vadd.d $vr3, $vr18, $vr3 - vld $vr18, $a5, -64 - vadd.d $vr9, $vr16, $vr9 - vld $vr16, $a5, -16 - vadd.d $vr4, $vr17, $vr4 - vld $vr17, $a5, -32 - vadd.d $vr12, $vr18, $vr12 - vld $vr18, $a5, 16 - vadd.d $vr5, $vr16, $vr5 - vld $vr16, $a5, 0 - vadd.d $vr11, $vr17, $vr11 - vld $vr17, $a5, 48 - vadd.d $vr6, $vr18, $vr6 - vld $vr18, $a5, 32 - vadd.d $vr14, $vr16, $vr14 - vld $vr16, $a5, 80 - vadd.d $vr7, $vr17, $vr7 - vld $vr17, $a5, 64 - vadd.d $vr13, $vr18, $vr13 - vld $vr18, $a5, 112 + vld $vr19, $a5, -112 + vld $vr20, $a5, -32 + vld $vr21, $a5, -64 + vld $vr22, $a5, -16 + vld $vr23, $a5, -48 + vld $vr24, $a5, 32 + vld $vr25, $a5, 0 + vld $vr26, $a5, 48 + vld $vr27, $a5, 16 + vld $vr28, $a5, 96 + vld $vr29, $a5, 64 + vld $vr30, $a5, 112 + vld $vr31, $a5, 80 + vadd.d $vr0, $vr19, $vr0 + vadd.d $vr1, $vr18, $vr1 + vadd.d $vr9, $vr17, $vr9 vadd.d $vr8, $vr16, $vr8 - vld $vr16, $a5, 96 - vadd.d $vr15, $vr17, $vr15 - vilvh.w $vr17, $vr0, $vr1 - vadd.d $vr17, $vr18, $vr17 - vilvl.w $vr1, $vr0, $vr1 - vadd.d $vr1, $vr16, $vr1 - vst $vr9, $a5, -96 - vst $vr3, $a5, -80 - vst $vr10, $a5, -128 - vst $vr2, $a5, -112 - vst $vr11, $a5, -32 - vst $vr5, $a5, -16 - vst $vr12, $a5, -64 - vst $vr4, $a5, -48 - vst $vr13, $a5, 32 - vst $vr7, $a5, 48 - vst $vr14, $a5, 0 - vst $vr6, $a5, 16 - vst $vr1, $a5, 96 - vst $vr17, $a5, 112 + vadd.d $vr2, $vr23, $vr2 + vadd.d $vr3, $vr22, $vr3 + vadd.d $vr11, $vr21, $vr11 + vadd.d $vr10, $vr20, $vr10 + vadd.d $vr4, $vr27, $vr4 + vadd.d $vr5, $vr26, $vr5 + vadd.d $vr13, $vr25, $vr13 + vadd.d $vr12, $vr24, $vr12 + vadd.d $vr6, $vr31, $vr6 + vadd.d $vr7, $vr30, $vr7 + vadd.d $vr15, $vr29, $vr15 + vadd.d $vr14, $vr28, $vr14 + vst $vr8, $a5, -96 + vst $vr9, $a5, -128 + vst $vr1, $a5, -80 + vst $vr0, $a5, -112 + vst $vr10, $a5, -32 + vst $vr11, $a5, -64 + vst $vr3, $a5, -16 + vst $vr2, $a5, -48 + vst $vr12, $a5, 32 + vst $vr13, $a5, 0 + vst $vr5, $a5, 48 + vst $vr4, $a5, 16 + vst $vr14, $a5, 96 vst $vr15, $a5, 64 - vst $vr8, $a5, 80 + vst $vr7, $a5, 112 + vst $vr6, $a5, 80 addi.d $a7, $a7, -32 addi.d $a6, $a6, 128 addi.d $a5, $a5, 256 @@ -28469,17 +29239,25 @@ _Z68benchForTruncOrZextVecWithAddInLoopWithVW8From_uint32_t_To_uint64_t_RN9bench move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - ld.d $s7, $sp, 32 # 8-byte Folded Reload - ld.d $s6, $sp, 40 # 8-byte Folded Reload - ld.d $s5, $sp, 48 # 8-byte Folded Reload - ld.d $s4, $sp, 56 # 8-byte Folded Reload - ld.d $s3, $sp, 64 # 8-byte Folded Reload - ld.d $s2, $sp, 72 # 8-byte Folded Reload - ld.d $s1, $sp, 80 # 8-byte Folded Reload - ld.d $s0, $sp, 88 # 8-byte Folded Reload - ld.d $fp, $sp, 96 # 8-byte Folded Reload - ld.d $ra, $sp, 104 # 8-byte Folded Reload - addi.d $sp, $sp, 112 + fld.d $fs7, $sp, 32 # 8-byte Folded Reload + fld.d $fs6, $sp, 40 # 8-byte Folded Reload + fld.d $fs5, $sp, 48 # 8-byte Folded Reload + fld.d $fs4, $sp, 56 # 8-byte Folded Reload + fld.d $fs3, $sp, 64 # 8-byte Folded Reload + fld.d $fs2, $sp, 72 # 8-byte Folded Reload + fld.d $fs1, $sp, 80 # 8-byte Folded Reload + fld.d $fs0, $sp, 88 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload + ld.d $s6, $sp, 104 # 8-byte Folded Reload + ld.d $s5, $sp, 112 # 8-byte Folded Reload + ld.d $s4, $sp, 120 # 8-byte Folded Reload + ld.d $s3, $sp, 128 # 8-byte Folded Reload + ld.d $s2, $sp, 136 # 8-byte Folded Reload + ld.d $s1, $sp, 144 # 8-byte Folded Reload + ld.d $s0, $sp, 152 # 8-byte Folded Reload + ld.d $fp, $sp, 160 # 8-byte Folded Reload + ld.d $ra, $sp, 168 # 8-byte Folded Reload + addi.d $sp, $sp, 176 ret .LBB69_18: .Ltmp1508: # EH_LABEL @@ -28568,26 +29346,26 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint32_t_To_uint64_t_RN9benc .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception70 # %bb.0: - addi.d $sp, $sp, -288 - .cfi_def_cfa_offset 288 - st.d $ra, $sp, 280 # 8-byte Folded Spill - st.d $fp, $sp, 272 # 8-byte Folded Spill - st.d $s0, $sp, 264 # 8-byte Folded Spill - st.d $s1, $sp, 256 # 8-byte Folded Spill - st.d $s2, $sp, 248 # 8-byte Folded Spill - st.d $s3, $sp, 240 # 8-byte Folded Spill - st.d $s4, $sp, 232 # 8-byte Folded Spill - st.d $s5, $sp, 224 # 8-byte Folded Spill - st.d $s6, $sp, 216 # 8-byte Folded Spill - st.d $s7, $sp, 208 # 8-byte Folded Spill - fst.d $fs0, $sp, 200 # 8-byte Folded Spill - fst.d $fs1, $sp, 192 # 8-byte Folded Spill - fst.d $fs2, $sp, 184 # 8-byte Folded Spill - fst.d $fs3, $sp, 176 # 8-byte Folded Spill - fst.d $fs4, $sp, 168 # 8-byte Folded Spill - fst.d $fs5, $sp, 160 # 8-byte Folded Spill - fst.d $fs6, $sp, 152 # 8-byte Folded Spill - fst.d $fs7, $sp, 144 # 8-byte Folded Spill + addi.d $sp, $sp, -256 + .cfi_def_cfa_offset 256 + st.d $ra, $sp, 248 # 8-byte Folded Spill + st.d $fp, $sp, 240 # 8-byte Folded Spill + st.d $s0, $sp, 232 # 8-byte Folded Spill + st.d $s1, $sp, 224 # 8-byte Folded Spill + st.d $s2, $sp, 216 # 8-byte Folded Spill + st.d $s3, $sp, 208 # 8-byte Folded Spill + st.d $s4, $sp, 200 # 8-byte Folded Spill + st.d $s5, $sp, 192 # 8-byte Folded Spill + st.d $s6, $sp, 184 # 8-byte Folded Spill + st.d $s7, $sp, 176 # 8-byte Folded Spill + fst.d $fs0, $sp, 168 # 8-byte Folded Spill + fst.d $fs1, $sp, 160 # 8-byte Folded Spill + fst.d $fs2, $sp, 152 # 8-byte Folded Spill + fst.d $fs3, $sp, 144 # 8-byte Folded Spill + fst.d $fs4, $sp, 136 # 8-byte Folded Spill + fst.d $fs5, $sp, 128 # 8-byte Folded Spill + fst.d $fs6, $sp, 120 # 8-byte Folded Spill + fst.d $fs7, $sp, 112 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -28624,18 +29402,18 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint32_t_To_uint64_t_RN9benc move $s1, $a0 ori $a0, $s4, 3072 add.d $s5, $fp, $a0 - st.d $s1, $sp, 120 + st.d $s1, $sp, 88 ori $a0, $zero, 0 lu32i.d $a0, -1 - st.d $a0, $sp, 128 + st.d $a0, $sp, 96 pcalau12i $a0, %pc_hi20(_ZL3rng) addi.d $s4, $a0, %pc_lo12(_ZL3rng) move $s7, $zero .p2align 4, , 16 .LBB70_2: # =>This Inner Loop Header: Depth=1 .Ltmp1523: # EH_LABEL - addi.d $a0, $sp, 128 - addi.d $a2, $sp, 128 + addi.d $a0, $sp, 96 + addi.d $a2, $sp, 96 move $a1, $s4 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionIjEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEjRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -28648,15 +29426,15 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint32_t_To_uint64_t_RN9benc # %bb.4: pcalau12i $a0, %pc_hi20(.LCPI70_0) vld $vr0, $a0, %pc_lo12(.LCPI70_0) - vst $vr0, $sp, 128 + vst $vr0, $sp, 96 pcalau12i $a0, %pc_hi20(_ZL3rng) addi.d $s3, $a0, %pc_lo12(_ZL3rng) move $s4, $zero .p2align 4, , 16 .LBB70_5: # =>This Inner Loop Header: Depth=1 .Ltmp1526: # EH_LABEL - addi.d $a0, $sp, 128 - addi.d $a2, $sp, 128 + addi.d $a0, $sp, 96 + addi.d $a2, $sp, 96 move $a1, $s3 pcaddu18i $ra, %call36(_ZNSt24uniform_int_distributionImEclISt23mersenne_twister_engineImLm32ELm624ELm397ELm31ELm2567483615ELm11ELm4294967295ELm7ELm2636928640ELm15ELm4022730752ELm18ELm1812433253EEEEmRT_RKNS0_10param_typeE) jirl $ra, $ra, 0 @@ -28680,11 +29458,10 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint32_t_To_uint64_t_RN9benc beqz $s1, .LBB70_14 # %bb.10: # %.lr.ph.preheader addi.d $a0, $fp, 128 - addi.d $a1, $sp, 120 + addi.d $a1, $sp, 88 ori $a2, $s6, 2048 lu12i.w $a3, 2 ori $a3, $a3, 1792 - vrepli.b $vr0, 0 .p2align 4, , 16 .LBB70_11: # %.lr.ph # =>This Loop Header: Depth=1 @@ -28692,7 +29469,7 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint32_t_To_uint64_t_RN9benc #APP #NO_APP #MEMBARRIER - ld.d $a5, $sp, 120 + ld.d $a5, $sp, 88 add.d $a4, $a5, $a2 addi.d $a5, $a5, 256 move $a6, $a0 @@ -28705,157 +29482,170 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint32_t_To_uint64_t_RN9benc vld $vr5, $a6, -112 vld $vr6, $a6, -96 vld $vr7, $a6, -80 - vld $vr9, $a6, -64 - vld $vr11, $a6, -48 - vld $vr13, $a6, -32 - vld $vr10, $a6, -16 - vld $vr16, $a6, 0 - vld $vr17, $a6, 16 - vld $vr20, $a6, 32 - vld $vr22, $a6, 48 + vld $vr10, $a6, -64 + vld $vr14, $a6, -48 + vld $vr18, $a6, -32 + vld $vr17, $a6, -16 + vld $vr24, $a6, 0 + vld $vr21, $a6, 16 + vld $vr25, $a6, 32 + vld $vr29, $a6, 48 vld $vr3, $a6, 64 vld $vr2, $a6, 80 vld $vr1, $a6, 96 - vst $vr1, $sp, 96 # 16-byte Folded Spill - vilvl.w $vr14, $vr0, $vr7 - vilvh.w $vr18, $vr0, $vr7 - vilvl.w $vr21, $vr0, $vr6 - vilvh.w $vr24, $vr0, $vr6 - vilvl.w $vr27, $vr0, $vr5 - vilvh.w $vr28, $vr0, $vr5 - vilvl.w $vr25, $vr0, $vr4 - vilvh.w $vr30, $vr0, $vr4 - vilvl.w $vr8, $vr0, $vr10 - vilvh.w $vr10, $vr0, $vr10 - vilvl.w $vr12, $vr0, $vr13 - vilvh.w $vr15, $vr0, $vr13 - vilvl.w $vr19, $vr0, $vr11 - vilvh.w $vr23, $vr0, $vr11 - vilvl.w $vr26, $vr0, $vr9 - vilvh.w $vr29, $vr0, $vr9 - vilvl.w $vr4, $vr0, $vr22 - vst $vr4, $sp, 64 # 16-byte Folded Spill - vilvh.w $vr6, $vr0, $vr22 - vilvl.w $vr7, $vr0, $vr20 - vilvh.w $vr9, $vr0, $vr20 - vilvl.w $vr4, $vr0, $vr17 - vilvh.w $vr5, $vr0, $vr17 - vilvl.w $vr17, $vr0, $vr16 - vilvh.w $vr20, $vr0, $vr16 - vilvh.w $vr1, $vr0, $vr1 - vst $vr1, $sp, 80 # 16-byte Folded Spill - vld $vr22, $a5, -240 - vilvh.w $vr16, $vr0, $vr2 - vld $vr31, $a5, -256 - vld $vr1, $a5, -208 - vadd.d $vr11, $vr22, $vr30 - vst $vr11, $sp, 48 # 16-byte Folded Spill - vld $vr30, $a5, -224 - vadd.d $vr11, $vr31, $vr25 - vst $vr11, $sp, 32 # 16-byte Folded Spill + vld $vr8, $a6, 112 + vsllwil.du.wu $vr9, $vr7, 0 + vsllwil.du.wu $vr12, $vr6, 0 + vsllwil.du.wu $vr15, $vr5, 0 + vsllwil.du.wu $vr16, $vr4, 0 + vshuf4i.w $vr7, $vr7, 14 + vsllwil.du.wu $vr19, $vr7, 0 + vshuf4i.w $vr6, $vr6, 14 + vsllwil.du.wu $vr22, $vr6, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.du.wu $vr27, $vr5, 0 + vshuf4i.w $vr4, $vr4, 14 + vsllwil.du.wu $vr26, $vr4, 0 + vsllwil.du.wu $vr6, $vr17, 0 + vsllwil.du.wu $vr5, $vr18, 0 + vsllwil.du.wu $vr11, $vr14, 0 + vsllwil.du.wu $vr13, $vr10, 0 + vshuf4i.w $vr4, $vr17, 14 + vsllwil.du.wu $vr17, $vr4, 0 + vshuf4i.w $vr4, $vr18, 14 + vsllwil.du.wu $vr20, $vr4, 0 + vshuf4i.w $vr4, $vr14, 14 + vsllwil.du.wu $vr23, $vr4, 0 + vshuf4i.w $vr4, $vr10, 14 + vsllwil.du.wu $vr28, $vr4, 0 + vsllwil.du.wu $vr0, $vr29, 0 + vst $vr0, $sp, 64 # 16-byte Folded Spill + vsllwil.du.wu $vr4, $vr25, 0 + vsllwil.du.wu $vr7, $vr21, 0 + vsllwil.du.wu $vr10, $vr24, 0 + vshuf4i.w $vr14, $vr29, 14 + vsllwil.du.wu $vr14, $vr14, 0 + vshuf4i.w $vr18, $vr25, 14 + vsllwil.du.wu $vr18, $vr18, 0 + vshuf4i.w $vr21, $vr21, 14 + vsllwil.du.wu $vr21, $vr21, 0 + vshuf4i.w $vr24, $vr24, 14 + vsllwil.du.wu $vr24, $vr24, 0 + vshuf4i.w $vr25, $vr2, 14 + vld $vr29, $a5, -240 + vsllwil.du.wu $vr25, $vr25, 0 + vld $vr30, $a5, -208 vld $vr31, $a5, -176 - vadd.d $vr1, $vr1, $vr28 - vst $vr1, $sp, 16 # 16-byte Folded Spill - vld $vr1, $a5, -192 - vadd.d $vr27, $vr30, $vr27 - vld $vr30, $a5, -144 - vadd.d $vr24, $vr31, $vr24 - vld $vr31, $a5, -160 - vadd.d $vr22, $vr1, $vr21 - vld $vr21, $a5, -112 - vadd.d $vr18, $vr30, $vr18 - vld $vr30, $a5, -128 - vadd.d $vr31, $vr31, $vr14 - vld $vr14, $a5, -80 - vadd.d $vr25, $vr21, $vr29 - vld $vr29, $a5, -96 - vadd.d $vr26, $vr30, $vr26 - vld $vr30, $a5, -48 - vadd.d $vr21, $vr14, $vr23 - vld $vr23, $a5, -64 - vadd.d $vr19, $vr29, $vr19 - vld $vr29, $a5, -16 - vadd.d $vr15, $vr30, $vr15 - vld $vr30, $a5, -32 - vadd.d $vr13, $vr23, $vr12 - vld $vr23, $a5, 16 - vadd.d $vr11, $vr29, $vr10 - vld $vr29, $a5, 0 - vadd.d $vr8, $vr30, $vr8 - vld $vr30, $a5, 48 - vadd.d $vr20, $vr23, $vr20 - vld $vr23, $a5, 32 - vadd.d $vr17, $vr29, $vr17 - vld $vr29, $a5, 80 - vadd.d $vr14, $vr30, $vr5 - vld $vr30, $a5, 64 - vadd.d $vr12, $vr23, $vr4 - vld $vr23, $a5, 112 - vadd.d $vr9, $vr29, $vr9 - vld $vr29, $a5, 96 - vadd.d $vr7, $vr30, $vr7 - vld $vr30, $a5, 144 - vadd.d $vr6, $vr23, $vr6 - vilvh.w $vr23, $vr0, $vr3 - vld $vr1, $sp, 64 # 16-byte Folded Reload - vadd.d $vr5, $vr29, $vr1 - vld $vr29, $a5, 128 - vadd.d $vr23, $vr30, $vr23 - vld $vr30, $a5, 176 - vilvl.w $vr3, $vr0, $vr3 - vadd.d $vr10, $vr29, $vr3 - vld $vr29, $a5, 160 + vadd.d $vr0, $vr29, $vr26 + vst $vr0, $sp, 48 # 16-byte Folded Spill + vld $vr29, $a5, -144 + vadd.d $vr0, $vr30, $vr27 + vst $vr0, $sp, 32 # 16-byte Folded Spill + vld $vr30, $a5, -256 + vadd.d $vr0, $vr31, $vr22 + vst $vr0, $sp, 16 # 16-byte Folded Spill + vld $vr31, $a5, -224 + vadd.d $vr22, $vr29, $vr19 + vld $vr29, $a5, -192 vadd.d $vr16, $vr30, $vr16 - vld $vr30, $a5, 208 - vilvl.w $vr2, $vr0, $vr2 - vadd.d $vr4, $vr29, $vr2 - vld $vr29, $a5, 192 - vld $vr1, $sp, 80 # 16-byte Folded Reload - vadd.d $vr30, $vr30, $vr1 - vld $vr1, $a6, 112 - vld $vr2, $sp, 96 # 16-byte Folded Reload - vilvl.w $vr2, $vr0, $vr2 - vld $vr3, $a5, 240 - vadd.d $vr2, $vr29, $vr2 - vld $vr29, $a5, 224 - vilvh.w $vr28, $vr0, $vr1 - vadd.d $vr3, $vr3, $vr28 - vilvl.w $vr1, $vr0, $vr1 - vadd.d $vr1, $vr29, $vr1 - vst $vr31, $a5, -160 - vst $vr18, $a5, -144 - vst $vr22, $a5, -192 - vst $vr24, $a5, -176 - vst $vr27, $a5, -224 - vld $vr18, $sp, 16 # 16-byte Folded Reload - vst $vr18, $a5, -208 - vld $vr18, $sp, 32 # 16-byte Folded Reload - vst $vr18, $a5, -256 - vld $vr18, $sp, 48 # 16-byte Folded Reload - vst $vr18, $a5, -240 - vst $vr8, $a5, -32 - vst $vr11, $a5, -16 - vst $vr13, $a5, -64 - vst $vr15, $a5, -48 - vst $vr19, $a5, -96 - vst $vr21, $a5, -80 - vst $vr26, $a5, -128 - vst $vr25, $a5, -112 - vst $vr5, $a5, 96 - vst $vr6, $a5, 112 - vst $vr7, $a5, 64 - vst $vr9, $a5, 80 - vst $vr12, $a5, 32 - vst $vr14, $a5, 48 - vst $vr17, $a5, 0 + vld $vr30, $a5, -160 + vadd.d $vr15, $vr31, $vr15 + vld $vr31, $a5, -112 + vadd.d $vr29, $vr29, $vr12 + vld $vr12, $a5, -80 + vadd.d $vr30, $vr30, $vr9 + vld $vr0, $a5, -48 + vadd.d $vr27, $vr31, $vr28 + vld $vr28, $a5, -16 + vadd.d $vr19, $vr12, $vr23 + vld $vr23, $a5, -128 + vadd.d $vr12, $vr0, $vr20 + vld $vr20, $a5, -96 + vadd.d $vr17, $vr28, $vr17 + vld $vr28, $a5, -64 + vadd.d $vr13, $vr23, $vr13 + vld $vr23, $a5, -32 + vadd.d $vr11, $vr20, $vr11 + vld $vr20, $a5, 16 + vadd.d $vr9, $vr28, $vr5 + vld $vr28, $a5, 48 + vadd.d $vr6, $vr23, $vr6 + vld $vr23, $a5, 80 + vadd.d $vr20, $vr20, $vr24 + vld $vr24, $a5, 112 + vadd.d $vr21, $vr28, $vr21 + vld $vr28, $a5, 0 + vadd.d $vr18, $vr23, $vr18 + vld $vr23, $a5, 32 + vadd.d $vr14, $vr24, $vr14 + vld $vr24, $a5, 64 + vadd.d $vr10, $vr28, $vr10 + vld $vr28, $a5, 96 + vadd.d $vr7, $vr23, $vr7 + vshuf4i.w $vr23, $vr3, 14 + vadd.d $vr5, $vr24, $vr4 + vld $vr24, $a5, 144 + vld $vr0, $sp, 64 # 16-byte Folded Reload + vadd.d $vr28, $vr28, $vr0 + vld $vr31, $a5, 176 + vsllwil.du.wu $vr23, $vr23, 0 + vadd.d $vr23, $vr24, $vr23 + vld $vr24, $a5, 208 + vadd.d $vr25, $vr31, $vr25 + vshuf4i.w $vr31, $vr1, 14 + vsllwil.du.wu $vr31, $vr31, 0 + vadd.d $vr24, $vr24, $vr31 + vld $vr31, $a5, 240 + vori.b $vr0, $vr8, 0 + vshuf4i.w $vr8, $vr8, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vld $vr26, $a5, 128 + vadd.d $vr8, $vr31, $vr8 + vsllwil.du.wu $vr3, $vr3, 0 + vld $vr31, $a5, 160 + vadd.d $vr4, $vr26, $vr3 + vsllwil.du.wu $vr2, $vr2, 0 + vld $vr3, $a5, 192 + vadd.d $vr2, $vr31, $vr2 + vld $vr31, $a5, 224 + vsllwil.du.wu $vr26, $vr1, 0 + vadd.d $vr3, $vr3, $vr26 + vsllwil.du.wu $vr26, $vr0, 0 + vadd.d $vr26, $vr31, $vr26 + vst $vr30, $a5, -160 + vst $vr29, $a5, -192 + vst $vr15, $a5, -224 + vst $vr16, $a5, -256 + vst $vr22, $a5, -144 + vld $vr0, $sp, 16 # 16-byte Folded Reload + vst $vr0, $a5, -176 + vld $vr0, $sp, 32 # 16-byte Folded Reload + vst $vr0, $a5, -208 + vld $vr0, $sp, 48 # 16-byte Folded Reload + vst $vr0, $a5, -240 + vst $vr6, $a5, -32 + vst $vr9, $a5, -64 + vst $vr11, $a5, -96 + vst $vr13, $a5, -128 + vst $vr17, $a5, -16 + vst $vr12, $a5, -48 + vst $vr19, $a5, -80 + vst $vr27, $a5, -112 + vst $vr28, $a5, 96 + vst $vr5, $a5, 64 + vst $vr7, $a5, 32 + vst $vr10, $a5, 0 + vst $vr14, $a5, 112 + vst $vr18, $a5, 80 + vst $vr21, $a5, 48 vst $vr20, $a5, 16 - vst $vr1, $a5, 224 - vst $vr3, $a5, 240 - vst $vr2, $a5, 192 - vst $vr30, $a5, 208 - vst $vr4, $a5, 160 - vst $vr16, $a5, 176 - vst $vr10, $a5, 128 + vst $vr26, $a5, 224 + vst $vr3, $a5, 192 + vst $vr2, $a5, 160 + vst $vr4, $a5, 128 + vst $vr8, $a5, 240 + vst $vr24, $a5, 208 + vst $vr25, $a5, 176 vst $vr23, $a5, 144 addi.d $a7, $a7, -64 addi.d $a6, $a6, 256 @@ -28936,7 +29726,7 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint32_t_To_uint64_t_RN9benc jirl $ra, $ra, 0 .Ltmp1532: # EH_LABEL # %bb.15: # %_ZNSt10unique_ptrIA_mSt14default_deleteIS0_EED2Ev.exit22 - ld.d $a0, $sp, 120 + ld.d $a0, $sp, 88 beqz $a0, .LBB70_17 # %bb.16: # %_ZNKSt14default_deleteIA_mEclImEENSt9enable_ifIXsr14is_convertibleIPA_T_PS0_EE5valueEvE4typeEPS4_.exit.i24 pcaddu18i $ra, %call36(_ZdaPv) @@ -28945,25 +29735,25 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint32_t_To_uint64_t_RN9benc move $a0, $fp pcaddu18i $ra, %call36(_ZdaPv) jirl $ra, $ra, 0 - fld.d $fs7, $sp, 144 # 8-byte Folded Reload - fld.d $fs6, $sp, 152 # 8-byte Folded Reload - fld.d $fs5, $sp, 160 # 8-byte Folded Reload - fld.d $fs4, $sp, 168 # 8-byte Folded Reload - fld.d $fs3, $sp, 176 # 8-byte Folded Reload - fld.d $fs2, $sp, 184 # 8-byte Folded Reload - fld.d $fs1, $sp, 192 # 8-byte Folded Reload - fld.d $fs0, $sp, 200 # 8-byte Folded Reload - ld.d $s7, $sp, 208 # 8-byte Folded Reload - ld.d $s6, $sp, 216 # 8-byte Folded Reload - ld.d $s5, $sp, 224 # 8-byte Folded Reload - ld.d $s4, $sp, 232 # 8-byte Folded Reload - ld.d $s3, $sp, 240 # 8-byte Folded Reload - ld.d $s2, $sp, 248 # 8-byte Folded Reload - ld.d $s1, $sp, 256 # 8-byte Folded Reload - ld.d $s0, $sp, 264 # 8-byte Folded Reload - ld.d $fp, $sp, 272 # 8-byte Folded Reload - ld.d $ra, $sp, 280 # 8-byte Folded Reload - addi.d $sp, $sp, 288 + fld.d $fs7, $sp, 112 # 8-byte Folded Reload + fld.d $fs6, $sp, 120 # 8-byte Folded Reload + fld.d $fs5, $sp, 128 # 8-byte Folded Reload + fld.d $fs4, $sp, 136 # 8-byte Folded Reload + fld.d $fs3, $sp, 144 # 8-byte Folded Reload + fld.d $fs2, $sp, 152 # 8-byte Folded Reload + fld.d $fs1, $sp, 160 # 8-byte Folded Reload + fld.d $fs0, $sp, 168 # 8-byte Folded Reload + ld.d $s7, $sp, 176 # 8-byte Folded Reload + ld.d $s6, $sp, 184 # 8-byte Folded Reload + ld.d $s5, $sp, 192 # 8-byte Folded Reload + ld.d $s4, $sp, 200 # 8-byte Folded Reload + ld.d $s3, $sp, 208 # 8-byte Folded Reload + ld.d $s2, $sp, 216 # 8-byte Folded Reload + ld.d $s1, $sp, 224 # 8-byte Folded Reload + ld.d $s0, $sp, 232 # 8-byte Folded Reload + ld.d $fp, $sp, 240 # 8-byte Folded Reload + ld.d $ra, $sp, 248 # 8-byte Folded Reload + addi.d $sp, $sp, 256 ret .LBB70_18: .Ltmp1522: # EH_LABEL @@ -28976,7 +29766,7 @@ _Z69benchForTruncOrZextVecWithAddInLoopWithVW16From_uint32_t_To_uint64_t_RN9benc jirl $ra, $ra, 0 .LBB70_19: .Ltmp1533: # EH_LABEL - ld.d $s1, $sp, 120 + ld.d $s1, $sp, 88 move $s0, $a0 bnez $s1, .LBB70_23 b .LBB70_24 @@ -29145,7 +29935,6 @@ _Z61benchForTruncOrZextVecWithAddInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5St addi.d $a1, $sp, 8 lu12i.w $a2, 2 ori $a2, $a2, 1808 - vrepli.b $vr0, 0 .p2align 4, , 16 .LBB71_11: # %.lr.ph # =>This Loop Header: Depth=1 @@ -29163,16 +29952,16 @@ _Z61benchForTruncOrZextVecWithAddInLoopFrom_uint32_t_To_uint64_t_RN9benchmark5St # => This Inner Loop Header: Depth=2 ld.d $a6, $a4, -8 ld.d $a7, $a4, 0 - vinsgr2vr.d $vr1, $a6, 0 - vinsgr2vr.d $vr2, $a7, 0 - vld $vr3, $a3, -16 - vld $vr4, $a3, 0 - vilvl.w $vr1, $vr0, $vr1 - vilvl.w $vr2, $vr0, $vr2 + vinsgr2vr.d $vr0, $a6, 0 + vinsgr2vr.d $vr1, $a7, 0 + vld $vr2, $a3, -16 + vld $vr3, $a3, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vadd.d $vr0, $vr2, $vr0 vadd.d $vr1, $vr3, $vr1 - vadd.d $vr2, $vr4, $vr2 - vst $vr1, $a3, -16 - vst $vr2, $a3, 0 + vst $vr0, $a3, -16 + vst $vr1, $a3, 0 addi.d $a5, $a5, -4 addi.d $a3, $a3, 32 addi.d $a4, $a4, 16 diff --git a/results/MultiSource/Applications/ALAC/decode/CMakeFiles/alacconvert-decode.dir/ALACEncoder.s b/results/MultiSource/Applications/ALAC/decode/CMakeFiles/alacconvert-decode.dir/ALACEncoder.s index 4dc7f9c0..abcbe365 100644 --- a/results/MultiSource/Applications/ALAC/decode/CMakeFiles/alacconvert-decode.dir/ALACEncoder.s +++ b/results/MultiSource/Applications/ALAC/decode/CMakeFiles/alacconvert-decode.dir/ALACEncoder.s @@ -1665,12 +1665,8 @@ _ZN11ALACEncoder10EncodeMonoEP9BitBufferPvjjj: # @_ZN11ALACEncoder10EncodeMonoEP ld.d $a7, $a3, 0 vinsgr2vr.d $vr0, $a6, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vsllwil.w.h $vr0, $vr0, 0 + vsllwil.w.h $vr1, $vr1, 0 vst $vr0, $a4, -16 vst $vr1, $a4, 0 addi.d $a3, $a3, 16 diff --git a/results/MultiSource/Applications/ALAC/decode/CMakeFiles/alacconvert-decode.dir/dp_dec.s b/results/MultiSource/Applications/ALAC/decode/CMakeFiles/alacconvert-decode.dir/dp_dec.s index 754a4f96..a2b2b5cd 100644 --- a/results/MultiSource/Applications/ALAC/decode/CMakeFiles/alacconvert-decode.dir/dp_dec.s +++ b/results/MultiSource/Applications/ALAC/decode/CMakeFiles/alacconvert-decode.dir/dp_dec.s @@ -556,14 +556,10 @@ unpc_block: # @unpc_block ld.d $s5, $s2, 0 vinsgr2vr.d $vr4, $s4, 0 vinsgr2vr.d $vr5, $s5, 0 - vilvl.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr5, $vr5, $vr5 vld $vr6, $s1, 0 vld $vr7, $s1, -16 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr5, 0 vshuf4i.w $vr6, $vr6, 27 vshuf4i.w $vr7, $vr7, 27 vsub.w $vr6, $vr6, $vr1 diff --git a/results/MultiSource/Applications/ALAC/decode/CMakeFiles/alacconvert-decode.dir/dp_enc.s b/results/MultiSource/Applications/ALAC/decode/CMakeFiles/alacconvert-decode.dir/dp_enc.s index 35ebc587..e62ba05b 100644 --- a/results/MultiSource/Applications/ALAC/decode/CMakeFiles/alacconvert-decode.dir/dp_enc.s +++ b/results/MultiSource/Applications/ALAC/decode/CMakeFiles/alacconvert-decode.dir/dp_enc.s @@ -750,14 +750,10 @@ pc_block: # @pc_block ld.d $s5, $s2, 0 vinsgr2vr.d $vr4, $s4, 0 vinsgr2vr.d $vr5, $s5, 0 - vilvl.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr5, $vr5, $vr5 vld $vr6, $s1, 0 vld $vr7, $s1, -16 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr5, 0 vshuf4i.w $vr6, $vr6, 27 vshuf4i.w $vr7, $vr7, 27 vsub.w $vr6, $vr6, $vr1 diff --git a/results/MultiSource/Applications/ALAC/encode/CMakeFiles/alacconvert-encode.dir/ALACEncoder.s b/results/MultiSource/Applications/ALAC/encode/CMakeFiles/alacconvert-encode.dir/ALACEncoder.s index 4dc7f9c0..abcbe365 100644 --- a/results/MultiSource/Applications/ALAC/encode/CMakeFiles/alacconvert-encode.dir/ALACEncoder.s +++ b/results/MultiSource/Applications/ALAC/encode/CMakeFiles/alacconvert-encode.dir/ALACEncoder.s @@ -1665,12 +1665,8 @@ _ZN11ALACEncoder10EncodeMonoEP9BitBufferPvjjj: # @_ZN11ALACEncoder10EncodeMonoEP ld.d $a7, $a3, 0 vinsgr2vr.d $vr0, $a6, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vsllwil.w.h $vr0, $vr0, 0 + vsllwil.w.h $vr1, $vr1, 0 vst $vr0, $a4, -16 vst $vr1, $a4, 0 addi.d $a3, $a3, 16 diff --git a/results/MultiSource/Applications/ALAC/encode/CMakeFiles/alacconvert-encode.dir/dp_dec.s b/results/MultiSource/Applications/ALAC/encode/CMakeFiles/alacconvert-encode.dir/dp_dec.s index 754a4f96..a2b2b5cd 100644 --- a/results/MultiSource/Applications/ALAC/encode/CMakeFiles/alacconvert-encode.dir/dp_dec.s +++ b/results/MultiSource/Applications/ALAC/encode/CMakeFiles/alacconvert-encode.dir/dp_dec.s @@ -556,14 +556,10 @@ unpc_block: # @unpc_block ld.d $s5, $s2, 0 vinsgr2vr.d $vr4, $s4, 0 vinsgr2vr.d $vr5, $s5, 0 - vilvl.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr5, $vr5, $vr5 vld $vr6, $s1, 0 vld $vr7, $s1, -16 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr5, 0 vshuf4i.w $vr6, $vr6, 27 vshuf4i.w $vr7, $vr7, 27 vsub.w $vr6, $vr6, $vr1 diff --git a/results/MultiSource/Applications/ALAC/encode/CMakeFiles/alacconvert-encode.dir/dp_enc.s b/results/MultiSource/Applications/ALAC/encode/CMakeFiles/alacconvert-encode.dir/dp_enc.s index 35ebc587..e62ba05b 100644 --- a/results/MultiSource/Applications/ALAC/encode/CMakeFiles/alacconvert-encode.dir/dp_enc.s +++ b/results/MultiSource/Applications/ALAC/encode/CMakeFiles/alacconvert-encode.dir/dp_enc.s @@ -750,14 +750,10 @@ pc_block: # @pc_block ld.d $s5, $s2, 0 vinsgr2vr.d $vr4, $s4, 0 vinsgr2vr.d $vr5, $s5, 0 - vilvl.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr5, $vr5, $vr5 vld $vr6, $s1, 0 vld $vr7, $s1, -16 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr5, 0 vshuf4i.w $vr6, $vr6, 27 vshuf4i.w $vr7, $vr7, 27 vsub.w $vr6, $vr6, $vr1 diff --git a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_is_tar.s b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_is_tar.s index be8238bf..8066c996 100644 --- a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_is_tar.s +++ b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_is_tar.s @@ -130,25 +130,24 @@ is_tar: # @is_tar or $a2, $a0, $a2 .LBB0_23: # %vector.ph move $a0, $zero - vrepli.b $vr2, 0 + vrepli.b $vr0, 0 ori $a3, $zero, 512 - vori.b $vr0, $vr2, 0 - vori.b $vr1, $vr2, 0 + vori.b $vr1, $vr0, 0 .p2align 4, , 16 .LBB0_24: # %vector.body # =>This Inner Loop Header: Depth=1 add.d $a4, $fp, $a0 ldx.w $a5, $fp, $a0 ld.w $a4, $a4, 4 - vinsgr2vr.w $vr3, $a5, 0 - vinsgr2vr.w $vr4, $a4, 0 - vilvl.b $vr3, $vr2, $vr3 - vilvl.h $vr3, $vr2, $vr3 - vilvl.b $vr4, $vr2, $vr4 - vilvl.h $vr4, $vr2, $vr4 - vadd.w $vr0, $vr0, $vr3 + vinsgr2vr.w $vr2, $a5, 0 + vinsgr2vr.w $vr3, $a4, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr0, $vr0, $vr2 addi.d $a0, $a0, 8 - vadd.w $vr1, $vr1, $vr4 + vadd.w $vr1, $vr1, $vr3 bne $a0, $a3, .LBB0_24 # %bb.25: # %middle.block ld.bu $a0, $fp, 155 diff --git a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_mspack.s b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_mspack.s index 6d0b081b..9d6188e5 100644 --- a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_mspack.s +++ b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_mspack.s @@ -11680,13 +11680,12 @@ mszip_make_decode_table: # @mszip_make_decode_table st.d $a4, $sp, 24 # 8-byte Folded Spill st.d $a0, $sp, 16 # 8-byte Folded Spill bstrpick.d $a4, $a0, 15, 0 + pcalau12i $t2, %pc_hi20(.LCPI16_0) + vld $vr0, $t2, %pc_lo12(.LCPI16_0) + pcalau12i $t2, %pc_hi20(.LCPI16_1) + vld $vr1, $t2, %pc_lo12(.LCPI16_1) addi.w $t2, $zero, -1 - pcalau12i $t3, %pc_hi20(.LCPI16_0) - vld $vr0, $t3, %pc_lo12(.LCPI16_0) - pcalau12i $t3, %pc_hi20(.LCPI16_1) - vld $vr1, $t3, %pc_lo12(.LCPI16_1) ori $t3, $zero, 16 - vrepli.b $vr2, 0 ori $t7, $zero, 1 st.d $t4, $sp, 8 # 8-byte Folded Spill st.d $a1, $sp, 32 # 8-byte Folded Spill @@ -11720,11 +11719,11 @@ mszip_make_decode_table: # @mszip_make_decode_table slli.w $s1, $s1, 3 mul.d $a1, $s1, $t7 slli.d $s3, $t7, 3 - vreplgr2vr.w $vr3, $s3 + vreplgr2vr.w $vr2, $s3 andi $s3, $t4, 7 - vreplgr2vr.w $vr5, $t7 - vmul.w $vr4, $vr5, $vr0 - vmul.w $vr5, $vr5, $vr1 + vreplgr2vr.w $vr4, $t7 + vmul.w $vr3, $vr4, $vr0 + vmul.w $vr4, $vr4, $vr1 b .LBB16_4 .p2align 4, , 16 .LBB16_3: # %.loopexit140 @@ -11789,34 +11788,36 @@ mszip_make_decode_table: # @mszip_make_decode_table .LBB16_12: # %vector.ph # in Loop: Header=BB16_4 Depth=2 add.w $s4, $s6, $a1 - vreplgr2vr.w $vr7, $s6 - vadd.w $vr6, $vr7, $vr4 - vadd.w $vr7, $vr7, $vr5 + vreplgr2vr.w $vr6, $s6 + vadd.w $vr5, $vr6, $vr3 + vadd.w $vr6, $vr6, $vr4 move $s6, $s1 .p2align 4, , 16 .LBB16_13: # %vector.body # Parent Loop BB16_2 Depth=1 # Parent Loop BB16_4 Depth=2 # => This Inner Loop Header: Depth=3 - vilvh.w $vr8, $vr2, $vr6 - vilvl.w $vr9, $vr2, $vr6 - vilvh.w $vr10, $vr2, $vr7 - vilvl.w $vr11, $vr2, $vr7 - vpickve2gr.d $s7, $vr11, 0 + vshuf4i.w $vr7, $vr5, 14 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.du.wu $vr8, $vr5, 0 + vshuf4i.w $vr9, $vr6, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr10, $vr6, 0 + vpickve2gr.d $s7, $vr10, 0 slli.d $s7, $s7, 1 - vpickve2gr.d $s8, $vr11, 1 + vpickve2gr.d $s8, $vr10, 1 slli.d $s8, $s8, 1 - vpickve2gr.d $ra, $vr10, 0 + vpickve2gr.d $ra, $vr9, 0 slli.d $ra, $ra, 1 - vpickve2gr.d $a6, $vr10, 1 + vpickve2gr.d $a6, $vr9, 1 slli.d $a6, $a6, 1 - vpickve2gr.d $a0, $vr9, 0 + vpickve2gr.d $a0, $vr8, 0 slli.d $a0, $a0, 1 - vpickve2gr.d $t0, $vr9, 1 + vpickve2gr.d $t0, $vr8, 1 slli.d $t0, $t0, 1 - vpickve2gr.d $t1, $vr8, 0 + vpickve2gr.d $t1, $vr7, 0 slli.d $t1, $t1, 1 - vpickve2gr.d $s2, $vr8, 1 + vpickve2gr.d $s2, $vr7, 1 slli.d $s2, $s2, 1 stx.h $t6, $a3, $s7 stx.h $t6, $a3, $s8 @@ -11826,9 +11827,9 @@ mszip_make_decode_table: # @mszip_make_decode_table stx.h $t6, $a3, $t0 stx.h $t6, $a3, $t1 stx.h $t6, $a3, $s2 - vadd.w $vr7, $vr7, $vr3 + vadd.w $vr6, $vr6, $vr2 addi.w $s6, $s6, -8 - vadd.w $vr6, $vr6, $vr3 + vadd.w $vr5, $vr5, $vr2 bnez $s6, .LBB16_13 # %bb.14: # %middle.block # in Loop: Header=BB16_4 Depth=2 diff --git a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_nsis_bzlib.s b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_nsis_bzlib.s index 53621485..c8cea39b 100644 --- a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_nsis_bzlib.s +++ b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_nsis_bzlib.s @@ -124,18 +124,18 @@ default_bzfree: # @default_bzfree .type nsis_BZ2_bzDecompress,@function nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress # %bb.0: - addi.d $sp, $sp, -608 - st.d $ra, $sp, 600 # 8-byte Folded Spill - st.d $fp, $sp, 592 # 8-byte Folded Spill - st.d $s0, $sp, 584 # 8-byte Folded Spill - st.d $s1, $sp, 576 # 8-byte Folded Spill - st.d $s2, $sp, 568 # 8-byte Folded Spill - st.d $s3, $sp, 560 # 8-byte Folded Spill - st.d $s4, $sp, 552 # 8-byte Folded Spill - st.d $s5, $sp, 544 # 8-byte Folded Spill - st.d $s6, $sp, 536 # 8-byte Folded Spill - st.d $s7, $sp, 528 # 8-byte Folded Spill - st.d $s8, $sp, 520 # 8-byte Folded Spill + addi.d $sp, $sp, -592 + st.d $ra, $sp, 584 # 8-byte Folded Spill + st.d $fp, $sp, 576 # 8-byte Folded Spill + st.d $s0, $sp, 568 # 8-byte Folded Spill + st.d $s1, $sp, 560 # 8-byte Folded Spill + st.d $s2, $sp, 552 # 8-byte Folded Spill + st.d $s3, $sp, 544 # 8-byte Folded Spill + st.d $s4, $sp, 536 # 8-byte Folded Spill + st.d $s5, $sp, 528 # 8-byte Folded Spill + st.d $s6, $sp, 520 # 8-byte Folded Spill + st.d $s7, $sp, 512 # 8-byte Folded Spill + st.d $s8, $sp, 504 # 8-byte Folded Spill move $a1, $a0 addi.w $a0, $zero, -2 beqz $a1, .LBB3_483 @@ -160,34 +160,34 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress addi.d $a3, $s1, 2047 addi.d $s8, $a3, 1149 addi.d $a4, $a3, 1421 - st.d $a4, $sp, 424 # 8-byte Folded Spill + st.d $a4, $sp, 408 # 8-byte Folded Spill lu12i.w $a4, 6 ori $a4, $a4, 1310 add.d $a4, $s1, $a4 - st.d $a4, $sp, 336 # 8-byte Folded Spill + st.d $a4, $sp, 344 # 8-byte Folded Spill ori $a4, $a0, 3788 add.d $a4, $s1, $a4 - st.d $a4, $sp, 344 # 8-byte Folded Spill + st.d $a4, $sp, 352 # 8-byte Folded Spill ori $a1, $a1, 2928 add.d $a1, $s1, $a1 st.d $a1, $sp, 200 # 8-byte Folded Spill lu12i.w $a1, 11 ori $a1, $a1, 380 add.d $a1, $s1, $a1 - st.d $a1, $sp, 384 # 8-byte Folded Spill + st.d $a1, $sp, 392 # 8-byte Folded Spill lu12i.w $a4, 12 ori $a1, $a4, 2476 add.d $ra, $s1, $a1 lu12i.w $a1, 14 ori $a1, $a1, 476 add.d $a1, $s1, $a1 - st.d $a1, $sp, 352 # 8-byte Folded Spill + st.d $a1, $sp, 360 # 8-byte Folded Spill addi.d $a1, $a2, -1524 - st.d $a1, $sp, 376 # 8-byte Folded Spill + st.d $a1, $sp, 384 # 8-byte Folded Spill addi.d $a1, $s1, 68 st.d $a1, $sp, 208 # 8-byte Folded Spill addi.d $a1, $a3, 1677 - st.d $a1, $sp, 328 # 8-byte Folded Spill + st.d $a1, $sp, 336 # 8-byte Folded Spill ori $a1, $a0, 3724 add.d $a1, $s1, $a1 st.d $a1, $sp, 152 # 8-byte Folded Spill @@ -206,7 +206,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.d $a2, $sp, 144 # 8-byte Folded Spill lu12i.w $a2, 24 ori $a2, $a2, 1696 - st.d $a2, $sp, 392 # 8-byte Folded Spill + st.d $a2, $sp, 400 # 8-byte Folded Spill lu12i.w $a2, 12320 ori $a2, $a2, 256 lu32i.d $a2, 394500 @@ -215,8 +215,9 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress vst $vr0, $sp, 128 # 16-byte Folded Spill vrepli.b $vr3, 32 ori $a0, $a0, 2100 - st.d $a0, $sp, 360 # 8-byte Folded Spill - vrepli.b $vr6, 0 + st.d $a0, $sp, 368 # 8-byte Folded Spill + vrepli.b $vr0, 0 + vst $vr0, $sp, 304 # 16-byte Folded Spill vrepli.w $vr0, 32 vst $vr0, $sp, 288 # 16-byte Folded Spill lu12i.w $a0, 878 @@ -230,9 +231,8 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ori $s4, $zero, 1 ori $t7, $zero, 2 ori $t8, $zero, 10 - st.d $ra, $sp, 496 # 8-byte Folded Spill - vst $vr3, $sp, 304 # 16-byte Folded Spill - vst $vr6, $sp, 400 # 16-byte Folded Spill + st.d $ra, $sp, 480 # 8-byte Folded Spill + vst $vr3, $sp, 320 # 16-byte Folded Spill .p2align 4, , 16 .LBB3_4: # =>This Loop Header: Depth=1 # Child Loop BB3_14 Depth 2 @@ -326,7 +326,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.d $a4, $fp, 56 ld.d $a3, $fp, 72 stptr.d $a0, $s1, 3160 - ld.d $a0, $sp, 392 # 8-byte Folded Reload + ld.d $a0, $sp, 400 # 8-byte Folded Reload mul.w $a0, $a1, $a0 srai.d $a1, $a0, 1 ori $a2, $zero, 1 @@ -339,22 +339,22 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress addi.w $a0, $zero, -3 beqz $a3, .LBB3_99 # %bb.10: # in Loop: Header=BB3_4 Depth=1 - st.d $a2, $sp, 488 # 8-byte Folded Spill - st.d $a2, $sp, 480 # 8-byte Folded Spill + st.d $a2, $sp, 472 # 8-byte Folded Spill + st.d $a2, $sp, 464 # 8-byte Folded Spill move $t4, $a2 move $t6, $a2 move $fp, $a2 + st.d $a2, $sp, 432 # 8-byte Folded Spill st.d $a2, $sp, 448 # 8-byte Folded Spill - st.d $a2, $sp, 464 # 8-byte Folded Spill - st.d $a2, $sp, 472 # 8-byte Folded Spill + st.d $a2, $sp, 456 # 8-byte Folded Spill move $t5, $a2 - st.d $a2, $sp, 440 # 8-byte Folded Spill + st.d $a2, $sp, 424 # 8-byte Folded Spill move $s2, $a2 - st.d $a2, $sp, 432 # 8-byte Folded Spill + st.d $a2, $sp, 416 # 8-byte Folded Spill move $t1, $a2 move $a7, $a2 - st.d $a2, $sp, 456 # 8-byte Folded Spill - st.d $a2, $sp, 504 # 8-byte Folded Spill + st.d $a2, $sp, 440 # 8-byte Folded Spill + st.d $a2, $sp, 488 # 8-byte Folded Spill move $a5, $a2 move $t7, $a2 move $s4, $a2 @@ -363,9 +363,8 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress move $t2, $a2 move $t0, $a2 move $a3, $a2 - ld.d $ra, $sp, 496 # 8-byte Folded Reload - vld $vr3, $sp, 304 # 16-byte Folded Reload - vld $vr6, $sp, 400 # 16-byte Folded Reload + ld.d $ra, $sp, 480 # 8-byte Folded Reload + vld $vr3, $sp, 320 # 16-byte Folded Reload ori $t8, $zero, 10 bnez $a1, .LBB3_85 b .LBB3_352 @@ -758,7 +757,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress beq $t5, $a7, .LBB3_160 # %bb.59: # in Loop: Header=BB3_44 Depth=2 ld.w $t1, $s1, 40 - ld.d $t2, $sp, 392 # 8-byte Folded Reload + ld.d $t2, $sp, 400 # 8-byte Folded Reload mul.w $t7, $t1, $t2 addi.w $t1, $a5, 0 bgeu $t1, $t7, .LBB3_483 @@ -875,29 +874,29 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress .LBB3_80: # in Loop: Header=BB3_4 Depth=1 ld.w $a2, $s6, 0 ld.w $a0, $s6, 4 - st.d $a0, $sp, 488 # 8-byte Folded Spill + st.d $a0, $sp, 472 # 8-byte Folded Spill ld.w $a0, $s6, 8 - st.d $a0, $sp, 480 # 8-byte Folded Spill + st.d $a0, $sp, 464 # 8-byte Folded Spill ld.w $t4, $s6, 12 ld.w $t6, $s6, 16 ld.w $fp, $s6, 20 ld.w $a0, $s6, 24 - st.d $a0, $sp, 448 # 8-byte Folded Spill + st.d $a0, $sp, 432 # 8-byte Folded Spill ld.w $a0, $s6, 28 - st.d $a0, $sp, 464 # 8-byte Folded Spill + st.d $a0, $sp, 448 # 8-byte Folded Spill ld.w $a0, $s6, 32 - st.d $a0, $sp, 472 # 8-byte Folded Spill + st.d $a0, $sp, 456 # 8-byte Folded Spill ld.w $t5, $s6, 36 ld.w $a0, $s6, 40 - st.d $a0, $sp, 440 # 8-byte Folded Spill + st.d $a0, $sp, 424 # 8-byte Folded Spill ld.w $a0, $s6, 48 - st.d $a0, $sp, 432 # 8-byte Folded Spill + st.d $a0, $sp, 416 # 8-byte Folded Spill ld.w $t1, $s6, 52 ld.w $a7, $s6, 56 ld.w $a0, $s6, 60 - st.d $a0, $sp, 456 # 8-byte Folded Spill + st.d $a0, $sp, 440 # 8-byte Folded Spill ld.w $a0, $s6, 64 - st.d $a0, $sp, 504 # 8-byte Folded Spill + st.d $a0, $sp, 488 # 8-byte Folded Spill ld.w $a5, $s6, 68 ld.w $t7, $s6, 72 ld.w $s4, $s6, 76 @@ -919,7 +918,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress jr $a1 .LBB3_82: # %._crit_edge1847.i # in Loop: Header=BB3_4 Depth=1 - st.d $s2, $sp, 368 # 8-byte Folded Spill + st.d $s2, $sp, 376 # 8-byte Folded Spill ld.w $a0, $s1, 36 b .LBB3_90 .LBB3_83: # in Loop: Header=BB3_4 Depth=1 @@ -929,22 +928,22 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress beqz $a0, .LBB3_105 # %bb.84: # in Loop: Header=BB3_4 Depth=1 move $a2, $zero - st.d $zero, $sp, 488 # 8-byte Folded Spill - st.d $zero, $sp, 480 # 8-byte Folded Spill + st.d $zero, $sp, 472 # 8-byte Folded Spill + st.d $zero, $sp, 464 # 8-byte Folded Spill move $t4, $zero move $t6, $zero move $fp, $zero + st.d $zero, $sp, 432 # 8-byte Folded Spill st.d $zero, $sp, 448 # 8-byte Folded Spill - st.d $zero, $sp, 464 # 8-byte Folded Spill - st.d $zero, $sp, 472 # 8-byte Folded Spill + st.d $zero, $sp, 456 # 8-byte Folded Spill move $t5, $zero - st.d $zero, $sp, 440 # 8-byte Folded Spill + st.d $zero, $sp, 424 # 8-byte Folded Spill move $s2, $zero - st.d $zero, $sp, 432 # 8-byte Folded Spill + st.d $zero, $sp, 416 # 8-byte Folded Spill move $t1, $zero move $a7, $zero - st.d $zero, $sp, 456 # 8-byte Folded Spill - st.d $zero, $sp, 504 # 8-byte Folded Spill + st.d $zero, $sp, 440 # 8-byte Folded Spill + st.d $zero, $sp, 488 # 8-byte Folded Spill move $a5, $zero move $t7, $zero move $s4, $zero @@ -952,12 +951,11 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress move $s5, $zero move $t2, $zero move $t0, $zero - ld.d $ra, $sp, 496 # 8-byte Folded Reload - vld $vr3, $sp, 304 # 16-byte Folded Reload - vld $vr6, $sp, 400 # 16-byte Folded Reload + ld.d $ra, $sp, 480 # 8-byte Folded Reload + vld $vr3, $sp, 320 # 16-byte Folded Reload ori $t8, $zero, 10 .LBB3_85: # in Loop: Header=BB3_4 Depth=1 - st.d $s2, $sp, 368 # 8-byte Folded Spill + st.d $s2, $sp, 376 # 8-byte Folded Spill ld.w $a1, $s1, 36 ori $a0, $zero, 14 st.w $a0, $s1, 8 @@ -979,7 +977,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress # %bb.88: # %._crit_edge.i # in Loop: Header=BB3_4 Depth=1 ori $a3, $zero, 49 - ld.d $s2, $sp, 368 # 8-byte Folded Reload + ld.d $s2, $sp, 376 # 8-byte Folded Reload bne $a1, $a3, .LBB3_212 # %bb.89: # in Loop: Header=BB3_4 Depth=1 st.w $zero, $s1, 56 @@ -1045,7 +1043,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ori $a0, $zero, 4 b .LBB3_215 .LBB3_99: # in Loop: Header=BB3_4 Depth=1 - st.d $a2, $sp, 456 # 8-byte Folded Spill + st.d $a2, $sp, 440 # 8-byte Folded Spill move $t7, $a2 move $s4, $a2 move $s3, $a2 @@ -1053,21 +1051,21 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress move $t2, $a2 move $t0, $a2 move $a5, $a2 - st.d $a2, $sp, 504 # 8-byte Folded Spill + st.d $a2, $sp, 488 # 8-byte Folded Spill move $a7, $a2 move $t1, $a2 - st.d $a2, $sp, 432 # 8-byte Folded Spill + st.d $a2, $sp, 416 # 8-byte Folded Spill move $s2, $a2 - st.d $a2, $sp, 440 # 8-byte Folded Spill + st.d $a2, $sp, 424 # 8-byte Folded Spill move $t5, $a2 - st.d $a2, $sp, 472 # 8-byte Folded Spill - st.d $a2, $sp, 464 # 8-byte Folded Spill + st.d $a2, $sp, 456 # 8-byte Folded Spill st.d $a2, $sp, 448 # 8-byte Folded Spill + st.d $a2, $sp, 432 # 8-byte Folded Spill move $fp, $a2 move $t6, $a2 move $t4, $a2 - st.d $a2, $sp, 480 # 8-byte Folded Spill - st.d $a2, $sp, 488 # 8-byte Folded Spill + st.d $a2, $sp, 464 # 8-byte Folded Spill + st.d $a2, $sp, 472 # 8-byte Folded Spill move $a3, $a2 b .LBB3_106 .LBB3_100: # %.lr.ph1485.i @@ -1108,7 +1106,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.w $a0, $a3, 16 b .LBB3_101 .LBB3_105: # in Loop: Header=BB3_4 Depth=1 - st.d $zero, $sp, 456 # 8-byte Folded Spill + st.d $zero, $sp, 440 # 8-byte Folded Spill move $t7, $zero move $s4, $zero move $s3, $zero @@ -1116,29 +1114,28 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress move $t2, $zero move $t0, $zero move $a5, $zero - st.d $zero, $sp, 504 # 8-byte Folded Spill + st.d $zero, $sp, 488 # 8-byte Folded Spill move $a7, $zero move $t1, $zero - st.d $zero, $sp, 432 # 8-byte Folded Spill + st.d $zero, $sp, 416 # 8-byte Folded Spill move $s2, $zero - st.d $zero, $sp, 440 # 8-byte Folded Spill + st.d $zero, $sp, 424 # 8-byte Folded Spill move $t5, $zero - st.d $zero, $sp, 472 # 8-byte Folded Spill - st.d $zero, $sp, 464 # 8-byte Folded Spill + st.d $zero, $sp, 456 # 8-byte Folded Spill st.d $zero, $sp, 448 # 8-byte Folded Spill + st.d $zero, $sp, 432 # 8-byte Folded Spill move $fp, $zero move $t6, $zero move $t4, $zero - st.d $zero, $sp, 480 # 8-byte Folded Spill - st.d $zero, $sp, 488 # 8-byte Folded Spill + st.d $zero, $sp, 464 # 8-byte Folded Spill + st.d $zero, $sp, 472 # 8-byte Folded Spill move $a2, $zero move $a3, $zero addi.w $a0, $zero, -3 .LBB3_106: # %BZ2_decompress.exit # in Loop: Header=BB3_4 Depth=1 - ld.d $ra, $sp, 496 # 8-byte Folded Reload - vld $vr3, $sp, 304 # 16-byte Folded Reload - vld $vr6, $sp, 400 # 16-byte Folded Reload + ld.d $ra, $sp, 480 # 8-byte Folded Reload + vld $vr3, $sp, 320 # 16-byte Folded Reload ori $t8, $zero, 10 b .LBB3_352 .LBB3_107: # %._crit_edge1864.i @@ -1160,7 +1157,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress srl.w $a1, $a4, $a0 bstrpick.d $fp, $a1, 14, 0 st.w $a0, $s1, 36 - ld.d $s0, $sp, 488 # 8-byte Folded Reload + ld.d $s0, $sp, 472 # 8-byte Folded Reload beqz $fp, .LBB3_185 # %bb.111: # in Loop: Header=BB3_4 Depth=1 move $a2, $zero @@ -1168,15 +1165,15 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress move $a5, $s2 move $s2, $t3 st.d $t2, $sp, 48 # 8-byte Folded Spill - st.d $s0, $sp, 488 # 8-byte Folded Spill + st.d $s0, $sp, 472 # 8-byte Folded Spill bge $a2, $fp, .LBB3_161 # %bb.113: # in Loop: Header=BB3_4 Depth=1 - st.d $zero, $sp, 488 # 8-byte Folded Spill + st.d $zero, $sp, 472 # 8-byte Folded Spill .LBB3_114: # %.preheader671 # in Loop: Header=BB3_4 Depth=1 move $t3, $s2 ld.w $a1, $s1, 36 - ld.d $s0, $sp, 488 # 8-byte Folded Reload + ld.d $s0, $sp, 472 # 8-byte Folded Reload .LBB3_115: # Parent Loop BB3_4 Depth=1 # => This Loop Header: Depth=2 # Child Loop BB3_121 Depth 3 @@ -1240,12 +1237,12 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress b .LBB3_120 .LBB3_124: # %._crit_edge1857.i # in Loop: Header=BB3_4 Depth=1 - st.d $s2, $sp, 368 # 8-byte Folded Spill + st.d $s2, $sp, 376 # 8-byte Folded Spill ld.w $a0, $s1, 36 b .LBB3_129 .LBB3_125: # %._crit_edge1852.i # in Loop: Header=BB3_4 Depth=1 - st.d $s2, $sp, 368 # 8-byte Folded Spill + st.d $s2, $sp, 376 # 8-byte Folded Spill ld.w $a1, $s1, 36 .LBB3_126: # in Loop: Header=BB3_4 Depth=1 ori $a0, $zero, 26 @@ -1286,7 +1283,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress # %bb.132: # in Loop: Header=BB3_4 Depth=1 ld.w $a3, $s1, 40 addi.w $a1, $a1, 0 - ld.d $a4, $sp, 392 # 8-byte Folded Reload + ld.d $a4, $sp, 400 # 8-byte Folded Reload mul.d $a3, $a3, $a4 addi.w $a3, $a3, 10 move $a5, $s2 @@ -1390,7 +1387,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.w $a1, $s1, 36 b .LBB3_447 .LBB3_148: # in Loop: Header=BB3_4 Depth=1 - st.d $s0, $sp, 488 # 8-byte Folded Spill + st.d $s0, $sp, 472 # 8-byte Folded Spill .LBB3_149: # in Loop: Header=BB3_4 Depth=1 move $a3, $zero move $a0, $zero @@ -1458,7 +1455,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress move $t1, $t6 b .LBB3_74 .LBB3_161: # in Loop: Header=BB3_4 Depth=1 - addi.d $t2, $sp, 514 + addi.d $t2, $sp, 498 blez $t6, .LBB3_165 # %bb.162: # %iter.check614 # in Loop: Header=BB3_4 Depth=1 @@ -1489,17 +1486,17 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress b .LBB3_168 .LBB3_167: # %._crit_edge1526.i # in Loop: Header=BB3_168 Depth=2 - ld.d $a2, $sp, 344 # 8-byte Folded Reload + ld.d $a2, $sp, 352 # 8-byte Folded Reload stx.b $a1, $a2, $a0 addi.d $a0, $a0, 1 - st.b $a1, $sp, 514 + st.b $a1, $sp, 498 beq $a0, $fp, .LBB3_186 .LBB3_168: # Parent Loop BB3_4 Depth=1 # => This Loop Header: Depth=2 # Child Loop BB3_181 Depth 3 # Child Loop BB3_178 Depth 3 # Child Loop BB3_174 Depth 3 - ld.d $a1, $sp, 336 # 8-byte Folded Reload + ld.d $a1, $sp, 344 # 8-byte Folded Reload ldx.bu $a2, $a1, $a0 ldx.bu $a1, $a2, $t2 beqz $a2, .LBB3_167 @@ -1556,7 +1553,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress sub.d $a3, $a2, $a7 sub.d $t0, $a6, $a7 sub.d $t1, $a2, $a6 - addi.d $a6, $sp, 507 + addi.d $a6, $sp, 491 add.d $a6, $a6, $t1 add.d $a4, $t1, $a4 .LBB3_178: # %vec.epilog.vector.body596 @@ -1581,7 +1578,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress # in Loop: Header=BB3_168 Depth=2 andi $a5, $a2, 8 andi $a6, $a2, 240 - addi.d $a7, $sp, 499 + addi.d $a7, $sp, 483 add.d $a7, $a7, $a2 move $t0, $a6 .LBB3_181: # %vector.body580 @@ -1651,12 +1648,12 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress move $a0, $zero b .LBB3_200 .LBB3_194: # in Loop: Header=BB3_4 Depth=1 - ld.d $a0, $sp, 336 # 8-byte Folded Reload + ld.d $a0, $sp, 344 # 8-byte Folded Reload stx.b $s0, $a0, $a2 addi.w $a2, $a2, 1 b .LBB3_112 .LBB3_195: # in Loop: Header=BB3_4 Depth=1 - st.d $s0, $sp, 488 # 8-byte Folded Spill + st.d $s0, $sp, 472 # 8-byte Folded Spill move $a3, $zero move $s2, $t3 b .LBB3_352 @@ -1667,7 +1664,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress andi $a0, $t6, 480 andi $a1, $t6, 24 move $a2, $a0 - addi.d $a3, $sp, 530 + addi.d $a3, $sp, 514 .LBB3_197: # %vector.body621 # Parent Loop BB3_4 Depth=1 # => This Inner Loop Header: Depth=2 @@ -1731,7 +1728,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.w $a1, $s1, 36 move $a5, $a0 ori $a3, $zero, 20 - ld.d $s2, $sp, 504 # 8-byte Folded Reload + ld.d $s2, $sp, 488 # 8-byte Folded Reload b .LBB3_439 .LBB3_207: # %.lr.ph1585.i # in Loop: Header=BB3_4 Depth=1 @@ -1782,14 +1779,14 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress move $a5, $s2 .LBB3_215: # %BZ2_decompress.exit # in Loop: Header=BB3_4 Depth=1 - ld.d $s2, $sp, 368 # 8-byte Folded Reload + ld.d $s2, $sp, 376 # 8-byte Folded Reload b .LBB3_352 .LBB3_216: # in Loop: Header=BB3_4 Depth=1 bge $a2, $a0, .LBB3_221 # %bb.217: # in Loop: Header=BB3_4 Depth=1 - ld.d $s2, $sp, 368 # 8-byte Folded Reload + ld.d $s2, $sp, 376 # 8-byte Folded Reload .LBB3_218: # in Loop: Header=BB3_4 Depth=1 - st.d $s2, $sp, 368 # 8-byte Folded Spill + st.d $s2, $sp, 376 # 8-byte Folded Spill ld.w $a0, $s1, 36 ori $a1, $zero, 28 st.w $a1, $s1, 8 @@ -1846,11 +1843,10 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.d $s8, $sp, 64 # 8-byte Folded Reload ld.d $s4, $sp, 256 # 8-byte Folded Reload ori $t8, $zero, 10 - vld $vr6, $sp, 400 # 16-byte Folded Reload - vld $vr3, $sp, 304 # 16-byte Folded Reload - ld.d $ra, $sp, 496 # 8-byte Folded Reload + vld $vr3, $sp, 320 # 16-byte Folded Reload + ld.d $ra, $sp, 480 # 8-byte Folded Reload move $a2, $zero - ld.d $s2, $sp, 368 # 8-byte Folded Reload + ld.d $s2, $sp, 376 # 8-byte Folded Reload b .LBB3_227 .LBB3_222: # %.lr.ph1670.i # in Loop: Header=BB3_4 Depth=1 @@ -1915,7 +1911,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ldx.bu $a3, $s8, $a1 beqz $a3, .LBB3_231 # %bb.233: # in Loop: Header=BB3_232 Depth=2 - ld.d $a3, $sp, 424 # 8-byte Folded Reload + ld.d $a3, $sp, 408 # 8-byte Folded Reload stx.b $a1, $a3, $a0 ori $a0, $zero, 3192 ldx.w $a0, $s1, $a0 @@ -2000,14 +1996,14 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress b .LBB3_108 .LBB3_247: # in Loop: Header=BB3_4 Depth=1 ori $a0, $zero, 15 - st.d $s0, $sp, 488 # 8-byte Folded Spill + st.d $s0, $sp, 472 # 8-byte Folded Spill blt $a0, $s0, .LBB3_253 .LBB3_248: # in Loop: Header=BB3_4 Depth=1 move $t3, $s2 ld.w $a0, $s1, 36 ori $a1, $zero, 29 st.w $a1, $s1, 8 - ld.d $s0, $sp, 488 # 8-byte Folded Reload + ld.d $s0, $sp, 472 # 8-byte Folded Reload move $s2, $a5 blez $a0, .LBB3_254 # %bb.249: # %.._crit_edge1664_crit_edge.i @@ -2072,7 +2068,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress .LBB3_259: # in Loop: Header=BB3_4 Depth=1 bge $a0, $t6, .LBB3_269 # %bb.260: # in Loop: Header=BB3_4 Depth=1 - st.d $a0, $sp, 480 # 8-byte Folded Spill + st.d $a0, $sp, 464 # 8-byte Folded Spill .LBB3_261: # in Loop: Header=BB3_4 Depth=1 move $t3, $s2 ld.w $a0, $s1, 36 @@ -2132,8 +2128,8 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress .LBB3_269: # %.preheader1392.i # in Loop: Header=BB3_4 Depth=1 st.d $a5, $sp, 112 # 8-byte Folded Spill - ld.d $a0, $sp, 456 # 8-byte Folded Reload - st.d $a0, $sp, 456 # 8-byte Folded Spill + ld.d $a0, $sp, 440 # 8-byte Folded Reload + st.d $a0, $sp, 440 # 8-byte Folded Spill st.d $t4, $sp, 232 # 8-byte Folded Spill st.d $t5, $sp, 96 # 8-byte Folded Spill st.d $t6, $sp, 224 # 8-byte Folded Spill @@ -2157,7 +2153,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress slli.d $a0, $a0, 3 st.d $a0, $sp, 120 # 8-byte Folded Spill ld.d $s1, $sp, 168 # 8-byte Folded Reload - ld.d $s8, $sp, 384 # 8-byte Folded Reload + ld.d $s8, $sp, 392 # 8-byte Folded Reload ld.d $a1, $sp, 176 # 8-byte Folded Reload ld.d $s3, $sp, 200 # 8-byte Folded Reload st.d $s4, $sp, 256 # 8-byte Folded Spill @@ -2166,10 +2162,10 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress .LBB3_271: # %CreateDecodeTables.exit.i # in Loop: Header=BB3_272 Depth=2 slli.d $a0, $s0, 2 - ld.d $a1, $sp, 376 # 8-byte Folded Reload + ld.d $a1, $sp, 384 # 8-byte Folded Reload stx.w $s2, $a1, $a0 addi.d $s0, $s0, 1 - ld.d $a1, $sp, 480 # 8-byte Folded Reload + ld.d $a1, $sp, 464 # 8-byte Folded Reload addi.d $a1, $a1, 258 addi.d $s3, $s3, 258 addi.d $s8, $s8, 1032 @@ -2187,7 +2183,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress # Child Loop BB3_297 Depth 3 # Child Loop BB3_300 Depth 3 slli.d $s5, $s0, 10 - st.d $a1, $sp, 480 # 8-byte Folded Spill + st.d $a1, $sp, 464 # 8-byte Folded Spill blez $t4, .LBB3_275 # %bb.273: # %.lr.ph1534.i # in Loop: Header=BB3_272 Depth=2 @@ -2208,11 +2204,11 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 move $fp, $zero - st.d $zero, $sp, 472 # 8-byte Folded Spill - st.d $zero, $sp, 464 # 8-byte Folded Spill + st.d $zero, $sp, 456 # 8-byte Folded Spill st.d $zero, $sp, 448 # 8-byte Folded Spill - st.d $zero, $sp, 440 # 8-byte Folded Spill - st.d $zero, $sp, 368 # 8-byte Folded Spill + st.d $zero, $sp, 432 # 8-byte Folded Spill + st.d $zero, $sp, 424 # 8-byte Folded Spill + st.d $zero, $sp, 376 # 8-byte Folded Spill move $a6, $zero move $a7, $zero move $t0, $zero @@ -2238,8 +2234,8 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.d $a1, $sp, 120 # 8-byte Folded Reload vld $vr3, $sp, 288 # 16-byte Folded Reload vori.b $vr2, $vr3, 0 - vori.b $vr0, $vr6, 0 - vori.b $vr1, $vr6, 0 + vld $vr1, $sp, 304 # 16-byte Folded Reload + vori.b $vr0, $vr1, 0 .p2align 4, , 16 .LBB3_277: # %vector.body555 # Parent Loop BB3_4 Depth=1 @@ -2249,10 +2245,10 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.w $a3, $a0, 0 vinsgr2vr.w $vr4, $a2, 0 vinsgr2vr.w $vr5, $a3, 0 - vilvl.b $vr4, $vr6, $vr4 - vilvl.h $vr4, $vr6, $vr4 - vilvl.b $vr5, $vr6, $vr5 - vilvl.h $vr5, $vr6, $vr5 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 vmax.wu $vr0, $vr0, $vr4 vmax.wu $vr1, $vr1, $vr5 vmin.wu $vr2, $vr2, $vr4 @@ -2303,7 +2299,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress move $a1, $zero st.d $s5, $sp, 280 # 8-byte Folded Spill alsl.d $a0, $s0, $s5, 3 - ld.d $a2, $sp, 352 # 8-byte Folded Reload + ld.d $a2, $sp, 360 # 8-byte Folded Reload add.d $a2, $a2, $a0 move $a6, $s2 b .LBB3_283 @@ -2378,22 +2374,22 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.w $a7, $s4, 64 ld.w $a6, $s4, 68 ld.w $a0, $s4, 72 - st.d $a0, $sp, 368 # 8-byte Folded Spill + st.d $a0, $sp, 376 # 8-byte Folded Spill ld.w $a0, $s4, 76 - st.d $a0, $sp, 440 # 8-byte Folded Spill + st.d $a0, $sp, 424 # 8-byte Folded Spill ld.w $a0, $s4, 80 - st.d $a0, $sp, 448 # 8-byte Folded Spill + st.d $a0, $sp, 432 # 8-byte Folded Spill ld.w $a0, $s4, 84 - st.d $a0, $sp, 464 # 8-byte Folded Spill + st.d $a0, $sp, 448 # 8-byte Folded Spill ld.w $a0, $s4, 88 - st.d $a0, $sp, 472 # 8-byte Folded Spill - ld.d $s4, $sp, 496 # 8-byte Folded Reload + st.d $a0, $sp, 456 # 8-byte Folded Spill + ld.d $s4, $sp, 480 # 8-byte Folded Reload ld.d $s5, $sp, 280 # 8-byte Folded Reload .LBB3_290: # %.preheader66.i.i # in Loop: Header=BB3_272 Depth=2 alsl.d $a0, $s0, $s5, 3 add.d $s4, $s4, $a0 - ld.d $a4, $sp, 384 # 8-byte Folded Reload + ld.d $a4, $sp, 392 # 8-byte Folded Reload add.d $a0, $a4, $a0 st.w $a3, $s4, 4 add.d $a2, $a3, $a2 @@ -2428,26 +2424,26 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.w $a1, $s4, 64 add.d $a1, $a1, $a6 st.w $a1, $s4, 68 - ld.d $a2, $sp, 368 # 8-byte Folded Reload + ld.d $a2, $sp, 376 # 8-byte Folded Reload add.d $a1, $a1, $a2 st.w $a1, $s4, 72 - ld.d $a2, $sp, 440 # 8-byte Folded Reload + ld.d $a2, $sp, 424 # 8-byte Folded Reload add.d $a1, $a1, $a2 st.w $a1, $s4, 76 - ld.d $a2, $sp, 448 # 8-byte Folded Reload + ld.d $a2, $sp, 432 # 8-byte Folded Reload add.d $a1, $a1, $a2 st.w $a1, $s4, 80 - ld.d $a2, $sp, 464 # 8-byte Folded Reload + ld.d $a2, $sp, 448 # 8-byte Folded Reload add.d $a1, $a1, $a2 st.w $a1, $s4, 84 - ld.d $a2, $sp, 472 # 8-byte Folded Reload + ld.d $a2, $sp, 456 # 8-byte Folded Reload add.d $a1, $a1, $a2 st.w $a1, $s4, 88 ori $a2, $zero, 92 move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a6, $sp, 360 # 8-byte Folded Reload + ld.d $a6, $sp, 368 # 8-byte Folded Reload ld.d $t4, $sp, 232 # 8-byte Folded Reload blez $t4, .LBB3_293 # %bb.291: # %.lr.ph82.preheader.i.i @@ -2476,8 +2472,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress bnez $a1, .LBB3_292 .LBB3_293: # %.preheader.i.i # in Loop: Header=BB3_272 Depth=2 - ld.d $ra, $sp, 496 # 8-byte Folded Reload - vld $vr6, $sp, 400 # 16-byte Folded Reload + ld.d $ra, $sp, 480 # 8-byte Folded Reload ld.d $s4, $sp, 256 # 8-byte Folded Reload ld.d $a5, $sp, 224 # 8-byte Folded Reload ld.d $s5, $sp, 264 # 8-byte Folded Reload @@ -2535,7 +2530,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress bnez $a0, .LBB3_300 b .LBB3_271 .LBB3_301: # in Loop: Header=BB3_4 Depth=1 - st.d $a5, $sp, 480 # 8-byte Folded Spill + st.d $a5, $sp, 464 # 8-byte Folded Spill ld.d $s1, $sp, 32 # 8-byte Folded Reload ld.d $s6, $sp, 24 # 8-byte Folded Reload ld.d $s7, $sp, 16 # 8-byte Folded Reload @@ -2543,7 +2538,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.d $s3, $sp, 104 # 8-byte Folded Reload b .LBB3_303 .LBB3_302: # in Loop: Header=BB3_4 Depth=1 - st.d $zero, $sp, 480 # 8-byte Folded Spill + st.d $zero, $sp, 464 # 8-byte Folded Spill .LBB3_303: # %._crit_edge1539.i # in Loop: Header=BB3_4 Depth=1 ori $a0, $zero, 3192 @@ -2603,13 +2598,12 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress bne $a0, $a4, .LBB3_304 # %bb.305: # in Loop: Header=BB3_4 Depth=1 addi.w $a0, $fp, 1 - st.d $a0, $sp, 448 # 8-byte Folded Spill - ld.d $a0, $sp, 392 # 8-byte Folded Reload + st.d $a0, $sp, 432 # 8-byte Folded Spill + ld.d $a0, $sp, 400 # 8-byte Folded Reload mul.w $a0, $s0, $a0 - st.d $a0, $sp, 440 # 8-byte Folded Spill - ld.d $ra, $sp, 496 # 8-byte Folded Reload - vld $vr3, $sp, 304 # 16-byte Folded Reload - vld $vr6, $sp, 400 # 16-byte Folded Reload + st.d $a0, $sp, 424 # 8-byte Folded Spill + ld.d $ra, $sp, 480 # 8-byte Folded Reload + vld $vr3, $sp, 320 # 16-byte Folded Reload ori $t8, $zero, 10 ld.d $t4, $sp, 232 # 8-byte Folded Reload ld.d $fp, $sp, 80 # 8-byte Folded Reload @@ -2619,24 +2613,24 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.d $t1, $sp, 72 # 8-byte Folded Reload blez $fp, .LBB3_315 # %bb.306: # in Loop: Header=BB3_4 Depth=1 - ld.d $a0, $sp, 344 # 8-byte Folded Reload + ld.d $a0, $sp, 352 # 8-byte Folded Reload ld.bu $s4, $a0, 0 move $s2, $zero - st.d $zero, $sp, 464 # 8-byte Folded Spill + st.d $zero, $sp, 448 # 8-byte Folded Spill slli.d $a0, $s4, 10 alsl.d $a0, $s4, $a0, 3 add.d $t2, $ra, $a0 slli.d $a1, $s4, 2 - ld.d $a2, $sp, 376 # 8-byte Folded Reload + ld.d $a2, $sp, 384 # 8-byte Folded Reload ldx.w $s3, $a2, $a1 - ld.d $a1, $sp, 352 # 8-byte Folded Reload + ld.d $a1, $sp, 360 # 8-byte Folded Reload add.d $t0, $a1, $a0 - ld.d $a1, $sp, 384 # 8-byte Folded Reload + ld.d $a1, $sp, 392 # 8-byte Folded Reload add.d $s5, $a1, $a0 ori $a2, $zero, 256 ori $a0, $zero, 49 - st.d $a0, $sp, 472 # 8-byte Folded Spill - st.d $s3, $sp, 504 # 8-byte Folded Spill + st.d $a0, $sp, 456 # 8-byte Folded Spill + st.d $s3, $sp, 488 # 8-byte Folded Spill ori $t3, $zero, 2 ld.d $t5, $sp, 96 # 8-byte Folded Reload ld.d $t6, $sp, 224 # 8-byte Folded Reload @@ -2647,7 +2641,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.w $a0, $s1, 36 ori $a1, $zero, 36 st.w $a1, $s1, 8 - ld.d $a1, $sp, 504 # 8-byte Folded Reload + ld.d $a1, $sp, 488 # 8-byte Folded Reload bge $a0, $a1, .LBB3_313 # %bb.308: # %.lr.ph1549.i # in Loop: Header=BB3_4 Depth=1 @@ -2658,7 +2652,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress b .LBB3_310 .LBB3_309: # in Loop: Header=BB3_310 Depth=2 addi.w $a3, $a3, -1 - ld.d $a6, $sp, 504 # 8-byte Folded Reload + ld.d $a6, $sp, 488 # 8-byte Folded Reload bge $a0, $a6, .LBB3_314 .LBB3_310: # Parent Loop BB3_4 Depth=1 # => This Inner Loop Header: Depth=2 @@ -2689,7 +2683,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress .LBB3_313: # %.._crit_edge1550_crit_edge.i # in Loop: Header=BB3_4 Depth=1 ld.w $a4, $s1, 32 - ld.d $a6, $sp, 504 # 8-byte Folded Reload + ld.d $a6, $sp, 488 # 8-byte Folded Reload .LBB3_314: # %._crit_edge1550.i # in Loop: Header=BB3_4 Depth=1 sub.w $a1, $a0, $a6 @@ -2702,8 +2696,8 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress b .LBB3_336 .LBB3_315: # in Loop: Header=BB3_4 Depth=1 move $s2, $zero - st.d $zero, $sp, 472 # 8-byte Folded Spill - st.d $zero, $sp, 464 # 8-byte Folded Spill + st.d $zero, $sp, 456 # 8-byte Folded Spill + st.d $zero, $sp, 448 # 8-byte Folded Spill move $a3, $zero addi.w $a0, $zero, -4 ori $a2, $zero, 256 @@ -2717,7 +2711,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress move $s2, $t3 blt $a2, $t4, .LBB3_326 # %bb.317: # in Loop: Header=BB3_4 Depth=1 - ld.d $a0, $sp, 480 # 8-byte Folded Reload + ld.d $a0, $sp, 464 # 8-byte Folded Reload addi.w $a0, $a0, 1 b .LBB3_259 .LBB3_318: # %.lr.ph1642.i @@ -2797,7 +2791,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.w $a0, $s1, 36 bnez $a1, .LBB3_323 # %bb.330: # in Loop: Header=BB3_4 Depth=1 - ld.d $a1, $sp, 480 # 8-byte Folded Reload + ld.d $a1, $sp, 464 # 8-byte Folded Reload slli.d $a0, $a1, 8 alsl.d $a0, $a1, $a0, 1 ld.d $a1, $sp, 200 # 8-byte Folded Reload @@ -2846,7 +2840,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress addi.w $a0, $zero, -4 bge $a3, $a6, .LBB3_338 # %bb.337: # in Loop: Header=BB3_4 Depth=1 - st.d $a6, $sp, 504 # 8-byte Folded Spill + st.d $a6, $sp, 488 # 8-byte Folded Spill b .LBB3_350 .LBB3_338: # in Loop: Header=BB3_4 Depth=1 slli.d $a3, $a6, 2 @@ -2856,7 +2850,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress bge $a4, $a5, .LBB3_348 # %bb.339: # in Loop: Header=BB3_4 Depth=1 addi.w $a6, $a6, 1 - st.d $a6, $sp, 504 # 8-byte Folded Spill + st.d $a6, $sp, 488 # 8-byte Folded Spill .LBB3_340: # in Loop: Header=BB3_4 Depth=1 ori $a0, $zero, 37 st.w $a0, $s1, 8 @@ -2873,7 +2867,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.w $a1, $s1, 36 move $a5, $a0 ori $a3, $zero, 20 - ld.d $a6, $sp, 504 # 8-byte Folded Reload + ld.d $a6, $sp, 488 # 8-byte Folded Reload b .LBB3_336 .LBB3_343: # %.lr.ph1628.i # in Loop: Header=BB3_4 Depth=1 @@ -2913,7 +2907,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.w $a5, $a0, 16 b .LBB3_344 .LBB3_348: # in Loop: Header=BB3_4 Depth=1 - st.d $a6, $sp, 504 # 8-byte Folded Spill + st.d $a6, $sp, 488 # 8-byte Folded Spill ldx.w $a1, $t2, $a3 sub.w $a1, $s2, $a1 ori $a3, $zero, 257 @@ -2933,30 +2927,30 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress .LBB3_352: # %BZ2_decompress.exit # in Loop: Header=BB3_4 Depth=1 st.w $a2, $s6, 0 - ld.d $a1, $sp, 488 # 8-byte Folded Reload + ld.d $a1, $sp, 472 # 8-byte Folded Reload st.w $a1, $s6, 4 - ld.d $a1, $sp, 480 # 8-byte Folded Reload + ld.d $a1, $sp, 464 # 8-byte Folded Reload st.w $a1, $s6, 8 st.w $t4, $s6, 12 st.w $t6, $s6, 16 st.w $fp, $s6, 20 - ld.d $a1, $sp, 448 # 8-byte Folded Reload + ld.d $a1, $sp, 432 # 8-byte Folded Reload st.w $a1, $s6, 24 - ld.d $a1, $sp, 464 # 8-byte Folded Reload + ld.d $a1, $sp, 448 # 8-byte Folded Reload st.w $a1, $s6, 28 - ld.d $a1, $sp, 472 # 8-byte Folded Reload + ld.d $a1, $sp, 456 # 8-byte Folded Reload st.w $a1, $s6, 32 st.w $t5, $s6, 36 - ld.d $a1, $sp, 440 # 8-byte Folded Reload + ld.d $a1, $sp, 424 # 8-byte Folded Reload st.w $a1, $s6, 40 st.w $s2, $s6, 44 - ld.d $a1, $sp, 432 # 8-byte Folded Reload + ld.d $a1, $sp, 416 # 8-byte Folded Reload st.w $a1, $s6, 48 st.w $t1, $s6, 52 st.w $a7, $s6, 56 - ld.d $a1, $sp, 456 # 8-byte Folded Reload + ld.d $a1, $sp, 440 # 8-byte Folded Reload st.w $a1, $s6, 60 - ld.d $a1, $sp, 504 # 8-byte Folded Reload + ld.d $a1, $sp, 488 # 8-byte Folded Reload st.w $a1, $s6, 64 st.w $a5, $s6, 68 st.w $t7, $s6, 72 @@ -2975,7 +2969,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress beq $a2, $a1, .LBB3_4 b .LBB3_483 .LBB3_354: # in Loop: Header=BB3_4 Depth=1 - st.d $s2, $sp, 504 # 8-byte Folded Spill + st.d $s2, $sp, 488 # 8-byte Folded Spill ldx.w $a1, $t2, $a3 sub.w $a1, $a6, $a1 ori $a3, $zero, 257 @@ -2992,7 +2986,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress .LBB3_356: # %.loopexit1376.i # in Loop: Header=BB3_4 Depth=1 move $s2, $s0 - ld.d $a0, $sp, 448 # 8-byte Folded Reload + ld.d $a0, $sp, 432 # 8-byte Folded Reload bne $t5, $a0, .LBB3_371 # %bb.357: # in Loop: Header=BB3_4 Depth=1 ld.w $a1, $s1, 56 @@ -3413,43 +3407,43 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress addi.d $a4, $zero, -1 ld.d $a5, $sp, 112 # 8-byte Folded Reload .LBB3_373: # in Loop: Header=BB3_4 Depth=1 - ld.d $a6, $sp, 472 # 8-byte Folded Reload + ld.d $a6, $sp, 456 # 8-byte Folded Reload sltui $a1, $t5, 1 slli.d $t1, $a0, 1 masknez $a3, $t1, $a1 maskeqz $a0, $a0, $a1 or $a0, $a0, $a3 add.w $a4, $a0, $a4 - st.d $a4, $sp, 432 # 8-byte Folded Spill + st.d $a4, $sp, 416 # 8-byte Folded Spill bnez $a6, .LBB3_376 # %bb.374: # in Loop: Header=BB3_4 Depth=1 - ld.d $a1, $sp, 464 # 8-byte Folded Reload + ld.d $a1, $sp, 448 # 8-byte Folded Reload addi.w $a1, $a1, 1 - st.d $a1, $sp, 464 # 8-byte Folded Spill + st.d $a1, $sp, 448 # 8-byte Folded Spill bge $a1, $fp, .LBB3_398 # %bb.375: # in Loop: Header=BB3_4 Depth=1 - ld.d $a0, $sp, 344 # 8-byte Folded Reload + ld.d $a0, $sp, 352 # 8-byte Folded Reload ldx.bu $s4, $a0, $a1 slli.d $a0, $s4, 2 - ld.d $a1, $sp, 376 # 8-byte Folded Reload + ld.d $a1, $sp, 384 # 8-byte Folded Reload ldx.w $s3, $a1, $a0 slli.d $a0, $s4, 10 alsl.d $a0, $s4, $a0, 3 - ld.d $a1, $sp, 384 # 8-byte Folded Reload + ld.d $a1, $sp, 392 # 8-byte Folded Reload add.d $s5, $a1, $a0 - ld.d $a1, $sp, 352 # 8-byte Folded Reload + ld.d $a1, $sp, 360 # 8-byte Folded Reload add.d $t0, $a1, $a0 add.d $t2, $ra, $a0 ori $a6, $zero, 50 .LBB3_376: # in Loop: Header=BB3_4 Depth=1 addi.w $a6, $a6, -1 - st.d $a6, $sp, 472 # 8-byte Folded Spill - st.d $s3, $sp, 504 # 8-byte Folded Spill + st.d $a6, $sp, 456 # 8-byte Folded Spill + st.d $s3, $sp, 488 # 8-byte Folded Spill .LBB3_377: # in Loop: Header=BB3_4 Depth=1 ld.w $a0, $s1, 36 ori $a1, $zero, 38 st.w $a1, $s1, 8 - ld.d $a1, $sp, 504 # 8-byte Folded Reload + ld.d $a1, $sp, 488 # 8-byte Folded Reload bge $a0, $a1, .LBB3_383 # %bb.378: # %.lr.ph1593.i # in Loop: Header=BB3_4 Depth=1 @@ -3461,7 +3455,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress b .LBB3_380 .LBB3_379: # in Loop: Header=BB3_380 Depth=2 addi.w $a3, $a3, -1 - ld.d $a5, $sp, 504 # 8-byte Folded Reload + ld.d $a5, $sp, 488 # 8-byte Folded Reload bge $a0, $a5, .LBB3_384 .LBB3_380: # Parent Loop BB3_4 Depth=1 # => This Inner Loop Header: Depth=2 @@ -3493,7 +3487,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress # in Loop: Header=BB3_4 Depth=1 move $s0, $s2 ld.w $a4, $s1, 32 - ld.d $a5, $sp, 504 # 8-byte Folded Reload + ld.d $a5, $sp, 488 # 8-byte Folded Reload .LBB3_384: # %._crit_edge1594.i # in Loop: Header=BB3_4 Depth=1 sub.w $a1, $a0, $a5 @@ -3508,7 +3502,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress .LBB3_385: # in Loop: Header=BB3_4 Depth=1 addi.w $a0, $zero, -4 ld.d $a5, $sp, 112 # 8-byte Folded Reload - ld.d $a1, $sp, 440 # 8-byte Folded Reload + ld.d $a1, $sp, 424 # 8-byte Folded Reload bge $s2, $a1, .LBB3_400 # %bb.386: # in Loop: Header=BB3_4 Depth=1 move $t1, $t0 @@ -3522,16 +3516,16 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.w $a4, $a1, 0 add.d $a1, $a4, $a3 bstrpick.d $a1, $a1, 31, 0 - ld.d $a5, $sp, 328 # 8-byte Folded Reload + ld.d $a5, $sp, 336 # 8-byte Folded Reload ldx.bu $a1, $a5, $a1 ori $a5, $zero, 4 - ld.d $s0, $sp, 472 # 8-byte Folded Reload + ld.d $s0, $sp, 456 # 8-byte Folded Reload bltu $a3, $a5, .LBB3_390 .LBB3_388: # %.lr.ph1569.i # Parent Loop BB3_4 Depth=1 # => This Inner Loop Header: Depth=2 add.w $a5, $a4, $a3 - ld.d $a6, $sp, 328 # 8-byte Folded Reload + ld.d $a6, $sp, 336 # 8-byte Folded Reload add.d $a5, $a6, $a5 ld.w $a6, $a5, -4 addi.w $a3, $a3, -4 @@ -3571,7 +3565,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress # => This Inner Loop Header: Depth=2 addi.w $a6, $a5, -1 bstrpick.d $a7, $a6, 31, 0 - ld.d $t0, $sp, 328 # 8-byte Folded Reload + ld.d $t0, $sp, 336 # 8-byte Folded Reload ldx.b $a7, $t0, $a7 bstrpick.d $a5, $a5, 31, 0 addi.w $a3, $a3, -1 @@ -3580,20 +3574,20 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress bnez $a3, .LBB3_395 .LBB3_396: # %._crit_edge1574.i # in Loop: Header=BB3_4 Depth=1 - ld.d $a3, $sp, 328 # 8-byte Folded Reload + ld.d $a3, $sp, 336 # 8-byte Folded Reload stx.b $a1, $a3, $a4 move $t0, $t1 ld.d $t1, $sp, 72 # 8-byte Folded Reload b .LBB3_424 .LBB3_397: # in Loop: Header=BB3_4 Depth=1 - st.d $t5, $sp, 448 # 8-byte Folded Spill + st.d $t5, $sp, 432 # 8-byte Folded Spill b .LBB3_481 .LBB3_398: # in Loop: Header=BB3_4 Depth=1 - st.d $zero, $sp, 472 # 8-byte Folded Spill + st.d $zero, $sp, 456 # 8-byte Folded Spill b .LBB3_212 .LBB3_399: # in Loop: Header=BB3_4 Depth=1 move $a3, $zero - st.d $t5, $sp, 448 # 8-byte Folded Spill + st.d $t5, $sp, 432 # 8-byte Folded Spill ld.d $t1, $sp, 72 # 8-byte Folded Reload b .LBB3_352 .LBB3_400: # in Loop: Header=BB3_4 Depth=1 @@ -3646,7 +3640,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress or $a7, $a7, $a5 b .LBB3_403 .LBB3_406: # in Loop: Header=BB3_4 Depth=1 - st.d $t1, $sp, 488 # 8-byte Folded Spill + st.d $t1, $sp, 472 # 8-byte Folded Spill st.d $t2, $sp, 48 # 8-byte Folded Spill move $a5, $zero st.w $a2, $s1, 60 @@ -3681,7 +3675,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress bstrins.d $a5, $a1, 19, 16 st.w $a5, $s1, 60 st.w $t2, $s1, 1092 - st.d $t5, $sp, 448 # 8-byte Folded Spill + st.d $t5, $sp, 432 # 8-byte Folded Spill ld.d $a7, $sp, 56 # 8-byte Folded Reload b .LBB3_480 .LBB3_409: # in Loop: Header=BB3_4 Depth=1 @@ -3691,7 +3685,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ldx.w $a6, $a4, $a1 andi $a7, $a3, 15 add.w $a1, $a6, $a7 - ld.d $s2, $sp, 328 # 8-byte Folded Reload + ld.d $s2, $sp, 336 # 8-byte Folded Reload ldx.bu $a1, $s2, $a1 alsl.d $a4, $a5, $a4, 2 beqz $a7, .LBB3_412 @@ -3703,7 +3697,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress # Parent Loop BB3_4 Depth=1 # => This Inner Loop Header: Depth=2 ldx.b $a6, $t0, $a3 - ld.d $a7, $sp, 328 # 8-byte Folded Reload + ld.d $a7, $sp, 336 # 8-byte Folded Reload stx.b $a6, $a7, $a3 ld.w $a6, $a4, 0 addi.d $a3, $a3, -1 @@ -3722,7 +3716,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.w $a6, $a4, -4 addi.w $a5, $a5, -1 st.w $a5, $a4, 0 - ld.d $t0, $sp, 328 # 8-byte Folded Reload + ld.d $t0, $sp, 336 # 8-byte Folded Reload add.d $a6, $t0, $a6 ld.b $a6, $a6, 15 addi.d $a4, $a4, -4 @@ -3735,7 +3729,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.w $a3, $a4, 0 addi.w $a3, $a3, -1 st.w $a3, $a4, 0 - ld.d $a5, $sp, 328 # 8-byte Folded Reload + ld.d $a5, $sp, 336 # 8-byte Folded Reload stx.b $a1, $a5, $a3 ld.w $a3, $a4, 0 move $t0, $t1 @@ -3774,7 +3768,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress srli.d $a2, $a0, 8 st.w $a2, $s1, 60 st.w $zero, $s1, 1092 - ld.d $a0, $sp, 392 # 8-byte Folded Reload + ld.d $a0, $sp, 400 # 8-byte Folded Reload mul.w $a3, $a3, $a0 ori $a0, $zero, 1 ld.d $t1, $sp, 72 # 8-byte Folded Reload @@ -3790,7 +3784,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.w $a1, $s1, 60 ori $a1, $zero, 1 st.w $a1, $s1, 1092 - st.d $t5, $sp, 448 # 8-byte Folded Spill + st.d $t5, $sp, 432 # 8-byte Folded Spill move $a2, $s2 ld.d $a5, $sp, 112 # 8-byte Folded Reload b .LBB3_352 @@ -3804,7 +3798,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress # Parent Loop BB3_4 Depth=1 # => This Inner Loop Header: Depth=2 ld.w $a6, $a5, 0 - ld.d $s2, $sp, 328 # 8-byte Folded Reload + ld.d $s2, $sp, 336 # 8-byte Folded Reload add.d $a6, $s2, $a6 ld.b $a6, $a6, 15 ld.d $a7, $sp, 216 # 8-byte Folded Reload @@ -3879,10 +3873,10 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress move $s2, $s0 .LBB3_423: # %.loopexit1387.i # in Loop: Header=BB3_4 Depth=1 - ld.d $s0, $sp, 472 # 8-byte Folded Reload + ld.d $s0, $sp, 456 # 8-byte Folded Reload .LBB3_424: # %.loopexit1387.i # in Loop: Header=BB3_4 Depth=1 - ld.d $a3, $sp, 424 # 8-byte Folded Reload + ld.d $a3, $sp, 408 # 8-byte Folded Reload ldx.bu $a1, $a3, $a1 slli.d $a3, $a1, 2 ld.d $a6, $sp, 208 # 8-byte Folded Reload @@ -3906,34 +3900,34 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.d $a5, $sp, 112 # 8-byte Folded Reload bnez $s0, .LBB3_430 # %bb.428: # in Loop: Header=BB3_4 Depth=1 - ld.d $a1, $sp, 464 # 8-byte Folded Reload + ld.d $a1, $sp, 448 # 8-byte Folded Reload addi.w $a1, $a1, 1 - st.d $a1, $sp, 464 # 8-byte Folded Spill + st.d $a1, $sp, 448 # 8-byte Folded Spill bge $a1, $fp, .LBB3_442 # %bb.429: # in Loop: Header=BB3_4 Depth=1 - ld.d $a0, $sp, 344 # 8-byte Folded Reload + ld.d $a0, $sp, 352 # 8-byte Folded Reload ldx.bu $s4, $a0, $a1 slli.d $a0, $s4, 2 - ld.d $a1, $sp, 376 # 8-byte Folded Reload + ld.d $a1, $sp, 384 # 8-byte Folded Reload ldx.w $s3, $a1, $a0 slli.d $a0, $s4, 10 alsl.d $a0, $s4, $a0, 3 - ld.d $a1, $sp, 384 # 8-byte Folded Reload + ld.d $a1, $sp, 392 # 8-byte Folded Reload add.d $s5, $a1, $a0 - ld.d $a1, $sp, 352 # 8-byte Folded Reload + ld.d $a1, $sp, 360 # 8-byte Folded Reload add.d $t0, $a1, $a0 add.d $t2, $ra, $a0 ori $s0, $zero, 50 .LBB3_430: # in Loop: Header=BB3_4 Depth=1 addi.w $s0, $s0, -1 - st.d $s0, $sp, 472 # 8-byte Folded Spill - st.d $s3, $sp, 504 # 8-byte Folded Spill + st.d $s0, $sp, 456 # 8-byte Folded Spill + st.d $s3, $sp, 488 # 8-byte Folded Spill .LBB3_431: # in Loop: Header=BB3_4 Depth=1 move $s0, $s2 ld.w $a0, $s1, 36 ori $a1, $zero, 40 st.w $a1, $s1, 8 - ld.d $a1, $sp, 504 # 8-byte Folded Reload + ld.d $a1, $sp, 488 # 8-byte Folded Reload bge $a0, $a1, .LBB3_437 # %bb.432: # %.lr.ph1578.i # in Loop: Header=BB3_4 Depth=1 @@ -3944,7 +3938,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress b .LBB3_434 .LBB3_433: # in Loop: Header=BB3_434 Depth=2 addi.w $a3, $a3, -1 - ld.d $a5, $sp, 504 # 8-byte Folded Reload + ld.d $a5, $sp, 488 # 8-byte Folded Reload bge $a0, $a5, .LBB3_438 .LBB3_434: # Parent Loop BB3_4 Depth=1 # => This Inner Loop Header: Depth=2 @@ -3975,7 +3969,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress .LBB3_437: # %.._crit_edge1579_crit_edge.i # in Loop: Header=BB3_4 Depth=1 ld.w $a4, $s1, 32 - ld.d $a5, $sp, 504 # 8-byte Folded Reload + ld.d $a5, $sp, 488 # 8-byte Folded Reload .LBB3_438: # %._crit_edge1579.i # in Loop: Header=BB3_4 Depth=1 sub.w $a1, $a0, $a5 @@ -3997,19 +3991,19 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress bge $a4, $a5, .LBB3_354 # %bb.441: # in Loop: Header=BB3_4 Depth=1 addi.w $s2, $s2, 1 - st.d $s2, $sp, 504 # 8-byte Folded Spill + st.d $s2, $sp, 488 # 8-byte Folded Spill move $a5, $a6 move $s2, $s0 b .LBB3_204 .LBB3_442: # in Loop: Header=BB3_4 Depth=1 - st.d $zero, $sp, 472 # 8-byte Folded Spill + st.d $zero, $sp, 456 # 8-byte Folded Spill move $a3, $zero b .LBB3_352 .LBB3_443: # in Loop: Header=BB3_4 Depth=1 addi.w $a0, $zero, -4 bge $a3, $s2, .LBB3_445 .LBB3_444: # in Loop: Header=BB3_4 Depth=1 - st.d $s2, $sp, 504 # 8-byte Folded Spill + st.d $s2, $sp, 488 # 8-byte Folded Spill b .LBB3_350 .LBB3_445: # in Loop: Header=BB3_4 Depth=1 slli.d $a3, $s2, 2 @@ -4019,7 +4013,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress bge $a4, $a5, .LBB3_455 # %bb.446: # in Loop: Header=BB3_4 Depth=1 addi.w $s2, $s2, 1 - st.d $s2, $sp, 504 # 8-byte Folded Spill + st.d $s2, $sp, 488 # 8-byte Folded Spill move $a5, $a6 move $s2, $s0 .LBB3_447: # in Loop: Header=BB3_4 Depth=1 @@ -4040,7 +4034,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.w $a1, $s1, 36 move $a5, $a0 ori $a3, $zero, 20 - ld.d $s2, $sp, 504 # 8-byte Folded Reload + ld.d $s2, $sp, 488 # 8-byte Folded Reload b .LBB3_443 .LBB3_450: # %.lr.ph1610.i # in Loop: Header=BB3_4 Depth=1 @@ -4080,7 +4074,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.w $a5, $a0, 16 b .LBB3_451 .LBB3_455: # in Loop: Header=BB3_4 Depth=1 - st.d $s2, $sp, 504 # 8-byte Folded Spill + st.d $s2, $sp, 488 # 8-byte Folded Spill ldx.w $a1, $t2, $a3 sub.w $a1, $a6, $a1 ori $a3, $zero, 257 @@ -4094,7 +4088,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress bgeu $t5, $t3, .LBB3_458 # %bb.457: # in Loop: Header=BB3_4 Depth=1 ld.d $a0, $sp, 72 # 8-byte Folded Reload - ld.d $a4, $sp, 432 # 8-byte Folded Reload + ld.d $a4, $sp, 416 # 8-byte Folded Reload move $s2, $s0 b .LBB3_373 .LBB3_458: # in Loop: Header=BB3_4 Depth=1 @@ -4102,15 +4096,15 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.d $t2, $sp, 48 # 8-byte Folded Spill st.d $a7, $sp, 56 # 8-byte Folded Spill ldptr.w $a1, $s1, 7820 - ld.d $a3, $sp, 328 # 8-byte Folded Reload + ld.d $a3, $sp, 336 # 8-byte Folded Reload ldx.bu $a1, $a3, $a1 - ld.d $a3, $sp, 424 # 8-byte Folded Reload + ld.d $a3, $sp, 408 # 8-byte Folded Reload ldx.bu $a3, $a3, $a1 slli.d $a4, $a3, 2 ld.d $a7, $sp, 208 # 8-byte Folded Reload ldx.w $a5, $a7, $a4 ld.bu $a6, $s1, 44 - ld.d $s2, $sp, 432 # 8-byte Folded Reload + ld.d $s2, $sp, 416 # 8-byte Folded Reload addi.w $a1, $s2, 1 add.d $a5, $a5, $a1 stx.w $a5, $a7, $a4 @@ -4123,7 +4117,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.d $t0, $sp, 40 # 8-byte Folded Spill st.d $s5, $sp, 264 # 8-byte Folded Spill st.d $s3, $sp, 104 # 8-byte Folded Spill - ld.d $a5, $sp, 440 # 8-byte Folded Reload + ld.d $a5, $sp, 424 # 8-byte Folded Reload slt $a4, $a5, $s0 masknez $a5, $a5, $a4 maskeqz $a4, $s0, $a4 @@ -4143,7 +4137,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress or $a5, $a5, $a6 bltu $t0, $a5, .LBB3_466 # %bb.461: # in Loop: Header=BB3_4 Depth=1 - ld.d $t2, $sp, 456 # 8-byte Folded Reload + ld.d $t2, $sp, 440 # 8-byte Folded Reload b .LBB3_469 .LBB3_462: # %.preheader1375.i # in Loop: Header=BB3_4 Depth=1 @@ -4153,7 +4147,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress st.d $t0, $sp, 40 # 8-byte Folded Spill st.d $s5, $sp, 264 # 8-byte Folded Spill st.d $s3, $sp, 104 # 8-byte Folded Spill - ld.d $a5, $sp, 440 # 8-byte Folded Reload + ld.d $a5, $sp, 424 # 8-byte Folded Reload slt $a4, $a5, $s0 masknez $a5, $a5, $a4 maskeqz $a4, $s0, $a4 @@ -4173,10 +4167,10 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress or $a5, $a5, $a6 bltu $t0, $a5, .LBB3_472 # %bb.464: # in Loop: Header=BB3_4 Depth=1 - ld.d $t2, $sp, 456 # 8-byte Folded Reload + ld.d $t2, $sp, 440 # 8-byte Folded Reload b .LBB3_475 .LBB3_465: # in Loop: Header=BB3_4 Depth=1 - st.d $a1, $sp, 432 # 8-byte Folded Spill + st.d $a1, $sp, 416 # 8-byte Folded Spill ld.d $a4, $sp, 160 # 8-byte Folded Reload ori $a5, $zero, 1024 ld.d $a7, $sp, 56 # 8-byte Folded Reload @@ -4184,7 +4178,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress b .LBB3_356 .LBB3_466: # %vector.ph501 # in Loop: Header=BB3_4 Depth=1 - ld.d $t2, $sp, 456 # 8-byte Folded Reload + ld.d $t2, $sp, 440 # 8-byte Folded Reload bstrpick.d $a5, $a5, 31, 1 ldptr.d $a7, $s1, 3160 slli.d $a6, $a5, 1 @@ -4206,7 +4200,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress # in Loop: Header=BB3_4 Depth=1 slli.d $a5, $s0, 1 sub.d $a6, $a4, $s0 - st.d $t2, $sp, 456 # 8-byte Folded Spill + st.d $t2, $sp, 440 # 8-byte Folded Spill .LBB3_470: # %scalar.ph499 # Parent Loop BB3_4 Depth=1 # => This Inner Loop Header: Depth=2 @@ -4224,7 +4218,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress b .LBB3_478 .LBB3_472: # %vector.ph490 # in Loop: Header=BB3_4 Depth=1 - ld.d $t2, $sp, 456 # 8-byte Folded Reload + ld.d $t2, $sp, 440 # 8-byte Folded Reload bstrpick.d $a5, $a5, 31, 1 ldptr.d $a7, $s1, 3152 slli.d $a6, $a5, 1 @@ -4246,7 +4240,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress # in Loop: Header=BB3_4 Depth=1 slli.d $a5, $s0, 2 sub.d $a6, $a4, $s0 - st.d $t2, $sp, 456 # 8-byte Folded Spill + st.d $t2, $sp, 440 # 8-byte Folded Spill .LBB3_476: # %scalar.ph489 # Parent Loop BB3_4 Depth=1 # => This Inner Loop Header: Depth=2 @@ -4262,11 +4256,10 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ori $a7, $zero, 1 blt $a7, $t0, .LBB3_476 .LBB3_478: # in Loop: Header=BB3_4 Depth=1 - st.d $a1, $sp, 432 # 8-byte Folded Spill - ld.d $ra, $sp, 496 # 8-byte Folded Reload + st.d $a1, $sp, 416 # 8-byte Folded Spill + ld.d $ra, $sp, 480 # 8-byte Folded Reload ld.d $t1, $sp, 184 # 8-byte Folded Reload - vld $vr3, $sp, 304 # 16-byte Folded Reload - vld $vr6, $sp, 400 # 16-byte Folded Reload + vld $vr3, $sp, 320 # 16-byte Folded Reload ld.d $a4, $sp, 160 # 8-byte Folded Reload ori $t3, $zero, 2 ori $a5, $zero, 1024 @@ -4278,12 +4271,11 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress ld.d $t0, $sp, 40 # 8-byte Folded Reload b .LBB3_356 .LBB3_479: # in Loop: Header=BB3_4 Depth=1 - st.d $t0, $sp, 432 # 8-byte Folded Spill + st.d $t0, $sp, 416 # 8-byte Folded Spill move $a3, $zero move $s2, $a4 - ld.d $ra, $sp, 496 # 8-byte Folded Reload - vld $vr3, $sp, 304 # 16-byte Folded Reload - vld $vr6, $sp, 400 # 16-byte Folded Reload + ld.d $ra, $sp, 480 # 8-byte Folded Reload + vld $vr3, $sp, 320 # 16-byte Folded Reload ori $t8, $zero, 10 ld.d $a7, $sp, 56 # 8-byte Folded Reload ld.d $s3, $sp, 104 # 8-byte Folded Reload @@ -4300,18 +4292,18 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress .LBB3_482: # %unRLE_obuf_to_output_SMALL.exit.thread.loopexit886 addi.w $a0, $zero, -1 .LBB3_483: # %unRLE_obuf_to_output_SMALL.exit.thread - ld.d $s8, $sp, 520 # 8-byte Folded Reload - ld.d $s7, $sp, 528 # 8-byte Folded Reload - ld.d $s6, $sp, 536 # 8-byte Folded Reload - ld.d $s5, $sp, 544 # 8-byte Folded Reload - ld.d $s4, $sp, 552 # 8-byte Folded Reload - ld.d $s3, $sp, 560 # 8-byte Folded Reload - ld.d $s2, $sp, 568 # 8-byte Folded Reload - ld.d $s1, $sp, 576 # 8-byte Folded Reload - ld.d $s0, $sp, 584 # 8-byte Folded Reload - ld.d $fp, $sp, 592 # 8-byte Folded Reload - ld.d $ra, $sp, 600 # 8-byte Folded Reload - addi.d $sp, $sp, 608 + ld.d $s8, $sp, 504 # 8-byte Folded Reload + ld.d $s7, $sp, 512 # 8-byte Folded Reload + ld.d $s6, $sp, 520 # 8-byte Folded Reload + ld.d $s5, $sp, 528 # 8-byte Folded Reload + ld.d $s4, $sp, 536 # 8-byte Folded Reload + ld.d $s3, $sp, 544 # 8-byte Folded Reload + ld.d $s2, $sp, 552 # 8-byte Folded Reload + ld.d $s1, $sp, 560 # 8-byte Folded Reload + ld.d $s0, $sp, 568 # 8-byte Folded Reload + ld.d $fp, $sp, 576 # 8-byte Folded Reload + ld.d $ra, $sp, 584 # 8-byte Folded Reload + addi.d $sp, $sp, 592 ret .LBB3_484: addi.w $a0, $zero, -4 @@ -4323,7 +4315,7 @@ nsis_BZ2_bzDecompress: # @nsis_BZ2_bzDecompress move $a0, $zero b .LBB3_483 .LBB3_487: # %vector.main.loop.iter.check - ld.d $s2, $sp, 328 # 8-byte Folded Reload + ld.d $s2, $sp, 336 # 8-byte Folded Reload ori $a0, $zero, 16 bgeu $a3, $a0, .LBB3_490 # %bb.488: # %vec.epilog.vector.body.preheader diff --git a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_pe.s b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_pe.s index 0ecf94b2..23c43c2f 100644 --- a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_pe.s +++ b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_pe.s @@ -5977,76 +5977,77 @@ cli_scanpe: # @cli_scanpe vld $vr2, $a1, %pc_lo12(.LCPI0_2) bstrpick.d $a1, $a0, 31, 2 slli.d $a1, $a1, 2 - vrepli.b $vr3, 0 lu12i.w $a2, -2 - vreplgr2vr.w $vr4, $a2 + vreplgr2vr.w $vr3, $a2 ori $a2, $zero, 36 move $a3, $a1 .LBB0_885: # %vector.body3955 # =>This Inner Loop Header: Depth=1 - vslli.w $vr5, $vr2, 1 - vilvh.w $vr6, $vr3, $vr5 - vilvl.w $vr7, $vr3, $vr5 - vpickve2gr.d $a4, $vr7, 0 - vpickve2gr.d $a5, $vr7, 1 - vpickve2gr.d $a6, $vr6, 0 - vpickve2gr.d $a7, $vr6, 1 + vslli.w $vr4, $vr2, 1 + vshuf4i.w $vr5, $vr4, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr6, $vr4, 0 + vpickve2gr.d $a4, $vr6, 0 + vpickve2gr.d $a5, $vr6, 1 + vpickve2gr.d $a6, $vr5, 0 + vpickve2gr.d $a7, $vr5, 1 ld.d $t0, $sp, 136 # 8-byte Folded Reload ldx.b $a4, $t0, $a4 ldx.b $a5, $t0, $a5 ldx.b $a6, $t0, $a6 ldx.b $a7, $t0, $a7 - vinsgr2vr.b $vr6, $a4, 0 - vinsgr2vr.b $vr6, $a5, 4 - vinsgr2vr.b $vr6, $a6, 8 - vinsgr2vr.b $vr6, $a7, 12 - vslli.w $vr6, $vr6, 24 - vsrai.w $vr6, $vr6, 24 - vbitseti.w $vr5, $vr5, 0 - vilvh.w $vr7, $vr3, $vr5 - vilvl.w $vr5, $vr3, $vr5 - vpickve2gr.d $a4, $vr5, 0 - vpickve2gr.d $a5, $vr5, 1 - vpickve2gr.d $a6, $vr7, 0 - vpickve2gr.d $a7, $vr7, 1 + vinsgr2vr.b $vr5, $a4, 0 + vinsgr2vr.b $vr5, $a5, 1 + vinsgr2vr.b $vr5, $a6, 2 + vinsgr2vr.b $vr5, $a7, 3 + vsllwil.h.b $vr5, $vr5, 0 + vsllwil.w.h $vr5, $vr5, 0 + vbitseti.w $vr4, $vr4, 0 + vshuf4i.w $vr6, $vr4, 14 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vpickve2gr.d $a4, $vr4, 0 + vpickve2gr.d $a5, $vr4, 1 + vpickve2gr.d $a6, $vr6, 0 + vpickve2gr.d $a7, $vr6, 1 ldx.b $a4, $t0, $a4 ldx.b $a5, $t0, $a5 ldx.b $a6, $t0, $a6 ldx.b $a7, $t0, $a7 - vinsgr2vr.b $vr5, $a4, 0 - vinsgr2vr.b $vr5, $a5, 4 - vinsgr2vr.b $vr5, $a6, 8 - vinsgr2vr.b $vr5, $a7, 12 - vslli.w $vr5, $vr5, 24 - vsrai.w $vr5, $vr5, 24 + vinsgr2vr.b $vr4, $a4, 0 + vinsgr2vr.b $vr4, $a5, 1 + vinsgr2vr.b $vr4, $a6, 2 + vinsgr2vr.b $vr4, $a7, 3 + vsllwil.h.b $vr4, $vr4, 0 + vsllwil.w.h $vr4, $vr4, 0 lu12i.w $a4, 2 ori $a4, $a4, 396 add.d $a4, $sp, $a4 - vld $vr7, $a4, 0 - vslli.w $vr5, $vr5, 20 - vslli.w $vr6, $vr6, 12 - vor.v $vr5, $vr5, $vr6 - vreplvei.w $vr6, $vr7, 0 - vsub.w $vr5, $vr5, $vr6 - vadd.w $vr5, $vr5, $vr4 - vaddi.du $vr6, $vr0, 1 - vaddi.du $vr7, $vr1, 1 - vpickve2gr.d $a4, $vr7, 0 + vld $vr6, $a4, 0 + vslli.w $vr4, $vr4, 20 + vslli.w $vr5, $vr5, 12 + vor.v $vr4, $vr4, $vr5 + vreplvei.w $vr5, $vr6, 0 + vsub.w $vr4, $vr4, $vr5 + vadd.w $vr4, $vr4, $vr3 + vaddi.du $vr5, $vr0, 1 + vaddi.du $vr6, $vr1, 1 + vpickve2gr.d $a4, $vr6, 0 mul.d $a4, $a4, $a2 add.d $a4, $s5, $a4 - vpickve2gr.d $a5, $vr7, 1 + vpickve2gr.d $a5, $vr6, 1 mul.d $a5, $a5, $a2 add.d $a5, $s5, $a5 - vpickve2gr.d $a6, $vr6, 0 + vpickve2gr.d $a6, $vr5, 0 mul.d $a6, $a6, $a2 add.d $a6, $s5, $a6 - vpickve2gr.d $a7, $vr6, 1 + vpickve2gr.d $a7, $vr5, 1 mul.d $a7, $a7, $a2 add.d $a7, $s5, $a7 - vstelm.w $vr5, $a4, 0, 0 - vstelm.w $vr5, $a5, 0, 1 - vstelm.w $vr5, $a6, 0, 2 - vstelm.w $vr5, $a7, 0, 3 + vstelm.w $vr4, $a4, 0, 0 + vstelm.w $vr4, $a5, 0, 1 + vstelm.w $vr4, $a6, 0, 2 + vstelm.w $vr4, $a7, 0, 3 vaddi.du $vr1, $vr1, 4 vaddi.du $vr0, $vr0, 4 addi.d $a3, $a3, -4 diff --git a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_phishcheck.s b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_phishcheck.s index 9fb83e3f..ea4fb635 100644 --- a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_phishcheck.s +++ b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_phishcheck.s @@ -2474,8 +2474,8 @@ cleanupURL: # @cleanupURL st.b $a4, $a7, -1 .LBB10_137: # %pred.store.continue65 # in Loop: Header=BB10_135 Depth=1 - vilvl.b $vr1, $vr1, $vr1 - vilvl.h $vr1, $vr1, $vr1 + vsllwil.h.b $vr1, $vr1, 0 + vsllwil.w.h $vr1, $vr1, 0 vpickve2gr.w $t0, $vr1, 1 andi $t0, $t0, 1 bnez $t0, .LBB10_140 @@ -2753,8 +2753,8 @@ cleanupURL: # @cleanupURL st.b $a6, $a7, -1 .LBB10_187: # %pred.store.continue151 # in Loop: Header=BB10_185 Depth=1 - vilvl.b $vr1, $vr1, $vr1 - vilvl.h $vr1, $vr1, $vr1 + vsllwil.h.b $vr1, $vr1, 0 + vsllwil.w.h $vr1, $vr1, 0 vpickve2gr.w $t0, $vr1, 1 andi $t0, $t0, 1 bnez $t0, .LBB10_190 @@ -3032,8 +3032,8 @@ cleanupURL: # @cleanupURL st.b $a6, $a7, -1 .LBB10_237: # %pred.store.continue240 # in Loop: Header=BB10_235 Depth=1 - vilvl.b $vr1, $vr1, $vr1 - vilvl.h $vr1, $vr1, $vr1 + vsllwil.h.b $vr1, $vr1, 0 + vsllwil.w.h $vr1, $vr1, 0 vpickve2gr.w $t0, $vr1, 1 andi $t0, $t0, 1 bnez $t0, .LBB10_240 @@ -3311,8 +3311,8 @@ cleanupURL: # @cleanupURL st.b $a6, $a7, -1 .LBB10_287: # %pred.store.continue329 # in Loop: Header=BB10_285 Depth=1 - vilvl.b $vr1, $vr1, $vr1 - vilvl.h $vr1, $vr1, $vr1 + vsllwil.h.b $vr1, $vr1, 0 + vsllwil.w.h $vr1, $vr1, 0 vpickve2gr.w $t0, $vr1, 1 andi $t0, $t0, 1 bnez $t0, .LBB10_290 @@ -3589,8 +3589,8 @@ cleanupURL: # @cleanupURL st.b $a4, $a5, -1 .LBB10_337: # %pred.store.continue418 # in Loop: Header=BB10_335 Depth=1 - vilvl.b $vr1, $vr1, $vr1 - vilvl.h $vr1, $vr1, $vr1 + vsllwil.h.b $vr1, $vr1, 0 + vsllwil.w.h $vr1, $vr1, 0 vpickve2gr.w $a6, $vr1, 1 andi $a6, $a6, 1 bnez $a6, .LBB10_340 diff --git a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_readdb.s b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_readdb.s index 68e732eb..1e48ad8d 100644 --- a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_readdb.s +++ b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_readdb.s @@ -126,15 +126,11 @@ cli_parse_add: # @cli_parse_add vseq.b $vr4, $vr4, $vr2 vseq.b $vr5, $vr5, $vr2 vor.v $vr4, $vr6, $vr4 - vilvl.b $vr4, $vr4, $vr4 - vilvl.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 24 - vsrai.w $vr4, $vr4, 24 + vsllwil.h.b $vr4, $vr4, 0 + vsllwil.w.h $vr4, $vr4, 0 vor.v $vr5, $vr7, $vr5 - vilvl.b $vr5, $vr5, $vr5 - vilvl.h $vr5, $vr5, $vr5 - vslli.w $vr5, $vr5, 24 - vsrai.w $vr5, $vr5, 24 + vsllwil.h.b $vr5, $vr5, 0 + vsllwil.w.h $vr5, $vr5, 0 vsub.w $vr0, $vr0, $vr4 vsub.w $vr3, $vr3, $vr5 addi.d $a3, $a3, -8 diff --git a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_regex_list.s b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_regex_list.s index 5ed8dfba..0113043e 100644 --- a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_regex_list.s +++ b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_regex_list.s @@ -1950,9 +1950,7 @@ load_regex_matcher: # @load_regex_matcher # => This Inner Loop Header: Depth=2 ld.d $a5, $a4, 0 vinsgr2vr.d $vr0, $a5, 0 - vilvl.b $vr0, $vr0, $vr0 - vslli.h $vr0, $vr0, 8 - vsrai.h $vr0, $vr0, 8 + vsllwil.h.b $vr0, $vr0, 0 vst $vr0, $a3, 0 addi.d $a2, $a2, 8 addi.d $a3, $a3, 16 @@ -1981,12 +1979,8 @@ load_regex_matcher: # @load_regex_matcher ld.d $a7, $a4, 0 vinsgr2vr.d $vr0, $a6, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.b $vr0, $vr0, $vr0 - vslli.h $vr0, $vr0, 8 - vsrai.h $vr0, $vr0, 8 - vilvl.b $vr1, $vr1, $vr1 - vslli.h $vr1, $vr1, 8 - vsrai.h $vr1, $vr1, 8 + vsllwil.h.b $vr0, $vr0, 0 + vsllwil.h.b $vr1, $vr1, 0 vst $vr0, $a3, -16 vst $vr1, $a3, 0 addi.d $a5, $a5, -16 diff --git a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_str.s b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_str.s index f30f8dd3..f62a33b8 100644 --- a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_str.s +++ b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_str.s @@ -587,7 +587,6 @@ cli_str2hex: # @cli_str2hex ori $a3, $zero, 16 pcalau12i $a5, %pc_hi20(.LCPI3_2) pcalau12i $a4, %pc_hi20(.LCPI3_3) - vrepli.b $vr0, 0 bgeu $s0, $a3, .LBB3_14 # %bb.10: move $s5, $zero @@ -596,21 +595,21 @@ cli_str2hex: # @cli_str2hex move $a2, $s5 bstrpick.d $a3, $s4, 31, 3 slli.d $s5, $a3, 3 - vld $vr1, $a5, %pc_lo12(.LCPI3_2) - vld $vr2, $a4, %pc_lo12(.LCPI3_3) + vld $vr0, $a5, %pc_lo12(.LCPI3_2) + vld $vr1, $a4, %pc_lo12(.LCPI3_3) slli.d $a4, $a3, 4 - vreplgr2vr.w $vr3, $a6 - vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr3, $vr2 + vreplgr2vr.w $vr2, $a6 + vadd.w $vr0, $vr2, $vr0 + vadd.w $vr1, $vr2, $vr1 sub.d $a5, $a2, $s5 add.d $a6, $s3, $a2 .p2align 4, , 16 .LBB3_12: # %vec.epilog.vector.body # =>This Inner Loop Header: Depth=1 ld.d $a2, $a6, 0 - vinsgr2vr.d $vr3, $a2, 0 - vsrli.b $vr4, $vr3, 4 - vpickve2gr.d $a2, $vr4, 0 + vinsgr2vr.d $vr2, $a2, 0 + vsrli.b $vr3, $vr2, 4 + vpickve2gr.d $a2, $vr3, 0 srli.d $a3, $a2, 56 bstrpick.d $a7, $a2, 51, 48 bstrpick.d $t0, $a2, 43, 40 @@ -627,18 +626,20 @@ cli_str2hex: # @cli_str2hex ldx.b $t0, $a1, $t0 ldx.b $a7, $a1, $a7 ldx.b $a3, $a1, $a3 - vilvh.w $vr4, $vr0, $vr1 - vilvl.w $vr5, $vr0, $vr1 - vilvh.w $vr6, $vr0, $vr2 - vilvl.w $vr7, $vr0, $vr2 - vpickve2gr.d $t5, $vr7, 0 - vpickve2gr.d $t6, $vr7, 1 - vpickve2gr.d $t7, $vr6, 0 - vpickve2gr.d $t8, $vr6, 1 - vpickve2gr.d $fp, $vr5, 0 - vpickve2gr.d $s0, $vr5, 1 - vpickve2gr.d $s1, $vr4, 0 - vpickve2gr.d $s2, $vr4, 1 + vshuf4i.w $vr3, $vr0, 14 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.du.wu $vr4, $vr0, 0 + vshuf4i.w $vr5, $vr1, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr6, $vr1, 0 + vpickve2gr.d $t5, $vr6, 0 + vpickve2gr.d $t6, $vr6, 1 + vpickve2gr.d $t7, $vr5, 0 + vpickve2gr.d $t8, $vr5, 1 + vpickve2gr.d $fp, $vr4, 0 + vpickve2gr.d $s0, $vr4, 1 + vpickve2gr.d $s1, $vr3, 0 + vpickve2gr.d $s2, $vr3, 1 stx.b $a2, $a0, $t5 stx.b $t4, $a0, $t6 stx.b $t3, $a0, $t7 @@ -647,9 +648,9 @@ cli_str2hex: # @cli_str2hex stx.b $t0, $a0, $s0 stx.b $a7, $a0, $s1 stx.b $a3, $a0, $s2 - vpickve2gr.d $a2, $vr3, 0 - vandi.b $vr3, $vr3, 15 - vpickve2gr.d $a3, $vr3, 0 + vpickve2gr.d $a2, $vr2, 0 + vandi.b $vr2, $vr2, 15 + vpickve2gr.d $a3, $vr2, 0 srli.d $a7, $a3, 56 bstrpick.d $t0, $a2, 51, 48 bstrpick.d $t1, $a2, 43, 40 @@ -666,20 +667,22 @@ cli_str2hex: # @cli_str2hex ldx.b $t1, $a1, $t1 ldx.b $t0, $a1, $t0 ldx.b $a7, $a1, $a7 - vbitseti.w $vr3, $vr2, 0 - vbitseti.w $vr4, $vr1, 0 - vilvh.w $vr5, $vr0, $vr4 - vilvl.w $vr4, $vr0, $vr4 - vilvh.w $vr6, $vr0, $vr3 - vilvl.w $vr3, $vr0, $vr3 - vpickve2gr.d $t5, $vr3, 0 - vpickve2gr.d $t6, $vr3, 1 - vpickve2gr.d $t7, $vr6, 0 - vpickve2gr.d $t8, $vr6, 1 - vpickve2gr.d $fp, $vr4, 0 - vpickve2gr.d $s0, $vr4, 1 - vpickve2gr.d $s1, $vr5, 0 - vpickve2gr.d $s2, $vr5, 1 + vbitseti.w $vr2, $vr1, 0 + vbitseti.w $vr3, $vr0, 0 + vshuf4i.w $vr4, $vr3, 14 + vsllwil.du.wu $vr4, $vr4, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vshuf4i.w $vr5, $vr2, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vpickve2gr.d $t5, $vr2, 0 + vpickve2gr.d $t6, $vr2, 1 + vpickve2gr.d $t7, $vr5, 0 + vpickve2gr.d $t8, $vr5, 1 + vpickve2gr.d $fp, $vr3, 0 + vpickve2gr.d $s0, $vr3, 1 + vpickve2gr.d $s1, $vr4, 0 + vpickve2gr.d $s2, $vr4, 1 stx.b $a3, $a0, $t5 stx.b $a2, $a0, $t6 stx.b $t4, $a0, $t7 @@ -688,8 +691,8 @@ cli_str2hex: # @cli_str2hex stx.b $t1, $a0, $s0 stx.b $t0, $a0, $s1 stx.b $a7, $a0, $s2 - vaddi.wu $vr2, $vr2, 16 vaddi.wu $vr1, $vr1, 16 + vaddi.wu $vr0, $vr0, 16 addi.d $a5, $a5, 8 addi.d $a6, $a6, 8 bnez $a5, .LBB3_12 @@ -702,206 +705,248 @@ cli_str2hex: # @cli_str2hex st.d $s4, $sp, 56 # 8-byte Folded Spill bstrpick.d $a6, $s4, 31, 4 pcalau12i $a3, %pc_hi20(.LCPI3_0) - vld $vr1, $a3, %pc_lo12(.LCPI3_0) + vld $vr0, $a3, %pc_lo12(.LCPI3_0) pcalau12i $a3, %pc_hi20(.LCPI3_1) - vld $vr2, $a3, %pc_lo12(.LCPI3_1) + vld $vr1, $a3, %pc_lo12(.LCPI3_1) st.d $a5, $sp, 24 # 8-byte Folded Spill - vld $vr3, $a5, %pc_lo12(.LCPI3_2) + vld $vr2, $a5, %pc_lo12(.LCPI3_2) st.d $a4, $sp, 32 # 8-byte Folded Spill - vld $vr4, $a4, %pc_lo12(.LCPI3_3) + vld $vr3, $a4, %pc_lo12(.LCPI3_3) slli.d $t1, $a6, 4 slli.d $a2, $a6, 5 st.d $a2, $sp, 16 # 8-byte Folded Spill - vrepli.w $vr5, 32 + vrepli.w $vr4, 32 st.d $s3, $sp, 64 # 8-byte Folded Spill move $t0, $s3 st.d $t1, $sp, 48 # 8-byte Folded Spill .p2align 4, , 16 .LBB3_15: # %vector.body # =>This Inner Loop Header: Depth=1 - vld $vr6, $t0, 0 - vsrli.b $vr7, $vr6, 4 - vilvh.b $vr8, $vr0, $vr7 - vilvh.h $vr9, $vr0, $vr8 - vilvh.w $vr10, $vr0, $vr9 - vilvl.w $vr9, $vr0, $vr9 - vilvl.h $vr8, $vr0, $vr8 - vilvh.w $vr11, $vr0, $vr8 - vilvl.w $vr8, $vr0, $vr8 - vilvl.b $vr7, $vr0, $vr7 - vilvh.h $vr12, $vr0, $vr7 - vilvh.w $vr13, $vr0, $vr12 - vilvl.w $vr12, $vr0, $vr12 - vilvl.h $vr7, $vr0, $vr7 - vilvh.w $vr14, $vr0, $vr7 - vilvl.w $vr7, $vr0, $vr7 - vpickve2gr.d $t2, $vr7, 0 - vpickve2gr.d $t3, $vr7, 1 - vpickve2gr.d $t4, $vr14, 0 - vpickve2gr.d $t5, $vr14, 1 - vpickve2gr.d $t6, $vr12, 0 - vpickve2gr.d $t7, $vr12, 1 - vpickve2gr.d $t8, $vr13, 0 - vpickve2gr.d $s0, $vr13, 1 - vpickve2gr.d $s1, $vr8, 0 - vpickve2gr.d $s2, $vr8, 1 - vpickve2gr.d $s3, $vr11, 0 - vpickve2gr.d $s4, $vr11, 1 - vpickve2gr.d $s5, $vr9, 0 - vpickve2gr.d $s6, $vr9, 1 - vpickve2gr.d $s7, $vr10, 0 - vpickve2gr.d $s8, $vr10, 1 - ldx.b $t2, $a1, $t2 - ldx.b $t3, $a1, $t3 - ldx.b $t4, $a1, $t4 - ldx.b $t5, $a1, $t5 - ldx.b $t6, $a1, $t6 - ldx.b $t7, $a1, $t7 - ldx.b $t8, $a1, $t8 - ldx.b $s0, $a1, $s0 - ldx.b $s1, $a1, $s1 - ldx.b $s2, $a1, $s2 - ldx.b $s3, $a1, $s3 - ldx.b $s4, $a1, $s4 - ldx.b $s5, $a1, $s5 - ldx.b $s6, $a1, $s6 - ldx.b $s7, $a1, $s7 - ldx.b $s8, $a1, $s8 - vilvh.w $vr7, $vr0, $vr1 - vilvl.w $vr8, $vr0, $vr1 - vilvh.w $vr9, $vr0, $vr2 - vilvl.w $vr10, $vr0, $vr2 - vilvh.w $vr11, $vr0, $vr3 - vilvl.w $vr12, $vr0, $vr3 - vilvh.w $vr13, $vr0, $vr4 - vilvl.w $vr14, $vr0, $vr4 - vpickve2gr.d $ra, $vr14, 0 - stx.b $t2, $a0, $ra - vpickve2gr.d $t2, $vr14, 1 - vpickve2gr.d $ra, $vr13, 0 - stx.b $t3, $a0, $t2 - vpickve2gr.d $t2, $vr13, 1 - vpickve2gr.d $t3, $vr12, 0 - stx.b $t4, $a0, $ra - vpickve2gr.d $t4, $vr12, 1 - vpickve2gr.d $ra, $vr11, 0 - stx.b $t5, $a0, $t2 - vpickve2gr.d $t2, $vr11, 1 - vpickve2gr.d $t5, $vr10, 0 - stx.b $t6, $a0, $t3 - vpickve2gr.d $t3, $vr10, 1 - vpickve2gr.d $t6, $vr9, 0 - stx.b $t7, $a0, $t4 - vpickve2gr.d $t4, $vr9, 1 - vpickve2gr.d $t7, $vr8, 0 - stx.b $t8, $a0, $ra - vpickve2gr.d $t8, $vr8, 1 - vpickve2gr.d $ra, $vr7, 0 - stx.b $s0, $a0, $t2 - vpickve2gr.d $t2, $vr7, 1 - stx.b $s1, $a0, $t5 - stx.b $s2, $a0, $t3 - stx.b $s3, $a0, $t6 - stx.b $s4, $a0, $t4 - stx.b $s5, $a0, $t7 - stx.b $s6, $a0, $t8 - stx.b $s7, $a0, $ra - stx.b $s8, $a0, $t2 - vandi.b $vr6, $vr6, 15 - vilvh.b $vr7, $vr0, $vr6 - vilvh.h $vr8, $vr0, $vr7 - vilvh.w $vr9, $vr0, $vr8 - vilvl.w $vr8, $vr0, $vr8 - vilvl.h $vr7, $vr0, $vr7 - vilvh.w $vr10, $vr0, $vr7 - vilvl.w $vr7, $vr0, $vr7 - vilvl.b $vr6, $vr0, $vr6 - vilvh.h $vr11, $vr0, $vr6 - vilvh.w $vr12, $vr0, $vr11 - vilvl.w $vr11, $vr0, $vr11 - vilvl.h $vr6, $vr0, $vr6 - vilvh.w $vr13, $vr0, $vr6 - vilvl.w $vr6, $vr0, $vr6 + vld $vr5, $t0, 0 + vsrli.b $vr6, $vr5, 4 + vbsrl.v $vr7, $vr6, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vbsrl.v $vr8, $vr6, 12 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vbsrl.v $vr9, $vr6, 10 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vbsrl.v $vr10, $vr6, 8 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vsrli.d $vr11, $vr6, 48 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vsrli.d $vr12, $vr6, 32 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vshuf4i.b $vr13, $vr6, 14 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 vpickve2gr.d $t2, $vr6, 0 vpickve2gr.d $t3, $vr6, 1 vpickve2gr.d $t4, $vr13, 0 vpickve2gr.d $t5, $vr13, 1 - vpickve2gr.d $t6, $vr11, 0 - vpickve2gr.d $t7, $vr11, 1 - vpickve2gr.d $t8, $vr12, 0 - vpickve2gr.d $s0, $vr12, 1 - vpickve2gr.d $s1, $vr7, 0 - vpickve2gr.d $s2, $vr7, 1 - vpickve2gr.d $s3, $vr10, 0 - vpickve2gr.d $s4, $vr10, 1 - vpickve2gr.d $s5, $vr8, 0 - vpickve2gr.d $s6, $vr8, 1 - vpickve2gr.d $s7, $vr9, 0 - vpickve2gr.d $s8, $vr9, 1 - ldx.b $ra, $a1, $t2 - ldx.b $a7, $a1, $t3 - ldx.b $a6, $a1, $t4 - ldx.b $a4, $a1, $t5 - ldx.b $a5, $a1, $t6 - ldx.b $a2, $a1, $t7 - ldx.b $fp, $a1, $t8 - ldx.b $a3, $a1, $s0 - ldx.b $s0, $a1, $s1 - ldx.b $t8, $a1, $s2 - ldx.b $t7, $a1, $s3 - ldx.b $t6, $a1, $s4 + vpickve2gr.d $t6, $vr12, 0 + vpickve2gr.d $t7, $vr12, 1 + vpickve2gr.d $t8, $vr11, 0 + vpickve2gr.d $s0, $vr11, 1 + vpickve2gr.d $s2, $vr10, 0 + vpickve2gr.d $s3, $vr10, 1 + vpickve2gr.d $s4, $vr9, 0 + vpickve2gr.d $s5, $vr9, 1 + vpickve2gr.d $s6, $vr8, 0 + vpickve2gr.d $s7, $vr8, 1 + vpickve2gr.d $s8, $vr7, 0 + vpickve2gr.d $ra, $vr7, 1 + ldx.b $a7, $a1, $t2 + ldx.b $a6, $a1, $t3 + ldx.b $a4, $a1, $t4 + ldx.b $a5, $a1, $t5 + ldx.b $a2, $a1, $t6 + ldx.b $fp, $a1, $t7 + ldx.b $a3, $a1, $t8 + ldx.b $s1, $a1, $s0 + ldx.b $t2, $a1, $s2 + ldx.b $t3, $a1, $s3 + ldx.b $t4, $a1, $s4 ldx.b $t5, $a1, $s5 - ldx.b $t4, $a1, $s6 - ldx.b $t3, $a1, $s7 - ldx.b $t2, $a1, $s8 - vbitseti.w $vr6, $vr4, 0 - vilvl.w $vr7, $vr0, $vr6 - vpickve2gr.d $s1, $vr7, 0 - stx.b $ra, $a0, $s1 - vpickve2gr.d $s1, $vr7, 1 - stx.b $a7, $a0, $s1 - vilvh.w $vr6, $vr0, $vr6 - vpickve2gr.d $a7, $vr6, 0 + ldx.b $t6, $a1, $s6 + ldx.b $t7, $a1, $s7 + ldx.b $t8, $a1, $s8 + ldx.b $s0, $a1, $ra + vshuf4i.w $vr6, $vr0, 14 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.du.wu $vr7, $vr0, 0 + vshuf4i.w $vr8, $vr1, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr9, $vr1, 0 + vshuf4i.w $vr10, $vr2, 14 + vsllwil.du.wu $vr10, $vr10, 0 + vsllwil.du.wu $vr11, $vr2, 0 + vshuf4i.w $vr12, $vr3, 14 + vsllwil.du.wu $vr12, $vr12, 0 + vsllwil.du.wu $vr13, $vr3, 0 + vpickve2gr.d $s2, $vr13, 0 + stx.b $a7, $a0, $s2 + vpickve2gr.d $a7, $vr13, 1 + vpickve2gr.d $s2, $vr12, 0 stx.b $a6, $a0, $a7 - vpickve2gr.d $a6, $vr6, 1 - stx.b $a4, $a0, $a6 - vbitseti.w $vr6, $vr3, 0 - vilvl.w $vr7, $vr0, $vr6 - vpickve2gr.d $a4, $vr7, 0 - stx.b $a5, $a0, $a4 - vpickve2gr.d $a4, $vr7, 1 - stx.b $a2, $a0, $a4 - vilvh.w $vr6, $vr0, $vr6 - vpickve2gr.d $a2, $vr6, 0 - stx.b $fp, $a0, $a2 + vpickve2gr.d $a6, $vr12, 1 + vpickve2gr.d $a7, $vr11, 0 + stx.b $a4, $a0, $s2 + vpickve2gr.d $a4, $vr11, 1 + vpickve2gr.d $s2, $vr10, 0 + stx.b $a5, $a0, $a6 + vpickve2gr.d $a5, $vr10, 1 + vpickve2gr.d $a6, $vr9, 0 + stx.b $a2, $a0, $a7 + vpickve2gr.d $a2, $vr9, 1 + vpickve2gr.d $a7, $vr8, 0 + stx.b $fp, $a0, $a4 + vpickve2gr.d $a4, $vr8, 1 + vpickve2gr.d $fp, $vr7, 0 + stx.b $a3, $a0, $s2 + vpickve2gr.d $a3, $vr7, 1 + vpickve2gr.d $s2, $vr6, 0 + stx.b $s1, $a0, $a5 + vpickve2gr.d $a5, $vr6, 1 + stx.b $t2, $a0, $a6 + stx.b $t3, $a0, $a2 + stx.b $t4, $a0, $a7 + stx.b $t5, $a0, $a4 + stx.b $t6, $a0, $fp + stx.b $t7, $a0, $a3 + stx.b $t8, $a0, $s2 + stx.b $s0, $a0, $a5 + vandi.b $vr5, $vr5, 15 + vbsrl.v $vr6, $vr5, 14 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vbsrl.v $vr7, $vr5, 12 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vbsrl.v $vr8, $vr5, 10 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vbsrl.v $vr9, $vr5, 8 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vsrli.d $vr10, $vr5, 48 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vsrli.d $vr11, $vr5, 32 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vshuf4i.b $vr12, $vr5, 14 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vpickve2gr.d $a2, $vr5, 0 + vpickve2gr.d $a3, $vr5, 1 + vpickve2gr.d $a4, $vr12, 0 + vpickve2gr.d $a5, $vr12, 1 + vpickve2gr.d $a6, $vr11, 0 + vpickve2gr.d $a7, $vr11, 1 + vpickve2gr.d $t2, $vr10, 0 + vpickve2gr.d $t3, $vr10, 1 + vpickve2gr.d $t4, $vr9, 0 + vpickve2gr.d $t5, $vr9, 1 + vpickve2gr.d $t6, $vr8, 0 + vpickve2gr.d $fp, $vr8, 1 + vpickve2gr.d $s4, $vr7, 0 + vpickve2gr.d $s5, $vr7, 1 + vpickve2gr.d $s6, $vr6, 0 + vpickve2gr.d $s7, $vr6, 1 + ldx.b $a2, $a1, $a2 + ldx.b $a3, $a1, $a3 + ldx.b $a4, $a1, $a4 + ldx.b $a5, $a1, $a5 + ldx.b $a6, $a1, $a6 + ldx.b $s3, $a1, $a7 + ldx.b $s2, $a1, $t2 + ldx.b $s1, $a1, $t3 + ldx.b $s0, $a1, $t4 + ldx.b $t8, $a1, $t5 + ldx.b $t7, $a1, $t6 + ldx.b $t6, $a1, $fp + ldx.b $t5, $a1, $s4 + ldx.b $t4, $a1, $s5 + ldx.b $t3, $a1, $s6 + ldx.b $t2, $a1, $s7 + vbitseti.w $vr5, $vr3, 0 + vsllwil.du.wu $vr6, $vr5, 0 + vpickve2gr.d $a7, $vr6, 0 + stx.b $a2, $a0, $a7 vpickve2gr.d $a2, $vr6, 1 stx.b $a3, $a0, $a2 - vbitseti.w $vr6, $vr2, 0 - vilvl.w $vr7, $vr0, $vr6 - vpickve2gr.d $a2, $vr7, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vpickve2gr.d $a2, $vr5, 0 + stx.b $a4, $a0, $a2 + vpickve2gr.d $a2, $vr5, 1 + stx.b $a5, $a0, $a2 + vbitseti.w $vr5, $vr2, 0 + vsllwil.du.wu $vr6, $vr5, 0 + vpickve2gr.d $a2, $vr6, 0 + stx.b $a6, $a0, $a2 + vpickve2gr.d $a2, $vr6, 1 + stx.b $s3, $a0, $a2 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vpickve2gr.d $a2, $vr5, 0 + stx.b $s2, $a0, $a2 + vpickve2gr.d $a2, $vr5, 1 + stx.b $s1, $a0, $a2 + vbitseti.w $vr5, $vr1, 0 + vsllwil.du.wu $vr6, $vr5, 0 + vpickve2gr.d $a2, $vr6, 0 stx.b $s0, $a0, $a2 - vpickve2gr.d $a2, $vr7, 1 + vpickve2gr.d $a2, $vr6, 1 stx.b $t8, $a0, $a2 - vilvh.w $vr6, $vr0, $vr6 - vpickve2gr.d $a2, $vr6, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vpickve2gr.d $a2, $vr5, 0 stx.b $t7, $a0, $a2 - vpickve2gr.d $a2, $vr6, 1 + vpickve2gr.d $a2, $vr5, 1 stx.b $t6, $a0, $a2 - vbitseti.w $vr6, $vr1, 0 - vilvl.w $vr7, $vr0, $vr6 - vpickve2gr.d $a2, $vr7, 0 + vbitseti.w $vr5, $vr0, 0 + vsllwil.du.wu $vr6, $vr5, 0 + vpickve2gr.d $a2, $vr6, 0 stx.b $t5, $a0, $a2 - vpickve2gr.d $a2, $vr7, 1 + vpickve2gr.d $a2, $vr6, 1 stx.b $t4, $a0, $a2 - vilvh.w $vr6, $vr0, $vr6 - vpickve2gr.d $a2, $vr6, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vpickve2gr.d $a2, $vr5, 0 stx.b $t3, $a0, $a2 - vpickve2gr.d $a2, $vr6, 1 + vpickve2gr.d $a2, $vr5, 1 stx.b $t2, $a0, $a2 - vadd.w $vr4, $vr4, $vr5 - vadd.w $vr3, $vr3, $vr5 - vadd.w $vr2, $vr2, $vr5 - vadd.w $vr1, $vr1, $vr5 + vadd.w $vr3, $vr3, $vr4 + vadd.w $vr2, $vr2, $vr4 + vadd.w $vr1, $vr1, $vr4 + vadd.w $vr0, $vr0, $vr4 addi.d $t1, $t1, -16 addi.d $t0, $t0, 16 bnez $t1, .LBB3_15 diff --git a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/shared_sha256.s b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/shared_sha256.s index a7e3f88c..7aa59913 100644 --- a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/shared_sha256.s +++ b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/shared_sha256.s @@ -101,55 +101,72 @@ sha256_update: # @sha256_update .p2align 4, 0x0 # -- Begin function sha256_block .LCPI2_0: .byte 1 # 0x1 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 .byte 4 # 0x4 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 9 # 0x9 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b .byte 12 # 0xc - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI2_1: .byte 0 # 0x0 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 .byte 5 # 0x5 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 8 # 0x8 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b .byte 13 # 0xd - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI2_2: .byte 2 # 0x2 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 .byte 6 # 0x6 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 10 # 0xa - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b .byte 14 # 0xe - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff +.LCPI2_3: + .byte 3 # 0x3 + .byte 7 # 0x7 + .byte 11 # 0xb + .byte 15 # 0xf + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .text .p2align 5 .type sha256_block,@function @@ -167,65 +184,98 @@ sha256_block: # @sha256_block .LBB2_2: addi.d $sp, $sp, -96 st.d $ra, $sp, 88 # 8-byte Folded Spill - vld $vr2, $a1, 0 + vld $vr6, $a1, 0 pcalau12i $a2, %pc_hi20(.LCPI2_0) - vld $vr1, $a2, %pc_lo12(.LCPI2_0) + vld $vr0, $a2, %pc_lo12(.LCPI2_0) pcalau12i $a2, %pc_hi20(.LCPI2_1) - vld $vr3, $a2, %pc_lo12(.LCPI2_1) - vrepli.b $vr0, 0 - vshuf.b $vr4, $vr0, $vr2, $vr1 - vshuf.b $vr5, $vr0, $vr2, $vr3 + vld $vr1, $a2, %pc_lo12(.LCPI2_1) + vshuf.b $vr2, $vr0, $vr6, $vr0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr3, $vr2, 0 + vshuf.b $vr2, $vr0, $vr6, $vr1 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr5, $vr2, 0 ori $a2, $zero, 8 lu32i.d $a2, 16 - vreplgr2vr.d $vr6, $a2 - vsll.w $vr4, $vr4, $vr6 + vreplgr2vr.d $vr2, $a2 + vsll.w $vr7, $vr3, $vr2 pcalau12i $a2, %pc_hi20(.LCPI2_2) - vld $vr7, $a2, %pc_lo12(.LCPI2_2) + vld $vr3, $a2, %pc_lo12(.LCPI2_2) ori $a2, $zero, 16 lu32i.d $a2, 8 - vreplgr2vr.d $vr8, $a2 - vsll.w $vr5, $vr5, $vr8 - vor.v $vr4, $vr5, $vr4 - vshuf.b $vr5, $vr0, $vr2, $vr7 - vor.v $vr4, $vr4, $vr5 - vslli.w $vr4, $vr4, 8 - vld $vr5, $a1, 16 - vsrli.w $vr2, $vr2, 24 - vor.v $vr2, $vr4, $vr2 - vst $vr2, $sp, 16 - vshuf.b $vr2, $vr0, $vr5, $vr1 - vshuf.b $vr4, $vr0, $vr5, $vr3 - vsll.w $vr2, $vr2, $vr6 - vsll.w $vr4, $vr4, $vr8 - vor.v $vr2, $vr4, $vr2 - vshuf.b $vr4, $vr0, $vr5, $vr7 - vor.v $vr2, $vr2, $vr4 - vslli.w $vr2, $vr2, 8 - vld $vr4, $a1, 32 - vsrli.w $vr5, $vr5, 24 - vor.v $vr2, $vr2, $vr5 - vst $vr2, $sp, 32 - vshuf.b $vr2, $vr0, $vr4, $vr1 - vshuf.b $vr5, $vr0, $vr4, $vr3 - vsll.w $vr2, $vr2, $vr6 - vsll.w $vr5, $vr5, $vr8 - vor.v $vr2, $vr5, $vr2 - vshuf.b $vr5, $vr0, $vr4, $vr7 - vor.v $vr2, $vr2, $vr5 - vslli.w $vr2, $vr2, 8 - vld $vr5, $a1, 48 - vsrli.w $vr4, $vr4, 24 - vor.v $vr2, $vr2, $vr4 - vst $vr2, $sp, 48 - vshuf.b $vr1, $vr0, $vr5, $vr1 - vshuf.b $vr2, $vr0, $vr5, $vr3 - vsll.w $vr1, $vr1, $vr6 - vsll.w $vr2, $vr2, $vr8 - vor.v $vr1, $vr2, $vr1 - vshuf.b $vr0, $vr0, $vr5, $vr7 + vreplgr2vr.d $vr4, $a2 + vsll.w $vr5, $vr5, $vr4 + vor.v $vr7, $vr5, $vr7 + vshuf.b $vr5, $vr0, $vr6, $vr3 + vsllwil.hu.bu $vr8, $vr5, 0 + pcalau12i $a2, %pc_hi20(.LCPI2_3) + vld $vr5, $a2, %pc_lo12(.LCPI2_3) + vsllwil.wu.hu $vr8, $vr8, 0 + vor.v $vr7, $vr7, $vr8 + vslli.w $vr7, $vr7, 8 + vshuf.b $vr6, $vr0, $vr6, $vr5 + vsllwil.hu.bu $vr6, $vr6, 0 + vld $vr8, $a1, 16 + vsllwil.wu.hu $vr6, $vr6, 0 + vor.v $vr6, $vr7, $vr6 + vst $vr6, $sp, 16 + vshuf.b $vr6, $vr0, $vr8, $vr0 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vshuf.b $vr7, $vr0, $vr8, $vr1 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsll.w $vr6, $vr6, $vr2 + vsll.w $vr7, $vr7, $vr4 + vor.v $vr6, $vr7, $vr6 + vshuf.b $vr7, $vr0, $vr8, $vr3 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vor.v $vr6, $vr6, $vr7 + vslli.w $vr6, $vr6, 8 + vshuf.b $vr7, $vr0, $vr8, $vr5 + vsllwil.hu.bu $vr7, $vr7, 0 + vld $vr8, $a1, 32 + vsllwil.wu.hu $vr7, $vr7, 0 + vor.v $vr6, $vr6, $vr7 + vst $vr6, $sp, 32 + vshuf.b $vr6, $vr0, $vr8, $vr0 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vshuf.b $vr7, $vr0, $vr8, $vr1 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsll.w $vr6, $vr6, $vr2 + vsll.w $vr7, $vr7, $vr4 + vor.v $vr6, $vr7, $vr6 + vshuf.b $vr7, $vr0, $vr8, $vr3 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vor.v $vr6, $vr6, $vr7 + vslli.w $vr6, $vr6, 8 + vshuf.b $vr7, $vr0, $vr8, $vr5 + vsllwil.hu.bu $vr7, $vr7, 0 + vld $vr8, $a1, 48 + vsllwil.wu.hu $vr7, $vr7, 0 + vor.v $vr6, $vr6, $vr7 + vst $vr6, $sp, 48 + vshuf.b $vr0, $vr0, $vr8, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vshuf.b $vr1, $vr0, $vr8, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsll.w $vr0, $vr0, $vr2 + vsll.w $vr1, $vr1, $vr4 vor.v $vr0, $vr1, $vr0 + vshuf.b $vr1, $vr0, $vr8, $vr3 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vor.v $vr0, $vr0, $vr1 vslli.w $vr0, $vr0, 8 - vsrli.w $vr1, $vr5, 24 + vshuf.b $vr1, $vr0, $vr8, $vr5 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vor.v $vr0, $vr0, $vr1 vst $vr0, $sp, 64 addi.d $a1, $sp, 16 diff --git a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/zlib_infback.s b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/zlib_infback.s index 8345623e..ef5e1d69 100644 --- a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/zlib_infback.s +++ b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/zlib_infback.s @@ -87,26 +87,26 @@ inflateBackInit_: # @inflateBackInit_ .type inflateBack,@function inflateBack: # @inflateBack # %bb.0: - addi.d $sp, $sp, -272 - st.d $ra, $sp, 264 # 8-byte Folded Spill - st.d $fp, $sp, 256 # 8-byte Folded Spill - st.d $s0, $sp, 248 # 8-byte Folded Spill - st.d $s1, $sp, 240 # 8-byte Folded Spill - st.d $s2, $sp, 232 # 8-byte Folded Spill - st.d $s3, $sp, 224 # 8-byte Folded Spill - st.d $s4, $sp, 216 # 8-byte Folded Spill - st.d $s5, $sp, 208 # 8-byte Folded Spill - st.d $s6, $sp, 200 # 8-byte Folded Spill - st.d $s7, $sp, 192 # 8-byte Folded Spill - st.d $s8, $sp, 184 # 8-byte Folded Spill - st.d $a2, $sp, 168 # 8-byte Folded Spill - st.d $a1, $sp, 160 # 8-byte Folded Spill + addi.d $sp, $sp, -256 + st.d $ra, $sp, 248 # 8-byte Folded Spill + st.d $fp, $sp, 240 # 8-byte Folded Spill + st.d $s0, $sp, 232 # 8-byte Folded Spill + st.d $s1, $sp, 224 # 8-byte Folded Spill + st.d $s2, $sp, 216 # 8-byte Folded Spill + st.d $s3, $sp, 208 # 8-byte Folded Spill + st.d $s4, $sp, 200 # 8-byte Folded Spill + st.d $s5, $sp, 192 # 8-byte Folded Spill + st.d $s6, $sp, 184 # 8-byte Folded Spill + st.d $s7, $sp, 176 # 8-byte Folded Spill + st.d $s8, $sp, 168 # 8-byte Folded Spill + st.d $a2, $sp, 152 # 8-byte Folded Spill + st.d $a1, $sp, 144 # 8-byte Folded Spill addi.w $s4, $zero, -2 - beqz $a0, .LBB1_201 + beqz $a0, .LBB1_200 # %bb.1: move $s6, $a0 ld.d $s0, $a0, 56 - beqz $s0, .LBB1_201 + beqz $s0, .LBB1_200 # %bb.2: move $s7, $a4 st.d $zero, $s6, 48 @@ -115,7 +115,7 @@ inflateBack: # @inflateBack st.d $a0, $s0, 8 ld.d $a1, $s6, 0 st.w $zero, $s0, 64 - st.d $a1, $sp, 176 + st.d $a1, $sp, 160 st.d $a3, $sp, 96 # 8-byte Folded Spill beqz $a1, .LBB1_4 # %bb.3: @@ -159,37 +159,35 @@ inflateBack: # @inflateBack lu12i.w $a1, 15 ori $a1, $a1, 4095 st.d $a1, $sp, 112 # 8-byte Folded Spill - vrepli.b $vr4, 0 st.d $s7, $sp, 120 # 8-byte Folded Spill - vst $vr4, $sp, 144 # 16-byte Folded Spill .p2align 4, , 16 .LBB1_6: # =>This Loop Header: Depth=1 # Child Loop BB1_15 Depth 2 # Child Loop BB1_20 Depth 2 - # Child Loop BB1_69 Depth 2 - # Child Loop BB1_52 Depth 2 - # Child Loop BB1_73 Depth 2 - # Child Loop BB1_77 Depth 3 - # Child Loop BB1_96 Depth 3 - # Child Loop BB1_86 Depth 3 - # Child Loop BB1_91 Depth 3 - # Child Loop BB1_107 Depth 3 - # Child Loop BB1_111 Depth 3 - # Child Loop BB1_130 Depth 2 - # Child Loop BB1_134 Depth 2 - # Child Loop BB1_143 Depth 2 - # Child Loop BB1_157 Depth 2 - # Child Loop BB1_166 Depth 2 - # Child Loop BB1_173 Depth 2 - # Child Loop BB1_184 Depth 2 - # Child Loop BB1_191 Depth 3 - # Child Loop BB1_195 Depth 3 + # Child Loop BB1_68 Depth 2 + # Child Loop BB1_49 Depth 2 + # Child Loop BB1_72 Depth 2 + # Child Loop BB1_76 Depth 3 + # Child Loop BB1_95 Depth 3 + # Child Loop BB1_85 Depth 3 + # Child Loop BB1_90 Depth 3 + # Child Loop BB1_106 Depth 3 + # Child Loop BB1_110 Depth 3 + # Child Loop BB1_129 Depth 2 + # Child Loop BB1_133 Depth 2 + # Child Loop BB1_142 Depth 2 + # Child Loop BB1_156 Depth 2 + # Child Loop BB1_165 Depth 2 + # Child Loop BB1_172 Depth 2 + # Child Loop BB1_183 Depth 2 + # Child Loop BB1_190 Depth 3 + # Child Loop BB1_194 Depth 3 # Child Loop BB1_26 Depth 2 # Child Loop BB1_33 Depth 2 ld.d $a1, $sp, 136 # 8-byte Folded Reload add.d $a0, $a0, $a1 ori $a1, $zero, 18 - bltu $a1, $a0, .LBB1_200 + bltu $a1, $a0, .LBB1_199 # %bb.7: # in Loop: Header=BB1_6 Depth=1 slli.d $a0, $a0, 2 ld.d $a1, $sp, 128 # 8-byte Folded Reload @@ -198,14 +196,14 @@ inflateBack: # @inflateBack jr $a0 .LBB1_8: # in Loop: Header=BB1_6 Depth=1 ld.w $a0, $s0, 12 - beqz $a0, .LBB1_42 + beqz $a0, .LBB1_39 # %bb.9: # in Loop: Header=BB1_6 Depth=1 andi $a0, $s3, 7 srl.d $fp, $fp, $a0 bstrins.d $s3, $zero, 2, 0 lu12i.w $a0, 3 ori $a0, $a0, 3920 - b .LBB1_40 + b .LBB1_55 .p2align 4, , 16 .LBB1_10: # %.preheader693 # in Loop: Header=BB1_6 Depth=1 @@ -233,15 +231,15 @@ inflateBack: # @inflateBack bgeu $a0, $a1, .LBB1_37 # %bb.13: # %.preheader682.preheader # in Loop: Header=BB1_6 Depth=1 - move $a0, $zero + move $a1, $zero st.w $zero, $s0, 140 ld.d $s1, $sp, 104 # 8-byte Folded Reload b .LBB1_20 .p2align 4, , 16 .LBB1_14: # in Loop: Header=BB1_15 Depth=2 - ld.d $a0, $sp, 176 + ld.d $a0, $sp, 160 addi.d $a1, $a0, 1 - st.d $a1, $sp, 176 + st.d $a1, $sp, 160 ld.bu $a0, $a0, 0 addi.w $s5, $s5, -1 sll.d $a0, $a0, $s1 @@ -256,60 +254,58 @@ inflateBack: # @inflateBack move $s1, $a0 bnez $s5, .LBB1_14 # %bb.16: # in Loop: Header=BB1_15 Depth=2 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - vld $vr4, $sp, 144 # 16-byte Folded Reload move $s5, $a0 bnez $a0, .LBB1_14 - b .LBB1_198 + b .LBB1_197 .LBB1_17: # %.._crit_edge849_crit_edge # in Loop: Header=BB1_20 Depth=2 move $s5, $a0 - ld.w $a0, $s0, 140 + ld.w $a1, $s0, 140 ld.w $a2, $s0, 128 - vld $vr4, $sp, 144 # 16-byte Folded Reload .LBB1_18: # %._crit_edge849 # in Loop: Header=BB1_20 Depth=2 - ld.d $a1, $sp, 176 - addi.d $a3, $a1, 1 - st.d $a3, $sp, 176 - ld.bu $a1, $a1, 0 - sll.d $a1, $a1, $s3 + ld.d $a0, $sp, 160 + addi.d $a3, $a0, 1 + st.d $a3, $sp, 160 + ld.bu $a0, $a0, 0 + sll.d $a0, $a0, $s3 ori $s3, $s3, 8 addi.w $s5, $s5, -1 - add.d $fp, $a1, $fp + add.d $fp, $a0, $fp .LBB1_19: # in Loop: Header=BB1_20 Depth=2 - move $a3, $a0 + move $a3, $a1 andi $a4, $fp, 7 - addi.w $a0, $a0, 1 - st.w $a0, $s0, 140 - bstrpick.d $a1, $a3, 31, 0 - slli.d $a5, $a1, 1 - pcalau12i $a1, %pc_hi20(inflateBack.order) - addi.d $a1, $a1, %pc_lo12(inflateBack.order) - ldx.hu $a5, $a1, $a5 + addi.w $a1, $a1, 1 + st.w $a1, $s0, 140 + bstrpick.d $a0, $a3, 31, 0 + slli.d $a5, $a0, 1 + pcalau12i $a0, %pc_hi20(inflateBack.order) + addi.d $a0, $a0, %pc_lo12(inflateBack.order) + ldx.hu $a5, $a0, $a5 slli.d $a5, $a5, 1 stx.h $a4, $s1, $a5 srli.d $fp, $fp, 3 addi.w $s3, $s3, -3 - bgeu $a0, $a2, .LBB1_48 + bgeu $a1, $a2, .LBB1_45 .LBB1_20: # %.preheader682 # Parent Loop BB1_6 Depth=1 # => This Inner Loop Header: Depth=2 - ori $a1, $zero, 2 - bltu $a1, $s3, .LBB1_19 + ori $a0, $zero, 2 + bltu $a0, $s3, .LBB1_19 # %bb.21: # %.lr.ph848 # in Loop: Header=BB1_20 Depth=2 bnez $s5, .LBB1_18 # %bb.22: # in Loop: Header=BB1_20 Depth=2 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 bnez $a0, .LBB1_17 - b .LBB1_198 + b .LBB1_197 .p2align 4, , 16 .LBB1_23: # in Loop: Header=BB1_6 Depth=1 addi.w $a0, $s3, 0 @@ -324,9 +320,9 @@ inflateBack: # @inflateBack b .LBB1_26 .p2align 4, , 16 .LBB1_25: # in Loop: Header=BB1_26 Depth=2 - ld.d $a0, $sp, 176 + ld.d $a0, $sp, 160 addi.d $a1, $a0, 1 - st.d $a1, $sp, 176 + st.d $a1, $sp, 160 ld.bu $a0, $a0, 0 addi.w $s5, $s5, -1 sll.d $a0, $a0, $s1 @@ -341,14 +337,13 @@ inflateBack: # @inflateBack move $s1, $a0 bnez $s5, .LBB1_25 # %bb.27: # in Loop: Header=BB1_26 Depth=2 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - vld $vr4, $sp, 144 # 16-byte Folded Reload move $s5, $a0 bnez $a0, .LBB1_25 - b .LBB1_198 + b .LBB1_197 .LBB1_28: # in Loop: Header=BB1_6 Depth=1 bstrins.d $s3, $zero, 2, 0 .LBB1_29: # %._crit_edge1013 @@ -360,7 +355,7 @@ inflateBack: # @inflateBack bne $a0, $a1, .LBB1_38 # %bb.30: # in Loop: Header=BB1_6 Depth=1 st.w $s1, $s0, 92 - beqz $s1, .LBB1_56 + beqz $s1, .LBB1_54 # %bb.31: # %.lr.ph1023.preheader # in Loop: Header=BB1_6 Depth=1 ld.d $s3, $sp, 96 # 8-byte Folded Reload @@ -374,7 +369,7 @@ inflateBack: # @inflateBack or $a0, $a1, $a0 sltu $a2, $a0, $s8 maskeqz $a0, $a0, $a2 - ld.d $a1, $sp, 176 + ld.d $a1, $sp, 160 masknez $a2, $s8, $a2 or $fp, $a0, $a2 bstrpick.d $s1, $fp, 31, 0 @@ -382,27 +377,27 @@ inflateBack: # @inflateBack move $a2, $s1 pcaddu18i $ra, %call36(memcpy) jirl $ra, $ra, 0 - ld.d $a0, $sp, 176 + ld.d $a0, $sp, 160 sub.w $s5, $s5, $fp add.d $a0, $a0, $s1 ld.w $a1, $s0, 92 - st.d $a0, $sp, 176 + st.d $a0, $sp, 160 sub.w $s8, $s8, $fp add.d $s2, $s2, $s1 sub.w $s1, $a1, $fp st.w $s1, $s0, 92 - beqz $s1, .LBB1_56 + beqz $s1, .LBB1_54 .LBB1_33: # %.lr.ph1023 # Parent Loop BB1_6 Depth=1 # => This Inner Loop Header: Depth=2 bnez $s5, .LBB1_35 # %bb.34: # in Loop: Header=BB1_33 Depth=2 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 move $s5, $a0 - beqz $a0, .LBB1_198 + beqz $a0, .LBB1_197 .LBB1_35: # in Loop: Header=BB1_33 Depth=2 bnez $s8, .LBB1_32 # %bb.36: # in Loop: Header=BB1_33 Depth=2 @@ -414,53 +409,41 @@ inflateBack: # @inflateBack move $a2, $s8 jirl $ra, $s3, 0 beqz $a0, .LBB1_32 - b .LBB1_199 + b .LBB1_198 .LBB1_37: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.3) addi.d $a0, $a0, %pc_lo12(.L.str.3) - b .LBB1_39 + b .LBB1_53 .LBB1_38: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.2) addi.d $a0, $a0, %pc_lo12(.L.str.2) -.LBB1_39: # %.thread - # in Loop: Header=BB1_6 Depth=1 - st.d $a0, $s6, 48 - lu12i.w $a0, 3 - ori $a0, $a0, 3921 -.LBB1_40: # %.thread - # in Loop: Header=BB1_6 Depth=1 - st.w $a0, $s0, 8 -.LBB1_41: # %.thread - # in Loop: Header=BB1_6 Depth=1 - ld.w $a0, $s0, 8 - b .LBB1_6 -.LBB1_42: # %.preheader683 + b .LBB1_53 +.LBB1_39: # %.preheader683 # in Loop: Header=BB1_6 Depth=1 addi.w $s1, $s3, 0 ori $a0, $zero, 2 - bltu $a0, $s1, .LBB1_46 -# %bb.43: # %.lr.ph1031 + bltu $a0, $s1, .LBB1_43 +# %bb.40: # %.lr.ph1031 # in Loop: Header=BB1_6 Depth=1 - bnez $s5, .LBB1_45 -# %bb.44: # in Loop: Header=BB1_6 Depth=1 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + bnez $s5, .LBB1_42 +# %bb.41: # in Loop: Header=BB1_6 Depth=1 + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - vld $vr4, $sp, 144 # 16-byte Folded Reload move $s5, $a0 - beqz $a0, .LBB1_198 -.LBB1_45: # %._crit_edge1032 + beqz $a0, .LBB1_197 +.LBB1_42: # %._crit_edge1032 # in Loop: Header=BB1_6 Depth=1 - ld.d $a0, $sp, 176 + ld.d $a0, $sp, 160 addi.d $a1, $a0, 1 - st.d $a1, $sp, 176 + st.d $a1, $sp, 160 ld.bu $a0, $a0, 0 ori $s3, $s3, 8 addi.w $s5, $s5, -1 sll.d $a0, $a0, $s1 add.d $fp, $a0, $fp -.LBB1_46: # in Loop: Header=BB1_6 Depth=1 +.LBB1_43: # in Loop: Header=BB1_6 Depth=1 bstrpick.d $a0, $fp, 2, 1 slli.d $a0, $a0, 2 ld.d $a2, $sp, 48 # 8-byte Folded Reload @@ -470,7 +453,7 @@ inflateBack: # @inflateBack st.w $a1, $s0, 12 ld.d $a0, $sp, 56 # 8-byte Folded Reload jr $a2 -.LBB1_47: # in Loop: Header=BB1_6 Depth=1 +.LBB1_44: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(fixedtables.lenfix) addi.d $a0, $a0, %pc_lo12(fixedtables.lenfix) st.d $a0, $s0, 104 @@ -481,40 +464,40 @@ inflateBack: # @inflateBack st.d $a0, $s0, 112 lu12i.w $a0, 3 ori $a0, $a0, 3912 - b .LBB1_65 -.LBB1_48: # %.preheader692 + b .LBB1_64 +.LBB1_45: # %.preheader692 # in Loop: Header=BB1_6 Depth=1 ori $a2, $zero, 18 - bltu $a2, $a0, .LBB1_54 -# %bb.49: # %.lr.ph872.preheader + bltu $a2, $a1, .LBB1_51 +# %bb.46: # %.lr.ph872.preheader # in Loop: Header=BB1_6 Depth=1 addi.d $a2, $a3, 2 bstrpick.d $a4, $a2, 31, 0 ori $a2, $zero, 20 sub.d $a2, $a2, $a4 ori $a5, $zero, 16 - bgeu $a2, $a5, .LBB1_66 -.LBB1_50: # in Loop: Header=BB1_6 Depth=1 + bgeu $a2, $a5, .LBB1_65 +.LBB1_47: # in Loop: Header=BB1_6 Depth=1 ori $a4, $zero, 19 -.LBB1_51: # %.lr.ph872.preheader1546 +.LBB1_48: # %.lr.ph872.preheader1546 # in Loop: Header=BB1_6 Depth=1 - alsl.d $a1, $a0, $a1, 1 - addi.d $a0, $a0, 1 + alsl.d $a0, $a1, $a0, 1 + addi.d $a1, $a1, 1 .p2align 4, , 16 -.LBB1_52: # %.lr.ph872 +.LBB1_49: # %.lr.ph872 # Parent Loop BB1_6 Depth=1 # => This Inner Loop Header: Depth=2 - ld.hu $a2, $a1, 0 + ld.hu $a2, $a0, 0 slli.d $a2, $a2, 1 stx.h $zero, $s1, $a2 - bstrpick.d $a2, $a0, 31, 0 - addi.d $a1, $a1, 2 - addi.d $a0, $a0, 1 - bne $a2, $a4, .LBB1_52 -.LBB1_53: # %._crit_edge873 + bstrpick.d $a2, $a1, 31, 0 + addi.d $a0, $a0, 2 + addi.d $a1, $a1, 1 + bne $a2, $a4, .LBB1_49 +.LBB1_50: # %._crit_edge873 # in Loop: Header=BB1_6 Depth=1 st.w $a4, $s0, 140 -.LBB1_54: # in Loop: Header=BB1_6 Depth=1 +.LBB1_51: # in Loop: Header=BB1_6 Depth=1 ld.d $a0, $sp, 72 # 8-byte Folded Reload st.d $a0, $s0, 144 st.d $a0, $s0, 104 @@ -528,43 +511,46 @@ inflateBack: # @inflateBack ld.d $a5, $sp, 80 # 8-byte Folded Reload pcaddu18i $ra, %call36(inflate_table) jirl $ra, $ra, 0 - beqz $a0, .LBB1_58 -# %bb.55: # in Loop: Header=BB1_6 Depth=1 + beqz $a0, .LBB1_57 +# %bb.52: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.4) addi.d $a0, $a0, %pc_lo12(.L.str.4) +.LBB1_53: # %.thread + # in Loop: Header=BB1_6 Depth=1 st.d $a0, $s6, 48 lu12i.w $a0, 3 ori $a0, $a0, 3921 - b .LBB1_57 -.LBB1_56: # %._crit_edge1024 + b .LBB1_55 +.LBB1_54: # %._crit_edge1024 # in Loop: Header=BB1_6 Depth=1 move $fp, $zero move $s3, $zero lu12i.w $a0, 3 ori $a0, $a0, 3903 -.LBB1_57: # %.thread + .p2align 4, , 16 +.LBB1_55: # %.thread # in Loop: Header=BB1_6 Depth=1 st.w $a0, $s0, 8 - vld $vr4, $sp, 144 # 16-byte Folded Reload +.LBB1_56: # %.thread + # in Loop: Header=BB1_6 Depth=1 ld.w $a0, $s0, 8 b .LBB1_6 -.LBB1_58: # in Loop: Header=BB1_6 Depth=1 +.LBB1_57: # in Loop: Header=BB1_6 Depth=1 ld.w $a2, $s0, 132 ld.w $a0, $s0, 136 sub.w $a1, $zero, $a2 st.w $zero, $s0, 140 - vld $vr4, $sp, 144 # 16-byte Folded Reload - bne $a0, $a1, .LBB1_71 -.LBB1_59: # %._crit_edge923 + bne $a0, $a1, .LBB1_70 +.LBB1_58: # %._crit_edge923 # in Loop: Header=BB1_6 Depth=1 ld.w $a0, $s0, 8 lu12i.w $a1, 3 ori $s1, $a1, 3921 - beq $a0, $s1, .LBB1_41 -# %bb.60: # in Loop: Header=BB1_6 Depth=1 + beq $a0, $s1, .LBB1_56 +# %bb.59: # in Loop: Header=BB1_6 Depth=1 ld.hu $a0, $s0, 664 - beqz $a0, .LBB1_113 -# %bb.61: # in Loop: Header=BB1_6 Depth=1 + beqz $a0, .LBB1_112 +# %bb.60: # in Loop: Header=BB1_6 Depth=1 ld.d $a0, $sp, 72 # 8-byte Folded Reload st.d $a0, $s0, 144 st.d $a0, $s0, 104 @@ -577,58 +563,63 @@ inflateBack: # @inflateBack ld.d $a5, $sp, 80 # 8-byte Folded Reload pcaddu18i $ra, %call36(inflate_table) jirl $ra, $ra, 0 - beqz $a0, .LBB1_114 -# %bb.62: # in Loop: Header=BB1_6 Depth=1 + beqz $a0, .LBB1_113 +# %bb.61: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.7) addi.d $a0, $a0, %pc_lo12(.L.str.7) - b .LBB1_116 -.LBB1_63: # in Loop: Header=BB1_6 Depth=1 + b .LBB1_115 +.LBB1_62: # in Loop: Header=BB1_6 Depth=1 lu12i.w $a0, 3 ori $a0, $a0, 3908 - b .LBB1_65 -.LBB1_64: # in Loop: Header=BB1_6 Depth=1 + b .LBB1_64 +.LBB1_63: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.1) addi.d $a0, $a0, %pc_lo12(.L.str.1) st.d $a0, $s6, 48 lu12i.w $a0, 3 ori $a0, $a0, 3921 -.LBB1_65: # in Loop: Header=BB1_6 Depth=1 +.LBB1_64: # in Loop: Header=BB1_6 Depth=1 st.w $a0, $s0, 8 srli.d $fp, $fp, 3 addi.w $s3, $s3, -3 ld.w $a0, $s0, 8 b .LBB1_6 -.LBB1_66: # %vector.scevcheck1524 +.LBB1_65: # %vector.scevcheck1524 # in Loop: Header=BB1_6 Depth=1 ori $a5, $zero, 19 sub.d $a4, $a5, $a4 addi.w $a5, $a4, 0 addi.d $a6, $zero, -3 sub.w $a3, $a6, $a3 - bltu $a3, $a5, .LBB1_50 -# %bb.67: # %vector.scevcheck1524 + bltu $a3, $a5, .LBB1_47 +# %bb.66: # %vector.scevcheck1524 # in Loop: Header=BB1_6 Depth=1 srli.d $a3, $a4, 32 ori $a4, $zero, 19 - bnez $a3, .LBB1_51 -# %bb.68: # %vector.ph1529 + bnez $a3, .LBB1_48 +# %bb.67: # %vector.ph1529 # in Loop: Header=BB1_6 Depth=1 move $a3, $a2 bstrins.d $a3, $zero, 2, 0 - add.d $a4, $a3, $a0 - alsl.d $a0, $a0, $a1, 1 + add.d $a4, $a3, $a1 + alsl.d $a1, $a1, $a0, 1 move $a5, $a3 .p2align 4, , 16 -.LBB1_69: # %vector.body1532 +.LBB1_68: # %vector.body1532 # Parent Loop BB1_6 Depth=1 # => This Inner Loop Header: Depth=2 - vld $vr0, $a0, 0 - vilvh.h $vr1, $vr4, $vr0 - vilvh.w $vr2, $vr4, $vr1 - vilvl.w $vr1, $vr4, $vr1 - vilvl.h $vr0, $vr4, $vr0 - vilvh.w $vr3, $vr4, $vr0 - vilvl.w $vr0, $vr4, $vr0 + vld $vr0, $a1, 0 + vbsrl.v $vr1, $vr0, 12 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 8 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vshuf4i.h $vr3, $vr0, 14 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vpickve2gr.d $a6, $vr0, 0 slli.d $a6, $a6, 1 vpickve2gr.d $a7, $vr0, 1 @@ -637,13 +628,13 @@ inflateBack: # @inflateBack slli.d $t0, $t0, 1 vpickve2gr.d $t1, $vr3, 1 slli.d $t1, $t1, 1 - vpickve2gr.d $t2, $vr1, 0 + vpickve2gr.d $t2, $vr2, 0 slli.d $t2, $t2, 1 - vpickve2gr.d $t3, $vr1, 1 + vpickve2gr.d $t3, $vr2, 1 slli.d $t3, $t3, 1 - vpickve2gr.d $t4, $vr2, 0 + vpickve2gr.d $t4, $vr1, 0 slli.d $t4, $t4, 1 - vpickve2gr.d $t5, $vr2, 1 + vpickve2gr.d $t5, $vr1, 1 slli.d $t5, $t5, 1 stx.h $zero, $s1, $a6 stx.h $zero, $s1, $a7 @@ -654,20 +645,20 @@ inflateBack: # @inflateBack stx.h $zero, $s1, $t4 stx.h $zero, $s1, $t5 addi.d $a5, $a5, -8 - addi.d $a0, $a0, 16 - bnez $a5, .LBB1_69 -# %bb.70: # %middle.block1537 + addi.d $a1, $a1, 16 + bnez $a5, .LBB1_68 +# %bb.69: # %middle.block1537 # in Loop: Header=BB1_6 Depth=1 - move $a0, $a4 + move $a1, $a4 ori $a4, $zero, 19 - bne $a2, $a3, .LBB1_51 - b .LBB1_53 -.LBB1_71: # %.preheader681.preheader + bne $a2, $a3, .LBB1_48 + b .LBB1_50 +.LBB1_70: # %.preheader681.preheader # in Loop: Header=BB1_6 Depth=1 st.d $s6, $sp, 16 # 8-byte Folded Spill st.d $s4, $sp, 24 # 8-byte Folded Spill - b .LBB1_73 -.LBB1_72: # in Loop: Header=BB1_73 Depth=2 + b .LBB1_72 +.LBB1_71: # in Loop: Header=BB1_72 Depth=2 ld.wu $a1, $s0, 140 srl.d $fp, $fp, $s1 sub.w $s3, $s3, $s1 @@ -679,16 +670,16 @@ inflateBack: # @inflateBack ld.d $a5, $sp, 104 # 8-byte Folded Reload stx.h $a0, $a5, $a1 add.w $a1, $a4, $a2 - bgeu $a3, $a1, .LBB1_59 -.LBB1_73: # %.preheader681 + bgeu $a3, $a1, .LBB1_58 +.LBB1_72: # %.preheader681 # Parent Loop BB1_6 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB1_77 Depth 3 - # Child Loop BB1_96 Depth 3 - # Child Loop BB1_86 Depth 3 - # Child Loop BB1_91 Depth 3 - # Child Loop BB1_107 Depth 3 - # Child Loop BB1_111 Depth 3 + # Child Loop BB1_76 Depth 3 + # Child Loop BB1_95 Depth 3 + # Child Loop BB1_85 Depth 3 + # Child Loop BB1_90 Depth 3 + # Child Loop BB1_106 Depth 3 + # Child Loop BB1_110 Depth 3 ld.wu $a0, $s0, 120 ld.d $a1, $s0, 104 addi.d $s7, $zero, -1 @@ -697,22 +688,21 @@ inflateBack: # @inflateBack addi.w $a2, $a2, 0 alsl.d $a2, $a2, $a1, 2 ld.bu $s1, $a2, 1 - bgeu $s3, $s1, .LBB1_80 -# %bb.74: # %.lr.ph885.preheader - # in Loop: Header=BB1_73 Depth=2 + bgeu $s3, $s1, .LBB1_79 +# %bb.73: # %.lr.ph885.preheader + # in Loop: Header=BB1_72 Depth=2 move $s4, $s3 - b .LBB1_77 + b .LBB1_76 .p2align 4, , 16 -.LBB1_75: # %._crit_edge1242 - # in Loop: Header=BB1_77 Depth=3 +.LBB1_74: # %._crit_edge1242 + # in Loop: Header=BB1_76 Depth=3 move $s5, $a0 ld.d $a1, $s0, 104 ld.w $a0, $s0, 120 - vld $vr4, $sp, 144 # 16-byte Folded Reload -.LBB1_76: # in Loop: Header=BB1_77 Depth=3 - ld.d $a2, $sp, 176 +.LBB1_75: # in Loop: Header=BB1_76 Depth=3 + ld.d $a2, $sp, 160 addi.d $a3, $a2, 1 - st.d $a3, $sp, 176 + st.d $a3, $sp, 160 ld.bu $a2, $a2, 0 sll.d $a2, $a2, $s4 add.d $fp, $a2, $fp @@ -724,140 +714,137 @@ inflateBack: # @inflateBack addi.w $s5, $s5, -1 addi.d $s4, $s4, 8 addi.w $s3, $s3, 8 - bgeu $s4, $s1, .LBB1_79 -.LBB1_77: # %.lr.ph885 + bgeu $s4, $s1, .LBB1_78 +.LBB1_76: # %.lr.ph885 # Parent Loop BB1_6 Depth=1 - # Parent Loop BB1_73 Depth=2 + # Parent Loop BB1_72 Depth=2 # => This Inner Loop Header: Depth=3 - bnez $s5, .LBB1_76 -# %bb.78: # in Loop: Header=BB1_77 Depth=3 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + bnez $s5, .LBB1_75 +# %bb.77: # in Loop: Header=BB1_76 Depth=3 + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - bnez $a0, .LBB1_75 - b .LBB1_198 -.LBB1_79: # %._crit_edge886.loopexit - # in Loop: Header=BB1_73 Depth=2 + bnez $a0, .LBB1_74 + b .LBB1_197 +.LBB1_78: # %._crit_edge886.loopexit + # in Loop: Header=BB1_72 Depth=2 ld.d $s4, $sp, 24 # 8-byte Folded Reload -.LBB1_80: # %._crit_edge886 - # in Loop: Header=BB1_73 Depth=2 +.LBB1_79: # %._crit_edge886 + # in Loop: Header=BB1_72 Depth=2 ld.hu $a0, $a2, 2 ld.d $s7, $sp, 120 # 8-byte Folded Reload ori $a1, $zero, 15 - bgeu $a1, $a0, .LBB1_72 -# %bb.81: # in Loop: Header=BB1_73 Depth=2 + bgeu $a1, $a0, .LBB1_71 +# %bb.80: # in Loop: Header=BB1_72 Depth=2 ori $a1, $zero, 16 - beq $a0, $a1, .LBB1_88 -# %bb.82: # in Loop: Header=BB1_73 Depth=2 + beq $a0, $a1, .LBB1_87 +# %bb.81: # in Loop: Header=BB1_72 Depth=2 move $s7, $s4 ori $a1, $zero, 17 - bne $a0, $a1, .LBB1_93 -# %bb.83: # %.preheader680 - # in Loop: Header=BB1_73 Depth=2 + bne $a0, $a1, .LBB1_92 +# %bb.82: # %.preheader680 + # in Loop: Header=BB1_72 Depth=2 addi.d $s4, $s1, 3 - bgeu $s3, $s4, .LBB1_98 -# %bb.84: # %.lr.ph897.preheader - # in Loop: Header=BB1_73 Depth=2 + bgeu $s3, $s4, .LBB1_97 +# %bb.83: # %.lr.ph897.preheader + # in Loop: Header=BB1_72 Depth=2 move $s6, $s3 - b .LBB1_86 + b .LBB1_85 .p2align 4, , 16 -.LBB1_85: # in Loop: Header=BB1_86 Depth=3 - ld.d $a0, $sp, 176 +.LBB1_84: # in Loop: Header=BB1_85 Depth=3 + ld.d $a0, $sp, 160 addi.d $a1, $a0, 1 - st.d $a1, $sp, 176 + st.d $a1, $sp, 160 ld.bu $a0, $a0, 0 addi.w $s5, $s5, -1 sll.d $a0, $a0, $s6 add.d $fp, $a0, $fp addi.d $s6, $s6, 8 addi.d $s3, $s3, 8 - bgeu $s6, $s4, .LBB1_98 -.LBB1_86: # %.lr.ph897 + bgeu $s6, $s4, .LBB1_97 +.LBB1_85: # %.lr.ph897 # Parent Loop BB1_6 Depth=1 - # Parent Loop BB1_73 Depth=2 + # Parent Loop BB1_72 Depth=2 # => This Inner Loop Header: Depth=3 - bnez $s5, .LBB1_85 -# %bb.87: # in Loop: Header=BB1_86 Depth=3 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + bnez $s5, .LBB1_84 +# %bb.86: # in Loop: Header=BB1_85 Depth=3 + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - vld $vr4, $sp, 144 # 16-byte Folded Reload move $s5, $a0 - bnez $a0, .LBB1_85 - b .LBB1_202 -.LBB1_88: # %.preheader679 - # in Loop: Header=BB1_73 Depth=2 + bnez $a0, .LBB1_84 + b .LBB1_201 +.LBB1_87: # %.preheader679 + # in Loop: Header=BB1_72 Depth=2 move $s7, $s6 addi.d $s4, $s1, 2 - bgeu $s3, $s4, .LBB1_99 -# %bb.89: # %.lr.ph905.preheader - # in Loop: Header=BB1_73 Depth=2 + bgeu $s3, $s4, .LBB1_98 +# %bb.88: # %.lr.ph905.preheader + # in Loop: Header=BB1_72 Depth=2 move $s6, $s3 - b .LBB1_91 + b .LBB1_90 .p2align 4, , 16 -.LBB1_90: # in Loop: Header=BB1_91 Depth=3 - ld.d $a0, $sp, 176 +.LBB1_89: # in Loop: Header=BB1_90 Depth=3 + ld.d $a0, $sp, 160 addi.d $a1, $a0, 1 - st.d $a1, $sp, 176 + st.d $a1, $sp, 160 ld.bu $a0, $a0, 0 addi.w $s5, $s5, -1 sll.d $a0, $a0, $s6 add.d $fp, $a0, $fp addi.d $s6, $s6, 8 addi.d $s3, $s3, 8 - bgeu $s6, $s4, .LBB1_99 -.LBB1_91: # %.lr.ph905 + bgeu $s6, $s4, .LBB1_98 +.LBB1_90: # %.lr.ph905 # Parent Loop BB1_6 Depth=1 - # Parent Loop BB1_73 Depth=2 + # Parent Loop BB1_72 Depth=2 # => This Inner Loop Header: Depth=3 - bnez $s5, .LBB1_90 -# %bb.92: # in Loop: Header=BB1_91 Depth=3 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + bnez $s5, .LBB1_89 +# %bb.91: # in Loop: Header=BB1_90 Depth=3 + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - vld $vr4, $sp, 144 # 16-byte Folded Reload move $s5, $a0 - bnez $a0, .LBB1_90 - b .LBB1_206 -.LBB1_93: # %.preheader678 - # in Loop: Header=BB1_73 Depth=2 + bnez $a0, .LBB1_89 + b .LBB1_205 +.LBB1_92: # %.preheader678 + # in Loop: Header=BB1_72 Depth=2 addi.d $s4, $s1, 7 - bgeu $s3, $s4, .LBB1_101 -# %bb.94: # %.lr.ph913.preheader - # in Loop: Header=BB1_73 Depth=2 + bgeu $s3, $s4, .LBB1_100 +# %bb.93: # %.lr.ph913.preheader + # in Loop: Header=BB1_72 Depth=2 move $s6, $s3 - b .LBB1_96 + b .LBB1_95 .p2align 4, , 16 -.LBB1_95: # in Loop: Header=BB1_96 Depth=3 - ld.d $a0, $sp, 176 +.LBB1_94: # in Loop: Header=BB1_95 Depth=3 + ld.d $a0, $sp, 160 addi.d $a1, $a0, 1 - st.d $a1, $sp, 176 + st.d $a1, $sp, 160 ld.bu $a0, $a0, 0 addi.w $s5, $s5, -1 sll.d $a0, $a0, $s6 add.d $fp, $a0, $fp addi.d $s6, $s6, 8 addi.d $s3, $s3, 8 - bgeu $s6, $s4, .LBB1_101 -.LBB1_96: # %.lr.ph913 + bgeu $s6, $s4, .LBB1_100 +.LBB1_95: # %.lr.ph913 # Parent Loop BB1_6 Depth=1 - # Parent Loop BB1_73 Depth=2 + # Parent Loop BB1_72 Depth=2 # => This Inner Loop Header: Depth=3 - bnez $s5, .LBB1_95 -# %bb.97: # in Loop: Header=BB1_96 Depth=3 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + bnez $s5, .LBB1_94 +# %bb.96: # in Loop: Header=BB1_95 Depth=3 + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - vld $vr4, $sp, 144 # 16-byte Folded Reload move $s5, $a0 - bnez $a0, .LBB1_95 - b .LBB1_202 -.LBB1_98: # %._crit_edge898 - # in Loop: Header=BB1_73 Depth=2 + bnez $a0, .LBB1_94 + b .LBB1_201 +.LBB1_97: # %._crit_edge898 + # in Loop: Header=BB1_72 Depth=2 move $a0, $zero srl.d $a1, $fp, $s1 andi $a2, $a1, 7 @@ -865,14 +852,14 @@ inflateBack: # @inflateBack srli.d $fp, $a1, 3 sub.d $a1, $s3, $s1 addi.w $s3, $a1, -3 - b .LBB1_102 -.LBB1_99: # %._crit_edge906 - # in Loop: Header=BB1_73 Depth=2 + b .LBB1_101 +.LBB1_98: # %._crit_edge906 + # in Loop: Header=BB1_72 Depth=2 ld.w $a0, $s0, 140 srl.d $fp, $fp, $s1 sub.w $s3, $s3, $s1 - beqz $a0, .LBB1_197 -# %bb.100: # in Loop: Header=BB1_73 Depth=2 + beqz $a0, .LBB1_196 +# %bb.99: # in Loop: Header=BB1_72 Depth=2 addi.d $a0, $a0, -1 bstrpick.d $a0, $a0, 31, 0 slli.d $a0, $a0, 1 @@ -884,9 +871,9 @@ inflateBack: # @inflateBack addi.w $s3, $s3, -2 move $s6, $s7 ld.d $s4, $sp, 24 # 8-byte Folded Reload - b .LBB1_103 -.LBB1_101: # %._crit_edge914 - # in Loop: Header=BB1_73 Depth=2 + b .LBB1_102 +.LBB1_100: # %._crit_edge914 + # in Loop: Header=BB1_72 Depth=2 move $a0, $zero srl.d $a1, $fp, $s1 andi $a2, $a1, 127 @@ -894,37 +881,37 @@ inflateBack: # @inflateBack srli.d $fp, $a1, 7 sub.d $a1, $s3, $s1 addi.w $s3, $a1, -7 -.LBB1_102: # in Loop: Header=BB1_73 Depth=2 +.LBB1_101: # in Loop: Header=BB1_72 Depth=2 ld.d $s6, $sp, 16 # 8-byte Folded Reload move $s4, $s7 ld.d $t3, $sp, 104 # 8-byte Folded Reload -.LBB1_103: # in Loop: Header=BB1_73 Depth=2 +.LBB1_102: # in Loop: Header=BB1_72 Depth=2 ld.w $a6, $s0, 140 ld.w $a2, $s0, 132 ld.w $a1, $s0, 136 add.w $a3, $a6, $a4 add.w $a1, $a1, $a2 ld.d $s7, $sp, 120 # 8-byte Folded Reload - bltu $a1, $a3, .LBB1_117 -# %bb.104: # %.preheader.preheader - # in Loop: Header=BB1_73 Depth=2 + bltu $a1, $a3, .LBB1_116 +# %bb.103: # %.preheader.preheader + # in Loop: Header=BB1_72 Depth=2 ori $a3, $zero, 16 - bltu $a4, $a3, .LBB1_109 -# %bb.105: # %.preheader.preheader - # in Loop: Header=BB1_73 Depth=2 + bltu $a4, $a3, .LBB1_108 +# %bb.104: # %.preheader.preheader + # in Loop: Header=BB1_72 Depth=2 sub.d $a3, $zero, $a4 - bltu $a3, $a6, .LBB1_109 -# %bb.106: # %vector.ph1514 - # in Loop: Header=BB1_73 Depth=2 + bltu $a3, $a6, .LBB1_108 +# %bb.105: # %vector.ph1514 + # in Loop: Header=BB1_72 Depth=2 andi $a7, $a4, 240 andi $a5, $a4, 15 add.w $a3, $a6, $a7 vreplgr2vr.h $vr0, $a0 move $t0, $a7 .p2align 4, , 16 -.LBB1_107: # %vector.body1517 +.LBB1_106: # %vector.body1517 # Parent Loop BB1_6 Depth=1 - # Parent Loop BB1_73 Depth=2 + # Parent Loop BB1_72 Depth=2 # => This Inner Loop Header: Depth=3 bstrpick.d $t1, $a6, 31, 0 alsl.d $t2, $t1, $t3, 1 @@ -933,21 +920,21 @@ inflateBack: # @inflateBack vst $vr0, $t2, 16 addi.w $t0, $t0, -16 addi.w $a6, $a6, 16 - bnez $t0, .LBB1_107 -# %bb.108: # %middle.block1520 - # in Loop: Header=BB1_73 Depth=2 - bne $a4, $a7, .LBB1_110 - b .LBB1_112 -.LBB1_109: # in Loop: Header=BB1_73 Depth=2 + bnez $t0, .LBB1_106 +# %bb.107: # %middle.block1520 + # in Loop: Header=BB1_72 Depth=2 + bne $a4, $a7, .LBB1_109 + b .LBB1_111 +.LBB1_108: # in Loop: Header=BB1_72 Depth=2 move $a5, $a4 move $a3, $a6 -.LBB1_110: # %.preheader.preheader1544 - # in Loop: Header=BB1_73 Depth=2 +.LBB1_109: # %.preheader.preheader1544 + # in Loop: Header=BB1_72 Depth=2 move $a4, $a3 .p2align 4, , 16 -.LBB1_111: # %.preheader +.LBB1_110: # %.preheader # Parent Loop BB1_6 Depth=1 - # Parent Loop BB1_73 Depth=2 + # Parent Loop BB1_72 Depth=2 # => This Inner Loop Header: Depth=3 addi.w $a5, $a5, -1 addi.w $a3, $a4, 1 @@ -955,20 +942,17 @@ inflateBack: # @inflateBack slli.d $a4, $a4, 1 stx.h $a0, $t3, $a4 move $a4, $a3 - bnez $a5, .LBB1_111 -.LBB1_112: # %.loopexit - # in Loop: Header=BB1_73 Depth=2 + bnez $a5, .LBB1_110 +.LBB1_111: # %.loopexit + # in Loop: Header=BB1_72 Depth=2 st.w $a3, $s0, 140 - bltu $a3, $a1, .LBB1_73 - b .LBB1_59 -.LBB1_113: # in Loop: Header=BB1_6 Depth=1 + bltu $a3, $a1, .LBB1_72 + b .LBB1_58 +.LBB1_112: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.6) addi.d $a0, $a0, %pc_lo12(.L.str.6) - st.d $a0, $s6, 48 - st.w $s1, $s0, 8 - ld.w $a0, $s0, 8 - b .LBB1_6 -.LBB1_114: # in Loop: Header=BB1_6 Depth=1 + b .LBB1_115 +.LBB1_113: # in Loop: Header=BB1_6 Depth=1 ld.d $a0, $s0, 144 ld.wu $a1, $s0, 132 ld.w $a2, $s0, 136 @@ -983,36 +967,34 @@ inflateBack: # @inflateBack ld.d $a5, $sp, 80 # 8-byte Folded Reload pcaddu18i $ra, %call36(inflate_table) jirl $ra, $ra, 0 - beqz $a0, .LBB1_118 -# %bb.115: # in Loop: Header=BB1_6 Depth=1 + beqz $a0, .LBB1_117 +# %bb.114: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.8) addi.d $a0, $a0, %pc_lo12(.L.str.8) -.LBB1_116: # %.thread +.LBB1_115: # %.thread # in Loop: Header=BB1_6 Depth=1 st.d $a0, $s6, 48 st.w $s1, $s0, 8 - vld $vr4, $sp, 144 # 16-byte Folded Reload ld.w $a0, $s0, 8 b .LBB1_6 -.LBB1_117: # in Loop: Header=BB1_6 Depth=1 +.LBB1_116: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.5) addi.d $a0, $a0, %pc_lo12(.L.str.5) - b .LBB1_39 -.LBB1_118: # in Loop: Header=BB1_6 Depth=1 + b .LBB1_53 +.LBB1_117: # in Loop: Header=BB1_6 Depth=1 lu12i.w $a0, 3 ori $a0, $a0, 3912 st.w $a0, $s0, 8 - vld $vr4, $sp, 144 # 16-byte Folded Reload .p2align 4, , 16 -.LBB1_119: # in Loop: Header=BB1_6 Depth=1 +.LBB1_118: # in Loop: Header=BB1_6 Depth=1 ori $a0, $zero, 6 - bltu $s5, $a0, .LBB1_124 -# %bb.120: # in Loop: Header=BB1_6 Depth=1 + bltu $s5, $a0, .LBB1_123 +# %bb.119: # in Loop: Header=BB1_6 Depth=1 ori $a0, $zero, 257 - bgeu $a0, $s8, .LBB1_124 -# %bb.121: # in Loop: Header=BB1_6 Depth=1 + bgeu $a0, $s8, .LBB1_123 +# %bb.120: # in Loop: Header=BB1_6 Depth=1 st.d $s2, $s6, 24 - ld.d $a0, $sp, 176 + ld.d $a0, $sp, 160 st.w $s8, $s6, 32 ld.w $a2, $s0, 64 ld.w $a1, $s0, 60 @@ -1020,11 +1002,11 @@ inflateBack: # @inflateBack st.w $s5, $s6, 8 st.d $fp, $s0, 80 st.w $s3, $s0, 88 - bgeu $a2, $a1, .LBB1_123 -# %bb.122: # in Loop: Header=BB1_6 Depth=1 + bgeu $a2, $a1, .LBB1_122 +# %bb.121: # in Loop: Header=BB1_6 Depth=1 sub.d $a0, $a1, $s8 st.w $a0, $s0, 64 -.LBB1_123: # in Loop: Header=BB1_6 Depth=1 +.LBB1_122: # in Loop: Header=BB1_6 Depth=1 move $a0, $s6 pcaddu18i $ra, %call36(inflate_fast) jirl $ra, $ra, 0 @@ -1034,11 +1016,10 @@ inflateBack: # @inflateBack ld.w $s5, $s6, 8 ld.d $fp, $s0, 80 ld.w $s3, $s0, 88 - st.d $a0, $sp, 176 - vld $vr4, $sp, 144 # 16-byte Folded Reload + st.d $a0, $sp, 160 ld.w $a0, $s0, 8 b .LBB1_6 -.LBB1_124: # %.preheader691 +.LBB1_123: # %.preheader691 # in Loop: Header=BB1_6 Depth=1 ld.wu $a1, $s0, 120 ld.d $a0, $s0, 104 @@ -1049,15 +1030,15 @@ inflateBack: # @inflateBack alsl.d $a2, $a2, $a0, 2 ld.bu $s1, $a2, 1 addi.w $s7, $s3, 0 - bltu $s7, $s1, .LBB1_130 -.LBB1_125: # %._crit_edge937 + bltu $s7, $s1, .LBB1_129 +.LBB1_124: # %._crit_edge937 # in Loop: Header=BB1_6 Depth=1 ld.bu $a1, $a2, 0 ld.hu $s7, $a2, 2 addi.d $a2, $a1, -1 ori $a3, $zero, 14 - bltu $a3, $a2, .LBB1_137 -# %bb.126: # %.preheader690 + bltu $a3, $a2, .LBB1_136 +# %bb.125: # %.preheader690 # in Loop: Header=BB1_6 Depth=1 st.d $s4, $sp, 24 # 8-byte Folded Spill st.d $s6, $sp, 16 # 8-byte Folded Spill @@ -1071,23 +1052,22 @@ inflateBack: # @inflateBack ld.bu $a2, $a1, 1 add.d $a4, $s1, $a2 addi.w $s4, $s3, 0 - bgeu $s4, $a4, .LBB1_136 -# %bb.127: # %.lr.ph951.preheader + bgeu $s4, $a4, .LBB1_135 +# %bb.126: # %.lr.ph951.preheader # in Loop: Header=BB1_6 Depth=1 nor $s6, $a3, $zero - b .LBB1_134 + b .LBB1_133 .p2align 4, , 16 -.LBB1_128: # %._crit_edge1247 - # in Loop: Header=BB1_130 Depth=2 +.LBB1_127: # %._crit_edge1247 + # in Loop: Header=BB1_129 Depth=2 move $s5, $a0 ld.d $a0, $s0, 104 ld.w $a1, $s0, 120 - vld $vr4, $sp, 144 # 16-byte Folded Reload addi.d $a5, $zero, -1 -.LBB1_129: # in Loop: Header=BB1_130 Depth=2 - ld.d $a2, $sp, 176 +.LBB1_128: # in Loop: Header=BB1_129 Depth=2 + ld.d $a2, $sp, 160 addi.d $a3, $a2, 1 - st.d $a3, $sp, 176 + st.d $a3, $sp, 160 ld.bu $a2, $a2, 0 sll.d $a2, $a2, $s7 add.d $fp, $a2, $fp @@ -1099,29 +1079,28 @@ inflateBack: # @inflateBack addi.w $s5, $s5, -1 addi.d $s7, $s7, 8 addi.d $s3, $s3, 8 - bgeu $s7, $s1, .LBB1_125 -.LBB1_130: # %.lr.ph936 + bgeu $s7, $s1, .LBB1_124 +.LBB1_129: # %.lr.ph936 # Parent Loop BB1_6 Depth=1 # => This Inner Loop Header: Depth=2 - bnez $s5, .LBB1_129 -# %bb.131: # in Loop: Header=BB1_130 Depth=2 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + bnez $s5, .LBB1_128 +# %bb.130: # in Loop: Header=BB1_129 Depth=2 + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - bnez $a0, .LBB1_128 - b .LBB1_198 + bnez $a0, .LBB1_127 + b .LBB1_197 .p2align 4, , 16 -.LBB1_132: # %._crit_edge1250 - # in Loop: Header=BB1_134 Depth=2 +.LBB1_131: # %._crit_edge1250 + # in Loop: Header=BB1_133 Depth=2 move $s5, $a0 ld.d $a0, $s0, 104 - vld $vr4, $sp, 144 # 16-byte Folded Reload addi.d $a5, $zero, -1 -.LBB1_133: # in Loop: Header=BB1_134 Depth=2 - ld.d $a1, $sp, 176 +.LBB1_132: # in Loop: Header=BB1_133 Depth=2 + ld.d $a1, $sp, 160 addi.d $a2, $a1, 1 - st.d $a2, $sp, 176 + st.d $a2, $sp, 160 ld.bu $a1, $a1, 0 sll.d $a1, $a1, $s4 add.d $fp, $a1, $fp @@ -1135,19 +1114,19 @@ inflateBack: # @inflateBack addi.d $s4, $s4, 8 add.d $a3, $s1, $a2 addi.d $s3, $s3, 8 - bgeu $s4, $a3, .LBB1_136 -.LBB1_134: # %.lr.ph951 + bgeu $s4, $a3, .LBB1_135 +.LBB1_133: # %.lr.ph951 # Parent Loop BB1_6 Depth=1 # => This Inner Loop Header: Depth=2 - bnez $s5, .LBB1_133 -# %bb.135: # in Loop: Header=BB1_134 Depth=2 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + bnez $s5, .LBB1_132 +# %bb.134: # in Loop: Header=BB1_133 Depth=2 + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - bnez $a0, .LBB1_132 - b .LBB1_202 -.LBB1_136: # %._crit_edge952 + bnez $a0, .LBB1_131 + b .LBB1_201 +.LBB1_135: # %._crit_edge952 # in Loop: Header=BB1_6 Depth=1 ld.hu $s7, $a1, 2 ld.bu $a1, $a1, 0 @@ -1156,56 +1135,55 @@ inflateBack: # @inflateBack move $s1, $a2 ld.d $s6, $sp, 16 # 8-byte Folded Reload ld.d $s4, $sp, 24 # 8-byte Folded Reload -.LBB1_137: # in Loop: Header=BB1_6 Depth=1 +.LBB1_136: # in Loop: Header=BB1_6 Depth=1 srl.d $fp, $fp, $s1 sub.w $s3, $s3, $s1 st.w $s7, $s0, 92 - beqz $a1, .LBB1_146 -# %bb.138: # in Loop: Header=BB1_6 Depth=1 + beqz $a1, .LBB1_145 +# %bb.137: # in Loop: Header=BB1_6 Depth=1 andi $a0, $a1, 32 + bnez $a0, .LBB1_149 +# %bb.138: # in Loop: Header=BB1_6 Depth=1 + andi $a0, $a1, 64 bnez $a0, .LBB1_150 # %bb.139: # in Loop: Header=BB1_6 Depth=1 - andi $a0, $a1, 64 - bnez $a0, .LBB1_151 -# %bb.140: # in Loop: Header=BB1_6 Depth=1 andi $a0, $a1, 15 st.w $a0, $s0, 100 - beqz $a0, .LBB1_154 -# %bb.141: # %.preheader689 + beqz $a0, .LBB1_153 +# %bb.140: # %.preheader689 # in Loop: Header=BB1_6 Depth=1 - bltu $s3, $a0, .LBB1_143 - b .LBB1_153 + bltu $s3, $a0, .LBB1_142 + b .LBB1_152 .p2align 4, , 16 -.LBB1_142: # in Loop: Header=BB1_143 Depth=2 - ld.d $a1, $sp, 176 +.LBB1_141: # in Loop: Header=BB1_142 Depth=2 + ld.d $a1, $sp, 160 addi.d $a2, $a1, 1 - st.d $a2, $sp, 176 + st.d $a2, $sp, 160 ld.bu $a1, $a1, 0 addi.w $s5, $s5, -1 sll.d $a1, $a1, $s3 addi.w $s3, $s3, 8 add.d $fp, $a1, $fp - bgeu $s3, $a0, .LBB1_152 -.LBB1_143: # %.lr.ph961 + bgeu $s3, $a0, .LBB1_151 +.LBB1_142: # %.lr.ph961 # Parent Loop BB1_6 Depth=1 # => This Inner Loop Header: Depth=2 - bnez $s5, .LBB1_142 -# %bb.144: # in Loop: Header=BB1_143 Depth=2 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + bnez $s5, .LBB1_141 +# %bb.143: # in Loop: Header=BB1_142 Depth=2 + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - beqz $a0, .LBB1_198 -# %bb.145: # %._crit_edge1252 - # in Loop: Header=BB1_143 Depth=2 + beqz $a0, .LBB1_197 +# %bb.144: # %._crit_edge1252 + # in Loop: Header=BB1_142 Depth=2 move $s5, $a0 ld.w $a0, $s0, 100 - vld $vr4, $sp, 144 # 16-byte Folded Reload addi.d $a5, $zero, -1 - b .LBB1_142 -.LBB1_146: # in Loop: Header=BB1_6 Depth=1 - bnez $s8, .LBB1_149 -# %bb.147: # in Loop: Header=BB1_6 Depth=1 + b .LBB1_141 +.LBB1_145: # in Loop: Header=BB1_6 Depth=1 + bnez $s8, .LBB1_148 +# %bb.146: # in Loop: Header=BB1_6 Depth=1 ld.w $s8, $s0, 60 ld.d $s2, $s0, 72 st.w $s8, $s0, 64 @@ -1214,30 +1192,29 @@ inflateBack: # @inflateBack move $a2, $s8 ld.d $a3, $sp, 96 # 8-byte Folded Reload jirl $ra, $a3, 0 - bnez $a0, .LBB1_199 -# %bb.148: # %._crit_edge1266 + bnez $a0, .LBB1_198 +# %bb.147: # %._crit_edge1266 # in Loop: Header=BB1_6 Depth=1 ld.w $s7, $s0, 92 - vld $vr4, $sp, 144 # 16-byte Folded Reload -.LBB1_149: # in Loop: Header=BB1_6 Depth=1 +.LBB1_148: # in Loop: Header=BB1_6 Depth=1 st.b $s7, $s2, 0 addi.d $s2, $s2, 1 addi.w $s8, $s8, -1 lu12i.w $a0, 3 ori $a0, $a0, 3912 - b .LBB1_182 -.LBB1_150: # in Loop: Header=BB1_6 Depth=1 + b .LBB1_181 +.LBB1_149: # in Loop: Header=BB1_6 Depth=1 lu12i.w $a0, 3 ori $a0, $a0, 3903 - b .LBB1_182 -.LBB1_151: # in Loop: Header=BB1_6 Depth=1 + b .LBB1_181 +.LBB1_150: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.9) addi.d $a0, $a0, %pc_lo12(.L.str.9) - b .LBB1_181 -.LBB1_152: # %._crit_edge962.loopexit + b .LBB1_180 +.LBB1_151: # %._crit_edge962.loopexit # in Loop: Header=BB1_6 Depth=1 ld.w $s7, $s0, 92 -.LBB1_153: # %._crit_edge962 +.LBB1_152: # %._crit_edge962 # in Loop: Header=BB1_6 Depth=1 sll.w $a1, $a5, $a0 andn $a1, $fp, $a1 @@ -1245,7 +1222,7 @@ inflateBack: # @inflateBack st.w $a1, $s0, 92 srl.d $fp, $fp, $a0 sub.w $s3, $s3, $a0 -.LBB1_154: # in Loop: Header=BB1_6 Depth=1 +.LBB1_153: # in Loop: Header=BB1_6 Depth=1 ld.wu $a1, $s0, 124 ld.d $a0, $s0, 112 sll.w $a2, $a5, $a1 @@ -1253,17 +1230,17 @@ inflateBack: # @inflateBack addi.w $a2, $a2, 0 alsl.d $a2, $a2, $a0, 2 ld.bu $s1, $a2, 1 - bgeu $s3, $s1, .LBB1_161 -# %bb.155: # %.lr.ph977.preheader + bgeu $s3, $s1, .LBB1_160 +# %bb.154: # %.lr.ph977.preheader # in Loop: Header=BB1_6 Depth=1 move $s7, $s4 move $s4, $s3 - b .LBB1_157 + b .LBB1_156 .p2align 4, , 16 -.LBB1_156: # in Loop: Header=BB1_157 Depth=2 - ld.d $a2, $sp, 176 +.LBB1_155: # in Loop: Header=BB1_156 Depth=2 + ld.d $a2, $sp, 160 addi.d $a3, $a2, 1 - st.d $a3, $sp, 176 + st.d $a3, $sp, 160 ld.bu $a2, $a2, 0 sll.d $a2, $a2, $s4 add.d $fp, $a2, $fp @@ -1275,35 +1252,34 @@ inflateBack: # @inflateBack addi.w $s5, $s5, -1 addi.d $s4, $s4, 8 addi.w $s3, $s3, 8 - bgeu $s4, $s1, .LBB1_160 -.LBB1_157: # %.lr.ph977 + bgeu $s4, $s1, .LBB1_159 +.LBB1_156: # %.lr.ph977 # Parent Loop BB1_6 Depth=1 # => This Inner Loop Header: Depth=2 - bnez $s5, .LBB1_156 -# %bb.158: # in Loop: Header=BB1_157 Depth=2 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + bnez $s5, .LBB1_155 +# %bb.157: # in Loop: Header=BB1_156 Depth=2 + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - beqz $a0, .LBB1_198 -# %bb.159: # %._crit_edge1255 - # in Loop: Header=BB1_157 Depth=2 + beqz $a0, .LBB1_197 +# %bb.158: # %._crit_edge1255 + # in Loop: Header=BB1_156 Depth=2 move $s5, $a0 ld.d $a0, $s0, 112 ld.w $a1, $s0, 124 - vld $vr4, $sp, 144 # 16-byte Folded Reload addi.d $a5, $zero, -1 - b .LBB1_156 -.LBB1_160: # %._crit_edge978.loopexit + b .LBB1_155 +.LBB1_159: # %._crit_edge978.loopexit # in Loop: Header=BB1_6 Depth=1 move $s4, $s7 -.LBB1_161: # %._crit_edge978 +.LBB1_160: # %._crit_edge978 # in Loop: Header=BB1_6 Depth=1 ld.bu $a3, $a2, 0 ld.hu $s7, $a2, 2 ori $a1, $zero, 15 - bltu $a1, $a3, .LBB1_169 -# %bb.162: # %.preheader688 + bltu $a1, $a3, .LBB1_168 +# %bb.161: # %.preheader688 # in Loop: Header=BB1_6 Depth=1 st.d $s4, $sp, 24 # 8-byte Folded Spill st.d $s6, $sp, 16 # 8-byte Folded Spill @@ -1316,22 +1292,21 @@ inflateBack: # @inflateBack alsl.d $a2, $a1, $a0, 2 ld.bu $a1, $a2, 1 add.d $a4, $s1, $a1 - bgeu $s3, $a4, .LBB1_168 -# %bb.163: # %.lr.ph992.preheader + bgeu $s3, $a4, .LBB1_167 +# %bb.162: # %.lr.ph992.preheader # in Loop: Header=BB1_6 Depth=1 nor $s4, $a3, $zero move $s6, $s3 - b .LBB1_166 -.LBB1_164: # %._crit_edge1258 - # in Loop: Header=BB1_166 Depth=2 + b .LBB1_165 +.LBB1_163: # %._crit_edge1258 + # in Loop: Header=BB1_165 Depth=2 move $s5, $a0 ld.d $a0, $s0, 112 - vld $vr4, $sp, 144 # 16-byte Folded Reload addi.d $a5, $zero, -1 -.LBB1_165: # in Loop: Header=BB1_166 Depth=2 - ld.d $a1, $sp, 176 +.LBB1_164: # in Loop: Header=BB1_165 Depth=2 + ld.d $a1, $sp, 160 addi.d $a2, $a1, 1 - st.d $a2, $sp, 176 + st.d $a2, $sp, 160 ld.bu $a1, $a1, 0 sll.d $a1, $a1, $s6 add.d $fp, $a1, $fp @@ -1345,19 +1320,19 @@ inflateBack: # @inflateBack addi.d $s6, $s6, 8 add.d $a3, $s1, $a1 addi.d $s3, $s3, 8 - bgeu $s6, $a3, .LBB1_168 -.LBB1_166: # %.lr.ph992 + bgeu $s6, $a3, .LBB1_167 +.LBB1_165: # %.lr.ph992 # Parent Loop BB1_6 Depth=1 # => This Inner Loop Header: Depth=2 - bnez $s5, .LBB1_165 -# %bb.167: # in Loop: Header=BB1_166 Depth=2 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + bnez $s5, .LBB1_164 +# %bb.166: # in Loop: Header=BB1_165 Depth=2 + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - bnez $a0, .LBB1_164 - b .LBB1_202 -.LBB1_168: # %._crit_edge993 + bnez $a0, .LBB1_163 + b .LBB1_201 +.LBB1_167: # %._crit_edge993 # in Loop: Header=BB1_6 Depth=1 ld.hu $s7, $a2, 2 ld.bu $a3, $a2, 0 @@ -1366,55 +1341,54 @@ inflateBack: # @inflateBack move $s1, $a1 ld.d $s6, $sp, 16 # 8-byte Folded Reload ld.d $s4, $sp, 24 # 8-byte Folded Reload -.LBB1_169: # in Loop: Header=BB1_6 Depth=1 +.LBB1_168: # in Loop: Header=BB1_6 Depth=1 srl.d $fp, $fp, $s1 andi $a0, $a3, 64 sub.w $s3, $s3, $s1 - bnez $a0, .LBB1_176 -# %bb.170: # in Loop: Header=BB1_6 Depth=1 + bnez $a0, .LBB1_175 +# %bb.169: # in Loop: Header=BB1_6 Depth=1 st.w $s7, $s0, 96 andi $a0, $a3, 15 st.w $a0, $s0, 100 - beqz $a0, .LBB1_179 -# %bb.171: # %.preheader687 + beqz $a0, .LBB1_178 +# %bb.170: # %.preheader687 # in Loop: Header=BB1_6 Depth=1 - bltu $s3, $a0, .LBB1_173 - b .LBB1_178 -.LBB1_172: # in Loop: Header=BB1_173 Depth=2 - ld.d $a1, $sp, 176 + bltu $s3, $a0, .LBB1_172 + b .LBB1_177 +.LBB1_171: # in Loop: Header=BB1_172 Depth=2 + ld.d $a1, $sp, 160 addi.d $a2, $a1, 1 - st.d $a2, $sp, 176 + st.d $a2, $sp, 160 ld.bu $a1, $a1, 0 addi.w $s5, $s5, -1 sll.d $a1, $a1, $s3 addi.w $s3, $s3, 8 add.d $fp, $a1, $fp - bgeu $s3, $a0, .LBB1_177 -.LBB1_173: # %.lr.ph1002 + bgeu $s3, $a0, .LBB1_176 +.LBB1_172: # %.lr.ph1002 # Parent Loop BB1_6 Depth=1 # => This Inner Loop Header: Depth=2 - bnez $s5, .LBB1_172 -# %bb.174: # in Loop: Header=BB1_173 Depth=2 - addi.d $a1, $sp, 176 - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload + bnez $s5, .LBB1_171 +# %bb.173: # in Loop: Header=BB1_172 Depth=2 + addi.d $a1, $sp, 160 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload jirl $ra, $a2, 0 - beqz $a0, .LBB1_198 -# %bb.175: # %._crit_edge1260 - # in Loop: Header=BB1_173 Depth=2 + beqz $a0, .LBB1_197 +# %bb.174: # %._crit_edge1260 + # in Loop: Header=BB1_172 Depth=2 move $s5, $a0 ld.w $a0, $s0, 100 - vld $vr4, $sp, 144 # 16-byte Folded Reload addi.d $a5, $zero, -1 - b .LBB1_172 -.LBB1_176: # in Loop: Header=BB1_6 Depth=1 + b .LBB1_171 +.LBB1_175: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.10) addi.d $a0, $a0, %pc_lo12(.L.str.10) - b .LBB1_181 -.LBB1_177: # %._crit_edge1003.loopexit + b .LBB1_180 +.LBB1_176: # %._crit_edge1003.loopexit # in Loop: Header=BB1_6 Depth=1 ld.w $s7, $s0, 96 -.LBB1_178: # %._crit_edge1003 +.LBB1_177: # %._crit_edge1003 # in Loop: Header=BB1_6 Depth=1 sll.w $a1, $a5, $a0 andn $a1, $fp, $a1 @@ -1422,45 +1396,45 @@ inflateBack: # @inflateBack st.w $s7, $s0, 96 srl.d $fp, $fp, $a0 sub.w $s3, $s3, $a0 -.LBB1_179: # in Loop: Header=BB1_6 Depth=1 +.LBB1_178: # in Loop: Header=BB1_6 Depth=1 ld.w $a0, $s0, 60 ld.w $a1, $s0, 64 sltu $a1, $a1, $a0 maskeqz $a1, $s8, $a1 sub.w $a0, $a0, $a1 - bgeu $a0, $s7, .LBB1_184 -# %bb.180: # in Loop: Header=BB1_6 Depth=1 + bgeu $a0, $s7, .LBB1_183 +# %bb.179: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.11) addi.d $a0, $a0, %pc_lo12(.L.str.11) -.LBB1_181: # %.thread +.LBB1_180: # %.thread # in Loop: Header=BB1_6 Depth=1 st.d $a0, $s6, 48 lu12i.w $a0, 3 ori $a0, $a0, 3921 -.LBB1_182: # %.thread +.LBB1_181: # %.thread # in Loop: Header=BB1_6 Depth=1 st.w $a0, $s0, 8 ld.d $s7, $sp, 120 # 8-byte Folded Reload ld.w $a0, $s0, 8 b .LBB1_6 -.LBB1_183: # %.loopexit1540 - # in Loop: Header=BB1_184 Depth=2 +.LBB1_182: # %.loopexit1540 + # in Loop: Header=BB1_183 Depth=2 ld.w $a1, $s0, 92 sub.w $s8, $s1, $a0 - beqz $a1, .LBB1_196 -.LBB1_184: # %.preheader685 + beqz $a1, .LBB1_195 +.LBB1_183: # %.preheader685 # Parent Loop BB1_6 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB1_191 Depth 3 - # Child Loop BB1_195 Depth 3 + # Child Loop BB1_190 Depth 3 + # Child Loop BB1_194 Depth 3 ld.w $s1, $s0, 60 - beqz $s8, .LBB1_186 -# %bb.185: # in Loop: Header=BB1_184 Depth=2 + beqz $s8, .LBB1_185 +# %bb.184: # in Loop: Header=BB1_183 Depth=2 move $a0, $s1 move $s7, $s2 move $s1, $s8 - b .LBB1_188 -.LBB1_186: # in Loop: Header=BB1_184 Depth=2 + b .LBB1_187 +.LBB1_185: # in Loop: Header=BB1_183 Depth=2 ld.d $s7, $s0, 72 st.w $s1, $s0, 64 ld.d $a0, $sp, 120 # 8-byte Folded Reload @@ -1468,12 +1442,11 @@ inflateBack: # @inflateBack move $a2, $s1 ld.d $a3, $sp, 96 # 8-byte Folded Reload jirl $ra, $a3, 0 - bnez $a0, .LBB1_199 -# %bb.187: # %._crit_edge1263 - # in Loop: Header=BB1_184 Depth=2 + bnez $a0, .LBB1_198 +# %bb.186: # %._crit_edge1263 + # in Loop: Header=BB1_183 Depth=2 ld.w $a0, $s0, 60 - vld $vr4, $sp, 144 # 16-byte Folded Reload -.LBB1_188: # in Loop: Header=BB1_184 Depth=2 +.LBB1_187: # in Loop: Header=BB1_183 Depth=2 ld.wu $a1, $s0, 96 sub.w $a0, $a0, $a1 sltu $a2, $a0, $s1 @@ -1494,13 +1467,13 @@ inflateBack: # @inflateBack addi.w $a2, $a0, -1 st.w $a4, $s0, 92 ori $a4, $zero, 31 - bltu $a2, $a4, .LBB1_193 -# %bb.189: # in Loop: Header=BB1_184 Depth=2 + bltu $a2, $a4, .LBB1_192 +# %bb.188: # in Loop: Header=BB1_183 Depth=2 addi.d $a4, $a3, 31 ori $a5, $zero, 32 - bltu $a4, $a5, .LBB1_193 -# %bb.190: # %vector.ph - # in Loop: Header=BB1_184 Depth=2 + bltu $a4, $a5, .LBB1_192 +# %bb.189: # %vector.ph + # in Loop: Header=BB1_183 Depth=2 bstrpick.d $a2, $a2, 31, 0 addi.d $a4, $a2, 1 bstrpick.d $a2, $a4, 32, 5 @@ -1510,9 +1483,9 @@ inflateBack: # @inflateBack add.d $a1, $a1, $a5 move $a6, $a5 .p2align 4, , 16 -.LBB1_191: # %vector.body +.LBB1_190: # %vector.body # Parent Loop BB1_6 Depth=1 - # Parent Loop BB1_184 Depth=2 + # Parent Loop BB1_183 Depth=2 # => This Inner Loop Header: Depth=3 add.d $a7, $s7, $a3 vldx $vr0, $s7, $a3 @@ -1521,21 +1494,21 @@ inflateBack: # @inflateBack vst $vr1, $s7, 16 addi.d $a6, $a6, -32 addi.d $s7, $s7, 32 - bnez $a6, .LBB1_191 -# %bb.192: # %middle.block - # in Loop: Header=BB1_184 Depth=2 - beq $a4, $a5, .LBB1_183 - b .LBB1_194 -.LBB1_193: # in Loop: Header=BB1_184 Depth=2 + bnez $a6, .LBB1_190 +# %bb.191: # %middle.block + # in Loop: Header=BB1_183 Depth=2 + beq $a4, $a5, .LBB1_182 + b .LBB1_193 +.LBB1_192: # in Loop: Header=BB1_183 Depth=2 move $s2, $s7 move $a2, $a0 -.LBB1_194: # %scalar.ph.preheader - # in Loop: Header=BB1_184 Depth=2 +.LBB1_193: # %scalar.ph.preheader + # in Loop: Header=BB1_183 Depth=2 move $a3, $s2 .p2align 4, , 16 -.LBB1_195: # %scalar.ph +.LBB1_194: # %scalar.ph # Parent Loop BB1_6 Depth=1 - # Parent Loop BB1_184 Depth=2 + # Parent Loop BB1_183 Depth=2 # => This Inner Loop Header: Depth=3 ld.b $a4, $a1, 0 addi.d $a1, $a1, 1 @@ -1543,13 +1516,13 @@ inflateBack: # @inflateBack addi.w $a2, $a2, -1 st.b $a4, $a3, 0 move $a3, $s2 - bnez $a2, .LBB1_195 - b .LBB1_183 -.LBB1_196: # in Loop: Header=BB1_6 Depth=1 + bnez $a2, .LBB1_194 + b .LBB1_182 +.LBB1_195: # in Loop: Header=BB1_6 Depth=1 ld.d $s7, $sp, 120 # 8-byte Folded Reload ld.w $a0, $s0, 8 b .LBB1_6 -.LBB1_197: # in Loop: Header=BB1_6 Depth=1 +.LBB1_196: # in Loop: Header=BB1_6 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.5) addi.d $a0, $a0, %pc_lo12(.L.str.5) move $s6, $s7 @@ -1561,44 +1534,44 @@ inflateBack: # @inflateBack ld.d $s7, $sp, 120 # 8-byte Folded Reload ld.w $a0, $s0, 8 b .LBB1_6 -.LBB1_198: # %.split1037 +.LBB1_197: # %.split1037 move $s5, $zero - st.d $zero, $sp, 176 -.LBB1_199: # %.loopexit684 + st.d $zero, $sp, 160 +.LBB1_198: # %.loopexit684 addi.w $s4, $zero, -5 -.LBB1_200: # %.loopexit684 - ld.d $a0, $sp, 176 +.LBB1_199: # %.loopexit684 + ld.d $a0, $sp, 160 st.d $a0, $s6, 0 st.w $s5, $s6, 8 -.LBB1_201: +.LBB1_200: move $a0, $s4 - ld.d $s8, $sp, 184 # 8-byte Folded Reload - ld.d $s7, $sp, 192 # 8-byte Folded Reload - ld.d $s6, $sp, 200 # 8-byte Folded Reload - ld.d $s5, $sp, 208 # 8-byte Folded Reload - ld.d $s4, $sp, 216 # 8-byte Folded Reload - ld.d $s3, $sp, 224 # 8-byte Folded Reload - ld.d $s2, $sp, 232 # 8-byte Folded Reload - ld.d $s1, $sp, 240 # 8-byte Folded Reload - ld.d $s0, $sp, 248 # 8-byte Folded Reload - ld.d $fp, $sp, 256 # 8-byte Folded Reload - ld.d $ra, $sp, 264 # 8-byte Folded Reload - addi.d $sp, $sp, 272 + ld.d $s8, $sp, 168 # 8-byte Folded Reload + ld.d $s7, $sp, 176 # 8-byte Folded Reload + ld.d $s6, $sp, 184 # 8-byte Folded Reload + ld.d $s5, $sp, 192 # 8-byte Folded Reload + ld.d $s4, $sp, 200 # 8-byte Folded Reload + ld.d $s3, $sp, 208 # 8-byte Folded Reload + ld.d $s2, $sp, 216 # 8-byte Folded Reload + ld.d $s1, $sp, 224 # 8-byte Folded Reload + ld.d $s0, $sp, 232 # 8-byte Folded Reload + ld.d $fp, $sp, 240 # 8-byte Folded Reload + ld.d $ra, $sp, 248 # 8-byte Folded Reload + addi.d $sp, $sp, 256 ret -.LBB1_202: +.LBB1_201: move $s5, $zero - st.d $zero, $sp, 176 + st.d $zero, $sp, 160 addi.w $s4, $zero, -5 ld.d $s6, $sp, 16 # 8-byte Folded Reload - b .LBB1_200 -.LBB1_203: # %.loopexit684.loopexit1547 + b .LBB1_199 +.LBB1_202: # %.loopexit684.loopexit1547 addi.w $s4, $zero, -3 - b .LBB1_200 -.LBB1_204: + b .LBB1_199 +.LBB1_203: ld.w $a0, $s0, 60 ori $s4, $zero, 1 - bgeu $s8, $a0, .LBB1_200 -# %bb.205: + bgeu $s8, $a0, .LBB1_199 +# %bb.204: ld.d $a1, $s0, 72 sub.w $a2, $a0, $s8 move $a0, $s7 @@ -1610,42 +1583,42 @@ inflateBack: # @inflateBack ori $a2, $zero, 1 maskeqz $a0, $a2, $a0 or $s4, $a0, $a1 - b .LBB1_200 -.LBB1_206: + b .LBB1_199 +.LBB1_205: move $s5, $zero - st.d $zero, $sp, 176 + st.d $zero, $sp, 160 addi.w $s4, $zero, -5 move $s6, $s7 - b .LBB1_200 + b .LBB1_199 .Lfunc_end1: .size inflateBack, .Lfunc_end1-inflateBack .section .rodata,"a",@progbits .p2align 2, 0x0 .LJTI1_0: .word .LBB1_8-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 .word .LBB1_23-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 .word .LBB1_10-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 - .word .LBB1_119-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 - .word .LBB1_200-.LJTI1_0 - .word .LBB1_204-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 + .word .LBB1_118-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 + .word .LBB1_199-.LJTI1_0 .word .LBB1_203-.LJTI1_0 + .word .LBB1_202-.LJTI1_0 .LJTI1_1: - .word .LBB1_65-.LJTI1_1 - .word .LBB1_47-.LJTI1_1 - .word .LBB1_63-.LJTI1_1 .word .LBB1_64-.LJTI1_1 + .word .LBB1_44-.LJTI1_1 + .word .LBB1_62-.LJTI1_1 + .word .LBB1_63-.LJTI1_1 # -- End function .text .globl inflateBackEnd # -- Begin function inflateBackEnd diff --git a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/zlib_inflate.s b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/zlib_inflate.s index 5ba763c0..e1ff6141 100644 --- a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/zlib_inflate.s +++ b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/zlib_inflate.s @@ -506,6 +506,15 @@ inflatePrime: # @inflatePrime .type inflate,@function inflate: # @inflate # %bb.0: + addi.w $a3, $zero, -2 + beqz $a0, .LBB6_409 +# %bb.1: + ld.d $a2, $a0, 64 + beqz $a2, .LBB6_409 +# %bb.2: + ld.d $a2, $a0, 72 + beqz $a2, .LBB6_409 +# %bb.3: addi.d $sp, $sp, -256 st.d $ra, $sp, 248 # 8-byte Folded Spill st.d $fp, $sp, 240 # 8-byte Folded Spill @@ -518,446 +527,444 @@ inflate: # @inflate st.d $s6, $sp, 184 # 8-byte Folded Spill st.d $s7, $sp, 176 # 8-byte Folded Spill st.d $s8, $sp, 168 # 8-byte Folded Spill - addi.w $s3, $zero, -2 - beqz $a0, .LBB6_407 -# %bb.1: - ld.d $a2, $a0, 64 - beqz $a2, .LBB6_407 -# %bb.2: - ld.d $a2, $a0, 72 - beqz $a2, .LBB6_407 -# %bb.3: ld.d $s5, $a0, 56 - beqz $s5, .LBB6_407 + beqz $s5, .LBB6_408 # %bb.4: ld.d $a2, $s5, 0 - bne $a2, $a0, .LBB6_407 + bne $a2, $a0, .LBB6_408 # %bb.5: # %inflateStateCheck.exit ld.w $a2, $s5, 8 - lu12i.w $a3, -4 - ori $s4, $a3, 204 - add.w $a3, $a2, $s4 - ori $a4, $zero, 31 - bltu $a4, $a3, .LBB6_407 + lu12i.w $a4, -4 + ori $t8, $a4, 204 + add.w $a4, $a2, $t8 + ori $a5, $zero, 31 + bltu $a5, $a4, .LBB6_408 # %bb.6: - ld.d $t8, $a0, 24 - beqz $t8, .LBB6_407 + ld.d $ra, $a0, 24 + beqz $ra, .LBB6_408 # %bb.7: ld.d $s8, $a0, 0 - beqz $s8, .LBB6_11 + beqz $s8, .LBB6_375 .LBB6_8: - lu12i.w $a3, 3 - ori $s7, $a3, 3903 - bne $a2, $s7, .LBB6_10 + lu12i.w $s4, 3 + ori $s3, $s4, 3903 + bne $a2, $s3, .LBB6_10 # %bb.9: - lu12i.w $a2, 3 - ori $a2, $a2, 3904 + ori $a2, $s4, 3904 st.w $a2, $s5, 8 .LBB6_10: # %.split2330 - ld.w $ra, $a0, 32 + ld.w $s7, $a0, 32 ld.w $s6, $a0, 8 - ld.d $s2, $s5, 80 - ld.w $s0, $s5, 88 - addi.d $a3, $s5, 152 - st.d $a3, $sp, 80 # 8-byte Folded Spill - addi.d $a3, $s5, 1368 - st.d $a3, $sp, 48 # 8-byte Folded Spill - addi.d $a3, $s5, 144 - st.d $a3, $sp, 64 # 8-byte Folded Spill - addi.d $a3, $s5, 120 - st.d $a3, $sp, 40 # 8-byte Folded Spill - addi.d $a3, $s5, 792 - st.d $a3, $sp, 56 # 8-byte Folded Spill - addi.d $a3, $s5, 124 - st.d $a3, $sp, 32 # 8-byte Folded Spill - addi.w $a3, $a1, -5 - st.d $a3, $sp, 72 # 8-byte Folded Spill - addi.w $a3, $zero, -3 - st.d $a3, $sp, 112 # 8-byte Folded Spill + ld.d $fp, $s5, 80 + ld.w $s1, $s5, 88 + addi.d $a4, $s5, 152 + st.d $a4, $sp, 72 # 8-byte Folded Spill + addi.d $a4, $s5, 1368 + st.d $a4, $sp, 40 # 8-byte Folded Spill + addi.d $a4, $s5, 144 + st.d $a4, $sp, 56 # 8-byte Folded Spill + addi.d $a4, $s5, 120 + st.d $a4, $sp, 32 # 8-byte Folded Spill + addi.d $a4, $s5, 792 + st.d $a4, $sp, 48 # 8-byte Folded Spill + addi.d $a4, $s5, 124 + st.d $a4, $sp, 24 # 8-byte Folded Spill + addi.w $a4, $a1, -5 + st.d $a4, $sp, 64 # 8-byte Folded Spill + addi.w $a4, $zero, -3 + st.d $a4, $sp, 104 # 8-byte Folded Spill ori $t7, $zero, 30 - pcalau12i $a3, %pc_hi20(.LJTI6_0) - addi.d $t6, $a3, %pc_lo12(.LJTI6_0) - st.d $zero, $sp, 96 # 8-byte Folded Spill - move $s1, $s6 - st.d $ra, $sp, 120 # 8-byte Folded Spill + pcalau12i $a4, %pc_hi20(.LJTI6_0) + addi.d $t6, $a4, %pc_lo12(.LJTI6_0) + st.d $zero, $sp, 88 # 8-byte Folded Spill + move $s2, $s6 + st.d $s7, $sp, 112 # 8-byte Folded Spill st.d $a1, $sp, 144 # 8-byte Folded Spill st.d $a0, $sp, 152 # 8-byte Folded Spill - st.d $s4, $sp, 104 # 8-byte Folded Spill + st.d $a3, $sp, 120 # 8-byte Folded Spill + st.d $t8, $sp, 128 # 8-byte Folded Spill st.d $t6, $sp, 136 # 8-byte Folded Spill - st.d $s6, $sp, 128 # 8-byte Folded Spill - b .LBB6_16 -.LBB6_11: - ld.w $a3, $a0, 8 - bnez $a3, .LBB6_407 - b .LBB6_8 -.LBB6_12: # in Loop: Header=BB6_16 Depth=1 - andi $a2, $s0, 7 - srl.d $s2, $s2, $a2 - bstrins.d $s0, $zero, 2, 0 - lu12i.w $a2, 3 - ori $a2, $a2, 3918 + b .LBB6_14 +.LBB6_11: # in Loop: Header=BB6_14 Depth=1 + move $a0, $zero + move $a1, $zero + move $a2, $zero + pcaddu18i $ra, %call36(crc32) + jirl $ra, $ra, 0 + move $a1, $a0 + st.d $a0, $s5, 32 + ld.d $a0, $sp, 152 # 8-byte Folded Reload + st.d $a1, $a0, 96 + st.w $s3, $s5, 8 + ld.d $a1, $sp, 144 # 8-byte Folded Reload + ld.d $a3, $sp, 120 # 8-byte Folded Reload + move $ra, $s6 + move $s6, $s4 + move $s4, $s7 + ld.d $s7, $sp, 96 # 8-byte Folded Reload +.LBB6_12: # %.thread + # in Loop: Header=BB6_14 Depth=1 + ld.d $t8, $sp, 128 # 8-byte Folded Reload + ori $t7, $zero, 30 + ld.d $t6, $sp, 136 # 8-byte Folded Reload .LBB6_13: # %.thread - # in Loop: Header=BB6_16 Depth=1 - st.w $a2, $s5, 8 -.LBB6_14: # %.thread - # in Loop: Header=BB6_16 Depth=1 - move $fp, $s8 -.LBB6_15: # %.thread - # in Loop: Header=BB6_16 Depth=1 + # in Loop: Header=BB6_14 Depth=1 ld.w $a2, $s5, 8 - move $s8, $fp -.LBB6_16: # =>This Loop Header: Depth=1 - # Child Loop BB6_54 Depth 2 - # Child Loop BB6_282 Depth 2 - # Child Loop BB6_40 Depth 2 - # Child Loop BB6_85 Depth 2 + move $s8, $s0 +.LBB6_14: # =>This Loop Header: Depth=1 + # Child Loop BB6_52 Depth 2 + # Child Loop BB6_277 Depth 2 + # Child Loop BB6_38 Depth 2 + # Child Loop BB6_84 Depth 2 # Child Loop BB6_165 Depth 2 - # Child Loop BB6_92 Depth 2 - # Child Loop BB6_99 Depth 2 - # Child Loop BB6_101 Depth 3 + # Child Loop BB6_91 Depth 2 + # Child Loop BB6_98 Depth 2 + # Child Loop BB6_100 Depth 3 # Child Loop BB6_115 Depth 3 # Child Loop BB6_109 Depth 3 # Child Loop BB6_112 Depth 3 # Child Loop BB6_129 Depth 3 # Child Loop BB6_133 Depth 3 - # Child Loop BB6_297 Depth 2 - # Child Loop BB6_303 Depth 2 - # Child Loop BB6_27 Depth 2 - # Child Loop BB6_317 Depth 2 - # Child Loop BB6_323 Depth 2 - # Child Loop BB6_333 Depth 2 - # Child Loop BB6_354 Depth 2 - # Child Loop BB6_358 Depth 2 - # Child Loop BB6_37 Depth 2 - # Child Loop BB6_32 Depth 2 - # Child Loop BB6_44 Depth 2 + # Child Loop BB6_293 Depth 2 + # Child Loop BB6_299 Depth 2 + # Child Loop BB6_25 Depth 2 + # Child Loop BB6_312 Depth 2 + # Child Loop BB6_318 Depth 2 + # Child Loop BB6_328 Depth 2 + # Child Loop BB6_351 Depth 2 + # Child Loop BB6_355 Depth 2 + # Child Loop BB6_35 Depth 2 + # Child Loop BB6_30 Depth 2 + # Child Loop BB6_42 Depth 2 # Child Loop BB6_182 Depth 2 # Child Loop BB6_191 Depth 2 # Child Loop BB6_202 Depth 2 # Child Loop BB6_227 Depth 2 # Child Loop BB6_244 Depth 2 # Child Loop BB6_259 Depth 2 - # Child Loop BB6_20 Depth 2 - add.d $a2, $a2, $s4 - bltu $t7, $a2, .LBB6_407 -# %bb.17: # in Loop: Header=BB6_16 Depth=1 + # Child Loop BB6_18 Depth 2 + add.d $a2, $a2, $t8 + bltu $t7, $a2, .LBB6_408 +# %bb.15: # in Loop: Header=BB6_14 Depth=1 slli.d $a2, $a2, 2 ldx.w $a2, $t6, $a2 add.d $a2, $t6, $a2 jr $a2 -.LBB6_18: # in Loop: Header=BB6_16 Depth=1 +.LBB6_16: # in Loop: Header=BB6_14 Depth=1 ld.w $a2, $s5, 16 beqz $a2, .LBB6_139 -# %bb.19: # %.preheader1277 - # in Loop: Header=BB6_16 Depth=1 - addi.w $a3, $s0, 0 - ori $a4, $zero, 15 - bltu $a4, $a3, .LBB6_150 -.LBB6_20: # %.lr.ph2324 - # Parent Loop BB6_16 Depth=1 +# %bb.17: # %.preheader1277 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a4, $s1, 0 + ori $a5, $zero, 15 + bltu $a5, $a4, .LBB6_150 +.LBB6_18: # %.lr.ph2324 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $s1, .LBB6_378 -# %bb.21: # in Loop: Header=BB6_20 Depth=2 - move $a4, $a3 - ld.bu $a3, $s8, 0 - addi.w $s1, $s1, -1 - addi.d $fp, $s8, 1 - sll.d $a3, $a3, $a4 - add.d $s2, $a3, $s2 - addi.d $a3, $a4, 8 - ori $a5, $zero, 8 - addi.w $s0, $s0, 8 - move $s8, $fp - bltu $a4, $a5, .LBB6_20 + beqz $s2, .LBB6_376 +# %bb.19: # in Loop: Header=BB6_18 Depth=2 + move $a5, $a4 + ld.bu $a4, $s8, 0 + addi.w $s2, $s2, -1 + addi.d $s0, $s8, 1 + sll.d $a4, $a4, $a5 + add.d $fp, $a4, $fp + addi.d $a4, $a5, 8 + ori $a6, $zero, 8 + addi.w $s1, $s1, 8 + move $s8, $s0 + bltu $a5, $a6, .LBB6_18 b .LBB6_151 -.LBB6_22: # in Loop: Header=BB6_16 Depth=1 - beqz $ra, .LBB6_388 -# %bb.23: # in Loop: Header=BB6_16 Depth=1 +.LBB6_20: # in Loop: Header=BB6_14 Depth=1 + beqz $s7, .LBB6_386 +# %bb.21: # in Loop: Header=BB6_14 Depth=1 ld.b $a2, $s5, 92 - st.b $a2, $t8, 0 - addi.d $t8, $t8, 1 - addi.w $ra, $ra, -1 - lu12i.w $a3, 3 - ori $a3, $a3, 3912 - st.w $a3, $s5, 8 - b .LBB6_14 -.LBB6_24: # %._crit_edge2829 - # in Loop: Header=BB6_16 Depth=1 - ld.w $a3, $s5, 100 - beqz $a3, .LBB6_313 -.LBB6_25: # %.preheader1299 - # in Loop: Header=BB6_16 Depth=1 - addi.w $a2, $s0, 0 - bgeu $a2, $a3, .LBB6_144 -# %bb.26: # %.lr.ph2041.preheader - # in Loop: Header=BB6_16 Depth=1 - move $a4, $s0 - move $a2, $s1 - move $a6, $s8 -.LBB6_27: # %.lr.ph2041 - # Parent Loop BB6_16 Depth=1 + st.b $a2, $ra, 0 + addi.d $ra, $ra, 1 + addi.w $s7, $s7, -1 + ori $a4, $s4, 3912 + st.w $a4, $s5, 8 + b .LBB6_342 +.LBB6_22: # %._crit_edge2829 + # in Loop: Header=BB6_14 Depth=1 + ld.w $a4, $s5, 100 + beqz $a4, .LBB6_308 +.LBB6_23: # %.preheader1299 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a2, $s1, 0 + bgeu $a2, $a4, .LBB6_144 +# %bb.24: # %.lr.ph2041.preheader + # in Loop: Header=BB6_14 Depth=1 + move $a5, $s1 + move $a2, $s2 + move $a7, $s8 +.LBB6_25: # %.lr.ph2041 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $a2, .LBB6_383 -# %bb.28: # in Loop: Header=BB6_27 Depth=2 - ld.bu $a7, $a6, 0 + beqz $a2, .LBB6_381 +# %bb.26: # in Loop: Header=BB6_25 Depth=2 + ld.bu $t0, $a7, 0 addi.w $a2, $a2, -1 - addi.d $a5, $a6, 1 - sll.d $a6, $a7, $a4 - addi.w $a4, $a4, 8 - add.d $s2, $a6, $s2 - move $a6, $a5 - bltu $a4, $a3, .LBB6_27 + addi.d $a6, $a7, 1 + sll.d $a7, $t0, $a5 + addi.w $a5, $a5, 8 + add.d $fp, $a7, $fp + move $a7, $a6 + bltu $a5, $a4, .LBB6_25 b .LBB6_145 -.LBB6_29: # %._crit_edge2826 - # in Loop: Header=BB6_16 Depth=1 - ld.w $a6, $s5, 140 - ld.d $fp, $sp, 80 # 8-byte Folded Reload - b .LBB6_97 -.LBB6_30: # %._crit_edge2834 - # in Loop: Header=BB6_16 Depth=1 - ld.w $a3, $s5, 100 - b .LBB6_330 -.LBB6_31: # %.preheader1291 - # in Loop: Header=BB6_16 Depth=1 - addi.w $a3, $s0, 0 +.LBB6_27: # %._crit_edge2826 + # in Loop: Header=BB6_14 Depth=1 + ld.w $a7, $s5, 140 + ld.d $s0, $sp, 72 # 8-byte Folded Reload + b .LBB6_96 +.LBB6_28: # %._crit_edge2834 + # in Loop: Header=BB6_14 Depth=1 + ld.w $a4, $s5, 100 + b .LBB6_325 +.LBB6_29: # %.preheader1291 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a3, $s1, 0 ori $a2, $zero, 31 - bltu $a2, $a3, .LBB6_58 -.LBB6_32: # %.lr.ph2107 - # Parent Loop BB6_16 Depth=1 + bltu $a2, $a3, .LBB6_56 +.LBB6_30: # %.lr.ph2107 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $s1, .LBB6_378 -# %bb.33: # in Loop: Header=BB6_32 Depth=2 + beqz $s2, .LBB6_376 +# %bb.31: # in Loop: Header=BB6_30 Depth=2 move $a4, $a3 ld.bu $a3, $s8, 0 - addi.w $s1, $s1, -1 + addi.w $s2, $s2, -1 addi.d $a2, $s8, 1 sll.d $a3, $a3, $a4 - add.d $s2, $a3, $s2 + add.d $fp, $a3, $fp addi.d $a3, $a4, 8 ori $a5, $zero, 24 - addi.d $s0, $s0, 8 + addi.d $s1, $s1, 8 move $s8, $a2 - bltu $a4, $a5, .LBB6_32 - b .LBB6_59 -.LBB6_34: # in Loop: Header=BB6_16 Depth=1 - addi.w $a2, $s0, 0 - ori $a3, $zero, 31 - bgeu $a3, $a2, .LBB6_181 - b .LBB6_184 -.LBB6_35: # in Loop: Header=BB6_16 Depth=1 - addi.w $a2, $s0, 0 - andi $a3, $s0, 7 + bltu $a4, $a5, .LBB6_30 + b .LBB6_57 +.LBB6_32: # in Loop: Header=BB6_14 Depth=1 + addi.w $a2, $s1, 0 ori $a4, $zero, 31 - srl.d $s2, $s2, $a3 - bltu $a4, $a2, .LBB6_70 -# %bb.36: # %.lr.ph2097.preheader - # in Loop: Header=BB6_16 Depth=1 - andi $s0, $s0, 24 - move $a4, $s0 + bgeu $a4, $a2, .LBB6_181 + b .LBB6_184 +.LBB6_33: # in Loop: Header=BB6_14 Depth=1 + addi.w $a2, $s1, 0 + andi $a4, $s1, 7 + ori $a5, $zero, 31 + srl.d $fp, $fp, $a4 + bltu $a5, $a2, .LBB6_69 +# %bb.34: # %.lr.ph2097.preheader + # in Loop: Header=BB6_14 Depth=1 + andi $s1, $s1, 24 + move $a5, $s1 move $a2, $s8 -.LBB6_37: # %.lr.ph2097 - # Parent Loop BB6_16 Depth=1 +.LBB6_35: # %.lr.ph2097 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $s1, .LBB6_379 -# %bb.38: # in Loop: Header=BB6_37 Depth=2 - move $a3, $a4 - ld.bu $a4, $a2, 0 - addi.w $s1, $s1, -1 + beqz $s2, .LBB6_377 +# %bb.36: # in Loop: Header=BB6_35 Depth=2 + move $a4, $a5 + ld.bu $a5, $a2, 0 + addi.w $s2, $s2, -1 addi.d $s8, $a2, 1 - sll.d $a2, $a4, $a3 - add.d $s2, $a2, $s2 - addi.d $a4, $a3, 8 - ori $a5, $zero, 24 - addi.w $s0, $s0, 8 + sll.d $a2, $a5, $a4 + add.d $fp, $a2, $fp + addi.d $a5, $a4, 8 + ori $a6, $zero, 24 + addi.w $s1, $s1, 8 move $a2, $s8 - bltu $a3, $a5, .LBB6_37 - b .LBB6_71 -.LBB6_39: # %.preheader1305 - # in Loop: Header=BB6_16 Depth=1 - addi.w $a2, $s0, 0 - ori $a3, $zero, 13 - bltu $a3, $a2, .LBB6_78 -.LBB6_40: # %.lr.ph1763 - # Parent Loop BB6_16 Depth=1 - # => This Inner Loop Header: Depth=2 - beqz $s1, .LBB6_378 -# %bb.41: # in Loop: Header=BB6_40 Depth=2 - move $a3, $a2 - ld.bu $a2, $s8, 0 - addi.w $s1, $s1, -1 - addi.d $fp, $s8, 1 - sll.d $a2, $a2, $a3 - add.d $s2, $a2, $s2 - addi.d $a2, $a3, 8 - ori $a4, $zero, 6 - addi.d $s0, $s0, 8 - move $s8, $fp - bltu $a3, $a4, .LBB6_40 - b .LBB6_79 -.LBB6_42: # %.preheader1287 - # in Loop: Header=BB6_16 Depth=1 - addi.w $a2, $s0, 0 - ori $a3, $zero, 15 - bltu $a3, $a2, .LBB6_46 -# %bb.43: # %.lr.ph2275.preheader - # in Loop: Header=BB6_16 Depth=1 - move $a3, $s8 -.LBB6_44: # %.lr.ph2275 - # Parent Loop BB6_16 Depth=1 + bltu $a4, $a6, .LBB6_35 + b .LBB6_70 +.LBB6_37: # %.preheader1305 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a2, $s1, 0 + ori $a4, $zero, 13 + bltu $a4, $a2, .LBB6_77 +.LBB6_38: # %.lr.ph1763 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $s1, .LBB6_381 -# %bb.45: # in Loop: Header=BB6_44 Depth=2 + beqz $s2, .LBB6_376 +# %bb.39: # in Loop: Header=BB6_38 Depth=2 move $a4, $a2 - ld.bu $a2, $a3, 0 - addi.w $s1, $s1, -1 - addi.d $s8, $a3, 1 + ld.bu $a2, $s8, 0 + addi.w $s2, $s2, -1 + addi.d $s0, $s8, 1 sll.d $a2, $a2, $a4 - add.d $s2, $a2, $s2 + add.d $fp, $a2, $fp addi.d $a2, $a4, 8 - ori $a5, $zero, 8 - addi.w $s0, $s0, 8 - move $a3, $s8 - bltu $a4, $a5, .LBB6_44 -.LBB6_46: # %._crit_edge2276 - # in Loop: Header=BB6_16 Depth=1 - andi $a2, $s2, 255 - ori $a3, $zero, 8 - st.w $s2, $s5, 24 - bne $a2, $a3, .LBB6_143 -# %bb.47: # in Loop: Header=BB6_16 Depth=1 + ori $a5, $zero, 6 + addi.d $s1, $s1, 8 + move $s8, $s0 + bltu $a4, $a5, .LBB6_38 + b .LBB6_78 +.LBB6_40: # %.preheader1287 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a2, $s1, 0 + ori $a4, $zero, 15 + bltu $a4, $a2, .LBB6_44 +# %bb.41: # %.lr.ph2275.preheader + # in Loop: Header=BB6_14 Depth=1 + move $a4, $s8 +.LBB6_42: # %.lr.ph2275 + # Parent Loop BB6_14 Depth=1 + # => This Inner Loop Header: Depth=2 + beqz $s2, .LBB6_379 +# %bb.43: # in Loop: Header=BB6_42 Depth=2 + move $a5, $a2 + ld.bu $a2, $a4, 0 + addi.w $s2, $s2, -1 + addi.d $s8, $a4, 1 + sll.d $a2, $a2, $a5 + add.d $fp, $a2, $fp + addi.d $a2, $a5, 8 + ori $a6, $zero, 8 + addi.w $s1, $s1, 8 + move $a4, $s8 + bltu $a5, $a6, .LBB6_42 +.LBB6_44: # %._crit_edge2276 + # in Loop: Header=BB6_14 Depth=1 + andi $a2, $fp, 255 + ori $a4, $zero, 8 + st.w $fp, $s5, 24 + bne $a2, $a4, .LBB6_143 +# %bb.45: # in Loop: Header=BB6_14 Depth=1 lu12i.w $a2, 14 - and $a2, $s2, $a2 + and $a2, $fp, $a2 beqz $a2, .LBB6_175 -# %bb.48: # in Loop: Header=BB6_16 Depth=1 +# %bb.46: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.4) addi.d $a2, $a2, %pc_lo12(.L.str.4) - b .LBB6_345 -.LBB6_49: # in Loop: Header=BB6_16 Depth=1 - addi.w $a2, $s0, 0 - ori $a3, $zero, 15 - bgeu $a3, $a2, .LBB6_190 + b .LBB6_340 +.LBB6_47: # in Loop: Header=BB6_14 Depth=1 + addi.w $a2, $s1, 0 + ori $a4, $zero, 15 + bgeu $a4, $a2, .LBB6_190 b .LBB6_193 -.LBB6_50: # in Loop: Header=BB6_16 Depth=1 +.LBB6_48: # in Loop: Header=BB6_14 Depth=1 ld.wu $a2, $s5, 24 - andi $a3, $a2, 1024 - beqz $a3, .LBB6_199 -# %bb.51: # %.preheader1283 - # in Loop: Header=BB6_16 Depth=1 - addi.w $a3, $s0, 0 - ori $a4, $zero, 15 - bltu $a4, $a3, .LBB6_204 + andi $a4, $a2, 1024 + beqz $a4, .LBB6_199 +# %bb.49: # %.preheader1283 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a4, $s1, 0 + ori $a5, $zero, 15 + bltu $a5, $a4, .LBB6_204 b .LBB6_201 -.LBB6_52: # in Loop: Header=BB6_16 Depth=1 - ld.w $a3, $s5, 16 - beqz $a3, .LBB6_140 -# %bb.53: # %.preheader1309 - # in Loop: Header=BB6_16 Depth=1 - addi.w $a2, $s0, 0 - ori $a4, $zero, 31 - bltu $a4, $a2, .LBB6_156 -.LBB6_54: # %.lr.ph - # Parent Loop BB6_16 Depth=1 +.LBB6_50: # in Loop: Header=BB6_14 Depth=1 + ld.w $a4, $s5, 16 + beqz $a4, .LBB6_140 +# %bb.51: # %.preheader1309 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a2, $s1, 0 + ori $a5, $zero, 31 + bltu $a5, $a2, .LBB6_156 +.LBB6_52: # %.lr.ph + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $s1, .LBB6_378 -# %bb.55: # in Loop: Header=BB6_54 Depth=2 - move $a4, $a2 + beqz $s2, .LBB6_376 +# %bb.53: # in Loop: Header=BB6_52 Depth=2 + move $a5, $a2 ld.bu $a2, $s8, 0 - addi.w $s1, $s1, -1 - addi.d $fp, $s8, 1 - sll.d $a2, $a2, $a4 - add.d $s2, $a2, $s2 - addi.d $a2, $a4, 8 - ori $a5, $zero, 24 - addi.w $s0, $s0, 8 - move $s8, $fp - bltu $a4, $a5, .LBB6_54 + addi.w $s2, $s2, -1 + addi.d $s0, $s8, 1 + sll.d $a2, $a2, $a5 + add.d $fp, $a2, $fp + addi.d $a2, $a5, 8 + ori $a6, $zero, 24 + addi.w $s1, $s1, 8 + move $s8, $s0 + bltu $a5, $a6, .LBB6_52 b .LBB6_157 -.LBB6_56: # %.split - # in Loop: Header=BB6_16 Depth=1 +.LBB6_54: # %.split + # in Loop: Header=BB6_14 Depth=1 ld.w $a2, $s5, 128 ld.w $a5, $s5, 140 - ld.d $fp, $sp, 80 # 8-byte Folded Reload - bltu $a5, $a2, .LBB6_82 - b .LBB6_88 -.LBB6_57: # %._crit_edge2822 - # in Loop: Header=BB6_16 Depth=1 - ld.w $a3, $s5, 16 - b .LBB6_279 -.LBB6_58: # in Loop: Header=BB6_16 Depth=1 + ld.d $s0, $sp, 72 # 8-byte Folded Reload + bltu $a5, $a2, .LBB6_81 + b .LBB6_87 +.LBB6_55: # %._crit_edge2822 + # in Loop: Header=BB6_14 Depth=1 + ld.w $a4, $s5, 16 + b .LBB6_274 +.LBB6_56: # in Loop: Header=BB6_14 Depth=1 move $a2, $s8 -.LBB6_59: # %._crit_edge2108 - # in Loop: Header=BB6_16 Depth=1 - move $s0, $zero - revb.2w $a1, $s2 +.LBB6_57: # %._crit_edge2108 + # in Loop: Header=BB6_14 Depth=1 + move $s1, $zero + revb.2w $a1, $fp bstrpick.d $a1, $a1, 31, 0 st.d $a1, $s5, 32 st.d $a1, $a0, 96 - lu12i.w $a1, 3 - ori $a1, $a1, 3902 + ori $a1, $s4, 3902 st.w $a1, $s5, 8 move $s8, $a2 - move $s2, $zero -.LBB6_60: # in Loop: Header=BB6_16 Depth=1 + move $fp, $zero +.LBB6_58: # in Loop: Header=BB6_14 Depth=1 ld.w $a1, $s5, 20 beqz $a1, .LBB6_410 -# %bb.61: # in Loop: Header=BB6_16 Depth=1 +# %bb.59: # in Loop: Header=BB6_14 Depth=1 move $a0, $zero move $a1, $zero move $a2, $zero - move $fp, $ra - move $s4, $t8 + move $s0, $ra pcaddu18i $ra, %call36(adler32) jirl $ra, $ra, 0 - move $t8, $s4 - move $ra, $fp + move $ra, $s0 move $a1, $a0 ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $a1, $s5, 32 st.d $a1, $a0, 96 - st.w $s7, $s5, 8 + st.w $s3, $s5, 8 ld.d $a1, $sp, 144 # 8-byte Folded Reload - ld.d $s4, $sp, 104 # 8-byte Folded Reload + ld.d $a3, $sp, 120 # 8-byte Folded Reload + ld.d $t8, $sp, 128 # 8-byte Folded Reload ori $t7, $zero, 30 ld.d $t6, $sp, 136 # 8-byte Folded Reload -.LBB6_62: # in Loop: Header=BB6_16 Depth=1 +.LBB6_60: # in Loop: Header=BB6_14 Depth=1 ori $a2, $zero, 2 - ld.d $a3, $sp, 72 # 8-byte Folded Reload - bltu $a3, $a2, .LBB6_389 -.LBB6_63: # in Loop: Header=BB6_16 Depth=1 + ld.d $a4, $sp, 64 # 8-byte Folded Reload + bltu $a4, $a2, .LBB6_390 +.LBB6_61: # in Loop: Header=BB6_14 Depth=1 ld.w $a2, $s5, 12 - bnez $a2, .LBB6_12 -# %bb.64: # %.preheader1289 - # in Loop: Header=BB6_16 Depth=1 - addi.w $a2, $s0, 0 - ori $a3, $zero, 2 - bltu $a3, $a2, .LBB6_67 -# %bb.65: # %.lr.ph2116 - # in Loop: Header=BB6_16 Depth=1 - beqz $s1, .LBB6_378 -# %bb.66: # %._crit_edge2117 - # in Loop: Header=BB6_16 Depth=1 - ld.bu $a3, $s8, 0 - ori $s0, $s0, 8 - addi.w $s1, $s1, -1 - addi.d $fp, $s8, 1 - sll.d $a2, $a3, $a2 - add.d $s2, $a2, $s2 - b .LBB6_68 -.LBB6_67: # in Loop: Header=BB6_16 Depth=1 - move $fp, $s8 -.LBB6_68: # in Loop: Header=BB6_16 Depth=1 - andi $a2, $s2, 1 + beqz $a2, .LBB6_63 +# %bb.62: # in Loop: Header=BB6_14 Depth=1 + andi $a2, $s1, 7 + srl.d $fp, $fp, $a2 + bstrins.d $s1, $zero, 2, 0 + ori $a2, $s4, 3918 + b .LBB6_341 +.LBB6_63: # %.preheader1289 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a2, $s1, 0 + ori $a4, $zero, 2 + bltu $a4, $a2, .LBB6_66 +# %bb.64: # %.lr.ph2116 + # in Loop: Header=BB6_14 Depth=1 + beqz $s2, .LBB6_376 +# %bb.65: # %._crit_edge2117 + # in Loop: Header=BB6_14 Depth=1 + ld.bu $a4, $s8, 0 + ori $s1, $s1, 8 + addi.w $s2, $s2, -1 + addi.d $s0, $s8, 1 + sll.d $a2, $a4, $a2 + add.d $fp, $a2, $fp + b .LBB6_67 +.LBB6_66: # in Loop: Header=BB6_14 Depth=1 + move $s0, $s8 +.LBB6_67: # in Loop: Header=BB6_14 Depth=1 + andi $a2, $fp, 1 st.w $a2, $s5, 12 - bstrpick.d $a3, $s2, 2, 1 - lu12i.w $a2, 3 - ori $a2, $a2, 3905 - slli.d $a3, $a3, 2 - pcalau12i $a4, %pc_hi20(.LJTI6_1) - addi.d $a4, $a4, %pc_lo12(.LJTI6_1) - ldx.w $a3, $a4, $a3 - add.d $a3, $a4, $a3 - jr $a3 -.LBB6_69: # in Loop: Header=BB6_16 Depth=1 + bstrpick.d $a4, $fp, 2, 1 + ori $a2, $s4, 3905 + slli.d $a4, $a4, 2 + pcalau12i $a5, %pc_hi20(.LJTI6_1) + addi.d $a5, $a5, %pc_lo12(.LJTI6_1) + ldx.w $a4, $a5, $a4 + add.d $a4, $a5, $a4 + jr $a4 +.LBB6_68: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(fixedtables.lenfix) addi.d $a2, $a2, %pc_lo12(fixedtables.lenfix) st.d $a2, $s5, 104 @@ -967,107 +974,107 @@ inflate: # @inflate pcalau12i $a2, %pc_hi20(fixedtables.distfix) addi.d $a2, $a2, %pc_lo12(fixedtables.distfix) st.d $a2, $s5, 112 - lu12i.w $a2, 3 - ori $a2, $a2, 3911 - ori $a3, $zero, 6 + ori $a2, $s4, 3911 + ori $a4, $zero, 6 st.w $a2, $s5, 8 - bne $a1, $a3, .LBB6_149 + bne $a1, $a4, .LBB6_149 b .LBB6_416 -.LBB6_70: # in Loop: Header=BB6_16 Depth=1 - bstrins.d $s0, $zero, 2, 0 -.LBB6_71: # %._crit_edge2098 - # in Loop: Header=BB6_16 Depth=1 - bstrpick.d $a2, $s2, 15, 0 - srli.d $a3, $s2, 16 - xor $a3, $a3, $a2 - lu12i.w $a4, 15 - ori $a4, $a4, 4095 - bne $a3, $a4, .LBB6_141 -# %bb.72: # in Loop: Header=BB6_16 Depth=1 - move $s2, $zero +.LBB6_69: # in Loop: Header=BB6_14 Depth=1 + bstrins.d $s1, $zero, 2, 0 +.LBB6_70: # %._crit_edge2098 + # in Loop: Header=BB6_14 Depth=1 + bstrpick.d $a2, $fp, 15, 0 + srli.d $a4, $fp, 16 + xor $a4, $a4, $a2 + lu12i.w $a5, 15 + ori $a5, $a5, 4095 + bne $a4, $a5, .LBB6_141 +# %bb.71: # in Loop: Header=BB6_14 Depth=1 + move $fp, $zero st.w $a2, $s5, 92 - lu12i.w $a2, 3 - ori $a2, $a2, 3906 - ori $a3, $zero, 6 + ori $a2, $s4, 3906 + ori $a4, $zero, 6 st.w $a2, $s5, 8 - move $s0, $zero - beq $a1, $a3, .LBB6_415 -.LBB6_73: # in Loop: Header=BB6_16 Depth=1 - lu12i.w $a2, 3 - ori $a2, $a2, 3907 + move $s1, $zero + beq $a1, $a4, .LBB6_415 +.LBB6_72: # in Loop: Header=BB6_14 Depth=1 + ori $a2, $s4, 3907 st.w $a2, $s5, 8 -.LBB6_74: # in Loop: Header=BB6_16 Depth=1 +.LBB6_73: # in Loop: Header=BB6_14 Depth=1 ld.w $a2, $s5, 92 - beqz $a2, .LBB6_77 -# %bb.75: # in Loop: Header=BB6_16 Depth=1 - move $a5, $s7 - sltu $a3, $a2, $s1 - masknez $a4, $s1, $a3 + beqz $a2, .LBB6_76 +# %bb.74: # in Loop: Header=BB6_14 Depth=1 + st.d $s3, $sp, 80 # 8-byte Folded Spill + st.d $s6, $sp, 96 # 8-byte Folded Spill + move $s6, $s4 + sltu $a3, $a2, $s2 + masknez $a4, $s2, $a3 maskeqz $a2, $a2, $a3 or $a2, $a2, $a4 - sltu $a3, $a2, $ra + sltu $a3, $a2, $s7 maskeqz $a2, $a2, $a3 - masknez $a3, $ra, $a3 - or $s7, $a2, $a3 - beqz $s7, .LBB6_411 -# %bb.76: # in Loop: Header=BB6_16 Depth=1 - st.d $a5, $sp, 88 # 8-byte Folded Spill - bstrpick.d $s6, $s7, 31, 0 - move $a0, $t8 + masknez $a3, $s7, $a3 + or $s4, $a2, $a3 + beqz $s4, .LBB6_411 +# %bb.75: # in Loop: Header=BB6_14 Depth=1 + move $s3, $s7 + bstrpick.d $s7, $s4, 31, 0 + move $a0, $ra move $a1, $s8 - move $a2, $s6 - move $fp, $ra - move $s4, $t8 + move $a2, $s7 + move $s0, $ra pcaddu18i $ra, %call36(memcpy) jirl $ra, $ra, 0 - move $ra, $fp + move $ra, $s0 ld.d $a0, $sp, 152 # 8-byte Folded Reload ld.d $a1, $sp, 144 # 8-byte Folded Reload - sub.w $s1, $s1, $s7 + sub.w $s2, $s2, $s4 ld.w $a2, $s5, 92 - add.d $fp, $s8, $s6 - sub.w $ra, $ra, $s7 - add.d $t8, $s4, $s6 - ld.d $a3, $sp, 88 # 8-byte Folded Reload - sub.d $a2, $a2, $s7 - move $s7, $a3 + add.d $s0, $s8, $s7 + sub.w $s3, $s3, $s4 + add.d $ra, $ra, $s7 + move $s7, $s3 + sub.d $a2, $a2, $s4 st.w $a2, $s5, 92 - b .LBB6_269 -.LBB6_77: # in Loop: Header=BB6_16 Depth=1 - st.w $s7, $s5, 8 - b .LBB6_14 -.LBB6_78: # in Loop: Header=BB6_16 Depth=1 - move $fp, $s8 -.LBB6_79: # %._crit_edge1764 - # in Loop: Header=BB6_16 Depth=1 - andi $a4, $s2, 31 - addi.d $a2, $a4, 257 + ld.d $a3, $sp, 120 # 8-byte Folded Reload + move $s4, $s6 + ld.d $s6, $sp, 96 # 8-byte Folded Reload + ld.d $s3, $sp, 80 # 8-byte Folded Reload + b .LBB6_12 +.LBB6_76: # in Loop: Header=BB6_14 Depth=1 + st.w $s3, $s5, 8 + b .LBB6_342 +.LBB6_77: # in Loop: Header=BB6_14 Depth=1 + move $s0, $s8 +.LBB6_78: # %._crit_edge1764 + # in Loop: Header=BB6_14 Depth=1 + andi $a5, $fp, 31 + addi.d $a2, $a5, 257 st.w $a2, $s5, 132 - bstrpick.d $a3, $s2, 9, 5 - addi.d $a2, $a3, 1 + bstrpick.d $a4, $fp, 9, 5 + addi.d $a2, $a4, 1 st.w $a2, $s5, 136 - bstrpick.d $a2, $s2, 13, 10 + bstrpick.d $a2, $fp, 13, 10 addi.d $a2, $a2, 4 st.w $a2, $s5, 128 - srli.d $s2, $s2, 14 - ori $a5, $zero, 29 - addi.w $s0, $s0, -14 - bltu $a5, $a4, .LBB6_142 -# %bb.80: # %._crit_edge1764 - # in Loop: Header=BB6_16 Depth=1 - ori $a4, $zero, 30 - bgeu $a3, $a4, .LBB6_142 -# %bb.81: # %.split.thread - # in Loop: Header=BB6_16 Depth=1 + srli.d $fp, $fp, 14 + ori $a6, $zero, 29 + addi.w $s1, $s1, -14 + bltu $a6, $a5, .LBB6_142 +# %bb.79: # %._crit_edge1764 + # in Loop: Header=BB6_14 Depth=1 + ori $a5, $zero, 30 + bgeu $a4, $a5, .LBB6_142 +# %bb.80: # %.split.thread + # in Loop: Header=BB6_14 Depth=1 move $a5, $zero st.w $zero, $s5, 140 - lu12i.w $a3, 3 - ori $a3, $a3, 3909 + ori $a3, $s4, 3909 st.w $a3, $s5, 8 - move $s8, $fp - ld.d $fp, $sp, 80 # 8-byte Folded Reload -.LBB6_82: # %.preheader1275.preheader - # in Loop: Header=BB6_16 Depth=1 + move $s8, $s0 + ld.d $s0, $sp, 72 # 8-byte Folded Reload +.LBB6_81: # %.preheader1275.preheader + # in Loop: Header=BB6_14 Depth=1 bstrpick.d $a4, $a5, 31, 0 bstrpick.d $a6, $a2, 31, 0 pcalau12i $a3, %pc_hi20(inflate.order) @@ -1075,48 +1082,48 @@ inflate: # @inflate alsl.d $a3, $a4, $a3, 1 sub.d $a4, $a6, $a4 addi.d $a5, $a5, 1 - b .LBB6_85 -.LBB6_83: # %._crit_edge1774 - # in Loop: Header=BB6_85 Depth=2 + b .LBB6_84 +.LBB6_82: # %._crit_edge1774 + # in Loop: Header=BB6_84 Depth=2 ld.bu $a7, $s8, 0 - ori $s0, $s0, 8 - addi.w $s1, $s1, -1 + ori $s1, $s1, 8 + addi.w $s2, $s2, -1 addi.d $s8, $s8, 1 sll.d $a6, $a7, $a6 - add.d $s2, $a6, $s2 -.LBB6_84: # in Loop: Header=BB6_85 Depth=2 + add.d $fp, $a6, $fp +.LBB6_83: # in Loop: Header=BB6_84 Depth=2 ld.hu $a6, $a3, 0 - andi $a7, $s2, 7 + andi $a7, $fp, 7 st.w $a5, $s5, 140 slli.d $a6, $a6, 1 - stx.h $a7, $fp, $a6 - srli.d $s2, $s2, 3 - addi.w $s0, $s0, -3 + stx.h $a7, $s0, $a6 + srli.d $fp, $fp, 3 + addi.w $s1, $s1, -3 addi.d $a3, $a3, 2 addi.d $a4, $a4, -1 addi.d $a5, $a5, 1 - beqz $a4, .LBB6_87 -.LBB6_85: # %.preheader1275 - # Parent Loop BB6_16 Depth=1 + beqz $a4, .LBB6_86 +.LBB6_84: # %.preheader1275 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - addi.w $a6, $s0, 0 + addi.w $a6, $s1, 0 ori $a7, $zero, 2 - bltu $a7, $a6, .LBB6_84 -# %bb.86: # %.lr.ph1773 - # in Loop: Header=BB6_85 Depth=2 - bnez $s1, .LBB6_83 - b .LBB6_378 -.LBB6_87: # in Loop: Header=BB6_16 Depth=1 + bltu $a7, $a6, .LBB6_83 +# %bb.85: # %.lr.ph1773 + # in Loop: Header=BB6_84 Depth=2 + bnez $s2, .LBB6_82 + b .LBB6_376 +.LBB6_86: # in Loop: Header=BB6_14 Depth=1 move $a5, $a2 -.LBB6_88: # %.preheader1304 - # in Loop: Header=BB6_16 Depth=1 - move $s6, $t8 - move $s4, $ra +.LBB6_87: # %.preheader1304 + # in Loop: Header=BB6_14 Depth=1 + st.d $s6, $sp, 96 # 8-byte Folded Spill + move $s6, $ra addi.w $a4, $a5, 0 ori $a0, $zero, 18 - bltu $a0, $a4, .LBB6_94 -# %bb.89: # %.lr.ph1933.preheader - # in Loop: Header=BB6_16 Depth=1 + bltu $a0, $a4, .LBB6_93 +# %bb.88: # %.lr.ph1933.preheader + # in Loop: Header=BB6_14 Depth=1 addi.w $a2, $a5, 1 ori $a0, $zero, 20 sub.d $a1, $a0, $a2 @@ -1124,1775 +1131,1767 @@ inflate: # @inflate pcalau12i $a0, %pc_hi20(inflate.order) addi.d $a0, $a0, %pc_lo12(inflate.order) bgeu $a1, $a3, .LBB6_162 -.LBB6_90: # in Loop: Header=BB6_16 Depth=1 +.LBB6_89: # in Loop: Header=BB6_14 Depth=1 move $a2, $a4 -.LBB6_91: # %.lr.ph1933.preheader3804 - # in Loop: Header=BB6_16 Depth=1 +.LBB6_90: # %.lr.ph1933.preheader3804 + # in Loop: Header=BB6_14 Depth=1 alsl.d $a0, $a2, $a0, 1 addi.d $a1, $a2, 1 -.LBB6_92: # %.lr.ph1933 - # Parent Loop BB6_16 Depth=1 +.LBB6_91: # %.lr.ph1933 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 ld.hu $a2, $a0, 0 slli.d $a2, $a2, 1 - stx.h $zero, $fp, $a2 + stx.h $zero, $s0, $a2 bstrpick.d $a2, $a1, 31, 0 addi.d $a0, $a0, 2 ori $a3, $zero, 19 addi.d $a1, $a1, 1 - bne $a2, $a3, .LBB6_92 -.LBB6_93: # %._crit_edge1934 - # in Loop: Header=BB6_16 Depth=1 + bne $a2, $a3, .LBB6_91 +.LBB6_92: # %._crit_edge1934 + # in Loop: Header=BB6_14 Depth=1 ori $a0, $zero, 19 st.w $a0, $s5, 140 -.LBB6_94: # in Loop: Header=BB6_16 Depth=1 - ld.d $a0, $sp, 48 # 8-byte Folded Reload +.LBB6_93: # in Loop: Header=BB6_14 Depth=1 + ld.d $a0, $sp, 40 # 8-byte Folded Reload st.d $a0, $s5, 144 st.d $a0, $s5, 104 ori $a0, $zero, 7 st.w $a0, $s5, 120 ori $a2, $zero, 19 move $a0, $zero - move $a1, $fp - ld.d $a3, $sp, 64 # 8-byte Folded Reload - ld.d $a4, $sp, 40 # 8-byte Folded Reload - ld.d $a5, $sp, 56 # 8-byte Folded Reload + move $a1, $s0 + ld.d $a3, $sp, 56 # 8-byte Folded Reload + ld.d $a4, $sp, 32 # 8-byte Folded Reload + ld.d $a5, $sp, 48 # 8-byte Folded Reload pcaddu18i $ra, %call36(inflate_table) jirl $ra, $ra, 0 - beqz $a0, .LBB6_96 -# %bb.95: # in Loop: Header=BB6_16 Depth=1 - st.d $a0, $sp, 96 # 8-byte Folded Spill + beqz $a0, .LBB6_95 +# %bb.94: # in Loop: Header=BB6_14 Depth=1 + st.d $a0, $sp, 88 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(.L.str.9) addi.d $a1, $a0, %pc_lo12(.L.str.9) b .LBB6_174 -.LBB6_96: # in Loop: Header=BB6_16 Depth=1 - move $a6, $zero - st.d $zero, $sp, 96 # 8-byte Folded Spill +.LBB6_95: # in Loop: Header=BB6_14 Depth=1 + move $a7, $zero + st.d $zero, $sp, 88 # 8-byte Folded Spill st.w $zero, $s5, 140 - lu12i.w $a0, 3 - ori $a0, $a0, 3910 + ori $a0, $s4, 3910 st.w $a0, $s5, 8 ld.d $a1, $sp, 144 # 8-byte Folded Reload ld.d $a0, $sp, 152 # 8-byte Folded Reload - move $ra, $s4 - move $t8, $s6 - ld.d $s6, $sp, 128 # 8-byte Folded Reload - ld.d $s4, $sp, 104 # 8-byte Folded Reload + ld.d $a3, $sp, 120 # 8-byte Folded Reload + move $ra, $s6 + ld.d $s6, $sp, 96 # 8-byte Folded Reload + ld.d $t8, $sp, 128 # 8-byte Folded Reload ori $t7, $zero, 30 ld.d $t6, $sp, 136 # 8-byte Folded Reload -.LBB6_97: # in Loop: Header=BB6_16 Depth=1 +.LBB6_96: # in Loop: Header=BB6_14 Depth=1 ld.w $a2, $s5, 132 - ld.w $a3, $s5, 136 - add.w $a3, $a3, $a2 - bgeu $a6, $a3, .LBB6_136 -# %bb.98: # %.preheader1273.lr.ph - # in Loop: Header=BB6_16 Depth=1 - ld.wu $a5, $s5, 120 - ld.d $a4, $s5, 104 - addi.d $a7, $zero, -1 - sll.w $a5, $a7, $a5 - nor $a5, $a5, $zero - move $t1, $s1 - move $t2, $s8 -.LBB6_99: # %.preheader1273 - # Parent Loop BB6_16 Depth=1 + ld.w $a4, $s5, 136 + add.w $a4, $a4, $a2 + bgeu $a7, $a4, .LBB6_136 +# %bb.97: # %.preheader1273.lr.ph + # in Loop: Header=BB6_14 Depth=1 + ld.wu $a6, $s5, 120 + ld.d $a5, $s5, 104 + addi.d $t0, $zero, -1 + sll.w $a6, $t0, $a6 + nor $a6, $a6, $zero + move $t2, $s2 + move $t3, $s8 +.LBB6_98: # %.preheader1273 + # Parent Loop BB6_14 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB6_101 Depth 3 + # Child Loop BB6_100 Depth 3 # Child Loop BB6_115 Depth 3 # Child Loop BB6_109 Depth 3 # Child Loop BB6_112 Depth 3 # Child Loop BB6_129 Depth 3 # Child Loop BB6_133 Depth 3 - and $a7, $a5, $s2 - addi.w $t3, $a7, 0 - alsl.d $a7, $t3, $a4, 2 - ld.bu $t0, $a7, 1 - addi.w $t4, $s0, 0 - bgeu $t4, $t0, .LBB6_103 -# %bb.100: # %.lr.ph1945.preheader - # in Loop: Header=BB6_99 Depth=2 - move $a7, $s0 - move $s1, $t1 - move $t5, $t2 -.LBB6_101: # %.lr.ph1945 - # Parent Loop BB6_16 Depth=1 - # Parent Loop BB6_99 Depth=2 + and $t0, $a6, $fp + addi.w $t4, $t0, 0 + alsl.d $t0, $t4, $a5, 2 + ld.bu $t1, $t0, 1 + addi.w $t5, $s1, 0 + bgeu $t5, $t1, .LBB6_103 +# %bb.99: # %.lr.ph1945.preheader + # in Loop: Header=BB6_98 Depth=2 + move $t0, $s1 + move $s2, $t2 + move $t6, $t3 +.LBB6_100: # %.lr.ph1945 + # Parent Loop BB6_14 Depth=1 + # Parent Loop BB6_98 Depth=2 # => This Inner Loop Header: Depth=3 - beqz $s1, .LBB6_382 -# %bb.102: # in Loop: Header=BB6_101 Depth=3 - ld.bu $t0, $t5, 0 - sll.d $t0, $t0, $t4 - add.d $s2, $t0, $s2 - and $t0, $a5, $s2 - addi.w $t3, $t0, 0 - alsl.d $t0, $t3, $a4, 2 - ld.bu $t0, $t0, 1 - addi.w $s1, $s1, -1 - addi.d $s8, $t5, 1 - addi.d $t4, $t4, 8 - addi.d $a7, $a7, 8 - move $t5, $s8 - bltu $t4, $t0, .LBB6_101 + beqz $s2, .LBB6_380 +# %bb.101: # in Loop: Header=BB6_100 Depth=3 + ld.bu $t1, $t6, 0 + sll.d $t1, $t1, $t5 + add.d $fp, $t1, $fp + and $t1, $a6, $fp + addi.w $t4, $t1, 0 + alsl.d $t1, $t4, $a5, 2 + ld.bu $t1, $t1, 1 + addi.w $s2, $s2, -1 + addi.d $s8, $t6, 1 + addi.d $t5, $t5, 8 + addi.d $t0, $t0, 8 + move $t6, $s8 + bltu $t5, $t1, .LBB6_100 +# %bb.102: # %._crit_edge1946.loopexit + # in Loop: Header=BB6_98 Depth=2 + ld.d $t6, $sp, 136 # 8-byte Folded Reload b .LBB6_104 -.LBB6_103: # in Loop: Header=BB6_99 Depth=2 - move $s8, $t2 - move $s1, $t1 - move $a7, $s0 +.LBB6_103: # in Loop: Header=BB6_98 Depth=2 + move $s8, $t3 + move $s2, $t2 + move $t0, $s1 .LBB6_104: # %._crit_edge1946 - # in Loop: Header=BB6_99 Depth=2 - alsl.d $t1, $t3, $a4, 2 - ld.hu $t1, $t1, 2 - ori $t2, $zero, 15 - bltu $t2, $t1, .LBB6_106 -# %bb.105: # in Loop: Header=BB6_99 Depth=2 - srl.d $s2, $s2, $t0 - sub.w $s0, $a7, $t0 - bstrpick.d $a7, $a6, 31, 0 - addi.w $a6, $a6, 1 - st.w $a6, $s5, 140 - slli.d $a7, $a7, 1 - stx.h $t1, $fp, $a7 + # in Loop: Header=BB6_98 Depth=2 + alsl.d $t2, $t4, $a5, 2 + ld.hu $t2, $t2, 2 + ori $t3, $zero, 15 + bltu $t3, $t2, .LBB6_106 +# %bb.105: # in Loop: Header=BB6_98 Depth=2 + srl.d $fp, $fp, $t1 + sub.w $s1, $t0, $t1 + bstrpick.d $t0, $a7, 31, 0 + addi.w $a7, $a7, 1 + st.w $a7, $s5, 140 + slli.d $t0, $t0, 1 + stx.h $t2, $s0, $t0 b .LBB6_135 -.LBB6_106: # in Loop: Header=BB6_99 Depth=2 - ori $t2, $zero, 16 - beq $t1, $t2, .LBB6_111 -# %bb.107: # in Loop: Header=BB6_99 Depth=2 - ori $t2, $zero, 17 - bne $t1, $t2, .LBB6_114 +.LBB6_106: # in Loop: Header=BB6_98 Depth=2 + ori $t3, $zero, 16 + beq $t2, $t3, .LBB6_111 +# %bb.107: # in Loop: Header=BB6_98 Depth=2 + ori $t3, $zero, 17 + bne $t2, $t3, .LBB6_114 # %bb.108: # %.preheader1271 - # in Loop: Header=BB6_99 Depth=2 - addi.d $t1, $t0, 3 - addi.w $t2, $a7, 0 - bgeu $t2, $t1, .LBB6_117 + # in Loop: Header=BB6_98 Depth=2 + addi.d $t2, $t1, 3 + addi.w $t3, $t0, 0 + bgeu $t3, $t2, .LBB6_117 .LBB6_109: # %.lr.ph1959 - # Parent Loop BB6_16 Depth=1 - # Parent Loop BB6_99 Depth=2 + # Parent Loop BB6_14 Depth=1 + # Parent Loop BB6_98 Depth=2 # => This Inner Loop Header: Depth=3 - beqz $s1, .LBB6_386 + beqz $s2, .LBB6_384 # %bb.110: # in Loop: Header=BB6_109 Depth=3 - ld.bu $t4, $s8, 0 - addi.w $s1, $s1, -1 - addi.d $t3, $s8, 1 - sll.d $t4, $t4, $t2 - add.d $s2, $t4, $s2 - addi.d $t2, $t2, 8 - addi.d $a7, $a7, 8 - move $s8, $t3 - bltu $t2, $t1, .LBB6_109 + ld.bu $t5, $s8, 0 + addi.w $s2, $s2, -1 + addi.d $t4, $s8, 1 + sll.d $t5, $t5, $t3 + add.d $fp, $t5, $fp + addi.d $t3, $t3, 8 + addi.d $t0, $t0, 8 + move $s8, $t4 + bltu $t3, $t2, .LBB6_109 b .LBB6_118 .LBB6_111: # %.preheader1269 - # in Loop: Header=BB6_99 Depth=2 - addi.d $t1, $t0, 2 - addi.w $t2, $a7, 0 - bgeu $t2, $t1, .LBB6_119 + # in Loop: Header=BB6_98 Depth=2 + addi.d $t2, $t1, 2 + addi.w $t3, $t0, 0 + bgeu $t3, $t2, .LBB6_119 .LBB6_112: # %.lr.ph1969 - # Parent Loop BB6_16 Depth=1 - # Parent Loop BB6_99 Depth=2 + # Parent Loop BB6_14 Depth=1 + # Parent Loop BB6_98 Depth=2 # => This Inner Loop Header: Depth=3 - beqz $s1, .LBB6_386 + beqz $s2, .LBB6_384 # %bb.113: # in Loop: Header=BB6_112 Depth=3 - ld.bu $t3, $s8, 0 - addi.w $s1, $s1, -1 - addi.d $fp, $s8, 1 - sll.d $t3, $t3, $t2 - add.d $s2, $t3, $s2 - addi.d $t2, $t2, 8 - addi.d $a7, $a7, 8 - move $s8, $fp - bltu $t2, $t1, .LBB6_112 + ld.bu $t4, $s8, 0 + addi.w $s2, $s2, -1 + addi.d $s0, $s8, 1 + sll.d $t4, $t4, $t3 + add.d $fp, $t4, $fp + addi.d $t3, $t3, 8 + addi.d $t0, $t0, 8 + move $s8, $s0 + bltu $t3, $t2, .LBB6_112 b .LBB6_120 .LBB6_114: # %.preheader1267 - # in Loop: Header=BB6_99 Depth=2 - addi.w $t1, $a7, 0 - addi.d $t2, $t0, 7 - bgeu $t1, $t2, .LBB6_122 + # in Loop: Header=BB6_98 Depth=2 + addi.w $t2, $t0, 0 + addi.d $t3, $t1, 7 + bgeu $t2, $t3, .LBB6_122 .LBB6_115: # %.lr.ph1979 - # Parent Loop BB6_16 Depth=1 - # Parent Loop BB6_99 Depth=2 + # Parent Loop BB6_14 Depth=1 + # Parent Loop BB6_98 Depth=2 # => This Inner Loop Header: Depth=3 - beqz $s1, .LBB6_386 + beqz $s2, .LBB6_384 # %bb.116: # in Loop: Header=BB6_115 Depth=3 - ld.bu $t4, $s8, 0 - addi.w $s1, $s1, -1 - addi.d $t3, $s8, 1 - sll.d $t4, $t4, $t1 - add.d $s2, $t4, $s2 - addi.d $t1, $t1, 8 - addi.d $a7, $a7, 8 - move $s8, $t3 - bltu $t1, $t2, .LBB6_115 + ld.bu $t5, $s8, 0 + addi.w $s2, $s2, -1 + addi.d $t4, $s8, 1 + sll.d $t5, $t5, $t2 + add.d $fp, $t5, $fp + addi.d $t2, $t2, 8 + addi.d $t0, $t0, 8 + move $s8, $t4 + bltu $t2, $t3, .LBB6_115 b .LBB6_123 -.LBB6_117: # in Loop: Header=BB6_99 Depth=2 - move $t3, $s8 +.LBB6_117: # in Loop: Header=BB6_98 Depth=2 + move $t4, $s8 .LBB6_118: # %._crit_edge1960 - # in Loop: Header=BB6_99 Depth=2 - move $t1, $zero - srl.d $t4, $s2, $t0 - andi $t2, $t4, 7 - addi.d $t2, $t2, 3 - srli.d $s2, $t4, 3 - sub.d $a7, $a7, $t0 - addi.w $s0, $a7, -3 + # in Loop: Header=BB6_98 Depth=2 + move $t2, $zero + srl.d $t5, $fp, $t1 + andi $t3, $t5, 7 + addi.d $t3, $t3, 3 + srli.d $fp, $t5, 3 + sub.d $t0, $t0, $t1 + addi.w $s1, $t0, -3 b .LBB6_124 -.LBB6_119: # in Loop: Header=BB6_99 Depth=2 - move $fp, $s8 +.LBB6_119: # in Loop: Header=BB6_98 Depth=2 + move $s0, $s8 .LBB6_120: # %._crit_edge1970 - # in Loop: Header=BB6_99 Depth=2 - srl.d $s2, $s2, $t0 - sub.w $s0, $a7, $t0 - beqz $a6, .LBB6_377 -# %bb.121: # in Loop: Header=BB6_99 Depth=2 - addi.d $a7, $a6, -1 - bstrpick.d $a7, $a7, 31, 0 - slli.d $a7, $a7, 1 - ld.d $t0, $sp, 80 # 8-byte Folded Reload - ldx.hu $t1, $t0, $a7 - andi $a7, $s2, 3 - addi.d $t2, $a7, 3 - srli.d $s2, $s2, 2 - addi.w $s0, $s0, -2 - move $s8, $fp - move $fp, $t0 + # in Loop: Header=BB6_98 Depth=2 + srl.d $fp, $fp, $t1 + sub.w $s1, $t0, $t1 + beqz $a7, .LBB6_374 +# %bb.121: # in Loop: Header=BB6_98 Depth=2 + addi.d $t0, $a7, -1 + bstrpick.d $t0, $t0, 31, 0 + slli.d $t0, $t0, 1 + ld.d $t1, $sp, 72 # 8-byte Folded Reload + ldx.hu $t2, $t1, $t0 + andi $t0, $fp, 3 + addi.d $t3, $t0, 3 + srli.d $fp, $fp, 2 + addi.w $s1, $s1, -2 + move $s8, $s0 + move $s0, $t1 b .LBB6_125 -.LBB6_122: # in Loop: Header=BB6_99 Depth=2 - move $t3, $s8 +.LBB6_122: # in Loop: Header=BB6_98 Depth=2 + move $t4, $s8 .LBB6_123: # %._crit_edge1980 - # in Loop: Header=BB6_99 Depth=2 - move $t1, $zero - srl.d $t4, $s2, $t0 - andi $t2, $t4, 127 - addi.d $t2, $t2, 11 - srli.d $s2, $t4, 7 - sub.d $a7, $a7, $t0 - addi.w $s0, $a7, -7 -.LBB6_124: # in Loop: Header=BB6_99 Depth=2 - move $s8, $t3 -.LBB6_125: # in Loop: Header=BB6_99 Depth=2 - add.w $a7, $t2, $a6 - bltu $a3, $a7, .LBB6_373 + # in Loop: Header=BB6_98 Depth=2 + move $t2, $zero + srl.d $t5, $fp, $t1 + andi $t3, $t5, 127 + addi.d $t3, $t3, 11 + srli.d $fp, $t5, 7 + sub.d $t0, $t0, $t1 + addi.w $s1, $t0, -7 +.LBB6_124: # in Loop: Header=BB6_98 Depth=2 + move $s8, $t4 +.LBB6_125: # in Loop: Header=BB6_98 Depth=2 + add.w $t0, $t3, $a7 + bltu $a4, $t0, .LBB6_370 # %bb.126: # %.preheader.preheader - # in Loop: Header=BB6_99 Depth=2 - ori $a7, $zero, 16 - bltu $t2, $a7, .LBB6_131 + # in Loop: Header=BB6_98 Depth=2 + ori $t0, $zero, 16 + bltu $t3, $t0, .LBB6_131 # %bb.127: # %.preheader.preheader - # in Loop: Header=BB6_99 Depth=2 - sub.d $a7, $zero, $t2 - bltu $a7, $a6, .LBB6_131 + # in Loop: Header=BB6_98 Depth=2 + sub.d $t0, $zero, $t3 + bltu $t0, $a7, .LBB6_131 # %bb.128: # %vector.ph3775 - # in Loop: Header=BB6_99 Depth=2 - andi $t3, $t2, 240 - andi $a7, $t2, 15 - add.w $t0, $a6, $t3 - vreplgr2vr.h $vr0, $t1 - move $t4, $t3 + # in Loop: Header=BB6_98 Depth=2 + andi $t4, $t3, 240 + andi $t0, $t3, 15 + add.w $t1, $a7, $t4 + vreplgr2vr.h $vr0, $t2 + move $t5, $t4 .LBB6_129: # %vector.body3778 - # Parent Loop BB6_16 Depth=1 - # Parent Loop BB6_99 Depth=2 + # Parent Loop BB6_14 Depth=1 + # Parent Loop BB6_98 Depth=2 # => This Inner Loop Header: Depth=3 - bstrpick.d $t5, $a6, 31, 0 - alsl.d $t6, $t5, $fp, 1 - slli.d $t5, $t5, 1 - vstx $vr0, $fp, $t5 - vst $vr0, $t6, 16 - addi.w $t4, $t4, -16 - addi.w $a6, $a6, 16 - bnez $t4, .LBB6_129 + bstrpick.d $t6, $a7, 31, 0 + alsl.d $t7, $t6, $s0, 1 + slli.d $t6, $t6, 1 + vstx $vr0, $s0, $t6 + vst $vr0, $t7, 16 + addi.w $t5, $t5, -16 + addi.w $a7, $a7, 16 + bnez $t5, .LBB6_129 # %bb.130: # %middle.block3781 - # in Loop: Header=BB6_99 Depth=2 + # in Loop: Header=BB6_98 Depth=2 + ori $t7, $zero, 30 ld.d $t6, $sp, 136 # 8-byte Folded Reload - bne $t2, $t3, .LBB6_132 + bne $t3, $t4, .LBB6_132 b .LBB6_134 -.LBB6_131: # in Loop: Header=BB6_99 Depth=2 - move $a7, $t2 - move $t0, $a6 +.LBB6_131: # in Loop: Header=BB6_98 Depth=2 + move $t0, $t3 + move $t1, $a7 .LBB6_132: # %.preheader.preheader3803 - # in Loop: Header=BB6_99 Depth=2 - move $a6, $t0 + # in Loop: Header=BB6_98 Depth=2 + move $a7, $t1 .LBB6_133: # %.preheader - # Parent Loop BB6_16 Depth=1 - # Parent Loop BB6_99 Depth=2 + # Parent Loop BB6_14 Depth=1 + # Parent Loop BB6_98 Depth=2 # => This Inner Loop Header: Depth=3 - addi.w $a7, $a7, -1 - addi.w $t0, $a6, 1 - bstrpick.d $a6, $a6, 31, 0 - slli.d $a6, $a6, 1 - stx.h $t1, $fp, $a6 - move $a6, $t0 - bnez $a7, .LBB6_133 + addi.w $t0, $t0, -1 + addi.w $t1, $a7, 1 + bstrpick.d $a7, $a7, 31, 0 + slli.d $a7, $a7, 1 + stx.h $t2, $s0, $a7 + move $a7, $t1 + bnez $t0, .LBB6_133 .LBB6_134: # %.loopexit - # in Loop: Header=BB6_99 Depth=2 - st.w $t0, $s5, 140 - move $a6, $t0 -.LBB6_135: # in Loop: Header=BB6_99 Depth=2 - move $t1, $s1 - move $t2, $s8 - bltu $a6, $a3, .LBB6_99 + # in Loop: Header=BB6_98 Depth=2 + st.w $t1, $s5, 140 + move $a7, $t1 +.LBB6_135: # in Loop: Header=BB6_98 Depth=2 + move $t2, $s2 + move $t3, $s8 + bltu $a7, $a4, .LBB6_98 .LBB6_136: # %._crit_edge1996 - # in Loop: Header=BB6_16 Depth=1 - ld.hu $a3, $s5, 664 - beqz $a3, .LBB6_161 -# %bb.137: # in Loop: Header=BB6_16 Depth=1 - move $s6, $t8 - move $s4, $ra - ld.d $a0, $sp, 48 # 8-byte Folded Reload + # in Loop: Header=BB6_14 Depth=1 + ld.hu $a4, $s5, 664 + beqz $a4, .LBB6_161 +# %bb.137: # in Loop: Header=BB6_14 Depth=1 + st.d $s6, $sp, 96 # 8-byte Folded Spill + move $s6, $ra + ld.d $a0, $sp, 40 # 8-byte Folded Reload st.d $a0, $s5, 144 st.d $a0, $s5, 104 ori $a0, $zero, 9 st.w $a0, $s5, 120 ori $a0, $zero, 1 - move $a1, $fp - ld.d $a3, $sp, 64 # 8-byte Folded Reload - ld.d $a4, $sp, 40 # 8-byte Folded Reload - ld.d $a5, $sp, 56 # 8-byte Folded Reload + move $a1, $s0 + ld.d $a3, $sp, 56 # 8-byte Folded Reload + ld.d $a4, $sp, 32 # 8-byte Folded Reload + ld.d $a5, $sp, 48 # 8-byte Folded Reload pcaddu18i $ra, %call36(inflate_table) jirl $ra, $ra, 0 beqz $a0, .LBB6_172 -# %bb.138: # in Loop: Header=BB6_16 Depth=1 - st.d $a0, $sp, 96 # 8-byte Folded Spill +# %bb.138: # in Loop: Header=BB6_14 Depth=1 + st.d $a0, $sp, 88 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(.L.str.12) addi.d $a1, $a0, %pc_lo12(.L.str.12) b .LBB6_174 -.LBB6_139: # in Loop: Header=BB6_16 Depth=1 - lu12i.w $a2, 3 - ori $a2, $a2, 3904 - b .LBB6_13 -.LBB6_140: # in Loop: Header=BB6_16 Depth=1 - move $a3, $zero - b .LBB6_278 -.LBB6_141: # in Loop: Header=BB6_16 Depth=1 +.LBB6_139: # in Loop: Header=BB6_14 Depth=1 + ori $a2, $s4, 3904 + b .LBB6_341 +.LBB6_140: # in Loop: Header=BB6_14 Depth=1 + move $a4, $zero + b .LBB6_273 +.LBB6_141: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.7) addi.d $a2, $a2, %pc_lo12(.L.str.7) - b .LBB6_345 -.LBB6_142: # in Loop: Header=BB6_16 Depth=1 + b .LBB6_340 +.LBB6_142: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.8) addi.d $a2, $a2, %pc_lo12(.L.str.8) - b .LBB6_287 -.LBB6_143: # in Loop: Header=BB6_16 Depth=1 + b .LBB6_282 +.LBB6_143: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.2) addi.d $a2, $a2, %pc_lo12(.L.str.2) - b .LBB6_345 -.LBB6_144: # in Loop: Header=BB6_16 Depth=1 - move $a5, $s8 - move $a2, $s1 - move $a4, $s0 + b .LBB6_340 +.LBB6_144: # in Loop: Header=BB6_14 Depth=1 + move $a6, $s8 + move $a2, $s2 + move $a5, $s1 .LBB6_145: # %._crit_edge2042 - # in Loop: Header=BB6_16 Depth=1 - ld.w $a6, $s5, 92 - addi.d $a7, $zero, -1 - sll.w $a7, $a7, $a3 - andn $a7, $s2, $a7 - add.d $a6, $a6, $a7 - lu12i.w $a7, 1 - ori $a7, $a7, 3052 - ldx.w $a7, $s5, $a7 - st.w $a6, $s5, 92 - srl.d $s2, $s2, $a3 - sub.d $s0, $a4, $a3 - add.d $a3, $a7, $a3 - stptr.w $a3, $s5, 7148 - move $s8, $a5 - move $s1, $a2 - b .LBB6_314 -.LBB6_146: # in Loop: Header=BB6_16 Depth=1 - lu12i.w $a2, 3 - ori $a2, $a2, 3908 + # in Loop: Header=BB6_14 Depth=1 + ld.w $a7, $s5, 92 + addi.d $t0, $zero, -1 + sll.w $t0, $t0, $a4 + andn $t0, $fp, $t0 + add.d $a7, $a7, $t0 + lu12i.w $t0, 1 + ori $t0, $t0, 3052 + ldx.w $t0, $s5, $t0 + st.w $a7, $s5, 92 + srl.d $fp, $fp, $a4 + sub.d $s1, $a5, $a4 + add.d $a4, $t0, $a4 + stptr.w $a4, $s5, 7148 + move $s8, $a6 + move $s2, $a2 + b .LBB6_309 +.LBB6_146: # in Loop: Header=BB6_14 Depth=1 + ori $a2, $s4, 3908 b .LBB6_148 -.LBB6_147: # in Loop: Header=BB6_16 Depth=1 +.LBB6_147: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.6) addi.d $a2, $a2, %pc_lo12(.L.str.6) st.d $a2, $a0, 48 - lu12i.w $a2, 3 - ori $a2, $a2, 3921 + ori $a2, $s4, 3921 .LBB6_148: # %.sink.split - # in Loop: Header=BB6_16 Depth=1 + # in Loop: Header=BB6_14 Depth=1 st.w $a2, $s5, 8 -.LBB6_149: # in Loop: Header=BB6_16 Depth=1 - srli.d $s2, $s2, 3 - addi.w $s0, $s0, -3 - b .LBB6_15 -.LBB6_150: # in Loop: Header=BB6_16 Depth=1 - move $fp, $s8 +.LBB6_149: # in Loop: Header=BB6_14 Depth=1 + srli.d $fp, $fp, 3 + addi.w $s1, $s1, -3 + b .LBB6_13 +.LBB6_150: # in Loop: Header=BB6_14 Depth=1 + move $s0, $s8 .LBB6_151: # %._crit_edge2325 - # in Loop: Header=BB6_16 Depth=1 - move $s8, $s6 - st.d $s7, $sp, 88 # 8-byte Folded Spill - andi $a3, $a2, 2 - beqz $a3, .LBB6_167 + # in Loop: Header=BB6_14 Depth=1 + st.d $s7, $sp, 96 # 8-byte Folded Spill + move $s7, $s4 + andi $a4, $a2, 2 + beqz $a4, .LBB6_167 # %bb.152: # %._crit_edge2325 - # in Loop: Header=BB6_16 Depth=1 - lu12i.w $a3, 8 - ori $s6, $a3, 2847 - bne $s2, $s6, .LBB6_167 -# %bb.153: # in Loop: Header=BB6_16 Depth=1 - st.d $t8, $sp, 24 # 8-byte Folded Spill - move $s4, $ra + # in Loop: Header=BB6_14 Depth=1 + lu12i.w $a4, 8 + ori $s4, $a4, 2847 + bne $fp, $s4, .LBB6_167 +# %bb.153: # in Loop: Header=BB6_14 Depth=1 + move $s8, $s3 + move $s3, $s6 + move $s6, $ra ld.w $a0, $s5, 56 bnez $a0, .LBB6_155 -# %bb.154: # in Loop: Header=BB6_16 Depth=1 +# %bb.154: # in Loop: Header=BB6_14 Depth=1 ori $a0, $zero, 15 st.w $a0, $s5, 56 -.LBB6_155: # in Loop: Header=BB6_16 Depth=1 +.LBB6_155: # in Loop: Header=BB6_14 Depth=1 move $a0, $zero move $a1, $zero move $a2, $zero pcaddu18i $ra, %call36(crc32) jirl $ra, $ra, 0 st.d $a0, $s5, 32 - st.h $s6, $sp, 164 + st.h $s4, $sp, 164 addi.d $a1, $sp, 164 ori $a2, $zero, 2 pcaddu18i $ra, %call36(crc32) jirl $ra, $ra, 0 - move $s2, $zero - move $s0, $zero + move $fp, $zero + move $s1, $zero st.d $a0, $s5, 32 - lu12i.w $a0, 3 - ori $a0, $a0, 3893 + move $s4, $s7 + ori $a0, $s7, 3893 st.w $a0, $s5, 8 ld.d $a1, $sp, 144 # 8-byte Folded Reload ld.d $a0, $sp, 152 # 8-byte Folded Reload - ld.d $s7, $sp, 88 # 8-byte Folded Reload - move $ra, $s4 - ld.d $t8, $sp, 24 # 8-byte Folded Reload - move $s6, $s8 - b .LBB6_270 -.LBB6_156: # in Loop: Header=BB6_16 Depth=1 - move $fp, $s8 + ld.d $a3, $sp, 120 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload + move $ra, $s6 + move $s6, $s3 + move $s3, $s8 + b .LBB6_12 +.LBB6_156: # in Loop: Header=BB6_14 Depth=1 + move $s0, $s8 .LBB6_157: # %._crit_edge - # in Loop: Header=BB6_16 Depth=1 - ld.d $a4, $a0, 40 - ld.d $a7, $sp, 120 # 8-byte Folded Reload - sub.w $a2, $a7, $ra - ld.d $a6, $s5, 40 - bstrpick.d $a5, $a2, 31, 0 - add.d $a4, $a4, $a5 - st.d $a4, $a0, 40 - add.d $a6, $a6, $a5 - andi $a4, $a3, 4 - st.d $a6, $s5, 40 - beqz $a4, .LBB6_275 + # in Loop: Header=BB6_14 Depth=1 + ld.d $a5, $a0, 40 + ld.d $t0, $sp, 112 # 8-byte Folded Reload + sub.w $a2, $t0, $s7 + ld.d $a7, $s5, 40 + bstrpick.d $a6, $a2, 31, 0 + add.d $a5, $a5, $a6 + st.d $a5, $a0, 40 + add.d $a7, $a7, $a6 + andi $a5, $a4, 4 + st.d $a7, $s5, 40 + beqz $a5, .LBB6_270 # %bb.158: # %._crit_edge - # in Loop: Header=BB6_16 Depth=1 - beq $a7, $ra, .LBB6_275 -# %bb.159: # in Loop: Header=BB6_16 Depth=1 + # in Loop: Header=BB6_14 Depth=1 + beq $t0, $s7, .LBB6_270 +# %bb.159: # in Loop: Header=BB6_14 Depth=1 move $s8, $s7 - move $s7, $s6 - move $s4, $ra + move $s7, $s4 + move $s4, $s3 + move $s3, $s6 ld.w $a3, $s5, 24 ld.d $a0, $s5, 32 - move $s6, $t8 - sub.d $a1, $t8, $a5 - beqz $a3, .LBB6_273 -# %bb.160: # in Loop: Header=BB6_16 Depth=1 + move $s6, $ra + sub.d $a1, $ra, $a6 + beqz $a3, .LBB6_268 +# %bb.160: # in Loop: Header=BB6_14 Depth=1 pcaddu18i $ra, %call36(crc32) jirl $ra, $ra, 0 - b .LBB6_274 -.LBB6_161: # in Loop: Header=BB6_16 Depth=1 + b .LBB6_269 +.LBB6_161: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.11) addi.d $a2, $a2, %pc_lo12(.L.str.11) - b .LBB6_345 + b .LBB6_340 .LBB6_162: # %vector.scevcheck3785 - # in Loop: Header=BB6_16 Depth=1 + # in Loop: Header=BB6_14 Depth=1 ori $a3, $zero, 19 sub.d $a2, $a3, $a2 addi.w $a3, $a2, 0 addi.d $a6, $zero, -2 sub.w $a5, $a6, $a5 - bltu $a5, $a3, .LBB6_90 + bltu $a5, $a3, .LBB6_89 # %bb.163: # %vector.scevcheck3785 - # in Loop: Header=BB6_16 Depth=1 + # in Loop: Header=BB6_14 Depth=1 srli.d $a2, $a2, 32 - bnez $a2, .LBB6_90 + bnez $a2, .LBB6_89 # %bb.164: # %vector.ph3788 - # in Loop: Header=BB6_16 Depth=1 + # in Loop: Header=BB6_14 Depth=1 move $a3, $a1 bstrins.d $a3, $zero, 2, 0 add.d $a2, $a3, $a4 alsl.d $a4, $a4, $a0, 1 move $a5, $a3 .LBB6_165: # %vector.body3791 - # Parent Loop BB6_16 Depth=1 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 vld $vr0, $a4, 0 - vrepli.b $vr1, 0 - vilvh.h $vr2, $vr1, $vr0 - vilvh.w $vr3, $vr1, $vr2 - vilvl.w $vr2, $vr1, $vr2 - vilvl.h $vr0, $vr1, $vr0 - vilvh.w $vr4, $vr1, $vr0 - vilvl.w $vr0, $vr1, $vr0 + vbsrl.v $vr1, $vr0, 12 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 8 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vshuf4i.h $vr3, $vr0, 14 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vpickve2gr.d $a6, $vr0, 0 slli.d $a6, $a6, 1 vpickve2gr.d $a7, $vr0, 1 slli.d $a7, $a7, 1 - vpickve2gr.d $t0, $vr4, 0 + vpickve2gr.d $t0, $vr3, 0 slli.d $t0, $t0, 1 - vpickve2gr.d $t1, $vr4, 1 + vpickve2gr.d $t1, $vr3, 1 slli.d $t1, $t1, 1 vpickve2gr.d $t2, $vr2, 0 slli.d $t2, $t2, 1 vpickve2gr.d $t3, $vr2, 1 slli.d $t3, $t3, 1 - vpickve2gr.d $t4, $vr3, 0 + vpickve2gr.d $t4, $vr1, 0 slli.d $t4, $t4, 1 - vpickve2gr.d $t5, $vr3, 1 + vpickve2gr.d $t5, $vr1, 1 slli.d $t5, $t5, 1 - stx.h $zero, $fp, $a6 - stx.h $zero, $fp, $a7 - stx.h $zero, $fp, $t0 - stx.h $zero, $fp, $t1 - stx.h $zero, $fp, $t2 - stx.h $zero, $fp, $t3 - stx.h $zero, $fp, $t4 - stx.h $zero, $fp, $t5 + stx.h $zero, $s0, $a6 + stx.h $zero, $s0, $a7 + stx.h $zero, $s0, $t0 + stx.h $zero, $s0, $t1 + stx.h $zero, $s0, $t2 + stx.h $zero, $s0, $t3 + stx.h $zero, $s0, $t4 + stx.h $zero, $s0, $t5 addi.d $a5, $a5, -8 addi.d $a4, $a4, 16 bnez $a5, .LBB6_165 # %bb.166: # %middle.block3796 - # in Loop: Header=BB6_16 Depth=1 - bne $a1, $a3, .LBB6_91 - b .LBB6_93 -.LBB6_167: # in Loop: Header=BB6_16 Depth=1 - ld.d $a3, $s5, 48 + # in Loop: Header=BB6_14 Depth=1 + bne $a1, $a3, .LBB6_90 + b .LBB6_92 +.LBB6_167: # in Loop: Header=BB6_14 Depth=1 + ld.d $a4, $s5, 48 st.w $zero, $s5, 24 - beqz $a3, .LBB6_169 -# %bb.168: # in Loop: Header=BB6_16 Depth=1 - addi.w $a4, $zero, -1 - lu32i.d $a4, 0 - st.w $a4, $a3, 72 -.LBB6_169: # in Loop: Header=BB6_16 Depth=1 + beqz $a4, .LBB6_169 +# %bb.168: # in Loop: Header=BB6_14 Depth=1 + addi.w $a5, $zero, -1 + lu32i.d $a5, 0 + st.w $a5, $a4, 72 +.LBB6_169: # in Loop: Header=BB6_14 Depth=1 andi $a2, $a2, 1 - ld.d $s7, $sp, 88 # 8-byte Folded Reload - move $s6, $s8 + move $s4, $s7 beqz $a2, .LBB6_171 -# %bb.170: # in Loop: Header=BB6_16 Depth=1 - slli.d $a2, $s2, 8 +# %bb.170: # in Loop: Header=BB6_14 Depth=1 + slli.d $a2, $fp, 8 bstrpick.d $a2, $a2, 15, 8 slli.d $a2, $a2, 8 - srli.d $a3, $s2, 8 - add.d $a2, $a2, $a3 - lu12i.w $a3, -270601 - ori $a3, $a3, 3039 - lu32i.d $a3, -270601 - lu52i.d $a3, $a3, -265 - mul.d $a2, $a2, $a3 - lu12i.w $a3, 135300 - ori $a3, $a3, 529 - lu32i.d $a3, 135300 - lu52i.d $a3, $a3, 132 - bltu $a2, $a3, .LBB6_366 -.LBB6_171: # in Loop: Header=BB6_16 Depth=1 + srli.d $a4, $fp, 8 + add.d $a2, $a2, $a4 + lu12i.w $a4, -270601 + ori $a4, $a4, 3039 + lu32i.d $a4, -270601 + lu52i.d $a4, $a4, -265 + mul.d $a2, $a2, $a4 + lu12i.w $a4, 135300 + ori $a4, $a4, 529 + lu32i.d $a4, 135300 + lu52i.d $a4, $a4, 132 + bltu $a2, $a4, .LBB6_363 +.LBB6_171: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.1) addi.d $a2, $a2, %pc_lo12(.L.str.1) - b .LBB6_287 -.LBB6_172: # in Loop: Header=BB6_16 Depth=1 + st.d $a2, $a0, 48 + ori $a2, $s4, 3921 + st.w $a2, $s5, 8 + ld.d $s7, $sp, 96 # 8-byte Folded Reload + b .LBB6_13 +.LBB6_172: # in Loop: Header=BB6_14 Depth=1 ld.d $a0, $s5, 144 st.d $a0, $s5, 112 ld.wu $a0, $s5, 132 ld.w $a2, $s5, 136 - ori $fp, $zero, 6 - st.w $fp, $s5, 124 - ld.d $a1, $sp, 80 # 8-byte Folded Reload + ori $s0, $zero, 6 + st.w $s0, $s5, 124 + ld.d $a1, $sp, 72 # 8-byte Folded Reload alsl.d $a1, $a0, $a1, 1 ori $a0, $zero, 2 - ld.d $a3, $sp, 64 # 8-byte Folded Reload - ld.d $a4, $sp, 32 # 8-byte Folded Reload - ld.d $a5, $sp, 56 # 8-byte Folded Reload + ld.d $a3, $sp, 56 # 8-byte Folded Reload + ld.d $a4, $sp, 24 # 8-byte Folded Reload + ld.d $a5, $sp, 48 # 8-byte Folded Reload pcaddu18i $ra, %call36(inflate_table) jirl $ra, $ra, 0 - st.d $a0, $sp, 96 # 8-byte Folded Spill - beqz $a0, .LBB6_289 -# %bb.173: # in Loop: Header=BB6_16 Depth=1 + st.d $a0, $sp, 88 # 8-byte Folded Spill + beqz $a0, .LBB6_284 +# %bb.173: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.13) addi.d $a1, $a0, %pc_lo12(.L.str.13) .LBB6_174: # %.thread - # in Loop: Header=BB6_16 Depth=1 + # in Loop: Header=BB6_14 Depth=1 ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $a1, $a0, 48 - lu12i.w $a1, 3 - ori $a1, $a1, 3921 + ori $a1, $s4, 3921 st.w $a1, $s5, 8 - move $fp, $s8 - b .LBB6_268 -.LBB6_175: # in Loop: Header=BB6_16 Depth=1 + move $s0, $s8 + ld.d $a1, $sp, 144 # 8-byte Folded Reload + ld.d $a3, $sp, 120 # 8-byte Folded Reload + move $ra, $s6 + ld.d $s6, $sp, 96 # 8-byte Folded Reload + b .LBB6_12 +.LBB6_175: # in Loop: Header=BB6_14 Depth=1 ld.d $a2, $s5, 48 beqz $a2, .LBB6_177 -# %bb.176: # in Loop: Header=BB6_16 Depth=1 - bstrpick.d $a3, $s2, 8, 8 - st.w $a3, $a2, 0 -.LBB6_177: # in Loop: Header=BB6_16 Depth=1 - andi $a2, $s2, 512 +# %bb.176: # in Loop: Header=BB6_14 Depth=1 + bstrpick.d $a4, $fp, 8, 8 + st.w $a4, $a2, 0 +.LBB6_177: # in Loop: Header=BB6_14 Depth=1 + andi $a2, $fp, 512 beqz $a2, .LBB6_180 -# %bb.178: # in Loop: Header=BB6_16 Depth=1 +# %bb.178: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 16 andi $a2, $a2, 4 beqz $a2, .LBB6_180 -# %bb.179: # in Loop: Header=BB6_16 Depth=1 +# %bb.179: # in Loop: Header=BB6_14 Depth=1 ld.d $a0, $s5, 32 - st.h $s2, $sp, 164 + st.h $fp, $sp, 164 addi.d $a1, $sp, 164 ori $a2, $zero, 2 move $fp, $ra - move $s0, $t8 pcaddu18i $ra, %call36(crc32) jirl $ra, $ra, 0 ld.d $t6, $sp, 136 # 8-byte Folded Reload ori $t7, $zero, 30 - move $t8, $s0 + ld.d $t8, $sp, 128 # 8-byte Folded Reload move $ra, $fp ld.d $a1, $sp, 144 # 8-byte Folded Reload move $a2, $a0 + ld.d $a3, $sp, 120 # 8-byte Folded Reload ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $a2, $s5, 32 .LBB6_180: # %.thread2893 - # in Loop: Header=BB6_16 Depth=1 - move $s0, $zero - move $s2, $zero - lu12i.w $a2, 3 - ori $a2, $a2, 3894 + # in Loop: Header=BB6_14 Depth=1 + move $s1, $zero + move $fp, $zero + ori $a2, $s4, 3894 st.w $a2, $s5, 8 .LBB6_181: # %.lr.ph2286.preheader - # in Loop: Header=BB6_16 Depth=1 - addi.w $a3, $s0, 0 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a4, $s1, 0 move $a2, $s8 .LBB6_182: # %.lr.ph2286 - # Parent Loop BB6_16 Depth=1 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $s1, .LBB6_379 + beqz $s2, .LBB6_377 # %bb.183: # in Loop: Header=BB6_182 Depth=2 - move $a4, $a3 - ld.bu $a3, $a2, 0 - addi.w $s1, $s1, -1 + move $a5, $a4 + ld.bu $a4, $a2, 0 + addi.w $s2, $s2, -1 addi.d $s8, $a2, 1 - sll.d $a2, $a3, $a4 - add.d $s2, $a2, $s2 - addi.d $a3, $a4, 8 - ori $a5, $zero, 24 - addi.d $s0, $s0, 8 + sll.d $a2, $a4, $a5 + add.d $fp, $a2, $fp + addi.d $a4, $a5, 8 + ori $a6, $zero, 24 + addi.d $s1, $s1, 8 move $a2, $s8 - bltu $a4, $a5, .LBB6_182 + bltu $a5, $a6, .LBB6_182 .LBB6_184: # %._crit_edge2287 - # in Loop: Header=BB6_16 Depth=1 + # in Loop: Header=BB6_14 Depth=1 ld.d $a2, $s5, 48 beqz $a2, .LBB6_186 -# %bb.185: # in Loop: Header=BB6_16 Depth=1 - st.d $s2, $a2, 8 -.LBB6_186: # in Loop: Header=BB6_16 Depth=1 +# %bb.185: # in Loop: Header=BB6_14 Depth=1 + st.d $fp, $a2, 8 +.LBB6_186: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 25 andi $a2, $a2, 2 beqz $a2, .LBB6_189 -# %bb.187: # in Loop: Header=BB6_16 Depth=1 +# %bb.187: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 16 andi $a2, $a2, 4 beqz $a2, .LBB6_189 -# %bb.188: # in Loop: Header=BB6_16 Depth=1 +# %bb.188: # in Loop: Header=BB6_14 Depth=1 ld.d $a0, $s5, 32 - st.w $s2, $sp, 164 + st.w $fp, $sp, 164 addi.d $a1, $sp, 164 ori $a2, $zero, 4 move $fp, $ra - move $s0, $t8 pcaddu18i $ra, %call36(crc32) jirl $ra, $ra, 0 ld.d $t6, $sp, 136 # 8-byte Folded Reload ori $t7, $zero, 30 - move $t8, $s0 + ld.d $t8, $sp, 128 # 8-byte Folded Reload move $ra, $fp ld.d $a1, $sp, 144 # 8-byte Folded Reload move $a2, $a0 + ld.d $a3, $sp, 120 # 8-byte Folded Reload ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $a2, $s5, 32 .LBB6_189: # %.thread2902 - # in Loop: Header=BB6_16 Depth=1 - move $s0, $zero - move $s2, $zero - lu12i.w $a2, 3 - ori $a2, $a2, 3895 + # in Loop: Header=BB6_14 Depth=1 + move $s1, $zero + move $fp, $zero + ori $a2, $s4, 3895 st.w $a2, $s5, 8 .LBB6_190: # %.lr.ph2296.preheader - # in Loop: Header=BB6_16 Depth=1 - addi.w $a3, $s0, 0 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a4, $s1, 0 move $a2, $s8 .LBB6_191: # %.lr.ph2296 - # Parent Loop BB6_16 Depth=1 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $s1, .LBB6_379 + beqz $s2, .LBB6_377 # %bb.192: # in Loop: Header=BB6_191 Depth=2 - move $a4, $a3 - ld.bu $a3, $a2, 0 - addi.w $s1, $s1, -1 + move $a5, $a4 + ld.bu $a4, $a2, 0 + addi.w $s2, $s2, -1 addi.d $s8, $a2, 1 - sll.d $a2, $a3, $a4 - add.d $s2, $a2, $s2 - addi.d $a3, $a4, 8 - ori $a5, $zero, 8 - addi.d $s0, $s0, 8 + sll.d $a2, $a4, $a5 + add.d $fp, $a2, $fp + addi.d $a4, $a5, 8 + ori $a6, $zero, 8 + addi.d $s1, $s1, 8 move $a2, $s8 - bltu $a4, $a5, .LBB6_191 + bltu $a5, $a6, .LBB6_191 .LBB6_193: # %._crit_edge2297 - # in Loop: Header=BB6_16 Depth=1 + # in Loop: Header=BB6_14 Depth=1 ld.d $a2, $s5, 48 beqz $a2, .LBB6_195 -# %bb.194: # in Loop: Header=BB6_16 Depth=1 - andi $a3, $s2, 255 - st.w $a3, $a2, 16 - srli.d $a3, $s2, 8 - st.w $a3, $a2, 20 -.LBB6_195: # in Loop: Header=BB6_16 Depth=1 +# %bb.194: # in Loop: Header=BB6_14 Depth=1 + andi $a4, $fp, 255 + st.w $a4, $a2, 16 + srli.d $a4, $fp, 8 + st.w $a4, $a2, 20 +.LBB6_195: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 25 andi $a2, $a2, 2 beqz $a2, .LBB6_198 -# %bb.196: # in Loop: Header=BB6_16 Depth=1 +# %bb.196: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 16 andi $a2, $a2, 4 beqz $a2, .LBB6_198 -# %bb.197: # in Loop: Header=BB6_16 Depth=1 +# %bb.197: # in Loop: Header=BB6_14 Depth=1 ld.d $a0, $s5, 32 - st.h $s2, $sp, 164 + st.h $fp, $sp, 164 addi.d $a1, $sp, 164 ori $a2, $zero, 2 move $fp, $ra - move $s0, $t8 pcaddu18i $ra, %call36(crc32) jirl $ra, $ra, 0 ld.d $t6, $sp, 136 # 8-byte Folded Reload ori $t7, $zero, 30 - move $t8, $s0 + ld.d $t8, $sp, 128 # 8-byte Folded Reload move $ra, $fp ld.d $a1, $sp, 144 # 8-byte Folded Reload move $a2, $a0 + ld.d $a3, $sp, 120 # 8-byte Folded Reload ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $a2, $s5, 32 .LBB6_198: # %.thread2911 - # in Loop: Header=BB6_16 Depth=1 + # in Loop: Header=BB6_14 Depth=1 ld.wu $a2, $s5, 24 - move $s2, $zero - move $s0, $zero - lu12i.w $a3, 3 - ori $a3, $a3, 3896 - andi $a4, $a2, 1024 - st.w $a3, $s5, 8 - bnez $a4, .LBB6_201 -.LBB6_199: # in Loop: Header=BB6_16 Depth=1 + move $fp, $zero + move $s1, $zero + ori $a4, $s4, 3896 + andi $a5, $a2, 1024 + st.w $a4, $s5, 8 + bnez $a5, .LBB6_201 +.LBB6_199: # in Loop: Header=BB6_14 Depth=1 ld.d $a2, $s5, 48 beqz $a2, .LBB6_210 -# %bb.200: # in Loop: Header=BB6_16 Depth=1 +# %bb.200: # in Loop: Header=BB6_14 Depth=1 st.d $zero, $a2, 24 b .LBB6_210 .LBB6_201: # %.lr.ph2305.preheader - # in Loop: Header=BB6_16 Depth=1 - addi.w $a5, $s0, 0 - move $a3, $s8 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a6, $s1, 0 + move $a4, $s8 .LBB6_202: # %.lr.ph2305 - # Parent Loop BB6_16 Depth=1 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $s1, .LBB6_381 + beqz $s2, .LBB6_379 # %bb.203: # in Loop: Header=BB6_202 Depth=2 - move $a4, $a5 - ld.bu $a5, $a3, 0 - addi.w $s1, $s1, -1 - addi.d $s8, $a3, 1 - sll.d $a3, $a5, $a4 - add.d $s2, $a3, $s2 - addi.d $a5, $a4, 8 - ori $a6, $zero, 8 - addi.d $s0, $s0, 8 - move $a3, $s8 - bltu $a4, $a6, .LBB6_202 + move $a5, $a6 + ld.bu $a6, $a4, 0 + addi.w $s2, $s2, -1 + addi.d $s8, $a4, 1 + sll.d $a4, $a6, $a5 + add.d $fp, $a4, $fp + addi.d $a6, $a5, 8 + ori $a7, $zero, 8 + addi.d $s1, $s1, 8 + move $a4, $s8 + bltu $a5, $a7, .LBB6_202 .LBB6_204: # %._crit_edge2306 - # in Loop: Header=BB6_16 Depth=1 - ld.d $a3, $s5, 48 - st.w $s2, $s5, 92 - beqz $a3, .LBB6_206 -# %bb.205: # in Loop: Header=BB6_16 Depth=1 - st.w $s2, $a3, 32 -.LBB6_206: # in Loop: Header=BB6_16 Depth=1 + # in Loop: Header=BB6_14 Depth=1 + ld.d $a4, $s5, 48 + st.w $fp, $s5, 92 + beqz $a4, .LBB6_206 +# %bb.205: # in Loop: Header=BB6_14 Depth=1 + st.w $fp, $a4, 32 +.LBB6_206: # in Loop: Header=BB6_14 Depth=1 andi $a2, $a2, 512 beqz $a2, .LBB6_208 -# %bb.207: # in Loop: Header=BB6_16 Depth=1 +# %bb.207: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 16 andi $a2, $a2, 4 bnez $a2, .LBB6_209 -.LBB6_208: # in Loop: Header=BB6_16 Depth=1 - move $s2, $zero - move $s0, $zero +.LBB6_208: # in Loop: Header=BB6_14 Depth=1 + move $fp, $zero + move $s1, $zero b .LBB6_210 -.LBB6_209: # in Loop: Header=BB6_16 Depth=1 +.LBB6_209: # in Loop: Header=BB6_14 Depth=1 ld.d $a0, $s5, 32 - st.h $s2, $sp, 164 + st.h $fp, $sp, 164 addi.d $a1, $sp, 164 ori $a2, $zero, 2 move $fp, $ra - move $s0, $t8 pcaddu18i $ra, %call36(crc32) jirl $ra, $ra, 0 ld.d $t6, $sp, 136 # 8-byte Folded Reload ori $t7, $zero, 30 - move $t8, $s0 + ld.d $t8, $sp, 128 # 8-byte Folded Reload move $ra, $fp ld.d $a1, $sp, 144 # 8-byte Folded Reload move $a2, $a0 + ld.d $a3, $sp, 120 # 8-byte Folded Reload ld.d $a0, $sp, 152 # 8-byte Folded Reload - move $s2, $zero - move $s0, $zero + move $fp, $zero + move $s1, $zero st.d $a2, $s5, 32 -.LBB6_210: # in Loop: Header=BB6_16 Depth=1 - lu12i.w $a2, 3 - ori $a2, $a2, 3897 +.LBB6_210: # in Loop: Header=BB6_14 Depth=1 + ori $a2, $s4, 3897 st.w $a2, $s5, 8 -.LBB6_211: # in Loop: Header=BB6_16 Depth=1 - ld.wu $a3, $s5, 24 - andi $a2, $a3, 1024 +.LBB6_211: # in Loop: Header=BB6_14 Depth=1 + ld.wu $a4, $s5, 24 + andi $a2, $a4, 1024 beqz $a2, .LBB6_221 -# %bb.212: # in Loop: Header=BB6_16 Depth=1 - move $fp, $s7 +# %bb.212: # in Loop: Header=BB6_14 Depth=1 + move $s0, $s7 ld.w $a2, $s5, 92 - sltu $a4, $a2, $s1 - masknez $a5, $s1, $a4 - maskeqz $a4, $a2, $a4 - or $s6, $a4, $a5 - beqz $s6, .LBB6_220 -# %bb.213: # in Loop: Header=BB6_16 Depth=1 - ld.d $a4, $s5, 48 - beqz $a4, .LBB6_216 -# %bb.214: # in Loop: Header=BB6_16 Depth=1 - ld.d $a5, $a4, 24 + sltu $a5, $a2, $s2 + masknez $a6, $s2, $a5 + maskeqz $a5, $a2, $a5 + or $s7, $a5, $a6 + beqz $s7, .LBB6_220 +# %bb.213: # in Loop: Header=BB6_14 Depth=1 + ld.d $a5, $s5, 48 beqz $a5, .LBB6_216 -# %bb.215: # in Loop: Header=BB6_16 Depth=1 - ld.w $a0, $a4, 32 +# %bb.214: # in Loop: Header=BB6_14 Depth=1 + ld.d $a6, $a5, 24 + beqz $a6, .LBB6_216 +# %bb.215: # in Loop: Header=BB6_14 Depth=1 + ld.w $a0, $a5, 32 sub.d $a1, $a0, $a2 - ld.w $a2, $a4, 36 + ld.w $a2, $a5, 36 bstrpick.d $a0, $a1, 31, 0 - add.d $a0, $a5, $a0 - add.w $a3, $a1, $s6 + add.d $a0, $a6, $a0 + add.w $a3, $a1, $s7 sltu $a3, $a2, $a3 sub.d $a1, $a2, $a1 maskeqz $a1, $a1, $a3 - masknez $a2, $s6, $a3 + masknez $a2, $s7, $a3 or $a1, $a1, $a2 bstrpick.d $a2, $a1, 31, 0 move $a1, $s8 - move $s4, $ra - move $s7, $t8 + move $s4, $s3 + move $s3, $s6 + move $s6, $ra pcaddu18i $ra, %call36(memcpy) jirl $ra, $ra, 0 ld.d $t6, $sp, 136 # 8-byte Folded Reload ori $t7, $zero, 30 - move $t8, $s7 - move $ra, $s4 - ld.d $s4, $sp, 104 # 8-byte Folded Reload + ld.d $t8, $sp, 128 # 8-byte Folded Reload + move $ra, $s6 + move $s6, $s3 + move $s3, $s4 + lu12i.w $s4, 3 + ld.d $a3, $sp, 120 # 8-byte Folded Reload ld.d $a0, $sp, 152 # 8-byte Folded Reload ld.d $a1, $sp, 144 # 8-byte Folded Reload - ld.w $a3, $s5, 24 -.LBB6_216: # in Loop: Header=BB6_16 Depth=1 - andi $a2, $a3, 512 + ld.w $a4, $s5, 24 +.LBB6_216: # in Loop: Header=BB6_14 Depth=1 + andi $a2, $a4, 512 beqz $a2, .LBB6_219 -# %bb.217: # in Loop: Header=BB6_16 Depth=1 +# %bb.217: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 16 andi $a2, $a2, 4 beqz $a2, .LBB6_219 -# %bb.218: # in Loop: Header=BB6_16 Depth=1 +# %bb.218: # in Loop: Header=BB6_14 Depth=1 ld.d $a0, $s5, 32 move $a1, $s8 - move $a2, $s6 - move $s4, $ra - st.d $t8, $sp, 24 # 8-byte Folded Spill + move $a2, $s7 + move $s4, $s3 + move $s3, $s6 + move $s6, $ra pcaddu18i $ra, %call36(crc32) jirl $ra, $ra, 0 ld.d $t6, $sp, 136 # 8-byte Folded Reload ori $t7, $zero, 30 - ld.d $t8, $sp, 24 # 8-byte Folded Reload - move $ra, $s4 - ld.d $s4, $sp, 104 # 8-byte Folded Reload + ld.d $t8, $sp, 128 # 8-byte Folded Reload + move $ra, $s6 + move $s6, $s3 + move $s3, $s4 + lu12i.w $s4, 3 ld.d $a1, $sp, 144 # 8-byte Folded Reload move $a2, $a0 + ld.d $a3, $sp, 120 # 8-byte Folded Reload ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $a2, $s5, 32 -.LBB6_219: # in Loop: Header=BB6_16 Depth=1 +.LBB6_219: # in Loop: Header=BB6_14 Depth=1 ld.w $a2, $s5, 92 - sub.w $s1, $s1, $s6 - bstrpick.d $a3, $s6, 31, 0 - add.d $s8, $s8, $a3 - sub.w $a2, $a2, $s6 + sub.w $s2, $s2, $s7 + bstrpick.d $a4, $s7, 31, 0 + add.d $s8, $s8, $a4 + sub.w $a2, $a2, $s7 st.w $a2, $s5, 92 -.LBB6_220: # in Loop: Header=BB6_16 Depth=1 - move $s7, $fp - ld.d $s6, $sp, 128 # 8-byte Folded Reload - bnez $a2, .LBB6_389 -.LBB6_221: # in Loop: Header=BB6_16 Depth=1 +.LBB6_220: # in Loop: Header=BB6_14 Depth=1 + move $s7, $s0 + bnez $a2, .LBB6_390 +.LBB6_221: # in Loop: Header=BB6_14 Depth=1 st.w $zero, $s5, 92 - lu12i.w $a2, 3 - ori $a2, $a2, 3898 + ori $a2, $s4, 3898 st.w $a2, $s5, 8 -.LBB6_222: # in Loop: Header=BB6_16 Depth=1 +.LBB6_222: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 25 andi $a2, $a2, 8 bnez $a2, .LBB6_225 -# %bb.223: # in Loop: Header=BB6_16 Depth=1 +# %bb.223: # in Loop: Header=BB6_14 Depth=1 ld.d $a2, $s5, 48 beqz $a2, .LBB6_238 -# %bb.224: # in Loop: Header=BB6_16 Depth=1 +# %bb.224: # in Loop: Header=BB6_14 Depth=1 st.d $zero, $a2, 40 b .LBB6_238 -.LBB6_225: # in Loop: Header=BB6_16 Depth=1 - beqz $s1, .LBB6_378 +.LBB6_225: # in Loop: Header=BB6_14 Depth=1 + beqz $s2, .LBB6_376 # %bb.226: # %.preheader1282.preheader - # in Loop: Header=BB6_16 Depth=1 - st.d $s7, $sp, 88 # 8-byte Folded Spill - move $s6, $zero - move $fp, $zero - bstrpick.d $a2, $s1, 31, 0 + # in Loop: Header=BB6_14 Depth=1 + st.d $s7, $sp, 96 # 8-byte Folded Spill + st.d $s3, $sp, 80 # 8-byte Folded Spill + move $s3, $s6 + move $s6, $s4 + move $s7, $zero + move $s0, $zero + bstrpick.d $a2, $s2, 31, 0 .LBB6_227: # %.preheader1282 - # Parent Loop BB6_16 Depth=1 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $a4, $s5, 48 - ldx.bu $s7, $s8, $fp - beqz $a4, .LBB6_231 + ld.d $a5, $s5, 48 + ldx.bu $s4, $s8, $s0 + beqz $a5, .LBB6_231 # %bb.228: # in Loop: Header=BB6_227 Depth=2 - ld.d $a3, $a4, 40 - beqz $a3, .LBB6_231 + ld.d $a4, $a5, 40 + beqz $a4, .LBB6_231 # %bb.229: # in Loop: Header=BB6_227 Depth=2 - ld.w $a5, $s5, 92 - ld.w $a4, $a4, 48 - bgeu $a5, $a4, .LBB6_231 + ld.w $a6, $s5, 92 + ld.w $a5, $a5, 48 + bgeu $a6, $a5, .LBB6_231 # %bb.230: # in Loop: Header=BB6_227 Depth=2 - bstrpick.d $a4, $a5, 31, 0 - addi.d $a5, $a5, 1 - st.w $a5, $s5, 92 - stx.b $s7, $a3, $a4 + bstrpick.d $a5, $a6, 31, 0 + addi.d $a6, $a6, 1 + st.w $a6, $s5, 92 + stx.b $s4, $a4, $a5 .LBB6_231: # in Loop: Header=BB6_227 Depth=2 - addi.d $fp, $fp, 1 - addi.w $s6, $s6, 1 - beqz $s7, .LBB6_233 + addi.d $s0, $s0, 1 + addi.w $s7, $s7, 1 + beqz $s4, .LBB6_233 # %bb.232: # in Loop: Header=BB6_227 Depth=2 - bltu $fp, $a2, .LBB6_227 -.LBB6_233: # in Loop: Header=BB6_16 Depth=1 + bltu $s0, $a2, .LBB6_227 +.LBB6_233: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 25 andi $a2, $a2, 2 beqz $a2, .LBB6_236 -# %bb.234: # in Loop: Header=BB6_16 Depth=1 +# %bb.234: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 16 andi $a2, $a2, 4 beqz $a2, .LBB6_236 -# %bb.235: # in Loop: Header=BB6_16 Depth=1 +# %bb.235: # in Loop: Header=BB6_14 Depth=1 ld.d $a0, $s5, 32 move $a1, $s8 - move $a2, $s6 + move $a2, $s7 st.d $ra, $sp, 16 # 8-byte Folded Spill - st.d $t8, $sp, 24 # 8-byte Folded Spill pcaddu18i $ra, %call36(crc32) jirl $ra, $ra, 0 ld.d $t6, $sp, 136 # 8-byte Folded Reload ori $t7, $zero, 30 - ld.d $t8, $sp, 24 # 8-byte Folded Reload + ld.d $t8, $sp, 128 # 8-byte Folded Reload ld.d $ra, $sp, 16 # 8-byte Folded Reload ld.d $a1, $sp, 144 # 8-byte Folded Reload move $a2, $a0 + ld.d $a3, $sp, 120 # 8-byte Folded Reload ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $a2, $s5, 32 -.LBB6_236: # in Loop: Header=BB6_16 Depth=1 - sub.w $s1, $s1, $s6 - bnez $s7, .LBB6_387 -# %bb.237: # in Loop: Header=BB6_16 Depth=1 - add.d $s8, $s8, $fp - ld.d $s6, $sp, 128 # 8-byte Folded Reload - ld.d $s7, $sp, 88 # 8-byte Folded Reload -.LBB6_238: # in Loop: Header=BB6_16 Depth=1 +.LBB6_236: # in Loop: Header=BB6_14 Depth=1 + sub.w $s2, $s2, $s7 + bnez $s4, .LBB6_385 +# %bb.237: # in Loop: Header=BB6_14 Depth=1 + add.d $s8, $s8, $s0 + move $s4, $s6 + move $s6, $s3 + ld.d $s3, $sp, 80 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload +.LBB6_238: # in Loop: Header=BB6_14 Depth=1 st.w $zero, $s5, 92 - lu12i.w $a2, 3 - ori $a2, $a2, 3899 + ori $a2, $s4, 3899 st.w $a2, $s5, 8 -.LBB6_239: # in Loop: Header=BB6_16 Depth=1 +.LBB6_239: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 25 andi $a2, $a2, 16 bnez $a2, .LBB6_242 -# %bb.240: # in Loop: Header=BB6_16 Depth=1 +# %bb.240: # in Loop: Header=BB6_14 Depth=1 ld.d $a2, $s5, 48 beqz $a2, .LBB6_255 -# %bb.241: # in Loop: Header=BB6_16 Depth=1 +# %bb.241: # in Loop: Header=BB6_14 Depth=1 st.d $zero, $a2, 56 b .LBB6_255 -.LBB6_242: # in Loop: Header=BB6_16 Depth=1 - beqz $s1, .LBB6_378 +.LBB6_242: # in Loop: Header=BB6_14 Depth=1 + beqz $s2, .LBB6_376 # %bb.243: # %.preheader1281.preheader - # in Loop: Header=BB6_16 Depth=1 - st.d $s7, $sp, 88 # 8-byte Folded Spill - move $s6, $zero - move $fp, $zero - bstrpick.d $a2, $s1, 31, 0 + # in Loop: Header=BB6_14 Depth=1 + st.d $s7, $sp, 96 # 8-byte Folded Spill + st.d $s3, $sp, 80 # 8-byte Folded Spill + move $s3, $s6 + move $s6, $s4 + move $s7, $zero + move $s0, $zero + bstrpick.d $a2, $s2, 31, 0 .LBB6_244: # %.preheader1281 - # Parent Loop BB6_16 Depth=1 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $a4, $s5, 48 - ldx.bu $s7, $s8, $fp - beqz $a4, .LBB6_248 + ld.d $a5, $s5, 48 + ldx.bu $s4, $s8, $s0 + beqz $a5, .LBB6_248 # %bb.245: # in Loop: Header=BB6_244 Depth=2 - ld.d $a3, $a4, 56 - beqz $a3, .LBB6_248 + ld.d $a4, $a5, 56 + beqz $a4, .LBB6_248 # %bb.246: # in Loop: Header=BB6_244 Depth=2 - ld.w $a5, $s5, 92 - ld.w $a4, $a4, 64 - bgeu $a5, $a4, .LBB6_248 + ld.w $a6, $s5, 92 + ld.w $a5, $a5, 64 + bgeu $a6, $a5, .LBB6_248 # %bb.247: # in Loop: Header=BB6_244 Depth=2 - bstrpick.d $a4, $a5, 31, 0 - addi.d $a5, $a5, 1 - st.w $a5, $s5, 92 - stx.b $s7, $a3, $a4 + bstrpick.d $a5, $a6, 31, 0 + addi.d $a6, $a6, 1 + st.w $a6, $s5, 92 + stx.b $s4, $a4, $a5 .LBB6_248: # in Loop: Header=BB6_244 Depth=2 - addi.d $fp, $fp, 1 - addi.w $s6, $s6, 1 - beqz $s7, .LBB6_250 + addi.d $s0, $s0, 1 + addi.w $s7, $s7, 1 + beqz $s4, .LBB6_250 # %bb.249: # in Loop: Header=BB6_244 Depth=2 - bltu $fp, $a2, .LBB6_244 -.LBB6_250: # in Loop: Header=BB6_16 Depth=1 + bltu $s0, $a2, .LBB6_244 +.LBB6_250: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 25 andi $a2, $a2, 2 beqz $a2, .LBB6_253 -# %bb.251: # in Loop: Header=BB6_16 Depth=1 +# %bb.251: # in Loop: Header=BB6_14 Depth=1 ld.bu $a2, $s5, 16 andi $a2, $a2, 4 beqz $a2, .LBB6_253 -# %bb.252: # in Loop: Header=BB6_16 Depth=1 +# %bb.252: # in Loop: Header=BB6_14 Depth=1 ld.d $a0, $s5, 32 move $a1, $s8 - move $a2, $s6 + move $a2, $s7 st.d $ra, $sp, 16 # 8-byte Folded Spill - st.d $t8, $sp, 24 # 8-byte Folded Spill pcaddu18i $ra, %call36(crc32) jirl $ra, $ra, 0 ld.d $t6, $sp, 136 # 8-byte Folded Reload ori $t7, $zero, 30 - ld.d $t8, $sp, 24 # 8-byte Folded Reload + ld.d $t8, $sp, 128 # 8-byte Folded Reload ld.d $ra, $sp, 16 # 8-byte Folded Reload ld.d $a1, $sp, 144 # 8-byte Folded Reload move $a2, $a0 + ld.d $a3, $sp, 120 # 8-byte Folded Reload ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $a2, $s5, 32 -.LBB6_253: # in Loop: Header=BB6_16 Depth=1 - sub.w $s1, $s1, $s6 - bnez $s7, .LBB6_387 -# %bb.254: # in Loop: Header=BB6_16 Depth=1 - add.d $s8, $s8, $fp - ld.d $s6, $sp, 128 # 8-byte Folded Reload - ld.d $s7, $sp, 88 # 8-byte Folded Reload -.LBB6_255: # in Loop: Header=BB6_16 Depth=1 - lu12i.w $a2, 3 - ori $a2, $a2, 3900 +.LBB6_253: # in Loop: Header=BB6_14 Depth=1 + sub.w $s2, $s2, $s7 + bnez $s4, .LBB6_385 +# %bb.254: # in Loop: Header=BB6_14 Depth=1 + add.d $s8, $s8, $s0 + move $s4, $s6 + move $s6, $s3 + ld.d $s3, $sp, 80 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload +.LBB6_255: # in Loop: Header=BB6_14 Depth=1 + ori $a2, $s4, 3900 st.w $a2, $s5, 8 -.LBB6_256: # in Loop: Header=BB6_16 Depth=1 +.LBB6_256: # in Loop: Header=BB6_14 Depth=1 ld.wu $a2, $s5, 24 - andi $a3, $a2, 512 - bnez $a3, .LBB6_258 -# %bb.257: # in Loop: Header=BB6_16 Depth=1 - move $fp, $s8 + andi $a4, $a2, 512 + bnez $a4, .LBB6_258 +# %bb.257: # in Loop: Header=BB6_14 Depth=1 + st.d $s7, $sp, 96 # 8-byte Folded Spill + move $s7, $s4 + move $s4, $s6 + move $s0, $s8 b .LBB6_265 .LBB6_258: # %.preheader1279 - # in Loop: Header=BB6_16 Depth=1 - addi.w $a3, $s0, 0 - ori $a4, $zero, 15 - bltu $a4, $a3, .LBB6_261 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a4, $s1, 0 + ori $a5, $zero, 15 + bltu $a5, $a4, .LBB6_261 .LBB6_259: # %.lr.ph2314 - # Parent Loop BB6_16 Depth=1 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $s1, .LBB6_378 + beqz $s2, .LBB6_376 # %bb.260: # in Loop: Header=BB6_259 Depth=2 - move $a4, $a3 - ld.bu $a3, $s8, 0 - addi.w $s1, $s1, -1 - addi.d $fp, $s8, 1 - sll.d $a3, $a3, $a4 - add.d $s2, $a3, $s2 - addi.d $a3, $a4, 8 - ori $a5, $zero, 8 - addi.w $s0, $s0, 8 - move $s8, $fp - bltu $a4, $a5, .LBB6_259 + move $a5, $a4 + ld.bu $a4, $s8, 0 + addi.w $s2, $s2, -1 + addi.d $s0, $s8, 1 + sll.d $a4, $a4, $a5 + add.d $fp, $a4, $fp + addi.d $a4, $a5, 8 + ori $a6, $zero, 8 + addi.w $s1, $s1, 8 + move $s8, $s0 + bltu $a5, $a6, .LBB6_259 b .LBB6_262 -.LBB6_261: # in Loop: Header=BB6_16 Depth=1 - move $fp, $s8 +.LBB6_261: # in Loop: Header=BB6_14 Depth=1 + move $s0, $s8 .LBB6_262: # %._crit_edge2315 - # in Loop: Header=BB6_16 Depth=1 - ld.bu $a3, $s5, 16 - andi $a3, $a3, 4 - beqz $a3, .LBB6_264 -# %bb.263: # in Loop: Header=BB6_16 Depth=1 - ld.hu $a3, $s5, 32 - bne $s2, $a3, .LBB6_272 -.LBB6_264: # in Loop: Header=BB6_16 Depth=1 - move $s2, $zero - move $s0, $zero -.LBB6_265: # in Loop: Header=BB6_16 Depth=1 - move $s6, $t8 - move $s4, $ra + # in Loop: Header=BB6_14 Depth=1 + ld.bu $a4, $s5, 16 + andi $a4, $a4, 4 + beqz $a4, .LBB6_264 +# %bb.263: # in Loop: Header=BB6_14 Depth=1 + ld.hu $a4, $s5, 32 + bne $fp, $a4, .LBB6_267 +.LBB6_264: # in Loop: Header=BB6_14 Depth=1 + st.d $s7, $sp, 96 # 8-byte Folded Spill + move $s7, $s4 + move $s4, $s6 + move $fp, $zero + move $s1, $zero +.LBB6_265: # in Loop: Header=BB6_14 Depth=1 + move $s6, $ra ld.d $a0, $s5, 48 - beqz $a0, .LBB6_267 -# %bb.266: # in Loop: Header=BB6_16 Depth=1 + beqz $a0, .LBB6_11 +# %bb.266: # in Loop: Header=BB6_14 Depth=1 bstrpick.d $a1, $a2, 9, 9 st.w $a1, $a0, 68 ori $a1, $zero, 1 st.w $a1, $a0, 72 -.LBB6_267: # in Loop: Header=BB6_16 Depth=1 - move $a0, $zero - move $a1, $zero - move $a2, $zero - pcaddu18i $ra, %call36(crc32) - jirl $ra, $ra, 0 - move $a1, $a0 - st.d $a0, $s5, 32 - ld.d $a0, $sp, 152 # 8-byte Folded Reload - st.d $a1, $a0, 96 - st.w $s7, $s5, 8 -.LBB6_268: # %.thread - # in Loop: Header=BB6_16 Depth=1 - ld.d $a1, $sp, 144 # 8-byte Folded Reload - move $ra, $s4 - move $t8, $s6 -.LBB6_269: # %.thread - # in Loop: Header=BB6_16 Depth=1 - ld.d $s6, $sp, 128 # 8-byte Folded Reload -.LBB6_270: # %.thread - # in Loop: Header=BB6_16 Depth=1 - ld.d $s4, $sp, 104 # 8-byte Folded Reload -.LBB6_271: # %.thread - # in Loop: Header=BB6_16 Depth=1 - ori $t7, $zero, 30 - ld.d $t6, $sp, 136 # 8-byte Folded Reload - b .LBB6_15 -.LBB6_272: # in Loop: Header=BB6_16 Depth=1 + b .LBB6_11 +.LBB6_267: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.5) addi.d $a2, $a2, %pc_lo12(.L.str.5) - b .LBB6_287 -.LBB6_273: # in Loop: Header=BB6_16 Depth=1 + b .LBB6_282 +.LBB6_268: # in Loop: Header=BB6_14 Depth=1 pcaddu18i $ra, %call36(adler32) jirl $ra, $ra, 0 -.LBB6_274: # in Loop: Header=BB6_16 Depth=1 +.LBB6_269: # in Loop: Header=BB6_14 Depth=1 move $a1, $a0 - ld.w $a3, $s5, 16 + ld.w $a4, $s5, 16 st.d $a0, $s5, 32 ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $a1, $a0, 96 - andi $a4, $a3, 4 + andi $a5, $a4, 4 ld.d $a1, $sp, 144 # 8-byte Folded Reload - move $ra, $s4 - move $t8, $s6 - move $s6, $s7 + ld.d $a3, $sp, 120 # 8-byte Folded Reload + move $ra, $s6 + move $s6, $s3 + move $s3, $s4 + move $s4, $s7 move $s7, $s8 - ld.d $s4, $sp, 104 # 8-byte Folded Reload + ld.d $t8, $sp, 128 # 8-byte Folded Reload ori $t7, $zero, 30 ld.d $t6, $sp, 136 # 8-byte Folded Reload -.LBB6_275: # in Loop: Header=BB6_16 Depth=1 - beqz $a4, .LBB6_277 -# %bb.276: # in Loop: Header=BB6_16 Depth=1 +.LBB6_270: # in Loop: Header=BB6_14 Depth=1 + beqz $a5, .LBB6_272 +# %bb.271: # in Loop: Header=BB6_14 Depth=1 ld.w $a2, $s5, 24 sltui $a2, $a2, 1 - revb.2w $a4, $s2 - bstrpick.d $a4, $a4, 31, 0 - ld.d $a5, $s5, 32 - masknez $a6, $s2, $a2 - maskeqz $a2, $a4, $a2 - or $a2, $a2, $a6 - bne $a2, $a5, .LBB6_288 -.LBB6_277: # in Loop: Header=BB6_16 Depth=1 - move $s2, $zero - move $s0, $zero - move $s8, $fp - st.d $ra, $sp, 120 # 8-byte Folded Spill -.LBB6_278: # in Loop: Header=BB6_16 Depth=1 - lu12i.w $a2, 3 - ori $a2, $a2, 3919 + revb.2w $a5, $fp + bstrpick.d $a5, $a5, 31, 0 + ld.d $a6, $s5, 32 + masknez $a7, $fp, $a2 + maskeqz $a2, $a5, $a2 + or $a2, $a2, $a7 + bne $a2, $a6, .LBB6_283 +.LBB6_272: # in Loop: Header=BB6_14 Depth=1 + move $fp, $zero + move $s1, $zero + move $s8, $s0 + st.d $s7, $sp, 112 # 8-byte Folded Spill +.LBB6_273: # in Loop: Header=BB6_14 Depth=1 + ori $a2, $s4, 3919 st.w $a2, $s5, 8 -.LBB6_279: # in Loop: Header=BB6_16 Depth=1 - beqz $a3, .LBB6_408 -# %bb.280: # in Loop: Header=BB6_16 Depth=1 +.LBB6_274: # in Loop: Header=BB6_14 Depth=1 + beqz $a4, .LBB6_387 +# %bb.275: # in Loop: Header=BB6_14 Depth=1 ld.w $a2, $s5, 24 - beqz $a2, .LBB6_408 -# %bb.281: # %.preheader1307 - # in Loop: Header=BB6_16 Depth=1 - addi.w $a2, $s0, 0 - ori $a3, $zero, 31 - bltu $a3, $a2, .LBB6_284 -.LBB6_282: # %.lr.ph1753 - # Parent Loop BB6_16 Depth=1 + beqz $a2, .LBB6_387 +# %bb.276: # %.preheader1307 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a2, $s1, 0 + ori $a4, $zero, 31 + bltu $a4, $a2, .LBB6_279 +.LBB6_277: # %.lr.ph1753 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $s1, .LBB6_378 -# %bb.283: # in Loop: Header=BB6_282 Depth=2 - move $a3, $a2 + beqz $s2, .LBB6_376 +# %bb.278: # in Loop: Header=BB6_277 Depth=2 + move $a4, $a2 ld.bu $a2, $s8, 0 - addi.w $s1, $s1, -1 - addi.d $fp, $s8, 1 - sll.d $a2, $a2, $a3 - add.d $s2, $a2, $s2 - addi.d $a2, $a3, 8 - ori $a4, $zero, 24 - addi.w $s0, $s0, 8 - move $s8, $fp - bltu $a3, $a4, .LBB6_282 - b .LBB6_285 -.LBB6_284: # in Loop: Header=BB6_16 Depth=1 - move $fp, $s8 -.LBB6_285: # %._crit_edge1754 - # in Loop: Header=BB6_16 Depth=1 + addi.w $s2, $s2, -1 + addi.d $s0, $s8, 1 + sll.d $a2, $a2, $a4 + add.d $fp, $a2, $fp + addi.d $a2, $a4, 8 + ori $a5, $zero, 24 + addi.w $s1, $s1, 8 + move $s8, $s0 + bltu $a4, $a5, .LBB6_277 + b .LBB6_280 +.LBB6_279: # in Loop: Header=BB6_14 Depth=1 + move $s0, $s8 +.LBB6_280: # %._crit_edge1754 + # in Loop: Header=BB6_14 Depth=1 ld.wu $a2, $s5, 40 - beq $s2, $a2, .LBB6_412 -# %bb.286: # in Loop: Header=BB6_16 Depth=1 + beq $fp, $a2, .LBB6_412 +# %bb.281: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.18) addi.d $a2, $a2, %pc_lo12(.L.str.18) -.LBB6_287: # %.thread - # in Loop: Header=BB6_16 Depth=1 +.LBB6_282: # %.thread + # in Loop: Header=BB6_14 Depth=1 st.d $a2, $a0, 48 - lu12i.w $a2, 3 - ori $a2, $a2, 3921 + ori $a2, $s4, 3921 st.w $a2, $s5, 8 - b .LBB6_15 -.LBB6_288: # in Loop: Header=BB6_16 Depth=1 + b .LBB6_13 +.LBB6_283: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.17) addi.d $a2, $a2, %pc_lo12(.L.str.17) st.d $a2, $a0, 48 - lu12i.w $a2, 3 - ori $a2, $a2, 3921 + ori $a2, $s4, 3921 st.w $a2, $s5, 8 - st.d $ra, $sp, 120 # 8-byte Folded Spill - b .LBB6_15 -.LBB6_289: # in Loop: Header=BB6_16 Depth=1 - st.d $zero, $sp, 96 # 8-byte Folded Spill - lu12i.w $a0, 3 - ori $a0, $a0, 3911 + st.d $s7, $sp, 112 # 8-byte Folded Spill + b .LBB6_13 +.LBB6_284: # in Loop: Header=BB6_14 Depth=1 + st.d $zero, $sp, 88 # 8-byte Folded Spill + ori $a0, $s4, 3911 st.w $a0, $s5, 8 ld.d $a1, $sp, 144 # 8-byte Folded Reload ld.d $a0, $sp, 152 # 8-byte Folded Reload - move $ra, $s4 - move $t8, $s6 - ld.d $s6, $sp, 128 # 8-byte Folded Reload - ld.d $s4, $sp, 104 # 8-byte Folded Reload + ld.d $a3, $sp, 120 # 8-byte Folded Reload + move $ra, $s6 + ld.d $s6, $sp, 96 # 8-byte Folded Reload + ld.d $t8, $sp, 128 # 8-byte Folded Reload ori $t7, $zero, 30 ld.d $t6, $sp, 136 # 8-byte Folded Reload - beq $a1, $fp, .LBB6_389 -.LBB6_290: # in Loop: Header=BB6_16 Depth=1 - lu12i.w $a2, 3 - ori $a2, $a2, 3912 + beq $a1, $s0, .LBB6_390 +.LBB6_285: # in Loop: Header=BB6_14 Depth=1 + ori $a2, $s4, 3912 st.w $a2, $s5, 8 -.LBB6_291: # in Loop: Header=BB6_16 Depth=1 +.LBB6_286: # in Loop: Header=BB6_14 Depth=1 ori $a2, $zero, 6 - bltu $s1, $a2, .LBB6_295 -# %bb.292: # in Loop: Header=BB6_16 Depth=1 + bltu $s2, $a2, .LBB6_291 +# %bb.287: # in Loop: Header=BB6_14 Depth=1 ori $a2, $zero, 258 - bltu $ra, $a2, .LBB6_295 -# %bb.293: # in Loop: Header=BB6_16 Depth=1 - st.d $t8, $a0, 24 - st.w $ra, $a0, 32 + bltu $s7, $a2, .LBB6_291 +# %bb.288: # in Loop: Header=BB6_14 Depth=1 + st.d $ra, $a0, 24 + st.w $s7, $a0, 32 st.d $s8, $a0, 0 - st.w $s1, $a0, 8 - st.d $s2, $s5, 80 - st.w $s0, $s5, 88 + st.w $s2, $a0, 8 + st.d $fp, $s5, 80 + st.w $s1, $s5, 88 ld.d $a0, $sp, 152 # 8-byte Folded Reload - ld.d $a1, $sp, 120 # 8-byte Folded Reload + ld.d $a1, $sp, 112 # 8-byte Folded Reload pcaddu18i $ra, %call36(inflate_fast) jirl $ra, $ra, 0 ld.d $a0, $sp, 152 # 8-byte Folded Reload - ld.d $t8, $a0, 24 - ld.w $ra, $a0, 32 - ld.d $fp, $a0, 0 - ld.w $s1, $a0, 8 - ld.d $s2, $s5, 80 + ld.d $ra, $a0, 24 + ld.w $s7, $a0, 32 + ld.d $s0, $a0, 0 + ld.w $s2, $a0, 8 + ld.d $fp, $s5, 80 ld.w $a1, $s5, 8 - ld.w $s0, $s5, 88 - bne $a1, $s7, .LBB6_305 -# %bb.294: # in Loop: Header=BB6_16 Depth=1 + ld.w $s1, $s5, 88 + bne $a1, $s3, .LBB6_290 +# %bb.289: # in Loop: Header=BB6_14 Depth=1 addi.w $a1, $zero, -1 lu32i.d $a1, 0 stptr.w $a1, $s5, 7148 +.LBB6_290: # in Loop: Header=BB6_14 Depth=1 ld.d $a1, $sp, 144 # 8-byte Folded Reload - b .LBB6_271 -.LBB6_295: # in Loop: Header=BB6_16 Depth=1 + ld.d $a3, $sp, 120 # 8-byte Folded Reload + b .LBB6_12 +.LBB6_291: # in Loop: Header=BB6_14 Depth=1 ld.wu $a2, $s5, 120 stptr.w $zero, $s5, 7148 - ld.d $a6, $s5, 104 - addi.w $a3, $zero, -1 - sll.w $a2, $a3, $a2 - andn $a4, $s2, $a2 - addi.w $a4, $a4, 0 - alsl.d $t0, $a4, $a6, 2 - ld.bu $a5, $t0, 1 - addi.w $a7, $s0, 0 - bgeu $a7, $a5, .LBB6_299 -# %bb.296: # %.lr.ph2011.preheader - # in Loop: Header=BB6_16 Depth=1 - nor $t1, $a2, $zero - move $a4, $s0 - move $a2, $s1 - move $t2, $s8 -.LBB6_297: # %.lr.ph2011 - # Parent Loop BB6_16 Depth=1 - # => This Inner Loop Header: Depth=2 - beqz $a2, .LBB6_380 -# %bb.298: # in Loop: Header=BB6_297 Depth=2 - ld.bu $a5, $t2, 0 - sll.d $a5, $a5, $a7 - add.d $s2, $a5, $s2 - and $a5, $t1, $s2 + ld.d $a7, $s5, 104 + addi.w $a4, $zero, -1 + sll.w $a2, $a4, $a2 + andn $a5, $fp, $a2 addi.w $a5, $a5, 0 - alsl.d $t0, $a5, $a6, 2 - ld.bu $a5, $t0, 1 + alsl.d $t1, $a5, $a7, 2 + ld.bu $a6, $t1, 1 + addi.w $t0, $s1, 0 + bgeu $t0, $a6, .LBB6_295 +# %bb.292: # %.lr.ph2011.preheader + # in Loop: Header=BB6_14 Depth=1 + nor $t2, $a2, $zero + move $a5, $s1 + move $a2, $s2 + move $t3, $s8 +.LBB6_293: # %.lr.ph2011 + # Parent Loop BB6_14 Depth=1 + # => This Inner Loop Header: Depth=2 + beqz $a2, .LBB6_378 +# %bb.294: # in Loop: Header=BB6_293 Depth=2 + ld.bu $a6, $t3, 0 + sll.d $a6, $a6, $t0 + add.d $fp, $a6, $fp + and $a6, $t2, $fp + addi.w $a6, $a6, 0 + alsl.d $t1, $a6, $a7, 2 + ld.bu $a6, $t1, 1 addi.w $a2, $a2, -1 - addi.d $fp, $t2, 1 - addi.d $a7, $a7, 8 - addi.d $a4, $a4, 8 - move $t2, $fp - bltu $a7, $a5, .LBB6_297 - b .LBB6_300 -.LBB6_299: # in Loop: Header=BB6_16 Depth=1 - move $fp, $s8 - move $a2, $s1 - move $a4, $s0 -.LBB6_300: # %._crit_edge2012.loopexit - # in Loop: Header=BB6_16 Depth=1 - move $a7, $a5 - ld.bu $t3, $t0, 0 - ld.hu $t0, $t0, 2 - addi.d $t1, $t3, -1 - ori $t2, $zero, 14 - bltu $t2, $t1, .LBB6_306 -# %bb.301: # %.preheader1301 - # in Loop: Header=BB6_16 Depth=1 - move $s4, $t8 - add.d $t1, $a7, $t3 - sll.w $t1, $a3, $t1 - andn $t2, $s2, $t1 - srl.w $t2, $t2, $a7 - add.d $t2, $t2, $t0 - bstrpick.d $t2, $t2, 31, 0 - alsl.d $t6, $t2, $a6, 2 - ld.bu $t4, $t6, 1 - add.d $t2, $a7, $t4 - addi.w $t3, $a4, 0 - bgeu $t3, $t2, .LBB6_307 -# %bb.302: # %.lr.ph2029.preheader - # in Loop: Header=BB6_16 Depth=1 - nor $t5, $t1, $zero - move $t2, $a4 - move $t1, $a2 - move $t8, $fp -.LBB6_303: # %.lr.ph2029 - # Parent Loop BB6_16 Depth=1 + addi.d $s0, $t3, 1 + addi.d $t0, $t0, 8 + addi.d $a5, $a5, 8 + move $t3, $s0 + bltu $t0, $a6, .LBB6_293 + b .LBB6_296 +.LBB6_295: # in Loop: Header=BB6_14 Depth=1 + move $s0, $s8 + move $a2, $s2 + move $a5, $s1 +.LBB6_296: # %._crit_edge2012.loopexit + # in Loop: Header=BB6_14 Depth=1 + move $t0, $a6 + ld.bu $t4, $t1, 0 + ld.hu $t1, $t1, 2 + addi.d $t2, $t4, -1 + ori $t3, $zero, 14 + bltu $t3, $t2, .LBB6_301 +# %bb.297: # %.preheader1301 + # in Loop: Header=BB6_14 Depth=1 + add.d $t2, $t0, $t4 + sll.w $t2, $a4, $t2 + andn $t3, $fp, $t2 + srl.w $t3, $t3, $t0 + add.d $t3, $t3, $t1 + bstrpick.d $t3, $t3, 31, 0 + alsl.d $t7, $t3, $a7, 2 + ld.bu $t5, $t7, 1 + add.d $t3, $t0, $t5 + addi.w $t4, $a5, 0 + bgeu $t4, $t3, .LBB6_302 +# %bb.298: # %.lr.ph2029.preheader + # in Loop: Header=BB6_14 Depth=1 + nor $t6, $t2, $zero + move $t3, $a5 + move $t2, $a2 + move $s1, $s0 +.LBB6_299: # %.lr.ph2029 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $t1, .LBB6_409 -# %bb.304: # in Loop: Header=BB6_303 Depth=2 - ld.bu $t4, $t8, 0 - sll.d $t4, $t4, $t3 - add.d $s2, $t4, $s2 - and $t4, $s2, $t5 - srl.w $t4, $t4, $a7 - add.d $t4, $t4, $t0 - bstrpick.d $t4, $t4, 31, 0 - alsl.d $t6, $t4, $a6, 2 - ld.bu $t4, $t6, 1 - addi.w $t1, $t1, -1 - addi.d $t7, $t8, 1 + beqz $t2, .LBB6_388 +# %bb.300: # in Loop: Header=BB6_299 Depth=2 + ld.bu $t5, $s1, 0 + sll.d $t5, $t5, $t4 + add.d $fp, $t5, $fp + and $t5, $fp, $t6 + srl.w $t5, $t5, $t0 + add.d $t5, $t5, $t1 + bstrpick.d $t5, $t5, 31, 0 + alsl.d $t7, $t5, $a7, 2 + ld.bu $t5, $t7, 1 + addi.w $t2, $t2, -1 + addi.d $t8, $s1, 1 + addi.d $t4, $t4, 8 + add.d $s2, $t0, $t5 addi.d $t3, $t3, 8 - add.d $s0, $a7, $t4 - addi.d $t2, $t2, 8 - move $t8, $t7 - bltu $t3, $s0, .LBB6_303 - b .LBB6_308 -.LBB6_305: # in Loop: Header=BB6_16 Depth=1 - ld.d $a1, $sp, 144 # 8-byte Folded Reload - b .LBB6_271 -.LBB6_306: # in Loop: Header=BB6_16 Depth=1 - move $a7, $zero - b .LBB6_309 -.LBB6_307: # in Loop: Header=BB6_16 Depth=1 - move $t7, $fp - move $t1, $a2 - move $t2, $a4 -.LBB6_308: # %._crit_edge2030 - # in Loop: Header=BB6_16 Depth=1 - ld.hu $t0, $t6, 2 - ld.bu $t3, $t6, 0 - srl.d $s2, $s2, $a5 - sub.d $a4, $t2, $a7 - move $fp, $t7 - move $a2, $t1 - move $a5, $t4 - move $t8, $s4 - ld.d $s4, $sp, 104 # 8-byte Folded Reload + move $s1, $t8 + bltu $t4, $s2, .LBB6_299 + b .LBB6_303 +.LBB6_301: # in Loop: Header=BB6_14 Depth=1 + move $t0, $zero + b .LBB6_304 +.LBB6_302: # in Loop: Header=BB6_14 Depth=1 + move $t8, $s0 + move $t2, $a2 + move $t3, $a5 +.LBB6_303: # %._crit_edge2030 + # in Loop: Header=BB6_14 Depth=1 + ld.hu $t1, $t7, 2 + ld.bu $t4, $t7, 0 + srl.d $fp, $fp, $a6 + sub.d $a5, $t3, $t0 + move $s0, $t8 + move $a2, $t2 + move $a6, $t5 + ld.d $t8, $sp, 128 # 8-byte Folded Reload ori $t7, $zero, 30 ld.d $t6, $sp, 136 # 8-byte Folded Reload -.LBB6_309: # in Loop: Header=BB6_16 Depth=1 - srl.d $s2, $s2, $a5 - sub.w $s0, $a4, $a5 - add.d $a4, $a7, $a5 - stptr.w $a4, $s5, 7148 - st.w $t0, $s5, 92 - beqz $t3, .LBB6_362 -# %bb.310: # in Loop: Header=BB6_16 Depth=1 - andi $a4, $t3, 32 - bnez $a4, .LBB6_364 -# %bb.311: # in Loop: Header=BB6_16 Depth=1 - andi $a3, $t3, 64 - bnez $a3, .LBB6_365 -# %bb.312: # in Loop: Header=BB6_16 Depth=1 - andi $a3, $t3, 15 - st.w $a3, $s5, 100 - lu12i.w $a4, 3 - ori $a4, $a4, 3913 - st.w $a4, $s5, 8 - move $s8, $fp - move $s1, $a2 - bnez $a3, .LBB6_25 -.LBB6_313: # %._crit_edge2831 - # in Loop: Header=BB6_16 Depth=1 - ld.w $a6, $s5, 92 -.LBB6_314: # in Loop: Header=BB6_16 Depth=1 - stptr.w $a6, $s5, 7152 - lu12i.w $a2, 3 - ori $a2, $a2, 3914 +.LBB6_304: # in Loop: Header=BB6_14 Depth=1 + srl.d $fp, $fp, $a6 + sub.w $s1, $a5, $a6 + add.d $a5, $t0, $a6 + stptr.w $a5, $s5, 7148 + st.w $t1, $s5, 92 + beqz $t4, .LBB6_359 +# %bb.305: # in Loop: Header=BB6_14 Depth=1 + andi $a5, $t4, 32 + bnez $a5, .LBB6_361 +# %bb.306: # in Loop: Header=BB6_14 Depth=1 + andi $a4, $t4, 64 + bnez $a4, .LBB6_362 +# %bb.307: # in Loop: Header=BB6_14 Depth=1 + andi $a4, $t4, 15 + st.w $a4, $s5, 100 + ori $a5, $s4, 3913 + st.w $a5, $s5, 8 + move $s8, $s0 + move $s2, $a2 + bnez $a4, .LBB6_23 +.LBB6_308: # %._crit_edge2831 + # in Loop: Header=BB6_14 Depth=1 + ld.w $a7, $s5, 92 +.LBB6_309: # in Loop: Header=BB6_14 Depth=1 + stptr.w $a7, $s5, 7152 + ori $a2, $s4, 3914 st.w $a2, $s5, 8 -.LBB6_315: # in Loop: Header=BB6_16 Depth=1 +.LBB6_310: # in Loop: Header=BB6_14 Depth=1 ld.wu $a2, $s5, 124 - ld.d $a5, $s5, 112 - addi.d $a7, $zero, -1 - sll.w $a2, $a7, $a2 - andn $a3, $s2, $a2 - addi.w $a3, $a3, 0 - alsl.d $a6, $a3, $a5, 2 - ld.bu $a4, $a6, 1 - addi.w $t0, $s0, 0 - bgeu $t0, $a4, .LBB6_319 -# %bb.316: # %.lr.ph2056.preheader - # in Loop: Header=BB6_16 Depth=1 - nor $t1, $a2, $zero - move $a3, $s0 - move $a2, $s1 - move $t2, $s8 -.LBB6_317: # %.lr.ph2056 - # Parent Loop BB6_16 Depth=1 - # => This Inner Loop Header: Depth=2 - beqz $a2, .LBB6_380 -# %bb.318: # in Loop: Header=BB6_317 Depth=2 - ld.bu $a4, $t2, 0 - sll.d $a4, $a4, $t0 - add.d $s2, $a4, $s2 - and $a4, $t1, $s2 + ld.d $a6, $s5, 112 + addi.d $t0, $zero, -1 + sll.w $a2, $t0, $a2 + andn $a4, $fp, $a2 addi.w $a4, $a4, 0 - alsl.d $a6, $a4, $a5, 2 - ld.bu $a4, $a6, 1 - addi.w $a2, $a2, -1 - addi.d $fp, $t2, 1 - addi.d $t0, $t0, 8 - addi.d $a3, $a3, 8 - move $t2, $fp - bltu $t0, $a4, .LBB6_317 - b .LBB6_320 -.LBB6_319: # in Loop: Header=BB6_16 Depth=1 - move $fp, $s8 - move $a2, $s1 - move $a3, $s0 -.LBB6_320: # %._crit_edge2057 - # in Loop: Header=BB6_16 Depth=1 - ld.bu $t2, $a6, 0 - ld.hu $a6, $a6, 2 - ori $t0, $zero, 16 - bgeu $t2, $t0, .LBB6_325 -# %bb.321: # %.preheader1296 - # in Loop: Header=BB6_16 Depth=1 - add.d $t0, $a4, $t2 - sll.w $a7, $a7, $t0 - andn $t0, $s2, $a7 - srl.w $t0, $t0, $a4 - add.d $t0, $t0, $a6 - bstrpick.d $t0, $t0, 31, 0 - alsl.d $t3, $t0, $a5, 2 - ld.bu $t1, $t3, 1 - add.d $t0, $a4, $t1 - addi.w $t2, $a3, 0 - bgeu $t2, $t0, .LBB6_326 -# %bb.322: # %.lr.ph2074.preheader - # in Loop: Header=BB6_16 Depth=1 - nor $t4, $a7, $zero - move $t0, $a3 - move $a7, $a2 - move $t6, $fp -.LBB6_323: # %.lr.ph2074 - # Parent Loop BB6_16 Depth=1 + alsl.d $a7, $a4, $a6, 2 + ld.bu $a5, $a7, 1 + addi.w $t1, $s1, 0 + bgeu $t1, $a5, .LBB6_314 +# %bb.311: # %.lr.ph2056.preheader + # in Loop: Header=BB6_14 Depth=1 + nor $t2, $a2, $zero + move $a4, $s1 + move $a2, $s2 + move $t3, $s8 +.LBB6_312: # %.lr.ph2056 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $a7, .LBB6_385 -# %bb.324: # in Loop: Header=BB6_323 Depth=2 - ld.bu $t1, $t6, 0 - sll.d $t1, $t1, $t2 - add.d $s2, $t1, $s2 - and $t1, $s2, $t4 - srl.w $t1, $t1, $a4 - add.d $t1, $t1, $a6 + beqz $a2, .LBB6_378 +# %bb.313: # in Loop: Header=BB6_312 Depth=2 + ld.bu $a5, $t3, 0 + sll.d $a5, $a5, $t1 + add.d $fp, $a5, $fp + and $a5, $t2, $fp + addi.w $a5, $a5, 0 + alsl.d $a7, $a5, $a6, 2 + ld.bu $a5, $a7, 1 + addi.w $a2, $a2, -1 + addi.d $s0, $t3, 1 + addi.d $t1, $t1, 8 + addi.d $a4, $a4, 8 + move $t3, $s0 + bltu $t1, $a5, .LBB6_312 + b .LBB6_315 +.LBB6_314: # in Loop: Header=BB6_14 Depth=1 + move $s0, $s8 + move $a2, $s2 + move $a4, $s1 +.LBB6_315: # %._crit_edge2057 + # in Loop: Header=BB6_14 Depth=1 + ld.bu $t3, $a7, 0 + ld.hu $a7, $a7, 2 + ori $t1, $zero, 16 + bgeu $t3, $t1, .LBB6_320 +# %bb.316: # %.preheader1296 + # in Loop: Header=BB6_14 Depth=1 + add.d $t1, $a5, $t3 + sll.w $t0, $t0, $t1 + andn $t1, $fp, $t0 + srl.w $t1, $t1, $a5 + add.d $t1, $t1, $a7 bstrpick.d $t1, $t1, 31, 0 - alsl.d $t3, $t1, $a5, 2 - ld.bu $t1, $t3, 1 - addi.w $a7, $a7, -1 - addi.d $t5, $t6, 1 - addi.d $t2, $t2, 8 - add.d $t7, $a4, $t1 - addi.d $t0, $t0, 8 - move $t6, $t5 - bltu $t2, $t7, .LBB6_323 - b .LBB6_327 -.LBB6_325: # %._crit_edge2057._crit_edge - # in Loop: Header=BB6_16 Depth=1 - lu12i.w $a5, 1 - ori $a5, $a5, 3052 - ldx.w $a5, $s5, $a5 - b .LBB6_328 -.LBB6_326: # in Loop: Header=BB6_16 Depth=1 - move $t5, $fp - move $a7, $a2 - move $t0, $a3 -.LBB6_327: # %._crit_edge2075 - # in Loop: Header=BB6_16 Depth=1 - ld.hu $a6, $t3, 2 + alsl.d $t4, $t1, $a6, 2 + ld.bu $t2, $t4, 1 + add.d $t1, $a5, $t2 + addi.w $t3, $a4, 0 + bgeu $t3, $t1, .LBB6_321 +# %bb.317: # %.lr.ph2074.preheader + # in Loop: Header=BB6_14 Depth=1 + nor $t5, $t0, $zero + move $t1, $a4 + move $t0, $a2 + move $t7, $s0 +.LBB6_318: # %.lr.ph2074 + # Parent Loop BB6_14 Depth=1 + # => This Inner Loop Header: Depth=2 + beqz $t0, .LBB6_383 +# %bb.319: # in Loop: Header=BB6_318 Depth=2 + ld.bu $t2, $t7, 0 + sll.d $t2, $t2, $t3 + add.d $fp, $t2, $fp + and $t2, $fp, $t5 + srl.w $t2, $t2, $a5 + add.d $t2, $t2, $a7 + bstrpick.d $t2, $t2, 31, 0 + alsl.d $t4, $t2, $a6, 2 + ld.bu $t2, $t4, 1 + addi.w $t0, $t0, -1 + addi.d $t6, $t7, 1 + addi.d $t3, $t3, 8 + add.d $t8, $a5, $t2 + addi.d $t1, $t1, 8 + move $t7, $t6 + bltu $t3, $t8, .LBB6_318 + b .LBB6_322 +.LBB6_320: # %._crit_edge2057._crit_edge + # in Loop: Header=BB6_14 Depth=1 + lu12i.w $a6, 1 + ori $a6, $a6, 3052 + ldx.w $a6, $s5, $a6 + b .LBB6_323 +.LBB6_321: # in Loop: Header=BB6_14 Depth=1 + move $t6, $s0 + move $t0, $a2 + move $t1, $a4 +.LBB6_322: # %._crit_edge2075 + # in Loop: Header=BB6_14 Depth=1 + ld.hu $a7, $t4, 2 lu12i.w $a2, 1 ori $a2, $a2, 3052 ldx.w $a2, $s5, $a2 - ld.bu $t2, $t3, 0 - srl.d $s2, $s2, $a4 - sub.d $a3, $t0, $a4 - add.d $a5, $a2, $a4 - move $fp, $t5 - move $a2, $a7 - move $a4, $t1 + ld.bu $t3, $t4, 0 + srl.d $fp, $fp, $a5 + sub.d $a4, $t1, $a5 + add.d $a6, $a2, $a5 + move $s0, $t6 + move $a2, $t0 + move $a5, $t2 + ld.d $t8, $sp, 128 # 8-byte Folded Reload ori $t7, $zero, 30 ld.d $t6, $sp, 136 # 8-byte Folded Reload -.LBB6_328: # in Loop: Header=BB6_16 Depth=1 - srl.d $s2, $s2, $a4 - sub.w $s0, $a3, $a4 - add.d $a3, $a5, $a4 - andi $a4, $t2, 64 - stptr.w $a3, $s5, 7148 - bnez $a4, .LBB6_335 -# %bb.329: # in Loop: Header=BB6_16 Depth=1 - st.w $a6, $s5, 96 - andi $a3, $t2, 15 - st.w $a3, $s5, 100 - lu12i.w $a4, 3 - ori $a4, $a4, 3915 - st.w $a4, $s5, 8 - move $s8, $fp - move $s1, $a2 -.LBB6_330: # in Loop: Header=BB6_16 Depth=1 - beqz $a3, .LBB6_339 -# %bb.331: # %.preheader1294 - # in Loop: Header=BB6_16 Depth=1 - addi.w $a2, $s0, 0 - bgeu $a2, $a3, .LBB6_337 -# %bb.332: # %.lr.ph2086.preheader - # in Loop: Header=BB6_16 Depth=1 - move $a4, $s0 - move $a2, $s1 - move $a6, $s8 -.LBB6_333: # %.lr.ph2086 - # Parent Loop BB6_16 Depth=1 +.LBB6_323: # in Loop: Header=BB6_14 Depth=1 + srl.d $fp, $fp, $a5 + sub.w $s1, $a4, $a5 + add.d $a4, $a6, $a5 + andi $a5, $t3, 64 + stptr.w $a4, $s5, 7148 + bnez $a5, .LBB6_330 +# %bb.324: # in Loop: Header=BB6_14 Depth=1 + st.w $a7, $s5, 96 + andi $a4, $t3, 15 + st.w $a4, $s5, 100 + ori $a5, $s4, 3915 + st.w $a5, $s5, 8 + move $s8, $s0 + move $s2, $a2 +.LBB6_325: # in Loop: Header=BB6_14 Depth=1 + beqz $a4, .LBB6_334 +# %bb.326: # %.preheader1294 + # in Loop: Header=BB6_14 Depth=1 + addi.w $a2, $s1, 0 + bgeu $a2, $a4, .LBB6_332 +# %bb.327: # %.lr.ph2086.preheader + # in Loop: Header=BB6_14 Depth=1 + move $a5, $s1 + move $a2, $s2 + move $a7, $s8 +.LBB6_328: # %.lr.ph2086 + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - beqz $a2, .LBB6_383 -# %bb.334: # in Loop: Header=BB6_333 Depth=2 - ld.bu $a7, $a6, 0 + beqz $a2, .LBB6_381 +# %bb.329: # in Loop: Header=BB6_328 Depth=2 + ld.bu $t0, $a7, 0 addi.w $a2, $a2, -1 - addi.d $a5, $a6, 1 - sll.d $a6, $a7, $a4 - addi.w $a4, $a4, 8 - add.d $s2, $a6, $s2 - move $a6, $a5 - bltu $a4, $a3, .LBB6_333 - b .LBB6_338 -.LBB6_335: # in Loop: Header=BB6_16 Depth=1 - pcalau12i $a3, %pc_hi20(.L.str.15) - addi.d $a3, $a3, %pc_lo12(.L.str.15) -.LBB6_336: # %.thread - # in Loop: Header=BB6_16 Depth=1 - st.d $a3, $a0, 48 - lu12i.w $a3, 3 - ori $a3, $a3, 3921 - b .LBB6_363 -.LBB6_337: # in Loop: Header=BB6_16 Depth=1 - move $a5, $s8 - move $a2, $s1 - move $a4, $s0 -.LBB6_338: # %._crit_edge2087 - # in Loop: Header=BB6_16 Depth=1 - ld.w $a6, $s5, 96 - addi.d $a7, $zero, -1 - sll.w $a7, $a7, $a3 - andn $a7, $s2, $a7 - add.d $a6, $a6, $a7 - lu12i.w $a7, 1 - ori $a7, $a7, 3052 - ldx.w $a7, $s5, $a7 - st.w $a6, $s5, 96 - srl.d $s2, $s2, $a3 - sub.w $s0, $a4, $a3 - add.d $a3, $a7, $a3 - stptr.w $a3, $s5, 7148 - move $s8, $a5 - move $s1, $a2 -.LBB6_339: # in Loop: Header=BB6_16 Depth=1 - lu12i.w $a2, 3 - ori $a2, $a2, 3916 + addi.d $a6, $a7, 1 + sll.d $a7, $t0, $a5 + addi.w $a5, $a5, 8 + add.d $fp, $a7, $fp + move $a7, $a6 + bltu $a5, $a4, .LBB6_328 + b .LBB6_333 +.LBB6_330: # in Loop: Header=BB6_14 Depth=1 + pcalau12i $a4, %pc_hi20(.L.str.15) + addi.d $a4, $a4, %pc_lo12(.L.str.15) +.LBB6_331: # %.thread + # in Loop: Header=BB6_14 Depth=1 + st.d $a4, $a0, 48 + ori $a4, $s4, 3921 + b .LBB6_360 +.LBB6_332: # in Loop: Header=BB6_14 Depth=1 + move $a6, $s8 + move $a2, $s2 + move $a5, $s1 +.LBB6_333: # %._crit_edge2087 + # in Loop: Header=BB6_14 Depth=1 + ld.w $a7, $s5, 96 + addi.d $t0, $zero, -1 + sll.w $t0, $t0, $a4 + andn $t0, $fp, $t0 + add.d $a7, $a7, $t0 + lu12i.w $t0, 1 + ori $t0, $t0, 3052 + ldx.w $t0, $s5, $t0 + st.w $a7, $s5, 96 + srl.d $fp, $fp, $a4 + sub.w $s1, $a5, $a4 + add.d $a4, $t0, $a4 + stptr.w $a4, $s5, 7148 + move $s8, $a6 + move $s2, $a2 +.LBB6_334: # in Loop: Header=BB6_14 Depth=1 + ori $a2, $s4, 3916 st.w $a2, $s5, 8 -.LBB6_340: # in Loop: Header=BB6_16 Depth=1 - beqz $ra, .LBB6_388 -# %bb.341: # in Loop: Header=BB6_16 Depth=1 +.LBB6_335: # in Loop: Header=BB6_14 Depth=1 + beqz $s7, .LBB6_386 +# %bb.336: # in Loop: Header=BB6_14 Depth=1 ld.w $a2, $s5, 96 - ld.d $a3, $sp, 120 # 8-byte Folded Reload - sub.w $a3, $a3, $ra - bgeu $a3, $a2, .LBB6_346 -# %bb.342: # in Loop: Header=BB6_16 Depth=1 - ld.w $a4, $s5, 64 - sub.w $a2, $a2, $a3 - bgeu $a4, $a2, .LBB6_347 -# %bb.343: # in Loop: Header=BB6_16 Depth=1 - ldptr.w $a3, $s5, 7144 - beqz $a3, .LBB6_347 -# %bb.344: # in Loop: Header=BB6_16 Depth=1 + ld.d $a4, $sp, 112 # 8-byte Folded Reload + sub.w $a4, $a4, $s7 + bgeu $a4, $a2, .LBB6_343 +# %bb.337: # in Loop: Header=BB6_14 Depth=1 + ld.w $a5, $s5, 64 + sub.w $a2, $a2, $a4 + bgeu $a5, $a2, .LBB6_344 +# %bb.338: # in Loop: Header=BB6_14 Depth=1 + ldptr.w $a4, $s5, 7144 + beqz $a4, .LBB6_344 +# %bb.339: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.16) addi.d $a2, $a2, %pc_lo12(.L.str.16) -.LBB6_345: # %.thread - # in Loop: Header=BB6_16 Depth=1 +.LBB6_340: # %.thread + # in Loop: Header=BB6_14 Depth=1 st.d $a2, $a0, 48 - lu12i.w $a2, 3 - ori $a2, $a2, 3921 + ori $a2, $s4, 3921 +.LBB6_341: # %.thread + # in Loop: Header=BB6_14 Depth=1 + st.w $a2, $s5, 8 +.LBB6_342: # %.thread + # in Loop: Header=BB6_14 Depth=1 + move $s0, $s8 b .LBB6_13 -.LBB6_346: # in Loop: Header=BB6_16 Depth=1 - ld.w $a3, $s5, 92 +.LBB6_343: # in Loop: Header=BB6_14 Depth=1 + ld.w $a4, $s5, 92 bstrpick.d $a2, $a2, 31, 0 - sub.d $a6, $t8, $a2 - move $a2, $a3 - b .LBB6_351 -.LBB6_347: # in Loop: Header=BB6_16 Depth=1 - ld.w $a3, $s5, 68 - bgeu $a3, $a2, .LBB6_349 -# %bb.348: # in Loop: Header=BB6_16 Depth=1 - ld.w $a4, $s5, 60 - sub.w $a2, $a2, $a3 - sub.w $a4, $a4, $a2 - b .LBB6_350 -.LBB6_349: # in Loop: Header=BB6_16 Depth=1 - sub.w $a4, $a3, $a2 -.LBB6_350: # in Loop: Header=BB6_16 Depth=1 - ld.d $a5, $s5, 72 - ld.w $a3, $s5, 92 + sub.d $a7, $ra, $a2 + move $a2, $a4 + b .LBB6_348 +.LBB6_344: # in Loop: Header=BB6_14 Depth=1 + ld.w $a4, $s5, 68 + bgeu $a4, $a2, .LBB6_346 +# %bb.345: # in Loop: Header=BB6_14 Depth=1 + ld.w $a5, $s5, 60 + sub.w $a2, $a2, $a4 + sub.w $a5, $a5, $a2 + b .LBB6_347 +.LBB6_346: # in Loop: Header=BB6_14 Depth=1 + sub.w $a5, $a4, $a2 +.LBB6_347: # in Loop: Header=BB6_14 Depth=1 + ld.d $a6, $s5, 72 + ld.w $a4, $s5, 92 + bstrpick.d $a5, $a5, 31, 0 + add.d $a7, $a6, $a5 + sltu $a5, $a2, $a4 + masknez $a6, $a4, $a5 + maskeqz $a2, $a2, $a5 + or $a2, $a2, $a6 +.LBB6_348: # in Loop: Header=BB6_14 Depth=1 + sltu $a5, $a2, $s7 + masknez $a6, $s7, $a5 + maskeqz $a2, $a2, $a5 + or $a2, $a2, $a6 + sub.d $a5, $a4, $a2 + addi.w $a4, $a2, -1 + ori $a6, $zero, 31 + st.w $a5, $s5, 92 + bltu $a4, $a6, .LBB6_353 +# %bb.349: # in Loop: Header=BB6_14 Depth=1 + sub.d $a5, $ra, $a7 + ori $a6, $zero, 32 + bltu $a5, $a6, .LBB6_353 +# %bb.350: # %vector.ph + # in Loop: Header=BB6_14 Depth=1 bstrpick.d $a4, $a4, 31, 0 - add.d $a6, $a5, $a4 - sltu $a4, $a2, $a3 - masknez $a5, $a3, $a4 - maskeqz $a2, $a2, $a4 - or $a2, $a2, $a5 -.LBB6_351: # in Loop: Header=BB6_16 Depth=1 - sltu $a4, $a2, $ra - masknez $a5, $ra, $a4 - maskeqz $a2, $a2, $a4 - or $a2, $a2, $a5 - sub.d $a4, $a3, $a2 - addi.w $a3, $a2, -1 - ori $a5, $zero, 31 - st.w $a4, $s5, 92 - bltu $a3, $a5, .LBB6_356 -# %bb.352: # in Loop: Header=BB6_16 Depth=1 - sub.d $a4, $t8, $a6 - ori $a5, $zero, 32 - bltu $a4, $a5, .LBB6_356 -# %bb.353: # %vector.ph - # in Loop: Header=BB6_16 Depth=1 - bstrpick.d $a3, $a3, 31, 0 - addi.d $a7, $a3, 1 - bstrpick.d $a3, $a7, 32, 5 - slli.d $t0, $a3, 5 - add.d $a3, $t8, $t0 - sub.d $a4, $a2, $t0 - add.d $a5, $a6, $t0 - addi.d $a6, $a6, 16 - addi.d $t1, $t8, 16 - move $t2, $t0 -.LBB6_354: # %vector.body - # Parent Loop BB6_16 Depth=1 + addi.d $t0, $a4, 1 + bstrpick.d $a4, $t0, 32, 5 + slli.d $t1, $a4, 5 + add.d $a4, $ra, $t1 + sub.d $a5, $a2, $t1 + add.d $a6, $a7, $t1 + addi.d $a7, $a7, 16 + addi.d $t2, $ra, 16 + move $t3, $t1 +.LBB6_351: # %vector.body + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - vld $vr0, $a6, -16 - vld $vr1, $a6, 0 - vst $vr0, $t1, -16 - vst $vr1, $t1, 0 - addi.d $t2, $t2, -32 - addi.d $a6, $a6, 32 - addi.d $t1, $t1, 32 - bnez $t2, .LBB6_354 -# %bb.355: # %middle.block - # in Loop: Header=BB6_16 Depth=1 - bne $a7, $t0, .LBB6_357 - b .LBB6_359 -.LBB6_356: # in Loop: Header=BB6_16 Depth=1 - move $a3, $t8 - move $a4, $a2 - move $a5, $a6 -.LBB6_357: # %scalar.ph.preheader - # in Loop: Header=BB6_16 Depth=1 - move $a6, $a3 -.LBB6_358: # %scalar.ph - # Parent Loop BB6_16 Depth=1 + vld $vr0, $a7, -16 + vld $vr1, $a7, 0 + vst $vr0, $t2, -16 + vst $vr1, $t2, 0 + addi.d $t3, $t3, -32 + addi.d $a7, $a7, 32 + addi.d $t2, $t2, 32 + bnez $t3, .LBB6_351 +# %bb.352: # %middle.block + # in Loop: Header=BB6_14 Depth=1 + bne $t0, $t1, .LBB6_354 + b .LBB6_356 +.LBB6_353: # in Loop: Header=BB6_14 Depth=1 + move $a4, $ra + move $a5, $a2 + move $a6, $a7 +.LBB6_354: # %scalar.ph.preheader + # in Loop: Header=BB6_14 Depth=1 + move $a7, $a4 +.LBB6_355: # %scalar.ph + # Parent Loop BB6_14 Depth=1 # => This Inner Loop Header: Depth=2 - ld.b $a7, $a5, 0 - addi.d $a5, $a5, 1 - addi.d $a3, $a6, 1 - addi.w $a4, $a4, -1 - st.b $a7, $a6, 0 - move $a6, $a3 - bnez $a4, .LBB6_358 -.LBB6_359: # %.loopexit3799 - # in Loop: Header=BB6_16 Depth=1 - ld.w $a4, $s5, 92 - sub.w $ra, $ra, $a2 - bnez $a4, .LBB6_361 -# %bb.360: # in Loop: Header=BB6_16 Depth=1 - lu12i.w $a2, 3 - ori $a2, $a2, 3912 + ld.b $t0, $a6, 0 + addi.d $a6, $a6, 1 + addi.d $a4, $a7, 1 + addi.w $a5, $a5, -1 + st.b $t0, $a7, 0 + move $a7, $a4 + bnez $a5, .LBB6_355 +.LBB6_356: # %.loopexit3799 + # in Loop: Header=BB6_14 Depth=1 + ld.w $a5, $s5, 92 + sub.w $s7, $s7, $a2 + bnez $a5, .LBB6_358 +# %bb.357: # in Loop: Header=BB6_14 Depth=1 + ori $a2, $s4, 3912 st.w $a2, $s5, 8 -.LBB6_361: # in Loop: Header=BB6_16 Depth=1 - move $fp, $s8 - move $t8, $a3 - b .LBB6_15 -.LBB6_362: # in Loop: Header=BB6_16 Depth=1 - lu12i.w $a3, 3 - ori $a3, $a3, 3917 -.LBB6_363: # %.thread - # in Loop: Header=BB6_16 Depth=1 - st.w $a3, $s5, 8 - move $s1, $a2 - b .LBB6_15 -.LBB6_364: # in Loop: Header=BB6_16 Depth=1 - lu32i.d $a3, 0 - stptr.w $a3, $s5, 7148 - st.w $s7, $s5, 8 - move $s1, $a2 - b .LBB6_15 -.LBB6_365: # in Loop: Header=BB6_16 Depth=1 - pcalau12i $a3, %pc_hi20(.L.str.14) - addi.d $a3, $a3, %pc_lo12(.L.str.14) - b .LBB6_336 -.LBB6_366: # in Loop: Header=BB6_16 Depth=1 - move $s4, $s6 - move $s6, $t8 +.LBB6_358: # in Loop: Header=BB6_14 Depth=1 + move $s0, $s8 + move $ra, $a4 + b .LBB6_13 +.LBB6_359: # in Loop: Header=BB6_14 Depth=1 + ori $a4, $s4, 3917 +.LBB6_360: # %.thread + # in Loop: Header=BB6_14 Depth=1 + st.w $a4, $s5, 8 + move $s2, $a2 + b .LBB6_13 +.LBB6_361: # in Loop: Header=BB6_14 Depth=1 + lu32i.d $a4, 0 + stptr.w $a4, $s5, 7148 + st.w $s3, $s5, 8 + move $s2, $a2 + b .LBB6_13 +.LBB6_362: # in Loop: Header=BB6_14 Depth=1 + pcalau12i $a4, %pc_hi20(.L.str.14) + addi.d $a4, $a4, %pc_lo12(.L.str.14) + b .LBB6_331 +.LBB6_363: # in Loop: Header=BB6_14 Depth=1 st.d $ra, $sp, 16 # 8-byte Folded Spill - lu12i.w $s7, 3 - andi $a0, $s2, 15 + andi $a0, $fp, 15 ori $a1, $zero, 8 - bne $a0, $a1, .LBB6_372 -# %bb.367: # in Loop: Header=BB6_16 Depth=1 + bne $a0, $a1, .LBB6_369 +# %bb.364: # in Loop: Header=BB6_14 Depth=1 ld.w $a2, $s5, 56 - bstrpick.d $a0, $s2, 7, 4 + bstrpick.d $a0, $fp, 7, 4 addi.w $a1, $a0, 8 - bnez $a2, .LBB6_369 -# %bb.368: # in Loop: Header=BB6_16 Depth=1 + bnez $a2, .LBB6_366 +# %bb.365: # in Loop: Header=BB6_14 Depth=1 st.w $a1, $s5, 56 move $a2, $a1 -.LBB6_369: # in Loop: Header=BB6_16 Depth=1 +.LBB6_366: # in Loop: Header=BB6_14 Depth=1 ori $a3, $zero, 7 - bltu $a3, $a0, .LBB6_374 -# %bb.370: # in Loop: Header=BB6_16 Depth=1 - bltu $a2, $a1, .LBB6_374 -# %bb.371: # in Loop: Header=BB6_16 Depth=1 + bltu $a3, $a0, .LBB6_371 +# %bb.367: # in Loop: Header=BB6_14 Depth=1 + bltu $a2, $a1, .LBB6_371 +# %bb.368: # in Loop: Header=BB6_14 Depth=1 ori $a1, $zero, 256 sll.w $a0, $a1, $a0 st.w $a0, $s5, 28 @@ -2902,154 +2901,165 @@ inflate: # @inflate pcaddu18i $ra, %call36(adler32) jirl $ra, $ra, 0 move $a1, $a0 - move $s0, $zero + move $s1, $zero st.d $a0, $s5, 32 ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $a1, $a0, 96 lu12i.w $a1, 2 - and $a1, $s2, $a1 + and $a1, $fp, $a1 sltui $a1, $a1, 1 + move $s4, $s7 ori $a2, $s7, 3901 masknez $a2, $a2, $a1 - ld.d $s7, $sp, 88 # 8-byte Folded Reload - maskeqz $a1, $s7, $a1 + maskeqz $a1, $s3, $a1 or $a1, $a1, $a2 st.w $a1, $s5, 8 - move $s2, $zero - ld.d $a1, $sp, 144 # 8-byte Folded Reload - b .LBB6_376 -.LBB6_372: # in Loop: Header=BB6_16 Depth=1 + move $fp, $zero + b .LBB6_373 +.LBB6_369: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.2) addi.d $a1, $a0, %pc_lo12(.L.str.2) - b .LBB6_375 -.LBB6_373: # in Loop: Header=BB6_16 Depth=1 + b .LBB6_372 +.LBB6_370: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.10) addi.d $a2, $a2, %pc_lo12(.L.str.10) - b .LBB6_345 -.LBB6_374: # in Loop: Header=BB6_16 Depth=1 - srli.d $s2, $s2, 4 - addi.w $s0, $s0, -4 + b .LBB6_340 +.LBB6_371: # in Loop: Header=BB6_14 Depth=1 + srli.d $fp, $fp, 4 + addi.w $s1, $s1, -4 pcalau12i $a0, %pc_hi20(.L.str.3) addi.d $a1, $a0, %pc_lo12(.L.str.3) -.LBB6_375: # %.thread - # in Loop: Header=BB6_16 Depth=1 +.LBB6_372: # %.thread + # in Loop: Header=BB6_14 Depth=1 ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $a1, $a0, 48 + move $s4, $s7 ori $a1, $s7, 3921 st.w $a1, $s5, 8 +.LBB6_373: # %.thread + # in Loop: Header=BB6_14 Depth=1 ld.d $a1, $sp, 144 # 8-byte Folded Reload - ld.d $s7, $sp, 88 # 8-byte Folded Reload -.LBB6_376: # %.thread - # in Loop: Header=BB6_16 Depth=1 + ld.d $a3, $sp, 120 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload ld.d $ra, $sp, 16 # 8-byte Folded Reload - move $t8, $s6 - move $s6, $s4 - b .LBB6_270 -.LBB6_377: # in Loop: Header=BB6_16 Depth=1 + b .LBB6_12 +.LBB6_374: # in Loop: Header=BB6_14 Depth=1 pcalau12i $a2, %pc_hi20(.L.str.10) addi.d $a2, $a2, %pc_lo12(.L.str.10) - b .LBB6_287 -.LBB6_378: - move $s1, $zero - b .LBB6_389 -.LBB6_379: - move $s1, $zero + b .LBB6_282 +.LBB6_375: + ld.w $a4, $a0, 8 + bnez $a4, .LBB6_408 + b .LBB6_8 +.LBB6_376: + move $s2, $zero + b .LBB6_390 +.LBB6_377: + move $s2, $zero move $s8, $a2 - b .LBB6_389 -.LBB6_380: # %.loopexit1268.loopexit2946 - bstrpick.d $a2, $s1, 31, 0 - alsl.w $s0, $s1, $s0, 3 - b .LBB6_384 -.LBB6_381: - move $s1, $zero - move $s8, $a3 - b .LBB6_389 -.LBB6_382: # %.loopexit1268.loopexit2943 - move $s1, $zero - bstrpick.d $a2, $t1, 31, 0 - alsl.w $s0, $t1, $s0, 3 - add.d $s8, $t2, $a2 - b .LBB6_389 -.LBB6_383: # %.loopexit1268.loopexit2944 - alsl.w $s0, $s1, $s0, 3 - bstrpick.d $a2, $s1, 31, 0 -.LBB6_384: # %.loopexit1268 + b .LBB6_390 +.LBB6_378: # %.loopexit1268.loopexit2946 + bstrpick.d $a2, $s2, 31, 0 + alsl.w $s1, $s2, $s1, 3 + b .LBB6_382 +.LBB6_379: + move $s2, $zero + move $s8, $a4 + b .LBB6_390 +.LBB6_380: # %.loopexit1268.loopexit2943 + move $s2, $zero + bstrpick.d $a2, $t2, 31, 0 + alsl.w $s1, $t2, $s1, 3 + add.d $s8, $t3, $a2 + b .LBB6_390 +.LBB6_381: # %.loopexit1268.loopexit2944 + alsl.w $s1, $s2, $s1, 3 + bstrpick.d $a2, $s2, 31, 0 +.LBB6_382: # %.loopexit1268 add.d $s8, $s8, $a2 - ld.d $a2, $sp, 96 # 8-byte Folded Reload - st.d $a2, $sp, 112 # 8-byte Folded Spill - move $s1, $zero - b .LBB6_391 -.LBB6_385: # %.loopexit1268.loopexit2945 - move $s1, $zero - bstrpick.d $a4, $a2, 31, 0 - alsl.w $s0, $a2, $a3, 3 - add.d $s8, $fp, $a4 + ld.d $a2, $sp, 88 # 8-byte Folded Reload + st.d $a2, $sp, 104 # 8-byte Folded Spill + move $s2, $zero + b .LBB6_392 +.LBB6_383: # %.loopexit1268.loopexit2945 + move $s2, $zero + bstrpick.d $a3, $a2, 31, 0 + alsl.w $s1, $a2, $a4, 3 b .LBB6_389 +.LBB6_384: + move $s2, $zero + move $s1, $t0 + b .LBB6_390 +.LBB6_385: # %..loopexit1268.loopexit4613_crit_edge + add.d $s8, $s8, $s0 + ld.d $a2, $sp, 88 # 8-byte Folded Reload + st.d $a2, $sp, 104 # 8-byte Folded Spill + move $s4, $s6 + move $s6, $s3 + ld.d $s3, $sp, 80 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload + b .LBB6_392 .LBB6_386: - move $s1, $zero - move $s0, $a7 - b .LBB6_389 -.LBB6_387: # %..loopexit1268.loopexit4613_crit_edge - add.d $s8, $s8, $fp - ld.d $a2, $sp, 96 # 8-byte Folded Reload - st.d $a2, $sp, 112 # 8-byte Folded Spill - ld.d $s6, $sp, 128 # 8-byte Folded Reload - ld.d $s7, $sp, 88 # 8-byte Folded Reload - b .LBB6_391 -.LBB6_388: - move $ra, $zero + move $s7, $zero + b .LBB6_390 +.LBB6_387: + move $s0, $s8 + b .LBB6_413 +.LBB6_388: # %.loopexit1268.loopexit2948 + move $s2, $zero + bstrpick.d $a3, $a2, 31, 0 + alsl.w $s1, $a2, $a5, 3 .LBB6_389: # %.loopexit1268 - ld.d $a2, $sp, 96 # 8-byte Folded Reload + add.d $s8, $s0, $a3 .LBB6_390: # %.loopexit1268 - st.d $a2, $sp, 112 # 8-byte Folded Spill + ld.d $a2, $sp, 88 # 8-byte Folded Reload .LBB6_391: # %.loopexit1268 - st.d $t8, $a0, 24 - st.w $ra, $a0, 32 + st.d $a2, $sp, 104 # 8-byte Folded Spill +.LBB6_392: # %.loopexit1268 + st.d $ra, $a0, 24 + st.w $s7, $a0, 32 ld.w $a2, $s5, 60 st.d $s8, $a0, 0 - st.w $s1, $a0, 8 - st.d $s2, $s5, 80 - st.w $s0, $s5, 88 - ld.d $s0, $sp, 120 # 8-byte Folded Reload - beqz $a2, .LBB6_395 -.LBB6_392: - sub.w $a2, $s0, $ra - move $a1, $t8 + st.w $s2, $a0, 8 + st.d $fp, $s5, 80 + st.w $s1, $s5, 88 + ld.d $fp, $sp, 112 # 8-byte Folded Reload + beqz $a2, .LBB6_396 +.LBB6_393: + sub.w $a2, $fp, $s7 + move $a1, $ra pcaddu18i $ra, %call36(updatewindow) jirl $ra, $ra, 0 - beqz $a0, .LBB6_399 -# %bb.393: - lu12i.w $a0, 3 - ori $a0, $a0, 3922 + beqz $a0, .LBB6_400 +# %bb.394: + ori $a0, $s4, 3922 st.w $a0, $s5, 8 -.LBB6_394: # %inflateStateCheck.exit.thread.loopexit - addi.w $s3, $zero, -4 - b .LBB6_407 .LBB6_395: - beq $s0, $ra, .LBB6_400 -# %bb.396: - ld.w $a2, $s5, 8 - lu12i.w $a3, 3 - ori $a3, $a3, 3920 - bltu $a3, $a2, .LBB6_400 + addi.w $a3, $zero, -4 + b .LBB6_408 +.LBB6_396: + beq $fp, $s7, .LBB6_401 # %bb.397: - ori $a3, $zero, 4 - bne $a1, $a3, .LBB6_392 + ld.w $a2, $s5, 8 + ori $a3, $s4, 3920 + bltu $a3, $a2, .LBB6_401 # %bb.398: - lu12i.w $a3, 3 - ori $a3, $a3, 3917 - bltu $a3, $a2, .LBB6_400 - b .LBB6_392 -.LBB6_399: # %._crit_edge2837 + ori $a3, $zero, 4 + bne $a1, $a3, .LBB6_393 +# %bb.399: + ori $a3, $s4, 3917 + bltu $a3, $a2, .LBB6_401 + b .LBB6_393 +.LBB6_400: # %._crit_edge2837 ld.d $a0, $sp, 152 # 8-byte Folded Reload - ld.w $s1, $a0, 8 - ld.w $ra, $a0, 32 + ld.w $s2, $a0, 8 + ld.w $s7, $a0, 32 ld.d $a1, $sp, 144 # 8-byte Folded Reload -.LBB6_400: - ld.d $s2, $sp, 112 # 8-byte Folded Reload - sub.d $a3, $s6, $s1 +.LBB6_401: + ld.d $s0, $sp, 104 # 8-byte Folded Reload + sub.d $a3, $s6, $s2 ld.d $a4, $a0, 16 - sub.w $a2, $s0, $ra + sub.w $a2, $fp, $s7 bstrpick.d $a3, $a3, 31, 0 ld.d $a5, $a0, 40 add.d $a3, $a4, $a3 @@ -3062,74 +3072,70 @@ inflate: # @inflate add.d $a4, $a4, $a3 andi $a5, $a5, 4 st.d $a4, $s5, 40 - beqz $a5, .LBB6_406 -# %bb.401: - beq $s0, $ra, .LBB6_406 + beqz $a5, .LBB6_407 # %bb.402: - move $fp, $ra + beq $fp, $s7, .LBB6_407 +# %bb.403: ld.w $a4, $s5, 24 ld.d $a1, $a0, 24 ld.d $a0, $s5, 32 sub.d $a1, $a1, $a3 - beqz $a4, .LBB6_404 -# %bb.403: + beqz $a4, .LBB6_405 +# %bb.404: pcaddu18i $ra, %call36(crc32) jirl $ra, $ra, 0 - b .LBB6_405 -.LBB6_404: + b .LBB6_406 +.LBB6_405: pcaddu18i $ra, %call36(adler32) jirl $ra, $ra, 0 -.LBB6_405: +.LBB6_406: move $a1, $a0 st.d $a0, $s5, 32 ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $a1, $a0, 96 ld.d $a1, $sp, 144 # 8-byte Folded Reload - move $ra, $fp -.LBB6_406: +.LBB6_407: ld.w $a2, $s5, 12 ld.w $a3, $s5, 88 ld.w $a4, $s5, 8 sltu $a2, $zero, $a2 slli.d $a2, $a2, 6 add.d $a2, $a2, $a3 - xor $a3, $a4, $s7 + xor $a3, $a4, $s3 sltui $a3, $a3, 1 slli.d $a3, $a3, 7 add.d $a2, $a2, $a3 - lu12i.w $a5, 3 - ori $a3, $a5, 3911 + ori $a3, $s4, 3911 xor $a3, $a4, $a3 sltui $a3, $a3, 1 - ori $a5, $a5, 3906 + ori $a5, $s4, 3906 xor $a4, $a4, $a5 sltui $a4, $a4, 1 or $a3, $a3, $a4 slli.d $a3, $a3, 8 add.d $a2, $a2, $a3 st.w $a2, $a0, 88 - xor $a0, $s6, $s1 + xor $a0, $s6, $s2 sltui $a0, $a0, 1 - xor $a2, $s0, $ra + xor $a2, $fp, $s7 sltui $a2, $a2, 1 addi.d $a1, $a1, -4 sltui $a1, $a1, 1 - sltui $a3, $s2, 1 - masknez $a4, $s2, $a3 + sltui $a3, $s0, 1 + masknez $a4, $s0, $a3 addi.w $a5, $zero, -5 maskeqz $a3, $a5, $a3 or $a3, $a3, $a4 maskeqz $a4, $a3, $a1 maskeqz $a3, $a3, $a2 - masknez $a2, $s2, $a2 + masknez $a2, $s0, $a2 or $a2, $a3, $a2 maskeqz $a2, $a2, $a0 - masknez $a0, $s2, $a0 + masknez $a0, $s0, $a0 or $a0, $a2, $a0 masknez $a0, $a0, $a1 - or $s3, $a4, $a0 -.LBB6_407: # %inflateStateCheck.exit.thread - move $a0, $s3 + or $a3, $a4, $a0 +.LBB6_408: ld.d $s8, $sp, 168 # 8-byte Folded Reload ld.d $s7, $sp, 176 # 8-byte Folded Reload ld.d $s6, $sp, 184 # 8-byte Folded Reload @@ -3142,95 +3148,85 @@ inflate: # @inflate ld.d $fp, $sp, 240 # 8-byte Folded Reload ld.d $ra, $sp, 248 # 8-byte Folded Reload addi.d $sp, $sp, 256 +.LBB6_409: # %inflateStateCheck.exit.thread + move $a0, $a3 ret -.LBB6_408: - move $fp, $s8 - b .LBB6_413 -.LBB6_409: # %.loopexit1268.loopexit2948 - move $s1, $zero - bstrpick.d $a3, $a2, 31, 0 - alsl.w $s0, $a2, $a4, 3 - add.d $s8, $fp, $a3 - ld.d $a2, $sp, 96 # 8-byte Folded Reload - st.d $a2, $sp, 112 # 8-byte Folded Spill - move $t8, $s4 - b .LBB6_391 .LBB6_410: - st.d $t8, $a0, 24 - st.w $ra, $a0, 32 + st.d $ra, $a0, 24 + st.w $s7, $a0, 32 st.d $s8, $a0, 0 - st.w $s1, $a0, 8 - st.d $s2, $s5, 80 - st.w $s0, $s5, 88 - ori $s3, $zero, 2 - b .LBB6_407 + st.w $s2, $a0, 8 + st.d $fp, $s5, 80 + st.w $s1, $s5, 88 + ori $a3, $zero, 2 + b .LBB6_408 .LBB6_411: - ld.d $a2, $sp, 96 # 8-byte Folded Reload - st.d $a2, $sp, 112 # 8-byte Folded Spill - ld.d $s6, $sp, 128 # 8-byte Folded Reload - move $s7, $a5 - b .LBB6_391 + ld.d $a2, $sp, 88 # 8-byte Folded Reload + st.d $a2, $sp, 104 # 8-byte Folded Spill + move $s4, $s6 + ld.d $s6, $sp, 96 # 8-byte Folded Reload + ld.d $s3, $sp, 80 # 8-byte Folded Reload + b .LBB6_392 .LBB6_412: - move $s2, $zero - move $s0, $zero + move $fp, $zero + move $s1, $zero .LBB6_413: - lu12i.w $a2, 3 - ori $a2, $a2, 3920 + ori $a2, $s4, 3920 st.w $a2, $s5, 8 ori $a2, $zero, 1 - st.d $a2, $sp, 112 # 8-byte Folded Spill - move $s8, $fp - b .LBB6_391 + st.d $a2, $sp, 104 # 8-byte Folded Spill + move $s8, $s0 + b .LBB6_392 .LBB6_414: # %.loopexit1268.loopexit3806 ori $a2, $zero, 1 - b .LBB6_390 + b .LBB6_391 .LBB6_415: - move $s0, $s2 - b .LBB6_389 + move $s1, $fp + b .LBB6_390 .LBB6_416: - srli.d $s2, $s2, 3 - addi.d $s0, $s0, -3 - move $s8, $fp - b .LBB6_389 + srli.d $fp, $fp, 3 + addi.d $s1, $s1, -3 + move $s8, $s0 + b .LBB6_390 .Lfunc_end6: .size inflate, .Lfunc_end6-inflate .section .rodata,"a",@progbits .p2align 2, 0x0 .LJTI6_0: - .word .LBB6_18-.LJTI6_0 - .word .LBB6_42-.LJTI6_0 - .word .LBB6_34-.LJTI6_0 - .word .LBB6_49-.LJTI6_0 - .word .LBB6_50-.LJTI6_0 + .word .LBB6_16-.LJTI6_0 + .word .LBB6_40-.LJTI6_0 + .word .LBB6_32-.LJTI6_0 + .word .LBB6_47-.LJTI6_0 + .word .LBB6_48-.LJTI6_0 .word .LBB6_211-.LJTI6_0 .word .LBB6_222-.LJTI6_0 .word .LBB6_239-.LJTI6_0 .word .LBB6_256-.LJTI6_0 - .word .LBB6_31-.LJTI6_0 + .word .LBB6_29-.LJTI6_0 + .word .LBB6_58-.LJTI6_0 .word .LBB6_60-.LJTI6_0 - .word .LBB6_62-.LJTI6_0 - .word .LBB6_63-.LJTI6_0 - .word .LBB6_35-.LJTI6_0 + .word .LBB6_61-.LJTI6_0 + .word .LBB6_33-.LJTI6_0 + .word .LBB6_72-.LJTI6_0 .word .LBB6_73-.LJTI6_0 - .word .LBB6_74-.LJTI6_0 - .word .LBB6_39-.LJTI6_0 - .word .LBB6_56-.LJTI6_0 - .word .LBB6_29-.LJTI6_0 - .word .LBB6_290-.LJTI6_0 - .word .LBB6_291-.LJTI6_0 - .word .LBB6_24-.LJTI6_0 - .word .LBB6_315-.LJTI6_0 - .word .LBB6_30-.LJTI6_0 - .word .LBB6_340-.LJTI6_0 + .word .LBB6_37-.LJTI6_0 + .word .LBB6_54-.LJTI6_0 + .word .LBB6_27-.LJTI6_0 + .word .LBB6_285-.LJTI6_0 + .word .LBB6_286-.LJTI6_0 .word .LBB6_22-.LJTI6_0 - .word .LBB6_52-.LJTI6_0 - .word .LBB6_57-.LJTI6_0 + .word .LBB6_310-.LJTI6_0 + .word .LBB6_28-.LJTI6_0 + .word .LBB6_335-.LJTI6_0 + .word .LBB6_20-.LJTI6_0 + .word .LBB6_50-.LJTI6_0 + .word .LBB6_55-.LJTI6_0 .word .LBB6_414-.LJTI6_0 - .word .LBB6_391-.LJTI6_0 - .word .LBB6_394-.LJTI6_0 + .word .LBB6_392-.LJTI6_0 + .word .LBB6_395-.LJTI6_0 .LJTI6_1: .word .LBB6_148-.LJTI6_1 - .word .LBB6_69-.LJTI6_1 + .word .LBB6_68-.LJTI6_1 .word .LBB6_146-.LJTI6_1 .word .LBB6_147-.LJTI6_1 # -- End function diff --git a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/block.s b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/block.s index d2d13877..05073ce1 100644 --- a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/block.s +++ b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/block.s @@ -2472,20 +2472,11 @@ intrapred_luma_16x16: # @intrapred_luma_16x16 .dword 0 # 0x0 .dword 1 # 0x1 .LCPI2_2: - .half 3 # 0x3 - .half 9 # 0x9 - .half 2 # 0x2 - .half 11 # 0xb - .half 1 # 0x1 - .half 13 # 0xd - .half 0 # 0x0 - .half 15 # 0xf -.LCPI2_3: .word 4 # 0x4 .word 5 # 0x5 .word 6 # 0x6 .word 7 # 0x7 -.LCPI2_4: +.LCPI2_3: .word 0 # 0x0 .word 1 # 0x1 .word 2 # 0x2 @@ -3609,38 +3600,35 @@ intrapred_chroma: # @intrapred_chroma vld $vr2, $t3, %pc_lo12(.LCPI2_0) pcalau12i $t3, %pc_hi20(.LCPI2_1) vld $vr3, $t3, %pc_lo12(.LCPI2_1) - pcalau12i $t3, %pc_hi20(.LCPI2_2) - vld $vr4, $t3, %pc_lo12(.LCPI2_2) vinsgr2vr.w $vr1, $a4, 0 alsl.d $a4, $a5, $a3, 1 addi.d $a4, $a4, 8 move $t3, $t1 - vori.b $vr5, $vr0, 0 .p2align 4, , 16 .LBB2_129: # %vector.body887 # =>This Inner Loop Header: Depth=1 ld.d $t4, $a4, -8 ld.d $t5, $a4, 0 - vinsgr2vr.d $vr6, $t4, 0 - vinsgr2vr.d $vr7, $t5, 0 + vinsgr2vr.d $vr4, $t4, 0 + vinsgr2vr.d $vr5, $t5, 0 alsl.d $t4, $t2, $a3, 1 ld.d $t5, $t4, -6 + vsllwil.wu.hu $vr4, $vr4, 0 ld.d $t4, $t4, -14 - vilvl.h $vr6, $vr0, $vr6 - vilvl.h $vr7, $vr0, $vr7 - vinsgr2vr.d $vr8, $t5, 0 - vinsgr2vr.d $vr9, $t4, 0 - vori.b $vr10, $vr4, 0 - vshuf.h $vr10, $vr0, $vr8 - vori.b $vr8, $vr4, 0 - vshuf.h $vr8, $vr0, $vr9 - vsub.w $vr6, $vr6, $vr10 - vsub.w $vr7, $vr7, $vr8 - vpickev.w $vr8, $vr2, $vr3 - vaddi.wu $vr9, $vr8, 1 - vaddi.wu $vr8, $vr8, 5 - vmadd.w $vr1, $vr6, $vr9 - vmadd.w $vr5, $vr7, $vr8 + vsllwil.wu.hu $vr5, $vr5, 0 + vinsgr2vr.d $vr6, $t5, 0 + vshuf4i.h $vr6, $vr6, 27 + vinsgr2vr.d $vr7, $t4, 0 + vshuf4i.h $vr7, $vr7, 27 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vpickev.w $vr6, $vr2, $vr3 + vaddi.wu $vr7, $vr6, 1 + vaddi.wu $vr6, $vr6, 5 + vmadd.w $vr1, $vr4, $vr7 + vmadd.w $vr0, $vr5, $vr6 vaddi.du $vr3, $vr3, 8 vaddi.du $vr2, $vr2, 8 addi.d $t3, $t3, -8 @@ -3648,7 +3636,7 @@ intrapred_chroma: # @intrapred_chroma addi.w $t2, $t2, -8 bnez $t3, .LBB2_129 # %bb.130: # %middle.block895 - vadd.w $vr0, $vr5, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a4, $vr0, 0 @@ -3814,10 +3802,10 @@ intrapred_chroma: # @intrapred_chroma # in Loop: Header=BB2_140 Depth=1 sub.d $t3, $a2, $a3 addi.d $t3, $t3, 1 + pcalau12i $t4, %pc_hi20(.LCPI2_2) + vld $vr3, $t4, %pc_lo12(.LCPI2_2) pcalau12i $t4, %pc_hi20(.LCPI2_3) - vld $vr3, $t4, %pc_lo12(.LCPI2_3) - pcalau12i $t4, %pc_hi20(.LCPI2_4) - vld $vr4, $t4, %pc_lo12(.LCPI2_4) + vld $vr4, $t4, %pc_lo12(.LCPI2_3) mul.d $t3, $t3, $a6 add.d $t3, $a1, $t3 vreplgr2vr.w $vr5, $t3 @@ -4250,45 +4238,44 @@ itrans: # @itrans slli.d $a3, $a3, 8 add.d $a3, $a0, $a3 slli.d $a4, $a4, 6 + add.d $a3, $a3, $a4 slli.d $a2, $a2, 5 add.d $a2, $a0, $a2 alsl.d $a1, $a1, $a2, 1 ld.d $a2, $a1, 104 - add.d $a3, $a3, $a4 ori $a4, $zero, 2408 vldx $vr0, $a3, $a4 vinsgr2vr.d $vr1, $a2, 0 - vrepli.b $vr2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vadd.w $vr0, $vr0, $vr1 vmaxi.w $vr0, $vr0, 0 vreplgr2vr.w $vr1, $a5 vmin.w $vr0, $vr0, $vr1 ld.d $a2, $a1, 136 ori $a4, $zero, 2424 - vldx $vr3, $a3, $a4 + vldx $vr2, $a3, $a4 vst $vr0, $a0, 1384 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 - vadd.w $vr0, $vr3, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 + vadd.w $vr0, $vr2, $vr0 vmaxi.w $vr0, $vr0, 0 vmin.w $vr0, $vr0, $vr1 ld.d $a2, $a1, 168 ori $a4, $zero, 2440 - vldx $vr3, $a3, $a4 + vldx $vr2, $a3, $a4 vst $vr0, $a0, 1448 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 - vadd.w $vr0, $vr3, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 + vadd.w $vr0, $vr2, $vr0 vmaxi.w $vr0, $vr0, 0 vmin.w $vr0, $vr0, $vr1 ld.d $a1, $a1, 200 ori $a2, $zero, 2456 - vldx $vr3, $a3, $a2 + vldx $vr2, $a3, $a2 vst $vr0, $a0, 1512 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 - vadd.w $vr0, $vr3, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 + vadd.w $vr0, $vr2, $vr0 vmaxi.w $vr0, $vr0, 0 vmin.w $vr0, $vr0, $vr1 vst $vr0, $a0, 1576 diff --git a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/erc_do_i.s b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/erc_do_i.s index 0719706e..687861d7 100644 --- a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/erc_do_i.s +++ b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/erc_do_i.s @@ -1190,8 +1190,7 @@ ercPixConcealIMB: # @ercPixConcealIMB addi.w $t1, $a6, 0 addi.w $t2, $t3, 0 ori $t3, $zero, 16 - vrepli.b $vr1, 0 - vrepli.h $vr2, 255 + vrepli.h $vr1, 255 pcalau12i $t4, %got_pc_hi20(img) ld.d $t4, $t4, %got_pc_lo12(img) move $t5, $zero @@ -1227,28 +1226,30 @@ ercPixConcealIMB: # @ercPixConcealIMB # %bb.69: # %vector.ph # in Loop: Header=BB2_65 Depth=1 move $t7, $zero - vreplgr2vr.w $vr3, $t6 - vreplgr2vr.w $vr4, $t5 + vreplgr2vr.w $vr2, $t6 + vreplgr2vr.w $vr3, $t5 move $t6, $a2 .p2align 4, , 16 .LBB2_70: # %vector.body # Parent Loop BB2_65 Depth=1 # => This Inner Loop Header: Depth=2 - vldx $vr5, $a3, $t7 - vldx $vr6, $t0, $t7 - vilvl.h $vr7, $vr1, $vr5 - vilvh.h $vr5, $vr1, $vr5 - vilvh.h $vr8, $vr1, $vr6 - vilvl.h $vr6, $vr1, $vr6 - vmul.w $vr6, $vr4, $vr6 - vmul.w $vr8, $vr4, $vr8 - vmadd.w $vr8, $vr3, $vr5 - vmadd.w $vr6, $vr3, $vr7 - vdiv.w $vr5, $vr6, $vr0 - vdiv.w $vr6, $vr8, $vr0 - vpickev.h $vr5, $vr6, $vr5 - vand.v $vr5, $vr5, $vr2 - vstx $vr5, $a1, $t7 + vldx $vr4, $a3, $t7 + vldx $vr5, $t0, $t7 + vsllwil.wu.hu $vr6, $vr4, 0 + vbsrl.v $vr4, $vr4, 8 + vsllwil.wu.hu $vr4, $vr4, 0 + vbsrl.v $vr7, $vr5, 8 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vmul.w $vr5, $vr3, $vr5 + vmul.w $vr7, $vr3, $vr7 + vmadd.w $vr7, $vr2, $vr4 + vmadd.w $vr5, $vr2, $vr6 + vdiv.w $vr4, $vr5, $vr0 + vdiv.w $vr5, $vr7, $vr0 + vpickev.h $vr4, $vr5, $vr4 + vand.v $vr4, $vr4, $vr1 + vstx $vr4, $a1, $t7 addi.d $t6, $t6, -8 addi.d $t7, $t7, 16 bnez $t6, .LBB2_70 diff --git a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/erc_do_p.s b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/erc_do_p.s index 0c7e8171..bd6cc6a1 100644 --- a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/erc_do_p.s +++ b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/erc_do_p.s @@ -4535,7 +4535,6 @@ edgeDistortion: # @edgeDistortion ori $a7, $zero, 1 ori $s4, $zero, 16 ori $s5, $zero, 3 - vrepli.b $vr0, 0 b .LBB24_2 .p2align 4, , 16 .LBB24_1: # in Loop: Header=BB24_2 Depth=1 @@ -4555,25 +4554,27 @@ edgeDistortion: # @edgeDistortion b .LBB24_8 .LBB24_3: # %.preheader140 # in Loop: Header=BB24_8 Depth=2 - vld $vr1, $a2, 496 + vld $vr0, $a2, 496 ld.d $a5, $sp, 40 # 8-byte Folded Reload - vld $vr2, $a5, 16 - vld $vr3, $a2, 480 + vld $vr1, $a5, 16 + vld $vr2, $a2, 480 .LBB24_4: # %.loopexit # in Loop: Header=BB24_8 Depth=2 - vld $vr4, $a5, 0 - vabsd.hu $vr1, $vr1, $vr2 - vilvl.h $vr2, $vr0, $vr1 - vabsd.hu $vr3, $vr3, $vr4 - vilvl.h $vr4, $vr0, $vr3 - vilvh.h $vr1, $vr0, $vr1 - vilvh.h $vr3, $vr0, $vr3 + vld $vr3, $a5, 0 + vabsd.hu $vr0, $vr0, $vr1 + vbsrl.v $vr1, $vr0, 8 + vsllwil.wu.hu $vr1, $vr1, 0 + vabsd.hu $vr2, $vr2, $vr3 + vbsrl.v $vr3, $vr2, 8 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vadd.w $vr0, $vr2, $vr0 vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr4, $vr2 - vadd.w $vr1, $vr2, $vr1 - vhaddw.d.w $vr1, $vr1, $vr1 - vhaddw.q.d $vr1, $vr1, $vr1 - vpickve2gr.d $a5, $vr1, 0 + vadd.w $vr0, $vr0, $vr1 + vhaddw.d.w $vr0, $vr0, $vr0 + vhaddw.q.d $vr0, $vr0, $vr0 + vpickve2gr.d $a5, $vr0, 0 .LBB24_5: # %.loopexit # in Loop: Header=BB24_8 Depth=2 add.w $a4, $a5, $a4 @@ -4600,10 +4601,10 @@ edgeDistortion: # @edgeDistortion jr $a5 .LBB24_11: # %.preheader # in Loop: Header=BB24_8 Depth=2 - vld $vr1, $a2, 16 + vld $vr0, $a2, 16 ld.d $a5, $sp, 32 # 8-byte Folded Reload - vld $vr2, $a5, 16 - vld $vr3, $a2, 0 + vld $vr1, $a5, 16 + vld $vr2, $a2, 0 b .LBB24_4 .LBB24_12: # %.preheader142 # in Loop: Header=BB24_8 Depth=2 diff --git a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/image.s b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/image.s index 263cadcf..681df6ea 100644 --- a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/image.s +++ b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/image.s @@ -1363,7 +1363,6 @@ buf2img: # @buf2img bstrpick.d $a2, $s1, 30, 4 slli.d $a2, $a2, 4 ori $a3, $zero, 16 - vrepli.b $vr0, 0 b .LBB4_17 .p2align 4, , 16 .LBB4_16: # %._crit_edge.us125 @@ -1419,12 +1418,12 @@ buf2img: # @buf2img # => This Inner Loop Header: Depth=2 ld.d $t1, $a7, -8 ld.d $t2, $a7, 0 - vinsgr2vr.d $vr1, $t1, 0 - vinsgr2vr.d $vr2, $t2, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vst $vr1, $a6, -16 - vst $vr2, $a6, 0 + vinsgr2vr.d $vr0, $t1, 0 + vinsgr2vr.d $vr1, $t2, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vst $vr0, $a6, -16 + vst $vr1, $a6, 0 addi.d $t0, $t0, -16 addi.d $a6, $a6, 32 addi.d $a7, $a7, 16 @@ -1601,7 +1600,6 @@ buf2img: # @buf2img sub.d $a7, $zero, $a5 ori $t0, $zero, 8 ori $t1, $zero, 16 - vrepli.b $vr0, 0 move $t2, $s2 b .LBB4_53 .p2align 4, , 16 @@ -1657,9 +1655,9 @@ buf2img: # @buf2img # Parent Loop BB4_53 Depth=1 # => This Inner Loop Header: Depth=2 ld.d $t7, $t6, 0 - vinsgr2vr.d $vr1, $t7, 0 - vilvl.b $vr1, $vr0, $vr1 - vst $vr1, $t5, 0 + vinsgr2vr.d $vr0, $t7, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vst $vr0, $t5, 0 addi.d $t4, $t4, 8 addi.d $t5, $t5, 16 addi.d $t6, $t6, 8 @@ -1680,12 +1678,12 @@ buf2img: # @buf2img # => This Inner Loop Header: Depth=2 ld.d $t7, $t5, -8 ld.d $t8, $t5, 0 - vinsgr2vr.d $vr1, $t7, 0 - vinsgr2vr.d $vr2, $t8, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vst $vr1, $t4, -16 - vst $vr2, $t4, 0 + vinsgr2vr.d $vr0, $t7, 0 + vinsgr2vr.d $vr1, $t8, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vst $vr0, $t4, -16 + vst $vr1, $t4, 0 addi.d $t6, $t6, -16 addi.d $t4, $t4, 32 addi.d $t5, $t5, 16 @@ -3794,8 +3792,7 @@ get_block: # @get_block vinsgr2vr.h $vr1, $t2, 1 vinsgr2vr.h $vr1, $a5, 2 vinsgr2vr.h $vr1, $t1, 3 - vrepli.b $vr2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vadd.w $vr0, $vr0, $vr1 ld.d $a5, $sp, 40 # 8-byte Folded Reload ld.d $a5, $a5, 0 @@ -3811,7 +3808,7 @@ get_block: # @get_block vinsgr2vr.h $vr1, $t1, 1 vinsgr2vr.h $vr1, $t2, 2 vinsgr2vr.h $vr1, $a5, 3 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vadd.w $vr0, $vr0, $vr1 vaddi.wu $vr0, $vr0, 1 vsrai.w $vr0, $vr0, 1 @@ -3825,7 +3822,7 @@ get_block: # @get_block vinsgr2vr.h $vr1, $t0, 1 vinsgr2vr.h $vr1, $t1, 2 vinsgr2vr.h $vr1, $a3, 3 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vadd.w $vr0, $vr0, $vr1 ld.d $a3, $sp, 48 # 8-byte Folded Reload ld.d $a3, $a3, 0 @@ -3841,7 +3838,7 @@ get_block: # @get_block vinsgr2vr.h $vr1, $a4, 1 vinsgr2vr.h $vr1, $a5, 2 vinsgr2vr.h $vr1, $a6, 3 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vadd.w $vr0, $vr0, $vr1 vaddi.wu $vr0, $vr0, 1 vsrai.w $vr0, $vr0, 1 @@ -4081,14 +4078,13 @@ get_block: # @get_block ldx.h $t2, $a4, $t1 ldx.h $t6, $a4, $t3 ld.d $a4, $sp, 56 # 8-byte Folded Reload - vld $vr1, $a4, 0 - vinsgr2vr.h $vr2, $a5, 0 - vinsgr2vr.h $vr2, $a6, 1 - vinsgr2vr.h $vr2, $t2, 2 - vinsgr2vr.h $vr2, $t6, 3 - vrepli.b $vr0, 0 - vilvl.h $vr2, $vr0, $vr2 - vadd.w $vr1, $vr1, $vr2 + vld $vr0, $a4, 0 + vinsgr2vr.h $vr1, $a5, 0 + vinsgr2vr.h $vr1, $a6, 1 + vinsgr2vr.h $vr1, $t2, 2 + vinsgr2vr.h $vr1, $t6, 3 + vsllwil.wu.hu $vr1, $vr1, 0 + vadd.w $vr0, $vr0, $vr1 slt $a5, $t5, $a3 maskeqz $a6, $a3, $a5 masknez $a5, $t5, $a5 @@ -4100,20 +4096,20 @@ get_block: # @get_block or $a5, $a5, $a6 slli.d $a5, $a5, 3 ldx.d $a5, $a0, $a5 - vaddi.wu $vr1, $vr1, 1 - vsrai.w $vr1, $vr1, 1 - vst $vr1, $a4, 0 + vaddi.wu $vr0, $vr0, 1 + vsrai.w $vr0, $vr0, 1 + vst $vr0, $a4, 0 ldx.h $a6, $a5, $a7 ldx.h $t2, $a5, $t0 ldx.h $t5, $a5, $t1 ldx.h $a5, $a5, $t3 - vld $vr1, $a4, 16 - vinsgr2vr.h $vr2, $a6, 0 - vinsgr2vr.h $vr2, $t2, 1 - vinsgr2vr.h $vr2, $t5, 2 - vinsgr2vr.h $vr2, $a5, 3 - vilvl.h $vr2, $vr0, $vr2 - vadd.w $vr1, $vr1, $vr2 + vld $vr0, $a4, 16 + vinsgr2vr.h $vr1, $a6, 0 + vinsgr2vr.h $vr1, $t2, 1 + vinsgr2vr.h $vr1, $t5, 2 + vinsgr2vr.h $vr1, $a5, 3 + vsllwil.wu.hu $vr1, $vr1, 0 + vadd.w $vr0, $vr0, $vr1 slt $a5, $t4, $a3 maskeqz $a6, $a3, $a5 masknez $a5, $t4, $a5 @@ -4125,20 +4121,20 @@ get_block: # @get_block or $a5, $a5, $a6 slli.d $a5, $a5, 3 ldx.d $a5, $a0, $a5 - vaddi.wu $vr1, $vr1, 1 - vsrai.w $vr1, $vr1, 1 - vst $vr1, $a4, 16 + vaddi.wu $vr0, $vr0, 1 + vsrai.w $vr0, $vr0, 1 + vst $vr0, $a4, 16 ldx.h $a6, $a5, $a7 ldx.h $t2, $a5, $t0 ldx.h $t4, $a5, $t1 ldx.h $a5, $a5, $t3 - vld $vr1, $a4, 32 - vinsgr2vr.h $vr2, $a6, 0 - vinsgr2vr.h $vr2, $t2, 1 - vinsgr2vr.h $vr2, $t4, 2 - vinsgr2vr.h $vr2, $a5, 3 - vilvl.h $vr2, $vr0, $vr2 - vadd.w $vr1, $vr1, $vr2 + vld $vr0, $a4, 32 + vinsgr2vr.h $vr1, $a6, 0 + vinsgr2vr.h $vr1, $t2, 1 + vinsgr2vr.h $vr1, $t4, 2 + vinsgr2vr.h $vr1, $a5, 3 + vsllwil.wu.hu $vr1, $vr1, 0 + vadd.w $vr0, $vr0, $vr1 slt $a5, $a2, $a3 maskeqz $a3, $a3, $a5 masknez $a2, $a2, $a5 @@ -4150,20 +4146,20 @@ get_block: # @get_block or $a1, $a2, $a1 slli.d $a1, $a1, 3 ldx.d $a0, $a0, $a1 - vaddi.wu $vr1, $vr1, 1 - vsrai.w $vr1, $vr1, 1 - vst $vr1, $a4, 32 + vaddi.wu $vr0, $vr0, 1 + vsrai.w $vr0, $vr0, 1 + vst $vr0, $a4, 32 ldx.h $a1, $a0, $a7 ldx.h $a2, $a0, $t0 ldx.h $a3, $a0, $t1 ldx.h $a5, $a0, $t3 - vld $vr1, $a4, 48 - vinsgr2vr.h $vr2, $a1, 0 - vinsgr2vr.h $vr2, $a2, 1 - vinsgr2vr.h $vr2, $a3, 2 - vinsgr2vr.h $vr2, $a5, 3 - vilvl.h $vr0, $vr0, $vr2 - vadd.w $vr0, $vr1, $vr0 + vld $vr0, $a4, 48 + vinsgr2vr.h $vr1, $a1, 0 + vinsgr2vr.h $vr1, $a2, 1 + vinsgr2vr.h $vr1, $a3, 2 + vinsgr2vr.h $vr1, $a5, 3 + vsllwil.wu.hu $vr1, $vr1, 0 + vadd.w $vr0, $vr0, $vr1 vaddi.wu $vr0, $vr0, 1 vsrai.w $vr0, $vr0, 1 vst $vr0, $a4, 48 diff --git a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/transform8x8.s b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/transform8x8.s index b8890539..0ce8b92a 100644 --- a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/transform8x8.s +++ b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/transform8x8.s @@ -10,15 +10,6 @@ .half 12 # 0xc .half 13 # 0xd .half 14 # 0xe -.LCPI0_1: - .half 3 # 0x3 - .half 9 # 0x9 - .half 4 # 0x4 - .half 11 # 0xb - .half 5 # 0x5 - .half 13 # 0xd - .half 6 # 0x6 - .half 15 # 0xf .text .globl intrapred8x8 .p2align 5 @@ -564,9 +555,10 @@ intrapred8x8: # @intrapred8x8 b .LBB0_81 .LBB0_41: st.d $s7, $sp, 56 # 8-byte Folded Spill - st.d $s0, $sp, 24 # 8-byte Folded Spill - st.d $s8, $sp, 32 # 8-byte Folded Spill - st.d $s6, $sp, 40 # 8-byte Folded Spill + st.d $s0, $sp, 16 # 8-byte Folded Spill + st.d $s8, $sp, 24 # 8-byte Folded Spill + st.d $s6, $sp, 32 # 8-byte Folded Spill + st.d $t8, $sp, 40 # 8-byte Folded Spill st.d $s2, $sp, 48 # 8-byte Folded Spill beqz $s5, .LBB0_44 # %bb.42: @@ -578,10 +570,8 @@ intrapred8x8: # @intrapred8x8 pcalau12i $a0, %pc_hi20(.L.str.4) addi.d $a0, $a0, %pc_lo12(.L.str.4) move $s2, $ra - move $s3, $t8 pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 - move $t8, $s3 move $ra, $s2 .LBB0_45: ld.hu $a1, $sp, 444 @@ -592,91 +582,89 @@ intrapred8x8: # @intrapred8x8 add.d $a3, $a4, $a3 alsl.d $a3, $a2, $a3, 1 srli.d $a5, $a3, 2 - addi.d $t4, $s1, 104 + addi.d $t3, $s1, 104 ld.d $a0, $sp, 96 # 8-byte Folded Reload slli.d $a3, $a0, 5 ld.hu $a6, $sp, 442 - add.d $t0, $t4, $a3 + add.d $t0, $t3, $a3 slli.d $t1, $fp, 1 stx.h $a5, $t0, $t1 addi.d $a5, $a6, 2 add.d $a2, $a5, $a2 alsl.d $a1, $a1, $a2, 1 srli.d $a1, $a1, 2 - st.d $ra, $sp, 16 # 8-byte Folded Spill + st.d $ra, $sp, 8 # 8-byte Folded Spill slli.d $s6, $ra, 1 stx.h $a1, $t0, $s6 ld.d $a0, $sp, 72 # 8-byte Folded Reload slli.d $a7, $a0, 5 - ld.hu $t5, $sp, 440 - add.d $t3, $t4, $a7 - stx.h $a1, $t3, $t1 + ld.hu $t4, $sp, 440 + add.d $t5, $t3, $a7 + stx.h $a1, $t5, $t1 alsl.d $a1, $a6, $a4, 1 - add.d $a1, $a1, $t5 + add.d $a1, $a1, $t4 srli.d $a1, $a1, 2 - ld.d $s7, $sp, 24 # 8-byte Folded Reload - slli.d $t7, $s7, 1 - stx.h $a1, $t0, $t7 - stx.h $a1, $t3, $s6 + ld.d $s7, $sp, 16 # 8-byte Folded Reload + slli.d $t8, $s7, 1 + stx.h $a1, $t0, $t8 + stx.h $a1, $t5, $s6 ld.d $a0, $sp, 112 # 8-byte Folded Reload slli.d $a4, $a0, 5 ld.hu $a6, $sp, 438 - add.d $t2, $t4, $a4 + add.d $t2, $t3, $a4 stx.h $a1, $t2, $t1 - alsl.d $a1, $t5, $a5, 1 + alsl.d $a1, $t4, $a5, 1 add.d $a1, $a1, $a6 srli.d $a4, $a1, 2 - ld.d $s8, $sp, 40 # 8-byte Folded Reload - slli.d $s0, $s8, 1 - stx.h $a4, $t0, $s0 - stx.h $a4, $t3, $t7 + ld.d $s8, $sp, 32 # 8-byte Folded Reload + slli.d $s1, $s8, 1 + stx.h $a4, $t0, $s1 + stx.h $a4, $t5, $t8 stx.h $a4, $t2, $s6 ld.d $a0, $sp, 80 # 8-byte Folded Reload slli.d $a1, $a0, 5 ld.hu $a7, $sp, 436 - add.d $s5, $t4, $a1 + add.d $s5, $t3, $a1 stx.h $a4, $s5, $t1 - alsl.d $a4, $a6, $t5, 1 + alsl.d $a4, $a6, $t4, 1 add.d $a4, $a4, $a7 addi.d $a4, $a4, 2 srli.d $a5, $a4, 2 - ld.d $ra, $sp, 32 # 8-byte Folded Reload - slli.d $s2, $ra, 1 - stx.h $a5, $t0, $s2 - stx.h $a5, $t3, $s0 - stx.h $a5, $t2, $t7 + ld.d $ra, $sp, 24 # 8-byte Folded Reload + slli.d $s3, $ra, 1 + stx.h $a5, $t0, $s3 + stx.h $a5, $t5, $s1 + stx.h $a5, $t2, $t8 stx.h $a5, $s5, $s6 ld.d $a0, $sp, 88 # 8-byte Folded Reload slli.d $a4, $a0, 5 - ld.hu $t5, $sp, 434 - add.d $a4, $t4, $a4 + ld.hu $t4, $sp, 434 + add.d $a4, $t3, $a4 stx.h $a5, $a4, $t1 alsl.d $a5, $a7, $a6, 1 - add.d $fp, $a5, $t5 + add.d $t7, $a5, $t4 ld.d $a3, $sp, 56 # 8-byte Folded Reload - slli.d $s1, $a3, 1 + slli.d $s0, $a3, 1 ld.d $a0, $sp, 64 # 8-byte Folded Reload slli.d $a5, $a0, 5 - add.d $a5, $t4, $a5 - alsl.d $t6, $t5, $a7, 1 - slli.d $s3, $t8, 1 + add.d $a5, $t3, $a5 + alsl.d $t6, $t4, $a7, 1 + ld.d $a2, $sp, 40 # 8-byte Folded Reload + slli.d $s2, $a2, 1 ld.d $a0, $sp, 104 # 8-byte Folded Reload slli.d $a6, $a0, 5 - add.d $a6, $t4, $a6 + add.d $a6, $t3, $a6 ld.d $a1, $sp, 48 # 8-byte Folded Reload alsl.d $a0, $a1, $t0, 1 st.d $a0, $sp, 112 # 8-byte Folded Spill ld.d $a0, $sp, 120 # 8-byte Folded Reload - move $a2, $t8 - slli.d $t8, $a0, 5 - add.d $t8, $t4, $t8 + slli.d $fp, $a0, 5 + add.d $fp, $t3, $fp ld.hu $s4, $sp, 400 - vinsgr2vr.h $vr0, $t5, 0 - pcalau12i $t4, %pc_hi20(.LCPI0_0) - vld $vr1, $t4, %pc_lo12(.LCPI0_0) - pcalau12i $t4, %pc_hi20(.LCPI0_1) - vld $vr2, $t4, %pc_lo12(.LCPI0_1) - alsl.d $a0, $a2, $t3, 1 + vinsgr2vr.h $vr0, $t4, 0 + pcalau12i $t3, %pc_hi20(.LCPI0_0) + vld $vr1, $t3, %pc_lo12(.LCPI0_0) + alsl.d $a0, $a2, $t5, 1 st.d $a0, $sp, 120 # 8-byte Folded Spill vinsgr2vr.h $vr0, $s4, 1 add.d $t6, $t6, $s4 @@ -684,59 +672,61 @@ intrapred8x8: # @intrapred8x8 st.d $a0, $sp, 104 # 8-byte Folded Spill addi.d $t6, $t6, 2 srli.d $s4, $t6, 2 - stx.h $s4, $t0, $s3 + stx.h $s4, $t0, $s2 alsl.d $t6, $ra, $s5, 1 - addi.d $fp, $fp, 2 - srli.d $s3, $fp, 2 - stx.h $s3, $t0, $s1 - alsl.d $t5, $s8, $a4, 1 - stx.h $s4, $t3, $s1 - alsl.d $fp, $s7, $a5, 1 - ld.d $a0, $sp, 16 # 8-byte Folded Reload - alsl.d $s1, $a0, $a6, 1 - stx.h $s3, $t3, $s2 - alsl.d $t4, $a1, $t3, 1 - stx.h $s3, $t2, $s0 - stx.h $s3, $s5, $t7 - stx.h $s3, $a4, $s6 - stx.h $s3, $a5, $t1 - alsl.d $s3, $a2, $t2, 1 - stx.h $s4, $t2, $s2 - alsl.d $s2, $a3, $s5, 1 - stx.h $s4, $s5, $s0 - alsl.d $t3, $ra, $a4, 1 - stx.h $s4, $a4, $t7 - alsl.d $t7, $s8, $a5, 1 + addi.d $t7, $t7, 2 + srli.d $s2, $t7, 2 + stx.h $s2, $t0, $s0 + alsl.d $t4, $s8, $a4, 1 + stx.h $s4, $t5, $s0 + alsl.d $t7, $s7, $a5, 1 + ld.d $a0, $sp, 8 # 8-byte Folded Reload + alsl.d $t3, $a0, $a6, 1 + stx.h $s2, $t5, $s3 + alsl.d $t5, $a1, $t5, 1 + stx.h $s2, $t2, $s1 + stx.h $s2, $s5, $t8 + stx.h $s2, $a4, $s6 + stx.h $s2, $a5, $t1 + alsl.d $s2, $a2, $t2, 1 + stx.h $s4, $t2, $s3 + alsl.d $s3, $a3, $s5, 1 + stx.h $s4, $s5, $s1 + alsl.d $s1, $ra, $a4, 1 + stx.h $s4, $a4, $t8 + alsl.d $t8, $s8, $a5, 1 alsl.d $s0, $s7, $a6, 1 alsl.d $t2, $a1, $t2, 1 stx.h $s4, $a5, $s6 alsl.d $t0, $a2, $s5, 1 - vld $vr3, $sp, 400 + vld $vr2, $sp, 400 stx.h $s4, $a6, $t1 alsl.d $s4, $a3, $a4, 1 - vld $vr4, $sp, 402 - vshuf.h $vr1, $vr3, $vr0 - vrepli.b $vr0, 0 - vshuf.h $vr2, $vr0, $vr3 - vilvl.h $vr1, $vr0, $vr1 - vilvl.h $vr5, $vr0, $vr3 - vilvh.h $vr3, $vr0, $vr3 - vilvl.h $vr6, $vr0, $vr4 - vilvh.h $vr0, $vr0, $vr4 - vslli.w $vr3, $vr3, 1 - vslli.w $vr4, $vr5, 1 - vadd.w $vr1, $vr1, $vr4 - vadd.w $vr2, $vr2, $vr3 - vadd.w $vr0, $vr2, $vr0 - vadd.w $vr1, $vr1, $vr6 - vaddi.wu $vr1, $vr1, 2 + vld $vr3, $sp, 402 + vshuf.h $vr1, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr1, 0 + vbsrl.v $vr1, $vr2, 6 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr4, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr5, $vr3, 0 + vbsrl.v $vr3, $vr3, 8 + vsllwil.wu.hu $vr3, $vr3, 0 + vslli.w $vr2, $vr2, 1 + vslli.w $vr4, $vr4, 1 + vadd.w $vr0, $vr0, $vr4 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr1, $vr1, $vr3 + vadd.w $vr0, $vr0, $vr5 vaddi.wu $vr0, $vr0, 2 - vsrli.w $vr0, $vr0, 2 + vaddi.wu $vr1, $vr1, 2 vsrli.w $vr1, $vr1, 2 - vpickev.h $vr2, $vr0, $vr1 - vstx $vr2, $t8, $t1 + vsrli.w $vr0, $vr0, 2 + vpickev.h $vr2, $vr1, $vr0 + vstx $vr2, $fp, $t1 alsl.d $t1, $ra, $a5, 1 - alsl.d $t8, $s8, $a6, 1 + alsl.d $fp, $s8, $a6, 1 alsl.d $a7, $a1, $s5, 1 alsl.d $s5, $a2, $a4, 1 alsl.d $s6, $ra, $a6, 1 @@ -748,37 +738,37 @@ intrapred8x8: # @intrapred8x8 alsl.d $a0, $a2, $a6, 1 alsl.d $a6, $a1, $a6, 1 ld.d $a1, $sp, 112 # 8-byte Folded Reload - vstelm.h $vr1, $a1, 0, 0 + vstelm.h $vr0, $a1, 0, 0 ld.d $a1, $sp, 120 # 8-byte Folded Reload - vstelm.h $vr1, $a1, 0, 0 + vstelm.h $vr0, $a1, 0, 0 ld.d $a1, $sp, 104 # 8-byte Folded Reload - vstelm.h $vr1, $a1, 0, 0 - vstelm.h $vr1, $t6, 0, 0 - vstelm.h $vr1, $t5, 0, 0 - vstelm.h $vr1, $fp, 0, 0 - vstelm.h $vr1, $s1, 0, 0 - vstelm.h $vr1, $t4, 0, 2 - vstelm.h $vr1, $s3, 0, 2 - vstelm.h $vr1, $s2, 0, 2 - vstelm.h $vr1, $t3, 0, 2 - vstelm.h $vr1, $t7, 0, 2 - vstelm.h $vr1, $s0, 0, 2 - vstelm.h $vr1, $t2, 0, 4 - vstelm.h $vr1, $t0, 0, 4 - vstelm.h $vr1, $s4, 0, 4 + vstelm.h $vr0, $a1, 0, 0 + vstelm.h $vr0, $t6, 0, 0 + vstelm.h $vr0, $t4, 0, 0 + vstelm.h $vr0, $t7, 0, 0 + vstelm.h $vr0, $t3, 0, 0 + vstelm.h $vr0, $t5, 0, 2 + vstelm.h $vr0, $s2, 0, 2 + vstelm.h $vr0, $s3, 0, 2 + vstelm.h $vr0, $s1, 0, 2 + vstelm.h $vr0, $t8, 0, 2 + vstelm.h $vr0, $s0, 0, 2 + vstelm.h $vr0, $t2, 0, 4 + vstelm.h $vr0, $t0, 0, 4 + vstelm.h $vr0, $s4, 0, 4 ld.d $s4, $sp, 128 # 8-byte Folded Reload - vstelm.h $vr1, $t1, 0, 4 - vstelm.h $vr1, $t8, 0, 4 - vstelm.h $vr1, $a7, 0, 6 - vstelm.h $vr1, $s5, 0, 6 - vstelm.h $vr1, $s8, 0, 6 - vstelm.h $vr1, $s6, 0, 6 - vstelm.h $vr0, $a4, 0, 0 - vstelm.h $vr0, $s7, 0, 0 - vstelm.h $vr0, $ra, 0, 0 - vstelm.h $vr0, $a5, 0, 2 - vstelm.h $vr0, $a0, 0, 2 - vstelm.h $vr0, $a6, 0, 4 + vstelm.h $vr0, $t1, 0, 4 + vstelm.h $vr0, $fp, 0, 4 + vstelm.h $vr0, $a7, 0, 6 + vstelm.h $vr0, $s5, 0, 6 + vstelm.h $vr0, $s8, 0, 6 + vstelm.h $vr0, $s6, 0, 6 + vstelm.h $vr1, $a4, 0, 0 + vstelm.h $vr1, $s7, 0, 0 + vstelm.h $vr1, $ra, 0, 0 + vstelm.h $vr1, $a5, 0, 2 + vstelm.h $vr1, $a0, 0, 2 + vstelm.h $vr1, $a6, 0, 4 b .LBB0_81 .LBB0_46: beqz $s4, .LBB0_72 @@ -839,122 +829,124 @@ intrapred8x8: # @intrapred8x8 ld.d $a0, $sp, 120 # 8-byte Folded Reload slli.d $a1, $a0, 5 add.d $a4, $a7, $a1 - slli.d $t3, $fp, 1 - slli.d $a6, $ra, 1 + slli.d $t4, $fp, 1 + slli.d $a5, $ra, 1 ld.d $a0, $sp, 104 # 8-byte Folded Reload slli.d $a1, $a0, 5 - add.d $a5, $a7, $a1 + add.d $a6, $a7, $a1 ld.d $a0, $sp, 64 # 8-byte Folded Reload slli.d $a1, $a0, 5 - add.d $t4, $a7, $a1 - slli.d $t2, $s6, 1 + add.d $t5, $a7, $a1 vld $vr0, $sp, 406 + slli.d $t2, $s6, 1 ld.d $a0, $sp, 88 # 8-byte Folded Reload slli.d $a1, $a0, 5 add.d $a1, $a7, $a1 - vrepli.b $vr1, 0 - vilvl.h $vr2, $vr1, $vr0 - vaddi.wu $vr2, $vr2, 2 - vpickve2gr.w $t0, $vr2, 0 + vsllwil.wu.hu $vr1, $vr0, 0 + vaddi.wu $vr1, $vr1, 2 + vpickve2gr.w $t0, $vr1, 0 add.d $a2, $t0, $a2 ld.hu $t0, $sp, 406 alsl.d $a2, $a3, $a2, 1 srli.d $a2, $a2, 2 - stx.h $a2, $a4, $t3 - vpickve2gr.w $a2, $vr2, 1 + stx.h $a2, $a4, $t4 + vpickve2gr.w $a2, $vr1, 1 add.d $a3, $a2, $a3 - slli.d $t5, $s8, 1 + slli.d $t6, $s8, 1 ld.d $a0, $sp, 80 # 8-byte Folded Reload slli.d $a2, $a0, 5 add.d $a2, $a7, $a2 alsl.d $t0, $t0, $a3, 1 - slli.d $t6, $s7, 1 + slli.d $t3, $s7, 1 ld.d $a0, $sp, 112 # 8-byte Folded Reload slli.d $a3, $a0, 5 add.d $a3, $a7, $a3 bstrpick.d $t0, $t0, 18, 2 - stx.h $t0, $a4, $a6 + stx.h $t0, $a4, $a5 slli.d $t1, $t8, 1 ld.d $a0, $sp, 72 # 8-byte Folded Reload - slli.d $a6, $a0, 5 - add.d $a6, $a7, $a6 - stx.h $t0, $a5, $t3 + slli.d $a5, $a0, 5 + add.d $a5, $a7, $a5 + stx.h $t0, $a6, $t4 slli.d $t0, $s2, 1 - vld $vr3, $sp, 408 ld.d $a0, $sp, 96 # 8-byte Folded Reload slli.d $t7, $a0, 5 - vld $vr4, $sp, 410 - vilvh.h $vr0, $vr1, $vr0 - vilvl.h $vr5, $vr1, $vr3 - vilvh.h $vr3, $vr1, $vr3 - vilvl.h $vr6, $vr1, $vr4 - vilvh.h $vr4, $vr1, $vr4 - vslli.w $vr1, $vr3, 1 - vadd.w $vr0, $vr0, $vr1 - vslli.w $vr1, $vr5, 1 - vadd.w $vr1, $vr2, $vr1 - vadd.w $vr0, $vr0, $vr4 + vld $vr2, $sp, 408 + vld $vr3, $sp, 410 + vbsrl.v $vr0, $vr0, 8 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr4, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr5, $vr3, 0 + vbsrl.v $vr3, $vr3, 8 + vsllwil.wu.hu $vr3, $vr3, 0 + vslli.w $vr2, $vr2, 1 + vadd.w $vr0, $vr0, $vr2 + vslli.w $vr2, $vr4, 1 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr3 vaddi.wu $vr0, $vr0, 2 - vadd.w $vr1, $vr1, $vr6 + vadd.w $vr1, $vr1, $vr5 vsrli.w $vr1, $vr1, 2 vsrli.w $vr0, $vr0, 2 vpickev.h $vr2, $vr0, $vr1 - vstx $vr2, $t4, $t3 - ld.hu $t3, $sp, 422 - ld.hu $t4, $sp, 426 + vstx $vr2, $t5, $t4 + ld.hu $t4, $sp, 422 + ld.hu $t5, $sp, 426 add.d $a7, $a7, $t7 - vpickve2gr.w $t7, $vr4, 3 - alsl.d $t3, $t7, $t3, 1 - add.d $t3, $t3, $t4 - addi.d $t3, $t3, 2 - bstrpick.d $t3, $t3, 19, 2 - stx.h $t3, $a1, $t0 - stx.h $t3, $a2, $t1 - stx.h $t3, $a7, $t2 + vpickve2gr.w $t7, $vr3, 3 + alsl.d $t4, $t7, $t4, 1 + add.d $t4, $t4, $t5 + addi.d $t4, $t4, 2 + bstrpick.d $t4, $t4, 19, 2 + stx.h $t4, $a1, $t0 + stx.h $t4, $a2, $t1 + stx.h $t4, $a7, $t2 ld.hu $t2, $sp, 428 - stx.h $t3, $a3, $t6 - stx.h $t3, $a6, $t5 - alsl.d $t3, $t4, $t7, 1 - add.d $t3, $t3, $t2 - addi.d $t3, $t3, 2 - bstrpick.d $t3, $t3, 19, 2 - stx.h $t3, $a2, $t0 - stx.h $t3, $a7, $t5 - ld.hu $t5, $sp, 430 - stx.h $t3, $a3, $t1 - stx.h $t3, $a6, $t6 - alsl.d $t3, $t2, $t4, 1 - add.d $t3, $t3, $t5 - addi.d $t3, $t3, 2 - srli.d $t3, $t3, 2 - stx.h $t3, $a7, $t6 - ld.hu $t4, $sp, 432 - stx.h $t3, $a3, $t0 - stx.h $t3, $a6, $t1 - alsl.d $t2, $t5, $t2, 1 - add.d $t2, $t2, $t4 + stx.h $t4, $a3, $t3 + stx.h $t4, $a5, $t6 + alsl.d $t4, $t5, $t7, 1 + add.d $t4, $t4, $t2 + addi.d $t4, $t4, 2 + bstrpick.d $t4, $t4, 19, 2 + stx.h $t4, $a2, $t0 + stx.h $t4, $a7, $t6 + ld.hu $t6, $sp, 430 + stx.h $t4, $a3, $t1 + stx.h $t4, $a5, $t3 + alsl.d $t4, $t2, $t5, 1 + add.d $t4, $t4, $t6 + addi.d $t4, $t4, 2 + srli.d $t4, $t4, 2 + stx.h $t4, $a7, $t3 + ld.hu $t3, $sp, 432 + stx.h $t4, $a3, $t0 + stx.h $t4, $a5, $t1 + alsl.d $t2, $t6, $t2, 1 + add.d $t2, $t2, $t3 addi.d $t2, $t2, 2 srli.d $t2, $t2, 2 stx.h $t2, $a7, $t1 - stx.h $t2, $a6, $t0 - alsl.d $t1, $t4, $t4, 1 - add.d $t1, $t5, $t1 + stx.h $t2, $a5, $t0 + alsl.d $t1, $t3, $t3, 1 + add.d $t1, $t6, $t1 addi.d $t1, $t1, 2 srli.d $t1, $t1, 2 stx.h $t1, $a7, $t0 alsl.d $t0, $s0, $a4, 1 vstelm.h $vr1, $t0, 0, 0 - alsl.d $t0, $ra, $a5, 1 + alsl.d $t0, $ra, $a6, 1 vstelm.h $vr1, $t0, 0, 0 alsl.d $t0, $s6, $a4, 1 vstelm.h $vr1, $t0, 0, 2 - alsl.d $t0, $s0, $a5, 1 + alsl.d $t0, $s0, $a6, 1 vstelm.h $vr1, $t0, 0, 2 alsl.d $t0, $fp, $a1, 1 vstelm.h $vr1, $t0, 0, 2 alsl.d $t0, $s8, $a4, 1 vstelm.h $vr1, $t0, 0, 4 - alsl.d $t0, $s6, $a5, 1 + alsl.d $t0, $s6, $a6, 1 vstelm.h $vr1, $t0, 0, 4 alsl.d $t0, $ra, $a1, 1 vstelm.h $vr1, $t0, 0, 4 @@ -962,7 +954,7 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr1, $t0, 0, 4 alsl.d $t0, $s7, $a4, 1 vstelm.h $vr1, $t0, 0, 6 - alsl.d $t0, $s8, $a5, 1 + alsl.d $t0, $s8, $a6, 1 vstelm.h $vr1, $t0, 0, 6 alsl.d $t0, $s0, $a1, 1 vstelm.h $vr1, $t0, 0, 6 @@ -972,7 +964,7 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr1, $t0, 0, 6 alsl.d $t0, $t8, $a4, 1 vstelm.h $vr0, $t0, 0, 0 - alsl.d $t0, $s7, $a5, 1 + alsl.d $t0, $s7, $a6, 1 vstelm.h $vr0, $t0, 0, 0 alsl.d $t0, $s6, $a1, 1 vstelm.h $vr0, $t0, 0, 0 @@ -980,23 +972,23 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr0, $t0, 0, 0 alsl.d $t0, $ra, $a3, 1 vstelm.h $vr0, $t0, 0, 0 - alsl.d $t0, $fp, $a6, 1 + alsl.d $t0, $fp, $a5, 1 vstelm.h $vr0, $t0, 0, 0 alsl.d $a4, $s2, $a4, 1 vstelm.h $vr0, $a4, 0, 2 - alsl.d $a4, $s2, $a5, 1 - alsl.d $a5, $t8, $a5, 1 - vstelm.h $vr0, $a5, 0, 2 - alsl.d $a5, $s8, $a1, 1 - vstelm.h $vr0, $a5, 0, 2 - alsl.d $a5, $s6, $a2, 1 - vstelm.h $vr0, $a5, 0, 2 - alsl.d $a5, $s0, $a3, 1 - vstelm.h $vr0, $a5, 0, 2 - alsl.d $a5, $ra, $a6, 1 - vstelm.h $vr0, $a5, 0, 2 - alsl.d $a5, $fp, $a7, 1 - vstelm.h $vr0, $a5, 0, 2 + alsl.d $a4, $s2, $a6, 1 + alsl.d $a6, $t8, $a6, 1 + vstelm.h $vr0, $a6, 0, 2 + alsl.d $a6, $s8, $a1, 1 + vstelm.h $vr0, $a6, 0, 2 + alsl.d $a6, $s6, $a2, 1 + vstelm.h $vr0, $a6, 0, 2 + alsl.d $a6, $s0, $a3, 1 + vstelm.h $vr0, $a6, 0, 2 + alsl.d $a6, $ra, $a5, 1 + vstelm.h $vr0, $a6, 0, 2 + alsl.d $a6, $fp, $a7, 1 + vstelm.h $vr0, $a6, 0, 2 vstelm.h $vr0, $a4, 0, 4 alsl.d $a4, $s7, $a1, 1 vstelm.h $vr0, $a4, 0, 4 @@ -1004,7 +996,7 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr0, $a4, 0, 4 alsl.d $a4, $s6, $a3, 1 vstelm.h $vr0, $a4, 0, 4 - alsl.d $a4, $s0, $a6, 1 + alsl.d $a4, $s0, $a5, 1 vstelm.h $vr0, $a4, 0, 4 alsl.d $a4, $ra, $a7, 1 vstelm.h $vr0, $a4, 0, 4 @@ -1014,7 +1006,7 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr0, $a1, 0, 6 alsl.d $a1, $s8, $a3, 1 vstelm.h $vr0, $a1, 0, 6 - alsl.d $a1, $s6, $a6, 1 + alsl.d $a1, $s6, $a5, 1 vstelm.h $vr0, $a1, 0, 6 alsl.d $a1, $s0, $a7, 1 vstelm.h $vr0, $a1, 0, 6 @@ -1039,26 +1031,25 @@ intrapred8x8: # @intrapred8x8 slli.d $a1, $a0, 5 add.d $a1, $a7, $a1 slli.d $t0, $fp, 1 + vld $vr2, $sp, 404 ld.d $a0, $sp, 64 # 8-byte Folded Reload slli.d $a3, $a0, 5 - vld $vr3, $sp, 404 add.d $a3, $a7, $a3 vld $vr1, $sp, 406 - vrepli.b $vr2, 0 - vilvl.h $vr4, $vr2, $vr3 - vpickve2gr.w $a4, $vr4, 0 + vsllwil.wu.hu $vr3, $vr2, 0 + vpickve2gr.w $a4, $vr3, 0 bstrpick.d $a5, $a4, 15, 0 or $a6, $a5, $a2 xor $a5, $a5, $a2 srli.d $a5, $a5, 1 sub.d $a5, $a6, $a5 stx.h $a5, $a1, $t0 - vor.v $vr0, $vr3, $vr1 - vxor.v $vr5, $vr3, $vr1 - vsrli.h $vr5, $vr5, 1 - vsub.h $vr0, $vr0, $vr5 + vor.v $vr0, $vr2, $vr1 + vxor.v $vr4, $vr2, $vr1 + vsrli.h $vr4, $vr4, 1 + vsub.h $vr0, $vr0, $vr4 vstx $vr0, $a3, $t0 - vpickve2gr.w $a3, $vr4, 1 + vpickve2gr.w $a3, $vr3, 1 addi.d $a6, $a3, 2 add.d $a2, $a6, $a2 alsl.d $a2, $a4, $a2, 1 @@ -1067,7 +1058,7 @@ intrapred8x8: # @intrapred8x8 slli.d $a5, $a0, 5 add.d $a5, $a7, $a5 stx.h $a2, $a5, $t0 - vpickve2gr.w $a2, $vr4, 2 + vpickve2gr.w $a2, $vr3, 2 addi.d $t1, $a2, 2 add.d $a4, $t1, $a4 alsl.d $a3, $a3, $a4, 1 @@ -1080,7 +1071,7 @@ intrapred8x8: # @intrapred8x8 stx.h $a3, $a5, $t2 slli.d $t3, $s0, 1 alsl.d $a2, $a2, $a6, 1 - vpickve2gr.w $t4, $vr4, 3 + vpickve2gr.w $t4, $vr3, 3 add.d $a2, $a2, $t4 bstrpick.d $a2, $a2, 18, 2 ld.d $a0, $sp, 112 # 8-byte Folded Reload @@ -1095,12 +1086,13 @@ intrapred8x8: # @intrapred8x8 ld.d $a0, $sp, 72 # 8-byte Folded Reload slli.d $a3, $a0, 5 add.d $a3, $a7, $a3 - vilvh.h $vr3, $vr2, $vr3 + vbsrl.v $vr2, $vr2, 8 + vsllwil.wu.hu $vr2, $vr2, 0 alsl.d $t1, $t4, $t1, 1 ld.d $a0, $sp, 96 # 8-byte Folded Reload slli.d $t5, $a0, 5 add.d $a7, $a7, $t5 - vpickve2gr.w $t5, $vr3, 0 + vpickve2gr.w $t5, $vr2, 0 add.d $t1, $t1, $t5 bstrpick.d $t1, $t1, 18, 2 stx.h $t1, $a7, $t0 @@ -1109,7 +1101,7 @@ intrapred8x8: # @intrapred8x8 stx.h $t1, $a4, $t3 stx.h $t1, $a5, $t0 alsl.d $t1, $t5, $t4, 1 - vpickve2gr.w $t4, $vr3, 1 + vpickve2gr.w $t4, $vr2, 1 add.d $t1, $t1, $t4 addi.d $t1, $t1, 2 bstrpick.d $t1, $t1, 19, 2 @@ -1119,7 +1111,7 @@ intrapred8x8: # @intrapred8x8 stx.h $t1, $a4, $t0 stx.h $t1, $a5, $t2 alsl.d $t1, $t4, $t5, 1 - vpickve2gr.w $t5, $vr3, 2 + vpickve2gr.w $t5, $vr2, 2 add.d $t1, $t1, $t5 addi.d $t1, $t1, 2 bstrpick.d $t1, $t1, 19, 2 @@ -1129,14 +1121,14 @@ intrapred8x8: # @intrapred8x8 stx.h $t1, $a4, $t2 stx.h $t1, $a5, $t3 alsl.d $t1, $t5, $t4, 1 - vpickve2gr.w $t4, $vr3, 3 + vpickve2gr.w $t4, $vr2, 3 add.d $t1, $t1, $t4 addi.d $t1, $t1, 2 bstrpick.d $t1, $t1, 19, 2 stx.h $t1, $a7, $t0 slli.d $t0, $t8, 1 vreplvei.h $vr1, $vr1, 7 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 stx.h $t1, $a6, $t2 stx.h $t1, $a4, $t3 stx.h $t1, $a5, $t0 @@ -1990,35 +1982,35 @@ LowPassForIntra8x8Pred: # @LowPassForIntra8x8Pred add.d $t5, $t3, $t5 vinsgr2vr.w $vr1, $t2, 0 srli.d $t2, $t5, 2 - vrepli.b $vr2, 0 - vilvh.h $vr3, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vinsgr2vr.w $vr2, $t4, 0 + vbsrl.v $vr2, $vr0, 8 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vinsgr2vr.w $vr3, $t4, 0 pcalau12i $t4, %pc_hi20(.LCPI1_0) vld $vr4, $t4, %pc_lo12(.LCPI1_0) vbsrl.v $vr5, $vr0, 12 - vbsll.v $vr6, $vr3, 4 + vbsll.v $vr6, $vr2, 4 vor.v $vr5, $vr6, $vr5 - vshuf.w $vr4, $vr0, $vr2 - vslli.w $vr2, $vr4, 1 + vshuf.w $vr4, $vr0, $vr3 + vslli.w $vr3, $vr4, 1 vslli.w $vr4, $vr5, 1 vinsgr2vr.w $vr1, $t3, 1 vpackev.d $vr1, $vr0, $vr1 vbsrl.v $vr5, $vr0, 8 - vbsll.v $vr6, $vr3, 8 + vbsll.v $vr6, $vr2, 8 vor.v $vr5, $vr6, $vr5 vadd.w $vr4, $vr5, $vr4 - vadd.w $vr1, $vr1, $vr2 + vadd.w $vr1, $vr1, $vr3 pcalau12i $t3, %pc_hi20(.LCPI1_1) - vld $vr2, $t3, %pc_lo12(.LCPI1_1) + vld $vr3, $t3, %pc_lo12(.LCPI1_1) vadd.w $vr0, $vr1, $vr0 - vadd.w $vr1, $vr4, $vr3 + vadd.w $vr1, $vr4, $vr2 vaddi.wu $vr1, $vr1, 2 - vadd.w $vr0, $vr0, $vr2 + vadd.w $vr0, $vr0, $vr3 vsrli.w $vr0, $vr0, 2 vsrli.w $vr1, $vr1, 2 vpickev.h $vr0, $vr1, $vr0 - vpickve2gr.w $t3, $vr3, 2 + vpickve2gr.w $t3, $vr2, 2 alsl.d $t3, $t1, $t3, 1 add.d $t3, $t3, $t0 addi.d $t3, $t3, 2 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/block.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/block.s index 23632228..d8c9f3a7 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/block.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/block.s @@ -1998,7 +1998,6 @@ dct_luma_16x16: # @dct_luma_16x16 addi.d $s7, $a6, %pc_lo12(dct_luma_16x16.M0) move $a6, $zero slli.d $a7, $a1, 1 - vrepli.b $vr0, 0 ori $t0, $zero, 16 .p2align 4, , 16 .LBB2_3: # =>This Inner Loop Header: Depth=1 @@ -2009,32 +2008,36 @@ dct_luma_16x16: # @dct_luma_16x16 slli.d $t5, $t5, 8 add.d $t5, $s7, $t5 alsl.d $t6, $t1, $t5, 4 - vldx $vr1, $t2, $a7 + vldx $vr0, $t2, $a7 slli.d $t1, $t1, 4 - vld $vr2, $a4, -16 alsl.d $t2, $a1, $t2, 1 - vilvl.h $vr3, $vr0, $vr1 - vilvh.h $vr1, $vr0, $vr1 - vilvl.h $vr4, $vr0, $vr2 - vilvh.h $vr2, $vr0, $vr2 - vsub.w $vr1, $vr1, $vr2 - vsub.w $vr2, $vr3, $vr4 - vstx $vr2, $t5, $t1 - vst $vr1, $a5, -16 - vld $vr3, $t2, 16 - vst $vr2, $a5, -32 - vld $vr2, $a4, 0 - vst $vr1, $t6, 64 - vilvl.h $vr1, $vr0, $vr3 - vilvh.h $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr2 - vilvh.h $vr2, $vr0, $vr2 - vsub.w $vr2, $vr3, $vr2 - vsub.w $vr1, $vr1, $vr4 + vld $vr1, $a4, -16 + vsllwil.wu.hu $vr2, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr3, $vr1, 0 + vbsrl.v $vr1, $vr1, 8 + vsllwil.wu.hu $vr1, $vr1, 0 + vsub.w $vr0, $vr0, $vr1 + vsub.w $vr1, $vr2, $vr3 + vstx $vr1, $t5, $t1 + vst $vr1, $a5, -32 + vld $vr1, $t2, 16 + vst $vr0, $a5, -16 + vst $vr0, $t6, 64 + vld $vr0, $a4, 0 + vsllwil.wu.hu $vr2, $vr1, 0 + vbsrl.v $vr1, $vr1, 8 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr3, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.wu.hu $vr0, $vr0, 0 + vsub.w $vr0, $vr1, $vr0 + vsub.w $vr1, $vr2, $vr3 vst $vr1, $t6, 128 - vst $vr2, $a5, 16 vst $vr1, $a5, 0 - vst $vr2, $t6, 192 + vst $vr0, $a5, 16 + vst $vr0, $t6, 192 addi.d $a6, $a6, 1 addi.d $a3, $a3, 8 addi.d $a4, $a4, 32 @@ -8580,45 +8583,44 @@ dct_luma_sp: # @dct_luma_sp ld.d $a1, $a1, 0 ld.w $a3, $s1, 20 ldx.w $a0, $a1, $a0 - slli.d $a3, $a3, 2 ld.d $a4, $s2, 0 + slli.d $a3, $a3, 2 ldx.w $s5, $a2, $a3 - ldx.w $a1, $a1, $a3 st.d $s0, $sp, 208 # 8-byte Folded Spill slli.d $a2, $s0, 5 add.d $a2, $a4, $a2 st.d $fp, $sp, 216 # 8-byte Folded Spill - alsl.d $a3, $fp, $a2, 1 - ldptr.d $a5, $a3, 12624 + alsl.d $a5, $fp, $a2, 1 + ldptr.d $a6, $a5, 12624 lu12i.w $a2, 3 ori $t4, $a2, 848 vldx $vr1, $a4, $t4 - vinsgr2vr.d $vr2, $a5, 0 - vrepli.b $vr5, 0 - vilvl.h $vr4, $vr5, $vr2 + ldx.w $a1, $a1, $a3 + vinsgr2vr.d $vr2, $a6, 0 + vsllwil.wu.hu $vr4, $vr2, 0 vadd.w $vr1, $vr1, $vr4 - ldptr.d $a5, $a3, 12656 + ldptr.d $a3, $a5, 12656 ori $t3, $a2, 912 vldx $vr2, $a4, $t3 vstx $vr1, $a4, $t4 - vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr3, $vr5, $vr1 + vinsgr2vr.d $vr1, $a3, 0 + vsllwil.wu.hu $vr3, $vr1, 0 vadd.w $vr1, $vr2, $vr3 - ldptr.d $a5, $a3, 12688 + ldptr.d $a3, $a5, 12688 ori $t2, $a2, 976 - vldx $vr6, $a4, $t2 + vldx $vr5, $a4, $t2 vstx $vr1, $a4, $t3 - vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr2, $vr5, $vr1 - vadd.w $vr1, $vr6, $vr2 - ldptr.d $a3, $a3, 12720 + vinsgr2vr.d $vr1, $a3, 0 + vsllwil.wu.hu $vr2, $vr1, 0 + vadd.w $vr1, $vr5, $vr2 + ldptr.d $a3, $a5, 12720 vstx $vr1, $a4, $t2 ori $t1, $a2, 1040 - vldx $vr6, $a4, $t1 + vldx $vr5, $a4, $t1 vinsgr2vr.d $vr1, $a3, 0 ld.d $a3, $s2, 0 - vilvl.h $vr1, $vr5, $vr1 - vadd.w $vr5, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 + vadd.w $vr5, $vr5, $vr1 vstx $vr5, $a4, $t1 ldx.w $a4, $a3, $t4 ori $a5, $a2, 860 @@ -9709,250 +9711,249 @@ dct_chroma_sp: # @dct_chroma_sp ori $a1, $a6, 848 add.d $a2, $s7, $a1 st.d $a2, $sp, 272 # 8-byte Folded Spill - vinsgr2vr.d $vr1, $a0, 0 - vrepli.b $vr0, 0 - vilvl.h $vr3, $vr0, $vr1 - vldx $vr1, $s7, $a1 + vinsgr2vr.d $vr0, $a0, 0 + vsllwil.wu.hu $vr2, $vr0, 0 + vldx $vr0, $s7, $a1 addi.d $a0, $sp, 544 - vstelm.w $vr3, $a0, 0, 1 + vstelm.w $vr2, $a0, 0, 1 addi.d $a0, $sp, 576 - vstelm.w $vr3, $a0, 0, 2 + vstelm.w $vr2, $a0, 0, 2 ldptr.d $a0, $s7, 12632 - vadd.w $vr1, $vr1, $vr3 + vadd.w $vr0, $vr0, $vr2 st.d $a1, $sp, 408 # 8-byte Folded Spill - vstx $vr1, $s7, $a1 + vstx $vr0, $s7, $a1 addi.d $a1, $sp, 608 - vstelm.w $vr3, $a1, 0, 3 - vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr2, $vr0, $vr1 + vstelm.w $vr2, $a1, 0, 3 + vinsgr2vr.d $vr0, $a0, 0 + vsllwil.wu.hu $vr1, $vr0, 0 ori $a1, $a6, 864 - vldx $vr1, $s7, $a1 + vldx $vr0, $s7, $a1 addi.d $a0, $sp, 672 - vstelm.w $vr2, $a0, 0, 1 + vstelm.w $vr1, $a0, 0, 1 addi.d $a0, $sp, 704 - vstelm.w $vr2, $a0, 0, 2 + vstelm.w $vr1, $a0, 0, 2 ldptr.d $a0, $s7, 12656 - vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr1 st.d $a1, $sp, 152 # 8-byte Folded Spill - vstx $vr1, $s7, $a1 + vstx $vr0, $s7, $a1 addi.d $a1, $sp, 736 - vstelm.w $vr2, $a1, 0, 3 - vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr4, $vr0, $vr1 + vstelm.w $vr1, $a1, 0, 3 + vinsgr2vr.d $vr0, $a0, 0 + vsllwil.wu.hu $vr3, $vr0, 0 ori $a1, $a6, 912 - vldx $vr1, $s7, $a1 + vldx $vr0, $s7, $a1 addi.d $a0, $sp, 548 - vstelm.w $vr4, $a0, 0, 1 + vstelm.w $vr3, $a0, 0, 1 addi.d $a0, $sp, 580 - vstelm.w $vr4, $a0, 0, 2 + vstelm.w $vr3, $a0, 0, 2 ldptr.d $a0, $s7, 12664 - vadd.w $vr1, $vr1, $vr4 + vadd.w $vr0, $vr0, $vr3 st.d $a1, $sp, 128 # 8-byte Folded Spill - vstx $vr1, $s7, $a1 + vstx $vr0, $s7, $a1 addi.d $a1, $sp, 612 - vstelm.w $vr4, $a1, 0, 3 - vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr5, $vr0, $vr1 + vstelm.w $vr3, $a1, 0, 3 + vinsgr2vr.d $vr0, $a0, 0 + vsllwil.wu.hu $vr4, $vr0, 0 ori $a1, $a6, 928 - vldx $vr1, $s7, $a1 + vldx $vr0, $s7, $a1 addi.d $a0, $sp, 676 - vstelm.w $vr5, $a0, 0, 1 + vstelm.w $vr4, $a0, 0, 1 addi.d $a0, $sp, 708 - vstelm.w $vr5, $a0, 0, 2 + vstelm.w $vr4, $a0, 0, 2 ldptr.d $a0, $s7, 12688 - vadd.w $vr1, $vr1, $vr5 + vadd.w $vr0, $vr0, $vr4 st.d $a1, $sp, 120 # 8-byte Folded Spill - vstx $vr1, $s7, $a1 + vstx $vr0, $s7, $a1 addi.d $a1, $sp, 740 - vstelm.w $vr5, $a1, 0, 3 - vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr6, $vr0, $vr1 + vstelm.w $vr4, $a1, 0, 3 + vinsgr2vr.d $vr0, $a0, 0 + vsllwil.wu.hu $vr5, $vr0, 0 ori $a1, $a6, 976 - vldx $vr1, $s7, $a1 + vldx $vr0, $s7, $a1 addi.d $a0, $sp, 552 - vstelm.w $vr6, $a0, 0, 1 + vstelm.w $vr5, $a0, 0, 1 addi.d $a0, $sp, 584 - vstelm.w $vr6, $a0, 0, 2 + vstelm.w $vr5, $a0, 0, 2 ldptr.d $a0, $s7, 12696 - vadd.w $vr1, $vr1, $vr6 + vadd.w $vr0, $vr0, $vr5 st.d $a1, $sp, 112 # 8-byte Folded Spill - vstx $vr1, $s7, $a1 + vstx $vr0, $s7, $a1 addi.d $a1, $sp, 616 - vstelm.w $vr6, $a1, 0, 3 - vinsgr2vr.d $vr1, $a0, 0 + vstelm.w $vr5, $a1, 0, 3 + vinsgr2vr.d $vr0, $a0, 0 ori $a0, $a6, 992 - vldx $vr7, $s7, $a0 - vilvl.h $vr8, $vr0, $vr1 + vldx $vr6, $s7, $a0 + vsllwil.wu.hu $vr7, $vr0, 0 addi.d $a1, $sp, 680 - vstelm.w $vr8, $a1, 0, 1 + vstelm.w $vr7, $a1, 0, 1 addi.d $a1, $sp, 712 - vstelm.w $vr8, $a1, 0, 2 - vadd.w $vr1, $vr7, $vr8 + vstelm.w $vr7, $a1, 0, 2 + vadd.w $vr0, $vr6, $vr7 st.d $a0, $sp, 104 # 8-byte Folded Spill - vstx $vr1, $s7, $a0 + vstx $vr0, $s7, $a0 ldptr.d $a0, $s7, 12720 addi.d $a1, $sp, 744 - vstelm.w $vr8, $a1, 0, 3 + vstelm.w $vr7, $a1, 0, 3 pcalau12i $a1, %pc_hi20(.LCPI6_1) - vld $vr1, $a1, %pc_lo12(.LCPI6_1) - vinsgr2vr.d $vr7, $a0, 0 - vilvl.h $vr7, $vr0, $vr7 - vpackev.w $vr3, $vr4, $vr3 - vpackev.d $vr3, $vr6, $vr3 - vori.b $vr4, $vr1, 0 - vshuf.w $vr4, $vr7, $vr3 - vst $vr4, $sp, 512 + vld $vr0, $a1, %pc_lo12(.LCPI6_1) + vinsgr2vr.d $vr6, $a0, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vpackev.w $vr2, $vr3, $vr2 + vpackev.d $vr2, $vr5, $vr2 + vori.b $vr3, $vr0, 0 + vshuf.w $vr3, $vr6, $vr2 + vst $vr3, $sp, 512 ori $a1, $a6, 1040 - vldx $vr3, $s7, $a1 + vldx $vr2, $s7, $a1 addi.d $a0, $sp, 556 - vstelm.w $vr7, $a0, 0, 1 + vstelm.w $vr6, $a0, 0, 1 addi.d $a0, $sp, 588 - vstelm.w $vr7, $a0, 0, 2 + vstelm.w $vr6, $a0, 0, 2 ldptr.d $a0, $s7, 12728 - vadd.w $vr3, $vr3, $vr7 + vadd.w $vr2, $vr2, $vr6 st.d $a1, $sp, 168 # 8-byte Folded Spill - vstx $vr3, $s7, $a1 + vstx $vr2, $s7, $a1 addi.d $a1, $sp, 620 - vstelm.w $vr7, $a1, 0, 3 - vinsgr2vr.d $vr3, $a0, 0 - vilvl.h $vr3, $vr0, $vr3 - vpackev.w $vr2, $vr5, $vr2 - vpackev.d $vr2, $vr8, $vr2 - vori.b $vr4, $vr1, 0 - vshuf.w $vr4, $vr3, $vr2 - vst $vr4, $sp, 640 + vstelm.w $vr6, $a1, 0, 3 + vinsgr2vr.d $vr2, $a0, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vpackev.w $vr1, $vr4, $vr1 + vpackev.d $vr1, $vr7, $vr1 + vori.b $vr3, $vr0, 0 + vshuf.w $vr3, $vr2, $vr1 + vst $vr3, $sp, 640 ori $a1, $a6, 1056 - vldx $vr2, $s7, $a1 + vldx $vr1, $s7, $a1 addi.d $a0, $sp, 684 - vstelm.w $vr3, $a0, 0, 1 + vstelm.w $vr2, $a0, 0, 1 addi.d $a0, $sp, 716 - vstelm.w $vr3, $a0, 0, 2 + vstelm.w $vr2, $a0, 0, 2 ldptr.d $a0, $s7, 12752 - vadd.w $vr2, $vr2, $vr3 + vadd.w $vr1, $vr1, $vr2 st.d $a1, $sp, 320 # 8-byte Folded Spill - vstx $vr2, $s7, $a1 + vstx $vr1, $s7, $a1 addi.d $a1, $sp, 748 - vstelm.w $vr3, $a1, 0, 3 - vinsgr2vr.d $vr2, $a0, 0 - vilvl.h $vr3, $vr0, $vr2 + vstelm.w $vr2, $a1, 0, 3 + vinsgr2vr.d $vr1, $a0, 0 + vsllwil.wu.hu $vr2, $vr1, 0 ori $a1, $a6, 1104 - vldx $vr2, $s7, $a1 + vldx $vr1, $s7, $a1 addi.d $a0, $sp, 560 - vstelm.w $vr3, $a0, 0, 1 + vstelm.w $vr2, $a0, 0, 1 addi.d $a0, $sp, 592 - vstelm.w $vr3, $a0, 0, 2 + vstelm.w $vr2, $a0, 0, 2 ldptr.d $a0, $s7, 12760 - vadd.w $vr2, $vr2, $vr3 + vadd.w $vr1, $vr1, $vr2 st.d $a1, $sp, 144 # 8-byte Folded Spill - vstx $vr2, $s7, $a1 + vstx $vr1, $s7, $a1 addi.d $a1, $sp, 624 - vstelm.w $vr3, $a1, 0, 3 - vinsgr2vr.d $vr2, $a0, 0 - vilvl.h $vr2, $vr0, $vr2 + vstelm.w $vr2, $a1, 0, 3 + vinsgr2vr.d $vr1, $a0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ori $a1, $a6, 1120 - vldx $vr4, $s7, $a1 + vldx $vr3, $s7, $a1 addi.d $a0, $sp, 688 - vstelm.w $vr2, $a0, 0, 1 + vstelm.w $vr1, $a0, 0, 1 addi.d $a0, $sp, 720 - vstelm.w $vr2, $a0, 0, 2 + vstelm.w $vr1, $a0, 0, 2 ldptr.d $a0, $s7, 12784 - vadd.w $vr4, $vr4, $vr2 + vadd.w $vr3, $vr3, $vr1 st.d $a1, $sp, 136 # 8-byte Folded Spill - vstx $vr4, $s7, $a1 + vstx $vr3, $s7, $a1 addi.d $a1, $sp, 752 - vstelm.w $vr2, $a1, 0, 3 - vinsgr2vr.d $vr4, $a0, 0 - vilvl.h $vr4, $vr0, $vr4 + vstelm.w $vr1, $a1, 0, 3 + vinsgr2vr.d $vr3, $a0, 0 + vsllwil.wu.hu $vr3, $vr3, 0 ori $a1, $a6, 1168 - vldx $vr5, $s7, $a1 + vldx $vr4, $s7, $a1 addi.d $a0, $sp, 564 - vstelm.w $vr4, $a0, 0, 1 + vstelm.w $vr3, $a0, 0, 1 addi.d $a0, $sp, 596 - vstelm.w $vr4, $a0, 0, 2 + vstelm.w $vr3, $a0, 0, 2 ldptr.d $a0, $s7, 12792 - vadd.w $vr5, $vr5, $vr4 + vadd.w $vr4, $vr4, $vr3 st.d $a1, $sp, 96 # 8-byte Folded Spill - vstx $vr5, $s7, $a1 + vstx $vr4, $s7, $a1 addi.d $a1, $sp, 628 - vstelm.w $vr4, $a1, 0, 3 - vinsgr2vr.d $vr5, $a0, 0 - vilvl.h $vr5, $vr0, $vr5 + vstelm.w $vr3, $a1, 0, 3 + vinsgr2vr.d $vr4, $a0, 0 + vsllwil.wu.hu $vr4, $vr4, 0 ori $a1, $a6, 1184 - vldx $vr6, $s7, $a1 + vldx $vr5, $s7, $a1 addi.d $a0, $sp, 692 - vstelm.w $vr5, $a0, 0, 1 + vstelm.w $vr4, $a0, 0, 1 addi.d $a0, $sp, 724 - vstelm.w $vr5, $a0, 0, 2 + vstelm.w $vr4, $a0, 0, 2 ldptr.d $a0, $s7, 12816 - vadd.w $vr6, $vr6, $vr5 + vadd.w $vr5, $vr5, $vr4 st.d $a1, $sp, 88 # 8-byte Folded Spill - vstx $vr6, $s7, $a1 + vstx $vr5, $s7, $a1 addi.d $a1, $sp, 756 - vstelm.w $vr5, $a1, 0, 3 - vinsgr2vr.d $vr6, $a0, 0 - vilvl.h $vr6, $vr0, $vr6 + vstelm.w $vr4, $a1, 0, 3 + vinsgr2vr.d $vr5, $a0, 0 + vsllwil.wu.hu $vr5, $vr5, 0 ori $a1, $a6, 1232 - vldx $vr7, $s7, $a1 + vldx $vr6, $s7, $a1 addi.d $a0, $sp, 568 - vstelm.w $vr6, $a0, 0, 1 + vstelm.w $vr5, $a0, 0, 1 addi.d $a0, $sp, 600 - vstelm.w $vr6, $a0, 0, 2 + vstelm.w $vr5, $a0, 0, 2 ldptr.d $a0, $s7, 12824 - vadd.w $vr7, $vr7, $vr6 + vadd.w $vr6, $vr6, $vr5 st.d $a1, $sp, 80 # 8-byte Folded Spill - vstx $vr7, $s7, $a1 + vstx $vr6, $s7, $a1 addi.d $a1, $sp, 632 - vstelm.w $vr6, $a1, 0, 3 - vinsgr2vr.d $vr7, $a0, 0 - vilvl.h $vr7, $vr0, $vr7 + vstelm.w $vr5, $a1, 0, 3 + vinsgr2vr.d $vr6, $a0, 0 + vsllwil.wu.hu $vr6, $vr6, 0 ori $a1, $a6, 1248 - vldx $vr8, $s7, $a1 + vldx $vr7, $s7, $a1 addi.d $a0, $sp, 696 - vstelm.w $vr7, $a0, 0, 1 + vstelm.w $vr6, $a0, 0, 1 addi.d $a0, $sp, 728 - vstelm.w $vr7, $a0, 0, 2 + vstelm.w $vr6, $a0, 0, 2 ldptr.d $a0, $s7, 12848 - vadd.w $vr8, $vr8, $vr7 + vadd.w $vr7, $vr7, $vr6 st.d $a1, $sp, 72 # 8-byte Folded Spill - vstx $vr8, $s7, $a1 + vstx $vr7, $s7, $a1 addi.d $a1, $sp, 760 - vstelm.w $vr7, $a1, 0, 3 - vinsgr2vr.d $vr8, $a0, 0 - vilvl.h $vr8, $vr0, $vr8 - vpackev.w $vr3, $vr4, $vr3 - vpackev.d $vr3, $vr6, $vr3 - vori.b $vr4, $vr1, 0 - vshuf.w $vr4, $vr8, $vr3 - vst $vr4, $sp, 528 + vstelm.w $vr6, $a1, 0, 3 + vinsgr2vr.d $vr7, $a0, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vpackev.w $vr2, $vr3, $vr2 + vpackev.d $vr2, $vr5, $vr2 + vori.b $vr3, $vr0, 0 + vshuf.w $vr3, $vr7, $vr2 + vst $vr3, $sp, 528 ori $a1, $a6, 1296 - vldx $vr3, $s7, $a1 + vldx $vr2, $s7, $a1 addi.d $a0, $sp, 572 - vstelm.w $vr8, $a0, 0, 1 + vstelm.w $vr7, $a0, 0, 1 addi.d $a0, $sp, 604 - vstelm.w $vr8, $a0, 0, 2 + vstelm.w $vr7, $a0, 0, 2 ldptr.d $a0, $s7, 12856 - vadd.w $vr3, $vr3, $vr8 + vadd.w $vr2, $vr2, $vr7 st.d $a1, $sp, 64 # 8-byte Folded Spill - vstx $vr3, $s7, $a1 + vstx $vr2, $s7, $a1 addi.d $a1, $sp, 636 - vstelm.w $vr8, $a1, 0, 3 - vinsgr2vr.d $vr3, $a0, 0 - vilvl.h $vr0, $vr0, $vr3 - vpackev.w $vr2, $vr5, $vr2 - vpackev.d $vr2, $vr7, $vr2 - vshuf.w $vr1, $vr0, $vr2 + vstelm.w $vr7, $a1, 0, 3 + vinsgr2vr.d $vr2, $a0, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vpackev.w $vr1, $vr4, $vr1 + vpackev.d $vr1, $vr6, $vr1 + vshuf.w $vr0, $vr2, $vr1 ori $a0, $a6, 1312 - vldx $vr2, $s7, $a0 - vst $vr1, $sp, 656 + vldx $vr1, $s7, $a0 + vst $vr0, $sp, 656 addi.d $a1, $sp, 700 - vstelm.w $vr0, $a1, 0, 1 + vstelm.w $vr2, $a1, 0, 1 addi.d $a1, $sp, 732 - vstelm.w $vr0, $a1, 0, 2 - vadd.w $vr1, $vr2, $vr0 + vstelm.w $vr2, $a1, 0, 2 + vadd.w $vr0, $vr1, $vr2 st.d $a0, $sp, 56 # 8-byte Folded Spill - vstx $vr1, $s7, $a0 + vstx $vr0, $s7, $a0 addi.d $a0, $sp, 764 - vstelm.w $vr0, $a0, 0, 3 + vstelm.w $vr2, $a0, 0, 3 ori $a0, $a6, 852 st.d $a0, $sp, 48 # 8-byte Folded Spill add.d $a0, $s7, $a0 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/fmo.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/fmo.s index 885b389a..be8c8dd4 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/fmo.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/fmo.s @@ -977,31 +977,30 @@ FmoGetLastCodedMBOfSliceGroup: # @FmoGetLastCodedMBOfSliceGroup lu12i.w $a4, -524288 vreplgr2vr.w $vr2, $a4 addi.d $a5, $a3, 4 - vrepli.b $vr3, 0 move $a6, $a1 - vori.b $vr4, $vr2, 0 + vori.b $vr3, $vr2, 0 .p2align 4, , 16 .LBB8_5: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $a7, $a5, -4 ld.w $t0, $a5, 0 - vinsgr2vr.w $vr5, $a7, 0 - vaddi.wu $vr6, $vr1, 4 - vinsgr2vr.w $vr7, $t0, 0 - vilvl.b $vr5, $vr3, $vr5 - vilvl.h $vr5, $vr3, $vr5 - vilvl.b $vr7, $vr3, $vr7 - vilvl.h $vr7, $vr3, $vr7 - vseq.w $vr5, $vr0, $vr5 - vseq.w $vr7, $vr0, $vr7 - vbitsel.v $vr2, $vr2, $vr1, $vr5 - vbitsel.v $vr4, $vr4, $vr6, $vr7 + vinsgr2vr.w $vr4, $a7, 0 + vaddi.wu $vr5, $vr1, 4 + vinsgr2vr.w $vr6, $t0, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vseq.w $vr4, $vr0, $vr4 + vseq.w $vr6, $vr0, $vr6 + vbitsel.v $vr2, $vr2, $vr1, $vr4 + vbitsel.v $vr3, $vr3, $vr5, $vr6 vaddi.wu $vr1, $vr1, 8 addi.d $a6, $a6, -8 addi.d $a5, $a5, 8 bnez $a6, .LBB8_5 # %bb.6: # %middle.block - vmax.w $vr0, $vr2, $vr4 + vmax.w $vr0, $vr2, $vr3 vbsrl.v $vr1, $vr0, 8 vmax.w $vr0, $vr1, $vr0 vbsrl.v $vr1, $vr0, 4 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/image.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/image.s index 1ab8dfd5..ee7c6f3c 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/image.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/image.s @@ -8546,7 +8546,6 @@ buf2img: # @buf2img sub.d $a7, $zero, $a5 ori $t0, $zero, 8 ori $t1, $zero, 16 - vrepli.b $vr0, 0 move $t2, $s0 b .LBB14_42 .p2align 4, , 16 @@ -8602,9 +8601,9 @@ buf2img: # @buf2img # Parent Loop BB14_42 Depth=1 # => This Inner Loop Header: Depth=2 ld.d $t7, $t6, 0 - vinsgr2vr.d $vr1, $t7, 0 - vilvl.b $vr1, $vr0, $vr1 - vst $vr1, $t5, 0 + vinsgr2vr.d $vr0, $t7, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vst $vr0, $t5, 0 addi.d $t4, $t4, 8 addi.d $t5, $t5, 16 addi.d $t6, $t6, 8 @@ -8625,12 +8624,12 @@ buf2img: # @buf2img # => This Inner Loop Header: Depth=2 ld.d $t7, $t5, -8 ld.d $t8, $t5, 0 - vinsgr2vr.d $vr1, $t7, 0 - vinsgr2vr.d $vr2, $t8, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vst $vr1, $t4, -16 - vst $vr2, $t4, 0 + vinsgr2vr.d $vr0, $t7, 0 + vinsgr2vr.d $vr1, $t8, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vst $vr0, $t4, -16 + vst $vr1, $t4, 0 addi.d $t6, $t6, -16 addi.d $t4, $t4, 32 addi.d $t5, $t5, 16 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/img_chroma.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/img_chroma.s index 92054b22..90071266 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/img_chroma.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/img_chroma.s @@ -117,8 +117,7 @@ getSubImagesChroma: # @getSubImagesChroma st.d $a0, $sp, 144 # 8-byte Folded Spill add.d $a0, $a6, $a7 st.d $a0, $sp, 216 # 8-byte Folded Spill - vrepli.b $vr0, 0 - vrepli.w $vr1, 32 + vrepli.w $vr0, 32 st.d $s8, $sp, 208 # 8-byte Folded Spill st.d $a4, $sp, 64 # 8-byte Folded Spill b .LBB0_8 @@ -153,8 +152,8 @@ getSubImagesChroma: # @getSubImagesChroma ldx.d $a0, $a0, $a1 st.d $a0, $sp, 80 # 8-byte Folded Spill slli.d $a5, $t6, 3 - vreplgr2vr.w $vr2, $s5 - vreplgr2vr.w $vr3, $a5 + vreplgr2vr.w $vr1, $s5 + vreplgr2vr.w $vr2, $a5 st.d $t6, $sp, 96 # 8-byte Folded Spill b .LBB0_10 .p2align 4, , 16 @@ -191,10 +190,10 @@ getSubImagesChroma: # @getSubImagesChroma mul.d $a2, $a1, $a0 st.d $a3, $sp, 104 # 8-byte Folded Spill mul.d $s4, $a3, $a0 - vreplgr2vr.w $vr4, $a2 - vreplgr2vr.w $vr5, $s4 - vreplgr2vr.w $vr6, $t8 - vreplgr2vr.w $vr7, $fp + vreplgr2vr.w $vr3, $a2 + vreplgr2vr.w $vr4, $s4 + vreplgr2vr.w $vr5, $t8 + vreplgr2vr.w $vr6, $fp ld.d $a1, $sp, 120 # 8-byte Folded Reload b .LBB0_12 .p2align 4, , 16 @@ -372,30 +371,30 @@ getSubImagesChroma: # @getSubImagesChroma # %bb.33: # %vector.ph229 # in Loop: Header=BB0_12 Depth=3 ld.h $t5, $s6, 0 - vreplgr2vr.h $vr8, $t5 - vilvl.h $vr8, $vr0, $vr8 - vinsgr2vr.h $vr9, $t5, 0 - vinsgr2vr.h $vr9, $t5, 1 - vinsgr2vr.h $vr9, $t5, 2 - vinsgr2vr.h $vr9, $t5, 3 + vreplgr2vr.h $vr7, $t5 + vinsgr2vr.h $vr8, $t5, 0 + vinsgr2vr.h $vr8, $t5, 1 + vinsgr2vr.h $vr8, $t5, 2 + vinsgr2vr.h $vr8, $t5, 3 + vsllwil.wu.hu $vr8, $vr8, 0 ld.h $t5, $a4, 0 - vilvl.h $vr9, $vr0, $vr9 - vmul.w $vr9, $vr2, $vr9 - vmul.w $vr8, $vr2, $vr8 - vreplgr2vr.h $vr10, $t5 - vinsgr2vr.h $vr11, $t5, 0 - vinsgr2vr.h $vr11, $t5, 1 - vinsgr2vr.h $vr11, $t5, 2 - vinsgr2vr.h $vr11, $t5, 3 - vilvl.h $vr11, $vr0, $vr11 - vilvl.h $vr10, $vr0, $vr10 - vmadd.w $vr8, $vr3, $vr10 - vmadd.w $vr9, $vr3, $vr11 - vadd.w $vr9, $vr9, $vr1 - vadd.w $vr8, $vr8, $vr1 + vsllwil.wu.hu $vr7, $vr7, 0 + vmul.w $vr7, $vr1, $vr7 + vmul.w $vr8, $vr1, $vr8 + vreplgr2vr.h $vr9, $t5 + vinsgr2vr.h $vr10, $t5, 0 + vinsgr2vr.h $vr10, $t5, 1 + vinsgr2vr.h $vr10, $t5, 2 + vinsgr2vr.h $vr10, $t5, 3 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vmadd.w $vr8, $vr2, $vr10 + vmadd.w $vr7, $vr2, $vr9 + vadd.w $vr8, $vr8, $vr0 + vadd.w $vr7, $vr7, $vr0 vsrli.w $vr8, $vr8, 6 - vsrli.w $vr9, $vr9, 6 - vpickev.h $vr8, $vr9, $vr8 + vsrli.w $vr7, $vr7, 6 + vpickev.h $vr7, $vr8, $vr7 move $t5, $a3 ld.d $t6, $sp, 176 # 8-byte Folded Reload .p2align 4, , 16 @@ -404,7 +403,7 @@ getSubImagesChroma: # @getSubImagesChroma # Parent Loop BB0_10 Depth=2 # Parent Loop BB0_12 Depth=3 # => This Inner Loop Header: Depth=4 - vst $vr8, $t5, 0 + vst $vr7, $t5, 0 addi.d $t6, $t6, -8 addi.d $t5, $t5, 16 bnez $t6, .LBB0_34 @@ -448,34 +447,38 @@ getSubImagesChroma: # @getSubImagesChroma # Parent Loop BB0_10 Depth=2 # Parent Loop BB0_12 Depth=3 # => This Inner Loop Header: Depth=4 - vld $vr8, $s1, -2 - vld $vr9, $s1, 0 - vilvl.h $vr10, $vr0, $vr8 - vilvh.h $vr8, $vr0, $vr8 - vilvh.h $vr11, $vr0, $vr9 - vilvl.h $vr9, $vr0, $vr9 - vld $vr12, $s2, -2 - vmul.w $vr9, $vr5, $vr9 - vld $vr13, $s2, 0 - vmul.w $vr11, $vr5, $vr11 - vilvl.h $vr14, $vr0, $vr12 - vilvh.h $vr12, $vr0, $vr12 - vilvh.h $vr15, $vr0, $vr13 - vilvl.h $vr13, $vr0, $vr13 - vmul.w $vr13, $vr7, $vr13 - vmul.w $vr15, $vr7, $vr15 - vmadd.w $vr15, $vr6, $vr12 - vmadd.w $vr13, $vr6, $vr14 - vmadd.w $vr11, $vr4, $vr8 - vmadd.w $vr9, $vr4, $vr10 - vadd.w $vr8, $vr9, $vr13 - vadd.w $vr9, $vr11, $vr15 - vadd.w $vr9, $vr9, $vr1 - vadd.w $vr8, $vr8, $vr1 + vld $vr7, $s1, -2 + vld $vr8, $s1, 0 + vsllwil.wu.hu $vr9, $vr7, 0 + vbsrl.v $vr7, $vr7, 8 + vsllwil.wu.hu $vr7, $vr7, 0 + vbsrl.v $vr10, $vr8, 8 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vld $vr11, $s2, -2 + vmul.w $vr8, $vr4, $vr8 + vmul.w $vr10, $vr4, $vr10 + vld $vr12, $s2, 0 + vsllwil.wu.hu $vr13, $vr11, 0 + vbsrl.v $vr11, $vr11, 8 + vsllwil.wu.hu $vr11, $vr11, 0 + vbsrl.v $vr14, $vr12, 8 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vmul.w $vr12, $vr6, $vr12 + vmul.w $vr14, $vr6, $vr14 + vmadd.w $vr14, $vr5, $vr11 + vmadd.w $vr12, $vr5, $vr13 + vmadd.w $vr10, $vr3, $vr7 + vmadd.w $vr8, $vr3, $vr9 + vadd.w $vr7, $vr8, $vr12 + vadd.w $vr8, $vr10, $vr14 + vadd.w $vr8, $vr8, $vr0 + vadd.w $vr7, $vr7, $vr0 + vsrli.w $vr7, $vr7, 6 vsrli.w $vr8, $vr8, 6 - vsrli.w $vr9, $vr9, 6 - vpickev.h $vr8, $vr9, $vr8 - vst $vr8, $s0, 0 + vpickev.h $vr7, $vr8, $vr7 + vst $vr7, $s0, 0 addi.d $s0, $s0, 16 addi.d $s1, $s1, 16 addi.d $s3, $s3, -8 @@ -513,37 +516,37 @@ getSubImagesChroma: # @getSubImagesChroma ld.h $a0, $t6, 0 ld.d $a4, $sp, 152 # 8-byte Folded Reload add.d $s1, $a4, $s1 - vreplgr2vr.h $vr8, $a0 - vilvl.h $vr8, $vr0, $vr8 - vinsgr2vr.h $vr9, $a0, 0 - vinsgr2vr.h $vr9, $a0, 1 - vinsgr2vr.h $vr9, $a0, 2 - vinsgr2vr.h $vr9, $a0, 3 + vreplgr2vr.h $vr7, $a0 + vinsgr2vr.h $vr8, $a0, 0 + vinsgr2vr.h $vr8, $a0, 1 + vinsgr2vr.h $vr8, $a0, 2 + vinsgr2vr.h $vr8, $a0, 3 + vsllwil.wu.hu $vr8, $vr8, 0 ld.h $a0, $s0, 0 - vilvl.h $vr9, $vr0, $vr9 - vmul.w $vr9, $vr2, $vr9 - vmul.w $vr8, $vr2, $vr8 - vreplgr2vr.h $vr10, $a0 - vinsgr2vr.h $vr11, $a0, 0 - vinsgr2vr.h $vr11, $a0, 1 - vinsgr2vr.h $vr11, $a0, 2 - vinsgr2vr.h $vr11, $a0, 3 - vilvl.h $vr11, $vr0, $vr11 - vilvl.h $vr10, $vr0, $vr10 - vmadd.w $vr8, $vr3, $vr10 - vmadd.w $vr9, $vr3, $vr11 - vadd.w $vr9, $vr9, $vr1 - vadd.w $vr8, $vr8, $vr1 + vsllwil.wu.hu $vr7, $vr7, 0 + vmul.w $vr7, $vr1, $vr7 + vmul.w $vr8, $vr1, $vr8 + vreplgr2vr.h $vr9, $a0 + vinsgr2vr.h $vr10, $a0, 0 + vinsgr2vr.h $vr10, $a0, 1 + vinsgr2vr.h $vr10, $a0, 2 + vinsgr2vr.h $vr10, $a0, 3 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vmadd.w $vr8, $vr2, $vr10 + vmadd.w $vr7, $vr2, $vr9 + vadd.w $vr8, $vr8, $vr0 + vadd.w $vr7, $vr7, $vr0 vsrli.w $vr8, $vr8, 6 - vsrli.w $vr9, $vr9, 6 - vpickev.h $vr8, $vr9, $vr8 + vsrli.w $vr7, $vr7, 6 + vpickev.h $vr7, $vr8, $vr7 .p2align 4, , 16 .LBB0_44: # %vector.body # Parent Loop BB0_8 Depth=1 # Parent Loop BB0_10 Depth=2 # Parent Loop BB0_12 Depth=3 # => This Inner Loop Header: Depth=4 - vst $vr8, $s2, 0 + vst $vr7, $s2, 0 addi.d $a4, $a4, -8 addi.d $s2, $s2, 16 bnez $a4, .LBB0_44 @@ -561,7 +564,7 @@ getSubImagesChroma: # @getSubImagesChroma ld.d $a0, $a0, 8 st.d $a0, $sp, 48 # 8-byte Folded Spill ld.d $a0, $sp, 32 # 8-byte Folded Reload - ld.d $s4, $a0, 8 + ld.d $a4, $a0, 8 ld.d $a0, $sp, 24 # 8-byte Folded Reload slli.d $a0, $a0, 3 st.d $a0, $sp, 176 # 8-byte Folded Spill @@ -605,9 +608,9 @@ getSubImagesChroma: # @getSubImagesChroma ld.d $a1, $sp, 48 # 8-byte Folded Reload ldx.d $a0, $a1, $a0 st.d $a0, $sp, 80 # 8-byte Folded Spill - slli.d $a4, $a2, 3 - vreplgr2vr.w $vr2, $a5 - vreplgr2vr.w $vr3, $a4 + slli.d $s3, $a2, 3 + vreplgr2vr.w $vr1, $a5 + vreplgr2vr.w $vr2, $s3 st.d $a2, $sp, 96 # 8-byte Folded Spill b .LBB0_50 .p2align 4, , 16 @@ -641,13 +644,13 @@ getSubImagesChroma: # @getSubImagesChroma mul.d $fp, $a0, $a2 mul.d $s7, $t2, $a2 ld.d $a1, $sp, 88 # 8-byte Folded Reload - mul.d $s3, $a0, $a1 + mul.d $s4, $a0, $a1 st.d $t2, $sp, 104 # 8-byte Folded Spill mul.d $s2, $t2, $a1 - vreplgr2vr.w $vr4, $s3 - vreplgr2vr.w $vr5, $s2 - vreplgr2vr.w $vr6, $fp - vreplgr2vr.w $vr7, $s7 + vreplgr2vr.w $vr3, $s4 + vreplgr2vr.w $vr4, $s2 + vreplgr2vr.w $vr5, $fp + vreplgr2vr.w $vr6, $s7 ld.d $a1, $sp, 120 # 8-byte Folded Reload b .LBB0_52 .p2align 4, , 16 @@ -681,9 +684,9 @@ getSubImagesChroma: # @getSubImagesChroma slli.d $a3, $t7, 3 ldx.d $a3, $t8, $a3 slli.d $a0, $a0, 3 - ldx.d $s6, $s4, $a0 + ldx.d $s6, $a4, $a0 slli.d $a0, $a2, 3 - ldx.d $s8, $s4, $a0 + ldx.d $s8, $a4, $a0 bltz $t3, .LBB0_58 # %bb.53: # %.lr.ph.us.us.us.1.preheader # in Loop: Header=BB0_52 Depth=3 @@ -704,7 +707,7 @@ getSubImagesChroma: # @getSubImagesChroma ld.hu $a0, $s6, 0 ld.hu $t2, $s8, 0 mul.d $a0, $a5, $a0 - mul.d $t2, $a4, $t2 + mul.d $t2, $s3, $t2 add.d $a0, $a0, $t2 addi.d $a0, $a0, 32 srli.d $a0, $a0, 6 @@ -747,7 +750,7 @@ getSubImagesChroma: # @getSubImagesChroma ld.hu $t2, $ra, 0 ld.hu $t5, $s5, -2 ld.hu $s1, $s5, 0 - mul.d $a0, $s3, $a0 + mul.d $a0, $s4, $a0 mul.d $t2, $s2, $t2 mul.d $t5, $fp, $t5 mul.d $s1, $s7, $s1 @@ -798,7 +801,7 @@ getSubImagesChroma: # @getSubImagesChroma ld.hu $a0, $t6, 0 ld.hu $t2, $s0, 0 mul.d $a0, $a5, $a0 - mul.d $t2, $a4, $t2 + mul.d $t2, $s3, $t2 add.d $a0, $a0, $t2 addi.d $a0, $a0, 32 srli.d $a0, $a0, 6 @@ -828,30 +831,30 @@ getSubImagesChroma: # @getSubImagesChroma # %bb.73: # %vector.ph332 # in Loop: Header=BB0_52 Depth=3 ld.h $a0, $s6, 0 - vreplgr2vr.h $vr8, $a0 - vilvl.h $vr8, $vr0, $vr8 - vinsgr2vr.h $vr9, $a0, 0 - vinsgr2vr.h $vr9, $a0, 1 - vinsgr2vr.h $vr9, $a0, 2 - vinsgr2vr.h $vr9, $a0, 3 + vreplgr2vr.h $vr7, $a0 + vinsgr2vr.h $vr8, $a0, 0 + vinsgr2vr.h $vr8, $a0, 1 + vinsgr2vr.h $vr8, $a0, 2 + vinsgr2vr.h $vr8, $a0, 3 + vsllwil.wu.hu $vr8, $vr8, 0 ld.h $a0, $s8, 0 - vilvl.h $vr9, $vr0, $vr9 - vmul.w $vr9, $vr2, $vr9 - vmul.w $vr8, $vr2, $vr8 - vreplgr2vr.h $vr10, $a0 - vinsgr2vr.h $vr11, $a0, 0 - vinsgr2vr.h $vr11, $a0, 1 - vinsgr2vr.h $vr11, $a0, 2 - vinsgr2vr.h $vr11, $a0, 3 - vilvl.h $vr11, $vr0, $vr11 - vilvl.h $vr10, $vr0, $vr10 - vmadd.w $vr8, $vr3, $vr10 - vmadd.w $vr9, $vr3, $vr11 - vadd.w $vr9, $vr9, $vr1 - vadd.w $vr8, $vr8, $vr1 + vsllwil.wu.hu $vr7, $vr7, 0 + vmul.w $vr7, $vr1, $vr7 + vmul.w $vr8, $vr1, $vr8 + vreplgr2vr.h $vr9, $a0 + vinsgr2vr.h $vr10, $a0, 0 + vinsgr2vr.h $vr10, $a0, 1 + vinsgr2vr.h $vr10, $a0, 2 + vinsgr2vr.h $vr10, $a0, 3 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vmadd.w $vr8, $vr2, $vr10 + vmadd.w $vr7, $vr2, $vr9 + vadd.w $vr8, $vr8, $vr0 + vadd.w $vr7, $vr7, $vr0 vsrli.w $vr8, $vr8, 6 - vsrli.w $vr9, $vr9, 6 - vpickev.h $vr8, $vr9, $vr8 + vsrli.w $vr7, $vr7, 6 + vpickev.h $vr7, $vr8, $vr7 move $a2, $a3 ld.d $t5, $sp, 176 # 8-byte Folded Reload .p2align 4, , 16 @@ -860,7 +863,7 @@ getSubImagesChroma: # @getSubImagesChroma # Parent Loop BB0_50 Depth=2 # Parent Loop BB0_52 Depth=3 # => This Inner Loop Header: Depth=4 - vst $vr8, $a2, 0 + vst $vr7, $a2, 0 addi.d $t5, $t5, -8 addi.d $a2, $a2, 16 bnez $t5, .LBB0_74 @@ -905,34 +908,38 @@ getSubImagesChroma: # @getSubImagesChroma # Parent Loop BB0_50 Depth=2 # Parent Loop BB0_52 Depth=3 # => This Inner Loop Header: Depth=4 - vld $vr8, $s1, -2 - vld $vr9, $s1, 0 - vilvl.h $vr10, $vr0, $vr8 - vilvh.h $vr8, $vr0, $vr8 - vilvh.h $vr11, $vr0, $vr9 - vilvl.h $vr9, $vr0, $vr9 - vld $vr12, $s5, -2 - vmul.w $vr9, $vr5, $vr9 - vld $vr13, $s5, 0 - vmul.w $vr11, $vr5, $vr11 - vilvl.h $vr14, $vr0, $vr12 - vilvh.h $vr12, $vr0, $vr12 - vilvh.h $vr15, $vr0, $vr13 - vilvl.h $vr13, $vr0, $vr13 - vmul.w $vr13, $vr7, $vr13 - vmul.w $vr15, $vr7, $vr15 - vmadd.w $vr15, $vr6, $vr12 - vmadd.w $vr13, $vr6, $vr14 - vmadd.w $vr11, $vr4, $vr8 - vmadd.w $vr9, $vr4, $vr10 - vadd.w $vr8, $vr9, $vr13 - vadd.w $vr9, $vr11, $vr15 - vadd.w $vr9, $vr9, $vr1 - vadd.w $vr8, $vr8, $vr1 + vld $vr7, $s1, -2 + vld $vr8, $s1, 0 + vsllwil.wu.hu $vr9, $vr7, 0 + vbsrl.v $vr7, $vr7, 8 + vsllwil.wu.hu $vr7, $vr7, 0 + vbsrl.v $vr10, $vr8, 8 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vld $vr11, $s5, -2 + vmul.w $vr8, $vr4, $vr8 + vmul.w $vr10, $vr4, $vr10 + vld $vr12, $s5, 0 + vsllwil.wu.hu $vr13, $vr11, 0 + vbsrl.v $vr11, $vr11, 8 + vsllwil.wu.hu $vr11, $vr11, 0 + vbsrl.v $vr14, $vr12, 8 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vmul.w $vr12, $vr6, $vr12 + vmul.w $vr14, $vr6, $vr14 + vmadd.w $vr14, $vr5, $vr11 + vmadd.w $vr12, $vr5, $vr13 + vmadd.w $vr10, $vr3, $vr7 + vmadd.w $vr8, $vr3, $vr9 + vadd.w $vr7, $vr8, $vr12 + vadd.w $vr8, $vr10, $vr14 + vadd.w $vr8, $vr8, $vr0 + vadd.w $vr7, $vr7, $vr0 + vsrli.w $vr7, $vr7, 6 vsrli.w $vr8, $vr8, 6 - vsrli.w $vr9, $vr9, 6 - vpickev.h $vr8, $vr9, $vr8 - vst $vr8, $s0, 0 + vpickev.h $vr7, $vr8, $vr7 + vst $vr7, $s0, 0 addi.d $s0, $s0, 16 addi.d $s1, $s1, 16 addi.d $a2, $a2, -8 @@ -970,37 +977,37 @@ getSubImagesChroma: # @getSubImagesChroma ld.h $a0, $t6, 0 ld.d $a2, $sp, 152 # 8-byte Folded Reload add.d $s1, $a2, $s1 - vreplgr2vr.h $vr8, $a0 - vilvl.h $vr8, $vr0, $vr8 - vinsgr2vr.h $vr9, $a0, 0 - vinsgr2vr.h $vr9, $a0, 1 - vinsgr2vr.h $vr9, $a0, 2 - vinsgr2vr.h $vr9, $a0, 3 + vreplgr2vr.h $vr7, $a0 + vinsgr2vr.h $vr8, $a0, 0 + vinsgr2vr.h $vr8, $a0, 1 + vinsgr2vr.h $vr8, $a0, 2 + vinsgr2vr.h $vr8, $a0, 3 + vsllwil.wu.hu $vr8, $vr8, 0 ld.h $a0, $s0, 0 - vilvl.h $vr9, $vr0, $vr9 - vmul.w $vr9, $vr2, $vr9 - vmul.w $vr8, $vr2, $vr8 - vreplgr2vr.h $vr10, $a0 - vinsgr2vr.h $vr11, $a0, 0 - vinsgr2vr.h $vr11, $a0, 1 - vinsgr2vr.h $vr11, $a0, 2 - vinsgr2vr.h $vr11, $a0, 3 - vilvl.h $vr11, $vr0, $vr11 - vilvl.h $vr10, $vr0, $vr10 - vmadd.w $vr8, $vr3, $vr10 - vmadd.w $vr9, $vr3, $vr11 - vadd.w $vr9, $vr9, $vr1 - vadd.w $vr8, $vr8, $vr1 + vsllwil.wu.hu $vr7, $vr7, 0 + vmul.w $vr7, $vr1, $vr7 + vmul.w $vr8, $vr1, $vr8 + vreplgr2vr.h $vr9, $a0 + vinsgr2vr.h $vr10, $a0, 0 + vinsgr2vr.h $vr10, $a0, 1 + vinsgr2vr.h $vr10, $a0, 2 + vinsgr2vr.h $vr10, $a0, 3 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vmadd.w $vr8, $vr2, $vr10 + vmadd.w $vr7, $vr2, $vr9 + vadd.w $vr8, $vr8, $vr0 + vadd.w $vr7, $vr7, $vr0 vsrli.w $vr8, $vr8, 6 - vsrli.w $vr9, $vr9, 6 - vpickev.h $vr8, $vr9, $vr8 + vsrli.w $vr7, $vr7, 6 + vpickev.h $vr7, $vr8, $vr7 .p2align 4, , 16 .LBB0_84: # %vector.body269 # Parent Loop BB0_48 Depth=1 # Parent Loop BB0_50 Depth=2 # Parent Loop BB0_52 Depth=3 # => This Inner Loop Header: Depth=4 - vst $vr8, $s5, 0 + vst $vr7, $s5, 0 addi.d $a2, $a2, -8 addi.d $s5, $s5, 16 bnez $a2, .LBB0_84 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/img_luma.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/img_luma.s index 95389f9f..f01ad8a6 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/img_luma.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/img_luma.s @@ -382,9 +382,8 @@ getHorSubImageSixTap: # @getHorSubImageSixTap bstrins.d $a7, $t4, 1, 0 st.d $a7, $sp, 8 # 8-byte Folded Spill ori $fp, $zero, 20 - vrepli.b $vr0, 0 - vrepli.w $vr1, 20 - vrepli.w $vr2, -5 + vrepli.w $vr0, 20 + vrepli.w $vr1, -5 slli.d $s1, $t7, 1 slli.d $s2, $s0, 1 slli.d $s3, $s3, 1 @@ -603,7 +602,7 @@ getHorSubImageSixTap: # @getHorSubImageSixTap bnez $t6, .LBB1_5 # %bb.9: # %vector.ph # in Loop: Header=BB1_3 Depth=1 - vldrepl.w $vr3, $a5, 0 + vldrepl.w $vr2, $a5, 0 addi.d $t6, $s8, 6 ld.d $t7, $sp, 32 # 8-byte Folded Reload .p2align 4, , 16 @@ -613,33 +612,33 @@ getHorSubImageSixTap: # @getHorSubImageSixTap ld.d $s0, $t6, -2 ld.d $t8, $t6, 0 ld.d $a7, $t6, -4 - vinsgr2vr.d $vr4, $s0, 0 + vinsgr2vr.d $vr3, $s0, 0 ld.d $s0, $t6, 2 - vinsgr2vr.d $vr5, $t8, 0 - vinsgr2vr.d $vr6, $a7, 0 - vilvl.h $vr4, $vr0, $vr4 - vinsgr2vr.d $vr7, $s0, 0 + vinsgr2vr.d $vr4, $t8, 0 + vinsgr2vr.d $vr5, $a7, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vinsgr2vr.d $vr6, $s0, 0 ld.d $a7, $t6, -6 - vilvl.h $vr5, $vr0, $vr5 - vadd.w $vr4, $vr5, $vr4 - vilvl.h $vr5, $vr0, $vr6 - vinsgr2vr.d $vr6, $a7, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vadd.w $vr3, $vr4, $vr3 + vsllwil.wu.hu $vr4, $vr5, 0 + vinsgr2vr.d $vr5, $a7, 0 ld.d $a7, $t6, 4 - vilvl.h $vr7, $vr0, $vr7 - vadd.w $vr5, $vr7, $vr5 - vilvl.h $vr6, $vr0, $vr6 - vinsgr2vr.d $vr7, $a7, 0 - vilvl.h $vr7, $vr0, $vr7 - vmadd.w $vr6, $vr4, $vr1 - vmadd.w $vr6, $vr5, $vr2 - vadd.w $vr4, $vr6, $vr7 - vaddi.wu $vr5, $vr4, 16 - vsrai.w $vr5, $vr5, 5 - vmaxi.w $vr5, $vr5, 0 - vmin.w $vr5, $vr5, $vr3 - vpickev.h $vr5, $vr5, $vr5 - vstelm.d $vr5, $ra, 0, 0 - vst $vr4, $t4, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vadd.w $vr4, $vr6, $vr4 + vsllwil.wu.hu $vr5, $vr5, 0 + vinsgr2vr.d $vr6, $a7, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vmadd.w $vr5, $vr3, $vr0 + vmadd.w $vr5, $vr4, $vr1 + vadd.w $vr3, $vr5, $vr6 + vaddi.wu $vr4, $vr3, 16 + vsrai.w $vr4, $vr4, 5 + vmaxi.w $vr4, $vr4, 0 + vmin.w $vr4, $vr4, $vr2 + vpickev.h $vr4, $vr4, $vr4 + vstelm.d $vr4, $ra, 0, 0 + vst $vr3, $t4, 0 addi.d $t7, $t7, -4 addi.d $t4, $t4, 16 addi.d $t6, $t6, 8 @@ -864,9 +863,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap ori $t6, $zero, 2 ori $t4, $zero, 8 ori $t5, $zero, 16 - vrepli.b $vr1, 0 - vrepli.w $vr2, 20 - vrepli.w $vr3, -5 + vrepli.w $vr1, 20 + vrepli.w $vr2, -5 b .LBB2_16 .p2align 4, , 16 .LBB2_15: # %..loopexit228_crit_edge.us @@ -977,46 +975,52 @@ getVerSubImageSixTap: # @getVerSubImageSixTap .LBB2_27: # %vector.body499 # Parent Loop BB2_16 Depth=1 # => This Inner Loop Header: Depth=2 - vld $vr4, $s4, 0 - vld $vr5, $s5, 0 - vilvh.h $vr6, $vr1, $vr4 - vilvl.h $vr4, $vr1, $vr4 - vilvh.h $vr7, $vr1, $vr5 - vilvl.h $vr5, $vr1, $vr5 - vadd.w $vr4, $vr5, $vr4 - vadd.w $vr5, $vr7, $vr6 - vld $vr6, $s6, 0 - vmul.w $vr5, $vr5, $vr2 - vld $vr7, $s7, 0 - vmul.w $vr4, $vr4, $vr2 - vilvl.h $vr8, $vr1, $vr6 - vilvh.h $vr6, $vr1, $vr6 - vilvl.h $vr9, $vr1, $vr7 - vilvh.h $vr7, $vr1, $vr7 - vld $vr10, $s8, 0 - vadd.w $vr6, $vr7, $vr6 + vld $vr3, $s4, 0 + vld $vr4, $s5, 0 + vbsrl.v $vr5, $vr3, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vbsrl.v $vr6, $vr4, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vadd.w $vr3, $vr4, $vr3 + vadd.w $vr4, $vr6, $vr5 + vld $vr5, $s6, 0 + vmul.w $vr4, $vr4, $vr1 + vmul.w $vr3, $vr3, $vr1 + vld $vr6, $s7, 0 + vsllwil.wu.hu $vr7, $vr5, 0 + vbsrl.v $vr5, $vr5, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vbsrl.v $vr6, $vr6, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vld $vr9, $s8, 0 + vadd.w $vr5, $vr6, $vr5 + vadd.w $vr6, $vr8, $vr7 vld $vr7, $ra, 0 - vadd.w $vr8, $vr9, $vr8 - vilvl.h $vr9, $vr1, $vr10 - vilvh.h $vr10, $vr1, $vr10 - vilvh.h $vr11, $vr1, $vr7 - vilvl.h $vr7, $vr1, $vr7 - vmadd.w $vr4, $vr8, $vr3 - vmadd.w $vr5, $vr6, $vr3 - vadd.w $vr5, $vr5, $vr10 + vsllwil.wu.hu $vr8, $vr9, 0 + vbsrl.v $vr9, $vr9, 8 + vsllwil.wu.hu $vr9, $vr9, 0 + vbsrl.v $vr10, $vr7, 8 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vmadd.w $vr3, $vr6, $vr2 + vmadd.w $vr4, $vr5, $vr2 vadd.w $vr4, $vr4, $vr9 - vadd.w $vr4, $vr4, $vr7 - vadd.w $vr5, $vr5, $vr11 - vaddi.wu $vr5, $vr5, 16 + vadd.w $vr3, $vr3, $vr8 + vadd.w $vr3, $vr3, $vr7 + vadd.w $vr4, $vr4, $vr10 vaddi.wu $vr4, $vr4, 16 + vaddi.wu $vr3, $vr3, 16 + vsrai.w $vr3, $vr3, 5 vsrai.w $vr4, $vr4, 5 - vsrai.w $vr5, $vr5, 5 - vmaxi.w $vr5, $vr5, 0 vmaxi.w $vr4, $vr4, 0 + vmaxi.w $vr3, $vr3, 0 + vmin.w $vr3, $vr3, $vr0 vmin.w $vr4, $vr4, $vr0 - vmin.w $vr5, $vr5, $vr0 - vpickev.h $vr4, $vr5, $vr4 - vst $vr4, $fp, 0 + vpickev.h $vr3, $vr4, $vr3 + vst $vr3, $fp, 0 addi.d $t3, $t3, -8 addi.d $fp, $fp, 16 addi.d $ra, $ra, 16 @@ -1478,12 +1482,12 @@ getVerSubImageSixTap: # @getVerSubImageSixTap # Child Loop BB2_63 Depth 2 # Child Loop BB2_66 Depth 2 slli.d $a7, $t6, 3 - ldx.d $fp, $a2, $a7 + ldx.d $a7, $a2, $a7 alsl.d $t3, $t6, $t1, 3 addi.d $t6, $t6, 1 - slli.d $a7, $t6, 3 - ldx.d $t7, $t1, $a7 - ld.d $a7, $t3, -8 + slli.d $t7, $t6, 3 + ldx.d $t7, $t1, $t7 + ld.d $fp, $t3, -8 ld.d $s0, $t3, 16 ld.d $s1, $t3, -16 ld.d $s2, $t3, 24 @@ -1496,11 +1500,11 @@ getVerSubImageSixTap: # @getVerSubImageSixTap # in Loop: Header=BB2_60 Depth=1 move $s3, $t8 move $s4, $t7 - move $s5, $a7 + move $s5, $fp move $s6, $s0 move $s7, $s1 move $s8, $s2 - move $ra, $fp + move $ra, $a7 move $t3, $a6 .p2align 4, , 16 .LBB2_63: # %vector.body361 @@ -1541,11 +1545,11 @@ getVerSubImageSixTap: # @getVerSubImageSixTap # in Loop: Header=BB2_60 Depth=1 alsl.d $t8, $t3, $t8, 2 alsl.d $s3, $t3, $t7, 2 - alsl.d $a7, $t3, $a7, 2 + alsl.d $fp, $t3, $fp, 2 alsl.d $s0, $t3, $s0, 2 alsl.d $s1, $t3, $s1, 2 alsl.d $s2, $t3, $s2, 2 - alsl.d $fp, $t3, $fp, 1 + alsl.d $a7, $t3, $a7, 1 sub.d $s4, $a4, $t3 .p2align 4, , 16 .LBB2_66: # %scalar.ph354 @@ -1553,7 +1557,7 @@ getVerSubImageSixTap: # @getVerSubImageSixTap # => This Inner Loop Header: Depth=2 ld.w $t3, $t8, 0 ld.w $s5, $s3, 0 - ld.w $s6, $a7, 0 + ld.w $s6, $fp, 0 ld.w $s7, $s0, 0 add.w $t3, $s5, $t3 slli.d $s5, $t3, 4 @@ -1573,15 +1577,15 @@ getVerSubImageSixTap: # @getVerSubImageSixTap maskeqz $t3, $t3, $s5 masknez $s5, $t2, $s5 or $t3, $t3, $s5 - st.h $t3, $fp, 0 + st.h $t3, $a7, 0 addi.d $t8, $t8, 4 addi.d $s3, $s3, 4 - addi.d $a7, $a7, 4 + addi.d $fp, $fp, 4 addi.d $s0, $s0, 4 addi.d $s1, $s1, 4 addi.d $s2, $s2, 4 addi.d $s4, $s4, -1 - addi.d $fp, $fp, 2 + addi.d $a7, $a7, 2 bnez $s4, .LBB2_66 b .LBB2_59 .LBB2_67: @@ -1623,22 +1627,22 @@ getVerSubImageSixTap: # @getVerSubImageSixTap ld.d $t7, $fp, -8 slli.d $a7, $t3, 3 ldx.d $t8, $a4, $a7 - ld.d $fp, $fp, -16 - slli.d $a7, $t1, 3 - ldx.d $s0, $a4, $a7 - lu12i.w $a7, 3 - ori $t1, $a7, 3232 + ld.d $a7, $fp, -16 + slli.d $t1, $t1, 3 + ldx.d $fp, $a4, $t1 + lu12i.w $t1, 3 + ori $t1, $t1, 3232 ldx.w $t3, $t0, $t1 or $a3, $a6, $a3 ori $a6, $zero, 4 bstrpick.d $a3, $a3, 31, 0 bge $a0, $a6, .LBB2_71 # %bb.70: - move $a7, $zero + move $s0, $zero b .LBB2_74 .LBB2_71: # %vector.ph375 bstrpick.d $a6, $a3, 30, 2 - slli.d $a7, $a6, 2 + slli.d $s0, $a6, 2 vreplgr2vr.w $vr0, $t3 vrepli.w $vr1, 20 vrepli.w $vr2, -5 @@ -1648,10 +1652,10 @@ getVerSubImageSixTap: # @getVerSubImageSixTap move $s2, $t6 move $s3, $t7 move $s4, $t8 - move $s5, $fp - move $s6, $s0 + move $s5, $a7 + move $s6, $fp move $s7, $t4 - move $a6, $a7 + move $a6, $s0 .p2align 4, , 16 .LBB2_72: # %vector.body380 # =>This Inner Loop Header: Depth=1 @@ -1684,16 +1688,16 @@ getVerSubImageSixTap: # @getVerSubImageSixTap bnez $a6, .LBB2_72 # %bb.73: # %middle.block389 ld.d $s7, $sp, 32 # 8-byte Folded Reload - beq $a7, $a3, .LBB2_76 + beq $s0, $a3, .LBB2_76 .LBB2_74: # %scalar.ph373.preheader - alsl.d $t4, $a7, $t4, 1 - alsl.d $s0, $a7, $s0, 2 - alsl.d $fp, $a7, $fp, 2 - alsl.d $t8, $a7, $t8, 2 - alsl.d $t7, $a7, $t7, 2 - alsl.d $t6, $a7, $t6, 2 - alsl.d $t5, $a7, $t5, 2 - sub.d $a7, $a3, $a7 + alsl.d $t4, $s0, $t4, 1 + alsl.d $fp, $s0, $fp, 2 + alsl.d $a7, $s0, $a7, 2 + alsl.d $t8, $s0, $t8, 2 + alsl.d $t7, $s0, $t7, 2 + alsl.d $t6, $s0, $t6, 2 + alsl.d $t5, $s0, $t5, 2 + sub.d $s0, $a3, $s0 .p2align 4, , 16 .LBB2_75: # %scalar.ph373 # =>This Inner Loop Header: Depth=1 @@ -1705,8 +1709,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap slli.d $s1, $a6, 4 alsl.d $a6, $a6, $s1, 2 add.w $s1, $s3, $s2 - ld.w $s2, $fp, 0 - ld.w $s3, $s0, 0 + ld.w $s2, $a7, 0 + ld.w $s3, $fp, 0 alsl.d $s1, $s1, $s1, 2 sub.d $a6, $a6, $s1 add.d $a6, $a6, $s2 @@ -1721,14 +1725,14 @@ getVerSubImageSixTap: # @getVerSubImageSixTap or $a6, $a6, $s1 st.h $a6, $t4, 0 addi.d $t4, $t4, 2 - addi.d $s0, $s0, 4 addi.d $fp, $fp, 4 + addi.d $a7, $a7, 4 addi.d $t8, $t8, 4 addi.d $t7, $t7, 4 addi.d $t6, $t6, 4 - addi.d $a7, $a7, -1 + addi.d $s0, $s0, -1 addi.d $t5, $t5, 4 - bnez $a7, .LBB2_75 + bnez $s0, .LBB2_75 .LBB2_76: # %.loopexit232 bge $t2, $s8, .LBB2_116 # %bb.77: # %.lr.ph.1 @@ -1786,9 +1790,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap bstrpick.d $a6, $a4, 30, 3 slli.d $t8, $a6, 3 vreplgr2vr.w $vr0, $a5 - vrepli.b $vr1, 0 - vrepli.w $vr2, 20 - vrepli.w $vr3, -5 + vrepli.w $vr1, 20 + vrepli.w $vr2, -5 move $a7, $t3 move $fp, $t4 move $s0, $t5 @@ -1798,40 +1801,44 @@ getVerSubImageSixTap: # @getVerSubImageSixTap .p2align 4, , 16 .LBB2_84: # %vector.body442 # =>This Inner Loop Header: Depth=1 - vld $vr4, $a7, 0 - vld $vr5, $fp, 0 - vilvh.h $vr6, $vr1, $vr4 - vilvl.h $vr4, $vr1, $vr4 - vilvh.h $vr7, $vr1, $vr5 - vilvl.h $vr5, $vr1, $vr5 - vadd.w $vr5, $vr5, $vr4 - vld $vr8, $s0, 0 - vadd.w $vr7, $vr7, $vr6 - vmul.w $vr7, $vr7, $vr2 - vmul.w $vr5, $vr5, $vr2 - vilvl.h $vr9, $vr1, $vr8 - vld $vr10, $s1, 0 - vilvh.h $vr8, $vr1, $vr8 - vadd.w $vr8, $vr8, $vr6 - vadd.w $vr9, $vr9, $vr4 - vilvh.h $vr11, $vr1, $vr10 - vilvl.h $vr10, $vr1, $vr10 - vmadd.w $vr5, $vr9, $vr3 - vmadd.w $vr7, $vr8, $vr3 - vadd.w $vr6, $vr7, $vr6 - vadd.w $vr4, $vr5, $vr4 - vadd.w $vr4, $vr4, $vr10 - vadd.w $vr5, $vr6, $vr11 - vaddi.wu $vr5, $vr5, 16 + vld $vr3, $a7, 0 + vld $vr4, $fp, 0 + vbsrl.v $vr5, $vr3, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vbsrl.v $vr6, $vr4, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vadd.w $vr4, $vr4, $vr3 + vld $vr7, $s0, 0 + vadd.w $vr6, $vr6, $vr5 + vmul.w $vr6, $vr6, $vr1 + vmul.w $vr4, $vr4, $vr1 + vsllwil.wu.hu $vr8, $vr7, 0 + vbsrl.v $vr7, $vr7, 8 + vld $vr9, $s1, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vadd.w $vr7, $vr7, $vr5 + vadd.w $vr8, $vr8, $vr3 + vbsrl.v $vr10, $vr9, 8 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vmadd.w $vr4, $vr8, $vr2 + vmadd.w $vr6, $vr7, $vr2 + vadd.w $vr5, $vr6, $vr5 + vadd.w $vr3, $vr4, $vr3 + vadd.w $vr3, $vr3, $vr9 + vadd.w $vr4, $vr5, $vr10 vaddi.wu $vr4, $vr4, 16 + vaddi.wu $vr3, $vr3, 16 + vsrai.w $vr3, $vr3, 5 vsrai.w $vr4, $vr4, 5 - vsrai.w $vr5, $vr5, 5 - vmaxi.w $vr5, $vr5, 0 vmaxi.w $vr4, $vr4, 0 + vmaxi.w $vr3, $vr3, 0 + vmin.w $vr3, $vr3, $vr0 vmin.w $vr4, $vr4, $vr0 - vmin.w $vr5, $vr5, $vr0 - vpickev.h $vr4, $vr5, $vr4 - vst $vr4, $s2, 0 + vpickev.h $vr3, $vr4, $vr3 + vst $vr3, $s2, 0 addi.d $s3, $s3, -8 addi.d $s2, $s2, 16 addi.d $s1, $s1, 16 @@ -1867,9 +1874,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap bstrpick.d $a6, $a4, 30, 3 slli.d $fp, $a6, 3 vreplgr2vr.w $vr0, $a5 - vrepli.b $vr1, 0 - vrepli.w $vr2, 20 - vrepli.w $vr3, -5 + vrepli.w $vr1, 20 + vrepli.w $vr2, -5 move $a7, $t4 move $s0, $t5 move $s1, $t3 @@ -1880,43 +1886,48 @@ getVerSubImageSixTap: # @getVerSubImageSixTap .p2align 4, , 16 .LBB2_92: # %vector.body469 # =>This Inner Loop Header: Depth=1 - vld $vr4, $a7, 0 - vld $vr5, $s0, 0 - vilvh.h $vr6, $vr1, $vr4 - vilvl.h $vr4, $vr1, $vr4 - vilvh.h $vr7, $vr1, $vr5 - vilvl.h $vr5, $vr1, $vr5 - vadd.w $vr4, $vr5, $vr4 - vadd.w $vr5, $vr7, $vr6 - vld $vr6, $s1, 0 - vmul.w $vr5, $vr5, $vr2 - vld $vr7, $s2, 0 - vmul.w $vr4, $vr4, $vr2 - vilvl.h $vr8, $vr1, $vr6 - vilvh.h $vr6, $vr1, $vr6 - vilvl.h $vr9, $vr1, $vr7 - vld $vr10, $s3, 0 - vilvh.h $vr7, $vr1, $vr7 - vadd.w $vr7, $vr7, $vr6 - vadd.w $vr9, $vr9, $vr8 - vilvh.h $vr11, $vr1, $vr10 - vilvl.h $vr10, $vr1, $vr10 - vmadd.w $vr4, $vr9, $vr3 - vmadd.w $vr5, $vr7, $vr3 - vadd.w $vr5, $vr5, $vr6 - vadd.w $vr4, $vr4, $vr8 + vld $vr3, $a7, 0 + vld $vr4, $s0, 0 + vbsrl.v $vr5, $vr3, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vbsrl.v $vr6, $vr4, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vadd.w $vr3, $vr4, $vr3 + vadd.w $vr4, $vr6, $vr5 + vld $vr5, $s1, 0 + vmul.w $vr4, $vr4, $vr1 + vmul.w $vr3, $vr3, $vr1 + vld $vr6, $s2, 0 + vsllwil.wu.hu $vr7, $vr5, 0 + vbsrl.v $vr5, $vr5, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vbsrl.v $vr6, $vr6, 8 + vld $vr9, $s3, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vadd.w $vr6, $vr6, $vr5 + vadd.w $vr8, $vr8, $vr7 + vbsrl.v $vr10, $vr9, 8 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vmadd.w $vr3, $vr8, $vr2 + vmadd.w $vr4, $vr6, $vr2 + vadd.w $vr4, $vr4, $vr5 + vadd.w $vr3, $vr3, $vr7 + vadd.w $vr3, $vr3, $vr9 vadd.w $vr4, $vr4, $vr10 - vadd.w $vr5, $vr5, $vr11 - vaddi.wu $vr5, $vr5, 16 vaddi.wu $vr4, $vr4, 16 + vaddi.wu $vr3, $vr3, 16 + vsrai.w $vr3, $vr3, 5 vsrai.w $vr4, $vr4, 5 - vsrai.w $vr5, $vr5, 5 - vmaxi.w $vr5, $vr5, 0 vmaxi.w $vr4, $vr4, 0 + vmaxi.w $vr3, $vr3, 0 + vmin.w $vr3, $vr3, $vr0 vmin.w $vr4, $vr4, $vr0 - vmin.w $vr5, $vr5, $vr0 - vpickev.h $vr4, $vr5, $vr4 - vst $vr4, $s4, 0 + vpickev.h $vr3, $vr4, $vr3 + vst $vr3, $s4, 0 addi.d $s5, $s5, -8 addi.d $s4, $s4, 16 addi.d $s3, $s3, 16 @@ -1962,9 +1973,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap bstrpick.d $a5, $a4, 30, 3 slli.d $s1, $a5, 3 vreplgr2vr.w $vr0, $t3 - vrepli.b $vr1, 0 - vrepli.w $vr2, 20 - vrepli.w $vr3, -5 + vrepli.w $vr1, 20 + vrepli.w $vr2, -5 move $s2, $t5 move $s3, $t6 move $s4, $t7 @@ -1976,46 +1986,52 @@ getVerSubImageSixTap: # @getVerSubImageSixTap .p2align 4, , 16 .LBB2_102: # %vector.body530 # =>This Inner Loop Header: Depth=1 - vld $vr4, $s2, 0 - vld $vr5, $s3, 0 - vilvh.h $vr6, $vr1, $vr4 - vilvl.h $vr4, $vr1, $vr4 - vilvh.h $vr7, $vr1, $vr5 - vilvl.h $vr5, $vr1, $vr5 - vadd.w $vr4, $vr5, $vr4 - vadd.w $vr5, $vr7, $vr6 - vld $vr6, $s4, 0 - vmul.w $vr5, $vr5, $vr2 - vld $vr7, $s5, 0 - vmul.w $vr4, $vr4, $vr2 - vilvl.h $vr8, $vr1, $vr6 - vilvh.h $vr6, $vr1, $vr6 - vilvl.h $vr9, $vr1, $vr7 - vilvh.h $vr7, $vr1, $vr7 - vld $vr10, $s6, 0 - vadd.w $vr6, $vr7, $vr6 + vld $vr3, $s2, 0 + vld $vr4, $s3, 0 + vbsrl.v $vr5, $vr3, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vbsrl.v $vr6, $vr4, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vadd.w $vr3, $vr4, $vr3 + vadd.w $vr4, $vr6, $vr5 + vld $vr5, $s4, 0 + vmul.w $vr4, $vr4, $vr1 + vmul.w $vr3, $vr3, $vr1 + vld $vr6, $s5, 0 + vsllwil.wu.hu $vr7, $vr5, 0 + vbsrl.v $vr5, $vr5, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vbsrl.v $vr6, $vr6, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vld $vr9, $s6, 0 + vadd.w $vr5, $vr6, $vr5 + vadd.w $vr6, $vr8, $vr7 vld $vr7, $s7, 0 - vadd.w $vr8, $vr9, $vr8 - vilvl.h $vr9, $vr1, $vr10 - vilvh.h $vr10, $vr1, $vr10 - vilvh.h $vr11, $vr1, $vr7 - vilvl.h $vr7, $vr1, $vr7 - vmadd.w $vr4, $vr8, $vr3 - vmadd.w $vr5, $vr6, $vr3 - vadd.w $vr5, $vr5, $vr10 + vsllwil.wu.hu $vr8, $vr9, 0 + vbsrl.v $vr9, $vr9, 8 + vsllwil.wu.hu $vr9, $vr9, 0 + vbsrl.v $vr10, $vr7, 8 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vmadd.w $vr3, $vr6, $vr2 + vmadd.w $vr4, $vr5, $vr2 vadd.w $vr4, $vr4, $vr9 - vadd.w $vr4, $vr4, $vr7 - vadd.w $vr5, $vr5, $vr11 - vaddi.wu $vr5, $vr5, 16 + vadd.w $vr3, $vr3, $vr8 + vadd.w $vr3, $vr3, $vr7 + vadd.w $vr4, $vr4, $vr10 vaddi.wu $vr4, $vr4, 16 + vaddi.wu $vr3, $vr3, 16 + vsrai.w $vr3, $vr3, 5 vsrai.w $vr4, $vr4, 5 - vsrai.w $vr5, $vr5, 5 - vmaxi.w $vr5, $vr5, 0 vmaxi.w $vr4, $vr4, 0 + vmaxi.w $vr3, $vr3, 0 + vmin.w $vr3, $vr3, $vr0 vmin.w $vr4, $vr4, $vr0 - vmin.w $vr5, $vr5, $vr0 - vpickev.h $vr4, $vr5, $vr4 - vst $vr4, $a7, 0 + vpickev.h $vr3, $vr4, $vr3 + vst $vr3, $a7, 0 addi.d $a5, $a5, -8 addi.d $a7, $a7, 16 addi.d $s7, $s7, 16 @@ -2277,9 +2293,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap bstrpick.d $a6, $a4, 30, 3 slli.d $s1, $a6, 3 vreplgr2vr.w $vr0, $t3 - vrepli.b $vr1, 0 - vrepli.w $vr2, 20 - vrepli.w $vr3, -5 + vrepli.w $vr1, 20 + vrepli.w $vr2, -5 move $s2, $t5 move $s3, $t6 move $s4, $t7 @@ -2291,46 +2306,52 @@ getVerSubImageSixTap: # @getVerSubImageSixTap .p2align 4, , 16 .LBB2_124: # %vector.body561 # =>This Inner Loop Header: Depth=1 - vld $vr4, $s2, 0 - vld $vr5, $s3, 0 - vilvh.h $vr6, $vr1, $vr4 - vilvl.h $vr4, $vr1, $vr4 - vilvh.h $vr7, $vr1, $vr5 - vilvl.h $vr5, $vr1, $vr5 - vadd.w $vr4, $vr5, $vr4 - vadd.w $vr5, $vr7, $vr6 - vld $vr6, $s4, 0 - vmul.w $vr5, $vr5, $vr2 - vld $vr7, $s5, 0 - vmul.w $vr4, $vr4, $vr2 - vilvl.h $vr8, $vr1, $vr6 - vilvh.h $vr6, $vr1, $vr6 - vilvl.h $vr9, $vr1, $vr7 - vilvh.h $vr7, $vr1, $vr7 - vld $vr10, $s6, 0 - vadd.w $vr6, $vr7, $vr6 + vld $vr3, $s2, 0 + vld $vr4, $s3, 0 + vbsrl.v $vr5, $vr3, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vbsrl.v $vr6, $vr4, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vadd.w $vr3, $vr4, $vr3 + vadd.w $vr4, $vr6, $vr5 + vld $vr5, $s4, 0 + vmul.w $vr4, $vr4, $vr1 + vmul.w $vr3, $vr3, $vr1 + vld $vr6, $s5, 0 + vsllwil.wu.hu $vr7, $vr5, 0 + vbsrl.v $vr5, $vr5, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vbsrl.v $vr6, $vr6, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vld $vr9, $s6, 0 + vadd.w $vr5, $vr6, $vr5 + vadd.w $vr6, $vr8, $vr7 vld $vr7, $s7, 0 - vadd.w $vr8, $vr9, $vr8 - vilvl.h $vr9, $vr1, $vr10 - vilvh.h $vr10, $vr1, $vr10 - vilvh.h $vr11, $vr1, $vr7 - vilvl.h $vr7, $vr1, $vr7 - vmadd.w $vr4, $vr8, $vr3 - vmadd.w $vr5, $vr6, $vr3 - vadd.w $vr5, $vr5, $vr10 + vsllwil.wu.hu $vr8, $vr9, 0 + vbsrl.v $vr9, $vr9, 8 + vsllwil.wu.hu $vr9, $vr9, 0 + vbsrl.v $vr10, $vr7, 8 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vmadd.w $vr3, $vr6, $vr2 + vmadd.w $vr4, $vr5, $vr2 vadd.w $vr4, $vr4, $vr9 - vadd.w $vr4, $vr4, $vr7 - vadd.w $vr5, $vr5, $vr11 - vaddi.wu $vr5, $vr5, 16 + vadd.w $vr3, $vr3, $vr8 + vadd.w $vr3, $vr3, $vr7 + vadd.w $vr4, $vr4, $vr10 vaddi.wu $vr4, $vr4, 16 + vaddi.wu $vr3, $vr3, 16 + vsrai.w $vr3, $vr3, 5 vsrai.w $vr4, $vr4, 5 - vsrai.w $vr5, $vr5, 5 - vmaxi.w $vr5, $vr5, 0 vmaxi.w $vr4, $vr4, 0 + vmaxi.w $vr3, $vr3, 0 + vmin.w $vr3, $vr3, $vr0 vmin.w $vr4, $vr4, $vr0 - vmin.w $vr5, $vr5, $vr0 - vpickev.h $vr4, $vr5, $vr4 - vst $vr4, $s8, 0 + vpickev.h $vr3, $vr4, $vr3 + vst $vr3, $s8, 0 addi.d $ra, $ra, -8 addi.d $s8, $s8, 16 addi.d $s7, $s7, 16 @@ -2370,9 +2391,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap bstrpick.d $a0, $a4, 30, 3 slli.d $a0, $a0, 3 vreplgr2vr.w $vr0, $a1 - vrepli.b $vr1, 0 - vrepli.w $vr2, 20 - vrepli.w $vr3, -5 + vrepli.w $vr1, 20 + vrepli.w $vr2, -5 move $t0, $a6 move $t1, $a5 move $t4, $a7 @@ -2384,46 +2404,52 @@ getVerSubImageSixTap: # @getVerSubImageSixTap .p2align 4, , 16 .LBB2_133: # %vector.body592 # =>This Inner Loop Header: Depth=1 - vld $vr4, $t0, 0 - vld $vr5, $t1, 0 - vilvh.h $vr6, $vr1, $vr4 - vilvl.h $vr4, $vr1, $vr4 - vilvh.h $vr7, $vr1, $vr5 - vilvl.h $vr5, $vr1, $vr5 - vadd.w $vr4, $vr5, $vr4 - vadd.w $vr5, $vr7, $vr6 - vld $vr6, $t4, 0 - vmul.w $vr5, $vr5, $vr2 - vld $vr7, $t5, 0 - vmul.w $vr4, $vr4, $vr2 - vilvl.h $vr8, $vr1, $vr6 - vilvh.h $vr6, $vr1, $vr6 - vilvl.h $vr9, $vr1, $vr7 - vilvh.h $vr7, $vr1, $vr7 - vld $vr10, $t6, 0 - vadd.w $vr6, $vr7, $vr6 + vld $vr3, $t0, 0 + vld $vr4, $t1, 0 + vbsrl.v $vr5, $vr3, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vbsrl.v $vr6, $vr4, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vadd.w $vr3, $vr4, $vr3 + vadd.w $vr4, $vr6, $vr5 + vld $vr5, $t4, 0 + vmul.w $vr4, $vr4, $vr1 + vmul.w $vr3, $vr3, $vr1 + vld $vr6, $t5, 0 + vsllwil.wu.hu $vr7, $vr5, 0 + vbsrl.v $vr5, $vr5, 8 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vbsrl.v $vr6, $vr6, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vld $vr9, $t6, 0 + vadd.w $vr5, $vr6, $vr5 + vadd.w $vr6, $vr8, $vr7 vld $vr7, $t7, 0 - vadd.w $vr8, $vr9, $vr8 - vilvl.h $vr9, $vr1, $vr10 - vilvh.h $vr10, $vr1, $vr10 - vilvh.h $vr11, $vr1, $vr7 - vilvl.h $vr7, $vr1, $vr7 - vmadd.w $vr4, $vr8, $vr3 - vmadd.w $vr5, $vr6, $vr3 - vadd.w $vr5, $vr5, $vr10 + vsllwil.wu.hu $vr8, $vr9, 0 + vbsrl.v $vr9, $vr9, 8 + vsllwil.wu.hu $vr9, $vr9, 0 + vbsrl.v $vr10, $vr7, 8 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vmadd.w $vr3, $vr6, $vr2 + vmadd.w $vr4, $vr5, $vr2 vadd.w $vr4, $vr4, $vr9 - vadd.w $vr4, $vr4, $vr7 - vadd.w $vr5, $vr5, $vr11 - vaddi.wu $vr5, $vr5, 16 + vadd.w $vr3, $vr3, $vr8 + vadd.w $vr3, $vr3, $vr7 + vadd.w $vr4, $vr4, $vr10 vaddi.wu $vr4, $vr4, 16 + vaddi.wu $vr3, $vr3, 16 + vsrai.w $vr3, $vr3, 5 vsrai.w $vr4, $vr4, 5 - vsrai.w $vr5, $vr5, 5 - vmaxi.w $vr5, $vr5, 0 vmaxi.w $vr4, $vr4, 0 + vmaxi.w $vr3, $vr3, 0 + vmin.w $vr3, $vr3, $vr0 vmin.w $vr4, $vr4, $vr0 - vmin.w $vr5, $vr5, $vr0 - vpickev.h $vr4, $vr5, $vr4 - vst $vr4, $t8, 0 + vpickev.h $vr3, $vr4, $vr3 + vst $vr3, $t8, 0 addi.d $fp, $fp, -8 addi.d $t8, $t8, 16 addi.d $t7, $t7, 16 @@ -2508,7 +2534,6 @@ getHorSubImageBiLinear: # @getHorSubImageBiLinear st.d $t5, $sp, 8 # 8-byte Folded Spill ori $s0, $zero, 8 ori $s1, $zero, 7 - vrepli.b $vr0, 0 ori $s2, $zero, 16 b .LBB3_5 .p2align 4, , 16 @@ -2596,13 +2621,13 @@ getHorSubImageBiLinear: # @getHorSubImageBiLinear .LBB3_15: # %vector.body147 # Parent Loop BB3_5 Depth=1 # => This Inner Loop Header: Depth=2 - vld $vr1, $s7, 0 - vld $vr2, $s6, 0 - vor.v $vr3, $vr1, $vr2 - vxor.v $vr1, $vr1, $vr2 - vsrli.h $vr1, $vr1, 1 - vsub.h $vr1, $vr3, $vr1 - vst $vr1, $s8, 0 + vld $vr0, $s7, 0 + vld $vr1, $s6, 0 + vor.v $vr2, $vr0, $vr1 + vxor.v $vr0, $vr0, $vr1 + vsrli.h $vr0, $vr0, 1 + vsub.h $vr0, $vr2, $vr0 + vst $vr0, $s8, 0 addi.d $ra, $ra, -8 addi.d $s8, $s8, 16 addi.d $s6, $s6, 16 @@ -2637,29 +2662,30 @@ getHorSubImageBiLinear: # @getHorSubImageBiLinear # %bb.19: # %vector.ph125 # in Loop: Header=BB3_5 Depth=1 ld.h $t5, $s6, 0 - vreplgr2vr.h $vr1, $t5 - vinsgr2vr.h $vr2, $t5, 0 - vinsgr2vr.h $vr2, $t5, 1 - vinsgr2vr.h $vr2, $t5, 2 - vinsgr2vr.h $vr2, $t5, 3 - vilvl.h $vr2, $vr0, $vr2 - vilvl.h $vr1, $vr0, $vr1 - vaddi.wu $vr1, $vr1, 1 - vaddi.wu $vr2, $vr2, 1 + vreplgr2vr.h $vr0, $t5 + vinsgr2vr.h $vr1, $t5, 0 + vinsgr2vr.h $vr1, $t5, 1 + vinsgr2vr.h $vr1, $t5, 2 + vinsgr2vr.h $vr1, $t5, 3 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr2, $vr0, 0 + vaddi.wu $vr0, $vr1, 1 + vaddi.wu $vr1, $vr2, 1 move $s5, $t8 .p2align 4, , 16 .LBB3_20: # %vector.body128 # Parent Loop BB3_5 Depth=1 # => This Inner Loop Header: Depth=2 - vld $vr3, $s8, 0 - vilvl.h $vr4, $vr0, $vr3 - vilvh.h $vr3, $vr0, $vr3 - vadd.w $vr3, $vr3, $vr2 - vadd.w $vr4, $vr4, $vr1 - vsrli.w $vr4, $vr4, 1 + vld $vr2, $s8, 0 + vbsrl.v $vr3, $vr2, 8 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vadd.w $vr2, $vr2, $vr1 + vsrli.w $vr2, $vr2, 1 + vadd.w $vr3, $vr3, $vr0 vsrli.w $vr3, $vr3, 1 - vpickev.h $vr3, $vr3, $vr4 - vst $vr3, $s7, 0 + vpickev.h $vr2, $vr3, $vr2 + vst $vr2, $s7, 0 addi.d $s5, $s5, -8 addi.d $s7, $s7, 16 addi.d $s8, $s8, 16 @@ -2690,7 +2716,6 @@ getHorSubImageBiLinear: # @getHorSubImageBiLinear slli.d $t5, $t6, 3 alsl.d $t6, $t6, $a3, 3 ori $t7, $zero, 7 - vrepli.b $vr0, 0 b .LBB3_25 .p2align 4, , 16 .LBB3_24: # %._crit_edge.us52 @@ -2752,29 +2777,30 @@ getHorSubImageBiLinear: # @getHorSubImageBiLinear # %bb.30: # %vector.ph # in Loop: Header=BB3_25 Depth=1 ld.h $s3, $t8, 0 - vreplgr2vr.h $vr1, $s3 - vinsgr2vr.h $vr2, $s3, 0 - vinsgr2vr.h $vr2, $s3, 1 - vinsgr2vr.h $vr2, $s3, 2 - vinsgr2vr.h $vr2, $s3, 3 - vilvl.h $vr2, $vr0, $vr2 - vilvl.h $vr1, $vr0, $vr1 - vaddi.wu $vr1, $vr1, 1 - vaddi.wu $vr2, $vr2, 1 + vreplgr2vr.h $vr0, $s3 + vinsgr2vr.h $vr1, $s3, 0 + vinsgr2vr.h $vr1, $s3, 1 + vinsgr2vr.h $vr1, $s3, 2 + vinsgr2vr.h $vr1, $s3, 3 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr2, $vr0, 0 + vaddi.wu $vr0, $vr1, 1 + vaddi.wu $vr1, $vr2, 1 move $s3, $t5 .p2align 4, , 16 .LBB3_31: # %vector.body # Parent Loop BB3_25 Depth=1 # => This Inner Loop Header: Depth=2 - vld $vr3, $s2, 0 - vilvl.h $vr4, $vr0, $vr3 - vilvh.h $vr3, $vr0, $vr3 - vadd.w $vr3, $vr3, $vr2 - vadd.w $vr4, $vr4, $vr1 - vsrli.w $vr4, $vr4, 1 + vld $vr2, $s2, 0 + vbsrl.v $vr3, $vr2, 8 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vadd.w $vr2, $vr2, $vr1 + vsrli.w $vr2, $vr2, 1 + vadd.w $vr3, $vr3, $vr0 vsrli.w $vr3, $vr3, 1 - vpickev.h $vr3, $vr3, $vr4 - vst $vr3, $s1, 0 + vpickev.h $vr2, $vr3, $vr2 + vst $vr2, $s1, 0 addi.d $s3, $s3, -8 addi.d $s1, $s1, 16 addi.d $s2, $s2, 16 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/intrarefresh.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/intrarefresh.s index 49755cbc..aa3d0914 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/intrarefresh.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/intrarefresh.s @@ -166,12 +166,9 @@ RandomIntraNewPicture: # @RandomIntraNewPicture .LBB2_4: # %vector.body # =>This Inner Loop Header: Depth=1 vmod.w $vr2, $vr1, $vr0 - vshuf4i.w $vr3, $vr2, 50 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 + vshuf4i.w $vr3, $vr2, 14 + vsllwil.d.w $vr3, $vr3, 0 + vsllwil.d.w $vr2, $vr2, 0 vpickve2gr.d $t0, $vr2, 0 slli.d $t0, $t0, 2 vpickve2gr.d $t1, $vr2, 1 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/macroblock.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/macroblock.s index 0d1d66d5..ac263459 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/macroblock.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/macroblock.s @@ -3754,24 +3754,24 @@ LumaPrediction4x4Bi: # @LumaPrediction4x4Bi .type LumaResidualCoding8x8,@function LumaResidualCoding8x8: # @LumaResidualCoding8x8 # %bb.0: - addi.d $sp, $sp, -304 - st.d $ra, $sp, 296 # 8-byte Folded Spill - st.d $fp, $sp, 288 # 8-byte Folded Spill - st.d $s0, $sp, 280 # 8-byte Folded Spill - st.d $s1, $sp, 272 # 8-byte Folded Spill - st.d $s2, $sp, 264 # 8-byte Folded Spill - st.d $s3, $sp, 256 # 8-byte Folded Spill - st.d $s4, $sp, 248 # 8-byte Folded Spill - st.d $s5, $sp, 240 # 8-byte Folded Spill - st.d $s6, $sp, 232 # 8-byte Folded Spill - st.d $s7, $sp, 224 # 8-byte Folded Spill - st.d $s8, $sp, 216 # 8-byte Folded Spill + addi.d $sp, $sp, -288 + st.d $ra, $sp, 280 # 8-byte Folded Spill + st.d $fp, $sp, 272 # 8-byte Folded Spill + st.d $s0, $sp, 264 # 8-byte Folded Spill + st.d $s1, $sp, 256 # 8-byte Folded Spill + st.d $s2, $sp, 248 # 8-byte Folded Spill + st.d $s3, $sp, 240 # 8-byte Folded Spill + st.d $s4, $sp, 232 # 8-byte Folded Spill + st.d $s5, $sp, 224 # 8-byte Folded Spill + st.d $s6, $sp, 216 # 8-byte Folded Spill + st.d $s7, $sp, 208 # 8-byte Folded Spill + st.d $s8, $sp, 200 # 8-byte Folded Spill move $s0, $a7 move $s8, $a5 move $t1, $a2 st.d $a1, $sp, 72 # 8-byte Folded Spill st.d $a0, $sp, 64 # 8-byte Folded Spill - st.w $zero, $sp, 212 + st.w $zero, $sp, 196 slli.d $a5, $a2, 2 bstrpick.d $a0, $a5, 31, 3 slli.w $t0, $a0, 3 @@ -3826,13 +3826,11 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 slli.d $a0, $t0, 5 lu12i.w $a2, 3 pcalau12i $a6, %pc_hi20(imgY_org) - vrepli.b $vr2, 0 pcalau12i $a3, %pc_hi20(si_frame_indicator) st.d $a3, $sp, 48 # 8-byte Folded Spill pcalau12i $a3, %pc_hi20(sp2_frame_indicator) st.d $a3, $sp, 40 # 8-byte Folded Spill st.d $a4, $sp, 152 # 8-byte Folded Spill - vst $vr2, $sp, 192 # 16-byte Folded Spill st.d $t0, $sp, 32 # 8-byte Folded Spill st.d $a0, $sp, 8 # 8-byte Folded Spill beqz $a1, .LBB8_19 @@ -3884,12 +3882,10 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 ld.d $a5, $a0, 8 ld.d $a6, $a4, -104 vinsgr2vr.d $vr0, $a1, 0 - vld $vr1, $sp, 192 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 ldx.d $a1, $a5, $a3 vinsgr2vr.d $vr1, $a6, 0 - vld $vr2, $sp, 192 # 16-byte Folded Reload - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $a4, -72 @@ -3898,31 +3894,25 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 vst $vr0, $a2, -208 vinsgr2vr.d $vr0, $a1, 0 ldx.d $a1, $a5, $a3 - vld $vr2, $sp, 192 # 16-byte Folded Reload - vilvl.h $vr1, $vr2, $vr1 - vld $vr2, $sp, 192 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr0, $vr0, 0 vsub.w $vr0, $vr1, $vr0 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $a4, -40 vst $vr0, $a2, -144 ld.d $a0, $a0, 24 - vld $vr0, $sp, 192 # 16-byte Folded Reload - vilvl.h $vr0, $vr0, $vr1 + vsllwil.wu.hu $vr0, $vr1, 0 vinsgr2vr.d $vr1, $a1, 0 - vld $vr2, $sp, 192 # 16-byte Folded Reload - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a0, $a0, $a3 vsub.w $vr0, $vr0, $vr1 ld.d $a1, $a4, -8 vst $vr0, $a2, -80 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 192 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 ld.d $a0, $s2, 0 - vld $vr2, $sp, 192 # 16-byte Folded Reload - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $a2, -16 ld.w $a1, $a0, 192 @@ -3936,7 +3926,6 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 move $a6, $fp pcaddu18i $ra, %call36(LumaPrediction4x4) jirl $ra, $ra, 0 - vld $vr2, $sp, 192 # 16-byte Folded Reload ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.d $a1, $a0, %pc_lo12(imgY_org) ld.d $a0, $sp, 128 # 8-byte Folded Reload @@ -3951,9 +3940,9 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 ld.d $a6, $a4, -96 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a6, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a2, $a2, $a3 vsub.w $vr0, $vr0, $vr1 ld.d $a6, $a4, -64 @@ -3961,26 +3950,26 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 16 vinsgr2vr.d $vr1, $a6, 0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a2, $a2, $a3 vsub.w $vr0, $vr0, $vr1 vst $vr0, $a5, -128 ld.d $a4, $a4, -32 vinsgr2vr.d $vr0, $a2, 0 ld.d $a1, $a1, 24 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 ld.d $a4, $sp, 152 # 8-byte Folded Reload - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a1, $a1, $a3 vsub.w $vr0, $vr0, $vr1 ldx.d $a2, $a0, $s3 vst $vr0, $a5, -64 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vstx $vr0, $a0, $s4 ld.d $a1, $sp, 136 # 8-byte Folded Reload @@ -4004,7 +3993,7 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 ori $a1, $zero, 3 beq $a0, $a1, .LBB8_9 # %bb.7: - addi.d $a1, $sp, 212 + addi.d $a1, $sp, 196 ld.d $a0, $sp, 80 # 8-byte Folded Reload move $a2, $zero pcaddu18i $ra, %call36(dct_luma8x8) @@ -4037,7 +4026,7 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 andi $a1, $a1, 1 bnez $a1, .LBB8_40 # %bb.10: # %.critedge - ld.w $a1, $sp, 212 + ld.w $a1, $sp, 196 ori $a2, $zero, 4 blt $a2, $a1, .LBB8_40 # %bb.11: @@ -4064,7 +4053,7 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 sub.d $a1, $a1, $a4 ld.d $a4, $sp, 16 # 8-byte Folded Reload slli.d $fp, $a4, 3 - st.w $zero, $sp, 212 + st.w $zero, $sp, 196 and $a1, $a2, $a1 st.w $a1, $a3, 0 slli.d $a1, $a4, 1 @@ -4293,10 +4282,9 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 ld.d $a5, $a4, -104 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vld $vr2, $sp, 192 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a2, $a2, $a3 vsub.w $vr0, $vr0, $vr1 ld.d $a5, $a4, -72 @@ -4306,27 +4294,27 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 vstx $vr0, $a0, $a2 vinsgr2vr.d $vr0, $a5, 0 ldx.d $a2, $a6, $a3 - vilvl.h $vr1, $vr2, $vr1 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr0, $vr0, 0 vsub.w $vr0, $vr1, $vr0 vinsgr2vr.d $vr1, $a2, 0 ld.d $a2, $a4, -40 ld.d $a5, $sp, 128 # 8-byte Folded Reload vstx $vr0, $a0, $a5 ld.d $a1, $a1, 24 - vilvl.h $vr0, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr1, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a1, $a1, $a3 vsub.w $vr0, $vr0, $vr1 ld.d $a2, $a4, -8 ld.d $a3, $sp, 104 # 8-byte Folded Reload vstx $vr0, $a0, $a3 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 ld.d $a1, $s2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a2, $sp, 96 # 8-byte Folded Reload vstx $vr0, $a0, $a2 @@ -4359,7 +4347,7 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 ld.w $a1, $a1, %pc_lo12(sp2_frame_indicator) or $a1, $a0, $a1 ld.d $a0, $sp, 152 # 8-byte Folded Reload - addi.d $a2, $sp, 212 + addi.d $a2, $sp, 196 beqz $a1, .LBB8_36 # %bb.27: # in Loop: Header=BB8_22 Depth=1 move $a1, $s1 @@ -4369,7 +4357,7 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 b .LBB8_30 .p2align 4, , 16 .LBB8_28: # in Loop: Header=BB8_22 Depth=1 - addi.d $a2, $sp, 212 + addi.d $a2, $sp, 196 ld.d $a0, $sp, 152 # 8-byte Folded Reload move $a1, $s1 move $a3, $zero @@ -4413,10 +4401,9 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 ld.d $a5, $a4, -96 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vld $vr2, $sp, 192 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a2, $a2, $a3 vsub.w $vr0, $vr0, $vr1 ld.d $a5, $a4, -64 @@ -4425,8 +4412,8 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 16 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a2, $a2, $a3 vsub.w $vr0, $vr0, $vr1 ld.d $a5, $sp, 128 # 8-byte Folded Reload @@ -4434,19 +4421,19 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 ld.d $a4, $a4, -32 vinsgr2vr.d $vr0, $a2, 0 ld.d $a1, $a1, 24 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a1, $a1, $a3 vsub.w $vr0, $vr0, $vr1 ldx.d $a2, $a0, $s4 ld.d $a3, $sp, 104 # 8-byte Folded Reload vstx $vr0, $a0, $a3 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 ld.d $a1, $s2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a2, $sp, 96 # 8-byte Folded Reload vstx $vr0, $a0, $a2 @@ -4469,7 +4456,7 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 ld.w $a1, $a1, %pc_lo12(sp2_frame_indicator) or $a1, $a0, $a1 ld.d $a0, $sp, 144 # 8-byte Folded Reload - addi.d $a2, $sp, 212 + addi.d $a2, $sp, 196 beqz $a1, .LBB8_37 # %bb.33: # in Loop: Header=BB8_22 Depth=1 move $a1, $s1 @@ -4480,7 +4467,7 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 b .LBB8_21 .p2align 4, , 16 .LBB8_34: # in Loop: Header=BB8_22 Depth=1 - addi.d $a2, $sp, 212 + addi.d $a2, $sp, 196 ld.d $a0, $sp, 144 # 8-byte Folded Reload move $a1, $s1 move $a3, $zero @@ -4525,19 +4512,19 @@ LumaResidualCoding8x8: # @LumaResidualCoding8x8 ldptr.w $a1, $a0, 15540 beqz $a1, .LBB8_12 .LBB8_40: # %.loopexit - ld.w $a0, $sp, 212 - ld.d $s8, $sp, 216 # 8-byte Folded Reload - ld.d $s7, $sp, 224 # 8-byte Folded Reload - ld.d $s6, $sp, 232 # 8-byte Folded Reload - ld.d $s5, $sp, 240 # 8-byte Folded Reload - ld.d $s4, $sp, 248 # 8-byte Folded Reload - ld.d $s3, $sp, 256 # 8-byte Folded Reload - ld.d $s2, $sp, 264 # 8-byte Folded Reload - ld.d $s1, $sp, 272 # 8-byte Folded Reload - ld.d $s0, $sp, 280 # 8-byte Folded Reload - ld.d $fp, $sp, 288 # 8-byte Folded Reload - ld.d $ra, $sp, 296 # 8-byte Folded Reload - addi.d $sp, $sp, 304 + ld.w $a0, $sp, 196 + ld.d $s8, $sp, 200 # 8-byte Folded Reload + ld.d $s7, $sp, 208 # 8-byte Folded Reload + ld.d $s6, $sp, 216 # 8-byte Folded Reload + ld.d $s5, $sp, 224 # 8-byte Folded Reload + ld.d $s4, $sp, 232 # 8-byte Folded Reload + ld.d $s3, $sp, 240 # 8-byte Folded Reload + ld.d $s2, $sp, 248 # 8-byte Folded Reload + ld.d $s1, $sp, 256 # 8-byte Folded Reload + ld.d $s0, $sp, 264 # 8-byte Folded Reload + ld.d $fp, $sp, 272 # 8-byte Folded Reload + ld.d $ra, $sp, 280 # 8-byte Folded Reload + addi.d $sp, $sp, 288 ret .Lfunc_end8: .size LumaResidualCoding8x8, .Lfunc_end8-LumaResidualCoding8x8 @@ -5615,18 +5602,18 @@ LumaResidualCoding: # @LumaResidualCoding .type TransformDecision,@function TransformDecision: # @TransformDecision # %bb.0: - addi.d $sp, $sp, -288 - st.d $ra, $sp, 280 # 8-byte Folded Spill - st.d $fp, $sp, 272 # 8-byte Folded Spill - st.d $s0, $sp, 264 # 8-byte Folded Spill - st.d $s1, $sp, 256 # 8-byte Folded Spill - st.d $s2, $sp, 248 # 8-byte Folded Spill - st.d $s3, $sp, 240 # 8-byte Folded Spill - st.d $s4, $sp, 232 # 8-byte Folded Spill - st.d $s5, $sp, 224 # 8-byte Folded Spill - st.d $s6, $sp, 216 # 8-byte Folded Spill - st.d $s7, $sp, 208 # 8-byte Folded Spill - st.d $s8, $sp, 200 # 8-byte Folded Spill + addi.d $sp, $sp, -272 + st.d $ra, $sp, 264 # 8-byte Folded Spill + st.d $fp, $sp, 256 # 8-byte Folded Spill + st.d $s0, $sp, 248 # 8-byte Folded Spill + st.d $s1, $sp, 240 # 8-byte Folded Spill + st.d $s2, $sp, 232 # 8-byte Folded Spill + st.d $s3, $sp, 224 # 8-byte Folded Spill + st.d $s4, $sp, 216 # 8-byte Folded Spill + st.d $s5, $sp, 208 # 8-byte Folded Spill + st.d $s6, $sp, 200 # 8-byte Folded Spill + st.d $s7, $sp, 192 # 8-byte Folded Spill + st.d $s8, $sp, 184 # 8-byte Folded Spill bstrpick.d $a2, $a0, 31, 0 addi.d $a2, $a2, 1 srli.d $a3, $a2, 32 @@ -5652,8 +5639,6 @@ TransformDecision: # @TransformDecision addi.d $s0, $a0, %pc_lo12(diff64) move $s7, $zero move $a4, $zero - vrepli.b $vr0, 0 - vst $vr0, $sp, 144 # 16-byte Folded Spill st.d $s4, $sp, 136 # 8-byte Folded Spill .p2align 4, , 16 .LBB13_2: # %.lr.ph @@ -5670,27 +5655,27 @@ TransformDecision: # @TransformDecision add.d $s2, $a0, $a1 andi $a0, $s3, 8 st.d $a0, $sp, 96 # 8-byte Folded Spill - addi.d $a1, $sp, 190 - addi.d $a2, $sp, 196 - addi.d $a3, $sp, 192 - addi.d $a4, $sp, 188 - addi.d $a5, $sp, 186 + addi.d $a1, $sp, 174 + addi.d $a2, $sp, 180 + addi.d $a3, $sp, 176 + addi.d $a4, $sp, 172 + addi.d $a5, $sp, 170 move $a0, $fp pcaddu18i $ra, %call36(SetModesAndRefframe) jirl $ra, $ra, 0 move $s5, $zero st.d $fp, $sp, 56 # 8-byte Folded Spill slli.w $a0, $fp, 2 - ld.h $a1, $sp, 190 + ld.h $a1, $sp, 174 st.d $a1, $sp, 88 # 8-byte Folded Spill - ld.w $a1, $sp, 196 + ld.w $a1, $sp, 180 st.d $a1, $sp, 80 # 8-byte Folded Spill - ld.w $a1, $sp, 192 - st.d $a1, $sp, 176 # 8-byte Folded Spill - ld.h $a1, $sp, 188 - st.d $a1, $sp, 168 # 8-byte Folded Spill - ld.h $a1, $sp, 186 + ld.w $a1, $sp, 176 st.d $a1, $sp, 160 # 8-byte Folded Spill + ld.h $a1, $sp, 172 + st.d $a1, $sp, 152 # 8-byte Folded Spill + ld.h $a1, $sp, 170 + st.d $a1, $sp, 144 # 8-byte Folded Spill ori $a0, $a0, 4 st.d $a0, $sp, 72 # 8-byte Folded Spill st.d $s3, $sp, 40 # 8-byte Folded Spill @@ -5716,9 +5701,9 @@ TransformDecision: # @TransformDecision move $a2, $s4 ld.d $s7, $sp, 80 # 8-byte Folded Reload move $a3, $s7 - ld.d $a4, $sp, 176 # 8-byte Folded Reload - ld.d $a5, $sp, 168 # 8-byte Folded Reload - ld.d $a6, $sp, 160 # 8-byte Folded Reload + ld.d $a4, $sp, 160 # 8-byte Folded Reload + ld.d $a5, $sp, 152 # 8-byte Folded Reload + ld.d $a6, $sp, 144 # 8-byte Folded Reload pcaddu18i $ra, %call36(LumaPrediction4x4) jirl $ra, $ra, 0 ld.d $a0, $sp, 136 # 8-byte Folded Reload @@ -5738,9 +5723,8 @@ TransformDecision: # @TransformDecision vinsgr2vr.d $vr1, $a4, 0 ld.d $a4, $a3, 8 add.d $a0, $s0, $a5 - vld $vr2, $sp, 144 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a4, $a4, $a2 vsub.w $vr0, $vr0, $vr1 vstx $vr0, $s0, $a5 @@ -5752,8 +5736,8 @@ TransformDecision: # @TransformDecision srai.d $a5, $a5, 30 vinsgr2vr.d $vr1, $a4, 0 ld.d $a4, $a3, 16 - vilvl.h $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a4, $a4, $a2 vstx $vr0, $s0, $a5 @@ -5763,9 +5747,9 @@ TransformDecision: # @TransformDecision ld.d $a4, $a1, -40 add.d $a5, $s5, $a5 srai.d $a5, $a5, 30 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 24 vstx $vr0, $s0, $a5 @@ -5776,9 +5760,9 @@ TransformDecision: # @TransformDecision ld.d $a1, $a1, -8 srai.d $a3, $a3, 30 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vstx $vr0, $s0, $a3 pcaddu18i $ra, %call36(distortion4x4) @@ -5793,9 +5777,9 @@ TransformDecision: # @TransformDecision ld.d $s3, $sp, 136 # 8-byte Folded Reload move $a2, $s4 move $a3, $s7 - ld.d $a4, $sp, 176 # 8-byte Folded Reload - ld.d $a5, $sp, 168 # 8-byte Folded Reload - ld.d $a6, $sp, 160 # 8-byte Folded Reload + ld.d $a4, $sp, 160 # 8-byte Folded Reload + ld.d $a5, $sp, 152 # 8-byte Folded Reload + ld.d $a6, $sp, 144 # 8-byte Folded Reload pcaddu18i $ra, %call36(LumaPrediction4x4) jirl $ra, $ra, 0 ld.d $a0, $s3, %pc_lo12(imgY_org) @@ -5815,9 +5799,8 @@ TransformDecision: # @TransformDecision add.d $a0, $s0, $a4 vinsgr2vr.d $vr1, $a6, 0 ld.d $a6, $a3, 8 - vld $vr2, $sp, 144 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a6, $a6, $a2 vstx $vr0, $s0, $a4 @@ -5827,9 +5810,9 @@ TransformDecision: # @TransformDecision ld.d $a6, $a5, -64 add.d $a4, $s5, $a4 srai.d $a4, $a4, 30 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a6, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a6, $a3, 16 vstx $vr0, $s0, $a4 @@ -5840,9 +5823,9 @@ TransformDecision: # @TransformDecision ld.d $a5, $a5, -32 srai.d $a4, $a4, 30 vinsgr2vr.d $vr0, $a6, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vstx $vr0, $s0, $a4 ld.d $a3, $a3, 24 @@ -5854,9 +5837,9 @@ TransformDecision: # @TransformDecision ldx.d $a1, $a1, $s2 srai.d $a4, $a4, 30 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vstx $vr0, $s0, $a4 addi.d $s5, $a3, 4 @@ -5908,18 +5891,18 @@ TransformDecision: # @TransformDecision add.d $a1, $a2, $a1 st.w $a1, $a3, 0 .LBB13_9: - ld.d $s8, $sp, 200 # 8-byte Folded Reload - ld.d $s7, $sp, 208 # 8-byte Folded Reload - ld.d $s6, $sp, 216 # 8-byte Folded Reload - ld.d $s5, $sp, 224 # 8-byte Folded Reload - ld.d $s4, $sp, 232 # 8-byte Folded Reload - ld.d $s3, $sp, 240 # 8-byte Folded Reload - ld.d $s2, $sp, 248 # 8-byte Folded Reload - ld.d $s1, $sp, 256 # 8-byte Folded Reload - ld.d $s0, $sp, 264 # 8-byte Folded Reload - ld.d $fp, $sp, 272 # 8-byte Folded Reload - ld.d $ra, $sp, 280 # 8-byte Folded Reload - addi.d $sp, $sp, 288 + ld.d $s8, $sp, 184 # 8-byte Folded Reload + ld.d $s7, $sp, 192 # 8-byte Folded Reload + ld.d $s6, $sp, 200 # 8-byte Folded Reload + ld.d $s5, $sp, 208 # 8-byte Folded Reload + ld.d $s4, $sp, 216 # 8-byte Folded Reload + ld.d $s3, $sp, 224 # 8-byte Folded Reload + ld.d $s2, $sp, 232 # 8-byte Folded Reload + ld.d $s1, $sp, 240 # 8-byte Folded Reload + ld.d $s0, $sp, 248 # 8-byte Folded Reload + ld.d $fp, $sp, 256 # 8-byte Folded Reload + ld.d $ra, $sp, 264 # 8-byte Folded Reload + addi.d $sp, $sp, 272 ret .Lfunc_end13: .size TransformDecision, .Lfunc_end13-TransformDecision @@ -7491,20 +7474,11 @@ ChromaResidualCoding: # @ChromaResidualCoding .dword 0 # 0x0 .dword 1 # 0x1 .LCPI17_2: - .half 3 # 0x3 - .half 9 # 0x9 - .half 2 # 0x2 - .half 11 # 0xb - .half 1 # 0x1 - .half 13 # 0xd - .half 0 # 0x0 - .half 15 # 0xf -.LCPI17_3: .word 4 # 0x4 .word 5 # 0x5 .word 6 # 0x6 .word 7 # 0x7 -.LCPI17_4: +.LCPI17_3: .word 0 # 0x0 .word 1 # 0x1 .word 2 # 0x2 @@ -7515,21 +7489,21 @@ ChromaResidualCoding: # @ChromaResidualCoding .type IntraChromaPrediction,@function IntraChromaPrediction: # @IntraChromaPrediction # %bb.0: - addi.d $sp, $sp, -1040 - st.d $ra, $sp, 1032 # 8-byte Folded Spill - st.d $fp, $sp, 1024 # 8-byte Folded Spill - st.d $s0, $sp, 1016 # 8-byte Folded Spill - st.d $s1, $sp, 1008 # 8-byte Folded Spill - st.d $s2, $sp, 1000 # 8-byte Folded Spill - st.d $s3, $sp, 992 # 8-byte Folded Spill - st.d $s4, $sp, 984 # 8-byte Folded Spill - st.d $s5, $sp, 976 # 8-byte Folded Spill - st.d $s6, $sp, 968 # 8-byte Folded Spill - st.d $s7, $sp, 960 # 8-byte Folded Spill - st.d $s8, $sp, 952 # 8-byte Folded Spill + addi.d $sp, $sp, -1024 + st.d $ra, $sp, 1016 # 8-byte Folded Spill + st.d $fp, $sp, 1008 # 8-byte Folded Spill + st.d $s0, $sp, 1000 # 8-byte Folded Spill + st.d $s1, $sp, 992 # 8-byte Folded Spill + st.d $s2, $sp, 984 # 8-byte Folded Spill + st.d $s3, $sp, 976 # 8-byte Folded Spill + st.d $s4, $sp, 968 # 8-byte Folded Spill + st.d $s5, $sp, 960 # 8-byte Folded Spill + st.d $s6, $sp, 952 # 8-byte Folded Spill + st.d $s7, $sp, 944 # 8-byte Folded Spill + st.d $s8, $sp, 936 # 8-byte Folded Spill move $s8, $a2 move $s2, $a1 - st.d $a0, $sp, 448 # 8-byte Folded Spill + st.d $a0, $sp, 432 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(img) ld.d $s7, $a0, %got_pc_lo12(img) ld.d $a0, $s7, 0 @@ -7554,7 +7528,7 @@ IntraChromaPrediction: # @IntraChromaPrediction addi.d $a0, $a0, 1 bstrpick.d $s6, $a0, 31, 0 addi.w $s3, $zero, -1 - addi.d $s4, $sp, 456 + addi.d $s4, $sp, 440 move $s5, $s3 .p2align 4, , 16 .LBB17_2: # %.lr.ph @@ -7574,15 +7548,15 @@ IntraChromaPrediction: # @IntraChromaPrediction ld.d $a5, $s1, %pc_lo12(getNeighbour) addi.w $a2, $zero, -1 ori $a3, $zero, 1 - addi.d $a4, $sp, 864 + addi.d $a4, $sp, 848 move $a0, $fp move $a1, $zero jirl $ra, $a5, 0 - ld.w $s4, $sp, 864 - ld.w $a0, $sp, 456 + ld.w $s4, $sp, 848 + ld.w $a0, $sp, 440 pcalau12i $a1, %got_pc_hi20(input) ld.d $a1, $a1, %got_pc_lo12(input) - st.d $a1, $sp, 184 # 8-byte Folded Spill + st.d $a1, $sp, 200 # 8-byte Folded Spill ld.d $a1, $a1, 0 ld.w $a1, $a1, 272 st.d $fp, $sp, 40 # 8-byte Folded Spill @@ -7594,7 +7568,7 @@ IntraChromaPrediction: # @IntraChromaPrediction beqz $s4, .LBB17_7 # %bb.5: ld.d $a2, $s7, 0 - ld.w $a3, $sp, 868 + ld.w $a3, $sp, 852 ldptr.d $a2, $a2, 14240 slli.d $a3, $a3, 2 ldx.w $s4, $a2, $a3 @@ -7604,11 +7578,11 @@ IntraChromaPrediction: # @IntraChromaPrediction bgtz $a1, .LBB17_8 b .LBB17_12 .LBB17_6: - ld.w $s3, $sp, 480 + ld.w $s3, $sp, 464 move $s5, $s3 ld.d $s6, $sp, 408 # 8-byte Folded Reload ld.d $ra, $sp, 336 # 8-byte Folded Reload - ld.d $a1, $sp, 448 # 8-byte Folded Reload + ld.d $a1, $sp, 432 # 8-byte Folded Reload ld.d $a3, $sp, 416 # 8-byte Folded Reload bnez $a1, .LBB17_19 b .LBB17_20 @@ -7621,7 +7595,7 @@ IntraChromaPrediction: # @IntraChromaPrediction .LBB17_8: # %.lr.ph460 ld.d $a2, $s7, 0 ori $s5, $zero, 1 - addi.d $a3, $sp, 484 + addi.d $a3, $sp, 468 move $a4, $a1 b .LBB17_10 .p2align 4, , 16 @@ -7650,7 +7624,7 @@ IntraChromaPrediction: # @IntraChromaPrediction bstrpick.d $a3, $a1, 31, 0 slli.d $a4, $a3, 4 alsl.d $a3, $a3, $a4, 3 - addi.d $a4, $sp, 456 + addi.d $a4, $sp, 440 add.d $a3, $a3, $a4 addi.d $a3, $a3, 28 ori $s3, $zero, 1 @@ -7678,11 +7652,11 @@ IntraChromaPrediction: # @IntraChromaPrediction beqz $a0, .LBB17_197 # %bb.18: ld.d $a0, $s7, 0 - ld.w $a1, $sp, 460 + ld.w $a1, $sp, 444 ldptr.d $a0, $a0, 14240 slli.d $a1, $a1, 2 ldx.w $a0, $a0, $a1 - ld.d $a1, $sp, 448 # 8-byte Folded Reload + ld.d $a1, $sp, 432 # 8-byte Folded Reload ld.d $a3, $sp, 416 # 8-byte Folded Reload beqz $a1, .LBB17_20 .LBB17_19: @@ -7716,19 +7690,19 @@ IntraChromaPrediction: # @IntraChromaPrediction slti $t2, $s6, 1 slti $a2, $s2, 1 sltu $a0, $zero, $a0 - st.d $a4, $sp, 208 # 8-byte Folded Spill + st.d $a4, $sp, 224 # 8-byte Folded Spill st.d $a0, $sp, 8 # 8-byte Folded Spill and $a0, $a4, $a0 - st.d $a0, $sp, 256 # 8-byte Folded Spill + st.d $a0, $sp, 272 # 8-byte Folded Spill srai.d $t3, $s2, 1 - addi.d $a0, $sp, 920 + addi.d $a0, $sp, 904 alsl.d $a3, $s2, $a0, 1 - st.d $a3, $sp, 224 # 8-byte Folded Spill + st.d $a3, $sp, 240 # 8-byte Folded Spill addi.d $a3, $t3, -1 srai.d $s8, $s6, 1 - addi.d $s1, $sp, 888 + addi.d $s1, $sp, 872 alsl.d $a4, $s6, $s1, 1 - st.d $a4, $sp, 216 # 8-byte Folded Spill + st.d $a4, $sp, 232 # 8-byte Folded Spill addi.d $a4, $s8, -1 addi.d $a5, $s2, -8 sltui $a5, $a5, 1 @@ -7737,43 +7711,43 @@ IntraChromaPrediction: # @IntraChromaPrediction ori $t0, $zero, 17 maskeqz $t1, $t0, $a5 or $a7, $t1, $a7 - st.d $a7, $sp, 160 # 8-byte Folded Spill + st.d $a7, $sp, 176 # 8-byte Folded Spill ori $a7, $zero, 6 sub.d $a5, $a7, $a5 - st.d $a5, $sp, 152 # 8-byte Folded Spill + st.d $a5, $sp, 168 # 8-byte Folded Spill addi.d $a5, $s6, -8 sltui $a5, $a5, 1 masknez $a6, $a6, $a5 maskeqz $t0, $t0, $a5 or $a6, $t0, $a6 - st.d $a6, $sp, 136 # 8-byte Folded Spill + st.d $a6, $sp, 152 # 8-byte Folded Spill slli.d $a6, $s6, 1 - st.d $a6, $sp, 200 # 8-byte Folded Spill + st.d $a6, $sp, 216 # 8-byte Folded Spill sub.d $a5, $a7, $a5 - st.d $a5, $sp, 128 # 8-byte Folded Spill + st.d $a5, $sp, 144 # 8-byte Folded Spill bstrpick.d $a5, $a3, 31, 0 - st.d $a5, $sp, 176 # 8-byte Folded Spill + st.d $a5, $sp, 192 # 8-byte Folded Spill bstrpick.d $a5, $a4, 31, 0 - st.d $a5, $sp, 168 # 8-byte Folded Spill + st.d $a5, $sp, 184 # 8-byte Folded Spill ld.d $a6, $sp, 344 # 8-byte Folded Reload bstrpick.d $a5, $a6, 31, 1 slli.d $a5, $a5, 1 - st.d $a5, $sp, 240 # 8-byte Folded Spill + st.d $a5, $sp, 256 # 8-byte Folded Spill st.d $t2, $sp, 16 # 8-byte Folded Spill or $a2, $a2, $t2 - st.d $a2, $sp, 192 # 8-byte Folded Spill + st.d $a2, $sp, 208 # 8-byte Folded Spill addi.w $a2, $zero, -8 lu32i.d $a2, 0 and $a5, $a6, $a2 - st.d $a5, $sp, 248 # 8-byte Folded Spill + st.d $a5, $sp, 264 # 8-byte Folded Spill and $a3, $a3, $a2 - st.d $a3, $sp, 120 # 8-byte Folded Spill + st.d $a3, $sp, 136 # 8-byte Folded Spill and $a2, $a4, $a2 - st.d $a2, $sp, 112 # 8-byte Folded Spill + st.d $a2, $sp, 128 # 8-byte Folded Spill bstrpick.d $a2, $ra, 30, 3 slli.d $a2, $a2, 3 st.d $a2, $sp, 416 # 8-byte Folded Spill - vreplgr2vr.w $vr10, $t3 + vreplgr2vr.w $vr8, $t3 slli.d $a2, $a1, 6 pcalau12i $a3, %pc_hi20(IntraChromaPrediction.block_pos) addi.d $a3, $a3, %pc_lo12(IntraChromaPrediction.block_pos) @@ -7812,8 +7786,8 @@ IntraChromaPrediction: # @IntraChromaPrediction st.d $a0, $sp, 80 # 8-byte Folded Spill ori $a1, $zero, 1 sub.d $a0, $a1, $s8 - st.d $a0, $sp, 144 # 8-byte Folded Spill - st.d $t3, $sp, 232 # 8-byte Folded Spill + st.d $a0, $sp, 160 # 8-byte Folded Spill + st.d $t3, $sp, 248 # 8-byte Folded Spill sub.d $a0, $a1, $t3 st.d $a0, $sp, 400 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(enc_picture) @@ -7827,12 +7801,12 @@ IntraChromaPrediction: # @IntraChromaPrediction ori $a0, $a0, 336 st.d $a0, $sp, 376 # 8-byte Folded Spill ori $fp, $zero, 3 - addi.d $t8, $sp, 456 - vrepli.b $vr11, 0 - st.d $s4, $sp, 264 # 8-byte Folded Spill - st.d $s2, $sp, 448 # 8-byte Folded Spill - vst $vr10, $sp, 320 # 16-byte Folded Spill - vst $vr11, $sp, 272 # 16-byte Folded Spill + addi.d $t8, $sp, 440 + vrepli.b $vr0, 0 + vst $vr0, $sp, 112 # 16-byte Folded Spill + st.d $s4, $sp, 280 # 8-byte Folded Spill + st.d $s2, $sp, 432 # 8-byte Folded Spill + vst $vr8, $sp, 320 # 16-byte Folded Spill b .LBB17_26 .p2align 4, , 16 .LBB17_25: # %.thread410 @@ -7958,8 +7932,8 @@ IntraChromaPrediction: # @IntraChromaPrediction .LBB17_34: # %.preheader450 # Parent Loop BB17_26 Depth=1 # => This Inner Loop Header: Depth=2 - ld.w $a7, $sp, 884 - ld.w $a6, $sp, 880 + ld.w $a7, $sp, 868 + ld.w $a6, $sp, 864 ld.bu $t0, $a3, -1 ld.w $t3, $a5, -8 ld.bu $t1, $a4, -3 @@ -8091,7 +8065,7 @@ IntraChromaPrediction: # @IntraChromaPrediction srli.d $t2, $t2, 2 .LBB17_51: # %.thread385 # in Loop: Header=BB17_34 Depth=2 - addi.d $t8, $sp, 456 + addi.d $t8, $sp, 440 b .LBB17_56 .LBB17_52: # in Loop: Header=BB17_34 Depth=2 move $t2, $a1 @@ -8286,7 +8260,7 @@ IntraChromaPrediction: # @IntraChromaPrediction srli.d $t2, $t2, 2 .LBB17_72: # %.thread385.1 # in Loop: Header=BB17_34 Depth=2 - addi.d $t8, $sp, 456 + addi.d $t8, $sp, 440 b .LBB17_77 .LBB17_73: # in Loop: Header=BB17_34 Depth=2 move $t2, $a1 @@ -8481,7 +8455,7 @@ IntraChromaPrediction: # @IntraChromaPrediction srli.d $t2, $t2, 2 .LBB17_93: # %.thread385.2 # in Loop: Header=BB17_34 Depth=2 - addi.d $t8, $sp, 456 + addi.d $t8, $sp, 440 b .LBB17_98 .LBB17_94: # in Loop: Header=BB17_34 Depth=2 move $t2, $a1 @@ -8676,12 +8650,12 @@ IntraChromaPrediction: # @IntraChromaPrediction # in Loop: Header=BB17_26 Depth=1 beqz $s4, .LBB17_118 # %bb.114: # in Loop: Header=BB17_26 Depth=1 - ld.w $a0, $sp, 884 + ld.w $a0, $sp, 868 slli.d $a0, $a0, 3 ldx.d $a0, $s0, $a0 - ld.w $a1, $sp, 880 + ld.w $a1, $sp, 864 alsl.d $a1, $a1, $a0, 1 - addi.d $a0, $sp, 920 + addi.d $a0, $sp, 904 ld.d $a2, $sp, 368 # 8-byte Folded Reload pcaddu18i $ra, %call36(memcpy) jirl $ra, $ra, 0 @@ -8704,7 +8678,7 @@ IntraChromaPrediction: # @IntraChromaPrediction # => This Inner Loop Header: Depth=2 ld.d $a0, $s6, 0 add.d $a0, $a0, $s2 - addi.d $a1, $sp, 920 + addi.d $a1, $sp, 904 move $a2, $s7 pcaddu18i $ra, %call36(memcpy) jirl $ra, $ra, 0 @@ -8713,17 +8687,16 @@ IntraChromaPrediction: # @IntraChromaPrediction bnez $s4, .LBB17_116 # %bb.117: # %.loopexit453.thread750 # in Loop: Header=BB17_26 Depth=1 - ld.d $s4, $sp, 264 # 8-byte Folded Reload - ld.d $s2, $sp, 448 # 8-byte Folded Reload + ld.d $s4, $sp, 280 # 8-byte Folded Reload + ld.d $s2, $sp, 432 # 8-byte Folded Reload ld.d $a0, $sp, 360 # 8-byte Folded Reload ld.d $ra, $sp, 336 # 8-byte Folded Reload move $s7, $s6 move $s6, $s8 move $s8, $s1 - addi.d $s1, $sp, 888 - vld $vr10, $sp, 320 # 16-byte Folded Reload - addi.d $t8, $sp, 456 - vld $vr11, $sp, 272 # 16-byte Folded Reload + addi.d $s1, $sp, 872 + vld $vr8, $sp, 320 # 16-byte Folded Reload + addi.d $t8, $sp, 440 ori $t2, $zero, 8 beqz $a0, .LBB17_25 b .LBB17_120 @@ -8743,32 +8716,31 @@ IntraChromaPrediction: # @IntraChromaPrediction # %bb.121: # in Loop: Header=BB17_26 Depth=1 move $a2, $zero ld.d $a7, $sp, 344 # 8-byte Folded Reload - ld.d $t0, $sp, 248 # 8-byte Folded Reload + ld.d $t0, $sp, 264 # 8-byte Folded Reload b .LBB17_127 .p2align 4, , 16 .LBB17_122: # %.loopexit453.thread # in Loop: Header=BB17_26 Depth=1 ld.d $a0, $sp, 360 # 8-byte Folded Reload ld.d $ra, $sp, 336 # 8-byte Folded Reload - vld $vr10, $sp, 320 # 16-byte Folded Reload - addi.d $t8, $sp, 456 - vld $vr11, $sp, 272 # 16-byte Folded Reload + vld $vr8, $sp, 320 # 16-byte Folded Reload + addi.d $t8, $sp, 440 ori $t2, $zero, 8 beqz $a0, .LBB17_25 .LBB17_123: # %.preheader451 # in Loop: Header=BB17_26 Depth=1 ld.d $a7, $sp, 344 # 8-byte Folded Reload - ld.d $a0, $sp, 192 # 8-byte Folded Reload - ld.d $t0, $sp, 248 # 8-byte Folded Reload + ld.d $a0, $sp, 208 # 8-byte Folded Reload + ld.d $t0, $sp, 264 # 8-byte Folded Reload beqz $a0, .LBB17_130 b .LBB17_139 .LBB17_124: # %vector.body832.preheader # in Loop: Header=BB17_26 Depth=1 - addi.d $a0, $sp, 524 - ld.d $a1, $sp, 240 # 8-byte Folded Reload - addi.d $a2, $sp, 890 + addi.d $a0, $sp, 508 + ld.d $a1, $sp, 256 # 8-byte Folded Reload + addi.d $a2, $sp, 874 ld.d $a7, $sp, 344 # 8-byte Folded Reload - ld.d $t0, $sp, 248 # 8-byte Folded Reload + ld.d $t0, $sp, 264 # 8-byte Folded Reload .p2align 4, , 16 .LBB17_125: # %vector.body832 # Parent Loop BB17_26 Depth=1 @@ -8793,7 +8765,7 @@ IntraChromaPrediction: # @IntraChromaPrediction bnez $a1, .LBB17_125 # %bb.126: # %middle.block835 # in Loop: Header=BB17_26 Depth=1 - ld.d $a0, $sp, 240 # 8-byte Folded Reload + ld.d $a0, $sp, 256 # 8-byte Folded Reload move $a2, $a0 beq $a0, $a7, .LBB17_129 .LBB17_127: # %.lr.ph492.preheader841 @@ -8802,7 +8774,7 @@ IntraChromaPrediction: # @IntraChromaPrediction sub.d $a1, $a7, $a2 slli.d $a3, $a2, 4 alsl.d $a2, $a2, $a3, 3 - addi.d $a3, $sp, 500 + addi.d $a3, $sp, 484 add.d $a2, $a3, $a2 .p2align 4, , 16 .LBB17_128: # %.lr.ph492 @@ -8854,7 +8826,7 @@ IntraChromaPrediction: # @IntraChromaPrediction .p2align 4, , 16 .LBB17_134: # %vector.body820.preheader # in Loop: Header=BB17_132 Depth=2 - addi.d $a3, $sp, 888 + addi.d $a3, $sp, 872 move $a4, $a2 move $a5, $t0 .p2align 4, , 16 @@ -8900,62 +8872,61 @@ IntraChromaPrediction: # @IntraChromaPrediction .p2align 4, , 16 .LBB17_139: # %._crit_edge497 # in Loop: Header=BB17_26 Depth=1 - ld.d $a0, $sp, 256 # 8-byte Folded Reload + ld.d $a0, $sp, 272 # 8-byte Folded Reload beqz $a0, .LBB17_25 # %bb.140: # in Loop: Header=BB17_26 Depth=1 - ld.w $a0, $sp, 476 + ld.w $a0, $sp, 460 slli.d $a0, $a0, 3 - ld.w $a1, $sp, 472 + ld.w $a1, $sp, 456 ldx.d $a2, $s0, $a0 - ld.d $a0, $sp, 224 # 8-byte Folded Reload + ld.d $a0, $sp, 240 # 8-byte Folded Reload ld.hu $a0, $a0, -2 slli.d $a1, $a1, 1 - ldx.hu $a4, $a2, $a1 - sub.d $a1, $a0, $a4 - ld.d $a3, $sp, 232 # 8-byte Folded Reload - mul.d $a1, $a1, $a3 - pcalau12i $a2, %pc_hi20(.LCPI17_2) - ori $a5, $zero, 2 - blt $a3, $a5, .LBB17_148 + ldx.hu $a3, $a2, $a1 + sub.d $a1, $a0, $a3 + ld.d $a2, $sp, 248 # 8-byte Folded Reload + mul.d $a1, $a1, $a2 + ori $a4, $zero, 2 + blt $a2, $a4, .LBB17_148 # %bb.141: # %.lr.ph501.preheader # in Loop: Header=BB17_26 Depth=1 - ori $a5, $zero, 9 - bgeu $a3, $a5, .LBB17_143 + ori $a4, $zero, 9 + bgeu $a2, $a4, .LBB17_143 # %bb.142: # in Loop: Header=BB17_26 Depth=1 - move $a7, $zero + move $a6, $zero b .LBB17_146 .LBB17_143: # %vector.ph793 # in Loop: Header=BB17_26 Depth=1 - pcalau12i $a3, %pc_hi20(.LCPI17_0) - vld $vr0, $a3, %pc_lo12(.LCPI17_0) - pcalau12i $a3, %pc_hi20(.LCPI17_1) - vld $vr2, $a3, %pc_lo12(.LCPI17_1) - vori.b $vr1, $vr11, 0 + pcalau12i $a2, %pc_hi20(.LCPI17_0) + vld $vr0, $a2, %pc_lo12(.LCPI17_0) + pcalau12i $a2, %pc_hi20(.LCPI17_1) + vld $vr2, $a2, %pc_lo12(.LCPI17_1) + vld $vr3, $sp, 112 # 16-byte Folded Reload + vori.b $vr1, $vr3, 0 vinsgr2vr.w $vr1, $a1, 0 ld.d $a1, $sp, 64 # 8-byte Folded Reload - ld.d $a3, $sp, 72 # 8-byte Folded Reload - ld.d $a5, $sp, 120 # 8-byte Folded Reload - vori.b $vr3, $vr11, 0 + ld.d $a2, $sp, 72 # 8-byte Folded Reload + ld.d $a4, $sp, 136 # 8-byte Folded Reload .p2align 4, , 16 .LBB17_144: # %vector.body796 # Parent Loop BB17_26 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $a6, $a1, -8 - ld.d $a7, $a1, 0 - vinsgr2vr.d $vr4, $a6, 0 - ld.d $a6, $a3, 8 - vinsgr2vr.d $vr5, $a7, 0 - ld.d $a7, $a3, 0 - vld $vr6, $a2, %pc_lo12(.LCPI17_2) - vinsgr2vr.d $vr7, $a6, 0 - vilvl.h $vr4, $vr11, $vr4 - vilvl.h $vr5, $vr11, $vr5 - vinsgr2vr.d $vr8, $a7, 0 - vori.b $vr9, $vr6, 0 - vshuf.h $vr9, $vr11, $vr7 - vshuf.h $vr6, $vr11, $vr8 - vsub.w $vr4, $vr4, $vr9 - vsub.w $vr5, $vr5, $vr6 + ld.d $a5, $a1, -8 + ld.d $a6, $a1, 0 + ld.d $a7, $a2, 8 + vinsgr2vr.d $vr4, $a5, 0 + vinsgr2vr.d $vr5, $a6, 0 + vinsgr2vr.d $vr6, $a7, 0 + ld.d $a5, $a2, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vshuf4i.h $vr6, $vr6, 27 + vinsgr2vr.d $vr7, $a5, 0 + vshuf4i.h $vr7, $vr7, 27 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 vpickev.w $vr6, $vr0, $vr2 vaddi.wu $vr7, $vr6, 1 vaddi.wu $vr6, $vr6, 5 @@ -8963,56 +8934,56 @@ IntraChromaPrediction: # @IntraChromaPrediction vmadd.w $vr3, $vr5, $vr6 vaddi.du $vr2, $vr2, 8 vaddi.du $vr0, $vr0, 8 - addi.d $a5, $a5, -8 - addi.d $a3, $a3, -16 + addi.d $a4, $a4, -8 + addi.d $a2, $a2, -16 addi.d $a1, $a1, 16 - bnez $a5, .LBB17_144 + bnez $a4, .LBB17_144 # %bb.145: # %middle.block810 # in Loop: Header=BB17_26 Depth=1 vadd.w $vr0, $vr3, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a1, $vr0, 0 - ld.d $a5, $sp, 120 # 8-byte Folded Reload - move $a7, $a5 - ld.d $a3, $sp, 176 # 8-byte Folded Reload - beq $a5, $a3, .LBB17_148 + ld.d $a4, $sp, 136 # 8-byte Folded Reload + move $a6, $a4 + ld.d $a2, $sp, 192 # 8-byte Folded Reload + beq $a4, $a2, .LBB17_148 .LBB17_146: # %.lr.ph501.preheader840 # in Loop: Header=BB17_26 Depth=1 - slli.d $a5, $a7, 1 - ld.d $a3, $sp, 104 # 8-byte Folded Reload - alsl.d $a3, $a7, $a3, 1 - ld.d $a6, $sp, 96 # 8-byte Folded Reload - sub.d $a5, $a6, $a5 - ld.d $a6, $sp, 176 # 8-byte Folded Reload - sub.d $a6, $a6, $a7 - addi.d $a7, $a7, 1 + slli.d $a4, $a6, 1 + ld.d $a2, $sp, 104 # 8-byte Folded Reload + alsl.d $a2, $a6, $a2, 1 + ld.d $a5, $sp, 96 # 8-byte Folded Reload + sub.d $a4, $a5, $a4 + ld.d $a5, $sp, 192 # 8-byte Folded Reload + sub.d $a5, $a5, $a6 + addi.d $a6, $a6, 1 .p2align 4, , 16 .LBB17_147: # %.lr.ph501 # Parent Loop BB17_26 Depth=1 # => This Inner Loop Header: Depth=2 - ld.hu $t0, $a3, 0 - ld.hu $t1, $a5, 0 - sub.d $t0, $t0, $t1 - mul.d $t0, $t0, $a7 - add.d $a1, $t0, $a1 - addi.d $a3, $a3, 2 - addi.d $a5, $a5, -2 - addi.d $a6, $a6, -1 - addi.d $a7, $a7, 1 - bnez $a6, .LBB17_147 + ld.hu $a7, $a2, 0 + ld.hu $t0, $a4, 0 + sub.d $a7, $a7, $t0 + mul.d $a7, $a7, $a6 + add.d $a1, $a7, $a1 + addi.d $a2, $a2, 2 + addi.d $a4, $a4, -2 + addi.d $a5, $a5, -1 + addi.d $a6, $a6, 1 + bnez $a5, .LBB17_147 .LBB17_148: # %._crit_edge502 # in Loop: Header=BB17_26 Depth=1 - ld.d $a3, $sp, 216 # 8-byte Folded Reload - ld.hu $a3, $a3, -2 - sub.d $a4, $a3, $a4 - mul.d $a4, $a4, $s8 - ori $a5, $zero, 2 - blt $s8, $a5, .LBB17_151 + ld.d $a2, $sp, 232 # 8-byte Folded Reload + ld.hu $a2, $a2, -2 + sub.d $a3, $a2, $a3 + mul.d $a3, $a3, $s8 + ori $a4, $zero, 2 + blt $s8, $a4, .LBB17_151 # %bb.149: # %.lr.ph507.preheader # in Loop: Header=BB17_26 Depth=1 - ori $a5, $zero, 9 - bgeu $s8, $a5, .LBB17_153 + ori $a4, $zero, 9 + bgeu $s8, $a4, .LBB17_153 # %bb.150: # in Loop: Header=BB17_26 Depth=1 move $a7, $zero b .LBB17_156 @@ -9020,44 +8991,44 @@ IntraChromaPrediction: # @IntraChromaPrediction # in Loop: Header=BB17_26 Depth=1 blez $s6, .LBB17_25 # %bb.152: # in Loop: Header=BB17_26 Depth=1 - alsl.d $a2, $a4, $a4, 2 - ld.d $a4, $sp, 200 # 8-byte Folded Reload - add.w $a2, $a2, $a4 - srai.d $a2, $a2, 6 + alsl.d $a3, $a3, $a3, 2 + ld.d $a4, $sp, 216 # 8-byte Folded Reload + add.w $a3, $a3, $a4 + srai.d $a3, $a3, 6 bgtz $s2, .LBB17_159 b .LBB17_25 .LBB17_153: # %vector.ph775 # in Loop: Header=BB17_26 Depth=1 - pcalau12i $a5, %pc_hi20(.LCPI17_0) - vld $vr0, $a5, %pc_lo12(.LCPI17_0) - pcalau12i $a5, %pc_hi20(.LCPI17_1) - vld $vr2, $a5, %pc_lo12(.LCPI17_1) - vori.b $vr1, $vr11, 0 - vinsgr2vr.w $vr1, $a4, 0 - ld.d $a4, $sp, 48 # 8-byte Folded Reload - ld.d $a5, $sp, 56 # 8-byte Folded Reload - ld.d $a6, $sp, 112 # 8-byte Folded Reload - vori.b $vr3, $vr11, 0 + pcalau12i $a4, %pc_hi20(.LCPI17_0) + vld $vr0, $a4, %pc_lo12(.LCPI17_0) + pcalau12i $a4, %pc_hi20(.LCPI17_1) + vld $vr2, $a4, %pc_lo12(.LCPI17_1) + vld $vr3, $sp, 112 # 16-byte Folded Reload + vori.b $vr1, $vr3, 0 + vinsgr2vr.w $vr1, $a3, 0 + ld.d $a3, $sp, 48 # 8-byte Folded Reload + ld.d $a4, $sp, 56 # 8-byte Folded Reload + ld.d $a5, $sp, 128 # 8-byte Folded Reload .p2align 4, , 16 .LBB17_154: # %vector.body778 # Parent Loop BB17_26 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $a7, $a4, -8 - ld.d $t0, $a4, 0 - vinsgr2vr.d $vr4, $a7, 0 - ld.d $a7, $a5, 8 - vinsgr2vr.d $vr5, $t0, 0 - ld.d $t0, $a5, 0 - vld $vr6, $a2, %pc_lo12(.LCPI17_2) - vinsgr2vr.d $vr7, $a7, 0 - vilvl.h $vr4, $vr11, $vr4 - vilvl.h $vr5, $vr11, $vr5 - vinsgr2vr.d $vr8, $t0, 0 - vori.b $vr9, $vr6, 0 - vshuf.h $vr9, $vr11, $vr7 - vshuf.h $vr6, $vr11, $vr8 - vsub.w $vr4, $vr4, $vr9 - vsub.w $vr5, $vr5, $vr6 + ld.d $a6, $a3, -8 + ld.d $a7, $a3, 0 + ld.d $t0, $a4, 8 + vinsgr2vr.d $vr4, $a6, 0 + vinsgr2vr.d $vr5, $a7, 0 + vinsgr2vr.d $vr6, $t0, 0 + ld.d $a6, $a4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vshuf4i.h $vr6, $vr6, 27 + vinsgr2vr.d $vr7, $a6, 0 + vshuf4i.h $vr7, $vr7, 27 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 vpickev.w $vr6, $vr0, $vr2 vaddi.wu $vr7, $vr6, 1 vaddi.wu $vr6, $vr6, 5 @@ -9065,80 +9036,80 @@ IntraChromaPrediction: # @IntraChromaPrediction vmadd.w $vr3, $vr5, $vr6 vaddi.du $vr2, $vr2, 8 vaddi.du $vr0, $vr0, 8 - addi.d $a6, $a6, -8 - addi.d $a5, $a5, -16 - addi.d $a4, $a4, 16 - bnez $a6, .LBB17_154 + addi.d $a5, $a5, -8 + addi.d $a4, $a4, -16 + addi.d $a3, $a3, 16 + bnez $a5, .LBB17_154 # %bb.155: # %middle.block788 # in Loop: Header=BB17_26 Depth=1 vadd.w $vr0, $vr3, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 - vpickve2gr.d $a4, $vr0, 0 - ld.d $a5, $sp, 112 # 8-byte Folded Reload + vpickve2gr.d $a3, $vr0, 0 + ld.d $a5, $sp, 128 # 8-byte Folded Reload move $a7, $a5 - ld.d $a2, $sp, 168 # 8-byte Folded Reload - beq $a5, $a2, .LBB17_158 + ld.d $a4, $sp, 184 # 8-byte Folded Reload + beq $a5, $a4, .LBB17_158 .LBB17_156: # %.lr.ph507.preheader839 # in Loop: Header=BB17_26 Depth=1 slli.d $a5, $a7, 1 - ld.d $a2, $sp, 88 # 8-byte Folded Reload - alsl.d $a2, $a7, $a2, 1 + ld.d $a4, $sp, 88 # 8-byte Folded Reload + alsl.d $a4, $a7, $a4, 1 ld.d $a6, $sp, 80 # 8-byte Folded Reload sub.d $a5, $a6, $a5 - ld.d $a6, $sp, 168 # 8-byte Folded Reload + ld.d $a6, $sp, 184 # 8-byte Folded Reload sub.d $a6, $a6, $a7 addi.d $a7, $a7, 1 .p2align 4, , 16 .LBB17_157: # %.lr.ph507 # Parent Loop BB17_26 Depth=1 # => This Inner Loop Header: Depth=2 - ld.hu $t0, $a2, 0 + ld.hu $t0, $a4, 0 ld.hu $t1, $a5, 0 sub.d $t0, $t0, $t1 mul.d $t0, $t0, $a7 - add.d $a4, $t0, $a4 - addi.d $a2, $a2, 2 + add.d $a3, $t0, $a3 + addi.d $a4, $a4, 2 addi.d $a5, $a5, -2 addi.d $a6, $a6, -1 addi.d $a7, $a7, 1 bnez $a6, .LBB17_157 .LBB17_158: # %._crit_edge508.thread # in Loop: Header=BB17_26 Depth=1 - ld.d $a2, $sp, 136 # 8-byte Folded Reload - mul.d $a2, $a4, $a2 - ld.d $a4, $sp, 200 # 8-byte Folded Reload - add.d $a2, $a2, $a4 - ld.d $a4, $sp, 128 # 8-byte Folded Reload - sra.w $a2, $a2, $a4 + ld.d $a4, $sp, 152 # 8-byte Folded Reload + mul.d $a3, $a3, $a4 + ld.d $a4, $sp, 216 # 8-byte Folded Reload + add.d $a3, $a3, $a4 + ld.d $a4, $sp, 144 # 8-byte Folded Reload + sra.w $a3, $a3, $a4 blez $s2, .LBB17_25 .LBB17_159: # %.preheader448.lr.ph.split.us # in Loop: Header=BB17_26 Depth=1 move $a4, $zero - ld.d $a5, $sp, 160 # 8-byte Folded Reload + ld.d $a5, $sp, 176 # 8-byte Folded Reload mul.d $a1, $a1, $a5 ld.d $a5, $sp, 368 # 8-byte Folded Reload add.d $a1, $a1, $a5 - ld.d $a5, $sp, 152 # 8-byte Folded Reload + ld.d $a5, $sp, 168 # 8-byte Folded Reload sra.w $a1, $a1, $a5 ld.d $a5, $s7, 0 - add.d $a0, $a3, $a0 + add.d $a0, $a2, $a0 slli.d $a6, $a0, 4 ld.d $a0, $sp, 392 # 8-byte Folded Reload slli.d $a0, $a0, 11 add.d $a0, $a5, $a0 - lu12i.w $a3, 2 - ori $a3, $a3, 1872 - add.d $a0, $a0, $a3 - lu12i.w $a3, 3 - ori $a3, $a3, 3236 - ldx.w $a3, $a5, $a3 - ld.d $a5, $sp, 144 # 8-byte Folded Reload - mul.d $a5, $a5, $a2 + lu12i.w $a2, 2 + ori $a2, $a2, 1872 + add.d $a0, $a0, $a2 + lu12i.w $a2, 3 + ori $a2, $a2, 3236 + ldx.w $a2, $a5, $a2 + ld.d $a5, $sp, 160 # 8-byte Folded Reload + mul.d $a5, $a5, $a3 add.d $a7, $a5, $a6 addi.d $a5, $a6, 16 vreplgr2vr.w $vr0, $a1 - vreplgr2vr.w $vr1, $a3 + vreplgr2vr.w $vr1, $a2 addi.d $a6, $a7, 16 b .LBB17_161 .p2align 4, , 16 @@ -9146,7 +9117,7 @@ IntraChromaPrediction: # @IntraChromaPrediction # in Loop: Header=BB17_161 Depth=2 addi.d $a4, $a4, 1 addi.d $a0, $a0, 32 - add.d $a6, $a6, $a2 + add.d $a6, $a6, $a3 ori $t2, $zero, 8 beq $a4, $s6, .LBB17_25 .LBB17_161: # %.preheader448.us @@ -9163,11 +9134,11 @@ IntraChromaPrediction: # @IntraChromaPrediction # in Loop: Header=BB17_161 Depth=2 sub.d $a7, $a4, $s8 addi.d $a7, $a7, 1 + pcalau12i $t0, %pc_hi20(.LCPI17_2) + vld $vr2, $t0, %pc_lo12(.LCPI17_2) pcalau12i $t0, %pc_hi20(.LCPI17_3) - vld $vr2, $t0, %pc_lo12(.LCPI17_3) - pcalau12i $t0, %pc_hi20(.LCPI17_4) - vld $vr3, $t0, %pc_lo12(.LCPI17_4) - mul.d $a7, $a7, $a2 + vld $vr3, $t0, %pc_lo12(.LCPI17_3) + mul.d $a7, $a7, $a3 add.d $a7, $a5, $a7 vreplgr2vr.w $vr4, $a7 move $a7, $a0 @@ -9177,8 +9148,8 @@ IntraChromaPrediction: # @IntraChromaPrediction # Parent Loop BB17_26 Depth=1 # Parent Loop BB17_161 Depth=2 # => This Inner Loop Header: Depth=3 - vsub.w $vr5, $vr2, $vr10 - vsub.w $vr6, $vr3, $vr10 + vsub.w $vr5, $vr2, $vr8 + vsub.w $vr6, $vr3, $vr8 vaddi.wu $vr6, $vr6, 1 vaddi.wu $vr5, $vr5, 1 vori.b $vr7, $vr4, 0 @@ -9219,9 +9190,9 @@ IntraChromaPrediction: # @IntraChromaPrediction srai.d $t2, $a7, 5 srai.d $t3, $a7, 63 andn $t2, $t2, $t3 - slt $t3, $t2, $a3 + slt $t3, $t2, $a2 maskeqz $t2, $t2, $t3 - masknez $t3, $a3, $t3 + masknez $t3, $a2, $t3 or $t2, $t2, $t3 stx.h $t2, $a0, $t0 add.w $a7, $a7, $a1 @@ -9230,7 +9201,7 @@ IntraChromaPrediction: # @IntraChromaPrediction bnez $t1, .LBB17_167 b .LBB17_160 .LBB17_168: - ld.d $s2, $sp, 184 # 8-byte Folded Reload + ld.d $s2, $sp, 200 # 8-byte Folded Reload ld.d $a0, $s2, 0 ldptr.w $a0, $a0, 4168 bnez $a0, .LBB17_196 @@ -9240,7 +9211,7 @@ IntraChromaPrediction: # @IntraChromaPrediction blez $s6, .LBB17_172 # %bb.170: # %.lr.ph517.preheader move $s0, $zero - addi.d $s1, $sp, 456 + addi.d $s1, $sp, 440 move $fp, $s6 .p2align 4, , 16 .LBB17_171: # %.lr.ph517 @@ -9259,47 +9230,45 @@ IntraChromaPrediction: # @IntraChromaPrediction .LBB17_172: # %.preheader434 move $a3, $zero move $a4, $zero - ld.d $a0, $sp, 448 # 8-byte Folded Reload + ld.d $a0, $sp, 432 # 8-byte Folded Reload slti $a0, $a0, 1 - ld.d $a1, $sp, 208 # 8-byte Folded Reload + ld.d $a1, $sp, 224 # 8-byte Folded Reload ld.d $a2, $sp, 360 # 8-byte Folded Reload and $a1, $a2, $a1 ld.d $a2, $sp, 8 # 8-byte Folded Reload and $a1, $a1, $a2 - st.d $a1, $sp, 312 # 8-byte Folded Spill + st.d $a1, $sp, 320 # 8-byte Folded Spill ld.d $a1, $sp, 16 # 8-byte Folded Reload or $a0, $a1, $a0 - st.d $a0, $sp, 336 # 8-byte Folded Spill + st.d $a0, $sp, 344 # 8-byte Folded Spill lu12i.w $a0, 524287 ori $a5, $a0, 4095 lu12i.w $a0, 2 ori $a2, $a0, 2384 ori $a6, $zero, 2 pcalau12i $a0, %pc_hi20(imgUV_org) - st.d $a0, $sp, 320 # 8-byte Folded Spill - vrepli.b $vr2, 0 + st.d $a0, $sp, 336 # 8-byte Folded Spill ori $a7, $zero, 4 - st.d $s7, $sp, 440 # 8-byte Folded Spill - vst $vr2, $sp, 416 # 16-byte Folded Spill + st.d $s7, $sp, 424 # 8-byte Folded Spill b .LBB17_176 .LBB17_173: # in Loop: Header=BB17_176 Depth=1 move $s3, $zero .LBB17_174: # %.split537.us # in Loop: Header=BB17_176 Depth=1 - ld.d $a5, $sp, 344 # 8-byte Folded Reload + ld.d $a5, $sp, 352 # 8-byte Folded Reload slt $a0, $s3, $a5 - ld.d $a1, $sp, 352 # 8-byte Folded Reload + ld.d $a1, $sp, 368 # 8-byte Folded Reload masknez $a1, $a1, $a0 - ld.d $a3, $sp, 368 # 8-byte Folded Reload + ld.d $a3, $sp, 384 # 8-byte Folded Reload maskeqz $a2, $a3, $a0 or $a4, $a2, $a1 masknez $a1, $a5, $a0 maskeqz $a0, $s3, $a0 or $a5, $a0, $a1 - ld.d $s7, $sp, 440 # 8-byte Folded Reload - ld.d $s2, $sp, 184 # 8-byte Folded Reload - ld.d $s4, $sp, 264 # 8-byte Folded Reload - ld.d $a2, $sp, 384 # 8-byte Folded Reload + ld.d $s7, $sp, 424 # 8-byte Folded Reload + ld.d $s2, $sp, 200 # 8-byte Folded Reload + ld.d $s4, $sp, 280 # 8-byte Folded Reload + ld.d $a2, $sp, 392 # 8-byte Folded Reload ori $a6, $zero, 2 ori $a7, $zero, 4 .LBB17_175: # in Loop: Header=BB17_176 Depth=1 @@ -9337,7 +9306,7 @@ IntraChromaPrediction: # @IntraChromaPrediction # %bb.181: # in Loop: Header=BB17_176 Depth=1 addi.d $a0, $a3, -2 sltu $a0, $zero, $a0 - ld.d $a1, $sp, 208 # 8-byte Folded Reload + ld.d $a1, $sp, 224 # 8-byte Folded Reload or $a0, $a0, $a1 beqz $a0, .LBB17_175 .LBB17_182: # in Loop: Header=BB17_176 Depth=1 @@ -9351,18 +9320,18 @@ IntraChromaPrediction: # @IntraChromaPrediction beqz $a0, .LBB17_175 b .LBB17_186 .LBB17_185: # in Loop: Header=BB17_176 Depth=1 - ld.d $a0, $sp, 312 # 8-byte Folded Reload + ld.d $a0, $sp, 320 # 8-byte Folded Reload beqz $a0, .LBB17_175 .LBB17_186: # in Loop: Header=BB17_176 Depth=1 - st.d $a5, $sp, 344 # 8-byte Folded Spill - st.d $a2, $sp, 384 # 8-byte Folded Spill - st.d $a4, $sp, 352 # 8-byte Folded Spill - st.d $a3, $sp, 368 # 8-byte Folded Spill - ld.d $a0, $sp, 336 # 8-byte Folded Reload + st.d $a5, $sp, 352 # 8-byte Folded Spill + st.d $a2, $sp, 392 # 8-byte Folded Spill + st.d $a4, $sp, 368 # 8-byte Folded Spill + st.d $a3, $sp, 384 # 8-byte Folded Spill + ld.d $a0, $sp, 344 # 8-byte Folded Reload bnez $a0, .LBB17_173 # %bb.187: # %.preheader433.lr.ph.us.us # in Loop: Header=BB17_176 Depth=1 - ld.d $a0, $sp, 320 # 8-byte Folded Reload + ld.d $a0, $sp, 336 # 8-byte Folded Reload ld.d $a0, $a0, %pc_lo12(imgUV_org) ld.d $s8, $a0, 0 move $a3, $zero @@ -9374,12 +9343,12 @@ IntraChromaPrediction: # @IntraChromaPrediction # Parent Loop BB17_176 Depth=1 # => This Loop Header: Depth=2 # Child Loop BB17_189 Depth 3 - st.d $a1, $sp, 392 # 8-byte Folded Spill + st.d $a1, $sp, 400 # 8-byte Folded Spill move $s1, $zero move $s4, $zero ori $a1, $zero, 24 mul.d $a0, $a3, $a1 - addi.d $a2, $sp, 456 + addi.d $a2, $sp, 440 add.d $s6, $a2, $a0 addi.d $a0, $a3, 1 mul.d $a0, $a0, $a1 @@ -9387,7 +9356,7 @@ IntraChromaPrediction: # @IntraChromaPrediction addi.d $a0, $a3, 2 mul.d $a0, $a0, $a1 add.d $s5, $a2, $a0 - st.d $a3, $sp, 400 # 8-byte Folded Spill + st.d $a3, $sp, 416 # 8-byte Folded Spill addi.d $a0, $a3, 3 mul.d $a0, $a0, $a1 add.d $s0, $a2, $a0 @@ -9400,7 +9369,7 @@ IntraChromaPrediction: # @IntraChromaPrediction slli.d $a0, $a0, 3 ldx.d $a0, $s8, $a0 ld.w $a1, $s6, 16 - ld.d $a2, $sp, 440 # 8-byte Folded Reload + ld.d $a2, $sp, 424 # 8-byte Folded Reload ld.d $a2, $a2, 0 alsl.d $a0, $a1, $a0, 1 ldx.d $a1, $a0, $s1 @@ -9408,9 +9377,9 @@ IntraChromaPrediction: # @IntraChromaPrediction ldx.d $a2, $a0, $s1 add.d $a0, $a0, $s1 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 pcalau12i $a1, %pc_hi20(diff) addi.d $fp, $a1, %pc_lo12(diff) @@ -9427,9 +9396,9 @@ IntraChromaPrediction: # @IntraChromaPrediction slli.d $a2, $a2, 3 ldx.d $a2, $s8, $a2 ld.w $a3, $s5, 16 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 alsl.d $a1, $a3, $a2, 1 ldx.d $a1, $a1, $s1 vsub.w $vr0, $vr0, $vr1 @@ -9441,56 +9410,55 @@ IntraChromaPrediction: # @IntraChromaPrediction ldx.d $a2, $s8, $a2 ld.w $a3, $s0, 16 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 alsl.d $a1, $a3, $a2, 1 ldx.d $a1, $a1, $s1 vsub.w $vr0, $vr0, $vr1 ld.d $a0, $a0, 96 vst $vr0, $fp, 32 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $fp, 48 move $a0, $fp pcaddu18i $ra, %call36(distortion4x4) jirl $ra, $ra, 0 - vld $vr2, $sp, 416 # 16-byte Folded Reload add.d $s3, $a0, $s3 addi.w $s4, $s4, 4 addi.d $s1, $s1, 8 - ld.d $a0, $sp, 448 # 8-byte Folded Reload + ld.d $a0, $sp, 432 # 8-byte Folded Reload blt $s4, $a0, .LBB17_189 # %bb.190: # %._crit_edge524.us.us.us # in Loop: Header=BB17_188 Depth=2 - ld.d $a1, $sp, 392 # 8-byte Folded Reload + ld.d $a1, $sp, 400 # 8-byte Folded Reload addi.w $a1, $a1, 4 - ld.d $a3, $sp, 400 # 8-byte Folded Reload + ld.d $a3, $sp, 416 # 8-byte Folded Reload addi.d $a3, $a3, 4 addi.d $s7, $s7, 128 ld.d $a0, $sp, 408 # 8-byte Folded Reload blt $a1, $a0, .LBB17_188 # %bb.191: # %._crit_edge528.split.us.us.us # in Loop: Header=BB17_176 Depth=1 - ld.d $a0, $sp, 320 # 8-byte Folded Reload + ld.d $a0, $sp, 336 # 8-byte Folded Reload ld.d $a0, $a0, %pc_lo12(imgUV_org) ld.d $s8, $a0, 8 move $a3, $zero move $a1, $zero - ld.d $s5, $sp, 384 # 8-byte Folded Reload + ld.d $s5, $sp, 392 # 8-byte Folded Reload .p2align 4, , 16 .LBB17_192: # %.preheader433.us.us.us.1 # Parent Loop BB17_176 Depth=1 # => This Loop Header: Depth=2 # Child Loop BB17_193 Depth 3 - st.d $a1, $sp, 392 # 8-byte Folded Spill + st.d $a1, $sp, 400 # 8-byte Folded Spill move $s1, $zero move $s4, $zero ori $a1, $zero, 24 mul.d $a0, $a3, $a1 - addi.d $a2, $sp, 456 + addi.d $a2, $sp, 440 add.d $s6, $a2, $a0 addi.d $a0, $a3, 1 mul.d $a0, $a0, $a1 @@ -9498,7 +9466,7 @@ IntraChromaPrediction: # @IntraChromaPrediction addi.d $a0, $a3, 2 mul.d $a0, $a0, $a1 add.d $s0, $a2, $a0 - st.d $a3, $sp, 400 # 8-byte Folded Spill + st.d $a3, $sp, 416 # 8-byte Folded Spill addi.d $a0, $a3, 3 mul.d $a0, $a0, $a1 add.d $s7, $a2, $a0 @@ -9511,7 +9479,7 @@ IntraChromaPrediction: # @IntraChromaPrediction slli.d $a0, $a0, 3 ldx.d $a0, $s8, $a0 ld.w $a1, $s6, 16 - ld.d $a2, $sp, 440 # 8-byte Folded Reload + ld.d $a2, $sp, 424 # 8-byte Folded Reload ld.d $a2, $a2, 0 alsl.d $a0, $a1, $a0, 1 ldx.d $a0, $a0, $s1 @@ -9523,9 +9491,9 @@ IntraChromaPrediction: # @IntraChromaPrediction slli.d $a1, $a2, 3 ldx.d $a1, $s8, $a1 ld.w $a2, $s2, 16 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 alsl.d $a0, $a2, $a1, 1 ldx.d $a0, $a0, $s1 vsub.w $vr0, $vr0, $vr1 @@ -9536,9 +9504,9 @@ IntraChromaPrediction: # @IntraChromaPrediction slli.d $a1, $a1, 3 ldx.d $a1, $s8, $a1 ld.w $a2, $s0, 16 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 alsl.d $a0, $a2, $a1, 1 ldx.d $a0, $a0, $s1 vsub.w $vr0, $vr0, $vr1 @@ -9550,33 +9518,32 @@ IntraChromaPrediction: # @IntraChromaPrediction ldx.d $a1, $s8, $a1 ld.w $a2, $s7, 16 vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 alsl.d $a0, $a2, $a1, 1 ldx.d $a0, $a0, $s1 vsub.w $vr0, $vr0, $vr1 ld.d $a1, $a3, 96 vst $vr0, $fp, 32 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $fp, 48 move $a0, $fp pcaddu18i $ra, %call36(distortion4x4) jirl $ra, $ra, 0 - vld $vr2, $sp, 416 # 16-byte Folded Reload add.w $s3, $a0, $s3 addi.w $s4, $s4, 4 addi.d $s1, $s1, 8 - ld.d $a0, $sp, 448 # 8-byte Folded Reload + ld.d $a0, $sp, 432 # 8-byte Folded Reload blt $s4, $a0, .LBB17_193 # %bb.194: # %._crit_edge524.us.us.us.1 # in Loop: Header=BB17_192 Depth=2 - ld.d $a1, $sp, 392 # 8-byte Folded Reload + ld.d $a1, $sp, 400 # 8-byte Folded Reload addi.w $a1, $a1, 4 - ld.d $a3, $sp, 400 # 8-byte Folded Reload + ld.d $a3, $sp, 416 # 8-byte Folded Reload addi.d $a3, $a3, 4 addi.d $s5, $s5, 128 ld.d $a0, $sp, 408 # 8-byte Folded Reload @@ -9586,22 +9553,22 @@ IntraChromaPrediction: # @IntraChromaPrediction ld.d $a0, $sp, 32 # 8-byte Folded Reload st.w $a4, $a0, 416 .LBB17_196: - ld.d $s8, $sp, 952 # 8-byte Folded Reload - ld.d $s7, $sp, 960 # 8-byte Folded Reload - ld.d $s6, $sp, 968 # 8-byte Folded Reload - ld.d $s5, $sp, 976 # 8-byte Folded Reload - ld.d $s4, $sp, 984 # 8-byte Folded Reload - ld.d $s3, $sp, 992 # 8-byte Folded Reload - ld.d $s2, $sp, 1000 # 8-byte Folded Reload - ld.d $s1, $sp, 1008 # 8-byte Folded Reload - ld.d $s0, $sp, 1016 # 8-byte Folded Reload - ld.d $fp, $sp, 1024 # 8-byte Folded Reload - ld.d $ra, $sp, 1032 # 8-byte Folded Reload - addi.d $sp, $sp, 1040 + ld.d $s8, $sp, 936 # 8-byte Folded Reload + ld.d $s7, $sp, 944 # 8-byte Folded Reload + ld.d $s6, $sp, 952 # 8-byte Folded Reload + ld.d $s5, $sp, 960 # 8-byte Folded Reload + ld.d $s4, $sp, 968 # 8-byte Folded Reload + ld.d $s3, $sp, 976 # 8-byte Folded Reload + ld.d $s2, $sp, 984 # 8-byte Folded Reload + ld.d $s1, $sp, 992 # 8-byte Folded Reload + ld.d $s0, $sp, 1000 # 8-byte Folded Reload + ld.d $fp, $sp, 1008 # 8-byte Folded Reload + ld.d $ra, $sp, 1016 # 8-byte Folded Reload + addi.d $sp, $sp, 1024 ret .LBB17_197: move $a0, $zero - ld.d $a1, $sp, 448 # 8-byte Folded Reload + ld.d $a1, $sp, 432 # 8-byte Folded Reload ld.d $a3, $sp, 416 # 8-byte Folded Reload bnez $a1, .LBB17_19 b .LBB17_20 @@ -9636,36 +9603,36 @@ IntraChromaPrediction: # @IntraChromaPrediction .type IntraChromaRDDecision,@function IntraChromaRDDecision: # @IntraChromaRDDecision # %bb.0: - addi.d $sp, $sp, -688 - st.d $ra, $sp, 680 # 8-byte Folded Spill - st.d $fp, $sp, 672 # 8-byte Folded Spill - st.d $s0, $sp, 664 # 8-byte Folded Spill - st.d $s1, $sp, 656 # 8-byte Folded Spill - st.d $s2, $sp, 648 # 8-byte Folded Spill - st.d $s3, $sp, 640 # 8-byte Folded Spill - st.d $s4, $sp, 632 # 8-byte Folded Spill - st.d $s5, $sp, 624 # 8-byte Folded Spill - st.d $s6, $sp, 616 # 8-byte Folded Spill - st.d $s7, $sp, 608 # 8-byte Folded Spill - st.d $s8, $sp, 600 # 8-byte Folded Spill - st.d $a0, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -672 + st.d $ra, $sp, 664 # 8-byte Folded Spill + st.d $fp, $sp, 656 # 8-byte Folded Spill + st.d $s0, $sp, 648 # 8-byte Folded Spill + st.d $s1, $sp, 640 # 8-byte Folded Spill + st.d $s2, $sp, 632 # 8-byte Folded Spill + st.d $s3, $sp, 624 # 8-byte Folded Spill + st.d $s4, $sp, 616 # 8-byte Folded Spill + st.d $s5, $sp, 608 # 8-byte Folded Spill + st.d $s6, $sp, 600 # 8-byte Folded Spill + st.d $s7, $sp, 592 # 8-byte Folded Spill + st.d $s8, $sp, 584 # 8-byte Folded Spill + st.d $a0, $sp, 64 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(img) ld.d $a0, $a0, %got_pc_lo12(img) - st.d $a0, $sp, 160 # 8-byte Folded Spill + st.d $a0, $sp, 144 # 8-byte Folded Spill ld.d $a0, $a0, 0 ldptr.d $a1, $a0, 14224 - st.d $a1, $sp, 8 # 8-byte Folded Spill + st.d $a1, $sp, 16 # 8-byte Folded Spill ld.w $s5, $a0, 12 ldptr.w $s6, $a0, 15548 ldptr.w $a0, $a0, 15544 - st.d $a0, $sp, 152 # 8-byte Folded Spill + st.d $a0, $sp, 136 # 8-byte Folded Spill pcalau12i $s0, %pc_hi20(getNeighbour) bltz $s6, .LBB18_3 # %bb.1: # %.lr.ph.preheader addi.d $a0, $s6, 1 bstrpick.d $fp, $a0, 31, 0 addi.w $s1, $zero, -1 - addi.d $s2, $sp, 168 + addi.d $s2, $sp, 152 move $s3, $s1 .p2align 4, , 16 .LBB18_2: # %.lr.ph @@ -9685,12 +9652,12 @@ IntraChromaRDDecision: # @IntraChromaRDDecision ld.d $a5, $s0, %pc_lo12(getNeighbour) addi.w $a2, $zero, -1 ori $a3, $zero, 1 - addi.d $a4, $sp, 576 + addi.d $a4, $sp, 560 move $a0, $s5 move $a1, $zero jirl $ra, $a5, 0 - ld.w $s7, $sp, 576 - ld.w $s8, $sp, 168 + ld.w $s7, $sp, 560 + ld.w $s8, $sp, 152 pcalau12i $a0, %got_pc_hi20(input) ld.d $a0, $a0, %got_pc_lo12(input) ld.d $a0, $a0, 0 @@ -9699,9 +9666,9 @@ IntraChromaRDDecision: # @IntraChromaRDDecision # %bb.4: beqz $s7, .LBB18_7 # %bb.5: - ld.d $a6, $sp, 160 # 8-byte Folded Reload + ld.d $a6, $sp, 144 # 8-byte Folded Reload ld.d $a0, $a6, 0 - ld.w $a1, $sp, 580 + ld.w $a1, $sp, 564 ldptr.d $a0, $a0, 14240 slli.d $a1, $a1, 2 ldx.w $s7, $a0, $a1 @@ -9711,13 +9678,13 @@ IntraChromaRDDecision: # @IntraChromaRDDecision bgtz $a0, .LBB18_8 b .LBB18_13 .LBB18_6: - ld.w $fp, $sp, 192 + ld.w $fp, $sp, 176 move $s3, $fp bgtz $s6, .LBB18_21 b .LBB18_23 .LBB18_7: move $s7, $zero - ld.d $a6, $sp, 160 # 8-byte Folded Reload + ld.d $a6, $sp, 144 # 8-byte Folded Reload srai.d $a0, $s6, 1 ori $fp, $zero, 1 ori $s3, $zero, 1 @@ -9725,7 +9692,7 @@ IntraChromaRDDecision: # @IntraChromaRDDecision .LBB18_8: # %.lr.ph129 ld.d $a1, $a6, 0 ori $s3, $zero, 1 - addi.d $a2, $sp, 196 + addi.d $a2, $sp, 180 move $a3, $a0 b .LBB18_11 .p2align 4, , 16 @@ -9752,7 +9719,7 @@ IntraChromaRDDecision: # @IntraChromaRDDecision bstrpick.d $a2, $a0, 31, 0 slli.d $a3, $a2, 4 alsl.d $a2, $a2, $a3, 3 - addi.d $a3, $sp, 168 + addi.d $a3, $sp, 152 add.d $a2, $a2, $a3 addi.d $a2, $a2, 28 ori $fp, $zero, 1 @@ -9780,7 +9747,7 @@ IntraChromaRDDecision: # @IntraChromaRDDecision beqz $s8, .LBB18_20 # %bb.19: ld.d $a0, $a6, 0 - ld.w $a1, $sp, 172 + ld.w $a1, $sp, 156 ldptr.d $a0, $a0, 14240 slli.d $a1, $a1, 2 ldx.w $s8, $a0, $a1 @@ -9791,7 +9758,7 @@ IntraChromaRDDecision: # @IntraChromaRDDecision blez $s6, .LBB18_23 .LBB18_21: # %.lr.ph138.preheader move $s1, $zero - addi.d $s2, $sp, 168 + addi.d $s2, $sp, 152 move $s4, $s6 .p2align 4, , 16 .LBB18_22: # %.lr.ph138 @@ -9808,10 +9775,10 @@ IntraChromaRDDecision: # @IntraChromaRDDecision addi.d $s2, $s2, 24 bnez $s4, .LBB18_22 .LBB18_23: # %._crit_edge139 - ld.d $a5, $sp, 160 # 8-byte Folded Reload + ld.d $a5, $sp, 144 # 8-byte Folded Reload ld.d $a0, $a5, 0 ldptr.w $a1, $a0, 15268 - ld.d $a6, $sp, 152 # 8-byte Folded Reload + ld.d $a6, $sp, 136 # 8-byte Folded Reload beqz $a1, .LBB18_33 # %bb.24: ldptr.w $a0, $a0, 14464 @@ -9827,7 +9794,7 @@ IntraChromaRDDecision: # @IntraChromaRDDecision .LBB18_28: # %vector.ph bstrpick.d $a0, $s6, 30, 1 slli.d $a0, $a0, 1 - addi.d $a1, $sp, 212 + addi.d $a1, $sp, 196 move $a2, $a0 .p2align 4, , 16 .LBB18_29: # %vector.body @@ -9846,7 +9813,7 @@ IntraChromaRDDecision: # @IntraChromaRDDecision .LBB18_31: # %.lr.ph141.preheader243 slli.d $a1, $a0, 4 alsl.d $a1, $a0, $a1, 3 - addi.d $a2, $sp, 168 + addi.d $a2, $sp, 152 add.d $a1, $a1, $a2 addi.d $a1, $a1, 20 sub.d $a0, $s6, $a0 @@ -9867,40 +9834,38 @@ IntraChromaRDDecision: # @IntraChromaRDDecision sltu $a2, $zero, $s3 sltu $a3, $zero, $fp and $a2, $a2, $a3 - st.d $a2, $sp, 24 # 8-byte Folded Spill + st.d $a2, $sp, 32 # 8-byte Folded Spill and $a1, $a2, $a1 sltu $a2, $zero, $s8 and $a1, $a1, $a2 - st.d $a1, $sp, 16 # 8-byte Folded Spill + st.d $a1, $sp, 24 # 8-byte Folded Spill slt $a1, $zero, $a6 and $a0, $a0, $a1 - st.d $a0, $sp, 32 # 8-byte Folded Spill + st.d $a0, $sp, 40 # 8-byte Folded Spill lu12i.w $a0, 524287 ori $a0, $a0, 4095 - st.d $a0, $sp, 112 # 8-byte Folded Spill + st.d $a0, $sp, 120 # 8-byte Folded Spill lu12i.w $a0, 2 ori $a0, $a0, 336 - st.d $a0, $sp, 88 # 8-byte Folded Spill + st.d $a0, $sp, 96 # 8-byte Folded Spill ori $a1, $zero, 2 pcalau12i $a0, %pc_hi20(imgUV_org) - st.d $a0, $sp, 80 # 8-byte Folded Spill - vrepli.b $vr2, 0 + st.d $a0, $sp, 88 # 8-byte Folded Spill ori $a2, $zero, 4 - st.d $s5, $sp, 48 # 8-byte Folded Spill - st.d $s6, $sp, 104 # 8-byte Folded Spill - st.d $s7, $sp, 40 # 8-byte Folded Spill - vst $vr2, $sp, 128 # 16-byte Folded Spill + st.d $s5, $sp, 56 # 8-byte Folded Spill + st.d $s6, $sp, 112 # 8-byte Folded Spill + st.d $s7, $sp, 48 # 8-byte Folded Spill b .LBB18_37 .LBB18_34: # in Loop: Header=BB18_37 Depth=1 move $s1, $zero .LBB18_35: # %.split165.us # in Loop: Header=BB18_37 Depth=1 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload fld.d $fa0, $a0, 24 pcalau12i $a0, %got_pc_hi20(mvbits) ld.d $a0, $a0, %got_pc_lo12(mvbits) ld.d $a0, $a0, 0 - ld.d $a7, $sp, 64 # 8-byte Folded Reload + ld.d $a7, $sp, 72 # 8-byte Folded Reload slli.d $a1, $a7, 2 ldx.w $a0, $a0, $a1 movgr2fr.w $fa1, $a0 @@ -9909,25 +9874,25 @@ IntraChromaRDDecision: # @IntraChromaRDDecision ftintrz.w.d $fa0, $fa0 movfr2gr.s $a0, $fa0 add.w $a0, $s1, $a0 - ld.d $a6, $sp, 112 # 8-byte Folded Reload + ld.d $a6, $sp, 120 # 8-byte Folded Reload slt $a1, $a0, $a6 - ld.d $a2, $sp, 72 # 8-byte Folded Reload + ld.d $a2, $sp, 80 # 8-byte Folded Reload masknez $a2, $a2, $a1 maskeqz $a3, $a7, $a1 or $a4, $a3, $a2 maskeqz $a0, $a0, $a1 masknez $a1, $a6, $a1 or $a0, $a0, $a1 - st.d $a0, $sp, 112 # 8-byte Folded Spill - ld.d $s5, $sp, 48 # 8-byte Folded Reload - ld.d $s7, $sp, 40 # 8-byte Folded Reload + st.d $a0, $sp, 120 # 8-byte Folded Spill + ld.d $s5, $sp, 56 # 8-byte Folded Reload + ld.d $s7, $sp, 48 # 8-byte Folded Reload ori $a1, $zero, 2 ori $a2, $zero, 4 .LBB18_36: # in Loop: Header=BB18_37 Depth=1 addi.d $a7, $a7, 1 - ld.d $a0, $sp, 88 # 8-byte Folded Reload + ld.d $a0, $sp, 96 # 8-byte Folded Reload addi.d $a0, $a0, 512 - st.d $a0, $sp, 88 # 8-byte Folded Spill + st.d $a0, $sp, 96 # 8-byte Folded Spill beq $a7, $a2, .LBB18_51 .LBB18_37: # =>This Loop Header: Depth=1 # Child Loop BB18_46 Depth 2 @@ -9943,16 +9908,16 @@ IntraChromaRDDecision: # @IntraChromaRDDecision ori $a0, $zero, 1 bne $a7, $a0, .LBB18_43 # %bb.41: # in Loop: Header=BB18_37 Depth=1 - ld.d $a0, $sp, 24 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload beqz $a0, .LBB18_36 b .LBB18_43 .LBB18_42: # in Loop: Header=BB18_37 Depth=1 - ld.d $a0, $sp, 16 # 8-byte Folded Reload + ld.d $a0, $sp, 24 # 8-byte Folded Reload beqz $a0, .LBB18_36 .LBB18_43: # in Loop: Header=BB18_37 Depth=1 - st.d $a7, $sp, 64 # 8-byte Folded Spill - st.d $a4, $sp, 72 # 8-byte Folded Spill - ld.d $a0, $sp, 32 # 8-byte Folded Reload + st.d $a7, $sp, 72 # 8-byte Folded Spill + st.d $a4, $sp, 80 # 8-byte Folded Spill + ld.d $a0, $sp, 40 # 8-byte Folded Reload beqz $a0, .LBB18_34 # %bb.44: # %.preheader121.lr.ph.us.us.preheader # in Loop: Header=BB18_37 Depth=1 @@ -9963,10 +9928,10 @@ IntraChromaRDDecision: # @IntraChromaRDDecision .p2align 4, , 16 .LBB18_45: # %._crit_edge152.split.us.us.us # in Loop: Header=BB18_46 Depth=2 - ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 120 # 8-byte Folded Reload slt $a0, $a0, $s1 xori $a0, $a0, 1 - ld.d $a1, $sp, 96 # 8-byte Folded Reload + ld.d $a1, $sp, 104 # 8-byte Folded Reload and $a1, $a0, $a1 ori $a0, $zero, 1 move $a2, $zero @@ -9976,15 +9941,15 @@ IntraChromaRDDecision: # @IntraChromaRDDecision # => This Loop Header: Depth=2 # Child Loop BB18_47 Depth 3 # Child Loop BB18_48 Depth 4 - st.d $a2, $sp, 96 # 8-byte Folded Spill - ld.d $a1, $sp, 80 # 8-byte Folded Reload + st.d $a2, $sp, 104 # 8-byte Folded Spill + ld.d $a1, $sp, 88 # 8-byte Folded Reload ld.d $a1, $a1, %pc_lo12(imgUV_org) slli.d $a2, $a0, 3 ldx.d $s2, $a1, $a2 move $s4, $zero move $a2, $zero slli.d $a0, $a0, 11 - ld.d $a1, $sp, 88 # 8-byte Folded Reload + ld.d $a1, $sp, 96 # 8-byte Folded Reload add.d $s6, $a1, $a0 .p2align 4, , 16 .LBB18_47: # %.preheader121.us.us.us @@ -9992,12 +9957,12 @@ IntraChromaRDDecision: # @IntraChromaRDDecision # Parent Loop BB18_46 Depth=2 # => This Loop Header: Depth=3 # Child Loop BB18_48 Depth 4 - st.d $a2, $sp, 120 # 8-byte Folded Spill + st.d $a2, $sp, 128 # 8-byte Folded Spill move $s8, $zero move $fp, $zero ori $a1, $zero, 24 mul.d $a0, $s4, $a1 - addi.d $a2, $sp, 168 + addi.d $a2, $sp, 152 add.d $s0, $a2, $a0 addi.d $a0, $s4, 1 mul.d $a0, $a0, $a1 @@ -10025,9 +9990,9 @@ IntraChromaRDDecision: # @IntraChromaRDDecision ldx.d $a2, $a1, $s8 add.d $a1, $a1, $s8 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 pcalau12i $a0, %pc_hi20(diff) addi.d $a0, $a0, %pc_lo12(diff) @@ -10044,9 +10009,9 @@ IntraChromaRDDecision: # @IntraChromaRDDecision slli.d $a3, $a3, 3 ldx.d $a3, $s2, $a3 ld.w $a4, $s5, 16 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 alsl.d $a2, $a4, $a3, 1 ldx.d $a2, $a2, $s8 vsub.w $vr0, $vr0, $vr1 @@ -10058,59 +10023,58 @@ IntraChromaRDDecision: # @IntraChromaRDDecision ldx.d $a3, $s2, $a3 ld.w $a4, $s7, 16 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 alsl.d $a2, $a4, $a3, 1 ldx.d $a2, $a2, $s8 vsub.w $vr0, $vr0, $vr1 ld.d $a1, $a1, 96 vst $vr0, $a0, 32 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $a0, 48 pcaddu18i $ra, %call36(distortion4x4) jirl $ra, $ra, 0 - vld $vr2, $sp, 128 # 16-byte Folded Reload - ld.d $a5, $sp, 160 # 8-byte Folded Reload + ld.d $a5, $sp, 144 # 8-byte Folded Reload add.w $s1, $a0, $s1 addi.w $fp, $fp, 4 addi.d $s8, $s8, 8 - ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a0, $sp, 136 # 8-byte Folded Reload blt $fp, $a0, .LBB18_48 # %bb.49: # %._crit_edge148.us.us.us # in Loop: Header=BB18_47 Depth=3 - ld.d $a0, $sp, 112 # 8-byte Folded Reload - ld.d $a2, $sp, 120 # 8-byte Folded Reload + ld.d $a0, $sp, 120 # 8-byte Folded Reload + ld.d $a2, $sp, 128 # 8-byte Folded Reload blt $a0, $s1, .LBB18_45 # %bb.50: # %._crit_edge148.us.us.us # in Loop: Header=BB18_47 Depth=3 addi.w $a2, $a2, 4 addi.d $s4, $s4, 4 addi.d $s6, $s6, 128 - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload blt $a2, $a0, .LBB18_47 b .LBB18_45 .LBB18_51: ori $a0, $zero, 536 mul.d $a0, $s5, $a0 - ld.d $a1, $sp, 8 # 8-byte Folded Reload + ld.d $a1, $sp, 16 # 8-byte Folded Reload add.d $a0, $a1, $a0 st.w $a4, $a0, 416 - ld.d $s8, $sp, 600 # 8-byte Folded Reload - ld.d $s7, $sp, 608 # 8-byte Folded Reload - ld.d $s6, $sp, 616 # 8-byte Folded Reload - ld.d $s5, $sp, 624 # 8-byte Folded Reload - ld.d $s4, $sp, 632 # 8-byte Folded Reload - ld.d $s3, $sp, 640 # 8-byte Folded Reload - ld.d $s2, $sp, 648 # 8-byte Folded Reload - ld.d $s1, $sp, 656 # 8-byte Folded Reload - ld.d $s0, $sp, 664 # 8-byte Folded Reload - ld.d $fp, $sp, 672 # 8-byte Folded Reload - ld.d $ra, $sp, 680 # 8-byte Folded Reload - addi.d $sp, $sp, 688 + ld.d $s8, $sp, 584 # 8-byte Folded Reload + ld.d $s7, $sp, 592 # 8-byte Folded Reload + ld.d $s6, $sp, 600 # 8-byte Folded Reload + ld.d $s5, $sp, 608 # 8-byte Folded Reload + ld.d $s4, $sp, 616 # 8-byte Folded Reload + ld.d $s3, $sp, 624 # 8-byte Folded Reload + ld.d $s2, $sp, 632 # 8-byte Folded Reload + ld.d $s1, $sp, 640 # 8-byte Folded Reload + ld.d $s0, $sp, 648 # 8-byte Folded Reload + ld.d $fp, $sp, 656 # 8-byte Folded Reload + ld.d $ra, $sp, 664 # 8-byte Folded Reload + addi.d $sp, $sp, 672 ret .Lfunc_end18: .size IntraChromaRDDecision, .Lfunc_end18-IntraChromaRDDecision diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_distortion.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_distortion.s index cfc44968..0a032161 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_distortion.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_distortion.s @@ -279,18 +279,12 @@ distortion8x8: # @distortion8x8 vldx $vr2, $a0, $a2 add.d $a4, $a0, $a2 vld $vr3, $a4, 16 - vshuf4i.w $vr4, $vr2, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $a4, $vr2, 0 slli.d $a4, $a4, 2 vpickve2gr.d $a5, $vr2, 1 @@ -343,18 +337,12 @@ distortion8x8: # @distortion8x8 vldx $vr2, $a0, $a2 add.d $a4, $a0, $a2 vld $vr3, $a4, 16 - vshuf4i.w $vr4, $vr2, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $a4, $vr2, 0 slli.d $a4, $a4, 2 vpickve2gr.d $a5, $vr2, 1 @@ -1388,8 +1376,8 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr9, $s2, 1 vinsgr2vr.h $vr9, $s6, 2 vinsgr2vr.h $vr9, $ra, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -1418,22 +1406,16 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr11, $s2, 1 vinsgr2vr.h $vr11, $s6, 2 vinsgr2vr.h $vr11, $ra, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $s1, $vr9, 0 slli.d $s1, $s1, 2 vpickve2gr.d $s2, $vr9, 1 @@ -1484,8 +1466,8 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr9, $s1, 1 vinsgr2vr.h $vr9, $s2, 2 vinsgr2vr.h $vr9, $s5, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -1514,22 +1496,16 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr11, $s1, 1 vinsgr2vr.h $vr11, $s2, 2 vinsgr2vr.h $vr11, $s5, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t5, $vr9, 0 slli.d $t5, $t5, 2 vpickve2gr.d $s1, $vr9, 1 @@ -1580,8 +1556,8 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr9, $s1, 1 vinsgr2vr.h $vr9, $s2, 2 vinsgr2vr.h $vr9, $s5, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -1610,22 +1586,16 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr11, $s1, 1 vinsgr2vr.h $vr11, $s2, 2 vinsgr2vr.h $vr11, $s5, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t5, $vr9, 0 slli.d $t5, $t5, 2 vpickve2gr.d $s1, $vr9, 1 @@ -1676,8 +1646,8 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr9, $s1, 1 vinsgr2vr.h $vr9, $s2, 2 vinsgr2vr.h $vr9, $s5, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -1706,22 +1676,16 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr11, $s1, 1 vinsgr2vr.h $vr11, $s2, 2 vinsgr2vr.h $vr11, $s5, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t5, $vr9, 0 slli.d $t5, $t5, 2 vpickve2gr.d $s1, $vr9, 1 @@ -2003,8 +1967,8 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr9, $t8, 1 vinsgr2vr.h $vr9, $s0, 2 vinsgr2vr.h $vr9, $s1, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -2033,22 +1997,16 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr11, $t8, 1 vinsgr2vr.h $vr11, $s0, 2 vinsgr2vr.h $vr11, $s1, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t7, $vr9, 0 slli.d $t7, $t7, 2 vpickve2gr.d $t8, $vr9, 1 @@ -2099,8 +2057,8 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr9, $t8, 1 vinsgr2vr.h $vr9, $s0, 2 vinsgr2vr.h $vr9, $s1, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -2129,22 +2087,16 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr11, $t8, 1 vinsgr2vr.h $vr11, $s0, 2 vinsgr2vr.h $vr11, $s1, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t7, $vr9, 0 slli.d $t7, $t7, 2 vpickve2gr.d $t8, $vr9, 1 @@ -2381,8 +2333,8 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr9, $t8, 1 vinsgr2vr.h $vr9, $s0, 2 vinsgr2vr.h $vr9, $s1, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -2411,22 +2363,16 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr11, $t8, 1 vinsgr2vr.h $vr11, $s0, 2 vinsgr2vr.h $vr11, $s1, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t7, $vr9, 0 slli.d $t7, $t7, 2 vpickve2gr.d $t8, $vr9, 1 @@ -2477,8 +2423,8 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr9, $t8, 1 vinsgr2vr.h $vr9, $s0, 2 vinsgr2vr.h $vr9, $s1, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -2507,22 +2453,16 @@ computeSADWP: # @computeSADWP vinsgr2vr.h $vr11, $t8, 1 vinsgr2vr.h $vr11, $s0, 2 vinsgr2vr.h $vr11, $s1, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t7, $vr9, 0 slli.d $t7, $t7, 2 vpickve2gr.d $t8, $vr9, 1 @@ -2696,16 +2636,16 @@ computeBiPredSAD1: # @computeBiPredSAD1 st.d $a6, $sp, 72 # 8-byte Folded Spill move $a2, $a6 jirl $ra, $a3, 0 - pcalau12i $s3, %pc_hi20(bipred1_access_method) - ld.w $a1, $s3, %pc_lo12(bipred1_access_method) + pcalau12i $a1, %pc_hi20(bipred1_access_method) + st.d $a1, $sp, 56 # 8-byte Folded Spill + ld.w $a1, $a1, %pc_lo12(bipred1_access_method) slli.d $a1, $a1, 3 ldx.d $a3, $s0, $a1 pcalau12i $s2, %pc_hi20(ref2_line) st.d $a0, $s2, %pc_lo12(ref2_line) pcalau12i $a0, %pc_hi20(ref_pic1_sub) - addi.d $a0, $a0, %pc_lo12(ref_pic1_sub) - st.d $a0, $sp, 56 # 8-byte Folded Spill - ld.d $a0, $a0, 0 + addi.d $s3, $a0, %pc_lo12(ref_pic1_sub) + ld.d $a0, $s3, 0 st.d $s5, $sp, 80 # 8-byte Folded Spill move $a1, $s5 st.d $s1, $sp, 88 # 8-byte Folded Spill @@ -2715,7 +2655,7 @@ computeBiPredSAD1: # @computeBiPredSAD1 st.d $a0, $s5, %pc_lo12(ref1_line) blez $s7, .LBB7_8 # %bb.1: # %.preheader67.lr.ph - ld.d $a4, $s2, %pc_lo12(ref2_line) + ld.d $a5, $s2, %pc_lo12(ref2_line) blez $s6, .LBB7_9 # %bb.2: # %.preheader67.us.preheader pcalau12i $a1, %got_pc_hi20(byte_abs) @@ -2723,51 +2663,50 @@ computeBiPredSAD1: # @computeBiPredSAD1 move $s0, $zero move $a1, $zero ld.d $a2, $a2, 0 - ld.d $a5, $s8, %pc_lo12(src_line) + ld.d $a4, $s8, %pc_lo12(src_line) slli.d $a3, $s4, 1 slli.d $a6, $s6, 1 sub.d $a3, $a3, $a6 - vrepli.b $vr0, 0 .p2align 4, , 16 .LBB7_3: # %.preheader67.us # =>This Loop Header: Depth=1 # Child Loop BB7_4 Depth 2 move $a6, $zero move $a7, $a0 - move $t0, $a4 + move $t0, $a5 .p2align 4, , 16 .LBB7_4: # Parent Loop BB7_3 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $a0, $a5, 0 - ld.d $a4, $a7, 0 + ld.d $a0, $a4, 0 + ld.d $a5, $a7, 0 ld.d $t1, $t0, 0 - vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr1, $vr0, $vr1 - vinsgr2vr.d $vr2, $a4, 0 - vinsgr2vr.d $vr3, $t1, 0 - vor.v $vr4, $vr2, $vr3 - vxor.v $vr2, $vr2, $vr3 - vsrli.h $vr2, $vr2, 1 - vsub.h $vr2, $vr4, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vsub.w $vr1, $vr1, $vr2 - vpickve2gr.w $a0, $vr1, 0 + vinsgr2vr.d $vr0, $a0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vinsgr2vr.d $vr1, $a5, 0 + vinsgr2vr.d $vr2, $t1, 0 + vor.v $vr3, $vr1, $vr2 + vxor.v $vr1, $vr1, $vr2 + vsrli.h $vr1, $vr1, 1 + vsub.h $vr1, $vr3, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 + vsub.w $vr0, $vr0, $vr1 + vpickve2gr.w $a0, $vr0, 0 slli.d $a0, $a0, 2 ldx.w $a0, $a2, $a0 - addi.d $a5, $a5, 8 + addi.d $a4, $a4, 8 addi.d $a7, $a7, 8 addi.d $t0, $t0, 8 add.d $a0, $a0, $s0 - vpickve2gr.w $a4, $vr1, 1 - slli.d $a4, $a4, 2 - ldx.w $a4, $a2, $a4 - vpickve2gr.w $t1, $vr1, 2 + vpickve2gr.w $a5, $vr0, 1 + slli.d $a5, $a5, 2 + ldx.w $a5, $a2, $a5 + vpickve2gr.w $t1, $vr0, 2 slli.d $t1, $t1, 2 ldx.w $t1, $a2, $t1 - vpickve2gr.w $t2, $vr1, 3 + vpickve2gr.w $t2, $vr0, 3 slli.d $t2, $t2, 2 ldx.w $t2, $a2, $t2 - add.d $a0, $a0, $a4 + add.d $a0, $a0, $a5 add.d $a0, $a0, $t1 addi.w $a6, $a6, 4 add.w $s0, $a0, $t2 @@ -2776,7 +2715,7 @@ computeBiPredSAD1: # @computeBiPredSAD1 # in Loop: Header=BB7_3 Depth=1 bge $s0, $fp, .LBB7_19 # %bb.6: # in Loop: Header=BB7_3 Depth=1 - add.d $a4, $t0, $a3 + add.d $a5, $t0, $a3 addi.w $a1, $a1, 1 add.d $a0, $a7, $a3 bne $a1, $s7, .LBB7_3 @@ -2785,7 +2724,7 @@ computeBiPredSAD1: # @computeBiPredSAD1 st.d $a0, $s2, %pc_lo12(ref2_line) add.d $a0, $a7, $a3 st.d $a0, $s5, %pc_lo12(ref1_line) - st.d $a5, $s8, %pc_lo12(src_line) + st.d $a4, $s8, %pc_lo12(src_line) pcalau12i $a0, %pc_hi20(ChromaMEEnable) ld.w $a0, $a0, %pc_lo12(ChromaMEEnable) bnez $a0, .LBB7_11 @@ -2807,7 +2746,7 @@ computeBiPredSAD1: # @computeBiPredSAD1 mul.d $a1, $a1, $a3 alsl.d $a1, $s4, $a1, 1 sub.d $a1, $a1, $a2 - add.d $a2, $a4, $a1 + add.d $a2, $a5, $a1 add.d $a0, $a0, $a1 st.d $a2, $s2, %pc_lo12(ref2_line) st.d $a0, $s5, %pc_lo12(ref1_line) @@ -2847,12 +2786,12 @@ computeBiPredSAD1: # @computeBiPredSAD1 ld.d $a1, $sp, 64 # 8-byte Folded Reload ld.d $a2, $sp, 72 # 8-byte Folded Reload jirl $ra, $a3, 0 - ld.w $a1, $s3, %pc_lo12(bipred1_access_method) + ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.w $a1, $a1, %pc_lo12(bipred1_access_method) slli.d $a1, $a1, 3 st.d $s1, $sp, 32 # 8-byte Folded Spill ldx.d $a3, $s1, $a1 - ld.d $a1, $sp, 56 # 8-byte Folded Reload - ld.d $a1, $a1, 8 + ld.d $a1, $s3, 8 st.d $a0, $s2, %pc_lo12(ref2_line) move $a0, $a1 ld.d $a1, $sp, 80 # 8-byte Folded Reload @@ -2918,14 +2857,12 @@ computeBiPredSAD1: # @computeBiPredSAD1 .LBB7_19: # %.loopexit68.split.us st.d $t0, $s2, %pc_lo12(ref2_line) st.d $a7, $s5, %pc_lo12(ref1_line) - st.d $a5, $s8, %pc_lo12(src_line) + st.d $a4, $s8, %pc_lo12(src_line) b .LBB7_33 .LBB7_20: # %.split.preheader ld.d $s1, $sp, 48 # 8-byte Folded Reload ld.w $a0, $s1, %pc_lo12(bipred2_access_method) st.d $s5, $sp, 32 # 8-byte Folded Spill - move $s4, $s3 - st.d $s3, $sp, 16 # 8-byte Folded Spill ld.d $a1, $sp, 96 # 8-byte Folded Reload addi.d $a1, $a1, 512 st.d $a1, $s8, %pc_lo12(src_line) @@ -2933,20 +2870,23 @@ computeBiPredSAD1: # @computeBiPredSAD1 pcalau12i $a1, %pc_hi20(get_crline) addi.d $fp, $a1, %pc_lo12(get_crline) ldx.d $a3, $fp, $a0 - ld.d $s3, $sp, 40 # 8-byte Folded Reload - ld.d $a0, $s3, 8 + move $s4, $s2 + st.d $s2, $sp, 16 # 8-byte Folded Spill + ld.d $s2, $sp, 40 # 8-byte Folded Reload + ld.d $a0, $s2, 8 ld.d $s7, $sp, 64 # 8-byte Folded Reload move $a1, $s7 ld.d $s6, $sp, 72 # 8-byte Folded Reload move $a2, $s6 jirl $ra, $a3, 0 - ld.w $a1, $s4, %pc_lo12(bipred1_access_method) + st.d $s8, $sp, 24 # 8-byte Folded Spill + move $s8, $s3 + ld.d $s3, $sp, 56 # 8-byte Folded Reload + ld.w $a1, $s3, %pc_lo12(bipred1_access_method) slli.d $a1, $a1, 3 ldx.d $a3, $fp, $a1 - st.d $s8, $sp, 24 # 8-byte Folded Spill - ld.d $s8, $sp, 56 # 8-byte Folded Reload ld.d $a1, $s8, 8 - st.d $a0, $s2, %pc_lo12(ref2_line) + st.d $a0, $s4, %pc_lo12(ref2_line) move $a0, $a1 ld.d $s5, $sp, 80 # 8-byte Folded Reload move $a1, $s5 @@ -2956,7 +2896,7 @@ computeBiPredSAD1: # @computeBiPredSAD1 ld.w $a1, $s1, %pc_lo12(bipred2_access_method) slli.d $a1, $a1, 3 ldx.d $a3, $fp, $a1 - ld.d $a1, $s3, 16 + ld.d $a1, $s2, 16 ld.d $s1, $sp, 32 # 8-byte Folded Reload st.d $a0, $s1, %pc_lo12(ref1_line) ld.d $a0, $sp, 96 # 8-byte Folded Reload @@ -2967,12 +2907,12 @@ computeBiPredSAD1: # @computeBiPredSAD1 move $a1, $s7 move $a2, $s6 jirl $ra, $a3, 0 - ld.d $a1, $sp, 16 # 8-byte Folded Reload - ld.w $a1, $a1, %pc_lo12(bipred1_access_method) + ld.w $a1, $s3, %pc_lo12(bipred1_access_method) slli.d $a1, $a1, 3 ldx.d $a3, $fp, $a1 ld.d $a1, $s8, 16 - st.d $a0, $s2, %pc_lo12(ref2_line) + ld.d $a2, $sp, 16 # 8-byte Folded Reload + st.d $a0, $a2, %pc_lo12(ref2_line) move $a0, $a1 move $a1, $s5 move $a2, $s4 @@ -3002,11 +2942,11 @@ computeBiPredSAD1: # @computeBiPredSAD1 ld.d $a1, $sp, 64 # 8-byte Folded Reload ld.d $a2, $sp, 72 # 8-byte Folded Reload jirl $ra, $a3, 0 - ld.w $a1, $s3, %pc_lo12(bipred1_access_method) + ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.w $a1, $a1, %pc_lo12(bipred1_access_method) slli.d $a1, $a1, 3 ldx.d $a3, $s1, $a1 - ld.d $a1, $sp, 56 # 8-byte Folded Reload - ld.d $a1, $a1, 16 + ld.d $a1, $s3, 16 st.d $a0, $s2, %pc_lo12(ref2_line) move $a0, $a1 ld.d $a1, $sp, 80 # 8-byte Folded Reload @@ -3235,48 +3175,48 @@ computeBiPredSAD2: # @computeBiPredSAD2 addi.d $a0, $a0, 32 addi.d $t8, $s3, 32 addi.d $s2, $s4, 32 - move $s4, $t1 + move $s3, $t1 vori.b $vr8, $vr6, 0 .p2align 4, , 16 .LBB8_6: # %vector.body # Parent Loop BB8_3 Depth=1 # => This Inner Loop Header: Depth=2 - ld.h $s3, $a0, -32 + ld.h $s4, $a0, -32 ld.h $s7, $a0, -24 ld.h $ra, $a0, -16 ld.h $s8, $a0, -8 - vinsgr2vr.h $vr9, $s3, 0 + vinsgr2vr.h $vr9, $s4, 0 vinsgr2vr.h $vr9, $s7, 1 vinsgr2vr.h $vr9, $ra, 2 vinsgr2vr.h $vr9, $s8, 3 - ld.h $s3, $a0, 0 + ld.h $s4, $a0, 0 ld.h $s7, $a0, 8 ld.h $s8, $a0, 16 ld.h $ra, $a0, 24 - vinsgr2vr.h $vr10, $s3, 0 + vinsgr2vr.h $vr10, $s4, 0 vinsgr2vr.h $vr10, $s7, 1 vinsgr2vr.h $vr10, $s8, 2 vinsgr2vr.h $vr10, $ra, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 - ld.h $s3, $t8, -32 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + ld.h $s4, $t8, -32 ld.h $s7, $t8, -24 ld.h $s8, $t8, -16 ld.h $ra, $t8, -8 - vinsgr2vr.h $vr11, $s3, 0 + vinsgr2vr.h $vr11, $s4, 0 vinsgr2vr.h $vr11, $s7, 1 vinsgr2vr.h $vr11, $s8, 2 vinsgr2vr.h $vr11, $ra, 3 - ld.h $s3, $t8, 0 + ld.h $s4, $t8, 0 ld.h $s7, $t8, 8 ld.h $s8, $t8, 16 ld.h $ra, $t8, 24 - vinsgr2vr.h $vr12, $s3, 0 + vinsgr2vr.h $vr12, $s4, 0 vinsgr2vr.h $vr12, $s7, 1 vinsgr2vr.h $vr12, $s8, 2 vinsgr2vr.h $vr12, $ra, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -3291,40 +3231,34 @@ computeBiPredSAD2: # @computeBiPredSAD2 vmaxi.w $vr9, $vr9, 0 vmin.w $vr10, $vr10, $vr5 vmin.w $vr9, $vr9, $vr5 - ld.h $s3, $s2, -32 + ld.h $s4, $s2, -32 ld.h $s7, $s2, -24 ld.h $s8, $s2, -16 ld.h $ra, $s2, -8 - vinsgr2vr.h $vr11, $s3, 0 + vinsgr2vr.h $vr11, $s4, 0 vinsgr2vr.h $vr11, $s7, 1 vinsgr2vr.h $vr11, $s8, 2 vinsgr2vr.h $vr11, $ra, 3 - ld.h $s3, $s2, 0 + ld.h $s4, $s2, 0 ld.h $s7, $s2, 8 ld.h $s8, $s2, 16 ld.h $ra, $s2, 24 - vinsgr2vr.h $vr12, $s3, 0 + vinsgr2vr.h $vr12, $s4, 0 vinsgr2vr.h $vr12, $s7, 1 vinsgr2vr.h $vr12, $s8, 2 vinsgr2vr.h $vr12, $ra, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 - vshuf4i.w $vr11, $vr10, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr10, $vr10, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr12, $vr9, 50 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vpickve2gr.d $s3, $vr10, 0 - slli.d $s3, $s3, 2 + vshuf4i.w $vr11, $vr10, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr10, $vr10, 0 + vshuf4i.w $vr12, $vr9, 14 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr9, $vr9, 0 + vpickve2gr.d $s4, $vr10, 0 + slli.d $s4, $s4, 2 vpickve2gr.d $s7, $vr10, 1 slli.d $s7, $s7, 2 vpickve2gr.d $s8, $vr11, 0 @@ -3339,11 +3273,11 @@ computeBiPredSAD2: # @computeBiPredSAD2 slli.d $t0, $t0, 2 vpickve2gr.d $fp, $vr12, 1 slli.d $fp, $fp, 2 - ldx.w $s3, $a5, $s3 + ldx.w $s4, $a5, $s4 ldx.w $s7, $a5, $s7 ldx.w $s8, $a5, $s8 ldx.w $ra, $a5, $ra - vinsgr2vr.w $vr9, $s3, 0 + vinsgr2vr.w $vr9, $s4, 0 vinsgr2vr.w $vr9, $s7, 1 vinsgr2vr.w $vr9, $s8, 2 vinsgr2vr.w $vr9, $ra, 3 @@ -3373,8 +3307,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr10, $t2, 1 vinsgr2vr.h $vr10, $t4, 2 vinsgr2vr.h $vr10, $fp, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $t0, $t8, -30 ld.h $t2, $t8, -22 ld.h $t4, $t8, -14 @@ -3391,8 +3325,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $t2, 1 vinsgr2vr.h $vr12, $t4, 2 vinsgr2vr.h $vr12, $fp, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -3423,22 +3357,16 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $t2, 1 vinsgr2vr.h $vr12, $t4, 2 vinsgr2vr.h $vr12, $fp, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 - vshuf4i.w $vr11, $vr10, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr10, $vr10, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr12, $vr9, 50 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 + vshuf4i.w $vr11, $vr10, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr10, $vr10, 0 + vshuf4i.w $vr12, $vr9, 14 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr9, $vr9, 0 vpickve2gr.d $t0, $vr10, 0 slli.d $t0, $t0, 2 vpickve2gr.d $t2, $vr10, 1 @@ -3447,8 +3375,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 slli.d $t4, $t4, 2 vpickve2gr.d $fp, $vr11, 1 slli.d $fp, $fp, 2 - vpickve2gr.d $s3, $vr9, 0 - slli.d $s3, $s3, 2 + vpickve2gr.d $s4, $vr9, 0 + slli.d $s4, $s4, 2 vpickve2gr.d $s7, $vr9, 1 slli.d $s7, $s7, 2 vpickve2gr.d $s8, $vr12, 0 @@ -3463,7 +3391,7 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.w $vr9, $t2, 1 vinsgr2vr.w $vr9, $t4, 2 vinsgr2vr.w $vr9, $fp, 3 - ldx.w $t0, $a5, $s3 + ldx.w $t0, $a5, $s4 ldx.w $t2, $a5, $s7 ldx.w $t4, $a5, $s8 ldx.w $fp, $a5, $ra @@ -3489,8 +3417,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr10, $t2, 1 vinsgr2vr.h $vr10, $t4, 2 vinsgr2vr.h $vr10, $fp, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $t0, $t8, -28 ld.h $t2, $t8, -20 ld.h $t4, $t8, -12 @@ -3507,8 +3435,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $t2, 1 vinsgr2vr.h $vr12, $t4, 2 vinsgr2vr.h $vr12, $fp, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -3539,22 +3467,16 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $t2, 1 vinsgr2vr.h $vr12, $t4, 2 vinsgr2vr.h $vr12, $fp, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 - vshuf4i.w $vr11, $vr10, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr10, $vr10, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr12, $vr9, 50 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 + vshuf4i.w $vr11, $vr10, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr10, $vr10, 0 + vshuf4i.w $vr12, $vr9, 14 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr9, $vr9, 0 vpickve2gr.d $t0, $vr10, 0 slli.d $t0, $t0, 2 vpickve2gr.d $t2, $vr10, 1 @@ -3563,8 +3485,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 slli.d $t4, $t4, 2 vpickve2gr.d $fp, $vr11, 1 slli.d $fp, $fp, 2 - vpickve2gr.d $s3, $vr9, 0 - slli.d $s3, $s3, 2 + vpickve2gr.d $s4, $vr9, 0 + slli.d $s4, $s4, 2 vpickve2gr.d $s7, $vr9, 1 slli.d $s7, $s7, 2 vpickve2gr.d $s8, $vr12, 0 @@ -3579,7 +3501,7 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.w $vr9, $t2, 1 vinsgr2vr.w $vr9, $t4, 2 vinsgr2vr.w $vr9, $fp, 3 - ldx.w $t0, $a5, $s3 + ldx.w $t0, $a5, $s4 ldx.w $t2, $a5, $s7 ldx.w $t4, $a5, $s8 ldx.w $fp, $a5, $ra @@ -3605,8 +3527,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr10, $t2, 1 vinsgr2vr.h $vr10, $t4, 2 vinsgr2vr.h $vr10, $fp, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $t0, $t8, -26 ld.h $t2, $t8, -18 ld.h $t4, $t8, -10 @@ -3623,8 +3545,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $t2, 1 vinsgr2vr.h $vr12, $t4, 2 vinsgr2vr.h $vr12, $fp, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -3655,22 +3577,16 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $t2, 1 vinsgr2vr.h $vr12, $t4, 2 vinsgr2vr.h $vr12, $fp, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 - vshuf4i.w $vr11, $vr10, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr10, $vr10, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr12, $vr9, 50 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 + vshuf4i.w $vr11, $vr10, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr10, $vr10, 0 + vshuf4i.w $vr12, $vr9, 14 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr9, $vr9, 0 vpickve2gr.d $t0, $vr10, 0 slli.d $t0, $t0, 2 vpickve2gr.d $t2, $vr10, 1 @@ -3679,8 +3595,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 slli.d $t4, $t4, 2 vpickve2gr.d $fp, $vr11, 1 slli.d $fp, $fp, 2 - vpickve2gr.d $s3, $vr9, 0 - slli.d $s3, $s3, 2 + vpickve2gr.d $s4, $vr9, 0 + slli.d $s4, $s4, 2 vpickve2gr.d $s7, $vr9, 1 slli.d $s7, $s7, 2 vpickve2gr.d $s8, $vr12, 0 @@ -3695,7 +3611,7 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.w $vr9, $t2, 1 vinsgr2vr.w $vr9, $t4, 2 vinsgr2vr.w $vr9, $fp, 3 - ldx.w $t0, $a5, $s3 + ldx.w $t0, $a5, $s4 ldx.w $t2, $a5, $s7 ldx.w $t4, $a5, $s8 ldx.w $fp, $a5, $ra @@ -3705,11 +3621,11 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.w $vr10, $fp, 3 vadd.w $vr7, $vr7, $vr9 vadd.w $vr8, $vr8, $vr10 - addi.d $s4, $s4, -8 + addi.d $s3, $s3, -8 addi.d $a0, $a0, 64 addi.d $t8, $t8, 64 addi.d $s2, $s2, 64 - bnez $s4, .LBB8_6 + bnez $s3, .LBB8_6 # %bb.7: # %middle.block # in Loop: Header=BB8_3 Depth=1 vadd.w $vr7, $vr8, $vr7 @@ -3984,8 +3900,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr10, $fp, 1 vinsgr2vr.h $vr10, $s2, 2 vinsgr2vr.h $vr10, $s7, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $t8, $t5, -16 ld.h $fp, $t5, -12 ld.h $s2, $t5, -8 @@ -4002,8 +3918,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $fp, 1 vinsgr2vr.h $vr12, $s2, 2 vinsgr2vr.h $vr12, $s7, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -4034,22 +3950,16 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $fp, 1 vinsgr2vr.h $vr12, $s2, 2 vinsgr2vr.h $vr12, $s7, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 - vshuf4i.w $vr11, $vr10, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr10, $vr10, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr12, $vr9, 50 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 + vshuf4i.w $vr11, $vr10, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr10, $vr10, 0 + vshuf4i.w $vr12, $vr9, 14 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr9, $vr9, 0 vpickve2gr.d $t8, $vr10, 0 slli.d $t8, $t8, 2 vpickve2gr.d $fp, $vr10, 1 @@ -4100,8 +4010,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr10, $t8, 1 vinsgr2vr.h $vr10, $fp, 2 vinsgr2vr.h $vr10, $s2, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $t4, $t5, -14 ld.h $t8, $t5, -10 ld.h $fp, $t5, -6 @@ -4118,8 +4028,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $t8, 1 vinsgr2vr.h $vr12, $fp, 2 vinsgr2vr.h $vr12, $s2, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -4150,22 +4060,16 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $t8, 1 vinsgr2vr.h $vr12, $fp, 2 vinsgr2vr.h $vr12, $s2, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 - vshuf4i.w $vr11, $vr10, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr10, $vr10, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr12, $vr9, 50 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 + vshuf4i.w $vr11, $vr10, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr10, $vr10, 0 + vshuf4i.w $vr12, $vr9, 14 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr9, $vr9, 0 vpickve2gr.d $t4, $vr10, 0 slli.d $t4, $t4, 2 vpickve2gr.d $t8, $vr10, 1 @@ -4456,8 +4360,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr10, $fp, 1 vinsgr2vr.h $vr10, $s2, 2 vinsgr2vr.h $vr10, $s3, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $t8, $t5, -16 ld.h $fp, $t5, -12 ld.h $s2, $t5, -8 @@ -4474,8 +4378,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $fp, 1 vinsgr2vr.h $vr12, $s2, 2 vinsgr2vr.h $vr12, $s3, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -4506,22 +4410,16 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $fp, 1 vinsgr2vr.h $vr12, $s2, 2 vinsgr2vr.h $vr12, $s3, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 - vshuf4i.w $vr11, $vr10, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr10, $vr10, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr12, $vr9, 50 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 + vshuf4i.w $vr11, $vr10, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr10, $vr10, 0 + vshuf4i.w $vr12, $vr9, 14 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr9, $vr9, 0 vpickve2gr.d $t8, $vr10, 0 slli.d $t8, $t8, 2 vpickve2gr.d $fp, $vr10, 1 @@ -4572,8 +4470,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr10, $t8, 1 vinsgr2vr.h $vr10, $fp, 2 vinsgr2vr.h $vr10, $s2, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $t4, $t5, -14 ld.h $t8, $t5, -10 ld.h $fp, $t5, -6 @@ -4590,8 +4488,8 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $t8, 1 vinsgr2vr.h $vr12, $fp, 2 vinsgr2vr.h $vr12, $s2, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -4622,22 +4520,16 @@ computeBiPredSAD2: # @computeBiPredSAD2 vinsgr2vr.h $vr12, $t8, 1 vinsgr2vr.h $vr12, $fp, 2 vinsgr2vr.h $vr12, $s2, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 - vshuf4i.w $vr11, $vr10, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr10, $vr10, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr12, $vr9, 50 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 + vshuf4i.w $vr11, $vr10, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr10, $vr10, 0 + vshuf4i.w $vr12, $vr9, 14 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr9, $vr9, 0 vpickve2gr.d $t4, $vr10, 0 slli.d $t4, $t4, 2 vpickve2gr.d $t8, $vr10, 1 @@ -4791,25 +4683,25 @@ computeBiPredSAD2: # @computeBiPredSAD2 .type computeSATD,@function computeSATD: # @computeSATD # %bb.0: - addi.d $sp, $sp, -208 - st.d $ra, $sp, 200 # 8-byte Folded Spill - st.d $fp, $sp, 192 # 8-byte Folded Spill - st.d $s0, $sp, 184 # 8-byte Folded Spill - st.d $s1, $sp, 176 # 8-byte Folded Spill - st.d $s2, $sp, 168 # 8-byte Folded Spill - st.d $s3, $sp, 160 # 8-byte Folded Spill - st.d $s4, $sp, 152 # 8-byte Folded Spill - st.d $s5, $sp, 144 # 8-byte Folded Spill - st.d $s6, $sp, 136 # 8-byte Folded Spill - st.d $s7, $sp, 128 # 8-byte Folded Spill - st.d $s8, $sp, 120 # 8-byte Folded Spill + addi.d $sp, $sp, -176 + st.d $ra, $sp, 168 # 8-byte Folded Spill + st.d $fp, $sp, 160 # 8-byte Folded Spill + st.d $s0, $sp, 152 # 8-byte Folded Spill + st.d $s1, $sp, 144 # 8-byte Folded Spill + st.d $s2, $sp, 136 # 8-byte Folded Spill + st.d $s3, $sp, 128 # 8-byte Folded Spill + st.d $s4, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 112 # 8-byte Folded Spill + st.d $s6, $sp, 104 # 8-byte Folded Spill + st.d $s7, $sp, 96 # 8-byte Folded Spill + st.d $s8, $sp, 88 # 8-byte Folded Spill pcalau12i $a6, %pc_hi20(test8x8transform) ld.w $a6, $a6, %pc_lo12(test8x8transform) - st.d $a3, $sp, 104 # 8-byte Folded Spill + st.d $a3, $sp, 72 # 8-byte Folded Spill slli.w $a3, $a1, 2 - st.d $a5, $sp, 112 # 8-byte Folded Spill + st.d $a5, $sp, 80 # 8-byte Folded Spill alsl.w $a5, $a1, $a5, 2 - st.d $a2, $sp, 96 # 8-byte Folded Spill + st.d $a2, $sp, 64 # 8-byte Folded Spill beqz $a6, .LBB9_8 # %bb.1: blez $a3, .LBB9_15 @@ -4820,43 +4712,41 @@ computeSATD: # @computeSATD pcalau12i $a1, %pc_hi20(img_padded_size_x) ld.w $s8, $a1, %pc_lo12(img_padded_size_x) slli.d $a1, $a3, 1 - st.d $a1, $sp, 16 # 8-byte Folded Spill + st.d $a1, $sp, 8 # 8-byte Folded Spill slli.d $s7, $a2, 1 pcalau12i $a1, %pc_hi20(ref_access_method) - st.d $a1, $sp, 88 # 8-byte Folded Spill + st.d $a1, $sp, 56 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(get_line) addi.d $a1, $a1, %pc_lo12(get_line) - st.d $a1, $sp, 80 # 8-byte Folded Spill + st.d $a1, $sp, 48 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(ref_pic_sub) - st.d $a1, $sp, 72 # 8-byte Folded Spill + st.d $a1, $sp, 40 # 8-byte Folded Spill pcalau12i $s3, %pc_hi20(ref_line) pcalau12i $s0, %pc_hi20(src_line) - vrepli.b $vr0, 0 - vst $vr0, $sp, 48 # 16-byte Folded Spill pcalau12i $a1, %pc_hi20(diff) addi.d $s5, $a1, %pc_lo12(diff) move $s4, $zero slli.d $s1, $s8, 1 - st.d $a4, $sp, 32 # 8-byte Folded Spill - st.d $a5, $sp, 24 # 8-byte Folded Spill + st.d $a4, $sp, 24 # 8-byte Folded Spill + st.d $a5, $sp, 16 # 8-byte Folded Spill .LBB9_4: # %.preheader78.us # =>This Loop Header: Depth=1 # Child Loop BB9_5 Depth 2 move $s2, $zero - st.d $a0, $sp, 40 # 8-byte Folded Spill + st.d $a0, $sp, 32 # 8-byte Folded Spill move $fp, $a0 move $s6, $a4 .p2align 4, , 16 .LBB9_5: # Parent Loop BB9_4 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $a0, $sp, 88 # 8-byte Folded Reload + ld.d $a0, $sp, 56 # 8-byte Folded Reload ld.w $a0, $a0, %pc_lo12(ref_access_method) slli.d $a0, $a0, 3 - ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 48 # 8-byte Folded Reload ldx.d $a3, $a1, $a0 - ld.d $a0, $sp, 72 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload ld.d $a0, $a0, %pc_lo12(ref_pic_sub) - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 80 # 8-byte Folded Reload move $a2, $s6 jirl $ra, $a3, 0 ld.d $a1, $fp, 0 @@ -4864,18 +4754,17 @@ computeSATD: # @computeSATD ld.d $a2, $a0, 0 st.d $fp, $s0, %pc_lo12(src_line) vinsgr2vr.d $vr0, $a1, 0 - vld $vr2, $sp, 48 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a1, $fp, 8 vsub.w $vr0, $vr0, $vr1 ld.d $a2, $a0, 8 vst $vr0, $s5, 0 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s5, 16 ldx.d $a1, $fp, $s7 @@ -4883,17 +4772,17 @@ computeSATD: # @computeSATD ldx.d $a0, $a0, $s1 add.d $a3, $fp, $s7 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a0, $a3, 8 vsub.w $vr0, $vr0, $vr1 ld.d $a1, $a2, 8 vst $vr0, $s5, 32 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s5, 48 ldx.d $a0, $a3, $s7 @@ -4901,17 +4790,17 @@ computeSATD: # @computeSATD ldx.d $a2, $a2, $s1 add.d $a3, $a3, $s7 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a0, $a3, 8 vsub.w $vr0, $vr0, $vr1 ld.d $a2, $a1, 8 vst $vr0, $s5, 64 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s5, 80 ldx.d $a0, $a3, $s7 @@ -4919,17 +4808,17 @@ computeSATD: # @computeSATD ldx.d $a1, $a1, $s1 add.d $a3, $a3, $s7 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a0, $a3, 8 vsub.w $vr0, $vr0, $vr1 ld.d $a1, $a2, 8 vst $vr0, $s5, 96 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s5, 112 ldx.d $a0, $a3, $s7 @@ -4937,17 +4826,17 @@ computeSATD: # @computeSATD ldx.d $a2, $a2, $s1 add.d $a3, $a3, $s7 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a0, $a3, 8 vsub.w $vr0, $vr0, $vr1 ld.d $a2, $a1, 8 vst $vr0, $s5, 128 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s5, 144 ldx.d $a0, $a3, $s7 @@ -4955,17 +4844,17 @@ computeSATD: # @computeSATD ldx.d $a1, $a1, $s1 add.d $a3, $a3, $s7 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a0, $a3, 8 vsub.w $vr0, $vr0, $vr1 ld.d $a1, $a2, 8 vst $vr0, $s5, 160 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s5, 176 ldx.d $a0, $a3, $s7 @@ -4973,17 +4862,17 @@ computeSATD: # @computeSATD ldx.d $a2, $a2, $s1 add.d $a3, $a3, $s7 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a0, $a3, 8 vsub.w $vr0, $vr0, $vr1 ld.d $a2, $a1, 8 vst $vr0, $s5, 192 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s5, 208 ldx.d $a0, $a3, $s7 @@ -4991,17 +4880,17 @@ computeSATD: # @computeSATD ldx.d $a1, $a1, $s1 add.d $a3, $a3, $s7 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a0, $a3, 8 vsub.w $vr0, $vr0, $vr1 ld.d $a1, $a2, 8 vst $vr0, $s5, 224 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s5, 240 alsl.d $a0, $s8, $a2, 1 @@ -5012,24 +4901,24 @@ computeSATD: # @computeSATD pcaddu18i $ra, %call36(HadamardSAD8x8) jirl $ra, $ra, 0 add.w $s4, $a0, $s4 - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 72 # 8-byte Folded Reload blt $a0, $s4, .LBB9_16 # %bb.6: # in Loop: Header=BB9_5 Depth=2 addi.w $s6, $s6, 32 addi.w $s2, $s2, 8 addi.d $fp, $fp, 16 - ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload blt $s2, $a0, .LBB9_5 # %bb.7: # %._crit_edge.us # in Loop: Header=BB9_4 Depth=1 - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 80 # 8-byte Folded Reload addi.w $a1, $a1, 32 - ld.d $a0, $sp, 40 # 8-byte Folded Reload - ld.d $a2, $sp, 16 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload + ld.d $a2, $sp, 8 # 8-byte Folded Reload add.d $a0, $a0, $a2 - st.d $a1, $sp, 112 # 8-byte Folded Spill - ld.d $a4, $sp, 32 # 8-byte Folded Reload - ld.d $a5, $sp, 24 # 8-byte Folded Reload + st.d $a1, $sp, 80 # 8-byte Folded Spill + ld.d $a4, $sp, 24 # 8-byte Folded Reload + ld.d $a5, $sp, 16 # 8-byte Folded Reload blt $a1, $a5, .LBB9_4 b .LBB9_16 .LBB9_8: @@ -5041,52 +4930,49 @@ computeSATD: # @computeSATD pcalau12i $a1, %pc_hi20(img_padded_size_x) ld.w $fp, $a1, %pc_lo12(img_padded_size_x) slli.d $a1, $a3, 1 - st.d $a1, $sp, 16 # 8-byte Folded Spill + st.d $a1, $sp, 8 # 8-byte Folded Spill slli.d $s1, $a2, 1 pcalau12i $a1, %pc_hi20(ref_access_method) - st.d $a1, $sp, 88 # 8-byte Folded Spill + st.d $a1, $sp, 56 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(get_line) addi.d $a1, $a1, %pc_lo12(get_line) - st.d $a1, $sp, 80 # 8-byte Folded Spill + st.d $a1, $sp, 48 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(ref_pic_sub) - st.d $a1, $sp, 72 # 8-byte Folded Spill - vrepli.b $vr0, 0 - vst $vr0, $sp, 48 # 16-byte Folded Spill + st.d $a1, $sp, 40 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(diff) addi.d $s5, $a1, %pc_lo12(diff) move $s4, $zero slli.d $s8, $fp, 1 pcalau12i $s0, %pc_hi20(src_line) pcalau12i $s2, %pc_hi20(ref_line) - st.d $a4, $sp, 32 # 8-byte Folded Spill - st.d $a5, $sp, 24 # 8-byte Folded Spill + st.d $a4, $sp, 24 # 8-byte Folded Spill + st.d $a5, $sp, 16 # 8-byte Folded Spill .LBB9_11: # %.preheader.us # =>This Loop Header: Depth=1 # Child Loop BB9_12 Depth 2 move $s3, $zero - st.d $a0, $sp, 40 # 8-byte Folded Spill + st.d $a0, $sp, 32 # 8-byte Folded Spill move $s7, $a0 move $s6, $a4 .p2align 4, , 16 .LBB9_12: # Parent Loop BB9_11 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $a0, $sp, 88 # 8-byte Folded Reload + ld.d $a0, $sp, 56 # 8-byte Folded Reload ld.w $a0, $a0, %pc_lo12(ref_access_method) slli.d $a0, $a0, 3 - ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 48 # 8-byte Folded Reload ldx.d $a3, $a1, $a0 - ld.d $a0, $sp, 72 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload ld.d $a0, $a0, %pc_lo12(ref_pic_sub) - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 80 # 8-byte Folded Reload move $a2, $s6 jirl $ra, $a3, 0 ld.d $a1, $s7, 0 ld.d $a2, $a0, 0 vinsgr2vr.d $vr0, $a1, 0 - vld $vr2, $sp, 48 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s5, 0 ldx.d $a1, $s7, $s1 @@ -5094,9 +4980,9 @@ computeSATD: # @computeSATD ldx.d $a0, $a0, $s8 add.d $a3, $s7, $s1 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s5, 16 ldx.d $a0, $a3, $s1 @@ -5104,9 +4990,9 @@ computeSATD: # @computeSATD ldx.d $a2, $a2, $s8 add.d $a3, $a3, $s1 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s5, 32 ldx.d $a0, $a3, $s1 @@ -5114,9 +5000,9 @@ computeSATD: # @computeSATD ldx.d $a1, $a1, $s8 add.d $a3, $a3, $s1 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s5, 48 alsl.d $a0, $fp, $a2, 1 @@ -5127,42 +5013,42 @@ computeSATD: # @computeSATD pcaddu18i $ra, %call36(HadamardSAD4x4) jirl $ra, $ra, 0 add.w $s4, $a0, $s4 - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 72 # 8-byte Folded Reload blt $a0, $s4, .LBB9_16 # %bb.13: # in Loop: Header=BB9_12 Depth=2 addi.w $s6, $s6, 16 addi.w $s3, $s3, 4 addi.d $s7, $s7, 8 - ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload blt $s3, $a0, .LBB9_12 # %bb.14: # %._crit_edge.us109 # in Loop: Header=BB9_11 Depth=1 - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 80 # 8-byte Folded Reload addi.w $a1, $a1, 16 - ld.d $a0, $sp, 40 # 8-byte Folded Reload - ld.d $a2, $sp, 16 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload + ld.d $a2, $sp, 8 # 8-byte Folded Reload add.d $a0, $a0, $a2 - st.d $a1, $sp, 112 # 8-byte Folded Spill - ld.d $a4, $sp, 32 # 8-byte Folded Reload - ld.d $a5, $sp, 24 # 8-byte Folded Reload + st.d $a1, $sp, 80 # 8-byte Folded Spill + ld.d $a4, $sp, 24 # 8-byte Folded Reload + ld.d $a5, $sp, 16 # 8-byte Folded Reload blt $a1, $a5, .LBB9_11 b .LBB9_16 .LBB9_15: move $s4, $zero .LBB9_16: # %.loopexit move $a0, $s4 - ld.d $s8, $sp, 120 # 8-byte Folded Reload - ld.d $s7, $sp, 128 # 8-byte Folded Reload - ld.d $s6, $sp, 136 # 8-byte Folded Reload - ld.d $s5, $sp, 144 # 8-byte Folded Reload - ld.d $s4, $sp, 152 # 8-byte Folded Reload - ld.d $s3, $sp, 160 # 8-byte Folded Reload - ld.d $s2, $sp, 168 # 8-byte Folded Reload - ld.d $s1, $sp, 176 # 8-byte Folded Reload - ld.d $s0, $sp, 184 # 8-byte Folded Reload - ld.d $fp, $sp, 192 # 8-byte Folded Reload - ld.d $ra, $sp, 200 # 8-byte Folded Reload - addi.d $sp, $sp, 208 + ld.d $s8, $sp, 88 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload + ld.d $s6, $sp, 104 # 8-byte Folded Reload + ld.d $s5, $sp, 112 # 8-byte Folded Reload + ld.d $s4, $sp, 120 # 8-byte Folded Reload + ld.d $s3, $sp, 128 # 8-byte Folded Reload + ld.d $s2, $sp, 136 # 8-byte Folded Reload + ld.d $s1, $sp, 144 # 8-byte Folded Reload + ld.d $s0, $sp, 152 # 8-byte Folded Reload + ld.d $fp, $sp, 160 # 8-byte Folded Reload + ld.d $ra, $sp, 168 # 8-byte Folded Reload + addi.d $sp, $sp, 176 ret .Lfunc_end9: .size computeSATD, .Lfunc_end9-computeSATD @@ -5766,21 +5652,21 @@ computeSATDWP: # @computeSATDWP .type computeBiPredSATD1,@function computeBiPredSATD1: # @computeBiPredSATD1 # %bb.0: - addi.d $sp, $sp, -288 - st.d $ra, $sp, 280 # 8-byte Folded Spill - st.d $fp, $sp, 272 # 8-byte Folded Spill - st.d $s0, $sp, 264 # 8-byte Folded Spill - st.d $s1, $sp, 256 # 8-byte Folded Spill - st.d $s2, $sp, 248 # 8-byte Folded Spill - st.d $s3, $sp, 240 # 8-byte Folded Spill - st.d $s4, $sp, 232 # 8-byte Folded Spill - st.d $s5, $sp, 224 # 8-byte Folded Spill - st.d $s6, $sp, 216 # 8-byte Folded Spill - st.d $s7, $sp, 208 # 8-byte Folded Spill - st.d $s8, $sp, 200 # 8-byte Folded Spill + addi.d $sp, $sp, -256 + st.d $ra, $sp, 248 # 8-byte Folded Spill + st.d $fp, $sp, 240 # 8-byte Folded Spill + st.d $s0, $sp, 232 # 8-byte Folded Spill + st.d $s1, $sp, 224 # 8-byte Folded Spill + st.d $s2, $sp, 216 # 8-byte Folded Spill + st.d $s3, $sp, 208 # 8-byte Folded Spill + st.d $s4, $sp, 200 # 8-byte Folded Spill + st.d $s5, $sp, 192 # 8-byte Folded Spill + st.d $s6, $sp, 184 # 8-byte Folded Spill + st.d $s7, $sp, 176 # 8-byte Folded Spill + st.d $s8, $sp, 168 # 8-byte Folded Spill pcalau12i $t0, %pc_hi20(test8x8transform) ld.w $t0, $t0, %pc_lo12(test8x8transform) - st.d $a3, $sp, 192 # 8-byte Folded Spill + st.d $a3, $sp, 160 # 8-byte Folded Spill move $s4, $a2 beqz $t0, .LBB11_8 # %bb.1: @@ -5793,75 +5679,73 @@ computeBiPredSATD1: # @computeBiPredSATD1 slli.w $a2, $s4, 3 addi.w $s1, $a3, -8 slli.d $a2, $a2, 1 - st.d $a2, $sp, 16 # 8-byte Folded Spill + st.d $a2, $sp, 8 # 8-byte Folded Spill pcalau12i $a2, %pc_hi20(src_line) - st.d $a2, $sp, 168 # 8-byte Folded Spill + st.d $a2, $sp, 136 # 8-byte Folded Spill pcalau12i $a2, %pc_hi20(bipred2_access_method) - st.d $a2, $sp, 160 # 8-byte Folded Spill + st.d $a2, $sp, 128 # 8-byte Folded Spill pcalau12i $a2, %pc_hi20(get_line) addi.d $a2, $a2, %pc_lo12(get_line) - st.d $a2, $sp, 152 # 8-byte Folded Spill + st.d $a2, $sp, 120 # 8-byte Folded Spill pcalau12i $a2, %pc_hi20(ref_pic2_sub) - st.d $a2, $sp, 144 # 8-byte Folded Spill + st.d $a2, $sp, 112 # 8-byte Folded Spill pcalau12i $a2, %pc_hi20(ref2_line) - st.d $a2, $sp, 136 # 8-byte Folded Spill + st.d $a2, $sp, 104 # 8-byte Folded Spill pcalau12i $a2, %pc_hi20(bipred1_access_method) - st.d $a2, $sp, 128 # 8-byte Folded Spill + st.d $a2, $sp, 96 # 8-byte Folded Spill pcalau12i $a2, %pc_hi20(ref_pic1_sub) - st.d $a2, $sp, 120 # 8-byte Folded Spill - vrepli.b $vr0, 0 - vst $vr0, $sp, 96 # 16-byte Folded Spill + st.d $a2, $sp, 88 # 8-byte Folded Spill pcalau12i $a2, %pc_hi20(diff) addi.d $s7, $a2, %pc_lo12(diff) move $s6, $zero move $a2, $zero slli.d $a3, $s4, 1 - st.d $a3, $sp, 80 # 8-byte Folded Spill + st.d $a3, $sp, 72 # 8-byte Folded Spill pcalau12i $a3, %pc_hi20(ref1_line) - st.d $a3, $sp, 88 # 8-byte Folded Spill - st.d $a1, $sp, 24 # 8-byte Folded Spill - st.d $a7, $sp, 64 # 8-byte Folded Spill - st.d $a6, $sp, 56 # 8-byte Folded Spill - st.d $a5, $sp, 48 # 8-byte Folded Spill - st.d $a4, $sp, 40 # 8-byte Folded Spill + st.d $a3, $sp, 80 # 8-byte Folded Spill + st.d $a1, $sp, 16 # 8-byte Folded Spill + st.d $a7, $sp, 56 # 8-byte Folded Spill + st.d $a6, $sp, 48 # 8-byte Folded Spill + st.d $a5, $sp, 40 # 8-byte Folded Spill + st.d $a4, $sp, 32 # 8-byte Folded Spill .LBB11_4: # %.lr.ph.us # =>This Loop Header: Depth=1 # Child Loop BB11_5 Depth 2 move $s8, $zero alsl.w $a1, $a2, $a7, 2 - st.d $a1, $sp, 184 # 8-byte Folded Spill - st.d $a2, $sp, 32 # 8-byte Folded Spill + st.d $a1, $sp, 152 # 8-byte Folded Spill + st.d $a2, $sp, 24 # 8-byte Folded Spill alsl.w $a1, $a2, $a5, 2 - st.d $a1, $sp, 176 # 8-byte Folded Spill - st.d $a0, $sp, 72 # 8-byte Folded Spill + st.d $a1, $sp, 144 # 8-byte Folded Spill + st.d $a0, $sp, 64 # 8-byte Folded Spill move $s5, $a0 move $s0, $a6 move $s2, $a4 .p2align 4, , 16 .LBB11_5: # Parent Loop BB11_4 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $a0, $sp, 160 # 8-byte Folded Reload + ld.d $a0, $sp, 128 # 8-byte Folded Reload ld.w $a0, $a0, %pc_lo12(bipred2_access_method) slli.d $a0, $a0, 3 - ld.d $fp, $sp, 152 # 8-byte Folded Reload + ld.d $fp, $sp, 120 # 8-byte Folded Reload ldx.d $a3, $fp, $a0 - ld.d $a0, $sp, 144 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload ld.d $a0, $a0, %pc_lo12(ref_pic2_sub) - ld.d $s3, $sp, 168 # 8-byte Folded Reload + ld.d $s3, $sp, 136 # 8-byte Folded Reload st.d $s5, $s3, %pc_lo12(src_line) - ld.d $a1, $sp, 184 # 8-byte Folded Reload + ld.d $a1, $sp, 152 # 8-byte Folded Reload move $a2, $s0 jirl $ra, $a3, 0 - ld.d $a1, $sp, 128 # 8-byte Folded Reload + ld.d $a1, $sp, 96 # 8-byte Folded Reload ld.w $a1, $a1, %pc_lo12(bipred1_access_method) slli.d $a1, $a1, 3 ldx.d $a3, $fp, $a1 - ld.d $a1, $sp, 120 # 8-byte Folded Reload + ld.d $a1, $sp, 88 # 8-byte Folded Reload ld.d $a1, $a1, %pc_lo12(ref_pic1_sub) - ld.d $fp, $sp, 136 # 8-byte Folded Reload + ld.d $fp, $sp, 104 # 8-byte Folded Reload st.d $a0, $fp, %pc_lo12(ref2_line) move $a0, $a1 - ld.d $a1, $sp, 176 # 8-byte Folded Reload + ld.d $a1, $sp, 144 # 8-byte Folded Reload move $a2, $s2 jirl $ra, $a3, 0 ld.d $a1, $s3, %pc_lo12(src_line) @@ -5870,15 +5754,14 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a4, $a0, 0 ld.d $a5, $a2, 0 vinsgr2vr.d $vr0, $a3, 0 - vld $vr4, $sp, 96 # 16-byte Folded Reload - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 vinsgr2vr.d $vr2, $a5, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s7, 0 alsl.d $a3, $s1, $a0, 1 @@ -5887,16 +5770,16 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a0, $a0, 8 ld.d $a2, $a2, 8 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 vinsgr2vr.d $vr2, $a2, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 - ld.d $a6, $sp, 80 # 8-byte Folded Reload + ld.d $a6, $sp, 72 # 8-byte Folded Reload ldx.d $a0, $a1, $a6 ld.d $a2, $a3, 16 vst $vr0, $s7, 16 @@ -5906,13 +5789,13 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a0, $a5, 16 addi.d $a2, $a5, 16 alsl.d $a1, $s4, $a1, 1 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr2, $a0, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s7, 32 alsl.d $a0, $s1, $a4, 1 @@ -5921,14 +5804,14 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a3, $a3, 24 ld.d $a5, $a5, 24 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a3, 0 vinsgr2vr.d $vr2, $a5, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a3, $a1, $a6 ld.d $a4, $a0, 16 @@ -5939,13 +5822,13 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a3, $a2, 16 addi.d $a4, $a2, 16 alsl.d $a1, $s4, $a1, 1 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr2, $a3, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s7, 64 alsl.d $a3, $s1, $a5, 1 @@ -5954,14 +5837,14 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a0, $a0, 24 ld.d $a2, $a2, 24 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 vinsgr2vr.d $vr2, $a2, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a0, $a1, $a6 ld.d $a2, $a3, 16 @@ -5972,13 +5855,13 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a0, $a4, 16 addi.d $a2, $a4, 16 alsl.d $a1, $s4, $a1, 1 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr2, $a0, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s7, 96 alsl.d $a0, $s1, $a5, 1 @@ -5987,14 +5870,14 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a3, $a3, 24 ld.d $a4, $a4, 24 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a3, 0 vinsgr2vr.d $vr2, $a4, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a3, $a1, $a6 ld.d $a4, $a0, 16 @@ -6005,13 +5888,13 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a3, $a2, 16 addi.d $a4, $a2, 16 alsl.d $a1, $s4, $a1, 1 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr2, $a3, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s7, 128 alsl.d $a3, $s1, $a5, 1 @@ -6020,14 +5903,14 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a0, $a0, 24 ld.d $a2, $a2, 24 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 vinsgr2vr.d $vr2, $a2, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a0, $a1, $a6 ld.d $a2, $a3, 16 @@ -6038,13 +5921,13 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a0, $a4, 16 addi.d $a2, $a4, 16 alsl.d $a1, $s4, $a1, 1 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr2, $a0, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s7, 160 alsl.d $a0, $s1, $a5, 1 @@ -6053,14 +5936,14 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a3, $a3, 24 ld.d $a4, $a4, 24 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a3, 0 vinsgr2vr.d $vr2, $a4, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a3, $a1, $a6 ld.d $a4, $a0, 16 @@ -6071,13 +5954,13 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a3, $a2, 16 addi.d $a4, $a2, 16 alsl.d $a1, $s4, $a1, 1 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr2, $a3, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s7, 192 alsl.d $a3, $s1, $a5, 1 @@ -6086,14 +5969,14 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a0, $a0, 24 ld.d $a2, $a2, 24 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 vinsgr2vr.d $vr2, $a2, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a0, $a1, $a6 ld.d $a2, $a3, 16 @@ -6104,13 +5987,13 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a0, $a4, 16 addi.d $a2, $a4, 16 alsl.d $a1, $s4, $a1, 1 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr2, $a0, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s7, 224 alsl.d $a0, $s1, $a5, 1 @@ -6119,28 +6002,28 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a3, $a3, 24 ld.d $a4, $a4, 24 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a3, 0 vinsgr2vr.d $vr2, $a4, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s7, 240 addi.d $a0, $a0, 16 addi.d $a2, $a2, 16 alsl.d $a1, $s4, $a1, 1 st.d $a1, $s3, %pc_lo12(src_line) - ld.d $a1, $sp, 88 # 8-byte Folded Reload + ld.d $a1, $sp, 80 # 8-byte Folded Reload st.d $a0, $a1, %pc_lo12(ref1_line) st.d $a2, $fp, %pc_lo12(ref2_line) move $a0, $s7 pcaddu18i $ra, %call36(HadamardSAD8x8) jirl $ra, $ra, 0 add.w $s6, $a0, $s6 - ld.d $a0, $sp, 192 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload blt $a0, $s6, .LBB11_16 # %bb.6: # in Loop: Header=BB11_5 Depth=2 addi.w $s2, $s2, 32 @@ -6150,16 +6033,16 @@ computeBiPredSATD1: # @computeBiPredSATD1 blt $s8, $s4, .LBB11_5 # %bb.7: # %..thread_crit_edge.us # in Loop: Header=BB11_4 Depth=1 - ld.d $a2, $sp, 32 # 8-byte Folded Reload + ld.d $a2, $sp, 24 # 8-byte Folded Reload addi.w $a2, $a2, 8 - ld.d $a0, $sp, 72 # 8-byte Folded Reload - ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload + ld.d $a1, $sp, 8 # 8-byte Folded Reload add.d $a0, $a0, $a1 - ld.d $a1, $sp, 24 # 8-byte Folded Reload - ld.d $a7, $sp, 64 # 8-byte Folded Reload - ld.d $a6, $sp, 56 # 8-byte Folded Reload - ld.d $a5, $sp, 48 # 8-byte Folded Reload - ld.d $a4, $sp, 40 # 8-byte Folded Reload + ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a7, $sp, 56 # 8-byte Folded Reload + ld.d $a6, $sp, 48 # 8-byte Folded Reload + ld.d $a5, $sp, 40 # 8-byte Folded Reload + ld.d $a4, $sp, 32 # 8-byte Folded Reload blt $a2, $a1, .LBB11_4 b .LBB11_16 .LBB11_8: @@ -6173,75 +6056,73 @@ computeBiPredSATD1: # @computeBiPredSATD1 slli.w $a1, $s4, 2 addi.w $s3, $a2, -4 slli.d $a1, $a1, 1 - st.d $a1, $sp, 16 # 8-byte Folded Spill + st.d $a1, $sp, 8 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(src_line) - st.d $a1, $sp, 168 # 8-byte Folded Spill + st.d $a1, $sp, 136 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(bipred2_access_method) - st.d $a1, $sp, 160 # 8-byte Folded Spill + st.d $a1, $sp, 128 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(get_line) addi.d $a1, $a1, %pc_lo12(get_line) - st.d $a1, $sp, 152 # 8-byte Folded Spill + st.d $a1, $sp, 120 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(ref_pic2_sub) - st.d $a1, $sp, 144 # 8-byte Folded Spill + st.d $a1, $sp, 112 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(ref2_line) - st.d $a1, $sp, 136 # 8-byte Folded Spill + st.d $a1, $sp, 104 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(bipred1_access_method) - st.d $a1, $sp, 128 # 8-byte Folded Spill + st.d $a1, $sp, 96 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(ref_pic1_sub) - st.d $a1, $sp, 120 # 8-byte Folded Spill - vrepli.b $vr0, 0 - vst $vr0, $sp, 96 # 16-byte Folded Spill + st.d $a1, $sp, 88 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(diff) addi.d $fp, $a1, %pc_lo12(diff) move $s6, $zero move $a2, $zero slli.d $a1, $s4, 1 - st.d $a1, $sp, 88 # 8-byte Folded Spill - pcalau12i $a1, %pc_hi20(ref1_line) st.d $a1, $sp, 80 # 8-byte Folded Spill - st.d $a7, $sp, 64 # 8-byte Folded Spill - st.d $a6, $sp, 56 # 8-byte Folded Spill - st.d $a5, $sp, 48 # 8-byte Folded Spill - st.d $a4, $sp, 40 # 8-byte Folded Spill - st.d $a3, $sp, 24 # 8-byte Folded Spill + pcalau12i $a1, %pc_hi20(ref1_line) + st.d $a1, $sp, 72 # 8-byte Folded Spill + st.d $a7, $sp, 56 # 8-byte Folded Spill + st.d $a6, $sp, 48 # 8-byte Folded Spill + st.d $a5, $sp, 40 # 8-byte Folded Spill + st.d $a4, $sp, 32 # 8-byte Folded Spill + st.d $a3, $sp, 16 # 8-byte Folded Spill .LBB11_11: # %.preheader.us # =>This Loop Header: Depth=1 # Child Loop BB11_12 Depth 2 move $s1, $zero add.w $a1, $a2, $a7 - st.d $a1, $sp, 184 # 8-byte Folded Spill - st.d $a2, $sp, 32 # 8-byte Folded Spill + st.d $a1, $sp, 152 # 8-byte Folded Spill + st.d $a2, $sp, 24 # 8-byte Folded Spill add.w $a1, $a2, $a5 - st.d $a1, $sp, 176 # 8-byte Folded Spill - st.d $a0, $sp, 72 # 8-byte Folded Spill + st.d $a1, $sp, 144 # 8-byte Folded Spill + st.d $a0, $sp, 64 # 8-byte Folded Spill move $s5, $a0 move $s7, $a6 move $s8, $a4 .p2align 4, , 16 .LBB11_12: # Parent Loop BB11_11 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $a0, $sp, 160 # 8-byte Folded Reload + ld.d $a0, $sp, 128 # 8-byte Folded Reload ld.w $a0, $a0, %pc_lo12(bipred2_access_method) slli.d $a0, $a0, 3 - ld.d $s2, $sp, 152 # 8-byte Folded Reload + ld.d $s2, $sp, 120 # 8-byte Folded Reload ldx.d $a3, $s2, $a0 - ld.d $a0, $sp, 144 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload ld.d $a0, $a0, %pc_lo12(ref_pic2_sub) - ld.d $s0, $sp, 168 # 8-byte Folded Reload + ld.d $s0, $sp, 136 # 8-byte Folded Reload st.d $s5, $s0, %pc_lo12(src_line) - ld.d $a1, $sp, 184 # 8-byte Folded Reload + ld.d $a1, $sp, 152 # 8-byte Folded Reload move $a2, $s7 jirl $ra, $a3, 0 - ld.d $a1, $sp, 128 # 8-byte Folded Reload + ld.d $a1, $sp, 96 # 8-byte Folded Reload ld.w $a1, $a1, %pc_lo12(bipred1_access_method) slli.d $a1, $a1, 3 ldx.d $a3, $s2, $a1 - ld.d $a1, $sp, 120 # 8-byte Folded Reload + ld.d $a1, $sp, 88 # 8-byte Folded Reload ld.d $a1, $a1, %pc_lo12(ref_pic1_sub) - ld.d $s2, $sp, 136 # 8-byte Folded Reload + ld.d $s2, $sp, 104 # 8-byte Folded Reload st.d $a0, $s2, %pc_lo12(ref2_line) move $a0, $a1 - ld.d $a1, $sp, 176 # 8-byte Folded Reload + ld.d $a1, $sp, 144 # 8-byte Folded Reload move $a2, $s8 jirl $ra, $a3, 0 ld.d $a1, $s0, %pc_lo12(src_line) @@ -6252,35 +6133,34 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a0, $a0, 0 ld.d $a2, $a2, 0 vinsgr2vr.d $vr0, $a4, 0 - vld $vr4, $sp, 96 # 16-byte Folded Reload - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 vinsgr2vr.d $vr2, $a2, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $fp, 0 addi.d $a0, $a3, 8 addi.d $a2, $a5, 8 alsl.d $a4, $s4, $a1, 1 alsl.d $a0, $s3, $a0, 1 - ld.d $a6, $sp, 88 # 8-byte Folded Reload + ld.d $a6, $sp, 80 # 8-byte Folded Reload ldx.d $a1, $a1, $a6 alsl.d $a2, $s3, $a2, 1 ld.d $a3, $a3, 8 ld.d $a5, $a5, 8 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a3, 0 vinsgr2vr.d $vr2, $a5, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $fp, 16 addi.d $a1, $a0, 8 @@ -6292,14 +6172,14 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a0, $a0, 8 ld.d $a2, $a2, 8 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 vinsgr2vr.d $vr2, $a2, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $fp, 32 addi.d $a0, $a1, 8 @@ -6311,28 +6191,28 @@ computeBiPredSATD1: # @computeBiPredSATD1 ld.d $a1, $a1, 8 ld.d $a3, $a3, 8 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr4, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 vinsgr2vr.d $vr2, $a3, 0 vor.v $vr3, $vr1, $vr2 vxor.v $vr1, $vr1, $vr2 vsrli.h $vr1, $vr1, 1 vsub.h $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $fp, 48 addi.d $a0, $a0, 8 addi.d $a1, $a2, 8 alsl.d $a2, $s4, $a4, 1 st.d $a2, $s0, %pc_lo12(src_line) - ld.d $a2, $sp, 80 # 8-byte Folded Reload + ld.d $a2, $sp, 72 # 8-byte Folded Reload st.d $a0, $a2, %pc_lo12(ref1_line) st.d $a1, $s2, %pc_lo12(ref2_line) move $a0, $fp pcaddu18i $ra, %call36(HadamardSAD4x4) jirl $ra, $ra, 0 add.w $s6, $a0, $s6 - ld.d $a0, $sp, 192 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload blt $a0, $s6, .LBB11_16 # %bb.13: # in Loop: Header=BB11_12 Depth=2 addi.w $s8, $s8, 16 @@ -6342,34 +6222,34 @@ computeBiPredSATD1: # @computeBiPredSATD1 blt $s1, $s4, .LBB11_12 # %bb.14: # %._crit_edge.us # in Loop: Header=BB11_11 Depth=1 - ld.d $a2, $sp, 32 # 8-byte Folded Reload + ld.d $a2, $sp, 24 # 8-byte Folded Reload addi.w $a2, $a2, 16 - ld.d $a0, $sp, 72 # 8-byte Folded Reload - ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload + ld.d $a1, $sp, 8 # 8-byte Folded Reload add.d $a0, $a0, $a1 - ld.d $a7, $sp, 64 # 8-byte Folded Reload - ld.d $a6, $sp, 56 # 8-byte Folded Reload - ld.d $a5, $sp, 48 # 8-byte Folded Reload - ld.d $a4, $sp, 40 # 8-byte Folded Reload - ld.d $a3, $sp, 24 # 8-byte Folded Reload + ld.d $a7, $sp, 56 # 8-byte Folded Reload + ld.d $a6, $sp, 48 # 8-byte Folded Reload + ld.d $a5, $sp, 40 # 8-byte Folded Reload + ld.d $a4, $sp, 32 # 8-byte Folded Reload + ld.d $a3, $sp, 16 # 8-byte Folded Reload blt $a2, $a3, .LBB11_11 b .LBB11_16 .LBB11_15: move $s6, $zero .LBB11_16: # %.loopexit move $a0, $s6 - ld.d $s8, $sp, 200 # 8-byte Folded Reload - ld.d $s7, $sp, 208 # 8-byte Folded Reload - ld.d $s6, $sp, 216 # 8-byte Folded Reload - ld.d $s5, $sp, 224 # 8-byte Folded Reload - ld.d $s4, $sp, 232 # 8-byte Folded Reload - ld.d $s3, $sp, 240 # 8-byte Folded Reload - ld.d $s2, $sp, 248 # 8-byte Folded Reload - ld.d $s1, $sp, 256 # 8-byte Folded Reload - ld.d $s0, $sp, 264 # 8-byte Folded Reload - ld.d $fp, $sp, 272 # 8-byte Folded Reload - ld.d $ra, $sp, 280 # 8-byte Folded Reload - addi.d $sp, $sp, 288 + ld.d $s8, $sp, 168 # 8-byte Folded Reload + ld.d $s7, $sp, 176 # 8-byte Folded Reload + ld.d $s6, $sp, 184 # 8-byte Folded Reload + ld.d $s5, $sp, 192 # 8-byte Folded Reload + ld.d $s4, $sp, 200 # 8-byte Folded Reload + ld.d $s3, $sp, 208 # 8-byte Folded Reload + ld.d $s2, $sp, 216 # 8-byte Folded Reload + ld.d $s1, $sp, 224 # 8-byte Folded Reload + ld.d $s0, $sp, 232 # 8-byte Folded Reload + ld.d $fp, $sp, 240 # 8-byte Folded Reload + ld.d $ra, $sp, 248 # 8-byte Folded Reload + addi.d $sp, $sp, 256 ret .Lfunc_end11: .size computeBiPredSATD1, .Lfunc_end11-computeBiPredSATD1 @@ -7721,8 +7601,8 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr9, $s2, 1 vinsgr2vr.h $vr9, $s6, 2 vinsgr2vr.h $vr9, $ra, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -7751,22 +7631,16 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr11, $s2, 1 vinsgr2vr.h $vr11, $s6, 2 vinsgr2vr.h $vr11, $ra, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $s1, $vr9, 0 slli.d $s1, $s1, 2 vpickve2gr.d $s2, $vr9, 1 @@ -7817,8 +7691,8 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr9, $s0, 1 vinsgr2vr.h $vr9, $s1, 2 vinsgr2vr.h $vr9, $s2, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -7847,22 +7721,16 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr11, $s0, 1 vinsgr2vr.h $vr11, $s1, 2 vinsgr2vr.h $vr11, $s2, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t4, $vr9, 0 slli.d $t4, $t4, 2 vpickve2gr.d $s0, $vr9, 1 @@ -7913,8 +7781,8 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr9, $s0, 1 vinsgr2vr.h $vr9, $s1, 2 vinsgr2vr.h $vr9, $s2, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -7943,22 +7811,16 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr11, $s0, 1 vinsgr2vr.h $vr11, $s1, 2 vinsgr2vr.h $vr11, $s2, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t4, $vr9, 0 slli.d $t4, $t4, 2 vpickve2gr.d $s0, $vr9, 1 @@ -8009,8 +7871,8 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr9, $s0, 1 vinsgr2vr.h $vr9, $s1, 2 vinsgr2vr.h $vr9, $s2, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -8039,22 +7901,16 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr11, $s0, 1 vinsgr2vr.h $vr11, $s1, 2 vinsgr2vr.h $vr11, $s2, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t4, $vr9, 0 slli.d $t4, $t4, 2 vpickve2gr.d $s0, $vr9, 1 @@ -8325,8 +8181,8 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr9, $t7, 1 vinsgr2vr.h $vr9, $t8, 2 vinsgr2vr.h $vr9, $s0, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -8355,22 +8211,16 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr11, $t7, 1 vinsgr2vr.h $vr11, $t8, 2 vinsgr2vr.h $vr11, $s0, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t6, $vr9, 0 slli.d $t6, $t6, 2 vpickve2gr.d $t7, $vr9, 1 @@ -8421,8 +8271,8 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr9, $t7, 1 vinsgr2vr.h $vr9, $t8, 2 vinsgr2vr.h $vr9, $s0, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -8451,22 +8301,16 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr11, $t7, 1 vinsgr2vr.h $vr11, $t8, 2 vinsgr2vr.h $vr11, $s0, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t6, $vr9, 0 slli.d $t6, $t6, 2 vpickve2gr.d $t7, $vr9, 1 @@ -8702,8 +8546,8 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr9, $t7, 1 vinsgr2vr.h $vr9, $t8, 2 vinsgr2vr.h $vr9, $s0, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -8732,22 +8576,16 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr11, $t7, 1 vinsgr2vr.h $vr11, $t8, 2 vinsgr2vr.h $vr11, $s0, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t6, $vr9, 0 slli.d $t6, $t6, 2 vpickve2gr.d $t7, $vr9, 1 @@ -8798,8 +8636,8 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr9, $t7, 1 vinsgr2vr.h $vr9, $t8, 2 vinsgr2vr.h $vr9, $s0, 3 - vilvl.h $vr8, $vr5, $vr8 - vilvl.h $vr9, $vr5, $vr9 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vori.b $vr10, $vr1, 0 vmadd.w $vr10, $vr0, $vr8 vori.b $vr8, $vr1, 0 @@ -8828,22 +8666,16 @@ computeSSEWP: # @computeSSEWP vinsgr2vr.h $vr11, $t7, 1 vinsgr2vr.h $vr11, $t8, 2 vinsgr2vr.h $vr11, $s0, 3 - vilvl.h $vr10, $vr5, $vr10 - vilvl.h $vr11, $vr5, $vr11 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.wu.hu $vr11, $vr11, 0 vsub.w $vr9, $vr10, $vr9 vsub.w $vr8, $vr11, $vr8 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr9, $vr9, 0 + vshuf4i.w $vr11, $vr8, 14 + vsllwil.d.w $vr11, $vr11, 0 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $t6, $vr9, 0 slli.d $t6, $t6, 2 vpickve2gr.d $t7, $vr9, 1 @@ -9050,7 +8882,6 @@ computeBiPredSSE1: # @computeBiPredSSE1 slli.d $a2, $s4, 1 slli.d $a5, $s6, 1 sub.d $a2, $a2, $a5 - vrepli.b $vr0, 0 .p2align 4, , 16 .LBB15_3: # %.preheader73.us # =>This Loop Header: Depth=1 @@ -9064,30 +8895,30 @@ computeBiPredSSE1: # @computeBiPredSSE1 ld.d $a0, $a3, 0 ld.d $a4, $a6, 0 ld.d $t0, $a7, 0 - vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr1, $vr0, $vr1 - vinsgr2vr.d $vr2, $a4, 0 - vinsgr2vr.d $vr3, $t0, 0 - vor.v $vr4, $vr2, $vr3 - vxor.v $vr2, $vr2, $vr3 - vsrli.h $vr2, $vr2, 1 - vsub.h $vr2, $vr4, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vsub.w $vr1, $vr1, $vr2 - vpickve2gr.w $a0, $vr1, 0 + vinsgr2vr.d $vr0, $a0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vinsgr2vr.d $vr1, $a4, 0 + vinsgr2vr.d $vr2, $t0, 0 + vor.v $vr3, $vr1, $vr2 + vxor.v $vr1, $vr1, $vr2 + vsrli.h $vr1, $vr1, 1 + vsub.h $vr1, $vr3, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 + vsub.w $vr0, $vr0, $vr1 + vpickve2gr.w $a0, $vr0, 0 slli.d $a0, $a0, 2 ldx.w $a0, $s3, $a0 addi.d $a3, $a3, 8 addi.d $a6, $a6, 8 addi.d $a7, $a7, 8 add.d $a0, $a0, $s0 - vpickve2gr.w $a4, $vr1, 1 + vpickve2gr.w $a4, $vr0, 1 slli.d $a4, $a4, 2 ldx.w $a4, $s3, $a4 - vpickve2gr.w $t0, $vr1, 2 + vpickve2gr.w $t0, $vr0, 2 slli.d $t0, $t0, 2 ldx.w $t0, $s3, $t0 - vpickve2gr.w $t1, $vr1, 3 + vpickve2gr.w $t1, $vr0, 3 slli.d $t1, $t1, 2 ldx.w $t1, $s3, $t1 add.d $a0, $a0, $a4 @@ -9569,8 +9400,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr10, $s4, 1 vinsgr2vr.h $vr10, $s7, 2 vinsgr2vr.h $vr10, $ra, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $s3, $t7, -32 ld.h $s4, $t7, -24 ld.h $s7, $t7, -16 @@ -9587,8 +9418,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $s4, 1 vinsgr2vr.h $vr12, $s7, 2 vinsgr2vr.h $vr12, $ra, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -9619,8 +9450,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $s4, 1 vinsgr2vr.h $vr12, $s7, 2 vinsgr2vr.h $vr12, $ra, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 vmadd.w $vr7, $vr10, $vr10 @@ -9641,8 +9472,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr10, $s4, 1 vinsgr2vr.h $vr10, $s7, 2 vinsgr2vr.h $vr10, $ra, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $s3, $t7, -30 ld.h $s4, $t7, -22 ld.h $s7, $t7, -14 @@ -9659,8 +9490,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $s4, 1 vinsgr2vr.h $vr12, $s7, 2 vinsgr2vr.h $vr12, $ra, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -9691,8 +9522,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $s4, 1 vinsgr2vr.h $vr12, $s7, 2 vinsgr2vr.h $vr12, $ra, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 vmadd.w $vr7, $vr10, $vr10 @@ -9713,8 +9544,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr10, $s4, 1 vinsgr2vr.h $vr10, $s7, 2 vinsgr2vr.h $vr10, $ra, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $s3, $t7, -28 ld.h $s4, $t7, -20 ld.h $s7, $t7, -12 @@ -9731,8 +9562,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $s4, 1 vinsgr2vr.h $vr12, $s7, 2 vinsgr2vr.h $vr12, $ra, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -9763,8 +9594,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $s4, 1 vinsgr2vr.h $vr12, $s7, 2 vinsgr2vr.h $vr12, $ra, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 vmadd.w $vr7, $vr10, $vr10 @@ -9785,8 +9616,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr10, $s4, 1 vinsgr2vr.h $vr10, $s7, 2 vinsgr2vr.h $vr10, $ra, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $s3, $t7, -26 ld.h $s4, $t7, -18 ld.h $s7, $t7, -10 @@ -9803,8 +9634,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $s4, 1 vinsgr2vr.h $vr12, $s7, 2 vinsgr2vr.h $vr12, $ra, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -9835,8 +9666,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $s4, 1 vinsgr2vr.h $vr12, $s7, 2 vinsgr2vr.h $vr12, $ra, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 vmadd.w $vr7, $vr10, $vr10 @@ -10110,8 +9941,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr10, $t8, 1 vinsgr2vr.h $vr10, $s2, 2 vinsgr2vr.h $vr10, $s7, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $t7, $t4, -16 ld.h $t8, $t4, -12 ld.h $s2, $t4, -8 @@ -10128,8 +9959,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $t8, 1 vinsgr2vr.h $vr12, $s2, 2 vinsgr2vr.h $vr12, $s7, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -10160,8 +9991,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $t8, 1 vinsgr2vr.h $vr12, $s2, 2 vinsgr2vr.h $vr12, $s7, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 vmadd.w $vr7, $vr10, $vr10 @@ -10182,8 +10013,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr10, $t8, 1 vinsgr2vr.h $vr10, $s2, 2 vinsgr2vr.h $vr10, $s7, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $t7, $t4, -14 ld.h $t8, $t4, -10 ld.h $s2, $t4, -6 @@ -10200,8 +10031,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $t8, 1 vinsgr2vr.h $vr12, $s2, 2 vinsgr2vr.h $vr12, $s7, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -10232,8 +10063,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $t8, 1 vinsgr2vr.h $vr12, $s2, 2 vinsgr2vr.h $vr12, $s7, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 vmadd.w $vr7, $vr10, $vr10 @@ -10487,8 +10318,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr10, $t8, 1 vinsgr2vr.h $vr10, $s2, 2 vinsgr2vr.h $vr10, $s3, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $t7, $t4, -16 ld.h $t8, $t4, -12 ld.h $s2, $t4, -8 @@ -10505,8 +10336,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $t8, 1 vinsgr2vr.h $vr12, $s2, 2 vinsgr2vr.h $vr12, $s3, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -10537,8 +10368,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $t8, 1 vinsgr2vr.h $vr12, $s2, 2 vinsgr2vr.h $vr12, $s3, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 vmadd.w $vr7, $vr10, $vr10 @@ -10559,8 +10390,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr10, $t8, 1 vinsgr2vr.h $vr10, $s2, 2 vinsgr2vr.h $vr10, $s3, 3 - vilvl.h $vr9, $vr6, $vr9 - vilvl.h $vr10, $vr6, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 ld.h $t7, $t4, -14 ld.h $t8, $t4, -10 ld.h $s2, $t4, -6 @@ -10577,8 +10408,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $t8, 1 vinsgr2vr.h $vr12, $s2, 2 vinsgr2vr.h $vr12, $s3, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vori.b $vr13, $vr2, 0 vmadd.w $vr13, $vr0, $vr9 vori.b $vr9, $vr2, 0 @@ -10609,8 +10440,8 @@ computeBiPredSSE2: # @computeBiPredSSE2 vinsgr2vr.h $vr12, $t8, 1 vinsgr2vr.h $vr12, $s2, 2 vinsgr2vr.h $vr12, $s3, 3 - vilvl.h $vr11, $vr6, $vr11 - vilvl.h $vr12, $vr6, $vr12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.wu.hu $vr12, $vr12, 0 vsub.w $vr10, $vr11, $vr10 vsub.w $vr9, $vr12, $vr9 vmadd.w $vr7, $vr10, $vr10 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_fullfast.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_fullfast.s index 5ed5ed1d..35620f95 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_fullfast.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_fullfast.s @@ -2994,13 +2994,13 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch move $s1, $zero pcalau12i $a0, %pc_hi20(img_padded_size_x) st.d $a0, $sp, 272 # 8-byte Folded Spill - vrepli.b $vr12, 0 + vrepli.b $vr11, 0 st.d $fp, $sp, 104 # 8-byte Folded Spill st.d $s0, $sp, 96 # 8-byte Folded Spill st.d $s2, $sp, 88 # 8-byte Folded Spill st.d $t1, $sp, 152 # 8-byte Folded Spill st.d $t2, $sp, 144 # 8-byte Folded Spill - vst $vr12, $sp, 192 # 16-byte Folded Spill + vst $vr11, $sp, 192 # 16-byte Folded Spill b .LBB4_28 .p2align 4, , 16 .LBB4_27: # %.loopexit @@ -3112,7 +3112,6 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch ld.d $a4, $sp, 72 # 8-byte Folded Reload add.d $a3, $a3, $a4 ld.d $s1, $sp, 80 # 8-byte Folded Reload - vld $vr12, $sp, 192 # 16-byte Folded Reload .p2align 4, , 16 .LBB4_38: # %.preheader548 # Parent Loop BB4_28 Depth=1 @@ -3139,8 +3138,9 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # Parent Loop BB4_38 Depth=2 # => This Inner Loop Header: Depth=3 vld $vr5, $t1, -16 - vilvh.h $vr6, $vr12, $vr5 - vilvl.h $vr5, $vr12, $vr5 + vbsrl.v $vr6, $vr5, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr5, $vr5, 0 vori.b $vr7, $vr1, 0 vmadd.w $vr7, $vr0, $vr5 vori.b $vr5, $vr1, 0 @@ -3149,17 +3149,18 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch vsra.w $vr6, $vr7, $vr2 vadd.w $vr6, $vr6, $vr3 vadd.w $vr5, $vr5, $vr3 - vld $vr7, $t4, 0 vmaxi.w $vr5, $vr5, 0 + vld $vr7, $t4, 0 vmaxi.w $vr6, $vr6, 0 vmin.w $vr6, $vr6, $vr4 - vilvl.h $vr8, $vr12, $vr7 + vmin.w $vr5, $vr5, $vr4 + vsllwil.wu.hu $vr8, $vr7, 0 vsub.w $vr6, $vr6, $vr8 vpickve2gr.w $t5, $vr6, 0 slli.d $t5, $t5, 2 ldx.w $t5, $s5, $t5 - vmin.w $vr5, $vr5, $vr4 - vilvh.h $vr7, $vr12, $vr7 + vbsrl.v $vr7, $vr7, 8 + vsllwil.wu.hu $vr7, $vr7, 0 vsub.w $vr5, $vr5, $vr7 add.d $t0, $t5, $t0 vpickve2gr.w $t5, $vr6, 1 @@ -3191,8 +3192,9 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch add.d $a7, $a7, $t5 add.d $a7, $a7, $t6 add.d $a7, $a7, $t7 - vilvh.h $vr6, $vr12, $vr5 - vilvl.h $vr5, $vr12, $vr5 + vbsrl.v $vr6, $vr5, 8 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.wu.hu $vr5, $vr5, 0 vori.b $vr7, $vr1, 0 vmadd.w $vr7, $vr0, $vr5 vori.b $vr5, $vr1, 0 @@ -3206,8 +3208,9 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch vmaxi.w $vr6, $vr6, 0 vmin.w $vr6, $vr6, $vr4 vmin.w $vr5, $vr5, $vr4 - vilvl.h $vr8, $vr12, $vr7 - vilvh.h $vr7, $vr12, $vr7 + vsllwil.wu.hu $vr8, $vr7, 0 + vbsrl.v $vr7, $vr7, 8 + vsllwil.wu.hu $vr7, $vr7, 0 vsub.w $vr5, $vr5, $vr7 vsub.w $vr6, $vr6, $vr8 vpickve2gr.w $t5, $vr6, 0 @@ -3307,7 +3310,7 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch ld.d $a1, $sp, 232 # 8-byte Folded Reload ld.d $a2, $sp, 224 # 8-byte Folded Reload jirl $ra, $a3, 0 - vld $vr12, $sp, 192 # 16-byte Folded Reload + vld $vr11, $sp, 192 # 16-byte Folded Reload ld.d $a1, $sp, 216 # 8-byte Folded Reload ld.d $a1, $a1, 0 move $a3, $a0 @@ -3452,7 +3455,6 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch addi.d $a3, $a3, 8 addi.d $a4, $s1, 8 move $s0, $t5 - vori.b $vr7, $vr2, 0 .p2align 4, , 16 .LBB4_54: # %vector.body1056 # Parent Loop BB4_28 Depth=1 @@ -3462,83 +3464,77 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # => This Inner Loop Header: Depth=5 ld.d $s1, $a3, -8 ld.d $s6, $a3, 0 - vinsgr2vr.d $vr8, $s1, 0 - vinsgr2vr.d $vr9, $s6, 0 - vilvl.h $vr8, $vr2, $vr8 - vilvl.h $vr9, $vr2, $vr9 - vori.b $vr10, $vr0, 0 - vmadd.w $vr10, $vr5, $vr8 - vori.b $vr8, $vr0, 0 - vmadd.w $vr8, $vr5, $vr9 - vsra.w $vr9, $vr10, $vr1 - vsra.w $vr8, $vr8, $vr1 - vadd.w $vr9, $vr9, $vr6 + vinsgr2vr.d $vr7, $s1, 0 + vinsgr2vr.d $vr8, $s6, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vori.b $vr9, $vr0, 0 + vmadd.w $vr9, $vr5, $vr7 + vori.b $vr7, $vr0, 0 + vmadd.w $vr7, $vr5, $vr8 + vsra.w $vr8, $vr9, $vr1 + vsra.w $vr7, $vr7, $vr1 vadd.w $vr8, $vr8, $vr6 - vmaxi.w $vr9, $vr9, 0 + vadd.w $vr7, $vr7, $vr6 vmaxi.w $vr8, $vr8, 0 + vmaxi.w $vr7, $vr7, 0 ld.d $s1, $a4, -8 ld.d $s6, $a4, 0 - vmin.w $vr9, $vr9, $vr4 vmin.w $vr8, $vr8, $vr4 - vinsgr2vr.d $vr10, $s1, 0 - vinsgr2vr.d $vr11, $s6, 0 - vilvl.h $vr10, $vr2, $vr10 - vilvl.h $vr11, $vr2, $vr11 - vsub.w $vr9, $vr9, $vr10 - vsub.w $vr8, $vr8, $vr11 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vpickve2gr.d $s1, $vr9, 0 + vmin.w $vr7, $vr7, $vr4 + vinsgr2vr.d $vr9, $s1, 0 + vinsgr2vr.d $vr10, $s6, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsub.w $vr8, $vr8, $vr9 + vsub.w $vr7, $vr7, $vr10 + vshuf4i.w $vr9, $vr8, 14 + vsllwil.d.w $vr9, $vr9, 0 + vsllwil.d.w $vr8, $vr8, 0 + vshuf4i.w $vr10, $vr7, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr7, $vr7, 0 + vpickve2gr.d $s1, $vr8, 0 slli.d $s1, $s1, 2 - vpickve2gr.d $s6, $vr9, 1 + vpickve2gr.d $s6, $vr8, 1 slli.d $s6, $s6, 2 - vpickve2gr.d $t3, $vr10, 0 + vpickve2gr.d $t3, $vr9, 0 slli.d $t3, $t3, 2 - vpickve2gr.d $a6, $vr10, 1 + vpickve2gr.d $a6, $vr9, 1 slli.d $a6, $a6, 2 - vpickve2gr.d $t7, $vr8, 0 + vpickve2gr.d $t7, $vr7, 0 slli.d $t7, $t7, 2 - vpickve2gr.d $t4, $vr8, 1 + vpickve2gr.d $t4, $vr7, 1 slli.d $t4, $t4, 2 - vpickve2gr.d $a2, $vr11, 0 + vpickve2gr.d $a2, $vr10, 0 slli.d $a2, $a2, 2 - vpickve2gr.d $a5, $vr11, 1 + vpickve2gr.d $a5, $vr10, 1 slli.d $a5, $a5, 2 ldx.w $s1, $s5, $s1 ldx.w $s6, $s5, $s6 ldx.w $t3, $s5, $t3 ldx.w $a6, $s5, $a6 - vinsgr2vr.w $vr8, $s1, 0 - vinsgr2vr.w $vr8, $s6, 1 - vinsgr2vr.w $vr8, $t3, 2 - vinsgr2vr.w $vr8, $a6, 3 + vinsgr2vr.w $vr7, $s1, 0 + vinsgr2vr.w $vr7, $s6, 1 + vinsgr2vr.w $vr7, $t3, 2 + vinsgr2vr.w $vr7, $a6, 3 ldx.w $a6, $s5, $t7 ldx.w $t3, $s5, $t4 ldx.w $a2, $s5, $a2 ldx.w $a5, $s5, $a5 - vinsgr2vr.w $vr9, $a6, 0 - vinsgr2vr.w $vr9, $t3, 1 - vinsgr2vr.w $vr9, $a2, 2 - vinsgr2vr.w $vr9, $a5, 3 - vadd.w $vr3, $vr8, $vr3 - vadd.w $vr7, $vr9, $vr7 + vinsgr2vr.w $vr8, $a6, 0 + vinsgr2vr.w $vr8, $t3, 1 + vinsgr2vr.w $vr8, $a2, 2 + vinsgr2vr.w $vr8, $a5, 3 + vadd.w $vr3, $vr7, $vr3 + vadd.w $vr2, $vr8, $vr2 addi.d $a3, $a3, 16 addi.d $s0, $s0, -8 addi.d $a4, $a4, 16 bnez $s0, .LBB4_54 # %bb.55: # %middle.block1069 # in Loop: Header=BB4_50 Depth=4 - vadd.w $vr2, $vr7, $vr3 + vadd.w $vr2, $vr2, $vr3 vhaddw.d.w $vr2, $vr2, $vr2 vhaddw.q.d $vr2, $vr2, $vr2 vpickve2gr.d $s0, $vr2, 0 @@ -3606,7 +3602,6 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch addi.d $a0, $a0, 8 addi.d $a3, $s2, 8 move $s2, $t5 - vori.b $vr7, $vr2, 0 .p2align 4, , 16 .LBB4_61: # %vector.body1020 # Parent Loop BB4_28 Depth=1 @@ -3616,83 +3611,77 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # => This Inner Loop Header: Depth=5 ld.d $a2, $a0, -8 ld.d $a5, $a0, 0 - vinsgr2vr.d $vr8, $a2, 0 - vinsgr2vr.d $vr9, $a5, 0 - vilvl.h $vr8, $vr2, $vr8 - vilvl.h $vr9, $vr2, $vr9 - vori.b $vr10, $vr0, 0 - vmadd.w $vr10, $vr5, $vr8 - vori.b $vr8, $vr0, 0 - vmadd.w $vr8, $vr5, $vr9 - vsra.w $vr9, $vr10, $vr1 - vsra.w $vr8, $vr8, $vr1 - vadd.w $vr9, $vr9, $vr6 + vinsgr2vr.d $vr7, $a2, 0 + vinsgr2vr.d $vr8, $a5, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vori.b $vr9, $vr0, 0 + vmadd.w $vr9, $vr5, $vr7 + vori.b $vr7, $vr0, 0 + vmadd.w $vr7, $vr5, $vr8 + vsra.w $vr8, $vr9, $vr1 + vsra.w $vr7, $vr7, $vr1 vadd.w $vr8, $vr8, $vr6 - vmaxi.w $vr9, $vr9, 0 + vadd.w $vr7, $vr7, $vr6 vmaxi.w $vr8, $vr8, 0 + vmaxi.w $vr7, $vr7, 0 ld.d $a2, $a3, -8 ld.d $a5, $a3, 0 - vmin.w $vr9, $vr9, $vr4 vmin.w $vr8, $vr8, $vr4 - vinsgr2vr.d $vr10, $a2, 0 - vinsgr2vr.d $vr11, $a5, 0 - vilvl.h $vr10, $vr2, $vr10 - vilvl.h $vr11, $vr2, $vr11 - vsub.w $vr9, $vr9, $vr10 - vsub.w $vr8, $vr8, $vr11 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vpickve2gr.d $a2, $vr9, 0 + vmin.w $vr7, $vr7, $vr4 + vinsgr2vr.d $vr9, $a2, 0 + vinsgr2vr.d $vr10, $a5, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsub.w $vr8, $vr8, $vr9 + vsub.w $vr7, $vr7, $vr10 + vshuf4i.w $vr9, $vr8, 14 + vsllwil.d.w $vr9, $vr9, 0 + vsllwil.d.w $vr8, $vr8, 0 + vshuf4i.w $vr10, $vr7, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr7, $vr7, 0 + vpickve2gr.d $a2, $vr8, 0 slli.d $a2, $a2, 2 - vpickve2gr.d $a5, $vr9, 1 + vpickve2gr.d $a5, $vr8, 1 slli.d $a5, $a5, 2 - vpickve2gr.d $a6, $vr10, 0 + vpickve2gr.d $a6, $vr9, 0 slli.d $a6, $a6, 2 - vpickve2gr.d $t3, $vr10, 1 + vpickve2gr.d $t3, $vr9, 1 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr8, 0 + vpickve2gr.d $t4, $vr7, 0 slli.d $t4, $t4, 2 - vpickve2gr.d $t7, $vr8, 1 + vpickve2gr.d $t7, $vr7, 1 slli.d $t7, $t7, 2 - vpickve2gr.d $s3, $vr11, 0 + vpickve2gr.d $s3, $vr10, 0 slli.d $s3, $s3, 2 - vpickve2gr.d $s6, $vr11, 1 + vpickve2gr.d $s6, $vr10, 1 slli.d $s6, $s6, 2 ldx.w $a2, $s5, $a2 ldx.w $a5, $s5, $a5 ldx.w $a6, $s5, $a6 ldx.w $t3, $s5, $t3 - vinsgr2vr.w $vr8, $a2, 0 - vinsgr2vr.w $vr8, $a5, 1 - vinsgr2vr.w $vr8, $a6, 2 - vinsgr2vr.w $vr8, $t3, 3 + vinsgr2vr.w $vr7, $a2, 0 + vinsgr2vr.w $vr7, $a5, 1 + vinsgr2vr.w $vr7, $a6, 2 + vinsgr2vr.w $vr7, $t3, 3 ldx.w $a2, $s5, $t4 ldx.w $a5, $s5, $t7 ldx.w $a6, $s5, $s3 ldx.w $t3, $s5, $s6 - vinsgr2vr.w $vr9, $a2, 0 - vinsgr2vr.w $vr9, $a5, 1 - vinsgr2vr.w $vr9, $a6, 2 - vinsgr2vr.w $vr9, $t3, 3 - vadd.w $vr3, $vr8, $vr3 - vadd.w $vr7, $vr9, $vr7 + vinsgr2vr.w $vr8, $a2, 0 + vinsgr2vr.w $vr8, $a5, 1 + vinsgr2vr.w $vr8, $a6, 2 + vinsgr2vr.w $vr8, $t3, 3 + vadd.w $vr3, $vr7, $vr3 + vadd.w $vr2, $vr8, $vr2 addi.d $a0, $a0, 16 addi.d $s2, $s2, -8 addi.d $a3, $a3, 16 bnez $s2, .LBB4_61 # %bb.62: # %middle.block1033 # in Loop: Header=BB4_50 Depth=4 - vadd.w $vr2, $vr7, $vr3 + vadd.w $vr2, $vr2, $vr3 vhaddw.d.w $vr2, $vr2, $vr2 vhaddw.q.d $vr2, $vr2, $vr2 vpickve2gr.d $s3, $vr2, 0 @@ -3760,7 +3749,6 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch addi.d $a3, $s8, 8 addi.d $s1, $s1, 8 move $s4, $t5 - vori.b $vr7, $vr2, 0 .p2align 4, , 16 .LBB4_68: # %vector.body984 # Parent Loop BB4_28 Depth=1 @@ -3770,83 +3758,77 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # => This Inner Loop Header: Depth=5 ld.d $a2, $a3, -8 ld.d $a5, $a3, 0 - vinsgr2vr.d $vr8, $a2, 0 - vinsgr2vr.d $vr9, $a5, 0 - vilvl.h $vr8, $vr2, $vr8 - vilvl.h $vr9, $vr2, $vr9 - vori.b $vr10, $vr0, 0 - vmadd.w $vr10, $vr5, $vr8 - vori.b $vr8, $vr0, 0 - vmadd.w $vr8, $vr5, $vr9 - vsra.w $vr9, $vr10, $vr1 - vsra.w $vr8, $vr8, $vr1 - vadd.w $vr9, $vr9, $vr6 + vinsgr2vr.d $vr7, $a2, 0 + vinsgr2vr.d $vr8, $a5, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vori.b $vr9, $vr0, 0 + vmadd.w $vr9, $vr5, $vr7 + vori.b $vr7, $vr0, 0 + vmadd.w $vr7, $vr5, $vr8 + vsra.w $vr8, $vr9, $vr1 + vsra.w $vr7, $vr7, $vr1 vadd.w $vr8, $vr8, $vr6 - vmaxi.w $vr9, $vr9, 0 + vadd.w $vr7, $vr7, $vr6 vmaxi.w $vr8, $vr8, 0 + vmaxi.w $vr7, $vr7, 0 ld.d $a2, $s1, -8 ld.d $a5, $s1, 0 - vmin.w $vr9, $vr9, $vr4 vmin.w $vr8, $vr8, $vr4 - vinsgr2vr.d $vr10, $a2, 0 - vinsgr2vr.d $vr11, $a5, 0 - vilvl.h $vr10, $vr2, $vr10 - vilvl.h $vr11, $vr2, $vr11 - vsub.w $vr9, $vr9, $vr10 - vsub.w $vr8, $vr8, $vr11 - vshuf4i.w $vr10, $vr9, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr11, $vr8, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vpickve2gr.d $a2, $vr9, 0 + vmin.w $vr7, $vr7, $vr4 + vinsgr2vr.d $vr9, $a2, 0 + vinsgr2vr.d $vr10, $a5, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsub.w $vr8, $vr8, $vr9 + vsub.w $vr7, $vr7, $vr10 + vshuf4i.w $vr9, $vr8, 14 + vsllwil.d.w $vr9, $vr9, 0 + vsllwil.d.w $vr8, $vr8, 0 + vshuf4i.w $vr10, $vr7, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr7, $vr7, 0 + vpickve2gr.d $a2, $vr8, 0 slli.d $a2, $a2, 2 - vpickve2gr.d $a5, $vr9, 1 + vpickve2gr.d $a5, $vr8, 1 slli.d $a5, $a5, 2 - vpickve2gr.d $a6, $vr10, 0 + vpickve2gr.d $a6, $vr9, 0 slli.d $a6, $a6, 2 - vpickve2gr.d $t3, $vr10, 1 + vpickve2gr.d $t3, $vr9, 1 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr8, 0 + vpickve2gr.d $t4, $vr7, 0 slli.d $t4, $t4, 2 - vpickve2gr.d $t7, $vr8, 1 + vpickve2gr.d $t7, $vr7, 1 slli.d $t7, $t7, 2 - vpickve2gr.d $s8, $vr11, 0 + vpickve2gr.d $s8, $vr10, 0 slli.d $s8, $s8, 2 - vpickve2gr.d $ra, $vr11, 1 + vpickve2gr.d $ra, $vr10, 1 slli.d $ra, $ra, 2 ldx.w $a2, $s5, $a2 ldx.w $a5, $s5, $a5 ldx.w $a6, $s5, $a6 ldx.w $t3, $s5, $t3 - vinsgr2vr.w $vr8, $a2, 0 - vinsgr2vr.w $vr8, $a5, 1 - vinsgr2vr.w $vr8, $a6, 2 - vinsgr2vr.w $vr8, $t3, 3 + vinsgr2vr.w $vr7, $a2, 0 + vinsgr2vr.w $vr7, $a5, 1 + vinsgr2vr.w $vr7, $a6, 2 + vinsgr2vr.w $vr7, $t3, 3 ldx.w $a2, $s5, $t4 ldx.w $a5, $s5, $t7 ldx.w $a6, $s5, $s8 ldx.w $t3, $s5, $ra - vinsgr2vr.w $vr9, $a2, 0 - vinsgr2vr.w $vr9, $a5, 1 - vinsgr2vr.w $vr9, $a6, 2 - vinsgr2vr.w $vr9, $t3, 3 - vadd.w $vr3, $vr8, $vr3 - vadd.w $vr7, $vr9, $vr7 + vinsgr2vr.w $vr8, $a2, 0 + vinsgr2vr.w $vr8, $a5, 1 + vinsgr2vr.w $vr8, $a6, 2 + vinsgr2vr.w $vr8, $t3, 3 + vadd.w $vr3, $vr7, $vr3 + vadd.w $vr2, $vr8, $vr2 addi.d $a3, $a3, 16 addi.d $s4, $s4, -8 addi.d $s1, $s1, 16 bnez $s4, .LBB4_68 # %bb.69: # %middle.block997 # in Loop: Header=BB4_50 Depth=4 - vadd.w $vr2, $vr7, $vr3 + vadd.w $vr2, $vr2, $vr3 vhaddw.d.w $vr2, $vr2, $vr2 vhaddw.q.d $vr2, $vr2, $vr2 vpickve2gr.d $ra, $vr2, 0 @@ -3905,7 +3887,7 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # in Loop: Header=BB4_50 Depth=4 add.d $s1, $s6, $t6 add.d $a3, $a4, $t6 - vori.b $vr2, $vr12, 0 + vori.b $vr2, $vr11, 0 vinsgr2vr.w $vr2, $t8, 0 vreplgr2vr.w $vr3, $fp vreplgr2vr.w $vr4, $s4 @@ -3913,7 +3895,7 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch addi.d $a0, $a4, 8 addi.d $a4, $s6, 8 move $t8, $t5 - vori.b $vr6, $vr12, 0 + vori.b $vr6, $vr11, 0 .p2align 4, , 16 .LBB4_75: # %vector.body # Parent Loop BB4_28 Depth=1 @@ -3925,8 +3907,8 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch ld.d $a5, $a0, 0 vinsgr2vr.d $vr7, $a2, 0 vinsgr2vr.d $vr8, $a5, 0 - vilvl.h $vr7, $vr12, $vr7 - vilvl.h $vr8, $vr12, $vr8 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.wu.hu $vr8, $vr8, 0 vori.b $vr9, $vr0, 0 vmadd.w $vr9, $vr4, $vr7 vori.b $vr7, $vr0, 0 @@ -3943,22 +3925,16 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch vmin.w $vr7, $vr7, $vr3 vinsgr2vr.d $vr9, $a2, 0 vinsgr2vr.d $vr10, $a5, 0 - vilvl.h $vr9, $vr12, $vr9 - vilvl.h $vr10, $vr12, $vr10 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.wu.hu $vr10, $vr10, 0 vsub.w $vr8, $vr8, $vr9 vsub.w $vr7, $vr7, $vr10 - vshuf4i.w $vr9, $vr8, 50 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vshuf4i.w $vr10, $vr7, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr7, $vr7, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 + vshuf4i.w $vr9, $vr8, 14 + vsllwil.d.w $vr9, $vr9, 0 + vsllwil.d.w $vr8, $vr8, 0 + vshuf4i.w $vr10, $vr7, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr7, $vr7, 0 vpickve2gr.d $a2, $vr8, 0 slli.d $a2, $a2, 2 vpickve2gr.d $a5, $vr8, 1 @@ -4309,7 +4285,6 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch addi.d $a0, $a0, 8 addi.d $t7, $s2, 8 move $t8, $t1 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB4_93: # %vector.body1264 # Parent Loop BB4_28 Depth=1 @@ -4318,69 +4293,77 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # => This Inner Loop Header: Depth=4 ld.d $s0, $a0, -8 ld.d $s2, $a0, 0 - vinsgr2vr.d $vr3, $s0, 0 - vinsgr2vr.d $vr4, $s2, 0 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr5, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.d $vr2, $s0, 0 + vinsgr2vr.d $vr3, $s2, 0 + vsllwil.wu.hu $vr4, $vr2, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.wu.hu $vr5, $vr3, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.h $vr3, $vr3, 14 ld.d $s0, $t7, -8 ld.d $s2, $t7, 0 - vilvl.w $vr6, $vr0, $vr4 - vilvh.w $vr4, $vr0, $vr4 - vinsgr2vr.d $vr7, $s0, 0 - vinsgr2vr.d $vr8, $s2, 0 - vilvl.h $vr7, $vr0, $vr7 - vilvl.w $vr9, $vr0, $vr7 - vilvh.w $vr7, $vr0, $vr7 - vilvl.h $vr8, $vr0, $vr8 - vilvl.w $vr10, $vr0, $vr8 - vilvh.w $vr8, $vr0, $vr8 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vinsgr2vr.d $vr6, $s0, 0 + vinsgr2vr.d $vr7, $s2, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.h $vr6, $vr6, 14 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.wu.hu $vr9, $vr7, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vshuf4i.h $vr7, $vr7, 14 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsub.d $vr2, $vr2, $vr6 + vsub.d $vr4, $vr4, $vr8 vsub.d $vr3, $vr3, $vr7 vsub.d $vr5, $vr5, $vr9 - vsub.d $vr4, $vr4, $vr8 - vsub.d $vr6, $vr6, $vr10 - vpickve2gr.d $s0, $vr5, 0 + vpickve2gr.d $s0, $vr4, 0 slli.d $s0, $s0, 2 - vpickve2gr.d $s2, $vr5, 1 + vpickve2gr.d $s2, $vr4, 1 slli.d $s2, $s2, 2 - vpickve2gr.d $s3, $vr3, 0 + vpickve2gr.d $s3, $vr2, 0 slli.d $s3, $s3, 2 - vpickve2gr.d $s4, $vr3, 1 + vpickve2gr.d $s4, $vr2, 1 slli.d $s4, $s4, 2 - vpickve2gr.d $s6, $vr6, 0 + vpickve2gr.d $s6, $vr5, 0 slli.d $s6, $s6, 2 - vpickve2gr.d $s7, $vr6, 1 + vpickve2gr.d $s7, $vr5, 1 slli.d $s7, $s7, 2 - vpickve2gr.d $s8, $vr4, 0 + vpickve2gr.d $s8, $vr3, 0 slli.d $s8, $s8, 2 - vpickve2gr.d $ra, $vr4, 1 + vpickve2gr.d $ra, $vr3, 1 slli.d $ra, $ra, 2 ldx.w $s0, $s5, $s0 ldx.w $s2, $s5, $s2 ldx.w $s3, $s5, $s3 ldx.w $s4, $s5, $s4 - vinsgr2vr.w $vr3, $s0, 0 - vinsgr2vr.w $vr3, $s2, 1 - vinsgr2vr.w $vr3, $s3, 2 - vinsgr2vr.w $vr3, $s4, 3 + vinsgr2vr.w $vr2, $s0, 0 + vinsgr2vr.w $vr2, $s2, 1 + vinsgr2vr.w $vr2, $s3, 2 + vinsgr2vr.w $vr2, $s4, 3 ldx.w $s0, $s5, $s6 ldx.w $s2, $s5, $s7 ldx.w $s3, $s5, $s8 ldx.w $s4, $s5, $ra - vinsgr2vr.w $vr4, $s0, 0 - vinsgr2vr.w $vr4, $s2, 1 - vinsgr2vr.w $vr4, $s3, 2 - vinsgr2vr.w $vr4, $s4, 3 - vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr4, $vr2 + vinsgr2vr.w $vr3, $s0, 0 + vinsgr2vr.w $vr3, $s2, 1 + vinsgr2vr.w $vr3, $s3, 2 + vinsgr2vr.w $vr3, $s4, 3 + vadd.w $vr1, $vr2, $vr1 + vadd.w $vr0, $vr3, $vr0 addi.d $a0, $a0, 16 addi.d $t8, $t8, -8 addi.d $t7, $t7, 16 bnez $t8, .LBB4_93 # %bb.94: # %middle.block1277 # in Loop: Header=BB4_90 Depth=3 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t7, $vr0, 0 @@ -4428,7 +4411,6 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch addi.d $a0, $s1, 8 addi.d $t6, $fp, 8 move $fp, $t1 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB4_100: # %vector.body1238 # Parent Loop BB4_28 Depth=1 @@ -4437,69 +4419,77 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # => This Inner Loop Header: Depth=4 ld.d $s1, $a0, -8 ld.d $s2, $a0, 0 - vinsgr2vr.d $vr3, $s1, 0 - vinsgr2vr.d $vr4, $s2, 0 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr5, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.d $vr2, $s1, 0 + vinsgr2vr.d $vr3, $s2, 0 + vsllwil.wu.hu $vr4, $vr2, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.wu.hu $vr5, $vr3, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.h $vr3, $vr3, 14 ld.d $s1, $t6, -8 ld.d $s2, $t6, 0 - vilvl.w $vr6, $vr0, $vr4 - vilvh.w $vr4, $vr0, $vr4 - vinsgr2vr.d $vr7, $s1, 0 - vinsgr2vr.d $vr8, $s2, 0 - vilvl.h $vr7, $vr0, $vr7 - vilvl.w $vr9, $vr0, $vr7 - vilvh.w $vr7, $vr0, $vr7 - vilvl.h $vr8, $vr0, $vr8 - vilvl.w $vr10, $vr0, $vr8 - vilvh.w $vr8, $vr0, $vr8 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vinsgr2vr.d $vr6, $s1, 0 + vinsgr2vr.d $vr7, $s2, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.h $vr6, $vr6, 14 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.wu.hu $vr9, $vr7, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vshuf4i.h $vr7, $vr7, 14 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsub.d $vr2, $vr2, $vr6 + vsub.d $vr4, $vr4, $vr8 vsub.d $vr3, $vr3, $vr7 vsub.d $vr5, $vr5, $vr9 - vsub.d $vr4, $vr4, $vr8 - vsub.d $vr6, $vr6, $vr10 - vpickve2gr.d $s1, $vr5, 0 + vpickve2gr.d $s1, $vr4, 0 slli.d $s1, $s1, 2 - vpickve2gr.d $s2, $vr5, 1 + vpickve2gr.d $s2, $vr4, 1 slli.d $s2, $s2, 2 - vpickve2gr.d $s3, $vr3, 0 + vpickve2gr.d $s3, $vr2, 0 slli.d $s3, $s3, 2 - vpickve2gr.d $s4, $vr3, 1 + vpickve2gr.d $s4, $vr2, 1 slli.d $s4, $s4, 2 - vpickve2gr.d $s6, $vr6, 0 + vpickve2gr.d $s6, $vr5, 0 slli.d $s6, $s6, 2 - vpickve2gr.d $s7, $vr6, 1 + vpickve2gr.d $s7, $vr5, 1 slli.d $s7, $s7, 2 - vpickve2gr.d $s8, $vr4, 0 + vpickve2gr.d $s8, $vr3, 0 slli.d $s8, $s8, 2 - vpickve2gr.d $ra, $vr4, 1 + vpickve2gr.d $ra, $vr3, 1 slli.d $ra, $ra, 2 ldx.w $s1, $s5, $s1 ldx.w $s2, $s5, $s2 ldx.w $s3, $s5, $s3 ldx.w $s4, $s5, $s4 - vinsgr2vr.w $vr3, $s1, 0 - vinsgr2vr.w $vr3, $s2, 1 - vinsgr2vr.w $vr3, $s3, 2 - vinsgr2vr.w $vr3, $s4, 3 + vinsgr2vr.w $vr2, $s1, 0 + vinsgr2vr.w $vr2, $s2, 1 + vinsgr2vr.w $vr2, $s3, 2 + vinsgr2vr.w $vr2, $s4, 3 ldx.w $s1, $s5, $s6 ldx.w $s2, $s5, $s7 ldx.w $s3, $s5, $s8 ldx.w $s4, $s5, $ra - vinsgr2vr.w $vr4, $s1, 0 - vinsgr2vr.w $vr4, $s2, 1 - vinsgr2vr.w $vr4, $s3, 2 - vinsgr2vr.w $vr4, $s4, 3 - vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr4, $vr2 + vinsgr2vr.w $vr3, $s1, 0 + vinsgr2vr.w $vr3, $s2, 1 + vinsgr2vr.w $vr3, $s3, 2 + vinsgr2vr.w $vr3, $s4, 3 + vadd.w $vr1, $vr2, $vr1 + vadd.w $vr0, $vr3, $vr0 addi.d $a0, $a0, 16 addi.d $fp, $fp, -8 addi.d $t6, $t6, 16 bnez $fp, .LBB4_100 # %bb.101: # %middle.block1251 # in Loop: Header=BB4_90 Depth=3 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t6, $vr0, 0 @@ -4547,7 +4537,6 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch addi.d $t5, $s0, 8 addi.d $t8, $t8, 8 move $s0, $t1 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB4_107: # %vector.body1212 # Parent Loop BB4_28 Depth=1 @@ -4556,69 +4545,77 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # => This Inner Loop Header: Depth=4 ld.d $s1, $t5, -8 ld.d $s2, $t5, 0 - vinsgr2vr.d $vr3, $s1, 0 - vinsgr2vr.d $vr4, $s2, 0 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr5, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.d $vr2, $s1, 0 + vinsgr2vr.d $vr3, $s2, 0 + vsllwil.wu.hu $vr4, $vr2, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.wu.hu $vr5, $vr3, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.h $vr3, $vr3, 14 ld.d $s1, $t8, -8 ld.d $s2, $t8, 0 - vilvl.w $vr6, $vr0, $vr4 - vilvh.w $vr4, $vr0, $vr4 - vinsgr2vr.d $vr7, $s1, 0 - vinsgr2vr.d $vr8, $s2, 0 - vilvl.h $vr7, $vr0, $vr7 - vilvl.w $vr9, $vr0, $vr7 - vilvh.w $vr7, $vr0, $vr7 - vilvl.h $vr8, $vr0, $vr8 - vilvl.w $vr10, $vr0, $vr8 - vilvh.w $vr8, $vr0, $vr8 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vinsgr2vr.d $vr6, $s1, 0 + vinsgr2vr.d $vr7, $s2, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.h $vr6, $vr6, 14 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.wu.hu $vr9, $vr7, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vshuf4i.h $vr7, $vr7, 14 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsub.d $vr2, $vr2, $vr6 + vsub.d $vr4, $vr4, $vr8 vsub.d $vr3, $vr3, $vr7 vsub.d $vr5, $vr5, $vr9 - vsub.d $vr4, $vr4, $vr8 - vsub.d $vr6, $vr6, $vr10 - vpickve2gr.d $s1, $vr5, 0 + vpickve2gr.d $s1, $vr4, 0 slli.d $s1, $s1, 2 - vpickve2gr.d $s2, $vr5, 1 + vpickve2gr.d $s2, $vr4, 1 slli.d $s2, $s2, 2 - vpickve2gr.d $s3, $vr3, 0 + vpickve2gr.d $s3, $vr2, 0 slli.d $s3, $s3, 2 - vpickve2gr.d $s4, $vr3, 1 + vpickve2gr.d $s4, $vr2, 1 slli.d $s4, $s4, 2 - vpickve2gr.d $s6, $vr6, 0 + vpickve2gr.d $s6, $vr5, 0 slli.d $s6, $s6, 2 - vpickve2gr.d $s7, $vr6, 1 + vpickve2gr.d $s7, $vr5, 1 slli.d $s7, $s7, 2 - vpickve2gr.d $s8, $vr4, 0 + vpickve2gr.d $s8, $vr3, 0 slli.d $s8, $s8, 2 - vpickve2gr.d $ra, $vr4, 1 + vpickve2gr.d $ra, $vr3, 1 slli.d $ra, $ra, 2 ldx.w $s1, $s5, $s1 ldx.w $s2, $s5, $s2 ldx.w $s3, $s5, $s3 ldx.w $s4, $s5, $s4 - vinsgr2vr.w $vr3, $s1, 0 - vinsgr2vr.w $vr3, $s2, 1 - vinsgr2vr.w $vr3, $s3, 2 - vinsgr2vr.w $vr3, $s4, 3 + vinsgr2vr.w $vr2, $s1, 0 + vinsgr2vr.w $vr2, $s2, 1 + vinsgr2vr.w $vr2, $s3, 2 + vinsgr2vr.w $vr2, $s4, 3 ldx.w $s1, $s5, $s6 ldx.w $s2, $s5, $s7 ldx.w $s3, $s5, $s8 ldx.w $s4, $s5, $ra - vinsgr2vr.w $vr4, $s1, 0 - vinsgr2vr.w $vr4, $s2, 1 - vinsgr2vr.w $vr4, $s3, 2 - vinsgr2vr.w $vr4, $s4, 3 - vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr4, $vr2 + vinsgr2vr.w $vr3, $s1, 0 + vinsgr2vr.w $vr3, $s2, 1 + vinsgr2vr.w $vr3, $s3, 2 + vinsgr2vr.w $vr3, $s4, 3 + vadd.w $vr1, $vr2, $vr1 + vadd.w $vr0, $vr3, $vr0 addi.d $t5, $t5, 16 addi.d $s0, $s0, -8 addi.d $t8, $t8, 16 bnez $s0, .LBB4_107 # %bb.108: # %middle.block1225 # in Loop: Header=BB4_90 Depth=3 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t5, $vr0, 0 @@ -4666,7 +4663,6 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch addi.d $t4, $fp, 8 addi.d $a0, $a0, 8 move $fp, $t1 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB4_114: # %vector.body1186 # Parent Loop BB4_28 Depth=1 @@ -4675,69 +4671,77 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # => This Inner Loop Header: Depth=4 ld.d $s0, $t4, -8 ld.d $s1, $t4, 0 - vinsgr2vr.d $vr3, $s0, 0 - vinsgr2vr.d $vr4, $s1, 0 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr5, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.d $vr2, $s0, 0 + vinsgr2vr.d $vr3, $s1, 0 + vsllwil.wu.hu $vr4, $vr2, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.wu.hu $vr5, $vr3, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.h $vr3, $vr3, 14 ld.d $s0, $a0, -8 ld.d $s1, $a0, 0 - vilvl.w $vr6, $vr0, $vr4 - vilvh.w $vr4, $vr0, $vr4 - vinsgr2vr.d $vr7, $s0, 0 - vinsgr2vr.d $vr8, $s1, 0 - vilvl.h $vr7, $vr0, $vr7 - vilvl.w $vr9, $vr0, $vr7 - vilvh.w $vr7, $vr0, $vr7 - vilvl.h $vr8, $vr0, $vr8 - vilvl.w $vr10, $vr0, $vr8 - vilvh.w $vr8, $vr0, $vr8 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vinsgr2vr.d $vr6, $s0, 0 + vinsgr2vr.d $vr7, $s1, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.h $vr6, $vr6, 14 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.wu.hu $vr9, $vr7, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vshuf4i.h $vr7, $vr7, 14 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsub.d $vr2, $vr2, $vr6 + vsub.d $vr4, $vr4, $vr8 vsub.d $vr3, $vr3, $vr7 vsub.d $vr5, $vr5, $vr9 - vsub.d $vr4, $vr4, $vr8 - vsub.d $vr6, $vr6, $vr10 - vpickve2gr.d $s0, $vr5, 0 + vpickve2gr.d $s0, $vr4, 0 slli.d $s0, $s0, 2 - vpickve2gr.d $s1, $vr5, 1 + vpickve2gr.d $s1, $vr4, 1 slli.d $s1, $s1, 2 - vpickve2gr.d $s3, $vr3, 0 + vpickve2gr.d $s3, $vr2, 0 slli.d $s3, $s3, 2 - vpickve2gr.d $s4, $vr3, 1 + vpickve2gr.d $s4, $vr2, 1 slli.d $s4, $s4, 2 - vpickve2gr.d $s6, $vr6, 0 + vpickve2gr.d $s6, $vr5, 0 slli.d $s6, $s6, 2 - vpickve2gr.d $s7, $vr6, 1 + vpickve2gr.d $s7, $vr5, 1 slli.d $s7, $s7, 2 - vpickve2gr.d $s8, $vr4, 0 + vpickve2gr.d $s8, $vr3, 0 slli.d $s8, $s8, 2 - vpickve2gr.d $ra, $vr4, 1 + vpickve2gr.d $ra, $vr3, 1 slli.d $ra, $ra, 2 ldx.w $s0, $s5, $s0 ldx.w $s1, $s5, $s1 ldx.w $s3, $s5, $s3 ldx.w $s4, $s5, $s4 - vinsgr2vr.w $vr3, $s0, 0 - vinsgr2vr.w $vr3, $s1, 1 - vinsgr2vr.w $vr3, $s3, 2 - vinsgr2vr.w $vr3, $s4, 3 + vinsgr2vr.w $vr2, $s0, 0 + vinsgr2vr.w $vr2, $s1, 1 + vinsgr2vr.w $vr2, $s3, 2 + vinsgr2vr.w $vr2, $s4, 3 ldx.w $s0, $s5, $s6 ldx.w $s1, $s5, $s7 ldx.w $s3, $s5, $s8 ldx.w $s4, $s5, $ra - vinsgr2vr.w $vr4, $s0, 0 - vinsgr2vr.w $vr4, $s1, 1 - vinsgr2vr.w $vr4, $s3, 2 - vinsgr2vr.w $vr4, $s4, 3 - vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr4, $vr2 + vinsgr2vr.w $vr3, $s0, 0 + vinsgr2vr.w $vr3, $s1, 1 + vinsgr2vr.w $vr3, $s3, 2 + vinsgr2vr.w $vr3, $s4, 3 + vadd.w $vr1, $vr2, $vr1 + vadd.w $vr0, $vr3, $vr0 addi.d $t4, $t4, 16 addi.d $fp, $fp, -8 addi.d $a0, $a0, 16 bnez $fp, .LBB4_114 # %bb.115: # %middle.block1199 # in Loop: Header=BB4_90 Depth=3 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t4, $vr0, 0 @@ -4904,7 +4908,6 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch addi.d $a0, $a0, 8 addi.d $t7, $s2, 8 move $s0, $t1 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB4_130: # %vector.body1160 # Parent Loop BB4_28 Depth=1 @@ -4913,69 +4916,77 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # => This Inner Loop Header: Depth=4 ld.d $s1, $a0, -8 ld.d $s2, $a0, 0 - vinsgr2vr.d $vr3, $s1, 0 - vinsgr2vr.d $vr4, $s2, 0 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr5, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.d $vr2, $s1, 0 + vinsgr2vr.d $vr3, $s2, 0 + vsllwil.wu.hu $vr4, $vr2, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.wu.hu $vr5, $vr3, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.h $vr3, $vr3, 14 ld.d $s1, $t7, -8 ld.d $s2, $t7, 0 - vilvl.w $vr6, $vr0, $vr4 - vilvh.w $vr4, $vr0, $vr4 - vinsgr2vr.d $vr7, $s1, 0 - vinsgr2vr.d $vr8, $s2, 0 - vilvl.h $vr7, $vr0, $vr7 - vilvl.w $vr9, $vr0, $vr7 - vilvh.w $vr7, $vr0, $vr7 - vilvl.h $vr8, $vr0, $vr8 - vilvl.w $vr10, $vr0, $vr8 - vilvh.w $vr8, $vr0, $vr8 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vinsgr2vr.d $vr6, $s1, 0 + vinsgr2vr.d $vr7, $s2, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.h $vr6, $vr6, 14 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.wu.hu $vr9, $vr7, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vshuf4i.h $vr7, $vr7, 14 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsub.d $vr2, $vr2, $vr6 + vsub.d $vr4, $vr4, $vr8 vsub.d $vr3, $vr3, $vr7 vsub.d $vr5, $vr5, $vr9 - vsub.d $vr4, $vr4, $vr8 - vsub.d $vr6, $vr6, $vr10 - vpickve2gr.d $s1, $vr5, 0 + vpickve2gr.d $s1, $vr4, 0 slli.d $s1, $s1, 2 - vpickve2gr.d $s2, $vr5, 1 + vpickve2gr.d $s2, $vr4, 1 slli.d $s2, $s2, 2 - vpickve2gr.d $s3, $vr3, 0 + vpickve2gr.d $s3, $vr2, 0 slli.d $s3, $s3, 2 - vpickve2gr.d $s4, $vr3, 1 + vpickve2gr.d $s4, $vr2, 1 slli.d $s4, $s4, 2 - vpickve2gr.d $s6, $vr6, 0 + vpickve2gr.d $s6, $vr5, 0 slli.d $s6, $s6, 2 - vpickve2gr.d $s7, $vr6, 1 + vpickve2gr.d $s7, $vr5, 1 slli.d $s7, $s7, 2 - vpickve2gr.d $s8, $vr4, 0 + vpickve2gr.d $s8, $vr3, 0 slli.d $s8, $s8, 2 - vpickve2gr.d $ra, $vr4, 1 + vpickve2gr.d $ra, $vr3, 1 slli.d $ra, $ra, 2 ldx.w $s1, $s5, $s1 ldx.w $s2, $s5, $s2 ldx.w $s3, $s5, $s3 ldx.w $s4, $s5, $s4 - vinsgr2vr.w $vr3, $s1, 0 - vinsgr2vr.w $vr3, $s2, 1 - vinsgr2vr.w $vr3, $s3, 2 - vinsgr2vr.w $vr3, $s4, 3 + vinsgr2vr.w $vr2, $s1, 0 + vinsgr2vr.w $vr2, $s2, 1 + vinsgr2vr.w $vr2, $s3, 2 + vinsgr2vr.w $vr2, $s4, 3 ldx.w $s1, $s5, $s6 ldx.w $s2, $s5, $s7 ldx.w $s3, $s5, $s8 ldx.w $s4, $s5, $ra - vinsgr2vr.w $vr4, $s1, 0 - vinsgr2vr.w $vr4, $s2, 1 - vinsgr2vr.w $vr4, $s3, 2 - vinsgr2vr.w $vr4, $s4, 3 - vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr4, $vr2 + vinsgr2vr.w $vr3, $s1, 0 + vinsgr2vr.w $vr3, $s2, 1 + vinsgr2vr.w $vr3, $s3, 2 + vinsgr2vr.w $vr3, $s4, 3 + vadd.w $vr1, $vr2, $vr1 + vadd.w $vr0, $vr3, $vr0 addi.d $a0, $a0, 16 addi.d $s0, $s0, -8 addi.d $t7, $t7, 16 bnez $s0, .LBB4_130 # %bb.131: # %middle.block1173 # in Loop: Header=BB4_127 Depth=3 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t7, $vr0, 0 @@ -5023,7 +5034,6 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch addi.d $t6, $fp, 8 addi.d $t8, $t8, 8 move $fp, $t1 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB4_137: # %vector.body1134 # Parent Loop BB4_28 Depth=1 @@ -5032,69 +5042,77 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # => This Inner Loop Header: Depth=4 ld.d $s1, $t6, -8 ld.d $s2, $t6, 0 - vinsgr2vr.d $vr3, $s1, 0 - vinsgr2vr.d $vr4, $s2, 0 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr5, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.d $vr2, $s1, 0 + vinsgr2vr.d $vr3, $s2, 0 + vsllwil.wu.hu $vr4, $vr2, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.wu.hu $vr5, $vr3, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.h $vr3, $vr3, 14 ld.d $s1, $t8, -8 ld.d $s2, $t8, 0 - vilvl.w $vr6, $vr0, $vr4 - vilvh.w $vr4, $vr0, $vr4 - vinsgr2vr.d $vr7, $s1, 0 - vinsgr2vr.d $vr8, $s2, 0 - vilvl.h $vr7, $vr0, $vr7 - vilvl.w $vr9, $vr0, $vr7 - vilvh.w $vr7, $vr0, $vr7 - vilvl.h $vr8, $vr0, $vr8 - vilvl.w $vr10, $vr0, $vr8 - vilvh.w $vr8, $vr0, $vr8 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vinsgr2vr.d $vr6, $s1, 0 + vinsgr2vr.d $vr7, $s2, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.h $vr6, $vr6, 14 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.wu.hu $vr9, $vr7, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vshuf4i.h $vr7, $vr7, 14 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsub.d $vr2, $vr2, $vr6 + vsub.d $vr4, $vr4, $vr8 vsub.d $vr3, $vr3, $vr7 vsub.d $vr5, $vr5, $vr9 - vsub.d $vr4, $vr4, $vr8 - vsub.d $vr6, $vr6, $vr10 - vpickve2gr.d $s1, $vr5, 0 + vpickve2gr.d $s1, $vr4, 0 slli.d $s1, $s1, 2 - vpickve2gr.d $s2, $vr5, 1 + vpickve2gr.d $s2, $vr4, 1 slli.d $s2, $s2, 2 - vpickve2gr.d $s3, $vr3, 0 + vpickve2gr.d $s3, $vr2, 0 slli.d $s3, $s3, 2 - vpickve2gr.d $s4, $vr3, 1 + vpickve2gr.d $s4, $vr2, 1 slli.d $s4, $s4, 2 - vpickve2gr.d $s6, $vr6, 0 + vpickve2gr.d $s6, $vr5, 0 slli.d $s6, $s6, 2 - vpickve2gr.d $s7, $vr6, 1 + vpickve2gr.d $s7, $vr5, 1 slli.d $s7, $s7, 2 - vpickve2gr.d $s8, $vr4, 0 + vpickve2gr.d $s8, $vr3, 0 slli.d $s8, $s8, 2 - vpickve2gr.d $ra, $vr4, 1 + vpickve2gr.d $ra, $vr3, 1 slli.d $ra, $ra, 2 ldx.w $s1, $s5, $s1 ldx.w $s2, $s5, $s2 ldx.w $s3, $s5, $s3 ldx.w $s4, $s5, $s4 - vinsgr2vr.w $vr3, $s1, 0 - vinsgr2vr.w $vr3, $s2, 1 - vinsgr2vr.w $vr3, $s3, 2 - vinsgr2vr.w $vr3, $s4, 3 + vinsgr2vr.w $vr2, $s1, 0 + vinsgr2vr.w $vr2, $s2, 1 + vinsgr2vr.w $vr2, $s3, 2 + vinsgr2vr.w $vr2, $s4, 3 ldx.w $s1, $s5, $s6 ldx.w $s2, $s5, $s7 ldx.w $s3, $s5, $s8 ldx.w $s4, $s5, $ra - vinsgr2vr.w $vr4, $s1, 0 - vinsgr2vr.w $vr4, $s2, 1 - vinsgr2vr.w $vr4, $s3, 2 - vinsgr2vr.w $vr4, $s4, 3 - vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr4, $vr2 + vinsgr2vr.w $vr3, $s1, 0 + vinsgr2vr.w $vr3, $s2, 1 + vinsgr2vr.w $vr3, $s3, 2 + vinsgr2vr.w $vr3, $s4, 3 + vadd.w $vr1, $vr2, $vr1 + vadd.w $vr0, $vr3, $vr0 addi.d $t6, $t6, 16 addi.d $fp, $fp, -8 addi.d $t8, $t8, 16 bnez $fp, .LBB4_137 # %bb.138: # %middle.block1147 # in Loop: Header=BB4_127 Depth=3 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t6, $vr0, 0 @@ -5142,7 +5160,6 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch addi.d $t5, $s0, 8 addi.d $a0, $a0, 8 move $s0, $t1 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB4_144: # %vector.body1108 # Parent Loop BB4_28 Depth=1 @@ -5151,69 +5168,77 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # => This Inner Loop Header: Depth=4 ld.d $s1, $t5, -8 ld.d $s2, $t5, 0 - vinsgr2vr.d $vr3, $s1, 0 - vinsgr2vr.d $vr4, $s2, 0 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr5, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.d $vr2, $s1, 0 + vinsgr2vr.d $vr3, $s2, 0 + vsllwil.wu.hu $vr4, $vr2, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.wu.hu $vr5, $vr3, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.h $vr3, $vr3, 14 ld.d $s1, $a0, -8 ld.d $s2, $a0, 0 - vilvl.w $vr6, $vr0, $vr4 - vilvh.w $vr4, $vr0, $vr4 - vinsgr2vr.d $vr7, $s1, 0 - vinsgr2vr.d $vr8, $s2, 0 - vilvl.h $vr7, $vr0, $vr7 - vilvl.w $vr9, $vr0, $vr7 - vilvh.w $vr7, $vr0, $vr7 - vilvl.h $vr8, $vr0, $vr8 - vilvl.w $vr10, $vr0, $vr8 - vilvh.w $vr8, $vr0, $vr8 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vinsgr2vr.d $vr6, $s1, 0 + vinsgr2vr.d $vr7, $s2, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.h $vr6, $vr6, 14 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.wu.hu $vr9, $vr7, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vshuf4i.h $vr7, $vr7, 14 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsub.d $vr2, $vr2, $vr6 + vsub.d $vr4, $vr4, $vr8 vsub.d $vr3, $vr3, $vr7 vsub.d $vr5, $vr5, $vr9 - vsub.d $vr4, $vr4, $vr8 - vsub.d $vr6, $vr6, $vr10 - vpickve2gr.d $s1, $vr5, 0 + vpickve2gr.d $s1, $vr4, 0 slli.d $s1, $s1, 2 - vpickve2gr.d $s2, $vr5, 1 + vpickve2gr.d $s2, $vr4, 1 slli.d $s2, $s2, 2 - vpickve2gr.d $s3, $vr3, 0 + vpickve2gr.d $s3, $vr2, 0 slli.d $s3, $s3, 2 - vpickve2gr.d $s4, $vr3, 1 + vpickve2gr.d $s4, $vr2, 1 slli.d $s4, $s4, 2 - vpickve2gr.d $s6, $vr6, 0 + vpickve2gr.d $s6, $vr5, 0 slli.d $s6, $s6, 2 - vpickve2gr.d $s7, $vr6, 1 + vpickve2gr.d $s7, $vr5, 1 slli.d $s7, $s7, 2 - vpickve2gr.d $s8, $vr4, 0 + vpickve2gr.d $s8, $vr3, 0 slli.d $s8, $s8, 2 - vpickve2gr.d $ra, $vr4, 1 + vpickve2gr.d $ra, $vr3, 1 slli.d $ra, $ra, 2 ldx.w $s1, $s5, $s1 ldx.w $s2, $s5, $s2 ldx.w $s3, $s5, $s3 ldx.w $s4, $s5, $s4 - vinsgr2vr.w $vr3, $s1, 0 - vinsgr2vr.w $vr3, $s2, 1 - vinsgr2vr.w $vr3, $s3, 2 - vinsgr2vr.w $vr3, $s4, 3 + vinsgr2vr.w $vr2, $s1, 0 + vinsgr2vr.w $vr2, $s2, 1 + vinsgr2vr.w $vr2, $s3, 2 + vinsgr2vr.w $vr2, $s4, 3 ldx.w $s1, $s5, $s6 ldx.w $s2, $s5, $s7 ldx.w $s3, $s5, $s8 ldx.w $s4, $s5, $ra - vinsgr2vr.w $vr4, $s1, 0 - vinsgr2vr.w $vr4, $s2, 1 - vinsgr2vr.w $vr4, $s3, 2 - vinsgr2vr.w $vr4, $s4, 3 - vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr4, $vr2 + vinsgr2vr.w $vr3, $s1, 0 + vinsgr2vr.w $vr3, $s2, 1 + vinsgr2vr.w $vr3, $s3, 2 + vinsgr2vr.w $vr3, $s4, 3 + vadd.w $vr1, $vr2, $vr1 + vadd.w $vr0, $vr3, $vr0 addi.d $t5, $t5, 16 addi.d $s0, $s0, -8 addi.d $a0, $a0, 16 bnez $s0, .LBB4_144 # %bb.145: # %middle.block1121 # in Loop: Header=BB4_127 Depth=3 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t5, $vr0, 0 @@ -5261,7 +5286,6 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch addi.d $t4, $fp, 8 addi.d $t8, $t8, 8 move $fp, $t1 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB4_151: # %vector.body1082 # Parent Loop BB4_28 Depth=1 @@ -5270,69 +5294,77 @@ SetupFastFullPelSearch: # @SetupFastFullPelSearch # => This Inner Loop Header: Depth=4 ld.d $s0, $t4, -8 ld.d $s1, $t4, 0 - vinsgr2vr.d $vr3, $s0, 0 - vinsgr2vr.d $vr4, $s1, 0 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr5, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.d $vr2, $s0, 0 + vinsgr2vr.d $vr3, $s1, 0 + vsllwil.wu.hu $vr4, $vr2, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vsllwil.wu.hu $vr5, $vr3, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.h $vr3, $vr3, 14 ld.d $s0, $t8, -8 ld.d $s1, $t8, 0 - vilvl.w $vr6, $vr0, $vr4 - vilvh.w $vr4, $vr0, $vr4 - vinsgr2vr.d $vr7, $s0, 0 - vinsgr2vr.d $vr8, $s1, 0 - vilvl.h $vr7, $vr0, $vr7 - vilvl.w $vr9, $vr0, $vr7 - vilvh.w $vr7, $vr0, $vr7 - vilvl.h $vr8, $vr0, $vr8 - vilvl.w $vr10, $vr0, $vr8 - vilvh.w $vr8, $vr0, $vr8 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vinsgr2vr.d $vr6, $s0, 0 + vinsgr2vr.d $vr7, $s1, 0 + vsllwil.wu.hu $vr8, $vr6, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.h $vr6, $vr6, 14 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.wu.hu $vr9, $vr7, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vshuf4i.h $vr7, $vr7, 14 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsub.d $vr2, $vr2, $vr6 + vsub.d $vr4, $vr4, $vr8 vsub.d $vr3, $vr3, $vr7 vsub.d $vr5, $vr5, $vr9 - vsub.d $vr4, $vr4, $vr8 - vsub.d $vr6, $vr6, $vr10 - vpickve2gr.d $s0, $vr5, 0 + vpickve2gr.d $s0, $vr4, 0 slli.d $s0, $s0, 2 - vpickve2gr.d $s1, $vr5, 1 + vpickve2gr.d $s1, $vr4, 1 slli.d $s1, $s1, 2 - vpickve2gr.d $s3, $vr3, 0 + vpickve2gr.d $s3, $vr2, 0 slli.d $s3, $s3, 2 - vpickve2gr.d $s4, $vr3, 1 + vpickve2gr.d $s4, $vr2, 1 slli.d $s4, $s4, 2 - vpickve2gr.d $s6, $vr6, 0 + vpickve2gr.d $s6, $vr5, 0 slli.d $s6, $s6, 2 - vpickve2gr.d $s7, $vr6, 1 + vpickve2gr.d $s7, $vr5, 1 slli.d $s7, $s7, 2 - vpickve2gr.d $s8, $vr4, 0 + vpickve2gr.d $s8, $vr3, 0 slli.d $s8, $s8, 2 - vpickve2gr.d $ra, $vr4, 1 + vpickve2gr.d $ra, $vr3, 1 slli.d $ra, $ra, 2 ldx.w $s0, $s5, $s0 ldx.w $s1, $s5, $s1 ldx.w $s3, $s5, $s3 ldx.w $s4, $s5, $s4 - vinsgr2vr.w $vr3, $s0, 0 - vinsgr2vr.w $vr3, $s1, 1 - vinsgr2vr.w $vr3, $s3, 2 - vinsgr2vr.w $vr3, $s4, 3 + vinsgr2vr.w $vr2, $s0, 0 + vinsgr2vr.w $vr2, $s1, 1 + vinsgr2vr.w $vr2, $s3, 2 + vinsgr2vr.w $vr2, $s4, 3 ldx.w $s0, $s5, $s6 ldx.w $s1, $s5, $s7 ldx.w $s3, $s5, $s8 ldx.w $s4, $s5, $ra - vinsgr2vr.w $vr4, $s0, 0 - vinsgr2vr.w $vr4, $s1, 1 - vinsgr2vr.w $vr4, $s3, 2 - vinsgr2vr.w $vr4, $s4, 3 - vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr4, $vr2 + vinsgr2vr.w $vr3, $s0, 0 + vinsgr2vr.w $vr3, $s1, 1 + vinsgr2vr.w $vr3, $s3, 2 + vinsgr2vr.w $vr3, $s4, 3 + vadd.w $vr1, $vr2, $vr1 + vadd.w $vr0, $vr3, $vr0 addi.d $t4, $t4, 16 addi.d $fp, $fp, -8 addi.d $t8, $t8, 16 bnez $fp, .LBB4_151 # %bb.152: # %middle.block1095 # in Loop: Header=BB4_127 Depth=3 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t4, $vr0, 0 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/mode_decision.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/mode_decision.s index 318307f1..894eefa9 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/mode_decision.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/mode_decision.s @@ -13,7 +13,6 @@ rc_store_diff: # @rc_store_diff move $a4, $zero addi.d $a2, $a2, 16 slli.d $a5, $a0, 1 - vrepli.b $vr0, 0 ori $a6, $zero, 128 .p2align 4, , 16 .LBB0_1: # =>This Inner Loop Header: Depth=1 @@ -21,36 +20,36 @@ rc_store_diff: # @rc_store_diff ldx.d $t0, $a7, $a5 ld.d $t1, $a2, -16 alsl.d $a7, $a0, $a7, 1 - vinsgr2vr.d $vr1, $t0, 0 - vilvl.h $vr1, $vr0, $vr1 - vinsgr2vr.d $vr2, $t1, 0 - vilvl.h $vr2, $vr0, $vr2 + vinsgr2vr.d $vr0, $t0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vinsgr2vr.d $vr1, $t1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $t0, $a7, 8 - vsub.w $vr1, $vr1, $vr2 + vsub.w $vr0, $vr0, $vr1 ld.d $t1, $a2, -8 - vst $vr1, $a3, -32 - vinsgr2vr.d $vr1, $t0, 0 - vilvl.h $vr1, $vr0, $vr1 - vinsgr2vr.d $vr2, $t1, 0 + vst $vr0, $a3, -32 + vinsgr2vr.d $vr0, $t0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vinsgr2vr.d $vr1, $t1, 0 ld.d $t0, $a7, 16 - vilvl.h $vr2, $vr0, $vr2 - vsub.w $vr1, $vr1, $vr2 + vsllwil.wu.hu $vr1, $vr1, 0 + vsub.w $vr0, $vr0, $vr1 ld.d $t1, $a2, 0 - vinsgr2vr.d $vr2, $t0, 0 - vst $vr1, $a3, -16 - vilvl.h $vr1, $vr0, $vr2 - vinsgr2vr.d $vr2, $t1, 0 - vilvl.h $vr2, $vr0, $vr2 + vinsgr2vr.d $vr1, $t0, 0 + vst $vr0, $a3, -16 + vsllwil.wu.hu $vr0, $vr1, 0 + vinsgr2vr.d $vr1, $t1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a7, $a7, 24 - vsub.w $vr1, $vr1, $vr2 + vsub.w $vr0, $vr0, $vr1 ld.d $t0, $a2, 8 - vst $vr1, $a3, 0 - vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr0, $vr1 - vinsgr2vr.d $vr2, $t0, 0 - vilvl.h $vr2, $vr0, $vr2 - vsub.w $vr1, $vr1, $vr2 - vst $vr1, $a3, 16 + vst $vr0, $a3, 0 + vinsgr2vr.d $vr0, $a7, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vinsgr2vr.d $vr1, $t0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsub.w $vr0, $vr0, $vr1 + vst $vr0, $a3, 16 addi.d $a4, $a4, 8 addi.d $a3, $a3, 64 addi.d $a2, $a2, 32 @@ -160,20 +159,19 @@ fast_mode_intra_decision: # @fast_mode_intra_decision slli.d $t2, $t2, 3 vinsgr2vr.w $vr3, $t4, 0 vinsgr2vr.w $vr4, $t6, 0 - vabsd.hu $vr3, $vr1, $vr3 - vrepli.b $vr1, 0 + vabsd.hu $vr1, $vr1, $vr3 ldx.h $t3, $t3, $a5 - vilvl.h $vr3, $vr1, $vr3 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $t4, $a6, 8 ld.d $t5, $a6, 16 - vinsgr2vr.h $vr5, $t3, 0 + vinsgr2vr.h $vr3, $t3, 0 ld.d $t3, $a6, 24 vabsd.hu $vr2, $vr2, $vr4 ldx.h $t4, $t4, $a5 ldx.h $t5, $t5, $a5 ldx.h $t3, $t3, $a5 - vilvl.h $vr2, $vr1, $vr2 - vinsgr2vr.h $vr5, $t4, 1 + vsllwil.wu.hu $vr2, $vr2, 0 + vinsgr2vr.h $vr3, $t4, 1 vinsgr2vr.h $vr4, $t5, 0 vinsgr2vr.h $vr4, $t3, 1 ldx.d $t1, $t1, $t2 @@ -188,30 +186,30 @@ fast_mode_intra_decision: # @fast_mode_intra_decision ld.h $t2, $t2, -2 ld.h $t3, $t3, -2 ld.h $t4, $t4, -2 - vinsgr2vr.h $vr6, $t1, 0 - vinsgr2vr.h $vr6, $t2, 1 - vinsgr2vr.h $vr7, $t3, 0 - vinsgr2vr.h $vr7, $t4, 1 - vabsd.hu $vr5, $vr5, $vr6 - vilvl.h $vr5, $vr1, $vr5 - vabsd.hu $vr4, $vr4, $vr7 - vilvl.h $vr4, $vr1, $vr4 - vadd.w $vr3, $vr3, $vr5 + vinsgr2vr.h $vr5, $t1, 0 + vinsgr2vr.h $vr5, $t2, 1 + vinsgr2vr.h $vr6, $t3, 0 + vinsgr2vr.h $vr6, $t4, 1 + vabsd.hu $vr3, $vr3, $vr5 + vsllwil.wu.hu $vr3, $vr3, 0 + vabsd.hu $vr4, $vr4, $vr6 + vsllwil.wu.hu $vr4, $vr4, 0 + vadd.w $vr1, $vr1, $vr3 vadd.w $vr2, $vr2, $vr4 ld.w $t1, $t0, 8 ld.w $t2, $t0, 12 ld.w $t3, $a7, 8 ld.w $t4, $a7, 12 - vinsgr2vr.w $vr4, $t1, 0 - vinsgr2vr.w $vr5, $t2, 0 - vinsgr2vr.w $vr6, $t3, 0 - vinsgr2vr.w $vr7, $t4, 0 + vinsgr2vr.w $vr3, $t1, 0 + vinsgr2vr.w $vr4, $t2, 0 + vinsgr2vr.w $vr5, $t3, 0 + vinsgr2vr.w $vr6, $t4, 0 + vabsd.hu $vr3, $vr3, $vr5 + vsllwil.wu.hu $vr3, $vr3, 0 vabsd.hu $vr4, $vr4, $vr6 - vilvl.h $vr4, $vr1, $vr4 - vabsd.hu $vr5, $vr5, $vr7 - vilvl.h $vr5, $vr1, $vr5 - vadd.w $vr3, $vr3, $vr4 - vadd.w $vr2, $vr2, $vr5 + vsllwil.wu.hu $vr4, $vr4, 0 + vadd.w $vr1, $vr1, $vr3 + vadd.w $vr2, $vr2, $vr4 ld.d $t1, $a6, 32 ld.d $t2, $a6, 40 ld.d $t3, $a6, 48 @@ -220,10 +218,10 @@ fast_mode_intra_decision: # @fast_mode_intra_decision ldx.h $t2, $t2, $a5 ldx.h $t3, $t3, $a5 ldx.h $t4, $t4, $a5 - vinsgr2vr.h $vr4, $t1, 0 - vinsgr2vr.h $vr4, $t2, 1 - vinsgr2vr.h $vr5, $t3, 0 - vinsgr2vr.h $vr5, $t4, 1 + vinsgr2vr.h $vr3, $t1, 0 + vinsgr2vr.h $vr3, $t2, 1 + vinsgr2vr.h $vr4, $t3, 0 + vinsgr2vr.h $vr4, $t4, 1 ld.d $t1, $a3, 32 ld.d $t2, $a3, 40 ld.d $t3, $a3, 48 @@ -236,30 +234,30 @@ fast_mode_intra_decision: # @fast_mode_intra_decision ld.h $t2, $t2, -2 ld.h $t3, $t3, -2 ld.h $t4, $t4, -2 - vinsgr2vr.h $vr6, $t1, 0 - vinsgr2vr.h $vr6, $t2, 1 - vinsgr2vr.h $vr7, $t3, 0 - vinsgr2vr.h $vr7, $t4, 1 + vinsgr2vr.h $vr5, $t1, 0 + vinsgr2vr.h $vr5, $t2, 1 + vinsgr2vr.h $vr6, $t3, 0 + vinsgr2vr.h $vr6, $t4, 1 + vabsd.hu $vr3, $vr3, $vr5 + vsllwil.wu.hu $vr3, $vr3, 0 vabsd.hu $vr4, $vr4, $vr6 - vilvl.h $vr4, $vr1, $vr4 - vabsd.hu $vr5, $vr5, $vr7 - vilvl.h $vr5, $vr1, $vr5 - vadd.w $vr3, $vr3, $vr4 - vadd.w $vr2, $vr2, $vr5 + vsllwil.wu.hu $vr4, $vr4, 0 + vadd.w $vr1, $vr1, $vr3 + vadd.w $vr2, $vr2, $vr4 ld.w $t1, $t0, 16 ld.w $t2, $t0, 20 ld.w $t3, $a7, 16 ld.w $t4, $a7, 20 - vinsgr2vr.w $vr4, $t1, 0 - vinsgr2vr.w $vr5, $t2, 0 - vinsgr2vr.w $vr6, $t3, 0 - vinsgr2vr.w $vr7, $t4, 0 + vinsgr2vr.w $vr3, $t1, 0 + vinsgr2vr.w $vr4, $t2, 0 + vinsgr2vr.w $vr5, $t3, 0 + vinsgr2vr.w $vr6, $t4, 0 + vabsd.hu $vr3, $vr3, $vr5 + vsllwil.wu.hu $vr3, $vr3, 0 vabsd.hu $vr4, $vr4, $vr6 - vilvl.h $vr4, $vr1, $vr4 - vabsd.hu $vr5, $vr5, $vr7 - vilvl.h $vr5, $vr1, $vr5 - vadd.w $vr3, $vr3, $vr4 - vadd.w $vr2, $vr2, $vr5 + vsllwil.wu.hu $vr4, $vr4, 0 + vadd.w $vr1, $vr1, $vr3 + vadd.w $vr2, $vr2, $vr4 ld.d $t1, $a6, 64 ld.d $t2, $a6, 72 ld.d $t3, $a6, 80 @@ -268,10 +266,10 @@ fast_mode_intra_decision: # @fast_mode_intra_decision ldx.h $t2, $t2, $a5 ldx.h $t3, $t3, $a5 ldx.h $t4, $t4, $a5 - vinsgr2vr.h $vr4, $t1, 0 - vinsgr2vr.h $vr4, $t2, 1 - vinsgr2vr.h $vr5, $t3, 0 - vinsgr2vr.h $vr5, $t4, 1 + vinsgr2vr.h $vr3, $t1, 0 + vinsgr2vr.h $vr3, $t2, 1 + vinsgr2vr.h $vr4, $t3, 0 + vinsgr2vr.h $vr4, $t4, 1 ld.d $t1, $a3, 64 ld.d $t2, $a3, 72 ld.d $t3, $a3, 80 @@ -284,34 +282,34 @@ fast_mode_intra_decision: # @fast_mode_intra_decision ld.h $t2, $t2, -2 ld.h $t3, $t3, -2 ld.h $t4, $t4, -2 - vinsgr2vr.h $vr6, $t1, 0 - vinsgr2vr.h $vr6, $t2, 1 - vinsgr2vr.h $vr7, $t3, 0 - vinsgr2vr.h $vr7, $t4, 1 + vinsgr2vr.h $vr5, $t1, 0 + vinsgr2vr.h $vr5, $t2, 1 + vinsgr2vr.h $vr6, $t3, 0 + vinsgr2vr.h $vr6, $t4, 1 + vabsd.hu $vr3, $vr3, $vr5 + vsllwil.wu.hu $vr3, $vr3, 0 vabsd.hu $vr4, $vr4, $vr6 - vilvl.h $vr4, $vr1, $vr4 - vabsd.hu $vr5, $vr5, $vr7 - vilvl.h $vr5, $vr1, $vr5 - vadd.w $vr3, $vr3, $vr4 - vilvl.w $vr3, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr5 - vilvl.w $vr2, $vr1, $vr2 + vsllwil.wu.hu $vr4, $vr4, 0 + vadd.w $vr1, $vr1, $vr3 + vsllwil.du.wu $vr1, $vr1, 0 + vadd.w $vr2, $vr2, $vr4 + vsllwil.du.wu $vr2, $vr2, 0 ld.w $t1, $t0, 24 ld.w $t0, $t0, 28 ld.w $t2, $a7, 24 ld.w $a7, $a7, 28 - vinsgr2vr.w $vr4, $t1, 0 - vinsgr2vr.w $vr5, $t0, 0 - vinsgr2vr.w $vr6, $t2, 0 - vinsgr2vr.w $vr7, $a7, 0 + vinsgr2vr.w $vr3, $t1, 0 + vinsgr2vr.w $vr4, $t0, 0 + vinsgr2vr.w $vr5, $t2, 0 + vinsgr2vr.w $vr6, $a7, 0 + vabsd.hu $vr3, $vr3, $vr5 vabsd.hu $vr4, $vr4, $vr6 - vabsd.hu $vr5, $vr5, $vr7 - vilvl.h $vr4, $vr1, $vr4 - vilvl.w $vr4, $vr1, $vr4 - vilvl.h $vr5, $vr1, $vr5 - vilvl.w $vr5, $vr1, $vr5 - vadd.d $vr3, $vr3, $vr4 - vadd.d $vr2, $vr2, $vr5 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vadd.d $vr1, $vr1, $vr3 + vadd.d $vr2, $vr2, $vr4 ld.d $a7, $a6, 96 ld.d $t0, $a6, 104 ld.d $t1, $a6, 112 @@ -320,10 +318,10 @@ fast_mode_intra_decision: # @fast_mode_intra_decision ldx.h $t0, $t0, $a5 ldx.h $t1, $t1, $a5 ldx.h $a5, $a6, $a5 - vinsgr2vr.h $vr4, $a7, 0 - vinsgr2vr.h $vr4, $t0, 1 - vinsgr2vr.h $vr5, $t1, 0 - vinsgr2vr.h $vr5, $a5, 1 + vinsgr2vr.h $vr3, $a7, 0 + vinsgr2vr.h $vr3, $t0, 1 + vinsgr2vr.h $vr4, $t1, 0 + vinsgr2vr.h $vr4, $a5, 1 ld.d $a5, $a3, 96 ld.d $a6, $a3, 104 ld.d $a7, $a3, 112 @@ -336,25 +334,25 @@ fast_mode_intra_decision: # @fast_mode_intra_decision ld.h $a5, $a6, -2 ld.h $a6, $a7, -2 ld.h $a3, $a3, -2 - vinsgr2vr.h $vr6, $a4, 0 - vinsgr2vr.h $vr6, $a5, 1 - vinsgr2vr.h $vr7, $a6, 0 - vinsgr2vr.h $vr7, $a3, 1 + vinsgr2vr.h $vr5, $a4, 0 + vinsgr2vr.h $vr5, $a5, 1 + vinsgr2vr.h $vr6, $a6, 0 + vinsgr2vr.h $vr6, $a3, 1 + vabsd.hu $vr3, $vr3, $vr5 vabsd.hu $vr4, $vr4, $vr6 - vabsd.hu $vr5, $vr5, $vr7 - vilvl.h $vr4, $vr1, $vr4 - vilvl.w $vr4, $vr1, $vr4 - vilvl.h $vr5, $vr1, $vr5 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.wu.hu $vr4, $vr4, 0 pcalau12i $a3, %pc_hi20(imgUV_org) ld.d $a5, $a3, %pc_lo12(imgUV_org) - vilvl.w $vr5, $vr1, $vr5 - vadd.d $vr3, $vr3, $vr4 - vadd.d $vr2, $vr2, $vr5 + vsllwil.du.wu $vr4, $vr4, 0 + vadd.d $vr1, $vr1, $vr3 + vadd.d $vr2, $vr2, $vr4 ld.d $a3, $a5, 0 ld.w $a6, $a0, 204 ldptr.d $a7, $a1, 6472 - vadd.d $vr2, $vr2, $vr3 - vhaddw.q.d $vr3, $vr2, $vr2 + vadd.d $vr1, $vr2, $vr1 + vhaddw.q.d $vr2, $vr1, $vr1 alsl.d $a1, $a6, $a3, 3 ld.d $a4, $a7, 0 ld.w $t0, $a0, 188 @@ -370,8 +368,8 @@ fast_mode_intra_decision: # @fast_mode_intra_decision ldx.d $t1, $t3, $t1 alsl.d $a6, $t0, $a7, 3 ld.d $t3, $a6, -8 - vori.b $vr2, $vr1, 0 - vextrins.d $vr2, $vr3, 0 + vrepli.b $vr1, 0 + vextrins.d $vr1, $vr2, 0 alsl.d $a7, $a3, $t2, 1 alsl.d $t0, $a3, $t1, 1 alsl.d $t1, $a0, $t4, 1 @@ -382,61 +380,61 @@ fast_mode_intra_decision: # @fast_mode_intra_decision # =>This Inner Loop Header: Depth=1 ld.w $t4, $a7, 0 ld.w $t5, $t1, 0 - vinsgr2vr.w $vr3, $t4, 0 + vinsgr2vr.w $vr2, $t4, 0 ldx.d $t4, $a1, $a2 - vinsgr2vr.w $vr4, $t5, 0 - vabsd.hu $vr3, $vr3, $vr4 + vinsgr2vr.w $vr3, $t5, 0 + vabsd.hu $vr2, $vr2, $vr3 slli.d $t5, $a3, 1 ldx.h $t4, $t4, $t5 - vilvl.h $vr3, $vr1, $vr3 + vsllwil.wu.hu $vr2, $vr2, 0 add.d $t6, $a1, $a2 ld.d $t6, $t6, 8 - vinsgr2vr.h $vr4, $t4, 0 + vinsgr2vr.h $vr3, $t4, 0 add.d $t4, $a4, $a2 ldx.d $t7, $a4, $a2 ld.d $t4, $t4, 8 - vilvl.w $vr3, $vr1, $vr3 + vsllwil.du.wu $vr2, $vr2, 0 ldx.h $t6, $t6, $t5 alsl.d $t7, $a0, $t7, 1 alsl.d $t4, $a0, $t4, 1 ld.h $t7, $t7, -2 ld.h $t4, $t4, -2 - vadd.d $vr2, $vr2, $vr3 - vinsgr2vr.h $vr4, $t6, 1 - vinsgr2vr.h $vr3, $t7, 0 - vinsgr2vr.h $vr3, $t4, 1 - vabsd.hu $vr3, $vr4, $vr3 + vadd.d $vr1, $vr1, $vr2 + vinsgr2vr.h $vr3, $t6, 1 + vinsgr2vr.h $vr2, $t7, 0 + vinsgr2vr.h $vr2, $t4, 1 + vabsd.hu $vr2, $vr3, $vr2 ld.w $t4, $t0, 0 - vilvl.h $vr3, $vr1, $vr3 - vilvl.w $vr3, $vr1, $vr3 - vadd.d $vr2, $vr2, $vr3 - vinsgr2vr.w $vr3, $t4, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vadd.d $vr1, $vr1, $vr2 + vinsgr2vr.w $vr2, $t4, 0 ldx.d $t4, $a5, $a2 ld.w $t6, $t2, 0 add.d $t7, $a5, $a2 ld.d $t7, $t7, 8 ldx.h $t4, $t4, $t5 - vinsgr2vr.w $vr4, $t6, 0 - vabsd.hu $vr3, $vr3, $vr4 + vinsgr2vr.w $vr3, $t6, 0 + vabsd.hu $vr2, $vr2, $vr3 ldx.h $t5, $t7, $t5 - vinsgr2vr.h $vr4, $t4, 0 + vinsgr2vr.h $vr3, $t4, 0 add.d $t4, $a6, $a2 ldx.d $t6, $a6, $a2 ld.d $t4, $t4, 8 - vilvl.h $vr3, $vr1, $vr3 - vilvl.w $vr3, $vr1, $vr3 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 alsl.d $t6, $a0, $t6, 1 alsl.d $t4, $a0, $t4, 1 ld.h $t6, $t6, -2 ld.h $t4, $t4, -2 - vadd.d $vr2, $vr2, $vr3 - vinsgr2vr.h $vr4, $t5, 1 - vinsgr2vr.h $vr3, $t6, 0 - vinsgr2vr.h $vr3, $t4, 1 - vabsd.hu $vr3, $vr4, $vr3 - vilvl.h $vr3, $vr1, $vr3 - vilvl.w $vr3, $vr1, $vr3 - vadd.d $vr2, $vr2, $vr3 + vadd.d $vr1, $vr1, $vr2 + vinsgr2vr.h $vr3, $t5, 1 + vinsgr2vr.h $vr2, $t6, 0 + vinsgr2vr.h $vr2, $t4, 1 + vabsd.hu $vr2, $vr3, $vr2 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vadd.d $vr1, $vr1, $vr2 addi.d $a2, $a2, 16 addi.d $a7, $a7, 4 addi.d $t0, $t0, 4 @@ -444,7 +442,7 @@ fast_mode_intra_decision: # @fast_mode_intra_decision addi.d $t2, $t2, 4 bne $a2, $t3, .LBB1_5 # %bb.6: # %middle.block78 - vhaddw.q.d $vr1, $vr2, $vr2 + vhaddw.q.d $vr1, $vr1, $vr1 vpickve2gr.d $a0, $vr1, 0 movgr2fr.d $fa1, $a0 ffint.d.l $fa1, $fa1 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/mv-search.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/mv-search.s index 04a644de..a7ad2c51 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/mv-search.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/mv-search.s @@ -1370,21 +1370,21 @@ Clear_Motion_Search_Module: # @Clear_Motion_Search_Module .type BPredPartitionCost,@function BPredPartitionCost: # @BPredPartitionCost # %bb.0: - addi.d $sp, $sp, -1296 - st.d $ra, $sp, 1288 # 8-byte Folded Spill - st.d $fp, $sp, 1280 # 8-byte Folded Spill - st.d $s0, $sp, 1272 # 8-byte Folded Spill - st.d $s1, $sp, 1264 # 8-byte Folded Spill - st.d $s2, $sp, 1256 # 8-byte Folded Spill - st.d $s3, $sp, 1248 # 8-byte Folded Spill - st.d $s4, $sp, 1240 # 8-byte Folded Spill - st.d $s5, $sp, 1232 # 8-byte Folded Spill - st.d $s6, $sp, 1224 # 8-byte Folded Spill - st.d $s7, $sp, 1216 # 8-byte Folded Spill - st.d $s8, $sp, 1208 # 8-byte Folded Spill - st.d $a5, $sp, 152 # 8-byte Folded Spill - st.d $a3, $sp, 144 # 8-byte Folded Spill - st.d $a2, $sp, 136 # 8-byte Folded Spill + addi.d $sp, $sp, -1264 + st.d $ra, $sp, 1256 # 8-byte Folded Spill + st.d $fp, $sp, 1248 # 8-byte Folded Spill + st.d $s0, $sp, 1240 # 8-byte Folded Spill + st.d $s1, $sp, 1232 # 8-byte Folded Spill + st.d $s2, $sp, 1224 # 8-byte Folded Spill + st.d $s3, $sp, 1216 # 8-byte Folded Spill + st.d $s4, $sp, 1208 # 8-byte Folded Spill + st.d $s5, $sp, 1200 # 8-byte Folded Spill + st.d $s6, $sp, 1192 # 8-byte Folded Spill + st.d $s7, $sp, 1184 # 8-byte Folded Spill + st.d $s8, $sp, 1176 # 8-byte Folded Spill + st.d $a5, $sp, 120 # 8-byte Folded Spill + st.d $a3, $sp, 112 # 8-byte Folded Spill + st.d $a2, $sp, 104 # 8-byte Folded Spill move $fp, $a0 pcalau12i $a0, %got_pc_hi20(input) ld.d $s0, $a0, %got_pc_lo12(input) @@ -1401,26 +1401,26 @@ BPredPartitionCost: # @BPredPartitionCost ld.w $t1, $a5, 4 ld.w $a5, $a0, 76 lu12i.w $t0, 1 - st.d $t3, $sp, 72 # 8-byte Folded Spill + st.d $t3, $sp, 64 # 8-byte Folded Spill blez $t1, .LBB3_13 # %bb.1: # %.lr.ph160 slli.d $a0, $t3, 3 ldx.w $a0, $a3, $a0 blez $a0, .LBB3_13 # %bb.2: # %.lr.ph.us.preheader - st.d $a4, $sp, 168 # 8-byte Folded Spill - st.d $a5, $sp, 16 # 8-byte Folded Spill - st.d $a6, $sp, 24 # 8-byte Folded Spill - st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $a4, $sp, 136 # 8-byte Folded Spill + st.d $a5, $sp, 8 # 8-byte Folded Spill + st.d $a6, $sp, 16 # 8-byte Folded Spill + st.d $s0, $sp, 80 # 8-byte Folded Spill slli.d $a2, $fp, 3 alsl.d $a5, $fp, $a3, 3 ldx.w $a3, $a3, $a2 ld.w $a5, $a5, 4 - ld.d $a4, $sp, 152 # 8-byte Folded Reload + ld.d $a4, $sp, 120 # 8-byte Folded Reload sltui $a6, $a4, 1 pcalau12i $a7, %got_pc_hi20(img) ld.d $a4, $a7, %got_pc_lo12(img) - st.d $a4, $sp, 176 # 8-byte Folded Spill + st.d $a4, $sp, 144 # 8-byte Folded Spill ld.d $a7, $a4, 0 lu12i.w $t0, 3 ori $t2, $t0, 2104 @@ -1438,7 +1438,7 @@ BPredPartitionCost: # @BPredPartitionCost pcalau12i $t0, %pc_hi20(mvbits) ld.d $t0, $t0, %pc_lo12(mvbits) add.w $a4, $s0, $t1 - st.d $a4, $sp, 56 # 8-byte Folded Spill + st.d $a4, $sp, 48 # 8-byte Folded Spill pcalau12i $t1, %pc_hi20(PartitionMotionSearch.bx0) addi.d $t1, $t1, %pc_lo12(PartitionMotionSearch.bx0) alsl.d $a4, $t3, $t1, 4 @@ -1447,11 +1447,11 @@ BPredPartitionCost: # @BPredPartitionCost add.w $t4, $t2, $a0 slli.d $t5, $t2, 3 slli.d $t6, $a3, 3 - ld.d $t1, $sp, 136 # 8-byte Folded Reload + ld.d $t1, $sp, 104 # 8-byte Folded Reload slli.d $t7, $t1, 3 - ld.d $t1, $sp, 144 # 8-byte Folded Reload + ld.d $t1, $sp, 112 # 8-byte Folded Reload slli.d $t8, $t1, 3 - st.d $s0, $sp, 64 # 8-byte Folded Spill + st.d $s0, $sp, 56 # 8-byte Folded Spill .p2align 4, , 16 .LBB3_3: # %.lr.ph.us # =>This Loop Header: Depth=1 @@ -1508,55 +1508,53 @@ BPredPartitionCost: # @BPredPartitionCost # %bb.5: # %._crit_edge.us # in Loop: Header=BB3_3 Depth=1 add.d $s0, $s0, $a5 - ld.d $t1, $sp, 56 # 8-byte Folded Reload + ld.d $t1, $sp, 48 # 8-byte Folded Reload blt $s0, $t1, .LBB3_3 # %bb.6: # %._crit_edge161 ldx.w $a2, $a4, $a1 - ld.d $a1, $sp, 168 # 8-byte Folded Reload + ld.d $a1, $sp, 136 # 8-byte Folded Reload mul.w $a1, $t3, $a1 srai.d $s8, $a1, 16 add.w $a0, $a2, $a0 - st.d $a0, $sp, 128 # 8-byte Folded Spill - st.d $a2, $sp, 40 # 8-byte Folded Spill + st.d $a0, $sp, 96 # 8-byte Folded Spill + st.d $a2, $sp, 32 # 8-byte Folded Spill slli.w $a0, $a2, 2 - st.d $a0, $sp, 32 # 8-byte Folded Spill - addi.d $s0, $sp, 312 + st.d $a0, $sp, 24 # 8-byte Folded Spill + addi.d $s0, $sp, 280 pcalau12i $a0, %pc_hi20(imgY_org) - st.d $a0, $sp, 120 # 8-byte Folded Spill - vrepli.b $vr0, 0 - vst $vr0, $sp, 96 # 16-byte Folded Spill + st.d $a0, $sp, 88 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(diff64) addi.d $s3, $a0, %pc_lo12(diff64) lu12i.w $a0, 1 ori $a0, $a0, 1004 - st.d $a0, $sp, 80 # 8-byte Folded Spill - ld.d $a1, $sp, 64 # 8-byte Folded Reload + st.d $a0, $sp, 72 # 8-byte Folded Spill + ld.d $a1, $sp, 56 # 8-byte Folded Reload b .LBB3_8 .p2align 4, , 16 .LBB3_7: # %._crit_edge.us180 # in Loop: Header=BB3_8 Depth=1 - ld.d $a1, $sp, 64 # 8-byte Folded Reload + ld.d $a1, $sp, 56 # 8-byte Folded Reload addi.d $a1, $a1, 1 - ld.d $s0, $sp, 48 # 8-byte Folded Reload + ld.d $s0, $sp, 40 # 8-byte Folded Reload addi.d $s0, $s0, 256 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 48 # 8-byte Folded Reload bge $a1, $a0, .LBB3_14 .LBB3_8: # %.lr.ph.us179 # =>This Loop Header: Depth=1 # Child Loop BB3_11 Depth 2 - ld.d $a0, $sp, 176 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload ld.d $a0, $a0, 0 ld.w $a0, $a0, 196 slli.w $s4, $a1, 2 - st.d $a1, $sp, 64 # 8-byte Folded Spill + st.d $a1, $sp, 56 # 8-byte Folded Spill alsl.w $s2, $a1, $a0, 2 slli.d $a0, $s4, 5 - st.d $a0, $sp, 168 # 8-byte Folded Spill + st.d $a0, $sp, 136 # 8-byte Folded Spill slli.d $a0, $s2, 3 - st.d $a0, $sp, 160 # 8-byte Folded Spill - st.d $s0, $sp, 48 # 8-byte Folded Spill - ld.d $s5, $sp, 32 # 8-byte Folded Reload - ld.d $s7, $sp, 40 # 8-byte Folded Reload + st.d $a0, $sp, 128 # 8-byte Folded Spill + st.d $s0, $sp, 40 # 8-byte Folded Spill + ld.d $s5, $sp, 24 # 8-byte Folded Reload + ld.d $s7, $sp, 32 # 8-byte Folded Reload b .LBB3_11 .p2align 4, , 16 .LBB3_9: # in Loop: Header=BB3_11 Depth=2 @@ -1568,12 +1566,12 @@ BPredPartitionCost: # @BPredPartitionCost addi.d $s7, $s7, 1 addi.w $s5, $s5, 4 addi.d $s0, $s0, 16 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 96 # 8-byte Folded Reload bge $s7, $a0, .LBB3_7 .LBB3_11: # %.preheader154.us # Parent Loop BB3_8 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $s6, $sp, 176 # 8-byte Folded Reload + ld.d $s6, $sp, 144 # 8-byte Folded Reload ld.d $a0, $s6, 0 ld.w $a0, $a0, 192 add.w $s1, $s5, $a0 @@ -1581,17 +1579,17 @@ BPredPartitionCost: # @BPredPartitionCost move $a1, $s4 move $a2, $fp move $a3, $fp - ld.d $a4, $sp, 136 # 8-byte Folded Reload - ld.d $a5, $sp, 144 # 8-byte Folded Reload - ld.d $a6, $sp, 152 # 8-byte Folded Reload + ld.d $a4, $sp, 104 # 8-byte Folded Reload + ld.d $a5, $sp, 112 # 8-byte Folded Reload + ld.d $a6, $sp, 120 # 8-byte Folded Reload pcaddu18i $ra, %call36(LumaPrediction4x4Bi) jirl $ra, $ra, 0 - ld.d $a0, $sp, 120 # 8-byte Folded Reload + ld.d $a0, $sp, 88 # 8-byte Folded Reload ld.d $a0, $a0, %pc_lo12(imgY_org) ld.d $a1, $s6, 0 - ld.d $a2, $sp, 160 # 8-byte Folded Reload + ld.d $a2, $sp, 128 # 8-byte Folded Reload ldx.d $a2, $a0, $a2 - ld.d $a3, $sp, 168 # 8-byte Folded Reload + ld.d $a3, $sp, 136 # 8-byte Folded Reload add.d $a1, $a1, $a3 slli.d $a3, $s1, 1 ldx.d $a2, $a2, $a3 @@ -1600,10 +1598,9 @@ BPredPartitionCost: # @BPredPartitionCost ldptr.d $a4, $a1, 12624 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a0, 8 - vld $vr2, $sp, 96 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a2, $a2, $a3 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s0, -128 @@ -1612,27 +1609,27 @@ BPredPartitionCost: # @BPredPartitionCost ld.d $a2, $a0, 16 vst $vr0, $s3, 0 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a2, $a2, $a3 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vsub.w $vr0, $vr1, $vr0 vst $vr0, $s0, -64 vinsgr2vr.d $vr1, $a2, 0 ldptr.d $a2, $a1, 12688 vst $vr0, $s3, 16 - vilvl.h $vr0, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr1, 0 ld.d $a0, $a0, 24 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a0, $a0, $a3 vst $vr0, $s0, 0 ldptr.d $a1, $a1, 12720 vst $vr0, $s3, 32 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s0, 64 vst $vr0, $s3, 48 @@ -1640,9 +1637,9 @@ BPredPartitionCost: # @BPredPartitionCost blt $a0, $fp, .LBB3_9 # %bb.12: # %.preheader154.us # in Loop: Header=BB3_11 Depth=2 - ld.d $a0, $sp, 88 # 8-byte Folded Reload + ld.d $a0, $sp, 80 # 8-byte Folded Reload ld.d $a0, $a0, 0 - ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 72 # 8-byte Folded Reload ldx.w $a0, $a0, $a1 bnez $a0, .LBB3_10 b .LBB3_9 @@ -1651,11 +1648,11 @@ BPredPartitionCost: # @BPredPartitionCost bge $a7, $fp, .LBB3_15 b .LBB3_23 .LBB3_14: # %._crit_edge177.loopexit - ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $s0, $sp, 80 # 8-byte Folded Reload ld.d $a2, $s0, 0 - ld.d $t3, $sp, 72 # 8-byte Folded Reload - ld.d $a6, $sp, 24 # 8-byte Folded Reload - ld.d $a5, $sp, 16 # 8-byte Folded Reload + ld.d $t3, $sp, 64 # 8-byte Folded Reload + ld.d $a6, $sp, 16 # 8-byte Folded Reload + ld.d $a5, $sp, 8 # 8-byte Folded Reload ori $a7, $zero, 4 lu12i.w $t0, 1 blt $a7, $fp, .LBB3_23 @@ -1680,10 +1677,10 @@ BPredPartitionCost: # @BPredPartitionCost masknez $a1, $a1, $a0 maskeqz $a0, $a6, $a0 or $s1, $a0, $a1 - addi.d $s2, $sp, 440 + addi.d $s2, $sp, 408 move $s3, $a3 slli.d $a0, $a3, 6 - st.d $a0, $sp, 176 # 8-byte Folded Spill + st.d $a0, $sp, 144 # 8-byte Folded Spill slli.d $s4, $s1, 2 pcalau12i $a0, %pc_hi20(diff64) addi.d $fp, $a0, %pc_lo12(diff64) @@ -1695,7 +1692,7 @@ BPredPartitionCost: # @BPredPartitionCost alsl.d $a0, $t3, $a2, 3 ld.w $a0, $a0, 76 add.d $s5, $s5, $s3 - ld.d $a1, $sp, 176 # 8-byte Folded Reload + ld.d $a1, $sp, 144 # 8-byte Folded Reload add.d $s2, $s2, $a1 bge $s5, $a0, .LBB3_23 .LBB3_20: # %.preheader152 @@ -1747,7 +1744,7 @@ BPredPartitionCost: # @BPredPartitionCost move $a0, $fp pcaddu18i $ra, %call36(distortion8x8) jirl $ra, $ra, 0 - ld.d $t3, $sp, 72 # 8-byte Folded Reload + ld.d $t3, $sp, 64 # 8-byte Folded Reload ld.d $a2, $s0, 0 alsl.d $a1, $t3, $a2, 3 ld.w $a1, $a1, 72 @@ -1758,18 +1755,18 @@ BPredPartitionCost: # @BPredPartitionCost b .LBB3_19 .LBB3_23: # %.loopexit addi.w $a0, $s8, 0 - ld.d $s8, $sp, 1208 # 8-byte Folded Reload - ld.d $s7, $sp, 1216 # 8-byte Folded Reload - ld.d $s6, $sp, 1224 # 8-byte Folded Reload - ld.d $s5, $sp, 1232 # 8-byte Folded Reload - ld.d $s4, $sp, 1240 # 8-byte Folded Reload - ld.d $s3, $sp, 1248 # 8-byte Folded Reload - ld.d $s2, $sp, 1256 # 8-byte Folded Reload - ld.d $s1, $sp, 1264 # 8-byte Folded Reload - ld.d $s0, $sp, 1272 # 8-byte Folded Reload - ld.d $fp, $sp, 1280 # 8-byte Folded Reload - ld.d $ra, $sp, 1288 # 8-byte Folded Reload - addi.d $sp, $sp, 1296 + ld.d $s8, $sp, 1176 # 8-byte Folded Reload + ld.d $s7, $sp, 1184 # 8-byte Folded Reload + ld.d $s6, $sp, 1192 # 8-byte Folded Reload + ld.d $s5, $sp, 1200 # 8-byte Folded Reload + ld.d $s4, $sp, 1208 # 8-byte Folded Reload + ld.d $s3, $sp, 1216 # 8-byte Folded Reload + ld.d $s2, $sp, 1224 # 8-byte Folded Reload + ld.d $s1, $sp, 1232 # 8-byte Folded Reload + ld.d $s0, $sp, 1240 # 8-byte Folded Reload + ld.d $fp, $sp, 1248 # 8-byte Folded Reload + ld.d $ra, $sp, 1256 # 8-byte Folded Reload + addi.d $sp, $sp, 1264 ret .Lfunc_end3: .size BPredPartitionCost, .Lfunc_end3-BPredPartitionCost @@ -3754,32 +3751,30 @@ FindSkipModeMotionVector: # @FindSkipModeMotionVector .type GetSkipCostMB,@function GetSkipCostMB: # @GetSkipCostMB # %bb.0: - addi.d $sp, $sp, -464 - st.d $ra, $sp, 456 # 8-byte Folded Spill - st.d $fp, $sp, 448 # 8-byte Folded Spill - st.d $s0, $sp, 440 # 8-byte Folded Spill - st.d $s1, $sp, 432 # 8-byte Folded Spill - st.d $s2, $sp, 424 # 8-byte Folded Spill - st.d $s3, $sp, 416 # 8-byte Folded Spill - st.d $s4, $sp, 408 # 8-byte Folded Spill - st.d $s5, $sp, 400 # 8-byte Folded Spill - st.d $s6, $sp, 392 # 8-byte Folded Spill - st.d $s7, $sp, 384 # 8-byte Folded Spill - st.d $s8, $sp, 376 # 8-byte Folded Spill + addi.d $sp, $sp, -448 + st.d $ra, $sp, 440 # 8-byte Folded Spill + st.d $fp, $sp, 432 # 8-byte Folded Spill + st.d $s0, $sp, 424 # 8-byte Folded Spill + st.d $s1, $sp, 416 # 8-byte Folded Spill + st.d $s2, $sp, 408 # 8-byte Folded Spill + st.d $s3, $sp, 400 # 8-byte Folded Spill + st.d $s4, $sp, 392 # 8-byte Folded Spill + st.d $s5, $sp, 384 # 8-byte Folded Spill + st.d $s6, $sp, 376 # 8-byte Folded Spill + st.d $s7, $sp, 368 # 8-byte Folded Spill + st.d $s8, $sp, 360 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(img) ld.d $s2, $a0, %got_pc_lo12(img) lu12i.w $a0, 3 ori $a0, $a0, 432 st.d $a0, $sp, 48 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(imgY_org) - st.d $a0, $sp, 96 # 8-byte Folded Spill - vrepli.b $vr0, 0 - vst $vr0, $sp, 80 # 16-byte Folded Spill + st.d $a0, $sp, 80 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(diff) addi.d $s5, $a0, %pc_lo12(diff) pcalau12i $a0, %got_pc_hi20(input) ld.d $a0, $a0, %got_pc_lo12(input) - st.d $a0, $sp, 112 # 8-byte Folded Spill + st.d $a0, $sp, 96 # 8-byte Folded Spill move $s1, $zero move $a2, $zero move $a3, $zero @@ -3787,7 +3782,7 @@ GetSkipCostMB: # @GetSkipCostMB b .LBB6_4 .p2align 4, , 16 .LBB6_1: # in Loop: Header=BB6_4 Depth=1 - ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 96 # 8-byte Folded Reload ld.d $a0, $a0, 0 ldptr.w $a1, $a0, 4168 beqz $a1, .LBB6_14 @@ -3820,7 +3815,7 @@ GetSkipCostMB: # @GetSkipCostMB slli.d $a3, $a3, 3 andi $a3, $a3, 8 addi.d $a3, $a3, 8 - st.d $a3, $sp, 104 # 8-byte Folded Spill + st.d $a3, $sp, 88 # 8-byte Folded Spill alsl.d $a2, $a2, $a1, 3 st.d $a4, $sp, 24 # 8-byte Folded Spill move $a1, $a4 @@ -3832,7 +3827,7 @@ GetSkipCostMB: # @GetSkipCostMB add.d $s1, $a1, $a0 st.d $a2, $sp, 64 # 8-byte Folded Spill slli.d $s4, $a2, 3 - addi.d $s6, $sp, 184 + addi.d $s6, $sp, 168 move $s3, $s0 move $s7, $s0 b .LBB6_7 @@ -3848,7 +3843,7 @@ GetSkipCostMB: # @GetSkipCostMB addi.d $s1, $s1, 8 addi.d $s3, $s3, 4 addi.d $s6, $s6, 16 - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 88 # 8-byte Folded Reload bgeu $s7, $a0, .LBB6_9 .LBB6_7: # %.preheader # Parent Loop BB6_4 Depth=1 @@ -3865,7 +3860,7 @@ GetSkipCostMB: # @GetSkipCostMB move $a6, $zero pcaddu18i $ra, %call36(LumaPrediction4x4) jirl $ra, $ra, 0 - ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a0, $sp, 80 # 8-byte Folded Reload ld.d $a1, $a0, %pc_lo12(imgY_org) ldx.d $a2, $a1, $s4 ld.d $a0, $s2, 0 @@ -3877,11 +3872,10 @@ GetSkipCostMB: # @GetSkipCostMB ld.d $a5, $a4, -96 ld.d $a6, $a1, 8 vinsgr2vr.d $vr0, $a2, 0 - vld $vr2, $sp, 80 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 ldx.d $a2, $a6, $a3 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a5, $a4, -64 vinsgr2vr.d $vr1, $a2, 0 @@ -3889,30 +3883,30 @@ GetSkipCostMB: # @GetSkipCostMB vst $vr0, $s5, 0 vinsgr2vr.d $vr0, $a5, 0 ld.d $a2, $a1, 16 - vilvl.h $vr1, $vr2, $vr1 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr0, $vr0, 0 vsub.w $vr0, $vr1, $vr0 ldx.d $a2, $a2, $a3 vst $vr0, $s6, -32 ld.d $a4, $a4, -32 vst $vr0, $s5, 16 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 ld.d $a1, $a1, 24 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s6, 0 ldx.d $a1, $a1, $a3 vst $vr0, $s5, 32 ldx.d $a2, $a0, $s1 - ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 96 # 8-byte Folded Reload ld.d $a0, $a0, 0 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 ldptr.w $a1, $a0, 4168 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s6, 32 vst $vr0, $s5, 48 @@ -3934,7 +3928,7 @@ GetSkipCostMB: # @GetSkipCostMB sub.d $a1, $a1, $a2 ld.d $a2, $sp, 24 # 8-byte Folded Reload sub.d $a1, $a1, $a2 - addi.d $a2, $sp, 184 + addi.d $a2, $sp, 168 add.d $s1, $a2, $a1 ld.d $a1, $sp, 8 # 8-byte Folded Reload or $a0, $a0, $a1 @@ -3954,7 +3948,7 @@ GetSkipCostMB: # @GetSkipCostMB addi.d $s6, $s6, 4 addi.d $s1, $s1, 16 addi.d $s4, $s4, 8 - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 88 # 8-byte Folded Reload bgeu $s0, $a0, .LBB6_1 .LBB6_12: # %.preheader.1 # Parent Loop BB6_4 Depth=1 @@ -3971,7 +3965,7 @@ GetSkipCostMB: # @GetSkipCostMB move $a6, $zero pcaddu18i $ra, %call36(LumaPrediction4x4) jirl $ra, $ra, 0 - ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a0, $sp, 80 # 8-byte Folded Reload ld.d $a1, $a0, %pc_lo12(imgY_org) slli.d $a0, $s8, 3 ldx.d $a2, $a1, $a0 @@ -3983,11 +3977,10 @@ GetSkipCostMB: # @GetSkipCostMB ld.d $a5, $a4, -96 ld.d $a6, $a1, 8 vinsgr2vr.d $vr0, $a2, 0 - vld $vr2, $sp, 80 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 ldx.d $a2, $a6, $a3 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a5, $a4, -64 vinsgr2vr.d $vr1, $a2, 0 @@ -3995,30 +3988,30 @@ GetSkipCostMB: # @GetSkipCostMB vst $vr0, $s5, 0 vinsgr2vr.d $vr0, $a5, 0 ld.d $a2, $a1, 16 - vilvl.h $vr1, $vr2, $vr1 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr0, $vr0, 0 vsub.w $vr0, $vr1, $vr0 ldx.d $a2, $a2, $a3 vst $vr0, $s1, -32 ld.d $a4, $a4, -32 vst $vr0, $s5, 16 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 ld.d $a1, $a1, 24 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s1, 0 ldx.d $a1, $a1, $a3 vst $vr0, $s5, 32 ldx.d $a2, $a0, $s4 - ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 96 # 8-byte Folded Reload ld.d $a0, $a0, 0 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 ldptr.w $a1, $a0, 4168 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s1, 32 vst $vr0, $s5, 48 @@ -4036,7 +4029,7 @@ GetSkipCostMB: # @GetSkipCostMB # in Loop: Header=BB6_4 Depth=1 pcalau12i $a0, %pc_hi20(diff64) addi.d $s0, $a0, %pc_lo12(diff64) - addi.d $a1, $sp, 120 + addi.d $a1, $sp, 104 ori $a2, $zero, 256 move $a0, $s0 pcaddu18i $ra, %call36(memcpy) @@ -4048,18 +4041,18 @@ GetSkipCostMB: # @GetSkipCostMB b .LBB6_3 .LBB6_16: move $a0, $fp - ld.d $s8, $sp, 376 # 8-byte Folded Reload - ld.d $s7, $sp, 384 # 8-byte Folded Reload - ld.d $s6, $sp, 392 # 8-byte Folded Reload - ld.d $s5, $sp, 400 # 8-byte Folded Reload - ld.d $s4, $sp, 408 # 8-byte Folded Reload - ld.d $s3, $sp, 416 # 8-byte Folded Reload - ld.d $s2, $sp, 424 # 8-byte Folded Reload - ld.d $s1, $sp, 432 # 8-byte Folded Reload - ld.d $s0, $sp, 440 # 8-byte Folded Reload - ld.d $fp, $sp, 448 # 8-byte Folded Reload - ld.d $ra, $sp, 456 # 8-byte Folded Reload - addi.d $sp, $sp, 464 + ld.d $s8, $sp, 360 # 8-byte Folded Reload + ld.d $s7, $sp, 368 # 8-byte Folded Reload + ld.d $s6, $sp, 376 # 8-byte Folded Reload + ld.d $s5, $sp, 384 # 8-byte Folded Reload + ld.d $s4, $sp, 392 # 8-byte Folded Reload + ld.d $s3, $sp, 400 # 8-byte Folded Reload + ld.d $s2, $sp, 408 # 8-byte Folded Reload + ld.d $s1, $sp, 416 # 8-byte Folded Reload + ld.d $s0, $sp, 424 # 8-byte Folded Reload + ld.d $fp, $sp, 432 # 8-byte Folded Reload + ld.d $ra, $sp, 440 # 8-byte Folded Reload + addi.d $sp, $sp, 448 ret .Lfunc_end6: .size GetSkipCostMB, .Lfunc_end6-GetSkipCostMB @@ -4069,20 +4062,20 @@ GetSkipCostMB: # @GetSkipCostMB .type BIDPartitionCost,@function BIDPartitionCost: # @BIDPartitionCost # %bb.0: - addi.d $sp, $sp, -1280 - st.d $ra, $sp, 1272 # 8-byte Folded Spill - st.d $fp, $sp, 1264 # 8-byte Folded Spill - st.d $s0, $sp, 1256 # 8-byte Folded Spill - st.d $s1, $sp, 1248 # 8-byte Folded Spill - st.d $s2, $sp, 1240 # 8-byte Folded Spill - st.d $s3, $sp, 1232 # 8-byte Folded Spill - st.d $s4, $sp, 1224 # 8-byte Folded Spill - st.d $s5, $sp, 1216 # 8-byte Folded Spill - st.d $s6, $sp, 1208 # 8-byte Folded Spill - st.d $s7, $sp, 1200 # 8-byte Folded Spill - st.d $s8, $sp, 1192 # 8-byte Folded Spill - st.d $a3, $sp, 128 # 8-byte Folded Spill - st.d $a2, $sp, 120 # 8-byte Folded Spill + addi.d $sp, $sp, -1264 + st.d $ra, $sp, 1256 # 8-byte Folded Spill + st.d $fp, $sp, 1248 # 8-byte Folded Spill + st.d $s0, $sp, 1240 # 8-byte Folded Spill + st.d $s1, $sp, 1232 # 8-byte Folded Spill + st.d $s2, $sp, 1224 # 8-byte Folded Spill + st.d $s3, $sp, 1216 # 8-byte Folded Spill + st.d $s4, $sp, 1208 # 8-byte Folded Spill + st.d $s5, $sp, 1200 # 8-byte Folded Spill + st.d $s6, $sp, 1192 # 8-byte Folded Spill + st.d $s7, $sp, 1184 # 8-byte Folded Spill + st.d $s8, $sp, 1176 # 8-byte Folded Spill + st.d $a3, $sp, 112 # 8-byte Folded Spill + st.d $a2, $sp, 104 # 8-byte Folded Spill move $fp, $a0 pcalau12i $a0, %got_pc_hi20(input) ld.d $s0, $a0, %got_pc_lo12(input) @@ -4124,7 +4117,7 @@ BIDPartitionCost: # @BIDPartitionCost ldx.w $t4, $a5, $a1 pcalau12i $a1, %got_pc_hi20(img) ld.d $a1, $a1, %got_pc_lo12(img) - st.d $a1, $sp, 160 # 8-byte Folded Spill + st.d $a1, $sp, 144 # 8-byte Folded Spill ld.d $a6, $a1, 0 move $a5, $zero ldptr.d $a1, $a6, 14384 @@ -4136,9 +4129,9 @@ BIDPartitionCost: # @BIDPartitionCost add.w $s7, $ra, $t0 slli.d $t0, $ra, 3 slli.d $t1, $a2, 3 - ld.d $t2, $sp, 120 # 8-byte Folded Reload + ld.d $t2, $sp, 104 # 8-byte Folded Reload slli.d $t2, $t2, 3 - ld.d $t3, $sp, 128 # 8-byte Folded Reload + ld.d $t3, $sp, 112 # 8-byte Folded Reload slli.d $t3, $t3, 3 st.d $t4, $sp, 64 # 8-byte Folded Spill .p2align 4, , 16 @@ -4204,11 +4197,9 @@ BIDPartitionCost: # @BIDPartitionCost srai.d $s8, $a0, 16 slli.w $a0, $ra, 2 st.d $a0, $sp, 32 # 8-byte Folded Spill - addi.d $s0, $sp, 296 + addi.d $s0, $sp, 280 pcalau12i $a0, %pc_hi20(imgY_org) - st.d $a0, $sp, 112 # 8-byte Folded Spill - vrepli.b $vr0, 0 - vst $vr0, $sp, 96 # 16-byte Folded Spill + st.d $a0, $sp, 96 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(diff64) addi.d $s2, $a0, %pc_lo12(diff64) ori $a0, $s6, 1004 @@ -4229,20 +4220,20 @@ BIDPartitionCost: # @BIDPartitionCost .LBB7_8: # %.lr.ph.us # =>This Loop Header: Depth=1 # Child Loop BB7_11 Depth 2 - ld.d $a0, $sp, 160 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload ld.d $a0, $a0, 0 ld.w $a0, $a0, 196 slli.w $s3, $a1, 2 st.d $a1, $sp, 64 # 8-byte Folded Spill alsl.w $a0, $a1, $a0, 2 slli.d $a1, $s3, 5 - st.d $a1, $sp, 144 # 8-byte Folded Spill - st.d $a0, $sp, 152 # 8-byte Folded Spill - slli.d $a0, $a0, 3 + st.d $a1, $sp, 128 # 8-byte Folded Spill st.d $a0, $sp, 136 # 8-byte Folded Spill + slli.d $a0, $a0, 3 + st.d $a0, $sp, 120 # 8-byte Folded Spill st.d $s0, $sp, 48 # 8-byte Folded Spill ld.d $s4, $sp, 32 # 8-byte Folded Reload - move $s6, $ra + move $s5, $ra b .LBB7_11 .p2align 4, , 16 .LBB7_9: # in Loop: Header=BB7_11 Depth=2 @@ -4251,45 +4242,44 @@ BIDPartitionCost: # @BIDPartitionCost jirl $ra, $ra, 0 add.d $s8, $a0, $s8 .LBB7_10: # in Loop: Header=BB7_11 Depth=2 - addi.d $s6, $s6, 1 + addi.d $s5, $s5, 1 addi.w $s4, $s4, 4 addi.d $s0, $s0, 16 - bge $s6, $s7, .LBB7_7 + bge $s5, $s7, .LBB7_7 .LBB7_11: # %.preheader149.us # Parent Loop BB7_8 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $s1, $sp, 160 # 8-byte Folded Reload + ld.d $s1, $sp, 144 # 8-byte Folded Reload ld.d $a0, $s1, 0 ld.w $a0, $a0, 192 - add.w $s5, $s4, $a0 + add.w $s6, $s4, $a0 ori $a2, $zero, 2 move $a0, $s4 move $a1, $s3 move $a3, $fp move $a4, $fp - ld.d $a5, $sp, 120 # 8-byte Folded Reload - ld.d $a6, $sp, 128 # 8-byte Folded Reload + ld.d $a5, $sp, 104 # 8-byte Folded Reload + ld.d $a6, $sp, 112 # 8-byte Folded Reload pcaddu18i $ra, %call36(LumaPrediction4x4) jirl $ra, $ra, 0 - ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 96 # 8-byte Folded Reload ld.d $a0, $a0, %pc_lo12(imgY_org) ld.d $a1, $s1, 0 - ld.d $a2, $sp, 136 # 8-byte Folded Reload + ld.d $a2, $sp, 120 # 8-byte Folded Reload ldx.d $a2, $a0, $a2 - ld.d $a3, $sp, 144 # 8-byte Folded Reload + ld.d $a3, $sp, 128 # 8-byte Folded Reload add.d $a1, $a1, $a3 - slli.d $a3, $s5, 1 + slli.d $a3, $s6, 1 ldx.d $a2, $a2, $a3 - ld.d $a4, $sp, 152 # 8-byte Folded Reload + ld.d $a4, $sp, 136 # 8-byte Folded Reload alsl.d $a0, $a4, $a0, 3 alsl.d $a1, $s4, $a1, 1 ldptr.d $a4, $a1, 12624 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a0, 8 - vld $vr2, $sp, 96 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a2, $a2, $a3 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s0, -128 @@ -4298,27 +4288,27 @@ BIDPartitionCost: # @BIDPartitionCost ld.d $a2, $a0, 16 vst $vr0, $s2, 0 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a2, $a2, $a3 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vsub.w $vr0, $vr1, $vr0 vst $vr0, $s0, -64 vinsgr2vr.d $vr1, $a2, 0 ldptr.d $a2, $a1, 12688 vst $vr0, $s2, 16 - vilvl.h $vr0, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr1, 0 ld.d $a0, $a0, 24 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a0, $a0, $a3 vst $vr0, $s0, 0 ldptr.d $a1, $a1, 12720 vst $vr0, $s2, 32 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s0, 64 vst $vr0, $s2, 48 @@ -4366,10 +4356,10 @@ BIDPartitionCost: # @BIDPartitionCost masknez $a1, $a1, $a0 maskeqz $a0, $t2, $a0 or $s1, $a0, $a1 - addi.d $s2, $sp, 424 + addi.d $s2, $sp, 408 move $s3, $a3 slli.d $a0, $a3, 6 - st.d $a0, $sp, 160 # 8-byte Folded Spill + st.d $a0, $sp, 144 # 8-byte Folded Spill slli.d $s4, $s1, 2 pcalau12i $a0, %pc_hi20(diff64) addi.d $fp, $a0, %pc_lo12(diff64) @@ -4381,7 +4371,7 @@ BIDPartitionCost: # @BIDPartitionCost alsl.d $a0, $t1, $a2, 3 ld.w $a0, $a0, 76 add.d $s5, $s5, $s3 - ld.d $a1, $sp, 160 # 8-byte Folded Reload + ld.d $a1, $sp, 144 # 8-byte Folded Reload add.d $s2, $s2, $a1 bge $s5, $a0, .LBB7_23 .LBB7_20: # %.preheader147 @@ -4444,18 +4434,18 @@ BIDPartitionCost: # @BIDPartitionCost b .LBB7_19 .LBB7_23: # %.loopexit addi.w $a0, $s8, 0 - ld.d $s8, $sp, 1192 # 8-byte Folded Reload - ld.d $s7, $sp, 1200 # 8-byte Folded Reload - ld.d $s6, $sp, 1208 # 8-byte Folded Reload - ld.d $s5, $sp, 1216 # 8-byte Folded Reload - ld.d $s4, $sp, 1224 # 8-byte Folded Reload - ld.d $s3, $sp, 1232 # 8-byte Folded Reload - ld.d $s2, $sp, 1240 # 8-byte Folded Reload - ld.d $s1, $sp, 1248 # 8-byte Folded Reload - ld.d $s0, $sp, 1256 # 8-byte Folded Reload - ld.d $fp, $sp, 1264 # 8-byte Folded Reload - ld.d $ra, $sp, 1272 # 8-byte Folded Reload - addi.d $sp, $sp, 1280 + ld.d $s8, $sp, 1176 # 8-byte Folded Reload + ld.d $s7, $sp, 1184 # 8-byte Folded Reload + ld.d $s6, $sp, 1192 # 8-byte Folded Reload + ld.d $s5, $sp, 1200 # 8-byte Folded Reload + ld.d $s4, $sp, 1208 # 8-byte Folded Reload + ld.d $s3, $sp, 1216 # 8-byte Folded Reload + ld.d $s2, $sp, 1224 # 8-byte Folded Reload + ld.d $s1, $sp, 1232 # 8-byte Folded Reload + ld.d $s0, $sp, 1240 # 8-byte Folded Reload + ld.d $fp, $sp, 1248 # 8-byte Folded Reload + ld.d $ra, $sp, 1256 # 8-byte Folded Reload + addi.d $sp, $sp, 1264 ret .Lfunc_end7: .size BIDPartitionCost, .Lfunc_end7-BIDPartitionCost @@ -4465,20 +4455,20 @@ BIDPartitionCost: # @BIDPartitionCost .type GetDirectCost8x8,@function GetDirectCost8x8: # @GetDirectCost8x8 # %bb.0: - addi.d $sp, $sp, -480 - st.d $ra, $sp, 472 # 8-byte Folded Spill - st.d $fp, $sp, 464 # 8-byte Folded Spill - st.d $s0, $sp, 456 # 8-byte Folded Spill - st.d $s1, $sp, 448 # 8-byte Folded Spill - st.d $s2, $sp, 440 # 8-byte Folded Spill - st.d $s3, $sp, 432 # 8-byte Folded Spill - st.d $s4, $sp, 424 # 8-byte Folded Spill - st.d $s5, $sp, 416 # 8-byte Folded Spill - st.d $s6, $sp, 408 # 8-byte Folded Spill - st.d $s7, $sp, 400 # 8-byte Folded Spill - st.d $s8, $sp, 392 # 8-byte Folded Spill - st.d $a1, $sp, 16 # 8-byte Folded Spill - addi.d $a7, $sp, 248 + addi.d $sp, $sp, -464 + st.d $ra, $sp, 456 # 8-byte Folded Spill + st.d $fp, $sp, 448 # 8-byte Folded Spill + st.d $s0, $sp, 440 # 8-byte Folded Spill + st.d $s1, $sp, 432 # 8-byte Folded Spill + st.d $s2, $sp, 424 # 8-byte Folded Spill + st.d $s3, $sp, 416 # 8-byte Folded Spill + st.d $s4, $sp, 408 # 8-byte Folded Spill + st.d $s5, $sp, 400 # 8-byte Folded Spill + st.d $s6, $sp, 392 # 8-byte Folded Spill + st.d $s7, $sp, 384 # 8-byte Folded Spill + st.d $s8, $sp, 376 # 8-byte Folded Spill + st.d $a1, $sp, 24 # 8-byte Folded Spill + addi.d $a7, $sp, 232 bstrpick.d $a1, $a0, 31, 31 add.d $a1, $a0, $a1 addi.w $a2, $a1, 0 @@ -4490,38 +4480,36 @@ GetDirectCost8x8: # @GetDirectCost8x8 slli.w $t0, $a0, 3 addi.w $a1, $t1, 0 addi.d $a0, $t0, 4 - st.d $a0, $sp, 64 # 8-byte Folded Spill + st.d $a0, $sp, 72 # 8-byte Folded Spill slli.d $a0, $a1, 5 alsl.d $fp, $t0, $a0, 1 pcalau12i $a0, %got_pc_hi20(img) ld.d $a0, $a0, %got_pc_lo12(img) - st.d $a0, $sp, 128 # 8-byte Folded Spill + st.d $a0, $sp, 112 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(direct_pdir) - st.d $a0, $sp, 120 # 8-byte Folded Spill + st.d $a0, $sp, 104 # 8-byte Folded Spill lu12i.w $a0, 524287 ori $a0, $a0, 4095 - st.d $a0, $sp, 8 # 8-byte Folded Spill + st.d $a0, $sp, 16 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(direct_ref_idx) - st.d $a0, $sp, 56 # 8-byte Folded Spill + st.d $a0, $sp, 64 # 8-byte Folded Spill pcalau12i $t2, %pc_hi20(imgY_org) - vrepli.b $vr0, 0 - vst $vr0, $sp, 96 # 16-byte Folded Spill pcalau12i $a0, %pc_hi20(diff) addi.d $s2, $a0, %pc_lo12(diff) move $s1, $zero move $s5, $zero - st.d $a1, $sp, 40 # 8-byte Folded Spill + st.d $a1, $sp, 48 # 8-byte Folded Spill move $a0, $a1 - st.d $t0, $sp, 48 # 8-byte Folded Spill - st.d $fp, $sp, 32 # 8-byte Folded Spill - st.d $t2, $sp, 24 # 8-byte Folded Spill + st.d $t0, $sp, 56 # 8-byte Folded Spill + st.d $fp, $sp, 40 # 8-byte Folded Spill + st.d $t2, $sp, 32 # 8-byte Folded Spill .p2align 4, , 16 .LBB8_1: # =>This Inner Loop Header: Depth=1 - ld.d $a1, $sp, 128 # 8-byte Folded Reload + ld.d $a1, $sp, 112 # 8-byte Folded Reload ld.d $a2, $a1, 0 ld.w $a1, $a2, 196 add.w $a3, $t1, $a1 - ld.d $a4, $sp, 120 # 8-byte Folded Reload + ld.d $a4, $sp, 104 # 8-byte Folded Reload ld.d $a4, $a4, %pc_lo12(direct_pdir) srai.d $a3, $a3, 2 ld.w $a2, $a2, 192 @@ -4534,8 +4522,8 @@ GetDirectCost8x8: # @GetDirectCost8x8 # %bb.2: # %.preheader60 # in Loop: Header=BB8_1 Depth=1 move $t3, $a0 - st.d $s5, $sp, 72 # 8-byte Folded Spill - ld.d $s7, $sp, 56 # 8-byte Folded Reload + st.d $s5, $sp, 80 # 8-byte Folded Spill + ld.d $s7, $sp, 64 # 8-byte Folded Reload ld.d $a0, $s7, %pc_lo12(direct_ref_idx) ld.d $a4, $a0, 0 ld.d $a0, $a0, 8 @@ -4543,10 +4531,10 @@ GetDirectCost8x8: # @GetDirectCost8x8 ldx.d $a0, $a0, $s0 ldx.b $a5, $a4, $a3 ldx.b $a6, $a0, $a3 - st.d $t3, $sp, 80 # 8-byte Folded Spill + st.d $t3, $sp, 88 # 8-byte Folded Spill add.d $s5, $t3, $a1 add.d $s8, $a7, $s1 - st.d $t1, $sp, 88 # 8-byte Folded Spill + st.d $t1, $sp, 96 # 8-byte Folded Spill move $s6, $t2 addi.w $s4, $t1, 0 move $a0, $t0 @@ -4558,7 +4546,7 @@ GetDirectCost8x8: # @GetDirectCost8x8 ld.d $a0, $s6, %pc_lo12(imgY_org) slli.d $s6, $s5, 3 ldx.d $a1, $a0, $s6 - ld.d $a2, $sp, 128 # 8-byte Folded Reload + ld.d $a2, $sp, 112 # 8-byte Folded Reload ld.d $a2, $a2, 0 alsl.d $a0, $s5, $a0, 3 slli.d $a3, $s3, 1 @@ -4568,53 +4556,52 @@ GetDirectCost8x8: # @GetDirectCost8x8 ldptr.d $a4, $a2, 12624 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $a0, 8 - vld $vr2, $sp, 96 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a1, $a1, $a3 vsub.w $vr0, $vr0, $vr1 - addi.d $a4, $sp, 136 + addi.d $a4, $sp, 120 vstx $vr0, $s1, $a4 ldptr.d $a4, $a2, 12656 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $a0, 16 vst $vr0, $s2, 0 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a1, $a1, $a3 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vsub.w $vr0, $vr1, $vr0 vst $vr0, $s8, -80 vinsgr2vr.d $vr1, $a1, 0 ldptr.d $a1, $a2, 12688 vst $vr0, $s2, 16 - vilvl.h $vr0, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr1, 0 ld.d $a0, $a0, 24 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a0, $a0, $a3 vst $vr0, $s8, -48 ldptr.d $a1, $a2, 12720 vst $vr0, $s2, 32 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $s8, -16 vst $vr0, $s2, 48 move $a0, $s2 pcaddu18i $ra, %call36(distortion4x4) jirl $ra, $ra, 0 - ld.d $a1, $sp, 128 # 8-byte Folded Reload + ld.d $a1, $sp, 112 # 8-byte Folded Reload ld.d $a1, $a1, 0 - ld.d $a2, $sp, 120 # 8-byte Folded Reload + ld.d $a2, $sp, 104 # 8-byte Folded Reload ld.d $a2, $a2, %pc_lo12(direct_pdir) ld.w $a1, $a1, 192 ldx.d $a2, $a2, $s0 - ld.d $a7, $sp, 64 # 8-byte Folded Reload + ld.d $a7, $sp, 72 # 8-byte Folded Reload add.w $s3, $a1, $a7 srai.d $a1, $s3, 2 ldx.b $a2, $a2, $a1 @@ -4628,7 +4615,7 @@ GetDirectCost8x8: # @GetDirectCost8x8 ldx.d $a3, $a3, $s0 ldx.b $a5, $a4, $a1 ldx.b $a6, $a3, $a1 - ld.d $a1, $sp, 72 # 8-byte Folded Reload + ld.d $a1, $sp, 80 # 8-byte Folded Reload add.d $s0, $a0, $a1 move $a0, $a7 move $a1, $s4 @@ -4636,69 +4623,68 @@ GetDirectCost8x8: # @GetDirectCost8x8 move $a4, $zero pcaddu18i $ra, %call36(LumaPrediction4x4) jirl $ra, $ra, 0 - ld.d $s4, $sp, 24 # 8-byte Folded Reload + ld.d $s4, $sp, 32 # 8-byte Folded Reload ld.d $a0, $s4, %pc_lo12(imgY_org) ldx.d $a1, $a0, $s6 - ld.d $a2, $sp, 128 # 8-byte Folded Reload + ld.d $a2, $sp, 112 # 8-byte Folded Reload ld.d $a2, $a2, 0 alsl.d $a0, $s5, $a0, 3 slli.d $a3, $s3, 1 ldx.d $a1, $a1, $a3 - ld.d $fp, $sp, 32 # 8-byte Folded Reload + ld.d $fp, $sp, 40 # 8-byte Folded Reload add.d $a2, $a2, $fp add.d $a2, $a2, $s1 ldptr.d $a4, $a2, 12632 vinsgr2vr.d $vr0, $a1, 0 - vld $vr2, $sp, 96 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 ld.d $a1, $a0, 8 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a1, $a1, $a3 vst $vr0, $s8, -96 vst $vr0, $s2, 0 ldptr.d $a4, $a2, 12664 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 ld.d $a1, $a0, 16 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a1, $a1, $a3 vst $vr0, $s8, -64 vst $vr0, $s2, 16 ldptr.d $a4, $a2, 12696 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 ld.d $a0, $a0, 24 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a0, $a0, $a3 vst $vr0, $s8, -32 ldptr.d $a1, $a2, 12728 vst $vr0, $s2, 32 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 - addi.d $s3, $sp, 248 + addi.d $s3, $sp, 232 vstx $vr0, $s3, $s1 vst $vr0, $s2, 48 move $a0, $s2 pcaddu18i $ra, %call36(distortion4x4) jirl $ra, $ra, 0 add.w $s5, $a0, $s0 - ld.d $a2, $sp, 80 # 8-byte Folded Reload + ld.d $a2, $sp, 88 # 8-byte Folded Reload addi.d $a0, $a2, 4 addi.d $s1, $s1, 128 - ld.d $t1, $sp, 88 # 8-byte Folded Reload + ld.d $t1, $sp, 96 # 8-byte Folded Reload addi.d $t1, $t1, 4 move $a7, $s3 - ld.d $t0, $sp, 48 # 8-byte Folded Reload - ld.d $a1, $sp, 40 # 8-byte Folded Reload + ld.d $t0, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 48 # 8-byte Folded Reload move $t2, $s4 bge $a1, $a2, .LBB8_1 # %bb.4: @@ -4713,7 +4699,7 @@ GetDirectCost8x8: # @GetDirectCost8x8 # %bb.6: # %.preheader.preheader pcalau12i $a0, %pc_hi20(diff64) addi.d $s0, $a0, %pc_lo12(diff64) - addi.d $a1, $sp, 136 + addi.d $a1, $sp, 120 ori $a2, $zero, 256 move $a0, $s0 pcaddu18i $ra, %call36(memcpy) @@ -4721,30 +4707,30 @@ GetDirectCost8x8: # @GetDirectCost8x8 move $a0, $s0 pcaddu18i $ra, %call36(distortion8x8) jirl $ra, $ra, 0 - ld.d $a2, $sp, 16 # 8-byte Folded Reload + ld.d $a2, $sp, 24 # 8-byte Folded Reload ld.w $a1, $a2, 0 add.d $a0, $a1, $a0 b .LBB8_8 .LBB8_7: - ld.d $a0, $sp, 8 # 8-byte Folded Reload + ld.d $a0, $sp, 16 # 8-byte Folded Reload move $s5, $a0 - ld.d $a2, $sp, 16 # 8-byte Folded Reload + ld.d $a2, $sp, 24 # 8-byte Folded Reload .LBB8_8: # %.sink.split st.w $a0, $a2, 0 .LBB8_9: move $a0, $s5 - ld.d $s8, $sp, 392 # 8-byte Folded Reload - ld.d $s7, $sp, 400 # 8-byte Folded Reload - ld.d $s6, $sp, 408 # 8-byte Folded Reload - ld.d $s5, $sp, 416 # 8-byte Folded Reload - ld.d $s4, $sp, 424 # 8-byte Folded Reload - ld.d $s3, $sp, 432 # 8-byte Folded Reload - ld.d $s2, $sp, 440 # 8-byte Folded Reload - ld.d $s1, $sp, 448 # 8-byte Folded Reload - ld.d $s0, $sp, 456 # 8-byte Folded Reload - ld.d $fp, $sp, 464 # 8-byte Folded Reload - ld.d $ra, $sp, 472 # 8-byte Folded Reload - addi.d $sp, $sp, 480 + ld.d $s8, $sp, 376 # 8-byte Folded Reload + ld.d $s7, $sp, 384 # 8-byte Folded Reload + ld.d $s6, $sp, 392 # 8-byte Folded Reload + ld.d $s5, $sp, 400 # 8-byte Folded Reload + ld.d $s4, $sp, 408 # 8-byte Folded Reload + ld.d $s3, $sp, 416 # 8-byte Folded Reload + ld.d $s2, $sp, 424 # 8-byte Folded Reload + ld.d $s1, $sp, 432 # 8-byte Folded Reload + ld.d $s0, $sp, 440 # 8-byte Folded Reload + ld.d $fp, $sp, 448 # 8-byte Folded Reload + ld.d $ra, $sp, 456 # 8-byte Folded Reload + addi.d $sp, $sp, 464 ret .Lfunc_end8: .size GetDirectCost8x8, .Lfunc_end8-GetDirectCost8x8 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/ratectl.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/ratectl.s index 103d2f98..3114990e 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/ratectl.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/ratectl.s @@ -964,12 +964,8 @@ ComputeFrameMAD: # @ComputeFrameMAD ld.d $a6, $a3, 0 vinsgr2vr.d $vr2, $a5, 0 vinsgr2vr.d $vr3, $a6, 0 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 vadd.d $vr0, $vr0, $vr2 vadd.d $vr1, $vr1, $vr3 addi.d $a4, $a4, -4 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/rdopt.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/rdopt.s index 0b7f216e..d64bad71 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/rdopt.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/rdopt.s @@ -2890,14 +2890,14 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks fst.d $fs0, $sp, 384 # 8-byte Folded Spill fst.d $fs1, $sp, 376 # 8-byte Folded Spill fst.d $fs2, $sp, 368 # 8-byte Folded Spill - st.d $a2, $sp, 144 # 8-byte Folded Spill + st.d $a2, $sp, 152 # 8-byte Folded Spill fmov.d $fs0, $fa0 slli.d $a2, $a1, 2 andi $s0, $a2, 4 bstrins.d $s0, $a0, 3, 3 st.d $a0, $sp, 208 # 8-byte Folded Spill slli.d $a0, $a0, 2 - st.d $a0, $sp, 24 # 8-byte Folded Spill + st.d $a0, $sp, 32 # 8-byte Folded Spill bstrpick.d $a0, $a0, 31, 3 st.d $a1, $sp, 224 # 8-byte Folded Spill slli.d $a1, $a1, 1 @@ -2920,7 +2920,7 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks vst $vr0, $sp, 192 # 16-byte Folded Spill addi.d $a1, $s0, -1 addi.d $a3, $sp, 324 - st.d $a2, $sp, 136 # 8-byte Folded Spill + st.d $a2, $sp, 144 # 8-byte Folded Spill pcaddu18i $ra, %call36(getLuma4x4Neighbour) jirl $ra, $ra, 0 ld.d $a0, $s6, 0 @@ -2970,7 +2970,7 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks add.w $a0, $s5, $s0 addi.w $a4, $zero, -1 add.w $a1, $s4, $fp - st.d $a1, $sp, 160 # 8-byte Folded Spill + st.d $a1, $sp, 168 # 8-byte Folded Spill move $a1, $a4 beqz $a2, .LBB5_9 # %bb.8: @@ -2999,12 +2999,12 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks ldx.b $a4, $a3, $a2 .LBB5_11: srai.d $a2, $a0, 2 - st.d $a2, $sp, 16 # 8-byte Folded Spill - ld.d $s2, $sp, 160 # 8-byte Folded Reload + st.d $a2, $sp, 24 # 8-byte Folded Spill + ld.d $s2, $sp, 168 # 8-byte Folded Reload srai.d $a2, $s2, 2 - st.d $a2, $sp, 8 # 8-byte Folded Spill + st.d $a2, $sp, 16 # 8-byte Folded Spill movfr2gr.s $a2, $fa0 - st.d $a2, $sp, 112 # 8-byte Folded Spill + st.d $a2, $sp, 120 # 8-byte Folded Spill or $a2, $a1, $a4 slti $a2, $a2, 0 slt $a3, $a1, $a4 @@ -3017,46 +3017,44 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks or $s3, $a1, $a3 lu12i.w $a1, 524287 ori $a1, $a1, 4095 - ld.d $a2, $sp, 144 # 8-byte Folded Reload + ld.d $a2, $sp, 152 # 8-byte Folded Reload st.w $a1, $a2, 0 addi.d $a2, $sp, 356 addi.d $a3, $sp, 352 addi.d $a4, $sp, 348 - st.d $a0, $sp, 128 # 8-byte Folded Spill + st.d $a0, $sp, 136 # 8-byte Folded Spill move $a1, $s2 pcaddu18i $ra, %call36(intrapred_luma) jirl $ra, $ra, 0 move $s5, $zero move $s7, $zero - st.d $zero, $sp, 152 # 8-byte Folded Spill - st.d $zero, $sp, 120 # 8-byte Folded Spill + st.d $zero, $sp, 160 # 8-byte Folded Spill + st.d $zero, $sp, 128 # 8-byte Folded Spill addi.w $s4, $s0, 0 - st.d $s3, $sp, 168 # 8-byte Folded Spill + st.d $s3, $sp, 176 # 8-byte Folded Spill bstrpick.d $a0, $s3, 31, 0 - ld.d $s3, $sp, 136 # 8-byte Folded Reload + ld.d $s3, $sp, 144 # 8-byte Folded Reload addi.d $a1, $s3, 1 - st.d $a1, $sp, 176 # 8-byte Folded Spill - addi.d $a1, $s3, 2 st.d $a1, $sp, 184 # 8-byte Folded Spill + addi.d $a1, $s3, 2 + st.d $a1, $sp, 192 # 8-byte Folded Spill addi.d $s0, $s3, 3 addi.d $a1, $s2, 1 addi.d $a2, $s2, 2 addi.d $a3, $s2, 3 slli.d $a0, $a0, 9 - st.d $a0, $sp, 104 # 8-byte Folded Spill + st.d $a0, $sp, 112 # 8-byte Folded Spill addi.w $s2, $zero, -3 pcalau12i $a0, %pc_hi20(.LCPI5_0) fld.d $fs2, $a0, %pc_lo12(.LCPI5_0) ori $a4, $zero, 1024 ori $a5, $zero, 9 - vrepli.b $vr0, 0 - vst $vr0, $sp, 192 # 16-byte Folded Spill slli.d $a0, $a1, 3 - st.d $a0, $sp, 64 # 8-byte Folded Spill - slli.d $a0, $a2, 3 st.d $a0, $sp, 72 # 8-byte Folded Spill - slli.d $a0, $a3, 3 + slli.d $a0, $a2, 3 st.d $a0, $sp, 80 # 8-byte Folded Spill + slli.d $a0, $a3, 3 + st.d $a0, $sp, 88 # 8-byte Folded Spill # implicit-def: $r4 # kill: killed $r4 # implicit-def: $r4 @@ -3065,13 +3063,13 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks # kill: killed $r4 # implicit-def: $r4 # kill: killed $r4 - st.d $s4, $sp, 96 # 8-byte Folded Spill - st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s4, $sp, 104 # 8-byte Folded Spill + st.d $s0, $sp, 96 # 8-byte Folded Spill b .LBB5_16 .LBB5_12: # %.loopexit247 # in Loop: Header=BB5_16 Depth=1 fmov.d $fs2, $fs1 - st.d $s8, $sp, 152 # 8-byte Folded Spill + st.d $s8, $sp, 160 # 8-byte Folded Spill .LBB5_13: # %.loopexit247 # in Loop: Header=BB5_16 Depth=1 pcalau12i $a0, %pc_hi20(cs_cm) @@ -3160,16 +3158,15 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks add.d $a4, $a3, $s5 ld.d $a5, $a4, 208 vinsgr2vr.d $vr0, $a2, 0 - vld $vr2, $sp, 192 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 lu12i.w $a6, 3 ori $a2, $a6, 848 vstx $vr0, $a3, $a2 ld.d $a2, $a4, 240 - ld.d $a4, $sp, 176 # 8-byte Folded Reload + ld.d $a4, $sp, 184 # 8-byte Folded Reload slli.d $a4, $a4, 5 add.d $a3, $a3, $a4 alsl.d $a3, $s4, $a3, 1 @@ -3182,14 +3179,14 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks add.d $a4, $a3, $s5 ld.d $a5, $a4, 240 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a2, $a6, 912 vstx $vr0, $a3, $a2 ld.d $a2, $a4, 272 - ld.d $a4, $sp, 184 # 8-byte Folded Reload + ld.d $a4, $sp, 192 # 8-byte Folded Reload slli.d $a4, $a4, 5 add.d $a3, $a3, $a4 alsl.d $a3, $s4, $a3, 1 @@ -3202,9 +3199,9 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks add.d $a4, $a3, $s5 ld.d $a5, $a4, 272 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a2, $a6, 976 vstx $vr0, $a3, $a2 @@ -3221,9 +3218,9 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks add.d $a1, $a2, $s5 ld.d $a1, $a1, 304 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a0, $a6, 1040 vstx $vr0, $a2, $a0 @@ -3233,7 +3230,7 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks ld.d $a2, $sp, 224 # 8-byte Folded Reload move $a3, $s8 fmov.d $fa0, $fs0 - ld.d $a4, $sp, 168 # 8-byte Folded Reload + ld.d $a4, $sp, 176 # 8-byte Folded Reload pcaddu18i $ra, %call36(RDCost_for_4x4IntraBlocks) jirl $ra, $ra, 0 fcmp.clt.d $fcc0, $fa0, $fs2 @@ -3270,9 +3267,9 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks ld.d $a1, $a1, 0 ld.w $a4, $a0, 20 ldptr.d $a1, $a1, 6440 - ld.d $a2, $sp, 160 # 8-byte Folded Reload + ld.d $a2, $sp, 168 # 8-byte Folded Reload slli.d $a2, $a2, 3 - ld.d $a6, $sp, 128 # 8-byte Folded Reload + ld.d $a6, $sp, 136 # 8-byte Folded Reload slli.d $a3, $a6, 1 ori $a5, $zero, 3 bne $a4, $a5, .LBB5_38 @@ -3290,16 +3287,16 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks ldx.d $a5, $a4, $a2 slli.d $a6, $a6, 2 vldx $vr0, $a5, $a6 - ld.d $t0, $sp, 64 # 8-byte Folded Reload + ld.d $t0, $sp, 72 # 8-byte Folded Reload ldx.d $a5, $a4, $t0 vst $vr0, $sp, 232 - ld.d $t1, $sp, 72 # 8-byte Folded Reload + ld.d $t1, $sp, 80 # 8-byte Folded Reload ldx.d $a7, $a4, $t1 ldx.d $t0, $a1, $t0 vldx $vr0, $a5, $a6 ldx.d $a5, $a1, $t1 vldx $vr1, $a7, $a6 - ld.d $a7, $sp, 80 # 8-byte Folded Reload + ld.d $a7, $sp, 88 # 8-byte Folded Reload ldx.d $a4, $a4, $a7 ldx.d $a7, $a1, $a7 ldx.d $t0, $t0, $a3 @@ -3330,10 +3327,9 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks ld.d $a4, $a3, 208 alsl.d $a5, $fp, $a0, 3 vinsgr2vr.d $vr0, $a2, 0 - vld $vr2, $sp, 192 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 pcalau12i $a0, %pc_hi20(diff) addi.d $a0, $a0, %pc_lo12(diff) @@ -3345,45 +3341,45 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks vst $vr0, $a0, 0 vinsgr2vr.d $vr0, $a4, 0 ldx.d $a2, $a6, $a1 - vilvl.h $vr1, $vr2, $vr1 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr0, $vr0, 0 vsub.w $vr0, $vr1, $vr0 vinsgr2vr.d $vr1, $a2, 0 ld.d $a2, $a3, 272 vst $vr0, $a0, 16 ld.d $a4, $a5, 24 - vilvl.h $vr0, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr1, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a1, $a4, $a1 vsub.w $vr0, $vr0, $vr1 ld.d $a2, $a3, 304 vst $vr0, $a0, 32 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $a0, 48 - ld.d $a1, $sp, 104 # 8-byte Folded Reload + ld.d $a1, $sp, 112 # 8-byte Folded Reload xor $a1, $a1, $s5 sltui $a1, $a1, 1 - ld.d $a2, $sp, 112 # 8-byte Folded Reload + ld.d $a2, $sp, 120 # 8-byte Folded Reload masknez $s8, $a2, $a1 pcaddu18i $ra, %call36(distortion4x4) jirl $ra, $ra, 0 - ld.d $a2, $sp, 144 # 8-byte Folded Reload + ld.d $a2, $sp, 152 # 8-byte Folded Reload ld.w $a1, $a2, 0 add.w $a0, $a0, $s8 bge $a0, $a1, .LBB5_14 # %bb.37: # in Loop: Header=BB5_16 Depth=1 st.w $a0, $a2, 0 addi.w $a0, $s2, 3 - st.d $a0, $sp, 152 # 8-byte Folded Spill + st.d $a0, $sp, 160 # 8-byte Folded Spill b .LBB5_14 .LBB5_38: # %.thread289 # in Loop: Header=BB5_16 Depth=1 - ld.d $a4, $sp, 160 # 8-byte Folded Reload + ld.d $a4, $sp, 168 # 8-byte Folded Reload alsl.d $a4, $a4, $a1, 3 ld.d $a5, $a4, 8 ld.d $a6, $a4, 16 @@ -3392,18 +3388,18 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks ldx.d $a5, $a6, $a3 ldx.d $a4, $a4, $a3 .LBB5_39: # in Loop: Header=BB5_16 Depth=1 - st.d $t0, $sp, 40 # 8-byte Folded Spill - st.d $a5, $sp, 48 # 8-byte Folded Spill - st.d $a4, $sp, 56 # 8-byte Folded Spill + st.d $t0, $sp, 48 # 8-byte Folded Spill + st.d $a5, $sp, 56 # 8-byte Folded Spill + st.d $a4, $sp, 64 # 8-byte Folded Spill ldx.d $a1, $a1, $a2 ldx.d $a1, $a1, $a3 - st.d $a1, $sp, 32 # 8-byte Folded Spill + st.d $a1, $sp, 40 # 8-byte Folded Spill ldptr.w $a1, $a0, 15260 ld.w $a2, $sp, 360 - st.d $a2, $sp, 120 # 8-byte Folded Spill - ld.d $s3, $sp, 136 # 8-byte Folded Reload - ld.d $s4, $sp, 96 # 8-byte Folded Reload - ld.d $s0, $sp, 88 # 8-byte Folded Reload + st.d $a2, $sp, 128 # 8-byte Folded Spill + ld.d $s3, $sp, 144 # 8-byte Folded Reload + ld.d $s4, $sp, 104 # 8-byte Folded Reload + ld.d $s0, $sp, 96 # 8-byte Folded Reload beqz $a1, .LBB5_12 # %bb.40: # %.preheader246 # in Loop: Header=BB5_16 Depth=1 @@ -3424,7 +3420,7 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks ld.d $a0, $s6, 0 ldptr.d $a0, $a0, 14176 ld.d $a0, $a0, 8 - ld.d $a2, $sp, 176 # 8-byte Folded Reload + ld.d $a2, $sp, 184 # 8-byte Folded Reload slli.d $a2, $a2, 3 ldx.d $a0, $a0, $a2 ldx.d $a2, $a1, $a2 @@ -3433,7 +3429,7 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks ld.d $a0, $s6, 0 ldptr.d $a0, $a0, 14176 ld.d $a0, $a0, 8 - ld.d $a2, $sp, 184 # 8-byte Folded Reload + ld.d $a2, $sp, 192 # 8-byte Folded Reload slli.d $a2, $a2, 3 ldx.d $a0, $a0, $a2 ldx.d $a2, $a1, $a2 @@ -3451,13 +3447,13 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks .LBB5_41: ld.d $a0, $s6, 0 ld.d $a0, $a0, 128 - ld.d $a1, $sp, 8 # 8-byte Folded Reload + ld.d $a1, $sp, 16 # 8-byte Folded Reload slli.d $a1, $a1, 3 ldx.d $a0, $a0, $a1 - ld.d $a1, $sp, 16 # 8-byte Folded Reload - ld.d $s5, $sp, 152 # 8-byte Folded Reload + ld.d $a1, $sp, 24 # 8-byte Folded Reload + ld.d $s5, $sp, 160 # 8-byte Folded Reload stx.b $s5, $a0, $a1 - ld.d $a1, $sp, 168 # 8-byte Folded Reload + ld.d $a1, $sp, 176 # 8-byte Folded Reload xor $a0, $a1, $s5 sltui $a0, $a0, 1 slt $a1, $s5, $a1 @@ -3474,7 +3470,7 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks mul.d $a1, $a3, $a1 add.d $a1, $a2, $a1 ld.d $a3, $sp, 224 # 8-byte Folded Reload - ld.d $a2, $sp, 24 # 8-byte Folded Reload + ld.d $a2, $sp, 32 # 8-byte Folded Reload add.w $a2, $a2, $a3 add.d $a1, $a1, $a2 st.b $a0, $a1, 332 @@ -3510,12 +3506,12 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks ld.d $a0, $a0, %got_pc_lo12(enc_picture) ld.d $a1, $a0, 0 ldptr.d $a1, $a1, 6440 - ld.d $a2, $sp, 160 # 8-byte Folded Reload + ld.d $a2, $sp, 168 # 8-byte Folded Reload slli.d $a6, $a2, 3 ldx.d $a2, $a1, $a6 - ld.d $t1, $sp, 128 # 8-byte Folded Reload + ld.d $t1, $sp, 136 # 8-byte Folded Reload slli.d $a1, $t1, 1 - ld.d $a3, $sp, 32 # 8-byte Folded Reload + ld.d $a3, $sp, 40 # 8-byte Folded Reload stx.d $a3, $a2, $a1 ld.d $a2, $s6, 0 slli.d $a4, $s5, 9 @@ -3547,14 +3543,14 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks .LBB5_46: ld.d $a6, $a0, 0 ldptr.d $a6, $a6, 6440 - ld.d $a7, $sp, 64 # 8-byte Folded Reload + ld.d $a7, $sp, 72 # 8-byte Folded Reload ldx.d $a6, $a6, $a7 - ld.d $a7, $sp, 40 # 8-byte Folded Reload + ld.d $a7, $sp, 48 # 8-byte Folded Reload stx.d $a7, $a6, $a1 ld.d $a6, $s6, 0 add.d $a7, $a6, $a4 ld.d $a7, $a7, 240 - ld.d $t0, $sp, 176 # 8-byte Folded Reload + ld.d $t0, $sp, 184 # 8-byte Folded Reload slli.d $t0, $t0, 5 add.d $a6, $a6, $t0 alsl.d $a6, $s4, $a6, 1 @@ -3572,7 +3568,7 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks # %bb.49: pcalau12i $a5, %pc_hi20(lrec) ld.d $a5, $a5, %pc_lo12(lrec) - ld.d $a6, $sp, 64 # 8-byte Folded Reload + ld.d $a6, $sp, 72 # 8-byte Folded Reload ldx.d $a5, $a5, $a6 vld $vr0, $sp, 248 slli.d $a6, $t1, 2 @@ -3580,14 +3576,14 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks .LBB5_50: ld.d $a5, $a0, 0 ldptr.d $a5, $a5, 6440 - ld.d $a6, $sp, 72 # 8-byte Folded Reload + ld.d $a6, $sp, 80 # 8-byte Folded Reload ldx.d $a5, $a5, $a6 - ld.d $a6, $sp, 48 # 8-byte Folded Reload + ld.d $a6, $sp, 56 # 8-byte Folded Reload stx.d $a6, $a5, $a1 ld.d $a5, $s6, 0 add.d $a6, $a5, $a4 ld.d $a6, $a6, 272 - ld.d $a7, $sp, 184 # 8-byte Folded Reload + ld.d $a7, $sp, 192 # 8-byte Folded Reload slli.d $a7, $a7, 5 add.d $a5, $a5, $a7 alsl.d $a5, $s4, $a5, 1 @@ -3605,7 +3601,7 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks # %bb.53: pcalau12i $a6, %pc_hi20(lrec) ld.d $a6, $a6, %pc_lo12(lrec) - ld.d $a7, $sp, 72 # 8-byte Folded Reload + ld.d $a7, $sp, 80 # 8-byte Folded Reload ldx.d $a6, $a6, $a7 vld $vr0, $sp, 264 slli.d $a7, $t1, 2 @@ -3613,9 +3609,9 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks .LBB5_54: ld.d $a0, $a0, 0 ldptr.d $a0, $a0, 6440 - ld.d $a6, $sp, 80 # 8-byte Folded Reload + ld.d $a6, $sp, 88 # 8-byte Folded Reload ldx.d $a0, $a0, $a6 - ld.d $a6, $sp, 56 # 8-byte Folded Reload + ld.d $a6, $sp, 64 # 8-byte Folded Reload stx.d $a6, $a0, $a1 ld.d $a0, $s6, 0 add.d $a1, $a0, $a4 @@ -3636,7 +3632,7 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks # %bb.57: pcalau12i $a0, %pc_hi20(lrec) ld.d $a0, $a0, %pc_lo12(lrec) - ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 88 # 8-byte Folded Reload ldx.d $a0, $a0, $a1 vld $vr0, $sp, 280 slli.d $a1, $t1, 2 @@ -3663,7 +3659,7 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks ld.d $a0, $s6, 0 ldptr.d $a0, $a0, 14176 ld.d $a0, $a0, 8 - ld.d $a3, $sp, 176 # 8-byte Folded Reload + ld.d $a3, $sp, 184 # 8-byte Folded Reload slli.d $a3, $a3, 3 ldx.d $a4, $a1, $a3 ldx.d $a0, $a0, $a3 @@ -3672,7 +3668,7 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks ld.d $a0, $s6, 0 ldptr.d $a0, $a0, 14176 ld.d $a0, $a0, 8 - ld.d $a3, $sp, 184 # 8-byte Folded Reload + ld.d $a3, $sp, 192 # 8-byte Folded Reload slli.d $a3, $a3, 3 ldx.d $a4, $a1, $a3 ldx.d $a0, $a0, $a3 @@ -3687,7 +3683,7 @@ Mode_Decision_for_4x4IntraBlocks: # @Mode_Decision_for_4x4IntraBlocks vldx $vr0, $a1, $a2 vstx $vr0, $a0, $a2 .LBB5_60: # %.loopexit - ld.d $a0, $sp, 120 # 8-byte Folded Reload + ld.d $a0, $sp, 128 # 8-byte Folded Reload b .LBB5_62 .LBB5_61: # %.preheader ld.d $a0, $s6, 0 @@ -4169,24 +4165,24 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks st.d $s7, $sp, 192 # 8-byte Folded Spill st.d $s8, $sp, 184 # 8-byte Folded Spill fst.d $fs0, $sp, 176 # 8-byte Folded Spill - st.d $a6, $sp, 72 # 8-byte Folded Spill - st.d $a5, $sp, 80 # 8-byte Folded Spill + st.d $a6, $sp, 88 # 8-byte Folded Spill + st.d $a5, $sp, 104 # 8-byte Folded Spill move $s4, $a4 move $fp, $a3 fmov.d $fs0, $fa0 - st.d $a1, $sp, 96 # 8-byte Folded Spill - st.d $a0, $sp, 88 # 8-byte Folded Spill + st.d $a1, $sp, 40 # 8-byte Folded Spill + st.d $a0, $sp, 96 # 8-byte Folded Spill st.w $zero, $sp, 172 st.w $zero, $sp, 164 slli.d $a0, $a2, 3 move $s3, $a2 slli.d $s2, $a2, 2 andi $s1, $a0, 8 - st.d $s2, $sp, 24 # 8-byte Folded Spill + st.d $s2, $sp, 32 # 8-byte Folded Spill bstrins.d $s2, $zero, 2, 0 addi.w $s6, $s2, 0 - srli.d $s7, $s1, 2 - srai.d $s8, $s6, 2 + srli.d $s8, $s1, 2 + srai.d $s7, $s6, 2 pcalau12i $a0, %got_pc_hi20(img) ld.d $s5, $a0, %got_pc_lo12(img) ld.d $a0, $s5, 0 @@ -4204,7 +4200,7 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks ld.w $a0, $a1, 12 ldptr.d $a2, $a1, 14224 ldptr.d $a3, $a1, 14216 - st.d $a3, $sp, 40 # 8-byte Folded Spill + st.d $a3, $sp, 56 # 8-byte Folded Spill ori $a3, $zero, 536 mul.d $a0, $a0, $a3 add.d $a3, $a2, $a0 @@ -4217,27 +4213,27 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks pcalau12i $a2, %got_pc_hi20(assignSE2partition) ld.d $a2, $a2, %got_pc_lo12(assignSE2partition) ldx.d $a0, $a2, $a0 - st.d $a0, $sp, 64 # 8-byte Folded Spill - st.d $a3, $sp, 56 # 8-byte Folded Spill + st.d $a0, $sp, 80 # 8-byte Folded Spill + st.d $a3, $sp, 72 # 8-byte Folded Spill st.h $zero, $a3, 480 - st.d $s0, $sp, 32 # 8-byte Folded Spill - st.d $s4, $sp, 48 # 8-byte Folded Spill - st.d $s7, $sp, 16 # 8-byte Folded Spill + st.d $s0, $sp, 48 # 8-byte Folded Spill + st.d $s4, $sp, 64 # 8-byte Folded Spill + st.d $s8, $sp, 24 # 8-byte Folded Spill beqz $s0, .LBB8_3 # %bb.1: ld.w $a0, $a1, 172 pcalau12i $a2, %pc_hi20(direct_pdir) ld.d $a2, $a2, %pc_lo12(direct_pdir) - add.w $a0, $a0, $s8 + add.w $a0, $a0, $s7 slli.d $a0, $a0, 3 ld.w $a1, $a1, 168 ldx.d $a2, $a2, $a0 - add.w $a1, $a1, $s7 + add.w $a1, $a1, $s8 ldx.b $a3, $a2, $a1 - bltz $a3, .LBB8_15 + bltz $a3, .LBB8_37 # %bb.2: - move $s4, $s8 - st.d $a4, $sp, 8 # 8-byte Folded Spill + move $s0, $s7 + move $s8, $a4 pcalau12i $a2, %pc_hi20(direct_ref_idx) ld.d $a2, $a2, %pc_lo12(direct_ref_idx) ld.d $a4, $a2, 0 @@ -4249,15 +4245,13 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks srai.d $a0, $a2, 63 andn $a6, $a2, $a0 addi.d $a0, $sp, 164 - ld.d $a1, $sp, 96 # 8-byte Folded Reload - move $s0, $s3 + ld.d $a1, $sp, 40 # 8-byte Folded Reload move $a2, $s3 move $a4, $zero move $a5, $zero - pcaddu18i $ra, %call36(LumaResidualCoding8x8) - jirl $ra, $ra, 0 b .LBB8_7 .LBB8_3: + move $s0, $s7 ori $a0, $zero, 2 bne $s4, $a0, .LBB8_6 # %bb.4: @@ -4271,11 +4265,11 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks ld.d $a0, $a0, %pc_lo12(wbp_weight) ld.d $a1, $a0, 0 ld.d $a0, $a0, 8 - ld.d $a2, $sp, 80 # 8-byte Folded Reload + ld.d $a2, $sp, 104 # 8-byte Folded Reload slli.d $a2, $a2, 3 ldx.d $a1, $a1, $a2 ldx.d $a0, $a0, $a2 - ld.d $a2, $sp, 72 # 8-byte Folded Reload + ld.d $a2, $sp, 88 # 8-byte Folded Reload slli.d $a2, $a2, 3 ldx.d $a1, $a1, $a2 ldx.d $a0, $a0, $a2 @@ -4284,9 +4278,9 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks add.d $a0, $a1, $a0 addi.w $a0, $a0, 128 ori $a1, $zero, 255 - bltu $a1, $a0, .LBB8_15 + bltu $a1, $a0, .LBB8_37 .LBB8_6: - st.d $a4, $sp, 8 # 8-byte Folded Spill + move $s8, $a4 addi.d $a0, $s4, -2 sltui $a0, $a0, 1 sltui $a1, $s4, 1 @@ -4299,29 +4293,28 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks sltui $a0, $a0, 2 maskeqz $a5, $fp, $a0 addi.d $a0, $sp, 164 - ld.d $a1, $sp, 96 # 8-byte Folded Reload - move $s0, $s3 + ld.d $a1, $sp, 40 # 8-byte Folded Reload move $a2, $s3 move $a3, $s4 - ld.d $a6, $sp, 80 # 8-byte Folded Reload - ld.d $a7, $sp, 72 # 8-byte Folded Reload + ld.d $a6, $sp, 104 # 8-byte Folded Reload + ld.d $a7, $sp, 88 # 8-byte Folded Reload +.LBB8_7: pcaddu18i $ra, %call36(LumaResidualCoding8x8) jirl $ra, $ra, 0 - move $s4, $s8 -.LBB8_7: + st.d $s0, $sp, 16 # 8-byte Folded Spill ld.d $a1, $sp, 112 # 8-byte Folded Reload ld.d $a1, $a1, 0 - ld.d $s7, $sp, 88 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload st.w $a0, $s7, 0 ldptr.w $a0, $a1, 4168 ld.d $a2, $s5, 0 ori $a3, $zero, 3 - bne $a0, $a3, .LBB8_11 + bne $a0, $a3, .LBB8_16 # %bb.8: ld.w $a0, $a2, 20 - ori $s3, $zero, 1 - ld.d $s8, $sp, 80 # 8-byte Folded Reload - beq $a0, $s3, .LBB8_10 + ori $s4, $zero, 1 + move $s0, $s3 + beq $a0, $s4, .LBB8_10 # %bb.9: addi.w $a1, $zero, -1 move $a0, $s0 @@ -4332,136 +4325,29 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks ldptr.w $a0, $a1, 4168 ld.d $a2, $s5, 0 ori $a3, $zero, 3 - bne $a0, $a3, .LBB8_11 + bne $a0, $a3, .LBB8_17 .LBB8_10: # %.thread ld.w $a0, $a2, 20 - bne $a0, $s3, .LBB8_16 -.LBB8_11: # %.thread208 - ld.w $a3, $a2, 176 - ldptr.d $a0, $a2, 14232 - pcalau12i $a4, %pc_hi20(imgY_org) - ld.d $a4, $a4, %pc_lo12(imgY_org) - ld.w $a5, $a2, 196 - add.w $a6, $a3, $s1 - pcalau12i $a3, %got_pc_hi20(enc_picture) - ld.d $a3, $a3, %got_pc_lo12(enc_picture) - ld.d $a3, $a3, 0 - ld.w $a2, $a2, 180 - ldptr.d $a3, $a3, 6440 - move $s1, $zero - slli.d $a2, $a2, 3 - alsl.d $a2, $s6, $a2, 3 - add.d $a2, $a3, $a2 - ld.d $a3, $sp, 24 # 8-byte Folded Reload - ori $a3, $a3, 7 - addi.w $a3, $a3, 0 - slli.d $a5, $a5, 3 - alsl.d $a5, $s6, $a5, 3 - add.d $a4, $a4, $a5 - addi.d $a5, $s6, -1 - slli.d $a6, $a6, 1 - vrepli.b $vr0, 0 - .p2align 4, , 16 -.LBB8_12: # =>This Inner Loop Header: Depth=1 - ld.d $a7, $a4, 0 - ld.d $t0, $a2, 0 - vldx $vr1, $a7, $a6 - vldx $vr2, $t0, $a6 - vilvl.h $vr3, $vr0, $vr1 - vilvh.h $vr1, $vr0, $vr1 - vilvl.h $vr4, $vr0, $vr2 - vilvh.h $vr2, $vr0, $vr2 - vsub.w $vr1, $vr1, $vr2 - vsub.w $vr2, $vr3, $vr4 - vpickve2gr.w $a7, $vr2, 0 - slli.d $a7, $a7, 2 - ldx.w $a7, $a0, $a7 - vpickve2gr.w $t0, $vr2, 1 - slli.d $t0, $t0, 2 - ldx.w $t0, $a0, $t0 - vpickve2gr.w $t1, $vr2, 2 - slli.d $t1, $t1, 2 - ldx.w $t1, $a0, $t1 - vpickve2gr.w $t2, $vr2, 3 - slli.d $t2, $t2, 2 - ldx.w $t2, $a0, $t2 - add.d $a7, $s1, $a7 - add.d $a7, $a7, $t0 - add.d $a7, $a7, $t1 - add.d $a7, $a7, $t2 - vpickve2gr.w $t0, $vr1, 0 - slli.d $t0, $t0, 2 - ldx.w $t0, $a0, $t0 - vpickve2gr.w $t1, $vr1, 1 - slli.d $t1, $t1, 2 - ldx.w $t1, $a0, $t1 - vpickve2gr.w $t2, $vr1, 2 - slli.d $t2, $t2, 2 - ldx.w $t2, $a0, $t2 - vpickve2gr.w $t3, $vr1, 3 - slli.d $t3, $t3, 2 - ldx.w $t3, $a0, $t3 - add.d $a7, $a7, $t0 - add.d $a7, $a7, $t1 - add.d $a7, $a7, $t2 - add.d $s1, $a7, $t3 - addi.d $a2, $a2, 8 - addi.d $a5, $a5, 1 - addi.d $a4, $a4, 8 - blt $a5, $a3, .LBB8_12 -# %bb.13: - ld.d $s6, $sp, 56 # 8-byte Folded Reload - ld.d $s7, $sp, 40 # 8-byte Folded Reload - ld.d $s3, $sp, 32 # 8-byte Folded Reload - ldptr.w $a1, $a1, 4008 - ld.d $a0, $sp, 8 # 8-byte Folded Reload - beqz $a1, .LBB8_23 -.LBB8_14: - st.w $a0, $sp, 124 - ori $a0, $zero, 2 - ld.d $a1, $sp, 64 # 8-byte Folded Reload - ld.w $a1, $a1, 8 - ld.d $a2, $s7, 24 - st.w $a0, $sp, 120 - ori $a0, $zero, 104 - mul.d $a0, $a1, $a0 - add.d $a1, $a2, $a0 - pcalau12i $a0, %got_pc_hi20(writeB8_typeInfo) - ld.d $a0, $a0, %got_pc_lo12(writeB8_typeInfo) - ld.d $a2, $a0, 0 - addi.d $a0, $sp, 120 - jirl $ra, $a2, 0 - ld.w $s2, $sp, 132 - ld.d $a3, $sp, 48 # 8-byte Folded Reload - beqz $s3, .LBB8_24 - b .LBB8_36 -.LBB8_15: - pcalau12i $a0, %pc_hi20(.LCPI8_0) - fld.d $fa0, $a0, %pc_lo12(.LCPI8_0) - b .LBB8_41 -.LBB8_16: # %.preheader - st.d $s4, $sp, 24 # 8-byte Folded Spill + beq $a0, $s4, .LBB8_17 +# %bb.11: # %.preheader ldptr.w $a0, $a1, 4728 - blez $a0, .LBB8_21 -# %bb.17: # %.lr.ph + blez $a0, .LBB8_41 +# %bb.12: # %.lr.ph pcalau12i $s6, %pc_hi20(imgY_org) pcalau12i $a0, %got_pc_hi20(decs) ld.d $s7, $a0, %got_pc_lo12(decs) move $s3, $zero move $s4, $zero - vrepli.b $vr4, 0 - vst $vr4, $sp, 96 # 16-byte Folded Spill .p2align 4, , 16 -.LBB8_18: # =>This Loop Header: Depth=1 - # Child Loop BB8_19 Depth 2 +.LBB8_13: # =>This Loop Header: Depth=1 + # Child Loop BB8_14 Depth 2 ori $a1, $zero, 8 move $a0, $s3 move $a2, $s0 move $a3, $fp - move $a4, $s8 + ld.d $a4, $sp, 104 # 8-byte Folded Reload pcaddu18i $ra, %call36(decode_one_b8block) jirl $ra, $ra, 0 - vld $vr4, $sp, 96 # 16-byte Folded Reload ld.d $a0, $s5, 0 ld.w $a1, $a0, 196 ld.w $a2, $a0, 192 @@ -4479,16 +4365,18 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks alsl.d $a4, $a4, $a6, 3 slli.d $a5, $a5, 1 .p2align 4, , 16 -.LBB8_19: # Parent Loop BB8_18 Depth=1 +.LBB8_14: # Parent Loop BB8_13 Depth=1 # => This Inner Loop Header: Depth=2 ld.d $a6, $a3, 0 ld.d $a7, $a4, 0 vldx $vr0, $a6, $a5 vldx $vr1, $a7, $a5 - vilvl.h $vr2, $vr4, $vr0 - vilvh.h $vr0, $vr4, $vr0 - vilvl.h $vr3, $vr4, $vr1 - vilvh.h $vr1, $vr4, $vr1 + vsllwil.wu.hu $vr2, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr3, $vr1, 0 + vbsrl.v $vr1, $vr1, 8 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vsub.w $vr1, $vr2, $vr3 vpickve2gr.w $a6, $vr1, 0 @@ -4526,140 +4414,199 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks addi.d $a2, $a2, 1 addi.d $a3, $a3, 8 addi.d $a4, $a4, 8 - blt $a2, $a1, .LBB8_19 -# %bb.20: # in Loop: Header=BB8_18 Depth=1 + blt $a2, $a1, .LBB8_14 +# %bb.15: # in Loop: Header=BB8_13 Depth=1 ld.d $a0, $sp, 112 # 8-byte Folded Reload ld.d $a1, $a0, 0 ldptr.w $a0, $a1, 4728 addi.d $s3, $s3, 1 - blt $s3, $a0, .LBB8_18 - b .LBB8_22 -.LBB8_21: # %.preheader.._crit_edge_crit_edge - move $s4, $zero -.LBB8_22: # %._crit_edge - div.d $s1, $s4, $a0 - ld.d $s6, $sp, 56 # 8-byte Folded Reload - ld.d $s7, $sp, 40 # 8-byte Folded Reload - ld.d $s4, $sp, 24 # 8-byte Folded Reload - ld.d $s3, $sp, 32 # 8-byte Folded Reload + blt $s3, $a0, .LBB8_13 + b .LBB8_42 +.LBB8_16: + move $s0, $s3 +.LBB8_17: # %.thread208 + ld.w $a3, $a2, 176 + ldptr.d $a0, $a2, 14232 + pcalau12i $a4, %pc_hi20(imgY_org) + ld.d $a4, $a4, %pc_lo12(imgY_org) + ld.w $a5, $a2, 196 + add.w $a6, $a3, $s1 + pcalau12i $a3, %got_pc_hi20(enc_picture) + ld.d $a3, $a3, %got_pc_lo12(enc_picture) + ld.d $a3, $a3, 0 + move $s1, $zero + ld.w $a7, $a2, 180 + ldptr.d $a3, $a3, 6440 + ld.d $a2, $sp, 32 # 8-byte Folded Reload + ori $a2, $a2, 7 + addi.w $a2, $a2, 0 + slli.d $a7, $a7, 3 + alsl.d $a7, $s6, $a7, 3 + add.d $a3, $a3, $a7 + slli.d $a5, $a5, 3 + alsl.d $a5, $s6, $a5, 3 + add.d $a4, $a4, $a5 + addi.d $a5, $s6, -1 + slli.d $a6, $a6, 1 + .p2align 4, , 16 +.LBB8_18: # =>This Inner Loop Header: Depth=1 + ld.d $a7, $a4, 0 + ld.d $t0, $a3, 0 + vldx $vr0, $a7, $a6 + vldx $vr1, $t0, $a6 + vsllwil.wu.hu $vr2, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr3, $vr1, 0 + vbsrl.v $vr1, $vr1, 8 + vsllwil.wu.hu $vr1, $vr1, 0 + vsub.w $vr0, $vr0, $vr1 + vsub.w $vr1, $vr2, $vr3 + vpickve2gr.w $a7, $vr1, 0 + slli.d $a7, $a7, 2 + ldx.w $a7, $a0, $a7 + vpickve2gr.w $t0, $vr1, 1 + slli.d $t0, $t0, 2 + ldx.w $t0, $a0, $t0 + vpickve2gr.w $t1, $vr1, 2 + slli.d $t1, $t1, 2 + ldx.w $t1, $a0, $t1 + vpickve2gr.w $t2, $vr1, 3 + slli.d $t2, $t2, 2 + ldx.w $t2, $a0, $t2 + add.d $a7, $s1, $a7 + add.d $a7, $a7, $t0 + add.d $a7, $a7, $t1 + add.d $a7, $a7, $t2 + vpickve2gr.w $t0, $vr0, 0 + slli.d $t0, $t0, 2 + ldx.w $t0, $a0, $t0 + vpickve2gr.w $t1, $vr0, 1 + slli.d $t1, $t1, 2 + ldx.w $t1, $a0, $t1 + vpickve2gr.w $t2, $vr0, 2 + slli.d $t2, $t2, 2 + ldx.w $t2, $a0, $t2 + vpickve2gr.w $t3, $vr0, 3 + slli.d $t3, $t3, 2 + ldx.w $t3, $a0, $t3 + add.d $a7, $a7, $t0 + add.d $a7, $a7, $t1 + add.d $a7, $a7, $t2 + add.d $s1, $a7, $t3 + addi.d $a3, $a3, 8 + addi.d $a5, $a5, 1 + addi.d $a4, $a4, 8 + blt $a5, $a2, .LBB8_18 +.LBB8_19: + ld.d $s6, $sp, 72 # 8-byte Folded Reload + ld.d $s7, $sp, 56 # 8-byte Folded Reload + ld.d $s3, $sp, 48 # 8-byte Folded Reload ldptr.w $a1, $a1, 4008 - ld.d $a0, $sp, 8 # 8-byte Folded Reload - bnez $a1, .LBB8_14 -.LBB8_23: + beqz $a1, .LBB8_21 +# %bb.20: + st.w $s8, $sp, 124 + ori $a0, $zero, 2 + ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.w $a1, $a1, 8 + ld.d $a2, $s7, 24 + st.w $a0, $sp, 120 + ori $a0, $zero, 104 + mul.d $a0, $a1, $a0 + add.d $a1, $a2, $a0 + pcalau12i $a0, %got_pc_hi20(writeB8_typeInfo) + ld.d $a0, $a0, %got_pc_lo12(writeB8_typeInfo) + ld.d $a2, $a0, 0 + addi.d $a0, $sp, 120 + jirl $ra, $a2, 0 + ld.w $s2, $sp, 132 + ld.d $a3, $sp, 64 # 8-byte Folded Reload + beqz $s3, .LBB8_22 + b .LBB8_32 +.LBB8_21: addi.d $a2, $sp, 168 addi.d $a3, $sp, 172 + move $a0, $s8 move $a1, $zero pcaddu18i $ra, %call36(ue_linfo) jirl $ra, $ra, 0 ld.w $s2, $sp, 168 - ld.d $a3, $sp, 48 # 8-byte Folded Reload - bnez $s3, .LBB8_36 -.LBB8_24: + ld.d $a3, $sp, 64 # 8-byte Folded Reload + bnez $s3, .LBB8_32 +.LBB8_22: ld.d $a1, $s5, 0 lu12i.w $a0, 15 move $a2, $a3 bstrins.d $a2, $zero, 1, 1 ori $s3, $a0, 4093 - bnez $a2, .LBB8_29 -# %bb.25: + bnez $a2, .LBB8_25 +# %bb.23: ldptr.w $a0, $a1, 14456 ori $a2, $zero, 2 - blt $a0, $a2, .LBB8_29 -# %bb.26: - move $a2, $s4 + blt $a0, $a2, .LBB8_25 +# %bb.24: move $s4, $a3 ori $a3, $zero, 1 move $a0, $fp - ld.d $a1, $sp, 16 # 8-byte Folded Reload - move $s8, $a2 - ld.d $a4, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 24 # 8-byte Folded Reload + ld.d $a2, $sp, 16 # 8-byte Folded Reload + ld.d $a4, $sp, 104 # 8-byte Folded Reload pcaddu18i $ra, %call36(writeReferenceFrame) jirl $ra, $ra, 0 move $a3, $s4 ld.d $a1, $s5, 0 add.w $s2, $a0, $s2 +.LBB8_25: ldptr.w $a0, $a1, 14460 ori $a2, $zero, 2 and $s3, $a3, $s3 - bge $a0, $a2, .LBB8_30 -.LBB8_27: - ld.d $s5, $sp, 16 # 8-byte Folded Reload - move $s4, $s8 - ld.d $a4, $sp, 80 # 8-byte Folded Reload - bnez $s3, .LBB8_34 -.LBB8_28: - addi.d $a2, $s5, 2 - move $s3, $a3 - addi.d $a3, $s4, 2 - move $a0, $s5 - move $a1, $s4 - move $a5, $zero - move $a6, $fp - pcaddu18i $ra, %call36(writeMotionVector8x8) - jirl $ra, $ra, 0 - move $a3, $s3 - add.w $s2, $a0, $s2 - addi.d $a0, $a3, -1 - bstrpick.d $a0, $a0, 15, 0 - ori $a1, $zero, 1 - bgeu $a1, $a0, .LBB8_35 - b .LBB8_36 -.LBB8_29: - move $s8, $s4 - ldptr.w $a0, $a1, 14460 - ori $a2, $zero, 2 - and $s3, $a3, $s3 - blt $a0, $a2, .LBB8_27 -.LBB8_30: + blt $a0, $a2, .LBB8_39 +# %bb.26: addi.d $a0, $a3, -1 bstrpick.d $a2, $a0, 15, 0 ori $a0, $zero, 1 - ld.d $s5, $sp, 16 # 8-byte Folded Reload - move $s4, $s8 - bltu $a0, $a2, .LBB8_33 -# %bb.31: + ld.d $s5, $sp, 24 # 8-byte Folded Reload + bltu $a0, $a2, .LBB8_29 +# %bb.27: ld.w $a1, $a1, 20 - bne $a1, $a0, .LBB8_33 -# %bb.32: + bne $a1, $a0, .LBB8_29 +# %bb.28: move $a0, $fp move $a1, $s5 - move $a2, $s4 - move $s8, $s4 + ld.d $a2, $sp, 16 # 8-byte Folded Reload move $s4, $a3 move $a3, $zero - ld.d $a4, $sp, 72 # 8-byte Folded Reload + ld.d $a4, $sp, 88 # 8-byte Folded Reload pcaddu18i $ra, %call36(writeReferenceFrame) jirl $ra, $ra, 0 move $a3, $s4 - move $s4, $s8 add.w $s2, $a0, $s2 -.LBB8_33: - ld.d $a4, $sp, 80 # 8-byte Folded Reload - beqz $s3, .LBB8_28 -.LBB8_34: +.LBB8_29: + beqz $s3, .LBB8_40 +.LBB8_30: addi.d $a0, $a3, -1 bstrpick.d $a0, $a0, 15, 0 ori $a1, $zero, 1 - bltu $a1, $a0, .LBB8_36 -.LBB8_35: + bltu $a1, $a0, .LBB8_32 +.LBB8_31: addi.d $a2, $s5, 2 - addi.d $a3, $s4, 2 + ld.d $a1, $sp, 16 # 8-byte Folded Reload + addi.d $a3, $a1, 2 ori $a5, $zero, 1 move $a0, $s5 - move $a1, $s4 - ld.d $a4, $sp, 72 # 8-byte Folded Reload + ld.d $a4, $sp, 88 # 8-byte Folded Reload move $a6, $fp pcaddu18i $ra, %call36(writeMotionVector8x8) jirl $ra, $ra, 0 add.w $s2, $a0, $s2 -.LBB8_36: +.LBB8_32: ld.d $a0, $sp, 112 # 8-byte Folded Reload ld.d $a0, $a0, 0 ldptr.w $a0, $a0, 4008 ori $a1, $zero, 1 - bne $a0, $a1, .LBB8_38 -# %bb.37: - ld.d $a0, $sp, 64 # 8-byte Folded Reload + bne $a0, $a1, .LBB8_34 +# %bb.33: + ld.d $a0, $sp, 80 # 8-byte Folded Reload ld.w $a0, $a0, 24 ld.d $a1, $s7, 24 ori $a2, $zero, 104 @@ -4669,7 +4616,7 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks move $a0, $s3 pcaddu18i $ra, %call36(arienco_bits_written) jirl $ra, $ra, 0 - ld.d $s4, $sp, 88 # 8-byte Folded Reload + ld.d $s4, $sp, 96 # 8-byte Folded Reload ld.w $a1, $s4, 0 pcalau12i $a2, %pc_hi20(cbp8x8) ld.w $a2, $a2, %pc_lo12(cbp8x8) @@ -4689,26 +4636,30 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks st.w $a0, $sp, 168 add.w $s2, $a0, $s2 ld.w $a0, $s4, 0 - bnez $a0, .LBB8_39 - b .LBB8_40 -.LBB8_38: - ld.d $s4, $sp, 88 # 8-byte Folded Reload + bnez $a0, .LBB8_35 + b .LBB8_36 +.LBB8_34: + ld.d $s4, $sp, 96 # 8-byte Folded Reload ld.w $a0, $s4, 0 - beqz $a0, .LBB8_40 -.LBB8_39: + beqz $a0, .LBB8_36 +.LBB8_35: ld.w $a2, $s6, 472 move $a0, $s0 move $a1, $fp pcaddu18i $ra, %call36(writeLumaCoeff8x8) jirl $ra, $ra, 0 add.w $s2, $a0, $s2 -.LBB8_40: +.LBB8_36: movgr2fr.d $fa0, $s1 ffint.d.l $fa0, $fa0 movgr2fr.w $fa1, $s2 ffint.d.w $fa1, $fa1 fmadd.d $fa0, $fs0, $fa1, $fa0 -.LBB8_41: + b .LBB8_38 +.LBB8_37: + pcalau12i $a0, %pc_hi20(.LCPI8_0) + fld.d $fa0, $a0, %pc_lo12(.LCPI8_0) +.LBB8_38: fld.d $fs0, $sp, 176 # 8-byte Folded Reload ld.d $s8, $sp, 184 # 8-byte Folded Reload ld.d $s7, $sp, 192 # 8-byte Folded Reload @@ -4723,6 +4674,32 @@ RDCost_for_8x8blocks: # @RDCost_for_8x8blocks ld.d $ra, $sp, 264 # 8-byte Folded Reload addi.d $sp, $sp, 272 ret +.LBB8_39: + ld.d $s5, $sp, 24 # 8-byte Folded Reload + bnez $s3, .LBB8_30 +.LBB8_40: + addi.d $a2, $s5, 2 + move $s3, $a3 + ld.d $a1, $sp, 16 # 8-byte Folded Reload + addi.d $a3, $a1, 2 + move $a0, $s5 + ld.d $a4, $sp, 104 # 8-byte Folded Reload + move $a5, $zero + move $a6, $fp + pcaddu18i $ra, %call36(writeMotionVector8x8) + jirl $ra, $ra, 0 + move $a3, $s3 + add.w $s2, $a0, $s2 + addi.d $a0, $a3, -1 + bstrpick.d $a0, $a0, 15, 0 + ori $a1, $zero, 1 + bgeu $a1, $a0, .LBB8_31 + b .LBB8_32 +.LBB8_41: # %.preheader.._crit_edge_crit_edge + move $s4, $zero +.LBB8_42: # %._crit_edge + div.d $s1, $s4, $a0 + b .LBB8_19 .Lfunc_end8: .size RDCost_for_8x8blocks, .Lfunc_end8-RDCost_for_8x8blocks # -- End function @@ -13397,8 +13374,8 @@ update_offset_params: # @update_offset_params ori $t4, $zero, 4 or $t3, $t3, $t0 st.d $a3, $sp, 24 # 8-byte Folded Spill - st.d $a1, $sp, 16 # 8-byte Folded Spill - st.d $a4, $sp, 8 # 8-byte Folded Spill + st.d $a4, $sp, 16 # 8-byte Folded Spill + st.d $a1, $sp, 8 # 8-byte Folded Spill st.d $a5, $sp, 32 # 8-byte Folded Spill bltu $t4, $a0, .LBB21_2 # %bb.1: # %switch.lookup @@ -13676,10 +13653,10 @@ update_offset_params: # @update_offset_params pcalau12i $a3, %pc_hi20(AdaptRndCrPos) addi.d $a3, $a3, %pc_lo12(AdaptRndCrPos) add.d $a1, $a3, $a1 - ld.d $a3, $sp, 8 # 8-byte Folded Reload + ld.d $a3, $sp, 16 # 8-byte Folded Reload ldx.w $a1, $a1, $a3 move $a3, $zero - ld.d $a6, $sp, 16 # 8-byte Folded Reload + ld.d $a6, $sp, 8 # 8-byte Folded Reload alsl.d $a4, $a1, $a6, 3 slli.d $a1, $a1, 3 ldx.d $a1, $a6, $a1 @@ -13690,34 +13667,33 @@ update_offset_params: # @update_offset_params alsl.d $t1, $a5, $a4, 1 vreplgr2vr.w $vr0, $a2 ori $t2, $zero, 4 - vrepli.b $vr1, 0 b .LBB21_11 .p2align 4, , 16 .LBB21_9: # %vector.body # in Loop: Header=BB21_11 Depth=1 slli.d $t6, $t3, 1 ldx.d $t7, $a1, $t6 - vld $vr2, $t5, 0 + vld $vr1, $t5, 0 alsl.d $t5, $t3, $a1, 1 - vinsgr2vr.d $vr3, $t7, 0 - vpickev.h $vr2, $vr2, $vr2 - vadd.h $vr2, $vr3, $vr2 - vmaxi.h $vr2, $vr2, 0 - vilvl.h $vr2, $vr1, $vr2 - vmin.w $vr2, $vr2, $vr0 - vpickev.h $vr2, $vr2, $vr2 + vinsgr2vr.d $vr2, $t7, 0 + vpickev.h $vr1, $vr1, $vr1 + vadd.h $vr1, $vr2, $vr1 + vmaxi.h $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vmin.w $vr1, $vr1, $vr0 + vpickev.h $vr1, $vr1, $vr1 ldx.d $t6, $a4, $t6 - vld $vr3, $t4, 0 - vstelm.d $vr2, $t5, 0, 0 + vld $vr2, $t4, 0 + vstelm.d $vr1, $t5, 0, 0 alsl.d $t3, $t3, $a4, 1 - vinsgr2vr.d $vr2, $t6, 0 - vpickev.h $vr3, $vr3, $vr3 - vadd.h $vr2, $vr2, $vr3 - vmaxi.h $vr2, $vr2, 0 - vilvl.h $vr2, $vr1, $vr2 - vmin.w $vr2, $vr2, $vr0 + vinsgr2vr.d $vr1, $t6, 0 vpickev.h $vr2, $vr2, $vr2 - vstelm.d $vr2, $t3, 0, 0 + vadd.h $vr1, $vr1, $vr2 + vmaxi.h $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vmin.w $vr1, $vr1, $vr0 + vpickev.h $vr1, $vr1, $vr1 + vstelm.d $vr1, $t3, 0, 0 .LBB21_10: # %._crit_edge.us # in Loop: Header=BB21_11 Depth=1 addi.d $a3, $a3, 1 @@ -15918,21 +15894,21 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 ori $a0, $zero, 1 beq $a1, $a2, .LBB26_8 # %bb.1: # %.preheader53.preheader - addi.d $sp, $sp, -272 - st.d $ra, $sp, 264 # 8-byte Folded Spill - st.d $fp, $sp, 256 # 8-byte Folded Spill - st.d $s0, $sp, 248 # 8-byte Folded Spill - st.d $s1, $sp, 240 # 8-byte Folded Spill - st.d $s2, $sp, 232 # 8-byte Folded Spill - st.d $s3, $sp, 224 # 8-byte Folded Spill - st.d $s4, $sp, 216 # 8-byte Folded Spill - st.d $s5, $sp, 208 # 8-byte Folded Spill - st.d $s6, $sp, 200 # 8-byte Folded Spill - st.d $s7, $sp, 192 # 8-byte Folded Spill - st.d $s8, $sp, 184 # 8-byte Folded Spill + addi.d $sp, $sp, -256 + st.d $ra, $sp, 248 # 8-byte Folded Spill + st.d $fp, $sp, 240 # 8-byte Folded Spill + st.d $s0, $sp, 232 # 8-byte Folded Spill + st.d $s1, $sp, 224 # 8-byte Folded Spill + st.d $s2, $sp, 216 # 8-byte Folded Spill + st.d $s3, $sp, 208 # 8-byte Folded Spill + st.d $s4, $sp, 200 # 8-byte Folded Spill + st.d $s5, $sp, 192 # 8-byte Folded Spill + st.d $s6, $sp, 184 # 8-byte Folded Spill + st.d $s7, $sp, 176 # 8-byte Folded Spill + st.d $s8, $sp, 168 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(img) ld.d $a0, $a0, %got_pc_lo12(img) - st.d $a0, $sp, 176 # 8-byte Folded Spill + st.d $a0, $sp, 160 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(tr8x8) addi.d $t3, $a0, %pc_lo12(tr8x8) lu12i.w $a0, 1 @@ -15950,7 +15926,6 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 move $s8, $zero move $a0, $zero move $a3, $zero - vrepli.b $vr2, 0 ori $t0, $zero, 0 ori $t1, $zero, 0 lu32i.d $t1, 4 @@ -15959,13 +15934,12 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 lu32i.d $t0, 12 st.d $t3, $sp, 64 # 8-byte Folded Spill st.d $t4, $sp, 56 # 8-byte Folded Spill - vst $vr2, $sp, 144 # 16-byte Folded Spill st.d $a5, $sp, 136 # 8-byte Folded Spill st.d $a6, $sp, 120 # 8-byte Folded Spill - st.d $t1, $sp, 112 # 8-byte Folded Spill - st.d $a7, $sp, 104 # 8-byte Folded Spill - st.d $t0, $sp, 96 # 8-byte Folded Spill - st.d $t2, $sp, 88 # 8-byte Folded Spill + st.d $t0, $sp, 112 # 8-byte Folded Spill + st.d $t2, $sp, 104 # 8-byte Folded Spill + st.d $a7, $sp, 96 # 8-byte Folded Spill + st.d $t1, $sp, 88 # 8-byte Folded Spill .p2align 4, , 16 .LBB26_2: # %.preheader53 # =>This Loop Header: Depth=1 @@ -15973,7 +15947,7 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 # Child Loop BB26_5 Depth 2 st.d $a0, $sp, 40 # 8-byte Folded Spill move $fp, $zero - ld.d $a0, $sp, 176 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.d $a0, $a0, 0 st.d $a1, $sp, 48 # 8-byte Folded Spill slli.d $a1, $a1, 3 @@ -15995,23 +15969,23 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 add.d $s1, $a0, $a2 add.d $a0, $t4, $a1 add.d $s2, $a0, $a2 - st.d $a4, $sp, 160 # 8-byte Folded Spill + st.d $a4, $sp, 144 # 8-byte Folded Spill slli.d $a0, $a4, 3 st.d $a0, $sp, 80 # 8-byte Folded Spill - st.d $s4, $sp, 168 # 8-byte Folded Spill + st.d $s4, $sp, 152 # 8-byte Folded Spill st.d $a3, $sp, 128 # 8-byte Folded Spill .p2align 4, , 16 .LBB26_3: # %.preheader # Parent Loop BB26_2 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $a0, $sp, 176 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.d $a0, $a0, 0 ld.d $a1, $a5, %pc_lo12(imgY_org) ld.w $a0, $a0, 192 ld.d $a2, $sp, 80 # 8-byte Folded Reload ldx.d $a3, $a1, $a2 add.d $a0, $s4, $a0 - ld.d $a2, $sp, 160 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload alsl.d $a2, $a2, $a1, 3 slli.d $a1, $a0, 1 ldx.d $a0, $a3, $a1 @@ -16019,15 +15993,15 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 ld.d $a4, $s2, -96 slli.d $a5, $fp, 2 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 ld.d $a0, $s1, -96 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr1, $vr0, $vr1 vstx $vr1, $a6, $a5 vinsgr2vr.d $vr1, $a0, 0 ld.d $a0, $a2, 8 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vstx $vr0, $a7, $a5 ldx.d $a0, $a0, $a1 @@ -16035,9 +16009,9 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 ld.d $a5, $s2, -64 srai.d $a4, $a4, 30 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a0, $s1, -64 vsub.w $vr1, $vr0, $vr1 ld.d $a5, $a2, 16 @@ -16045,21 +16019,21 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 vinsgr2vr.d $vr1, $a0, 0 alsl.d $a0, $fp, $a6, 2 ldx.d $a5, $a5, $a1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vstx $vr0, $a7, $a4 vinsgr2vr.d $vr0, $a5, 0 ld.d $a4, $s2, -32 add.d $a5, $a3, $t2 srai.d $a5, $a5, 30 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 ld.d $a4, $s1, -32 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr1, $vr0, $vr1 vstx $vr1, $a6, $a5 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a2, $a2, 24 vsub.w $vr0, $vr0, $vr1 vstx $vr0, $a7, $a5 @@ -16069,14 +16043,14 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 ld.d $a4, $s2, 0 srai.d $a3, $a3, 30 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 ld.d $a1, $s1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr1, $vr0, $vr1 vstx $vr1, $a6, $a3 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vstx $vr0, $a7, $a3 addi.w $fp, $a2, 4 @@ -16091,7 +16065,6 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 move $t2, $s6 move $t1, $s3 move $t0, $s0 - vld $vr2, $sp, 144 # 16-byte Folded Reload move $a7, $s5 move $a6, $s7 ld.d $a5, $sp, 136 # 8-byte Folded Reload @@ -16101,7 +16074,7 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 addi.d $s2, $s2, 8 bltu $s4, $a3, .LBB26_3 # %bb.4: # in Loop: Header=BB26_2 Depth=1 - ld.d $a0, $sp, 176 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.d $a0, $a0, 0 ld.w $a0, $a0, 196 ld.d $a2, $sp, 72 # 8-byte Folded Reload @@ -16127,15 +16100,15 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 slli.d $a0, $a1, 5 add.d $s5, $a4, $a0 add.d $s4, $a3, $a0 - ld.d $a4, $sp, 168 # 8-byte Folded Reload - st.d $s8, $sp, 160 # 8-byte Folded Spill + ld.d $a4, $sp, 152 # 8-byte Folded Reload + st.d $s8, $sp, 144 # 8-byte Folded Spill ld.d $s8, $sp, 24 # 8-byte Folded Reload .p2align 4, , 16 .LBB26_5: # %.preheader.1 # Parent Loop BB26_2 Depth=1 # => This Inner Loop Header: Depth=2 - st.d $a4, $sp, 168 # 8-byte Folded Spill - ld.d $a0, $sp, 176 # 8-byte Folded Reload + st.d $a4, $sp, 152 # 8-byte Folded Spill + ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.d $a0, $a0, 0 ld.d $a1, $sp, 136 # 8-byte Folded Reload ld.d $a1, $a1, %pc_lo12(imgY_org) @@ -16151,29 +16124,28 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 ldx.d $a4, $s4, $s8 slli.d $a5, $fp, 2 vinsgr2vr.d $vr0, $a0, 0 - vld $vr2, $sp, 144 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 ldx.d $a0, $s5, $s8 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr1, $vr0, $vr1 ld.d $a6, $sp, 120 # 8-byte Folded Reload vstx $vr1, $a6, $a5 vinsgr2vr.d $vr1, $a0, 0 ld.d $a0, $a2, 8 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 - ld.d $a7, $sp, 104 # 8-byte Folded Reload + ld.d $a7, $sp, 96 # 8-byte Folded Reload vstx $vr0, $a7, $a5 ldx.d $a0, $a0, $a1 - ld.d $a4, $sp, 112 # 8-byte Folded Reload + ld.d $a4, $sp, 88 # 8-byte Folded Reload add.d $a4, $a3, $a4 ldx.d $a5, $s2, $s8 srai.d $a4, $a4, 30 vinsgr2vr.d $vr0, $a0, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ldx.d $a0, $s6, $s8 vsub.w $vr1, $vr0, $vr1 ld.d $a5, $a2, 16 @@ -16181,49 +16153,49 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 vinsgr2vr.d $vr1, $a0, 0 alsl.d $a0, $fp, $a6, 2 ldx.d $a5, $a5, $a1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vstx $vr0, $a7, $a4 vinsgr2vr.d $vr0, $a5, 0 ldx.d $a4, $s3, $s8 - ld.d $a5, $sp, 88 # 8-byte Folded Reload + ld.d $a5, $sp, 104 # 8-byte Folded Reload add.d $a5, $a3, $a5 srai.d $a5, $a5, 30 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 ldx.d $a4, $s7, $s8 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr1, $vr0, $vr1 vstx $vr1, $a6, $a5 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a2, $a2, 24 vsub.w $vr0, $vr0, $vr1 vstx $vr0, $a7, $a5 - ld.d $a4, $sp, 96 # 8-byte Folded Reload + ld.d $a4, $sp, 112 # 8-byte Folded Reload add.d $a3, $a3, $a4 ldx.d $a1, $a2, $a1 srai.d $a2, $a3, 32 ldx.d $a4, $s1, $s8 srai.d $a3, $a3, 30 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 ldx.d $a1, $s0, $s8 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr1, $vr0, $vr1 vstx $vr1, $a6, $a3 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vstx $vr0, $a7, $a3 addi.w $fp, $a2, 4 pcaddu18i $ra, %call36(distortion4x4) jirl $ra, $ra, 0 - ld.d $a4, $sp, 168 # 8-byte Folded Reload - ld.d $a1, $sp, 160 # 8-byte Folded Reload + ld.d $a4, $sp, 152 # 8-byte Folded Reload + ld.d $a1, $sp, 144 # 8-byte Folded Reload add.w $a1, $a0, $a1 - st.d $a1, $sp, 160 # 8-byte Folded Spill + st.d $a1, $sp, 144 # 8-byte Folded Spill addi.d $a4, $a4, 4 addi.d $s0, $s0, 8 addi.d $s1, $s1, 8 @@ -16252,28 +16224,27 @@ GetBestTransformP8x8: # @GetBestTransformP8x8 ld.d $t4, $sp, 56 # 8-byte Folded Reload ld.d $a5, $sp, 136 # 8-byte Folded Reload ld.d $a6, $sp, 120 # 8-byte Folded Reload - ld.d $a7, $sp, 104 # 8-byte Folded Reload - vld $vr2, $sp, 144 # 16-byte Folded Reload - ld.d $t0, $sp, 96 # 8-byte Folded Reload - ld.d $t1, $sp, 112 # 8-byte Folded Reload - ld.d $t2, $sp, 88 # 8-byte Folded Reload + ld.d $a7, $sp, 96 # 8-byte Folded Reload + ld.d $t0, $sp, 112 # 8-byte Folded Reload + ld.d $t1, $sp, 88 # 8-byte Folded Reload + ld.d $t2, $sp, 104 # 8-byte Folded Reload ori $a4, $zero, 4 - ld.d $s8, $sp, 160 # 8-byte Folded Reload + ld.d $s8, $sp, 144 # 8-byte Folded Reload bne $a3, $a4, .LBB26_2 # %bb.7: slt $a0, $a0, $s8 - ld.d $s8, $sp, 184 # 8-byte Folded Reload - ld.d $s7, $sp, 192 # 8-byte Folded Reload - ld.d $s6, $sp, 200 # 8-byte Folded Reload - ld.d $s5, $sp, 208 # 8-byte Folded Reload - ld.d $s4, $sp, 216 # 8-byte Folded Reload - ld.d $s3, $sp, 224 # 8-byte Folded Reload - ld.d $s2, $sp, 232 # 8-byte Folded Reload - ld.d $s1, $sp, 240 # 8-byte Folded Reload - ld.d $s0, $sp, 248 # 8-byte Folded Reload - ld.d $fp, $sp, 256 # 8-byte Folded Reload - ld.d $ra, $sp, 264 # 8-byte Folded Reload - addi.d $sp, $sp, 272 + ld.d $s8, $sp, 168 # 8-byte Folded Reload + ld.d $s7, $sp, 176 # 8-byte Folded Reload + ld.d $s6, $sp, 184 # 8-byte Folded Reload + ld.d $s5, $sp, 192 # 8-byte Folded Reload + ld.d $s4, $sp, 200 # 8-byte Folded Reload + ld.d $s3, $sp, 208 # 8-byte Folded Reload + ld.d $s2, $sp, 216 # 8-byte Folded Reload + ld.d $s1, $sp, 224 # 8-byte Folded Reload + ld.d $s0, $sp, 232 # 8-byte Folded Reload + ld.d $fp, $sp, 240 # 8-byte Folded Reload + ld.d $ra, $sp, 248 # 8-byte Folded Reload + addi.d $sp, $sp, 256 .LBB26_8: ret .Lfunc_end26: diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/transform8x8.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/transform8x8.s index e1e075e4..7cd168ff 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/transform8x8.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/transform8x8.s @@ -105,11 +105,11 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks fst.d $fs0, $sp, 1936 # 8-byte Folded Spill fst.d $fs1, $sp, 1928 # 8-byte Folded Spill fst.d $fs2, $sp, 1920 # 8-byte Folded Spill - addi.d $sp, $sp, -544 + addi.d $sp, $sp, -528 st.d $a1, $sp, 152 # 8-byte Folded Spill fmov.d $fs0, $fa0 andi $a4, $a0, 1 - st.d $a0, $sp, 208 # 8-byte Folded Spill + st.d $a0, $sp, 192 # 8-byte Folded Spill slli.d $s2, $a0, 2 slli.d $fp, $a4, 3 st.d $s2, $sp, 72 # 8-byte Folded Spill @@ -136,7 +136,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks st.d $a3, $sp, 144 # 8-byte Folded Spill bstrpick.d $s7, $a3, 62, 61 addi.d $a1, $fp, -1 - addi.d $a3, $sp, 244 + addi.d $a3, $sp, 228 move $a0, $s4 st.d $a2, $sp, 168 # 8-byte Folded Spill pcaddu18i $ra, %call36(getLuma4x4Neighbour) @@ -144,56 +144,56 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ld.d $a0, $s8, 0 ld.w $a0, $a0, 12 addi.w $a2, $s2, -1 - addi.d $a3, $sp, 220 + addi.d $a3, $sp, 204 move $a1, $fp pcaddu18i $ra, %call36(getLuma4x4Neighbour) jirl $ra, $ra, 0 pcalau12i $a0, %got_pc_hi20(input) ld.d $a0, $a0, %got_pc_lo12(input) - st.d $a0, $sp, 200 # 8-byte Folded Spill + st.d $a0, $sp, 184 # 8-byte Folded Spill ld.d $a0, $a0, 0 ld.w $a0, $a0, 272 beqz $a0, .LBB1_7 # %bb.1: - ld.w $a0, $sp, 220 + ld.w $a0, $sp, 204 beqz $a0, .LBB1_4 # %bb.2: ld.d $a0, $s8, 0 - ld.w $a1, $sp, 224 + ld.w $a1, $sp, 208 ldptr.d $a0, $a0, 14240 slli.d $a1, $a1, 2 ldx.w $a0, $a0, $a1 - ld.w $a1, $sp, 244 - st.w $a0, $sp, 220 + ld.w $a1, $sp, 228 + st.w $a0, $sp, 204 beqz $a1, .LBB1_5 .LBB1_3: ld.d $a0, $s8, 0 - ld.w $a1, $sp, 248 + ld.w $a1, $sp, 232 ldptr.d $a0, $a0, 14240 slli.d $a1, $a1, 2 ldx.w $a0, $a0, $a1 b .LBB1_6 .LBB1_4: move $a0, $zero - ld.w $a1, $sp, 244 - st.w $a0, $sp, 220 + ld.w $a1, $sp, 228 + st.w $a0, $sp, 204 bnez $a1, .LBB1_3 .LBB1_5: move $a0, $zero .LBB1_6: - st.w $a0, $sp, 244 + st.w $a0, $sp, 228 .LBB1_7: add.w $a0, $s5, $s6 ld.d $a1, $sp, 144 # 8-byte Folded Reload add.w $a1, $a1, $s7 - ld.w $a5, $sp, 220 + ld.w $a5, $sp, 204 ori $a4, $zero, 536 add.w $s2, $s3, $s2 ori $a2, $zero, 255 ori $a3, $zero, 255 beqz $a5, .LBB1_9 # %bb.8: # %.sink.split - ld.d $a3, $sp, 208 # 8-byte Folded Reload + ld.d $a3, $sp, 192 # 8-byte Folded Reload sltui $a3, $a3, 2 ori $a5, $zero, 136 masknez $a5, $a5, $a3 @@ -201,14 +201,14 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ld.d $a7, $s8, 0 maskeqz $a3, $a6, $a3 or $a3, $a3, $a5 - ld.w $a5, $sp, 240 + ld.w $a5, $sp, 224 ldx.d $a3, $a7, $a3 slli.d $a5, $a5, 3 ldx.d $a3, $a3, $a5 - ld.w $a5, $sp, 236 + ld.w $a5, $sp, 220 ldx.bu $a3, $a3, $a5 .LBB1_9: - ld.w $a5, $sp, 244 + ld.w $a5, $sp, 228 mul.d $a4, $s4, $a4 ld.d $t0, $sp, 64 # 8-byte Folded Reload beqz $a5, .LBB1_11 @@ -220,11 +220,11 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ld.d $a7, $s8, 0 maskeqz $a2, $a6, $a2 or $a2, $a2, $a5 - ld.w $a5, $sp, 264 + ld.w $a5, $sp, 248 ldx.d $a2, $a7, $a2 slli.d $a5, $a5, 3 ldx.d $a2, $a2, $a5 - ld.w $a5, $sp, 260 + ld.w $a5, $sp, 244 ldx.bu $a2, $a2, $a5 .LBB1_11: alsl.w $s5, $t0, $s0, 3 @@ -248,14 +248,14 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ori $a2, $zero, 2 maskeqz $a0, $a2, $a0 or $a0, $a0, $a1 - st.d $a0, $sp, 192 # 8-byte Folded Spill + st.d $a0, $sp, 176 # 8-byte Folded Spill lu12i.w $a0, 524287 ori $a0, $a0, 4095 ld.d $a1, $sp, 152 # 8-byte Folded Reload st.w $a0, $a1, 0 - addi.d $a2, $sp, 276 - addi.d $a3, $sp, 272 - addi.d $a4, $sp, 268 + addi.d $a2, $sp, 260 + addi.d $a3, $sp, 256 + addi.d $a4, $sp, 252 ld.d $a0, $sp, 128 # 8-byte Folded Reload ld.d $a1, $sp, 144 # 8-byte Folded Reload pcaddu18i $ra, %call36(intrapred_luma8x8) @@ -263,12 +263,12 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks move $s7, $zero move $s6, $zero st.d $zero, $sp, 120 # 8-byte Folded Spill - ld.w $a0, $sp, 272 + ld.w $a0, $sp, 256 st.d $a0, $sp, 160 # 8-byte Folded Spill - ld.w $a0, $sp, 276 - ld.w $a1, $sp, 268 + ld.w $a0, $sp, 260 + ld.w $a1, $sp, 252 st.d $a1, $sp, 104 # 8-byte Folded Spill - addi.d $a1, $sp, 1304 + addi.d $a1, $sp, 1288 alsl.d $a2, $fp, $a1, 2 st.d $a2, $sp, 24 # 8-byte Folded Spill vldi $vr0, -1008 @@ -292,14 +292,12 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks st.d $a2, $sp, 96 # 8-byte Folded Spill st.d $a3, $sp, 16 # 8-byte Folded Spill move $a2, $a3 - ld.d $a3, $sp, 208 # 8-byte Folded Reload + ld.d $a3, $sp, 192 # 8-byte Folded Reload bstrins.d $a2, $a3, 5, 5 add.d $a1, $a1, $a2 st.d $a1, $sp, 80 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(.LCPI1_0) fld.d $fs2, $a1, %pc_lo12(.LCPI1_0) - vrepli.b $vr0, 0 - vst $vr0, $sp, 176 # 16-byte Folded Spill lu12i.w $s4, 3 sltu $a0, $zero, $a0 st.d $a0, $sp, 112 # 8-byte Folded Spill @@ -307,7 +305,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks .p2align 4, , 16 .LBB1_12: # %.loopexit233 # in Loop: Header=BB1_15 Depth=1 - ori $a0, $zero, 2456 + ori $a0, $zero, 2440 add.d $a0, $sp, $a0 ld.w $a0, $a0, 0 st.d $a0, $sp, 120 # 8-byte Folded Spill @@ -336,7 +334,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ld.d $a0, $sp, 160 # 8-byte Folded Reload beqz $a0, .LBB1_25 .LBB1_18: # in Loop: Header=BB1_15 Depth=1 - ld.d $a0, $sp, 200 # 8-byte Folded Reload + ld.d $a0, $sp, 184 # 8-byte Folded Reload ld.d $a0, $a0, 0 ldptr.w $a1, $a0, 4168 ld.d $a3, $s8, 0 @@ -362,19 +360,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7376 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vld $vr2, $sp, 176 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 848 ldptr.d $a7, $a6, 7384 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 864 vstx $vr0, $a4, $a3 @@ -391,18 +388,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7392 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 912 ldptr.d $a7, $a6, 7400 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 928 vstx $vr0, $a4, $a3 @@ -419,18 +416,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7408 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 976 ldptr.d $a7, $a6, 7416 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 992 vstx $vr0, $a4, $a3 @@ -447,18 +444,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7424 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 1040 ldptr.d $a7, $a6, 7432 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 1056 vstx $vr0, $a4, $a3 @@ -475,18 +472,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7440 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 1104 ldptr.d $a7, $a6, 7448 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 1120 vstx $vr0, $a4, $a3 @@ -503,18 +500,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7456 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 1168 ldptr.d $a7, $a6, 7464 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 1184 vstx $vr0, $a4, $a3 @@ -531,18 +528,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7472 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 1232 ldptr.d $a7, $a6, 7480 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 1248 vstx $vr0, $a4, $a3 @@ -559,27 +556,27 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a4, $a0, 7488 alsl.d $a1, $s5, $a1, 1 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a1, $a1, 8 ori $a2, $s4, 1296 ldptr.d $a0, $a0, 7496 vstx $vr0, $a3, $a2 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a0, $s4, 1312 vstx $vr0, $a3, $a0 - ori $a0, $zero, 2456 + ori $a0, $zero, 2440 add.d $a0, $sp, $a0 - ld.d $a1, $sp, 208 # 8-byte Folded Reload + ld.d $a1, $sp, 192 # 8-byte Folded Reload move $a2, $s7 fmov.d $fa0, $fs0 - ld.d $a3, $sp, 192 # 8-byte Folded Reload + ld.d $a3, $sp, 176 # 8-byte Folded Reload pcaddu18i $ra, %call36(RDCost_for_8x8IntraBlocks) jirl $ra, $ra, 0 fcmp.cule.d $fcc0, $fs2, $fa0 @@ -592,7 +589,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ld.d $a0, $s8, 0 ld.d $a1, $s6, 0 ldptr.d $a0, $a0, 14160 - ld.d $a2, $sp, 208 # 8-byte Folded Reload + ld.d $a2, $sp, 192 # 8-byte Folded Reload slli.d $s3, $a2, 3 ldx.d $a1, $a1, $s3 ldx.d $a0, $a0, $s3 @@ -701,18 +698,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks vldx $vr0, $a1, $a3 vldx $vr1, $a2, $a3 ld.d $a1, $a0, 16 - ori $a2, $zero, 2328 + ori $a2, $zero, 2312 add.d $a2, $sp, $a2 vst $vr0, $a2, 0 ld.d $a2, $a0, 24 - ori $a4, $zero, 2344 + ori $a4, $zero, 2328 add.d $a4, $sp, $a4 vst $vr1, $a4, 0 vldx $vr0, $a1, $a3 ld.d $a1, $a0, 32 vldx $vr1, $a2, $a3 ld.d $a2, $a0, 40 - ori $a4, $zero, 2360 + ori $a4, $zero, 2344 add.d $a4, $sp, $a4 vst $vr0, $a4, 0 vldx $vr0, $a1, $a3 @@ -721,21 +718,21 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ld.d $a2, $a0, 56 ld.d $a0, $s8, 0 vldx $vr3, $a1, $a3 - ori $a1, $zero, 2376 + ori $a1, $zero, 2360 add.d $a1, $sp, $a1 vst $vr1, $a1, 0 vldx $vr1, $a2, $a3 ldptr.w $a1, $a0, 15260 - ori $a2, $zero, 2392 + ori $a2, $zero, 2376 add.d $a2, $sp, $a2 vst $vr0, $a2, 0 - ori $a2, $zero, 2408 + ori $a2, $zero, 2392 add.d $a2, $sp, $a2 vst $vr2, $a2, 0 - ori $a2, $zero, 2424 + ori $a2, $zero, 2408 add.d $a2, $sp, $a2 vst $vr3, $a2, 0 - ori $a2, $zero, 2440 + ori $a2, $zero, 2424 add.d $a2, $sp, $a2 vst $vr1, $a2, 0 beqz $a1, .LBB1_12 @@ -771,10 +768,9 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a0, $a1, 7376 alsl.d $a3, $s5, $a4, 1 vinsgr2vr.d $vr0, $a5, 0 - vld $vr2, $sp, 176 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 pcalau12i $a0, %pc_hi20(diff64) addi.d $a0, $a0, %pc_lo12(diff64) @@ -782,91 +778,91 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks vst $vr0, $a0, 0 ldptr.d $a4, $a1, 7384 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 ld.d $a3, $s0, 8 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a4, $a3, $a2 vst $vr0, $a0, 16 ldptr.d $a5, $a1, 7392 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a3, $a3, 8 vsub.w $vr0, $vr0, $vr1 vst $vr0, $a0, 32 ldptr.d $a4, $a1, 7400 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 ld.d $a3, $s0, 16 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a4, $a3, $a2 vst $vr0, $a0, 48 ldptr.d $a5, $a1, 7408 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a3, $a3, 8 vsub.w $vr0, $vr0, $vr1 vst $vr0, $a0, 64 ldptr.d $a4, $a1, 7416 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 ld.d $a3, $s0, 24 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a4, $a3, $a2 vst $vr0, $a0, 80 ldptr.d $a5, $a1, 7424 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a3, $a3, 8 vsub.w $vr0, $vr0, $vr1 vst $vr0, $a0, 96 ldptr.d $a4, $a1, 7432 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 ld.d $a3, $s0, 32 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a4, $a3, $a2 vst $vr0, $a0, 112 ldptr.d $a5, $a1, 7440 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a3, $a3, 8 vsub.w $vr0, $vr0, $vr1 vst $vr0, $a0, 128 ldptr.d $a4, $a1, 7448 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 ld.d $a3, $s0, 40 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a4, $a3, $a2 vst $vr0, $a0, 144 ldptr.d $a5, $a1, 7456 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a3, $a3, 8 vsub.w $vr0, $vr0, $vr1 ldptr.d $a4, $a1, 7464 @@ -875,45 +871,45 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks vst $vr0, $a0, 160 vinsgr2vr.d $vr0, $a4, 0 ldx.d $a3, $a5, $a2 - vilvl.h $vr1, $vr2, $vr1 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr0, $vr0, 0 vsub.w $vr0, $vr1, $vr0 vinsgr2vr.d $vr1, $a3, 0 ldptr.d $a3, $a1, 7472 vst $vr0, $a0, 176 alsl.d $a4, $s5, $a5, 1 - vilvl.h $vr0, $vr2, $vr1 + vsllwil.wu.hu $vr0, $vr1, 0 vinsgr2vr.d $vr1, $a3, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a3, $a4, 8 vsub.w $vr0, $vr0, $vr1 vst $vr0, $a0, 192 ldptr.d $a4, $a1, 7480 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 ld.d $a3, $s0, 56 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ldx.d $a2, $a3, $a2 vst $vr0, $a0, 208 ldptr.d $a4, $a1, 7488 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a2, $a3, 8 vsub.w $vr0, $vr0, $vr1 ldptr.d $a1, $a1, 7496 vst $vr0, $a0, 224 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 vst $vr0, $a0, 240 - ld.d $a1, $sp, 192 # 8-byte Folded Reload + ld.d $a1, $sp, 176 # 8-byte Folded Reload xor $a1, $s7, $a1 sltui $a1, $a1, 1 ld.d $a2, $sp, 136 # 8-byte Folded Reload @@ -951,7 +947,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks addi.w $s2, $s6, 0 ld.d $a1, $sp, 40 # 8-byte Folded Reload stx.b $s6, $a0, $a1 - ld.d $a1, $sp, 192 # 8-byte Folded Reload + ld.d $a1, $sp, 176 # 8-byte Folded Reload xor $a0, $a1, $s2 sltui $a0, $a0, 1 slt $a1, $s2, $a1 @@ -968,7 +964,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks st.b $a0, $a1, 348 ld.d $a2, $s8, 0 ld.w $a1, $a2, 164 - ld.d $a3, $sp, 208 # 8-byte Folded Reload + ld.d $a3, $sp, 192 # 8-byte Folded Reload bstrins.d $a3, $zero, 0, 0 alsl.w $a4, $a1, $a3, 2 addi.d $a0, $a3, 2 @@ -996,13 +992,13 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks addi.d $a4, $a4, 8 blt $a3, $a5, .LBB1_29 .LBB1_30: # %._crit_edge - ld.d $a0, $sp, 200 # 8-byte Folded Reload + ld.d $a0, $sp, 184 # 8-byte Folded Reload ld.d $a0, $a0, 0 ldptr.w $a0, $a0, 4168 beqz $a0, .LBB1_33 # %bb.31: # %.preheader228 ldptr.d $a0, $a2, 14160 - ld.d $a1, $sp, 208 # 8-byte Folded Reload + ld.d $a1, $sp, 192 # 8-byte Folded Reload slli.d $s0, $a1, 3 ldx.d $a0, $a0, $s0 ld.d $a0, $a0, 0 @@ -1232,19 +1228,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7376 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vld $vr2, $sp, 176 # 16-byte Folded Reload - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 848 ldptr.d $a7, $a6, 7384 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 864 vstx $vr0, $a4, $a3 @@ -1261,18 +1256,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7392 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 912 ldptr.d $a7, $a6, 7400 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 928 vstx $vr0, $a4, $a3 @@ -1289,18 +1284,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7408 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 976 ldptr.d $a7, $a6, 7416 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 992 vstx $vr0, $a4, $a3 @@ -1317,18 +1312,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7424 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 1040 ldptr.d $a7, $a6, 7432 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 1056 vstx $vr0, $a4, $a3 @@ -1345,18 +1340,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7440 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 1104 ldptr.d $a7, $a6, 7448 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 1120 vstx $vr0, $a4, $a3 @@ -1373,18 +1368,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7456 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 1168 ldptr.d $a7, $a6, 7464 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 1184 vstx $vr0, $a4, $a3 @@ -1401,18 +1396,18 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a7, $a6, 7472 alsl.d $a3, $s5, $a3, 1 vinsgr2vr.d $vr0, $a5, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a3, $a3, 8 ori $a5, $s4, 1232 ldptr.d $a7, $a6, 7480 vstx $vr0, $a4, $a5 vinsgr2vr.d $vr0, $a3, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a7, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a3, $s4, 1248 vstx $vr0, $a4, $a3 @@ -1429,25 +1424,25 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a4, $a0, 7488 alsl.d $a1, $s5, $a1, 1 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ld.d $a1, $a1, 8 ori $a2, $s4, 1296 ldptr.d $a0, $a0, 7496 vstx $vr0, $a3, $a2 vinsgr2vr.d $vr0, $a1, 0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vinsgr2vr.d $vr1, $a0, 0 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vsub.w $vr0, $vr0, $vr1 ori $a0, $s4, 1312 vstx $vr0, $a3, $a0 - ori $a0, $zero, 2460 + ori $a0, $zero, 2444 add.d $a1, $sp, $a0 ori $a2, $zero, 1 - ld.d $a0, $sp, 208 # 8-byte Folded Reload + ld.d $a0, $sp, 192 # 8-byte Folded Reload pcaddu18i $ra, %call36(dct_luma8x8) jirl $ra, $ra, 0 b .LBB1_36 @@ -1468,7 +1463,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ld.d $t7, $sp, 144 # 8-byte Folded Reload slli.d $t1, $t7, 3 ldx.d $t1, $t0, $t1 - ori $t0, $zero, 2328 + ori $t0, $zero, 2312 add.d $t0, $sp, $t0 vld $vr0, $t0, 0 ld.d $t0, $sp, 128 # 8-byte Folded Reload @@ -1489,7 +1484,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $t3, $t3, 6440 alsl.d $t3, $t7, $t3, 3 ld.d $t3, $t3, 8 - ori $t4, $zero, 2344 + ori $t4, $zero, 2328 add.d $t4, $sp, $t4 vld $vr0, $t4, 0 vstx $vr0, $t3, $t0 @@ -1505,7 +1500,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a6, $a6, 6440 alsl.d $a6, $t7, $a6, 3 ld.d $a6, $a6, 16 - ori $t3, $zero, 2360 + ori $t3, $zero, 2344 add.d $t3, $sp, $t3 vld $vr0, $t3, 0 vstx $vr0, $a6, $t0 @@ -1521,7 +1516,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a5, $a5, 6440 alsl.d $a5, $t7, $a5, 3 ld.d $a5, $a5, 24 - ori $a6, $zero, 2376 + ori $a6, $zero, 2360 add.d $a6, $sp, $a6 vld $vr0, $a6, 0 vstx $vr0, $a5, $t0 @@ -1537,7 +1532,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a4, $a4, 6440 alsl.d $a4, $t7, $a4, 3 ld.d $a4, $a4, 32 - ori $a5, $zero, 2392 + ori $a5, $zero, 2376 add.d $a5, $sp, $a5 vld $vr0, $a5, 0 vstx $vr0, $a4, $t0 @@ -1553,7 +1548,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a3, $a3, 6440 alsl.d $a3, $t7, $a3, 3 ld.d $a3, $a3, 40 - ori $a4, $zero, 2408 + ori $a4, $zero, 2392 add.d $a4, $sp, $a4 vld $vr0, $a4, 0 vstx $vr0, $a3, $t0 @@ -1569,7 +1564,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a2, $a2, 6440 alsl.d $a2, $t7, $a2, 3 ld.d $a2, $a2, 48 - ori $a3, $zero, 2424 + ori $a3, $zero, 2408 add.d $a3, $sp, $a3 vld $vr0, $a3, 0 vstx $vr0, $a2, $t0 @@ -1585,7 +1580,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks ldptr.d $a1, $a1, 6440 alsl.d $a1, $t7, $a1, 3 ld.d $a1, $a1, 56 - ori $a2, $zero, 2440 + ori $a2, $zero, 2424 add.d $a2, $sp, $a2 vld $vr0, $a2, 0 vstx $vr0, $a1, $t0 @@ -1599,7 +1594,7 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks vstx $vr0, $a0, $t2 ld.d $a0, $sp, 120 # 8-byte Folded Reload .LBB1_36: # %.loopexit - addi.d $sp, $sp, 544 + addi.d $sp, $sp, 528 fld.d $fs2, $sp, 1920 # 8-byte Folded Reload fld.d $fs1, $sp, 1928 # 8-byte Folded Reload fld.d $fs0, $sp, 1936 # 8-byte Folded Reload @@ -2406,7 +2401,6 @@ intrapred_luma8x8: # @intrapred_luma8x8 # %bb.47: stx.h $s7, $s4, $a2 .LBB2_48: - vrepli.b $vr0, 0 beqz $s0, .LBB2_50 # %bb.49: ld.hu $t1, $fp, 6 @@ -2434,22 +2428,24 @@ intrapred_luma8x8: # @intrapred_luma8x8 stx.h $a1, $s4, $a3 st.h $a1, $s6, 272 ld.hu $t6, $fp, 12 - vld $vr1, $fp, 12 + vld $vr0, $fp, 12 ld.hu $t5, $fp, 14 - vld $vr2, $fp, 14 + vld $vr1, $fp, 14 ld.hu $t7, $fp, 30 - vld $vr3, $fp, 16 + vld $vr2, $fp, 16 ld.hu $t8, $fp, 28 ld.hu $a3, $fp, 16 - vilvl.h $vr8, $vr0, $vr1 - vbsrl.v $vr7, $vr1, 8 - vilvl.h $vr6, $vr0, $vr7 - vilvl.h $vr1, $vr0, $vr2 - vilvh.h $vr2, $vr0, $vr2 - vilvl.h $vr9, $vr0, $vr3 - vilvh.h $vr10, $vr0, $vr3 - vaddi.wu $vr3, $vr6, 2 - vaddi.wu $vr4, $vr8, 2 + vsllwil.wu.hu $vr7, $vr0, 0 + vbsrl.v $vr6, $vr0, 8 + vsllwil.wu.hu $vr5, $vr6, 0 + vsllwil.wu.hu $vr0, $vr1, 0 + vbsrl.v $vr1, $vr1, 8 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr8, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.wu.hu $vr9, $vr2, 0 + vaddi.wu $vr2, $vr5, 2 + vaddi.wu $vr3, $vr7, 2 addi.d $a4, $t5, 2 alsl.d $a5, $t2, $t6, 1 addi.d $a5, $a5, 2 @@ -2472,37 +2468,37 @@ intrapred_luma8x8: # @intrapred_luma8x8 st.h $a6, $s6, 276 ori $a7, $s5, 3672 stx.h $a6, $s4, $a7 - vslli.w $vr2, $vr2, 1 - vslli.w $vr5, $vr1, 1 - vadd.w $vr1, $vr4, $vr5 - vadd.w $vr2, $vr3, $vr2 - vadd.w $vr10, $vr2, $vr10 - vadd.w $vr1, $vr1, $vr9 - vsrli.w $vr2, $vr1, 2 - vsrli.w $vr1, $vr10, 2 - vpickve2gr.h $a7, $vr1, 6 - vpickve2gr.h $a6, $vr2, 0 + vslli.w $vr1, $vr1, 1 + vslli.w $vr4, $vr0, 1 + vadd.w $vr0, $vr3, $vr4 + vadd.w $vr1, $vr2, $vr1 + vadd.w $vr9, $vr1, $vr9 + vadd.w $vr0, $vr0, $vr8 + vsrli.w $vr1, $vr0, 2 + vsrli.w $vr0, $vr9, 2 + vpickve2gr.h $a7, $vr0, 6 + vpickve2gr.h $a6, $vr1, 0 st.h $a6, $s6, 320 st.h $a6, $s6, 292 st.h $a6, $s6, 264 - vpickve2gr.h $a6, $vr2, 2 + vpickve2gr.h $a6, $vr1, 2 st.h $a6, $s6, 308 st.h $a6, $s6, 280 - vpickve2gr.h $a6, $vr2, 4 + vpickve2gr.h $a6, $vr1, 4 st.h $a6, $s6, 352 st.h $a6, $s6, 324 st.h $a6, $s6, 296 st.h $a6, $s6, 268 - vpickve2gr.h $a6, $vr2, 6 + vpickve2gr.h $a6, $vr1, 6 st.h $a6, $s6, 368 st.h $a6, $s6, 312 st.h $a6, $s6, 284 - vpickve2gr.h $a6, $vr1, 4 + vpickve2gr.h $a6, $vr0, 4 st.h $a6, $s6, 360 st.h $a6, $s6, 332 - vpickve2gr.h $a6, $vr1, 2 + vpickve2gr.h $a6, $vr0, 2 st.h $a7, $s6, 376 - vpickve2gr.h $a7, $vr1, 0 + vpickve2gr.h $a7, $vr0, 0 st.h $a7, $s6, 356 st.h $a7, $s6, 328 st.h $a7, $s6, 300 @@ -2578,45 +2574,40 @@ intrapred_luma8x8: # @intrapred_luma8x8 st.h $t0, $s6, 808 ori $t1, $s3, 92 stx.h $t0, $s4, $t1 - vpickve2gr.w $t0, $vr8, 3 + vpickve2gr.w $t0, $vr7, 3 bstrpick.d $t0, $t0, 15, 0 - or $t1, $a3, $t0 - xor $t2, $a3, $t0 - srli.d $t2, $t2, 1 - sub.d $t1, $t1, $t2 + add.d $t1, $a3, $t0 + addi.d $t1, $t1, 1 + srli.d $t1, $t1, 1 ori $t2, $s3, 184 stx.h $t1, $s4, $t2 st.h $t1, $s6, 840 ori $t2, $s3, 124 stx.h $t1, $s4, $t2 st.h $t1, $s6, 780 - vpickve2gr.w $t1, $vr7, 0 + vpickve2gr.w $t1, $vr6, 0 bstrpick.d $t1, $t1, 15, 0 - or $t2, $t0, $t1 - xor $t0, $t0, $t1 + add.d $t0, $t0, $t1 + addi.d $t0, $t0, 1 srli.d $t0, $t0, 1 - sub.d $t0, $t2, $t0 - st.h $t0, $s6, 872 ori $t2, $s3, 156 stx.h $t0, $s4, $t2 + st.h $t0, $s6, 872 st.h $t0, $s6, 812 - vpickve2gr.w $t0, $vr6, 1 - bstrpick.d $t0, $t0, 15, 0 - or $t2, $t1, $t0 - xor $t1, $t1, $t0 - srli.d $t1, $t1, 1 - sub.d $t1, $t2, $t1 - ori $t2, $s3, 188 - stx.h $t1, $s4, $t2 - st.h $t1, $s6, 844 - ori $t1, $s3, 96 - stx.h $a2, $s4, $t1 - vpickve2gr.w $a2, $vr6, 2 - bstrpick.d $a2, $a2, 15, 0 - or $t1, $t0, $a2 - xor $a2, $t0, $a2 + ori $t0, $s3, 96 + stx.h $a2, $s4, $t0 + vpickve2gr.w $a2, $vr5, 1 + bstrpick.d $t0, $a2, 15, 0 + add.d $t0, $t1, $t0 + addi.d $t0, $t0, 1 + srli.d $t0, $t0, 1 + ori $t1, $s3, 188 + stx.h $t0, $s4, $t1 + st.h $t0, $s6, 844 + vpickve2gr.w $t0, $vr5, 2 + add.d $a2, $a2, $t0 + addi.d $a2, $a2, 1 srli.d $a2, $a2, 1 - sub.d $a2, $t1, $a2 st.h $a2, $s6, 876 ori $a2, $s3, 128 stx.h $a0, $s4, $a2 @@ -2647,7 +2638,7 @@ intrapred_luma8x8: # @intrapred_luma8x8 ori $a1, $s3, 104 ld.hu $a4, $fp, 14 stx.h $a0, $s4, $a1 - vpickve2gr.w $a0, $vr4, 2 + vpickve2gr.w $a0, $vr3, 2 add.d $a0, $a0, $a2 alsl.d $a0, $a4, $a0, 1 bstrpick.d $a0, $a0, 18, 2 @@ -2657,7 +2648,7 @@ intrapred_luma8x8: # @intrapred_luma8x8 stx.h $a0, $s4, $a1 st.h $a0, $s6, 852 st.h $a0, $s6, 792 - vpickve2gr.w $a0, $vr4, 3 + vpickve2gr.w $a0, $vr3, 3 alsl.d $a0, $a3, $a0, 1 add.d $a0, $a0, $a4 bstrpick.d $a0, $a0, 18, 2 @@ -2668,8 +2659,8 @@ intrapred_luma8x8: # @intrapred_luma8x8 ori $a1, $s3, 108 stx.h $a0, $s4, $a1 ld.hu $a0, $fp, 16 - vpickve2gr.w $a1, $vr3, 0 - vpickve2gr.w $a2, $vr5, 2 + vpickve2gr.w $a1, $vr2, 0 + vpickve2gr.w $a2, $vr4, 2 add.d $a1, $a1, $a2 add.d $a0, $a1, $a0 bstrpick.d $a0, $a0, 18, 2 @@ -2680,8 +2671,8 @@ intrapred_luma8x8: # @intrapred_luma8x8 stx.h $a0, $s4, $a1 st.h $a0, $s6, 796 ld.hu $a0, $fp, 18 - vpickve2gr.w $a1, $vr3, 1 - vpickve2gr.w $a2, $vr5, 3 + vpickve2gr.w $a1, $vr2, 1 + vpickve2gr.w $a2, $vr4, 3 add.d $a1, $a1, $a2 add.d $a0, $a1, $a0 bstrpick.d $a0, $a0, 18, 2 @@ -2693,64 +2684,64 @@ intrapred_luma8x8: # @intrapred_luma8x8 st.h $a6, $s6, 892 ori $a0, $s5, 3716 add.d $a0, $s4, $a0 - vstelm.h $vr2, $a0, 0, 0 + vstelm.h $vr1, $a0, 0, 0 ori $a0, $s5, 3688 add.d $a0, $s4, $a0 - vstelm.h $vr2, $a0, 0, 0 + vstelm.h $vr1, $a0, 0, 0 ori $a0, $s5, 3760 add.d $a0, $s4, $a0 - vstelm.h $vr2, $a0, 0, 2 + vstelm.h $vr1, $a0, 0, 2 ori $a0, $s5, 3732 add.d $a0, $s4, $a0 - vstelm.h $vr2, $a0, 0, 2 + vstelm.h $vr1, $a0, 0, 2 ori $a0, $s5, 3704 add.d $a0, $s4, $a0 - vstelm.h $vr2, $a0, 0, 2 + vstelm.h $vr1, $a0, 0, 2 ori $a0, $s5, 3676 add.d $a0, $s4, $a0 - vstelm.h $vr2, $a0, 0, 2 + vstelm.h $vr1, $a0, 0, 2 ori $a0, $s5, 3776 add.d $a0, $s4, $a0 - vstelm.h $vr2, $a0, 0, 4 + vstelm.h $vr1, $a0, 0, 4 ori $a0, $s5, 3720 add.d $a0, $s4, $a0 - vstelm.h $vr2, $a0, 0, 4 + vstelm.h $vr1, $a0, 0, 4 ori $a0, $s5, 3692 add.d $a0, $s4, $a0 - vstelm.h $vr2, $a0, 0, 4 + vstelm.h $vr1, $a0, 0, 4 ori $a0, $s5, 3764 add.d $a0, $s4, $a0 - vstelm.h $vr2, $a0, 0, 6 + vstelm.h $vr1, $a0, 0, 6 ori $a0, $s5, 3736 add.d $a0, $s4, $a0 - vstelm.h $vr2, $a0, 0, 6 + vstelm.h $vr1, $a0, 0, 6 ori $a0, $s5, 3708 add.d $a0, $s4, $a0 - vstelm.h $vr2, $a0, 0, 6 + vstelm.h $vr1, $a0, 0, 6 ori $a0, $s5, 3780 add.d $a0, $s4, $a0 - vstelm.h $vr1, $a0, 0, 0 + vstelm.h $vr0, $a0, 0, 0 ori $a0, $s5, 3724 add.d $a0, $s4, $a0 - vstelm.h $vr1, $a0, 0, 0 + vstelm.h $vr0, $a0, 0, 0 ori $a0, $s5, 3768 add.d $a0, $s4, $a0 - vstelm.h $vr1, $a0, 0, 2 + vstelm.h $vr0, $a0, 0, 2 ori $a0, $s5, 3740 add.d $a0, $s4, $a0 - vstelm.h $vr1, $a0, 0, 2 + vstelm.h $vr0, $a0, 0, 2 ori $a0, $s5, 3784 add.d $a0, $s4, $a0 - vstelm.h $vr1, $a0, 0, 4 + vstelm.h $vr0, $a0, 0, 4 ori $a0, $s5, 3772 add.d $a0, $s4, $a0 - vpickev.h $vr2, $vr1, $vr2 - vstelm.h $vr1, $a0, 0, 6 + vpickev.h $vr1, $vr0, $vr1 + vstelm.h $vr0, $a0, 0, 6 ori $a0, $s5, 3744 - vstx $vr2, $s4, $a0 + vstx $vr1, $s4, $a0 ori $a0, $s3, 204 add.d $a0, $s4, $a0 - vstelm.h $vr1, $a0, 0, 0 + vstelm.h $vr0, $a0, 0, 0 .LBB2_50: sltui $a0, $s2, 1 ld.d $a1, $sp, 88 # 8-byte Folded Reload @@ -2839,17 +2830,17 @@ intrapred_luma8x8: # @intrapred_luma8x8 ori $a3, $s5, 3820 add.d $a6, $a0, $a3 ld.hu $a5, $fp, 0 - vld $vr1, $fp, 0 - vld $vr2, $fp, 2 + vld $vr0, $fp, 0 + vld $vr1, $fp, 2 ld.hu $a4, $fp, 2 ld.hu $a3, $fp, 4 - vinsgr2vr.h $vr3, $s2, 0 - vinsgr2vr.h $vr3, $a5, 1 - vinsgr2vr.h $vr3, $a4, 2 - vinsgr2vr.h $vr3, $a3, 3 - vbsrl.v $vr6, $vr1, 6 - vilvl.h $vr4, $vr0, $vr6 - vilvl.h $vr7, $vr0, $vr3 + vinsgr2vr.h $vr2, $s2, 0 + vinsgr2vr.h $vr2, $a5, 1 + vinsgr2vr.h $vr2, $a4, 2 + vinsgr2vr.h $vr2, $a3, 3 + vsllwil.wu.hu $vr2, $vr2, 0 + vbsrl.v $vr6, $vr0, 6 + vsllwil.wu.hu $vr3, $vr6, 0 addi.d $a2, $a2, 2 srli.d $a2, $a2, 2 st.h $a2, $a1, 104 @@ -2861,14 +2852,16 @@ intrapred_luma8x8: # @intrapred_luma8x8 st.h $a2, $a1, 32 ori $s4, $s5, 3824 stx.h $a2, $a0, $s4 - vilvh.h $vr5, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.h $vr8, $vr0, $vr2 - vilvh.h $vr3, $vr0, $vr2 - vslli.w $vr0, $vr1, 1 - vadd.w $vr0, $vr7, $vr0 + vbsrl.v $vr4, $vr0, 8 + vsllwil.wu.hu $vr5, $vr4, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr7, $vr1, 0 + vbsrl.v $vr1, $vr1, 8 + vsllwil.wu.hu $vr4, $vr1, 0 + vslli.w $vr0, $vr0, 1 + vadd.w $vr0, $vr2, $vr0 vslli.w $vr1, $vr5, 1 - vadd.w $vr1, $vr4, $vr1 + vadd.w $vr1, $vr3, $vr1 addi.d $a2, $a5, 2 add.d $s1, $a2, $s1 alsl.d $s1, $s2, $s1, 1 @@ -2884,8 +2877,8 @@ intrapred_luma8x8: # @intrapred_luma8x8 st.h $s1, $a1, 16 ori $s2, $s5, 3808 stx.h $s1, $a0, $s2 - vadd.w $vr1, $vr1, $vr3 - vadd.w $vr0, $vr0, $vr8 + vadd.w $vr1, $vr1, $vr4 + vadd.w $vr0, $vr0, $vr7 vaddi.wu $vr0, $vr0, 2 vaddi.wu $vr1, $vr1, 2 vsrli.w $vr1, $vr1, 2 @@ -2941,65 +2934,58 @@ intrapred_luma8x8: # @intrapred_luma8x8 st.h $a4, $a1, 112 vpickve2gr.w $a4, $vr6, 0 bstrpick.d $a4, $a4, 15, 0 - or $a5, $a3, $a4 - xor $a3, $a3, $a4 + add.d $a3, $a3, $a4 + addi.d $a3, $a3, 1 srli.d $a3, $a3, 1 - sub.d $a3, $a5, $a3 st.h $a3, $a1, 216 ori $a5, $s5, 3992 stx.h $a3, $a0, $a5 st.h $a3, $a1, 148 ori $a5, $s5, 3924 stx.h $a3, $a0, $a5 - vpickve2gr.w $a3, $vr4, 1 + vpickve2gr.w $a3, $vr3, 1 bstrpick.d $a3, $a3, 15, 0 - or $a5, $a4, $a3 - xor $a4, $a4, $a3 + add.d $a4, $a4, $a3 + addi.d $a4, $a4, 1 srli.d $a4, $a4, 1 - sub.d $a4, $a5, $a4 ori $a5, $s5, 4028 stx.h $a4, $a0, $a5 st.h $a4, $a1, 184 ori $a5, $s5, 3960 stx.h $a4, $a0, $a5 st.h $a4, $a1, 116 - vpickve2gr.w $a4, $vr4, 2 + vpickve2gr.w $a4, $vr3, 2 bstrpick.d $a4, $a4, 15, 0 - or $a5, $a3, $a4 - xor $a3, $a3, $a4 + add.d $a3, $a3, $a4 + addi.d $a3, $a3, 1 srli.d $a3, $a3, 1 - sub.d $a3, $a5, $a3 st.h $a3, $a1, 220 ori $a5, $s5, 3996 stx.h $a3, $a0, $a5 st.h $a3, $a1, 152 ori $a5, $s5, 3928 stx.h $a3, $a0, $a5 - vpickve2gr.w $a3, $vr4, 3 + vpickve2gr.w $a3, $vr3, 3 bstrpick.d $a3, $a3, 15, 0 - or $a5, $a4, $a3 - xor $a4, $a4, $a3 + add.d $a4, $a4, $a3 + addi.d $a4, $a4, 1 srli.d $a4, $a4, 1 - sub.d $a4, $a5, $a4 st.h $a4, $a1, 188 ori $a5, $s5, 3964 stx.h $a4, $a0, $a5 st.h $a4, $a1, 120 vpickve2gr.w $a4, $vr5, 3 - bstrpick.d $a4, $a4, 15, 0 - or $a5, $a3, $a4 - xor $a3, $a3, $a4 + bstrpick.d $a5, $a4, 15, 0 + add.d $a3, $a3, $a5 + addi.d $a3, $a3, 1 srli.d $a3, $a3, 1 - sub.d $a3, $a5, $a3 st.h $a3, $a1, 156 ori $a5, $s5, 3932 stx.h $a3, $a0, $a5 - vpickve2gr.w $a3, $vr3, 3 - bstrpick.d $a3, $a3, 15, 0 - or $a5, $a4, $a3 - xor $a3, $a4, $a3 + vpickve2gr.w $a3, $vr4, 3 + add.d $a3, $a4, $a3 + addi.d $a3, $a3, 1 srli.d $a3, $a3, 1 - sub.d $a3, $a5, $a3 st.h $a3, $a1, 124 vstelm.h $vr2, $a1, 228, 0 ori $a3, $s5, 4004 @@ -3495,47 +3481,48 @@ RDCost_for_8x8IntraBlocks: # @RDCost_for_8x8IntraBlocks alsl.d $a3, $s6, $s7, 3 alsl.d $a4, $s5, $s4, 3 slli.d $a5, $s8, 1 - vrepli.b $vr0, 0 ori $a6, $zero, 64 .p2align 4, , 16 .LBB3_1: # =>This Inner Loop Header: Depth=1 ldx.d $a7, $a3, $a2 ldx.d $t0, $a4, $a2 - vldx $vr1, $a7, $a5 - vldx $vr2, $t0, $a5 - vilvl.h $vr3, $vr0, $vr1 - vilvl.h $vr4, $vr0, $vr2 - vsub.w $vr3, $vr3, $vr4 - vpickve2gr.w $a7, $vr3, 0 + vldx $vr0, $a7, $a5 + vldx $vr1, $t0, $a5 + vsllwil.wu.hu $vr2, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr3, $vr1, 0 + vsub.w $vr2, $vr2, $vr3 + vpickve2gr.w $a7, $vr2, 0 slli.d $a7, $a7, 2 ldx.w $a7, $a0, $a7 - vilvh.h $vr1, $vr0, $vr1 - vilvh.h $vr2, $vr0, $vr2 - vsub.w $vr1, $vr1, $vr2 + vbsrl.v $vr1, $vr1, 8 + vsllwil.wu.hu $vr1, $vr1, 0 + vsub.w $vr0, $vr0, $vr1 add.d $a7, $s3, $a7 - vpickve2gr.w $t0, $vr3, 1 + vpickve2gr.w $t0, $vr2, 1 slli.d $t0, $t0, 2 ldx.w $t0, $a0, $t0 - vpickve2gr.w $t1, $vr3, 2 + vpickve2gr.w $t1, $vr2, 2 slli.d $t1, $t1, 2 ldx.w $t1, $a0, $t1 - vpickve2gr.w $t2, $vr3, 3 + vpickve2gr.w $t2, $vr2, 3 slli.d $t2, $t2, 2 ldx.w $t2, $a0, $t2 - vpickve2gr.w $t3, $vr1, 0 + vpickve2gr.w $t3, $vr0, 0 slli.d $t3, $t3, 2 ldx.w $t3, $a0, $t3 add.d $a7, $a7, $t0 add.d $a7, $a7, $t1 add.d $a7, $a7, $t2 add.d $a7, $a7, $t3 - vpickve2gr.w $t0, $vr1, 1 + vpickve2gr.w $t0, $vr0, 1 slli.d $t0, $t0, 2 ldx.w $t0, $a0, $t0 - vpickve2gr.w $t1, $vr1, 2 + vpickve2gr.w $t1, $vr0, 2 slli.d $t1, $t1, 2 ldx.w $t1, $a0, $t1 - vpickve2gr.w $t2, $vr1, 3 + vpickve2gr.w $t2, $vr0, 3 slli.d $t2, $t2, 2 ldx.w $t2, $a0, $t2 add.d $a7, $a7, $t0 @@ -5065,35 +5052,35 @@ LowPassForIntra8x8Pred: # @LowPassForIntra8x8Pred add.d $t5, $t3, $t5 vinsgr2vr.w $vr1, $t2, 0 srli.d $t2, $t5, 2 - vrepli.b $vr2, 0 - vilvh.h $vr3, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vinsgr2vr.w $vr2, $t4, 0 + vbsrl.v $vr2, $vr0, 8 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vinsgr2vr.w $vr3, $t4, 0 pcalau12i $t4, %pc_hi20(.LCPI5_0) vld $vr4, $t4, %pc_lo12(.LCPI5_0) vbsrl.v $vr5, $vr0, 12 - vbsll.v $vr6, $vr3, 4 + vbsll.v $vr6, $vr2, 4 vor.v $vr5, $vr6, $vr5 - vshuf.w $vr4, $vr0, $vr2 - vslli.w $vr2, $vr4, 1 + vshuf.w $vr4, $vr0, $vr3 + vslli.w $vr3, $vr4, 1 vslli.w $vr4, $vr5, 1 vinsgr2vr.w $vr1, $t3, 1 vpackev.d $vr1, $vr0, $vr1 vbsrl.v $vr5, $vr0, 8 - vbsll.v $vr6, $vr3, 8 + vbsll.v $vr6, $vr2, 8 vor.v $vr5, $vr6, $vr5 vadd.w $vr4, $vr5, $vr4 - vadd.w $vr1, $vr1, $vr2 + vadd.w $vr1, $vr1, $vr3 pcalau12i $t3, %pc_hi20(.LCPI5_1) - vld $vr2, $t3, %pc_lo12(.LCPI5_1) + vld $vr3, $t3, %pc_lo12(.LCPI5_1) vadd.w $vr0, $vr1, $vr0 - vadd.w $vr1, $vr4, $vr3 + vadd.w $vr1, $vr4, $vr2 vaddi.wu $vr1, $vr1, 2 - vadd.w $vr0, $vr0, $vr2 + vadd.w $vr0, $vr0, $vr3 vsrli.w $vr0, $vr0, 2 vsrli.w $vr1, $vr1, 2 vpickev.h $vr0, $vr1, $vr0 - vpickve2gr.w $t3, $vr3, 2 + vpickve2gr.w $t3, $vr2, 2 alsl.d $t3, $t1, $t3, 1 add.d $t3, $t3, $t0 addi.d $t3, $t3, 2 diff --git a/results/MultiSource/Applications/SIBsim4/CMakeFiles/SIBsim4.dir/sim4b1.s b/results/MultiSource/Applications/SIBsim4/CMakeFiles/SIBsim4.dir/sim4b1.s index 97d7681c..96ad78a7 100644 --- a/results/MultiSource/Applications/SIBsim4/CMakeFiles/SIBsim4.dir/sim4b1.s +++ b/results/MultiSource/Applications/SIBsim4/CMakeFiles/SIBsim4.dir/sim4b1.s @@ -7289,11 +7289,9 @@ merge: # @merge .type is_polyAT_exon_p,@function is_polyAT_exon_p: # @is_polyAT_exon_p # %bb.0: - addi.d $sp, $sp, -32 - fst.d $fs0, $sp, 24 # 8-byte Folded Spill - fst.d $fs1, $sp, 16 # 8-byte Folded Spill - fst.d $fs2, $sp, 8 # 8-byte Folded Spill - fst.d $fs3, $sp, 0 # 8-byte Folded Spill + addi.d $sp, $sp, -16 + fst.d $fs0, $sp, 8 # 8-byte Folded Spill + fst.d $fs1, $sp, 0 # 8-byte Folded Spill addi.w $a4, $a1, 0 addi.w $a3, $a0, -1 bgeu $a3, $a4, .LBB8_3 @@ -7305,7 +7303,7 @@ is_polyAT_exon_p: # @is_polyAT_exon_p # %bb.2: move $t2, $zero move $a3, $zero - move $t0, $zero + move $a6, $zero move $a4, $zero move $t1, $zero move $a5, $t3 @@ -7313,7 +7311,7 @@ is_polyAT_exon_p: # @is_polyAT_exon_p .LBB8_3: move $t1, $zero move $a4, $zero - move $t0, $zero + move $a6, $zero move $a3, $zero move $t2, $zero .LBB8_4: # %._crit_edge @@ -7324,70 +7322,71 @@ is_polyAT_exon_p: # @is_polyAT_exon_p alsl.w $a0, $t1, $a0, 1 ori $a5, $zero, 29 div.wu $a2, $a0, $a1 - bltu $a5, $a1, .LBB8_9 + bltu $a5, $a1, .LBB8_10 # %bb.5: ori $a5, $zero, 6 ori $a0, $zero, 1 - bltu $a5, $a2, .LBB8_14 + bltu $a5, $a2, .LBB8_9 # %bb.6: - add.d $a2, $t0, $t1 + add.d $a2, $a6, $t1 slli.d $a5, $a2, 3 alsl.w $a2, $a2, $a5, 1 div.wu $a2, $a2, $a1 ori $a5, $zero, 7 - bltu $a5, $a2, .LBB8_14 + bltu $a5, $a2, .LBB8_9 # %bb.7: slli.d $a2, $a3, 3 alsl.w $a2, $a3, $a2, 1 div.wu $a2, $a2, $a1 ori $a5, $zero, 6 - bltu $a5, $a2, .LBB8_14 + bltu $a5, $a2, .LBB8_9 # %bb.8: add.d $a2, $a3, $a4 slli.d $a3, $a2, 3 alsl.w $a2, $a2, $a3, 1 div.wu $a1, $a2, $a1 ori $a2, $zero, 7 - bgeu $a2, $a1, .LBB8_13 - b .LBB8_14 + bgeu $a2, $a1, .LBB8_14 .LBB8_9: + fld.d $fs1, $sp, 0 # 8-byte Folded Reload + fld.d $fs0, $sp, 8 # 8-byte Folded Reload + addi.d $sp, $sp, 16 + ret +.LBB8_10: ori $a5, $zero, 7 ori $a0, $zero, 1 - bltu $a5, $a2, .LBB8_14 -# %bb.10: - add.d $a2, $t0, $t1 + bltu $a5, $a2, .LBB8_9 +# %bb.11: + add.d $a2, $a6, $t1 ori $a5, $zero, 100 mul.w $a2, $a2, $a5 div.wu $a2, $a2, $a1 ori $a5, $zero, 94 - bltu $a5, $a2, .LBB8_14 -# %bb.11: + bltu $a5, $a2, .LBB8_9 +# %bb.12: slli.d $a2, $a3, 3 alsl.w $a2, $a3, $a2, 1 div.wu $a2, $a2, $a1 ori $a5, $zero, 7 - bltu $a5, $a2, .LBB8_14 -# %bb.12: + bltu $a5, $a2, .LBB8_9 +# %bb.13: add.d $a2, $a3, $a4 ori $a3, $zero, 100 mul.w $a2, $a2, $a3 div.wu $a1, $a2, $a1 ori $a2, $zero, 94 - bltu $a2, $a1, .LBB8_14 -.LBB8_13: - move $a0, $zero + bltu $a2, $a1, .LBB8_9 .LBB8_14: - fld.d $fs3, $sp, 0 # 8-byte Folded Reload - fld.d $fs2, $sp, 8 # 8-byte Folded Reload - fld.d $fs1, $sp, 16 # 8-byte Folded Reload - fld.d $fs0, $sp, 24 # 8-byte Folded Reload - addi.d $sp, $sp, 32 + move $a0, $zero + fld.d $fs1, $sp, 0 # 8-byte Folded Reload + fld.d $fs0, $sp, 8 # 8-byte Folded Reload + addi.d $sp, $sp, 16 ret .LBB8_15: # %vector.ph bstrpick.d $a3, $a4, 31, 0 - addi.d $a6, $a3, 1 - bstrpick.d $a3, $a6, 32, 3 - slli.d $a7, $a3, 3 + addi.d $a7, $a3, 1 + bstrpick.d $a3, $a7, 32, 3 + slli.d $t0, $a3, 3 alsl.d $a5, $a3, $t3, 3 add.d $a3, $t3, $a2 addi.d $a3, $a3, 4 @@ -7397,7 +7396,7 @@ is_polyAT_exon_p: # @is_polyAT_exon_p vrepli.b $vr8, 71 vrepli.b $vr10, 84 vrepli.w $vr12, 1 - move $a4, $a7 + move $a4, $t0 vori.b $vr14, $vr1, 0 vori.b $vr11, $vr1, 0 vori.b $vr13, $vr1, 0 @@ -7410,77 +7409,73 @@ is_polyAT_exon_p: # @is_polyAT_exon_p .p2align 4, , 16 .LBB8_16: # %vector.body # =>This Inner Loop Header: Depth=1 - ld.w $t0, $a3, -4 + ld.w $a6, $a3, -4 ld.w $t1, $a3, 0 - vinsgr2vr.w $vr15, $t0, 0 + vinsgr2vr.w $vr15, $a6, 0 vinsgr2vr.w $vr16, $t1, 0 vseq.b $vr17, $vr15, $vr4 vseq.b $vr18, $vr16, $vr4 - vseq.b $vr19, $vr15, $vr10 - vilvl.b $vr19, $vr19, $vr19 - vilvl.h $vr19, $vr19, $vr19 - vslli.w $vr20, $vr19, 24 - vsrai.w $vr20, $vr20, 24 - vseq.b $vr21, $vr16, $vr10 - vilvl.b $vr21, $vr21, $vr21 - vilvl.h $vr21, $vr21, $vr21 - vslli.w $vr22, $vr21, 24 - vsrai.w $vr22, $vr22, 24 - vseq.b $vr23, $vr15, $vr6 - vor.v $vr17, $vr17, $vr23 - vseq.b $vr24, $vr16, $vr6 - vor.v $vr18, $vr18, $vr24 - vseq.b $vr15, $vr15, $vr8 + vseq.b $vr19, $vr15, $vr6 + vor.v $vr17, $vr17, $vr19 + vseq.b $vr20, $vr16, $vr6 + vor.v $vr18, $vr18, $vr20 + vseq.b $vr21, $vr15, $vr8 + vor.v $vr17, $vr17, $vr21 + vsllwil.h.b $vr22, $vr17, 0 + vsllwil.w.h $vr22, $vr22, 0 + vseq.b $vr23, $vr16, $vr8 + vor.v $vr18, $vr18, $vr23 + vsllwil.h.b $vr24, $vr18, 0 + vsllwil.w.h $vr24, $vr24, 0 + vseq.b $vr15, $vr15, $vr10 vor.v $vr17, $vr17, $vr15 - vilvl.b $vr17, $vr17, $vr17 - vilvl.h $vr17, $vr17, $vr17 - vslli.w $vr17, $vr17, 24 - vsrai.w $vr17, $vr17, 24 - vseq.b $vr16, $vr16, $vr8 + vsllwil.h.b $vr17, $vr17, 0 + vsllwil.w.h $vr17, $vr17, 0 + vseq.b $vr16, $vr16, $vr10 vor.v $vr18, $vr18, $vr16 - vilvl.b $vr18, $vr18, $vr18 - vilvl.h $vr18, $vr18, $vr18 - vslli.w $vr18, $vr18, 24 - vsrai.w $vr18, $vr18, 24 - vor.v $vr25, $vr17, $vr20 - vor.v $vr26, $vr18, $vr22 - vilvl.b $vr23, $vr23, $vr23 - vilvl.h $vr23, $vr23, $vr23 + vsllwil.h.b $vr18, $vr18, 0 + vsllwil.w.h $vr18, $vr18, 0 + vxor.v $vr25, $vr19, $vr21 + vor.v $vr25, $vr15, $vr25 + vsllwil.h.b $vr25, $vr25, 0 + vsllwil.w.h $vr25, $vr25, 0 + vandn.v $vr22, $vr25, $vr22 + vand.v $vr22, $vr22, $vr12 + vadd.w $vr0, $vr0, $vr22 + vxor.v $vr22, $vr20, $vr23 + vor.v $vr22, $vr16, $vr22 + vsllwil.h.b $vr22, $vr22, 0 + vsllwil.w.h $vr22, $vr22, 0 + vandn.v $vr22, $vr22, $vr24 + vand.v $vr22, $vr22, $vr12 + vadd.w $vr2, $vr2, $vr22 + vilvl.b $vr19, $vr19, $vr19 + vilvl.h $vr19, $vr19, $vr19 + vand.v $vr19, $vr19, $vr12 + vadd.w $vr3, $vr3, $vr19 + vilvl.b $vr19, $vr20, $vr20 + vilvl.h $vr19, $vr19, $vr19 + vand.v $vr19, $vr19, $vr12 + vadd.w $vr5, $vr5, $vr19 + vilvl.b $vr19, $vr21, $vr21 + vilvl.h $vr19, $vr19, $vr19 + vand.v $vr19, $vr19, $vr12 + vadd.w $vr7, $vr7, $vr19 + vilvl.b $vr19, $vr23, $vr23 + vilvl.h $vr19, $vr19, $vr19 + vand.v $vr19, $vr19, $vr12 + vadd.w $vr9, $vr9, $vr19 vilvl.b $vr15, $vr15, $vr15 vilvl.h $vr15, $vr15, $vr15 - vxor.v $vr27, $vr23, $vr15 - vslli.w $vr27, $vr27, 24 - vsrai.w $vr27, $vr27, 24 - vor.v $vr20, $vr20, $vr27 - vandn.v $vr17, $vr20, $vr17 - vand.v $vr17, $vr17, $vr12 - vadd.w $vr0, $vr0, $vr17 - vilvl.b $vr17, $vr24, $vr24 - vilvl.h $vr17, $vr17, $vr17 - vilvl.b $vr16, $vr16, $vr16 - vilvl.h $vr16, $vr16, $vr16 - vxor.v $vr20, $vr17, $vr16 - vslli.w $vr20, $vr20, 24 - vsrai.w $vr20, $vr20, 24 - vor.v $vr20, $vr22, $vr20 - vandn.v $vr18, $vr20, $vr18 - vand.v $vr18, $vr18, $vr12 - vadd.w $vr2, $vr2, $vr18 - vand.v $vr18, $vr23, $vr12 - vadd.w $vr3, $vr3, $vr18 - vand.v $vr17, $vr17, $vr12 - vadd.w $vr5, $vr5, $vr17 vand.v $vr15, $vr15, $vr12 - vadd.w $vr7, $vr7, $vr15 - vand.v $vr15, $vr16, $vr12 - vadd.w $vr9, $vr9, $vr15 - vand.v $vr15, $vr19, $vr12 vadd.w $vr11, $vr11, $vr15 - vand.v $vr15, $vr21, $vr12 + vilvl.b $vr15, $vr16, $vr16 + vilvl.h $vr15, $vr15, $vr15 + vand.v $vr15, $vr15, $vr12 vadd.w $vr13, $vr13, $vr15 - vadd.w $vr1, $vr1, $vr25 + vadd.w $vr1, $vr1, $vr17 vaddi.wu $vr1, $vr1, 1 - vadd.w $vr14, $vr14, $vr26 + vadd.w $vr14, $vr14, $vr18 vaddi.wu $vr14, $vr14, 1 addi.d $a4, $a4, -8 addi.d $a3, $a3, 8 @@ -7497,7 +7492,7 @@ is_polyAT_exon_p: # @is_polyAT_exon_p vadd.w $vr1, $vr9, $vr7 vhaddw.d.w $vr1, $vr1, $vr1 vhaddw.q.d $vr1, $vr1, $vr1 - vpickve2gr.d $t0, $vr1, 0 + vpickve2gr.d $a6, $vr1, 0 vadd.w $vr1, $vr5, $vr3 vhaddw.d.w $vr1, $vr1, $vr1 vhaddw.q.d $vr1, $vr1, $vr1 @@ -7506,17 +7501,17 @@ is_polyAT_exon_p: # @is_polyAT_exon_p vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t1, $vr0, 0 - beq $a6, $a7, .LBB8_4 + beq $a7, $t0, .LBB8_4 .LBB8_18: # %.lr.ph.preheader71 add.d $a2, $a2, $a5 sub.d $a5, $a1, $a5 - ori $a6, $zero, 19 - pcalau12i $a7, %pc_hi20(.LJTI8_0) - addi.d $a7, $a7, %pc_lo12(.LJTI8_0) + ori $a7, $zero, 19 + pcalau12i $t0, %pc_hi20(.LJTI8_0) + addi.d $t0, $t0, %pc_lo12(.LJTI8_0) b .LBB8_21 .p2align 4, , 16 .LBB8_19: # in Loop: Header=BB8_21 Depth=1 - addi.d $t0, $t0, 1 + addi.d $a6, $a6, 1 .LBB8_20: # in Loop: Header=BB8_21 Depth=1 addi.w $a5, $a5, -1 addi.d $a2, $a2, 1 @@ -7525,12 +7520,12 @@ is_polyAT_exon_p: # @is_polyAT_exon_p # =>This Inner Loop Header: Depth=1 ld.bu $t3, $a2, 0 addi.d $t3, $t3, -65 - bltu $a6, $t3, .LBB8_26 + bltu $a7, $t3, .LBB8_26 # %bb.22: # %.lr.ph # in Loop: Header=BB8_21 Depth=1 slli.d $t3, $t3, 2 - ldx.w $t3, $a7, $t3 - add.d $t3, $a7, $t3 + ldx.w $t3, $t0, $t3 + add.d $t3, $t0, $t3 jr $t3 .LBB8_23: # in Loop: Header=BB8_21 Depth=1 addi.d $t1, $t1, 1 diff --git a/results/MultiSource/Applications/SPASS/CMakeFiles/SPASS.dir/cnf.s b/results/MultiSource/Applications/SPASS/CMakeFiles/SPASS.dir/cnf.s index 63836260..89ee5ddd 100644 --- a/results/MultiSource/Applications/SPASS/CMakeFiles/SPASS.dir/cnf.s +++ b/results/MultiSource/Applications/SPASS/CMakeFiles/SPASS.dir/cnf.s @@ -6744,16 +6744,12 @@ cnf_Flotter: # @cnf_Flotter ld.h $a6, $a3, 0 vinsgr2vr.h $vr2, $a5, 0 vinsgr2vr.h $vr3, $a6, 0 - vilvl.b $vr2, $vr2, $vr2 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 56 - vsrai.d $vr2, $vr2, 56 - vilvl.b $vr3, $vr3, $vr3 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 56 - vsrai.d $vr3, $vr3, 56 + vsllwil.h.b $vr2, $vr2, 0 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.h.b $vr3, $vr3, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 vadd.d $vr0, $vr0, $vr2 vadd.d $vr1, $vr1, $vr3 addi.d $a4, $a4, -4 diff --git a/results/MultiSource/Applications/SPASS/CMakeFiles/SPASS.dir/top.s b/results/MultiSource/Applications/SPASS/CMakeFiles/SPASS.dir/top.s index 2ddbb9b9..e6c9d335 100644 --- a/results/MultiSource/Applications/SPASS/CMakeFiles/SPASS.dir/top.s +++ b/results/MultiSource/Applications/SPASS/CMakeFiles/SPASS.dir/top.s @@ -1150,16 +1150,12 @@ main: # @main ld.h $a6, $a3, 0 vinsgr2vr.h $vr2, $a5, 0 vinsgr2vr.h $vr3, $a6, 0 - vilvl.b $vr2, $vr2, $vr2 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 56 - vsrai.d $vr2, $vr2, 56 - vilvl.b $vr3, $vr3, $vr3 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 56 - vsrai.d $vr3, $vr3, 56 + vsllwil.h.b $vr2, $vr2, 0 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.h.b $vr3, $vr3, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 vadd.d $vr0, $vr0, $vr2 vadd.d $vr1, $vr1, $vr3 addi.d $a4, $a4, -4 @@ -1298,16 +1294,12 @@ main: # @main ld.h $a6, $a3, 0 vinsgr2vr.h $vr2, $a5, 0 vinsgr2vr.h $vr3, $a6, 0 - vilvl.b $vr2, $vr2, $vr2 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 56 - vsrai.d $vr2, $vr2, 56 - vilvl.b $vr3, $vr3, $vr3 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 56 - vsrai.d $vr3, $vr3, 56 + vsllwil.h.b $vr2, $vr2, 0 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.h.b $vr3, $vr3, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 vadd.d $vr0, $vr0, $vr2 vadd.d $vr1, $vr1, $vr3 addi.d $a4, $a4, -4 diff --git a/results/MultiSource/Applications/d/CMakeFiles/make_dparser.dir/write_ctables.s b/results/MultiSource/Applications/d/CMakeFiles/make_dparser.dir/write_ctables.s index 0d94e115..177d6447 100644 --- a/results/MultiSource/Applications/d/CMakeFiles/make_dparser.dir/write_ctables.s +++ b/results/MultiSource/Applications/d/CMakeFiles/make_dparser.dir/write_ctables.s @@ -4230,13 +4230,9 @@ write_header_as_C: # @write_header_as_C vinsgr2vr.w $vr3, $a7, 0 vinsgr2vr.w $vr3, $t0, 1 vseqi.w $vr2, $vr2, 3 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 + vsllwil.d.w $vr2, $vr2, 0 vseqi.w $vr3, $vr3, 3 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr3, $vr3, 0 vor.v $vr0, $vr0, $vr2 vor.v $vr1, $vr1, $vr3 addi.d $a4, $a4, -4 diff --git a/results/MultiSource/Applications/lemon/CMakeFiles/lemon.dir/lemon.s b/results/MultiSource/Applications/lemon/CMakeFiles/lemon.dir/lemon.s index 7947336a..66d0c92b 100644 --- a/results/MultiSource/Applications/lemon/CMakeFiles/lemon.dir/lemon.s +++ b/results/MultiSource/Applications/lemon/CMakeFiles/lemon.dir/lemon.s @@ -6890,8 +6890,8 @@ Parse: # @Parse st.b $a7, $a1, -1 .LBB37_90: # %pred.store.continue506 # in Loop: Header=BB37_88 Depth=2 - vilvl.b $vr0, $vr0, $vr0 - vilvl.h $vr0, $vr0, $vr0 + vsllwil.h.b $vr0, $vr0, 0 + vsllwil.w.h $vr0, $vr0, 0 vpickve2gr.w $a2, $vr0, 1 andi $a2, $a2, 1 beqz $a2, .LBB37_93 diff --git a/results/MultiSource/Applications/lua/CMakeFiles/lua.dir/llex.s b/results/MultiSource/Applications/lua/CMakeFiles/lua.dir/llex.s index 3467c341..4107219e 100644 --- a/results/MultiSource/Applications/lua/CMakeFiles/lua.dir/llex.s +++ b/results/MultiSource/Applications/lua/CMakeFiles/lua.dir/llex.s @@ -2558,7 +2558,7 @@ read_numeral: # @read_numeral bnez $a7, .LBB13_36 # %bb.29: # %pred.store.continue98 # in Loop: Header=BB13_28 Depth=1 - vilvl.b $vr2, $vr2, $vr2 + vsllwil.h.b $vr2, $vr2, 0 vpickve2gr.h $a7, $vr2, 1 andi $a7, $a7, 1 bnez $a7, .LBB13_37 @@ -2597,7 +2597,7 @@ read_numeral: # @read_numeral .LBB13_36: # %pred.store.if97 # in Loop: Header=BB13_28 Depth=1 st.b $a1, $a3, 3 - vilvl.b $vr2, $vr2, $vr2 + vsllwil.h.b $vr2, $vr2, 0 vpickve2gr.h $a7, $vr2, 1 andi $a7, $a7, 1 beqz $a7, .LBB13_30 @@ -2913,7 +2913,7 @@ read_numeral: # @read_numeral bnez $t0, .LBB13_100 # %bb.93: # %pred.store.continue177 # in Loop: Header=BB13_92 Depth=1 - vilvl.b $vr2, $vr2, $vr2 + vsllwil.h.b $vr2, $vr2, 0 vpickve2gr.h $t0, $vr2, 1 andi $t0, $t0, 1 bnez $t0, .LBB13_101 @@ -2952,7 +2952,7 @@ read_numeral: # @read_numeral .LBB13_100: # %pred.store.if176 # in Loop: Header=BB13_92 Depth=1 st.b $a1, $a4, 3 - vilvl.b $vr2, $vr2, $vr2 + vsllwil.h.b $vr2, $vr2, 0 vpickve2gr.h $t0, $vr2, 1 andi $t0, $t0, 1 beqz $t0, .LBB13_94 @@ -3278,7 +3278,7 @@ read_numeral: # @read_numeral bnez $t0, .LBB13_167 # %bb.160: # %pred.store.continue261 # in Loop: Header=BB13_159 Depth=1 - vilvl.b $vr2, $vr2, $vr2 + vsllwil.h.b $vr2, $vr2, 0 vpickve2gr.h $t0, $vr2, 1 andi $t0, $t0, 1 bnez $t0, .LBB13_168 @@ -3317,7 +3317,7 @@ read_numeral: # @read_numeral .LBB13_167: # %pred.store.if260 # in Loop: Header=BB13_159 Depth=1 st.b $a7, $a3, 3 - vilvl.b $vr2, $vr2, $vr2 + vsllwil.h.b $vr2, $vr2, 0 vpickve2gr.h $t0, $vr2, 1 andi $t0, $t0, 1 beqz $t0, .LBB13_161 diff --git a/results/MultiSource/Applications/minisat/CMakeFiles/minisat.dir/Solver.s b/results/MultiSource/Applications/minisat/CMakeFiles/minisat.dir/Solver.s index 0a09275e..39950ef6 100644 --- a/results/MultiSource/Applications/minisat/CMakeFiles/minisat.dir/Solver.s +++ b/results/MultiSource/Applications/minisat/CMakeFiles/minisat.dir/Solver.s @@ -2383,18 +2383,12 @@ _ZN6Solver7analyzeEP6ClauseR3vecI3LitERi: # @_ZN6Solver7analyzeEP6ClauseR3vecI3L vld $vr3, $a6, 0 vsrai.w $vr2, $vr2, 1 vsrai.w $vr3, $vr3, 1 - vshuf4i.w $vr4, $vr2, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $t0, $vr2, 0 slli.d $t0, $t0, 2 vpickve2gr.d $t1, $vr2, 1 diff --git a/results/MultiSource/Applications/obsequi/CMakeFiles/Obsequi.dir/negamax.s b/results/MultiSource/Applications/obsequi/CMakeFiles/Obsequi.dir/negamax.s index aa798275..cc15ea8e 100644 --- a/results/MultiSource/Applications/obsequi/CMakeFiles/Obsequi.dir/negamax.s +++ b/results/MultiSource/Applications/obsequi/CMakeFiles/Obsequi.dir/negamax.s @@ -39,8 +39,8 @@ search_for_move: # @search_for_move pcalau12i $a0, %got_pc_hi20(g_board_size) ld.d $a0, $a0, %got_pc_lo12(g_board_size) ld.w $a0, $a0, 0 - vrepli.b $vr10, 0 - vst $vr10, $sp, 96 # 16-byte Folded Spill + vrepli.b $vr0, 0 + vst $vr0, $sp, 96 # 16-byte Folded Spill blez $a0, .LBB0_6 # %bb.4: # %.lr.ph.preheader ori $a1, $zero, 8 @@ -65,8 +65,8 @@ search_for_move: # @search_for_move pcalau12i $a3, %got_pc_hi20(countbits16) ld.d $a3, $a3, %got_pc_lo12(countbits16) move $a4, $a1 - vori.b $vr2, $vr10, 0 - vori.b $vr3, $vr10, 0 + vld $vr3, $sp, 96 # 16-byte Folded Reload + vori.b $vr2, $vr3, 0 .p2align 4, , 16 .LBB0_8: # %vector.body # =>This Inner Loop Header: Depth=1 @@ -76,10 +76,12 @@ search_for_move: # @search_for_move vxor.v $vr7, $vr5, $vr0 vandn.v $vr4, $vr4, $vr1 vandn.v $vr5, $vr5, $vr1 - vilvh.w $vr8, $vr10, $vr4 - vilvl.w $vr4, $vr10, $vr4 - vilvh.w $vr9, $vr10, $vr5 - vilvl.w $vr5, $vr10, $vr5 + vshuf4i.w $vr8, $vr4, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.w $vr9, $vr5, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr5, $vr5, 0 vpickve2gr.d $a5, $vr4, 0 slli.d $a5, $a5, 2 vpickve2gr.d $a6, $vr4, 1 @@ -114,10 +116,12 @@ search_for_move: # @search_for_move vinsgr2vr.w $vr5, $t0, 3 vsrli.w $vr6, $vr6, 16 vsrli.w $vr7, $vr7, 16 - vilvh.w $vr8, $vr10, $vr6 - vilvl.w $vr6, $vr10, $vr6 - vilvh.w $vr9, $vr10, $vr7 - vilvl.w $vr7, $vr10, $vr7 + vshuf4i.w $vr8, $vr6, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.w $vr9, $vr7, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr7, $vr7, 0 vpickve2gr.d $a5, $vr6, 0 slli.d $a5, $a5, 2 vpickve2gr.d $a6, $vr6, 1 diff --git a/results/MultiSource/Applications/obsequi/CMakeFiles/Obsequi.dir/traits.s b/results/MultiSource/Applications/obsequi/CMakeFiles/Obsequi.dir/traits.s index 1a0c1dc9..217a35eb 100644 --- a/results/MultiSource/Applications/obsequi/CMakeFiles/Obsequi.dir/traits.s +++ b/results/MultiSource/Applications/obsequi/CMakeFiles/Obsequi.dir/traits.s @@ -81,119 +81,122 @@ write_node_info: # @write_node_info ld.d $a4, $a4, %got_pc_lo12(countbits16) move $a5, $a2 vori.b $vr4, $vr1, 0 - vori.b $vr5, $vr1, 0 .p2align 4, , 16 .LBB0_7: # %vector.body # =>This Inner Loop Header: Depth=1 - vld $vr6, $a3, -16 - vld $vr7, $a3, -20 - vld $vr8, $a3, 0 - vbsrl.v $vr9, $vr0, 12 + vld $vr5, $a3, -16 + vld $vr6, $a3, -20 + vld $vr7, $a3, 0 + vbsrl.v $vr8, $vr0, 12 vld $vr0, $a3, -4 - vbsll.v $vr10, $vr7, 4 + vbsll.v $vr9, $vr6, 4 + vor.v $vr8, $vr9, $vr8 + vbsrl.v $vr9, $vr6, 12 + vbsll.v $vr10, $vr0, 4 vor.v $vr9, $vr10, $vr9 - vbsrl.v $vr10, $vr7, 12 - vbsll.v $vr11, $vr0, 4 - vor.v $vr10, $vr11, $vr10 - vand.v $vr6, $vr6, $vr9 - vand.v $vr8, $vr8, $vr10 - vxor.v $vr9, $vr7, $vr2 - vxor.v $vr10, $vr0, $vr2 - vsrli.w $vr11, $vr6, 1 - vsrli.w $vr12, $vr8, 1 - vand.v $vr6, $vr11, $vr6 - vand.v $vr8, $vr12, $vr8 + vand.v $vr5, $vr5, $vr8 + vand.v $vr7, $vr7, $vr9 + vxor.v $vr8, $vr6, $vr2 + vxor.v $vr9, $vr0, $vr2 + vsrli.w $vr10, $vr5, 1 + vsrli.w $vr11, $vr7, 1 + vand.v $vr5, $vr10, $vr5 + vand.v $vr7, $vr11, $vr7 + vsrli.w $vr8, $vr8, 1 vsrli.w $vr9, $vr9, 1 - vsrli.w $vr10, $vr10, 1 - vor.v $vr6, $vr6, $vr7 - vor.v $vr7, $vr8, $vr0 + vor.v $vr5, $vr5, $vr6 + vor.v $vr6, $vr7, $vr0 + vandn.v $vr5, $vr5, $vr8 vandn.v $vr6, $vr6, $vr9 - vandn.v $vr7, $vr7, $vr10 + vand.v $vr7, $vr5, $vr3 vand.v $vr8, $vr6, $vr3 - vand.v $vr9, $vr7, $vr3 - vilvh.w $vr10, $vr1, $vr8 - vilvl.w $vr8, $vr1, $vr8 - vilvh.w $vr11, $vr1, $vr9 - vilvl.w $vr9, $vr1, $vr9 - vpickve2gr.d $a6, $vr8, 0 + vshuf4i.w $vr9, $vr7, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vshuf4i.w $vr10, $vr8, 14 + vsllwil.du.wu $vr10, $vr10, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vpickve2gr.d $a6, $vr7, 0 slli.d $a6, $a6, 2 - vpickve2gr.d $a7, $vr8, 1 + vpickve2gr.d $a7, $vr7, 1 slli.d $a7, $a7, 2 - vpickve2gr.d $t0, $vr10, 0 + vpickve2gr.d $t0, $vr9, 0 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr10, 1 + vpickve2gr.d $t1, $vr9, 1 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr9, 0 + vpickve2gr.d $t2, $vr8, 0 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr9, 1 + vpickve2gr.d $t3, $vr8, 1 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr11, 0 + vpickve2gr.d $t4, $vr10, 0 slli.d $t4, $t4, 2 - vpickve2gr.d $t5, $vr11, 1 + vpickve2gr.d $t5, $vr10, 1 slli.d $t5, $t5, 2 ldx.w $a6, $a4, $a6 ldx.w $a7, $a4, $a7 ldx.w $t0, $a4, $t0 ldx.w $t1, $a4, $t1 - vinsgr2vr.w $vr8, $a6, 0 - vinsgr2vr.w $vr8, $a7, 1 - vinsgr2vr.w $vr8, $t0, 2 - vinsgr2vr.w $vr8, $t1, 3 + vinsgr2vr.w $vr7, $a6, 0 + vinsgr2vr.w $vr7, $a7, 1 + vinsgr2vr.w $vr7, $t0, 2 + vinsgr2vr.w $vr7, $t1, 3 ldx.w $a6, $a4, $t2 ldx.w $a7, $a4, $t3 ldx.w $t0, $a4, $t4 ldx.w $t1, $a4, $t5 - vinsgr2vr.w $vr9, $a6, 0 - vinsgr2vr.w $vr9, $a7, 1 - vinsgr2vr.w $vr9, $t0, 2 - vinsgr2vr.w $vr9, $t1, 3 + vinsgr2vr.w $vr8, $a6, 0 + vinsgr2vr.w $vr8, $a7, 1 + vinsgr2vr.w $vr8, $t0, 2 + vinsgr2vr.w $vr8, $t1, 3 + vsrli.w $vr5, $vr5, 16 vsrli.w $vr6, $vr6, 16 - vsrli.w $vr7, $vr7, 16 - vilvh.w $vr10, $vr1, $vr6 - vilvl.w $vr6, $vr1, $vr6 - vilvh.w $vr11, $vr1, $vr7 - vilvl.w $vr7, $vr1, $vr7 - vpickve2gr.d $a6, $vr6, 0 + vshuf4i.w $vr9, $vr5, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.w $vr10, $vr6, 14 + vsllwil.du.wu $vr10, $vr10, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vpickve2gr.d $a6, $vr5, 0 slli.d $a6, $a6, 2 - vpickve2gr.d $a7, $vr6, 1 + vpickve2gr.d $a7, $vr5, 1 slli.d $a7, $a7, 2 - vpickve2gr.d $t0, $vr10, 0 + vpickve2gr.d $t0, $vr9, 0 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr10, 1 + vpickve2gr.d $t1, $vr9, 1 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr7, 0 + vpickve2gr.d $t2, $vr6, 0 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr7, 1 + vpickve2gr.d $t3, $vr6, 1 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr11, 0 + vpickve2gr.d $t4, $vr10, 0 slli.d $t4, $t4, 2 - vpickve2gr.d $t5, $vr11, 1 + vpickve2gr.d $t5, $vr10, 1 slli.d $t5, $t5, 2 ldx.w $a6, $a4, $a6 ldx.w $a7, $a4, $a7 ldx.w $t0, $a4, $t0 ldx.w $t1, $a4, $t1 - vinsgr2vr.w $vr6, $a6, 0 - vinsgr2vr.w $vr6, $a7, 1 - vinsgr2vr.w $vr6, $t0, 2 - vinsgr2vr.w $vr6, $t1, 3 + vinsgr2vr.w $vr5, $a6, 0 + vinsgr2vr.w $vr5, $a7, 1 + vinsgr2vr.w $vr5, $t0, 2 + vinsgr2vr.w $vr5, $t1, 3 ldx.w $a6, $a4, $t2 ldx.w $a7, $a4, $t3 ldx.w $t0, $a4, $t4 ldx.w $t1, $a4, $t5 - vinsgr2vr.w $vr7, $a6, 0 - vinsgr2vr.w $vr7, $a7, 1 - vinsgr2vr.w $vr7, $t0, 2 - vinsgr2vr.w $vr7, $t1, 3 + vinsgr2vr.w $vr6, $a6, 0 + vinsgr2vr.w $vr6, $a7, 1 + vinsgr2vr.w $vr6, $t0, 2 + vinsgr2vr.w $vr6, $t1, 3 + vadd.w $vr1, $vr7, $vr1 vadd.w $vr4, $vr8, $vr4 - vadd.w $vr5, $vr9, $vr5 + vadd.w $vr1, $vr1, $vr5 vadd.w $vr4, $vr4, $vr6 - vadd.w $vr5, $vr5, $vr7 addi.d $a5, $a5, -8 addi.d $a3, $a3, 32 bnez $a5, .LBB0_7 # %bb.8: # %middle.block - vadd.w $vr1, $vr5, $vr4 + vadd.w $vr1, $vr4, $vr1 vhaddw.d.w $vr1, $vr1, $vr1 vhaddw.q.d $vr1, $vr1, $vr1 vpickve2gr.d $a4, $vr1, 0 @@ -272,119 +275,122 @@ write_node_info: # @write_node_info ld.d $a5, $a5, %got_pc_lo12(countbits16) move $a6, $a3 vori.b $vr4, $vr1, 0 - vori.b $vr5, $vr1, 0 .p2align 4, , 16 .LBB0_17: # %vector.body75 # =>This Inner Loop Header: Depth=1 - vld $vr6, $a4, -16 - vld $vr7, $a4, -20 - vld $vr8, $a4, 0 - vbsrl.v $vr9, $vr0, 12 + vld $vr5, $a4, -16 + vld $vr6, $a4, -20 + vld $vr7, $a4, 0 + vbsrl.v $vr8, $vr0, 12 vld $vr0, $a4, -4 - vbsll.v $vr10, $vr7, 4 + vbsll.v $vr9, $vr6, 4 + vor.v $vr8, $vr9, $vr8 + vbsrl.v $vr9, $vr6, 12 + vbsll.v $vr10, $vr0, 4 vor.v $vr9, $vr10, $vr9 - vbsrl.v $vr10, $vr7, 12 - vbsll.v $vr11, $vr0, 4 - vor.v $vr10, $vr11, $vr10 - vand.v $vr6, $vr6, $vr9 - vand.v $vr8, $vr8, $vr10 - vxor.v $vr9, $vr7, $vr2 - vxor.v $vr10, $vr0, $vr2 - vsrli.w $vr11, $vr6, 1 - vsrli.w $vr12, $vr8, 1 - vand.v $vr6, $vr11, $vr6 - vand.v $vr8, $vr12, $vr8 + vand.v $vr5, $vr5, $vr8 + vand.v $vr7, $vr7, $vr9 + vxor.v $vr8, $vr6, $vr2 + vxor.v $vr9, $vr0, $vr2 + vsrli.w $vr10, $vr5, 1 + vsrli.w $vr11, $vr7, 1 + vand.v $vr5, $vr10, $vr5 + vand.v $vr7, $vr11, $vr7 + vsrli.w $vr8, $vr8, 1 vsrli.w $vr9, $vr9, 1 - vsrli.w $vr10, $vr10, 1 - vor.v $vr6, $vr6, $vr7 - vor.v $vr7, $vr8, $vr0 + vor.v $vr5, $vr5, $vr6 + vor.v $vr6, $vr7, $vr0 + vandn.v $vr5, $vr5, $vr8 vandn.v $vr6, $vr6, $vr9 - vandn.v $vr7, $vr7, $vr10 + vand.v $vr7, $vr5, $vr3 vand.v $vr8, $vr6, $vr3 - vand.v $vr9, $vr7, $vr3 - vilvh.w $vr10, $vr1, $vr8 - vilvl.w $vr8, $vr1, $vr8 - vilvh.w $vr11, $vr1, $vr9 - vilvl.w $vr9, $vr1, $vr9 - vpickve2gr.d $a7, $vr8, 0 + vshuf4i.w $vr9, $vr7, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vshuf4i.w $vr10, $vr8, 14 + vsllwil.du.wu $vr10, $vr10, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vpickve2gr.d $a7, $vr7, 0 slli.d $a7, $a7, 2 - vpickve2gr.d $t0, $vr8, 1 + vpickve2gr.d $t0, $vr7, 1 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr10, 0 + vpickve2gr.d $t1, $vr9, 0 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr10, 1 + vpickve2gr.d $t2, $vr9, 1 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr9, 0 + vpickve2gr.d $t3, $vr8, 0 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr9, 1 + vpickve2gr.d $t4, $vr8, 1 slli.d $t4, $t4, 2 - vpickve2gr.d $t5, $vr11, 0 + vpickve2gr.d $t5, $vr10, 0 slli.d $t5, $t5, 2 - vpickve2gr.d $t6, $vr11, 1 + vpickve2gr.d $t6, $vr10, 1 slli.d $t6, $t6, 2 ldx.w $a7, $a5, $a7 ldx.w $t0, $a5, $t0 ldx.w $t1, $a5, $t1 ldx.w $t2, $a5, $t2 - vinsgr2vr.w $vr8, $a7, 0 - vinsgr2vr.w $vr8, $t0, 1 - vinsgr2vr.w $vr8, $t1, 2 - vinsgr2vr.w $vr8, $t2, 3 + vinsgr2vr.w $vr7, $a7, 0 + vinsgr2vr.w $vr7, $t0, 1 + vinsgr2vr.w $vr7, $t1, 2 + vinsgr2vr.w $vr7, $t2, 3 ldx.w $a7, $a5, $t3 ldx.w $t0, $a5, $t4 ldx.w $t1, $a5, $t5 ldx.w $t2, $a5, $t6 - vinsgr2vr.w $vr9, $a7, 0 - vinsgr2vr.w $vr9, $t0, 1 - vinsgr2vr.w $vr9, $t1, 2 - vinsgr2vr.w $vr9, $t2, 3 + vinsgr2vr.w $vr8, $a7, 0 + vinsgr2vr.w $vr8, $t0, 1 + vinsgr2vr.w $vr8, $t1, 2 + vinsgr2vr.w $vr8, $t2, 3 + vsrli.w $vr5, $vr5, 16 vsrli.w $vr6, $vr6, 16 - vsrli.w $vr7, $vr7, 16 - vilvh.w $vr10, $vr1, $vr6 - vilvl.w $vr6, $vr1, $vr6 - vilvh.w $vr11, $vr1, $vr7 - vilvl.w $vr7, $vr1, $vr7 - vpickve2gr.d $a7, $vr6, 0 + vshuf4i.w $vr9, $vr5, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.w $vr10, $vr6, 14 + vsllwil.du.wu $vr10, $vr10, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vpickve2gr.d $a7, $vr5, 0 slli.d $a7, $a7, 2 - vpickve2gr.d $t0, $vr6, 1 + vpickve2gr.d $t0, $vr5, 1 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr10, 0 + vpickve2gr.d $t1, $vr9, 0 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr10, 1 + vpickve2gr.d $t2, $vr9, 1 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr7, 0 + vpickve2gr.d $t3, $vr6, 0 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr7, 1 + vpickve2gr.d $t4, $vr6, 1 slli.d $t4, $t4, 2 - vpickve2gr.d $t5, $vr11, 0 + vpickve2gr.d $t5, $vr10, 0 slli.d $t5, $t5, 2 - vpickve2gr.d $t6, $vr11, 1 + vpickve2gr.d $t6, $vr10, 1 slli.d $t6, $t6, 2 ldx.w $a7, $a5, $a7 ldx.w $t0, $a5, $t0 ldx.w $t1, $a5, $t1 ldx.w $t2, $a5, $t2 - vinsgr2vr.w $vr6, $a7, 0 - vinsgr2vr.w $vr6, $t0, 1 - vinsgr2vr.w $vr6, $t1, 2 - vinsgr2vr.w $vr6, $t2, 3 + vinsgr2vr.w $vr5, $a7, 0 + vinsgr2vr.w $vr5, $t0, 1 + vinsgr2vr.w $vr5, $t1, 2 + vinsgr2vr.w $vr5, $t2, 3 ldx.w $a7, $a5, $t3 ldx.w $t0, $a5, $t4 ldx.w $t1, $a5, $t5 ldx.w $t2, $a5, $t6 - vinsgr2vr.w $vr7, $a7, 0 - vinsgr2vr.w $vr7, $t0, 1 - vinsgr2vr.w $vr7, $t1, 2 - vinsgr2vr.w $vr7, $t2, 3 + vinsgr2vr.w $vr6, $a7, 0 + vinsgr2vr.w $vr6, $t0, 1 + vinsgr2vr.w $vr6, $t1, 2 + vinsgr2vr.w $vr6, $t2, 3 + vadd.w $vr1, $vr7, $vr1 vadd.w $vr4, $vr8, $vr4 - vadd.w $vr5, $vr9, $vr5 + vadd.w $vr1, $vr1, $vr5 vadd.w $vr4, $vr4, $vr6 - vadd.w $vr5, $vr5, $vr7 addi.d $a6, $a6, -8 addi.d $a4, $a4, 32 bnez $a6, .LBB0_17 # %bb.18: # %middle.block86 - vadd.w $vr1, $vr5, $vr4 + vadd.w $vr1, $vr4, $vr1 vhaddw.d.w $vr1, $vr1, $vr1 vhaddw.q.d $vr1, $vr1, $vr1 vpickve2gr.d $a4, $vr1, 0 @@ -489,99 +495,102 @@ write_node_info: # @write_node_info ld.d $a3, $a3, %got_pc_lo12(countbits16) move $a4, $a1 vori.b $vr3, $vr0, 0 - vori.b $vr4, $vr0, 0 .p2align 4, , 16 .LBB0_27: # %vector.body98 # =>This Inner Loop Header: Depth=1 - vld $vr5, $a2, -16 - vld $vr6, $a2, 0 + vld $vr4, $a2, -16 + vld $vr5, $a2, 0 + vxor.v $vr6, $vr4, $vr1 vxor.v $vr7, $vr5, $vr1 - vxor.v $vr8, $vr6, $vr1 + vandn.v $vr4, $vr4, $vr2 vandn.v $vr5, $vr5, $vr2 - vandn.v $vr6, $vr6, $vr2 - vilvh.w $vr9, $vr0, $vr5 - vilvl.w $vr5, $vr0, $vr5 - vilvh.w $vr10, $vr0, $vr6 - vilvl.w $vr6, $vr0, $vr6 - vpickve2gr.d $a5, $vr5, 0 + vshuf4i.w $vr8, $vr4, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.w $vr9, $vr5, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vpickve2gr.d $a5, $vr4, 0 slli.d $a5, $a5, 2 - vpickve2gr.d $a6, $vr5, 1 + vpickve2gr.d $a6, $vr4, 1 slli.d $a6, $a6, 2 - vpickve2gr.d $a7, $vr9, 0 + vpickve2gr.d $a7, $vr8, 0 slli.d $a7, $a7, 2 - vpickve2gr.d $t0, $vr9, 1 + vpickve2gr.d $t0, $vr8, 1 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr6, 0 + vpickve2gr.d $t1, $vr5, 0 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr6, 1 + vpickve2gr.d $t2, $vr5, 1 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr10, 0 + vpickve2gr.d $t3, $vr9, 0 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr10, 1 + vpickve2gr.d $t4, $vr9, 1 slli.d $t4, $t4, 2 ldx.w $a5, $a3, $a5 ldx.w $a6, $a3, $a6 ldx.w $a7, $a3, $a7 ldx.w $t0, $a3, $t0 - vinsgr2vr.w $vr5, $a5, 0 - vinsgr2vr.w $vr5, $a6, 1 - vinsgr2vr.w $vr5, $a7, 2 - vinsgr2vr.w $vr5, $t0, 3 + vinsgr2vr.w $vr4, $a5, 0 + vinsgr2vr.w $vr4, $a6, 1 + vinsgr2vr.w $vr4, $a7, 2 + vinsgr2vr.w $vr4, $t0, 3 ldx.w $a5, $a3, $t1 ldx.w $a6, $a3, $t2 ldx.w $a7, $a3, $t3 ldx.w $t0, $a3, $t4 - vinsgr2vr.w $vr6, $a5, 0 - vinsgr2vr.w $vr6, $a6, 1 - vinsgr2vr.w $vr6, $a7, 2 - vinsgr2vr.w $vr6, $t0, 3 + vinsgr2vr.w $vr5, $a5, 0 + vinsgr2vr.w $vr5, $a6, 1 + vinsgr2vr.w $vr5, $a7, 2 + vinsgr2vr.w $vr5, $t0, 3 + vsrli.w $vr6, $vr6, 16 vsrli.w $vr7, $vr7, 16 - vsrli.w $vr8, $vr8, 16 - vilvh.w $vr9, $vr0, $vr7 - vilvl.w $vr7, $vr0, $vr7 - vilvh.w $vr10, $vr0, $vr8 - vilvl.w $vr8, $vr0, $vr8 - vpickve2gr.d $a5, $vr7, 0 + vshuf4i.w $vr8, $vr6, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.w $vr9, $vr7, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vpickve2gr.d $a5, $vr6, 0 slli.d $a5, $a5, 2 - vpickve2gr.d $a6, $vr7, 1 + vpickve2gr.d $a6, $vr6, 1 slli.d $a6, $a6, 2 - vpickve2gr.d $a7, $vr9, 0 + vpickve2gr.d $a7, $vr8, 0 slli.d $a7, $a7, 2 - vpickve2gr.d $t0, $vr9, 1 + vpickve2gr.d $t0, $vr8, 1 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr8, 0 + vpickve2gr.d $t1, $vr7, 0 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr8, 1 + vpickve2gr.d $t2, $vr7, 1 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr10, 0 + vpickve2gr.d $t3, $vr9, 0 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr10, 1 + vpickve2gr.d $t4, $vr9, 1 slli.d $t4, $t4, 2 ldx.w $a5, $a3, $a5 ldx.w $a6, $a3, $a6 ldx.w $a7, $a3, $a7 ldx.w $t0, $a3, $t0 - vinsgr2vr.w $vr7, $a5, 0 - vinsgr2vr.w $vr7, $a6, 1 - vinsgr2vr.w $vr7, $a7, 2 - vinsgr2vr.w $vr7, $t0, 3 + vinsgr2vr.w $vr6, $a5, 0 + vinsgr2vr.w $vr6, $a6, 1 + vinsgr2vr.w $vr6, $a7, 2 + vinsgr2vr.w $vr6, $t0, 3 ldx.w $a5, $a3, $t1 ldx.w $a6, $a3, $t2 ldx.w $a7, $a3, $t3 ldx.w $t0, $a3, $t4 - vinsgr2vr.w $vr8, $a5, 0 - vinsgr2vr.w $vr8, $a6, 1 - vinsgr2vr.w $vr8, $a7, 2 - vinsgr2vr.w $vr8, $t0, 3 + vinsgr2vr.w $vr7, $a5, 0 + vinsgr2vr.w $vr7, $a6, 1 + vinsgr2vr.w $vr7, $a7, 2 + vinsgr2vr.w $vr7, $t0, 3 + vadd.w $vr0, $vr4, $vr0 vadd.w $vr3, $vr5, $vr3 - vadd.w $vr4, $vr6, $vr4 + vadd.w $vr0, $vr0, $vr6 vadd.w $vr3, $vr3, $vr7 - vadd.w $vr4, $vr4, $vr8 addi.d $a4, $a4, -8 addi.d $a2, $a2, 32 bnez $a4, .LBB0_27 # %bb.28: # %middle.block105 - vadd.w $vr0, $vr4, $vr3 + vadd.w $vr0, $vr3, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a2, $vr0, 0 @@ -650,105 +659,108 @@ write_node_info: # @write_node_info ld.d $a4, $a4, %got_pc_lo12(countbits16) move $a5, $a2 vori.b $vr3, $vr1, 0 - vori.b $vr4, $vr1, 0 .p2align 4, , 16 .LBB0_36: # %vector.body115 # =>This Inner Loop Header: Depth=1 - vld $vr5, $a3, -16 - vbsrl.v $vr6, $vr0, 12 + vld $vr4, $a3, -16 + vbsrl.v $vr5, $vr0, 12 vld $vr0, $a3, 0 - vbsll.v $vr7, $vr5, 4 + vbsll.v $vr6, $vr4, 4 + vor.v $vr5, $vr6, $vr5 + vbsrl.v $vr6, $vr4, 12 + vbsll.v $vr7, $vr0, 4 vor.v $vr6, $vr7, $vr6 - vbsrl.v $vr7, $vr5, 12 - vbsll.v $vr8, $vr0, 4 - vor.v $vr7, $vr8, $vr7 - vxor.v $vr5, $vr5, $vr6 - vxor.v $vr6, $vr0, $vr7 + vxor.v $vr4, $vr4, $vr5 + vxor.v $vr5, $vr0, $vr6 + vand.v $vr6, $vr4, $vr2 vand.v $vr7, $vr5, $vr2 - vand.v $vr8, $vr6, $vr2 - vilvh.w $vr9, $vr1, $vr7 - vilvl.w $vr7, $vr1, $vr7 - vilvh.w $vr10, $vr1, $vr8 - vilvl.w $vr8, $vr1, $vr8 - vpickve2gr.d $a6, $vr7, 0 + vshuf4i.w $vr8, $vr6, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.w $vr9, $vr7, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vpickve2gr.d $a6, $vr6, 0 slli.d $a6, $a6, 2 - vpickve2gr.d $a7, $vr7, 1 + vpickve2gr.d $a7, $vr6, 1 slli.d $a7, $a7, 2 - vpickve2gr.d $t0, $vr9, 0 + vpickve2gr.d $t0, $vr8, 0 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr9, 1 + vpickve2gr.d $t1, $vr8, 1 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr8, 0 + vpickve2gr.d $t2, $vr7, 0 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr8, 1 + vpickve2gr.d $t3, $vr7, 1 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr10, 0 + vpickve2gr.d $t4, $vr9, 0 slli.d $t4, $t4, 2 - vpickve2gr.d $t5, $vr10, 1 + vpickve2gr.d $t5, $vr9, 1 slli.d $t5, $t5, 2 ldx.w $a6, $a4, $a6 ldx.w $a7, $a4, $a7 ldx.w $t0, $a4, $t0 ldx.w $t1, $a4, $t1 - vinsgr2vr.w $vr7, $a6, 0 - vinsgr2vr.w $vr7, $a7, 1 - vinsgr2vr.w $vr7, $t0, 2 - vinsgr2vr.w $vr7, $t1, 3 + vinsgr2vr.w $vr6, $a6, 0 + vinsgr2vr.w $vr6, $a7, 1 + vinsgr2vr.w $vr6, $t0, 2 + vinsgr2vr.w $vr6, $t1, 3 ldx.w $a6, $a4, $t2 ldx.w $a7, $a4, $t3 ldx.w $t0, $a4, $t4 ldx.w $t1, $a4, $t5 - vinsgr2vr.w $vr8, $a6, 0 - vinsgr2vr.w $vr8, $a7, 1 - vinsgr2vr.w $vr8, $t0, 2 - vinsgr2vr.w $vr8, $t1, 3 + vinsgr2vr.w $vr7, $a6, 0 + vinsgr2vr.w $vr7, $a7, 1 + vinsgr2vr.w $vr7, $t0, 2 + vinsgr2vr.w $vr7, $t1, 3 + vsrli.w $vr4, $vr4, 16 vsrli.w $vr5, $vr5, 16 - vsrli.w $vr6, $vr6, 16 - vilvh.w $vr9, $vr1, $vr5 - vilvl.w $vr5, $vr1, $vr5 - vilvh.w $vr10, $vr1, $vr6 - vilvl.w $vr6, $vr1, $vr6 - vpickve2gr.d $a6, $vr5, 0 + vshuf4i.w $vr8, $vr4, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.w $vr9, $vr5, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vpickve2gr.d $a6, $vr4, 0 slli.d $a6, $a6, 2 - vpickve2gr.d $a7, $vr5, 1 + vpickve2gr.d $a7, $vr4, 1 slli.d $a7, $a7, 2 - vpickve2gr.d $t0, $vr9, 0 + vpickve2gr.d $t0, $vr8, 0 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr9, 1 + vpickve2gr.d $t1, $vr8, 1 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr6, 0 + vpickve2gr.d $t2, $vr5, 0 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr6, 1 + vpickve2gr.d $t3, $vr5, 1 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr10, 0 + vpickve2gr.d $t4, $vr9, 0 slli.d $t4, $t4, 2 - vpickve2gr.d $t5, $vr10, 1 + vpickve2gr.d $t5, $vr9, 1 slli.d $t5, $t5, 2 ldx.w $a6, $a4, $a6 ldx.w $a7, $a4, $a7 ldx.w $t0, $a4, $t0 ldx.w $t1, $a4, $t1 - vinsgr2vr.w $vr5, $a6, 0 - vinsgr2vr.w $vr5, $a7, 1 - vinsgr2vr.w $vr5, $t0, 2 - vinsgr2vr.w $vr5, $t1, 3 + vinsgr2vr.w $vr4, $a6, 0 + vinsgr2vr.w $vr4, $a7, 1 + vinsgr2vr.w $vr4, $t0, 2 + vinsgr2vr.w $vr4, $t1, 3 ldx.w $a6, $a4, $t2 ldx.w $a7, $a4, $t3 ldx.w $t0, $a4, $t4 ldx.w $t1, $a4, $t5 - vinsgr2vr.w $vr6, $a6, 0 - vinsgr2vr.w $vr6, $a7, 1 - vinsgr2vr.w $vr6, $t0, 2 - vinsgr2vr.w $vr6, $t1, 3 + vinsgr2vr.w $vr5, $a6, 0 + vinsgr2vr.w $vr5, $a7, 1 + vinsgr2vr.w $vr5, $t0, 2 + vinsgr2vr.w $vr5, $t1, 3 + vadd.w $vr1, $vr6, $vr1 vadd.w $vr3, $vr7, $vr3 - vadd.w $vr4, $vr8, $vr4 + vadd.w $vr1, $vr1, $vr4 vadd.w $vr3, $vr3, $vr5 - vadd.w $vr4, $vr4, $vr6 addi.d $a5, $a5, -8 addi.d $a3, $a3, 32 bnez $a5, .LBB0_36 # %bb.37: # %middle.block124 - vadd.w $vr1, $vr4, $vr3 + vadd.w $vr1, $vr3, $vr1 vhaddw.d.w $vr1, $vr1, $vr1 vhaddw.q.d $vr1, $vr1, $vr1 vpickve2gr.d $a3, $vr1, 0 @@ -817,103 +829,106 @@ write_node_info: # @write_node_info ld.d $a3, $a3, %got_pc_lo12(countbits16) move $a4, $a1 vori.b $vr2, $vr0, 0 - vori.b $vr3, $vr0, 0 .p2align 4, , 16 .LBB0_46: # %vector.body136 # =>This Inner Loop Header: Depth=1 - vld $vr4, $a2, -16 - vld $vr5, $a2, 0 + vld $vr3, $a2, -16 + vld $vr4, $a2, 0 + vsrli.w $vr5, $vr3, 1 vsrli.w $vr6, $vr4, 1 - vsrli.w $vr7, $vr5, 1 + vbitclri.w $vr3, $vr3, 31 vbitclri.w $vr4, $vr4, 31 - vbitclri.w $vr5, $vr5, 31 + vxor.v $vr3, $vr5, $vr3 vxor.v $vr4, $vr6, $vr4 - vxor.v $vr5, $vr7, $vr5 + vand.v $vr5, $vr3, $vr1 vand.v $vr6, $vr4, $vr1 - vand.v $vr7, $vr5, $vr1 - vilvh.w $vr8, $vr0, $vr6 - vilvl.w $vr6, $vr0, $vr6 - vilvh.w $vr9, $vr0, $vr7 - vilvl.w $vr7, $vr0, $vr7 - vpickve2gr.d $a5, $vr6, 0 + vshuf4i.w $vr7, $vr5, 14 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.w $vr8, $vr6, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vpickve2gr.d $a5, $vr5, 0 slli.d $a5, $a5, 2 - vpickve2gr.d $a6, $vr6, 1 + vpickve2gr.d $a6, $vr5, 1 slli.d $a6, $a6, 2 - vpickve2gr.d $a7, $vr8, 0 + vpickve2gr.d $a7, $vr7, 0 slli.d $a7, $a7, 2 - vpickve2gr.d $t0, $vr8, 1 + vpickve2gr.d $t0, $vr7, 1 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr7, 0 + vpickve2gr.d $t1, $vr6, 0 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr7, 1 + vpickve2gr.d $t2, $vr6, 1 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr9, 0 + vpickve2gr.d $t3, $vr8, 0 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr9, 1 + vpickve2gr.d $t4, $vr8, 1 slli.d $t4, $t4, 2 ldx.w $a5, $a3, $a5 ldx.w $a6, $a3, $a6 ldx.w $a7, $a3, $a7 ldx.w $t0, $a3, $t0 - vinsgr2vr.w $vr6, $a5, 0 - vinsgr2vr.w $vr6, $a6, 1 - vinsgr2vr.w $vr6, $a7, 2 - vinsgr2vr.w $vr6, $t0, 3 + vinsgr2vr.w $vr5, $a5, 0 + vinsgr2vr.w $vr5, $a6, 1 + vinsgr2vr.w $vr5, $a7, 2 + vinsgr2vr.w $vr5, $t0, 3 ldx.w $a5, $a3, $t1 ldx.w $a6, $a3, $t2 ldx.w $a7, $a3, $t3 ldx.w $t0, $a3, $t4 - vinsgr2vr.w $vr7, $a5, 0 - vinsgr2vr.w $vr7, $a6, 1 - vinsgr2vr.w $vr7, $a7, 2 - vinsgr2vr.w $vr7, $t0, 3 + vinsgr2vr.w $vr6, $a5, 0 + vinsgr2vr.w $vr6, $a6, 1 + vinsgr2vr.w $vr6, $a7, 2 + vinsgr2vr.w $vr6, $t0, 3 + vsrli.w $vr3, $vr3, 16 vsrli.w $vr4, $vr4, 16 - vsrli.w $vr5, $vr5, 16 - vilvh.w $vr8, $vr0, $vr4 - vilvl.w $vr4, $vr0, $vr4 - vilvh.w $vr9, $vr0, $vr5 - vilvl.w $vr5, $vr0, $vr5 - vpickve2gr.d $a5, $vr4, 0 + vshuf4i.w $vr7, $vr3, 14 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vshuf4i.w $vr8, $vr4, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vpickve2gr.d $a5, $vr3, 0 slli.d $a5, $a5, 2 - vpickve2gr.d $a6, $vr4, 1 + vpickve2gr.d $a6, $vr3, 1 slli.d $a6, $a6, 2 - vpickve2gr.d $a7, $vr8, 0 + vpickve2gr.d $a7, $vr7, 0 slli.d $a7, $a7, 2 - vpickve2gr.d $t0, $vr8, 1 + vpickve2gr.d $t0, $vr7, 1 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr5, 0 + vpickve2gr.d $t1, $vr4, 0 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr5, 1 + vpickve2gr.d $t2, $vr4, 1 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr9, 0 + vpickve2gr.d $t3, $vr8, 0 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr9, 1 + vpickve2gr.d $t4, $vr8, 1 slli.d $t4, $t4, 2 ldx.w $a5, $a3, $a5 ldx.w $a6, $a3, $a6 ldx.w $a7, $a3, $a7 ldx.w $t0, $a3, $t0 - vinsgr2vr.w $vr4, $a5, 0 - vinsgr2vr.w $vr4, $a6, 1 - vinsgr2vr.w $vr4, $a7, 2 - vinsgr2vr.w $vr4, $t0, 3 + vinsgr2vr.w $vr3, $a5, 0 + vinsgr2vr.w $vr3, $a6, 1 + vinsgr2vr.w $vr3, $a7, 2 + vinsgr2vr.w $vr3, $t0, 3 ldx.w $a5, $a3, $t1 ldx.w $a6, $a3, $t2 ldx.w $a7, $a3, $t3 ldx.w $t0, $a3, $t4 - vinsgr2vr.w $vr5, $a5, 0 - vinsgr2vr.w $vr5, $a6, 1 - vinsgr2vr.w $vr5, $a7, 2 - vinsgr2vr.w $vr5, $t0, 3 + vinsgr2vr.w $vr4, $a5, 0 + vinsgr2vr.w $vr4, $a6, 1 + vinsgr2vr.w $vr4, $a7, 2 + vinsgr2vr.w $vr4, $t0, 3 + vadd.w $vr0, $vr5, $vr0 vadd.w $vr2, $vr6, $vr2 - vadd.w $vr3, $vr7, $vr3 + vadd.w $vr0, $vr0, $vr3 vadd.w $vr2, $vr2, $vr4 - vadd.w $vr3, $vr3, $vr5 addi.d $a4, $a4, -8 addi.d $a2, $a2, 32 bnez $a4, .LBB0_46 # %bb.47: # %middle.block143 - vadd.w $vr0, $vr3, $vr2 + vadd.w $vr0, $vr2, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a2, $vr0, 0 @@ -1023,123 +1038,126 @@ tr_non_safe_moves_a_little_touchy: # @tr_non_safe_moves_a_little_touchy ld.d $a6, $a6, %got_pc_lo12(countbits16) move $a7, $a4 vori.b $vr3, $vr1, 0 - vori.b $vr4, $vr1, 0 .p2align 4, , 16 .LBB1_5: # %vector.body # =>This Inner Loop Header: Depth=1 - vld $vr5, $a5, -16 - vld $vr6, $a5, -20 - vld $vr7, $a5, 0 - vbsrl.v $vr8, $vr0, 12 + vld $vr4, $a5, -16 + vld $vr5, $a5, -20 + vld $vr6, $a5, 0 + vbsrl.v $vr7, $vr0, 12 vld $vr0, $a5, -4 - vbsll.v $vr9, $vr6, 4 + vbsll.v $vr8, $vr5, 4 + vor.v $vr7, $vr8, $vr7 + vbsrl.v $vr8, $vr5, 12 + vbsll.v $vr9, $vr0, 4 vor.v $vr8, $vr9, $vr8 - vbsrl.v $vr9, $vr6, 12 - vbsll.v $vr10, $vr0, 4 - vor.v $vr9, $vr10, $vr9 - vor.v $vr5, $vr5, $vr8 - vor.v $vr7, $vr7, $vr9 - vsrli.w $vr8, $vr6, 1 - vsrli.w $vr9, $vr0, 1 - vor.v $vr5, $vr5, $vr6 - vor.v $vr7, $vr7, $vr0 - vslli.w $vr10, $vr5, 1 - vslli.w $vr11, $vr7, 1 - vslli.w $vr6, $vr6, 2 - vslli.w $vr12, $vr0, 2 - vor.v $vr6, $vr8, $vr6 - vor.v $vr8, $vr9, $vr12 - vor.v $vr6, $vr6, $vr10 - vor.v $vr8, $vr8, $vr11 - vor.v $vr9, $vr5, $vr6 - vor.v $vr10, $vr7, $vr8 - vnor.v $vr5, $vr5, $vr6 - vnor.v $vr6, $vr7, $vr8 + vor.v $vr4, $vr4, $vr7 + vor.v $vr6, $vr6, $vr8 + vsrli.w $vr7, $vr5, 1 + vsrli.w $vr8, $vr0, 1 + vor.v $vr4, $vr4, $vr5 + vor.v $vr6, $vr6, $vr0 + vslli.w $vr9, $vr4, 1 + vslli.w $vr10, $vr6, 1 + vslli.w $vr5, $vr5, 2 + vslli.w $vr11, $vr0, 2 + vor.v $vr5, $vr7, $vr5 + vor.v $vr7, $vr8, $vr11 + vor.v $vr5, $vr5, $vr9 + vor.v $vr7, $vr7, $vr10 + vor.v $vr8, $vr4, $vr5 + vor.v $vr9, $vr6, $vr7 + vnor.v $vr4, $vr4, $vr5 + vnor.v $vr5, $vr6, $vr7 + vandn.v $vr6, $vr8, $vr2 vandn.v $vr7, $vr9, $vr2 - vandn.v $vr8, $vr10, $vr2 - vilvh.w $vr9, $vr1, $vr7 - vilvl.w $vr7, $vr1, $vr7 - vilvh.w $vr10, $vr1, $vr8 - vilvl.w $vr8, $vr1, $vr8 - vpickve2gr.d $t0, $vr7, 0 + vshuf4i.w $vr8, $vr6, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.w $vr9, $vr7, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vpickve2gr.d $t0, $vr6, 0 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr7, 1 + vpickve2gr.d $t1, $vr6, 1 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr9, 0 + vpickve2gr.d $t2, $vr8, 0 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr9, 1 + vpickve2gr.d $t3, $vr8, 1 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr8, 0 + vpickve2gr.d $t4, $vr7, 0 slli.d $t4, $t4, 2 - vpickve2gr.d $t5, $vr8, 1 + vpickve2gr.d $t5, $vr7, 1 slli.d $t5, $t5, 2 - vpickve2gr.d $t6, $vr10, 0 + vpickve2gr.d $t6, $vr9, 0 slli.d $t6, $t6, 2 - vpickve2gr.d $t7, $vr10, 1 + vpickve2gr.d $t7, $vr9, 1 slli.d $t7, $t7, 2 ldx.w $t0, $a6, $t0 ldx.w $t1, $a6, $t1 ldx.w $t2, $a6, $t2 ldx.w $t3, $a6, $t3 - vinsgr2vr.w $vr7, $t0, 0 - vinsgr2vr.w $vr7, $t1, 1 - vinsgr2vr.w $vr7, $t2, 2 - vinsgr2vr.w $vr7, $t3, 3 + vinsgr2vr.w $vr6, $t0, 0 + vinsgr2vr.w $vr6, $t1, 1 + vinsgr2vr.w $vr6, $t2, 2 + vinsgr2vr.w $vr6, $t3, 3 ldx.w $t0, $a6, $t4 ldx.w $t1, $a6, $t5 ldx.w $t2, $a6, $t6 ldx.w $t3, $a6, $t7 - vinsgr2vr.w $vr8, $t0, 0 - vinsgr2vr.w $vr8, $t1, 1 - vinsgr2vr.w $vr8, $t2, 2 - vinsgr2vr.w $vr8, $t3, 3 + vinsgr2vr.w $vr7, $t0, 0 + vinsgr2vr.w $vr7, $t1, 1 + vinsgr2vr.w $vr7, $t2, 2 + vinsgr2vr.w $vr7, $t3, 3 + vsrli.w $vr4, $vr4, 16 vsrli.w $vr5, $vr5, 16 - vsrli.w $vr6, $vr6, 16 - vilvh.w $vr9, $vr1, $vr5 - vilvl.w $vr5, $vr1, $vr5 - vilvh.w $vr10, $vr1, $vr6 - vilvl.w $vr6, $vr1, $vr6 - vpickve2gr.d $t0, $vr5, 0 + vshuf4i.w $vr8, $vr4, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf4i.w $vr9, $vr5, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vpickve2gr.d $t0, $vr4, 0 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr5, 1 + vpickve2gr.d $t1, $vr4, 1 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr9, 0 + vpickve2gr.d $t2, $vr8, 0 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr9, 1 + vpickve2gr.d $t3, $vr8, 1 slli.d $t3, $t3, 2 - vpickve2gr.d $t4, $vr6, 0 + vpickve2gr.d $t4, $vr5, 0 slli.d $t4, $t4, 2 - vpickve2gr.d $t5, $vr6, 1 + vpickve2gr.d $t5, $vr5, 1 slli.d $t5, $t5, 2 - vpickve2gr.d $t6, $vr10, 0 + vpickve2gr.d $t6, $vr9, 0 slli.d $t6, $t6, 2 - vpickve2gr.d $t7, $vr10, 1 + vpickve2gr.d $t7, $vr9, 1 slli.d $t7, $t7, 2 ldx.w $t0, $a6, $t0 ldx.w $t1, $a6, $t1 ldx.w $t2, $a6, $t2 ldx.w $t3, $a6, $t3 - vinsgr2vr.w $vr5, $t0, 0 - vinsgr2vr.w $vr5, $t1, 1 - vinsgr2vr.w $vr5, $t2, 2 - vinsgr2vr.w $vr5, $t3, 3 + vinsgr2vr.w $vr4, $t0, 0 + vinsgr2vr.w $vr4, $t1, 1 + vinsgr2vr.w $vr4, $t2, 2 + vinsgr2vr.w $vr4, $t3, 3 ldx.w $t0, $a6, $t4 ldx.w $t1, $a6, $t5 ldx.w $t2, $a6, $t6 ldx.w $t3, $a6, $t7 - vinsgr2vr.w $vr6, $t0, 0 - vinsgr2vr.w $vr6, $t1, 1 - vinsgr2vr.w $vr6, $t2, 2 - vinsgr2vr.w $vr6, $t3, 3 + vinsgr2vr.w $vr5, $t0, 0 + vinsgr2vr.w $vr5, $t1, 1 + vinsgr2vr.w $vr5, $t2, 2 + vinsgr2vr.w $vr5, $t3, 3 + vadd.w $vr1, $vr6, $vr1 vadd.w $vr3, $vr7, $vr3 - vadd.w $vr4, $vr8, $vr4 + vadd.w $vr1, $vr1, $vr4 vadd.w $vr3, $vr3, $vr5 - vadd.w $vr4, $vr4, $vr6 addi.d $a7, $a7, -8 addi.d $a5, $a5, 32 bnez $a7, .LBB1_5 # %bb.6: # %middle.block - vadd.w $vr1, $vr4, $vr3 + vadd.w $vr1, $vr3, $vr1 vhaddw.d.w $vr1, $vr1, $vr1 vhaddw.q.d $vr1, $vr1, $vr1 vpickve2gr.d $a5, $vr1, 0 diff --git a/results/MultiSource/Applications/oggenc/CMakeFiles/oggenc.dir/oggenc.s b/results/MultiSource/Applications/oggenc/CMakeFiles/oggenc.dir/oggenc.s index 4156b131..21649dcb 100644 --- a/results/MultiSource/Applications/oggenc/CMakeFiles/oggenc.dir/oggenc.s +++ b/results/MultiSource/Applications/oggenc/CMakeFiles/oggenc.dir/oggenc.s @@ -21219,25 +21219,24 @@ ogg_stream_pagein: # @ogg_stream_pagein addi.d $a7, $a7, 8 vrepli.w $vr2, 255 move $t1, $t0 - vori.b $vr3, $vr0, 0 .p2align 4, , 16 .LBB110_52: # %vector.body # =>This Inner Loop Header: Depth=1 ld.d $t2, $a7, -8 ld.d $t3, $a7, 0 - vinsgr2vr.d $vr4, $t2, 0 - vinsgr2vr.d $vr5, $t3, 0 + vinsgr2vr.d $vr3, $t2, 0 + vinsgr2vr.d $vr4, $t3, 0 + vand.v $vr3, $vr3, $vr2 vand.v $vr4, $vr4, $vr2 - vand.v $vr5, $vr5, $vr2 - vilvl.w $vr4, $vr0, $vr4 - vilvl.w $vr5, $vr0, $vr5 - vsub.d $vr1, $vr1, $vr4 - vsub.d $vr3, $vr3, $vr5 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsub.d $vr1, $vr1, $vr3 + vsub.d $vr0, $vr0, $vr4 addi.d $t1, $t1, -4 addi.d $a7, $a7, 16 bnez $t1, .LBB110_52 # %bb.53: # %middle.block - vadd.d $vr0, $vr3, $vr1 + vadd.d $vr0, $vr0, $vr1 vhaddw.q.d $vr0, $vr0, $vr0 vstelm.d $vr0, $a3, 0, 0 beq $a6, $t0, .LBB110_20 @@ -56731,12 +56730,9 @@ _vp_remove_floor: # @_vp_remove_floor .LBB221_12: # %vector.body # =>This Inner Loop Header: Depth=1 vld $vr0, $t0, 0 - vshuf4i.w $vr1, $vr0, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vshuf4i.w $vr0, $vr0, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 + vshuf4i.w $vr1, $vr0, 14 + vsllwil.d.w $vr1, $vr1, 0 + vsllwil.d.w $vr0, $vr0, 0 vpickve2gr.d $t3, $vr0, 0 slli.d $t3, $t3, 2 vpickve2gr.d $t4, $vr0, 1 @@ -64343,42 +64339,41 @@ floor1_encode: # @floor1_encode ori $a4, $a4, 1366 lu32i.d $a4, 349525 lu52i.d $a4, $a4, 341 - vrepli.b $vr5, 0 - vreplgr2vr.w $vr6, $a0 + vreplgr2vr.w $vr5, $a0 move $a5, $a2 move $a6, $a3 .p2align 4, , 16 .LBB256_17: # %vector.body # =>This Inner Loop Header: Depth=1 - vld $vr7, $a5, 0 - vand.v $vr8, $vr7, $vr4 - vsrli.w $vr9, $vr8, 4 - vpickve2gr.h $a7, $vr7, 6 + vld $vr6, $a5, 0 + vand.v $vr7, $vr6, $vr4 + vsrli.w $vr8, $vr7, 4 + vpickve2gr.h $a7, $vr6, 2 bstrpick.d $a7, $a7, 14, 0 mulh.du $a7, $a7, $a4 - vpickve2gr.h $t0, $vr7, 4 + vpickve2gr.h $t0, $vr6, 0 bstrpick.d $t0, $t0, 14, 0 mulh.du $t0, $t0, $a4 - vpickve2gr.h $t1, $vr7, 2 - bstrpick.d $t1, $t1, 14, 0 - mulh.du $t1, $t1, $a4 - vpickve2gr.h $t2, $vr7, 0 - bstrpick.d $t2, $t2, 14, 0 - mulh.du $t2, $t2, $a4 - vori.b $vr10, $vr5, 0 - vinsgr2vr.h $vr10, $t2, 0 - vinsgr2vr.h $vr10, $t1, 2 - vinsgr2vr.h $vr10, $t0, 4 - vinsgr2vr.h $vr10, $a7, 6 - vsrli.w $vr11, $vr8, 3 - vsrli.w $vr12, $vr8, 2 - vbitsel.v $vr8, $vr8, $vr9, $vr0 - vbitsel.v $vr8, $vr8, $vr10, $vr1 - vbitsel.v $vr8, $vr8, $vr11, $vr2 - vbitsel.v $vr8, $vr8, $vr12, $vr3 - vand.v $vr7, $vr7, $vr6 - vor.v $vr7, $vr8, $vr7 - vst $vr7, $a5, 0 + vinsgr2vr.h $vr9, $t0, 0 + vinsgr2vr.h $vr9, $a7, 1 + vpickve2gr.h $a7, $vr6, 4 + bstrpick.d $a7, $a7, 14, 0 + mulh.du $a7, $a7, $a4 + vinsgr2vr.h $vr9, $a7, 2 + vpickve2gr.h $a7, $vr6, 6 + bstrpick.d $a7, $a7, 14, 0 + mulh.du $a7, $a7, $a4 + vinsgr2vr.h $vr9, $a7, 3 + vsllwil.wu.hu $vr9, $vr9, 0 + vsrli.w $vr10, $vr7, 3 + vsrli.w $vr11, $vr7, 2 + vbitsel.v $vr7, $vr7, $vr8, $vr0 + vbitsel.v $vr7, $vr7, $vr9, $vr1 + vbitsel.v $vr7, $vr7, $vr10, $vr2 + vbitsel.v $vr7, $vr7, $vr11, $vr3 + vand.v $vr6, $vr6, $vr5 + vor.v $vr6, $vr7, $vr6 + vst $vr6, $a5, 0 addi.d $a6, $a6, -4 addi.d $a5, $a5, 16 bnez $a6, .LBB256_17 @@ -65935,18 +65930,12 @@ floor1_look: # @floor1_look # =>This Inner Loop Header: Depth=1 vld $vr2, $a3, -16 vld $vr3, $a3, 0 - vshuf4i.w $vr4, $vr2, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $a5, $vr2, 0 slli.d $a5, $a5, 2 vpickve2gr.d $a6, $vr2, 1 @@ -66142,12 +66131,9 @@ floor1_look: # @floor1_look .LBB259_29: # %vector.body191 # =>This Inner Loop Header: Depth=1 vld $vr0, $a1, 260 - vshuf4i.w $vr1, $vr0, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vshuf4i.w $vr0, $vr0, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 + vshuf4i.w $vr1, $vr0, 14 + vsllwil.d.w $vr1, $vr1, 0 + vsllwil.d.w $vr0, $vr0, 0 vpickve2gr.d $a3, $vr0, 0 slli.d $a3, $a3, 2 vpickve2gr.d $a4, $vr0, 1 @@ -71526,12 +71512,12 @@ mapping0_forward: # @mapping0_forward # Child Loop BB282_24 Depth 2 ld.d $a1, $fp, -144 # 8-byte Folded Reload ld.d $a0, $a1, 0 - slli.d $s7, $s6, 3 - ldx.d $s1, $a0, $s7 - ld.d $s0, $a1, 120 + slli.d $s0, $s6, 3 + ldx.d $s1, $a0, $s0 + ld.d $s7, $a1, 120 ld.d $a0, $a1, 128 ld.d $s3, $a1, 112 - add.d $a1, $s0, $a2 + add.d $a1, $s7, $a2 bge $a0, $a1, .LBB282_21 # %bb.18: # in Loop: Header=BB282_17 Depth=1 ld.d $s4, $fp, -144 # 8-byte Folded Reload @@ -71542,7 +71528,7 @@ mapping0_forward: # @mapping0_forward jirl $ra, $ra, 0 ld.d $a1, $s4, 136 ld.d $a2, $s4, 144 - add.d $a1, $a1, $s0 + add.d $a1, $a1, $s7 st.d $a1, $s4, 136 st.d $a2, $a0, 8 move $a2, $s5 @@ -71555,7 +71541,7 @@ mapping0_forward: # @mapping0_forward jirl $ra, $ra, 0 move $a2, $s5 move $s3, $a0 - move $s0, $zero + move $s7, $zero st.d $a0, $s4, 112 b .LBB282_22 .p2align 4, , 16 @@ -71563,11 +71549,11 @@ mapping0_forward: # @mapping0_forward ld.d $s4, $fp, -144 # 8-byte Folded Reload .LBB282_22: # %_vorbis_block_alloc.exit375 # in Loop: Header=BB282_17 Depth=1 - add.d $a0, $s3, $s0 - add.d $a1, $s0, $a2 + add.d $a0, $s3, $s7 + add.d $a1, $s7, $a2 st.d $a1, $s4, 120 - ld.d $s0, $fp, -216 # 8-byte Folded Reload - stx.d $a0, $s0, $s7 + ld.d $s3, $fp, -216 # 8-byte Folded Reload + stx.d $a0, $s3, $s0 ld.w $a3, $s4, 48 ld.w $a4, $s4, 56 ld.w $a5, $s4, 64 @@ -71581,7 +71567,7 @@ mapping0_forward: # @mapping0_forward ld.d $a1, $fp, -176 # 8-byte Folded Reload ldx.d $a0, $a1, $a0 ld.d $a0, $a0, 0 - ldx.d $a2, $s0, $s7 + ldx.d $a2, $s3, $s0 move $a1, $s1 pcaddu18i $ra, %call36(mdct_forward) jirl $ra, $ra, 0 @@ -73107,12 +73093,9 @@ mapping0_forward: # @mapping0_forward # Parent Loop BB282_145 Depth=2 # => This Inner Loop Header: Depth=3 vld $vr0, $s3, 0 - vshuf4i.w $vr1, $vr0, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vshuf4i.w $vr0, $vr0, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 + vshuf4i.w $vr1, $vr0, 14 + vsllwil.d.w $vr1, $vr1, 0 + vsllwil.d.w $vr0, $vr0, 0 vpickve2gr.d $a7, $vr0, 0 slli.d $a7, $a7, 2 vpickve2gr.d $t0, $vr0, 1 @@ -76571,12 +76554,9 @@ dradf4: # @dradf4 vfadd.s $vr6, $vr3, $vr4 vfadd.s $vr7, $vr5, $vr6 vslli.w $vr8, $vr0, 2 - vshuf4i.w $vr9, $vr8, 50 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr10, $vr8, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 + vshuf4i.w $vr9, $vr8, 14 + vsllwil.d.w $vr9, $vr9, 0 + vsllwil.d.w $vr10, $vr8, 0 vpickve2gr.d $s2, $vr10, 0 alsl.d $s2, $s2, $a3, 2 vpickve2gr.d $s3, $vr10, 1 @@ -76591,12 +76571,9 @@ dradf4: # @dradf4 vstelm.w $vr7, $s5, 0, 3 vfsub.s $vr5, $vr6, $vr5 vaddi.wu $vr6, $vr8, 4 - vshuf4i.w $vr7, $vr6, 50 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 - vshuf4i.w $vr6, $vr6, 16 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 + vshuf4i.w $vr7, $vr6, 14 + vsllwil.d.w $vr7, $vr7, 0 + vsllwil.d.w $vr6, $vr6, 0 vpickve2gr.d $s2, $vr6, 0 alsl.d $s2, $s2, $a3, 2 vpickve2gr.d $s3, $vr6, 1 @@ -76611,12 +76588,9 @@ dradf4: # @dradf4 vstelm.w $vr5, $s5, -4, 3 vfsub.s $vr3, $vr3, $vr4 vbitseti.w $vr4, $vr8, 1 - vshuf4i.w $vr5, $vr4, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 + vshuf4i.w $vr5, $vr4, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr4, $vr4, 0 vpickve2gr.d $s2, $vr4, 0 alsl.d $s2, $s2, $a3, 2 vpickve2gr.d $s3, $vr4, 1 @@ -76732,12 +76706,9 @@ dradf2: # @dradf2 vldx $vr3, $t4, $t2 vfadd.s $vr4, $vr2, $vr3 vslli.w $vr5, $vr1, 1 - vshuf4i.w $vr6, $vr5, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr7, $vr5, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 + vshuf4i.w $vr6, $vr5, 14 + vsllwil.d.w $vr6, $vr6, 0 + vsllwil.d.w $vr7, $vr5, 0 vpickve2gr.d $t6, $vr7, 0 alsl.d $t6, $t6, $a3, 2 vpickve2gr.d $t7, $vr7, 1 @@ -76752,12 +76723,9 @@ dradf2: # @dradf2 vstelm.w $vr4, $fp, 0, 3 vfsub.s $vr2, $vr2, $vr3 vadd.w $vr3, $vr5, $vr0 - vshuf4i.w $vr4, $vr3, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr4, $vr3, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $t6, $vr3, 0 alsl.d $t6, $t6, $a3, 2 vpickve2gr.d $t7, $vr3, 1 @@ -79108,12 +79076,9 @@ dradb2: # @dradb2 .p2align 4, , 16 .LBB335_11: # %vector.body # =>This Inner Loop Header: Depth=1 - vshuf4i.w $vr1, $vr0, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vshuf4i.w $vr2, $vr0, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 + vshuf4i.w $vr1, $vr0, 14 + vsllwil.d.w $vr1, $vr1, 0 + vsllwil.d.w $vr2, $vr0, 0 vpickve2gr.d $t6, $vr2, 0 slli.d $t6, $t6, 2 vpickve2gr.d $t7, $vr2, 1 @@ -79127,12 +79092,9 @@ dradb2: # @dradb2 fldx.s $fa3, $a2, $t8 fldx.s $fa4, $a2, $fp vbitseti.w $vr5, $vr0, 0 - vshuf4i.w $vr6, $vr5, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vshuf4i.w $vr6, $vr5, 14 + vsllwil.d.w $vr6, $vr6, 0 + vsllwil.d.w $vr5, $vr5, 0 vpickve2gr.d $t6, $vr5, 0 slli.d $t6, $t6, 2 vpickve2gr.d $t7, $vr5, 1 diff --git a/results/MultiSource/Applications/sgefa/CMakeFiles/sgefa.dir/driver.s b/results/MultiSource/Applications/sgefa/CMakeFiles/sgefa.dir/driver.s index 6c11b0d4..ecb66f6b 100644 --- a/results/MultiSource/Applications/sgefa/CMakeFiles/sgefa.dir/driver.s +++ b/results/MultiSource/Applications/sgefa/CMakeFiles/sgefa.dir/driver.s @@ -446,8 +446,9 @@ matgen: # @matgen .LBB1_22: # %vector.body748 # Parent Loop BB1_19 Depth=1 # => This Inner Loop Header: Depth=2 - vilvl.w $vr6, $vr1, $vr2 - vilvh.w $vr7, $vr1, $vr2 + vsllwil.du.wu $vr6, $vr2, 0 + vshuf4i.w $vr7, $vr2, 14 + vsllwil.du.wu $vr7, $vr7, 0 vslt.d $vr8, $vr7, $vr3 vslt.d $vr9, $vr6, $vr3 vslt.du $vr7, $vr4, $vr7 @@ -654,8 +655,9 @@ matgen: # @matgen # Parent Loop BB1_41 Depth=1 # => This Inner Loop Header: Depth=2 vst $vr5, $t1, 0 - vilvh.w $vr11, $vr5, $vr7 - vilvl.w $vr12, $vr5, $vr7 + vshuf4i.w $vr11, $vr7, 14 + vsllwil.du.wu $vr11, $vr11, 0 + vsllwil.du.wu $vr12, $vr7, 0 vseq.d $vr13, $vr8, $vr12 vseq.d $vr14, $vr8, $vr11 vpickev.w $vr15, $vr14, $vr13 @@ -942,8 +944,9 @@ matgen: # @matgen .LBB1_86: # %vector.body652 # Parent Loop BB1_83 Depth=1 # => This Inner Loop Header: Depth=2 - vilvh.w $vr6, $vr2, $vr5 - vilvl.w $vr7, $vr2, $vr5 + vshuf4i.w $vr6, $vr5, 14 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.du.wu $vr7, $vr5, 0 vslt.du $vr7, $vr7, $vr3 vslt.du $vr6, $vr6, $vr3 vpickev.w $vr6, $vr6, $vr7 @@ -1072,14 +1075,13 @@ matgen: # @matgen bstrpick.d $a4, $a2, 30, 2 slli.d $a3, $a4, 2 slli.d $a4, $a4, 4 - vreplvei.d $vr1, $vr0, 0 pcalau12i $a5, %pc_hi20(.LCPI1_1) - fld.d $fa2, $a5, %pc_lo12(.LCPI1_1) + fld.d $fa1, $a5, %pc_lo12(.LCPI1_1) pcalau12i $a5, %pc_hi20(.LCPI1_0) - vld $vr3, $a5, %pc_lo12(.LCPI1_0) + vld $vr2, $a5, %pc_lo12(.LCPI1_0) + vreplvei.d $vr3, $vr0, 0 ori $a5, $zero, 4 - vrepli.b $vr4, 0 - vreplgr2vr.d $vr5, $s5 + vreplgr2vr.d $vr4, $s5 b .LBB1_102 .LBB1_101: # %._crit_edge469 # in Loop: Header=BB1_102 Depth=1 @@ -1095,14 +1097,14 @@ matgen: # @matgen srli.d $a7, $a6, 1 andi $t1, $a6, 1 or $a7, $t1, $a7 - movgr2fr.d $fa6, $a7 - ffint.s.l $fa6, $fa6 - fadd.s $fa6, $fa6, $fa6 + movgr2fr.d $fa5, $a7 + ffint.s.l $fa5, $fa5 + fadd.s $fa5, $fa5, $fa5 slti $a7, $a6, 0 - movgr2fr.d $fa7, $a6 - ffint.s.l $fa7, $fa7 + movgr2fr.d $fa6, $a6 + ffint.s.l $fa6, $fa6 movgr2cf $fcc0, $a7 - fsel $fa6, $fa7, $fa6, $fcc0 + fsel $fa5, $fa6, $fa5, $fcc0 bge $s3, $a5, .LBB1_104 # %bb.103: # in Loop: Header=BB1_102 Depth=1 move $t1, $zero @@ -1111,51 +1113,52 @@ matgen: # @matgen .LBB1_104: # %vector.ph621 # in Loop: Header=BB1_102 Depth=1 add.d $a7, $t0, $a4 - vreplgr2vr.d $vr7, $a0 - vreplgr2vr.w $vr8, $a6 - vreplvei.w $vr9, $vr6, 0 + vreplgr2vr.d $vr6, $a0 + vreplgr2vr.w $vr7, $a6 + vreplvei.w $vr8, $vr5, 0 move $t1, $a3 - vori.b $vr10, $vr3, 0 + vori.b $vr9, $vr2, 0 .p2align 4, , 16 .LBB1_105: # %vector.body632 # Parent Loop BB1_102 Depth=1 # => This Inner Loop Header: Depth=2 - vilvh.w $vr11, $vr4, $vr10 - vilvl.w $vr12, $vr4, $vr10 - vslt.du $vr12, $vr7, $vr12 - vslt.du $vr11, $vr7, $vr11 - vpickev.w $vr11, $vr11, $vr12 - vaddi.wu $vr12, $vr10, 1 - vbitsel.v $vr11, $vr8, $vr12, $vr11 - vffint.s.wu $vr11, $vr11 - vfdiv.s $vr11, $vr9, $vr11 - vreplvei.w $vr12, $vr11, 3 - fcvt.d.s $ft4, $ft4 - vreplvei.w $vr13, $vr11, 2 - fcvt.d.s $ft5, $ft5 - vextrins.d $vr13, $vr12, 16 - vreplvei.w $vr12, $vr11, 1 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.du.wu $vr10, $vr10, 0 + vsllwil.du.wu $vr11, $vr9, 0 + vslt.du $vr11, $vr6, $vr11 + vslt.du $vr10, $vr6, $vr10 + vpickev.w $vr10, $vr10, $vr11 + vaddi.wu $vr11, $vr9, 1 + vbitsel.v $vr10, $vr7, $vr11, $vr10 + vffint.s.wu $vr10, $vr10 + vfdiv.s $vr10, $vr8, $vr10 + vreplvei.w $vr11, $vr10, 3 + fcvt.d.s $ft3, $ft3 + vreplvei.w $vr12, $vr10, 2 fcvt.d.s $ft4, $ft4 - vreplvei.w $vr11, $vr11, 0 + vextrins.d $vr12, $vr11, 16 + vreplvei.w $vr11, $vr10, 1 fcvt.d.s $ft3, $ft3 - vextrins.d $vr11, $vr12, 16 - vfmul.d $vr11, $vr11, $vr5 - vfmul.d $vr12, $vr13, $vr5 - vfdiv.d $vr12, $vr12, $vr1 - vfdiv.d $vr11, $vr11, $vr1 - vreplvei.d $vr13, $vr11, 1 - fcvt.s.d $ft5, $ft5 - vreplvei.d $vr11, $vr11, 0 - fcvt.s.d $ft3, $ft3 - vextrins.w $vr11, $vr13, 16 - vreplvei.d $vr13, $vr12, 0 - fcvt.s.d $ft5, $ft5 - vextrins.w $vr11, $vr13, 32 - vreplvei.d $vr12, $vr12, 1 + vreplvei.w $vr10, $vr10, 0 + fcvt.d.s $ft2, $ft2 + vextrins.d $vr10, $vr11, 16 + vfmul.d $vr10, $vr10, $vr4 + vfmul.d $vr11, $vr12, $vr4 + vfdiv.d $vr11, $vr11, $vr3 + vfdiv.d $vr10, $vr10, $vr3 + vreplvei.d $vr12, $vr10, 1 fcvt.s.d $ft4, $ft4 - vextrins.w $vr11, $vr12, 48 - vst $vr11, $t0, 0 - vaddi.wu $vr10, $vr10, 4 + vreplvei.d $vr10, $vr10, 0 + fcvt.s.d $ft2, $ft2 + vextrins.w $vr10, $vr12, 16 + vreplvei.d $vr12, $vr11, 0 + fcvt.s.d $ft4, $ft4 + vextrins.w $vr10, $vr12, 32 + vreplvei.d $vr11, $vr11, 1 + fcvt.s.d $ft3, $ft3 + vextrins.w $vr10, $vr11, 48 + vst $vr10, $t0, 0 + vaddi.wu $vr9, $vr9, 4 addi.d $t1, $t1, -4 addi.d $t0, $t0, 16 bnez $t1, .LBB1_105 @@ -1176,14 +1179,14 @@ matgen: # @matgen maskeqz $t2, $t1, $t2 or $t2, $t2, $t3 bstrpick.d $t2, $t2, 31, 0 - movgr2fr.d $fa7, $t2 - ffint.s.l $fa7, $fa7 - fdiv.s $fa7, $fa6, $fa7 - fcvt.d.s $fa7, $fa7 - fmul.d $fa7, $fa7, $fa2 - fdiv.d $fa7, $fa7, $fa0 - fcvt.s.d $fa7, $fa7 - fst.s $fa7, $a7, 0 + movgr2fr.d $fa6, $t2 + ffint.s.l $fa6, $fa6 + fdiv.s $fa6, $fa5, $fa6 + fcvt.d.s $fa6, $fa6 + fmul.d $fa6, $fa6, $fa1 + fdiv.d $fa6, $fa6, $fa0 + fcvt.s.d $fa6, $fa6 + fst.s $fa6, $a7, 0 addi.d $a7, $a7, 4 addi.d $t0, $t0, 1 bne $a2, $t1, .LBB1_108 @@ -1423,7 +1426,6 @@ matgen: # @matgen lu52i.d $a5, $a5, 1149 vreplgr2vr.d $vr4, $a5 ori $a5, $zero, 4 - vrepli.b $vr5, 0 b .LBB1_139 .LBB1_138: # %._crit_edge461 # in Loop: Header=BB1_139 Depth=1 @@ -1439,14 +1441,14 @@ matgen: # @matgen srli.d $a7, $a6, 1 andi $t1, $a6, 1 or $a7, $t1, $a7 - movgr2fr.d $fa6, $a7 - ffint.s.l $fa6, $fa6 - fadd.s $fa6, $fa6, $fa6 + movgr2fr.d $fa5, $a7 + ffint.s.l $fa5, $fa5 + fadd.s $fa5, $fa5, $fa5 slti $a7, $a6, 0 - movgr2fr.d $fa7, $a6 - ffint.s.l $fa7, $fa7 + movgr2fr.d $fa6, $a6 + ffint.s.l $fa6, $fa6 movgr2cf $fcc0, $a7 - fsel $fa6, $fa7, $fa6, $fcc0 + fsel $fa5, $fa6, $fa5, $fcc0 bge $s3, $a5, .LBB1_141 # %bb.140: # in Loop: Header=BB1_139 Depth=1 move $t1, $zero @@ -1455,50 +1457,51 @@ matgen: # @matgen .LBB1_141: # %vector.ph # in Loop: Header=BB1_139 Depth=1 add.d $a7, $t0, $a4 - vreplgr2vr.d $vr7, $a0 - vreplgr2vr.w $vr8, $a6 - vreplvei.w $vr9, $vr6, 0 + vreplgr2vr.d $vr6, $a0 + vreplgr2vr.w $vr7, $a6 + vreplvei.w $vr8, $vr5, 0 move $t1, $a3 - vori.b $vr10, $vr3, 0 + vori.b $vr9, $vr3, 0 .p2align 4, , 16 .LBB1_142: # %vector.body # Parent Loop BB1_139 Depth=1 # => This Inner Loop Header: Depth=2 - vilvh.w $vr11, $vr5, $vr10 - vilvl.w $vr12, $vr5, $vr10 - vslt.du $vr12, $vr7, $vr12 - vslt.du $vr11, $vr7, $vr11 - vpickev.w $vr11, $vr11, $vr12 - vaddi.wu $vr12, $vr10, 1 - vbitsel.v $vr11, $vr8, $vr12, $vr11 - vffint.s.wu $vr11, $vr11 - vfdiv.s $vr11, $vr11, $vr9 - vfmul.s $vr11, $vr11, $vr1 - vreplvei.w $vr12, $vr11, 1 - fcvt.d.s $ft4, $ft4 - vreplvei.w $vr13, $vr11, 0 - fcvt.d.s $ft5, $ft5 - vextrins.d $vr13, $vr12, 16 - vreplvei.w $vr12, $vr11, 3 + vshuf4i.w $vr10, $vr9, 14 + vsllwil.du.wu $vr10, $vr10, 0 + vsllwil.du.wu $vr11, $vr9, 0 + vslt.du $vr11, $vr6, $vr11 + vslt.du $vr10, $vr6, $vr10 + vpickev.w $vr10, $vr10, $vr11 + vaddi.wu $vr11, $vr9, 1 + vbitsel.v $vr10, $vr7, $vr11, $vr10 + vffint.s.wu $vr10, $vr10 + vfdiv.s $vr10, $vr10, $vr8 + vfmul.s $vr10, $vr10, $vr1 + vreplvei.w $vr11, $vr10, 1 + fcvt.d.s $ft3, $ft3 + vreplvei.w $vr12, $vr10, 0 fcvt.d.s $ft4, $ft4 - vreplvei.w $vr11, $vr11, 2 + vextrins.d $vr12, $vr11, 16 + vreplvei.w $vr11, $vr10, 3 fcvt.d.s $ft3, $ft3 - vextrins.d $vr11, $vr12, 16 - vfdiv.d $vr11, $vr11, $vr4 - vfdiv.d $vr12, $vr13, $vr4 - vreplvei.d $vr13, $vr12, 1 - fcvt.s.d $ft5, $ft5 - vreplvei.d $vr12, $vr12, 0 + vreplvei.w $vr10, $vr10, 2 + fcvt.d.s $ft2, $ft2 + vextrins.d $vr10, $vr11, 16 + vfdiv.d $vr10, $vr10, $vr4 + vfdiv.d $vr11, $vr12, $vr4 + vreplvei.d $vr12, $vr11, 1 fcvt.s.d $ft4, $ft4 - vextrins.w $vr12, $vr13, 16 - vreplvei.d $vr13, $vr11, 0 - fcvt.s.d $ft5, $ft5 - vextrins.w $vr12, $vr13, 32 - vreplvei.d $vr11, $vr11, 1 + vreplvei.d $vr11, $vr11, 0 fcvt.s.d $ft3, $ft3 - vextrins.w $vr12, $vr11, 48 - vst $vr12, $t0, 0 - vaddi.wu $vr10, $vr10, 4 + vextrins.w $vr11, $vr12, 16 + vreplvei.d $vr12, $vr10, 0 + fcvt.s.d $ft4, $ft4 + vextrins.w $vr11, $vr12, 32 + vreplvei.d $vr10, $vr10, 1 + fcvt.s.d $ft2, $ft2 + vextrins.w $vr11, $vr10, 48 + vst $vr11, $t0, 0 + vaddi.wu $vr9, $vr9, 4 addi.d $t1, $t1, -4 addi.d $t0, $t0, 16 bnez $t1, .LBB1_142 @@ -1519,14 +1522,14 @@ matgen: # @matgen maskeqz $t2, $t1, $t2 or $t2, $t2, $t3 bstrpick.d $t2, $t2, 31, 0 - movgr2fr.d $fa7, $t2 - ffint.s.l $fa7, $fa7 - fdiv.s $fa7, $fa7, $fa6 - fmul.s $fa7, $fa7, $fa0 - fcvt.d.s $fa7, $fa7 - fdiv.d $fa7, $fa7, $fa2 - fcvt.s.d $fa7, $fa7 - fst.s $fa7, $a7, 0 + movgr2fr.d $fa6, $t2 + ffint.s.l $fa6, $fa6 + fdiv.s $fa6, $fa6, $fa5 + fmul.s $fa6, $fa6, $fa0 + fcvt.d.s $fa6, $fa6 + fdiv.d $fa6, $fa6, $fa2 + fcvt.s.d $fa6, $fa6 + fst.s $fa6, $a7, 0 addi.d $a7, $a7, 4 addi.d $t0, $t0, 1 bne $a2, $t1, .LBB1_145 @@ -1645,8 +1648,9 @@ matgen: # @matgen .LBB1_161: # %vector.body672 # Parent Loop BB1_158 Depth=1 # => This Inner Loop Header: Depth=2 - vilvh.w $vr6, $vr2, $vr5 - vilvl.w $vr7, $vr2, $vr5 + vshuf4i.w $vr6, $vr5, 14 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.du.wu $vr7, $vr5, 0 vslt.du $vr7, $vr3, $vr7 vslt.du $vr6, $vr3, $vr6 vpickev.w $vr6, $vr6, $vr7 diff --git a/results/MultiSource/Applications/siod/CMakeFiles/siod.dir/slibu.s b/results/MultiSource/Applications/siod/CMakeFiles/siod.dir/slibu.s index d811c936..fc8919d0 100644 --- a/results/MultiSource/Applications/siod/CMakeFiles/siod.dir/slibu.s +++ b/results/MultiSource/Applications/siod/CMakeFiles/siod.dir/slibu.s @@ -4287,43 +4287,31 @@ html_encode: # @html_encode vinsgr2vr.h $vr9, $a3, 0 vinsgr2vr.h $vr10, $a4, 0 vseq.b $vr11, $vr9, $vr1 - vilvl.b $vr11, $vr11, $vr11 - vilvl.h $vr11, $vr11, $vr11 - vilvl.w $vr11, $vr11, $vr11 - vslli.d $vr11, $vr11, 56 - vsrai.d $vr11, $vr11, 56 + vsllwil.h.b $vr11, $vr11, 0 + vsllwil.w.h $vr11, $vr11, 0 + vsllwil.d.w $vr11, $vr11, 0 vseq.b $vr12, $vr10, $vr1 - vilvl.b $vr12, $vr12, $vr12 - vilvl.h $vr12, $vr12, $vr12 - vilvl.w $vr12, $vr12, $vr12 - vslli.d $vr12, $vr12, 56 - vsrai.d $vr12, $vr12, 56 + vsllwil.h.b $vr12, $vr12, 0 + vsllwil.w.h $vr12, $vr12, 0 + vsllwil.d.w $vr12, $vr12, 0 vseq.b $vr13, $vr9, $vr2 - vilvl.b $vr13, $vr13, $vr13 - vilvl.h $vr13, $vr13, $vr13 - vilvl.w $vr13, $vr13, $vr13 - vslli.d $vr13, $vr13, 56 - vsrai.d $vr13, $vr13, 56 + vsllwil.h.b $vr13, $vr13, 0 + vsllwil.w.h $vr13, $vr13, 0 + vsllwil.d.w $vr13, $vr13, 0 vseq.b $vr14, $vr10, $vr2 - vilvl.b $vr14, $vr14, $vr14 - vilvl.h $vr14, $vr14, $vr14 - vilvl.w $vr14, $vr14, $vr14 - vslli.d $vr14, $vr14, 56 - vsrai.d $vr14, $vr14, 56 + vsllwil.h.b $vr14, $vr14, 0 + vsllwil.w.h $vr14, $vr14, 0 + vsllwil.d.w $vr14, $vr14, 0 vbitclri.b $vr9, $vr9, 1 vseq.b $vr9, $vr9, $vr3 - vilvl.b $vr9, $vr9, $vr9 - vilvl.h $vr9, $vr9, $vr9 - vilvl.w $vr9, $vr9, $vr9 - vslli.d $vr9, $vr9, 56 - vsrai.d $vr9, $vr9, 56 + vsllwil.h.b $vr9, $vr9, 0 + vsllwil.w.h $vr9, $vr9, 0 + vsllwil.d.w $vr9, $vr9, 0 vbitclri.b $vr10, $vr10, 1 vseq.b $vr10, $vr10, $vr3 - vilvl.b $vr10, $vr10, $vr10 - vilvl.h $vr10, $vr10, $vr10 - vilvl.w $vr10, $vr10, $vr10 - vslli.d $vr10, $vr10, 56 - vsrai.d $vr10, $vr10, 56 + vsllwil.h.b $vr10, $vr10, 0 + vsllwil.w.h $vr10, $vr10, 0 + vsllwil.d.w $vr10, $vr10, 0 vbitsel.v $vr9, $vr5, $vr4, $vr9 vbitsel.v $vr9, $vr9, $vr6, $vr11 vbitsel.v $vr9, $vr9, $vr7, $vr13 diff --git a/results/MultiSource/Applications/spiff/CMakeFiles/spiff.dir/float.s b/results/MultiSource/Applications/spiff/CMakeFiles/spiff.dir/float.s index c6ecf392..43dc23bf 100644 --- a/results/MultiSource/Applications/spiff/CMakeFiles/spiff.dir/float.s +++ b/results/MultiSource/Applications/spiff/CMakeFiles/spiff.dir/float.s @@ -738,27 +738,7 @@ F_floatcmp: # @F_floatcmp .Lfunc_end3: .size F_floatcmp, .Lfunc_end3-F_floatcmp # -- End function - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4, 0x0 # -- Begin function F_floatmul -.LCPI4_0: - .byte 3 # 0x3 - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .byte 2 # 0x2 - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .byte 1 # 0x1 - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .byte 0 # 0x0 - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .text - .globl F_floatmul + .globl F_floatmul # -- Begin function F_floatmul .p2align 5 .type F_floatmul,@function F_floatmul: # @F_floatmul @@ -819,68 +799,66 @@ F_floatmul: # @F_floatmul jirl $ra, $ra, 0 ld.d $a1, $s0, 8 pcalau12i $a0, %pc_hi20(F_floatmul.man2) - addi.d $s2, $a0, %pc_lo12(F_floatmul.man2) - move $a0, $s2 + addi.d $s3, $a0, %pc_lo12(F_floatmul.man2) + move $a0, $s3 pcaddu18i $ra, %call36(strcpy) jirl $ra, $ra, 0 move $a0, $s1 pcaddu18i $ra, %call36(strlen) jirl $ra, $ra, 0 - move $s3, $a0 + move $s2, $a0 add.d $fp, $s1, $a0 - move $a0, $s2 + move $a0, $s3 pcaddu18i $ra, %call36(strlen) jirl $ra, $ra, 0 addi.d $a3, $fp, -1 - add.d $a2, $s2, $a0 + add.d $a2, $s3, $a0 bgeu $a3, $s1, .LBB4_10 # %bb.9: move $a1, $zero b .LBB4_16 .LBB4_10: # %.lr.ph.preheader ori $a1, $zero, 8 - bgeu $s3, $a1, .LBB4_12 + bgeu $s2, $a1, .LBB4_12 # %bb.11: move $a1, $zero b .LBB4_15 .LBB4_12: # %vector.ph - add.d $a1, $s3, $s1 - move $a4, $s3 + add.d $a1, $s2, $s1 + move $a4, $s2 bstrins.d $a4, $zero, 2, 0 sub.d $a3, $a3, $a4 - pcalau12i $a5, %pc_hi20(.LCPI4_0) - vld $vr0, $a5, %pc_lo12(.LCPI4_0) addi.d $a1, $a1, -4 - vrepli.b $vr1, 0 - vrepli.w $vr2, -48 + vrepli.b $vr0, 0 + vrepli.w $vr1, -48 move $a5, $a4 - vori.b $vr3, $vr1, 0 + vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB4_13: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $a6, $a1, 0 ld.w $a7, $a1, -4 - vinsgr2vr.w $vr4, $a6, 0 - vinsgr2vr.w $vr5, $a7, 0 - vshuf.b $vr4, $vr0, $vr4, $vr0 - vslli.w $vr4, $vr4, 24 - vsrai.w $vr4, $vr4, 24 - vshuf.b $vr5, $vr0, $vr5, $vr0 - vslli.w $vr5, $vr5, 24 - vsrai.w $vr5, $vr5, 24 - vadd.w $vr1, $vr1, $vr4 - vadd.w $vr3, $vr3, $vr5 - vadd.w $vr1, $vr1, $vr2 - vadd.w $vr3, $vr3, $vr2 + vinsgr2vr.w $vr3, $a6, 0 + vshuf4i.b $vr3, $vr3, 27 + vinsgr2vr.w $vr4, $a7, 0 + vshuf4i.b $vr4, $vr4, 27 + vsllwil.h.b $vr3, $vr3, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.h.b $vr4, $vr4, 0 + vsllwil.w.h $vr4, $vr4, 0 + vadd.w $vr0, $vr0, $vr3 + vadd.w $vr2, $vr2, $vr4 + vadd.w $vr0, $vr0, $vr1 + vadd.w $vr2, $vr2, $vr1 addi.d $a5, $a5, -8 addi.d $a1, $a1, -8 bnez $a5, .LBB4_13 # %bb.14: # %middle.block - vadd.w $vr0, $vr3, $vr1 + vadd.w $vr0, $vr2, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a1, $vr0, 0 - beq $s3, $a4, .LBB4_16 + beq $s2, $a4, .LBB4_16 .p2align 4, , 16 .LBB4_15: # %.lr.ph # =>This Inner Loop Header: Depth=1 @@ -893,7 +871,7 @@ F_floatmul: # @F_floatmul addi.d $a2, $a2, -1 st.d $s0, $sp, 40 # 8-byte Folded Spill st.d $s4, $sp, 24 # 8-byte Folded Spill - bgeu $a2, $s2, .LBB4_18 + bgeu $a2, $s3, .LBB4_18 # %bb.17: move $a4, $zero b .LBB4_24 @@ -904,39 +882,37 @@ F_floatmul: # @F_floatmul move $a4, $zero b .LBB4_23 .LBB4_20: # %vector.ph91 - add.d $a4, $a0, $s2 + add.d $a4, $a0, $s3 move $a3, $a0 bstrins.d $a3, $zero, 2, 0 sub.d $a2, $a2, $a3 - pcalau12i $a5, %pc_hi20(.LCPI4_0) - vld $vr0, $a5, %pc_lo12(.LCPI4_0) addi.d $a4, $a4, -4 - vrepli.b $vr1, 0 - vrepli.w $vr2, -48 + vrepli.b $vr0, 0 + vrepli.w $vr1, -48 move $a5, $a3 - vori.b $vr3, $vr1, 0 + vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB4_21: # %vector.body94 # =>This Inner Loop Header: Depth=1 ld.w $a6, $a4, 0 ld.w $a7, $a4, -4 - vinsgr2vr.w $vr4, $a6, 0 - vinsgr2vr.w $vr5, $a7, 0 - vshuf.b $vr4, $vr0, $vr4, $vr0 - vslli.w $vr4, $vr4, 24 - vsrai.w $vr4, $vr4, 24 - vshuf.b $vr5, $vr0, $vr5, $vr0 - vslli.w $vr5, $vr5, 24 - vsrai.w $vr5, $vr5, 24 - vadd.w $vr1, $vr1, $vr4 - vadd.w $vr3, $vr3, $vr5 - vadd.w $vr1, $vr1, $vr2 - vadd.w $vr3, $vr3, $vr2 + vinsgr2vr.w $vr3, $a6, 0 + vshuf4i.b $vr3, $vr3, 27 + vinsgr2vr.w $vr4, $a7, 0 + vshuf4i.b $vr4, $vr4, 27 + vsllwil.h.b $vr3, $vr3, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.h.b $vr4, $vr4, 0 + vsllwil.w.h $vr4, $vr4, 0 + vadd.w $vr0, $vr0, $vr3 + vadd.w $vr2, $vr2, $vr4 + vadd.w $vr0, $vr0, $vr1 + vadd.w $vr2, $vr2, $vr1 addi.d $a5, $a5, -8 addi.d $a4, $a4, -8 bnez $a5, .LBB4_21 # %bb.22: # %middle.block105 - vadd.w $vr0, $vr3, $vr1 + vadd.w $vr0, $vr2, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a4, $vr0, 0 @@ -948,38 +924,38 @@ F_floatmul: # @F_floatmul add.d $a0, $a4, $a0 addi.d $a2, $a2, -1 addi.d $a4, $a0, -48 - bgeu $a2, $s2, .LBB4_23 + bgeu $a2, $s3, .LBB4_23 .LBB4_24: # %._crit_edge addi.w $a0, $a1, 0 addi.w $a1, $a4, 0 slt $a0, $a1, $a0 - masknez $a1, $s2, $a0 + masknez $a1, $s3, $a0 maskeqz $a2, $s1, $a0 - or $s3, $a2, $a1 + or $s2, $a2, $a1 masknez $a1, $s1, $a0 - maskeqz $a0, $s2, $a0 + maskeqz $a0, $s3, $a0 or $fp, $a0, $a1 - move $a0, $s3 + move $a0, $s2 pcaddu18i $ra, %call36(S_trimzeros) jirl $ra, $ra, 0 move $a0, $fp pcaddu18i $ra, %call36(S_trimzeros) jirl $ra, $ra, 0 - move $a0, $s3 + move $a0, $s2 pcaddu18i $ra, %call36(strlen) jirl $ra, $ra, 0 - move $s2, $a0 + move $s3, $a0 move $a0, $fp pcaddu18i $ra, %call36(strlen) jirl $ra, $ra, 0 - add.d $a1, $a0, $s2 + add.d $a1, $a0, $s3 st.d $a1, $sp, 16 # 8-byte Folded Spill add.d $a0, $fp, $a0 pcalau12i $a1, %pc_hi20(F_floatmul.prod) - addi.d $s2, $a1, %pc_lo12(F_floatmul.prod) + addi.d $s3, $a1, %pc_lo12(F_floatmul.prod) ori $a1, $zero, 48 addi.d $s7, $a0, -1 - st.h $a1, $s2, 0 + st.h $a1, $s3, 0 st.d $fp, $sp, 48 # 8-byte Folded Spill bgeu $s7, $fp, .LBB4_27 .LBB4_25: # %._crit_edge72 @@ -1028,12 +1004,12 @@ F_floatmul: # @F_floatmul .p2align 4, , 16 .LBB4_28: # %.lr.ph.i49.preheader # in Loop: Header=BB4_29 Depth=1 - move $a0, $s3 + move $a0, $s2 pcaddu18i $ra, %call36(strlen) jirl $ra, $ra, 0 addi.d $s7, $s7, -1 ori $a1, $zero, 48 - stx.h $a1, $s3, $a0 + stx.h $a1, $s2, $a0 ld.d $a0, $sp, 48 # 8-byte Folded Reload bltu $s7, $a0, .LBB4_25 .LBB4_29: # %.preheader @@ -1064,28 +1040,28 @@ F_floatmul: # @F_floatmul # Parent Loop BB4_29 Depth=1 # => This Loop Header: Depth=2 # Child Loop BB4_36 Depth 3 - move $a0, $s2 + move $a0, $s3 pcaddu18i $ra, %call36(strlen) jirl $ra, $ra, 0 - add.d $a0, $s2, $a0 + add.d $a0, $s3, $a0 addi.d $s1, $a0, -1 - move $a0, $s3 + move $a0, $s2 pcaddu18i $ra, %call36(strlen) jirl $ra, $ra, 0 - add.d $a0, $s3, $a0 + add.d $a0, $s2, $a0 addi.d $a0, $a0, -1 st.b $zero, $s4, 199 - bgeu $s1, $s2, .LBB4_35 + bgeu $s1, $s3, .LBB4_35 # %bb.34: # %.lr.ph68 # in Loop: Header=BB4_33 Depth=2 move $a1, $s5 - bltu $a0, $s3, .LBB4_31 + bltu $a0, $s2, .LBB4_31 .LBB4_35: # %.lr.ph.i.preheader # in Loop: Header=BB4_33 Depth=2 move $a2, $zero - sltu $a1, $s1, $s2 + sltu $a1, $s1, $s3 xori $a5, $a1, 1 - sltu $a1, $a0, $s3 + sltu $a1, $a0, $s2 xori $a4, $a1, 1 move $a3, $s5 .p2align 4, , 16 @@ -1124,15 +1100,15 @@ F_floatmul: # @F_floatmul addi.d $a4, $a1, 48 addi.d $a1, $a3, -1 st.b $a4, $a3, 0 - sltu $a3, $s1, $s2 + sltu $a3, $s1, $s3 xori $a5, $a3, 1 - sltu $a3, $a0, $s3 + sltu $a3, $a0, $s2 xori $a4, $a3, 1 move $a3, $a1 - bgeu $s1, $s2, .LBB4_36 + bgeu $s1, $s3, .LBB4_36 # %bb.42: # in Loop: Header=BB4_36 Depth=3 move $a3, $a1 - bgeu $a0, $s3, .LBB4_36 + bgeu $a0, $s2, .LBB4_36 # %bb.43: # %._crit_edge.i # in Loop: Header=BB4_33 Depth=2 blt $a6, $s0, .LBB4_31 diff --git a/results/MultiSource/Applications/sqlite3/CMakeFiles/sqlite3.dir/sqlite3.s b/results/MultiSource/Applications/sqlite3/CMakeFiles/sqlite3.dir/sqlite3.s index 2207ed2c..c4c3942f 100644 --- a/results/MultiSource/Applications/sqlite3/CMakeFiles/sqlite3.dir/sqlite3.s +++ b/results/MultiSource/Applications/sqlite3/CMakeFiles/sqlite3.dir/sqlite3.s @@ -3621,8 +3621,8 @@ sqlite3VdbeMakeReady: # @sqlite3VdbeMakeReady st.b $a1, $a0, -48 .LBB22_72: # %pred.store.continue144 # in Loop: Header=BB22_70 Depth=1 - vilvl.b $vr1, $vr1, $vr1 - vilvl.h $vr1, $vr1, $vr1 + vsllwil.h.b $vr1, $vr1, 0 + vsllwil.w.h $vr1, $vr1, 0 vpickve2gr.w $a6, $vr1, 1 andi $a6, $a6, 1 bnez $a6, .LBB22_75 @@ -23523,14 +23523,10 @@ readMasterJournal: # @readMasterJournal ld.w $a4, $a0, 0 vinsgr2vr.w $vr2, $a3, 0 vinsgr2vr.w $vr3, $a4, 0 - vilvl.b $vr2, $vr2, $vr2 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 24 - vsrai.w $vr2, $vr2, 24 - vilvl.b $vr3, $vr3, $vr3 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 24 - vsrai.w $vr3, $vr3, 24 + vsllwil.h.b $vr2, $vr2, 0 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.h.b $vr3, $vr3, 0 + vsllwil.w.h $vr3, $vr3, 0 vsub.w $vr1, $vr1, $vr2 vsub.w $vr0, $vr0, $vr3 addi.d $a2, $a2, -8 @@ -23850,36 +23846,35 @@ pager_playback_one_page: # @pager_playback_one_page add.d $a2, $s2, $s0 addi.d $a2, $a2, -800 move $a5, $a4 - vori.b $vr2, $vr0, 0 .LBB228_14: # %vector.body # =>This Inner Loop Header: Depth=1 ld.b $a6, $a2, 600 ld.b $a7, $a2, 400 ld.b $t0, $a2, 200 ld.b $t1, $a2, 0 - vinsgr2vr.b $vr3, $a6, 0 - vinsgr2vr.b $vr3, $a7, 1 - vinsgr2vr.b $vr3, $t0, 2 - vinsgr2vr.b $vr3, $t1, 3 + vinsgr2vr.b $vr2, $a6, 0 + vinsgr2vr.b $vr2, $a7, 1 + vinsgr2vr.b $vr2, $t0, 2 + vinsgr2vr.b $vr2, $t1, 3 ld.b $a6, $a2, -200 ld.b $a7, $a2, -400 ld.b $t0, $a2, -600 ld.b $t1, $a2, -800 - vinsgr2vr.b $vr4, $a6, 0 - vinsgr2vr.b $vr4, $a7, 1 - vinsgr2vr.b $vr4, $t0, 2 - vinsgr2vr.b $vr4, $t1, 3 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 + vinsgr2vr.b $vr3, $a6, 0 + vinsgr2vr.b $vr3, $a7, 1 + vinsgr2vr.b $vr3, $t0, 2 + vinsgr2vr.b $vr3, $t1, 3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr3 addi.d $a5, $a5, -8 addi.d $a2, $a2, -1600 bnez $a5, .LBB228_14 # %bb.15: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a2, $vr0, 0 @@ -29832,36 +29827,35 @@ pager_write: # @pager_write add.d $a4, $a4, $s0 addi.d $a4, $a4, -800 move $a5, $a3 - vori.b $vr2, $vr0, 0 .LBB260_72: # %vector.body # =>This Inner Loop Header: Depth=1 ld.b $a6, $a4, 600 ld.b $a7, $a4, 400 ld.b $t0, $a4, 200 ld.b $t1, $a4, 0 - vinsgr2vr.b $vr3, $a6, 0 - vinsgr2vr.b $vr3, $a7, 1 - vinsgr2vr.b $vr3, $t0, 2 - vinsgr2vr.b $vr3, $t1, 3 + vinsgr2vr.b $vr2, $a6, 0 + vinsgr2vr.b $vr2, $a7, 1 + vinsgr2vr.b $vr2, $t0, 2 + vinsgr2vr.b $vr2, $t1, 3 ld.b $a6, $a4, -200 ld.b $a7, $a4, -400 ld.b $t0, $a4, -600 ld.b $t1, $a4, -800 - vinsgr2vr.b $vr4, $a6, 0 - vinsgr2vr.b $vr4, $a7, 1 - vinsgr2vr.b $vr4, $t0, 2 - vinsgr2vr.b $vr4, $t1, 3 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 + vinsgr2vr.b $vr3, $a6, 0 + vinsgr2vr.b $vr3, $a7, 1 + vinsgr2vr.b $vr3, $t0, 2 + vinsgr2vr.b $vr3, $t1, 3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr3 addi.d $a5, $a5, -8 addi.d $a4, $a4, -1600 bnez $a5, .LBB260_72 # %bb.73: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $s2, $vr0, 0 @@ -33921,14 +33915,10 @@ writeMasterJournal: # @writeMasterJournal ld.w $a6, $a3, 0 vinsgr2vr.w $vr2, $a5, 0 vinsgr2vr.w $vr3, $a6, 0 - vilvl.b $vr2, $vr2, $vr2 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 24 - vsrai.w $vr2, $vr2, 24 - vilvl.b $vr3, $vr3, $vr3 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 24 - vsrai.w $vr3, $vr3, 24 + vsllwil.h.b $vr2, $vr2, 0 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.h.b $vr3, $vr3, 0 + vsllwil.w.h $vr3, $vr3, 0 vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 addi.d $a4, $a4, -8 @@ -51393,8 +51383,8 @@ balance_nonroot: # @balance_nonroot move $s4, $a0 move $s1, $zero move $s3, $zero - bnez $s1, .LBB357_198 - b .LBB357_207 + bnez $s1, .LBB357_197 + b .LBB357_206 .LBB357_38: bne $s0, $a1, .LBB357_41 # %bb.39: @@ -51416,9 +51406,9 @@ balance_nonroot: # @balance_nonroot bnez $s2, .LBB357_36 .LBB357_40: ori $s4, $zero, 11 - b .LBB357_213 + b .LBB357_212 .LBB357_41: - st.d $s0, $sp, 112 # 8-byte Folded Spill + st.d $s0, $sp, 120 # 8-byte Folded Spill move $s1, $zero move $fp, $zero ori $s0, $zero, 1 @@ -51439,8 +51429,8 @@ balance_nonroot: # @balance_nonroot jirl $ra, $ra, 0 beqz $a0, .LBB357_82 # %bb.43: - st.d $s2, $sp, 56 # 8-byte Folded Spill - st.d $s3, $sp, 72 # 8-byte Folded Spill + st.d $s2, $sp, 64 # 8-byte Folded Spill + st.d $s3, $sp, 80 # 8-byte Folded Spill st.d $s1, $sp, 160 # 8-byte Folded Spill st.d $s7, $sp, 144 # 8-byte Folded Spill bstrpick.d $a1, $fp, 31, 0 @@ -51456,7 +51446,7 @@ balance_nonroot: # @balance_nonroot ld.bu $s2, $s8, 38 addi.d $a0, $a0, 144 st.d $a0, $sp, 208 - st.d $a2, $sp, 64 # 8-byte Folded Spill + st.d $a2, $sp, 72 # 8-byte Folded Spill add.d $s3, $a0, $a2 bnez $s0, .LBB357_46 # %bb.44: # %.lr.ph626.preheader @@ -51489,48 +51479,48 @@ balance_nonroot: # @balance_nonroot addi.d $s6, $s6, 8 bnez $s7, .LBB357_45 .LBB357_46: # %._crit_edge - ld.d $a3, $sp, 72 # 8-byte Folded Reload + ld.d $a3, $sp, 80 # 8-byte Folded Reload ld.b $a0, $a3, 4 move $fp, $zero - st.d $zero, $sp, 120 # 8-byte Folded Spill + st.d $zero, $sp, 128 # 8-byte Folded Spill ld.bu $a1, $a3, 6 addi.d $a2, $s3, 144 - st.d $a2, $sp, 88 # 8-byte Folded Spill + st.d $a2, $sp, 96 # 8-byte Folded Spill andi $a2, $a0, 255 slli.d $a0, $a2, 2 - st.d $a0, $sp, 96 # 8-byte Folded Spill + st.d $a0, $sp, 104 # 8-byte Folded Spill sltu $a1, $zero, $a1 sltu $a0, $zero, $a2 and $a0, $a1, $a0 - st.d $a0, $sp, 104 # 8-byte Folded Spill + st.d $a0, $sp, 112 # 8-byte Folded Spill st.d $s8, $sp, 176 # 8-byte Folded Spill bnez $s0, .LBB357_106 # %bb.47: # %.lr.ph641 - st.d $a2, $sp, 32 # 8-byte Folded Spill - st.d $a1, $sp, 40 # 8-byte Folded Spill + st.d $a2, $sp, 40 # 8-byte Folded Spill + st.d $a1, $sp, 48 # 8-byte Folded Spill move $s3, $zero move $fp, $zero - st.d $zero, $sp, 120 # 8-byte Folded Spill - ld.d $a0, $sp, 64 # 8-byte Folded Reload + st.d $zero, $sp, 128 # 8-byte Folded Spill + ld.d $a0, $sp, 72 # 8-byte Folded Reload alsl.d $a0, $a0, $a0, 2 sltui $a1, $s2, 1 - ld.d $a2, $sp, 88 # 8-byte Folded Reload + ld.d $a2, $sp, 96 # 8-byte Folded Reload add.d $a0, $a2, $a0 - st.d $a0, $sp, 80 # 8-byte Folded Spill + st.d $a0, $sp, 88 # 8-byte Folded Spill masknez $s0, $a0, $a1 - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload sub.d $a0, $zero, $a0 - st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a0, $sp, 56 # 8-byte Folded Spill ld.d $a0, $sp, 160 # 8-byte Folded Reload addi.d $a0, $a0, -1 - st.d $a0, $sp, 128 # 8-byte Folded Spill + st.d $a0, $sp, 136 # 8-byte Folded Spill ori $s2, $zero, 1 ld.d $s7, $sp, 144 # 8-byte Folded Reload ld.d $a4, $sp, 168 # 8-byte Folded Reload b .LBB357_50 .LBB357_48: # in Loop: Header=BB357_50 Depth=1 move $a0, $s7 - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 120 # 8-byte Folded Reload move $a2, $s5 pcaddu18i $ra, %call36(dropCell) jirl $ra, $ra, 0 @@ -51655,7 +51645,7 @@ balance_nonroot: # @balance_nonroot ld.d $a2, $sp, 152 # 8-byte Folded Reload .LBB357_68: # %._crit_edge635 # in Loop: Header=BB357_50 Depth=1 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 136 # 8-byte Folded Reload bgeu $s3, $a0, .LBB357_49 # %bb.69: # in Loop: Header=BB357_50 Depth=1 addi.d $a0, $sp, 256 @@ -51666,21 +51656,21 @@ balance_nonroot: # @balance_nonroot pcaddu18i $ra, %call36(sqlite3BtreeParseCellPtr) jirl $ra, $ra, 0 ld.hu $s5, $sp, 334 - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload bnez $a0, .LBB357_48 # %bb.70: # in Loop: Header=BB357_50 Depth=1 addi.w $s8, $fp, 0 slli.d $a0, $s8, 1 stx.h $s5, $s1, $a0 - ld.d $a0, $sp, 88 # 8-byte Folded Reload - ld.d $a1, $sp, 120 # 8-byte Folded Reload + ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a1, $sp, 128 # 8-byte Folded Reload add.d $s7, $a0, $a1 move $a0, $s7 move $a1, $s6 move $a2, $s5 pcaddu18i $ra, %call36(memcpy) jirl $ra, $ra, 0 - ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a0, $sp, 104 # 8-byte Folded Reload add.d $a0, $s7, $a0 slli.d $a1, $s8, 3 ld.d $a2, $sp, 168 # 8-byte Folded Reload @@ -51689,19 +51679,19 @@ balance_nonroot: # @balance_nonroot ld.bu $a0, $a0, 38 beqz $a0, .LBB357_72 # %bb.71: # in Loop: Header=BB357_50 Depth=1 - ld.d $a0, $sp, 80 # 8-byte Folded Reload + ld.d $a0, $sp, 88 # 8-byte Folded Reload ori $a1, $zero, 255 stx.b $a1, $a0, $s8 .LBB357_72: # in Loop: Header=BB357_50 Depth=1 alsl.d $s6, $s8, $s1, 1 ld.d $s7, $sp, 144 # 8-byte Folded Reload move $a0, $s7 - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 120 # 8-byte Folded Reload move $a2, $s5 pcaddu18i $ra, %call36(dropCell) jirl $ra, $ra, 0 ld.h $a0, $s6, 0 - ld.d $a1, $sp, 96 # 8-byte Folded Reload + ld.d $a1, $sp, 104 # 8-byte Folded Reload sub.d $a0, $a0, $a1 st.h $a0, $s6, 0 ld.bu $a1, $s4, 4 @@ -51727,9 +51717,9 @@ balance_nonroot: # @balance_nonroot st.w $a0, $a2, 0 ld.d $s8, $sp, 176 # 8-byte Folded Reload .LBB357_76: # in Loop: Header=BB357_50 Depth=1 - ld.d $a0, $sp, 120 # 8-byte Folded Reload + ld.d $a0, $sp, 128 # 8-byte Folded Reload add.w $a0, $a0, $s5 - st.d $a0, $sp, 120 # 8-byte Folded Spill + st.d $a0, $sp, 128 # 8-byte Folded Spill addi.d $fp, $fp, 1 b .LBB357_49 .LBB357_77: @@ -51758,7 +51748,7 @@ balance_nonroot: # @balance_nonroot ld.hu $a0, $s7, 20 st.d $zero, $sp, 344 add.d $a1, $a1, $a2 - st.d $s0, $sp, 112 # 8-byte Folded Spill + st.d $s0, $sp, 120 # 8-byte Folded Spill addi.w $s0, $s0, 1 addi.d $fp, $a1, 1 bgeu $s0, $a0, .LBB357_88 @@ -51797,8 +51787,8 @@ balance_nonroot: # @balance_nonroot .LBB357_82: move $s3, $zero ori $s4, $zero, 7 - bnez $s1, .LBB357_198 - b .LBB357_207 + bnez $s1, .LBB357_197 + b .LBB357_206 .LBB357_83: # %._crit_edge642 addi.w $a0, $fp, 0 blez $a0, .LBB357_105 @@ -51807,13 +51797,13 @@ balance_nonroot: # @balance_nonroot move $a1, $zero move $t7, $zero move $a5, $zero - ld.d $a3, $sp, 96 # 8-byte Folded Reload + ld.d $a3, $sp, 104 # 8-byte Folded Reload add.d $a2, $a3, $a2 addi.d $a2, $a2, -12 addi.d $a3, $sp, 216 addi.d $a4, $sp, 236 - ld.d $t6, $sp, 72 # 8-byte Folded Reload - ld.d $t0, $sp, 48 # 8-byte Folded Reload + ld.d $t6, $sp, 80 # 8-byte Folded Reload + ld.d $t0, $sp, 56 # 8-byte Folded Reload b .LBB357_86 .p2align 4, , 16 .LBB357_85: # in Loop: Header=BB357_86 Depth=1 @@ -51835,7 +51825,7 @@ balance_nonroot: # @balance_nonroot addi.w $t7, $t7, 1 b .LBB357_85 .LBB357_88: - bne $s0, $a0, .LBB357_180 + bne $s0, $a0, .LBB357_179 # %bb.89: ld.d $a0, $s7, 112 ld.bu $a1, $s7, 8 @@ -51862,19 +51852,19 @@ balance_nonroot: # @balance_nonroot move $a3, $zero pcaddu18i $ra, %call36(sqlite3PagerAcquire) jirl $ra, $ra, 0 - beqz $a0, .LBB357_185 + beqz $a0, .LBB357_184 # %bb.92: ori $s1, $zero, 1 move $s4, $a0 move $s3, $zero - bnez $s1, .LBB357_198 - b .LBB357_207 + bnez $s1, .LBB357_197 + b .LBB357_206 .LBB357_93: # %getAndInitPage.exit move $a1, $s7 pcaddu18i $ra, %call36(sqlite3BtreeInitPage) jirl $ra, $ra, 0 move $s4, $a0 - bnez $a0, .LBB357_213 + bnez $a0, .LBB357_212 b .LBB357_78 .LBB357_94: # %._crit_edge650 slli.d $a2, $t7, 2 @@ -51884,11 +51874,11 @@ balance_nonroot: # @balance_nonroot stx.w $fp, $a2, $a1 blez $t7, .LBB357_102 # %bb.95: # %.lr.ph655.preheader - ld.d $a3, $sp, 32 # 8-byte Folded Reload + ld.d $a3, $sp, 40 # 8-byte Folded Reload andi $a3, $a3, 255 sltu $a3, $zero, $a3 ldx.w $a6, $a2, $a0 - ld.d $a2, $sp, 40 # 8-byte Folded Reload + ld.d $a2, $sp, 48 # 8-byte Folded Reload and $a2, $a3, $a2 sub.d $a2, $zero, $a2 ori $a3, $zero, 1 @@ -51943,7 +51933,7 @@ balance_nonroot: # @balance_nonroot ld.bu $a0, $a0, 0 st.d $a0, $sp, 152 # 8-byte Folded Spill slt $a0, $zero, $t7 - st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a0, $sp, 56 # 8-byte Folded Spill bgez $t7, .LBB357_107 # %bb.103: st.d $t7, $sp, 80 # 8-byte Folded Spill @@ -51952,15 +51942,15 @@ balance_nonroot: # @balance_nonroot .LBB357_104: move $s3, $zero ori $s1, $zero, 1 - b .LBB357_198 + b .LBB357_197 .LBB357_105: - ld.d $a3, $sp, 72 # 8-byte Folded Reload + ld.d $a3, $sp, 80 # 8-byte Folded Reload .LBB357_106: # %._crit_edge656.thread ld.d $a0, $a3, 112 ld.bu $a0, $a0, 0 st.d $a0, $sp, 152 # 8-byte Folded Spill move $t7, $zero - st.d $zero, $sp, 48 # 8-byte Folded Spill + st.d $zero, $sp, 56 # 8-byte Folded Spill st.w $zero, $sp, 216 st.w $fp, $sp, 236 .LBB357_107: # %.lr.ph661.preheader @@ -51968,9 +51958,9 @@ balance_nonroot: # @balance_nonroot move $s8, $zero st.d $t7, $sp, 80 # 8-byte Folded Spill addi.w $a0, $t7, 1 - st.d $a0, $sp, 72 # 8-byte Folded Spill + st.d $a0, $sp, 88 # 8-byte Folded Spill bstrpick.d $a0, $a0, 31, 0 - st.d $a0, $sp, 128 # 8-byte Folded Spill + st.d $a0, $sp, 136 # 8-byte Folded Spill addi.d $s5, $sp, 284 addi.d $s3, $sp, 304 addi.d $s0, $sp, 384 @@ -51985,7 +51975,7 @@ balance_nonroot: # @balance_nonroot st.d $zero, $s0, 0 pcaddu18i $ra, %call36(sqlite3PagerWrite) jirl $ra, $ra, 0 - bnez $a0, .LBB357_178 + bnez $a0, .LBB357_177 .LBB357_109: # in Loop: Header=BB357_110 Depth=1 move $a0, $s6 ld.d $a1, $sp, 152 # 8-byte Folded Reload @@ -51997,7 +51987,7 @@ balance_nonroot: # @balance_nonroot addi.d $s0, $s0, 8 addi.d $s7, $s7, 4 addi.w $s2, $s2, 1 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 136 # 8-byte Folded Reload beq $a0, $s8, .LBB357_113 .LBB357_110: # %.lr.ph661 # =>This Inner Loop Header: Depth=1 @@ -52011,7 +52001,7 @@ balance_nonroot: # @balance_nonroot move $a4, $zero pcaddu18i $ra, %call36(allocateBtreePage) jirl $ra, $ra, 0 - bnez $a0, .LBB357_177 + bnez $a0, .LBB357_176 # %bb.112: # in Loop: Header=BB357_110 Depth=1 ld.d $s6, $sp, 408 st.d $s6, $s3, 0 @@ -52019,7 +52009,7 @@ balance_nonroot: # @balance_nonroot .LBB357_113: ld.d $s7, $sp, 144 # 8-byte Folded Reload ld.d $s8, $sp, 176 # 8-byte Folded Reload - ld.d $s3, $sp, 72 # 8-byte Folded Reload + ld.d $s3, $sp, 88 # 8-byte Folded Reload .LBB357_114: # %.preheader594 ld.d $a0, $sp, 160 # 8-byte Folded Reload bgeu $s3, $a0, .LBB357_120 @@ -52042,7 +52032,7 @@ balance_nonroot: # @balance_nonroot move $a0, $s5 pcaddu18i $ra, %call36(freePage) jirl $ra, $ra, 0 - bnez $a0, .LBB357_176 + bnez $a0, .LBB357_175 # %bb.118: # in Loop: Header=BB357_117 Depth=1 beqz $s5, .LBB357_116 # %bb.119: # in Loop: Header=BB357_117 Depth=1 @@ -52051,9 +52041,9 @@ balance_nonroot: # @balance_nonroot jirl $ra, $ra, 0 b .LBB357_116 .LBB357_120: # %.preheader593 - ld.d $s2, $sp, 56 # 8-byte Folded Reload + ld.d $s2, $sp, 64 # 8-byte Folded Reload ld.d $s0, $sp, 80 # 8-byte Folded Reload - ld.d $a0, $sp, 48 # 8-byte Folded Reload + ld.d $a0, $sp, 56 # 8-byte Folded Reload beqz $a0, .LBB357_127 # %bb.121: # %.lr.ph675.preheader move $a5, $zero @@ -52116,48 +52106,48 @@ balance_nonroot: # @balance_nonroot stx.d $t6, $t2, $a4 b .LBB357_122 .LBB357_127: # %.preheader592 - blez $s3, .LBB357_181 + blez $s3, .LBB357_180 # %bb.128: # %.lr.ph683 move $s0, $zero addi.w $a0, $s3, -1 - st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a0, $sp, 56 # 8-byte Folded Spill srli.d $a0, $s2, 2 move $s2, $zero slli.d $a1, $a0, 5 ld.d $a2, $sp, 168 # 8-byte Folded Reload add.d $a3, $a2, $a1 - st.d $a3, $sp, 32 # 8-byte Folded Spill + st.d $a3, $sp, 48 # 8-byte Folded Spill addi.d $a3, $a3, 8 - st.d $a3, $sp, 24 # 8-byte Folded Spill + st.d $a3, $sp, 40 # 8-byte Folded Spill alsl.d $a0, $a0, $a1, 3 - ld.d $a1, $sp, 64 # 8-byte Folded Reload + ld.d $a1, $sp, 72 # 8-byte Folded Reload alsl.d $a0, $a1, $a0, 3 add.d $a0, $a0, $a2 addi.d $a0, $a0, 432 - st.d $a0, $sp, 40 # 8-byte Folded Spill + st.d $a0, $sp, 64 # 8-byte Folded Spill addi.d $a0, $s7, 16 - st.d $a0, $sp, 16 # 8-byte Folded Spill - vrepli.b $vr4, 0 + st.d $a0, $sp, 8 # 8-byte Folded Spill + vrepli.b $vr0, 0 + vst $vr0, $sp, 16 # 16-byte Folded Spill addi.w $a0, $fp, 0 - st.d $a0, $sp, 56 # 8-byte Folded Spill - st.d $s3, $sp, 72 # 8-byte Folded Spill - vst $vr4, $sp, 128 # 16-byte Folded Spill + st.d $a0, $sp, 72 # 8-byte Folded Spill + st.d $s3, $sp, 88 # 8-byte Folded Spill b .LBB357_131 .LBB357_129: # in Loop: Header=BB357_131 Depth=1 - ld.d $a5, $sp, 80 # 8-byte Folded Reload + ld.d $a5, $sp, 136 # 8-byte Folded Reload addi.w $a5, $a5, 1 addi.d $s0, $s0, 1 - st.d $s0, $sp, 112 # 8-byte Folded Spill + st.d $s0, $sp, 120 # 8-byte Folded Spill .LBB357_130: # in Loop: Header=BB357_131 Depth=1 addi.d $s2, $s2, 1 move $s0, $a5 - beq $s2, $s3, .LBB357_181 + beq $s2, $s3, .LBB357_180 .LBB357_131: # =>This Loop Header: Depth=1 # Child Loop BB357_136 Depth 2 # Child Loop BB357_139 Depth 2 # Child Loop BB357_144 Depth 2 - # Child Loop BB357_153 Depth 2 - # Child Loop BB357_168 Depth 2 + # Child Loop BB357_152 Depth 2 + # Child Loop BB357_167 Depth 2 slli.d $a0, $s2, 2 addi.d $a1, $sp, 236 ldx.w $a5, $a0, $a1 @@ -52178,11 +52168,11 @@ balance_nonroot: # @balance_nonroot # in Loop: Header=BB357_131 Depth=1 bstrpick.d $a0, $fp, 30, 3 slli.d $a1, $a0, 3 - ld.d $a0, $sp, 24 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload alsl.d $a0, $s0, $a0, 1 move $a2, $a1 - vori.b $vr0, $vr4, 0 - vori.b $vr1, $vr4, 0 + vld $vr1, $sp, 16 # 16-byte Folded Reload + vori.b $vr0, $vr1, 0 .p2align 4, , 16 .LBB357_136: # %vector.body # Parent Loop BB357_131 Depth=1 @@ -52191,8 +52181,8 @@ balance_nonroot: # @balance_nonroot ld.d $a4, $a0, 0 vinsgr2vr.d $vr2, $a3, 0 vinsgr2vr.d $vr3, $a4, 0 - vilvl.h $vr2, $vr4, $vr2 - vilvl.h $vr3, $vr4, $vr3 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr3, $vr3, 0 vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 addi.d $a2, $a2, -8 @@ -52210,7 +52200,7 @@ balance_nonroot: # @balance_nonroot sub.d $a2, $fp, $a1 slli.d $a1, $a1, 1 alsl.d $a1, $s0, $a1, 1 - ld.d $a3, $sp, 32 # 8-byte Folded Reload + ld.d $a3, $sp, 48 # 8-byte Folded Reload add.d $a1, $a3, $a1 .p2align 4, , 16 .LBB357_139: # %.lr.ph.i527 @@ -52223,7 +52213,7 @@ balance_nonroot: # @balance_nonroot bnez $a2, .LBB357_139 .LBB357_140: # %._crit_edge.i525 # in Loop: Header=BB357_131 Depth=1 - st.d $s2, $sp, 64 # 8-byte Folded Spill + st.d $s2, $sp, 80 # 8-byte Folded Spill slli.d $a1, $s2, 3 addi.d $a2, $sp, 304 ldx.d $s5, $a1, $a2 @@ -52233,11 +52223,11 @@ balance_nonroot: # @balance_nonroot add.d $a1, $s3, $a1 revb.2h $a2, $fp st.h $a2, $a1, 3 - st.d $a5, $sp, 80 # 8-byte Folded Spill + st.d $a5, $sp, 136 # 8-byte Folded Spill bne $a5, $s0, .LBB357_142 # %bb.141: # in Loop: Header=BB357_131 Depth=1 move $a0, $zero - b .LBB357_148 + b .LBB357_147 .LBB357_142: # in Loop: Header=BB357_131 Depth=1 addi.w $a1, $a0, 0 move $a0, $s5 @@ -52284,76 +52274,72 @@ balance_nonroot: # @balance_nonroot move $a0, $fp .LBB357_147: # %assemblePage.exit # in Loop: Header=BB357_131 Depth=1 - vld $vr4, $sp, 128 # 16-byte Folded Reload -.LBB357_148: # %assemblePage.exit - # in Loop: Header=BB357_131 Depth=1 st.h $a0, $s5, 20 ld.bu $a0, $s8, 38 - ld.d $s3, $sp, 72 # 8-byte Folded Reload - ld.d $s2, $sp, 64 # 8-byte Folded Reload - beqz $a0, .LBB357_155 -# %bb.149: # %assemblePage.exit + ld.d $s3, $sp, 88 # 8-byte Folded Reload + ld.d $s2, $sp, 80 # 8-byte Folded Reload + beqz $a0, .LBB357_154 +# %bb.148: # %assemblePage.exit # in Loop: Header=BB357_131 Depth=1 - ld.d $a0, $sp, 80 # 8-byte Folded Reload - bge $s0, $a0, .LBB357_155 -# %bb.150: # %.lr.ph677 + ld.d $a0, $sp, 136 # 8-byte Folded Reload + bge $s0, $a0, .LBB357_154 +# %bb.149: # %.lr.ph677 # in Loop: Header=BB357_131 Depth=1 move $s6, $zero - ld.d $a0, $sp, 40 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload add.d $s0, $a0, $s0 - b .LBB357_153 -.LBB357_151: # in Loop: Header=BB357_153 Depth=2 + b .LBB357_152 +.LBB357_150: # in Loop: Header=BB357_152 Depth=2 move $a0, $s5 move $a1, $s6 pcaddu18i $ra, %call36(ptrmapPutOvfl) jirl $ra, $ra, 0 - vld $vr4, $sp, 128 # 16-byte Folded Reload - bnez $a0, .LBB357_176 -.LBB357_152: # in Loop: Header=BB357_153 Depth=2 + bnez $a0, .LBB357_175 +.LBB357_151: # in Loop: Header=BB357_152 Depth=2 addi.w $s6, $s6, 1 addi.d $s0, $s0, 1 - beq $fp, $s6, .LBB357_155 -.LBB357_153: # Parent Loop BB357_131 Depth=1 + beq $fp, $s6, .LBB357_154 +.LBB357_152: # Parent Loop BB357_131 Depth=1 # => This Inner Loop Header: Depth=2 ld.bu $a0, $s0, 0 ori $a1, $zero, 255 - beq $a0, $a1, .LBB357_151 -# %bb.154: # in Loop: Header=BB357_153 Depth=2 + beq $a0, $a1, .LBB357_150 +# %bb.153: # in Loop: Header=BB357_152 Depth=2 slli.d $a0, $a0, 3 addi.d $a1, $sp, 344 ldx.d $a0, $a0, $a1 ld.w $a0, $a0, 128 ld.w $a1, $s5, 128 - bne $a0, $a1, .LBB357_151 - b .LBB357_152 -.LBB357_155: # %.loopexit590 + bne $a0, $a1, .LBB357_150 + b .LBB357_151 +.LBB357_154: # %.loopexit590 # in Loop: Header=BB357_131 Depth=1 ld.d $a4, $sp, 168 # 8-byte Folded Reload - ld.d $a0, $sp, 48 # 8-byte Folded Reload - bgeu $s2, $a0, .LBB357_160 -# %bb.156: # %.loopexit590 - # in Loop: Header=BB357_131 Depth=1 ld.d $a0, $sp, 56 # 8-byte Folded Reload + bgeu $s2, $a0, .LBB357_159 +# %bb.155: # %.loopexit590 + # in Loop: Header=BB357_131 Depth=1 + ld.d $a0, $sp, 72 # 8-byte Folded Reload ori $fp, $zero, 1 - ld.d $a5, $sp, 80 # 8-byte Folded Reload + ld.d $a5, $sp, 136 # 8-byte Folded Reload bge $a5, $a0, .LBB357_130 -# %bb.157: # in Loop: Header=BB357_131 Depth=1 +# %bb.156: # in Loop: Header=BB357_131 Depth=1 slli.d $a1, $a5, 3 slli.d $a0, $a5, 1 ldx.hu $a0, $s1, $a0 ld.bu $a2, $s5, 4 ldx.d $s4, $a4, $a1 - ld.d $a1, $sp, 96 # 8-byte Folded Reload + ld.d $a1, $sp, 104 # 8-byte Folded Reload add.w $a3, $a1, $a0 st.w $a3, $sp, 188 - beqz $a2, .LBB357_161 + beqz $a2, .LBB357_160 +# %bb.157: # in Loop: Header=BB357_131 Depth=1 + ld.d $s0, $sp, 120 # 8-byte Folded Reload + ld.d $a1, $sp, 112 # 8-byte Folded Reload + beqz $a1, .LBB357_161 # %bb.158: # in Loop: Header=BB357_131 Depth=1 - ld.d $s0, $sp, 112 # 8-byte Folded Reload - ld.d $a1, $sp, 104 # 8-byte Folded Reload - beqz $a1, .LBB357_162 -# %bb.159: # in Loop: Header=BB357_131 Depth=1 addi.w $a5, $a5, -1 - st.d $a5, $sp, 80 # 8-byte Folded Spill + st.d $a5, $sp, 136 # 8-byte Folded Spill slli.d $a0, $a5, 3 ldx.d $a1, $a4, $a0 addi.d $a2, $sp, 408 @@ -52361,8 +52347,8 @@ balance_nonroot: # @balance_nonroot pcaddu18i $ra, %call36(sqlite3BtreeParseCellPtr) jirl $ra, $ra, 0 ld.d $a3, $sp, 416 - ld.d $a0, $sp, 88 # 8-byte Folded Reload - ld.d $fp, $sp, 120 # 8-byte Folded Reload + ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $fp, $sp, 128 # 8-byte Folded Reload add.d $s4, $a0, $fp addi.d $a7, $sp, 188 move $a0, $s7 @@ -52376,29 +52362,29 @@ balance_nonroot: # @balance_nonroot ld.w $a3, $sp, 188 move $a4, $zero add.w $fp, $a3, $fp - st.d $fp, $sp, 120 # 8-byte Folded Spill + st.d $fp, $sp, 128 # 8-byte Folded Spill ori $fp, $zero, 1 - b .LBB357_164 -.LBB357_160: # in Loop: Header=BB357_131 Depth=1 - ld.d $a5, $sp, 80 # 8-byte Folded Reload + b .LBB357_163 +.LBB357_159: # in Loop: Header=BB357_131 Depth=1 + ld.d $a5, $sp, 136 # 8-byte Folded Reload b .LBB357_130 -.LBB357_161: # in Loop: Header=BB357_131 Depth=1 +.LBB357_160: # in Loop: Header=BB357_131 Depth=1 ld.d $a0, $s5, 112 ld.w $a1, $s4, 0 move $a4, $zero st.w $a1, $a0, 8 - ld.d $s0, $sp, 112 # 8-byte Folded Reload - b .LBB357_164 -.LBB357_162: # in Loop: Header=BB357_131 Depth=1 + ld.d $s0, $sp, 120 # 8-byte Folded Reload + b .LBB357_163 +.LBB357_161: # in Loop: Header=BB357_131 Depth=1 addi.d $s4, $s4, -4 - ld.d $a1, $sp, 88 # 8-byte Folded Reload - ld.d $a2, $sp, 120 # 8-byte Folded Reload + ld.d $a1, $sp, 96 # 8-byte Folded Reload + ld.d $a2, $sp, 128 # 8-byte Folded Reload add.d $a4, $a1, $a2 add.w $a2, $a3, $a2 - st.d $a2, $sp, 120 # 8-byte Folded Spill + st.d $a2, $sp, 128 # 8-byte Folded Spill ori $a1, $zero, 4 - bne $a0, $a1, .LBB357_164 -# %bb.163: # in Loop: Header=BB357_131 Depth=1 + bne $a0, $a1, .LBB357_163 +# %bb.162: # in Loop: Header=BB357_131 Depth=1 addi.d $a2, $sp, 408 move $a0, $s7 move $a1, $s4 @@ -52408,7 +52394,7 @@ balance_nonroot: # @balance_nonroot move $a4, $fp ori $fp, $zero, 1 ld.hu $a3, $sp, 438 -.LBB357_164: # in Loop: Header=BB357_131 Depth=1 +.LBB357_163: # in Loop: Header=BB357_131 Depth=1 addi.w $s6, $s0, 0 ori $a5, $zero, 4 move $a0, $s7 @@ -52416,34 +52402,33 @@ balance_nonroot: # @balance_nonroot move $a2, $s4 pcaddu18i $ra, %call36(insertCell) jirl $ra, $ra, 0 - bnez $a0, .LBB357_175 -# %bb.165: # in Loop: Header=BB357_131 Depth=1 + bnez $a0, .LBB357_174 +# %bb.164: # in Loop: Header=BB357_131 Depth=1 ld.bu $a1, $s7, 2 move $a0, $s0 - vld $vr4, $sp, 128 # 16-byte Folded Reload - beqz $a1, .LBB357_171 -# %bb.166: # %.lr.ph.i531 + beqz $a1, .LBB357_170 +# %bb.165: # %.lr.ph.i531 # in Loop: Header=BB357_131 Depth=1 addi.d $a2, $a1, 1 - ld.d $a0, $sp, 16 # 8-byte Folded Reload + ld.d $a0, $sp, 8 # 8-byte Folded Reload alsl.d $a1, $a1, $a0, 4 move $a0, $s0 - b .LBB357_168 -.LBB357_167: # in Loop: Header=BB357_168 Depth=2 + b .LBB357_167 +.LBB357_166: # in Loop: Header=BB357_167 Depth=2 addi.d $a2, $a2, -1 addi.d $a1, $a1, -16 - bgeu $fp, $a2, .LBB357_171 -.LBB357_168: # Parent Loop BB357_131 Depth=1 + bgeu $fp, $a2, .LBB357_170 +.LBB357_167: # Parent Loop BB357_131 Depth=1 # => This Inner Loop Header: Depth=2 ld.hu $a3, $a1, 0 addi.w $a4, $a0, 0 - blt $a4, $a3, .LBB357_167 -# %bb.169: # in Loop: Header=BB357_168 Depth=2 - beq $a4, $a3, .LBB357_172 -# %bb.170: # in Loop: Header=BB357_168 Depth=2 + blt $a4, $a3, .LBB357_166 +# %bb.168: # in Loop: Header=BB357_167 Depth=2 + beq $a4, $a3, .LBB357_171 +# %bb.169: # in Loop: Header=BB357_167 Depth=2 addi.d $a0, $a0, -1 - b .LBB357_167 -.LBB357_171: # %._crit_edge.i537 + b .LBB357_166 +.LBB357_170: # %._crit_edge.i537 # in Loop: Header=BB357_131 Depth=1 ld.hu $a1, $s7, 14 ld.d $a2, $s7, 112 @@ -52454,58 +52439,57 @@ balance_nonroot: # @balance_nonroot slli.d $a1, $a1, 8 add.d $a1, $a2, $a1 add.d $a0, $a1, $a0 - b .LBB357_173 -.LBB357_172: # %.thread.i540 + b .LBB357_172 +.LBB357_171: # %.thread.i540 # in Loop: Header=BB357_131 Depth=1 ld.d $a0, $a1, -8 -.LBB357_173: # %findOverflowCell.exit541 +.LBB357_172: # %findOverflowCell.exit541 # in Loop: Header=BB357_131 Depth=1 ld.w $a1, $s5, 128 revb.2w $a1, $a1 st.w $a1, $a0, 0 ld.bu $a0, $s8, 38 sltui $a0, $a0, 1 - ld.d $a1, $sp, 104 # 8-byte Folded Reload + ld.d $a1, $sp, 112 # 8-byte Folded Reload or $a0, $a0, $a1 andi $a0, $a0, 1 bnez $a0, .LBB357_129 -# %bb.174: # in Loop: Header=BB357_131 Depth=1 +# %bb.173: # in Loop: Header=BB357_131 Depth=1 move $a0, $s7 move $a1, $s6 pcaddu18i $ra, %call36(ptrmapPutOvfl) jirl $ra, $ra, 0 - vld $vr4, $sp, 128 # 16-byte Folded Reload beqz $a0, .LBB357_129 -.LBB357_175: +.LBB357_174: move $s4, $a0 ld.d $a0, $sp, 168 # 8-byte Folded Reload ld.d $s1, $sp, 160 # 8-byte Folded Reload - b .LBB357_197 -.LBB357_176: + b .LBB357_196 +.LBB357_175: move $s4, $a0 ld.d $s1, $sp, 160 # 8-byte Folded Reload - b .LBB357_196 -.LBB357_177: + b .LBB357_195 +.LBB357_176: move $s4, $a0 - b .LBB357_179 -.LBB357_178: # %.threadsplit + b .LBB357_178 +.LBB357_177: # %.threadsplit move $s4, $a0 addi.w $s2, $s2, 1 -.LBB357_179: # %.thread +.LBB357_178: # %.thread move $s3, $s2 ld.d $s7, $sp, 144 # 8-byte Folded Reload ld.d $s1, $sp, 160 # 8-byte Folded Reload - b .LBB357_196 -.LBB357_180: + b .LBB357_195 +.LBB357_179: move $s0, $zero ori $s1, $zero, 1 b .LBB357_42 -.LBB357_181: # %._crit_edge684 +.LBB357_180: # %._crit_edge684 ld.d $a0, $sp, 152 # 8-byte Folded Reload andi $a0, $a0, 8 ld.d $s1, $sp, 160 # 8-byte Folded Reload - bnez $a0, .LBB357_183 -# %bb.182: + bnez $a0, .LBB357_182 +# %bb.181: addi.d $a0, $sp, 344 alsl.d $a0, $s1, $a0, 3 ld.d $a0, $a0, -8 @@ -52516,14 +52500,14 @@ balance_nonroot: # @balance_nonroot ld.d $a1, $a1, 112 ld.w $a0, $a0, 8 st.w $a0, $a1, 8 -.LBB357_183: +.LBB357_182: ld.hu $a0, $s7, 20 ld.bu $a1, $s7, 2 add.d $a0, $a1, $a0 - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 120 # 8-byte Folded Reload addi.w $a1, $a1, 0 - bne $a1, $a0, .LBB357_188 -# %bb.184: + bne $a1, $a0, .LBB357_187 +# %bb.183: ld.d $a0, $s7, 112 ld.bu $a1, $s7, 8 addi.d $a2, $sp, 284 @@ -52532,8 +52516,8 @@ balance_nonroot: # @balance_nonroot add.d $a0, $a0, $a1 revb.2w $a1, $a2 st.w $a1, $a0, 8 - b .LBB357_189 -.LBB357_185: + b .LBB357_188 +.LBB357_184: ld.d $s1, $sp, 304 ld.d $a0, $s1, 0 ld.d $a1, $s1, 80 @@ -52551,8 +52535,8 @@ balance_nonroot: # @balance_nonroot st.d $s8, $s1, 192 st.b $a1, $s1, 96 st.d $a2, $sp, 392 - beqz $a3, .LBB357_214 -.LBB357_186: # %getAndInitPage.exit.thread550.1 + beqz $a3, .LBB357_213 +.LBB357_185: # %getAndInitPage.exit.thread550.1 st.h $s0, $s1, 104 ld.hu $a1, $s1, 108 ld.bu $a2, $s1, 90 @@ -52560,11 +52544,11 @@ balance_nonroot: # @balance_nonroot ld.hu $a0, $s7, 20 add.d $a1, $fp, $a1 add.d $a1, $a1, $a2 - ld.d $a2, $sp, 112 # 8-byte Folded Reload + ld.d $a2, $sp, 120 # 8-byte Folded Reload addi.w $s1, $a2, 2 addi.d $fp, $a1, 1 - bgeu $s1, $a0, .LBB357_202 -# %bb.187: + bgeu $s1, $a0, .LBB357_201 +# %bb.186: ld.hu $a0, $s7, 14 ld.d $a1, $s7, 112 alsl.w $a0, $s1, $a0, 1 @@ -52584,8 +52568,8 @@ balance_nonroot: # @balance_nonroot slli.w $a2, $a3, 8 or $a0, $a0, $a2 addi.d $a1, $a1, 3 - b .LBB357_204 -.LBB357_188: + b .LBB357_203 +.LBB357_187: move $a0, $s7 pcaddu18i $ra, %call36(findOverflowCell) jirl $ra, $ra, 0 @@ -52594,36 +52578,36 @@ balance_nonroot: # @balance_nonroot ld.w $a1, $a1, -4 revb.2w $a1, $a1 st.w $a1, $a0, 0 -.LBB357_189: - blez $s3, .LBB357_193 -# %bb.190: # %.lr.ph688.preheader +.LBB357_188: + blez $s3, .LBB357_192 +# %bb.189: # %.lr.ph688.preheader addi.d $fp, $sp, 304 move $s0, $s3 -.LBB357_191: # %.lr.ph688 +.LBB357_190: # %.lr.ph688 # =>This Inner Loop Header: Depth=1 ld.d $a0, $fp, 0 pcaddu18i $ra, %call36(reparentChildPages) jirl $ra, $ra, 0 - bnez $a0, .LBB357_195 -# %bb.192: # in Loop: Header=BB357_191 Depth=1 + bnez $a0, .LBB357_194 +# %bb.191: # in Loop: Header=BB357_190 Depth=1 addi.d $s0, $s0, -1 addi.d $fp, $fp, 8 - bnez $s0, .LBB357_191 -.LBB357_193: # %._crit_edge689 + bnez $s0, .LBB357_190 +.LBB357_192: # %._crit_edge689 move $a0, $s7 pcaddu18i $ra, %call36(reparentChildPages) jirl $ra, $ra, 0 - bnez $a0, .LBB357_195 -# %bb.194: + bnez $a0, .LBB357_194 +# %bb.193: move $a0, $s7 move $a1, $zero pcaddu18i $ra, %call36(balance) jirl $ra, $ra, 0 -.LBB357_195: # %.loopexit +.LBB357_194: # %.loopexit move $s4, $a0 -.LBB357_196: # %.loopexit +.LBB357_195: # %.loopexit ld.d $a0, $sp, 168 # 8-byte Folded Reload -.LBB357_197: # %.loopexit +.LBB357_196: # %.loopexit ld.w $a1, $a0, -8 pcalau12i $a2, %pc_hi20(mem.5) ld.d $a3, $a2, %pc_lo12(mem.5) @@ -52632,28 +52616,28 @@ balance_nonroot: # @balance_nonroot st.d $a1, $a2, %pc_lo12(mem.5) pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - beqz $s1, .LBB357_207 -.LBB357_198: # %.lr.ph691.preheader + beqz $s1, .LBB357_206 +.LBB357_197: # %.lr.ph691.preheader addi.d $fp, $sp, 384 - b .LBB357_200 + b .LBB357_199 .p2align 4, , 16 -.LBB357_199: # %releasePage.exit543 - # in Loop: Header=BB357_200 Depth=1 +.LBB357_198: # %releasePage.exit543 + # in Loop: Header=BB357_199 Depth=1 addi.d $s1, $s1, -1 addi.d $fp, $fp, 8 - beqz $s1, .LBB357_207 -.LBB357_200: # %.lr.ph691 + beqz $s1, .LBB357_206 +.LBB357_199: # %.lr.ph691 # =>This Inner Loop Header: Depth=1 ld.d $a0, $fp, 0 - beqz $a0, .LBB357_199 -# %bb.201: # in Loop: Header=BB357_200 Depth=1 + beqz $a0, .LBB357_198 +# %bb.200: # in Loop: Header=BB357_199 Depth=1 ld.d $a0, $a0, 120 pcaddu18i $ra, %call36(sqlite3PagerUnref) jirl $ra, $ra, 0 - b .LBB357_199 -.LBB357_202: - bne $s1, $a0, .LBB357_216 -# %bb.203: + b .LBB357_198 +.LBB357_201: + bne $s1, $a0, .LBB357_215 +# %bb.202: ld.d $a0, $s7, 112 ld.bu $a1, $s7, 8 add.d $a1, $a0, $a1 @@ -52666,66 +52650,66 @@ balance_nonroot: # @balance_nonroot slli.w $a2, $a3, 8 or $a0, $a0, $a2 addi.d $a1, $a1, 11 -.LBB357_204: +.LBB357_203: ld.bu $a1, $a1, 0 or $s2, $a0, $a1 st.w $s2, $sp, 380 - beqz $s2, .LBB357_221 -# %bb.205: + beqz $s2, .LBB357_220 +# %bb.204: ld.d $a0, $s8, 0 addi.d $a2, $sp, 304 move $a1, $s2 move $a3, $zero pcaddu18i $ra, %call36(sqlite3PagerAcquire) jirl $ra, $ra, 0 - beqz $a0, .LBB357_217 -# %bb.206: + beqz $a0, .LBB357_216 +# %bb.205: move $s4, $a0 ori $s1, $zero, 2 move $s3, $zero - bnez $s1, .LBB357_198 -.LBB357_207: # %.preheader - blez $s3, .LBB357_212 -# %bb.208: # %.lr.ph693.preheader + bnez $s1, .LBB357_197 +.LBB357_206: # %.preheader + blez $s3, .LBB357_211 +# %bb.207: # %.lr.ph693.preheader addi.d $fp, $sp, 304 - b .LBB357_210 + b .LBB357_209 .p2align 4, , 16 -.LBB357_209: # %releasePage.exit545 - # in Loop: Header=BB357_210 Depth=1 +.LBB357_208: # %releasePage.exit545 + # in Loop: Header=BB357_209 Depth=1 addi.d $s3, $s3, -1 addi.d $fp, $fp, 8 - beqz $s3, .LBB357_212 -.LBB357_210: # %.lr.ph693 + beqz $s3, .LBB357_211 +.LBB357_209: # %.lr.ph693 # =>This Inner Loop Header: Depth=1 ld.d $a0, $fp, 0 - beqz $a0, .LBB357_209 -# %bb.211: # in Loop: Header=BB357_210 Depth=1 + beqz $a0, .LBB357_208 +# %bb.210: # in Loop: Header=BB357_209 Depth=1 ld.d $a0, $a0, 120 pcaddu18i $ra, %call36(sqlite3PagerUnref) jirl $ra, $ra, 0 - b .LBB357_209 -.LBB357_212: # %._crit_edge694 + b .LBB357_208 +.LBB357_211: # %._crit_edge694 beqz $s7, .LBB357_11 -.LBB357_213: # %._crit_edge694.thread +.LBB357_212: # %._crit_edge694.thread ld.d $a0, $s7, 120 pcaddu18i $ra, %call36(sqlite3PagerUnref) jirl $ra, $ra, 0 b .LBB357_11 -.LBB357_214: # %getAndInitPage.exit.1 +.LBB357_213: # %getAndInitPage.exit.1 move $a1, $s7 pcaddu18i $ra, %call36(sqlite3BtreeInitPage) jirl $ra, $ra, 0 - beqz $a0, .LBB357_186 -# %bb.215: + beqz $a0, .LBB357_185 +# %bb.214: move $s3, $zero ori $s1, $zero, 1 move $s4, $a0 - b .LBB357_198 -.LBB357_216: + b .LBB357_197 +.LBB357_215: move $s0, $zero ori $s1, $zero, 2 b .LBB357_42 -.LBB357_217: +.LBB357_216: move $s5, $s3 ld.d $s3, $sp, 304 ld.d $a0, $s3, 0 @@ -52744,8 +52728,8 @@ balance_nonroot: # @balance_nonroot st.d $s8, $s3, 192 st.b $a1, $s3, 96 st.d $a2, $sp, 400 - beqz $a3, .LBB357_219 -.LBB357_218: # %getAndInitPage.exit.thread550.2 + beqz $a3, .LBB357_218 +.LBB357_217: # %getAndInitPage.exit.thread550.2 move $s0, $zero ld.hu $a0, $s3, 108 ld.bu $a1, $s3, 90 @@ -52757,17 +52741,17 @@ balance_nonroot: # @balance_nonroot ori $s1, $zero, 3 move $s3, $s5 b .LBB357_42 -.LBB357_219: # %getAndInitPage.exit.2 +.LBB357_218: # %getAndInitPage.exit.2 move $a1, $s7 pcaddu18i $ra, %call36(sqlite3BtreeInitPage) jirl $ra, $ra, 0 - beqz $a0, .LBB357_218 -# %bb.220: + beqz $a0, .LBB357_217 +# %bb.219: move $s4, $a0 -.LBB357_221: +.LBB357_220: move $s3, $zero ori $s1, $zero, 2 - b .LBB357_198 + b .LBB357_197 .Lfunc_end357: .size balance_nonroot, .Lfunc_end357-balance_nonroot # -- End function @@ -52928,23 +52912,22 @@ assemblePage: # @assemblePage addi.d $a1, $s1, 8 move $a2, $a0 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB360_5: # %vector.body # =>This Inner Loop Header: Depth=1 ld.d $a3, $a1, -8 ld.d $a4, $a1, 0 - vinsgr2vr.d $vr3, $a3, 0 - vinsgr2vr.d $vr4, $a4, 0 - vilvl.h $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.d $vr2, $a3, 0 + vinsgr2vr.d $vr3, $a4, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 addi.d $a2, $a2, -8 addi.d $a1, $a1, 16 bnez $a2, .LBB360_5 # %bb.6: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a1, $vr0, 0 @@ -101444,42 +101427,42 @@ sqlite3ValueFromExpr: # @sqlite3ValueFromExpr ld.b $s7, $a5, 12 ld.b $s8, $a5, 14 ld.b $ra, $a5, 16 - vinsgr2vr.b $vr7, $t2, 0 - vinsgr2vr.b $vr7, $t3, 4 - vinsgr2vr.b $vr7, $t4, 8 - vinsgr2vr.b $vr7, $t5, 12 - vslli.w $vr7, $vr7, 24 - vsrai.w $vr7, $vr7, 24 - vinsgr2vr.b $vr8, $a6, 0 - vinsgr2vr.b $vr8, $a7, 4 - vinsgr2vr.b $vr8, $t0, 8 - vinsgr2vr.b $vr8, $t1, 12 - vslli.w $vr8, $vr8, 24 - vsrai.w $vr8, $vr8, 24 + vinsgr2vr.b $vr7, $a6, 0 + vinsgr2vr.b $vr7, $a7, 1 + vinsgr2vr.b $vr7, $t0, 2 + vinsgr2vr.b $vr7, $t1, 3 + vinsgr2vr.b $vr8, $t2, 0 + vinsgr2vr.b $vr8, $t3, 1 + vinsgr2vr.b $vr8, $t4, 2 + vinsgr2vr.b $vr8, $t5, 3 + vsllwil.h.b $vr8, $vr8, 0 + vsllwil.w.h $vr8, $vr8, 0 + vsllwil.h.b $vr7, $vr7, 0 + vsllwil.w.h $vr7, $vr7, 0 vinsgr2vr.b $vr9, $s6, 0 - vinsgr2vr.b $vr9, $s7, 4 - vinsgr2vr.b $vr9, $s8, 8 - vinsgr2vr.b $vr9, $ra, 12 - vslli.w $vr9, $vr9, 24 - vsrai.w $vr9, $vr9, 24 + vinsgr2vr.b $vr9, $s7, 1 + vinsgr2vr.b $vr9, $s8, 2 + vinsgr2vr.b $vr9, $ra, 3 + vsllwil.h.b $vr9, $vr9, 0 + vsllwil.w.h $vr9, $vr9, 0 vinsgr2vr.b $vr10, $t6, 0 - vinsgr2vr.b $vr10, $t7, 4 - vinsgr2vr.b $vr10, $t8, 8 - vinsgr2vr.b $vr10, $s3, 12 - vslli.w $vr10, $vr10, 24 - vsrai.w $vr10, $vr10, 24 + vinsgr2vr.b $vr10, $t7, 1 + vinsgr2vr.b $vr10, $t8, 2 + vinsgr2vr.b $vr10, $s3, 3 + vsllwil.h.b $vr10, $vr10, 0 + vsllwil.w.h $vr10, $vr10, 0 vadd.w $vr11, $vr10, $vr0 vadd.w $vr12, $vr9, $vr0 - vadd.w $vr13, $vr8, $vr0 - vadd.w $vr14, $vr7, $vr0 + vadd.w $vr13, $vr7, $vr0 + vadd.w $vr14, $vr8, $vr0 vslt.wu $vr15, $vr1, $vr14 vslt.wu $vr16, $vr1, $vr13 vslt.wu $vr17, $vr1, $vr12 vslt.wu $vr18, $vr1, $vr11 vadd.w $vr19, $vr10, $vr2 vadd.w $vr20, $vr9, $vr2 - vadd.w $vr21, $vr8, $vr2 - vadd.w $vr22, $vr7, $vr2 + vadd.w $vr21, $vr7, $vr2 + vadd.w $vr22, $vr8, $vr2 vslti.wu $vr22, $vr22, 6 vslti.wu $vr21, $vr21, 6 vslti.wu $vr20, $vr20, 6 @@ -101488,16 +101471,16 @@ sqlite3ValueFromExpr: # @sqlite3ValueFromExpr vbitsel.v $vr20, $vr4, $vr3, $vr20 vbitsel.v $vr21, $vr4, $vr3, $vr21 vbitsel.v $vr22, $vr4, $vr3, $vr22 - vadd.w $vr7, $vr22, $vr7 - vadd.w $vr8, $vr21, $vr8 + vadd.w $vr8, $vr22, $vr8 + vadd.w $vr7, $vr21, $vr7 vadd.w $vr9, $vr20, $vr9 vadd.w $vr10, $vr19, $vr10 vbitsel.v $vr10, $vr11, $vr10, $vr18 vbitsel.v $vr9, $vr12, $vr9, $vr17 - vbitsel.v $vr8, $vr13, $vr8, $vr16 - vbitsel.v $vr7, $vr14, $vr7, $vr15 + vbitsel.v $vr11, $vr13, $vr7, $vr16 + vbitsel.v $vr7, $vr14, $vr8, $vr15 vslli.w $vr7, $vr7, 4 - vslli.w $vr8, $vr8, 4 + vslli.w $vr8, $vr11, 4 vslli.w $vr9, $vr9, 4 vslli.w $vr10, $vr10, 4 ld.b $a6, $a5, -13 @@ -101516,42 +101499,42 @@ sqlite3ValueFromExpr: # @sqlite3ValueFromExpr ld.b $s7, $a5, 13 ld.b $s8, $a5, 15 ld.b $ra, $a5, 17 - vinsgr2vr.b $vr11, $t6, 0 - vinsgr2vr.b $vr11, $t7, 4 - vinsgr2vr.b $vr11, $t8, 8 - vinsgr2vr.b $vr11, $s3, 12 - vslli.w $vr11, $vr11, 24 - vsrai.w $vr11, $vr11, 24 - vinsgr2vr.b $vr12, $s6, 0 - vinsgr2vr.b $vr12, $s7, 4 - vinsgr2vr.b $vr12, $s8, 8 - vinsgr2vr.b $vr12, $ra, 12 - vslli.w $vr12, $vr12, 24 - vsrai.w $vr12, $vr12, 24 - vinsgr2vr.b $vr13, $a6, 0 - vinsgr2vr.b $vr13, $a7, 4 - vinsgr2vr.b $vr13, $t0, 8 - vinsgr2vr.b $vr13, $t1, 12 - vslli.w $vr13, $vr13, 24 - vsrai.w $vr13, $vr13, 24 + vinsgr2vr.b $vr11, $a6, 0 + vinsgr2vr.b $vr11, $a7, 1 + vinsgr2vr.b $vr11, $t0, 2 + vinsgr2vr.b $vr11, $t1, 3 + vinsgr2vr.b $vr12, $t6, 0 + vinsgr2vr.b $vr12, $t7, 1 + vinsgr2vr.b $vr12, $t8, 2 + vinsgr2vr.b $vr12, $s3, 3 + vsllwil.h.b $vr12, $vr12, 0 + vsllwil.w.h $vr12, $vr12, 0 + vinsgr2vr.b $vr13, $s6, 0 + vinsgr2vr.b $vr13, $s7, 1 + vinsgr2vr.b $vr13, $s8, 2 + vinsgr2vr.b $vr13, $ra, 3 + vsllwil.h.b $vr13, $vr13, 0 + vsllwil.w.h $vr13, $vr13, 0 + vsllwil.h.b $vr11, $vr11, 0 + vsllwil.w.h $vr11, $vr11, 0 vinsgr2vr.b $vr14, $t2, 0 - vinsgr2vr.b $vr14, $t3, 4 - vinsgr2vr.b $vr14, $t4, 8 - vinsgr2vr.b $vr14, $t5, 12 - vslli.w $vr14, $vr14, 24 - vsrai.w $vr14, $vr14, 24 + vinsgr2vr.b $vr14, $t3, 1 + vinsgr2vr.b $vr14, $t4, 2 + vinsgr2vr.b $vr14, $t5, 3 + vsllwil.h.b $vr14, $vr14, 0 + vsllwil.w.h $vr14, $vr14, 0 vadd.w $vr15, $vr14, $vr0 - vadd.w $vr16, $vr13, $vr0 - vadd.w $vr17, $vr12, $vr0 - vadd.w $vr18, $vr11, $vr0 + vadd.w $vr16, $vr11, $vr0 + vadd.w $vr17, $vr13, $vr0 + vadd.w $vr18, $vr12, $vr0 vslt.wu $vr19, $vr1, $vr18 vslt.wu $vr20, $vr1, $vr17 vslt.wu $vr21, $vr1, $vr16 vslt.wu $vr22, $vr1, $vr15 vadd.w $vr23, $vr14, $vr2 - vadd.w $vr24, $vr13, $vr2 - vadd.w $vr25, $vr12, $vr2 - vadd.w $vr26, $vr11, $vr2 + vadd.w $vr24, $vr11, $vr2 + vadd.w $vr25, $vr13, $vr2 + vadd.w $vr26, $vr12, $vr2 vslti.wu $vr26, $vr26, 6 vslti.wu $vr25, $vr25, 6 vslti.wu $vr24, $vr24, 6 @@ -101560,17 +101543,17 @@ sqlite3ValueFromExpr: # @sqlite3ValueFromExpr vbitsel.v $vr24, $vr6, $vr5, $vr24 vbitsel.v $vr25, $vr6, $vr5, $vr25 vbitsel.v $vr26, $vr6, $vr5, $vr26 - vadd.w $vr11, $vr26, $vr11 - vadd.w $vr12, $vr25, $vr12 - vadd.w $vr13, $vr24, $vr13 + vadd.w $vr12, $vr26, $vr12 + vadd.w $vr13, $vr25, $vr13 + vadd.w $vr11, $vr24, $vr11 vadd.w $vr14, $vr23, $vr14 vbitsel.v $vr14, $vr15, $vr14, $vr22 - vbitsel.v $vr13, $vr16, $vr13, $vr21 - vbitsel.v $vr12, $vr17, $vr12, $vr20 - vbitsel.v $vr11, $vr18, $vr11, $vr19 - vor.v $vr10, $vr11, $vr10 - vor.v $vr9, $vr12, $vr9 - vor.v $vr8, $vr13, $vr8 + vbitsel.v $vr11, $vr16, $vr11, $vr21 + vbitsel.v $vr13, $vr17, $vr13, $vr20 + vbitsel.v $vr12, $vr18, $vr12, $vr19 + vor.v $vr10, $vr12, $vr10 + vor.v $vr9, $vr13, $vr9 + vor.v $vr8, $vr11, $vr8 vor.v $vr7, $vr14, $vr7 vpickev.h $vr7, $vr7, $vr8 vpickev.h $vr8, $vr9, $vr10 @@ -101796,29 +101779,29 @@ sqlite3HexToBlob: # @sqlite3HexToBlob ld.b $s4, $a5, 13 ld.b $s5, $a5, 15 vinsgr2vr.b $vr6, $a6, 0 - vinsgr2vr.b $vr6, $a7, 4 - vinsgr2vr.b $vr6, $t0, 8 - vinsgr2vr.b $vr6, $t1, 12 - vslli.w $vr6, $vr6, 24 - vsrai.w $vr9, $vr6, 24 + vinsgr2vr.b $vr6, $a7, 1 + vinsgr2vr.b $vr6, $t0, 2 + vinsgr2vr.b $vr6, $t1, 3 + vsllwil.h.b $vr6, $vr6, 0 + vsllwil.w.h $vr9, $vr6, 0 vinsgr2vr.b $vr6, $t2, 0 - vinsgr2vr.b $vr6, $t3, 4 - vinsgr2vr.b $vr6, $t4, 8 - vinsgr2vr.b $vr6, $t5, 12 - vslli.w $vr6, $vr6, 24 - vsrai.w $vr14, $vr6, 24 + vinsgr2vr.b $vr6, $t3, 1 + vinsgr2vr.b $vr6, $t4, 2 + vinsgr2vr.b $vr6, $t5, 3 + vsllwil.h.b $vr6, $vr6, 0 + vsllwil.w.h $vr14, $vr6, 0 vinsgr2vr.b $vr6, $t6, 0 - vinsgr2vr.b $vr6, $t7, 4 - vinsgr2vr.b $vr6, $t8, 8 - vinsgr2vr.b $vr6, $s1, 12 - vslli.w $vr6, $vr6, 24 - vsrai.w $vr15, $vr6, 24 + vinsgr2vr.b $vr6, $t7, 1 + vinsgr2vr.b $vr6, $t8, 2 + vinsgr2vr.b $vr6, $s1, 3 + vsllwil.h.b $vr6, $vr6, 0 + vsllwil.w.h $vr15, $vr6, 0 vinsgr2vr.b $vr6, $s2, 0 - vinsgr2vr.b $vr6, $s3, 4 - vinsgr2vr.b $vr6, $s4, 8 - vinsgr2vr.b $vr6, $s5, 12 - vslli.w $vr6, $vr6, 24 - vsrai.w $vr16, $vr6, 24 + vinsgr2vr.b $vr6, $s3, 1 + vinsgr2vr.b $vr6, $s4, 2 + vinsgr2vr.b $vr6, $s5, 3 + vsllwil.h.b $vr6, $vr6, 0 + vsllwil.w.h $vr16, $vr6, 0 vadd.w $vr13, $vr16, $vr0 vadd.w $vr11, $vr15, $vr0 vadd.w $vr17, $vr14, $vr0 @@ -101959,29 +101942,29 @@ sqlite3HexToBlob: # @sqlite3HexToBlob ld.b $s4, $a5, 14 ld.b $s5, $a5, 16 vinsgr2vr.b $vr10, $a6, 0 - vinsgr2vr.b $vr10, $a7, 4 - vinsgr2vr.b $vr10, $t0, 8 - vinsgr2vr.b $vr10, $t1, 12 - vslli.w $vr10, $vr10, 24 - vsrai.w $vr13, $vr10, 24 + vinsgr2vr.b $vr10, $a7, 1 + vinsgr2vr.b $vr10, $t0, 2 + vinsgr2vr.b $vr10, $t1, 3 + vsllwil.h.b $vr10, $vr10, 0 + vsllwil.w.h $vr13, $vr10, 0 vinsgr2vr.b $vr10, $t2, 0 - vinsgr2vr.b $vr10, $t3, 4 - vinsgr2vr.b $vr10, $t4, 8 - vinsgr2vr.b $vr10, $t5, 12 - vslli.w $vr10, $vr10, 24 - vsrai.w $vr17, $vr10, 24 + vinsgr2vr.b $vr10, $t3, 1 + vinsgr2vr.b $vr10, $t4, 2 + vinsgr2vr.b $vr10, $t5, 3 + vsllwil.h.b $vr10, $vr10, 0 + vsllwil.w.h $vr17, $vr10, 0 vinsgr2vr.b $vr10, $t6, 0 - vinsgr2vr.b $vr10, $t7, 4 - vinsgr2vr.b $vr10, $t8, 8 - vinsgr2vr.b $vr10, $s1, 12 - vslli.w $vr10, $vr10, 24 - vsrai.w $vr19, $vr10, 24 + vinsgr2vr.b $vr10, $t7, 1 + vinsgr2vr.b $vr10, $t8, 2 + vinsgr2vr.b $vr10, $s1, 3 + vsllwil.h.b $vr10, $vr10, 0 + vsllwil.w.h $vr19, $vr10, 0 vinsgr2vr.b $vr10, $s2, 0 - vinsgr2vr.b $vr10, $s3, 4 - vinsgr2vr.b $vr10, $s4, 8 - vinsgr2vr.b $vr10, $s5, 12 - vslli.w $vr10, $vr10, 24 - vsrai.w $vr20, $vr10, 24 + vinsgr2vr.b $vr10, $s3, 1 + vinsgr2vr.b $vr10, $s4, 2 + vinsgr2vr.b $vr10, $s5, 3 + vsllwil.h.b $vr10, $vr10, 0 + vsllwil.w.h $vr20, $vr10, 0 vadd.w $vr21, $vr20, $vr0 vadd.w $vr22, $vr19, $vr0 vadd.w $vr15, $vr17, $vr0 @@ -105610,7 +105593,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin fst.d $fs2, $sp, 1264 # 8-byte Folded Spill fst.d $fs3, $sp, 1256 # 8-byte Folded Spill fst.d $fs4, $sp, 1248 # 8-byte Folded Spill - st.d $a1, $sp, 240 # 8-byte Folded Spill + st.d $a1, $sp, 248 # 8-byte Folded Spill ld.hu $a1, $a1, 0 ori $a5, $zero, 65 move $s8, $a0 @@ -105625,7 +105608,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin b .LBB493_49 .LBB493_2: move $s2, $a2 - st.d $a3, $sp, 144 # 8-byte Folded Spill + st.d $a3, $sp, 152 # 8-byte Folded Spill st.d $a4, $sp, 40 # 8-byte Folded Spill beqz $a3, .LBB493_4 # %bb.3: @@ -105636,7 +105619,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.d $zero, $sp, 32 # 8-byte Folded Spill .LBB493_5: ld.d $a0, $s8, 24 - st.d $a0, $sp, 272 # 8-byte Folded Spill + st.d $a0, $sp, 280 # 8-byte Folded Spill addi.d $a0, $sp, 984 ori $a2, $zero, 260 addi.d $fp, $sp, 984 @@ -105655,7 +105638,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $s2 pcaddu18i $ra, %call36(whereSplit) jirl $ra, $ra, 0 - ld.d $a0, $sp, 240 # 8-byte Folded Reload + ld.d $a0, $sp, 248 # 8-byte Folded Reload ld.h $a0, $a0, 0 ld.d $s3, $s8, 0 alsl.d $a0, $a0, $a0, 1 @@ -105671,12 +105654,12 @@ sqlite3WhereBegin: # @sqlite3WhereBegin jirl $ra, $ra, 0 bnez $a0, .LBB493_10 # %bb.8: - st.d $zero, $sp, 136 # 8-byte Folded Spill + st.d $zero, $sp, 144 # 8-byte Folded Spill ori $a0, $zero, 1 st.b $a0, $s3, 42 ld.bu $a0, $s3, 42 pcalau12i $a1, %pc_hi20(mem.5) - st.d $a1, $sp, 96 # 8-byte Folded Spill + st.d $a1, $sp, 112 # 8-byte Folded Spill bnez $a0, .LBB493_33 b .LBB493_12 .LBB493_9: # %sqlite3DbMallocRaw.exit.i @@ -105686,27 +105669,27 @@ sqlite3WhereBegin: # @sqlite3WhereBegin beqz $a0, .LBB493_11 .LBB493_10: # %sqlite3DbMallocRaw.exit.thread8.i bstrpick.d $a2, $fp, 31, 0 - st.d $a0, $sp, 136 # 8-byte Folded Spill + st.d $a0, $sp, 144 # 8-byte Folded Spill move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 ld.bu $a0, $s3, 42 pcalau12i $a1, %pc_hi20(mem.5) - st.d $a1, $sp, 96 # 8-byte Folded Spill + st.d $a1, $sp, 112 # 8-byte Folded Spill bnez $a0, .LBB493_33 b .LBB493_12 .LBB493_11: - st.d $zero, $sp, 136 # 8-byte Folded Spill + st.d $zero, $sp, 144 # 8-byte Folded Spill ld.bu $a0, $s3, 42 pcalau12i $a1, %pc_hi20(mem.5) - st.d $a1, $sp, 96 # 8-byte Folded Spill + st.d $a1, $sp, 112 # 8-byte Folded Spill bnez $a0, .LBB493_33 .LBB493_12: - ld.d $a2, $sp, 240 # 8-byte Folded Reload + ld.d $a2, $sp, 248 # 8-byte Folded Reload ld.h $a0, $a2, 0 - ld.d $a1, $sp, 136 # 8-byte Folded Reload + ld.d $a1, $sp, 144 # 8-byte Folded Reload st.w $a0, $a1, 28 - ld.d $a3, $sp, 272 # 8-byte Folded Reload + ld.d $a3, $sp, 280 # 8-byte Folded Reload ld.w $s0, $a3, 40 ld.w $a0, $a3, 44 st.d $s8, $a1, 0 @@ -105732,7 +105715,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin beqz $fp, .LBB493_17 .LBB493_16: ld.w $a1, $fp, -8 - ld.d $a3, $sp, 96 # 8-byte Folded Reload + ld.d $a3, $sp, 112 # 8-byte Folded Reload ld.d $a2, $a3, %pc_lo12(mem.5) addi.d $a0, $fp, -8 sub.d $a1, $a2, $a1 @@ -105749,9 +105732,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a0, $fp pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1128 + beqz $a0, .LBB493_1127 .LBB493_19: # %sqlite3DbReallocOrFree.exit.i - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.d $a0, $a1, 48 beqz $a0, .LBB493_21 .LBB493_20: @@ -105761,11 +105744,11 @@ sqlite3WhereBegin: # @sqlite3WhereBegin stx.w $a2, $a0, $a1 .LBB493_21: # %sqlite3VdbeMakeLabel.exit nor $a0, $s0, $zero - ld.d $a1, $sp, 136 # 8-byte Folded Reload + ld.d $a1, $sp, 144 # 8-byte Folded Reload st.w $a0, $a1, 24 beqz $s2, .LBB493_26 # %bb.22: - ld.d $a1, $sp, 240 # 8-byte Folded Reload + ld.d $a1, $sp, 248 # 8-byte Folded Reload ld.hu $a1, $a1, 0 beqz $a1, .LBB493_25 # %bb.23: @@ -105780,7 +105763,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.w $a0, $sp, 1244 beqz $a0, .LBB493_26 # %bb.24: # %._crit_edge1883 - ld.d $a0, $sp, 136 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload ld.w $a0, $a0, 24 .LBB493_25: addi.w $a2, $a0, 0 @@ -105790,12 +105773,12 @@ sqlite3WhereBegin: # @sqlite3WhereBegin pcaddu18i $ra, %call36(sqlite3ExprIfFalse) jirl $ra, $ra, 0 .LBB493_26: - ld.d $a0, $sp, 240 # 8-byte Folded Reload + ld.d $a0, $sp, 248 # 8-byte Folded Reload ld.h $a0, $a0, 0 blez $a0, .LBB493_29 # %bb.27: # %.lr.ph addi.d $a1, $sp, 988 - ld.d $a2, $sp, 240 # 8-byte Folded Reload + ld.d $a2, $sp, 248 # 8-byte Folded Reload addi.d $a2, $a2, 52 .p2align 4, , 16 .LBB493_28: # =>This Inner Loop Header: Depth=1 @@ -105859,7 +105842,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin beqz $a0, .LBB493_42 # %bb.41: ld.w $a1, $a0, -8 - ld.d $a3, $sp, 96 # 8-byte Folded Reload + ld.d $a3, $sp, 112 # 8-byte Folded Reload ld.d $a2, $a3, %pc_lo12(mem.5) addi.d $a0, $a0, -8 sub.d $a1, $a2, $a1 @@ -105867,11 +105850,11 @@ sqlite3WhereBegin: # @sqlite3WhereBegin pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 .LBB493_42: # %whereClauseClear.exit1402 - ld.d $a1, $sp, 136 # 8-byte Folded Reload + ld.d $a1, $sp, 144 # 8-byte Folded Reload beqz $a1, .LBB493_49 # %bb.43: # %.preheader.i1404 ld.w $a0, $a1, 28 - ld.d $a2, $sp, 96 # 8-byte Folded Reload + ld.d $a2, $sp, 112 # 8-byte Folded Reload ld.d $fp, $a2, %pc_lo12(mem.5) blez $a0, .LBB493_48 # %bb.44: # %.lr.ph.i1405.preheader @@ -105894,15 +105877,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin sub.d $fp, $fp, $a2 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.d $a0, $sp, 136 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload ld.w $a0, $a0, 28 b .LBB493_45 .LBB493_48: # %sqlite3_free.exit12.i - ld.d $a0, $sp, 136 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload ld.w $a1, $a0, -8 addi.d $a0, $a0, -8 sub.d $a1, $fp, $a1 - ld.d $a2, $sp, 96 # 8-byte Folded Reload + ld.d $a2, $sp, 112 # 8-byte Folded Reload st.d $a1, $a2, %pc_lo12(mem.5) pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 @@ -105929,23 +105912,23 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ret .LBB493_51: st.d $s3, $sp, 72 # 8-byte Folded Spill - ld.d $a1, $sp, 240 # 8-byte Folded Reload + ld.d $a1, $sp, 248 # 8-byte Folded Reload addi.d $a0, $a1, 8 - st.d $a0, $sp, 128 # 8-byte Folded Spill - ld.d $a0, $sp, 136 # 8-byte Folded Reload + st.d $a0, $sp, 136 # 8-byte Folded Spill + ld.d $a0, $sp, 144 # 8-byte Folded Reload addi.d $s3, $a0, 40 ld.h $a0, $a1, 0 lu12i.w $a1, 15 vrepli.b $vr0, 0 - vst $vr0, $sp, 448 # 16-byte Folded Spill - st.d $s8, $sp, 320 # 8-byte Folded Spill - st.d $s3, $sp, 192 # 8-byte Folded Spill + vst $vr0, $sp, 96 # 16-byte Folded Spill + st.d $s8, $sp, 336 # 8-byte Folded Spill + st.d $s3, $sp, 200 # 8-byte Folded Spill blez $a0, .LBB493_459 # %bb.52: # %.lr.ph1744 - st.d $zero, $sp, 152 # 8-byte Folded Spill + st.d $zero, $sp, 160 # 8-byte Folded Spill move $a6, $zero addi.w $s7, $zero, -1 - ld.d $a2, $sp, 144 # 8-byte Folded Reload + ld.d $a2, $sp, 152 # 8-byte Folded Reload sltu $a2, $zero, $a2 st.d $a2, $sp, 48 # 8-byte Folded Spill pcalau12i $a2, %pc_hi20(.LCPI493_0) @@ -105954,31 +105937,31 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ori $s5, $zero, 1 movgr2fr.d $fs1, $zero ori $a1, $a1, 4095 - st.d $a1, $sp, 296 # 8-byte Folded Spill + st.d $a1, $sp, 304 # 8-byte Folded Spill lu12i.w $a2, 407238 lu12i.w $a1, 403142 ori $a1, $a1, 3938 - st.d $a1, $sp, 368 # 8-byte Folded Spill + st.d $a1, $sp, 384 # 8-byte Folded Spill lu12i.w $a1, 407174 ori $a1, $a1, 370 - st.d $a1, $sp, 424 # 8-byte Folded Spill + st.d $a1, $sp, 440 # 8-byte Folded Spill lu12i.w $a1, 468566 ori $a1, $a1, 364 - st.d $a1, $sp, 408 # 8-byte Folded Spill + st.d $a1, $sp, 424 # 8-byte Folded Spill lu12i.w $a1, 419526 ori $a1, $a1, 3937 - st.d $a1, $sp, 400 # 8-byte Folded Spill + st.d $a1, $sp, 416 # 8-byte Folded Spill lu12i.w $a1, 411383 ori $a1, $a1, 1378 - st.d $a1, $sp, 392 # 8-byte Folded Spill + st.d $a1, $sp, 408 # 8-byte Folded Spill lu12i.w $a1, 1686 ori $a1, $a1, 3700 - st.d $a1, $sp, 336 # 8-byte Folded Spill + st.d $a1, $sp, 352 # 8-byte Folded Spill lu12i.w $a1, 476759 ori $a1, $a1, 2164 - st.d $a1, $sp, 416 # 8-byte Folded Spill - vldi $vr8, -988 - vldi $vr9, -912 + st.d $a1, $sp, 432 # 8-byte Folded Spill + vldi $vr7, -988 + vldi $vr8, -912 lu12i.w $a1, 768 ori $a1, $a1, 512 st.d $a1, $sp, 16 # 8-byte Folded Spill @@ -105991,9 +105974,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin lu52i.d $a1, $a1, 1350 st.d $a1, $sp, 80 # 8-byte Folded Spill ori $a1, $a2, 3937 - st.d $a1, $sp, 352 # 8-byte Folded Spill + st.d $a1, $sp, 368 # 8-byte Folded Spill ori $t3, $zero, 48 - st.d $s3, $sp, 184 # 8-byte Folded Spill + st.d $s3, $sp, 192 # 8-byte Folded Spill move $a7, $s7 b .LBB493_55 .p2align 4, , 16 @@ -106003,18 +105986,18 @@ sqlite3WhereBegin: # @sqlite3WhereBegin .LBB493_54: # %getMask.exit841 # in Loop: Header=BB493_55 Depth=1 and $a7, $t0, $a7 - ld.d $a0, $sp, 240 # 8-byte Folded Reload + ld.d $a0, $sp, 248 # 8-byte Folded Reload ld.hu $a0, $a0, 0 and $s7, $a1, $s7 - ld.d $a3, $sp, 184 # 8-byte Folded Reload - ld.d $a1, $sp, 160 # 8-byte Folded Reload + ld.d $a3, $sp, 192 # 8-byte Folded Reload + ld.d $a1, $sp, 168 # 8-byte Folded Reload st.w $a1, $a3, 0 - ld.d $a2, $sp, 152 # 8-byte Folded Reload + ld.d $a2, $sp, 160 # 8-byte Folded Reload addi.w $a2, $a2, 1 ext.w.h $a1, $a0 addi.d $a3, $a3, 96 - st.d $a3, $sp, 184 # 8-byte Folded Spill - st.d $a2, $sp, 152 # 8-byte Folded Spill + st.d $a3, $sp, 192 # 8-byte Folded Spill + st.d $a2, $sp, 160 # 8-byte Folded Spill bge $a2, $a1, .LBB493_458 .LBB493_55: # =>This Loop Header: Depth=1 # Child Loop BB493_58 Depth 2 @@ -106070,33 +106053,33 @@ sqlite3WhereBegin: # @sqlite3WhereBegin # in Loop: Header=BB493_55 Depth=1 st.d $a7, $sp, 64 # 8-byte Folded Spill move $a5, $zero - st.d $zero, $sp, 120 # 8-byte Folded Spill + st.d $zero, $sp, 128 # 8-byte Folded Spill move $t1, $zero - st.d $zero, $sp, 160 # 8-byte Folded Spill + st.d $zero, $sp, 168 # 8-byte Folded Spill move $a7, $zero slli.d $a1, $a6, 6 alsl.d $a1, $a6, $a1, 3 - ld.d $a2, $sp, 128 # 8-byte Folded Reload + ld.d $a2, $sp, 136 # 8-byte Folded Reload add.d $t0, $a2, $a1 - ld.d $a1, $sp, 152 # 8-byte Folded Reload + ld.d $a1, $sp, 160 # 8-byte Folded Reload sltui $a1, $a1, 1 ld.d $a2, $sp, 48 # 8-byte Folded Reload and $a1, $a2, $a1 st.d $a1, $sp, 88 # 8-byte Folded Spill - st.d $a6, $sp, 176 # 8-byte Folded Spill + st.d $a6, $sp, 184 # 8-byte Folded Spill fmov.d $fs2, $fs0 - st.d $s7, $sp, 304 # 8-byte Folded Spill + st.d $s7, $sp, 312 # 8-byte Folded Spill b .LBB493_58 .p2align 4, , 16 .LBB493_57: # %.thread1561 # in Loop: Header=BB493_58 Depth=2 addi.w $a1, $a6, 0 - ld.d $a2, $sp, 176 # 8-byte Folded Reload + ld.d $a2, $sp, 184 # 8-byte Folded Reload xor $a1, $a2, $a1 sltui $a1, $a1, 1 add.w $a2, $a2, $a1 - st.d $a2, $sp, 176 # 8-byte Folded Spill - ld.d $s8, $sp, 320 # 8-byte Folded Reload + st.d $a2, $sp, 184 # 8-byte Folded Spill + ld.d $s8, $sp, 336 # 8-byte Folded Reload addi.d $a6, $a6, 1 ext.w.h $a1, $a0 addi.d $t0, $t0, 72 @@ -106184,15 +106167,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin .LBB493_66: # in Loop: Header=BB493_58 Depth=2 ld.d $s3, $t0, 24 ld.bu $a0, $s3, 105 - st.d $a6, $sp, 232 # 8-byte Folded Spill - st.d $a5, $sp, 224 # 8-byte Folded Spill - st.d $t1, $sp, 216 # 8-byte Folded Spill - st.d $a7, $sp, 208 # 8-byte Folded Spill - st.d $t0, $sp, 328 # 8-byte Folded Spill - st.d $t2, $sp, 200 # 8-byte Folded Spill + st.d $a6, $sp, 240 # 8-byte Folded Spill + st.d $a5, $sp, 232 # 8-byte Folded Spill + st.d $t1, $sp, 224 # 8-byte Folded Spill + st.d $a7, $sp, 216 # 8-byte Folded Spill + st.d $t0, $sp, 344 # 8-byte Folded Spill + st.d $t2, $sp, 208 # 8-byte Folded Spill beqz $a0, .LBB493_69 # %bb.67: # in Loop: Header=BB493_58 Depth=2 - ld.d $a0, $sp, 144 # 8-byte Folded Reload + ld.d $a0, $sp, 152 # 8-byte Folded Reload beqz $a0, .LBB493_71 # %bb.68: # in Loop: Header=BB493_58 Depth=2 ld.d $s4, $a0, 0 @@ -106202,7 +106185,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.d $a0, $sp, 88 # 8-byte Folded Reload beqz $a0, .LBB493_83 # %bb.70: # in Loop: Header=BB493_58 Depth=2 - ld.d $a0, $sp, 144 # 8-byte Folded Reload + ld.d $a0, $sp, 152 # 8-byte Folded Reload ld.d $a0, $a0, 0 st.d $a0, $sp, 464 # 8-byte Folded Spill ld.d $s2, $s3, 32 @@ -106212,7 +106195,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin .LBB493_71: # in Loop: Header=BB493_58 Depth=2 move $s4, $zero .LBB493_72: # in Loop: Header=BB493_58 Depth=2 - ld.d $a1, $sp, 192 # 8-byte Folded Reload + ld.d $a1, $sp, 200 # 8-byte Folded Reload alsl.d $a0, $a6, $a6, 1 slli.d $a0, $a0, 5 add.d $s1, $a1, $a0 @@ -106222,7 +106205,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin # in Loop: Header=BB493_58 Depth=2 ld.d $s6, $s2, 32 ld.w $s0, $s2, 0 - ld.d $s8, $sp, 320 # 8-byte Folded Reload + ld.d $s8, $sp, 336 # 8-byte Folded Reload blez $s0, .LBB493_126 .LBB493_74: # %.lr.ph192.i # in Loop: Header=BB493_58 Depth=2 @@ -106294,7 +106277,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin bnez $a2, .LBB493_86 # %bb.89: # in Loop: Header=BB493_87 Depth=3 ld.hu $a2, $a0, 12 - ld.d $a3, $sp, 296 # 8-byte Folded Reload + ld.d $a3, $sp, 304 # 8-byte Folded Reload bne $a2, $a3, .LBB493_86 # %bb.90: # in Loop: Header=BB493_87 Depth=3 ld.hu $a2, $a0, 14 @@ -106324,9 +106307,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin # Parent Loop BB493_55 Depth=1 # Parent Loop BB493_58 Depth=2 # => This Inner Loop Header: Depth=3 - fmul.d $fa2, $fa2, $ft0 + fmul.d $fa2, $fa2, $fa7 fcmp.clt.d $fcc0, $fa2, $fa0 - fadd.d $fa1, $fa1, $ft1 + fadd.d $fa1, $fa1, $ft0 bcnez $fcc0, .LBB493_95 .LBB493_96: # %estLog.exit.i # in Loop: Header=BB493_58 Depth=2 @@ -106371,7 +106354,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin bnez $a4, .LBB493_101 # %bb.104: # in Loop: Header=BB493_102 Depth=3 ld.hu $a4, $a2, -20 - ld.d $a5, $sp, 296 # 8-byte Folded Reload + ld.d $a5, $sp, 304 # 8-byte Folded Reload bne $a4, $a5, .LBB493_101 # %bb.105: # in Loop: Header=BB493_102 Depth=3 ld.hu $a4, $a2, -18 @@ -106398,7 +106381,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin bnez $a3, .LBB493_107 # %bb.110: # in Loop: Header=BB493_108 Depth=3 ld.hu $a3, $a1, 12 - ld.d $a4, $sp, 296 # 8-byte Folded Reload + ld.d $a4, $sp, 304 # 8-byte Folded Reload bne $a3, $a4, .LBB493_107 # %bb.111: # in Loop: Header=BB493_108 Depth=3 ld.hu $a3, $a1, 14 @@ -106512,7 +106495,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin beqz $a0, .LBB493_129 # %bb.128: # in Loop: Header=BB493_58 Depth=2 ld.w $a1, $a0, -8 - ld.d $a3, $sp, 96 # 8-byte Folded Reload + ld.d $a3, $sp, 112 # 8-byte Folded Reload ld.d $a2, $a3, %pc_lo12(mem.5) addi.d $a0, $a0, -8 sub.d $a1, $a2, $a1 @@ -106523,11 +106506,11 @@ sqlite3WhereBegin: # @sqlite3WhereBegin # in Loop: Header=BB493_58 Depth=2 ld.w $fp, $s2, 16 st.w $zero, $s2, 40 - vld $vr0, $sp, 448 # 16-byte Folded Reload + vld $vr0, $sp, 96 # 16-byte Folded Reload vst $vr0, $s2, 48 ld.d $a0, $sp, 80 # 8-byte Folded Reload st.d $a0, $s2, 64 - ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload beqz $a0, .LBB493_132 # %bb.130: # %sqlite3_free.exit.i # in Loop: Header=BB493_58 Depth=2 @@ -106541,8 +106524,8 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $s2 jirl $ra, $a2, 0 ld.w $a1, $s2, 0 - vldi $vr8, -988 - vldi $vr9, -912 + vldi $vr7, -988 + vldi $vr8, -912 ori $t3, $zero, 48 blez $a1, .LBB493_138 # %bb.133: # %.lr.ph197.i @@ -106572,8 +106555,8 @@ sqlite3WhereBegin: # @sqlite3WhereBegin pcaddu18i $ra, %call36(sqlite3ErrorMsg) jirl $ra, $ra, 0 ori $t3, $zero, 48 - vldi $vr9, -912 - vldi $vr8, -988 + vldi $vr8, -912 + vldi $vr7, -988 movgr2fr.d $fa0, $zero b .LBB493_183 .p2align 4, , 16 @@ -106610,7 +106593,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin bne $s8, $a2, .LBB493_143 # %bb.145: # in Loop: Header=BB493_144 Depth=3 ld.hu $a2, $a0, -2 - ld.d $a3, $sp, 296 # 8-byte Folded Reload + ld.d $a3, $sp, 304 # 8-byte Folded Reload bne $a2, $a3, .LBB493_143 # %bb.146: # in Loop: Header=BB493_144 Depth=3 ld.hu $a2, $a0, 0 @@ -106627,8 +106610,8 @@ sqlite3WhereBegin: # @sqlite3WhereBegin pcaddu18i $ra, %call36(sqlite3ErrorMsg) jirl $ra, $ra, 0 ori $t3, $zero, 48 - vldi $vr9, -912 - vldi $vr8, -988 + vldi $vr8, -912 + vldi $vr7, -988 .LBB493_148: # in Loop: Header=BB493_58 Depth=2 fld.d $fa0, $s2, 64 st.w $fp, $s2, 16 @@ -106670,11 +106653,10 @@ sqlite3WhereBegin: # @sqlite3WhereBegin bne $a1, $a2, .LBB493_155 b .LBB493_442 .LBB493_157: # in Loop: Header=BB493_58 Depth=2 - vld $vr7, $sp, 448 # 16-byte Folded Reload ori $s5, $zero, 1 ori $t4, $zero, 110 - ld.d $t5, $sp, 368 # 8-byte Folded Reload - ld.d $t6, $sp, 352 # 8-byte Folded Reload + ld.d $t5, $sp, 384 # 8-byte Folded Reload + ld.d $t6, $sp, 368 # 8-byte Folded Reload b .LBB493_218 .LBB493_158: # in Loop: Header=BB493_58 Depth=2 move $s0, $zero @@ -106726,7 +106708,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin sltui $a1, $a1, 1 maskeqz $s5, $a0, $a1 .LBB493_170: # in Loop: Header=BB493_58 Depth=2 - ld.d $a0, $sp, 320 # 8-byte Folded Reload + ld.d $a0, $sp, 336 # 8-byte Folded Reload ld.d $s6, $a0, 0 slli.d $a0, $s0, 4 alsl.d $a0, $s0, $a0, 2 @@ -106773,7 +106755,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin # %bb.176: # %.lr.ph186.i # in Loop: Header=BB493_58 Depth=2 ld.d $a6, $sp, 496 - ld.d $a3, $sp, 328 # 8-byte Folded Reload + ld.d $a3, $sp, 344 # 8-byte Folded Reload ld.w $a3, $a3, 44 move $a4, $zero move $a5, $zero @@ -106809,26 +106791,26 @@ sqlite3WhereBegin: # @sqlite3WhereBegin # in Loop: Header=BB493_58 Depth=2 pcalau12i $a0, %pc_hi20(.L.str.206) addi.d $a1, $a0, %pc_lo12(.L.str.206) - ld.d $s8, $sp, 320 # 8-byte Folded Reload + ld.d $s8, $sp, 336 # 8-byte Folded Reload move $a0, $s8 pcaddu18i $ra, %call36(sqlite3ErrorMsg) jirl $ra, $ra, 0 movgr2fr.d $fa0, $zero ori $s5, $zero, 1 - vldi $vr8, -988 - vldi $vr9, -912 + vldi $vr7, -988 + vldi $vr8, -912 ori $t3, $zero, 48 .LBB493_183: # %bestVirtualIndex.exit # in Loop: Header=BB493_58 Depth=2 ld.d $a1, $s1, 88 lu12i.w $a3, 2048 move $a0, $a3 - ld.d $a6, $sp, 232 # 8-byte Folded Reload - ld.d $a5, $sp, 224 # 8-byte Folded Reload - ld.d $t1, $sp, 216 # 8-byte Folded Reload - ld.d $a7, $sp, 208 # 8-byte Folded Reload - ld.d $t0, $sp, 328 # 8-byte Folded Reload - ld.d $a4, $sp, 200 # 8-byte Folded Reload + ld.d $a6, $sp, 240 # 8-byte Folded Reload + ld.d $a5, $sp, 232 # 8-byte Folded Reload + ld.d $t1, $sp, 224 # 8-byte Folded Reload + ld.d $a7, $sp, 216 # 8-byte Folded Reload + ld.d $t0, $sp, 344 # 8-byte Folded Reload + ld.d $a4, $sp, 208 # 8-byte Folded Reload beqz $a1, .LBB493_185 # %bb.184: # in Loop: Header=BB493_58 Depth=2 ld.w $a0, $a1, 60 @@ -106847,19 +106829,19 @@ sqlite3WhereBegin: # @sqlite3WhereBegin fcmp.cule.d $fcc0, $fs2, $fs3 bcnez $fcc0, .LBB493_187 .LBB493_186: # in Loop: Header=BB493_58 Depth=2 - ld.d $a2, $sp, 184 # 8-byte Folded Reload + ld.d $a2, $sp, 192 # 8-byte Folded Reload st.d $a1, $a2, 80 ori $a7, $zero, 1 - st.d $a6, $sp, 160 # 8-byte Folded Spill + st.d $a6, $sp, 168 # 8-byte Folded Spill fmov.d $fs2, $fs3 move $t1, $a3 - st.d $a0, $sp, 120 # 8-byte Folded Spill + st.d $a0, $sp, 128 # 8-byte Folded Spill move $a5, $t2 .LBB493_187: # in Loop: Header=BB493_58 Depth=2 bnez $a4, .LBB493_449 # %bb.188: # %._crit_edge1887 # in Loop: Header=BB493_58 Depth=2 - ld.d $a0, $sp, 240 # 8-byte Folded Reload + ld.d $a0, $sp, 248 # 8-byte Folded Reload ld.hu $a0, $a0, 0 addi.d $a6, $a6, 1 ext.w.h $a1, $a0 @@ -106875,11 +106857,11 @@ sqlite3WhereBegin: # @sqlite3WhereBegin bne $s5, $s8, .LBB493_193 # %bb.191: # in Loop: Header=BB493_58 Depth=2 move $a2, $zero - ld.d $s8, $sp, 320 # 8-byte Folded Reload + ld.d $s8, $sp, 336 # 8-byte Folded Reload b .LBB493_196 .LBB493_192: # in Loop: Header=BB493_58 Depth=2 ori $s5, $zero, 1 - ld.d $s8, $sp, 320 # 8-byte Folded Reload + ld.d $s8, $sp, 336 # 8-byte Folded Reload bgtz $s0, .LBB493_74 b .LBB493_126 .LBB493_193: # %vector.ph2425 @@ -106911,7 +106893,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin bnez $a5, .LBB493_194 # %bb.195: # %middle.block2431 # in Loop: Header=BB493_58 Depth=2 - ld.d $s8, $sp, 320 # 8-byte Folded Reload + ld.d $s8, $sp, 336 # 8-byte Folded Reload beq $a2, $s5, .LBB493_198 .LBB493_196: # %scalar.ph2423.preheader # in Loop: Header=BB493_58 Depth=2 @@ -106974,7 +106956,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin bnez $a1, .LBB493_201 # %bb.204: # in Loop: Header=BB493_202 Depth=3 ld.hu $a1, $a0, -20 - ld.d $a2, $sp, 296 # 8-byte Folded Reload + ld.d $a2, $sp, 304 # 8-byte Folded Reload bne $a1, $a2, .LBB493_201 # %bb.205: # in Loop: Header=BB493_202 Depth=3 ld.hu $a1, $a0, -18 @@ -107023,42 +107005,40 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a0, $fp pcaddu18i $ra, %call36(exprTableUsage) jirl $ra, $ra, 0 - vldi $vr9, -912 - vldi $vr8, -988 + vldi $vr8, -912 + vldi $vr7, -988 and $a1, $a0, $s5 move $a0, $s7 - ld.d $s7, $sp, 304 # 8-byte Folded Reload + ld.d $s7, $sp, 312 # 8-byte Folded Reload beqz $a1, .LBB493_211 .LBB493_213: # %.loopexit.i820 # in Loop: Header=BB493_58 Depth=2 vldi $vr0, -912 - fcmp.cule.d $fcc0, $fs4, $ft0 + fcmp.cule.d $fcc0, $fs4, $fa7 bcnez $fcc0, .LBB493_216 # %bb.214: # %.lr.ph.i293.i.preheader # in Loop: Header=BB493_58 Depth=2 vldi $vr1, -988 - vld $vr7, $sp, 448 # 16-byte Folded Reload ori $s5, $zero, 1 ori $t4, $zero, 110 - ld.d $t5, $sp, 368 # 8-byte Folded Reload - ld.d $t6, $sp, 352 # 8-byte Folded Reload + ld.d $t5, $sp, 384 # 8-byte Folded Reload + ld.d $t6, $sp, 368 # 8-byte Folded Reload ld.d $t1, $sp, 464 # 8-byte Folded Reload .p2align 4, , 16 .LBB493_215: # %.lr.ph.i293.i # Parent Loop BB493_55 Depth=1 # Parent Loop BB493_58 Depth=2 # => This Inner Loop Header: Depth=3 - fmul.d $fa1, $fa1, $ft0 + fmul.d $fa1, $fa1, $fa7 fcmp.clt.d $fcc0, $fa1, $fs4 - fadd.d $fa0, $fa0, $ft1 + fadd.d $fa0, $fa0, $ft0 bcnez $fcc0, .LBB493_215 b .LBB493_217 .LBB493_216: # in Loop: Header=BB493_58 Depth=2 - vld $vr7, $sp, 448 # 16-byte Folded Reload ori $s5, $zero, 1 ori $t4, $zero, 110 - ld.d $t5, $sp, 368 # 8-byte Folded Reload - ld.d $t6, $sp, 352 # 8-byte Folded Reload + ld.d $t5, $sp, 384 # 8-byte Folded Reload + ld.d $t6, $sp, 368 # 8-byte Folded Reload ld.d $t1, $sp, 464 # 8-byte Folded Reload .LBB493_217: # %estLog.exit296.i # in Loop: Header=BB493_58 Depth=2 @@ -107070,7 +107050,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin masknez $a1, $s0, $a0 maskeqz $a0, $s4, $a0 or $a0, $a0, $a1 - st.d $a0, $sp, 248 # 8-byte Folded Spill + st.d $a0, $sp, 256 # 8-byte Folded Spill andi $a0, $s1, 8 sltui $a0, $a0, 1 ori $a1, $zero, 3 @@ -107078,14 +107058,13 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ori $a2, $zero, 131 maskeqz $a0, $a2, $a0 or $a0, $a0, $a1 - st.d $a0, $sp, 376 # 8-byte Folded Spill + st.d $a0, $sp, 392 # 8-byte Folded Spill beqz $s2, .LBB493_438 # %bb.219: # %.preheader380.lr.ph.i # in Loop: Header=BB493_58 Depth=2 - st.d $zero, $sp, 256 # 8-byte Folded Spill st.d $zero, $sp, 264 # 8-byte Folded Spill - st.d $s8, $sp, 312 # 8-byte Folded Spill - st.d $t1, $sp, 464 # 8-byte Folded Spill + st.d $zero, $sp, 272 # 8-byte Folded Spill + st.d $s8, $sp, 328 # 8-byte Folded Spill b .LBB493_221 .p2align 4, , 16 .LBB493_220: # %.thread372.i @@ -107151,7 +107130,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin beqz $s3, .LBB493_351 # %bb.224: # %.lr.ph69.i1412 # in Loop: Header=BB493_223 Depth=4 - st.d $s0, $sp, 384 # 8-byte Folded Spill + st.d $s0, $sp, 400 # 8-byte Folded Spill ld.d $a1, $s2, 16 slli.d $a2, $t8, 2 ldx.w $s4, $a1, $a2 @@ -107181,7 +107160,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin bne $s4, $a1, .LBB493_225 # %bb.229: # in Loop: Header=BB493_226 Depth=5 ld.hu $a1, $s1, 14 - ld.d $a2, $sp, 376 # 8-byte Folded Reload + ld.d $a2, $sp, 392 # 8-byte Folded Reload and $a2, $a1, $a2 beqz $a2, .LBB493_225 # %bb.230: # %findTerm.exit1429.thread1548 @@ -107192,11 +107171,11 @@ sqlite3WhereBegin: # @sqlite3WhereBegin .LBB493_231: # in Loop: Header=BB493_223 Depth=4 lu12i.w $a1, 1 or $s6, $s6, $a1 - ld.d $s0, $sp, 384 # 8-byte Folded Reload + ld.d $s0, $sp, 400 # 8-byte Folded Reload b .LBB493_350 .LBB493_232: # in Loop: Header=BB493_235 Depth=5 move $a2, $zero - ld.d $t5, $sp, 368 # 8-byte Folded Reload + ld.d $t5, $sp, 384 # 8-byte Folded Reload .LBB493_233: # %sqlite3StrICmp.exit.i # in Loop: Header=BB493_235 Depth=5 ld.bu $a1, $a1, 0 @@ -107236,7 +107215,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin bne $s4, $a1, .LBB493_234 # %bb.238: # in Loop: Header=BB493_235 Depth=5 ld.hu $a1, $s1, 14 - ld.d $a2, $sp, 376 # 8-byte Folded Reload + ld.d $a2, $sp, 392 # 8-byte Folded Reload and $a2, $a1, $a2 beqz $a2, .LBB493_234 # %bb.239: # in Loop: Header=BB493_235 Depth=5 @@ -107290,7 +107269,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin .LBB493_245: # in Loop: Header=BB493_247 Depth=6 bstrpick.d $t2, $t0, 23, 0 move $a3, $t1 - ld.d $t1, $sp, 336 # 8-byte Folded Reload + ld.d $t1, $sp, 352 # 8-byte Folded Reload beq $t2, $t1, .LBB493_278 .p2align 4, , 16 .LBB493_246: # in Loop: Header=BB493_247 Depth=6 @@ -107317,7 +107296,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin beq $t2, $t5, .LBB493_252 # %bb.249: # %.lr.ph.i.i.i1473 # in Loop: Header=BB493_247 Depth=6 - ld.d $t3, $sp, 424 # 8-byte Folded Reload + ld.d $t3, $sp, 440 # 8-byte Folded Reload beq $t2, $t3, .LBB493_246 b .LBB493_254 .p2align 4, , 16 @@ -107326,7 +107305,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin beq $t2, $a4, .LBB493_246 # %bb.251: # %.lr.ph.i.i.i1473 # in Loop: Header=BB493_247 Depth=6 - ld.d $t3, $sp, 416 # 8-byte Folded Reload + ld.d $t3, $sp, 432 # 8-byte Folded Reload beq $t2, $t3, .LBB493_246 b .LBB493_254 .LBB493_252: # in Loop: Header=BB493_247 Depth=6 @@ -107338,14 +107317,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ori $t7, $zero, 101 beq $t3, $t7, .LBB493_246 .LBB493_254: # in Loop: Header=BB493_247 Depth=6 - ld.d $a3, $sp, 408 # 8-byte Folded Reload + ld.d $a3, $sp, 424 # 8-byte Folded Reload xor $a3, $t2, $a3 sltu $a3, $zero, $a3 - ld.d $t3, $sp, 400 # 8-byte Folded Reload + ld.d $t3, $sp, 416 # 8-byte Folded Reload xor $t3, $t2, $t3 sltu $t3, $zero, $t3 and $a3, $a3, $t3 - ld.d $t3, $sp, 392 # 8-byte Folded Reload + ld.d $t3, $sp, 408 # 8-byte Folded Reload xor $t2, $t2, $t3 sltu $t2, $zero, $t2 and $a3, $t2, $a3 @@ -107399,7 +107378,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin .LBB493_263: # in Loop: Header=BB493_265 Depth=6 bstrpick.d $t2, $a7, 23, 0 move $t0, $t1 - ld.d $t1, $sp, 336 # 8-byte Folded Reload + ld.d $t1, $sp, 352 # 8-byte Folded Reload beq $t2, $t1, .LBB493_305 .p2align 4, , 16 .LBB493_264: # in Loop: Header=BB493_265 Depth=6 @@ -107426,7 +107405,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin beq $t2, $t5, .LBB493_270 # %bb.267: # %.lr.ph.i.i.i1508 # in Loop: Header=BB493_265 Depth=6 - ld.d $t3, $sp, 424 # 8-byte Folded Reload + ld.d $t3, $sp, 440 # 8-byte Folded Reload beq $t2, $t3, .LBB493_264 b .LBB493_272 .p2align 4, , 16 @@ -107435,7 +107414,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin beq $t2, $a4, .LBB493_264 # %bb.269: # %.lr.ph.i.i.i1508 # in Loop: Header=BB493_265 Depth=6 - ld.d $t3, $sp, 416 # 8-byte Folded Reload + ld.d $t3, $sp, 432 # 8-byte Folded Reload beq $t2, $t3, .LBB493_264 b .LBB493_272 .LBB493_270: # in Loop: Header=BB493_265 Depth=6 @@ -107447,14 +107426,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ori $t7, $zero, 101 beq $t3, $t7, .LBB493_264 .LBB493_272: # in Loop: Header=BB493_265 Depth=6 - ld.d $t0, $sp, 408 # 8-byte Folded Reload + ld.d $t0, $sp, 424 # 8-byte Folded Reload xor $t0, $t2, $t0 sltu $t0, $zero, $t0 - ld.d $t3, $sp, 400 # 8-byte Folded Reload + ld.d $t3, $sp, 416 # 8-byte Folded Reload xor $t3, $t2, $t3 sltu $t3, $zero, $t3 and $t0, $t0, $t3 - ld.d $t3, $sp, 392 # 8-byte Folded Reload + ld.d $t3, $sp, 408 # 8-byte Folded Reload xor $t2, $t2, $t3 sltu $t2, $zero, $t2 and $t0, $t2, $t0 @@ -107522,7 +107501,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin .LBB493_285: # in Loop: Header=BB493_287 Depth=6 bstrpick.d $t2, $a7, 23, 0 move $t0, $t1 - ld.d $t1, $sp, 336 # 8-byte Folded Reload + ld.d $t1, $sp, 352 # 8-byte Folded Reload beq $t2, $t1, .LBB493_306 .p2align 4, , 16 .LBB493_286: # in Loop: Header=BB493_287 Depth=6 @@ -107549,7 +107528,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin beq $t2, $t5, .LBB493_292 # %bb.289: # %.lr.ph.i.i.i1491 # in Loop: Header=BB493_287 Depth=6 - ld.d $t3, $sp, 424 # 8-byte Folded Reload + ld.d $t3, $sp, 440 # 8-byte Folded Reload beq $t2, $t3, .LBB493_286 b .LBB493_294 .p2align 4, , 16 @@ -107558,7 +107537,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin beq $t2, $a4, .LBB493_286 # %bb.291: # %.lr.ph.i.i.i1491 # in Loop: Header=BB493_287 Depth=6 - ld.d $t3, $sp, 416 # 8-byte Folded Reload + ld.d $t3, $sp, 432 # 8-byte Folded Reload beq $t2, $t3, .LBB493_286 b .LBB493_294 .LBB493_292: # in Loop: Header=BB493_287 Depth=6 @@ -107570,14 +107549,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ori $t7, $zero, 101 beq $t3, $t7, .LBB493_286 .LBB493_294: # in Loop: Header=BB493_287 Depth=6 - ld.d $t0, $sp, 408 # 8-byte Folded Reload + ld.d $t0, $sp, 424 # 8-byte Folded Reload xor $t0, $t2, $t0 sltu $t0, $zero, $t0 - ld.d $t3, $sp, 400 # 8-byte Folded Reload + ld.d $t3, $sp, 416 # 8-byte Folded Reload xor $t3, $t2, $t3 sltu $t3, $zero, $t3 and $t0, $t0, $t3 - ld.d $t3, $sp, 392 # 8-byte Folded Reload + ld.d $t3, $sp, 408 # 8-byte Folded Reload xor $t2, $t2, $t3 sltu $t2, $zero, $t2 and $t0, $t2, $t0 @@ -107693,23 +107672,22 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.d $a2, $fp, 0 ld.d $a0, $ra, 0 move $a1, $fp - st.d $a2, $sp, 288 # 8-byte Folded Spill - vst $vr2, $sp, 432 # 16-byte Folded Spill - st.d $t8, $sp, 360 # 8-byte Folded Spill - st.d $ra, $sp, 344 # 8-byte Folded Spill + st.d $a2, $sp, 320 # 8-byte Folded Spill + vst $vr2, $sp, 448 # 16-byte Folded Spill + st.d $t8, $sp, 376 # 8-byte Folded Spill + st.d $ra, $sp, 360 # 8-byte Folded Spill pcaddu18i $ra, %call36(sqlite3GetCollSeq) jirl $ra, $ra, 0 - ld.d $ra, $sp, 344 # 8-byte Folded Reload - ld.d $t8, $sp, 360 # 8-byte Folded Reload - vld $vr2, $sp, 432 # 16-byte Folded Reload - ld.d $t6, $sp, 352 # 8-byte Folded Reload - vldi $vr9, -912 - vldi $vr8, -988 + ld.d $ra, $sp, 360 # 8-byte Folded Reload + ld.d $t8, $sp, 376 # 8-byte Folded Reload + vld $vr2, $sp, 448 # 16-byte Folded Reload + ld.d $t6, $sp, 368 # 8-byte Folded Reload + vldi $vr8, -912 + vldi $vr7, -988 ori $t4, $zero, 110 - vld $vr7, $sp, 448 # 16-byte Folded Reload bnez $a0, .LBB493_328 # %bb.318: # in Loop: Header=BB493_235 Depth=5 - ld.d $a2, $sp, 288 # 8-byte Folded Reload + ld.d $a2, $sp, 320 # 8-byte Folded Reload ld.w $a0, $ra, 80 bnez $a0, .LBB493_320 # %bb.319: # in Loop: Header=BB493_235 Depth=5 @@ -107718,14 +107696,13 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a0, $ra pcaddu18i $ra, %call36(sqlite3ErrorMsg) jirl $ra, $ra, 0 - ld.d $ra, $sp, 344 # 8-byte Folded Reload - ld.d $t8, $sp, 360 # 8-byte Folded Reload - vld $vr2, $sp, 432 # 16-byte Folded Reload - ld.d $t6, $sp, 352 # 8-byte Folded Reload - vldi $vr9, -912 - vldi $vr8, -988 + ld.d $ra, $sp, 360 # 8-byte Folded Reload + ld.d $t8, $sp, 376 # 8-byte Folded Reload + vld $vr2, $sp, 448 # 16-byte Folded Reload + ld.d $t6, $sp, 368 # 8-byte Folded Reload + vldi $vr8, -912 + vldi $vr7, -988 ori $t4, $zero, 110 - vld $vr7, $sp, 448 # 16-byte Folded Reload ld.w $a0, $ra, 80 .LBB493_320: # %sqlite3CheckCollSeq.exit.i.i # in Loop: Header=BB493_235 Depth=5 @@ -107805,7 +107782,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin # %bb.335: # %.lr.ph.i.i1416.preheader # in Loop: Header=BB493_235 Depth=5 addi.d $a3, $a3, 1 - ld.d $t5, $sp, 368 # 8-byte Folded Reload + ld.d $t5, $sp, 384 # 8-byte Folded Reload .p2align 4, , 16 .LBB493_336: # %.lr.ph.i.i1416 # Parent Loop BB493_55 Depth=1 @@ -107833,19 +107810,18 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.d $a0, $ra, 0 move $a1, $fp move $a2, $s0 - vst $vr2, $sp, 432 # 16-byte Folded Spill - st.d $t8, $sp, 360 # 8-byte Folded Spill - st.d $ra, $sp, 344 # 8-byte Folded Spill + vst $vr2, $sp, 448 # 16-byte Folded Spill + st.d $t8, $sp, 376 # 8-byte Folded Spill + st.d $ra, $sp, 360 # 8-byte Folded Spill pcaddu18i $ra, %call36(sqlite3GetCollSeq) jirl $ra, $ra, 0 - ld.d $ra, $sp, 344 # 8-byte Folded Reload - ld.d $t8, $sp, 360 # 8-byte Folded Reload - vld $vr2, $sp, 432 # 16-byte Folded Reload - ld.d $t6, $sp, 352 # 8-byte Folded Reload - vldi $vr9, -912 - vldi $vr8, -988 + ld.d $ra, $sp, 360 # 8-byte Folded Reload + ld.d $t8, $sp, 376 # 8-byte Folded Reload + vld $vr2, $sp, 448 # 16-byte Folded Reload + ld.d $t6, $sp, 368 # 8-byte Folded Reload + vldi $vr8, -912 + vldi $vr7, -988 ori $t4, $zero, 110 - vld $vr7, $sp, 448 # 16-byte Folded Reload bnez $a0, .LBB493_328 # %bb.341: # in Loop: Header=BB493_235 Depth=5 ld.w $a0, $ra, 80 @@ -107857,14 +107833,13 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a2, $s0 pcaddu18i $ra, %call36(sqlite3ErrorMsg) jirl $ra, $ra, 0 - ld.d $ra, $sp, 344 # 8-byte Folded Reload - ld.d $t8, $sp, 360 # 8-byte Folded Reload - vld $vr2, $sp, 432 # 16-byte Folded Reload - ld.d $t6, $sp, 352 # 8-byte Folded Reload - vldi $vr9, -912 - vldi $vr8, -988 + ld.d $ra, $sp, 360 # 8-byte Folded Reload + ld.d $t8, $sp, 376 # 8-byte Folded Reload + vld $vr2, $sp, 448 # 16-byte Folded Reload + ld.d $t6, $sp, 368 # 8-byte Folded Reload + vldi $vr8, -912 + vldi $vr7, -988 ori $t4, $zero, 110 - vld $vr7, $sp, 448 # 16-byte Folded Reload ld.w $a0, $ra, 80 .LBB493_343: # %sqlite3CheckCollSeq.exit.i26.i # in Loop: Header=BB493_235 Depth=5 @@ -107876,7 +107851,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin lu12i.w $a1, 1 or $s6, $s6, $a1 ld.d $t1, $sp, 464 # 8-byte Folded Reload - ld.d $s0, $sp, 384 # 8-byte Folded Reload + ld.d $s0, $sp, 400 # 8-byte Folded Reload b .LBB493_350 .LBB493_345: # %sqlite3StrICmp.exit.i.findTerm.exit1429.thread1548.loopexit1789_crit_edge # in Loop: Header=BB493_223 Depth=4 @@ -107893,11 +107868,11 @@ sqlite3WhereBegin: # @sqlite3WhereBegin # %bb.347: # in Loop: Header=BB493_223 Depth=4 vldi $vr0, -967 fmul.d $fa2, $fa2, $fa0 - ld.d $s0, $sp, 384 # 8-byte Folded Reload + ld.d $s0, $sp, 400 # 8-byte Folded Reload b .LBB493_350 .LBB493_348: # in Loop: Header=BB493_223 Depth=4 ld.d $a1, $a1, 32 - ld.d $s0, $sp, 384 # 8-byte Folded Reload + ld.d $s0, $sp, 400 # 8-byte Folded Reload beqz $a1, .LBB493_350 # %bb.349: # in Loop: Header=BB493_223 Depth=4 ld.w $a1, $a1, 0 @@ -107928,7 +107903,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin slli.d $a1, $s4, 2 ldx.wu $a2, $a2, $a1 vldi $vr0, -912 - fcmp.cule.d $fcc0, $fa2, $ft0 + fcmp.cule.d $fcc0, $fa2, $fa7 bcnez $fcc0, .LBB493_356 # %bb.354: # %.lr.ph.i298.i.preheader # in Loop: Header=BB493_221 Depth=3 @@ -107939,9 +107914,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin # Parent Loop BB493_58 Depth=2 # Parent Loop BB493_221 Depth=3 # => This Inner Loop Header: Depth=4 - fmul.d $fa1, $fa1, $ft0 + fmul.d $fa1, $fa1, $fa7 fcmp.clt.d $fcc0, $fa1, $fa2 - fadd.d $fa0, $fa0, $ft1 + fadd.d $fa0, $fa0, $ft0 bcnez $fcc0, .LBB493_355 .LBB493_356: # %estLog.exit301.i # in Loop: Header=BB493_221 Depth=3 @@ -108016,39 +107991,38 @@ sqlite3WhereBegin: # @sqlite3WhereBegin vldi $vr0, -1016 fdiv.d $fs4, $fs4, $fa0 .LBB493_363: # in Loop: Header=BB493_221 Depth=3 - vld $vr7, $sp, 448 # 16-byte Folded Reload ori $t4, $zero, 110 - ld.d $t5, $sp, 368 # 8-byte Folded Reload - vldi $vr8, -988 - vldi $vr9, -912 - ld.d $t6, $sp, 352 # 8-byte Folded Reload + ld.d $t5, $sp, 384 # 8-byte Folded Reload + vldi $vr7, -988 + vldi $vr8, -912 + ld.d $t6, $sp, 368 # 8-byte Folded Reload move $t1, $s3 .LBB493_364: # in Loop: Header=BB493_221 Depth=3 beqz $t1, .LBB493_424 # %bb.365: # in Loop: Header=BB493_221 Depth=3 - st.d $s0, $sp, 384 # 8-byte Folded Spill - st.d $s1, $sp, 344 # 8-byte Folded Spill + st.d $s0, $sp, 400 # 8-byte Folded Spill + st.d $s1, $sp, 360 # 8-byte Folded Spill slli.d $a0, $s1, 49 bltz $a0, .LBB493_419 # %bb.366: # in Loop: Header=BB493_221 Depth=3 ld.w $a0, $t1, 0 ld.d $a7, $sp, 480 - st.d $a7, $sp, 432 # 8-byte Folded Spill - st.d $a0, $sp, 360 # 8-byte Folded Spill + st.d $a7, $sp, 448 # 8-byte Folded Spill + st.d $a0, $sp, 376 # 8-byte Folded Spill blez $a0, .LBB493_405 # %bb.367: # %.lr.ph.i302.i # in Loop: Header=BB493_221 Depth=3 ld.d $t2, $t1, 16 - ld.d $a0, $sp, 320 # 8-byte Folded Reload + ld.d $a0, $sp, 336 # 8-byte Folded Reload ld.d $t3, $a0, 0 ld.w $s1, $s2, 8 move $s7, $zero move $s0, $zero move $s3, $zero - st.d $zero, $sp, 288 # 8-byte Folded Spill + st.d $zero, $sp, 296 # 8-byte Folded Spill addi.d $a0, $a7, 4 - st.d $a0, $sp, 168 # 8-byte Folded Spill - st.d $t3, $sp, 280 # 8-byte Folded Spill + st.d $a0, $sp, 176 # 8-byte Folded Spill + st.d $t3, $sp, 288 # 8-byte Folded Spill .p2align 4, , 16 .LBB493_368: # Parent Loop BB493_55 Depth=1 # Parent Loop BB493_58 Depth=2 @@ -108097,21 +108071,21 @@ sqlite3WhereBegin: # @sqlite3WhereBegin .LBB493_375: # in Loop: Header=BB493_368 Depth=4 beqz $fp, .LBB493_382 # %bb.376: # in Loop: Header=BB493_368 Depth=4 - ld.d $s8, $fp, 0 - ld.d $a0, $sp, 320 # 8-byte Folded Reload + ld.d $s1, $fp, 0 + ld.d $a0, $sp, 336 # 8-byte Folded Reload ld.d $a0, $a0, 0 move $a1, $fp - move $a2, $s8 - move $s1, $t2 + move $a2, $s1 + move $s8, $t1 + st.d $t2, $sp, 320 # 8-byte Folded Spill pcaddu18i $ra, %call36(sqlite3GetCollSeq) jirl $ra, $ra, 0 - ld.d $t3, $sp, 280 # 8-byte Folded Reload - move $t2, $s1 - ld.d $t1, $sp, 464 # 8-byte Folded Reload - ld.d $a7, $sp, 432 # 8-byte Folded Reload - vldi $vr9, -912 - vldi $vr8, -988 - vld $vr7, $sp, 448 # 16-byte Folded Reload + ld.d $t3, $sp, 288 # 8-byte Folded Reload + ld.d $t2, $sp, 320 # 8-byte Folded Reload + move $t1, $s8 + ld.d $a7, $sp, 448 # 8-byte Folded Reload + vldi $vr8, -912 + vldi $vr7, -988 beqz $a0, .LBB493_379 .p2align 4, , 16 # %bb.377: # %sqlite3ExprCollSeq.exit.i.i @@ -108134,28 +108108,27 @@ sqlite3WhereBegin: # @sqlite3WhereBegin maskeqz $a2, $a3, $a2 or $a0, $a2, $a0 alsl.d $a2, $s7, $a4, 3 - ld.d $s8, $sp, 312 # 8-byte Folded Reload + ld.d $s8, $sp, 328 # 8-byte Folded Reload ld.w $a3, $s6, 76 beq $a3, $a0, .LBB493_384 b .LBB493_393 .LBB493_379: # in Loop: Header=BB493_368 Depth=4 - ld.d $fp, $sp, 320 # 8-byte Folded Reload + ld.d $fp, $sp, 336 # 8-byte Folded Reload ld.w $a0, $fp, 80 bnez $a0, .LBB493_381 # %bb.380: # in Loop: Header=BB493_368 Depth=4 pcalau12i $a0, %pc_hi20(.L.str.343) addi.d $a1, $a0, %pc_lo12(.L.str.343) move $a0, $fp - move $a2, $s8 + move $a2, $s1 pcaddu18i $ra, %call36(sqlite3ErrorMsg) jirl $ra, $ra, 0 - ld.d $t3, $sp, 280 # 8-byte Folded Reload - move $t2, $s1 - ld.d $t1, $sp, 464 # 8-byte Folded Reload - ld.d $a7, $sp, 432 # 8-byte Folded Reload - vldi $vr9, -912 - vldi $vr8, -988 - vld $vr7, $sp, 448 # 16-byte Folded Reload + ld.d $t3, $sp, 288 # 8-byte Folded Reload + ld.d $t2, $sp, 320 # 8-byte Folded Reload + move $t1, $s8 + ld.d $a7, $sp, 448 # 8-byte Folded Reload + vldi $vr8, -912 + vldi $vr7, -988 ld.w $a0, $fp, 80 .LBB493_381: # %sqlite3CheckCollSeq.exit.i.i.i # in Loop: Header=BB493_368 Depth=4 @@ -108171,7 +108144,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero addi.w $a0, $zero, -1 move $a2, $fp - ld.d $s8, $sp, 312 # 8-byte Folded Reload + ld.d $s8, $sp, 328 # 8-byte Folded Reload ld.w $a3, $s6, 76 bne $a3, $a0, .LBB493_393 .LBB493_384: # in Loop: Header=BB493_368 Depth=4 @@ -108211,7 +108184,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin xor $a1, $a1, $a2 bgeu $s4, $s7, .LBB493_395 # %bb.391: # in Loop: Header=BB493_368 Depth=4 - ld.d $a2, $sp, 288 # 8-byte Folded Reload + ld.d $a2, $sp, 296 # 8-byte Folded Reload bne $a1, $a2, .LBB493_419 # %bb.392: # in Loop: Header=BB493_368 Depth=4 addi.w $s3, $s3, 1 @@ -108225,23 +108198,23 @@ sqlite3WhereBegin: # @sqlite3WhereBegin # in Loop: Header=BB493_368 Depth=4 addi.d $s7, $s7, 1 addi.d $s0, $s0, 1 - ld.d $a0, $sp, 360 # 8-byte Folded Reload + ld.d $a0, $sp, 376 # 8-byte Folded Reload blt $s3, $a0, .LBB493_368 b .LBB493_407 .LBB493_395: # in Loop: Header=BB493_368 Depth=4 - st.d $a1, $sp, 288 # 8-byte Folded Spill + st.d $a1, $sp, 296 # 8-byte Folded Spill addi.w $s3, $s3, 1 addi.d $t2, $t2, 24 bgez $a0, .LBB493_394 .LBB493_396: # in Loop: Header=BB493_368 Depth=4 ld.w $a0, $a7, 0 addi.w $fp, $zero, -1 - st.d $t2, $sp, 104 # 8-byte Folded Spill + st.d $t2, $sp, 320 # 8-byte Folded Spill blez $a0, .LBB493_401 # %bb.397: # %.lr.ph.i.i.i311.i # in Loop: Header=BB493_368 Depth=4 move $a1, $zero - ld.d $a2, $sp, 168 # 8-byte Folded Reload + ld.d $a2, $sp, 176 # 8-byte Folded Reload .p2align 4, , 16 .LBB493_398: # Parent Loop BB493_55 Depth=1 # Parent Loop BB493_58 Depth=2 @@ -108264,7 +108237,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.w $a0, $t1, 0 bstrpick.d $a1, $s3, 31, 0 slt $a2, $s3, $a0 - st.d $s3, $sp, 112 # 8-byte Folded Spill + st.d $s3, $sp, 120 # 8-byte Folded Spill masknez $a3, $s3, $a2 maskeqz $a0, $a0, $a2 or $a2, $a0, $a3 @@ -108288,23 +108261,22 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a0, $a7 pcaddu18i $ra, %call36(exprTableUsage) jirl $ra, $ra, 0 - ld.d $a7, $sp, 432 # 8-byte Folded Reload - vldi $vr9, -912 - vldi $vr8, -988 - vld $vr7, $sp, 448 # 16-byte Folded Reload + ld.d $a7, $sp, 448 # 8-byte Folded Reload + vldi $vr8, -912 + vldi $vr7, -988 and $a1, $a0, $fp beqz $a1, .LBB493_402 # %bb.404: # in Loop: Header=BB493_368 Depth=4 ld.d $t1, $sp, 464 # 8-byte Folded Reload - ld.d $s3, $sp, 112 # 8-byte Folded Reload - ld.d $t2, $sp, 104 # 8-byte Folded Reload - ld.d $t3, $sp, 280 # 8-byte Folded Reload + ld.d $s3, $sp, 120 # 8-byte Folded Reload + ld.d $t2, $sp, 320 # 8-byte Folded Reload + ld.d $t3, $sp, 288 # 8-byte Folded Reload b .LBB493_394 .LBB493_405: # in Loop: Header=BB493_221 Depth=3 move $s3, $zero move $s0, $zero ori $fp, $zero, 1 - ld.d $a0, $sp, 360 # 8-byte Folded Reload + ld.d $a0, $sp, 376 # 8-byte Folded Reload blt $s3, $a0, .LBB493_409 b .LBB493_429 .LBB493_406: # %.critedge.loopexit.loopexit.split.loop.exit185.i.i @@ -108312,19 +108284,19 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $s0, $s7 .LBB493_407: # %.critedge.loopexit.i.i # in Loop: Header=BB493_221 Depth=3 - ld.d $a0, $sp, 288 # 8-byte Folded Reload + ld.d $a0, $sp, 296 # 8-byte Folded Reload sltui $fp, $a0, 1 - ld.d $a0, $sp, 360 # 8-byte Folded Reload + ld.d $a0, $sp, 376 # 8-byte Folded Reload bge $s3, $a0, .LBB493_429 b .LBB493_409 .LBB493_408: # %select.unfold.thread.i.i # in Loop: Header=BB493_221 Depth=3 addi.d $s0, $s0, 1 - ld.d $s3, $sp, 360 # 8-byte Folded Reload + ld.d $s3, $sp, 376 # 8-byte Folded Reload ld.d $t1, $sp, 464 # 8-byte Folded Reload - ld.d $a0, $sp, 288 # 8-byte Folded Reload + ld.d $a0, $sp, 296 # 8-byte Folded Reload sltui $fp, $a0, 1 - ld.d $a0, $sp, 360 # 8-byte Folded Reload + ld.d $a0, $sp, 376 # 8-byte Folded Reload bge $s3, $a0, .LBB493_429 .LBB493_409: # in Loop: Header=BB493_221 Depth=3 ld.bu $a0, $s2, 44 @@ -108383,10 +108355,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a0, $a7 pcaddu18i $ra, %call36(exprTableUsage) jirl $ra, $ra, 0 - ld.d $a7, $sp, 432 # 8-byte Folded Reload - vldi $vr9, -912 - vldi $vr8, -988 - vld $vr7, $sp, 448 # 16-byte Folded Reload + ld.d $a7, $sp, 448 # 8-byte Folded Reload + vldi $vr8, -912 + vldi $vr7, -988 and $a1, $a0, $s0 move $a0, $s3 ld.d $t1, $sp, 464 # 8-byte Folded Reload @@ -108395,42 +108366,42 @@ sqlite3WhereBegin: # @sqlite3WhereBegin .LBB493_419: # %isSortingIndex.exit.thread.i # in Loop: Header=BB493_221 Depth=3 vldi $vr0, -912 - fcmp.cule.d $fcc0, $fs4, $ft0 + fcmp.cule.d $fcc0, $fs4, $fa7 bcnez $fcc0, .LBB493_422 # %bb.420: # %.lr.ph.i317.i.preheader # in Loop: Header=BB493_221 Depth=3 vldi $vr1, -988 ori $t4, $zero, 110 - ld.d $t5, $sp, 368 # 8-byte Folded Reload - ld.d $t6, $sp, 352 # 8-byte Folded Reload - ld.d $s0, $sp, 384 # 8-byte Folded Reload + ld.d $t5, $sp, 384 # 8-byte Folded Reload + ld.d $t6, $sp, 368 # 8-byte Folded Reload + ld.d $s0, $sp, 400 # 8-byte Folded Reload .p2align 4, , 16 .LBB493_421: # %.lr.ph.i317.i # Parent Loop BB493_55 Depth=1 # Parent Loop BB493_58 Depth=2 # Parent Loop BB493_221 Depth=3 # => This Inner Loop Header: Depth=4 - fmul.d $fa1, $fa1, $ft0 + fmul.d $fa1, $fa1, $fa7 fcmp.clt.d $fcc0, $fa1, $fs4 - fadd.d $fa0, $fa0, $ft1 + fadd.d $fa0, $fa0, $ft0 bcnez $fcc0, .LBB493_421 b .LBB493_423 .p2align 4, , 16 .LBB493_422: # in Loop: Header=BB493_221 Depth=3 ori $t4, $zero, 110 - ld.d $t5, $sp, 368 # 8-byte Folded Reload - ld.d $t6, $sp, 352 # 8-byte Folded Reload - ld.d $s0, $sp, 384 # 8-byte Folded Reload + ld.d $t5, $sp, 384 # 8-byte Folded Reload + ld.d $t6, $sp, 368 # 8-byte Folded Reload + ld.d $s0, $sp, 400 # 8-byte Folded Reload .LBB493_423: # %estLog.exit320.i # in Loop: Header=BB493_221 Depth=3 fmadd.d $fs4, $fs4, $fa0, $fs4 - ld.d $s7, $sp, 304 # 8-byte Folded Reload - ld.d $s1, $sp, 344 # 8-byte Folded Reload + ld.d $s7, $sp, 312 # 8-byte Folded Reload + ld.d $s1, $sp, 360 # 8-byte Folded Reload .LBB493_424: # in Loop: Header=BB493_221 Depth=3 ori $s5, $zero, 1 beqz $s1, .LBB493_220 # %bb.425: # in Loop: Header=BB493_221 Depth=3 - ld.d $a0, $sp, 328 # 8-byte Folded Reload + ld.d $a0, $sp, 344 # 8-byte Folded Reload ld.d $a2, $a0, 64 bltz $a2, .LBB493_436 .LBB493_426: # %.preheader.i824 @@ -108447,7 +108418,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin b .LBB493_433 .LBB493_429: # %.thread363.i # in Loop: Header=BB493_221 Depth=3 - ld.d $a2, $sp, 344 # 8-byte Folded Reload + ld.d $a2, $sp, 360 # 8-byte Folded Reload sltui $a0, $a2, 1 lu12i.w $a1, 256 or $a1, $a2, $a1 @@ -108460,13 +108431,13 @@ sqlite3WhereBegin: # @sqlite3WhereBegin maskeqz $a0, $a0, $fp masknez $a1, $a1, $fp or $s1, $a0, $a1 - ld.d $s7, $sp, 304 # 8-byte Folded Reload + ld.d $s7, $sp, 312 # 8-byte Folded Reload ori $s5, $zero, 1 ori $t4, $zero, 110 - ld.d $t5, $sp, 368 # 8-byte Folded Reload - ld.d $t6, $sp, 352 # 8-byte Folded Reload - ld.d $s0, $sp, 384 # 8-byte Folded Reload - ld.d $a0, $sp, 328 # 8-byte Folded Reload + ld.d $t5, $sp, 384 # 8-byte Folded Reload + ld.d $t6, $sp, 368 # 8-byte Folded Reload + ld.d $s0, $sp, 400 # 8-byte Folded Reload + ld.d $a0, $sp, 344 # 8-byte Folded Reload ld.d $a2, $a0, 64 bgez $a2, .LBB493_426 b .LBB493_436 @@ -108492,15 +108463,11 @@ sqlite3WhereBegin: # @sqlite3WhereBegin vinsgr2vr.d $vr4, $a6, 0 vrepli.w $vr5, 63 vslt.wu $vr6, $vr3, $vr5 - vshuf4i.w $vr6, $vr6, 16 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 + vsllwil.d.w $vr6, $vr6, 0 vslt.wu $vr5, $vr4, $vr5 - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vilvl.w $vr3, $vr7, $vr3 - vilvl.w $vr4, $vr7, $vr4 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.du.wu $vr4, $vr4, 0 vbitrev.d $vr3, $vr0, $vr3 vbitrev.d $vr4, $vr0, $vr4 vbitsel.v $vr3, $vr0, $vr3, $vr6 @@ -108555,30 +108522,30 @@ sqlite3WhereBegin: # @sqlite3WhereBegin fcmp.cule.d $fcc0, $fs3, $fs4 bcnez $fcc0, .LBB493_220 # %bb.437: # in Loop: Header=BB493_221 Depth=3 - st.d $s0, $sp, 264 # 8-byte Folded Spill - st.d $s1, $sp, 248 # 8-byte Folded Spill + st.d $s0, $sp, 272 # 8-byte Folded Spill + st.d $s1, $sp, 256 # 8-byte Folded Spill fmov.d $fs3, $fs4 - st.d $s2, $sp, 256 # 8-byte Folded Spill + st.d $s2, $sp, 264 # 8-byte Folded Spill b .LBB493_220 .LBB493_438: # in Loop: Header=BB493_58 Depth=2 + st.d $zero, $sp, 272 # 8-byte Folded Spill st.d $zero, $sp, 264 # 8-byte Folded Spill - st.d $zero, $sp, 256 # 8-byte Folded Spill .LBB493_439: # %._crit_edge449.i # in Loop: Header=BB493_58 Depth=2 move $a1, $zero - ld.d $a0, $sp, 248 # 8-byte Folded Reload - ld.d $a2, $sp, 376 # 8-byte Folded Reload + ld.d $a0, $sp, 256 # 8-byte Folded Reload + ld.d $a2, $sp, 392 # 8-byte Folded Reload or $a0, $a0, $a2 - ld.d $s8, $sp, 320 # 8-byte Folded Reload - ld.d $a6, $sp, 232 # 8-byte Folded Reload + ld.d $s8, $sp, 336 # 8-byte Folded Reload + ld.d $a6, $sp, 240 # 8-byte Folded Reload ori $t3, $zero, 48 - ld.d $a5, $sp, 224 # 8-byte Folded Reload - ld.d $t1, $sp, 216 # 8-byte Folded Reload - ld.d $a7, $sp, 208 # 8-byte Folded Reload - ld.d $t0, $sp, 328 # 8-byte Folded Reload - ld.d $a4, $sp, 200 # 8-byte Folded Reload - ld.d $a3, $sp, 264 # 8-byte Folded Reload - ld.d $t2, $sp, 256 # 8-byte Folded Reload + ld.d $a5, $sp, 232 # 8-byte Folded Reload + ld.d $t1, $sp, 224 # 8-byte Folded Reload + ld.d $a7, $sp, 216 # 8-byte Folded Reload + ld.d $t0, $sp, 344 # 8-byte Folded Reload + ld.d $a4, $sp, 208 # 8-byte Folded Reload + ld.d $a3, $sp, 272 # 8-byte Folded Reload + ld.d $t2, $sp, 264 # 8-byte Folded Reload fcmp.cule.d $fcc0, $fs2, $fs3 bcnez $fcc0, .LBB493_187 b .LBB493_186 @@ -108592,11 +108559,10 @@ sqlite3WhereBegin: # @sqlite3WhereBegin maskeqz $a0, $a2, $a0 or $a0, $a0, $a1 or $s4, $a0, $s4 - vld $vr7, $sp, 448 # 16-byte Folded Reload ori $s5, $zero, 1 ori $t4, $zero, 110 - ld.d $t5, $sp, 368 # 8-byte Folded Reload - ld.d $t6, $sp, 352 # 8-byte Folded Reload + ld.d $t5, $sp, 384 # 8-byte Folded Reload + ld.d $t6, $sp, 368 # 8-byte Folded Reload ld.d $t1, $sp, 464 # 8-byte Folded Reload b .LBB493_218 .LBB493_441: # in Loop: Header=BB493_58 Depth=2 @@ -108624,8 +108590,8 @@ sqlite3WhereBegin: # @sqlite3WhereBegin pcaddu18i $ra, %call36(exprTableUsage) jirl $ra, $ra, 0 ori $t3, $zero, 48 - vldi $vr9, -912 - vldi $vr8, -988 + vldi $vr8, -912 + vldi $vr7, -988 and $a1, $a0, $s0 move $a0, $s5 ori $s5, $zero, 1 @@ -108638,13 +108604,13 @@ sqlite3WhereBegin: # @sqlite3WhereBegin .LBB493_446: # %bestIndex.exit # in Loop: Header=BB493_58 Depth=2 fmov.d $fs3, $fs1 - ld.d $s8, $sp, 320 # 8-byte Folded Reload - ld.d $a6, $sp, 232 # 8-byte Folded Reload - ld.d $a5, $sp, 224 # 8-byte Folded Reload - ld.d $t1, $sp, 216 # 8-byte Folded Reload - ld.d $a7, $sp, 208 # 8-byte Folded Reload - ld.d $t0, $sp, 328 # 8-byte Folded Reload - ld.d $a4, $sp, 200 # 8-byte Folded Reload + ld.d $s8, $sp, 336 # 8-byte Folded Reload + ld.d $a6, $sp, 240 # 8-byte Folded Reload + ld.d $a5, $sp, 232 # 8-byte Folded Reload + ld.d $t1, $sp, 224 # 8-byte Folded Reload + ld.d $a7, $sp, 216 # 8-byte Folded Reload + ld.d $t0, $sp, 344 # 8-byte Folded Reload + ld.d $a4, $sp, 208 # 8-byte Folded Reload fcmp.cule.d $fcc0, $fs2, $fs3 bceqz $fcc0, .LBB493_186 b .LBB493_187 @@ -108658,25 +108624,25 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a5, $zero move $a1, $zero move $t1, $zero - st.d $zero, $sp, 160 # 8-byte Folded Spill + st.d $zero, $sp, 168 # 8-byte Folded Spill b .LBB493_452 .p2align 4, , 16 .LBB493_449: # %.thread # in Loop: Header=BB493_55 Depth=1 - ld.d $a1, $sp, 120 # 8-byte Folded Reload + ld.d $a1, $sp, 128 # 8-byte Folded Reload slli.d $a0, $a1, 43 bgez $a0, .LBB493_451 # %bb.450: # in Loop: Header=BB493_55 Depth=1 - ld.d $a0, $sp, 144 # 8-byte Folded Reload + ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $zero, $a0, 0 .LBB493_451: # %.thread.thread # in Loop: Header=BB493_55 Depth=1 - ld.d $a6, $sp, 176 # 8-byte Folded Reload - ld.d $s3, $sp, 192 # 8-byte Folded Reload + ld.d $a6, $sp, 184 # 8-byte Folded Reload + ld.d $s3, $sp, 200 # 8-byte Folded Reload ld.d $a7, $sp, 64 # 8-byte Folded Reload .LBB493_452: # %.thread.thread # in Loop: Header=BB493_55 Depth=1 - ld.d $a0, $sp, 184 # 8-byte Folded Reload + ld.d $a0, $sp, 192 # 8-byte Folded Reload move $t0, $a1 st.w $a1, $a0, 4 st.d $a5, $a0, 16 @@ -108692,16 +108658,16 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.w $a0, $s8, 84 .LBB493_454: # in Loop: Header=BB493_55 Depth=1 ld.w $a0, $sp, 984 - ld.d $a3, $sp, 184 # 8-byte Folded Reload + ld.d $a3, $sp, 192 # 8-byte Folded Reload st.w $a2, $a3, 28 blez $a0, .LBB493_54 # %bb.455: # %.lr.ph.i836 # in Loop: Header=BB493_55 Depth=1 - ld.d $a2, $sp, 160 # 8-byte Folded Reload + ld.d $a2, $sp, 168 # 8-byte Folded Reload addi.w $a2, $a2, 0 slli.d $a3, $a2, 6 alsl.d $a2, $a2, $a3, 3 - ld.d $a3, $sp, 128 # 8-byte Folded Reload + ld.d $a3, $sp, 136 # 8-byte Folded Reload add.d $a2, $a3, $a2 ld.w $a2, $a2, 44 move $a3, $zero @@ -108718,18 +108684,18 @@ sqlite3WhereBegin: # @sqlite3WhereBegin b .LBB493_54 .LBB493_458: # %._crit_edge1745.loopexit bstrpick.d $a0, $a7, 22, 22 - ld.d $a1, $sp, 144 # 8-byte Folded Reload + ld.d $a1, $sp, 152 # 8-byte Folded Reload bnez $a1, .LBB493_460 b .LBB493_462 .LBB493_459: ori $a0, $zero, 1 - ld.d $a1, $sp, 144 # 8-byte Folded Reload + ld.d $a1, $sp, 152 # 8-byte Folded Reload beqz $a1, .LBB493_462 .LBB493_460: # %._crit_edge1745 andi $a0, $a0, 1 beqz $a0, .LBB493_462 # %bb.461: - ld.d $a0, $sp, 144 # 8-byte Folded Reload + ld.d $a0, $sp, 152 # 8-byte Folded Reload st.d $zero, $a0, 0 .LBB493_462: addi.w $fp, $zero, -1 @@ -108737,26 +108703,25 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $fp pcaddu18i $ra, %call36(sqlite3CodeVerifySchema) jirl $ra, $ra, 0 - ld.d $a0, $sp, 240 # 8-byte Folded Reload + ld.d $a0, $sp, 248 # 8-byte Folded Reload ld.h $a0, $a0, 0 - blez $a0, .LBB493_1116 + blez $a0, .LBB493_1115 # %bb.463: # %.lr.ph1758 - st.d $fp, $sp, 432 # 8-byte Folded Spill + st.d $fp, $sp, 448 # 8-byte Folded Spill ori $s4, $zero, 2 pcalau12i $a0, %pc_hi20(.L.str.395) addi.d $a0, $a0, %pc_lo12(.L.str.395) - st.d $a0, $sp, 416 # 8-byte Folded Spill + st.d $a0, $sp, 432 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(.L.str.399) addi.d $a0, $a0, %pc_lo12(.L.str.399) - st.d $a0, $sp, 400 # 8-byte Folded Spill + st.d $a0, $sp, 416 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(.L.str.398) addi.d $a0, $a0, %pc_lo12(.L.str.398) - st.d $a0, $sp, 408 # 8-byte Folded Spill + st.d $a0, $sp, 424 # 8-byte Folded Spill move $s1, $zero lu12i.w $a0, -245 ori $a0, $a0, 3520 - st.d $a0, $sp, 424 # 8-byte Folded Spill - vld $vr0, $sp, 448 # 16-byte Folded Reload + st.d $a0, $sp, 440 # 8-byte Folded Spill b .LBB493_467 .p2align 4, , 16 .LBB493_464: # in Loop: Header=BB493_467 Depth=1 @@ -108766,27 +108731,27 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.bu $a0, $fp, 101 beqz $a0, .LBB493_472 .LBB493_466: # in Loop: Header=BB493_467 Depth=1 - ld.d $a0, $sp, 240 # 8-byte Folded Reload + ld.d $a0, $sp, 248 # 8-byte Folded Reload ld.h $a0, $a0, 0 addi.w $s1, $s1, 1 addi.d $s3, $s3, 96 - bge $s1, $a0, .LBB493_563 + bge $s1, $a0, .LBB493_562 .LBB493_467: # =>This Loop Header: Depth=1 - # Child Loop BB493_513 Depth 2 + # Child Loop BB493_512 Depth 2 ld.bu $a0, $s8, 208 ld.w $a1, $s3, 28 st.d $a1, $sp, 464 # 8-byte Folded Spill - bne $a0, $s4, .LBB493_510 + bne $a0, $s4, .LBB493_509 # %bb.468: # in Loop: Header=BB493_467 Depth=1 ld.w $a0, $s3, 0 slli.d $a1, $a0, 6 alsl.d $a0, $a0, $a1, 3 - ld.d $a1, $sp, 128 # 8-byte Folded Reload + ld.d $a1, $sp, 136 # 8-byte Folded Reload add.d $fp, $a1, $a0 ld.d $a2, $fp, 8 ld.d $s0, $sp, 72 # 8-byte Folded Reload move $a0, $s0 - ld.d $a1, $sp, 416 # 8-byte Folded Reload + ld.d $a1, $sp, 432 # 8-byte Folded Reload pcaddu18i $ra, %call36(sqlite3MPrintf) jirl $ra, $ra, 0 ld.d $a3, $fp, 16 @@ -108818,7 +108783,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin bnez $a0, .LBB493_466 # %bb.473: # in Loop: Header=BB493_467 Depth=1 ld.d $a0, $s3, 80 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload beqz $a0, .LBB493_502 # %bb.474: # in Loop: Header=BB493_467 Depth=1 ld.w $s8, $s7, 44 @@ -108826,27 +108791,27 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.w $s2, $a1, 28 ld.d $s0, $fp, 120 move $a0, $s5 - blt $s5, $s2, .LBB493_522 + blt $s5, $s2, .LBB493_521 # %bb.475: # in Loop: Header=BB493_467 Depth=1 ld.d $fp, $a1, 0 ld.bu $a0, $fp, 42 - beqz $a0, .LBB493_518 + beqz $a0, .LBB493_517 .LBB493_476: # %resizeOpArray.exit.i.i857 # in Loop: Header=BB493_467 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - beqz $a0, .LBB493_521 + beqz $a0, .LBB493_520 # %bb.477: # in Loop: Header=BB493_467 Depth=1 move $s5, $zero ori $s4, $zero, 2 - b .LBB493_523 + b .LBB493_522 .LBB493_478: # in Loop: Header=BB493_467 Depth=1 ld.bu $a0, $s3, 5 andi $a0, $a0, 3 beqz $a0, .LBB493_480 # %bb.479: # in Loop: Header=BB493_467 Depth=1 ld.d $a0, $sp, 72 # 8-byte Folded Reload - ld.d $a1, $sp, 408 # 8-byte Folded Reload + ld.d $a1, $sp, 424 # 8-byte Folded Reload move $a2, $s6 pcaddu18i $ra, %call36(sqlite3MPrintf) jirl $ra, $ra, 0 @@ -108858,7 +108823,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.w $a3, $a0, 40 ld.d $a4, $a0, 48 ld.d $a0, $sp, 72 # 8-byte Folded Reload - ld.d $a1, $sp, 400 # 8-byte Folded Reload + ld.d $a1, $sp, 416 # 8-byte Folded Reload move $a2, $s6 pcaddu18i $ra, %call36(sqlite3MPrintf) jirl $ra, $ra, 0 @@ -108878,8 +108843,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin jirl $ra, $ra, 0 move $s6, $a0 .LBB493_485: # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - vld $vr0, $sp, 448 # 16-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $fp, $a1, 24 ld.w $s2, $a1, 28 ld.w $s5, $s3, 0 @@ -108913,13 +108877,12 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s4, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_516 + beqz $a0, .LBB493_515 # %bb.490: # %sqlite3DbRealloc.exit.i.i.i # in Loop: Header=BB493_467 Depth=1 st.w $s4, $s0, 28 st.d $a0, $s0, 32 move $a1, $s0 - vld $vr0, $sp, 448 # 16-byte Folded Reload bge $s2, $s4, .LBB493_487 # %bb.491: # in Loop: Header=BB493_467 Depth=1 ori $a2, $zero, 24 @@ -108930,25 +108893,24 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - vld $vr0, $sp, 448 # 16-byte Folded Reload - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload b .LBB493_487 .LBB493_492: # %resizeOpArray.exit._crit_edge.i.i # in Loop: Header=BB493_467 Depth=1 ld.w $a0, $a1, 24 ori $s4, $zero, 2 .LBB493_493: # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $fp, $a0 add.d $a2, $a1, $a0 ori $a3, $zero, 109 stx.h $a3, $a1, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $s1, $a2, 4 st.w $s5, $a2, 8 st.w $zero, $a2, 12 @@ -108961,10 +108923,10 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.bu $a1, $a1, 42 beqz $a1, .LBB493_497 .LBB493_495: # in Loop: Header=BB493_467 Depth=1 - beqz $s6, .LBB493_510 + beqz $s6, .LBB493_509 # %bb.496: # in Loop: Header=BB493_467 Depth=1 ld.w $a1, $s6, -8 - ld.d $a3, $sp, 96 # 8-byte Folded Reload + ld.d $a3, $sp, 112 # 8-byte Folded Reload ld.d $a2, $a3, %pc_lo12(mem.5) addi.d $a0, $s6, -8 sub.d $a1, $a2, $a1 @@ -108975,9 +108937,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin .LBB493_497: # in Loop: Header=BB493_467 Depth=1 bgez $fp, .LBB493_500 # %bb.498: # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $a1, $a1, 24 - blez $a1, .LBB493_510 + blez $a1, .LBB493_509 # %bb.499: # in Loop: Header=BB493_467 Depth=1 addi.w $fp, $a1, -1 .LBB493_500: # in Loop: Header=BB493_467 Depth=1 @@ -108998,7 +108960,7 @@ sqlite3WhereBegin: # @sqlite3WhereBegin .LBB493_502: # in Loop: Header=BB493_467 Depth=1 ld.bu $a0, $s3, 6 andi $a0, $a0, 8 - bnez $a0, .LBB493_517 + bnez $a0, .LBB493_516 # %bb.503: # in Loop: Header=BB493_467 Depth=1 ld.w $a1, $s7, 44 ori $a4, $zero, 12 @@ -109009,20 +108971,20 @@ sqlite3WhereBegin: # @sqlite3WhereBegin jirl $ra, $ra, 0 ld.w $a0, $fp, 8 ori $a1, $zero, 63 - bltu $a1, $a0, .LBB493_531 + bltu $a1, $a0, .LBB493_530 # %bb.504: # %._crit_edge1752 # in Loop: Header=BB493_467 Depth=1 - ld.d $a0, $sp, 272 # 8-byte Folded Reload - beqz $a0, .LBB493_531 + ld.d $a0, $sp, 280 # 8-byte Folded Reload + beqz $a0, .LBB493_530 # %bb.505: # %._crit_edge1752 # in Loop: Header=BB493_467 Depth=1 - ld.d $a0, $sp, 272 # 8-byte Folded Reload + ld.d $a0, $sp, 280 # 8-byte Folded Reload ld.w $a0, $a0, 24 - blez $a0, .LBB493_531 + blez $a0, .LBB493_530 # %bb.506: # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 - beqz $a1, .LBB493_531 + beqz $a1, .LBB493_530 # %bb.507: # in Loop: Header=BB493_467 Depth=1 ld.d $a2, $s7, 64 sltui $a3, $a2, 1 @@ -109034,55 +108996,51 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.d $a0, $a0, $a3, 3 add.d $a0, $a1, $a0 st.w $a2, $a0, -16 - b .LBB493_531 + b .LBB493_530 .LBB493_508: # in Loop: Header=BB493_467 Depth=1 st.b $zero, $fp, 1 .p2align 4, , 16 .LBB493_509: # %sqlite3VdbeChangeP4.exit # in Loop: Header=BB493_467 Depth=1 - vld $vr0, $sp, 448 # 16-byte Folded Reload -.LBB493_510: # %sqlite3VdbeChangeP4.exit - # in Loop: Header=BB493_467 Depth=1 ld.w $a0, $s3, 0 slli.d $a1, $a0, 6 alsl.d $a0, $a0, $a1, 3 - ld.d $a1, $sp, 128 # 8-byte Folded Reload + ld.d $a1, $sp, 136 # 8-byte Folded Reload add.d $s7, $a1, $a0 ld.d $fp, $s7, 24 ld.d $a0, $fp, 144 - ld.d $s6, $sp, 424 # 8-byte Folded Reload + ld.d $s6, $sp, 440 # 8-byte Folded Reload beqz $a0, .LBB493_465 -# %bb.511: # %.preheader.i846 +# %bb.510: # %.preheader.i846 # in Loop: Header=BB493_467 Depth=1 ld.d $a2, $s8, 0 ld.w $a1, $a2, 8 blez $a1, .LBB493_464 -# %bb.512: # %.lr.ph.i849 +# %bb.511: # %.lr.ph.i849 # in Loop: Header=BB493_467 Depth=1 ld.d $a2, $a2, 16 move $s6, $zero addi.d $a2, $a2, 40 move $a3, $a1 .p2align 4, , 16 -.LBB493_513: # Parent Loop BB493_467 Depth=1 +.LBB493_512: # Parent Loop BB493_467 Depth=1 # => This Inner Loop Header: Depth=2 ld.d $a4, $a2, 0 beq $a4, $a0, .LBB493_465 -# %bb.514: # in Loop: Header=BB493_513 Depth=2 +# %bb.513: # in Loop: Header=BB493_512 Depth=2 addi.w $s6, $s6, 1 addi.d $a3, $a3, -1 addi.d $a2, $a2, 48 - bnez $a3, .LBB493_513 -# %bb.515: # in Loop: Header=BB493_467 Depth=1 + bnez $a3, .LBB493_512 +# %bb.514: # in Loop: Header=BB493_467 Depth=1 move $s6, $a1 b .LBB493_465 -.LBB493_516: # in Loop: Header=BB493_467 Depth=1 +.LBB493_515: # in Loop: Header=BB493_467 Depth=1 ori $a0, $zero, 1 st.b $a0, $s7, 42 move $a1, $s0 - vld $vr0, $sp, 448 # 16-byte Folded Reload b .LBB493_487 -.LBB493_517: # in Loop: Header=BB493_467 Depth=1 +.LBB493_516: # in Loop: Header=BB493_467 Depth=1 ld.w $a2, $fp, 40 ld.d $a4, $fp, 0 move $a0, $s8 @@ -109090,9 +109048,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a3, $zero pcaddu18i $ra, %call36(sqlite3TableLock) jirl $ra, $ra, 0 - b .LBB493_531 -.LBB493_518: # in Loop: Header=BB493_467 Depth=1 - st.d $s0, $sp, 392 # 8-byte Folded Spill + b .LBB493_530 +.LBB493_517: # in Loop: Header=BB493_467 Depth=1 + st.d $s0, $sp, 408 # 8-byte Folded Spill sltui $a0, $s2, 1 move $s0, $a1 slli.w $a1, $s2, 1 @@ -109105,16 +109063,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s4, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_537 -# %bb.519: # %sqlite3DbRealloc.exit.i.i.i863 + beqz $a0, .LBB493_536 +# %bb.518: # %sqlite3DbRealloc.exit.i.i.i863 # in Loop: Header=BB493_467 Depth=1 st.w $s4, $s0, 28 st.d $a0, $s0, 32 move $a1, $s0 - vld $vr0, $sp, 448 # 16-byte Folded Reload - ld.d $s0, $sp, 392 # 8-byte Folded Reload + ld.d $s0, $sp, 408 # 8-byte Folded Reload bge $s2, $s4, .LBB493_476 -# %bb.520: # in Loop: Header=BB493_467 Depth=1 +# %bb.519: # in Loop: Header=BB493_467 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s2, $a2 add.d $a0, $a0, $a1 @@ -109123,47 +109080,47 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - vld $vr0, $sp, 448 # 16-byte Folded Reload - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload b .LBB493_476 -.LBB493_521: # %resizeOpArray.exit._crit_edge.i.i860 +.LBB493_520: # %resizeOpArray.exit._crit_edge.i.i860 # in Loop: Header=BB493_467 Depth=1 ld.w $a0, $a1, 24 ori $s4, $zero, 2 -.LBB493_522: # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_521: # in Loop: Header=BB493_467 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s5, $a0 add.d $a2, $a1, $a0 ori $a3, $zero, 30 stx.h $a3, $a1, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $s8, $a2, 4 + vld $vr0, $sp, 96 # 16-byte Folded Reload vst $vr0, $a2, 8 st.b $zero, $a1, 339 -.LBB493_523: # %sqlite3VdbeAddOp4.exit864 +.LBB493_522: # %sqlite3VdbeAddOp4.exit864 # in Loop: Header=BB493_467 Depth=1 ld.d $a0, $a1, 32 - ld.d $s8, $sp, 320 # 8-byte Folded Reload - beqz $a0, .LBB493_531 -# %bb.524: # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $s8, $sp, 336 # 8-byte Folded Reload + beqz $a0, .LBB493_530 +# %bb.523: # in Loop: Header=BB493_467 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 0 ld.bu $a1, $a1, 42 - bnez $a1, .LBB493_531 + bnez $a1, .LBB493_530 +# %bb.524: # in Loop: Header=BB493_467 Depth=1 + bgez $s5, .LBB493_527 # %bb.525: # in Loop: Header=BB493_467 Depth=1 - bgez $s5, .LBB493_528 -# %bb.526: # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $a1, $a1, 24 - blez $a1, .LBB493_531 -# %bb.527: # in Loop: Header=BB493_467 Depth=1 + blez $a1, .LBB493_530 +# %bb.526: # in Loop: Header=BB493_467 Depth=1 addi.w $s5, $a1, -1 -.LBB493_528: # in Loop: Header=BB493_467 Depth=1 +.LBB493_527: # in Loop: Header=BB493_467 Depth=1 slli.d $a1, $s5, 4 alsl.d $a1, $s5, $a1, 3 add.d $fp, $a0, $a1 @@ -109172,37 +109129,37 @@ sqlite3WhereBegin: # @sqlite3WhereBegin pcaddu18i $ra, %call36(freeP4) jirl $ra, $ra, 0 st.d $zero, $fp, 16 - beqz $s0, .LBB493_530 -# %bb.529: # in Loop: Header=BB493_467 Depth=1 + beqz $s0, .LBB493_529 +# %bb.528: # in Loop: Header=BB493_467 Depth=1 st.d $s0, $fp, 16 ori $a0, $zero, 246 st.b $a0, $fp, 1 - b .LBB493_531 -.LBB493_530: # in Loop: Header=BB493_467 Depth=1 + b .LBB493_530 +.LBB493_529: # in Loop: Header=BB493_467 Depth=1 st.b $zero, $fp, 1 -.LBB493_531: # %sqlite3VdbeChangeP4.exit1434 +.LBB493_530: # %sqlite3VdbeChangeP4.exit1434 # in Loop: Header=BB493_467 Depth=1 ld.w $a0, $s7, 44 ld.d $s7, $s3, 16 st.w $a0, $s3, 24 - beqz $s7, .LBB493_558 -# %bb.532: # in Loop: Header=BB493_467 Depth=1 + beqz $s7, .LBB493_557 +# %bb.531: # in Loop: Header=BB493_467 Depth=1 move $a0, $s8 move $a1, $s7 pcaddu18i $ra, %call36(sqlite3IndexKeyinfo) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $fp, $a1, 24 ld.w $s2, $a1, 28 ld.w $s5, $s7, 40 move $s8, $a0 move $a0, $fp - blt $fp, $s2, .LBB493_543 -# %bb.533: # in Loop: Header=BB493_467 Depth=1 + blt $fp, $s2, .LBB493_542 +# %bb.532: # in Loop: Header=BB493_467 Depth=1 ld.d $s4, $a1, 0 ld.bu $a0, $s4, 42 - bnez $a0, .LBB493_540 -# %bb.534: # in Loop: Header=BB493_467 Depth=1 + bnez $a0, .LBB493_539 +# %bb.533: # in Loop: Header=BB493_467 Depth=1 sltui $a0, $s2, 1 move $a3, $a1 slli.w $a1, $s2, 1 @@ -109215,14 +109172,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s0, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_538 -# %bb.535: # %sqlite3DbRealloc.exit.i.i.i876 + beqz $a0, .LBB493_537 +# %bb.534: # %sqlite3DbRealloc.exit.i.i.i876 # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $s0, $a1, 28 st.d $a0, $a1, 32 - bge $s2, $s0, .LBB493_540 -# %bb.536: # in Loop: Header=BB493_467 Depth=1 + bge $s2, $s0, .LBB493_539 +# %bb.535: # in Loop: Header=BB493_467 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s2, $a2 add.d $a0, $a0, $a1 @@ -109231,47 +109188,46 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - b .LBB493_539 -.LBB493_537: # in Loop: Header=BB493_467 Depth=1 + b .LBB493_538 +.LBB493_536: # in Loop: Header=BB493_467 Depth=1 ori $a0, $zero, 1 st.b $a0, $fp, 42 move $a1, $s0 - vld $vr0, $sp, 448 # 16-byte Folded Reload - ld.d $s0, $sp, 392 # 8-byte Folded Reload + ld.d $s0, $sp, 408 # 8-byte Folded Reload b .LBB493_476 -.LBB493_538: # in Loop: Header=BB493_467 Depth=1 +.LBB493_537: # in Loop: Header=BB493_467 Depth=1 ori $a0, $zero, 1 st.b $a0, $s4, 42 -.LBB493_539: # %resizeOpArray.exit.i.i870 +.LBB493_538: # %resizeOpArray.exit.i.i870 # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload -.LBB493_540: # %resizeOpArray.exit.i.i870 + ld.d $a1, $sp, 280 # 8-byte Folded Reload +.LBB493_539: # %resizeOpArray.exit.i.i870 # in Loop: Header=BB493_467 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - beqz $a0, .LBB493_542 -# %bb.541: # in Loop: Header=BB493_467 Depth=1 + beqz $a0, .LBB493_541 +# %bb.540: # in Loop: Header=BB493_467 Depth=1 move $fp, $zero ori $s4, $zero, 2 ld.d $a0, $a1, 32 - bnez $a0, .LBB493_544 - b .LBB493_545 -.LBB493_542: # %resizeOpArray.exit._crit_edge.i.i873 + bnez $a0, .LBB493_543 + b .LBB493_544 +.LBB493_541: # %resizeOpArray.exit._crit_edge.i.i873 # in Loop: Header=BB493_467 Depth=1 ld.w $a0, $a1, 24 ori $s4, $zero, 2 -.LBB493_543: # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_542: # in Loop: Header=BB493_467 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $fp, $a0 add.d $a2, $a1, $a0 ori $a3, $zero, 12 stx.h $a3, $a1, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a0, $sp, 464 # 8-byte Folded Reload st.w $a0, $a2, 4 st.w $s5, $a2, 8 @@ -109279,32 +109235,32 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.d $zero, $a2, 16 st.b $zero, $a1, 339 ld.d $a0, $a1, 32 - beqz $a0, .LBB493_545 -.LBB493_544: # in Loop: Header=BB493_467 Depth=1 + beqz $a0, .LBB493_544 +.LBB493_543: # in Loop: Header=BB493_467 Depth=1 ld.d $a1, $a1, 0 ld.bu $a1, $a1, 42 - beqz $a1, .LBB493_547 -.LBB493_545: # in Loop: Header=BB493_467 Depth=1 - beqz $s8, .LBB493_553 -# %bb.546: # in Loop: Header=BB493_467 Depth=1 + beqz $a1, .LBB493_546 +.LBB493_544: # in Loop: Header=BB493_467 Depth=1 + beqz $s8, .LBB493_552 +# %bb.545: # in Loop: Header=BB493_467 Depth=1 ld.w $a1, $s8, -8 - ld.d $a3, $sp, 96 # 8-byte Folded Reload + ld.d $a3, $sp, 112 # 8-byte Folded Reload ld.d $a2, $a3, %pc_lo12(mem.5) addi.d $a0, $s8, -8 sub.d $a1, $a2, $a1 st.d $a1, $a3, %pc_lo12(mem.5) pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - b .LBB493_553 -.LBB493_547: # in Loop: Header=BB493_467 Depth=1 - bgez $fp, .LBB493_550 -# %bb.548: # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + b .LBB493_552 +.LBB493_546: # in Loop: Header=BB493_467 Depth=1 + bgez $fp, .LBB493_549 +# %bb.547: # in Loop: Header=BB493_467 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $a1, $a1, 24 - blez $a1, .LBB493_553 -# %bb.549: # in Loop: Header=BB493_467 Depth=1 + blez $a1, .LBB493_552 +# %bb.548: # in Loop: Header=BB493_467 Depth=1 addi.w $fp, $a1, -1 -.LBB493_550: # in Loop: Header=BB493_467 Depth=1 +.LBB493_549: # in Loop: Header=BB493_467 Depth=1 slli.d $a1, $fp, 4 alsl.d $a1, $fp, $a1, 3 add.d $fp, $a0, $a1 @@ -109313,38 +109269,38 @@ sqlite3WhereBegin: # @sqlite3WhereBegin pcaddu18i $ra, %call36(freeP4) jirl $ra, $ra, 0 st.d $zero, $fp, 16 - beqz $s8, .LBB493_552 -# %bb.551: # in Loop: Header=BB493_467 Depth=1 + beqz $s8, .LBB493_551 +# %bb.550: # in Loop: Header=BB493_467 Depth=1 st.d $s8, $fp, 16 ori $a0, $zero, 250 st.b $a0, $fp, 1 - b .LBB493_553 -.LBB493_552: # in Loop: Header=BB493_467 Depth=1 + b .LBB493_552 +.LBB493_551: # in Loop: Header=BB493_467 Depth=1 st.b $zero, $fp, 1 -.LBB493_553: # %sqlite3VdbeChangeP4.exit1437 +.LBB493_552: # %sqlite3VdbeChangeP4.exit1437 # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $fp, $a1, 24 ld.w $s2, $a1, 28 ld.w $s5, $s7, 8 move $a0, $fp - ld.d $s8, $sp, 320 # 8-byte Folded Reload - blt $fp, $s2, .LBB493_557 -# %bb.554: # in Loop: Header=BB493_467 Depth=1 + ld.d $s8, $sp, 336 # 8-byte Folded Reload + blt $fp, $s2, .LBB493_556 +# %bb.553: # in Loop: Header=BB493_467 Depth=1 ld.d $s4, $a1, 0 ld.bu $a0, $s4, 42 - beqz $a0, .LBB493_559 -.LBB493_555: # %resizeOpArray.exit.i.i881 + beqz $a0, .LBB493_558 +.LBB493_554: # %resizeOpArray.exit.i.i881 # in Loop: Header=BB493_467 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 ori $s4, $zero, 2 - bnez $a0, .LBB493_558 -# %bb.556: # %resizeOpArray.exit._crit_edge.i.i884 + bnez $a0, .LBB493_557 +# %bb.555: # %resizeOpArray.exit._crit_edge.i.i884 # in Loop: Header=BB493_467 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $a0, $a1, 24 -.LBB493_557: # in Loop: Header=BB493_467 Depth=1 +.LBB493_556: # in Loop: Header=BB493_467 Depth=1 move $a4, $a1 addi.d $a1, $s5, 1 ld.d $a2, $a4, 32 @@ -109361,15 +109317,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.w $zero, $a3, 12 st.d $zero, $a3, 16 st.b $zero, $a4, 339 -.LBB493_558: # %sqlite3VdbeAddOp2.exit +.LBB493_557: # %sqlite3VdbeAddOp2.exit # in Loop: Header=BB493_467 Depth=1 move $a0, $s8 move $a1, $s6 pcaddu18i $ra, %call36(sqlite3CodeVerifySchema) jirl $ra, $ra, 0 - vld $vr0, $sp, 448 # 16-byte Folded Reload b .LBB493_466 -.LBB493_559: # in Loop: Header=BB493_467 Depth=1 +.LBB493_558: # in Loop: Header=BB493_467 Depth=1 sltui $a0, $s2, 1 move $s7, $a1 slli.w $a1, $s2, 1 @@ -109382,14 +109337,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s0, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_562 -# %bb.560: # %sqlite3DbRealloc.exit.i.i.i887 + beqz $a0, .LBB493_561 +# %bb.559: # %sqlite3DbRealloc.exit.i.i.i887 # in Loop: Header=BB493_467 Depth=1 st.w $s0, $s7, 28 st.d $a0, $s7, 32 move $a1, $s7 - bge $s2, $s0, .LBB493_555 -# %bb.561: # in Loop: Header=BB493_467 Depth=1 + bge $s2, $s0, .LBB493_554 +# %bb.560: # in Loop: Header=BB493_467 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s2, $a2 add.d $a0, $a0, $a1 @@ -109398,83 +109353,83 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_555 -.LBB493_562: # in Loop: Header=BB493_467 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_554 +.LBB493_561: # in Loop: Header=BB493_467 Depth=1 ori $a0, $zero, 1 st.b $a0, $s4, 42 move $a1, $s7 - b .LBB493_555 -.LBB493_563: # %._crit_edge1759 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + b .LBB493_554 +.LBB493_562: # %._crit_edge1759 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $a1, $a1, 24 - ld.d $a2, $sp, 136 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload st.w $a1, $a2, 16 - blez $a0, .LBB493_1117 -# %bb.564: # %.lr.ph1786 + blez $a0, .LBB493_1116 +# %bb.563: # %.lr.ph1786 move $a1, $zero addi.d $a0, $s8, 40 - st.d $a0, $sp, 376 # 8-byte Folded Spill + st.d $a0, $sp, 384 # 8-byte Folded Spill ori $s7, $zero, 1 lu12i.w $a0, 15 ori $a0, $a0, 4095 - st.d $a0, $sp, 368 # 8-byte Folded Spill - ld.d $s1, $sp, 192 # 8-byte Folded Reload - ld.d $s6, $sp, 432 # 8-byte Folded Reload - b .LBB493_566 + st.d $a0, $sp, 376 # 8-byte Folded Spill + ld.d $s1, $sp, 200 # 8-byte Folded Reload + ld.d $s6, $sp, 448 # 8-byte Folded Reload + b .LBB493_565 .p2align 4, , 16 -.LBB493_565: # %.loopexit - # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 240 # 8-byte Folded Reload +.LBB493_564: # %.loopexit + # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 248 # 8-byte Folded Reload ld.h $a0, $a0, 0 - ld.d $a1, $sp, 424 # 8-byte Folded Reload + ld.d $a1, $sp, 432 # 8-byte Folded Reload addi.w $a1, $a1, 1 addi.d $s1, $s1, 96 - bge $a1, $a0, .LBB493_1118 -.LBB493_566: # =>This Loop Header: Depth=1 - # Child Loop BB493_595 Depth 2 - # Child Loop BB493_596 Depth 3 - # Child Loop BB493_806 Depth 2 - # Child Loop BB493_808 Depth 3 - # Child Loop BB493_625 Depth 2 - # Child Loop BB493_635 Depth 2 - # Child Loop BB493_661 Depth 2 - # Child Loop BB493_729 Depth 2 - # Child Loop BB493_1074 Depth 2 - # Child Loop BB493_681 Depth 2 - # Child Loop BB493_902 Depth 2 - # Child Loop BB493_819 Depth 2 - # Child Loop BB493_826 Depth 2 - # Child Loop BB493_839 Depth 2 - st.d $a1, $sp, 424 # 8-byte Folded Spill + bge $a1, $a0, .LBB493_1117 +.LBB493_565: # =>This Loop Header: Depth=1 + # Child Loop BB493_594 Depth 2 + # Child Loop BB493_595 Depth 3 + # Child Loop BB493_805 Depth 2 + # Child Loop BB493_807 Depth 3 + # Child Loop BB493_624 Depth 2 + # Child Loop BB493_634 Depth 2 + # Child Loop BB493_660 Depth 2 + # Child Loop BB493_728 Depth 2 + # Child Loop BB493_1073 Depth 2 + # Child Loop BB493_680 Depth 2 + # Child Loop BB493_901 Depth 2 + # Child Loop BB493_818 Depth 2 + # Child Loop BB493_825 Depth 2 + # Child Loop BB493_838 Depth 2 + st.d $a1, $sp, 432 # 8-byte Folded Spill ld.w $a0, $s1, 0 slli.d $a1, $a0, 6 alsl.d $a0, $a0, $a1, 3 - ld.d $a1, $sp, 128 # 8-byte Folded Reload + ld.d $a1, $sp, 136 # 8-byte Folded Reload add.d $s3, $a1, $a0 ld.w $a0, $s3, 44 st.d $a0, $sp, 464 # 8-byte Folded Spill ld.d $s2, $s1, 16 ld.w $a0, $s1, 28 - st.d $a0, $sp, 408 # 8-byte Folded Spill - ld.d $a2, $sp, 272 # 8-byte Folded Reload + st.d $a0, $sp, 416 # 8-byte Folded Spill + ld.d $a2, $sp, 280 # 8-byte Folded Reload move $a1, $s1 ld.w $s1, $a2, 40 ld.w $a0, $a2, 44 - st.d $a1, $sp, 192 # 8-byte Folded Spill + st.d $a1, $sp, 200 # 8-byte Folded Spill ld.wu $s0, $a1, 4 addi.d $a1, $s1, 1 st.w $a1, $a2, 40 move $a1, $a2 - st.d $s6, $sp, 432 # 8-byte Folded Spill - bge $s1, $a0, .LBB493_568 -# %bb.567: # %._crit_edge.i894 - # in Loop: Header=BB493_566 Depth=1 + st.d $s6, $sp, 448 # 8-byte Folded Spill + bge $s1, $a0, .LBB493_567 +# %bb.566: # %._crit_edge.i894 + # in Loop: Header=BB493_565 Depth=1 ld.d $fp, $a1, 48 - bnez $fp, .LBB493_574 - b .LBB493_575 + bnez $fp, .LBB493_573 + b .LBB493_574 .p2align 4, , 16 -.LBB493_568: # in Loop: Header=BB493_566 Depth=1 +.LBB493_567: # in Loop: Header=BB493_565 Depth=1 ld.d $s4, $a1, 0 move $a3, $a1 ld.bu $a1, $s4, 42 @@ -109482,23 +109437,23 @@ sqlite3WhereBegin: # @sqlite3WhereBegin slli.d $a2, $a0, 1 addi.d $a2, $a2, 10 st.w $a2, $a3, 44 - beqz $a1, .LBB493_572 -# %bb.569: # in Loop: Header=BB493_566 Depth=1 - beqz $s6, .LBB493_571 -.LBB493_570: # in Loop: Header=BB493_566 Depth=1 + beqz $a1, .LBB493_571 +# %bb.568: # in Loop: Header=BB493_565 Depth=1 + beqz $s6, .LBB493_570 +.LBB493_569: # in Loop: Header=BB493_565 Depth=1 ld.w $a1, $s6, -8 - ld.d $a3, $sp, 96 # 8-byte Folded Reload + ld.d $a3, $sp, 112 # 8-byte Folded Reload ld.d $a2, $a3, %pc_lo12(mem.5) addi.d $a0, $s6, -8 sub.d $a1, $a2, $a1 st.d $a1, $a3, %pc_lo12(mem.5) pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 -.LBB493_571: # %sqlite3DbReallocOrFree.exit.i890 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_570: # %sqlite3DbReallocOrFree.exit.i890 + # in Loop: Header=BB493_565 Depth=1 move $fp, $zero - b .LBB493_573 -.LBB493_572: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_572 +.LBB493_571: # in Loop: Header=BB493_565 Depth=1 ori $a1, $zero, 10 alsl.w $a0, $a0, $a1, 1 slli.w $a1, $a0, 2 @@ -109506,182 +109461,182 @@ sqlite3WhereBegin: # @sqlite3WhereBegin pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 move $fp, $a0 - beqz $a0, .LBB493_639 -.LBB493_573: # %sqlite3DbReallocOrFree.exit.i890 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + beqz $a0, .LBB493_638 +.LBB493_572: # %sqlite3DbReallocOrFree.exit.i890 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.d $fp, $a1, 48 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - beqz $fp, .LBB493_575 -.LBB493_574: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + beqz $fp, .LBB493_574 +.LBB493_573: # in Loop: Header=BB493_565 Depth=1 slli.d $a0, $s1, 2 addi.w $a1, $zero, -1 lu32i.d $a1, 0 stx.w $a1, $fp, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload -.LBB493_575: # %sqlite3VdbeMakeLabel.exit897 - # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload +.LBB493_574: # %sqlite3VdbeMakeLabel.exit897 + # in Loop: Header=BB493_565 Depth=1 nor $a2, $s1, $zero ld.w $s1, $a1, 40 ld.w $a0, $a1, 44 - ld.d $a1, $sp, 192 # 8-byte Folded Reload + ld.d $a1, $sp, 200 # 8-byte Folded Reload st.w $a2, $a1, 36 - st.d $a2, $sp, 416 # 8-byte Folded Spill + st.d $a2, $sp, 424 # 8-byte Folded Spill st.w $a2, $a1, 32 addi.d $a1, $s1, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a1, $a2, 40 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - blt $s1, $a0, .LBB493_582 -# %bb.576: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + blt $s1, $a0, .LBB493_581 +# %bb.575: # in Loop: Header=BB493_565 Depth=1 ld.d $s4, $a1, 0 move $a3, $a1 ld.bu $a1, $s4, 42 slli.d $a2, $a0, 1 addi.d $a2, $a2, 10 st.w $a2, $a3, 44 - beqz $a1, .LBB493_580 -# %bb.577: # in Loop: Header=BB493_566 Depth=1 - beqz $fp, .LBB493_579 -.LBB493_578: # in Loop: Header=BB493_566 Depth=1 + beqz $a1, .LBB493_579 +# %bb.576: # in Loop: Header=BB493_565 Depth=1 + beqz $fp, .LBB493_578 +.LBB493_577: # in Loop: Header=BB493_565 Depth=1 ld.w $a1, $fp, -8 - ld.d $a3, $sp, 96 # 8-byte Folded Reload + ld.d $a3, $sp, 112 # 8-byte Folded Reload ld.d $a2, $a3, %pc_lo12(mem.5) addi.d $a0, $fp, -8 sub.d $a1, $a2, $a1 st.d $a1, $a3, %pc_lo12(mem.5) pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 -.LBB493_579: # %sqlite3DbReallocOrFree.exit.i900 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_578: # %sqlite3DbReallocOrFree.exit.i900 + # in Loop: Header=BB493_565 Depth=1 move $a0, $zero - b .LBB493_581 -.LBB493_580: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_580 +.LBB493_579: # in Loop: Header=BB493_565 Depth=1 ori $a1, $zero, 10 alsl.w $a0, $a0, $a1, 1 slli.w $a1, $a0, 2 move $a0, $fp pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_640 -.LBB493_581: # %sqlite3DbReallocOrFree.exit.i900 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + beqz $a0, .LBB493_639 +.LBB493_580: # %sqlite3DbReallocOrFree.exit.i900 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.d $a0, $a1, 48 move $fp, $a0 -.LBB493_582: # %._crit_edge.i904 - # in Loop: Header=BB493_566 Depth=1 - beqz $fp, .LBB493_584 -# %bb.583: # in Loop: Header=BB493_566 Depth=1 +.LBB493_581: # %._crit_edge.i904 + # in Loop: Header=BB493_565 Depth=1 + beqz $fp, .LBB493_583 +# %bb.582: # in Loop: Header=BB493_565 Depth=1 slli.d $a0, $s1, 2 addi.w $a1, $zero, -1 lu32i.d $a1, 0 stx.w $a1, $fp, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload -.LBB493_584: # %sqlite3VdbeMakeLabel.exit907 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a2, $sp, 192 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload +.LBB493_583: # %sqlite3VdbeMakeLabel.exit907 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a2, $sp, 200 # 8-byte Folded Reload ld.w $a0, $a2, 0 nor $s5, $s1, $zero st.w $s5, $a2, 40 - st.d $s5, $sp, 448 # 8-byte Folded Spill - blez $a0, .LBB493_591 -# %bb.585: # in Loop: Header=BB493_566 Depth=1 + st.d $s5, $sp, 440 # 8-byte Folded Spill + blez $a0, .LBB493_590 +# %bb.584: # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s3, 41 andi $a0, $a0, 8 - beqz $a0, .LBB493_591 -# %bb.586: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_590 +# %bb.585: # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $s8, 88 ld.w $s1, $a1, 24 ld.w $s3, $a1, 28 addi.d $fp, $a0, 1 st.w $fp, $s8, 88 - ld.d $a0, $sp, 192 # 8-byte Folded Reload + ld.d $a0, $sp, 200 # 8-byte Folded Reload st.w $fp, $a0, 12 move $a0, $s1 - blt $s1, $s3, .LBB493_590 -# %bb.587: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s3, .LBB493_589 +# %bb.586: # in Loop: Header=BB493_565 Depth=1 ld.d $s4, $a1, 0 ld.bu $a0, $s4, 42 - beqz $a0, .LBB493_656 -.LBB493_588: # %resizeOpArray.exit.i.i911 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_655 +.LBB493_587: # %resizeOpArray.exit.i.i911 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bnez $a0, .LBB493_591 -# %bb.589: # %resizeOpArray.exit._crit_edge.i.i914 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bnez $a0, .LBB493_590 +# %bb.588: # %resizeOpArray.exit._crit_edge.i.i914 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_590: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_589: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s1, $a0 add.d $a2, $a1, $a0 ori $a3, $zero, 46 stx.h $a3, $a1, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $zero, $a2, 4 st.w $fp, $a2, 8 st.w $zero, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $a1, 339 -.LBB493_591: # %sqlite3VdbeAddOp2.exit918 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 192 # 8-byte Folded Reload +.LBB493_590: # %sqlite3VdbeAddOp2.exit918 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 200 # 8-byte Folded Reload ld.d $s4, $a0, 80 - beqz $s4, .LBB493_611 -# %bb.592: # in Loop: Header=BB493_566 Depth=1 + beqz $s4, .LBB493_610 +# %bb.591: # in Loop: Header=BB493_565 Depth=1 ld.w $s7, $s4, 0 ld.d $s2, $s4, 32 ld.w $a0, $s8, 72 ld.d $s3, $s4, 8 move $a2, $s8 addi.w $a3, $s7, 2 - bge $a0, $a3, .LBB493_620 -# %bb.593: # in Loop: Header=BB493_566 Depth=1 + bge $a0, $a3, .LBB493_619 +# %bb.592: # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a2, 88 addi.d $s8, $a0, 1 add.d $a0, $a0, $a3 st.w $a0, $a2, 88 addi.d $a4, $s8, 1 - st.d $a3, $sp, 392 # 8-byte Folded Spill - st.d $a4, $sp, 400 # 8-byte Folded Spill - blez $s7, .LBB493_621 -.LBB493_594: # %.preheader.lr.ph - # in Loop: Header=BB493_566 Depth=1 + st.d $a3, $sp, 400 # 8-byte Folded Spill + st.d $a4, $sp, 408 # 8-byte Folded Spill + blez $s7, .LBB493_620 +.LBB493_593: # %.preheader.lr.ph + # in Loop: Header=BB493_565 Depth=1 addi.d $a0, $s7, 1 - st.d $a0, $sp, 384 # 8-byte Folded Spill + st.d $a0, $sp, 392 # 8-byte Folded Spill addi.d $s0, $s3, 8 ori $a0, $zero, 1 -.LBB493_595: # %.preheader - # Parent Loop BB493_566 Depth=1 +.LBB493_594: # %.preheader + # Parent Loop BB493_565 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB493_596 Depth 3 + # Child Loop BB493_595 Depth 3 move $s5, $a0 move $a0, $s7 move $a2, $s2 move $a1, $s0 move $s1, $s7 .p2align 4, , 16 -.LBB493_596: # Parent Loop BB493_566 Depth=1 - # Parent Loop BB493_595 Depth=2 +.LBB493_595: # Parent Loop BB493_565 Depth=1 + # Parent Loop BB493_594 Depth=2 # => This Inner Loop Header: Depth=3 ld.w $a3, $a2, 0 - beq $a3, $s5, .LBB493_598 -# %bb.597: # in Loop: Header=BB493_596 Depth=3 + beq $a3, $s5, .LBB493_597 +# %bb.596: # in Loop: Header=BB493_595 Depth=3 addi.w $s1, $s1, -1 addi.d $a1, $a1, 12 addi.d $a0, $a0, -1 addi.d $a2, $a2, 8 - bnez $a0, .LBB493_596 - b .LBB493_777 + bnez $a0, .LBB493_595 + b .LBB493_776 .p2align 4, , 16 -.LBB493_598: # in Loop: Header=BB493_595 Depth=2 +.LBB493_597: # in Loop: Header=BB493_594 Depth=2 ld.w $a0, $a1, 0 ld.d $a1, $sp, 496 slli.d $a2, $a0, 5 @@ -109689,33 +109644,33 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ldx.d $a0, $a1, $a0 ld.d $a1, $a0, 24 add.w $fp, $a4, $s5 - ld.d $a0, $sp, 320 # 8-byte Folded Reload + ld.d $a0, $sp, 336 # 8-byte Folded Reload move $a2, $fp pcaddu18i $ra, %call36(sqlite3ExprCodeTarget) jirl $ra, $ra, 0 - beq $a0, $fp, .LBB493_605 -# %bb.599: # in Loop: Header=BB493_595 Depth=2 - ld.d $a1, $sp, 320 # 8-byte Folded Reload + beq $a0, $fp, .LBB493_604 +# %bb.598: # in Loop: Header=BB493_594 Depth=2 + ld.d $a1, $sp, 336 # 8-byte Folded Reload ld.d $s6, $a1, 24 - beqz $s6, .LBB493_605 -# %bb.600: # in Loop: Header=BB493_595 Depth=2 + beqz $s6, .LBB493_604 +# %bb.599: # in Loop: Header=BB493_594 Depth=2 ld.w $a3, $s6, 24 ld.w $a4, $s6, 28 move $a1, $a3 - blt $a3, $a4, .LBB493_604 -# %bb.601: # in Loop: Header=BB493_595 Depth=2 + blt $a3, $a4, .LBB493_603 +# %bb.600: # in Loop: Header=BB493_594 Depth=2 ld.d $a2, $s6, 0 ld.bu $a1, $a2, 42 - beqz $a1, .LBB493_607 -.LBB493_602: # %resizeOpArray.exit.i.i1441 - # in Loop: Header=BB493_595 Depth=2 + beqz $a1, .LBB493_606 +.LBB493_601: # %resizeOpArray.exit.i.i1441 + # in Loop: Header=BB493_594 Depth=2 ld.d $a1, $s6, 0 ld.bu $a1, $a1, 42 - bnez $a1, .LBB493_605 -# %bb.603: # %resizeOpArray.exit._crit_edge.i.i1444 - # in Loop: Header=BB493_595 Depth=2 + bnez $a1, .LBB493_604 +# %bb.602: # %resizeOpArray.exit._crit_edge.i.i1444 + # in Loop: Header=BB493_594 Depth=2 ld.w $a1, $s6, 24 -.LBB493_604: # in Loop: Header=BB493_595 Depth=2 +.LBB493_603: # in Loop: Header=BB493_594 Depth=2 ld.d $a2, $s6, 32 addi.d $a1, $a1, 1 st.w $a1, $s6, 24 @@ -109729,18 +109684,18 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.w $zero, $a3, 12 st.d $zero, $a3, 16 st.b $zero, $s6, 339 -.LBB493_605: # %sqlite3ExprCode.exit - # in Loop: Header=BB493_595 Depth=2 - beqz $s1, .LBB493_775 -# %bb.606: # in Loop: Header=BB493_595 Depth=2 +.LBB493_604: # %sqlite3ExprCode.exit + # in Loop: Header=BB493_594 Depth=2 + beqz $s1, .LBB493_774 +# %bb.605: # in Loop: Header=BB493_594 Depth=2 addi.w $a0, $s5, 1 - ld.d $a4, $sp, 400 # 8-byte Folded Reload - bne $s5, $s7, .LBB493_595 - b .LBB493_776 -.LBB493_607: # in Loop: Header=BB493_595 Depth=2 + ld.d $a4, $sp, 408 # 8-byte Folded Reload + bne $s5, $s7, .LBB493_594 + b .LBB493_775 +.LBB493_606: # in Loop: Header=BB493_594 Depth=2 st.d $a2, $sp, 328 # 8-byte Folded Spill - st.d $a3, $sp, 352 # 8-byte Folded Spill - st.d $a0, $sp, 360 # 8-byte Folded Spill + st.d $a3, $sp, 360 # 8-byte Folded Spill + st.d $a0, $sp, 368 # 8-byte Folded Spill sltui $a0, $a4, 1 slli.w $a1, $a4, 1 masknez $a1, $a1, $a0 @@ -109749,23 +109704,23 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.d $a0, $s6, 32 or $a2, $a2, $a1 slli.d $a1, $a2, 4 - st.d $a2, $sp, 344 # 8-byte Folded Spill + st.d $a2, $sp, 352 # 8-byte Folded Spill alsl.w $a1, $a2, $a1, 3 - st.d $a4, $sp, 336 # 8-byte Folded Spill + st.d $a4, $sp, 344 # 8-byte Folded Spill pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_610 -# %bb.608: # %sqlite3DbRealloc.exit.i.i.i1447 - # in Loop: Header=BB493_595 Depth=2 + beqz $a0, .LBB493_609 +# %bb.607: # %sqlite3DbRealloc.exit.i.i.i1447 + # in Loop: Header=BB493_594 Depth=2 move $a1, $a0 - ld.d $a5, $sp, 344 # 8-byte Folded Reload + ld.d $a5, $sp, 352 # 8-byte Folded Reload st.w $a5, $s6, 28 st.d $a0, $s6, 32 - ld.d $a0, $sp, 360 # 8-byte Folded Reload - ld.d $a3, $sp, 352 # 8-byte Folded Reload - ld.d $a4, $sp, 336 # 8-byte Folded Reload - bge $a4, $a5, .LBB493_602 -# %bb.609: # in Loop: Header=BB493_595 Depth=2 + ld.d $a0, $sp, 368 # 8-byte Folded Reload + ld.d $a3, $sp, 360 # 8-byte Folded Reload + ld.d $a4, $sp, 344 # 8-byte Folded Reload + bge $a4, $a5, .LBB493_601 +# %bb.608: # in Loop: Header=BB493_594 Depth=2 ori $a2, $zero, 24 mul.d $a0, $a4, $a2 add.d $a0, $a1, $a0 @@ -109774,61 +109729,61 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a3, $sp, 352 # 8-byte Folded Reload - ld.d $a0, $sp, 360 # 8-byte Folded Reload - b .LBB493_602 -.LBB493_610: # in Loop: Header=BB493_595 Depth=2 + ld.d $a3, $sp, 360 # 8-byte Folded Reload + ld.d $a0, $sp, 368 # 8-byte Folded Reload + b .LBB493_601 +.LBB493_609: # in Loop: Header=BB493_594 Depth=2 ori $a0, $zero, 1 ld.d $a1, $sp, 328 # 8-byte Folded Reload st.b $a0, $a1, 42 - ld.d $a0, $sp, 360 # 8-byte Folded Reload - ld.d $a3, $sp, 352 # 8-byte Folded Reload - b .LBB493_602 + ld.d $a0, $sp, 368 # 8-byte Folded Reload + ld.d $a3, $sp, 360 # 8-byte Folded Reload + b .LBB493_601 .p2align 4, , 16 -.LBB493_611: # in Loop: Header=BB493_566 Depth=1 +.LBB493_610: # in Loop: Header=BB493_565 Depth=1 ld.wu $s1, $a0, 4 andi $a0, $s1, 256 - bnez $a0, .LBB493_622 -# %bb.612: # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_621 +# %bb.611: # in Loop: Header=BB493_565 Depth=1 andi $a0, $s1, 512 lu12i.w $a2, 512 and $s3, $s0, $a2 - bnez $a0, .LBB493_632 -# %bb.613: # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_631 +# %bb.612: # in Loop: Header=BB493_565 Depth=1 slli.d $a0, $s1, 50 lu12i.w $a2, 128 and $s0, $s0, $a2 - bltz $a0, .LBB493_666 -# %bb.614: # in Loop: Header=BB493_566 Depth=1 + bltz $a0, .LBB493_665 +# %bb.613: # in Loop: Header=BB493_565 Depth=1 slli.d $a0, $s1, 51 - bltz $a0, .LBB493_702 -# %bb.615: # in Loop: Header=BB493_566 Depth=1 + bltz $a0, .LBB493_701 +# %bb.614: # in Loop: Header=BB493_565 Depth=1 ld.w $fp, $a1, 24 ld.w $s0, $a1, 28 - ld.d $s1, $sp, 192 # 8-byte Folded Reload + ld.d $s1, $sp, 200 # 8-byte Folded Reload ori $a0, $zero, 102 st.w $a0, $s1, 48 ld.d $a6, $sp, 464 # 8-byte Folded Reload st.w $a6, $s1, 52 move $a0, $fp - blt $fp, $s0, .LBB493_619 -# %bb.616: # in Loop: Header=BB493_566 Depth=1 + blt $fp, $s0, .LBB493_618 +# %bb.615: # in Loop: Header=BB493_565 Depth=1 ld.d $s1, $a1, 0 ld.bu $a0, $s1, 42 - beqz $a0, .LBB493_875 -.LBB493_617: # %resizeOpArray.exit.i.i1355 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_874 +.LBB493_616: # %resizeOpArray.exit.i.i1355 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a1, $a0, 42 ori $a0, $zero, 1 - ld.d $s1, $sp, 192 # 8-byte Folded Reload + ld.d $s1, $sp, 200 # 8-byte Folded Reload ld.d $a6, $sp, 464 # 8-byte Folded Reload - bnez $a1, .LBB493_816 -# %bb.618: # %resizeOpArray.exit._crit_edge.i.i1358 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + bnez $a1, .LBB493_815 +# %bb.617: # %resizeOpArray.exit._crit_edge.i.i1358 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $a0, $a1, 24 -.LBB493_619: # in Loop: Header=BB493_566 Depth=1 +.LBB493_618: # in Loop: Header=BB493_565 Depth=1 move $a3, $a1 ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 @@ -109839,167 +109794,167 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ori $a4, $zero, 115 stx.h $a4, $a1, $a0 st.w $a6, $a2, 4 - ld.d $a0, $sp, 416 # 8-byte Folded Reload + ld.d $a0, $sp, 424 # 8-byte Folded Reload st.w $a0, $a2, 8 st.w $zero, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $a3, 339 addi.d $a0, $fp, 1 - b .LBB493_816 + b .LBB493_815 .p2align 4, , 16 -.LBB493_620: # in Loop: Header=BB493_566 Depth=1 +.LBB493_619: # in Loop: Header=BB493_565 Depth=1 ld.w $s8, $a2, 76 add.d $a1, $s8, $a3 st.w $a1, $a2, 76 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload sub.d $a0, $a0, $a3 st.w $a0, $a2, 72 addi.d $a4, $s8, 1 - st.d $a3, $sp, 392 # 8-byte Folded Spill - st.d $a4, $sp, 400 # 8-byte Folded Spill - bgtz $s7, .LBB493_594 -.LBB493_621: # in Loop: Header=BB493_566 Depth=1 + st.d $a3, $sp, 400 # 8-byte Folded Spill + st.d $a4, $sp, 408 # 8-byte Folded Spill + bgtz $s7, .LBB493_593 +.LBB493_620: # in Loop: Header=BB493_565 Depth=1 move $fp, $zero - b .LBB493_778 -.LBB493_622: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_777 +.LBB493_621: # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $sp, 488 - beqz $a0, .LBB493_641 -# %bb.623: # %.lr.ph69.i - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_640 +# %bb.622: # %.lr.ph69.i + # in Loop: Header=BB493_565 Depth=1 ld.d $a1, $sp, 496 - ld.d $fp, $sp, 192 # 8-byte Folded Reload - ld.d $a3, $sp, 368 # 8-byte Folded Reload + ld.d $fp, $sp, 200 # 8-byte Folded Reload + ld.d $a3, $sp, 376 # 8-byte Folded Reload ld.d $a4, $sp, 464 # 8-byte Folded Reload - b .LBB493_625 + b .LBB493_624 .p2align 4, , 16 -.LBB493_624: # %.thread.us.i - # in Loop: Header=BB493_625 Depth=2 +.LBB493_623: # %.thread.us.i + # in Loop: Header=BB493_624 Depth=2 addi.w $a0, $a0, -1 addi.d $a1, $a1, 48 - beqz $a0, .LBB493_629 -.LBB493_625: # %.lr.ph69.split.us.i - # Parent Loop BB493_566 Depth=1 + beqz $a0, .LBB493_628 +.LBB493_624: # %.lr.ph69.split.us.i + # Parent Loop BB493_565 Depth=1 # => This Inner Loop Header: Depth=2 ld.h $a2, $a1, 10 - bne $a4, $a2, .LBB493_624 -# %bb.626: # in Loop: Header=BB493_625 Depth=2 + bne $a4, $a2, .LBB493_623 +# %bb.625: # in Loop: Header=BB493_624 Depth=2 ld.d $a2, $a1, 32 and $a2, $a2, $s6 - bnez $a2, .LBB493_624 -# %bb.627: # in Loop: Header=BB493_625 Depth=2 + bnez $a2, .LBB493_623 +# %bb.626: # in Loop: Header=BB493_624 Depth=2 ld.hu $a2, $a1, 12 - bne $a2, $a3, .LBB493_624 -# %bb.628: # in Loop: Header=BB493_625 Depth=2 + bne $a2, $a3, .LBB493_623 +# %bb.627: # in Loop: Header=BB493_624 Depth=2 ld.hu $a2, $a1, 14 andi $a2, $a2, 3 - beqz $a2, .LBB493_624 - b .LBB493_630 -.LBB493_629: # in Loop: Header=BB493_566 Depth=1 + beqz $a2, .LBB493_623 + b .LBB493_629 +.LBB493_628: # in Loop: Header=BB493_565 Depth=1 move $a1, $zero -.LBB493_630: # %findTerm.exit - # in Loop: Header=BB493_566 Depth=1 +.LBB493_629: # %findTerm.exit + # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s8, 37 - beqz $a0, .LBB493_642 -.LBB493_631: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_641 +.LBB493_630: # in Loop: Header=BB493_565 Depth=1 addi.d $a0, $a0, -1 andi $a2, $a0, 255 slli.d $a2, $a2, 2 - ld.d $a3, $sp, 376 # 8-byte Folded Reload + ld.d $a3, $sp, 384 # 8-byte Folded Reload ldx.w $s2, $a3, $a2 st.b $a0, $s8, 37 - b .LBB493_643 -.LBB493_632: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_642 +.LBB493_631: # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $sp, 488 - beqz $a0, .LBB493_687 -# %bb.633: # %.lr.ph69.i986 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_686 +# %bb.632: # %.lr.ph69.i986 + # in Loop: Header=BB493_565 Depth=1 ld.d $s4, $sp, 496 move $fp, $s4 move $a1, $a0 - ld.d $a3, $sp, 368 # 8-byte Folded Reload + ld.d $a3, $sp, 376 # 8-byte Folded Reload ld.d $a6, $sp, 464 # 8-byte Folded Reload - b .LBB493_635 + b .LBB493_634 .p2align 4, , 16 -.LBB493_634: # %.thread.us.i1000 - # in Loop: Header=BB493_635 Depth=2 +.LBB493_633: # %.thread.us.i1000 + # in Loop: Header=BB493_634 Depth=2 addi.w $a1, $a1, -1 addi.d $fp, $fp, 48 - beqz $a1, .LBB493_659 -.LBB493_635: # %.lr.ph69.split.us.i997 - # Parent Loop BB493_566 Depth=1 + beqz $a1, .LBB493_658 +.LBB493_634: # %.lr.ph69.split.us.i997 + # Parent Loop BB493_565 Depth=1 # => This Inner Loop Header: Depth=2 ld.h $a2, $fp, 10 - bne $a6, $a2, .LBB493_634 -# %bb.636: # in Loop: Header=BB493_635 Depth=2 + bne $a6, $a2, .LBB493_633 +# %bb.635: # in Loop: Header=BB493_634 Depth=2 ld.d $a2, $fp, 32 and $a2, $a2, $s6 - bnez $a2, .LBB493_634 -# %bb.637: # in Loop: Header=BB493_635 Depth=2 + bnez $a2, .LBB493_633 +# %bb.636: # in Loop: Header=BB493_634 Depth=2 ld.hu $a2, $fp, 12 - bne $a2, $a3, .LBB493_634 -# %bb.638: # in Loop: Header=BB493_635 Depth=2 + bne $a2, $a3, .LBB493_633 +# %bb.637: # in Loop: Header=BB493_634 Depth=2 ld.hu $a2, $fp, 14 andi $a2, $a2, 36 - beqz $a2, .LBB493_634 - b .LBB493_661 -.LBB493_639: # in Loop: Header=BB493_566 Depth=1 + beqz $a2, .LBB493_633 + b .LBB493_660 +.LBB493_638: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s4, 42 - bnez $s6, .LBB493_570 - b .LBB493_571 -.LBB493_640: # in Loop: Header=BB493_566 Depth=1 + bnez $s6, .LBB493_569 + b .LBB493_570 +.LBB493_639: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s4, 42 - bnez $fp, .LBB493_578 - b .LBB493_579 -.LBB493_641: # in Loop: Header=BB493_566 Depth=1 + bnez $fp, .LBB493_577 + b .LBB493_578 +.LBB493_640: # in Loop: Header=BB493_565 Depth=1 move $a1, $zero - ld.d $fp, $sp, 192 # 8-byte Folded Reload + ld.d $fp, $sp, 200 # 8-byte Folded Reload ld.bu $a0, $s8, 37 - bnez $a0, .LBB493_631 -.LBB493_642: # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_630 +.LBB493_641: # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $s8, 88 addi.w $s2, $a0, 1 st.w $s2, $s8, 88 -.LBB493_643: # %sqlite3GetTempReg.exit - # in Loop: Header=BB493_566 Depth=1 +.LBB493_642: # %sqlite3GetTempReg.exit + # in Loop: Header=BB493_565 Depth=1 move $a0, $s8 move $a2, $fp move $a3, $s2 pcaddu18i $ra, %call36(codeEqualityTerm) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $s1, $a1, 24 ld.w $s3, $a1, 28 ld.w $fp, $fp, 36 move $s0, $s1 - blt $s1, $s3, .LBB493_647 -# %bb.644: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s3, .LBB493_646 +# %bb.643: # in Loop: Header=BB493_565 Depth=1 ld.d $s0, $a1, 0 ld.bu $a0, $s0, 42 - beqz $a0, .LBB493_691 -.LBB493_645: # %resizeOpArray.exit.i - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_690 +.LBB493_644: # %resizeOpArray.exit.i + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 ld.w $s0, $a1, 24 - beqz $a0, .LBB493_647 -# %bb.646: # %sqlite3VdbeAddOp3.exit - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_646 +# %bb.645: # %sqlite3VdbeAddOp3.exit + # in Loop: Header=BB493_565 Depth=1 ld.w $s1, $a1, 28 move $a0, $s0 - bge $s0, $s1, .LBB493_648 - b .LBB493_651 -.LBB493_647: # %resizeOpArray.exit._crit_edge.i - # in Loop: Header=BB493_566 Depth=1 + bge $s0, $s1, .LBB493_647 + b .LBB493_650 +.LBB493_646: # %resizeOpArray.exit._crit_edge.i + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 32 addi.d $a1, $s0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a1, $a2, 24 ori $a1, $zero, 24 mul.d $a1, $s1, $a1 add.d $a2, $a0, $a1 ori $a3, $zero, 36 stx.h $a3, $a0, $a1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $s2, $a2, 4 st.d $zero, $a2, 16 ld.w $s0, $a1, 24 @@ -110008,24 +109963,24 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.b $zero, $a1, 339 ld.w $s1, $a1, 28 move $a0, $s0 - blt $s0, $s1, .LBB493_651 -.LBB493_648: # in Loop: Header=BB493_566 Depth=1 + blt $s0, $s1, .LBB493_650 +.LBB493_647: # in Loop: Header=BB493_565 Depth=1 ld.d $s3, $a1, 0 ld.bu $a0, $s3, 42 - beqz $a0, .LBB493_694 -.LBB493_649: # %resizeOpArray.exit.i976 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_693 +.LBB493_648: # %resizeOpArray.exit.i976 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - bnez $a0, .LBB493_652 -# %bb.650: # %resizeOpArray.exit._crit_edge.i979 - # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_651 +# %bb.649: # %resizeOpArray.exit._crit_edge.i979 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_651: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_650: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s0, $a0 @@ -110037,29 +109992,29 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.w $fp, $a2, 8 st.w $s2, $a2, 12 st.d $zero, $a2, 16 - ld.d $a0, $sp, 272 # 8-byte Folded Reload + ld.d $a0, $sp, 280 # 8-byte Folded Reload st.b $zero, $a0, 339 -.LBB493_652: # %sqlite3VdbeAddOp3.exit983 - # in Loop: Header=BB493_566 Depth=1 - beqz $s2, .LBB493_655 -# %bb.653: # in Loop: Header=BB493_566 Depth=1 +.LBB493_651: # %sqlite3VdbeAddOp3.exit983 + # in Loop: Header=BB493_565 Depth=1 + beqz $s2, .LBB493_654 +# %bb.652: # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s8, 37 ori $a1, $zero, 7 - bltu $a1, $a0, .LBB493_655 -# %bb.654: # in Loop: Header=BB493_566 Depth=1 + bltu $a1, $a0, .LBB493_654 +# %bb.653: # in Loop: Header=BB493_565 Depth=1 addi.d $a1, $a0, 1 st.b $a1, $s8, 37 slli.d $a0, $a0, 2 - ld.d $a1, $sp, 376 # 8-byte Folded Reload + ld.d $a1, $sp, 384 # 8-byte Folded Reload stx.w $s2, $a1, $a0 -.LBB493_655: # %sqlite3ReleaseTempReg.exit - # in Loop: Header=BB493_566 Depth=1 - ld.d $s1, $sp, 192 # 8-byte Folded Reload +.LBB493_654: # %sqlite3ReleaseTempReg.exit + # in Loop: Header=BB493_565 Depth=1 + ld.d $s1, $sp, 200 # 8-byte Folded Reload ori $a0, $zero, 22 st.w $a0, $s1, 48 ld.d $a6, $sp, 464 # 8-byte Folded Reload - b .LBB493_817 -.LBB493_656: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_816 +.LBB493_655: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s3, 1 move $s6, $a1 slli.w $a1, $s3, 1 @@ -110072,15 +110027,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s5, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_708 -# %bb.657: # %sqlite3DbRealloc.exit.i.i.i917 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_707 +# %bb.656: # %sqlite3DbRealloc.exit.i.i.i917 + # in Loop: Header=BB493_565 Depth=1 st.w $s5, $s6, 28 st.d $a0, $s6, 32 move $a1, $s6 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - bge $s3, $s5, .LBB493_588 -# %bb.658: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + bge $s3, $s5, .LBB493_587 +# %bb.657: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s3, $a2 add.d $a0, $a0, $a1 @@ -110089,39 +110044,39 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_588 -.LBB493_659: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_587 +.LBB493_658: # in Loop: Header=BB493_565 Depth=1 move $fp, $zero - b .LBB493_661 + b .LBB493_660 .p2align 4, , 16 -.LBB493_660: # %.thread.us.i1019 - # in Loop: Header=BB493_661 Depth=2 +.LBB493_659: # %.thread.us.i1019 + # in Loop: Header=BB493_660 Depth=2 addi.w $a0, $a0, -1 addi.d $s4, $s4, 48 - beqz $a0, .LBB493_665 -.LBB493_661: # %.lr.ph69.split.us.i1016 - # Parent Loop BB493_566 Depth=1 + beqz $a0, .LBB493_664 +.LBB493_660: # %.lr.ph69.split.us.i1016 + # Parent Loop BB493_565 Depth=1 # => This Inner Loop Header: Depth=2 ld.h $a1, $s4, 10 - bne $a6, $a1, .LBB493_660 -# %bb.662: # in Loop: Header=BB493_661 Depth=2 + bne $a6, $a1, .LBB493_659 +# %bb.661: # in Loop: Header=BB493_660 Depth=2 ld.d $a1, $s4, 32 and $a1, $a1, $s6 - bnez $a1, .LBB493_660 -# %bb.663: # in Loop: Header=BB493_661 Depth=2 + bnez $a1, .LBB493_659 +# %bb.662: # in Loop: Header=BB493_660 Depth=2 ld.hu $a1, $s4, 12 - bne $a1, $a3, .LBB493_660 -# %bb.664: # in Loop: Header=BB493_661 Depth=2 + bne $a1, $a3, .LBB493_659 +# %bb.663: # in Loop: Header=BB493_660 Depth=2 ld.hu $a1, $s4, 14 andi $a1, $a1, 24 - beqz $a1, .LBB493_660 - b .LBB493_688 -.LBB493_665: # in Loop: Header=BB493_566 Depth=1 + beqz $a1, .LBB493_659 + b .LBB493_687 +.LBB493_664: # in Loop: Header=BB493_565 Depth=1 move $s4, $zero - b .LBB493_688 -.LBB493_666: # in Loop: Header=BB493_566 Depth=1 - ld.d $fp, $sp, 192 # 8-byte Folded Reload + b .LBB493_687 +.LBB493_665: # in Loop: Header=BB493_565 Depth=1 + ld.d $fp, $sp, 200 # 8-byte Folded Reload ld.w $s4, $fp, 60 addi.d $a2, $sp, 472 ori $a4, $zero, 2 @@ -110131,23 +110086,23 @@ sqlite3WhereBegin: # @sqlite3WhereBegin pcaddu18i $ra, %call36(codeAllEqualityTerms) jirl $ra, $ra, 0 move $a3, $s4 - st.d $a0, $sp, 392 # 8-byte Folded Spill + st.d $a0, $sp, 400 # 8-byte Folded Spill ori $a2, $zero, 1 slli.d $a0, $s4, 2 st.d $a0, $sp, 328 # 8-byte Folded Spill ld.d $a0, $sp, 40 # 8-byte Folded Reload ori $a1, $zero, 1 - st.d $a1, $sp, 384 # 8-byte Folded Spill - st.d $s0, $sp, 360 # 8-byte Folded Spill - bne $a0, $a2, .LBB493_670 -# %bb.667: # in Loop: Header=BB493_566 Depth=1 + st.d $a1, $sp, 392 # 8-byte Folded Spill + st.d $s0, $sp, 368 # 8-byte Folded Spill + bne $a0, $a2, .LBB493_669 +# %bb.666: # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $fp, 6 andi $a0, $a0, 16 - beqz $a0, .LBB493_670 -# %bb.668: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_669 +# %bb.667: # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $s2, 8 - bge $a3, $a0, .LBB493_670 -# %bb.669: # in Loop: Header=BB493_566 Depth=1 + bge $a3, $a0, .LBB493_669 +# %bb.668: # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $sp, 32 # 8-byte Folded Reload ld.d $a0, $a0, 16 ld.d $a0, $a0, 0 @@ -110159,32 +110114,32 @@ sqlite3WhereBegin: # @sqlite3WhereBegin sltu $a2, $zero, $a2 xor $a0, $a0, $a1 sltu $a1, $zero, $a0 - st.d $a1, $sp, 384 # 8-byte Folded Spill + st.d $a1, $sp, 392 # 8-byte Folded Spill sltui $a0, $a0, 1 and $a0, $a0, $a2 - b .LBB493_671 -.LBB493_670: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_670 +.LBB493_669: # in Loop: Header=BB493_565 Depth=1 move $a0, $zero -.LBB493_671: # in Loop: Header=BB493_566 Depth=1 +.LBB493_670: # in Loop: Header=BB493_565 Depth=1 ld.d $a1, $s2, 72 ldx.bu $a1, $a1, $a3 bstrpick.d $a4, $s1, 16, 16 bstrpick.d $a2, $s1, 17, 17 sltui $s4, $a1, 1 - st.d $a2, $sp, 344 # 8-byte Folded Spill + st.d $a2, $sp, 352 # 8-byte Folded Spill masknez $a1, $a2, $s4 - ld.d $a2, $sp, 192 # 8-byte Folded Reload + ld.d $a2, $sp, 200 # 8-byte Folded Reload ld.w $a2, $a2, 36 - st.d $a2, $sp, 336 # 8-byte Folded Spill - st.d $a4, $sp, 352 # 8-byte Folded Spill + st.d $a2, $sp, 344 # 8-byte Folded Spill + st.d $a4, $sp, 360 # 8-byte Folded Spill maskeqz $a2, $a4, $s4 or $s0, $a2, $a1 - ld.d $a1, $sp, 392 # 8-byte Folded Reload + ld.d $a1, $sp, 400 # 8-byte Folded Reload add.w $fp, $a1, $a3 - st.d $a3, $sp, 400 # 8-byte Folded Spill - st.d $fp, $sp, 304 # 8-byte Folded Spill - beqz $s0, .LBB493_754 -# %bb.672: # in Loop: Header=BB493_566 Depth=1 + st.d $a3, $sp, 408 # 8-byte Folded Spill + st.d $fp, $sp, 312 # 8-byte Folded Spill + beqz $s0, .LBB493_753 +# %bb.671: # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $s2, 16 ld.d $a1, $sp, 328 # 8-byte Folded Reload ldx.w $a2, $a0, $a1 @@ -110207,80 +110162,80 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a2, $fp pcaddu18i $ra, %call36(sqlite3ExprCodeTarget) jirl $ra, $ra, 0 - beq $a0, $fp, .LBB493_675 -# %bb.673: # in Loop: Header=BB493_566 Depth=1 + beq $a0, $fp, .LBB493_674 +# %bb.672: # in Loop: Header=BB493_565 Depth=1 move $a2, $a0 - ld.d $a0, $sp, 320 # 8-byte Folded Reload + ld.d $a0, $sp, 336 # 8-byte Folded Reload ld.d $a0, $a0, 24 - beqz $a0, .LBB493_675 -# %bb.674: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_674 +# %bb.673: # in Loop: Header=BB493_565 Depth=1 ori $a1, $zero, 7 move $a3, $fp pcaddu18i $ra, %call36(sqlite3VdbeAddOp2) jirl $ra, $ra, 0 -.LBB493_675: # %sqlite3ExprCode.exit1101 - # in Loop: Header=BB493_566 Depth=1 - ld.d $fp, $sp, 272 # 8-byte Folded Reload +.LBB493_674: # %sqlite3ExprCode.exit1101 + # in Loop: Header=BB493_565 Depth=1 + ld.d $fp, $sp, 280 # 8-byte Folded Reload ld.w $s1, $fp, 24 ld.w $s5, $fp, 28 move $a0, $s1 - blt $s1, $s5, .LBB493_679 -# %bb.676: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s5, .LBB493_678 +# %bb.675: # in Loop: Header=BB493_565 Depth=1 ld.d $s6, $fp, 0 ld.bu $a0, $s6, 42 - beqz $a0, .LBB493_872 -.LBB493_677: # %resizeOpArray.exit.i.i1105 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_871 +.LBB493_676: # %resizeOpArray.exit.i.i1105 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $fp, 0 ld.bu $a0, $a0, 42 - bnez $a0, .LBB493_680 -# %bb.678: # %resizeOpArray.exit._crit_edge.i.i1108 - # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_679 +# %bb.677: # %resizeOpArray.exit._crit_edge.i.i1108 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $fp, 24 -.LBB493_679: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_678: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s1, $a0 add.d $a2, $a1, $a0 ori $a3, $zero, 65 stx.h $a3, $a1, $a0 - ld.d $a0, $sp, 304 # 8-byte Folded Reload + ld.d $a0, $sp, 312 # 8-byte Folded Reload st.w $a0, $a2, 4 - ld.d $a0, $sp, 336 # 8-byte Folded Reload + ld.d $a0, $sp, 344 # 8-byte Folded Reload st.w $a0, $a2, 8 st.w $zero, $a2, 12 st.d $zero, $a2, 16 - ld.d $a0, $sp, 272 # 8-byte Folded Reload + ld.d $a0, $sp, 280 # 8-byte Folded Reload st.b $zero, $a0, 339 -.LBB493_680: # %sqlite3VdbeAddOp2.exit1112 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_679: # %sqlite3VdbeAddOp2.exit1112 + # in Loop: Header=BB493_565 Depth=1 ld.hu $a0, $s8, 14 andi $a4, $a0, 40 - ld.d $a3, $sp, 192 # 8-byte Folded Reload + ld.d $a3, $sp, 200 # 8-byte Folded Reload .p2align 4, , 16 -.LBB493_681: # Parent Loop BB493_566 Depth=1 +.LBB493_680: # Parent Loop BB493_565 Depth=1 # => This Inner Loop Header: Depth=2 ld.bu $a0, $s8, 16 andi $a1, $a0, 4 - bnez $a1, .LBB493_686 -# %bb.682: # in Loop: Header=BB493_681 Depth=2 + bnez $a1, .LBB493_685 +# %bb.681: # in Loop: Header=BB493_680 Depth=2 ld.w $a1, $a3, 12 - beqz $a1, .LBB493_684 -# %bb.683: # in Loop: Header=BB493_681 Depth=2 + beqz $a1, .LBB493_683 +# %bb.682: # in Loop: Header=BB493_680 Depth=2 ld.d $a1, $s8, 0 ld.hu $a1, $a1, 2 andi $a1, $a1, 1 - beqz $a1, .LBB493_686 -.LBB493_684: # in Loop: Header=BB493_681 Depth=2 + beqz $a1, .LBB493_685 +.LBB493_683: # in Loop: Header=BB493_680 Depth=2 ld.h $a1, $s8, 8 addi.d $a0, $a0, 4 st.b $a0, $s8, 16 - bltz $a1, .LBB493_686 -# %bb.685: # in Loop: Header=BB493_681 Depth=2 + bltz $a1, .LBB493_685 +# %bb.684: # in Loop: Header=BB493_680 Depth=2 ld.d $a0, $s8, 24 ld.d $a0, $a0, 24 slli.d $a2, $a1, 5 @@ -110290,39 +110245,39 @@ sqlite3WhereBegin: # @sqlite3WhereBegin addi.d $a0, $a0, -1 andi $a1, $a0, 255 st.b $a0, $s8, 17 - beqz $a1, .LBB493_681 -.LBB493_686: # %disableTerm.exit1117.thread.loopexit - # in Loop: Header=BB493_566 Depth=1 + beqz $a1, .LBB493_680 +.LBB493_685: # %disableTerm.exit1117.thread.loopexit + # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 58 - st.d $a0, $sp, 312 # 8-byte Folded Spill - ld.d $s8, $sp, 320 # 8-byte Folded Reload - ld.d $a3, $sp, 400 # 8-byte Folded Reload - b .LBB493_756 -.LBB493_687: # in Loop: Header=BB493_566 Depth=1 + st.d $a0, $sp, 320 # 8-byte Folded Spill + ld.d $s8, $sp, 336 # 8-byte Folded Reload + ld.d $a3, $sp, 408 # 8-byte Folded Reload + b .LBB493_755 +.LBB493_686: # in Loop: Header=BB493_565 Depth=1 move $fp, $zero move $s4, $zero ld.d $a6, $sp, 464 # 8-byte Folded Reload -.LBB493_688: # %findTerm.exit1022 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_687: # %findTerm.exit1022 + # in Loop: Header=BB493_565 Depth=1 sltu $s3, $zero, $s3 masknez $a0, $fp, $s3 maskeqz $a1, $s4, $s3 or $s0, $a1, $a0 - beqz $s0, .LBB493_697 -# %bb.689: # in Loop: Header=BB493_566 Depth=1 + beqz $s0, .LBB493_696 +# %bb.688: # in Loop: Header=BB493_565 Depth=1 ld.d $s1, $s0, 0 ld.bu $a0, $s8, 37 ld.d $a1, $s1, 24 - beqz $a0, .LBB493_709 -# %bb.690: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_708 +# %bb.689: # in Loop: Header=BB493_565 Depth=1 addi.d $a0, $a0, -1 andi $a2, $a0, 255 slli.d $a2, $a2, 2 - ld.d $a3, $sp, 376 # 8-byte Folded Reload + ld.d $a3, $sp, 384 # 8-byte Folded Reload ldx.w $s2, $a3, $a2 st.b $a0, $s8, 37 - b .LBB493_710 -.LBB493_691: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_709 +.LBB493_690: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s3, 1 move $s5, $a1 slli.w $a1, $s3, 1 @@ -110335,15 +110290,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s4, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_773 -# %bb.692: # %sqlite3DbRealloc.exit.i.i - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_772 +# %bb.691: # %sqlite3DbRealloc.exit.i.i + # in Loop: Header=BB493_565 Depth=1 st.w $s4, $s5, 28 st.d $a0, $s5, 32 move $a1, $s5 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bge $s3, $s4, .LBB493_645 -# %bb.693: # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bge $s3, $s4, .LBB493_644 +# %bb.692: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s3, $a2 add.d $a0, $a0, $a1 @@ -110352,9 +110307,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_645 -.LBB493_694: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_644 +.LBB493_693: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s1, 1 move $s5, $a1 slli.w $a1, $s1, 1 @@ -110367,15 +110322,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s4, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_774 -# %bb.695: # %sqlite3DbRealloc.exit.i.i982 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_773 +# %bb.694: # %sqlite3DbRealloc.exit.i.i982 + # in Loop: Header=BB493_565 Depth=1 st.w $s4, $s5, 28 st.d $a0, $s5, 32 move $a1, $s5 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bge $s1, $s4, .LBB493_649 -# %bb.696: # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bge $s1, $s4, .LBB493_648 +# %bb.695: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s1, $a2 add.d $a0, $a0, $a1 @@ -110384,31 +110339,31 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_649 -.LBB493_697: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_648 +.LBB493_696: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $s0, $a1, 24 ld.w $s1, $a1, 28 move $a0, $s0 - blt $s0, $s1, .LBB493_701 -# %bb.698: # in Loop: Header=BB493_566 Depth=1 + blt $s0, $s1, .LBB493_700 +# %bb.697: # in Loop: Header=BB493_565 Depth=1 ld.d $s2, $a1, 0 ld.bu $a0, $s2, 42 - beqz $a0, .LBB493_869 -.LBB493_699: # %resizeOpArray.exit.i.i1055 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_868 +.LBB493_698: # %resizeOpArray.exit.i.i1055 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - ld.d $s1, $sp, 192 # 8-byte Folded Reload - ld.d $s5, $sp, 448 # 8-byte Folded Reload + ld.d $s1, $sp, 200 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload ld.d $a6, $sp, 464 # 8-byte Folded Reload - bnez $a0, .LBB493_734 -# %bb.700: # %resizeOpArray.exit._crit_edge.i.i1058 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + bnez $a0, .LBB493_733 +# %bb.699: # %resizeOpArray.exit._crit_edge.i.i1058 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $a0, $a1, 24 -.LBB493_701: # in Loop: Header=BB493_566 Depth=1 +.LBB493_700: # in Loop: Header=BB493_565 Depth=1 move $a5, $a1 ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 @@ -110423,16 +110378,16 @@ sqlite3WhereBegin: # @sqlite3WhereBegin or $a3, $a4, $a3 stx.b $a3, $a1, $a0 st.w $a6, $a2, 4 - ld.d $a0, $sp, 416 # 8-byte Folded Reload + ld.d $a0, $sp, 424 # 8-byte Folded Reload st.w $a0, $a2, 8 st.w $zero, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $a2, 1 st.b $zero, $a5, 339 - ld.d $s1, $sp, 192 # 8-byte Folded Reload - b .LBB493_734 -.LBB493_702: # in Loop: Header=BB493_566 Depth=1 - ld.d $s1, $sp, 192 # 8-byte Folded Reload + ld.d $s1, $sp, 200 # 8-byte Folded Reload + b .LBB493_733 +.LBB493_701: # in Loop: Header=BB493_565 Depth=1 + ld.d $s1, $sp, 200 # 8-byte Folded Reload ld.w $s4, $s1, 60 addi.d $a2, $sp, 472 ori $a4, $zero, 1 @@ -110442,19 +110397,19 @@ sqlite3WhereBegin: # @sqlite3WhereBegin pcaddu18i $ra, %call36(codeAllEqualityTerms) jirl $ra, $ra, 0 ld.w $a1, $s1, 36 - st.d $a1, $sp, 432 # 8-byte Folded Spill + st.d $a1, $sp, 448 # 8-byte Folded Spill move $a5, $a0 ld.d $a0, $sp, 40 # 8-byte Folded Reload - st.d $s0, $sp, 360 # 8-byte Folded Spill - bne $a0, $s7, .LBB493_743 -# %bb.703: # in Loop: Header=BB493_566 Depth=1 + st.d $s0, $sp, 368 # 8-byte Folded Spill + bne $a0, $s7, .LBB493_742 +# %bb.702: # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s1, 6 andi $a0, $a0, 16 - beqz $a0, .LBB493_743 -# %bb.704: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_742 +# %bb.703: # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $s2, 8 - bge $s4, $a0, .LBB493_743 -# %bb.705: # in Loop: Header=BB493_566 Depth=1 + bge $s4, $a0, .LBB493_742 +# %bb.704: # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $sp, 32 # 8-byte Folded Reload ld.d $a0, $a0, 16 ld.d $a0, $a0, 0 @@ -110462,18 +110417,18 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.w $a0, $a0, 76 slli.d $a2, $s4, 2 ldx.w $a1, $a1, $a2 - bne $a0, $a1, .LBB493_743 -# %bb.706: # %.thread1599 - # in Loop: Header=BB493_566 Depth=1 + bne $a0, $a1, .LBB493_742 +# %bb.705: # %.thread1599 + # in Loop: Header=BB493_565 Depth=1 ld.w $a4, $s1, 8 move $s1, $s4 - ld.d $s4, $sp, 272 # 8-byte Folded Reload + ld.d $s4, $sp, 280 # 8-byte Folded Reload move $a0, $s4 move $a1, $s1 move $a2, $s2 move $a3, $a5 move $fp, $a5 - st.d $a5, $sp, 400 # 8-byte Folded Spill + st.d $a5, $sp, 408 # 8-byte Folded Spill pcaddu18i $ra, %call36(buildIndexProbe) jirl $ra, $ra, 0 add.w $a3, $fp, $s1 @@ -110488,121 +110443,121 @@ sqlite3WhereBegin: # @sqlite3WhereBegin addi.w $a1, $s1, 1 move $a0, $s4 move $a2, $s2 - ld.d $a3, $sp, 400 # 8-byte Folded Reload + ld.d $a3, $sp, 408 # 8-byte Folded Reload move $a4, $fp pcaddu18i $ra, %call36(buildIndexProbe) jirl $ra, $ra, 0 - beqz $s3, .LBB493_1114 -# %bb.707: # %.split723 - # in Loop: Header=BB493_566 Depth=1 + beqz $s3, .LBB493_1113 +# %bb.706: # %.split723 + # in Loop: Header=BB493_565 Depth=1 ori $a1, $zero, 94 move $a0, $s4 - ld.d $a2, $sp, 408 # 8-byte Folded Reload - ld.d $s2, $sp, 432 # 8-byte Folded Reload + ld.d $a2, $sp, 416 # 8-byte Folded Reload + ld.d $s2, $sp, 448 # 8-byte Folded Reload move $a3, $s2 move $a4, $fp pcaddu18i $ra, %call36(sqlite3VdbeAddOp3) jirl $ra, $ra, 0 - b .LBB493_750 -.LBB493_708: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_749 +.LBB493_707: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s4, 42 move $a1, $s6 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - b .LBB493_588 -.LBB493_709: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + b .LBB493_587 +.LBB493_708: # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $s8, 88 addi.w $s2, $a0, 1 st.w $s2, $s8, 88 -.LBB493_710: # %sqlite3GetTempReg.exit1453 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_709: # %sqlite3GetTempReg.exit1453 + # in Loop: Header=BB493_565 Depth=1 move $a0, $s8 move $a2, $s2 pcaddu18i $ra, %call36(sqlite3ExprCodeTarget) jirl $ra, $ra, 0 move $a5, $a0 - bne $a0, $s2, .LBB493_712 -# %bb.711: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_716 -.LBB493_712: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - beqz $s2, .LBB493_715 -# %bb.713: # in Loop: Header=BB493_566 Depth=1 + bne $a0, $s2, .LBB493_711 +# %bb.710: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_715 +.LBB493_711: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + beqz $s2, .LBB493_714 +# %bb.712: # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s8, 37 ori $a2, $zero, 7 - bltu $a2, $a0, .LBB493_715 -# %bb.714: # in Loop: Header=BB493_566 Depth=1 + bltu $a2, $a0, .LBB493_714 +# %bb.713: # in Loop: Header=BB493_565 Depth=1 addi.d $a1, $a0, 1 st.b $a1, $s8, 37 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload slli.d $a0, $a0, 2 - ld.d $a2, $sp, 376 # 8-byte Folded Reload + ld.d $a2, $sp, 384 # 8-byte Folded Reload stx.w $s2, $a2, $a0 -.LBB493_715: # %sqlite3ExprCodeTemp.exit - # in Loop: Header=BB493_566 Depth=1 +.LBB493_714: # %sqlite3ExprCodeTemp.exit + # in Loop: Header=BB493_565 Depth=1 move $s2, $zero -.LBB493_716: # %sqlite3ExprCodeTemp.exit - # in Loop: Header=BB493_566 Depth=1 +.LBB493_715: # %sqlite3ExprCodeTemp.exit + # in Loop: Header=BB493_565 Depth=1 ld.w $s5, $a1, 24 ld.w $s7, $a1, 28 ld.bu $s6, $s1, 0 move $s1, $s5 - blt $s5, $s7, .LBB493_719 -# %bb.717: # in Loop: Header=BB493_566 Depth=1 + blt $s5, $s7, .LBB493_718 +# %bb.716: # in Loop: Header=BB493_565 Depth=1 ld.d $s1, $a1, 0 ld.bu $a0, $s1, 42 - beqz $a0, .LBB493_849 -.LBB493_718: # %resizeOpArray.exit.i1026 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_848 +.LBB493_717: # %resizeOpArray.exit.i1026 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 ld.w $s1, $a1, 24 - ld.d $s8, $sp, 320 # 8-byte Folded Reload - bnez $a0, .LBB493_720 -.LBB493_719: # %resizeOpArray.exit._crit_edge.i1029 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s8, $sp, 336 # 8-byte Folded Reload + bnez $a0, .LBB493_719 +.LBB493_718: # %resizeOpArray.exit._crit_edge.i1029 + # in Loop: Header=BB493_565 Depth=1 addi.d $a0, $s6, -69 sltui $a0, $a0, 2 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a2, $s1, 1 - ld.d $a3, $sp, 272 # 8-byte Folded Reload + ld.d $a3, $sp, 280 # 8-byte Folded Reload st.w $a2, $a3, 24 ori $a2, $zero, 24 mul.d $a2, $s5, $a2 add.d $a3, $a1, $a2 ori $a4, $zero, 107 stx.h $a4, $a1, $a2 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $a5, $a3, 4 st.d $zero, $a3, 16 ld.w $s1, $a1, 24 - ld.d $a2, $sp, 416 # 8-byte Folded Reload + ld.d $a2, $sp, 424 # 8-byte Folded Reload st.w $a2, $a3, 8 st.w $a0, $a3, 12 st.b $zero, $a1, 339 -.LBB493_720: # %sqlite3VdbeAddOp3.exit1033 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_719: # %sqlite3VdbeAddOp3.exit1033 + # in Loop: Header=BB493_565 Depth=1 ld.w $s5, $a1, 28 move $a0, $s1 - blt $s1, $s5, .LBB493_724 -# %bb.721: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s5, .LBB493_723 +# %bb.720: # in Loop: Header=BB493_565 Depth=1 ld.d $s6, $a1, 0 ld.bu $a0, $s6, 42 - beqz $a0, .LBB493_852 -.LBB493_722: # %resizeOpArray.exit.i1037 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_851 +.LBB493_721: # %resizeOpArray.exit.i1037 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - bnez $a0, .LBB493_725 -# %bb.723: # %resizeOpArray.exit._crit_edge.i1040 - # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_724 +# %bb.722: # %resizeOpArray.exit._crit_edge.i1040 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_724: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_723: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s1, $a0 @@ -110615,53 +110570,53 @@ sqlite3WhereBegin: # @sqlite3WhereBegin stx.b $a3, $a1, $a0 ld.d $a0, $sp, 464 # 8-byte Folded Reload st.w $a0, $a2, 4 - ld.d $a0, $sp, 416 # 8-byte Folded Reload + ld.d $a0, $sp, 424 # 8-byte Folded Reload st.w $a0, $a2, 8 st.w $a5, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $a2, 1 - ld.d $a0, $sp, 272 # 8-byte Folded Reload + ld.d $a0, $sp, 280 # 8-byte Folded Reload st.b $zero, $a0, 339 -.LBB493_725: # %sqlite3VdbeAddOp3.exit1044 - # in Loop: Header=BB493_566 Depth=1 - beqz $s2, .LBB493_728 -# %bb.726: # in Loop: Header=BB493_566 Depth=1 +.LBB493_724: # %sqlite3VdbeAddOp3.exit1044 + # in Loop: Header=BB493_565 Depth=1 + beqz $s2, .LBB493_727 +# %bb.725: # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s8, 37 ori $a1, $zero, 7 - bltu $a1, $a0, .LBB493_728 -# %bb.727: # in Loop: Header=BB493_566 Depth=1 + bltu $a1, $a0, .LBB493_727 +# %bb.726: # in Loop: Header=BB493_565 Depth=1 addi.d $a1, $a0, 1 st.b $a1, $s8, 37 slli.d $a0, $a0, 2 - ld.d $a1, $sp, 376 # 8-byte Folded Reload + ld.d $a1, $sp, 384 # 8-byte Folded Reload stx.w $s2, $a1, $a0 -.LBB493_728: # %sqlite3ReleaseTempReg.exit1046 - # in Loop: Header=BB493_566 Depth=1 - ld.d $s1, $sp, 192 # 8-byte Folded Reload - ld.d $s6, $sp, 432 # 8-byte Folded Reload +.LBB493_727: # %sqlite3ReleaseTempReg.exit1046 + # in Loop: Header=BB493_565 Depth=1 + ld.d $s1, $sp, 200 # 8-byte Folded Reload + ld.d $s6, $sp, 448 # 8-byte Folded Reload ori $s7, $zero, 1 - ld.d $s5, $sp, 448 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload ld.d $a6, $sp, 464 # 8-byte Folded Reload .p2align 4, , 16 -.LBB493_729: # Parent Loop BB493_566 Depth=1 +.LBB493_728: # Parent Loop BB493_565 Depth=1 # => This Inner Loop Header: Depth=2 ld.bu $a0, $s0, 16 andi $a1, $a0, 4 - bnez $a1, .LBB493_734 -# %bb.730: # in Loop: Header=BB493_729 Depth=2 + bnez $a1, .LBB493_733 +# %bb.729: # in Loop: Header=BB493_728 Depth=2 ld.w $a1, $s1, 12 - beqz $a1, .LBB493_732 -# %bb.731: # in Loop: Header=BB493_729 Depth=2 + beqz $a1, .LBB493_731 +# %bb.730: # in Loop: Header=BB493_728 Depth=2 ld.d $a1, $s0, 0 ld.hu $a1, $a1, 2 andi $a1, $a1, 1 - beqz $a1, .LBB493_734 -.LBB493_732: # in Loop: Header=BB493_729 Depth=2 + beqz $a1, .LBB493_733 +.LBB493_731: # in Loop: Header=BB493_728 Depth=2 ld.h $a1, $s0, 8 addi.d $a0, $a0, 4 st.b $a0, $s0, 16 - bltz $a1, .LBB493_734 -# %bb.733: # in Loop: Header=BB493_729 Depth=2 + bltz $a1, .LBB493_733 +# %bb.732: # in Loop: Header=BB493_728 Depth=2 ld.d $a0, $s0, 24 ld.d $a0, $a0, 24 slli.d $a2, $a1, 5 @@ -110671,14 +110626,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin addi.d $a0, $a0, -1 andi $a1, $a0, 255 st.b $a0, $s0, 17 - beqz $a1, .LBB493_729 -.LBB493_734: # %disableTerm.exit1051 - # in Loop: Header=BB493_566 Depth=1 + beqz $a1, .LBB493_728 +.LBB493_733: # %disableTerm.exit1051 + # in Loop: Header=BB493_565 Depth=1 masknez $a0, $s4, $s3 maskeqz $a1, $fp, $s3 or $s0, $a1, $a0 - beqz $s0, .LBB493_742 -# %bb.735: # in Loop: Header=BB493_566 Depth=1 + beqz $s0, .LBB493_741 +# %bb.734: # in Loop: Header=BB493_565 Depth=1 move $a2, $s1 ld.d $s1, $s0, 0 ld.w $a0, $s8, 88 @@ -110690,20 +110645,20 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a2, $fp pcaddu18i $ra, %call36(sqlite3ExprCodeTarget) jirl $ra, $ra, 0 - beq $a0, $fp, .LBB493_1070 -# %bb.736: # in Loop: Header=BB493_566 Depth=1 + beq $a0, $fp, .LBB493_1069 +# %bb.735: # in Loop: Header=BB493_565 Depth=1 ld.d $s2, $s8, 24 - beqz $s2, .LBB493_1070 -# %bb.737: # in Loop: Header=BB493_566 Depth=1 + beqz $s2, .LBB493_1069 +# %bb.736: # in Loop: Header=BB493_565 Depth=1 ld.w $s4, $s2, 24 ld.w $s5, $s2, 28 move $a1, $s4 - blt $s4, $s5, .LBB493_1069 -# %bb.738: # in Loop: Header=BB493_566 Depth=1 + blt $s4, $s5, .LBB493_1068 +# %bb.737: # in Loop: Header=BB493_565 Depth=1 ld.d $s7, $s2, 0 ld.bu $a1, $s7, 42 - bnez $a1, .LBB493_1067 -# %bb.739: # in Loop: Header=BB493_566 Depth=1 + bnez $a1, .LBB493_1066 +# %bb.738: # in Loop: Header=BB493_565 Depth=1 move $s6, $a0 sltui $a0, $s5, 1 slli.w $a1, $s5, 1 @@ -110716,15 +110671,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s8, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1065 -# %bb.740: # %sqlite3DbRealloc.exit.i.i.i1463 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1064 +# %bb.739: # %sqlite3DbRealloc.exit.i.i.i1463 + # in Loop: Header=BB493_565 Depth=1 move $a1, $a0 st.w $s8, $s2, 28 st.d $a0, $s2, 32 move $a0, $s6 - bge $s5, $s8, .LBB493_1067 -# %bb.741: # in Loop: Header=BB493_566 Depth=1 + bge $s5, $s8, .LBB493_1066 +# %bb.740: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a0, $s5, $a2 add.d $a0, $a1, $a0 @@ -110733,10 +110688,10 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - b .LBB493_1066 -.LBB493_742: # %disableTerm.exit1070 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 272 # 8-byte Folded Reload + b .LBB493_1065 +.LBB493_741: # %disableTerm.exit1070 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 280 # 8-byte Folded Reload ld.w $a0, $a0, 24 ori $a1, $zero, 102 masknez $a1, $a1, $s3 @@ -110744,129 +110699,129 @@ sqlite3WhereBegin: # @sqlite3WhereBegin maskeqz $a2, $a2, $s3 or $a1, $a2, $a1 st.w $a1, $s1, 48 - b .LBB493_815 -.LBB493_743: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + b .LBB493_814 +.LBB493_742: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $fp, $a1, 24 ld.w $s0, $a1, 28 ld.w $s8, $s1, 8 move $a0, $fp - blt $fp, $s0, .LBB493_747 -# %bb.744: # in Loop: Header=BB493_566 Depth=1 - st.d $s4, $sp, 416 # 8-byte Folded Spill + blt $fp, $s0, .LBB493_746 +# %bb.743: # in Loop: Header=BB493_565 Depth=1 + st.d $s4, $sp, 424 # 8-byte Folded Spill ld.d $s1, $a1, 0 ld.bu $a0, $s1, 42 - beqz $a0, .LBB493_878 -.LBB493_745: # %resizeOpArray.exit.i.i1293 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_877 +.LBB493_744: # %resizeOpArray.exit.i.i1293 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - bnez $a0, .LBB493_748 -# %bb.746: # %resizeOpArray.exit._crit_edge.i.i1295 - # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_747 +# %bb.745: # %resizeOpArray.exit._crit_edge.i.i1295 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 - ld.d $s4, $sp, 416 # 8-byte Folded Reload -.LBB493_747: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $s4, $sp, 424 # 8-byte Folded Reload +.LBB493_746: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $fp, $a0 add.d $a2, $a1, $a0 ori $a3, $zero, 84 stx.h $a3, $a1, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $a5, $a2, 4 st.w $s4, $a2, 8 st.w $s8, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $a1, 339 -.LBB493_748: # in Loop: Header=BB493_566 Depth=1 +.LBB493_747: # in Loop: Header=BB493_565 Depth=1 move $a0, $a1 move $s4, $a1 move $a1, $s2 pcaddu18i $ra, %call36(sqlite3IndexAffinityStr) jirl $ra, $ra, 0 - beqz $s3, .LBB493_855 -# %bb.749: # %.split - # in Loop: Header=BB493_566 Depth=1 + beqz $s3, .LBB493_854 +# %bb.748: # %.split + # in Loop: Header=BB493_565 Depth=1 ori $a1, $zero, 63 move $a0, $s4 - ld.d $a2, $sp, 408 # 8-byte Folded Reload - ld.d $s2, $sp, 432 # 8-byte Folded Reload + ld.d $a2, $sp, 416 # 8-byte Folded Reload + ld.d $s2, $sp, 448 # 8-byte Folded Reload move $a3, $s2 move $a4, $s8 pcaddu18i $ra, %call36(sqlite3VdbeAddOp3) jirl $ra, $ra, 0 - ld.d $s8, $sp, 320 # 8-byte Folded Reload -.LBB493_750: # in Loop: Header=BB493_566 Depth=1 + ld.d $s8, $sp, 336 # 8-byte Folded Reload +.LBB493_749: # in Loop: Header=BB493_565 Depth=1 ld.d $a6, $sp, 464 # 8-byte Folded Reload ld.w $fp, $s4, 24 ld.w $s1, $s4, 28 - ld.d $a0, $sp, 192 # 8-byte Folded Reload + ld.d $a0, $sp, 200 # 8-byte Folded Reload ld.w $s0, $a0, 8 move $a0, $fp - blt $fp, $s1, .LBB493_964 -# %bb.751: # in Loop: Header=BB493_566 Depth=1 + blt $fp, $s1, .LBB493_963 +# %bb.750: # in Loop: Header=BB493_565 Depth=1 move $a1, $s4 ld.d $s2, $s4, 0 ld.bu $a0, $s2, 42 - beqz $a0, .LBB493_960 -.LBB493_752: # %resizeOpArray.exit.i1303 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_959 +.LBB493_751: # %resizeOpArray.exit.i1303 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 ld.d $a6, $sp, 464 # 8-byte Folded Reload - beqz $a0, .LBB493_963 -# %bb.753: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_962 +# %bb.752: # in Loop: Header=BB493_565 Depth=1 move $fp, $zero - b .LBB493_965 -.LBB493_754: # %disableTerm.exit1117 - # in Loop: Header=BB493_566 Depth=1 + b .LBB493_964 +.LBB493_753: # %disableTerm.exit1117 + # in Loop: Header=BB493_565 Depth=1 slt $a1, $zero, $a3 or $a0, $a1, $a0 andi $a0, $a0, 1 - beqz $a0, .LBB493_860 -# %bb.755: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_859 +# %bb.754: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 22 masknez $a0, $a0, $a1 ori $a2, $zero, 58 maskeqz $a1, $a2, $a1 or $a0, $a1, $a0 - st.d $a0, $sp, 312 # 8-byte Folded Spill + st.d $a0, $sp, 320 # 8-byte Folded Spill ori $a4, $zero, 1 -.LBB493_756: # %disableTerm.exit1117.thread - # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_755: # %disableTerm.exit1117.thread + # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $s1, $a1, 24 ld.w $s5, $a1, 28 sltu $a0, $zero, $s0 - ld.d $a2, $sp, 384 # 8-byte Folded Reload + ld.d $a2, $sp, 392 # 8-byte Folded Reload or $a0, $a0, $a2 add.d $s0, $s0, $a3 - beqz $a0, .LBB493_758 -# %bb.757: # in Loop: Header=BB493_566 Depth=1 - st.d $a4, $sp, 296 # 8-byte Folded Spill - b .LBB493_763 -.LBB493_758: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_757 +# %bb.756: # in Loop: Header=BB493_565 Depth=1 + st.d $a4, $sp, 304 # 8-byte Folded Spill + b .LBB493_762 +.LBB493_757: # in Loop: Header=BB493_565 Depth=1 move $a0, $s1 move $a4, $a1 - blt $s1, $s5, .LBB493_761 -# %bb.759: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s5, .LBB493_760 +# %bb.758: # in Loop: Header=BB493_565 Depth=1 ld.d $s6, $a4, 0 ld.bu $a0, $s6, 42 - beqz $a0, .LBB493_883 -.LBB493_760: # %resizeOpArray.exit.i.i1121 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_882 +.LBB493_759: # %resizeOpArray.exit.i.i1121 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a4, 0 ld.bu $a1, $a0, 42 ld.w $a0, $a4, 24 - bnez $a1, .LBB493_762 -.LBB493_761: # %resizeOpArray.exit._crit_edge.i.i1124 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 392 # 8-byte Folded Reload + bnez $a1, .LBB493_761 +.LBB493_760: # %resizeOpArray.exit._crit_edge.i.i1124 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 400 # 8-byte Folded Reload add.d $a1, $s0, $a1 ld.d $a2, $a4, 32 addi.d $a0, $a0, 1 @@ -110882,138 +110837,138 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.w $a1, $a3, 8 st.w $zero, $a3, 12 st.b $zero, $a4, 339 -.LBB493_762: # %sqlite3VdbeAddOp2.exit1128 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_761: # %sqlite3VdbeAddOp2.exit1128 + # in Loop: Header=BB493_565 Depth=1 ld.w $s5, $a4, 28 - st.d $zero, $sp, 296 # 8-byte Folded Spill + st.d $zero, $sp, 304 # 8-byte Folded Spill addi.d $s0, $s0, 1 move $s1, $a0 move $a1, $a4 -.LBB493_763: # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 192 # 8-byte Folded Reload +.LBB493_762: # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 200 # 8-byte Folded Reload ld.w $s6, $a0, 8 move $a0, $s1 - blt $s1, $s5, .LBB493_767 -# %bb.764: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s5, .LBB493_766 +# %bb.763: # in Loop: Header=BB493_565 Depth=1 ld.d $s7, $a1, 0 ld.bu $a0, $s7, 42 - beqz $a0, .LBB493_866 -.LBB493_765: # %resizeOpArray.exit.i.i1132 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_865 +.LBB493_764: # %resizeOpArray.exit.i.i1132 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - ld.d $s8, $sp, 320 # 8-byte Folded Reload - bnez $a0, .LBB493_768 -# %bb.766: # %resizeOpArray.exit._crit_edge.i.i1134 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s8, $sp, 336 # 8-byte Folded Reload + bnez $a0, .LBB493_767 +# %bb.765: # %resizeOpArray.exit._crit_edge.i.i1134 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_767: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_766: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s1, $a0 add.d $a2, $a1, $a0 ori $a3, $zero, 84 stx.h $a3, $a1, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - ld.d $a0, $sp, 392 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + ld.d $a0, $sp, 400 # 8-byte Folded Reload st.w $a0, $a2, 4 st.w $s0, $a2, 8 st.w $s6, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $a1, 339 -.LBB493_768: # %buildIndexProbe.exit - # in Loop: Header=BB493_566 Depth=1 +.LBB493_767: # %buildIndexProbe.exit + # in Loop: Header=BB493_565 Depth=1 move $a0, $a1 move $fp, $a1 move $a1, $s2 pcaddu18i $ra, %call36(sqlite3IndexAffinityStr) jirl $ra, $ra, 0 - beqz $s3, .LBB493_772 -# %bb.769: # in Loop: Header=BB493_566 Depth=1 + beqz $s3, .LBB493_771 +# %bb.768: # in Loop: Header=BB493_565 Depth=1 ld.w $s1, $fp, 24 ld.w $s5, $fp, 28 - ld.d $a0, $sp, 192 # 8-byte Folded Reload + ld.d $a0, $sp, 200 # 8-byte Folded Reload ld.w $s0, $a0, 8 move $a0, $s1 move $a5, $fp - ld.d $s6, $sp, 432 # 8-byte Folded Reload - blt $s1, $s5, .LBB493_890 -# %bb.770: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + blt $s1, $s5, .LBB493_889 +# %bb.769: # in Loop: Header=BB493_565 Depth=1 ld.d $s6, $a5, 0 ld.bu $a0, $s6, 42 - beqz $a0, .LBB493_886 -.LBB493_771: # %resizeOpArray.exit.i1141 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_885 +.LBB493_770: # %resizeOpArray.exit.i1141 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a5, 0 ld.bu $a0, $a0, 42 - beqz $a0, .LBB493_889 -.LBB493_772: # in Loop: Header=BB493_566 Depth=1 - ld.d $s6, $sp, 432 # 8-byte Folded Reload + beqz $a0, .LBB493_888 +.LBB493_771: # in Loop: Header=BB493_565 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload ori $s7, $zero, 1 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - b .LBB493_892 -.LBB493_773: # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + b .LBB493_891 +.LBB493_772: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s0, 42 move $a1, $s5 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - b .LBB493_645 -.LBB493_774: # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + b .LBB493_644 +.LBB493_773: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s3, 42 move $a1, $s5 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - b .LBB493_649 -.LBB493_775: # in Loop: Header=BB493_566 Depth=1 - ld.d $a4, $sp, 400 # 8-byte Folded Reload - b .LBB493_777 -.LBB493_776: # in Loop: Header=BB493_566 Depth=1 - ld.d $s5, $sp, 384 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload + b .LBB493_648 +.LBB493_774: # in Loop: Header=BB493_565 Depth=1 + ld.d $a4, $sp, 408 # 8-byte Folded Reload + b .LBB493_776 +.LBB493_775: # in Loop: Header=BB493_565 Depth=1 + ld.d $s5, $sp, 392 # 8-byte Folded Reload .p2align 4, , 16 -.LBB493_777: # %sqlite3ExprCode.exit._crit_edge.loopexit - # in Loop: Header=BB493_566 Depth=1 +.LBB493_776: # %sqlite3ExprCode.exit._crit_edge.loopexit + # in Loop: Header=BB493_565 Depth=1 addi.d $fp, $s5, -1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload -.LBB493_778: # %sqlite3ExprCode.exit._crit_edge - # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload +.LBB493_777: # %sqlite3ExprCode.exit._crit_edge + # in Loop: Header=BB493_565 Depth=1 ld.w $s5, $a1, 24 ld.w $s6, $a1, 28 ld.w $s1, $s4, 40 move $s0, $s5 - st.d $s8, $sp, 408 # 8-byte Folded Spill - blt $s5, $s6, .LBB493_782 -# %bb.779: # in Loop: Header=BB493_566 Depth=1 + st.d $s8, $sp, 416 # 8-byte Folded Spill + blt $s5, $s6, .LBB493_781 +# %bb.778: # in Loop: Header=BB493_565 Depth=1 ld.d $s0, $a1, 0 ld.bu $a0, $s0, 42 - beqz $a0, .LBB493_790 -.LBB493_780: # %resizeOpArray.exit.i.i925 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_789 +.LBB493_779: # %resizeOpArray.exit.i.i925 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 ld.w $s0, $a1, 24 - beqz $a0, .LBB493_782 -# %bb.781: # %sqlite3VdbeAddOp2.exit932 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_781 +# %bb.780: # %sqlite3VdbeAddOp2.exit932 + # in Loop: Header=BB493_565 Depth=1 ld.w $s1, $a1, 28 move $s8, $a1 move $a1, $s0 - bge $s0, $s1, .LBB493_783 - b .LBB493_785 + bge $s0, $s1, .LBB493_782 + b .LBB493_784 .p2align 4, , 16 -.LBB493_782: # %resizeOpArray.exit._crit_edge.i.i928 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_781: # %resizeOpArray.exit._crit_edge.i.i928 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 32 addi.d $a1, $s0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a1, $a2, 24 ori $a1, $zero, 24 mul.d $a1, $s5, $a1 add.d $a2, $a0, $a1 ori $a3, $zero, 46 stx.h $a3, $a0, $a1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $s1, $a2, 4 st.d $zero, $a2, 16 ld.w $s0, $a1, 24 @@ -111023,19 +110978,19 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.w $s1, $a1, 28 move $s8, $a1 move $a1, $s0 - blt $s0, $s1, .LBB493_785 -.LBB493_783: # in Loop: Header=BB493_566 Depth=1 + blt $s0, $s1, .LBB493_784 +.LBB493_782: # in Loop: Header=BB493_565 Depth=1 ld.d $s5, $s8, 0 ld.bu $a0, $s5, 42 - beqz $a0, .LBB493_793 -.LBB493_784: # %resizeOpArray.exit.i.i936 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_792 +.LBB493_783: # %resizeOpArray.exit.i.i936 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $s8, 0 ld.bu $a0, $a0, 42 ld.w $a1, $s8, 24 - bnez $a0, .LBB493_786 -.LBB493_785: # %resizeOpArray.exit._crit_edge.i.i939 - # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_785 +.LBB493_784: # %resizeOpArray.exit._crit_edge.i.i939 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $s8, 32 addi.d $a1, $a1, 1 st.w $a1, $s8, 24 @@ -111050,26 +111005,26 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.w $a4, $a2, 8 st.w $zero, $a2, 12 st.b $zero, $s8, 339 -.LBB493_786: # %sqlite3VdbeAddOp2.exit943 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_785: # %sqlite3VdbeAddOp2.exit943 + # in Loop: Header=BB493_565 Depth=1 ld.d $a2, $s4, 48 ld.w $s0, $s8, 28 ld.w $fp, $s4, 56 move $a0, $a1 - blt $a1, $s0, .LBB493_800 -# %bb.787: # in Loop: Header=BB493_566 Depth=1 + blt $a1, $s0, .LBB493_799 +# %bb.786: # in Loop: Header=BB493_565 Depth=1 ld.d $s6, $s8, 0 ld.bu $a0, $s6, 42 - beqz $a0, .LBB493_796 -.LBB493_788: # %resizeOpArray.exit.i.i947 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_795 +.LBB493_787: # %resizeOpArray.exit.i.i947 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $s8, 0 ld.bu $a0, $a0, 42 - beqz $a0, .LBB493_799 -# %bb.789: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_798 +# %bb.788: # in Loop: Header=BB493_565 Depth=1 move $a1, $zero - b .LBB493_801 -.LBB493_790: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_800 +.LBB493_789: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s6, 1 move $s8, $a1 slli.w $a1, $s6, 1 @@ -111082,16 +111037,16 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s7, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_845 -# %bb.791: # %sqlite3DbRealloc.exit.i.i.i931 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_844 +# %bb.790: # %sqlite3DbRealloc.exit.i.i.i931 + # in Loop: Header=BB493_565 Depth=1 st.w $s7, $s8, 28 st.d $a0, $s8, 32 move $a1, $s8 - ld.d $s8, $sp, 408 # 8-byte Folded Reload - ld.d $a4, $sp, 400 # 8-byte Folded Reload - bge $s6, $s7, .LBB493_780 -# %bb.792: # in Loop: Header=BB493_566 Depth=1 + ld.d $s8, $sp, 416 # 8-byte Folded Reload + ld.d $a4, $sp, 408 # 8-byte Folded Reload + bge $s6, $s7, .LBB493_779 +# %bb.791: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s6, $a2 add.d $a0, $a0, $a1 @@ -111100,10 +111055,10 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a4, $sp, 400 # 8-byte Folded Reload - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_780 -.LBB493_793: # in Loop: Header=BB493_566 Depth=1 + ld.d $a4, $sp, 408 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_779 +.LBB493_792: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s1, 1 slli.w $a1, $s1, 1 masknez $a1, $a1, $a0 @@ -111115,14 +111070,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s6, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_846 -# %bb.794: # %sqlite3DbRealloc.exit.i.i.i942 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_845 +# %bb.793: # %sqlite3DbRealloc.exit.i.i.i942 + # in Loop: Header=BB493_565 Depth=1 st.w $s6, $s8, 28 st.d $a0, $s8, 32 - ld.d $a4, $sp, 400 # 8-byte Folded Reload - bge $s1, $s6, .LBB493_784 -# %bb.795: # in Loop: Header=BB493_566 Depth=1 + ld.d $a4, $sp, 408 # 8-byte Folded Reload + bge $s1, $s6, .LBB493_783 +# %bb.794: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s1, $a2 add.d $a0, $a0, $a1 @@ -111131,10 +111086,10 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a4, $sp, 400 # 8-byte Folded Reload - ld.d $s8, $sp, 272 # 8-byte Folded Reload - b .LBB493_784 -.LBB493_796: # in Loop: Header=BB493_566 Depth=1 + ld.d $a4, $sp, 408 # 8-byte Folded Reload + ld.d $s8, $sp, 280 # 8-byte Folded Reload + b .LBB493_783 +.LBB493_795: # in Loop: Header=BB493_565 Depth=1 move $s1, $a2 move $s5, $a1 sltui $a0, $s0, 1 @@ -111148,15 +111103,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s7, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_847 -# %bb.797: # %sqlite3DbRealloc.exit.i.i.i953 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_846 +# %bb.796: # %sqlite3DbRealloc.exit.i.i.i953 + # in Loop: Header=BB493_565 Depth=1 st.w $s7, $s8, 28 st.d $a0, $s8, 32 move $a1, $s5 move $a2, $s1 - bge $s0, $s7, .LBB493_788 -# %bb.798: # in Loop: Header=BB493_566 Depth=1 + bge $s0, $s7, .LBB493_787 +# %bb.797: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s0, $a2 add.d $a0, $a0, $a1 @@ -111167,11 +111122,11 @@ sqlite3WhereBegin: # @sqlite3WhereBegin jirl $ra, $ra, 0 move $a2, $s1 move $a1, $s5 - b .LBB493_788 -.LBB493_799: # %resizeOpArray.exit._crit_edge.i.i950 - # in Loop: Header=BB493_566 Depth=1 + b .LBB493_787 +.LBB493_798: # %resizeOpArray.exit._crit_edge.i.i950 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $s8, 24 -.LBB493_800: # in Loop: Header=BB493_566 Depth=1 +.LBB493_799: # in Loop: Header=BB493_565 Depth=1 ld.d $a3, $s8, 32 addi.d $a0, $a0, 1 st.w $a0, $s8, 24 @@ -111182,14 +111137,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin stx.h $a5, $a3, $a0 ld.d $a0, $sp, 464 # 8-byte Folded Reload st.w $a0, $a4, 4 - ld.d $a0, $sp, 416 # 8-byte Folded Reload + ld.d $a0, $sp, 424 # 8-byte Folded Reload st.w $a0, $a4, 8 - ld.d $a0, $sp, 408 # 8-byte Folded Reload + ld.d $a0, $sp, 416 # 8-byte Folded Reload st.w $a0, $a4, 12 st.d $zero, $a4, 16 st.b $zero, $s8, 339 -.LBB493_801: # %sqlite3VdbeAddOp4.exit954 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_800: # %sqlite3VdbeAddOp4.exit954 + # in Loop: Header=BB493_565 Depth=1 sltui $a0, $fp, 1 addi.w $a3, $zero, -11 masknez $a3, $a3, $a0 @@ -111199,42 +111154,42 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a0, $s8 pcaddu18i $ra, %call36(sqlite3VdbeChangeP4) jirl $ra, $ra, 0 - ld.d $a0, $sp, 320 # 8-byte Folded Reload + ld.d $a0, $sp, 336 # 8-byte Folded Reload ld.w $a0, $a0, 72 - ld.d $s1, $sp, 192 # 8-byte Folded Reload - ld.d $s6, $sp, 432 # 8-byte Folded Reload + ld.d $s1, $sp, 200 # 8-byte Folded Reload + ld.d $s6, $sp, 448 # 8-byte Folded Reload ori $s7, $zero, 1 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - ld.d $a1, $sp, 392 # 8-byte Folded Reload - bge $a0, $a1, .LBB493_803 -# %bb.802: # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 320 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload + ld.d $a1, $sp, 400 # 8-byte Folded Reload + bge $a0, $a1, .LBB493_802 +# %bb.801: # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 336 # 8-byte Folded Reload st.w $a1, $a0, 72 - ld.d $a1, $sp, 408 # 8-byte Folded Reload + ld.d $a1, $sp, 416 # 8-byte Folded Reload st.w $a1, $a0, 76 -.LBB493_803: # %sqlite3ReleaseTempRange.exit - # in Loop: Header=BB493_566 Depth=1 +.LBB493_802: # %sqlite3ReleaseTempRange.exit + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $s4, 0 st.w $zero, $s4, 56 - ld.d $s8, $sp, 320 # 8-byte Folded Reload - blez $a0, .LBB493_814 -# %bb.804: # %.lr.ph1769 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s8, $sp, 336 # 8-byte Folded Reload + blez $a0, .LBB493_813 +# %bb.803: # %.lr.ph1769 + # in Loop: Header=BB493_565 Depth=1 ld.d $a1, $sp, 496 move $a2, $zero - b .LBB493_806 + b .LBB493_805 .p2align 4, , 16 -.LBB493_805: # %disableTerm.exit - # in Loop: Header=BB493_806 Depth=2 +.LBB493_804: # %disableTerm.exit + # in Loop: Header=BB493_805 Depth=2 addi.d $a2, $a2, 1 - beq $a2, $a0, .LBB493_814 -.LBB493_806: # Parent Loop BB493_566 Depth=1 + beq $a2, $a0, .LBB493_813 +.LBB493_805: # Parent Loop BB493_565 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB493_808 Depth 3 + # Child Loop BB493_807 Depth 3 alsl.d $a3, $a2, $s2, 3 ld.bu $a3, $a3, 4 - beqz $a3, .LBB493_805 -# %bb.807: # in Loop: Header=BB493_806 Depth=2 + beqz $a3, .LBB493_804 +# %bb.806: # in Loop: Header=BB493_805 Depth=2 slli.d $a3, $a2, 3 alsl.d $a3, $a2, $a3, 2 add.d $a3, $s3, $a3 @@ -111243,29 +111198,29 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.d $a3, $a3, $a4, 4 add.d $a3, $a1, $a3 .p2align 4, , 16 -.LBB493_808: # %tailrecurse.i - # Parent Loop BB493_566 Depth=1 - # Parent Loop BB493_806 Depth=2 +.LBB493_807: # %tailrecurse.i + # Parent Loop BB493_565 Depth=1 + # Parent Loop BB493_805 Depth=2 # => This Inner Loop Header: Depth=3 - beqz $a3, .LBB493_805 -# %bb.809: # in Loop: Header=BB493_808 Depth=3 + beqz $a3, .LBB493_804 +# %bb.808: # in Loop: Header=BB493_807 Depth=3 ld.bu $a4, $a3, 16 andi $a5, $a4, 4 - bnez $a5, .LBB493_805 -# %bb.810: # in Loop: Header=BB493_808 Depth=3 + bnez $a5, .LBB493_804 +# %bb.809: # in Loop: Header=BB493_807 Depth=3 ld.w $a5, $s1, 12 - beqz $a5, .LBB493_812 -# %bb.811: # in Loop: Header=BB493_808 Depth=3 + beqz $a5, .LBB493_811 +# %bb.810: # in Loop: Header=BB493_807 Depth=3 ld.d $a5, $a3, 0 ld.hu $a5, $a5, 2 andi $a5, $a5, 1 - beqz $a5, .LBB493_805 -.LBB493_812: # in Loop: Header=BB493_808 Depth=3 + beqz $a5, .LBB493_804 +.LBB493_811: # in Loop: Header=BB493_807 Depth=3 ld.h $a5, $a3, 8 addi.d $a4, $a4, 4 st.b $a4, $a3, 16 - bltz $a5, .LBB493_805 -# %bb.813: # in Loop: Header=BB493_808 Depth=3 + bltz $a5, .LBB493_804 +# %bb.812: # in Loop: Header=BB493_807 Depth=3 ld.d $a3, $a3, 24 ld.d $a3, $a3, 24 slli.d $a4, $a5, 5 @@ -111275,58 +111230,58 @@ sqlite3WhereBegin: # @sqlite3WhereBegin addi.d $a4, $a4, -1 andi $a5, $a4, 255 st.b $a4, $a3, 17 - beqz $a5, .LBB493_808 - b .LBB493_805 + beqz $a5, .LBB493_807 + b .LBB493_804 .p2align 4, , 16 -.LBB493_814: # %._crit_edge1770 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 272 # 8-byte Folded Reload +.LBB493_813: # %._crit_edge1770 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 280 # 8-byte Folded Reload ld.w $a0, $a0, 24 st.w $s7, $s1, 48 ld.d $a6, $sp, 464 # 8-byte Folded Reload -.LBB493_815: # %sqlite3VdbeAddOp2.exit1362 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_814: # %sqlite3VdbeAddOp2.exit1362 + # in Loop: Header=BB493_565 Depth=1 st.w $a6, $s1, 52 -.LBB493_816: # %sqlite3VdbeAddOp2.exit1362 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_815: # %sqlite3VdbeAddOp2.exit1362 + # in Loop: Header=BB493_565 Depth=1 st.w $a0, $s1, 56 -.LBB493_817: # %sqlite3ReleaseTempReg.exit1098 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_816: # %sqlite3ReleaseTempReg.exit1098 + # in Loop: Header=BB493_565 Depth=1 ld.w $a1, $sp, 984 addi.w $a0, $zero, -1 - blez $a1, .LBB493_821 -# %bb.818: # %.lr.ph.i1365 - # in Loop: Header=BB493_566 Depth=1 + blez $a1, .LBB493_820 +# %bb.817: # %.lr.ph.i1365 + # in Loop: Header=BB493_565 Depth=1 move $a2, $zero addi.d $a3, $sp, 988 .p2align 4, , 16 -.LBB493_819: # Parent Loop BB493_566 Depth=1 +.LBB493_818: # Parent Loop BB493_565 Depth=1 # => This Inner Loop Header: Depth=2 ld.w $a4, $a3, 0 - beq $a4, $a6, .LBB493_822 -# %bb.820: # in Loop: Header=BB493_819 Depth=2 + beq $a4, $a6, .LBB493_821 +# %bb.819: # in Loop: Header=BB493_818 Depth=2 addi.d $a2, $a2, 1 addi.d $a3, $a3, 4 - bne $a1, $a2, .LBB493_819 -.LBB493_821: # %getMask.exit1370 - # in Loop: Header=BB493_566 Depth=1 + bne $a1, $a2, .LBB493_818 +.LBB493_820: # %getMask.exit1370 + # in Loop: Header=BB493_565 Depth=1 ld.w $a1, $sp, 488 and $s6, $a0, $s6 - bgtz $a1, .LBB493_823 - b .LBB493_830 + bgtz $a1, .LBB493_822 + b .LBB493_829 .p2align 4, , 16 -.LBB493_822: # in Loop: Header=BB493_566 Depth=1 +.LBB493_821: # in Loop: Header=BB493_565 Depth=1 sll.d $a0, $s7, $a2 nor $a0, $a0, $zero ld.w $a1, $sp, 488 and $s6, $a0, $s6 - blez $a1, .LBB493_830 -.LBB493_823: # %.lr.ph1773 - # in Loop: Header=BB493_566 Depth=1 + blez $a1, .LBB493_829 +.LBB493_822: # %.lr.ph1773 + # in Loop: Header=BB493_565 Depth=1 ld.d $fp, $sp, 496 addi.d $s0, $a1, 1 - b .LBB493_826 -.LBB493_824: # in Loop: Header=BB493_826 Depth=2 + b .LBB493_825 +.LBB493_823: # in Loop: Header=BB493_825 Depth=2 ori $a3, $zero, 8 move $a0, $s8 move $a2, $s5 @@ -111336,58 +111291,58 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ori $a0, $a0, 4 st.b $a0, $fp, 16 .p2align 4, , 16 -.LBB493_825: # in Loop: Header=BB493_826 Depth=2 +.LBB493_824: # in Loop: Header=BB493_825 Depth=2 addi.w $s0, $s0, -1 addi.d $fp, $fp, 48 - bge $s7, $s0, .LBB493_830 -.LBB493_826: # Parent Loop BB493_566 Depth=1 + bge $s7, $s0, .LBB493_829 +.LBB493_825: # Parent Loop BB493_565 Depth=1 # => This Inner Loop Header: Depth=2 ld.bu $a0, $fp, 16 andi $a0, $a0, 6 - bnez $a0, .LBB493_825 -# %bb.827: # in Loop: Header=BB493_826 Depth=2 + bnez $a0, .LBB493_824 +# %bb.826: # in Loop: Header=BB493_825 Depth=2 ld.d $a0, $fp, 40 and $a0, $a0, $s6 - bnez $a0, .LBB493_825 -# %bb.828: # in Loop: Header=BB493_826 Depth=2 + bnez $a0, .LBB493_824 +# %bb.827: # in Loop: Header=BB493_825 Depth=2 ld.w $a0, $s1, 12 ld.d $a1, $fp, 0 - beqz $a0, .LBB493_824 -# %bb.829: # in Loop: Header=BB493_826 Depth=2 + beqz $a0, .LBB493_823 +# %bb.828: # in Loop: Header=BB493_825 Depth=2 ld.hu $a0, $a1, 2 andi $a0, $a0, 1 - bnez $a0, .LBB493_824 - b .LBB493_825 + bnez $a0, .LBB493_823 + b .LBB493_824 .p2align 4, , 16 -.LBB493_830: # %._crit_edge1774 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_829: # %._crit_edge1774 + # in Loop: Header=BB493_565 Depth=1 ld.w $fp, $s1, 12 - beqz $fp, .LBB493_565 -# %bb.831: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + beqz $fp, .LBB493_564 +# %bb.830: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $s0, $a1, 24 move $a0, $s1 ld.w $s1, $a1, 28 st.w $s0, $a0, 44 move $a0, $s0 - blt $s0, $s1, .LBB493_835 -# %bb.832: # in Loop: Header=BB493_566 Depth=1 + blt $s0, $s1, .LBB493_834 +# %bb.831: # in Loop: Header=BB493_565 Depth=1 ld.d $s2, $a1, 0 ld.bu $a0, $s2, 42 - beqz $a0, .LBB493_842 -.LBB493_833: # %resizeOpArray.exit.i.i1374 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_841 +.LBB493_832: # %resizeOpArray.exit.i.i1374 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - bnez $a0, .LBB493_836 -# %bb.834: # %resizeOpArray.exit._crit_edge.i.i1377 - # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_835 +# %bb.833: # %resizeOpArray.exit._crit_edge.i.i1377 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_835: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_834: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s0, $a0 @@ -111398,34 +111353,34 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.w $fp, $a2, 8 st.w $zero, $a2, 12 st.d $zero, $a2, 16 - ld.d $a0, $sp, 272 # 8-byte Folded Reload + ld.d $a0, $sp, 280 # 8-byte Folded Reload st.b $zero, $a0, 339 -.LBB493_836: # %sqlite3VdbeAddOp2.exit1381 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_835: # %sqlite3VdbeAddOp2.exit1381 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $sp, 488 - ld.d $s1, $sp, 192 # 8-byte Folded Reload - blez $a0, .LBB493_565 -# %bb.837: # %.lr.ph1777.preheader - # in Loop: Header=BB493_566 Depth=1 + ld.d $s1, $sp, 200 # 8-byte Folded Reload + blez $a0, .LBB493_564 +# %bb.836: # %.lr.ph1777.preheader + # in Loop: Header=BB493_565 Depth=1 ld.d $fp, $sp, 496 move $s0, $zero - b .LBB493_839 + b .LBB493_838 .p2align 4, , 16 -.LBB493_838: # in Loop: Header=BB493_839 Depth=2 +.LBB493_837: # in Loop: Header=BB493_838 Depth=2 addi.w $s0, $s0, 1 addi.d $fp, $fp, 48 - bge $s0, $a0, .LBB493_565 -.LBB493_839: # %.lr.ph1777 - # Parent Loop BB493_566 Depth=1 + bge $s0, $a0, .LBB493_564 +.LBB493_838: # %.lr.ph1777 + # Parent Loop BB493_565 Depth=1 # => This Inner Loop Header: Depth=2 ld.bu $a1, $fp, 16 andi $a1, $a1, 6 - bnez $a1, .LBB493_838 -# %bb.840: # in Loop: Header=BB493_839 Depth=2 + bnez $a1, .LBB493_837 +# %bb.839: # in Loop: Header=BB493_838 Depth=2 ld.d $a1, $fp, 40 and $a1, $a1, $s6 - bnez $a1, .LBB493_838 -# %bb.841: # in Loop: Header=BB493_839 Depth=2 + bnez $a1, .LBB493_837 +# %bb.840: # in Loop: Header=BB493_838 Depth=2 ld.d $a1, $fp, 0 ori $a3, $zero, 8 move $a0, $s8 @@ -111436,8 +111391,8 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.w $a0, $sp, 488 ori $a1, $a1, 4 st.b $a1, $fp, 16 - b .LBB493_838 -.LBB493_842: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_837 +.LBB493_841: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s1, 1 move $s4, $a1 slli.w $a1, $s1, 1 @@ -111450,14 +111405,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s3, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_848 -# %bb.843: # %sqlite3DbRealloc.exit.i.i.i1380 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_847 +# %bb.842: # %sqlite3DbRealloc.exit.i.i.i1380 + # in Loop: Header=BB493_565 Depth=1 st.w $s3, $s4, 28 st.d $a0, $s4, 32 move $a1, $s4 - bge $s1, $s3, .LBB493_833 -# %bb.844: # in Loop: Header=BB493_566 Depth=1 + bge $s1, $s3, .LBB493_832 +# %bb.843: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s1, $a2 add.d $a0, $a0, $a1 @@ -111466,32 +111421,32 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_833 -.LBB493_845: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_832 +.LBB493_844: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 st.b $a0, $s0, 42 move $a1, $s8 - ld.d $s8, $sp, 408 # 8-byte Folded Reload - ld.d $a4, $sp, 400 # 8-byte Folded Reload - b .LBB493_780 -.LBB493_846: # in Loop: Header=BB493_566 Depth=1 + ld.d $s8, $sp, 416 # 8-byte Folded Reload + ld.d $a4, $sp, 408 # 8-byte Folded Reload + b .LBB493_779 +.LBB493_845: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 st.b $a0, $s5, 42 - ld.d $a4, $sp, 400 # 8-byte Folded Reload - b .LBB493_784 -.LBB493_847: # in Loop: Header=BB493_566 Depth=1 + ld.d $a4, $sp, 408 # 8-byte Folded Reload + b .LBB493_783 +.LBB493_846: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 st.b $a0, $s6, 42 move $a1, $s5 move $a2, $s1 - b .LBB493_788 -.LBB493_848: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_787 +.LBB493_847: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s2, 42 move $a1, $s4 - b .LBB493_833 -.LBB493_849: # in Loop: Header=BB493_566 Depth=1 - st.d $a5, $sp, 408 # 8-byte Folded Spill + b .LBB493_832 +.LBB493_848: # in Loop: Header=BB493_565 Depth=1 + st.d $a5, $sp, 416 # 8-byte Folded Spill sltui $a0, $s7, 1 move $a3, $a1 slli.w $a1, $s7, 1 @@ -111504,15 +111459,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s8, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_881 -# %bb.850: # %sqlite3DbRealloc.exit.i.i1032 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + beqz $a0, .LBB493_880 +# %bb.849: # %sqlite3DbRealloc.exit.i.i1032 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $s8, $a1, 28 st.d $a0, $a1, 32 - ld.d $a5, $sp, 408 # 8-byte Folded Reload - bge $s7, $s8, .LBB493_718 -# %bb.851: # in Loop: Header=BB493_566 Depth=1 + ld.d $a5, $sp, 416 # 8-byte Folded Reload + bge $s7, $s8, .LBB493_717 +# %bb.850: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s7, $a2 add.d $a0, $a0, $a1 @@ -111521,11 +111476,11 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a5, $sp, 408 # 8-byte Folded Reload - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_718 -.LBB493_852: # in Loop: Header=BB493_566 Depth=1 - st.d $a5, $sp, 408 # 8-byte Folded Spill + ld.d $a5, $sp, 416 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_717 +.LBB493_851: # in Loop: Header=BB493_565 Depth=1 + st.d $a5, $sp, 416 # 8-byte Folded Spill sltui $a0, $s5, 1 move $a3, $a1 slli.w $a1, $s5, 1 @@ -111538,15 +111493,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s7, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_882 -# %bb.853: # %sqlite3DbRealloc.exit.i.i1043 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + beqz $a0, .LBB493_881 +# %bb.852: # %sqlite3DbRealloc.exit.i.i1043 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $s7, $a1, 28 st.d $a0, $a1, 32 - ld.d $a5, $sp, 408 # 8-byte Folded Reload - bge $s5, $s7, .LBB493_722 -# %bb.854: # in Loop: Header=BB493_566 Depth=1 + ld.d $a5, $sp, 416 # 8-byte Folded Reload + bge $s5, $s7, .LBB493_721 +# %bb.853: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s5, $a2 add.d $a0, $a0, $a1 @@ -111555,69 +111510,69 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a5, $sp, 408 # 8-byte Folded Reload - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_722 -.LBB493_855: # %.split725 - # in Loop: Header=BB493_566 Depth=1 + ld.d $a5, $sp, 416 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_721 +.LBB493_854: # %.split725 + # in Loop: Header=BB493_565 Depth=1 ori $a1, $zero, 116 move $a0, $s4 - ld.d $a2, $sp, 408 # 8-byte Folded Reload - ld.d $s2, $sp, 432 # 8-byte Folded Reload + ld.d $a2, $sp, 416 # 8-byte Folded Reload + ld.d $s2, $sp, 448 # 8-byte Folded Reload move $a3, $s2 move $a4, $s8 pcaddu18i $ra, %call36(sqlite3VdbeAddOp3) jirl $ra, $ra, 0 - ld.d $s8, $sp, 320 # 8-byte Folded Reload -.LBB493_856: # in Loop: Header=BB493_566 Depth=1 + ld.d $s8, $sp, 336 # 8-byte Folded Reload +.LBB493_855: # in Loop: Header=BB493_565 Depth=1 ld.d $a6, $sp, 464 # 8-byte Folded Reload ld.w $fp, $s4, 24 ld.w $s1, $s4, 28 - ld.d $a0, $sp, 192 # 8-byte Folded Reload + ld.d $a0, $sp, 200 # 8-byte Folded Reload ld.w $s0, $a0, 8 move $a0, $fp - blt $fp, $s1, .LBB493_970 -# %bb.857: # in Loop: Header=BB493_566 Depth=1 + blt $fp, $s1, .LBB493_969 +# %bb.856: # in Loop: Header=BB493_565 Depth=1 ld.d $s2, $s4, 0 ld.bu $a0, $s2, 42 - beqz $a0, .LBB493_966 -.LBB493_858: # %resizeOpArray.exit.i1314 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_965 +.LBB493_857: # %resizeOpArray.exit.i1314 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $s4, 0 ld.bu $a0, $a0, 42 ld.d $a6, $sp, 464 # 8-byte Folded Reload - beqz $a0, .LBB493_969 -# %bb.859: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_968 +# %bb.858: # in Loop: Header=BB493_565 Depth=1 move $fp, $zero - b .LBB493_971 -.LBB493_860: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_970 +.LBB493_859: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 - st.d $a0, $sp, 296 # 8-byte Folded Spill + st.d $a0, $sp, 304 # 8-byte Folded Spill ori $a0, $zero, 22 - st.d $a0, $sp, 312 # 8-byte Folded Spill - beqz $s3, .LBB493_892 -# %bb.861: # in Loop: Header=BB493_566 Depth=1 - ld.d $a5, $sp, 272 # 8-byte Folded Reload + st.d $a0, $sp, 320 # 8-byte Folded Spill + beqz $s3, .LBB493_891 +# %bb.860: # in Loop: Header=BB493_565 Depth=1 + ld.d $a5, $sp, 280 # 8-byte Folded Reload ld.w $s1, $a5, 24 ld.w $s0, $a5, 28 move $a0, $s1 - blt $s1, $s0, .LBB493_865 -# %bb.862: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s0, .LBB493_864 +# %bb.861: # in Loop: Header=BB493_565 Depth=1 ld.d $s5, $a5, 0 ld.bu $a0, $s5, 42 - beqz $a0, .LBB493_1106 -.LBB493_863: # %resizeOpArray.exit.i.i1152 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1105 +.LBB493_862: # %resizeOpArray.exit.i.i1152 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a5, 0 ld.bu $a0, $a0, 42 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bnez $a0, .LBB493_892 -# %bb.864: # %resizeOpArray.exit._crit_edge.i.i1155 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a5, $sp, 272 # 8-byte Folded Reload + ld.d $s6, $sp, 448 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bnez $a0, .LBB493_891 +# %bb.863: # %resizeOpArray.exit._crit_edge.i.i1155 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a5, $sp, 280 # 8-byte Folded Reload ld.w $a0, $a5, 24 -.LBB493_865: # in Loop: Header=BB493_566 Depth=1 +.LBB493_864: # in Loop: Header=BB493_565 Depth=1 move $s0, $zero ld.d $a1, $a5, 32 addi.d $a0, $a0, 1 @@ -111628,12 +111583,12 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ori $a3, $zero, 50 stx.b $a3, $a1, $a2 ori $a1, $zero, 1 - st.d $a1, $sp, 296 # 8-byte Folded Spill + st.d $a1, $sp, 304 # 8-byte Folded Spill ori $a1, $zero, 22 - st.d $a1, $sp, 312 # 8-byte Folded Spill - ld.d $a1, $sp, 416 # 8-byte Folded Reload - b .LBB493_891 -.LBB493_866: # in Loop: Header=BB493_566 Depth=1 + st.d $a1, $sp, 320 # 8-byte Folded Spill + ld.d $a1, $sp, 424 # 8-byte Folded Reload + b .LBB493_890 +.LBB493_865: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s5, 1 move $fp, $a1 slli.w $a1, $s5, 1 @@ -111646,14 +111601,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s8, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_952 -# %bb.867: # %sqlite3DbRealloc.exit.i.i.i1137 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_951 +# %bb.866: # %sqlite3DbRealloc.exit.i.i.i1137 + # in Loop: Header=BB493_565 Depth=1 st.w $s8, $fp, 28 st.d $a0, $fp, 32 move $a1, $fp - bge $s5, $s8, .LBB493_765 -# %bb.868: # in Loop: Header=BB493_566 Depth=1 + bge $s5, $s8, .LBB493_764 +# %bb.867: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s5, $a2 add.d $a0, $a0, $a1 @@ -111662,9 +111617,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_765 -.LBB493_869: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_764 +.LBB493_868: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s1, 1 move $s6, $a1 slli.w $a1, $s1, 1 @@ -111677,15 +111632,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s5, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_953 -# %bb.870: # %sqlite3DbRealloc.exit.i.i.i1061 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_952 +# %bb.869: # %sqlite3DbRealloc.exit.i.i.i1061 + # in Loop: Header=BB493_565 Depth=1 st.w $s5, $s6, 28 st.d $a0, $s6, 32 move $a1, $s6 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - bge $s1, $s5, .LBB493_699 -# %bb.871: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + bge $s1, $s5, .LBB493_698 +# %bb.870: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s1, $a2 add.d $a0, $a0, $a1 @@ -111694,9 +111649,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_699 -.LBB493_872: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_698 +.LBB493_871: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s5, 1 slli.w $a1, $s5, 1 masknez $a1, $a1, $a0 @@ -111708,13 +111663,13 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s7, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1054 -# %bb.873: # %sqlite3DbRealloc.exit.i.i.i1111 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1053 +# %bb.872: # %sqlite3DbRealloc.exit.i.i.i1111 + # in Loop: Header=BB493_565 Depth=1 st.w $s7, $fp, 28 st.d $a0, $fp, 32 - bge $s5, $s7, .LBB493_677 -# %bb.874: # in Loop: Header=BB493_566 Depth=1 + bge $s5, $s7, .LBB493_676 +# %bb.873: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s5, $a2 add.d $a0, $a0, $a1 @@ -111723,9 +111678,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $fp, $sp, 272 # 8-byte Folded Reload - b .LBB493_677 -.LBB493_875: # in Loop: Header=BB493_566 Depth=1 + ld.d $fp, $sp, 280 # 8-byte Folded Reload + b .LBB493_676 +.LBB493_874: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s0, 1 move $s3, $a1 slli.w $a1, $s0, 1 @@ -111738,14 +111693,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s2, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1059 -# %bb.876: # %sqlite3DbRealloc.exit.i.i.i1361 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1058 +# %bb.875: # %sqlite3DbRealloc.exit.i.i.i1361 + # in Loop: Header=BB493_565 Depth=1 st.w $s2, $s3, 28 st.d $a0, $s3, 32 move $a1, $s3 - bge $s0, $s2, .LBB493_617 -# %bb.877: # in Loop: Header=BB493_566 Depth=1 + bge $s0, $s2, .LBB493_616 +# %bb.876: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s0, $a2 add.d $a0, $a0, $a1 @@ -111754,10 +111709,10 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_617 -.LBB493_878: # in Loop: Header=BB493_566 Depth=1 - st.d $a5, $sp, 400 # 8-byte Folded Spill + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_616 +.LBB493_877: # in Loop: Header=BB493_565 Depth=1 + st.d $a5, $sp, 408 # 8-byte Folded Spill sltui $a0, $s0, 1 move $s5, $a1 slli.w $a1, $s0, 1 @@ -111770,16 +111725,16 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s4, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1062 -# %bb.879: # %sqlite3DbRealloc.exit.i.i.i1298 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1061 +# %bb.878: # %sqlite3DbRealloc.exit.i.i.i1298 + # in Loop: Header=BB493_565 Depth=1 st.w $s4, $s5, 28 st.d $a0, $s5, 32 move $a1, $s5 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - ld.d $a5, $sp, 400 # 8-byte Folded Reload - bge $s0, $s4, .LBB493_745 -# %bb.880: # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + ld.d $a5, $sp, 408 # 8-byte Folded Reload + bge $s0, $s4, .LBB493_744 +# %bb.879: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s0, $a2 add.d $a0, $a0, $a1 @@ -111788,22 +111743,22 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a5, $sp, 400 # 8-byte Folded Reload - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_745 -.LBB493_881: # in Loop: Header=BB493_566 Depth=1 + ld.d $a5, $sp, 408 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_744 +.LBB493_880: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 st.b $a0, $s1, 42 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - ld.d $a5, $sp, 408 # 8-byte Folded Reload - b .LBB493_718 -.LBB493_882: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + ld.d $a5, $sp, 416 # 8-byte Folded Reload + b .LBB493_717 +.LBB493_881: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 st.b $a0, $s6, 42 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - ld.d $a5, $sp, 408 # 8-byte Folded Reload - b .LBB493_722 -.LBB493_883: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + ld.d $a5, $sp, 416 # 8-byte Folded Reload + b .LBB493_721 +.LBB493_882: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s5, 1 move $fp, $a4 slli.w $a1, $s5, 1 @@ -111816,14 +111771,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s7, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1063 -# %bb.884: # %sqlite3DbRealloc.exit.i.i.i1127 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1062 +# %bb.883: # %sqlite3DbRealloc.exit.i.i.i1127 + # in Loop: Header=BB493_565 Depth=1 st.w $s7, $fp, 28 st.d $a0, $fp, 32 move $a4, $fp - bge $s5, $s7, .LBB493_760 -# %bb.885: # in Loop: Header=BB493_566 Depth=1 + bge $s5, $s7, .LBB493_759 +# %bb.884: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s5, $a2 add.d $a0, $a0, $a1 @@ -111832,9 +111787,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a4, $sp, 272 # 8-byte Folded Reload - b .LBB493_760 -.LBB493_886: # in Loop: Header=BB493_566 Depth=1 + ld.d $a4, $sp, 280 # 8-byte Folded Reload + b .LBB493_759 +.LBB493_885: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s5, 1 move $fp, $a5 slli.w $a1, $s5, 1 @@ -111847,14 +111802,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s7, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1064 -# %bb.887: # %sqlite3DbRealloc.exit.i.i1147 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1063 +# %bb.886: # %sqlite3DbRealloc.exit.i.i1147 + # in Loop: Header=BB493_565 Depth=1 st.w $s7, $fp, 28 st.d $a0, $fp, 32 move $a5, $fp - bge $s5, $s7, .LBB493_771 -# %bb.888: # in Loop: Header=BB493_566 Depth=1 + bge $s5, $s7, .LBB493_770 +# %bb.887: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s5, $a2 add.d $a0, $a0, $a1 @@ -111863,14 +111818,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a5, $sp, 272 # 8-byte Folded Reload - b .LBB493_771 -.LBB493_889: # %resizeOpArray.exit._crit_edge.i1144 - # in Loop: Header=BB493_566 Depth=1 + ld.d $a5, $sp, 280 # 8-byte Folded Reload + b .LBB493_770 +.LBB493_888: # %resizeOpArray.exit._crit_edge.i1144 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a5, 24 - ld.d $s6, $sp, 432 # 8-byte Folded Reload -.LBB493_890: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 296 # 8-byte Folded Reload + ld.d $s6, $sp, 448 # 8-byte Folded Reload +.LBB493_889: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 304 # 8-byte Folded Reload sltui $a1, $a1, 1 ld.d $a2, $a5, 32 addi.d $a0, $a0, 1 @@ -111884,28 +111839,28 @@ sqlite3WhereBegin: # @sqlite3WhereBegin maskeqz $a1, $a6, $a1 or $a1, $a1, $a4 stx.b $a1, $a2, $a3 - ld.d $a1, $sp, 336 # 8-byte Folded Reload + ld.d $a1, $sp, 344 # 8-byte Folded Reload ori $s7, $zero, 1 - ld.d $s5, $sp, 448 # 8-byte Folded Reload -.LBB493_891: # %sqlite3VdbeAddOp3.exit1148.sink.split - # in Loop: Header=BB493_566 Depth=1 - ld.d $a2, $sp, 408 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload +.LBB493_890: # %sqlite3VdbeAddOp3.exit1148.sink.split + # in Loop: Header=BB493_565 Depth=1 + ld.d $a2, $sp, 416 # 8-byte Folded Reload st.w $a2, $a0, 4 st.w $a1, $a0, 8 st.w $s0, $a0, 12 st.d $zero, $a0, 16 st.b $zero, $a0, 1 st.b $zero, $a5, 339 -.LBB493_892: # %sqlite3VdbeAddOp3.exit1148 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 352 # 8-byte Folded Reload +.LBB493_891: # %sqlite3VdbeAddOp3.exit1148 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 360 # 8-byte Folded Reload masknez $a0, $a0, $s4 - ld.d $a1, $sp, 344 # 8-byte Folded Reload + ld.d $a1, $sp, 352 # 8-byte Folded Reload maskeqz $a1, $a1, $s4 or $s0, $a1, $a0 - beqz $s0, .LBB493_908 -# %bb.893: # in Loop: Header=BB493_566 Depth=1 - st.d $s3, $sp, 288 # 8-byte Folded Spill + beqz $s0, .LBB493_907 +# %bb.892: # in Loop: Header=BB493_565 Depth=1 + st.d $s3, $sp, 296 # 8-byte Folded Spill ld.d $a0, $s2, 16 ld.d $a1, $sp, 328 # 8-byte Folded Reload ldx.w $a2, $a0, $a1 @@ -111925,47 +111880,47 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ld.d $a0, $a0, 0 ld.d $a1, $a0, 24 move $a0, $a2 - ld.d $s3, $sp, 304 # 8-byte Folded Reload + ld.d $s3, $sp, 312 # 8-byte Folded Reload move $a2, $s3 pcaddu18i $ra, %call36(sqlite3ExprCodeTarget) jirl $ra, $ra, 0 - beq $a0, $s3, .LBB493_896 -# %bb.894: # in Loop: Header=BB493_566 Depth=1 + beq $a0, $s3, .LBB493_895 +# %bb.893: # in Loop: Header=BB493_565 Depth=1 move $a2, $a0 - ld.d $a0, $sp, 320 # 8-byte Folded Reload + ld.d $a0, $sp, 336 # 8-byte Folded Reload ld.d $a0, $a0, 24 - beqz $a0, .LBB493_896 -# %bb.895: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_895 +# %bb.894: # in Loop: Header=BB493_565 Depth=1 ori $a1, $zero, 7 move $a3, $s3 pcaddu18i $ra, %call36(sqlite3VdbeAddOp2) jirl $ra, $ra, 0 -.LBB493_896: # %sqlite3ExprCode.exit1162 - # in Loop: Header=BB493_566 Depth=1 - ld.d $fp, $sp, 272 # 8-byte Folded Reload +.LBB493_895: # %sqlite3ExprCode.exit1162 + # in Loop: Header=BB493_565 Depth=1 + ld.d $fp, $sp, 280 # 8-byte Folded Reload ld.w $s1, $fp, 24 ld.w $s4, $fp, 28 move $a0, $s1 - blt $s1, $s4, .LBB493_900 -# %bb.897: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s4, .LBB493_899 +# %bb.896: # in Loop: Header=BB493_565 Depth=1 ld.d $s5, $fp, 0 ld.bu $a0, $s5, 42 - beqz $a0, .LBB493_928 -.LBB493_898: # %resizeOpArray.exit.i.i1166 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_927 +.LBB493_897: # %resizeOpArray.exit.i.i1166 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $fp, 0 ld.bu $a0, $a0, 42 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bnez $a0, .LBB493_901 -# %bb.899: # %resizeOpArray.exit._crit_edge.i.i1169 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bnez $a0, .LBB493_900 +# %bb.898: # %resizeOpArray.exit._crit_edge.i.i1169 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $fp, 24 -.LBB493_900: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_899: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s1, $a0 @@ -111973,37 +111928,37 @@ sqlite3WhereBegin: # @sqlite3WhereBegin ori $a3, $zero, 65 stx.h $a3, $a1, $a0 st.w $s3, $a2, 4 - ld.d $a0, $sp, 336 # 8-byte Folded Reload + ld.d $a0, $sp, 344 # 8-byte Folded Reload st.w $a0, $a2, 8 st.w $zero, $a2, 12 st.d $zero, $a2, 16 - ld.d $a0, $sp, 272 # 8-byte Folded Reload + ld.d $a0, $sp, 280 # 8-byte Folded Reload st.b $zero, $a0, 339 -.LBB493_901: # %sqlite3VdbeAddOp2.exit1173 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_900: # %sqlite3VdbeAddOp2.exit1173 + # in Loop: Header=BB493_565 Depth=1 ld.hu $a0, $s8, 14 andi $fp, $a0, 40 - ld.d $a3, $sp, 192 # 8-byte Folded Reload + ld.d $a3, $sp, 200 # 8-byte Folded Reload .p2align 4, , 16 -.LBB493_902: # Parent Loop BB493_566 Depth=1 +.LBB493_901: # Parent Loop BB493_565 Depth=1 # => This Inner Loop Header: Depth=2 ld.bu $a0, $s8, 16 andi $a1, $a0, 4 - bnez $a1, .LBB493_907 -# %bb.903: # in Loop: Header=BB493_902 Depth=2 + bnez $a1, .LBB493_906 +# %bb.902: # in Loop: Header=BB493_901 Depth=2 ld.w $a1, $a3, 12 - beqz $a1, .LBB493_905 -# %bb.904: # in Loop: Header=BB493_902 Depth=2 + beqz $a1, .LBB493_904 +# %bb.903: # in Loop: Header=BB493_901 Depth=2 ld.d $a1, $s8, 0 ld.hu $a1, $a1, 2 andi $a1, $a1, 1 - beqz $a1, .LBB493_907 -.LBB493_905: # in Loop: Header=BB493_902 Depth=2 + beqz $a1, .LBB493_906 +.LBB493_904: # in Loop: Header=BB493_901 Depth=2 ld.h $a1, $s8, 8 addi.d $a0, $a0, 4 st.b $a0, $s8, 16 - bltz $a1, .LBB493_907 -# %bb.906: # in Loop: Header=BB493_902 Depth=2 + bltz $a1, .LBB493_906 +# %bb.905: # in Loop: Header=BB493_901 Depth=2 ld.d $a0, $s8, 24 ld.d $a0, $a0, 24 slli.d $a2, $a1, 5 @@ -112013,81 +111968,81 @@ sqlite3WhereBegin: # @sqlite3WhereBegin addi.d $a0, $a0, -1 andi $a1, $a0, 255 st.b $a0, $s8, 17 - beqz $a1, .LBB493_902 -.LBB493_907: # %disableTerm.exit1178.loopexit - # in Loop: Header=BB493_566 Depth=1 - ld.d $s8, $sp, 320 # 8-byte Folded Reload - ld.d $s3, $sp, 288 # 8-byte Folded Reload - b .LBB493_909 -.LBB493_908: # in Loop: Header=BB493_566 Depth=1 + beqz $a1, .LBB493_901 +.LBB493_906: # %disableTerm.exit1178.loopexit + # in Loop: Header=BB493_565 Depth=1 + ld.d $s8, $sp, 336 # 8-byte Folded Reload + ld.d $s3, $sp, 296 # 8-byte Folded Reload + b .LBB493_908 +.LBB493_907: # in Loop: Header=BB493_565 Depth=1 ori $fp, $zero, 1 - ld.d $a3, $sp, 192 # 8-byte Folded Reload -.LBB493_909: # %disableTerm.exit1178 - # in Loop: Header=BB493_566 Depth=1 + ld.d $a3, $sp, 200 # 8-byte Folded Reload +.LBB493_908: # %disableTerm.exit1178 + # in Loop: Header=BB493_565 Depth=1 addi.w $a0, $s3, 0 sltu $s4, $zero, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - ld.d $a0, $sp, 400 # 8-byte Folded Reload - bgtz $a0, .LBB493_917 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + ld.d $a0, $sp, 408 # 8-byte Folded Reload + bgtz $a0, .LBB493_916 +# %bb.909: # %disableTerm.exit1178 + # in Loop: Header=BB493_565 Depth=1 + bnez $s0, .LBB493_916 # %bb.910: # %disableTerm.exit1178 - # in Loop: Header=BB493_566 Depth=1 - bnez $s0, .LBB493_917 -# %bb.911: # %disableTerm.exit1178 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 384 # 8-byte Folded Reload + # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 392 # 8-byte Folded Reload or $a0, $a0, $s4 - beqz $a0, .LBB493_917 -# %bb.912: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_916 +# %bb.911: # in Loop: Header=BB493_565 Depth=1 ld.w $s2, $a1, 24 - bnez $s3, .LBB493_1012 -# %bb.913: # in Loop: Header=BB493_566 Depth=1 + bnez $s3, .LBB493_1011 +# %bb.912: # in Loop: Header=BB493_565 Depth=1 ld.w $s0, $a1, 28 move $a0, $s2 - blt $s2, $s0, .LBB493_1001 -# %bb.914: # in Loop: Header=BB493_566 Depth=1 + blt $s2, $s0, .LBB493_1000 +# %bb.913: # in Loop: Header=BB493_565 Depth=1 ld.d $s1, $a1, 0 ld.bu $a0, $s1, 42 - beqz $a0, .LBB493_997 -.LBB493_915: # %resizeOpArray.exit.i.i1219 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_996 +.LBB493_914: # %resizeOpArray.exit.i.i1219 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - beqz $a0, .LBB493_1000 -# %bb.916: # in Loop: Header=BB493_566 Depth=1 - ld.d $a3, $sp, 192 # 8-byte Folded Reload - ld.d $s5, $sp, 448 # 8-byte Folded Reload - b .LBB493_1002 -.LBB493_917: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_999 +# %bb.915: # in Loop: Header=BB493_565 Depth=1 + ld.d $a3, $sp, 200 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload + b .LBB493_1001 +.LBB493_916: # in Loop: Header=BB493_565 Depth=1 sltu $a0, $zero, $s0 - ld.d $a2, $sp, 384 # 8-byte Folded Reload + ld.d $a2, $sp, 392 # 8-byte Folded Reload or $a0, $a0, $a2 - ld.d $a2, $sp, 400 # 8-byte Folded Reload + ld.d $a2, $sp, 408 # 8-byte Folded Reload add.d $s0, $s0, $a2 - bnez $a0, .LBB493_924 -# %bb.918: # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_923 +# %bb.917: # in Loop: Header=BB493_565 Depth=1 ld.w $s1, $a1, 24 ld.w $s5, $a1, 28 move $a0, $s1 - blt $s1, $s5, .LBB493_922 -# %bb.919: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s5, .LBB493_921 +# %bb.918: # in Loop: Header=BB493_565 Depth=1 ld.d $s6, $a1, 0 ld.bu $a0, $s6, 42 - beqz $a0, .LBB493_954 -.LBB493_920: # %resizeOpArray.exit.i.i1182 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_953 +.LBB493_919: # %resizeOpArray.exit.i.i1182 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - bnez $a0, .LBB493_923 -# %bb.921: # %resizeOpArray.exit._crit_edge.i.i1185 - # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_922 +# %bb.920: # %resizeOpArray.exit._crit_edge.i.i1185 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_922: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 392 # 8-byte Folded Reload +.LBB493_921: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 400 # 8-byte Folded Reload add.d $a1, $s0, $a1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload ld.d $a2, $a2, 32 addi.d $a0, $a0, 1 - ld.d $a3, $sp, 272 # 8-byte Folded Reload + ld.d $a3, $sp, 280 # 8-byte Folded Reload st.w $a0, $a3, 24 ori $a0, $zero, 24 mul.d $a0, $s1, $a0 @@ -112096,36 +112051,36 @@ sqlite3WhereBegin: # @sqlite3WhereBegin stx.h $a4, $a2, $a0 st.w $zero, $a3, 4 st.w $a1, $a3, 8 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $zero, $a3, 12 st.d $zero, $a3, 16 st.b $zero, $a1, 339 -.LBB493_923: # %sqlite3VdbeAddOp2.exit1189 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_922: # %sqlite3VdbeAddOp2.exit1189 + # in Loop: Header=BB493_565 Depth=1 move $fp, $zero addi.d $s0, $s0, 1 - ld.d $a3, $sp, 192 # 8-byte Folded Reload -.LBB493_924: # in Loop: Header=BB493_566 Depth=1 + ld.d $a3, $sp, 200 # 8-byte Folded Reload +.LBB493_923: # in Loop: Header=BB493_565 Depth=1 st.d $s4, $sp, 328 # 8-byte Folded Spill - beqz $s3, .LBB493_926 -# %bb.925: # in Loop: Header=BB493_566 Depth=1 + beqz $s3, .LBB493_925 +# %bb.924: # in Loop: Header=BB493_565 Depth=1 ld.w $s6, $a3, 8 ori $a0, $zero, 39 - st.d $a0, $sp, 312 # 8-byte Folded Spill - b .LBB493_932 -.LBB493_926: # in Loop: Header=BB493_566 Depth=1 + st.d $a0, $sp, 320 # 8-byte Folded Spill + b .LBB493_931 +.LBB493_925: # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s8, 37 - beqz $a0, .LBB493_931 -# %bb.927: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_930 +# %bb.926: # in Loop: Header=BB493_565 Depth=1 addi.d $a0, $a0, -1 andi $a1, $a0, 255 slli.d $a1, $a1, 2 - ld.d $a2, $sp, 376 # 8-byte Folded Reload + ld.d $a2, $sp, 384 # 8-byte Folded Reload ldx.w $s6, $a2, $a1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.b $a0, $s8, 37 - b .LBB493_932 -.LBB493_928: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_931 +.LBB493_927: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s4, 1 slli.w $a1, $s4, 1 masknez $a1, $a1, $a0 @@ -112137,13 +112092,13 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s6, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1055 -# %bb.929: # %sqlite3DbRealloc.exit.i.i.i1172 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1054 +# %bb.928: # %sqlite3DbRealloc.exit.i.i.i1172 + # in Loop: Header=BB493_565 Depth=1 st.w $s6, $fp, 28 st.d $a0, $fp, 32 - bge $s4, $s6, .LBB493_898 -# %bb.930: # in Loop: Header=BB493_566 Depth=1 + bge $s4, $s6, .LBB493_897 +# %bb.929: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s4, $a2 add.d $a0, $a0, $a1 @@ -112152,95 +112107,95 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $fp, $sp, 272 # 8-byte Folded Reload - b .LBB493_898 -.LBB493_931: # in Loop: Header=BB493_566 Depth=1 + ld.d $fp, $sp, 280 # 8-byte Folded Reload + b .LBB493_897 +.LBB493_930: # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $s8, 88 addi.w $s6, $a0, 1 st.w $s6, $s8, 88 -.LBB493_932: # %sqlite3GetTempReg.exit1192 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_931: # %sqlite3GetTempReg.exit1192 + # in Loop: Header=BB493_565 Depth=1 ld.w $s1, $a1, 24 ld.w $s5, $a1, 28 move $a0, $s1 move $s4, $fp - blt $s1, $s5, .LBB493_936 -# %bb.933: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s5, .LBB493_935 +# %bb.932: # in Loop: Header=BB493_565 Depth=1 ld.d $s7, $a1, 0 ld.bu $a0, $s7, 42 - beqz $a0, .LBB493_947 -.LBB493_934: # %resizeOpArray.exit.i.i1196 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_946 +.LBB493_933: # %resizeOpArray.exit.i.i1196 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - ld.d $s8, $sp, 320 # 8-byte Folded Reload - bnez $a0, .LBB493_937 -# %bb.935: # %resizeOpArray.exit._crit_edge.i.i1198 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s8, $sp, 336 # 8-byte Folded Reload + bnez $a0, .LBB493_936 +# %bb.934: # %resizeOpArray.exit._crit_edge.i.i1198 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_936: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_935: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s1, $a0 add.d $a2, $a1, $a0 ori $a3, $zero, 84 stx.h $a3, $a1, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - ld.d $a0, $sp, 392 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + ld.d $a0, $sp, 400 # 8-byte Folded Reload st.w $a0, $a2, 4 st.w $s0, $a2, 8 st.w $s6, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $a1, 339 -.LBB493_937: # %buildIndexProbe.exit1202 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_936: # %buildIndexProbe.exit1202 + # in Loop: Header=BB493_565 Depth=1 move $a0, $a1 move $fp, $a1 move $a1, $s2 pcaddu18i $ra, %call36(sqlite3IndexAffinityStr) jirl $ra, $ra, 0 - beqz $s3, .LBB493_939 -# %bb.938: # in Loop: Header=BB493_566 Depth=1 + beqz $s3, .LBB493_938 +# %bb.937: # in Loop: Header=BB493_565 Depth=1 move $a1, $fp - ld.d $a3, $sp, 192 # 8-byte Folded Reload - ld.d $s6, $sp, 432 # 8-byte Folded Reload + ld.d $a3, $sp, 200 # 8-byte Folded Reload + ld.d $s6, $sp, 448 # 8-byte Folded Reload ori $s7, $zero, 1 - ld.d $s5, $sp, 448 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload move $fp, $s4 ld.d $s4, $sp, 328 # 8-byte Folded Reload - b .LBB493_1002 -.LBB493_939: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_1001 +.LBB493_938: # in Loop: Header=BB493_565 Depth=1 ld.w $s0, $fp, 24 ld.w $s1, $fp, 28 move $a0, $s0 move $a1, $fp ori $s7, $zero, 1 - ld.d $s5, $sp, 448 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload move $fp, $s4 - blt $s0, $s1, .LBB493_943 -# %bb.940: # in Loop: Header=BB493_566 Depth=1 + blt $s0, $s1, .LBB493_942 +# %bb.939: # in Loop: Header=BB493_565 Depth=1 ld.d $s2, $a1, 0 ld.bu $a0, $s2, 42 - beqz $a0, .LBB493_957 -.LBB493_941: # %resizeOpArray.exit.i1206 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_956 +.LBB493_940: # %resizeOpArray.exit.i1206 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bnez $a0, .LBB493_944 -# %bb.942: # %resizeOpArray.exit._crit_edge.i1209 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bnez $a0, .LBB493_943 +# %bb.941: # %resizeOpArray.exit._crit_edge.i1209 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_943: # in Loop: Header=BB493_566 Depth=1 +.LBB493_942: # in Loop: Header=BB493_565 Depth=1 sltui $a1, $fp, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload ld.d $a2, $a2, 32 addi.d $a0, $a0, 1 - ld.d $a3, $sp, 272 # 8-byte Folded Reload + ld.d $a3, $sp, 280 # 8-byte Folded Reload st.w $a0, $a3, 24 ori $a0, $zero, 24 mul.d $a0, $s0, $a0 @@ -112251,34 +112206,34 @@ sqlite3WhereBegin: # @sqlite3WhereBegin maskeqz $a1, $a5, $a1 or $a1, $a1, $a4 stx.b $a1, $a2, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - ld.d $a0, $sp, 408 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + ld.d $a0, $sp, 416 # 8-byte Folded Reload st.w $a0, $a3, 4 - ld.d $a0, $sp, 336 # 8-byte Folded Reload + ld.d $a0, $sp, 344 # 8-byte Folded Reload st.w $a0, $a3, 8 st.w $s6, $a3, 12 st.d $zero, $a3, 16 st.b $zero, $a3, 1 st.b $zero, $a1, 339 -.LBB493_944: # %sqlite3VdbeAddOp3.exit1213 - # in Loop: Header=BB493_566 Depth=1 - beqz $s6, .LBB493_950 -# %bb.945: # in Loop: Header=BB493_566 Depth=1 +.LBB493_943: # %sqlite3VdbeAddOp3.exit1213 + # in Loop: Header=BB493_565 Depth=1 + beqz $s6, .LBB493_949 +# %bb.944: # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s8, 37 - ld.d $a3, $sp, 192 # 8-byte Folded Reload + ld.d $a3, $sp, 200 # 8-byte Folded Reload ori $a2, $zero, 7 ld.d $s4, $sp, 328 # 8-byte Folded Reload - bltu $a2, $a0, .LBB493_951 -# %bb.946: # in Loop: Header=BB493_566 Depth=1 + bltu $a2, $a0, .LBB493_950 +# %bb.945: # in Loop: Header=BB493_565 Depth=1 addi.d $a1, $a0, 1 st.b $a1, $s8, 37 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload slli.d $a0, $a0, 2 - ld.d $a2, $sp, 376 # 8-byte Folded Reload + ld.d $a2, $sp, 384 # 8-byte Folded Reload stx.w $s6, $a2, $a0 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - b .LBB493_1002 -.LBB493_947: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + b .LBB493_1001 +.LBB493_946: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s5, 1 move $fp, $a1 slli.w $a1, $s5, 1 @@ -112291,14 +112246,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s8, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1058 -# %bb.948: # %sqlite3DbRealloc.exit.i.i.i1201 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1057 +# %bb.947: # %sqlite3DbRealloc.exit.i.i.i1201 + # in Loop: Header=BB493_565 Depth=1 st.w $s8, $fp, 28 st.d $a0, $fp, 32 move $a1, $fp - bge $s5, $s8, .LBB493_934 -# %bb.949: # in Loop: Header=BB493_566 Depth=1 + bge $s5, $s8, .LBB493_933 +# %bb.948: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s5, $a2 add.d $a0, $a0, $a1 @@ -112307,27 +112262,27 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_934 -.LBB493_950: # in Loop: Header=BB493_566 Depth=1 - ld.d $a3, $sp, 192 # 8-byte Folded Reload - ld.d $s6, $sp, 432 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_933 +.LBB493_949: # in Loop: Header=BB493_565 Depth=1 + ld.d $a3, $sp, 200 # 8-byte Folded Reload + ld.d $s6, $sp, 448 # 8-byte Folded Reload ld.d $s4, $sp, 328 # 8-byte Folded Reload - b .LBB493_1002 -.LBB493_951: # in Loop: Header=BB493_566 Depth=1 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - b .LBB493_1002 -.LBB493_952: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_1001 +.LBB493_950: # in Loop: Header=BB493_565 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + b .LBB493_1001 +.LBB493_951: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 st.b $a0, $s7, 42 move $a1, $fp - b .LBB493_765 -.LBB493_953: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_764 +.LBB493_952: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s2, 42 move $a1, $s6 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - b .LBB493_699 -.LBB493_954: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + b .LBB493_698 +.LBB493_953: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s5, 1 move $fp, $a1 slli.w $a1, $s5, 1 @@ -112340,14 +112295,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s7, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1104 -# %bb.955: # %sqlite3DbRealloc.exit.i.i.i1188 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1103 +# %bb.954: # %sqlite3DbRealloc.exit.i.i.i1188 + # in Loop: Header=BB493_565 Depth=1 st.w $s7, $fp, 28 st.d $a0, $fp, 32 move $a1, $fp - bge $s5, $s7, .LBB493_920 -# %bb.956: # in Loop: Header=BB493_566 Depth=1 + bge $s5, $s7, .LBB493_919 +# %bb.955: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s5, $a2 add.d $a0, $a0, $a1 @@ -112356,9 +112311,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_920 -.LBB493_957: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_919 +.LBB493_956: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s1, 1 move $fp, $a1 slli.w $a1, $s1, 1 @@ -112371,15 +112326,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s5, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1105 -# %bb.958: # %sqlite3DbRealloc.exit.i.i1212 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1104 +# %bb.957: # %sqlite3DbRealloc.exit.i.i1212 + # in Loop: Header=BB493_565 Depth=1 st.w $s5, $fp, 28 st.d $a0, $fp, 32 move $a1, $fp move $fp, $s4 - bge $s1, $s5, .LBB493_941 -# %bb.959: # in Loop: Header=BB493_566 Depth=1 + bge $s1, $s5, .LBB493_940 +# %bb.958: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s1, $a2 add.d $a0, $a0, $a1 @@ -112388,9 +112343,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_941 -.LBB493_960: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_940 +.LBB493_959: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s1, 1 move $s4, $a1 slli.w $a1, $s1, 1 @@ -112403,14 +112358,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s3, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1109 -# %bb.961: # %sqlite3DbRealloc.exit.i.i1309 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1108 +# %bb.960: # %sqlite3DbRealloc.exit.i.i1309 + # in Loop: Header=BB493_565 Depth=1 st.w $s3, $s4, 28 st.d $a0, $s4, 32 move $a1, $s4 - bge $s1, $s3, .LBB493_752 -# %bb.962: # in Loop: Header=BB493_566 Depth=1 + bge $s1, $s3, .LBB493_751 +# %bb.961: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s1, $a2 add.d $a0, $a0, $a1 @@ -112419,37 +112374,37 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_752 -.LBB493_963: # %resizeOpArray.exit._crit_edge.i1306 - # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_751 +.LBB493_962: # %resizeOpArray.exit._crit_edge.i1306 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 - ld.d $s2, $sp, 432 # 8-byte Folded Reload -.LBB493_964: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $s2, $sp, 448 # 8-byte Folded Reload +.LBB493_963: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $fp, $a0 add.d $a2, $a1, $a0 ori $a3, $zero, 39 stx.h $a3, $a1, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - ld.d $a0, $sp, 408 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + ld.d $a0, $sp, 416 # 8-byte Folded Reload st.w $a0, $a2, 4 st.w $s2, $a2, 8 st.w $s0, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $a1, 339 -.LBB493_965: # %sqlite3VdbeAddOp3.exit1310 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_964: # %sqlite3VdbeAddOp3.exit1310 + # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 47 - ld.d $s1, $sp, 192 # 8-byte Folded Reload - ld.d $a4, $sp, 360 # 8-byte Folded Reload - b .LBB493_974 -.LBB493_966: # in Loop: Header=BB493_566 Depth=1 + ld.d $s1, $sp, 200 # 8-byte Folded Reload + ld.d $a4, $sp, 368 # 8-byte Folded Reload + b .LBB493_973 +.LBB493_965: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s1, 1 slli.w $a1, $s1, 1 masknez $a1, $a1, $a0 @@ -112461,13 +112416,13 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s3, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1110 -# %bb.967: # %sqlite3DbRealloc.exit.i.i1320 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1109 +# %bb.966: # %sqlite3DbRealloc.exit.i.i1320 + # in Loop: Header=BB493_565 Depth=1 st.w $s3, $s4, 28 st.d $a0, $s4, 32 - bge $s1, $s3, .LBB493_858 -# %bb.968: # in Loop: Header=BB493_566 Depth=1 + bge $s1, $s3, .LBB493_857 +# %bb.967: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s1, $a2 add.d $a0, $a0, $a1 @@ -112476,124 +112431,124 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $s4, $sp, 272 # 8-byte Folded Reload - b .LBB493_858 -.LBB493_969: # %resizeOpArray.exit._crit_edge.i1317 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s4, $sp, 280 # 8-byte Folded Reload + b .LBB493_857 +.LBB493_968: # %resizeOpArray.exit._crit_edge.i1317 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $s4, 24 - ld.d $s2, $sp, 432 # 8-byte Folded Reload -.LBB493_970: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $s2, $sp, 448 # 8-byte Folded Reload +.LBB493_969: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $fp, $a0 add.d $a2, $a1, $a0 ori $a3, $zero, 58 stx.h $a3, $a1, $a0 - ld.d $s4, $sp, 272 # 8-byte Folded Reload - ld.d $a0, $sp, 408 # 8-byte Folded Reload + ld.d $s4, $sp, 280 # 8-byte Folded Reload + ld.d $a0, $sp, 416 # 8-byte Folded Reload st.w $a0, $a2, 4 st.w $s2, $a2, 8 st.w $s0, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $s4, 339 -.LBB493_971: # %sqlite3VdbeAddOp3.exit1321 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_970: # %sqlite3VdbeAddOp3.exit1321 + # in Loop: Header=BB493_565 Depth=1 ld.d $a1, $s4, 32 ori $a0, $zero, 102 - ld.d $s1, $sp, 192 # 8-byte Folded Reload - ld.d $a4, $sp, 360 # 8-byte Folded Reload - beqz $a1, .LBB493_973 -# %bb.972: # in Loop: Header=BB493_566 Depth=1 + ld.d $s1, $sp, 200 # 8-byte Folded Reload + ld.d $a4, $sp, 368 # 8-byte Folded Reload + beqz $a1, .LBB493_972 +# %bb.971: # in Loop: Header=BB493_565 Depth=1 ld.w $a2, $s4, 24 ori $a3, $zero, 24 mul.d $a2, $a2, $a3 add.d $a1, $a1, $a2 st.b $s7, $a1, -21 -.LBB493_973: # %sqlite3VdbeAddOp3.exit1310 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_972: # %sqlite3VdbeAddOp3.exit1310 + # in Loop: Header=BB493_565 Depth=1 move $a1, $s4 -.LBB493_974: # %sqlite3VdbeAddOp3.exit1310 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_973: # %sqlite3VdbeAddOp3.exit1310 + # in Loop: Header=BB493_565 Depth=1 st.w $a0, $s1, 48 - bnez $a4, .LBB493_990 -# %bb.975: # in Loop: Header=BB493_566 Depth=1 + bnez $a4, .LBB493_989 +# %bb.974: # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s8, 37 - beqz $a0, .LBB493_977 -# %bb.976: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_976 +# %bb.975: # in Loop: Header=BB493_565 Depth=1 addi.d $a0, $a0, -1 andi $a1, $a0, 255 slli.d $a1, $a1, 2 - ld.d $a2, $sp, 376 # 8-byte Folded Reload + ld.d $a2, $sp, 384 # 8-byte Folded Reload ldx.w $s0, $a2, $a1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.b $a0, $s8, 37 - b .LBB493_978 -.LBB493_977: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_977 +.LBB493_976: # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $s8, 88 addi.w $s0, $a0, 1 st.w $s0, $s8, 88 -.LBB493_978: # %sqlite3GetTempReg.exit1327 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_977: # %sqlite3GetTempReg.exit1327 + # in Loop: Header=BB493_565 Depth=1 ld.w $s2, $a1, 24 ld.w $s3, $a1, 28 move $s1, $s2 - blt $s2, $s3, .LBB493_981 -# %bb.979: # in Loop: Header=BB493_566 Depth=1 + blt $s2, $s3, .LBB493_980 +# %bb.978: # in Loop: Header=BB493_565 Depth=1 ld.d $s1, $a1, 0 ld.bu $a0, $s1, 42 - beqz $a0, .LBB493_991 -.LBB493_980: # %resizeOpArray.exit.i.i1331 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_990 +.LBB493_979: # %resizeOpArray.exit.i.i1331 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 ld.w $s1, $a1, 24 - bnez $a0, .LBB493_982 -.LBB493_981: # %resizeOpArray.exit._crit_edge.i.i1334 - # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_981 +.LBB493_980: # %resizeOpArray.exit._crit_edge.i.i1334 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 32 addi.d $a1, $s1, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a1, $a2, 24 ori $a1, $zero, 24 mul.d $a1, $s2, $a1 add.d $a2, $a0, $a1 ori $a3, $zero, 52 stx.h $a3, $a0, $a1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - ld.d $a0, $sp, 408 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + ld.d $a0, $sp, 416 # 8-byte Folded Reload st.w $a0, $a2, 4 st.d $zero, $a2, 16 ld.w $s1, $a1, 24 st.w $s0, $a2, 8 st.w $zero, $a2, 12 st.b $zero, $a1, 339 -.LBB493_982: # %sqlite3VdbeAddOp2.exit1338 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_981: # %sqlite3VdbeAddOp2.exit1338 + # in Loop: Header=BB493_565 Depth=1 ld.w $s2, $a1, 28 move $a0, $s1 - blt $s1, $s2, .LBB493_986 -# %bb.983: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s2, .LBB493_985 +# %bb.982: # in Loop: Header=BB493_565 Depth=1 ld.d $s3, $a1, 0 ld.bu $a0, $s3, 42 - beqz $a0, .LBB493_994 -.LBB493_984: # %resizeOpArray.exit.i1342 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_993 +.LBB493_983: # %resizeOpArray.exit.i1342 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 ld.d $a6, $sp, 464 # 8-byte Folded Reload - bnez $a0, .LBB493_987 -# %bb.985: # %resizeOpArray.exit._crit_edge.i1345 - # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_986 +# %bb.984: # %resizeOpArray.exit._crit_edge.i1345 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_986: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_985: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s1, $a0 @@ -112605,29 +112560,29 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.w $zero, $a2, 8 st.w $s0, $a2, 12 st.d $zero, $a2, 16 - ld.d $a0, $sp, 272 # 8-byte Folded Reload + ld.d $a0, $sp, 280 # 8-byte Folded Reload st.b $zero, $a0, 339 -.LBB493_987: # %sqlite3VdbeAddOp3.exit1349 - # in Loop: Header=BB493_566 Depth=1 - ld.d $s1, $sp, 192 # 8-byte Folded Reload - beqz $s0, .LBB493_990 -# %bb.988: # in Loop: Header=BB493_566 Depth=1 +.LBB493_986: # %sqlite3VdbeAddOp3.exit1349 + # in Loop: Header=BB493_565 Depth=1 + ld.d $s1, $sp, 200 # 8-byte Folded Reload + beqz $s0, .LBB493_989 +# %bb.987: # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s8, 37 ori $a1, $zero, 7 - bltu $a1, $a0, .LBB493_990 -# %bb.989: # in Loop: Header=BB493_566 Depth=1 + bltu $a1, $a0, .LBB493_989 +# %bb.988: # in Loop: Header=BB493_565 Depth=1 addi.d $a1, $a0, 1 st.b $a1, $s8, 37 slli.d $a0, $a0, 2 - ld.d $a1, $sp, 376 # 8-byte Folded Reload + ld.d $a1, $sp, 384 # 8-byte Folded Reload stx.w $s0, $a1, $a0 -.LBB493_990: # %sqlite3ReleaseTempReg.exit1351 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 408 # 8-byte Folded Reload +.LBB493_989: # %sqlite3ReleaseTempReg.exit1351 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 416 # 8-byte Folded Reload st.w $a0, $s1, 52 st.w $fp, $s1, 56 - b .LBB493_817 -.LBB493_991: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_816 +.LBB493_990: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s3, 1 move $s5, $a1 slli.w $a1, $s3, 1 @@ -112640,15 +112595,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s4, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1111 -# %bb.992: # %sqlite3DbRealloc.exit.i.i.i1337 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1110 +# %bb.991: # %sqlite3DbRealloc.exit.i.i.i1337 + # in Loop: Header=BB493_565 Depth=1 st.w $s4, $s5, 28 st.d $a0, $s5, 32 move $a1, $s5 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bge $s3, $s4, .LBB493_980 -# %bb.993: # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bge $s3, $s4, .LBB493_979 +# %bb.992: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s3, $a2 add.d $a0, $a0, $a1 @@ -112657,9 +112612,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_980 -.LBB493_994: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_979 +.LBB493_993: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s2, 1 move $s5, $a1 slli.w $a1, $s2, 1 @@ -112672,15 +112627,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s4, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1112 -# %bb.995: # %sqlite3DbRealloc.exit.i.i1348 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1111 +# %bb.994: # %sqlite3DbRealloc.exit.i.i1348 + # in Loop: Header=BB493_565 Depth=1 st.w $s4, $s5, 28 st.d $a0, $s5, 32 move $a1, $s5 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bge $s2, $s4, .LBB493_984 -# %bb.996: # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bge $s2, $s4, .LBB493_983 +# %bb.995: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s2, $a2 add.d $a0, $a0, $a1 @@ -112689,9 +112644,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_984 -.LBB493_997: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_983 +.LBB493_996: # in Loop: Header=BB493_565 Depth=1 st.d $s4, $sp, 328 # 8-byte Folded Spill move $s4, $fp sltui $a0, $s0, 1 @@ -112706,16 +112661,16 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s5, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1113 -# %bb.998: # %sqlite3DbRealloc.exit.i.i.i1225 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1112 +# %bb.997: # %sqlite3DbRealloc.exit.i.i.i1225 + # in Loop: Header=BB493_565 Depth=1 st.w $s5, $fp, 28 st.d $a0, $fp, 32 move $a1, $fp move $fp, $s4 ld.d $s4, $sp, 328 # 8-byte Folded Reload - bge $s0, $s5, .LBB493_915 -# %bb.999: # in Loop: Header=BB493_566 Depth=1 + bge $s0, $s5, .LBB493_914 +# %bb.998: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s0, $a2 add.d $a0, $a0, $a1 @@ -112724,256 +112679,256 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_915 -.LBB493_1000: # %resizeOpArray.exit._crit_edge.i.i1222 - # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_914 +.LBB493_999: # %resizeOpArray.exit._crit_edge.i.i1222 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 - ld.d $a3, $sp, 192 # 8-byte Folded Reload - ld.d $s5, $sp, 448 # 8-byte Folded Reload -.LBB493_1001: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a3, $sp, 200 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload +.LBB493_1000: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s2, $a0 add.d $a2, $a1, $a0 ori $a4, $zero, 115 stx.h $a4, $a1, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - ld.d $a0, $sp, 408 # 8-byte Folded Reload - st.w $a0, $a2, 4 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a0, $sp, 416 # 8-byte Folded Reload + st.w $a0, $a2, 4 + ld.d $a0, $sp, 424 # 8-byte Folded Reload st.w $a0, $a2, 8 st.w $zero, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $a1, 339 -.LBB493_1002: # %sqlite3ReleaseTempReg.exit1215 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_1001: # %sqlite3ReleaseTempReg.exit1215 + # in Loop: Header=BB493_565 Depth=1 ld.w $s2, $a1, 24 ori $a0, $zero, 22 - ld.d $a2, $sp, 312 # 8-byte Folded Reload - beq $a2, $a0, .LBB493_1012 -# %bb.1003: # in Loop: Header=BB493_566 Depth=1 + ld.d $a2, $sp, 320 # 8-byte Folded Reload + beq $a2, $a0, .LBB493_1011 +# %bb.1002: # in Loop: Header=BB493_565 Depth=1 ld.w $s1, $a1, 28 ld.w $s0, $a3, 8 move $a0, $s2 - blt $s2, $s1, .LBB493_1007 -# %bb.1004: # in Loop: Header=BB493_566 Depth=1 + blt $s2, $s1, .LBB493_1006 +# %bb.1003: # in Loop: Header=BB493_565 Depth=1 ld.d $s5, $a1, 0 ld.bu $a0, $s5, 42 - beqz $a0, .LBB493_1051 -.LBB493_1005: # %resizeOpArray.exit.i1230 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1050 +.LBB493_1004: # %resizeOpArray.exit.i1230 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bnez $a0, .LBB493_1008 -# %bb.1006: # %resizeOpArray.exit._crit_edge.i1233 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bnez $a0, .LBB493_1007 +# %bb.1005: # %resizeOpArray.exit._crit_edge.i1233 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_1007: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_1006: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s2, $a0 add.d $a2, $a1, $a0 - ld.d $a3, $sp, 312 # 8-byte Folded Reload + ld.d $a3, $sp, 320 # 8-byte Folded Reload stx.b $a3, $a1, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - ld.d $a0, $sp, 408 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + ld.d $a0, $sp, 416 # 8-byte Folded Reload st.w $a0, $a2, 4 - ld.d $a0, $sp, 336 # 8-byte Folded Reload + ld.d $a0, $sp, 344 # 8-byte Folded Reload st.w $a0, $a2, 8 st.w $s0, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $a2, 1 st.b $zero, $a1, 339 -.LBB493_1008: # %sqlite3VdbeAddOp3.exit1237 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 296 # 8-byte Folded Reload +.LBB493_1007: # %sqlite3VdbeAddOp3.exit1237 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 304 # 8-byte Folded Reload sltui $a0, $a0, 1 or $a0, $a0, $s4 - beqz $a0, .LBB493_1010 -# %bb.1009: # %sqlite3VdbeAddOp3.exit1237 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1009 +# %bb.1008: # %sqlite3VdbeAddOp3.exit1237 + # in Loop: Header=BB493_565 Depth=1 sltui $a0, $fp, 1 and $a0, $a0, $s4 - beqz $a0, .LBB493_1012 -.LBB493_1010: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1011 +.LBB493_1009: # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 32 - beqz $a0, .LBB493_1012 -# %bb.1011: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + beqz $a0, .LBB493_1011 +# %bb.1010: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $a1, $a1, 24 ori $a2, $zero, 24 mul.d $a1, $a1, $a2 add.d $a0, $a0, $a1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.b $s7, $a0, -21 -.LBB493_1012: # %sqlite3VdbeChangeP5.exit1240 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_1011: # %sqlite3VdbeChangeP5.exit1240 + # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s8, 37 - beqz $a0, .LBB493_1014 -# %bb.1013: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1013 +# %bb.1012: # in Loop: Header=BB493_565 Depth=1 addi.d $a0, $a0, -1 andi $a1, $a0, 255 slli.d $a1, $a1, 2 - ld.d $a2, $sp, 376 # 8-byte Folded Reload + ld.d $a2, $sp, 384 # 8-byte Folded Reload ldx.w $s0, $a2, $a1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.b $a0, $s8, 37 - b .LBB493_1015 -.LBB493_1014: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_1014 +.LBB493_1013: # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $s8, 88 addi.w $s0, $a0, 1 st.w $s0, $s8, 88 -.LBB493_1015: # %sqlite3GetTempReg.exit1243 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 352 # 8-byte Folded Reload - ld.d $a2, $sp, 344 # 8-byte Folded Reload +.LBB493_1014: # %sqlite3GetTempReg.exit1243 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 360 # 8-byte Folded Reload + ld.d $a2, $sp, 352 # 8-byte Folded Reload or $a0, $a0, $a2 - beqz $a0, .LBB493_1025 -# %bb.1016: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1024 +# %bb.1015: # in Loop: Header=BB493_565 Depth=1 ld.w $s4, $a1, 24 ld.w $s5, $a1, 28 move $s1, $s4 - blt $s4, $s5, .LBB493_1019 -# %bb.1017: # in Loop: Header=BB493_566 Depth=1 + blt $s4, $s5, .LBB493_1018 +# %bb.1016: # in Loop: Header=BB493_565 Depth=1 ld.d $s1, $a1, 0 ld.bu $a0, $s1, 42 - beqz $a0, .LBB493_1039 -.LBB493_1018: # %resizeOpArray.exit.i1247 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1038 +.LBB493_1017: # %resizeOpArray.exit.i1247 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 ld.w $s1, $a1, 24 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - bnez $a0, .LBB493_1020 -.LBB493_1019: # %resizeOpArray.exit._crit_edge.i1250 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + bnez $a0, .LBB493_1019 +.LBB493_1018: # %resizeOpArray.exit._crit_edge.i1250 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 32 addi.d $a1, $s1, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a1, $a2, 24 ori $a1, $zero, 24 mul.d $a1, $s4, $a1 add.d $a2, $a0, $a1 ori $a3, $zero, 2 stx.h $a3, $a0, $a1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - ld.d $a0, $sp, 408 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + ld.d $a0, $sp, 416 # 8-byte Folded Reload st.w $a0, $a2, 4 st.d $zero, $a2, 16 ld.w $s1, $a1, 24 - ld.d $a0, $sp, 400 # 8-byte Folded Reload + ld.d $a0, $sp, 408 # 8-byte Folded Reload st.w $a0, $a2, 8 st.w $s0, $a2, 12 st.b $zero, $a1, 339 -.LBB493_1020: # %sqlite3VdbeAddOp3.exit1254 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_1019: # %sqlite3VdbeAddOp3.exit1254 + # in Loop: Header=BB493_565 Depth=1 ld.w $fp, $a1, 28 move $a0, $s1 - blt $s1, $fp, .LBB493_1024 -# %bb.1021: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $fp, .LBB493_1023 +# %bb.1020: # in Loop: Header=BB493_565 Depth=1 ld.d $s4, $a1, 0 ld.bu $a0, $s4, 42 - beqz $a0, .LBB493_1042 -.LBB493_1022: # %resizeOpArray.exit.i.i1258 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1041 +.LBB493_1021: # %resizeOpArray.exit.i.i1258 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bnez $a0, .LBB493_1025 -# %bb.1023: # %resizeOpArray.exit._crit_edge.i.i1261 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bnez $a0, .LBB493_1024 +# %bb.1022: # %resizeOpArray.exit._crit_edge.i.i1261 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_1024: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_1023: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s1, $a0 add.d $a2, $a1, $a0 ori $a3, $zero, 65 stx.h $a3, $a1, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $s0, $a2, 4 - ld.d $s5, $sp, 448 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload st.w $s5, $a2, 8 st.w $zero, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $a1, 339 -.LBB493_1025: # %sqlite3VdbeAddOp2.exit1265 - # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 360 # 8-byte Folded Reload - bnez $a0, .LBB493_1035 -# %bb.1026: # in Loop: Header=BB493_566 Depth=1 +.LBB493_1024: # %sqlite3VdbeAddOp2.exit1265 + # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 368 # 8-byte Folded Reload + bnez $a0, .LBB493_1034 +# %bb.1025: # in Loop: Header=BB493_565 Depth=1 ld.w $s1, $a1, 24 ld.w $s4, $a1, 28 move $fp, $s1 - blt $s1, $s4, .LBB493_1029 -# %bb.1027: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s4, .LBB493_1028 +# %bb.1026: # in Loop: Header=BB493_565 Depth=1 ld.d $fp, $a1, 0 ld.bu $a0, $fp, 42 - beqz $a0, .LBB493_1045 -.LBB493_1028: # %resizeOpArray.exit.i.i1269 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1044 +.LBB493_1027: # %resizeOpArray.exit.i.i1269 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 ld.w $fp, $a1, 24 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bnez $a0, .LBB493_1030 -.LBB493_1029: # %resizeOpArray.exit._crit_edge.i.i1272 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bnez $a0, .LBB493_1029 +.LBB493_1028: # %resizeOpArray.exit._crit_edge.i.i1272 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 32 addi.d $a1, $fp, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a1, $a2, 24 ori $a1, $zero, 24 mul.d $a1, $s1, $a1 add.d $a2, $a0, $a1 ori $a3, $zero, 52 stx.h $a3, $a0, $a1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - ld.d $a0, $sp, 408 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload + ld.d $a0, $sp, 416 # 8-byte Folded Reload st.w $a0, $a2, 4 st.d $zero, $a2, 16 ld.w $fp, $a1, 24 st.w $s0, $a2, 8 st.w $zero, $a2, 12 st.b $zero, $a1, 339 -.LBB493_1030: # %sqlite3VdbeAddOp2.exit1276 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_1029: # %sqlite3VdbeAddOp2.exit1276 + # in Loop: Header=BB493_565 Depth=1 ld.w $s1, $a1, 28 move $a0, $fp - blt $fp, $s1, .LBB493_1034 -# %bb.1031: # in Loop: Header=BB493_566 Depth=1 + blt $fp, $s1, .LBB493_1033 +# %bb.1030: # in Loop: Header=BB493_565 Depth=1 ld.d $s4, $a1, 0 ld.bu $a0, $s4, 42 - beqz $a0, .LBB493_1048 -.LBB493_1032: # %resizeOpArray.exit.i1280 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1047 +.LBB493_1031: # %resizeOpArray.exit.i1280 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bnez $a0, .LBB493_1035 -# %bb.1033: # %resizeOpArray.exit._crit_edge.i1283 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bnez $a0, .LBB493_1034 +# %bb.1032: # %resizeOpArray.exit._crit_edge.i1283 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_1034: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_1033: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $fp, $a0 @@ -112985,37 +112940,37 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.w $zero, $a2, 8 st.w $s0, $a2, 12 st.d $zero, $a2, 16 - ld.d $a0, $sp, 272 # 8-byte Folded Reload + ld.d $a0, $sp, 280 # 8-byte Folded Reload st.b $zero, $a0, 339 -.LBB493_1035: # %sqlite3VdbeAddOp3.exit1287 - # in Loop: Header=BB493_566 Depth=1 - beqz $s0, .LBB493_1038 -# %bb.1036: # in Loop: Header=BB493_566 Depth=1 +.LBB493_1034: # %sqlite3VdbeAddOp3.exit1287 + # in Loop: Header=BB493_565 Depth=1 + beqz $s0, .LBB493_1037 +# %bb.1035: # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s8, 37 ori $a1, $zero, 7 - bltu $a1, $a0, .LBB493_1038 -# %bb.1037: # in Loop: Header=BB493_566 Depth=1 + bltu $a1, $a0, .LBB493_1037 +# %bb.1036: # in Loop: Header=BB493_565 Depth=1 addi.d $a1, $a0, 1 st.b $a1, $s8, 37 slli.d $a0, $a0, 2 - ld.d $a1, $sp, 376 # 8-byte Folded Reload + ld.d $a1, $sp, 384 # 8-byte Folded Reload stx.w $s0, $a1, $a0 -.LBB493_1038: # %sqlite3ReleaseTempReg.exit1289 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_1037: # %sqlite3ReleaseTempReg.exit1289 + # in Loop: Header=BB493_565 Depth=1 sltu $a0, $zero, $s3 ori $a1, $zero, 102 masknez $a1, $a1, $a0 ori $a2, $zero, 47 maskeqz $a0, $a2, $a0 or $a0, $a0, $a1 - ld.d $s1, $sp, 192 # 8-byte Folded Reload + ld.d $s1, $sp, 200 # 8-byte Folded Reload st.w $a0, $s1, 48 - ld.d $a0, $sp, 408 # 8-byte Folded Reload + ld.d $a0, $sp, 416 # 8-byte Folded Reload st.w $a0, $s1, 52 st.w $s2, $s1, 56 ld.d $a6, $sp, 464 # 8-byte Folded Reload - b .LBB493_817 -.LBB493_1039: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_816 +.LBB493_1038: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s5, 1 move $fp, $a1 slli.w $a1, $s5, 1 @@ -113028,14 +112983,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s6, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1056 -# %bb.1040: # %sqlite3DbRealloc.exit.i.i1253 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1055 +# %bb.1039: # %sqlite3DbRealloc.exit.i.i1253 + # in Loop: Header=BB493_565 Depth=1 st.w $s6, $fp, 28 st.d $a0, $fp, 32 move $a1, $fp - bge $s5, $s6, .LBB493_1018 -# %bb.1041: # in Loop: Header=BB493_566 Depth=1 + bge $s5, $s6, .LBB493_1017 +# %bb.1040: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s5, $a2 add.d $a0, $a0, $a1 @@ -113044,9 +112999,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_1018 -.LBB493_1042: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_1017 +.LBB493_1041: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $fp, 1 move $s6, $a1 slli.w $a1, $fp, 1 @@ -113059,15 +113014,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s5, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1057 -# %bb.1043: # %sqlite3DbRealloc.exit.i.i.i1264 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1056 +# %bb.1042: # %sqlite3DbRealloc.exit.i.i.i1264 + # in Loop: Header=BB493_565 Depth=1 st.w $s5, $s6, 28 st.d $a0, $s6, 32 move $a1, $s6 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - bge $fp, $s5, .LBB493_1022 -# %bb.1044: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + bge $fp, $s5, .LBB493_1021 +# %bb.1043: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $fp, $a2 add.d $a0, $a0, $a1 @@ -113076,9 +113031,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_1022 -.LBB493_1045: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_1021 +.LBB493_1044: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s4, 1 move $s6, $a1 slli.w $a1, $s4, 1 @@ -113091,15 +113046,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s5, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1060 -# %bb.1046: # %sqlite3DbRealloc.exit.i.i.i1275 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1059 +# %bb.1045: # %sqlite3DbRealloc.exit.i.i.i1275 + # in Loop: Header=BB493_565 Depth=1 st.w $s5, $s6, 28 st.d $a0, $s6, 32 move $a1, $s6 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - bge $s4, $s5, .LBB493_1028 -# %bb.1047: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + bge $s4, $s5, .LBB493_1027 +# %bb.1046: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s4, $a2 add.d $a0, $a0, $a1 @@ -113108,9 +113063,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_1028 -.LBB493_1048: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_1027 +.LBB493_1047: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s1, 1 move $s6, $a1 slli.w $a1, $s1, 1 @@ -113123,15 +113078,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s5, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1061 -# %bb.1049: # %sqlite3DbRealloc.exit.i.i1286 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1060 +# %bb.1048: # %sqlite3DbRealloc.exit.i.i1286 + # in Loop: Header=BB493_565 Depth=1 st.w $s5, $s6, 28 st.d $a0, $s6, 32 move $a1, $s6 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - bge $s1, $s5, .LBB493_1032 -# %bb.1050: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + bge $s1, $s5, .LBB493_1031 +# %bb.1049: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s1, $a2 add.d $a0, $a0, $a1 @@ -113140,9 +113095,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_1032 -.LBB493_1051: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_1031 +.LBB493_1050: # in Loop: Header=BB493_565 Depth=1 st.d $s4, $sp, 328 # 8-byte Folded Spill move $s4, $fp sltui $a0, $s1, 1 @@ -113157,16 +113112,16 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s6, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1103 -# %bb.1052: # %sqlite3DbRealloc.exit.i.i1236 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1102 +# %bb.1051: # %sqlite3DbRealloc.exit.i.i1236 + # in Loop: Header=BB493_565 Depth=1 st.w $s6, $fp, 28 st.d $a0, $fp, 32 move $a1, $fp move $fp, $s4 ld.d $s4, $sp, 328 # 8-byte Folded Reload - bge $s1, $s6, .LBB493_1005 -# %bb.1053: # in Loop: Header=BB493_566 Depth=1 + bge $s1, $s6, .LBB493_1004 +# %bb.1052: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s1, $a2 add.d $a0, $a0, $a1 @@ -113175,77 +113130,77 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_1005 -.LBB493_1054: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_1004 +.LBB493_1053: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 st.b $a0, $s6, 42 - b .LBB493_677 -.LBB493_1055: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_676 +.LBB493_1054: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s5, 42 - b .LBB493_898 -.LBB493_1056: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_897 +.LBB493_1055: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s1, 42 move $a1, $fp - b .LBB493_1018 -.LBB493_1057: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_1017 +.LBB493_1056: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s4, 42 move $a1, $s6 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - b .LBB493_1022 -.LBB493_1058: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + b .LBB493_1021 +.LBB493_1057: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 st.b $a0, $s7, 42 move $a1, $fp - b .LBB493_934 -.LBB493_1059: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_933 +.LBB493_1058: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s1, 42 move $a1, $s3 - b .LBB493_617 -.LBB493_1060: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_616 +.LBB493_1059: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $fp, 42 move $a1, $s6 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - b .LBB493_1028 -.LBB493_1061: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + b .LBB493_1027 +.LBB493_1060: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s4, 42 move $a1, $s6 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - b .LBB493_1032 -.LBB493_1062: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + b .LBB493_1031 +.LBB493_1061: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s1, 42 move $a1, $s5 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - ld.d $a5, $sp, 400 # 8-byte Folded Reload - b .LBB493_745 -.LBB493_1063: # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + ld.d $a5, $sp, 408 # 8-byte Folded Reload + b .LBB493_744 +.LBB493_1062: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 st.b $a0, $s6, 42 move $a4, $fp - b .LBB493_760 -.LBB493_1064: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_759 +.LBB493_1063: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 st.b $a0, $s6, 42 move $a5, $fp - b .LBB493_771 -.LBB493_1065: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_770 +.LBB493_1064: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 st.b $a0, $s7, 42 -.LBB493_1066: # %resizeOpArray.exit.i.i1457 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_1065: # %resizeOpArray.exit.i.i1457 + # in Loop: Header=BB493_565 Depth=1 move $a0, $s6 -.LBB493_1067: # %resizeOpArray.exit.i.i1457 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_1066: # %resizeOpArray.exit.i.i1457 + # in Loop: Header=BB493_565 Depth=1 ld.d $a1, $s2, 0 ld.bu $a1, $a1, 42 - ld.d $s8, $sp, 320 # 8-byte Folded Reload - ld.d $s6, $sp, 432 # 8-byte Folded Reload - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bnez $a1, .LBB493_1070 -# %bb.1068: # %resizeOpArray.exit._crit_edge.i.i1460 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s8, $sp, 336 # 8-byte Folded Reload + ld.d $s6, $sp, 448 # 8-byte Folded Reload + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bnez $a1, .LBB493_1069 +# %bb.1067: # %resizeOpArray.exit._crit_edge.i.i1460 + # in Loop: Header=BB493_565 Depth=1 ld.w $a1, $s2, 24 -.LBB493_1069: # in Loop: Header=BB493_566 Depth=1 +.LBB493_1068: # in Loop: Header=BB493_565 Depth=1 ld.d $a2, $s2, 32 addi.d $a1, $a1, 1 st.w $a1, $s2, 24 @@ -113259,48 +113214,48 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.w $zero, $a3, 12 st.d $zero, $a3, 16 st.b $zero, $s2, 339 - ld.d $s5, $sp, 448 # 8-byte Folded Reload -.LBB493_1070: # %sqlite3ExprCode.exit1065 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload +.LBB493_1069: # %sqlite3ExprCode.exit1065 + # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s1, 0 ori $a0, $a0, 2 ori $a1, $zero, 71 - bne $a0, $a1, .LBB493_1072 -# %bb.1071: # in Loop: Header=BB493_566 Depth=1 + bne $a0, $a1, .LBB493_1071 +# %bb.1070: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 72 masknez $a0, $a0, $s3 ori $a1, $zero, 70 - b .LBB493_1073 -.LBB493_1072: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_1072 +.LBB493_1071: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 69 masknez $a0, $a0, $s3 ori $a1, $zero, 71 -.LBB493_1073: # in Loop: Header=BB493_566 Depth=1 +.LBB493_1072: # in Loop: Header=BB493_565 Depth=1 maskeqz $a1, $a1, $s3 or $fp, $a1, $a0 - ld.d $a3, $sp, 192 # 8-byte Folded Reload + ld.d $a3, $sp, 200 # 8-byte Folded Reload ld.d $a4, $sp, 464 # 8-byte Folded Reload ori $s7, $zero, 1 .p2align 4, , 16 -.LBB493_1074: # Parent Loop BB493_566 Depth=1 +.LBB493_1073: # Parent Loop BB493_565 Depth=1 # => This Inner Loop Header: Depth=2 ld.bu $a0, $s0, 16 andi $a1, $a0, 4 - bnez $a1, .LBB493_1079 -# %bb.1075: # in Loop: Header=BB493_1074 Depth=2 + bnez $a1, .LBB493_1078 +# %bb.1074: # in Loop: Header=BB493_1073 Depth=2 ld.w $a1, $a3, 12 - beqz $a1, .LBB493_1077 -# %bb.1076: # in Loop: Header=BB493_1074 Depth=2 + beqz $a1, .LBB493_1076 +# %bb.1075: # in Loop: Header=BB493_1073 Depth=2 ld.d $a1, $s0, 0 ld.hu $a1, $a1, 2 andi $a1, $a1, 1 - beqz $a1, .LBB493_1079 -.LBB493_1077: # in Loop: Header=BB493_1074 Depth=2 + beqz $a1, .LBB493_1078 +.LBB493_1076: # in Loop: Header=BB493_1073 Depth=2 ld.h $a1, $s0, 8 addi.d $a0, $a0, 4 st.b $a0, $s0, 16 - bltz $a1, .LBB493_1079 -# %bb.1078: # in Loop: Header=BB493_1074 Depth=2 + bltz $a1, .LBB493_1078 +# %bb.1077: # in Loop: Header=BB493_1073 Depth=2 ld.d $a0, $s0, 24 ld.d $a0, $a0, 24 slli.d $a2, $a1, 5 @@ -113310,9 +113265,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin addi.d $a0, $a0, -1 andi $a1, $a0, 255 st.b $a0, $s0, 17 - beqz $a1, .LBB493_1074 -.LBB493_1079: # in Loop: Header=BB493_566 Depth=1 - ld.d $a0, $sp, 272 # 8-byte Folded Reload + beqz $a1, .LBB493_1073 +.LBB493_1078: # in Loop: Header=BB493_565 Depth=1 + ld.d $a0, $sp, 280 # 8-byte Folded Reload ld.w $s1, $a0, 24 ori $a0, $zero, 102 masknez $a1, $a0, $s3 @@ -113323,49 +113278,49 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.w $a1, $a3, 48 st.w $a4, $a3, 52 st.w $s1, $a3, 56 - beqz $a0, .LBB493_1081 -# %bb.1080: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1080 +# %bb.1079: # in Loop: Header=BB493_565 Depth=1 addi.d $a0, $a0, -1 andi $a1, $a0, 255 slli.d $a1, $a1, 2 - ld.d $a2, $sp, 376 # 8-byte Folded Reload + ld.d $a2, $sp, 384 # 8-byte Folded Reload ldx.w $s0, $a2, $a1 st.b $a0, $s8, 37 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $s3, $a1, 28 move $s2, $s1 - bge $s1, $s3, .LBB493_1082 - b .LBB493_1084 -.LBB493_1081: # in Loop: Header=BB493_566 Depth=1 + bge $s1, $s3, .LBB493_1081 + b .LBB493_1083 +.LBB493_1080: # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $s8, 88 addi.w $s0, $a0, 1 st.w $s0, $s8, 88 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.w $s3, $a1, 28 move $s2, $s1 - blt $s1, $s3, .LBB493_1084 -.LBB493_1082: # in Loop: Header=BB493_566 Depth=1 + blt $s1, $s3, .LBB493_1083 +.LBB493_1081: # in Loop: Header=BB493_565 Depth=1 ld.d $s2, $a1, 0 ld.bu $a0, $s2, 42 - beqz $a0, .LBB493_1095 -.LBB493_1083: # %resizeOpArray.exit.i.i1077 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1094 +.LBB493_1082: # %resizeOpArray.exit.i.i1077 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 ld.w $s2, $a1, 24 - bnez $a0, .LBB493_1085 -.LBB493_1084: # %resizeOpArray.exit._crit_edge.i.i1080 - # in Loop: Header=BB493_566 Depth=1 + bnez $a0, .LBB493_1084 +.LBB493_1083: # %resizeOpArray.exit._crit_edge.i.i1080 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 32 addi.d $a1, $s2, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a1, $a2, 24 ori $a1, $zero, 24 mul.d $a1, $s1, $a1 add.d $a2, $a0, $a1 ori $a3, $zero, 38 stx.h $a3, $a0, $a1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a0, $sp, 464 # 8-byte Folded Reload st.w $a0, $a2, 4 st.d $zero, $a2, 16 @@ -113373,72 +113328,72 @@ sqlite3WhereBegin: # @sqlite3WhereBegin st.w $s0, $a2, 8 st.w $zero, $a2, 12 st.b $zero, $a1, 339 -.LBB493_1085: # %sqlite3VdbeAddOp2.exit1084 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_1084: # %sqlite3VdbeAddOp2.exit1084 + # in Loop: Header=BB493_565 Depth=1 ld.w $s3, $a1, 28 - ld.d $a0, $sp, 192 # 8-byte Folded Reload + ld.d $a0, $sp, 200 # 8-byte Folded Reload ld.w $s1, $a0, 8 move $a0, $s2 - blt $s2, $s3, .LBB493_1089 -# %bb.1086: # in Loop: Header=BB493_566 Depth=1 + blt $s2, $s3, .LBB493_1088 +# %bb.1085: # in Loop: Header=BB493_565 Depth=1 ld.d $s4, $a1, 0 ld.bu $a0, $s4, 42 - beqz $a0, .LBB493_1098 -.LBB493_1087: # %resizeOpArray.exit.i1088 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1097 +.LBB493_1086: # %resizeOpArray.exit.i1088 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 0 ld.bu $a0, $a0, 42 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bnez $a0, .LBB493_1090 -# %bb.1088: # %resizeOpArray.exit._crit_edge.i1091 - # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bnez $a0, .LBB493_1089 +# %bb.1087: # %resizeOpArray.exit._crit_edge.i1091 + # in Loop: Header=BB493_565 Depth=1 ld.w $a0, $a1, 24 -.LBB493_1089: # in Loop: Header=BB493_566 Depth=1 - ld.d $a1, $sp, 272 # 8-byte Folded Reload +.LBB493_1088: # in Loop: Header=BB493_565 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload ld.d $a1, $a1, 32 addi.d $a0, $a0, 1 - ld.d $a2, $sp, 272 # 8-byte Folded Reload + ld.d $a2, $sp, 280 # 8-byte Folded Reload st.w $a0, $a2, 24 ori $a0, $zero, 24 mul.d $a0, $s2, $a0 add.d $a2, $a1, $a0 stx.b $fp, $a1, $a0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload + ld.d $a1, $sp, 280 # 8-byte Folded Reload st.w $s1, $a2, 4 - ld.d $a0, $sp, 416 # 8-byte Folded Reload + ld.d $a0, $sp, 424 # 8-byte Folded Reload st.w $a0, $a2, 8 st.w $s0, $a2, 12 st.d $zero, $a2, 16 st.b $zero, $a2, 1 st.b $zero, $a1, 339 -.LBB493_1090: # %sqlite3VdbeAddOp3.exit1095 - # in Loop: Header=BB493_566 Depth=1 +.LBB493_1089: # %sqlite3VdbeAddOp3.exit1095 + # in Loop: Header=BB493_565 Depth=1 ld.d $a0, $a1, 32 - beqz $a0, .LBB493_1092 -# %bb.1091: # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1091 +# %bb.1090: # in Loop: Header=BB493_565 Depth=1 ld.w $a1, $a1, 24 ori $a2, $zero, 24 mul.d $a1, $a1, $a2 add.d $a0, $a0, $a1 ori $a1, $zero, 107 st.b $a1, $a0, -21 -.LBB493_1092: # %sqlite3VdbeChangeP5.exit - # in Loop: Header=BB493_566 Depth=1 - ld.d $s1, $sp, 192 # 8-byte Folded Reload +.LBB493_1091: # %sqlite3VdbeChangeP5.exit + # in Loop: Header=BB493_565 Depth=1 + ld.d $s1, $sp, 200 # 8-byte Folded Reload ld.d $a6, $sp, 464 # 8-byte Folded Reload - beqz $s0, .LBB493_817 -# %bb.1093: # in Loop: Header=BB493_566 Depth=1 + beqz $s0, .LBB493_816 +# %bb.1092: # in Loop: Header=BB493_565 Depth=1 ld.bu $a0, $s8, 37 ori $a1, $zero, 7 - bltu $a1, $a0, .LBB493_817 -# %bb.1094: # in Loop: Header=BB493_566 Depth=1 + bltu $a1, $a0, .LBB493_816 +# %bb.1093: # in Loop: Header=BB493_565 Depth=1 addi.d $a1, $a0, 1 st.b $a1, $s8, 37 slli.d $a0, $a0, 2 - ld.d $a1, $sp, 376 # 8-byte Folded Reload + ld.d $a1, $sp, 384 # 8-byte Folded Reload stx.w $s0, $a1, $a0 - b .LBB493_817 -.LBB493_1095: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_816 +.LBB493_1094: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s3, 1 move $s5, $a1 slli.w $a1, $s3, 1 @@ -113451,15 +113406,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s4, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1101 -# %bb.1096: # %sqlite3DbRealloc.exit.i.i.i1083 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1100 +# %bb.1095: # %sqlite3DbRealloc.exit.i.i.i1083 + # in Loop: Header=BB493_565 Depth=1 st.w $s4, $s5, 28 st.d $a0, $s5, 32 move $a1, $s5 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - bge $s3, $s4, .LBB493_1083 -# %bb.1097: # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + bge $s3, $s4, .LBB493_1082 +# %bb.1096: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s3, $a2 add.d $a0, $a0, $a1 @@ -113468,9 +113423,9 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_1083 -.LBB493_1098: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_1082 +.LBB493_1097: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s3, 1 move $s6, $a1 slli.w $a1, $s3, 1 @@ -113483,15 +113438,15 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s5, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1102 -# %bb.1099: # %sqlite3DbRealloc.exit.i.i1094 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1101 +# %bb.1098: # %sqlite3DbRealloc.exit.i.i1094 + # in Loop: Header=BB493_565 Depth=1 st.w $s5, $s6, 28 st.d $a0, $s6, 32 move $a1, $s6 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - bge $s3, $s5, .LBB493_1087 -# %bb.1100: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + bge $s3, $s5, .LBB493_1086 +# %bb.1099: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s3, $a2 add.d $a0, $a0, $a1 @@ -113500,35 +113455,35 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a1, $sp, 272 # 8-byte Folded Reload - b .LBB493_1087 -.LBB493_1101: # in Loop: Header=BB493_566 Depth=1 + ld.d $a1, $sp, 280 # 8-byte Folded Reload + b .LBB493_1086 +.LBB493_1100: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s2, 42 move $a1, $s5 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - b .LBB493_1083 -.LBB493_1102: # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + b .LBB493_1082 +.LBB493_1101: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s4, 42 move $a1, $s6 - ld.d $s6, $sp, 432 # 8-byte Folded Reload - b .LBB493_1087 -.LBB493_1103: # in Loop: Header=BB493_566 Depth=1 + ld.d $s6, $sp, 448 # 8-byte Folded Reload + b .LBB493_1086 +.LBB493_1102: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s5, 42 move $a1, $fp move $fp, $s4 ld.d $s4, $sp, 328 # 8-byte Folded Reload - b .LBB493_1005 -.LBB493_1104: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_1004 +.LBB493_1103: # in Loop: Header=BB493_565 Depth=1 ori $a0, $zero, 1 st.b $a0, $s6, 42 move $a1, $fp - b .LBB493_920 -.LBB493_1105: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_919 +.LBB493_1104: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s2, 42 move $a1, $fp move $fp, $s4 - b .LBB493_941 -.LBB493_1106: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_940 +.LBB493_1105: # in Loop: Header=BB493_565 Depth=1 sltui $a0, $s0, 1 move $fp, $a5 slli.w $a1, $s0, 1 @@ -113541,14 +113496,14 @@ sqlite3WhereBegin: # @sqlite3WhereBegin alsl.w $a1, $s6, $a1, 3 pcaddu18i $ra, %call36(sqlite3_realloc) jirl $ra, $ra, 0 - beqz $a0, .LBB493_1115 -# %bb.1107: # %sqlite3DbRealloc.exit.i.i.i1158 - # in Loop: Header=BB493_566 Depth=1 + beqz $a0, .LBB493_1114 +# %bb.1106: # %sqlite3DbRealloc.exit.i.i.i1158 + # in Loop: Header=BB493_565 Depth=1 st.w $s6, $fp, 28 st.d $a0, $fp, 32 move $a5, $fp - bge $s0, $s6, .LBB493_863 -# %bb.1108: # in Loop: Header=BB493_566 Depth=1 + bge $s0, $s6, .LBB493_862 +# %bb.1107: # in Loop: Header=BB493_565 Depth=1 ori $a2, $zero, 24 mul.d $a1, $s0, $a2 add.d $a0, $a0, $a1 @@ -113557,101 +113512,101 @@ sqlite3WhereBegin: # @sqlite3WhereBegin move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a5, $sp, 272 # 8-byte Folded Reload - b .LBB493_863 -.LBB493_1109: # in Loop: Header=BB493_566 Depth=1 + ld.d $a5, $sp, 280 # 8-byte Folded Reload + b .LBB493_862 +.LBB493_1108: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s2, 42 move $a1, $s4 - b .LBB493_752 -.LBB493_1110: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_751 +.LBB493_1109: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s2, 42 - b .LBB493_858 -.LBB493_1111: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_857 +.LBB493_1110: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s1, 42 move $a1, $s5 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - b .LBB493_980 -.LBB493_1112: # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + b .LBB493_979 +.LBB493_1111: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s3, 42 move $a1, $s5 - ld.d $s5, $sp, 448 # 8-byte Folded Reload - b .LBB493_984 -.LBB493_1113: # in Loop: Header=BB493_566 Depth=1 + ld.d $s5, $sp, 440 # 8-byte Folded Reload + b .LBB493_983 +.LBB493_1112: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s1, 42 move $a1, $fp move $fp, $s4 ld.d $s4, $sp, 328 # 8-byte Folded Reload - b .LBB493_915 -.LBB493_1114: # %.split727 - # in Loop: Header=BB493_566 Depth=1 + b .LBB493_914 +.LBB493_1113: # %.split727 + # in Loop: Header=BB493_565 Depth=1 ori $a1, $zero, 5 move $a0, $s4 - ld.d $a2, $sp, 408 # 8-byte Folded Reload - ld.d $s2, $sp, 432 # 8-byte Folded Reload + ld.d $a2, $sp, 416 # 8-byte Folded Reload + ld.d $s2, $sp, 448 # 8-byte Folded Reload move $a3, $s2 move $a4, $fp pcaddu18i $ra, %call36(sqlite3VdbeAddOp3) jirl $ra, $ra, 0 - b .LBB493_856 -.LBB493_1115: # in Loop: Header=BB493_566 Depth=1 + b .LBB493_855 +.LBB493_1114: # in Loop: Header=BB493_565 Depth=1 st.b $s7, $s5, 42 move $a5, $fp - b .LBB493_863 -.LBB493_1116: # %._crit_edge1759.thread - ld.d $a0, $sp, 272 # 8-byte Folded Reload + b .LBB493_862 +.LBB493_1115: # %._crit_edge1759.thread + ld.d $a0, $sp, 280 # 8-byte Folded Reload ld.w $a0, $a0, 24 move $s5, $zero - ld.d $a1, $sp, 136 # 8-byte Folded Reload + ld.d $a1, $sp, 144 # 8-byte Folded Reload st.w $a0, $a1, 16 - b .LBB493_1118 -.LBB493_1117: + b .LBB493_1117 +.LBB493_1116: move $s5, $zero -.LBB493_1118: # %._crit_edge1787 +.LBB493_1117: # %._crit_edge1787 ld.w $a1, $sp, 488 ld.d $a0, $sp, 496 - ld.d $a2, $sp, 136 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload st.w $s5, $a2, 20 - blez $a1, .LBB493_1124 -# %bb.1119: # %.lr.ph.i1387.preheader + blez $a1, .LBB493_1123 +# %bb.1118: # %.lr.ph.i1387.preheader addi.d $fp, $a1, 1 addi.d $s0, $a0, 16 ori $s1, $zero, 1 - b .LBB493_1121 + b .LBB493_1120 .p2align 4, , 16 -.LBB493_1120: # in Loop: Header=BB493_1121 Depth=1 +.LBB493_1119: # in Loop: Header=BB493_1120 Depth=1 addi.w $fp, $fp, -1 addi.d $s0, $s0, 48 - bgeu $s1, $fp, .LBB493_1123 -.LBB493_1121: # %.lr.ph.i1387 + bgeu $s1, $fp, .LBB493_1122 +.LBB493_1120: # %.lr.ph.i1387 # =>This Inner Loop Header: Depth=1 ld.bu $a0, $s0, 0 andi $a0, $a0, 1 - beqz $a0, .LBB493_1120 -# %bb.1122: # in Loop: Header=BB493_1121 Depth=1 + beqz $a0, .LBB493_1119 +# %bb.1121: # in Loop: Header=BB493_1120 Depth=1 ld.d $a0, $s0, -16 pcaddu18i $ra, %call36(sqlite3ExprDelete) jirl $ra, $ra, 0 - b .LBB493_1120 -.LBB493_1123: # %._crit_edge.loopexit.i + b .LBB493_1119 +.LBB493_1122: # %._crit_edge.loopexit.i ld.d $a0, $sp, 496 -.LBB493_1124: # %._crit_edge.i1382 +.LBB493_1123: # %._crit_edge.i1382 addi.d $a1, $sp, 504 - beq $a0, $a1, .LBB493_1127 -# %bb.1125: # %._crit_edge.i1382 - beqz $a0, .LBB493_1127 -# %bb.1126: + beq $a0, $a1, .LBB493_1126 +# %bb.1124: # %._crit_edge.i1382 + beqz $a0, .LBB493_1126 +# %bb.1125: ld.w $a1, $a0, -8 - ld.d $a3, $sp, 96 # 8-byte Folded Reload + ld.d $a3, $sp, 112 # 8-byte Folded Reload ld.d $a2, $a3, %pc_lo12(mem.5) addi.d $a0, $a0, -8 sub.d $a1, $a2, $a1 st.d $a1, $a3, %pc_lo12(mem.5) pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 -.LBB493_1127: - ld.d $a0, $sp, 136 # 8-byte Folded Reload +.LBB493_1126: + ld.d $a0, $sp, 144 # 8-byte Folded Reload b .LBB493_50 -.LBB493_1128: +.LBB493_1127: ori $a0, $zero, 1 st.b $a0, $s1, 42 bnez $fp, .LBB493_16 @@ -149299,47 +149254,55 @@ hexFunc: # @hexFunc add.d $a1, $s0, $a1 alsl.d $a2, $a2, $s1, 4 addi.d $a4, $s0, 15 - vrepli.b $vr0, 0 - vrepli.w $vr1, 15 + vrepli.w $vr0, 15 move $a5, $a3 .p2align 4, , 16 .LBB588_34: # %vector.body # =>This Inner Loop Header: Depth=1 vld $vr4, $s1, 0 - vilvh.b $vr3, $vr0, $vr4 - vilvh.h $vr2, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvh.h $vr5, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vsrli.w $vr6, $vr4, 4 - vsrli.w $vr7, $vr5, 4 - vsrli.w $vr8, $vr3, 4 - vsrli.w $vr9, $vr2, 4 - vilvh.w $vr10, $vr0, $vr9 - vilvl.w $vr9, $vr0, $vr9 - vilvh.w $vr11, $vr0, $vr8 - vilvl.w $vr8, $vr0, $vr8 - vilvh.w $vr12, $vr0, $vr7 - vilvl.w $vr7, $vr0, $vr7 - vilvh.w $vr13, $vr0, $vr6 - vilvl.w $vr6, $vr0, $vr6 - vpickve2gr.d $a6, $vr6, 0 - vpickve2gr.d $a7, $vr6, 1 - vpickve2gr.d $t0, $vr13, 0 - vpickve2gr.d $t1, $vr13, 1 - vpickve2gr.d $t2, $vr7, 0 - vpickve2gr.d $t3, $vr7, 1 - vpickve2gr.d $t4, $vr12, 0 - vpickve2gr.d $t5, $vr12, 1 - vpickve2gr.d $t6, $vr8, 0 - vpickve2gr.d $t7, $vr8, 1 - vpickve2gr.d $t8, $vr11, 0 - vpickve2gr.d $s5, $vr11, 1 - vpickve2gr.d $s6, $vr9, 0 - vpickve2gr.d $s7, $vr9, 1 - vpickve2gr.d $s8, $vr10, 0 - vpickve2gr.d $ra, $vr10, 1 + vbsrl.v $vr1, $vr4, 12 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr4, 8 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsrli.d $vr3, $vr4, 32 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsrli.w $vr5, $vr4, 4 + vsrli.w $vr6, $vr3, 4 + vsrli.w $vr7, $vr2, 4 + vsrli.w $vr8, $vr1, 4 + vshuf4i.w $vr9, $vr8, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr8, $vr8, 0 + vshuf4i.w $vr10, $vr7, 14 + vsllwil.du.wu $vr10, $vr10, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vshuf4i.w $vr11, $vr6, 14 + vsllwil.du.wu $vr11, $vr11, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.w $vr12, $vr5, 14 + vsllwil.du.wu $vr12, $vr12, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vpickve2gr.d $a6, $vr5, 0 + vpickve2gr.d $a7, $vr5, 1 + vpickve2gr.d $t0, $vr12, 0 + vpickve2gr.d $t1, $vr12, 1 + vpickve2gr.d $t2, $vr6, 0 + vpickve2gr.d $t3, $vr6, 1 + vpickve2gr.d $t4, $vr11, 0 + vpickve2gr.d $t5, $vr11, 1 + vpickve2gr.d $t6, $vr7, 0 + vpickve2gr.d $t7, $vr7, 1 + vpickve2gr.d $t8, $vr10, 0 + vpickve2gr.d $s5, $vr10, 1 + vpickve2gr.d $s6, $vr8, 0 + vpickve2gr.d $s7, $vr8, 1 + vpickve2gr.d $s8, $vr9, 0 + vpickve2gr.d $ra, $vr9, 1 ldx.b $a6, $a0, $a6 ldx.b $a7, $a0, $a7 ldx.b $t0, $a0, $t0 @@ -149372,34 +149335,38 @@ hexFunc: # @hexFunc st.b $s7, $a4, 11 st.b $s8, $a4, 13 st.b $ra, $a4, 15 - vand.v $vr4, $vr4, $vr1 - vand.v $vr5, $vr5, $vr1 - vand.v $vr3, $vr3, $vr1 - vand.v $vr2, $vr2, $vr1 - vilvh.w $vr6, $vr0, $vr2 - vilvl.w $vr2, $vr0, $vr2 - vilvh.w $vr7, $vr0, $vr3 - vilvl.w $vr3, $vr0, $vr3 - vilvh.w $vr8, $vr0, $vr5 - vilvl.w $vr5, $vr0, $vr5 - vilvh.w $vr9, $vr0, $vr4 - vilvl.w $vr4, $vr0, $vr4 + vand.v $vr4, $vr4, $vr0 + vand.v $vr3, $vr3, $vr0 + vand.v $vr2, $vr2, $vr0 + vand.v $vr1, $vr1, $vr0 + vshuf4i.w $vr5, $vr1, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vshuf4i.w $vr6, $vr2, 14 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vshuf4i.w $vr7, $vr3, 14 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vshuf4i.w $vr8, $vr4, 14 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr4, $vr4, 0 vpickve2gr.d $a6, $vr4, 0 vpickve2gr.d $a7, $vr4, 1 - vpickve2gr.d $t0, $vr9, 0 - vpickve2gr.d $t1, $vr9, 1 - vpickve2gr.d $t2, $vr5, 0 - vpickve2gr.d $t3, $vr5, 1 - vpickve2gr.d $t4, $vr8, 0 - vpickve2gr.d $t5, $vr8, 1 - vpickve2gr.d $t6, $vr3, 0 - vpickve2gr.d $t7, $vr3, 1 - vpickve2gr.d $t8, $vr7, 0 - vpickve2gr.d $s5, $vr7, 1 - vpickve2gr.d $s6, $vr2, 0 - vpickve2gr.d $s7, $vr2, 1 - vpickve2gr.d $s8, $vr6, 0 - vpickve2gr.d $ra, $vr6, 1 + vpickve2gr.d $t0, $vr8, 0 + vpickve2gr.d $t1, $vr8, 1 + vpickve2gr.d $t2, $vr3, 0 + vpickve2gr.d $t3, $vr3, 1 + vpickve2gr.d $t4, $vr7, 0 + vpickve2gr.d $t5, $vr7, 1 + vpickve2gr.d $t6, $vr2, 0 + vpickve2gr.d $t7, $vr2, 1 + vpickve2gr.d $t8, $vr6, 0 + vpickve2gr.d $s5, $vr6, 1 + vpickve2gr.d $s6, $vr1, 0 + vpickve2gr.d $s7, $vr1, 1 + vpickve2gr.d $s8, $vr5, 0 + vpickve2gr.d $ra, $vr5, 1 ldx.b $a6, $a0, $a6 ldx.b $a7, $a0, $a7 ldx.b $t0, $a0, $t0 @@ -149993,179 +149960,212 @@ quoteFunc: # @quoteFunc st.d $s3, $sp, 16 # 8-byte Folded Spill bstrpick.d $a1, $s3, 30, 4 slli.d $a3, $a1, 4 - vrepli.b $vr8, 0 st.d $s1, $sp, 32 # 8-byte Folded Spill move $a2, $s1 st.d $a3, $sp, 8 # 8-byte Folded Spill .LBB593_48: # %vector.body # =>This Inner Loop Header: Depth=1 - vld $vr9, $a2, 0 - vsrli.b $vr10, $vr9, 4 - vilvh.b $vr11, $vr8, $vr10 - vilvh.h $vr12, $vr8, $vr11 - vilvh.w $vr13, $vr8, $vr12 - vilvl.w $vr12, $vr8, $vr12 - vilvl.h $vr11, $vr8, $vr11 - vilvh.w $vr14, $vr8, $vr11 - vilvl.w $vr11, $vr8, $vr11 - vilvl.b $vr10, $vr8, $vr10 - vilvh.h $vr15, $vr8, $vr10 - vilvh.w $vr16, $vr8, $vr15 - vilvl.w $vr15, $vr8, $vr15 - vilvl.h $vr10, $vr8, $vr10 - vilvh.w $vr17, $vr8, $vr10 - vilvl.w $vr10, $vr8, $vr10 - vpickve2gr.d $a4, $vr10, 0 - vpickve2gr.d $a5, $vr10, 1 - vpickve2gr.d $a6, $vr17, 0 - vpickve2gr.d $a7, $vr17, 1 + vld $vr8, $a2, 0 + vsrli.b $vr9, $vr8, 4 + vbsrl.v $vr10, $vr9, 14 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vbsrl.v $vr11, $vr9, 12 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vbsrl.v $vr12, $vr9, 10 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vbsrl.v $vr13, $vr9, 8 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vsrli.d $vr14, $vr9, 48 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vsrli.d $vr15, $vr9, 32 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vshuf4i.b $vr16, $vr9, 14 + vsllwil.hu.bu $vr16, $vr16, 0 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.du.wu $vr16, $vr16, 0 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vpickve2gr.d $a4, $vr9, 0 + vpickve2gr.d $a5, $vr9, 1 + vpickve2gr.d $a6, $vr16, 0 + vpickve2gr.d $a7, $vr16, 1 vpickve2gr.d $t0, $vr15, 0 vpickve2gr.d $t1, $vr15, 1 - vpickve2gr.d $t3, $vr16, 0 - vpickve2gr.d $t4, $vr16, 1 - vpickve2gr.d $t5, $vr11, 0 - vpickve2gr.d $t8, $vr11, 1 - vpickve2gr.d $s7, $vr14, 0 - vpickve2gr.d $s8, $vr14, 1 - vpickve2gr.d $ra, $vr12, 0 - vpickve2gr.d $a1, $vr12, 1 - vpickve2gr.d $s1, $vr13, 0 - vpickve2gr.d $s2, $vr13, 1 + vpickve2gr.d $t2, $vr14, 0 + vpickve2gr.d $t3, $vr14, 1 + vpickve2gr.d $t4, $vr13, 0 + vpickve2gr.d $t6, $vr13, 1 + vpickve2gr.d $t8, $vr12, 0 + vpickve2gr.d $s7, $vr12, 1 + vpickve2gr.d $s8, $vr11, 0 + vpickve2gr.d $a1, $vr11, 1 + vpickve2gr.d $s1, $vr10, 0 + vpickve2gr.d $s2, $vr10, 1 ldx.b $s3, $a0, $a4 ldx.b $s4, $a0, $a5 - ldx.b $t2, $a0, $a6 - ldx.b $t6, $a0, $a7 + ldx.b $t5, $a0, $a6 + ldx.b $t7, $a0, $a7 ldx.b $s5, $a0, $t0 ldx.b $s6, $a0, $t1 - ldx.b $t7, $a0, $t3 - ldx.b $t3, $a0, $t4 - ldx.b $t4, $a0, $t5 - ldx.b $t0, $a0, $t8 + ldx.b $t2, $a0, $t2 + ldx.b $a7, $a0, $t3 + ldx.b $t0, $a0, $t4 + ldx.b $a5, $a0, $t6 + ldx.b $a6, $a0, $t8 ldx.b $t1, $a0, $s7 - ldx.b $t5, $a0, $s8 - ldx.b $ra, $a0, $ra + ldx.b $ra, $a0, $s8 ldx.b $s7, $a0, $a1 ldx.b $s8, $a0, $s1 - ldx.b $a7, $a0, $s2 - vslli.d $vr10, $vr0, 1 - vslli.d $vr11, $vr1, 1 - vslli.d $vr12, $vr2, 1 - vslli.d $vr13, $vr3, 1 - vslli.d $vr14, $vr4, 1 - vslli.d $vr15, $vr5, 1 - vslli.d $vr16, $vr6, 1 - vslli.d $vr17, $vr7, 1 - vpickve2gr.d $a1, $vr17, 0 - add.d $a5, $s0, $a1 - vpickve2gr.d $a1, $vr17, 1 - add.d $a6, $s0, $a1 + ldx.b $t6, $a0, $s2 + vslli.d $vr9, $vr0, 1 + vslli.d $vr10, $vr1, 1 + vslli.d $vr11, $vr2, 1 + vslli.d $vr12, $vr3, 1 + vslli.d $vr13, $vr4, 1 + vslli.d $vr14, $vr5, 1 + vslli.d $vr15, $vr6, 1 + vslli.d $vr16, $vr7, 1 vpickve2gr.d $a1, $vr16, 0 - add.d $a4, $s0, $a1 - st.b $s3, $a5, 2 + add.d $t3, $s0, $a1 vpickve2gr.d $a1, $vr16, 1 - add.d $t8, $s0, $a1 - st.b $s4, $a6, 2 + add.d $t4, $s0, $a1 vpickve2gr.d $a1, $vr15, 0 - add.d $s4, $s0, $a1 - st.b $t2, $a4, 2 + add.d $a4, $s0, $a1 + st.b $s3, $t3, 2 vpickve2gr.d $a1, $vr15, 1 - add.d $t2, $s0, $a1 - st.b $t6, $t8, 2 + add.d $t8, $s0, $a1 + st.b $s4, $t4, 2 vpickve2gr.d $a1, $vr14, 0 - add.d $t6, $s0, $a1 - st.b $s5, $s4, 2 + add.d $s4, $s0, $a1 + st.b $t5, $a4, 2 vpickve2gr.d $a1, $vr14, 1 - add.d $s5, $s0, $a1 - st.b $s6, $t2, 2 + add.d $t5, $s0, $a1 + st.b $t7, $t8, 2 vpickve2gr.d $a1, $vr13, 0 - add.d $s6, $s0, $a1 - st.b $t7, $t6, 2 - vpickve2gr.d $a1, $vr13, 1 add.d $t7, $s0, $a1 - st.b $t3, $s5, 2 + st.b $s5, $s4, 2 + vpickve2gr.d $a1, $vr13, 1 + add.d $s5, $s0, $a1 + st.b $s6, $t5, 2 vpickve2gr.d $a1, $vr12, 0 - add.d $t3, $s0, $a1 - st.b $t4, $s6, 2 + add.d $s6, $s0, $a1 + st.b $t2, $t7, 2 vpickve2gr.d $a1, $vr12, 1 - add.d $t4, $s0, $a1 - st.b $t0, $t7, 2 + add.d $t2, $s0, $a1 + st.b $a7, $s5, 2 vpickve2gr.d $a1, $vr11, 0 - add.d $t0, $s0, $a1 - st.b $t1, $t3, 2 + add.d $a7, $s0, $a1 + st.b $t0, $s6, 2 vpickve2gr.d $a1, $vr11, 1 - add.d $t1, $s0, $a1 - st.b $t5, $t4, 2 + add.d $t0, $s0, $a1 + st.b $a5, $t2, 2 vpickve2gr.d $a1, $vr10, 0 - add.d $t5, $s0, $a1 - st.b $ra, $t0, 2 + add.d $a5, $s0, $a1 + st.b $a6, $a7, 2 vpickve2gr.d $a1, $vr10, 1 - st.b $s7, $t1, 2 - st.b $s8, $t5, 2 - vandi.b $vr9, $vr9, 15 - vilvl.b $vr10, $vr8, $vr9 - vilvl.h $vr11, $vr8, $vr10 - vilvl.w $vr12, $vr8, $vr11 - vpickve2gr.d $s1, $vr12, 0 - vpickve2gr.d $s2, $vr12, 1 + add.d $a6, $s0, $a1 + st.b $t1, $t0, 2 + vpickve2gr.d $a1, $vr9, 0 + add.d $t1, $s0, $a1 + st.b $ra, $a5, 2 + vpickve2gr.d $a1, $vr9, 1 + st.b $s7, $a6, 2 + st.b $s8, $t1, 2 + vandi.b $vr8, $vr8, 15 + vsllwil.hu.bu $vr9, $vr8, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vpickve2gr.d $s1, $vr9, 0 + vpickve2gr.d $s2, $vr9, 1 ldx.b $s1, $a0, $s1 ldx.b $s2, $a0, $s2 add.d $s7, $s0, $a1 - st.b $a7, $s7, 2 - st.b $s1, $a5, 3 - st.b $s2, $a6, 3 - vilvh.b $vr9, $vr8, $vr9 - vilvl.h $vr12, $vr8, $vr9 - vilvh.w $vr11, $vr8, $vr11 + st.b $t6, $s7, 2 + st.b $s1, $t3, 3 + st.b $s2, $t4, 3 + vbsrl.v $vr9, $vr8, 8 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vsrli.d $vr10, $vr8, 48 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vshuf4i.b $vr11, $vr8, 14 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 vpickve2gr.d $a1, $vr11, 0 ldx.b $a1, $a0, $a1 - vilvl.w $vr13, $vr8, $vr12 - vilvh.h $vr10, $vr8, $vr10 - vilvh.w $vr14, $vr8, $vr10 + vsrli.d $vr12, $vr8, 32 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 st.b $a1, $a4, 3 vpickve2gr.d $a1, $vr11, 1 ldx.b $a1, $a0, $a1 - vilvl.w $vr10, $vr8, $vr10 - vpickve2gr.d $a4, $vr10, 0 + vsllwil.du.wu $vr11, $vr12, 0 + vpickve2gr.d $a4, $vr11, 0 ldx.b $a4, $a0, $a4 st.b $a1, $t8, 3 - vpickve2gr.d $a1, $vr10, 1 - vpickve2gr.d $a5, $vr14, 0 + vpickve2gr.d $a1, $vr11, 1 + vpickve2gr.d $t3, $vr10, 0 st.b $a4, $s4, 3 - vpickve2gr.d $a4, $vr14, 1 - vpickve2gr.d $a6, $vr13, 0 + vpickve2gr.d $a4, $vr10, 1 + vpickve2gr.d $t4, $vr9, 0 ldx.b $a1, $a0, $a1 - ldx.b $a5, $a0, $a5 + ldx.b $t3, $a0, $t3 ldx.b $a4, $a0, $a4 - ldx.b $a6, $a0, $a6 - st.b $a1, $t2, 3 - st.b $a5, $t6, 3 + ldx.b $t4, $a0, $t4 + st.b $a1, $t5, 3 + st.b $t3, $t7, 3 st.b $a4, $s5, 3 - st.b $a6, $s6, 3 - vpickve2gr.d $a1, $vr13, 1 + st.b $t4, $s6, 3 + vbsrl.v $vr10, $vr8, 14 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vbsrl.v $vr11, $vr8, 12 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vbsrl.v $vr8, $vr8, 10 + vpickve2gr.d $a1, $vr9, 1 ldx.b $a1, $a0, $a1 - vilvh.h $vr9, $vr8, $vr9 - vilvh.w $vr10, $vr8, $vr9 - vilvh.w $vr11, $vr8, $vr12 - st.b $a1, $t7, 3 - vpickve2gr.d $a1, $vr11, 0 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.du.wu $vr8, $vr8, 0 + st.b $a1, $t2, 3 + vpickve2gr.d $a1, $vr8, 0 ldx.b $a1, $a0, $a1 - vilvl.w $vr9, $vr8, $vr9 - vpickve2gr.d $a4, $vr11, 1 + vsllwil.du.wu $vr9, $vr11, 0 + vpickve2gr.d $a4, $vr8, 1 ldx.b $a4, $a0, $a4 - st.b $a1, $t3, 3 + st.b $a1, $a7, 3 vpickve2gr.d $a1, $vr9, 0 - vpickve2gr.d $a5, $vr9, 1 - st.b $a4, $t4, 3 + vpickve2gr.d $a7, $vr9, 1 + st.b $a4, $t0, 3 vpickve2gr.d $a4, $vr10, 0 - vpickve2gr.d $a6, $vr10, 1 + vpickve2gr.d $t0, $vr10, 1 ldx.b $a1, $a0, $a1 - ldx.b $a5, $a0, $a5 + ldx.b $a7, $a0, $a7 ldx.b $a4, $a0, $a4 - ldx.b $a6, $a0, $a6 - st.b $a1, $t0, 3 - st.b $a5, $t1, 3 - st.b $a4, $t5, 3 - st.b $a6, $s7, 3 + ldx.b $t0, $a0, $t0 + st.b $a1, $a5, 3 + st.b $a7, $a6, 3 + st.b $a4, $t1, 3 + st.b $t0, $s7, 3 vaddi.du $vr7, $vr7, 16 vaddi.du $vr6, $vr6, 16 vaddi.du $vr5, $vr5, 16 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Aes.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Aes.s index 65036e92..183a6c3b 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Aes.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Aes.s @@ -7,7 +7,6 @@ AesGenTables: # @AesGenTables # %bb.0: # %vector.ph pcalau12i $a0, %pc_hi20(Sbox) addi.d $a0, $a0, %pc_lo12(Sbox) - vrepli.b $vr0, 0 pcalau12i $a1, %pc_hi20(InvS) addi.d $a1, $a1, %pc_lo12(InvS) move $a2, $zero @@ -15,67 +14,84 @@ AesGenTables: # @AesGenTables .p2align 4, , 16 .LBB0_1: # %vector.body # =>This Inner Loop Header: Depth=1 - vldx $vr1, $a0, $a2 + vldx $vr0, $a0, $a2 ori $a4, $a2, 1 - vilvl.b $vr2, $vr0, $vr1 - vilvl.h $vr3, $vr0, $vr2 - vilvl.w $vr4, $vr0, $vr3 - vpickve2gr.d $a5, $vr4, 0 + vsllwil.hu.bu $vr1, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vpickve2gr.d $a5, $vr1, 0 stx.b $a2, $a1, $a5 - vpickve2gr.d $a5, $vr4, 1 + vpickve2gr.d $a5, $vr1, 1 stx.b $a4, $a1, $a5 ori $a4, $a2, 2 - vilvh.w $vr3, $vr0, $vr3 - vpickve2gr.d $a5, $vr3, 0 + vshuf4i.b $vr1, $vr0, 14 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vpickve2gr.d $a5, $vr1, 0 stx.b $a4, $a1, $a5 ori $a4, $a2, 3 - vpickve2gr.d $a5, $vr3, 1 + vpickve2gr.d $a5, $vr1, 1 stx.b $a4, $a1, $a5 ori $a4, $a2, 4 - vilvh.h $vr2, $vr0, $vr2 - vilvl.w $vr3, $vr0, $vr2 - vpickve2gr.d $a5, $vr3, 0 + vsrli.d $vr1, $vr0, 32 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vpickve2gr.d $a5, $vr1, 0 stx.b $a4, $a1, $a5 ori $a4, $a2, 5 - vpickve2gr.d $a5, $vr3, 1 + vpickve2gr.d $a5, $vr1, 1 stx.b $a4, $a1, $a5 ori $a4, $a2, 6 - vilvh.w $vr2, $vr0, $vr2 - vpickve2gr.d $a5, $vr2, 0 + vsrli.d $vr1, $vr0, 48 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vpickve2gr.d $a5, $vr1, 0 stx.b $a4, $a1, $a5 ori $a4, $a2, 7 - vpickve2gr.d $a5, $vr2, 1 + vpickve2gr.d $a5, $vr1, 1 stx.b $a4, $a1, $a5 ori $a4, $a2, 8 - vilvh.b $vr1, $vr0, $vr1 - vilvl.h $vr2, $vr0, $vr1 - vilvl.w $vr3, $vr0, $vr2 - vpickve2gr.d $a5, $vr3, 0 + vbsrl.v $vr1, $vr0, 8 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vpickve2gr.d $a5, $vr1, 0 stx.b $a4, $a1, $a5 ori $a4, $a2, 9 - vpickve2gr.d $a5, $vr3, 1 + vpickve2gr.d $a5, $vr1, 1 stx.b $a4, $a1, $a5 ori $a4, $a2, 10 - vilvh.w $vr2, $vr0, $vr2 - vpickve2gr.d $a5, $vr2, 0 + vbsrl.v $vr1, $vr0, 10 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vpickve2gr.d $a5, $vr1, 0 stx.b $a4, $a1, $a5 ori $a4, $a2, 11 - vpickve2gr.d $a5, $vr2, 1 + vpickve2gr.d $a5, $vr1, 1 stx.b $a4, $a1, $a5 ori $a4, $a2, 12 - vilvh.h $vr1, $vr0, $vr1 - vilvl.w $vr2, $vr0, $vr1 - vpickve2gr.d $a5, $vr2, 0 + vbsrl.v $vr1, $vr0, 12 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vpickve2gr.d $a5, $vr1, 0 stx.b $a4, $a1, $a5 ori $a4, $a2, 13 - vpickve2gr.d $a5, $vr2, 1 + vpickve2gr.d $a5, $vr1, 1 stx.b $a4, $a1, $a5 ori $a4, $a2, 14 - vilvh.w $vr1, $vr0, $vr1 - vpickve2gr.d $a5, $vr1, 0 + vbsrl.v $vr0, $vr0, 14 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vpickve2gr.d $a5, $vr0, 0 stx.b $a4, $a1, $a5 ori $a4, $a2, 15 - vpickve2gr.d $a5, $vr1, 1 + vpickve2gr.d $a5, $vr0, 1 addi.d $a2, $a2, 16 stx.b $a4, $a1, $a5 bne $a2, $a3, .LBB0_1 @@ -88,7 +104,8 @@ AesGenTables: # @AesGenTables addi.d $a5, $a3, %pc_lo12(T) move $a3, $zero add.d $a4, $a5, $a4 - vrepli.b $vr1, -1 + vrepli.b $vr0, -1 + vrepli.b $vr1, 0 vrepli.w $vr2, 27 vrepli.w $vr3, 254 vrepli.w $vr4, 64 @@ -99,15 +116,13 @@ AesGenTables: # @AesGenTables # =>This Inner Loop Header: Depth=1 ldx.w $a6, $a0, $a3 vinsgr2vr.w $vr6, $a6, 0 - vilvl.b $vr7, $vr0, $vr6 - vilvl.h $vr7, $vr0, $vr7 + vsllwil.hu.bu $vr7, $vr6, 0 + vsllwil.wu.hu $vr7, $vr7, 0 vslli.w $vr8, $vr7, 1 - vslt.b $vr6, $vr1, $vr6 - vilvl.b $vr6, $vr6, $vr6 - vilvl.h $vr6, $vr6, $vr6 - vslli.w $vr6, $vr6, 24 - vsrai.w $vr6, $vr6, 24 - vbitsel.v $vr6, $vr2, $vr0, $vr6 + vslt.b $vr6, $vr0, $vr6 + vsllwil.h.b $vr6, $vr6, 0 + vsllwil.w.h $vr6, $vr6, 0 + vbitsel.v $vr6, $vr2, $vr1, $vr6 vand.v $vr8, $vr8, $vr3 vxor.v $vr6, $vr8, $vr6 vxor.v $vr8, $vr6, $vr7 @@ -138,15 +153,13 @@ AesGenTables: # @AesGenTables vor.v $vr6, $vr6, $vr9 vst $vr6, $a4, 1024 vinsgr2vr.w $vr6, $a6, 0 - vilvl.b $vr7, $vr0, $vr6 - vilvl.h $vr7, $vr0, $vr7 + vsllwil.hu.bu $vr7, $vr6, 0 + vsllwil.wu.hu $vr7, $vr7, 0 vslli.w $vr8, $vr7, 1 - vslt.b $vr6, $vr1, $vr6 - vilvl.b $vr6, $vr6, $vr6 - vilvl.h $vr6, $vr6, $vr6 - vslli.w $vr6, $vr6, 24 - vsrai.w $vr6, $vr6, 24 - vbitsel.v $vr6, $vr2, $vr0, $vr6 + vslt.b $vr6, $vr0, $vr6 + vsllwil.h.b $vr6, $vr6, 0 + vsllwil.w.h $vr6, $vr6, 0 + vbitsel.v $vr6, $vr2, $vr1, $vr6 vand.v $vr8, $vr8, $vr3 vxor.v $vr6, $vr8, $vr6 vslli.w $vr8, $vr6, 1 @@ -282,55 +295,55 @@ AesCbc_Encode: # @AesCbc_Encode .p2align 4, 0x0 # -- Begin function AesCbc_Decode .LCPI2_0: .byte 0 # 0x0 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 .byte 4 # 0x4 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 8 # 0x8 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b .byte 12 # 0xc - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI2_1: .byte 1 # 0x1 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 .byte 5 # 0x5 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 9 # 0x9 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b .byte 13 # 0xd - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI2_2: .byte 2 # 0x2 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 .byte 6 # 0x6 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 10 # 0xa - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b .byte 14 # 0xe - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .text .globl AesCbc_Decode .p2align 5 @@ -351,14 +364,13 @@ AesCbc_Decode: # @AesCbc_Decode st.d $s6, $sp, 24 # 8-byte Folded Spill st.d $s7, $sp, 16 # 8-byte Folded Spill st.d $s8, $sp, 8 # 8-byte Folded Spill + pcalau12i $a3, %pc_hi20(.LCPI2_0) + vld $vr0, $a3, %pc_lo12(.LCPI2_0) + pcalau12i $a3, %pc_hi20(.LCPI2_1) + vld $vr1, $a3, %pc_lo12(.LCPI2_1) + pcalau12i $a3, %pc_hi20(.LCPI2_2) + vld $vr2, $a3, %pc_lo12(.LCPI2_2) addi.d $a3, $a0, 16 - pcalau12i $a4, %pc_hi20(.LCPI2_0) - vld $vr0, $a4, %pc_lo12(.LCPI2_0) - pcalau12i $a4, %pc_hi20(.LCPI2_1) - vld $vr1, $a4, %pc_lo12(.LCPI2_1) - pcalau12i $a4, %pc_hi20(.LCPI2_2) - vld $vr2, $a4, %pc_lo12(.LCPI2_2) - vrepli.b $vr3, 0 lu12i.w $a4, -4096 lu32i.d $a4, 0 pcalau12i $a5, %pc_hi20(D) @@ -445,30 +457,36 @@ AesCbc_Decode: # @AesCbc_Decode xor $t2, $t4, $t2 xor $t2, $t2, $t3 st.w $t2, $a1, 12 - vst $vr4, $a0, 0 + vst $vr3, $a0, 0 addi.d $a2, $a2, -1 addi.d $a1, $a1, 16 beqz $a2, .LBB2_6 .LBB2_3: # =>This Loop Header: Depth=1 # Child Loop BB2_4 Depth 2 - vld $vr4, $a1, 0 - vshuf.b $vr5, $vr3, $vr4, $vr0 - vshuf.b $vr6, $vr3, $vr4, $vr1 - vslli.w $vr6, $vr6, 8 - vor.v $vr5, $vr6, $vr5 + vld $vr3, $a1, 0 + vshuf.b $vr4, $vr0, $vr3, $vr0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vshuf.b $vr5, $vr0, $vr3, $vr1 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vslli.w $vr5, $vr5, 8 + vor.v $vr4, $vr5, $vr4 + vshuf.b $vr5, $vr0, $vr3, $vr2 + vsllwil.hu.bu $vr5, $vr5, 0 ld.w $t2, $a3, 0 - vshuf.b $vr6, $vr3, $vr4, $vr2 - vslli.w $vr6, $vr6, 16 - vor.v $vr5, $vr5, $vr6 + vsllwil.wu.hu $vr5, $vr5, 0 + vslli.w $vr5, $vr5, 16 + vor.v $vr4, $vr4, $vr5 slli.d $t3, $t2, 3 addi.d $t3, $t3, 4 bstrpick.d $t3, $t3, 31, 2 slli.d $t3, $t3, 4 - vldx $vr6, $a3, $t3 - vshuf4i.b $vr4, $vr4, 3 - vslli.w $vr4, $vr4, 24 - vor.v $vr4, $vr5, $vr4 - vxor.v $vr5, $vr6, $vr4 + vldx $vr5, $a3, $t3 + vshuf4i.b $vr3, $vr3, 3 + vslli.w $vr3, $vr3, 24 + vor.v $vr3, $vr4, $vr3 + vxor.v $vr4, $vr5, $vr3 addi.w $t3, $t2, -1 slli.d $t2, $t2, 5 bstrpick.d $t2, $t2, 33, 5 @@ -477,22 +495,22 @@ AesCbc_Decode: # @AesCbc_Decode .p2align 4, , 16 .LBB2_4: # Parent Loop BB2_3 Depth=1 # => This Inner Loop Header: Depth=2 - vpickve2gr.w $t4, $vr5, 0 + vpickve2gr.w $t4, $vr4, 0 and $t7, $t4, $a4 andi $t4, $t4, 255 slli.d $t4, $t4, 2 ldx.w $t8, $a5, $t4 - vpickve2gr.w $fp, $vr5, 1 + vpickve2gr.w $fp, $vr4, 1 and $t4, $fp, $a4 srli.d $t4, $t4, 22 ldx.w $t4, $a6, $t4 ld.w $t5, $t2, 0 - vpickve2gr.w $s0, $vr5, 2 + vpickve2gr.w $s0, $vr4, 2 and $t6, $s0, $a4 andi $s0, $s0, 255 slli.d $s0, $s0, 2 ldx.w $s0, $a5, $s0 - vpickve2gr.w $s1, $vr5, 3 + vpickve2gr.w $s1, $vr4, 3 and $s2, $s1, $a4 srli.d $s2, $s2, 22 ldx.w $s2, $a6, $s2 @@ -502,8 +520,8 @@ AesCbc_Decode: # @AesCbc_Decode ldx.w $s1, $a5, $s1 srli.d $t7, $t7, 22 ldx.w $t7, $a6, $t7 - vsrli.w $vr6, $vr5, 8 - vpickve2gr.b $s4, $vr6, 12 + vsrli.w $vr5, $vr4, 8 + vpickve2gr.b $s4, $vr5, 12 andi $s4, $s4, 255 slli.d $s4, $s4, 2 ldx.w $s4, $a7, $s4 @@ -511,15 +529,15 @@ AesCbc_Decode: # @AesCbc_Decode slli.d $fp, $fp, 2 ldx.w $fp, $a5, $fp xor $t8, $s4, $t8 - vpickve2gr.b $s4, $vr6, 4 + vpickve2gr.b $s4, $vr5, 4 andi $s4, $s4, 255 slli.d $s4, $s4, 2 ldx.w $s4, $a7, $s4 - vpickve2gr.b $s5, $vr6, 8 + vpickve2gr.b $s5, $vr5, 8 andi $s5, $s5, 255 slli.d $s5, $s5, 2 ldx.w $s5, $a7, $s5 - vpickve2gr.b $s6, $vr6, 0 + vpickve2gr.b $s6, $vr5, 0 andi $s6, $s6, 255 slli.d $s6, $s6, 2 ldx.w $s6, $a7, $s6 @@ -527,16 +545,16 @@ AesCbc_Decode: # @AesCbc_Decode xor $s0, $s4, $s0 xor $s1, $s5, $s1 xor $fp, $s6, $fp - vsrli.w $vr5, $vr5, 16 - vpickve2gr.b $s4, $vr5, 8 + vsrli.w $vr4, $vr4, 16 + vpickve2gr.b $s4, $vr4, 8 andi $s4, $s4, 255 slli.d $s4, $s4, 2 ldx.w $s4, $t0, $s4 - vpickve2gr.b $s5, $vr5, 0 + vpickve2gr.b $s5, $vr4, 0 andi $s5, $s5, 255 slli.d $s5, $s5, 2 ldx.w $s5, $t0, $s5 - vpickve2gr.b $s6, $vr5, 4 + vpickve2gr.b $s6, $vr4, 4 andi $s6, $s6, 255 slli.d $s6, $s6, 2 ldx.w $s6, $t0, $s6 @@ -548,7 +566,7 @@ AesCbc_Decode: # @AesCbc_Decode xor $t5, $t5, $s3 xor $t8, $s1, $s6 xor $t7, $t8, $t7 - vpickve2gr.b $t8, $vr5, 12 + vpickve2gr.b $t8, $vr4, 12 andi $t8, $t8, 255 slli.d $t8, $t8, 2 ldx.w $s0, $t0, $t8 @@ -609,27 +627,27 @@ AesCbc_Decode: # @AesCbc_Decode ldx.w $t8, $t0, $t8 srli.d $s4, $s4, 22 ldx.w $s4, $a6, $s4 - vinsgr2vr.w $vr5, $s3, 0 - vinsgr2vr.w $vr5, $t4, 1 - vinsgr2vr.w $vr5, $s7, 2 - vinsgr2vr.w $vr5, $t5, 3 - vinsgr2vr.w $vr6, $s1, 0 - vinsgr2vr.w $vr6, $fp, 1 - vinsgr2vr.w $vr6, $s0, 2 - vinsgr2vr.w $vr6, $t7, 3 - vxor.v $vr5, $vr5, $vr6 - vinsgr2vr.w $vr6, $s5, 0 - vinsgr2vr.w $vr6, $t8, 1 - vinsgr2vr.w $vr6, $ra, 2 - vinsgr2vr.w $vr6, $t6, 3 - vxor.v $vr5, $vr5, $vr6 - vinsgr2vr.w $vr6, $s6, 0 - vinsgr2vr.w $vr6, $s4, 1 - vld $vr7, $t2, -16 - vinsgr2vr.w $vr6, $s2, 2 - vinsgr2vr.w $vr6, $s8, 3 - vxor.v $vr5, $vr5, $vr6 - vxor.v $vr5, $vr5, $vr7 + vinsgr2vr.w $vr4, $s3, 0 + vinsgr2vr.w $vr4, $t4, 1 + vinsgr2vr.w $vr4, $s7, 2 + vinsgr2vr.w $vr4, $t5, 3 + vinsgr2vr.w $vr5, $s1, 0 + vinsgr2vr.w $vr5, $fp, 1 + vinsgr2vr.w $vr5, $s0, 2 + vinsgr2vr.w $vr5, $t7, 3 + vxor.v $vr4, $vr4, $vr5 + vinsgr2vr.w $vr5, $s5, 0 + vinsgr2vr.w $vr5, $t8, 1 + vinsgr2vr.w $vr5, $ra, 2 + vinsgr2vr.w $vr5, $t6, 3 + vxor.v $vr4, $vr4, $vr5 + vinsgr2vr.w $vr5, $s6, 0 + vinsgr2vr.w $vr5, $s4, 1 + vld $vr6, $t2, -16 + vinsgr2vr.w $vr5, $s2, 2 + vinsgr2vr.w $vr5, $s8, 3 + vxor.v $vr4, $vr4, $vr5 + vxor.v $vr4, $vr4, $vr6 addi.w $t3, $t3, -1 addi.d $t2, $t2, -32 b .LBB2_4 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Ppmd8.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Ppmd8.s index 6901093f..8449f3bc 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Ppmd8.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Ppmd8.s @@ -2165,12 +2165,12 @@ RestoreModel: # @RestoreModel # %bb.11: ori $s0, $zero, 63 ori $s3, $zero, 8 - vrepli.b $vr10, 0 - vrepli.b $vr11, 63 - vrepli.w $vr12, 1 - vst $vr10, $sp, 48 # 16-byte Folded Spill - vst $vr11, $sp, 32 # 16-byte Folded Spill - vst $vr12, $sp, 16 # 16-byte Folded Spill + vrepli.b $vr0, 0 + vst $vr0, $sp, 48 # 16-byte Folded Spill + vrepli.b $vr10, 63 + vrepli.w $vr11, 1 + vst $vr10, $sp, 32 # 16-byte Folded Spill + vst $vr11, $sp, 16 # 16-byte Folded Spill b .LBB14_14 .p2align 4, , 16 .LBB14_12: # in Loop: Header=BB14_14 Depth=1 @@ -2232,20 +2232,19 @@ RestoreModel: # @RestoreModel slli.d $a2, $a3, 2 alsl.d $a2, $a3, $a2, 1 add.d $a2, $a0, $a2 - vld $vr10, $sp, 48 # 16-byte Folded Reload - vori.b $vr2, $vr10, 0 + vld $vr3, $sp, 48 # 16-byte Folded Reload + vori.b $vr2, $vr3, 0 vinsgr2vr.w $vr2, $a6, 0 - vori.b $vr1, $vr10, 0 + vori.b $vr1, $vr3, 0 vinsgr2vr.w $vr1, $a5, 0 - vori.b $vr0, $vr10, 0 + vori.b $vr0, $vr3, 0 vinsgr2vr.w $vr0, $a4, 0 addi.d $a0, $a0, 24 move $a4, $a3 - vori.b $vr5, $vr10, 0 - vori.b $vr4, $vr10, 0 - vori.b $vr3, $vr10, 0 - vld $vr11, $sp, 32 # 16-byte Folded Reload - vld $vr12, $sp, 16 # 16-byte Folded Reload + vori.b $vr5, $vr3, 0 + vori.b $vr4, $vr3, 0 + vld $vr10, $sp, 32 # 16-byte Folded Reload + vld $vr11, $sp, 16 # 16-byte Folded Reload .p2align 4, , 16 .LBB14_19: # %vector.body # Parent Loop BB14_14 Depth=1 @@ -2278,20 +2277,20 @@ RestoreModel: # @RestoreModel vinsgr2vr.b $vr7, $a6, 1 vinsgr2vr.b $vr7, $a7, 2 vinsgr2vr.b $vr7, $t0, 3 - vilvl.b $vr8, $vr10, $vr6 - vilvl.h $vr8, $vr10, $vr8 - vilvl.b $vr9, $vr10, $vr7 - vilvl.h $vr9, $vr10, $vr9 + vsllwil.hu.bu $vr8, $vr6, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vsllwil.hu.bu $vr9, $vr7, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vsub.w $vr2, $vr2, $vr8 vsub.w $vr5, $vr5, $vr9 vsrli.b $vr8, $vr6, 1 vsub.b $vr6, $vr6, $vr8 - vilvl.b $vr8, $vr10, $vr6 - vilvl.h $vr8, $vr10, $vr8 + vsllwil.hu.bu $vr8, $vr6, 0 + vsllwil.wu.hu $vr8, $vr8, 0 vsrli.b $vr9, $vr7, 1 vsub.b $vr7, $vr7, $vr9 - vilvl.b $vr9, $vr10, $vr7 - vilvl.h $vr9, $vr10, $vr9 + vsllwil.hu.bu $vr9, $vr7, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vstelm.b $vr6, $a0, -17, 0 vstelm.b $vr6, $a0, -11, 1 vstelm.b $vr6, $a0, -5, 2 @@ -2318,15 +2317,15 @@ RestoreModel: # @RestoreModel vinsgr2vr.b $vr7, $a6, 1 vinsgr2vr.b $vr7, $a7, 2 vinsgr2vr.b $vr7, $t0, 3 - vslt.bu $vr6, $vr11, $vr6 + vslt.bu $vr6, $vr10, $vr6 vilvl.b $vr6, $vr6, $vr6 vilvl.h $vr6, $vr6, $vr6 - vand.v $vr6, $vr6, $vr12 + vand.v $vr6, $vr6, $vr11 vslli.w $vr6, $vr6, 3 - vslt.bu $vr7, $vr11, $vr7 + vslt.bu $vr7, $vr10, $vr7 vilvl.b $vr7, $vr7, $vr7 vilvl.h $vr7, $vr7, $vr7 - vand.v $vr7, $vr7, $vr12 + vand.v $vr7, $vr7, $vr11 vslli.w $vr7, $vr7, 3 vor.v $vr0, $vr6, $vr0 vor.v $vr3, $vr7, $vr3 @@ -2845,50 +2844,59 @@ AllocUnitsRare: # @AllocUnitsRare .type GetUsedMemory,@function GetUsedMemory: # @GetUsedMemory # %bb.0: - vld $vr0, $a0, 512 - vld $vr1, $a0, 544 + vld $vr0, $a0, 544 + vld $vr1, $a0, 512 vld $vr2, $a0, 528 vld $vr3, $a0, 560 vld $vr4, $a0, 448 vld $vr5, $a0, 496 - vld $vr6, $a0, 464 - vld $vr7, $a0, 144 + vld $vr6, $a0, 144 + vld $vr7, $a0, 464 vld $vr8, $a0, 480 vld $vr9, $a0, 128 - vrepli.b $vr10, 0 - vilvl.b $vr11, $vr10, $vr7 - vilvl.h $vr12, $vr10, $vr11 - vilvh.b $vr7, $vr10, $vr7 - vilvl.h $vr13, $vr10, $vr7 - vilvh.h $vr11, $vr10, $vr11 - vilvh.h $vr7, $vr10, $vr7 - vilvl.b $vr14, $vr10, $vr9 - vilvl.h $vr15, $vr10, $vr14 - vilvh.b $vr9, $vr10, $vr9 - vilvh.h $vr16, $vr10, $vr9 - vilvh.h $vr14, $vr10, $vr14 - vilvl.h $vr9, $vr10, $vr9 + vbsrl.v $vr10, $vr6, 8 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.hu.bu $vr11, $vr6, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsrli.d $vr12, $vr6, 32 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vbsrl.v $vr6, $vr6, 12 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr13, $vr9, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vbsrl.v $vr14, $vr9, 12 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsrli.d $vr15, $vr9, 32 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vbsrl.v $vr9, $vr9, 8 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 vmul.w $vr8, $vr8, $vr9 - vmul.w $vr6, $vr6, $vr14 + vmul.w $vr7, $vr7, $vr15 ld.w $a1, $a0, 160 - vmul.w $vr5, $vr5, $vr16 - vmul.w $vr4, $vr4, $vr15 + vmul.w $vr5, $vr5, $vr14 + vmul.w $vr4, $vr4, $vr13 vld $vr9, $a0, 576 - vinsgr2vr.w $vr14, $a1, 0 + vinsgr2vr.w $vr13, $a1, 0 ld.w $a1, $a0, 592 ld.bu $a2, $a0, 164 ld.w $a3, $a0, 596 ld.bu $a4, $a0, 165 - vilvl.b $vr14, $vr10, $vr14 - vilvl.h $vr10, $vr10, $vr14 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 mul.d $a1, $a1, $a2 mul.d $a2, $a3, $a4 - vmadd.w $vr4, $vr9, $vr10 - vmadd.w $vr5, $vr3, $vr7 - vmadd.w $vr6, $vr2, $vr11 - vadd.w $vr2, $vr6, $vr5 - vmadd.w $vr8, $vr1, $vr13 - vmadd.w $vr4, $vr0, $vr12 + vmadd.w $vr4, $vr9, $vr13 + vmadd.w $vr5, $vr3, $vr6 + vmadd.w $vr7, $vr2, $vr12 + vadd.w $vr2, $vr7, $vr5 + vmadd.w $vr4, $vr1, $vr11 + vmadd.w $vr8, $vr0, $vr10 vadd.w $vr0, $vr4, $vr8 vadd.w $vr0, $vr0, $vr2 vhaddw.d.w $vr0, $vr0, $vr0 @@ -3232,13 +3240,13 @@ CutOff: # @CutOff sub.d $s3, $s3, $a5 slli.d $t1, $a2, 5 alsl.d $a2, $a2, $t1, 4 - vrepli.b $vr2, 0 + vrepli.b $vr1, 0 add.d $a2, $a0, $a2 - vori.b $vr3, $vr2, 0 + vori.b $vr3, $vr1, 0 vinsgr2vr.w $vr3, $a6, 0 - vori.b $vr1, $vr2, 0 - vinsgr2vr.w $vr1, $t0, 0 - vori.b $vr0, $vr2, 0 + vori.b $vr2, $vr1, 0 + vinsgr2vr.w $vr2, $t0, 0 + vori.b $vr0, $vr1, 0 vinsgr2vr.w $vr0, $a7, 0 vreplgr2vr.w $vr4, $s1 addi.d $a0, $a0, 24 @@ -3246,9 +3254,8 @@ CutOff: # @CutOff vrepli.b $vr7, 63 vrepli.w $vr8, 1 move $a6, $a5 - vori.b $vr10, $vr2, 0 - vori.b $vr9, $vr2, 0 - vori.b $vr6, $vr2, 0 + vori.b $vr9, $vr1, 0 + vori.b $vr6, $vr1, 0 .p2align 4, , 16 .LBB17_35: # %vector.body # =>This Inner Loop Header: Depth=1 @@ -3256,77 +3263,77 @@ CutOff: # @CutOff ld.b $t0, $a0, -11 ld.b $t1, $a0, -5 ld.b $t2, $a0, 1 - vinsgr2vr.b $vr11, $a7, 0 - vinsgr2vr.b $vr11, $t0, 1 - vinsgr2vr.b $vr11, $t1, 2 - vinsgr2vr.b $vr11, $t2, 3 + vinsgr2vr.b $vr10, $a7, 0 + vinsgr2vr.b $vr10, $t0, 1 + vinsgr2vr.b $vr10, $t1, 2 + vinsgr2vr.b $vr10, $t2, 3 ld.b $a7, $a0, 7 ld.b $t0, $a0, 13 ld.b $t1, $a0, 19 ld.b $t2, $a0, 25 - vinsgr2vr.b $vr12, $a7, 0 - vinsgr2vr.b $vr12, $t0, 1 - vinsgr2vr.b $vr12, $t1, 2 - vinsgr2vr.b $vr12, $t2, 3 - vilvl.b $vr11, $vr2, $vr11 - vilvl.h $vr11, $vr2, $vr11 - vilvl.b $vr12, $vr2, $vr12 - vilvl.h $vr12, $vr2, $vr12 - vsub.w $vr3, $vr3, $vr11 - vsub.w $vr10, $vr10, $vr12 + vinsgr2vr.b $vr11, $a7, 0 + vinsgr2vr.b $vr11, $t0, 1 + vinsgr2vr.b $vr11, $t1, 2 + vinsgr2vr.b $vr11, $t2, 3 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsub.w $vr3, $vr3, $vr10 + vsub.w $vr1, $vr1, $vr11 + vadd.w $vr10, $vr4, $vr10 vadd.w $vr11, $vr4, $vr11 - vadd.w $vr12, $vr4, $vr12 + vsrl.w $vr10, $vr10, $vr4 vsrl.w $vr11, $vr11, $vr4 - vsrl.w $vr12, $vr12, $vr4 - vstelm.b $vr11, $a0, -17, 0 - vstelm.b $vr11, $a0, -11, 4 - vstelm.b $vr11, $a0, -5, 8 - vstelm.b $vr11, $a0, 1, 12 - vstelm.b $vr12, $a0, 7, 0 - vstelm.b $vr12, $a0, 13, 4 - vstelm.b $vr12, $a0, 19, 8 - vstelm.b $vr12, $a0, 25, 12 + vstelm.b $vr10, $a0, -17, 0 + vstelm.b $vr10, $a0, -11, 4 + vstelm.b $vr10, $a0, -5, 8 + vstelm.b $vr10, $a0, 1, 12 + vstelm.b $vr11, $a0, 7, 0 + vstelm.b $vr11, $a0, 13, 4 + vstelm.b $vr11, $a0, 19, 8 + vstelm.b $vr11, $a0, 25, 12 + vand.v $vr10, $vr10, $vr5 vand.v $vr11, $vr11, $vr5 - vand.v $vr12, $vr12, $vr5 - vadd.w $vr1, $vr11, $vr1 - vadd.w $vr9, $vr12, $vr9 + vadd.w $vr2, $vr10, $vr2 + vadd.w $vr9, $vr11, $vr9 ld.b $a7, $a0, -18 ld.b $t0, $a0, -12 ld.b $t1, $a0, -6 ld.b $t2, $a0, 0 - vinsgr2vr.b $vr11, $a7, 0 - vinsgr2vr.b $vr11, $t0, 1 - vinsgr2vr.b $vr11, $t1, 2 - vinsgr2vr.b $vr11, $t2, 3 + vinsgr2vr.b $vr10, $a7, 0 + vinsgr2vr.b $vr10, $t0, 1 + vinsgr2vr.b $vr10, $t1, 2 + vinsgr2vr.b $vr10, $t2, 3 ld.b $a7, $a0, 6 ld.b $t0, $a0, 12 ld.b $t1, $a0, 18 ld.b $t2, $a0, 24 - vinsgr2vr.b $vr12, $a7, 0 - vinsgr2vr.b $vr12, $t0, 1 - vinsgr2vr.b $vr12, $t1, 2 - vinsgr2vr.b $vr12, $t2, 3 + vinsgr2vr.b $vr11, $a7, 0 + vinsgr2vr.b $vr11, $t0, 1 + vinsgr2vr.b $vr11, $t1, 2 + vinsgr2vr.b $vr11, $t2, 3 + vslt.bu $vr10, $vr7, $vr10 + vilvl.b $vr10, $vr10, $vr10 + vilvl.h $vr10, $vr10, $vr10 + vand.v $vr10, $vr10, $vr8 + vslli.w $vr10, $vr10, 3 vslt.bu $vr11, $vr7, $vr11 vilvl.b $vr11, $vr11, $vr11 vilvl.h $vr11, $vr11, $vr11 vand.v $vr11, $vr11, $vr8 vslli.w $vr11, $vr11, 3 - vslt.bu $vr12, $vr7, $vr12 - vilvl.b $vr12, $vr12, $vr12 - vilvl.h $vr12, $vr12, $vr12 - vand.v $vr12, $vr12, $vr8 - vslli.w $vr12, $vr12, 3 - vor.v $vr0, $vr11, $vr0 - vor.v $vr6, $vr12, $vr6 + vor.v $vr0, $vr10, $vr0 + vor.v $vr6, $vr11, $vr6 addi.d $a6, $a6, -8 addi.d $a0, $a0, 48 bnez $a6, .LBB17_35 # %bb.36: # %middle.block - vadd.w $vr2, $vr10, $vr3 - vhaddw.d.w $vr2, $vr2, $vr2 - vhaddw.q.d $vr2, $vr2, $vr2 - vpickve2gr.d $a6, $vr2, 0 - vadd.w $vr1, $vr9, $vr1 + vadd.w $vr1, $vr1, $vr3 + vhaddw.d.w $vr1, $vr1, $vr1 + vhaddw.q.d $vr1, $vr1, $vr1 + vpickve2gr.d $a6, $vr1, 0 + vadd.w $vr1, $vr9, $vr2 vhaddw.d.w $vr1, $vr1, $vr1 vhaddw.q.d $vr1, $vr1, $vr1 vpickve2gr.d $t0, $vr1, 0 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Sha256.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Sha256.s index cec15728..4ddbcda4 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Sha256.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/C/Sha256.s @@ -87,55 +87,72 @@ Sha256_Update: # @Sha256_Update .p2align 4, 0x0 # -- Begin function Sha256_WriteByteBlock .LCPI2_0: .byte 1 # 0x1 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 .byte 4 # 0x4 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 9 # 0x9 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b .byte 12 # 0xc - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI2_1: .byte 0 # 0x0 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 .byte 5 # 0x5 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 8 # 0x8 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b .byte 13 # 0xd - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI2_2: .byte 2 # 0x2 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 .byte 6 # 0x6 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 10 # 0xa - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b .byte 14 # 0xe - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff +.LCPI2_3: + .byte 3 # 0x3 + .byte 7 # 0x7 + .byte 11 # 0xb + .byte 15 # 0xf + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .text .p2align 5 .type Sha256_WriteByteBlock,@function @@ -144,65 +161,98 @@ Sha256_WriteByteBlock: # @Sha256_WriteByteBlock addi.d $sp, $sp, -192 st.d $fp, $sp, 184 # 8-byte Folded Spill st.d $s0, $sp, 176 # 8-byte Folded Spill - vld $vr2, $a0, 40 + vld $vr6, $a0, 40 pcalau12i $a1, %pc_hi20(.LCPI2_0) - vld $vr1, $a1, %pc_lo12(.LCPI2_0) - vrepli.b $vr0, 0 - vshuf.b $vr3, $vr0, $vr2, $vr1 + vld $vr0, $a1, %pc_lo12(.LCPI2_0) + vshuf.b $vr1, $vr0, $vr6, $vr0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr3, $vr1, 0 pcalau12i $a1, %pc_hi20(.LCPI2_1) - vld $vr4, $a1, %pc_lo12(.LCPI2_1) + vld $vr1, $a1, %pc_lo12(.LCPI2_1) ori $a1, $zero, 16 lu32i.d $a1, 24 - vreplgr2vr.d $vr5, $a1 - vsll.w $vr3, $vr3, $vr5 - vshuf.b $vr6, $vr0, $vr2, $vr4 + vreplgr2vr.d $vr2, $a1 + vsll.w $vr5, $vr3, $vr2 + vshuf.b $vr3, $vr0, $vr6, $vr1 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr7, $vr3, 0 pcalau12i $a1, %pc_hi20(.LCPI2_2) - vld $vr7, $a1, %pc_lo12(.LCPI2_2) + vld $vr3, $a1, %pc_lo12(.LCPI2_2) ori $a1, $zero, 24 lu32i.d $a1, 16 - vreplgr2vr.d $vr8, $a1 - vsll.w $vr6, $vr6, $vr8 - vor.v $vr3, $vr6, $vr3 - vshuf.b $vr6, $vr0, $vr2, $vr7 - vslli.w $vr6, $vr6, 8 - vor.v $vr3, $vr3, $vr6 - vld $vr6, $a0, 56 - vsrli.w $vr2, $vr2, 24 - vor.v $vr2, $vr3, $vr2 - vst $vr2, $sp, 16 - vshuf.b $vr2, $vr0, $vr6, $vr1 - vsll.w $vr2, $vr2, $vr5 - vshuf.b $vr3, $vr0, $vr6, $vr4 - vsll.w $vr3, $vr3, $vr8 - vor.v $vr2, $vr3, $vr2 - vshuf.b $vr3, $vr0, $vr6, $vr7 - vslli.w $vr3, $vr3, 8 - vor.v $vr2, $vr2, $vr3 - vld $vr3, $a0, 72 - vsrli.w $vr6, $vr6, 24 - vor.v $vr2, $vr2, $vr6 - vst $vr2, $sp, 32 - vshuf.b $vr2, $vr0, $vr3, $vr1 - vsll.w $vr2, $vr2, $vr5 - vshuf.b $vr6, $vr0, $vr3, $vr4 - vsll.w $vr6, $vr6, $vr8 - vor.v $vr2, $vr6, $vr2 - vshuf.b $vr6, $vr0, $vr3, $vr7 - vslli.w $vr6, $vr6, 8 - vor.v $vr2, $vr2, $vr6 - vld $vr6, $a0, 88 - vsrli.w $vr3, $vr3, 24 - vor.v $vr2, $vr2, $vr3 - vst $vr2, $sp, 48 - vshuf.b $vr1, $vr0, $vr6, $vr1 - vsll.w $vr1, $vr1, $vr5 - vshuf.b $vr2, $vr0, $vr6, $vr4 - vsll.w $vr2, $vr2, $vr8 - vor.v $vr1, $vr2, $vr1 - vshuf.b $vr0, $vr0, $vr6, $vr7 - vslli.w $vr0, $vr0, 8 + vreplgr2vr.d $vr4, $a1 + vsll.w $vr7, $vr7, $vr4 + vor.v $vr7, $vr7, $vr5 + vshuf.b $vr5, $vr0, $vr6, $vr3 + vsllwil.hu.bu $vr8, $vr5, 0 + pcalau12i $a1, %pc_hi20(.LCPI2_3) + vld $vr5, $a1, %pc_lo12(.LCPI2_3) + vsllwil.wu.hu $vr8, $vr8, 0 + vslli.w $vr8, $vr8, 8 + vor.v $vr7, $vr7, $vr8 + vshuf.b $vr6, $vr0, $vr6, $vr5 + vsllwil.hu.bu $vr6, $vr6, 0 + vld $vr8, $a0, 56 + vsllwil.wu.hu $vr6, $vr6, 0 + vor.v $vr6, $vr7, $vr6 + vst $vr6, $sp, 16 + vshuf.b $vr6, $vr0, $vr8, $vr0 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsll.w $vr6, $vr6, $vr2 + vshuf.b $vr7, $vr0, $vr8, $vr1 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsll.w $vr7, $vr7, $vr4 + vor.v $vr6, $vr7, $vr6 + vshuf.b $vr7, $vr0, $vr8, $vr3 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vslli.w $vr7, $vr7, 8 + vor.v $vr6, $vr6, $vr7 + vshuf.b $vr7, $vr0, $vr8, $vr5 + vsllwil.hu.bu $vr7, $vr7, 0 + vld $vr8, $a0, 72 + vsllwil.wu.hu $vr7, $vr7, 0 + vor.v $vr6, $vr6, $vr7 + vst $vr6, $sp, 32 + vshuf.b $vr6, $vr0, $vr8, $vr0 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsll.w $vr6, $vr6, $vr2 + vshuf.b $vr7, $vr0, $vr8, $vr1 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsll.w $vr7, $vr7, $vr4 + vor.v $vr6, $vr7, $vr6 + vshuf.b $vr7, $vr0, $vr8, $vr3 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vslli.w $vr7, $vr7, 8 + vor.v $vr6, $vr6, $vr7 + vshuf.b $vr7, $vr0, $vr8, $vr5 + vsllwil.hu.bu $vr7, $vr7, 0 + vld $vr8, $a0, 88 + vsllwil.wu.hu $vr7, $vr7, 0 + vor.v $vr6, $vr6, $vr7 + vst $vr6, $sp, 48 + vshuf.b $vr0, $vr0, $vr8, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsll.w $vr0, $vr0, $vr2 + vshuf.b $vr1, $vr0, $vr8, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsll.w $vr1, $vr1, $vr4 vor.v $vr0, $vr1, $vr0 - vsrli.w $vr1, $vr6, 24 + vshuf.b $vr1, $vr0, $vr8, $vr3 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vslli.w $vr1, $vr1, 8 + vor.v $vr0, $vr0, $vr1 + vshuf.b $vr1, $vr0, $vr8, $vr5 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vld $vr2, $a0, 16 vld $vr3, $a0, 0 vor.v $vr0, $vr0, $vr1 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/7z/7zEncode.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/7z/7zEncode.s index ee8c96cd..b4277baf 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/7z/7zEncode.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/7z/7zEncode.s @@ -2723,18 +2723,12 @@ _ZN8NArchive3N7z8CEncoder13EncoderConstrEv: # @_ZN8NArchive3N7z8CEncoder13Encode # Parent Loop BB3_25 Depth=1 # => This Inner Loop Header: Depth=2 vaddi.wu $vr3, $vr1, 4 - vshuf4i.w $vr4, $vr1, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr5, $vr1, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr6, $vr3, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr4, $vr1, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr5, $vr1, 0 + vshuf4i.w $vr6, $vr3, 14 + vsllwil.d.w $vr6, $vr6, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $a5, $vr5, 0 slli.d $a5, $a5, 3 vpickve2gr.d $a6, $vr5, 1 @@ -2817,18 +2811,12 @@ _ZN8NArchive3N7z8CEncoder13EncoderConstrEv: # @_ZN8NArchive3N7z8CEncoder13Encode # Parent Loop BB3_25 Depth=1 # => This Inner Loop Header: Depth=2 vaddi.wu $vr3, $vr1, 4 - vshuf4i.w $vr4, $vr1, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr5, $vr1, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr6, $vr3, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr4, $vr1, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr5, $vr1, 0 + vshuf4i.w $vr6, $vr3, 14 + vsllwil.d.w $vr6, $vr6, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $a6, $vr5, 0 alsl.d $a6, $a6, $a3, 3 vpickve2gr.d $a7, $vr5, 1 @@ -3030,18 +3018,12 @@ _ZN8NArchive3N7z8CEncoder13EncoderConstrEv: # @_ZN8NArchive3N7z8CEncoder13Encode # Parent Loop BB3_59 Depth=1 # => This Inner Loop Header: Depth=2 vaddi.wu $vr5, $vr3, 4 - vshuf4i.w $vr6, $vr3, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr7, $vr3, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 - vshuf4i.w $vr8, $vr5, 50 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vshuf4i.w $vr6, $vr3, 14 + vsllwil.d.w $vr6, $vr6, 0 + vsllwil.d.w $vr7, $vr3, 0 + vshuf4i.w $vr8, $vr5, 14 + vsllwil.d.w $vr8, $vr8, 0 + vsllwil.d.w $vr5, $vr5, 0 vpickve2gr.d $t2, $vr7, 0 alsl.d $t2, $t2, $a2, 3 vpickve2gr.d $t3, $vr7, 1 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/7z/7zIn.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/7z/7zIn.s index 0bd94f5d..49482918 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/7z/7zIn.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/7z/7zIn.s @@ -1429,7 +1429,6 @@ _ZN8NArchive3N7z8CInByte210ReadStringER11CStringBaseIwE: # @_ZN8NArchive3N7z8CIn alsl.d $s1, $a3, $s1, 3 add.d $a3, $s4, $s5 addi.d $a3, $a3, 3 - vrepli.b $vr0, 0 move $a4, $a2 move $a5, $a1 .p2align 4, , 16 @@ -1439,25 +1438,25 @@ _ZN8NArchive3N7z8CInByte210ReadStringER11CStringBaseIwE: # @_ZN8NArchive3N7z8CIn ld.b $a7, $a3, -1 ld.b $t0, $a3, 1 ld.b $t1, $a3, 3 - vinsgr2vr.b $vr1, $a6, 0 - vinsgr2vr.b $vr1, $a7, 1 - vinsgr2vr.b $vr1, $t0, 2 - vinsgr2vr.b $vr1, $t1, 3 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 + vinsgr2vr.b $vr0, $a6, 0 + vinsgr2vr.b $vr0, $a7, 1 + vinsgr2vr.b $vr0, $t0, 2 + vinsgr2vr.b $vr0, $t1, 3 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 ld.b $a6, $a3, -2 ld.b $a7, $a3, 0 ld.b $t0, $a3, 2 ld.b $t1, $a3, 4 - vinsgr2vr.b $vr2, $a6, 0 - vinsgr2vr.b $vr2, $a7, 1 - vinsgr2vr.b $vr2, $t0, 2 - vinsgr2vr.b $vr2, $t1, 3 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vslli.w $vr2, $vr2, 8 - vor.v $vr1, $vr2, $vr1 - vst $vr1, $a5, 0 + vinsgr2vr.b $vr1, $a6, 0 + vinsgr2vr.b $vr1, $a7, 1 + vinsgr2vr.b $vr1, $t0, 2 + vinsgr2vr.b $vr1, $t1, 3 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vslli.w $vr1, $vr1, 8 + vor.v $vr0, $vr1, $vr0 + vst $vr0, $a5, 0 addi.d $a5, $a5, 16 addi.d $a4, $a4, -4 addi.d $a3, $a3, 8 @@ -1615,10 +1614,8 @@ _ZN8NArchive3N7z10CInArchive20FindAndReadSignatureEP9IInStreamPKy: # @_ZN8NArchi vseqi.b $vr0, $vr0, 0 vinsgr2vr.w $vr1, $a0, 0 vseqi.b $vr1, $vr1, 0 - vilvl.b $vr1, $vr1, $vr1 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 24 - vsrai.w $vr1, $vr1, 24 + vsllwil.h.b $vr1, $vr1, 0 + vsllwil.w.h $vr1, $vr1, 0 vpickve2gr.b $a0, $vr0, 0 vinsgr2vr.w $vr2, $a0, 0 vpickve2gr.b $a0, $vr0, 1 @@ -7192,12 +7189,11 @@ _ZN8NArchive3N7z10CInArchive10ReadHeaderERNS0_18CArchiveDatabaseExEP22ICryptoGet # in Loop: Header=BB34_62 Depth=1 move $a2, $a0 bstrins.d $a2, $zero, 2, 0 - vld $vr4, $sp, 128 # 16-byte Folded Reload - vori.b $vr0, $vr4, 0 + vld $vr1, $sp, 128 # 16-byte Folded Reload + vori.b $vr0, $vr1, 0 vinsgr2vr.w $vr0, $s6, 0 addi.d $a3, $a1, 4 move $a4, $a2 - vori.b $vr1, $vr4, 0 .p2align 4, , 16 .LBB34_137: # %vector.body # Parent Loop BB34_62 Depth=1 @@ -7206,10 +7202,10 @@ _ZN8NArchive3N7z10CInArchive10ReadHeaderERNS0_18CArchiveDatabaseExEP22ICryptoGet ld.w $a6, $a3, 0 vinsgr2vr.w $vr2, $a5, 0 vinsgr2vr.w $vr3, $a6, 0 - vilvl.b $vr2, $vr4, $vr2 - vilvl.h $vr2, $vr4, $vr2 - vilvl.b $vr3, $vr4, $vr3 - vilvl.h $vr3, $vr4, $vr3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 addi.w $a4, $a4, -8 @@ -7340,25 +7336,24 @@ _ZN8NArchive3N7z10CInArchive10ReadHeaderERNS0_18CArchiveDatabaseExEP22ICryptoGet addi.d $a2, $a1, 4 move $a3, $s6 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB34_157: # %vector.body516 # =>This Inner Loop Header: Depth=1 ld.w $a4, $a2, -4 ld.w $a5, $a2, 0 - vinsgr2vr.w $vr3, $a4, 0 - vinsgr2vr.w $vr4, $a5, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.w $vr2, $a4, 0 + vinsgr2vr.w $vr3, $a5, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 addi.w $a3, $a3, -8 addi.d $a2, $a2, 8 bnez $a3, .LBB34_157 # %bb.158: # %middle.block523 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 addi.w $a3, $s6, 0 @@ -8300,123 +8295,123 @@ _ZN8NArchive3N7z18CArchiveDatabaseEx24FillFolderStartFileIndexEv: # @_ZN8NArchiv .p2align 4, 0x0 # -- Begin function _ZN8NArchive3N7z10CInArchive13ReadDatabase2ERNS0_18CArchiveDatabaseExEP22ICryptoGetTextPasswordRb .LCPI39_0: .byte 8 # 0x8 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 0 # 0x0 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI39_1: .byte 9 # 0x9 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 1 # 0x1 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI39_2: .byte 10 # 0xa - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 2 # 0x2 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI39_3: .byte 11 # 0xb - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 3 # 0x3 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI39_4: .byte 12 # 0xc - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 4 # 0x4 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI39_5: .byte 13 # 0xd - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 5 # 0x5 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI39_6: .byte 14 # 0xe - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 6 # 0x6 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .text .globl _ZN8NArchive3N7z10CInArchive13ReadDatabase2ERNS0_18CArchiveDatabaseExEP22ICryptoGetTextPasswordRb .p2align 5 @@ -8473,8 +8468,9 @@ _ZN8NArchive3N7z10CInArchive13ReadDatabase2ERNS0_18CArchiveDatabaseExEP22ICrypto addi.d $a0, $s1, 648 pcaddu18i $ra, %call36(_ZN17CBaseRecordVector5ClearEv) jirl $ra, $ra, 0 - vrepli.b $vr8, 0 - vst $vr8, $s1, 680 + vrepli.b $vr0, 0 + vst $vr0, $sp, 16 # 16-byte Folded Spill + vst $vr0, $s1, 680 ld.d $a0, $s2, 48 st.d $a0, $s1, 488 ld.bu $a0, $s2, 62 @@ -8483,35 +8479,55 @@ _ZN8NArchive3N7z10CInArchive13ReadDatabase2ERNS0_18CArchiveDatabaseExEP22ICrypto st.b $a1, $s1, 481 bnez $a0, .LBB39_52 # %bb.1: - ld.w $s3, $s2, 64 vld $vr0, $s2, 68 pcalau12i $a0, %pc_hi20(.LCPI39_0) vld $vr1, $a0, %pc_lo12(.LCPI39_0) - pcalau12i $a0, %pc_hi20(.LCPI39_1) - vld $vr2, $a0, %pc_lo12(.LCPI39_1) + ld.w $s3, $s2, 64 addi.d $a0, $s2, 68 ld.w $s4, $s2, 84 - vshuf.b $vr1, $vr8, $vr0, $vr1 - vshuf.b $vr2, $vr8, $vr0, $vr2 + vshuf.b $vr1, $vr0, $vr0, $vr1 + pcalau12i $a1, %pc_hi20(.LCPI39_1) + vld $vr2, $a1, %pc_lo12(.LCPI39_1) + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vshuf.b $vr2, $vr0, $vr0, $vr2 + vsllwil.hu.bu $vr2, $vr2, 0 pcalau12i $a1, %pc_hi20(.LCPI39_2) vld $vr3, $a1, %pc_lo12(.LCPI39_2) + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vslli.d $vr2, $vr2, 8 + vshuf.b $vr3, $vr0, $vr0, $vr3 + vsllwil.hu.bu $vr3, $vr3, 0 pcalau12i $a1, %pc_hi20(.LCPI39_3) vld $vr4, $a1, %pc_lo12(.LCPI39_3) - vslli.d $vr2, $vr2, 8 - vshuf.b $vr3, $vr8, $vr0, $vr3 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 vslli.d $vr3, $vr3, 16 - vshuf.b $vr4, $vr8, $vr0, $vr4 + vshuf.b $vr4, $vr0, $vr0, $vr4 + vsllwil.hu.bu $vr4, $vr4, 0 pcalau12i $a1, %pc_hi20(.LCPI39_4) vld $vr5, $a1, %pc_lo12(.LCPI39_4) + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vslli.d $vr4, $vr4, 24 + vshuf.b $vr5, $vr0, $vr0, $vr5 pcalau12i $a1, %pc_hi20(.LCPI39_5) vld $vr6, $a1, %pc_lo12(.LCPI39_5) + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf.b $vr6, $vr0, $vr0, $vr6 pcalau12i $a1, %pc_hi20(.LCPI39_6) vld $vr7, $a1, %pc_lo12(.LCPI39_6) - vslli.d $vr4, $vr4, 24 - vshuf.b $vr5, $vr8, $vr0, $vr5 - vshuf.b $vr6, $vr8, $vr0, $vr6 - vst $vr8, $sp, 16 # 16-byte Folded Spill - vshuf.b $vr7, $vr8, $vr0, $vr7 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf.b $vr7, $vr0, $vr0, $vr7 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 vbsrl.v $vr8, $vr0, 15 vbsll.v $vr0, $vr0, 1 vor.v $vr0, $vr0, $vr8 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/7z/7zOut.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/7z/7zOut.s index 9a870b43..89f21ce6 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/7z/7zOut.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/7z/7zOut.s @@ -1705,25 +1705,24 @@ _ZN8NArchive3N7z11COutArchive16WriteHashDigestsERK13CRecordVectorIbERKS2_IjE: # addi.d $a3, $a0, 4 move $a4, $a2 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB14_4: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $a5, $a3, -4 ld.w $a6, $a3, 0 - vinsgr2vr.w $vr3, $a5, 0 - vinsgr2vr.w $vr4, $a6, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.w $vr2, $a5, 0 + vinsgr2vr.w $vr3, $a6, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 addi.d $a4, $a4, -8 addi.d $a3, $a3, 8 bnez $a4, .LBB14_4 # %bb.5: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a3, $vr0, 0 @@ -3481,25 +3480,24 @@ _ZN8NArchive3N7z11COutArchive20WriteUInt64DefVectorERKNS0_16CUInt64DefVectorEh: addi.d $a4, $a0, 4 move $a5, $a2 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB20_4: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $a6, $a4, -4 ld.w $a7, $a4, 0 - vinsgr2vr.w $vr3, $a6, 0 - vinsgr2vr.w $vr4, $a7, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.w $vr2, $a6, 0 + vinsgr2vr.w $vr3, $a7, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 addi.d $a5, $a5, -8 addi.d $a4, $a4, 8 bnez $a5, .LBB20_4 # %bb.5: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a4, $vr0, 0 @@ -4750,7 +4748,6 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea pcaddu18i $ra, %call36(_ZN17CBaseRecordVectorD2Ev) jirl $ra, $ra, 0 ld.w $a0, $s0, 172 - vld $vr8, $sp, 32 # 16-byte Folded Reload blez $a0, .LBB23_160 # %bb.115: # %.lr.ph318 ld.d $a1, $s0, 176 @@ -4795,12 +4792,8 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea vslli.w $vr5, $vr5, 1 vaddi.wu $vr4, $vr4, 2 vaddi.wu $vr5, $vr5, 2 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr5, $vr5, 0 vadd.d $vr0, $vr0, $vr4 vadd.d $vr2, $vr2, $vr5 addi.d $a4, $a4, -4 @@ -4891,7 +4884,6 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea ld.d $a0, $fp, 16 addi.d $a0, $a0, 1 st.d $a0, $fp, 16 - vld $vr8, $sp, 32 # 16-byte Folded Reload ld.w $a0, $s0, 172 bgtz $a0, .LBB23_139 b .LBB23_160 @@ -4921,14 +4913,12 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea srli.d $a0, $a0, 8 xor $a0, $a1, $a0 st.w $a0, $fp, 24 - vld $vr8, $sp, 32 # 16-byte Folded Reload ld.w $a0, $s0, 172 bgtz $a0, .LBB23_139 b .LBB23_160 .LBB23_137: ld.d $a1, $fp, 96 ld.d $a0, $fp, 104 - vld $vr8, $sp, 32 # 16-byte Folded Reload beq $a1, $a0, .LBB23_242 # %bb.138: # %_ZN8NArchive3N7z15CWriteBufferLoc9WriteByteEh.exit.i210 ld.d $a1, $fp, 88 @@ -5006,7 +4996,6 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea move $a0, $s2 pcaddu18i $ra, %call36(_ZN10COutBuffer14FlushWithCheckEv) jirl $ra, $ra, 0 - vld $vr8, $sp, 32 # 16-byte Folded Reload .LBB23_151: # %_ZN10COutBuffer9WriteByteEh.exit.i216 # in Loop: Header=BB23_146 Depth=2 ld.wu $a0, $fp, 24 @@ -5050,7 +5039,6 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea move $a0, $s2 pcaddu18i $ra, %call36(_ZN10COutBuffer14FlushWithCheckEv) jirl $ra, $ra, 0 - vld $vr8, $sp, 32 # 16-byte Folded Reload .LBB23_157: # %_ZN10COutBuffer9WriteByteEh.exit.i221 # in Loop: Header=BB23_146 Depth=2 ld.wu $a0, $fp, 24 @@ -5096,25 +5084,24 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea addi.d $a3, $a1, 4 move $a4, $a2 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB23_165: # %vector.body425 # =>This Inner Loop Header: Depth=1 ld.w $a5, $a3, -4 ld.w $a6, $a3, 0 - vinsgr2vr.w $vr3, $a5, 0 - vinsgr2vr.w $vr4, $a6, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.w $vr2, $a5, 0 + vinsgr2vr.w $vr3, $a6, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 addi.d $a4, $a4, -8 addi.d $a3, $a3, 8 bnez $a4, .LBB23_165 # %bb.166: # %middle.block432 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a3, $vr0, 0 @@ -5140,7 +5127,6 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea move $a0, $fp pcaddu18i $ra, %call36(_ZN8NArchive3N7z11COutArchive22WriteAlignedBoolHeaderERK13CRecordVectorIbEihj) jirl $ra, $ra, 0 - vld $vr8, $sp, 32 # 16-byte Folded Reload ld.w $a0, $s0, 236 blez $a0, .LBB23_175 # %bb.171: # %.lr.ph24.i @@ -5162,7 +5148,6 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea move $a0, $fp pcaddu18i $ra, %call36(_ZN8NArchive3N7z11COutArchive11WriteUInt64Ey) jirl $ra, $ra, 0 - vld $vr8, $sp, 32 # 16-byte Folded Reload ld.w $a0, $s0, 236 b .LBB23_172 .LBB23_175: # %_ZN8NArchive3N7z11COutArchive20WriteUInt64DefVectorERKNS0_16CUInt64DefVectorEh.exit @@ -5186,25 +5171,24 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea addi.d $a3, $a1, 4 move $a4, $a2 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB23_180: # %vector.body442 # =>This Inner Loop Header: Depth=1 ld.w $a5, $a3, -4 ld.w $a6, $a3, 0 - vinsgr2vr.w $vr3, $a5, 0 - vinsgr2vr.w $vr4, $a6, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.w $vr2, $a5, 0 + vinsgr2vr.w $vr3, $a6, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 addi.d $a4, $a4, -8 addi.d $a3, $a3, 8 bnez $a4, .LBB23_180 # %bb.181: # %middle.block449 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a3, $vr0, 0 @@ -5230,7 +5214,6 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea move $a0, $fp pcaddu18i $ra, %call36(_ZN8NArchive3N7z11COutArchive22WriteAlignedBoolHeaderERK13CRecordVectorIbEihj) jirl $ra, $ra, 0 - vld $vr8, $sp, 32 # 16-byte Folded Reload ld.w $a0, $s0, 300 blez $a0, .LBB23_190 # %bb.186: # %.lr.ph24.i234 @@ -5252,7 +5235,6 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea move $a0, $fp pcaddu18i $ra, %call36(_ZN8NArchive3N7z11COutArchive11WriteUInt64Ey) jirl $ra, $ra, 0 - vld $vr8, $sp, 32 # 16-byte Folded Reload ld.w $a0, $s0, 300 b .LBB23_187 .LBB23_190: # %_ZN8NArchive3N7z11COutArchive20WriteUInt64DefVectorERKNS0_16CUInt64DefVectorEh.exit238 @@ -5276,25 +5258,24 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea addi.d $a3, $a1, 4 move $a4, $a2 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB23_195: # %vector.body459 # =>This Inner Loop Header: Depth=1 ld.w $a5, $a3, -4 ld.w $a6, $a3, 0 - vinsgr2vr.w $vr3, $a5, 0 - vinsgr2vr.w $vr4, $a6, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.w $vr2, $a5, 0 + vinsgr2vr.w $vr3, $a6, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 addi.d $a4, $a4, -8 addi.d $a3, $a3, 8 bnez $a4, .LBB23_195 # %bb.196: # %middle.block466 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a3, $vr0, 0 @@ -5320,7 +5301,6 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea move $a0, $fp pcaddu18i $ra, %call36(_ZN8NArchive3N7z11COutArchive22WriteAlignedBoolHeaderERK13CRecordVectorIbEihj) jirl $ra, $ra, 0 - vld $vr8, $sp, 32 # 16-byte Folded Reload ld.w $a0, $s0, 364 blez $a0, .LBB23_205 # %bb.201: # %.lr.ph24.i247 @@ -5342,7 +5322,6 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea move $a0, $fp pcaddu18i $ra, %call36(_ZN8NArchive3N7z11COutArchive11WriteUInt64Ey) jirl $ra, $ra, 0 - vld $vr8, $sp, 32 # 16-byte Folded Reload ld.w $a0, $s0, 364 b .LBB23_202 .LBB23_205: # %_ZN8NArchive3N7z11COutArchive20WriteUInt64DefVectorERKNS0_16CUInt64DefVectorEh.exit251 @@ -5361,8 +5340,8 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea slli.d $a2, $a2, 3 addi.d $a3, $a1, 4 move $a4, $a2 - vori.b $vr0, $vr8, 0 - vori.b $vr1, $vr8, 0 + vld $vr1, $sp, 32 # 16-byte Folded Reload + vori.b $vr0, $vr1, 0 .p2align 4, , 16 .LBB23_209: # %vector.body476 # =>This Inner Loop Header: Depth=1 @@ -5370,10 +5349,10 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea ld.w $a6, $a3, 0 vinsgr2vr.w $vr2, $a5, 0 vinsgr2vr.w $vr3, $a6, 0 - vilvl.b $vr2, $vr8, $vr2 - vilvl.h $vr2, $vr8, $vr2 - vilvl.b $vr3, $vr8, $vr3 - vilvl.h $vr3, $vr8, $vr3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 addi.d $a4, $a4, -8 @@ -5406,7 +5385,6 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea move $a0, $fp pcaddu18i $ra, %call36(_ZN8NArchive3N7z11COutArchive22WriteAlignedBoolHeaderERK13CRecordVectorIbEihj) jirl $ra, $ra, 0 - vld $vr8, $sp, 32 # 16-byte Folded Reload ld.w $a0, $s0, 428 blez $a0, .LBB23_219 # %bb.215: # %.lr.ph24.i260 @@ -5428,11 +5406,11 @@ _ZN8NArchive3N7z11COutArchive11WriteHeaderERKNS0_16CArchiveDatabaseERKNS0_14CHea move $a0, $fp pcaddu18i $ra, %call36(_ZN8NArchive3N7z11COutArchive11WriteUInt64Ey) jirl $ra, $ra, 0 - vld $vr8, $sp, 32 # 16-byte Folded Reload ld.w $a0, $s0, 428 b .LBB23_216 .LBB23_219: # %_ZN8NArchive3N7z11COutArchive20WriteUInt64DefVectorERKNS0_16CUInt64DefVectorEh.exit264 - vst $vr8, $sp, 128 + vld $vr0, $sp, 32 # 16-byte Folded Reload + vst $vr0, $sp, 128 ld.w $a1, $s0, 172 ori $a0, $zero, 1 st.d $a0, $sp, 144 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Cab/CabBlockInStream.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Cab/CabBlockInStream.s index 4a51865d..f87bc72c 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Cab/CabBlockInStream.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Cab/CabBlockInStream.s @@ -289,7 +289,6 @@ _ZN8NArchive4NCab10CCheckSum26UpdateEPKvj: # @_ZN8NArchive4NCab10CCheckSum26Upda vori.b $vr1, $vr0, 0 vinsgr2vr.w $vr1, $t0, 0 move $t0, $a6 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB4_10: # %vector.body # =>This Inner Loop Header: Depth=1 @@ -297,70 +296,70 @@ _ZN8NArchive4NCab10CCheckSum26UpdateEPKvj: # @_ZN8NArchive4NCab10CCheckSum26Upda ld.b $t2, $a7, -11 ld.b $t3, $a7, -7 ld.b $t4, $a7, -3 - vinsgr2vr.b $vr3, $t1, 0 - vinsgr2vr.b $vr3, $t2, 1 - vinsgr2vr.b $vr3, $t3, 2 - vinsgr2vr.b $vr3, $t4, 3 + vinsgr2vr.b $vr2, $t1, 0 + vinsgr2vr.b $vr2, $t2, 1 + vinsgr2vr.b $vr2, $t3, 2 + vinsgr2vr.b $vr2, $t4, 3 ld.b $t1, $a7, 1 ld.b $t2, $a7, 5 ld.b $t3, $a7, 9 ld.b $t4, $a7, 13 - vinsgr2vr.b $vr4, $t1, 0 - vinsgr2vr.b $vr4, $t2, 1 - vinsgr2vr.b $vr4, $t3, 2 - vinsgr2vr.b $vr4, $t4, 3 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.b $vr3, $t1, 0 + vinsgr2vr.b $vr3, $t2, 1 + vinsgr2vr.b $vr3, $t3, 2 + vinsgr2vr.b $vr3, $t4, 3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 ld.b $t1, $a7, -14 ld.b $t2, $a7, -10 ld.b $t3, $a7, -6 ld.b $t4, $a7, -2 - vinsgr2vr.b $vr5, $t1, 0 - vinsgr2vr.b $vr5, $t2, 1 - vinsgr2vr.b $vr5, $t3, 2 - vinsgr2vr.b $vr5, $t4, 3 + vinsgr2vr.b $vr4, $t1, 0 + vinsgr2vr.b $vr4, $t2, 1 + vinsgr2vr.b $vr4, $t3, 2 + vinsgr2vr.b $vr4, $t4, 3 ld.b $t1, $a7, 2 ld.b $t2, $a7, 6 ld.b $t3, $a7, 10 ld.b $t4, $a7, 14 - vinsgr2vr.b $vr6, $t1, 0 - vinsgr2vr.b $vr6, $t2, 1 - vinsgr2vr.b $vr6, $t3, 2 - vinsgr2vr.b $vr6, $t4, 3 - vilvl.b $vr5, $vr0, $vr5 - vilvl.h $vr5, $vr0, $vr5 - vilvl.b $vr6, $vr0, $vr6 - vilvl.h $vr6, $vr0, $vr6 + vinsgr2vr.b $vr5, $t1, 0 + vinsgr2vr.b $vr5, $t2, 1 + vinsgr2vr.b $vr5, $t3, 2 + vinsgr2vr.b $vr5, $t4, 3 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vslli.w $vr4, $vr4, 8 vslli.w $vr5, $vr5, 8 - vslli.w $vr6, $vr6, 8 + vor.v $vr2, $vr4, $vr2 vor.v $vr3, $vr5, $vr3 - vor.v $vr4, $vr6, $vr4 ld.b $t1, $a7, -13 ld.b $t2, $a7, -9 ld.b $t3, $a7, -5 ld.b $t4, $a7, -1 - vinsgr2vr.b $vr5, $t1, 0 - vinsgr2vr.b $vr5, $t2, 1 - vinsgr2vr.b $vr5, $t3, 2 - vinsgr2vr.b $vr5, $t4, 3 + vinsgr2vr.b $vr4, $t1, 0 + vinsgr2vr.b $vr4, $t2, 1 + vinsgr2vr.b $vr4, $t3, 2 + vinsgr2vr.b $vr4, $t4, 3 ld.b $t1, $a7, 3 ld.b $t2, $a7, 7 ld.b $t3, $a7, 11 ld.b $t4, $a7, 15 - vinsgr2vr.b $vr6, $t1, 0 - vinsgr2vr.b $vr6, $t2, 1 - vinsgr2vr.b $vr6, $t3, 2 - vinsgr2vr.b $vr6, $t4, 3 - vilvl.b $vr5, $vr0, $vr5 - vilvl.h $vr5, $vr0, $vr5 - vilvl.b $vr6, $vr0, $vr6 - vilvl.h $vr6, $vr0, $vr6 + vinsgr2vr.b $vr5, $t1, 0 + vinsgr2vr.b $vr5, $t2, 1 + vinsgr2vr.b $vr5, $t3, 2 + vinsgr2vr.b $vr5, $t4, 3 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vslli.w $vr4, $vr4, 16 vslli.w $vr5, $vr5, 16 - vslli.w $vr6, $vr6, 16 + vor.v $vr2, $vr2, $vr4 vor.v $vr3, $vr3, $vr5 - vor.v $vr4, $vr4, $vr6 ld.b $t1, $a7, -12 ld.b $t2, $a7, -8 ld.b $t3, $a7, -4 @@ -369,25 +368,25 @@ _ZN8NArchive4NCab10CCheckSum26UpdateEPKvj: # @_ZN8NArchive4NCab10CCheckSum26Upda ld.b $t6, $a7, 8 ld.b $t7, $a7, 12 ld.b $t8, $a7, 16 - vinsgr2vr.b $vr5, $t1, 0 - vinsgr2vr.b $vr5, $t2, 4 - vinsgr2vr.b $vr5, $t3, 8 - vinsgr2vr.b $vr5, $t4, 12 - vinsgr2vr.b $vr6, $t5, 0 - vinsgr2vr.b $vr6, $t6, 4 - vinsgr2vr.b $vr6, $t7, 8 - vinsgr2vr.b $vr6, $t8, 12 + vinsgr2vr.b $vr4, $t1, 0 + vinsgr2vr.b $vr4, $t2, 4 + vinsgr2vr.b $vr4, $t3, 8 + vinsgr2vr.b $vr4, $t4, 12 + vinsgr2vr.b $vr5, $t5, 0 + vinsgr2vr.b $vr5, $t6, 4 + vinsgr2vr.b $vr5, $t7, 8 + vinsgr2vr.b $vr5, $t8, 12 + vslli.w $vr4, $vr4, 24 vslli.w $vr5, $vr5, 24 - vslli.w $vr6, $vr6, 24 + vor.v $vr2, $vr2, $vr4 vor.v $vr3, $vr3, $vr5 - vor.v $vr4, $vr4, $vr6 - vxor.v $vr1, $vr3, $vr1 - vxor.v $vr2, $vr4, $vr2 + vxor.v $vr1, $vr2, $vr1 + vxor.v $vr0, $vr3, $vr0 addi.d $t0, $t0, -8 addi.d $a7, $a7, 32 bnez $t0, .LBB4_10 # %bb.11: # %middle.block - vxor.v $vr0, $vr2, $vr1 + vxor.v $vr0, $vr0, $vr1 vbsrl.v $vr1, $vr0, 8 vxor.v $vr0, $vr1, $vr0 vbsrl.v $vr1, $vr0, 4 @@ -632,34 +631,33 @@ _ZN8NArchive4NCab17CCabBlockInStream7PreReadERjS2_: # @_ZN8NArchive4NCab17CCabBl vrepli.b $vr4, -1 vrepli.w $vr5, -40 move $a4, $a2 - vori.b $vr6, $vr0, 0 .p2align 4, , 16 .LBB5_19: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $a5, $a3, -4 ld.w $a6, $a3, 0 - vinsgr2vr.w $vr7, $a5, 0 - vinsgr2vr.w $vr8, $a6, 0 - vilvl.b $vr7, $vr0, $vr7 - vilvl.h $vr7, $vr0, $vr7 - vilvl.b $vr8, $vr0, $vr8 - vilvl.h $vr8, $vr0, $vr8 - vxor.v $vr9, $vr3, $vr4 - vadd.w $vr9, $vr2, $vr9 - vsub.w $vr10, $vr2, $vr3 + vinsgr2vr.w $vr6, $a5, 0 + vinsgr2vr.w $vr7, $a6, 0 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vxor.v $vr8, $vr3, $vr4 + vadd.w $vr8, $vr2, $vr8 + vsub.w $vr9, $vr2, $vr3 + vslli.w $vr8, $vr8, 3 vslli.w $vr9, $vr9, 3 - vslli.w $vr10, $vr10, 3 - vadd.w $vr10, $vr10, $vr5 + vadd.w $vr9, $vr9, $vr5 + vsll.w $vr6, $vr6, $vr8 vsll.w $vr7, $vr7, $vr9 - vsll.w $vr8, $vr8, $vr10 - vxor.v $vr1, $vr7, $vr1 - vxor.v $vr6, $vr8, $vr6 + vxor.v $vr1, $vr6, $vr1 + vxor.v $vr0, $vr7, $vr0 vaddi.wu $vr3, $vr3, 8 addi.d $a4, $a4, -8 addi.d $a3, $a3, 8 bnez $a4, .LBB5_19 # %bb.20: # %middle.block - vxor.v $vr0, $vr6, $vr1 + vxor.v $vr0, $vr0, $vr1 vbsrl.v $vr1, $vr0, 8 vxor.v $vr0, $vr1, $vr0 vbsrl.v $vr1, $vr0, 4 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Tar/TarIn.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Tar/TarIn.s index c8fde4ab..7c331fcf 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Tar/TarIn.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Tar/TarIn.s @@ -1101,9 +1101,8 @@ _ZN8NArchive4NTar8ReadItemEP19ISequentialInStreamRbRNS0_7CItemExER11CStringBaseI .LBB0_156: # %vector.body476.preheader # in Loop: Header=BB0_2 Depth=1 move $a0, $zero - vld $vr4, $sp, 16 # 16-byte Folded Reload - vori.b $vr0, $vr4, 0 - vori.b $vr1, $vr4, 0 + vld $vr1, $sp, 16 # 16-byte Folded Reload + vori.b $vr0, $vr1, 0 .p2align 4, , 16 .LBB0_157: # %vector.body476 # Parent Loop BB0_2 Depth=1 @@ -1113,10 +1112,10 @@ _ZN8NArchive4NTar8ReadItemEP19ISequentialInStreamRbRNS0_7CItemExER11CStringBaseI ld.w $a1, $a1, 4 vinsgr2vr.w $vr2, $a2, 0 vinsgr2vr.w $vr3, $a1, 0 - vilvl.b $vr2, $vr4, $vr2 - vilvl.h $vr2, $vr4, $vr2 - vilvl.b $vr3, $vr4, $vr3 - vilvl.h $vr3, $vr4, $vr3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 vadd.w $vr0, $vr0, $vr2 addi.d $a0, $a0, 8 vadd.w $vr1, $vr1, $vr3 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Tar/TarOut.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Tar/TarOut.s index 7e0f14a9..e9944e65 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Tar/TarOut.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Tar/TarOut.s @@ -424,24 +424,23 @@ _ZN8NArchive4NTar11COutArchive15WriteHeaderRealERKNS0_5CItemE: # @_ZN8NArchive4N addi.d $a1, $sp, 16 ori $a2, $zero, 512 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .LBB2_59: # %vector.body # =>This Inner Loop Header: Depth=1 add.d $a3, $a1, $a0 ldx.w $a4, $a0, $a1 ld.w $a3, $a3, 4 - vinsgr2vr.w $vr3, $a4, 0 - vinsgr2vr.w $vr4, $a3, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vadd.w $vr1, $vr1, $vr3 + vinsgr2vr.w $vr2, $a4, 0 + vinsgr2vr.w $vr3, $a3, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr0, $vr0, $vr2 addi.d $a0, $a0, 8 - vadd.w $vr2, $vr2, $vr4 + vadd.w $vr1, $vr1, $vr3 bne $a0, $a2, .LBB2_59 # %bb.60: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a0, $vr0, 0 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/XzHandler.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/XzHandler.s index 9cb0d66d..bb6c629d 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/XzHandler.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/XzHandler.s @@ -3055,7 +3055,6 @@ _ZN8NArchive3NXzL14GetCheckStringERK4CXzs: # @_ZN8NArchive3NXzL14GetCheckStringE vrepli.h $vr1, 15 move $a4, $a0 vori.b $vr2, $vr0, 0 - vori.b $vr3, $vr0, 0 .p2align 4, , 16 .LBB13_5: # %vector.body # =>This Inner Loop Header: Depth=1 @@ -3063,29 +3062,29 @@ _ZN8NArchive3NXzL14GetCheckStringERK4CXzs: # @_ZN8NArchive3NXzL14GetCheckStringE ld.h $a6, $a3, -120 ld.h $a7, $a3, -80 ld.h $t0, $a3, -40 - vinsgr2vr.h $vr4, $a5, 0 - vinsgr2vr.h $vr4, $a6, 1 - vinsgr2vr.h $vr4, $a7, 2 - vinsgr2vr.h $vr4, $t0, 3 + vinsgr2vr.h $vr3, $a5, 0 + vinsgr2vr.h $vr3, $a6, 1 + vinsgr2vr.h $vr3, $a7, 2 + vinsgr2vr.h $vr3, $t0, 3 ld.h $a5, $a3, 0 ld.h $a6, $a3, 40 ld.h $a7, $a3, 80 ld.h $t0, $a3, 120 - vinsgr2vr.h $vr5, $a5, 0 - vinsgr2vr.h $vr5, $a6, 1 - vinsgr2vr.h $vr5, $a7, 2 - vinsgr2vr.h $vr5, $t0, 3 + vinsgr2vr.h $vr4, $a5, 0 + vinsgr2vr.h $vr4, $a6, 1 + vinsgr2vr.h $vr4, $a7, 2 + vinsgr2vr.h $vr4, $t0, 3 + vand.v $vr3, $vr3, $vr1 vand.v $vr4, $vr4, $vr1 - vand.v $vr5, $vr5, $vr1 - vilvl.h $vr4, $vr0, $vr4 - vilvl.h $vr5, $vr0, $vr5 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vbitset.w $vr0, $vr0, $vr3 vbitset.w $vr2, $vr2, $vr4 - vbitset.w $vr3, $vr3, $vr5 addi.d $a4, $a4, -8 addi.d $a3, $a3, 320 bnez $a4, .LBB13_5 # %bb.6: # %middle.block - vor.v $vr0, $vr3, $vr2 + vor.v $vr0, $vr2, $vr0 vbsrl.v $vr1, $vr0, 8 vor.v $vr0, $vr1, $vr0 vbsrl.v $vr1, $vr0, 4 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Zip/ZipIn.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Zip/ZipIn.s index 008e5b80..92d5b97b 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Zip/ZipIn.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Zip/ZipIn.s @@ -3068,123 +3068,123 @@ _ZN8NArchive4NZip10CInArchive10ReadCdItemERNS0_7CItemExE: # @_ZN8NArchive4NZip10 .p2align 4, 0x0 # -- Begin function _ZN8NArchive4NZip10CInArchive8TryEcd64EyRNS0_7CCdInfoE .LCPI23_0: .byte 0 # 0x0 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 8 # 0x8 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI23_1: .byte 1 # 0x1 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 9 # 0x9 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI23_2: .byte 2 # 0x2 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 10 # 0xa - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI23_3: .byte 3 # 0x3 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 11 # 0xb - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI23_4: .byte 4 # 0x4 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 12 # 0xc - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI23_5: .byte 5 # 0x5 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 13 # 0xd - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI23_6: .byte 6 # 0x6 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 14 # 0xe - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .text .globl _ZN8NArchive4NZip10CInArchive8TryEcd64EyRNS0_7CCdInfoE .p2align 5 @@ -3247,17 +3247,37 @@ _ZN8NArchive4NZip10CInArchive8TryEcd64EyRNS0_7CCdInfoE: # @_ZN8NArchive4NZip10CI vld $vr6, $a1, %pc_lo12(.LCPI23_5) pcalau12i $a1, %pc_hi20(.LCPI23_6) vld $vr7, $a1, %pc_lo12(.LCPI23_6) - vrepli.b $vr8, 0 - vshuf.b $vr1, $vr8, $vr0, $vr1 - vshuf.b $vr2, $vr8, $vr0, $vr2 + vshuf.b $vr1, $vr0, $vr0, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vshuf.b $vr2, $vr0, $vr0, $vr2 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 vslli.d $vr2, $vr2, 8 - vshuf.b $vr3, $vr8, $vr0, $vr3 + vshuf.b $vr3, $vr0, $vr0, $vr3 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 vslli.d $vr3, $vr3, 16 - vshuf.b $vr4, $vr8, $vr0, $vr4 + vshuf.b $vr4, $vr0, $vr0, $vr4 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 vslli.d $vr4, $vr4, 24 - vshuf.b $vr5, $vr8, $vr0, $vr5 - vshuf.b $vr6, $vr8, $vr0, $vr6 - vshuf.b $vr7, $vr8, $vr0, $vr7 + vshuf.b $vr5, $vr0, $vr0, $vr5 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf.b $vr6, $vr0, $vr0, $vr6 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf.b $vr7, $vr0, $vr0, $vr7 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 vsrli.d $vr0, $vr0, 56 vslli.d $vr0, $vr0, 56 vslli.d $vr7, $vr7, 48 @@ -5135,123 +5155,123 @@ _ZN8NArchive4NZip6CEcd645ParseEPKh: # @_ZN8NArchive4NZip6CEcd645ParseEPKh .p2align 4, 0x0 # -- Begin function _ZN8NArchive4NZip10CInArchive11ReadHeadersER13CObjectVectorINS0_7CItemExEEPNS0_13CProgressVirtE .LCPI31_0: .byte 8 # 0x8 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 0 # 0x0 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI31_1: .byte 9 # 0x9 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 1 # 0x1 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI31_2: .byte 10 # 0xa - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 2 # 0x2 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI31_3: .byte 11 # 0xb - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 3 # 0x3 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI31_4: .byte 12 # 0xc - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 4 # 0x4 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI31_5: .byte 13 # 0xd - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 5 # 0x5 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI31_6: .byte 14 # 0xe - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 - .byte 20 # 0x14 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 6 # 0x6 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b - .byte 28 # 0x1c - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .text .globl _ZN8NArchive4NZip10CInArchive11ReadHeadersER13CObjectVectorINS0_7CItemExEEPNS0_13CProgressVirtE .p2align 5 @@ -5364,16 +5384,16 @@ _ZN8NArchive4NZip10CInArchive11ReadHeadersER13CObjectVectorINS0_7CItemExEEPNS0_1 ld.d $a2, $fp, 88 addi.d $a3, $a0, -4 .LBB31_10: - ld.w $s4, $fp, 8 + ld.w $s5, $fp, 8 pcalau12i $a0, %got_pc_hi20(_ZN8NArchive4NZip10NSignature21kZip64EndOfCentralDirE) ld.d $a0, $a0, %got_pc_lo12(_ZN8NArchive4NZip10NSignature21kZip64EndOfCentralDirE) ld.w $s6, $a0, 0 - beq $s4, $s6, .LBB31_16 + beq $s5, $s6, .LBB31_16 # %bb.11: move $s2, $zero - move $s5, $zero + move $s4, $zero vrepli.b $vr0, 0 - move $a1, $s4 + move $a1, $s5 pcalau12i $a0, %got_pc_hi20(_ZN8NArchive4NZip10NSignature28kZip64EndOfCentralDirLocatorE) ld.d $a0, $a0, %got_pc_lo12(_ZN8NArchive4NZip10NSignature28kZip64EndOfCentralDirLocatorE) ld.w $a0, $a0, 0 @@ -5428,21 +5448,21 @@ _ZN8NArchive4NZip10CInArchive11ReadHeadersER13CObjectVectorINS0_7CItemExEEPNS0_1 jirl $ra, $ra, 0 bnez $a0, .LBB31_45 # %bb.17: # %_ZN8NArchive4NZip10CInArchive20ReadBytesAndTestSizeEPvj.exit.i96 - st.d $s6, $sp, 128 # 8-byte Folded Spill + st.d $s6, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 128 # 8-byte Folded Spill st.d $s3, $sp, 160 # 8-byte Folded Spill ld.w $a0, $sp, 244 bne $a0, $s1, .LBB31_47 # %bb.18: # %_ZN8NArchive4NZip10CInArchive13SafeReadBytesEPvj.exit ld.bu $a0, $sp, 180 - st.d $a0, $sp, 120 # 8-byte Folded Spill - ld.bu $s3, $sp, 181 - ld.bu $a0, $sp, 182 st.d $a0, $sp, 112 # 8-byte Folded Spill - ld.bu $s8, $sp, 183 + ld.bu $s1, $sp, 181 + ld.bu $s3, $sp, 182 + ld.bu $s2, $sp, 183 ld.bu $s6, $sp, 184 ld.bu $s7, $sp, 185 ld.bu $s5, $sp, 186 - ld.bu $s1, $sp, 187 + ld.bu $s4, $sp, 187 vld $vr0, $sp, 188 vst $vr0, $sp, 96 # 16-byte Folded Spill ld.bu $a0, $sp, 208 @@ -5468,7 +5488,7 @@ _ZN8NArchive4NZip10CInArchive11ReadHeadersER13CObjectVectorINS0_7CItemExEEPNS0_1 addi.d $s0, $s0, -44 beqz $s0, .LBB31_23 # %bb.19: # %.lr.ph.i.preheader - ori $s2, $zero, 1 + ori $s8, $zero, 1 .p2align 4, , 16 .LBB31_20: # %.lr.ph.i # =>This Inner Loop Header: Depth=1 @@ -5482,7 +5502,7 @@ _ZN8NArchive4NZip10CInArchive11ReadHeadersER13CObjectVectorINS0_7CItemExEEPNS0_1 # %bb.21: # %_ZN8NArchive4NZip10CInArchive20ReadBytesAndTestSizeEPvj.exit.i.i.i # in Loop: Header=BB31_20 Depth=1 ld.w $a0, $sp, 244 - bne $a0, $s2, .LBB31_47 + bne $a0, $s8, .LBB31_47 # %bb.22: # %_ZN8NArchive4NZip10CInArchive8ReadByteEv.exit.i # in Loop: Header=BB31_20 Depth=1 addi.d $s0, $s0, -1 @@ -5501,64 +5521,84 @@ _ZN8NArchive4NZip10CInArchive11ReadHeadersER13CObjectVectorINS0_7CItemExEEPNS0_1 bne $a0, $s0, .LBB31_31 # %bb.25: ld.w $a1, $sp, 240 - ld.d $a0, $sp, 120 # 8-byte Folded Reload - or $a0, $s3, $a0 - ld.d $a2, $sp, 112 # 8-byte Folded Reload - or $a0, $a0, $a2 - or $a0, $a0, $s8 + ld.d $a0, $sp, 112 # 8-byte Folded Reload + or $a0, $s1, $a0 + or $a0, $a0, $s3 + or $a0, $a0, $s2 or $a2, $s7, $s6 or $a2, $a2, $s5 - or $a2, $a2, $s1 + or $a2, $a2, $s4 or $a0, $a0, $a2 st.w $a1, $fp, 8 bnez $a0, .LBB31_48 # %bb.26: pcalau12i $a0, %pc_hi20(.LCPI31_0) vld $vr0, $a0, %pc_lo12(.LCPI31_0) + vld $vr8, $sp, 96 # 16-byte Folded Reload + vshuf.b $vr0, $vr0, $vr8, $vr0 pcalau12i $a0, %pc_hi20(.LCPI31_1) vld $vr1, $a0, %pc_lo12(.LCPI31_1) - vrepli.b $vr2, 0 - vld $vr8, $sp, 96 # 16-byte Folded Reload - vshuf.b $vr0, $vr2, $vr8, $vr0 - vshuf.b $vr1, $vr2, $vr8, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 + vshuf.b $vr1, $vr0, $vr8, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 pcalau12i $a0, %pc_hi20(.LCPI31_2) - vld $vr3, $a0, %pc_lo12(.LCPI31_2) - pcalau12i $a0, %pc_hi20(.LCPI31_3) - vld $vr4, $a0, %pc_lo12(.LCPI31_3) + vld $vr2, $a0, %pc_lo12(.LCPI31_2) + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 vslli.d $vr1, $vr1, 8 - vshuf.b $vr3, $vr2, $vr8, $vr3 - vslli.d $vr3, $vr3, 16 - vshuf.b $vr4, $vr2, $vr8, $vr4 + vshuf.b $vr2, $vr0, $vr8, $vr2 + vsllwil.hu.bu $vr2, $vr2, 0 + pcalau12i $a0, %pc_hi20(.LCPI31_3) + vld $vr3, $a0, %pc_lo12(.LCPI31_3) + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vslli.d $vr2, $vr2, 16 + vshuf.b $vr3, $vr0, $vr8, $vr3 + vsllwil.hu.bu $vr3, $vr3, 0 pcalau12i $a0, %pc_hi20(.LCPI31_4) - vld $vr5, $a0, %pc_lo12(.LCPI31_4) + vld $vr4, $a0, %pc_lo12(.LCPI31_4) + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vslli.d $vr3, $vr3, 24 + vshuf.b $vr4, $vr0, $vr8, $vr4 pcalau12i $a0, %pc_hi20(.LCPI31_5) - vld $vr6, $a0, %pc_lo12(.LCPI31_5) + vld $vr5, $a0, %pc_lo12(.LCPI31_5) + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vshuf.b $vr5, $vr0, $vr8, $vr5 pcalau12i $a0, %pc_hi20(.LCPI31_6) - vld $vr7, $a0, %pc_lo12(.LCPI31_6) - vslli.d $vr4, $vr4, 24 - vshuf.b $vr5, $vr2, $vr8, $vr5 - vshuf.b $vr6, $vr2, $vr8, $vr6 - vshuf.b $vr2, $vr2, $vr8, $vr7 + vld $vr6, $a0, %pc_lo12(.LCPI31_6) + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf.b $vr6, $vr0, $vr8, $vr6 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 vbsrl.v $vr7, $vr8, 15 vbsll.v $vr8, $vr8, 1 vor.v $vr7, $vr8, $vr7 vslli.d $vr7, $vr7, 56 - vslli.d $vr2, $vr2, 48 - vslli.d $vr6, $vr6, 40 - vslli.d $vr5, $vr5, 32 + vslli.d $vr6, $vr6, 48 + vslli.d $vr5, $vr5, 40 + vslli.d $vr4, $vr4, 32 vor.v $vr0, $vr1, $vr0 + vor.v $vr0, $vr0, $vr2 vor.v $vr0, $vr0, $vr3 vor.v $vr0, $vr0, $vr4 vor.v $vr0, $vr0, $vr5 - vor.v $vr0, $vr0, $vr6 ld.w $a0, $sp, 220 - vor.v $vr0, $vr0, $vr2 + vor.v $vr0, $vr0, $vr6 vor.v $vr0, $vr0, $vr7 vpickve2gr.d $a2, $vr0, 1 ld.d $s3, $sp, 160 # 8-byte Folded Reload ld.d $s2, $sp, 152 # 8-byte Folded Reload - ld.d $s5, $sp, 168 # 8-byte Folded Reload - ld.d $s6, $sp, 128 # 8-byte Folded Reload + ld.d $s4, $sp, 168 # 8-byte Folded Reload + ld.d $s5, $sp, 128 # 8-byte Folded Reload + ld.d $s6, $sp, 120 # 8-byte Folded Reload bne $a2, $a0, .LBB31_31 # %bb.27: vpickve2gr.d $a2, $vr0, 0 @@ -5576,7 +5616,7 @@ _ZN8NArchive4NZip10CInArchive11ReadHeadersER13CObjectVectorINS0_7CItemExEEPNS0_1 ld.d $a4, $sp, 232 or $a2, $a3, $a2 or $a0, $a2, $a0 - or $a0, $a0, $s5 + or $a0, $a0, $s4 bne $a0, $a4, .LBB31_31 # %bb.29: ld.d $a0, $sp, 32 # 8-byte Folded Reload @@ -5623,13 +5663,13 @@ _ZN8NArchive4NZip10CInArchive11ReadHeadersER13CObjectVectorINS0_7CItemExEEPNS0_1 jirl $ra, $ra, 0 bnez $a0, .LBB31_45 # %bb.35: # %_ZN8NArchive4NZip10CInArchive20ReadBytesAndTestSizeEPvj.exit.i104 - st.d $s5, $sp, 168 # 8-byte Folded Spill + st.d $s4, $sp, 168 # 8-byte Folded Spill st.d $s2, $sp, 152 # 8-byte Folded Spill st.d $s3, $sp, 160 # 8-byte Folded Spill ld.w $a0, $sp, 244 bne $a0, $s0, .LBB31_47 # %bb.36: # %_ZN8NArchive4NZip10CInArchive13SafeReadBytesEPvj.exit105 - xor $a0, $s4, $s6 + xor $a0, $s5, $s6 sltu $s0, $zero, $a0 ld.hu $s5, $sp, 176 ld.hu $s4, $sp, 178 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Compress/BZip2Encoder.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Compress/BZip2Encoder.s index 986d2a31..ba1af7dc 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Compress/BZip2Encoder.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Compress/BZip2Encoder.s @@ -2538,36 +2538,53 @@ _ZN9NCompress6NBZip211CThreadInfo11EncodeBlockEPKhj: # @_ZN9NCompress6NBZip211CT .LBB29_14: # %vector.body # =>This Inner Loop Header: Depth=1 vld $vr0, $a5, 0 - vilvh.b $vr1, $vr8, $vr0 - vilvh.h $vr2, $vr8, $vr1 - vilvh.w $vr3, $vr8, $vr2 - vilvl.w $vr2, $vr8, $vr2 - vilvl.h $vr1, $vr8, $vr1 - vilvh.w $vr4, $vr8, $vr1 - vilvl.w $vr1, $vr8, $vr1 - vilvl.b $vr0, $vr8, $vr0 - vilvh.h $vr5, $vr8, $vr0 - vilvh.w $vr6, $vr8, $vr5 - vilvl.w $vr5, $vr8, $vr5 - vilvl.h $vr0, $vr8, $vr0 - vilvh.w $vr7, $vr8, $vr0 - vilvl.w $vr0, $vr8, $vr0 + vbsrl.v $vr1, $vr0, 14 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 12 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr0, 10 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr4, $vr0, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsrli.d $vr5, $vr0, 48 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsrli.d $vr6, $vr0, 32 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.b $vr7, $vr0, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vpickve2gr.d $a7, $vr0, 0 vpickve2gr.d $t0, $vr0, 1 vpickve2gr.d $t1, $vr7, 0 vpickve2gr.d $t2, $vr7, 1 - vpickve2gr.d $t3, $vr5, 0 - vpickve2gr.d $t4, $vr5, 1 - vpickve2gr.d $t5, $vr6, 0 - vpickve2gr.d $t6, $vr6, 1 - vpickve2gr.d $t7, $vr1, 0 - vpickve2gr.d $t8, $vr1, 1 - vpickve2gr.d $s3, $vr4, 0 - vpickve2gr.d $s5, $vr4, 1 + vpickve2gr.d $t3, $vr6, 0 + vpickve2gr.d $t4, $vr6, 1 + vpickve2gr.d $t5, $vr5, 0 + vpickve2gr.d $t6, $vr5, 1 + vpickve2gr.d $t7, $vr4, 0 + vpickve2gr.d $t8, $vr4, 1 + vpickve2gr.d $s3, $vr3, 0 + vpickve2gr.d $s5, $vr3, 1 vpickve2gr.d $s6, $vr2, 0 vpickve2gr.d $s7, $vr2, 1 - vpickve2gr.d $s8, $vr3, 0 - vpickve2gr.d $ra, $vr3, 1 + vpickve2gr.d $s8, $vr1, 0 + vpickve2gr.d $ra, $vr1, 1 stx.b $a4, $a7, $a3 stx.b $a4, $t0, $a3 stx.b $a4, $t1, $a3 @@ -2604,10 +2621,13 @@ _ZN9NCompress6NBZip211CThreadInfo11EncodeBlockEPKhj: # @_ZN9NCompress6NBZip211CT # =>This Inner Loop Header: Depth=1 ld.w $a6, $a3, 0 vinsgr2vr.w $vr0, $a6, 0 - vilvl.b $vr0, $vr8, $vr0 - vilvl.h $vr0, $vr8, $vr0 - vilvh.w $vr1, $vr8, $vr0 - vilvl.w $vr0, $vr8, $vr0 + vshuf4i.b $vr1, $vr0, 14 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vpickve2gr.d $a6, $vr0, 0 vpickve2gr.d $a7, $vr0, 1 vpickve2gr.d $t0, $vr1, 0 @@ -3712,10 +3732,12 @@ _ZN9NCompress6NBZip211CThreadInfo11EncodeBlockEPKhj: # @_ZN9NCompress6NBZip211CT # => This Inner Loop Header: Depth=5 vld $vr2, $t3, -16 vld $vr3, $t3, 0 - vilvh.w $vr4, $vr6, $vr2 - vilvl.w $vr2, $vr6, $vr2 - vilvh.w $vr5, $vr6, $vr3 - vilvl.w $vr3, $vr6, $vr3 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.du.wu $vr4, $vr4, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr3, $vr3, 0 vpickve2gr.d $t5, $vr2, 0 vpickve2gr.d $t6, $vr2, 1 vpickve2gr.d $t7, $vr4, 0 @@ -3740,10 +3762,10 @@ _ZN9NCompress6NBZip211CThreadInfo11EncodeBlockEPKhj: # @_ZN9NCompress6NBZip211CT vinsgr2vr.b $vr3, $t6, 1 vinsgr2vr.b $vr3, $t7, 2 vinsgr2vr.b $vr3, $t8, 3 - vilvl.b $vr2, $vr6, $vr2 - vilvl.h $vr2, $vr6, $vr2 - vilvl.b $vr3, $vr6, $vr3 - vilvl.h $vr3, $vr6, $vr3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 addi.d $t4, $t4, -8 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Compress/DeflateEncoder.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Compress/DeflateEncoder.s index a61fbd25..2be324ef 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Compress/DeflateEncoder.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Compress/DeflateEncoder.s @@ -2142,28 +2142,27 @@ _ZN9NCompress8NDeflate8NEncoder16Huffman_GetPriceEPKjPKhj: # @_ZN9NCompress8NDef addi.d $a5, $a1, 4 move $a6, $a2 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB17_5: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $a7, $a5, -4 ld.w $t0, $a5, 0 - vinsgr2vr.w $vr3, $a7, 0 - vinsgr2vr.w $vr4, $t0, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vld $vr5, $a4, -16 - vld $vr6, $a4, 0 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.w $vr2, $a7, 0 + vinsgr2vr.w $vr3, $t0, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vld $vr4, $a4, -16 + vld $vr5, $a4, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vmadd.w $vr0, $vr4, $vr2 vmadd.w $vr1, $vr5, $vr3 - vmadd.w $vr2, $vr6, $vr4 addi.d $a6, $a6, -8 addi.d $a4, $a4, 32 addi.d $a5, $a5, 8 bnez $a6, .LBB17_5 # %bb.6: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a4, $vr0, 0 @@ -2214,28 +2213,27 @@ _ZN9NCompress8NDeflate8NEncoder21Huffman_GetPrice_SpecEPKjPKhjS5_j: # @_ZN9NComp addi.d $t0, $a1, 4 move $t1, $a7 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB18_5: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $t2, $t0, -4 ld.w $t3, $t0, 0 - vinsgr2vr.w $vr3, $t2, 0 - vinsgr2vr.w $vr4, $t3, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vld $vr5, $a5, -16 - vld $vr6, $a5, 0 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.w $vr2, $t2, 0 + vinsgr2vr.w $vr3, $t3, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vld $vr4, $a5, -16 + vld $vr5, $a5, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vmadd.w $vr0, $vr4, $vr2 vmadd.w $vr1, $vr5, $vr3 - vmadd.w $vr2, $vr6, $vr4 addi.d $t1, $t1, -8 addi.d $a5, $a5, 32 addi.d $t0, $t0, 8 bnez $t1, .LBB18_5 # %bb.6: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a5, $vr0, 0 @@ -2279,28 +2277,27 @@ _ZN9NCompress8NDeflate8NEncoder21Huffman_GetPrice_SpecEPKjPKhjS5_j: # @_ZN9NComp addi.d $a7, $a3, 4 move $t0, $a4 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB18_14: # %vector.body31 # =>This Inner Loop Header: Depth=1 ld.w $t1, $a7, -4 ld.w $t2, $a7, 0 - vinsgr2vr.w $vr3, $t1, 0 - vinsgr2vr.w $vr4, $t2, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vld $vr5, $a6, -16 - vld $vr6, $a6, 0 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.w $vr2, $t1, 0 + vinsgr2vr.w $vr3, $t2, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vld $vr4, $a6, -16 + vld $vr5, $a6, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vmadd.w $vr0, $vr4, $vr2 vmadd.w $vr1, $vr5, $vr3 - vmadd.w $vr2, $vr6, $vr4 addi.d $t0, $t0, -8 addi.d $a6, $a6, 32 addi.d $a7, $a7, 8 bnez $t0, .LBB18_14 # %bb.15: # %middle.block40 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a6, $vr0, 0 @@ -2346,15 +2343,15 @@ _ZN9NCompress8NDeflate8NEncoder21Huffman_GetPrice_SpecEPKjPKhjS5_j: # @_ZN9NComp .word 4294967295 # 0xffffffff .word 4 # 0x4 .LCPI19_3: - .word 6 # 0x6 - .word 7 # 0x7 - .word 7 # 0x7 - .word 9 # 0x9 -.LCPI19_4: .word 3 # 0x3 .word 5 # 0x5 .word 5 # 0x5 .word 6 # 0x6 +.LCPI19_4: + .word 6 # 0x6 + .word 7 # 0x7 + .word 7 # 0x7 + .word 9 # 0x9 .LCPI19_5: .word 11 # 0xb .word 12 # 0xc @@ -2381,7 +2378,6 @@ _ZNK9NCompress8NDeflate8NEncoder6CCoder15GetLzBlockPriceEv: # @_ZNK9NCompress8ND fst.d $fs0, $sp, 32 # 8-byte Folded Spill fst.d $fs1, $sp, 24 # 8-byte Folded Spill fst.d $fs2, $sp, 16 # 8-byte Folded Spill - fst.d $fs3, $sp, 8 # 8-byte Folded Spill move $a1, $zero addi.d $a2, $a0, 2047 addi.d $s0, $a2, 179 @@ -2391,28 +2387,27 @@ _ZNK9NCompress8NDeflate8NEncoder6CCoder15GetLzBlockPriceEv: # @_ZNK9NCompress8ND vrepli.b $vr0, 0 ori $a5, $zero, 1152 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB19_1: # %vector.body # =>This Inner Loop Header: Depth=1 ld.d $a6, $a4, -4 ld.w $a7, $a4, 0 - vinsgr2vr.d $vr3, $a6, 0 - vinsgr2vr.w $vr4, $a7, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 + vinsgr2vr.d $vr2, $a6, 0 + vinsgr2vr.w $vr3, $a7, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 add.d $a6, $a3, $a1 - vldx $vr5, $a3, $a1 - vld $vr6, $a6, 16 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vldx $vr4, $a3, $a1 + vld $vr5, $a6, 16 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vmadd.w $vr0, $vr4, $vr2 vmadd.w $vr1, $vr5, $vr3 - vmadd.w $vr2, $vr6, $vr4 addi.d $a1, $a1, 32 addi.d $a4, $a4, 8 bne $a1, $a5, .LBB19_1 # %bb.2: # %_ZN9NCompress8NDeflate8NEncoder16Huffman_GetPriceEPKjPKhj.exit.i - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a1, $vr0, 0 @@ -2431,14 +2426,14 @@ _ZNK9NCompress8NDeflate8NEncoder6CCoder15GetLzBlockPriceEv: # @_ZNK9NCompress8ND ldx.bu $t7, $a0, $a3 ori $a3, $zero, 3488 ldx.w $s5, $a0, $a3 - ld.bu $t8, $s0, 19 - ld.bu $fp, $s0, 20 - ld.bu $s8, $s0, 21 + ld.bu $s8, $s0, 19 + ld.bu $t8, $s0, 20 + ld.bu $fp, $s0, 21 ori $a3, $zero, 2248 - ldx.bu $s1, $a0, $a3 - ld.bu $s2, $s0, 27 - ld.bu $s3, $s0, 28 - ld.bu $s4, $s0, 29 + ldx.bu $s4, $a0, $a3 + ld.bu $s1, $s0, 27 + ld.bu $s2, $s0, 28 + ld.bu $s3, $s0, 29 ori $a3, $zero, 3532 ldx.w $a3, $a0, $a3 ori $a4, $zero, 3424 @@ -2457,132 +2452,143 @@ _ZNK9NCompress8NDeflate8NEncoder6CCoder15GetLzBlockPriceEv: # @_ZNK9NCompress8ND vldx $vr0, $a0, $ra pcalau12i $ra, %pc_hi20(.LCPI19_0) vld $vr1, $ra, %pc_lo12(.LCPI19_0) - vld $vr6, $s0, 1 + vld $vr7, $s0, 1 vldx $vr2, $a0, $t3 ori $t3, $zero, 3420 ori $ra, $zero, 3452 vldx $vr3, $a0, $ra ori $ra, $zero, 3468 - vldx $vr12, $a0, $ra + vldx $vr11, $a0, $ra vldx $vr4, $a0, $t3 vpickve2gr.w $t3, $vr3, 0 add.w $t3, $t4, $t3 - vpickve2gr.w $t4, $vr12, 3 + vpickve2gr.w $t4, $vr11, 3 add.w $t4, $t4, $t6 - vrepli.b $vr7, 0 - vilvh.b $vr9, $vr7, $vr6 - vilvl.h $vr5, $vr7, $vr9 - vilvl.b $vr8, $vr7, $vr6 - vilvh.h $vr6, $vr7, $vr8 - vilvl.h $vr8, $vr7, $vr8 - vilvh.h $vr13, $vr7, $vr9 + vbsrl.v $vr5, $vr7, 12 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr12, $vr5, 0 + vsrli.d $vr5, $vr7, 32 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vbsrl.v $vr6, $vr7, 8 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 mul.d $t5, $t6, $t5 mul.d $t6, $s5, $t7 vpickve2gr.w $t7, $vr0, 0 - mul.d $t7, $t7, $t8 - vpickve2gr.w $t8, $vr0, 1 - mul.d $t8, $t8, $fp - vpickve2gr.w $fp, $vr0, 2 - mul.d $fp, $fp, $s8 + mul.d $t7, $t7, $s8 + vpickve2gr.w $s8, $vr0, 1 + mul.d $t8, $s8, $t8 + vpickve2gr.w $s8, $vr0, 2 + mul.d $fp, $s8, $fp ld.w $s8, $s0, 23 vpickve2gr.w $s0, $vr0, 3 - mul.d $s0, $s0, $s1 - ldptr.d $s1, $a0, 3524 - vinsgr2vr.w $vr14, $s8, 0 + mul.d $s0, $s0, $s4 + ldptr.d $s4, $a0, 3524 + vinsgr2vr.w $vr13, $s8, 0 ori $s8, $zero, 3508 - vldx $vr9, $a0, $s8 - vinsgr2vr.d $vr15, $s1, 0 - ori $s1, $zero, 3528 - ldx.w $s1, $a0, $s1 + vldx $vr8, $a0, $s8 + vinsgr2vr.d $vr14, $s4, 0 + ori $s4, $zero, 3528 + ldx.w $s4, $a0, $s4 ldptr.d $ra, $a0, 3440 pcalau12i $s8, %pc_hi20(.LCPI19_1) - vld $vr10, $s8, %pc_lo12(.LCPI19_1) + vld $vr9, $s8, %pc_lo12(.LCPI19_1) pcalau12i $s8, %pc_hi20(.LCPI19_2) - vld $vr11, $s8, %pc_lo12(.LCPI19_2) + vld $vr10, $s8, %pc_lo12(.LCPI19_2) ori $s8, $zero, 3524 ldx.w $s8, $a0, $s8 - vinsgr2vr.d $vr16, $ra, 0 - vshuf.w $vr10, $vr16, $vr3 - vshuf.w $vr11, $vr16, $vr15 - vinsgr2vr.w $vr11, $a3, 2 - vld $vr16, $a2, 0 + vinsgr2vr.d $vr15, $ra, 0 + vshuf.w $vr9, $vr15, $vr3 + vshuf.w $vr10, $vr15, $vr14 + vld $vr15, $a2, 0 + vinsgr2vr.w $vr10, $a3, 2 ori $ra, $zero, 3332 - vldx $vr15, $a0, $ra + vldx $vr14, $a0, $ra + vbsrl.v $vr16, $vr15, 12 ldptr.d $ra, $a0, 2224 - vilvh.b $vr17, $vr7, $vr16 - vilvh.h $vr18, $vr7, $vr17 - vmul.w $vr15, $vr15, $vr18 - vinsgr2vr.d $vr18, $ra, 0 + vsllwil.hu.bu $vr16, $vr16, 0 + vsllwil.wu.hu $vr16, $vr16, 0 + vmul.w $vr14, $vr14, $vr16 + vinsgr2vr.d $vr16, $ra, 0 ori $ra, $zero, 3404 - vldx $vr19, $a0, $ra - vinsgr2vr.b $vr20, $s7, 0 - vextrins.b $vr20, $vr18, 16 - vextrins.b $vr20, $vr18, 33 - vinsgr2vr.b $vr20, $s6, 3 - vilvl.b $vr18, $vr7, $vr20 - vilvl.h $vr18, $vr7, $vr18 - vmadd.w $vr15, $vr19, $vr18 - vmadd.w $vr15, $vr12, $vr13 - vinsgr2vr.w $vr12, $s5, 3 - vilvl.b $vr13, $vr7, $vr14 - vilvl.h $vr13, $vr7, $vr13 + vldx $vr17, $a0, $ra + vinsgr2vr.b $vr18, $s7, 0 + vextrins.b $vr18, $vr16, 16 + vextrins.b $vr18, $vr16, 33 + vinsgr2vr.b $vr18, $s6, 3 + vsllwil.hu.bu $vr16, $vr18, 0 + vsllwil.wu.hu $vr16, $vr16, 0 + vmadd.w $vr14, $vr17, $vr16 + vmadd.w $vr14, $vr11, $vr12 + vinsgr2vr.w $vr11, $s5, 3 + vsllwil.hu.bu $vr12, $vr13, 0 + vsllwil.wu.hu $vr12, $vr12, 0 pcalau12i $s5, %pc_hi20(.LCPI19_3) - vld $vr14, $s5, %pc_lo12(.LCPI19_3) + vld $vr13, $s5, %pc_lo12(.LCPI19_3) pcalau12i $s5, %pc_hi20(.LCPI19_4) - vld $vr18, $s5, %pc_lo12(.LCPI19_4) - vrepli.w $vr19, 3 - vinsgr2vr.w $vr19, $s2, 0 - vinsgr2vr.w $vr19, $s3, 1 - vinsgr2vr.w $vr19, $s4, 2 - pcalau12i $s2, %pc_hi20(.LCPI19_5) - vld $vr20, $s2, %pc_lo12(.LCPI19_5) - alsl.d $s2, $s8, $s8, 1 - alsl.d $s2, $s2, $s8, 2 - ori $s3, $zero, 3316 - vldx $vr21, $a0, $s3 - ori $s3, $zero, 3300 - vldx $vr22, $a0, $s3 - ori $s3, $zero, 3284 - vldx $vr23, $a0, $s3 - vilvl.h $vr17, $vr7, $vr17 - vilvl.b $vr16, $vr7, $vr16 - vilvh.h $vr24, $vr7, $vr16 - vilvl.h $vr16, $vr7, $vr16 - vmul.w $vr16, $vr23, $vr16 - vmul.w $vr22, $vr22, $vr24 - vmul.w $vr17, $vr21, $vr17 - ld.d $s3, $a2, 18 + vld $vr16, $s5, %pc_lo12(.LCPI19_4) + vrepli.w $vr17, 3 + vinsgr2vr.w $vr17, $s1, 0 + vinsgr2vr.w $vr17, $s2, 1 + vinsgr2vr.w $vr17, $s3, 2 + pcalau12i $s1, %pc_hi20(.LCPI19_5) + vld $vr18, $s1, %pc_lo12(.LCPI19_5) + alsl.d $s1, $s8, $s8, 1 + alsl.d $s1, $s1, $s8, 2 + ori $s2, $zero, 3300 + vldx $vr19, $a0, $s2 + ori $s2, $zero, 3316 + vldx $vr20, $a0, $s2 + ori $s2, $zero, 3284 + vldx $vr21, $a0, $s2 + vsrli.d $vr22, $vr15, 32 + vsllwil.hu.bu $vr22, $vr22, 0 + vsllwil.wu.hu $vr22, $vr22, 0 + vbsrl.v $vr23, $vr15, 8 + vsllwil.hu.bu $vr23, $vr23, 0 + vsllwil.wu.hu $vr23, $vr23, 0 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vmul.w $vr15, $vr21, $vr15 + vmul.w $vr20, $vr20, $vr23 + vmul.w $vr19, $vr19, $vr22 + ld.d $s2, $a2, 18 ld.w $a2, $a2, 26 mul.d $a7, $t0, $a7 mul.d $t0, $t2, $t1 - vinsgr2vr.d $vr21, $s3, 0 - vinsgr2vr.w $vr23, $a2, 0 + vinsgr2vr.d $vr21, $s2, 0 + vinsgr2vr.w $vr22, $a2, 0 ori $a2, $zero, 3356 + vldx $vr23, $a0, $a2 + ori $a2, $zero, 3388 vldx $vr24, $a0, $a2 ori $a2, $zero, 3372 vldx $vr25, $a0, $a2 - ori $a2, $zero, 3388 - vldx $vr26, $a0, $a2 - vilvl.b $vr21, $vr7, $vr21 - vilvl.h $vr27, $vr7, $vr21 - vilvh.h $vr21, $vr7, $vr21 - vilvl.b $vr23, $vr7, $vr23 - vilvl.h $vr7, $vr7, $vr23 - vmadd.w $vr17, $vr26, $vr7 - vmadd.w $vr22, $vr25, $vr21 - vmadd.w $vr16, $vr24, $vr27 - vmadd.w $vr16, $vr4, $vr8 - vmadd.w $vr22, $vr2, $vr6 - vmadd.w $vr17, $vr3, $vr5 - vmadd.w $vr17, $vr10, $vr18 - vmadd.w $vr22, $vr11, $vr19 - vmadd.w $vr15, $vr12, $vr14 - vmadd.w $vr16, $vr9, $vr13 - vmadd.w $vr16, $vr0, $vr1 - vmadd.w $vr16, $vr9, $vr20 - vadd.w $vr0, $vr22, $vr15 - vadd.w $vr1, $vr16, $vr17 - vadd.w $vr0, $vr1, $vr0 + vsllwil.hu.bu $vr26, $vr21, 0 + vsllwil.wu.hu $vr26, $vr26, 0 + vsllwil.hu.bu $vr22, $vr22, 0 + vsllwil.wu.hu $vr22, $vr22, 0 + vsrli.d $vr21, $vr21, 32 + vsllwil.hu.bu $vr21, $vr21, 0 + vsllwil.wu.hu $vr21, $vr21, 0 + vmadd.w $vr19, $vr25, $vr21 + vmadd.w $vr20, $vr24, $vr22 + vmadd.w $vr15, $vr23, $vr26 + vmadd.w $vr15, $vr4, $vr7 + vmadd.w $vr20, $vr3, $vr6 + vmadd.w $vr19, $vr2, $vr5 + vmadd.w $vr19, $vr10, $vr17 + vmadd.w $vr14, $vr11, $vr16 + vmadd.w $vr20, $vr9, $vr13 + vmadd.w $vr15, $vr8, $vr12 + vmadd.w $vr15, $vr0, $vr1 + vmadd.w $vr15, $vr8, $vr18 + vadd.w $vr0, $vr15, $vr20 + vadd.w $vr1, $vr19, $vr14 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a0, $vr0, 0 @@ -2590,8 +2596,8 @@ _ZNK9NCompress8NDeflate8NEncoder6CCoder15GetLzBlockPriceEv: # @_ZNK9NCompress8ND add.d $a2, $t0, $t5 add.d $a7, $t6, $t7 add.d $t0, $t8, $fp - add.d $t1, $s0, $s2 - add.d $a3, $s1, $a3 + add.d $t1, $s0, $s1 + add.d $a3, $s4, $a3 ori $t2, $zero, 14 mul.d $a3, $a3, $t2 alsl.d $a6, $t3, $a6, 2 @@ -2605,7 +2611,6 @@ _ZNK9NCompress8NDeflate8NEncoder6CCoder15GetLzBlockPriceEv: # @_ZNK9NCompress8ND add.d $a2, $a3, $a4 add.d $a0, $a0, $a2 add.w $a0, $a0, $a1 - fld.d $fs3, $sp, 8 # 8-byte Folded Reload fld.d $fs2, $sp, 16 # 8-byte Folded Reload fld.d $fs1, $sp, 24 # 8-byte Folded Reload fld.d $fs0, $sp, 32 # 8-byte Folded Reload @@ -3200,19 +3205,7 @@ _ZN9NCompress8NDeflate8NEncoder6CCoder9SetPricesERKNS0_7CLevelsE: # @_ZN9NCompre .Lfunc_end21: .size _ZN9NCompress8NDeflate8NEncoder6CCoder9SetPricesERKNS0_7CLevelsE, .Lfunc_end21-_ZN9NCompress8NDeflate8NEncoder6CCoder9SetPricesERKNS0_7CLevelsE # -- End function - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4, 0x0 # -- Begin function _ZN9NCompress8NDeflate8NEncoder19Huffman_ReverseBitsEPjPKhj -.LCPI22_0: - .half 3 # 0x3 - .half 9 # 0x9 - .half 2 # 0x2 - .half 11 # 0xb - .half 1 # 0x1 - .half 13 # 0xd - .half 0 # 0x0 - .half 15 # 0xf - .text - .globl _ZN9NCompress8NDeflate8NEncoder19Huffman_ReverseBitsEPjPKhj + .globl _ZN9NCompress8NDeflate8NEncoder19Huffman_ReverseBitsEPjPKhj # -- Begin function _ZN9NCompress8NDeflate8NEncoder19Huffman_ReverseBitsEPjPKhj .p2align 5 .type _ZN9NCompress8NDeflate8NEncoder19Huffman_ReverseBitsEPjPKhj,@function _ZN9NCompress8NDeflate8NEncoder19Huffman_ReverseBitsEPjPKhj: # @_ZN9NCompress8NDeflate8NEncoder19Huffman_ReverseBitsEPjPKhj @@ -3255,43 +3248,40 @@ _ZN9NCompress8NDeflate8NEncoder19Huffman_ReverseBitsEPjPKhj: # @_ZN9NCompress8ND bstrpick.d $a2, $a3, 31, 3 slli.d $a2, $a2, 3 addi.d $a4, $a1, 4 - pcalau12i $a5, %pc_hi20(.LCPI22_0) - vld $vr0, $a5, %pc_lo12(.LCPI22_0) addi.d $a5, $a0, 16 - vrepli.b $vr1, 0 - vrepli.w $vr2, 16 + vrepli.w $vr0, 16 move $a6, $a2 .p2align 4, , 16 .LBB22_9: # %vector.body # =>This Inner Loop Header: Depth=1 - vld $vr3, $a5, -16 - vld $vr4, $a5, 0 - vpickev.h $vr3, $vr3, $vr3 - vpickev.h $vr4, $vr4, $vr4 - vpickve2gr.d $a7, $vr3, 0 + vld $vr1, $a5, -16 + vld $vr2, $a5, 0 + vpickev.h $vr1, $vr1, $vr1 + vpickev.h $vr2, $vr2, $vr2 + vpickve2gr.d $a7, $vr1, 0 bitrev.d $a7, $a7 - vinsgr2vr.d $vr3, $a7, 0 - vpickve2gr.d $a7, $vr4, 0 + vinsgr2vr.d $vr1, $a7, 0 + vpickve2gr.d $a7, $vr2, 0 ld.w $t0, $a4, -4 bitrev.d $a7, $a7 - vinsgr2vr.d $vr4, $a7, 0 + vinsgr2vr.d $vr2, $a7, 0 + vshuf4i.h $vr1, $vr1, 27 + vinsgr2vr.w $vr3, $t0, 0 ld.w $a7, $a4, 0 - vinsgr2vr.w $vr5, $t0, 0 - vori.b $vr6, $vr0, 0 - vshuf.h $vr6, $vr1, $vr3 - vori.b $vr3, $vr0, 0 - vshuf.h $vr3, $vr1, $vr4 + vshuf4i.h $vr2, $vr2, 27 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr2, $vr2, 0 vinsgr2vr.w $vr4, $a7, 0 - vilvl.b $vr5, $vr1, $vr5 - vilvl.h $vr5, $vr1, $vr5 - vilvl.b $vr4, $vr1, $vr4 - vilvl.h $vr4, $vr1, $vr4 - vsub.w $vr5, $vr2, $vr5 - vsub.w $vr4, $vr2, $vr4 - vsrl.w $vr5, $vr6, $vr5 - vsrl.w $vr3, $vr3, $vr4 - vst $vr5, $a5, -16 - vst $vr3, $a5, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsub.w $vr3, $vr0, $vr3 + vsub.w $vr4, $vr0, $vr4 + vsrl.w $vr1, $vr1, $vr3 + vsrl.w $vr2, $vr2, $vr4 + vst $vr1, $a5, -16 + vst $vr2, $a5, 0 addi.d $a6, $a6, -8 addi.d $a4, $a4, 8 addi.d $a5, $a5, 32 @@ -3302,19 +3292,7 @@ _ZN9NCompress8NDeflate8NEncoder19Huffman_ReverseBitsEPjPKhj: # @_ZN9NCompress8ND .Lfunc_end22: .size _ZN9NCompress8NDeflate8NEncoder19Huffman_ReverseBitsEPjPKhj, .Lfunc_end22-_ZN9NCompress8NDeflate8NEncoder19Huffman_ReverseBitsEPjPKhj # -- End function - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4, 0x0 # -- Begin function _ZN9NCompress8NDeflate8NEncoder6CCoder10WriteBlockEv -.LCPI23_0: - .half 3 # 0x3 - .half 9 # 0x9 - .half 2 # 0x2 - .half 11 # 0xb - .half 1 # 0x1 - .half 13 # 0xd - .half 0 # 0x0 - .half 15 # 0xf - .text - .globl _ZN9NCompress8NDeflate8NEncoder6CCoder10WriteBlockEv + .globl _ZN9NCompress8NDeflate8NEncoder6CCoder10WriteBlockEv # -- Begin function _ZN9NCompress8NDeflate8NEncoder6CCoder10WriteBlockEv .p2align 5 .type _ZN9NCompress8NDeflate8NEncoder6CCoder10WriteBlockEv,@function _ZN9NCompress8NDeflate8NEncoder6CCoder10WriteBlockEv: # @_ZN9NCompress8NDeflate8NEncoder6CCoder10WriteBlockEv @@ -3353,171 +3331,169 @@ _ZN9NCompress8NDeflate8NEncoder6CCoder10WriteBlockEv: # @_ZN9NCompress8NDeflate8 st.d $a2, $sp, 40 # 8-byte Folded Spill addi.d $a2, $a0, 1505 addi.d $a3, $fp, 1940 - pcalau12i $a4, %pc_hi20(.LCPI23_0) - vld $vr0, $a4, %pc_lo12(.LCPI23_0) - vrepli.b $vr2, 0 - vrepli.w $vr1, 16 + vrepli.w $vr0, 16 ori $a4, $zero, 288 .p2align 4, , 16 .LBB23_1: # %vector.body # =>This Inner Loop Header: Depth=1 - vld $vr3, $a2, -16 - vld $vr4, $a2, 0 - vpickev.h $vr3, $vr3, $vr3 - vpickev.h $vr4, $vr4, $vr4 - vpickve2gr.d $a5, $vr3, 0 + vld $vr1, $a2, -16 + vld $vr2, $a2, 0 + vpickev.h $vr1, $vr1, $vr1 + vpickev.h $vr2, $vr2, $vr2 + vpickve2gr.d $a5, $vr1, 0 bitrev.d $a5, $a5 - vinsgr2vr.d $vr3, $a5, 0 - vpickve2gr.d $a5, $vr4, 0 + vinsgr2vr.d $vr1, $a5, 0 + vpickve2gr.d $a5, $vr2, 0 add.d $a6, $a3, $a1 ld.d $a6, $a6, -4 bitrev.d $a5, $a5 - vinsgr2vr.d $vr4, $a5, 0 + vinsgr2vr.d $vr2, $a5, 0 + vshuf4i.h $vr1, $vr1, 27 + vinsgr2vr.d $vr3, $a6, 0 ldx.w $a5, $a3, $a1 - vinsgr2vr.d $vr5, $a6, 0 - vori.b $vr6, $vr0, 0 - vshuf.h $vr6, $vr2, $vr3 - vori.b $vr3, $vr0, 0 - vshuf.h $vr3, $vr2, $vr4 + vshuf4i.h $vr2, $vr2, 27 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr2, $vr2, 0 vinsgr2vr.w $vr4, $a5, 0 - vilvl.b $vr5, $vr2, $vr5 - vilvl.h $vr5, $vr2, $vr5 - vilvl.b $vr4, $vr2, $vr4 - vilvl.h $vr4, $vr2, $vr4 - vsub.w $vr5, $vr1, $vr5 - vsub.w $vr4, $vr1, $vr4 - vsrl.w $vr5, $vr6, $vr5 - vsrl.w $vr3, $vr3, $vr4 - vst $vr5, $a2, -16 - vst $vr3, $a2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsub.w $vr3, $vr0, $vr3 + vsub.w $vr4, $vr0, $vr4 + vsrl.w $vr1, $vr1, $vr3 + vsrl.w $vr2, $vr2, $vr4 + vst $vr1, $a2, -16 + vst $vr2, $a2, 0 addi.d $a1, $a1, 8 addi.d $a2, $a2, 32 bne $a1, $a4, .LBB23_1 # %bb.2: # %_ZN9NCompress8NDeflate8NEncoder19Huffman_ReverseBitsEPjPKhj.exit lu12i.w $a5, 1 ori $a1, $a5, 592 - vldx $vr3, $fp, $a1 - vpickev.h $vr3, $vr3, $vr3 - vpickve2gr.d $a2, $vr3, 0 - ldptr.d $a3, $fp, 2224 - bitrev.d $a2, $a2 - vinsgr2vr.d $vr3, $a2, 0 - vori.b $vr4, $vr0, 0 - vshuf.h $vr4, $vr2, $vr3 - vinsgr2vr.d $vr3, $a3, 0 - vilvl.b $vr3, $vr2, $vr3 + vldx $vr1, $fp, $a1 + vpickev.h $vr1, $vr1, $vr1 ori $a2, $a5, 608 - vldx $vr5, $fp, $a2 - vilvl.h $vr3, $vr2, $vr3 - vsub.w $vr3, $vr1, $vr3 - vsrl.w $vr3, $vr4, $vr3 - vpickev.h $vr4, $vr5, $vr5 - vpickve2gr.d $a3, $vr4, 0 + vldx $vr2, $fp, $a2 + vpickve2gr.d $a3, $vr1, 0 + bitrev.d $a3, $a3 + vinsgr2vr.d $vr1, $a3, 0 + vpickev.h $vr2, $vr2, $vr2 + vpickve2gr.d $a3, $vr2, 0 + bitrev.d $a3, $a3 ori $a4, $zero, 2228 ldx.w $a4, $fp, $a4 - bitrev.d $a3, $a3 - vinsgr2vr.d $vr4, $a3, 0 - vori.b $vr5, $vr0, 0 - vshuf.h $vr5, $vr2, $vr4 - vinsgr2vr.w $vr4, $a4, 0 - vilvl.b $vr4, $vr2, $vr4 - vilvl.h $vr4, $vr2, $vr4 + vinsgr2vr.d $vr2, $a3, 0 + vshuf4i.h $vr2, $vr2, 27 + vsllwil.wu.hu $vr2, $vr2, 0 + vinsgr2vr.w $vr3, $a4, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 ori $a3, $a5, 624 - vldx $vr6, $fp, $a3 - vsub.w $vr4, $vr1, $vr4 - vsrl.w $vr4, $vr5, $vr4 - vstx $vr4, $fp, $a2 - vpickev.h $vr4, $vr6, $vr6 - vpickve2gr.d $a2, $vr4, 0 - ldptr.d $a4, $fp, 2232 + vldx $vr4, $fp, $a3 + vsub.w $vr3, $vr0, $vr3 + vsrl.w $vr2, $vr2, $vr3 + vstx $vr2, $fp, $a2 + vpickev.h $vr2, $vr4, $vr4 + vpickve2gr.d $a2, $vr2, 0 bitrev.d $a2, $a2 - vinsgr2vr.d $vr4, $a2, 0 - vori.b $vr5, $vr0, 0 - vshuf.h $vr5, $vr2, $vr4 - vinsgr2vr.d $vr4, $a4, 0 - vilvl.b $vr4, $vr2, $vr4 - vilvl.h $vr4, $vr2, $vr4 + ldptr.d $a4, $fp, 2232 + vinsgr2vr.d $vr2, $a2, 0 + vshuf4i.h $vr2, $vr2, 27 + vsllwil.wu.hu $vr2, $vr2, 0 + vinsgr2vr.d $vr3, $a4, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 ori $a2, $a5, 640 - vldx $vr6, $fp, $a2 - vsub.w $vr4, $vr1, $vr4 - vsrl.w $vr4, $vr5, $vr4 - vstx $vr4, $fp, $a3 - vpickev.h $vr4, $vr6, $vr6 - vpickve2gr.d $a3, $vr4, 0 - ori $a4, $zero, 2236 - ldx.w $a4, $fp, $a4 + vldx $vr4, $fp, $a2 + vsub.w $vr3, $vr0, $vr3 + vsrl.w $vr2, $vr2, $vr3 + vstx $vr2, $fp, $a3 + vpickev.h $vr2, $vr4, $vr4 + vpickve2gr.d $a3, $vr2, 0 bitrev.d $a3, $a3 - vinsgr2vr.d $vr4, $a3, 0 - vori.b $vr5, $vr0, 0 - vshuf.h $vr5, $vr2, $vr4 - vinsgr2vr.w $vr4, $a4, 0 - vilvl.b $vr4, $vr2, $vr4 - vilvl.h $vr4, $vr2, $vr4 + vinsgr2vr.d $vr2, $a3, 0 + ori $a3, $zero, 2236 + ldx.w $a3, $fp, $a3 + ldptr.d $a4, $fp, 2224 + vshuf4i.h $vr2, $vr2, 27 + vsllwil.wu.hu $vr2, $vr2, 0 + vinsgr2vr.w $vr3, $a3, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsub.w $vr3, $vr0, $vr3 ori $a3, $a5, 656 - vldx $vr6, $fp, $a3 - vsub.w $vr4, $vr1, $vr4 - vsrl.w $vr4, $vr5, $vr4 - vstx $vr4, $fp, $a2 - vpickev.h $vr4, $vr6, $vr6 - vpickve2gr.d $a2, $vr4, 0 - ldptr.d $a4, $fp, 2240 + vldx $vr4, $fp, $a3 + vsrl.w $vr2, $vr2, $vr3 + vinsgr2vr.d $vr3, $a4, 0 + vstx $vr2, $fp, $a2 + vpickev.h $vr2, $vr4, $vr4 + vpickve2gr.d $a2, $vr2, 0 bitrev.d $a2, $a2 - vinsgr2vr.d $vr4, $a2, 0 - vori.b $vr5, $vr0, 0 - vshuf.h $vr5, $vr2, $vr4 + ldptr.d $a4, $fp, 2240 + vinsgr2vr.d $vr2, $a2, 0 + vshuf4i.h $vr2, $vr2, 27 + vsllwil.wu.hu $vr2, $vr2, 0 vinsgr2vr.d $vr4, $a4, 0 - vilvl.b $vr4, $vr2, $vr4 - vilvl.h $vr4, $vr2, $vr4 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsub.w $vr4, $vr0, $vr4 + vsrl.w $vr2, $vr2, $vr4 ori $a2, $a5, 672 - vldx $vr6, $fp, $a2 - vsub.w $vr4, $vr1, $vr4 - vsrl.w $vr4, $vr5, $vr4 - vstx $vr4, $fp, $a3 - vpickev.h $vr4, $vr6, $vr6 - vpickve2gr.d $a3, $vr4, 0 - ori $a4, $zero, 2244 - ldx.w $a4, $fp, $a4 + vldx $vr4, $fp, $a2 + vshuf4i.h $vr1, $vr1, 27 + vsllwil.wu.hu $vr1, $vr1, 0 + vstx $vr2, $fp, $a3 + vpickev.h $vr2, $vr4, $vr4 + vpickve2gr.d $a3, $vr2, 0 bitrev.d $a3, $a3 - vinsgr2vr.d $vr4, $a3, 0 - vori.b $vr5, $vr0, 0 - vshuf.h $vr5, $vr2, $vr4 - vinsgr2vr.w $vr4, $a4, 0 - vilvl.b $vr4, $vr2, $vr4 - vilvl.h $vr4, $vr2, $vr4 + vinsgr2vr.d $vr2, $a3, 0 + ori $a3, $zero, 2244 + ldx.w $a3, $fp, $a3 + vsllwil.hu.bu $vr3, $vr3, 0 + vshuf4i.h $vr2, $vr2, 27 + vsllwil.wu.hu $vr2, $vr2, 0 + vinsgr2vr.w $vr4, $a3, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsub.w $vr4, $vr0, $vr4 + vsrl.w $vr2, $vr2, $vr4 ori $a3, $a5, 688 - vldx $vr6, $fp, $a3 - vsub.w $vr4, $vr1, $vr4 - vsrl.w $vr4, $vr5, $vr4 - vstx $vr4, $fp, $a2 - vpickev.h $vr4, $vr6, $vr6 - vpickve2gr.d $a2, $vr4, 0 - ldptr.d $a4, $fp, 2248 + vldx $vr4, $fp, $a3 + vsllwil.wu.hu $vr3, $vr3, 0 + vsub.w $vr3, $vr0, $vr3 + vstx $vr2, $fp, $a2 + vpickev.h $vr2, $vr4, $vr4 + vpickve2gr.d $a2, $vr2, 0 bitrev.d $a2, $a2 - vinsgr2vr.d $vr4, $a2, 0 - vori.b $vr5, $vr0, 0 - vshuf.h $vr5, $vr2, $vr4 - vinsgr2vr.d $vr4, $a4, 0 - vilvl.b $vr4, $vr2, $vr4 - vilvl.h $vr4, $vr2, $vr4 + vinsgr2vr.d $vr2, $a2, 0 + ldptr.d $a2, $fp, 2248 + vsrl.w $vr1, $vr1, $vr3 + vshuf4i.h $vr2, $vr2, 27 + vsllwil.wu.hu $vr2, $vr2, 0 + vinsgr2vr.d $vr3, $a2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 ori $a2, $a5, 704 - vldx $vr6, $fp, $a2 - vsub.w $vr4, $vr1, $vr4 - vsrl.w $vr4, $vr5, $vr4 - vstx $vr4, $fp, $a3 - vpickev.h $vr4, $vr6, $vr6 - vpickve2gr.d $a3, $vr4, 0 + vldx $vr4, $fp, $a2 + vsub.w $vr3, $vr0, $vr3 + vsrl.w $vr2, $vr2, $vr3 + vstx $vr2, $fp, $a3 + vpickev.h $vr2, $vr4, $vr4 + vpickve2gr.d $a3, $vr2, 0 bitrev.d $a3, $a3 - ori $a4, $zero, 2252 - ldx.w $a4, $fp, $a4 - vinsgr2vr.d $vr4, $a3, 0 - vstx $vr3, $fp, $a1 - vshuf.h $vr0, $vr2, $vr4 - vinsgr2vr.w $vr3, $a4, 0 - vilvl.b $vr3, $vr2, $vr3 - vilvl.h $vr2, $vr2, $vr3 + vinsgr2vr.d $vr2, $a3, 0 + ori $a3, $zero, 2252 + ldx.w $a3, $fp, $a3 + vstx $vr1, $fp, $a1 + vshuf4i.h $vr1, $vr2, 27 + vsllwil.wu.hu $vr1, $vr1, 0 + vinsgr2vr.w $vr2, $a3, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 ld.w $a3, $fp, 1372 - vsub.w $vr1, $vr1, $vr2 - vsrl.w $vr0, $vr0, $vr1 + vsub.w $vr0, $vr0, $vr2 + vsrl.w $vr0, $vr1, $vr0 vstx $vr0, $fp, $a2 addi.d $s0, $fp, 1168 st.d $s0, $sp, 48 # 8-byte Folded Spill @@ -5349,19 +5325,7 @@ _ZN9NCompress8NDeflate8NEncoder6CCoder13GetBlockPriceEii: # @_ZN9NCompress8NDefl .size _ZN9NCompress8NDeflate8NEncoder6CCoder13GetBlockPriceEii, .Lfunc_end27-_ZN9NCompress8NDeflate8NEncoder6CCoder13GetBlockPriceEii .cfi_endproc # -- End function - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4, 0x0 # -- Begin function _ZN9NCompress8NDeflate8NEncoder6CCoder9CodeBlockEib -.LCPI28_0: - .half 3 # 0x3 - .half 9 # 0x9 - .half 2 # 0x2 - .half 11 # 0xb - .half 1 # 0x1 - .half 13 # 0xd - .half 0 # 0x0 - .half 15 # 0xf - .text - .globl _ZN9NCompress8NDeflate8NEncoder6CCoder9CodeBlockEib + .globl _ZN9NCompress8NDeflate8NEncoder6CCoder9CodeBlockEib # -- Begin function _ZN9NCompress8NDeflate8NEncoder6CCoder9CodeBlockEib .p2align 5 .type _ZN9NCompress8NDeflate8NEncoder6CCoder9CodeBlockEib,@function _ZN9NCompress8NDeflate8NEncoder6CCoder9CodeBlockEib: # @_ZN9NCompress8NDeflate8NEncoder6CCoder9CodeBlockEib @@ -5484,28 +5448,27 @@ _ZN9NCompress8NDeflate8NEncoder6CCoder9CodeBlockEib: # @_ZN9NCompress8NDeflate8N move $a1, $zero addi.d $a0, $s5, 209 addi.d $a2, $fp, 1940 - vrepli.b $vr0, 0 - vrepli.w $vr1, 9 - vrepli.w $vr2, 1 + vrepli.w $vr0, 9 + vrepli.w $vr1, 1 ori $a3, $zero, 1152 .p2align 4, , 16 .LBB28_6: # %vector.body # =>This Inner Loop Header: Depth=1 ld.d $a4, $a2, -4 ld.w $a5, $a2, 0 - vinsgr2vr.d $vr3, $a4, 0 - vinsgr2vr.w $vr4, $a5, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vsub.w $vr3, $vr1, $vr3 - vsub.w $vr4, $vr1, $vr4 - vsll.w $vr3, $vr2, $vr3 - vsll.w $vr4, $vr2, $vr4 + vinsgr2vr.d $vr2, $a4, 0 + vinsgr2vr.w $vr3, $a5, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsub.w $vr2, $vr0, $vr2 + vsub.w $vr3, $vr0, $vr3 + vsll.w $vr2, $vr1, $vr2 + vsll.w $vr3, $vr1, $vr3 add.d $a4, $a0, $a1 - vstx $vr3, $a0, $a1 - vst $vr4, $a4, 16 + vstx $vr2, $a0, $a1 + vst $vr3, $a4, 16 addi.d $a1, $a1, 32 addi.d $a2, $a2, 8 bne $a1, $a3, .LBB28_6 @@ -5513,70 +5476,70 @@ _ZN9NCompress8NDeflate8NEncoder6CCoder9CodeBlockEib: # @_ZN9NCompress8NDeflate8N ldptr.d $a1, $fp, 2224 ori $a2, $zero, 3408 add.d $s2, $fp, $a2 - vinsgr2vr.d $vr3, $a1, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 + vinsgr2vr.d $vr2, $a1, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 ori $a1, $zero, 2228 ldx.w $a1, $fp, $a1 - vsub.w $vr3, $vr1, $vr3 - vsll.w $vr3, $vr2, $vr3 - vstx $vr3, $fp, $a2 - vinsgr2vr.w $vr3, $a1, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vsub.w $vr3, $vr1, $vr3 + vsub.w $vr2, $vr0, $vr2 + vsll.w $vr2, $vr1, $vr2 + vstx $vr2, $fp, $a2 + vinsgr2vr.w $vr2, $a1, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsub.w $vr2, $vr0, $vr2 ldptr.d $a1, $fp, 2232 - vsll.w $vr3, $vr2, $vr3 + vsll.w $vr2, $vr1, $vr2 ori $a2, $zero, 3424 - vstx $vr3, $fp, $a2 - vinsgr2vr.d $vr3, $a1, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vsub.w $vr3, $vr1, $vr3 + vstx $vr2, $fp, $a2 + vinsgr2vr.d $vr2, $a1, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsub.w $vr2, $vr0, $vr2 ori $a1, $zero, 2236 ldx.w $a1, $fp, $a1 - vsll.w $vr3, $vr2, $vr3 + vsll.w $vr2, $vr1, $vr2 ori $a2, $zero, 3440 - vstx $vr3, $fp, $a2 - vinsgr2vr.w $vr3, $a1, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vsub.w $vr3, $vr1, $vr3 + vstx $vr2, $fp, $a2 + vinsgr2vr.w $vr2, $a1, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsub.w $vr2, $vr0, $vr2 ldptr.d $a1, $fp, 2240 - vsll.w $vr3, $vr2, $vr3 + vsll.w $vr2, $vr1, $vr2 ori $a2, $zero, 3456 - vstx $vr3, $fp, $a2 - vinsgr2vr.d $vr3, $a1, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vsub.w $vr3, $vr1, $vr3 + vstx $vr2, $fp, $a2 + vinsgr2vr.d $vr2, $a1, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsub.w $vr2, $vr0, $vr2 ori $a1, $zero, 2244 ldx.w $a1, $fp, $a1 - vsll.w $vr3, $vr2, $vr3 + vsll.w $vr2, $vr1, $vr2 ori $a2, $zero, 3472 - vstx $vr3, $fp, $a2 - vinsgr2vr.w $vr3, $a1, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vsub.w $vr3, $vr1, $vr3 + vstx $vr2, $fp, $a2 + vinsgr2vr.w $vr2, $a1, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsub.w $vr2, $vr0, $vr2 ldptr.d $a1, $fp, 2248 - vsll.w $vr3, $vr2, $vr3 + vsll.w $vr2, $vr1, $vr2 ori $a2, $zero, 3488 - vstx $vr3, $fp, $a2 - vinsgr2vr.d $vr3, $a1, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vsub.w $vr3, $vr1, $vr3 + vstx $vr2, $fp, $a2 + vinsgr2vr.d $vr2, $a1, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsub.w $vr2, $vr0, $vr2 ori $a1, $zero, 2252 ldx.w $a1, $fp, $a1 - vsll.w $vr3, $vr2, $vr3 + vsll.w $vr2, $vr1, $vr2 ori $a2, $zero, 3504 - vstx $vr3, $fp, $a2 - vinsgr2vr.w $vr3, $a1, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr0, $vr0, $vr3 - vsub.w $vr0, $vr1, $vr0 - vsll.w $vr0, $vr2, $vr0 + vstx $vr2, $fp, $a2 + vinsgr2vr.w $vr2, $a1, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsub.w $vr0, $vr0, $vr2 + vsll.w $vr0, $vr1, $vr0 ori $a1, $zero, 3520 vstx $vr0, $fp, $a1 addi.d $a1, $s5, 1489 @@ -5654,66 +5617,64 @@ _ZN9NCompress8NDeflate8NEncoder6CCoder9CodeBlockEib: # @_ZN9NCompress8NDeflate8N ori $a0, $s4, 796 vpickev.h $vr0, $vr0, $vr0 vpickve2gr.d $a2, $vr0, 0 - pcalau12i $a3, %pc_hi20(.LCPI28_0) - vld $vr0, $a3, %pc_lo12(.LCPI28_0) bitrev.d $a2, $a2 ldx.w $a3, $fp, $a0 - vinsgr2vr.d $vr2, $a2, 0 - vrepli.b $vr1, 0 - vori.b $vr3, $vr0, 0 - vshuf.h $vr3, $vr1, $vr2 - vinsgr2vr.w $vr2, $a3, 0 - vilvl.b $vr2, $vr1, $vr2 - vilvl.h $vr4, $vr1, $vr2 - vrepli.w $vr2, 16 + vinsgr2vr.d $vr0, $a2, 0 + vshuf4i.h $vr0, $vr0, 27 + vsllwil.wu.hu $vr1, $vr0, 0 + vinsgr2vr.w $vr0, $a3, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr2, $vr0, 0 + vrepli.w $vr0, 16 ori $a2, $s4, 736 - vldx $vr5, $fp, $a2 - vsub.w $vr4, $vr2, $vr4 - vsrl.w $vr3, $vr3, $vr4 - vstx $vr3, $fp, $a1 - vpickev.h $vr3, $vr5, $vr5 - vpickve2gr.d $a3, $vr3, 0 - ldptr.d $a4, $fp, 4896 + vldx $vr3, $fp, $a2 + vsub.w $vr2, $vr0, $vr2 + vsrl.w $vr1, $vr1, $vr2 + vstx $vr1, $fp, $a1 + vpickev.h $vr1, $vr3, $vr3 + vpickve2gr.d $a3, $vr1, 0 bitrev.d $a3, $a3 - vinsgr2vr.d $vr3, $a3, 0 - vori.b $vr4, $vr0, 0 - vshuf.h $vr4, $vr1, $vr3 - vinsgr2vr.d $vr3, $a4, 0 - vilvl.b $vr3, $vr1, $vr3 - vilvl.h $vr3, $vr1, $vr3 + ldptr.d $a4, $fp, 4896 + vinsgr2vr.d $vr1, $a3, 0 + vshuf4i.h $vr1, $vr1, 27 + vsllwil.wu.hu $vr1, $vr1, 0 + vinsgr2vr.d $vr2, $a4, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 ori $a3, $s4, 752 - vldx $vr5, $fp, $a3 - vsub.w $vr3, $vr2, $vr3 - vsrl.w $vr3, $vr4, $vr3 - vstx $vr3, $fp, $a2 - vpickev.h $vr3, $vr5, $vr5 - vpickve2gr.d $a2, $vr3, 0 + vldx $vr3, $fp, $a3 + vsub.w $vr2, $vr0, $vr2 + vsrl.w $vr1, $vr1, $vr2 + vstx $vr1, $fp, $a2 + vpickev.h $vr1, $vr3, $vr3 + vpickve2gr.d $a2, $vr1, 0 + bitrev.d $a2, $a2 ori $a4, $s4, 804 ldx.w $a4, $fp, $a4 - bitrev.d $a2, $a2 - vinsgr2vr.d $vr3, $a2, 0 - vori.b $vr4, $vr0, 0 - vshuf.h $vr4, $vr1, $vr3 - vinsgr2vr.w $vr3, $a4, 0 - vilvl.b $vr3, $vr1, $vr3 - vilvl.h $vr3, $vr1, $vr3 - vsub.w $vr3, $vr2, $vr3 + vinsgr2vr.d $vr1, $a2, 0 + vshuf4i.h $vr1, $vr1, 27 + vsllwil.wu.hu $vr1, $vr1, 0 + vinsgr2vr.w $vr2, $a4, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsub.w $vr2, $vr0, $vr2 ori $a2, $s4, 768 - vldx $vr5, $fp, $a2 - vsrl.w $vr3, $vr4, $vr3 + vldx $vr3, $fp, $a2 + vsrl.w $vr1, $vr1, $vr2 add.d $s0, $fp, $a1 - vstx $vr3, $fp, $a3 - vpickev.h $vr3, $vr5, $vr5 - vpickve2gr.d $a1, $vr3, 0 - ldptr.d $a3, $fp, 4904 + vstx $vr1, $fp, $a3 + vpickev.h $vr1, $vr3, $vr3 + vpickve2gr.d $a1, $vr1, 0 bitrev.d $a1, $a1 - vinsgr2vr.d $vr3, $a1, 0 - vshuf.h $vr0, $vr1, $vr3 - vinsgr2vr.d $vr3, $a3, 0 - vilvl.b $vr3, $vr1, $vr3 - vilvl.h $vr1, $vr1, $vr3 - vsub.w $vr1, $vr2, $vr1 - vsrl.w $vr0, $vr0, $vr1 + ldptr.d $a3, $fp, 4904 + vinsgr2vr.d $vr1, $a1, 0 + vshuf4i.h $vr1, $vr1, 27 + vsllwil.wu.hu $vr1, $vr1, 0 + vinsgr2vr.d $vr2, $a3, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsub.w $vr0, $vr0, $vr2 + vsrl.w $vr0, $vr1, $vr0 ori $a1, $s4, 784 ldx.h $a1, $fp, $a1 vstx $vr0, $fp, $a2 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Compress/QuantumDecoder.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Compress/QuantumDecoder.s index 74f3908d..f93bde82 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Compress/QuantumDecoder.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Compress/QuantumDecoder.s @@ -1023,19 +1023,8 @@ _ZN9NCompress8NQuantum8CDecoder8CodeSpecEj: # @_ZN9NCompress8NQuantum8CDecoder8C .size _ZN9NCompress8NQuantum8CDecoder8CodeSpecEj, .Lfunc_end1-_ZN9NCompress8NQuantum8CDecoder8CodeSpecEj .cfi_endproc # -- End function - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4, 0x0 # -- Begin function _ZN9NCompress8NQuantum11NRangeCoder13CModelDecoder6DecodeEPNS1_8CDecoderE -.LCPI2_0: - .half 3 # 0x3 - .half 9 # 0x9 - .half 4 # 0x4 - .half 11 # 0xb - .half 5 # 0x5 - .half 13 # 0xd - .half 6 # 0x6 - .half 15 # 0xf .section .text._ZN9NCompress8NQuantum11NRangeCoder13CModelDecoder6DecodeEPNS1_8CDecoderE,"axG",@progbits,_ZN9NCompress8NQuantum11NRangeCoder13CModelDecoder6DecodeEPNS1_8CDecoderE,comdat - .weak _ZN9NCompress8NQuantum11NRangeCoder13CModelDecoder6DecodeEPNS1_8CDecoderE + .weak _ZN9NCompress8NQuantum11NRangeCoder13CModelDecoder6DecodeEPNS1_8CDecoderE # -- Begin function _ZN9NCompress8NQuantum11NRangeCoder13CModelDecoder6DecodeEPNS1_8CDecoderE .p2align 5 .type _ZN9NCompress8NQuantum11NRangeCoder13CModelDecoder6DecodeEPNS1_8CDecoderE,@function _ZN9NCompress8NQuantum11NRangeCoder13CModelDecoder6DecodeEPNS1_8CDecoderE: # @_ZN9NCompress8NQuantum11NRangeCoder13CModelDecoder6DecodeEPNS1_8CDecoderE @@ -1168,40 +1157,38 @@ _ZN9NCompress8NQuantum11NRangeCoder13CModelDecoder6DecodeEPNS1_8CDecoderE: # @_Z .LBB2_17: # %vector.ph93 addi.d $a5, $fp, 10 bstrpick.d $a4, $a2, 31, 3 - pcalau12i $a6, %pc_hi20(.LCPI2_0) - vld $vr0, $a6, %pc_lo12(.LCPI2_0) slli.d $a4, $a4, 3 - vinsgr2vr.h $vr1, $a3, 7 - vrepli.b $vr2, 0 + vinsgr2vr.h $vr0, $a3, 7 move $a3, $a4 .p2align 4, , 16 .LBB2_18: # %vector.body96 # =>This Inner Loop Header: Depth=1 - vori.b $vr3, $vr1, 0 - vld $vr1, $a5, 0 - vbsrl.v $vr3, $vr3, 14 - vbsll.v $vr4, $vr1, 2 - vor.v $vr3, $vr4, $vr3 - vori.b $vr4, $vr0, 0 - vshuf.h $vr4, $vr2, $vr1 - vilvl.h $vr3, $vr2, $vr3 - vilvh.h $vr5, $vr2, $vr1 - vilvl.h $vr6, $vr2, $vr1 - vsub.w $vr3, $vr3, $vr6 - vaddi.wu $vr3, $vr3, 1 - vsub.w $vr4, $vr4, $vr5 - vaddi.wu $vr4, $vr4, 1 - vsrli.w $vr4, $vr4, 1 - vsrli.w $vr3, $vr3, 1 - vpickev.h $vr3, $vr4, $vr3 - vst $vr3, $a5, -2 + vori.b $vr1, $vr0, 0 + vld $vr0, $a5, 0 + vbsrl.v $vr1, $vr1, 14 + vbsll.v $vr2, $vr0, 2 + vor.v $vr1, $vr2, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 6 + vsllwil.wu.hu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr0, 8 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.wu.hu $vr4, $vr0, 0 + vsub.w $vr1, $vr1, $vr4 + vaddi.wu $vr1, $vr1, 1 + vsub.w $vr2, $vr2, $vr3 + vaddi.wu $vr2, $vr2, 1 + vsrli.w $vr2, $vr2, 1 + vsrli.w $vr1, $vr1, 1 + vpickev.h $vr1, $vr2, $vr1 + vst $vr1, $a5, -2 addi.d $a3, $a3, -8 addi.d $a5, $a5, 16 bnez $a3, .LBB2_18 # %bb.19: # %middle.block100 beq $a4, $a2, .LBB2_23 # %bb.20: - vpickve2gr.h $a3, $vr1, 7 + vpickve2gr.h $a3, $vr0, 7 .LBB2_21: # %.lr.ph.preheader123 alsl.d $a5, $a4, $fp, 1 addi.d $a5, $a5, 10 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Crypto/HmacSha1.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Crypto/HmacSha1.s index 5a777f25..75833909 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Crypto/HmacSha1.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Crypto/HmacSha1.s @@ -155,55 +155,72 @@ _ZN7NCrypto5NSha15CHmac5FinalEPhm: # @_ZN7NCrypto5NSha15CHmac5FinalEPhm .p2align 4, 0x0 # -- Begin function _ZN7NCrypto5NSha17CHmac326SetKeyEPKhm .LCPI2_0: .byte 1 # 0x1 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 .byte 4 # 0x4 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 9 # 0x9 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b .byte 12 # 0xc - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI2_1: .byte 0 # 0x0 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 .byte 5 # 0x5 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 8 # 0x8 - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b .byte 13 # 0xd - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .LCPI2_2: .byte 2 # 0x2 - .byte 17 # 0x11 - .byte 18 # 0x12 - .byte 19 # 0x13 .byte 6 # 0x6 - .byte 21 # 0x15 - .byte 22 # 0x16 - .byte 23 # 0x17 .byte 10 # 0xa - .byte 25 # 0x19 - .byte 26 # 0x1a - .byte 27 # 0x1b .byte 14 # 0xe - .byte 29 # 0x1d - .byte 30 # 0x1e - .byte 31 # 0x1f + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff +.LCPI2_3: + .byte 3 # 0x3 + .byte 7 # 0x7 + .byte 11 # 0xb + .byte 15 # 0xf + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff + .byte 255 # 0xff .text .globl _ZN7NCrypto5NSha17CHmac326SetKeyEPKhm .p2align 5 @@ -248,27 +265,36 @@ _ZN7NCrypto5NSha17CHmac326SetKeyEPKhm: # @_ZN7NCrypto5NSha17CHmac326SetKeyEPKhm vld $vr0, $sp, 16 pcalau12i $a0, %pc_hi20(.LCPI2_0) vld $vr1, $a0, %pc_lo12(.LCPI2_0) - vrepli.b $vr2, 0 - vshuf.b $vr1, $vr2, $vr0, $vr1 + vshuf.b $vr1, $vr0, $vr0, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 pcalau12i $a0, %pc_hi20(.LCPI2_1) - vld $vr3, $a0, %pc_lo12(.LCPI2_1) + vld $vr2, $a0, %pc_lo12(.LCPI2_1) ori $a0, $zero, 16 lu32i.d $a0, 24 - vreplgr2vr.d $vr4, $a0 - vsll.w $vr1, $vr1, $vr4 - vshuf.b $vr3, $vr2, $vr0, $vr3 + vreplgr2vr.d $vr3, $a0 + vsll.w $vr1, $vr1, $vr3 + vshuf.b $vr2, $vr0, $vr0, $vr2 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 pcalau12i $a0, %pc_hi20(.LCPI2_2) - vld $vr4, $a0, %pc_lo12(.LCPI2_2) + vld $vr3, $a0, %pc_lo12(.LCPI2_2) ori $a0, $zero, 24 lu32i.d $a0, 16 - vreplgr2vr.d $vr5, $a0 - vsll.w $vr3, $vr3, $vr5 - vor.v $vr1, $vr3, $vr1 - vshuf.b $vr2, $vr2, $vr0, $vr4 + vreplgr2vr.d $vr4, $a0 + vsll.w $vr2, $vr2, $vr4 + vor.v $vr1, $vr2, $vr1 + vshuf.b $vr2, $vr0, $vr0, $vr3 + vsllwil.hu.bu $vr2, $vr2, 0 + pcalau12i $a0, %pc_hi20(.LCPI2_3) + vld $vr3, $a0, %pc_lo12(.LCPI2_3) + vsllwil.wu.hu $vr2, $vr2, 0 vslli.w $vr2, $vr2, 8 - ld.w $a0, $sp, 32 vor.v $vr1, $vr1, $vr2 - vsrli.w $vr0, $vr0, 24 + vshuf.b $vr0, $vr0, $vr0, $vr3 + ld.w $a0, $sp, 32 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 vor.v $vr0, $vr1, $vr0 revb.2w $a0, $a0 b .LBB2_6 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/UI/Common/Update.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/UI/Common/Update.s index 12b19f57..91baec15 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/UI/Common/Update.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/UI/Common/Update.s @@ -4554,9 +4554,8 @@ _Z13UpdateArchiveP7CCodecsRKN9NWildcard7CCensorER14CUpdateOptionsR16CUpdateError slli.d $a2, $a2, 3 addi.d $a3, $a1, 64 move $a4, $a2 - vld $vr4, $sp, 304 # 16-byte Folded Reload - vori.b $vr0, $vr4, 0 - vori.b $vr1, $vr4, 0 + vld $vr1, $sp, 304 # 16-byte Folded Reload + vori.b $vr0, $vr1, 0 .p2align 4, , 16 .LBB12_181: # %vector.body # Parent Loop BB12_151 Depth=1 @@ -4577,10 +4576,10 @@ _Z13UpdateArchiveP7CCodecsRKN9NWildcard7CCensorER14CUpdateOptionsR16CUpdateError vinsgr2vr.b $vr3, $a6, 1 vinsgr2vr.b $vr3, $a7, 2 vinsgr2vr.b $vr3, $t0, 3 - vilvl.b $vr2, $vr4, $vr2 - vilvl.h $vr2, $vr4, $vr2 - vilvl.b $vr3, $vr4, $vr3 - vilvl.h $vr3, $vr4, $vr3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 addi.d $a4, $a4, -8 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/Windows/Time.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/Windows/Time.s index a64dcdca..5eed4de5 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/Windows/Time.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/Windows/Time.s @@ -225,25 +225,24 @@ _ZN8NWindows5NTime19GetSecondsSince1601EjjjjjjRy: # @_ZN8NWindows5NTime19GetSeco vinsgr2vr.w $vr1, $a2, 0 addi.d $a2, $sp, 4 move $t1, $t0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB4_9: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $t2, $a2, -4 ld.w $t3, $a2, 0 - vinsgr2vr.w $vr3, $t2, 0 - vinsgr2vr.w $vr4, $t3, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 + vinsgr2vr.w $vr2, $t2, 0 + vinsgr2vr.w $vr3, $t3, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr3 addi.d $t1, $t1, -8 addi.d $a2, $a2, 8 bnez $t1, .LBB4_9 # %bb.10: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a2, $vr0, 0 diff --git a/results/MultiSource/Benchmarks/ASC_Sequoia/AMGmk/CMakeFiles/AMGmk.dir/csr_matvec.s b/results/MultiSource/Benchmarks/ASC_Sequoia/AMGmk/CMakeFiles/AMGmk.dir/csr_matvec.s index 3dfa476c..df3a47d7 100644 --- a/results/MultiSource/Benchmarks/ASC_Sequoia/AMGmk/CMakeFiles/AMGmk.dir/csr_matvec.s +++ b/results/MultiSource/Benchmarks/ASC_Sequoia/AMGmk/CMakeFiles/AMGmk.dir/csr_matvec.s @@ -919,7 +919,7 @@ hypre_CSRMatrixMatvec_FF: # @hypre_CSRMatrixMatvec_FF st.d $zero, $t6, -16 .LBB2_27: # %pred.store.continue # in Loop: Header=BB2_25 Depth=1 - vshuf4i.w $vr2, $vr2, 16 + vsllwil.d.w $vr2, $vr2, 0 vpickve2gr.d $s0, $vr2, 1 andi $s0, $s0, 1 beqz $s0, .LBB2_29 @@ -936,7 +936,7 @@ hypre_CSRMatrixMatvec_FF: # @hypre_CSRMatrixMatvec_FF # %bb.30: # %pred.store.if155 # in Loop: Header=BB2_25 Depth=1 st.d $zero, $t6, 0 - vshuf4i.w $vr2, $vr2, 16 + vsllwil.d.w $vr2, $vr2, 0 vpickve2gr.d $fp, $vr2, 1 andi $fp, $fp, 1 beqz $fp, .LBB2_24 @@ -944,7 +944,7 @@ hypre_CSRMatrixMatvec_FF: # @hypre_CSRMatrixMatvec_FF .p2align 4, , 16 .LBB2_31: # %pred.store.continue156 # in Loop: Header=BB2_25 Depth=1 - vshuf4i.w $vr2, $vr2, 16 + vsllwil.d.w $vr2, $vr2, 0 vpickve2gr.d $fp, $vr2, 1 andi $fp, $fp, 1 beqz $fp, .LBB2_24 diff --git a/results/MultiSource/Benchmarks/BitBench/uudecode/CMakeFiles/uudecode.dir/uudecode.s b/results/MultiSource/Benchmarks/BitBench/uudecode/CMakeFiles/uudecode.dir/uudecode.s index ee488d0d..d3bf2ce3 100644 --- a/results/MultiSource/Benchmarks/BitBench/uudecode/CMakeFiles/uudecode.dir/uudecode.s +++ b/results/MultiSource/Benchmarks/BitBench/uudecode/CMakeFiles/uudecode.dir/uudecode.s @@ -232,30 +232,18 @@ decode: # @decode vandi.b $vr6, $vr6, 3 vor.v $vr5, $vr6, $vr5 vxori.b $vr5, $vr5, 130 - vshuf4i.w $vr6, $vr1, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr7, $vr1, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 - vshuf4i.w $vr8, $vr2, 50 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vshuf4i.w $vr9, $vr2, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr10, $vr3, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr11, $vr3, 16 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr12, $vr4, 50 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr13, $vr4, 16 - vslli.d $vr13, $vr13, 32 - vsrai.d $vr13, $vr13, 32 + vshuf4i.w $vr6, $vr1, 14 + vsllwil.d.w $vr6, $vr6, 0 + vsllwil.d.w $vr7, $vr1, 0 + vshuf4i.w $vr8, $vr2, 14 + vsllwil.d.w $vr8, $vr8, 0 + vsllwil.d.w $vr9, $vr2, 0 + vshuf4i.w $vr10, $vr3, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr11, $vr3, 0 + vshuf4i.w $vr12, $vr4, 14 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr13, $vr4, 0 vpickve2gr.d $a5, $vr13, 0 add.d $s0, $a2, $a5 vpickve2gr.d $a5, $vr13, 1 diff --git a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btConvexConvexAlgorithm.s b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btConvexConvexAlgorithm.s index a3cfb108..2d3b8292 100644 --- a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btConvexConvexAlgorithm.s +++ b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btConvexConvexAlgorithm.s @@ -2540,8 +2540,7 @@ _ZN15btTransformUtil32calculateDiffAxisAngleQuaternionERK12btQuaternionS2_R9btVe vshuf4i.w $vr4, $vr4, 13 vshuf4i.w $vr4, $vr4, 16 vslli.d $vr4, $vr4, 32 - vrepli.b $vr6, 0 - vilvl.w $vr5, $vr6, $vr5 + vsllwil.du.wu $vr5, $vr5, 0 vor.v $vr4, $vr4, $vr5 .LBB16_3: # %_ZNK12btQuaternion7nearestERKS_.exit vreplvei.w $vr5, $vr4, 0 diff --git a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btDiscreteDynamicsWorld.s b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btDiscreteDynamicsWorld.s index eea68f4f..9dc5fd44 100644 --- a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btDiscreteDynamicsWorld.s +++ b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btDiscreteDynamicsWorld.s @@ -4105,8 +4105,7 @@ _ZN23btDiscreteDynamicsWorld15debugDrawObjectERK11btTransformPK16btCollisionShap vinsgr2vr.w $vr1, $s4, 1 vshuf4i.w $vr1, $vr1, 16 vslli.d $vr1, $vr1, 32 - vrepli.b $vr2, 0 - vilvl.w $vr0, $vr2, $vr0 + vsllwil.du.wu $vr0, $vr0, 0 vor.v $vr0, $vr1, $vr0 vst $vr0, $sp, 80 slli.d $a0, $s3, 2 @@ -4116,7 +4115,8 @@ _ZN23btDiscreteDynamicsWorld15debugDrawObjectERK11btTransformPK16btCollisionShap fld.s $fs3, $s0, 52 fld.s $fa1, $s0, 56 fst.s $fa1, $sp, 64 # 4-byte Folded Spill - vst $vr2, $sp, 80 + vrepli.b $vr1, 0 + vst $vr1, $sp, 80 fstx.s $fa0, $a0, $a1 addi.w $a0, $s3, 1 lu12i.w $a1, 349525 @@ -4129,7 +4129,7 @@ _ZN23btDiscreteDynamicsWorld15debugDrawObjectERK11btTransformPK16btCollisionShap sub.w $a0, $a0, $a1 slli.d $a0, $a0, 2 ld.d $a1, $s1, 0 - vst $vr2, $sp, 240 + vst $vr1, $sp, 240 addi.d $a2, $sp, 240 fstx.s $fs0, $a0, $a2 ld.d $a1, $a1, 32 diff --git a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btMinkowskiPenetrationDepthSolver.s b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btMinkowskiPenetrationDepthSolver.s index 4a541114..3161e7cd 100644 --- a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btMinkowskiPenetrationDepthSolver.s +++ b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btMinkowskiPenetrationDepthSolver.s @@ -129,7 +129,6 @@ _ZN33btMinkowskiPenetrationDepthSolver12calcPenDepthER22btVoronoiSimplexSolverPK pcalau12i $a0, %pc_hi20(_ZL22sPenetrationDirections) addi.d $s5, $a0, %pc_lo12(_ZL22sPenetrationDirections) move $a0, $zero - vrepli.b $vr18, 0 addi.d $a1, $sp, 1432 addi.d $a2, $sp, 440 ori $a3, $zero, 672 @@ -137,58 +136,58 @@ _ZN33btMinkowskiPenetrationDepthSolver12calcPenDepthER22btVoronoiSimplexSolverPK .LBB0_4: # %vector.body # =>This Inner Loop Header: Depth=1 add.d $a4, $s5, $a0 - fldx.s $ft11, $s5, $a0 - fld.s $ft12, $a4, 16 - fld.s $ft13, $a4, 4 - fld.s $ft14, $a4, 20 - fld.s $ft15, $a4, 8 - fld.s $fs0, $a4, 24 - vextrins.w $vr19, $vr20, 16 - vextrins.w $vr21, $vr22, 16 - vextrins.w $vr23, $vr24, 16 - vbitrevi.w $vr20, $vr19, 31 - vbitrevi.w $vr22, $vr21, 31 - vbitrevi.w $vr24, $vr23, 31 - vfmul.s $vr25, $vr1, $vr22 - vfmadd.s $vr25, $vr0, $vr20, $vr25 - vfmadd.s $vr25, $vr2, $vr24, $vr25 - vfmul.s $vr26, $vr4, $vr22 - vfmadd.s $vr26, $vr3, $vr20, $vr26 - vfmadd.s $vr26, $vr5, $vr24, $vr26 - vfmul.s $vr22, $vr7, $vr22 - vfmadd.s $vr20, $vr6, $vr20, $vr22 - vfmadd.s $vr20, $vr8, $vr24, $vr20 - vshuf4i.w $vr22, $vr26, 16 - vslli.d $vr22, $vr22, 32 - vilvl.w $vr24, $vr18, $vr25 - vor.v $vr22, $vr22, $vr24 - vpickve2gr.d $a4, $vr20, 0 + fldx.s $ft10, $s5, $a0 + fld.s $ft11, $a4, 16 + fld.s $ft12, $a4, 4 + fld.s $ft13, $a4, 20 + fld.s $ft14, $a4, 8 + fld.s $ft15, $a4, 24 + vextrins.w $vr18, $vr19, 16 + vextrins.w $vr20, $vr21, 16 + vextrins.w $vr22, $vr23, 16 + vbitrevi.w $vr19, $vr18, 31 + vbitrevi.w $vr21, $vr20, 31 + vbitrevi.w $vr23, $vr22, 31 + vfmul.s $vr24, $vr1, $vr21 + vfmadd.s $vr24, $vr0, $vr19, $vr24 + vfmadd.s $vr24, $vr2, $vr23, $vr24 + vfmul.s $vr25, $vr4, $vr21 + vfmadd.s $vr25, $vr3, $vr19, $vr25 + vfmadd.s $vr25, $vr5, $vr23, $vr25 + vfmul.s $vr21, $vr7, $vr21 + vfmadd.s $vr19, $vr6, $vr19, $vr21 + vfmadd.s $vr19, $vr8, $vr23, $vr19 + vshuf4i.w $vr21, $vr25, 16 + vslli.d $vr21, $vr21, 32 + vsllwil.du.wu $vr23, $vr24, 0 + vor.v $vr21, $vr21, $vr23 + vpickve2gr.d $a4, $vr19, 0 srli.d $a5, $a4, 32 bstrpick.d $a4, $a4, 31, 0 add.d $a6, $a1, $a0 - vstelm.d $vr22, $a6, 0, 0 - vstelm.d $vr22, $a6, 16, 1 + vstelm.d $vr21, $a6, 0, 0 + vstelm.d $vr21, $a6, 16, 1 st.d $a4, $a6, 8 st.d $a5, $a6, 24 - vfmul.s $vr20, $vr21, $vr10 - vfmadd.s $vr20, $vr9, $vr19, $vr20 - vfmadd.s $vr20, $vr11, $vr23, $vr20 - vfmul.s $vr22, $vr21, $vr13 - vfmadd.s $vr22, $vr12, $vr19, $vr22 - vfmadd.s $vr22, $vr14, $vr23, $vr22 - vfmul.s $vr21, $vr21, $vr16 - vfmadd.s $vr19, $vr15, $vr19, $vr21 - vfmadd.s $vr19, $vr17, $vr23, $vr19 - vshuf4i.w $vr21, $vr22, 16 - vslli.d $vr21, $vr21, 32 - vilvl.w $vr20, $vr18, $vr20 - vor.v $vr20, $vr21, $vr20 - vpickve2gr.d $a4, $vr19, 0 + vfmul.s $vr19, $vr20, $vr10 + vfmadd.s $vr19, $vr9, $vr18, $vr19 + vfmadd.s $vr19, $vr11, $vr22, $vr19 + vfmul.s $vr21, $vr20, $vr13 + vfmadd.s $vr21, $vr12, $vr18, $vr21 + vfmadd.s $vr21, $vr14, $vr22, $vr21 + vfmul.s $vr20, $vr20, $vr16 + vfmadd.s $vr18, $vr15, $vr18, $vr20 + vfmadd.s $vr18, $vr17, $vr22, $vr18 + vshuf4i.w $vr20, $vr21, 16 + vslli.d $vr20, $vr20, 32 + vsllwil.du.wu $vr19, $vr19, 0 + vor.v $vr19, $vr20, $vr19 + vpickve2gr.d $a4, $vr18, 0 srli.d $a5, $a4, 32 bstrpick.d $a4, $a4, 31, 0 add.d $a6, $a2, $a0 - vstelm.d $vr20, $a6, 0, 0 - vstelm.d $vr20, $a6, 16, 1 + vstelm.d $vr19, $a6, 0, 0 + vstelm.d $vr19, $a6, 16, 1 st.d $a4, $a6, 8 addi.d $a0, $a0, 32 st.d $a5, $a6, 24 diff --git a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btSoftBodyHelpers.s b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btSoftBodyHelpers.s index df41c18e..e87df555 100644 --- a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btSoftBodyHelpers.s +++ b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btSoftBodyHelpers.s @@ -4256,30 +4256,29 @@ _ZN17btSoftBodyHelpers10CreateRopeER19btSoftBodyWorldInfoRK9btVector3S4_ii: # @_ ori $a4, $zero, 0 lu32i.d $a4, 1 vreplgr2vr.d $vr14, $a4 - vrepli.b $vr15, 0 - vreplgr2vr.w $vr16, $a0 + vreplgr2vr.w $vr15, $a0 move $a4, $a1 move $a5, $s3 .p2align 4, , 16 .LBB12_4: # %vector.body # =>This Inner Loop Header: Depth=1 - vffint.s.wu $vr17, $vr14 - vfdiv.s $vr17, $vr17, $vr7 - vfmadd.s $vr18, $vr9, $vr17, $vr8 - vfmadd.s $vr19, $vr11, $vr17, $vr10 - vfmadd.s $vr17, $vr13, $vr17, $vr12 - vshuf4i.w $vr19, $vr19, 16 - vslli.d $vr19, $vr19, 32 - vilvl.w $vr18, $vr15, $vr18 - vor.v $vr18, $vr19, $vr18 - vpickve2gr.d $a6, $vr17, 0 + vffint.s.wu $vr16, $vr14 + vfdiv.s $vr16, $vr16, $vr7 + vfmadd.s $vr17, $vr9, $vr16, $vr8 + vfmadd.s $vr18, $vr11, $vr16, $vr10 + vfmadd.s $vr16, $vr13, $vr16, $vr12 + vshuf4i.w $vr18, $vr18, 16 + vslli.d $vr18, $vr18, 32 + vsllwil.du.wu $vr17, $vr17, 0 + vor.v $vr17, $vr18, $vr17 + vpickve2gr.d $a6, $vr16, 0 srli.d $a7, $a6, 32 bstrpick.d $a6, $a6, 31, 0 - vstelm.d $vr18, $a3, -16, 0 - vstelm.d $vr18, $a3, 0, 1 + vstelm.d $vr17, $a3, -16, 0 + vstelm.d $vr17, $a3, 0, 1 st.d $a6, $a3, -8 st.d $a7, $a3, 8 - vstelm.d $vr16, $a5, 0, 0 + vstelm.d $vr15, $a5, 0, 0 vaddi.wu $vr14, $vr14, 2 addi.d $a3, $a3, 32 addi.d $a4, $a4, -2 @@ -4534,35 +4533,34 @@ _ZN17btSoftBodyHelpers11CreatePatchER19btSoftBodyWorldInfoRK9btVector3S4_S4_S4_i addi.d $a1, $fp, -1 st.d $a1, $sp, 8 # 8-byte Folded Spill bstrpick.d $a1, $a1, 31, 0 - fld.s $fa1, $s5, 0 - fld.s $fa0, $s4, 0 - fld.s $fa4, $s5, 4 - fld.s $fa2, $s4, 4 + fld.s $fa0, $s5, 0 + fld.s $fa2, $s4, 0 + fld.s $fa1, $s5, 4 + fld.s $fa4, $s4, 4 movgr2fr.d $fa3, $a1 ffint.s.l $fa3, $fa3 - fst.s $fa3, $sp, 48 # 4-byte Folded Spill - fst.s $fa1, $sp, 56 # 4-byte Folded Spill - fsub.s $fa0, $fa0, $fa1 + fst.s $fa3, $sp, 52 # 4-byte Folded Spill + fst.s $fa0, $sp, 56 # 4-byte Folded Spill + fsub.s $fa0, $fa2, $fa0 + fst.s $fa0, $sp, 48 # 4-byte Folded Spill + fsub.s $fa0, $fa4, $fa1 fst.s $fa0, $sp, 44 # 4-byte Folded Spill - fst.s $fa4, $sp, 52 # 4-byte Folded Spill - fsub.s $fa0, $fa2, $fa4 - fst.s $fa0, $sp, 40 # 4-byte Folded Spill fld.s $fa5, $s5, 8 - fld.s $fa0, $s4, 8 + fld.s $fa2, $s4, 8 fld.s $fa6, $s8, 0 - fld.s $fa2, $s1, 0 + fld.s $ft2, $s1, 0 fld.s $fa7, $s8, 4 fld.s $ft3, $s1, 4 fld.s $ft0, $s8, 8 fld.s $ft4, $s1, 8 - fsub.s $ft1, $fa0, $fa5 - fsub.s $ft2, $fa2, $fa6 + fsub.s $ft1, $fa2, $fa5 + fsub.s $ft2, $ft2, $fa6 fsub.s $ft3, $ft3, $fa7 fsub.s $ft4, $ft4, $ft0 addi.d $s1, $s0, -1 bstrpick.d $a1, $s1, 31, 0 - movgr2fr.d $fa0, $a1 - ffint.s.l $ft5, $fa0 + movgr2fr.d $fa2, $a1 + ffint.s.l $ft5, $fa2 bstrpick.d $a4, $s0, 30, 1 slli.d $a1, $a4, 1 vori.b $vr14, $vr13, 0 @@ -4576,13 +4574,12 @@ _ZN17btSoftBodyHelpers11CreatePatchER19btSoftBodyWorldInfoRK9btVector3S4_S4_S4_i move $a6, $s0 bstrins.d $a6, $zero, 30, 1 pcalau12i $a7, %pc_hi20(.LCPI13_0) - vld $vr2, $a7, %pc_lo12(.LCPI13_0) + vld $vr15, $a7, %pc_lo12(.LCPI13_0) ori $a7, $zero, 0 lu32i.d $a7, 1 - vreplgr2vr.d $vr0, $a7 - vrepli.b $vr17, 0 + vreplgr2vr.d $vr2, $a7 lu12i.w $a7, 260096 - vreplgr2vr.w $vr18, $a7 + vreplgr2vr.w $vr17, $a7 b .LBB13_4 .p2align 4, , 16 .LBB13_3: # %._crit_edge.us @@ -4596,24 +4593,25 @@ _ZN17btSoftBodyHelpers11CreatePatchER19btSoftBodyWorldInfoRK9btVector3S4_S4_S4_i # Child Loop BB13_5 Depth 2 # Child Loop BB13_8 Depth 2 bstrpick.d $t0, $a0, 31, 0 - movgr2fr.d $ft7, $t0 - ffint.s.l $ft7, $ft7 - fld.s $fa1, $sp, 48 # 4-byte Folded Reload - fdiv.s $ft7, $ft7, $fa1 - fld.s $fa1, $sp, 56 # 4-byte Folded Reload - fld.s $fa3, $sp, 44 # 4-byte Folded Reload - fmadd.s $ft11, $fa3, $ft7, $fa1 - fld.s $fa1, $sp, 52 # 4-byte Folded Reload - fld.s $fa3, $sp, 40 # 4-byte Folded Reload - fmadd.s $ft12, $fa3, $ft7, $fa1 - fmadd.s $ft13, $ft1, $ft7, $fa5 - fmadd.s $ft8, $ft2, $ft7, $fa6 - fmadd.s $ft15, $ft3, $ft7, $fa7 - fmadd.s $ft7, $ft4, $ft7, $ft0 - fsub.s $ft14, $ft8, $ft11 - fsub.s $ft15, $ft15, $ft12 - fsub.s $fs0, $ft7, $ft13 + movgr2fr.d $ft8, $t0 + ffint.s.l $ft8, $ft8 + fld.s $fa0, $sp, 52 # 4-byte Folded Reload + fdiv.s $ft8, $ft8, $fa0 + fld.s $fa0, $sp, 56 # 4-byte Folded Reload + fld.s $fa3, $sp, 48 # 4-byte Folded Reload + fmadd.s $ft10, $fa3, $ft8, $fa0 + fld.s $fa0, $sp, 44 # 4-byte Folded Reload + fmadd.s $ft11, $fa0, $ft8, $fa1 + fmadd.s $ft12, $ft1, $ft8, $fa5 + fmadd.s $ft13, $ft2, $ft8, $fa6 + fmadd.s $ft14, $ft3, $ft8, $fa7 + fmadd.s $ft8, $ft4, $ft8, $ft0 + fsub.s $ft13, $ft13, $ft10 + fsub.s $ft14, $ft14, $ft11 + fsub.s $ft15, $ft8, $ft12 mul.d $t0, $a0, $s0 + vori.b $vr24, $vr18, 0 + vextrins.w $vr24, $vr18, 16 vori.b $vr25, $vr19, 0 vextrins.w $vr25, $vr19, 16 vori.b $vr26, $vr20, 0 @@ -4624,40 +4622,38 @@ _ZN17btSoftBodyHelpers11CreatePatchER19btSoftBodyWorldInfoRK9btVector3S4_S4_S4_i vextrins.w $vr28, $vr22, 16 vori.b $vr29, $vr23, 0 vextrins.w $vr29, $vr23, 16 - vori.b $vr30, $vr24, 0 - vextrins.w $vr30, $vr24, 16 - vreplgr2vr.d $vr31, $t0 + vreplgr2vr.d $vr30, $t0 move $t0, $a1 - vori.b $vr15, $vr2, 0 - vori.b $vr16, $vr0, 0 + vori.b $vr31, $vr15, 0 + vori.b $vr16, $vr2, 0 .p2align 4, , 16 .LBB13_5: # %vector.body # Parent Loop BB13_4 Depth=1 # => This Inner Loop Header: Depth=2 - vffint.s.wu $vr3, $vr16 - vfdiv.s $vr3, $vr3, $vr14 - vfmadd.s $vr4, $vr28, $vr3, $vr25 - vfmadd.s $vr1, $vr29, $vr3, $vr26 - vfmadd.s $vr3, $vr30, $vr3, $vr27 - vshuf4i.w $vr1, $vr1, 16 - vslli.d $vr1, $vr1, 32 - vilvl.w $vr4, $vr17, $vr4 - vor.v $vr1, $vr1, $vr4 - vpickve2gr.d $t1, $vr3, 0 + vffint.s.wu $vr0, $vr16 + vfdiv.s $vr0, $vr0, $vr14 + vfmadd.s $vr3, $vr27, $vr0, $vr24 + vfmadd.s $vr4, $vr28, $vr0, $vr25 + vfmadd.s $vr0, $vr29, $vr0, $vr26 + vshuf4i.w $vr4, $vr4, 16 + vslli.d $vr4, $vr4, 32 + vsllwil.du.wu $vr3, $vr3, 0 + vor.v $vr3, $vr4, $vr3 + vpickve2gr.d $t1, $vr0, 0 srli.d $t2, $t1, 32 bstrpick.d $t1, $t1, 31, 0 - vadd.d $vr3, $vr15, $vr31 - vpickve2gr.d $t3, $vr3, 0 + vadd.d $vr0, $vr31, $vr30 + vpickve2gr.d $t3, $vr0, 0 alsl.d $t4, $t3, $s2, 4 - vpickve2gr.d $t5, $vr3, 1 + vpickve2gr.d $t5, $vr0, 1 alsl.d $t5, $t5, $s2, 4 - vstelm.d $vr1, $t4, 0, 0 - vstelm.d $vr1, $t5, 0, 1 + vstelm.d $vr3, $t4, 0, 0 + vstelm.d $vr3, $t5, 0, 1 st.d $t1, $t4, 8 st.d $t2, $t5, 8 alsl.d $t1, $t3, $s3, 2 - vstelm.d $vr18, $t1, 0, 0 - vaddi.du $vr15, $vr15, 2 + vstelm.d $vr17, $t1, 0, 0 + vaddi.du $vr31, $vr31, 2 addi.d $t0, $t0, -2 vaddi.wu $vr16, $vr16, 2 bnez $t0, .LBB13_5 @@ -4675,16 +4671,16 @@ _ZN17btSoftBodyHelpers11CreatePatchER19btSoftBodyWorldInfoRK9btVector3S4_S4_S4_i # Parent Loop BB13_4 Depth=1 # => This Inner Loop Header: Depth=2 bstrpick.d $t4, $t1, 31, 0 - movgr2fr.d $fa1, $t4 - ffint.s.l $fa1, $fa1 - fdiv.s $fa1, $fa1, $ft5 - fmadd.s $fa3, $ft14, $fa1, $ft11 - fmadd.s $fa4, $ft15, $fa1, $ft12 - fmadd.s $fa1, $fs0, $fa1, $ft13 + movgr2fr.d $fa0, $t4 + ffint.s.l $fa0, $fa0 + fdiv.s $fa0, $fa0, $ft5 + fmadd.s $fa3, $ft13, $fa0, $ft10 + fmadd.s $fa4, $ft14, $fa0, $ft11 + fmadd.s $fa0, $ft15, $fa0, $ft12 movfr2gr.s $t4, $fa3 movfr2gr.s $t5, $fa4 bstrins.d $t4, $t5, 63, 32 - movfr2gr.s $t5, $fa1 + movfr2gr.s $t5, $fa0 bstrpick.d $t5, $t5, 31, 0 st.d $t4, $t3, -8 st.d $t5, $t3, 0 @@ -5069,33 +5065,33 @@ _ZN17btSoftBodyHelpers13CreatePatchUVER19btSoftBodyWorldInfoRK9btVector3S4_S4_S4 addi.d $s8, $s0, -1 bstrpick.d $a1, $s8, 31, 0 fld.s $fa0, $s6, 0 - fld.s $fa2, $s5, 0 + fld.s $fa3, $s5, 0 fld.s $fa1, $s6, 4 - fld.s $fa4, $s5, 4 - movgr2fr.d $fa3, $a1 - ffint.s.l $fa3, $fa3 - fst.s $fa3, $sp, 32 # 4-byte Folded Spill - fst.s $fa0, $sp, 40 # 4-byte Folded Spill - fsub.s $fa0, $fa2, $fa0 + fld.s $fa5, $s5, 4 + movgr2fr.d $fa2, $a1 + ffint.s.l $fa2, $fa2 + fst.s $fa2, $sp, 40 # 4-byte Folded Spill + fmov.s $fa4, $fa0 + fsub.s $fa0, $fa3, $fa0 + fst.s $fa0, $sp, 32 # 4-byte Folded Spill + fsub.s $fa0, $fa5, $fa1 fst.s $fa0, $sp, 28 # 4-byte Folded Spill - fsub.s $fa0, $fa4, $fa1 - fst.s $fa0, $sp, 24 # 4-byte Folded Spill fld.s $fa5, $s6, 8 - fld.s $fa2, $s5, 8 + fld.s $ft1, $s5, 8 fld.s $fa6, $s1, 0 fld.s $ft2, $s4, 0 fld.s $fa7, $s1, 4 fld.s $ft3, $s4, 4 fld.s $ft0, $s1, 8 fld.s $ft4, $s4, 8 - fsub.s $ft1, $fa2, $fa5 + fsub.s $ft1, $ft1, $fa5 fsub.s $ft2, $ft2, $fa6 fsub.s $ft3, $ft3, $fa7 fsub.s $ft4, $ft4, $ft0 addi.d $s1, $fp, -1 bstrpick.d $a1, $s1, 31, 0 - movgr2fr.d $fa2, $a1 - ffint.s.l $fa0, $fa2 + movgr2fr.d $ft5, $a1 + ffint.s.l $fa0, $ft5 bstrpick.d $a4, $fp, 30, 1 slli.d $a1, $a4, 1 vori.b $vr13, $vr0, 0 @@ -5114,10 +5110,9 @@ _ZN17btSoftBodyHelpers13CreatePatchUVER19btSoftBodyWorldInfoRK9btVector3S4_S4_S4 vld $vr14, $a7, %pc_lo12(.LCPI14_0) ori $a7, $zero, 0 lu32i.d $a7, 1 - vreplgr2vr.d $vr2, $a7 - vrepli.b $vr16, 0 + vreplgr2vr.d $vr15, $a7 lu12i.w $a7, 260096 - vreplgr2vr.w $vr17, $a7 + vreplgr2vr.w $vr16, $a7 b .LBB14_4 .p2align 4, , 16 .LBB14_3: # %._crit_edge.us @@ -5131,24 +5126,25 @@ _ZN17btSoftBodyHelpers13CreatePatchUVER19btSoftBodyWorldInfoRK9btVector3S4_S4_S4 # Child Loop BB14_5 Depth 2 # Child Loop BB14_8 Depth 2 bstrpick.d $t0, $a0, 31, 0 - movgr2fr.d $ft7, $t0 - ffint.s.l $ft7, $ft7 - fld.s $fa0, $sp, 32 # 4-byte Folded Reload - fdiv.s $ft7, $ft7, $fa0 + movgr2fr.d $ft9, $t0 + ffint.s.l $ft9, $ft9 fld.s $fa0, $sp, 40 # 4-byte Folded Reload - fld.s $fa3, $sp, 28 # 4-byte Folded Reload - fmadd.s $ft10, $fa3, $ft7, $fa0 - fld.s $fa0, $sp, 24 # 4-byte Folded Reload - fmadd.s $ft11, $fa0, $ft7, $fa1 - fmadd.s $ft12, $ft1, $ft7, $fa5 - fmadd.s $ft13, $ft2, $ft7, $fa6 - fmadd.s $ft14, $ft3, $ft7, $fa7 - fmadd.s $ft7, $ft4, $ft7, $ft0 - fsub.s $ft13, $ft13, $ft10 - fsub.s $ft14, $ft14, $ft11 - fsub.s $ft15, $ft7, $ft12 + fdiv.s $ft12, $ft9, $fa0 + fld.s $fa0, $sp, 32 # 4-byte Folded Reload + fmadd.s $ft9, $fa0, $ft12, $fa4 + fld.s $fa0, $sp, 28 # 4-byte Folded Reload + fmadd.s $ft10, $fa0, $ft12, $fa1 + fmadd.s $ft11, $ft1, $ft12, $fa5 + fmadd.s $ft13, $ft2, $ft12, $fa6 + fmadd.s $ft14, $ft3, $ft12, $fa7 + fmadd.s $ft15, $ft4, $ft12, $ft0 + fsub.s $ft12, $ft13, $ft9 + fsub.s $ft13, $ft14, $ft10 + fsub.s $ft14, $ft15, $ft11 ld.d $t0, $sp, 80 # 8-byte Folded Reload mul.d $t0, $a0, $t0 + vori.b $vr23, $vr17, 0 + vextrins.w $vr23, $vr17, 16 vori.b $vr24, $vr18, 0 vextrins.w $vr24, $vr18, 16 vori.b $vr25, $vr19, 0 @@ -5159,47 +5155,45 @@ _ZN17btSoftBodyHelpers13CreatePatchUVER19btSoftBodyWorldInfoRK9btVector3S4_S4_S4 vextrins.w $vr27, $vr21, 16 vori.b $vr28, $vr22, 0 vextrins.w $vr28, $vr22, 16 - vori.b $vr29, $vr23, 0 - vextrins.w $vr29, $vr23, 16 - vreplgr2vr.d $vr30, $t0 + vreplgr2vr.d $vr29, $t0 move $t0, $a1 - vori.b $vr31, $vr14, 0 - vori.b $vr15, $vr2, 0 + vori.b $vr30, $vr14, 0 + vori.b $vr31, $vr15, 0 .p2align 4, , 16 .LBB14_5: # %vector.body # Parent Loop BB14_4 Depth=1 # => This Inner Loop Header: Depth=2 - vffint.s.wu $vr0, $vr15 - vfdiv.s $vr0, $vr0, $vr13 - vfmadd.s $vr3, $vr27, $vr0, $vr24 - vfmadd.s $vr4, $vr28, $vr0, $vr25 - vfmadd.s $vr0, $vr29, $vr0, $vr26 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vilvl.w $vr3, $vr16, $vr3 - vor.v $vr3, $vr4, $vr3 - vpickve2gr.d $t1, $vr0, 0 + vffint.s.wu $vr2, $vr31 + vfdiv.s $vr2, $vr2, $vr13 + vfmadd.s $vr0, $vr26, $vr2, $vr23 + vfmadd.s $vr3, $vr27, $vr2, $vr24 + vfmadd.s $vr2, $vr28, $vr2, $vr25 + vshuf4i.w $vr3, $vr3, 16 + vslli.d $vr3, $vr3, 32 + vsllwil.du.wu $vr0, $vr0, 0 + vor.v $vr0, $vr3, $vr0 + vpickve2gr.d $t1, $vr2, 0 srli.d $t2, $t1, 32 bstrpick.d $t1, $t1, 31, 0 - vadd.d $vr0, $vr31, $vr30 - vpickve2gr.d $t3, $vr0, 0 + vadd.d $vr2, $vr30, $vr29 + vpickve2gr.d $t3, $vr2, 0 alsl.d $t4, $t3, $s2, 4 - vpickve2gr.d $t5, $vr0, 1 + vpickve2gr.d $t5, $vr2, 1 alsl.d $t5, $t5, $s2, 4 - vstelm.d $vr3, $t4, 0, 0 - vstelm.d $vr3, $t5, 0, 1 + vstelm.d $vr0, $t4, 0, 0 + vstelm.d $vr0, $t5, 0, 1 st.d $t1, $t4, 8 st.d $t2, $t5, 8 alsl.d $t1, $t3, $s3, 2 - vstelm.d $vr17, $t1, 0, 0 - vaddi.du $vr31, $vr31, 2 + vstelm.d $vr16, $t1, 0, 0 + vaddi.du $vr30, $vr30, 2 addi.d $t0, $t0, -2 - vaddi.wu $vr15, $vr15, 2 + vaddi.wu $vr31, $vr31, 2 bnez $t0, .LBB14_5 # %bb.6: # %middle.block # in Loop: Header=BB14_4 Depth=1 ld.d $t0, $sp, 80 # 8-byte Folded Reload - vld $vr15, $sp, 48 # 16-byte Folded Reload + vld $vr23, $sp, 48 # 16-byte Folded Reload beq $t0, $a1, .LBB14_3 # %bb.7: # %scalar.ph.preheader # in Loop: Header=BB14_4 Depth=1 @@ -5214,12 +5208,12 @@ _ZN17btSoftBodyHelpers13CreatePatchUVER19btSoftBodyWorldInfoRK9btVector3S4_S4_S4 bstrpick.d $t4, $t1, 31, 0 movgr2fr.d $fa0, $t4 ffint.s.l $fa0, $fa0 - fdiv.s $fa0, $fa0, $ft7 + fdiv.s $fa0, $fa0, $ft15 + fmadd.s $fa2, $ft12, $fa0, $ft9 fmadd.s $fa3, $ft13, $fa0, $ft10 - fmadd.s $fa4, $ft14, $fa0, $ft11 - fmadd.s $fa0, $ft15, $fa0, $ft12 - movfr2gr.s $t4, $fa3 - movfr2gr.s $t5, $fa4 + fmadd.s $fa0, $ft14, $fa0, $ft11 + movfr2gr.s $t4, $fa2 + movfr2gr.s $t5, $fa3 bstrins.d $t4, $t5, 63, 32 movfr2gr.s $t5, $fa0 bstrpick.d $t5, $t5, 31, 0 diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/MallocPlus.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/MallocPlus.s index c2f856f0..a1f11d0d 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/MallocPlus.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/MallocPlus.s @@ -3306,12 +3306,9 @@ _ZN10MallocPlus14memory_reorderEPfPi: # @_ZN10MallocPlus14memory_reorderEPfPi .LBB12_24: # %vector.body # =>This Inner Loop Header: Depth=1 vld $vr0, $a3, 0 - vshuf4i.w $vr1, $vr0, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vshuf4i.w $vr0, $vr0, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 + vshuf4i.w $vr1, $vr0, 14 + vsllwil.d.w $vr1, $vr1, 0 + vsllwil.d.w $vr0, $vr0, 0 vpickve2gr.d $a6, $vr0, 0 slli.d $a6, $a6, 2 vpickve2gr.d $a7, $vr0, 1 @@ -3577,12 +3574,9 @@ _ZN10MallocPlus14memory_reorderEPiS0_: # @_ZN10MallocPlus14memory_reorderEPiS0_ .LBB13_24: # %vector.body # =>This Inner Loop Header: Depth=1 vld $vr0, $a3, 0 - vshuf4i.w $vr1, $vr0, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vshuf4i.w $vr0, $vr0, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 + vshuf4i.w $vr1, $vr0, 14 + vsllwil.d.w $vr1, $vr1, 0 + vsllwil.d.w $vr0, $vr0, 0 vpickve2gr.d $a6, $vr0, 0 slli.d $a6, $a6, 2 vpickve2gr.d $a7, $vr0, 1 diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/hsfcsort.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/hsfcsort.s index 5b1f06a8..62bdb7da 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/hsfcsort.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/hsfcsort.s @@ -208,12 +208,9 @@ hsfc2sort: # @hsfc2sort .p2align 4, , 16 .LBB0_23: # %vector.body112 # =>This Inner Loop Header: Depth=1 - vshuf4i.w $vr2, $vr0, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr0, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr2, $vr0, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr0, 0 vpickve2gr.d $a4, $vr3, 0 slli.d $a4, $a4, 2 vpickve2gr.d $a5, $vr3, 1 @@ -265,12 +262,9 @@ hsfc2sort: # @hsfc2sort .p2align 4, , 16 .LBB0_28: # %vector.body # =>This Inner Loop Header: Depth=1 - vshuf4i.w $vr2, $vr1, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr1, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr2, $vr1, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr1, 0 vpickve2gr.d $a4, $vr3, 0 slli.d $a4, $a4, 2 vpickve2gr.d $a5, $vr3, 1 @@ -283,12 +277,9 @@ hsfc2sort: # @hsfc2sort ldx.w $a5, $s2, $a5 ldx.w $a6, $s2, $a6 ldx.w $a7, $s2, $a7 - vshuf4i.w $vr2, $vr0, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr0, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr2, $vr0, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr0, 0 vpickve2gr.d $t0, $vr3, 0 slli.d $t0, $t0, 2 vpickve2gr.d $t1, $vr3, 1 @@ -502,12 +493,9 @@ hsfc3sort: # @hsfc3sort .p2align 4, , 16 .LBB2_14: # %vector.body # =>This Inner Loop Header: Depth=1 - vshuf4i.w $vr2, $vr1, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr1, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr2, $vr1, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr1, 0 vpickve2gr.d $a4, $vr3, 0 slli.d $a4, $a4, 2 vpickve2gr.d $a5, $vr3, 1 @@ -520,12 +508,9 @@ hsfc3sort: # @hsfc3sort ldx.w $a5, $s2, $a5 ldx.w $a6, $s2, $a6 ldx.w $a7, $s2, $a7 - vshuf4i.w $vr2, $vr0, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr0, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr2, $vr0, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr0, 0 vpickve2gr.d $t0, $vr3, 0 slli.d $t0, $t0, 2 vpickve2gr.d $t1, $vr3, 1 @@ -643,12 +628,9 @@ hsfc3sort: # @hsfc3sort .p2align 4, , 16 .LBB2_29: # %vector.body150 # =>This Inner Loop Header: Depth=1 - vshuf4i.w $vr2, $vr0, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr0, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr2, $vr0, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr0, 0 vpickve2gr.d $a4, $vr3, 0 slli.d $a4, $a4, 2 vpickve2gr.d $a5, $vr3, 1 diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/state.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/state.s index 41952d4d..68f2bfec 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/state.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/state.s @@ -1122,18 +1122,12 @@ _ZN5State18add_boundary_cellsEv: # @_ZN5State18add_boundary_cellsEv vld $vr2, $t0, -16 vld $vr3, $t0, 0 vld $vr5, $a7, 0 - vshuf4i.w $vr6, $vr4, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr7, $vr5, 50 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vshuf4i.w $vr6, $vr4, 14 + vsllwil.d.w $vr6, $vr6, 0 + vsllwil.d.w $vr4, $vr4, 0 + vshuf4i.w $vr7, $vr5, 14 + vsllwil.d.w $vr7, $vr7, 0 + vsllwil.d.w $vr5, $vr5, 0 vpickve2gr.d $t2, $vr4, 0 slli.d $t5, $t2, 2 vpickve2gr.d $t2, $vr4, 1 diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/QCS.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/QCS.s index c15e0ba7..aae7d5dc 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/QCS.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/QCS.s @@ -744,50 +744,51 @@ _ZN3QCS11setQCnForceEPKdS1_S1_P7double2ii: # @_ZN3QCS11setQCnForceEPKdS1_S1_P7do .cfi_offset 31, -88 move $s5, $a0 ld.d $a0, $a0, 0 - move $s0, $a6 - move $s4, $a5 + move $s4, $a6 + move $fp, $a5 st.d $a4, $sp, 32 # 8-byte Folded Spill st.d $a3, $sp, 24 # 8-byte Folded Spill st.d $a2, $sp, 16 # 8-byte Folded Spill - move $s1, $a1 - ld.d $fp, $a0, 0 + move $s0, $a1 + ld.d $s1, $a0, 0 ld.d $s6, $a0, 248 ld.d $s2, $a0, 312 ld.d $s3, $a0, 360 - ld.d $s7, $fp, 376 + ld.d $s7, $s1, 376 sub.d $s8, $a6, $a5 addi.w $a0, $s8, 0 slli.d $a0, $a0, 3 pcaddu18i $ra, %call36(malloc) jirl $ra, $ra, 0 - bge $s4, $s0, .LBB4_9 + bge $fp, $s4, .LBB4_9 # %bb.1: # %.lr.ph + move $t5, $s4 fld.d $fa0, $s5, 8 vldi $vr1, -912 fadd.d $fa1, $fa0, $fa1 fld.d $fa2, $s5, 24 - ld.d $a1, $fp, 104 + ld.d $a1, $s1, 104 vldi $vr3, -944 fld.d $fa0, $s5, 16 fmul.d $fa2, $fa2, $fa3 ori $a3, $zero, 2 fmul.d $fa1, $fa1, $fa2 - move $a2, $s4 - ld.d $t5, $sp, 24 # 8-byte Folded Reload + move $a2, $fp + ld.d $s4, $sp, 24 # 8-byte Folded Reload ld.d $t6, $sp, 16 # 8-byte Folded Reload bltu $s8, $a3, .LBB4_5 # %bb.2: # %vector.ph move $a3, $s8 bstrins.d $a3, $zero, 0, 0 - add.d $a2, $a3, $s4 + add.d $a2, $a3, $fp vreplvei.d $vr2, $vr1, 0 vreplvei.d $vr3, $vr0, 0 - alsl.d $a4, $s4, $a1, 2 + alsl.d $a4, $fp, $a1, 2 vrepli.b $vr4, 0 move $a5, $a3 move $a6, $a0 - move $a7, $s1 - move $t0, $t5 + move $a7, $s0 + move $t0, $s4 move $t1, $t6 .p2align 4, , 16 .LBB4_3: # %vector.body @@ -795,9 +796,7 @@ _ZN3QCS11setQCnForceEPKdS1_S1_P7double2ii: # @_ZN3QCS11setQCnForceEPKdS1_S1_P7do ld.d $t2, $a4, 0 vinsgr2vr.d $vr5, $t2, 0 vld $vr6, $t1, 0 - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vsllwil.d.w $vr5, $vr5, 0 vpickve2gr.d $t2, $vr5, 0 slli.d $t2, $t2, 3 vpickve2gr.d $t3, $vr5, 1 @@ -833,13 +832,13 @@ _ZN3QCS11setQCnForceEPKdS1_S1_P7double2ii: # @_ZN3QCS11setQCnForceEPKdS1_S1_P7do .LBB4_5: # %scalar.ph.preheader alsl.d $a1, $a2, $a1, 2 slli.d $a3, $a2, 3 - slli.d $a4, $s4, 3 + slli.d $a4, $fp, 3 sub.d $a6, $a3, $a4 add.d $a3, $t6, $a6 - add.d $a4, $t5, $a6 - add.d $a5, $s1, $a6 + add.d $a4, $s4, $a6 + add.d $a5, $s0, $a6 add.d $a6, $a0, $a6 - sub.d $a2, $s0, $a2 + sub.d $a2, $t5, $a2 movgr2fr.d $fa2, $zero .p2align 4, , 16 .LBB4_6: # %scalar.ph @@ -870,15 +869,15 @@ _ZN3QCS11setQCnForceEPKdS1_S1_P7double2ii: # @_ZN3QCS11setQCnForceEPKdS1_S1_P7do addi.d $a6, $a6, 8 bnez $a2, .LBB4_6 .LBB4_7: # %.lr.ph97 - ld.d $a5, $fp, 120 - ld.d $a1, $fp, 96 - ld.d $a2, $fp, 88 - ld.d $a3, $fp, 112 + ld.d $a5, $s1, 120 + ld.d $a1, $s1, 96 + ld.d $a2, $s1, 88 + ld.d $a3, $s1, 112 ld.d $a4, $sp, 32 # 8-byte Folded Reload addi.d $a4, $a4, 16 - alsl.d $a5, $s4, $a5, 2 - alsl.d $a6, $s4, $a1, 2 - alsl.d $a7, $s4, $a3, 2 + alsl.d $a5, $fp, $a5, 2 + alsl.d $a6, $fp, $a1, 2 + alsl.d $a7, $fp, $a3, 2 move $t0, $a0 .p2align 4, , 16 .LBB4_8: # =>This Inner Loop Header: Depth=1 diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/CMakeFiles/miniFE.dir/main.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/CMakeFiles/miniFE.dir/main.s index e5a7902b..27eedbb6 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/CMakeFiles/miniFE.dir/main.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/CMakeFiles/miniFE.dir/main.s @@ -9750,13 +9750,9 @@ _ZN6miniFE16impose_dirichletINS_9CSRMatrixIdiiEENS_6VectorIdiiEEEEvNT_10ScalarTy vinsgr2vr.d $vr1, $t0, 0 vinsgr2vr.d $vr2, $t1, 0 vseq.w $vr1, $vr1, $vr0 - vshuf4i.w $vr1, $vr1, 16 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 + vsllwil.d.w $vr1, $vr1, 0 vseq.w $vr2, $vr2, $vr0 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 + vsllwil.d.w $vr2, $vr2, 0 vand.v $vr1, $vr1, $vr4 vand.v $vr2, $vr2, $vr4 vst $vr1, $a5, -16 diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeFiles/SimpleMOC.dir/tracks.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeFiles/SimpleMOC.dir/tracks.s index 9a40d220..58636ab7 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeFiles/SimpleMOC.dir/tracks.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeFiles/SimpleMOC.dir/tracks.s @@ -767,7 +767,7 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks st.d $s7, $sp, 192 # 8-byte Folded Spill st.d $s8, $sp, 184 # 8-byte Folded Spill addi.d $fp, $sp, 272 - move $s6, $a3 + move $s7, $a3 move $s2, $a2 st.d $a1, $fp, -224 # 8-byte Folded Spill move $s3, $a0 @@ -830,8 +830,8 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks move $sp, $a0 sub.d $s5, $sp, $a1 move $sp, $s5 - sub.d $s7, $sp, $a1 - move $sp, $s7 + sub.d $s6, $sp, $a1 + move $sp, $s6 slli.d $a1, $a2, 3 addi.d $a1, $a1, 15 bstrpick.d $a1, $a1, 35, 4 @@ -852,7 +852,7 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks jirl $ra, $ra, 0 ld.w $a2, $s2, 28 ori $a1, $zero, 4 - move $a0, $s7 + move $a0, $s6 move $a3, $s0 pcaddu18i $ra, %call36(fread) jirl $ra, $ra, 0 @@ -873,7 +873,7 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks move $a2, $zero b .LBB7_7 .LBB7_3: - move $s7, $zero + move $s6, $zero ld.d $s8, $fp, -256 # 8-byte Folded Reload b .LBB7_10 .LBB7_4: # %vector.ph @@ -891,12 +891,8 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks ld.d $a5, $a2, 0 vinsgr2vr.d $vr2, $a4, 0 vinsgr2vr.d $vr3, $a5, 0 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 vadd.d $vr0, $vr0, $vr2 vadd.d $vr1, $vr1, $vr3 addi.d $a3, $a3, -4 @@ -921,15 +917,15 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks bnez $a0, .LBB7_8 .LBB7_9: # %._crit_edge st.d $a2, $s2, 80 - slli.d $s7, $a2, 5 + slli.d $s6, $a2, 5 .LBB7_10: - move $a0, $s7 + move $a0, $s6 pcaddu18i $ra, %call36(malloc) jirl $ra, $ra, 0 - ld.d $a1, $s6, 0 + ld.d $a1, $s7, 0 st.d $a0, $fp, -208 # 8-byte Folded Spill - add.d $a0, $a1, $s7 - st.d $a0, $s6, 0 + add.d $a0, $a1, $s6 + st.d $a0, $s7, 0 addi.d $a1, $fp, -200 move $a0, $s0 pcaddu18i $ra, %call36(fgetpos) @@ -943,7 +939,7 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks beqz $a1, .LBB7_35 # %bb.12: # %.preheader110.us.preheader move $s1, $zero - move $s7, $zero + move $s6, $zero b .LBB7_15 .p2align 4, , 16 .LBB7_13: # %._crit_edge119.split.us.us.loopexit @@ -969,7 +965,7 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks .LBB7_17: # %._crit_edge115.split.us.us.us # in Loop: Header=BB7_18 Depth=2 addi.w $s3, $s3, 1 - add.d $s7, $s7, $s4 + add.d $s6, $s6, $s4 beq $s3, $s2, .LBB7_13 .LBB7_18: # %.lr.ph118.us # Parent Loop BB7_15 Depth=1 @@ -1008,15 +1004,15 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks bnez $s5, .LBB7_19 b .LBB7_17 .LBB7_20: - move $s7, $zero + move $s6, $zero .LBB7_21: # %._crit_edge122 - slli.d $a0, $s7, 4 + slli.d $a0, $s6, 4 pcaddu18i $ra, %call36(malloc) jirl $ra, $ra, 0 - ld.d $a1, $s6, 0 + ld.d $a1, $s7, 0 st.d $a0, $fp, -216 # 8-byte Folded Spill - alsl.d $a0, $s7, $a1, 4 - st.d $a0, $s6, 0 + alsl.d $a0, $s6, $a1, 4 + st.d $a0, $s7, 0 addi.d $a1, $fp, -200 move $a0, $s0 pcaddu18i $ra, %call36(fsetpos) @@ -1053,16 +1049,16 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks # %bb.26: # %.lr.ph132.preheader # in Loop: Header=BB7_25 Depth=1 st.d $a3, $fp, -248 # 8-byte Folded Spill - move $s6, $zero + move $s7, $zero addi.w $s3, $s3, 0 b .LBB7_28 .p2align 4, , 16 .LBB7_27: # %._crit_edge128 # in Loop: Header=BB7_28 Depth=2 add.w $s1, $s5, $s1 - addi.w $s6, $s6, 1 + addi.w $s7, $s7, 1 addi.d $s3, $s3, 1 - beq $s6, $s2, .LBB7_23 + beq $s7, $s2, .LBB7_23 .LBB7_28: # %.lr.ph132 # Parent Loop BB7_25 Depth=1 # => This Loop Header: Depth=2 @@ -1216,7 +1212,7 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks .LBB7_34: # %._crit_edge139 ld.d $s1, $fp, -264 # 8-byte Folded Reload ld.d $a1, $s1, 80 - div.d $a0, $s7, $a1 + div.d $a0, $s6, $a1 st.d $a0, $s1, 48 pcalau12i $a0, %pc_hi20(.L.str.3) addi.d $a0, $a0, %pc_lo12(.L.str.3) @@ -1234,7 +1230,7 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks jirl $ra, $ra, 0 pcalau12i $a0, %pc_hi20(.L.str.5) addi.d $a0, $a0, %pc_lo12(.L.str.5) - move $a1, $s7 + move $a1, $s6 pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 move $a0, $s0 @@ -1258,7 +1254,7 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks ret .LBB7_35: # %.preheader110.preheader move $s1, $zero - move $s7, $zero + move $s6, $zero b .LBB7_38 .p2align 4, , 16 .LBB7_36: # %._crit_edge119.split.loopexit @@ -1284,7 +1280,7 @@ load_OpenMOC_tracks: # @load_OpenMOC_tracks .LBB7_40: # %._crit_edge115.split # in Loop: Header=BB7_41 Depth=2 addi.w $s3, $s3, 1 - add.d $s7, $s7, $s4 + add.d $s6, $s6, $s4 beq $s3, $s2, .LBB7_36 .LBB7_41: # %.lr.ph118 # Parent Loop BB7_38 Depth=1 diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/CMakeFiles/miniAMR.dir/refine.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/CMakeFiles/miniAMR.dir/refine.s index d574d953..5cd3690a 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/CMakeFiles/miniAMR.dir/refine.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/CMakeFiles/miniAMR.dir/refine.s @@ -1652,48 +1652,44 @@ refine_level: # @refine_level ld.w $t0, $a4, -24 ld.w $t1, $a4, -16 ld.w $t2, $a4, -8 - ld.w $t3, $a4, 0 - ld.w $t4, $a4, 8 - ld.w $t5, $a4, 16 - ld.w $t6, $a4, 24 - vinsgr2vr.w $vr2, $t1, 0 - vinsgr2vr.w $vr2, $t2, 2 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 + vinsgr2vr.w $vr2, $a7, 0 + vinsgr2vr.w $vr2, $t0, 1 + ld.w $a7, $a4, 0 + ld.w $t0, $a4, 8 + ld.w $t3, $a4, 16 + ld.w $t4, $a4, 24 vinsgr2vr.w $vr3, $a7, 0 - vinsgr2vr.w $vr3, $t0, 2 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vinsgr2vr.w $vr4, $t5, 0 - vinsgr2vr.w $vr4, $t6, 2 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 + vinsgr2vr.w $vr3, $t0, 1 + vinsgr2vr.w $vr4, $t1, 0 + vinsgr2vr.w $vr4, $t2, 1 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 vinsgr2vr.w $vr5, $t3, 0 - vinsgr2vr.w $vr5, $t4, 2 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vpickve2gr.d $a7, $vr3, 0 + vinsgr2vr.w $vr5, $t4, 1 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 + vpickve2gr.d $a7, $vr2, 0 mul.d $a7, $a7, $a5 add.d $a7, $a1, $a7 - vpickve2gr.d $t0, $vr3, 1 + vpickve2gr.d $t0, $vr2, 1 mul.d $t0, $t0, $a5 add.d $t0, $a1, $t0 - vpickve2gr.d $t1, $vr2, 0 + vpickve2gr.d $t1, $vr4, 0 mul.d $t1, $t1, $a5 add.d $t1, $a1, $t1 - vpickve2gr.d $t2, $vr2, 1 + vpickve2gr.d $t2, $vr4, 1 mul.d $t2, $t2, $a5 add.d $t2, $a1, $t2 - vpickve2gr.d $t3, $vr5, 0 + vpickve2gr.d $t3, $vr3, 0 mul.d $t3, $t3, $a5 add.d $t3, $a1, $t3 - vpickve2gr.d $t4, $vr5, 1 + vpickve2gr.d $t4, $vr3, 1 mul.d $t4, $t4, $a5 add.d $t4, $a1, $t4 - vpickve2gr.d $t5, $vr4, 0 + vpickve2gr.d $t5, $vr5, 0 mul.d $t5, $t5, $a5 add.d $t5, $a1, $t5 - vpickve2gr.d $t6, $vr4, 1 + vpickve2gr.d $t6, $vr5, 1 mul.d $t6, $t6, $a5 add.d $t6, $a1, $t6 ld.w $a7, $a7, 8 diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeFiles/miniGMG.dir/operators.ompif.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeFiles/miniGMG.dir/operators.ompif.s index 17cabf24..edabb1bf 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeFiles/miniGMG.dir/operators.ompif.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeFiles/miniGMG.dir/operators.ompif.s @@ -3927,9 +3927,7 @@ restriction_betas: # @restriction_betas # => This Inner Loop Header: Depth=4 vadd.w $vr8, $vr6, $vr7 vslli.w $vr8, $vr8, 1 - vshuf4i.w $vr9, $vr8, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 + vsllwil.d.w $vr9, $vr8, 0 vpickve2gr.d $a1, $vr9, 0 slli.d $a1, $a1, 3 vpickve2gr.d $t0, $vr9, 1 @@ -3937,9 +3935,7 @@ restriction_betas: # @restriction_betas fldx.d $ft1, $s1, $a1 fldx.d $ft2, $s1, $t0 vadd.w $vr11, $vr8, $vr3 - vshuf4i.w $vr11, $vr11, 16 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 + vsllwil.d.w $vr11, $vr11, 0 vpickve2gr.d $a1, $vr11, 0 slli.d $a1, $a1, 3 vpickve2gr.d $t0, $vr11, 1 @@ -3949,9 +3945,7 @@ restriction_betas: # @restriction_betas vextrins.d $vr9, $vr10, 16 vextrins.d $vr11, $vr12, 16 vadd.w $vr10, $vr8, $vr4 - vshuf4i.w $vr10, $vr10, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 + vsllwil.d.w $vr10, $vr10, 0 vpickve2gr.d $a1, $vr10, 0 slli.d $a1, $a1, 3 vpickve2gr.d $t0, $vr10, 1 @@ -3959,9 +3953,7 @@ restriction_betas: # @restriction_betas fldx.d $ft2, $s1, $a1 fldx.d $ft4, $s1, $t0 vadd.w $vr8, $vr8, $vr5 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $a1, $vr8, 0 slli.d $a1, $a1, 3 vpickve2gr.d $t0, $vr8, 1 @@ -4219,9 +4211,7 @@ restriction_betas: # @restriction_betas # => This Inner Loop Header: Depth=4 vadd.w $vr6, $vr4, $vr5 vslli.w $vr6, $vr6, 1 - vshuf4i.w $vr7, $vr6, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 + vsllwil.d.w $vr7, $vr6, 0 vpickve2gr.d $a0, $vr7, 0 slli.d $a0, $a0, 3 vpickve2gr.d $a1, $vr7, 1 @@ -4229,9 +4219,7 @@ restriction_betas: # @restriction_betas fldx.d $fa7, $s1, $a0 fldx.d $ft0, $s1, $a1 vbitseti.w $vr9, $vr6, 0 - vshuf4i.w $vr10, $vr9, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 + vsllwil.d.w $vr10, $vr9, 0 vpickve2gr.d $a0, $vr10, 0 slli.d $a0, $a0, 3 vpickve2gr.d $a1, $vr10, 1 @@ -4241,9 +4229,7 @@ restriction_betas: # @restriction_betas vextrins.d $vr7, $vr8, 16 vextrins.d $vr10, $vr11, 16 vadd.w $vr6, $vr6, $vr3 - vshuf4i.w $vr6, $vr6, 16 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 + vsllwil.d.w $vr6, $vr6, 0 vpickve2gr.d $a0, $vr6, 0 slli.d $a0, $a0, 3 vpickve2gr.d $a1, $vr6, 1 @@ -4251,9 +4237,7 @@ restriction_betas: # @restriction_betas fldx.d $fa6, $s1, $a0 fldx.d $ft0, $s1, $a1 vadd.w $vr9, $vr9, $vr3 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 + vsllwil.d.w $vr9, $vr9, 0 vpickve2gr.d $a0, $vr9, 0 slli.d $a0, $a0, 3 vpickve2gr.d $a1, $vr9, 1 @@ -4511,9 +4495,7 @@ restriction_betas: # @restriction_betas # => This Inner Loop Header: Depth=4 vadd.w $vr6, $vr4, $vr5 vslli.w $vr6, $vr6, 1 - vshuf4i.w $vr7, $vr6, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 + vsllwil.d.w $vr7, $vr6, 0 vpickve2gr.d $a0, $vr7, 0 slli.d $a0, $a0, 3 vpickve2gr.d $a1, $vr7, 1 @@ -4521,9 +4503,7 @@ restriction_betas: # @restriction_betas fldx.d $fa7, $s1, $a0 fldx.d $ft0, $s1, $a1 vbitseti.w $vr9, $vr6, 0 - vshuf4i.w $vr10, $vr9, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 + vsllwil.d.w $vr10, $vr9, 0 vpickve2gr.d $a0, $vr10, 0 slli.d $a0, $a0, 3 vpickve2gr.d $a1, $vr10, 1 @@ -4533,9 +4513,7 @@ restriction_betas: # @restriction_betas vextrins.d $vr7, $vr8, 16 vextrins.d $vr10, $vr11, 16 vadd.w $vr6, $vr6, $vr3 - vshuf4i.w $vr6, $vr6, 16 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 + vsllwil.d.w $vr6, $vr6, 0 vpickve2gr.d $a0, $vr6, 0 slli.d $a0, $a0, 3 vpickve2gr.d $a1, $vr6, 1 @@ -4543,9 +4521,7 @@ restriction_betas: # @restriction_betas fldx.d $fa6, $s1, $a0 fldx.d $ft0, $s1, $a1 vadd.w $vr9, $vr9, $vr3 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 + vsllwil.d.w $vr9, $vr9, 0 vpickve2gr.d $a0, $vr9, 0 slli.d $a0, $a0, 3 vpickve2gr.d $a1, $vr9, 1 @@ -5411,21 +5387,21 @@ initialize_grid_to_scalar: # @initialize_grid_to_scalar add.d $a1, $a0, $a1 ld.w $a0, $a1, 44 ld.w $a2, $a1, 28 - sub.w $s1, $zero, $a0 + sub.w $a3, $zero, $a0 st.d $a2, $sp, 120 # 8-byte Folded Spill add.w $a2, $a2, $a0 st.d $a2, $sp, 112 # 8-byte Folded Spill - bge $s1, $a2, .LBB12_2 + bge $a3, $a2, .LBB12_2 # %bb.4: # %.preheader78.lr.ph # in Loop: Header=BB12_3 Depth=1 ld.w $t2, $a1, 24 add.w $t3, $t2, $a0 - bge $s1, $t3, .LBB12_2 + bge $a3, $t3, .LBB12_2 # %bb.5: # %.preheader78.lr.ph # in Loop: Header=BB12_3 Depth=1 ld.w $t4, $a1, 20 add.w $t5, $t4, $a0 - bge $s1, $t5, .LBB12_2 + bge $a3, $t5, .LBB12_2 # %bb.6: # %.preheader78.us.us.preheader # in Loop: Header=BB12_3 Depth=1 ld.w $t6, $a1, 48 @@ -5438,40 +5414,39 @@ initialize_grid_to_scalar: # @initialize_grid_to_scalar addi.d $a2, $a2, 1 mul.w $a2, $a0, $a2 alsl.d $s2, $a2, $a1, 3 - nor $a1, $s1, $zero + nor $a1, $a3, $zero add.d $s3, $a1, $t5 nor $a1, $t6, $zero sub.d $a1, $a1, $a4 - sub.d $s4, $t5, $s1 - mul.w $a3, $a0, $a1 - move $a7, $s4 - bstrins.d $a7, $zero, 1, 0 - add.d $s7, $a7, $s1 + sub.d $s4, $t5, $a3 + mul.w $a2, $a0, $a1 + move $s6, $s4 + bstrins.d $s6, $zero, 1, 0 + add.d $s7, $s6, $a3 vreplgr2vr.d $vr6, $t4 - vreplgr2vr.d $vr7, $s1 + vreplgr2vr.d $vr7, $a3 vadd.d $vr7, $vr7, $vr1 - vinsgr2vr.w $vr8, $s1, 0 - vinsgr2vr.w $vr8, $s1, 1 + vinsgr2vr.w $vr8, $a3, 0 + vinsgr2vr.w $vr8, $a3, 1 vadd.w $vr8, $vr8, $vr2 add.d $a1, $a4, $t6 sub.d $a1, $zero, $a1 - mul.d $a2, $a0, $a1 - st.d $a3, $sp, 96 # 8-byte Folded Spill - move $a1, $a3 - move $a3, $s1 + mul.d $a1, $a0, $a1 + st.d $a2, $sp, 96 # 8-byte Folded Spill + move $a7, $a3 st.d $a4, $sp, 104 # 8-byte Folded Spill b .LBB12_8 .p2align 4, , 16 .LBB12_7: # %._crit_edge81.split.us.us.us # in Loop: Header=BB12_8 Depth=2 - addi.w $a3, $a3, 1 + addi.w $a7, $a7, 1 ld.d $a5, $sp, 128 # 8-byte Folded Reload addi.d $a5, $a5, 1 ld.d $a4, $sp, 104 # 8-byte Folded Reload - add.w $a1, $a1, $a4 - add.d $a2, $a2, $a4 + add.w $a2, $a2, $a4 + add.d $a1, $a1, $a4 ld.d $a0, $sp, 112 # 8-byte Folded Reload - beq $a3, $a0, .LBB12_2 + beq $a7, $a0, .LBB12_2 .LBB12_8: # %.preheader78.us.us # Parent Loop BB12_3 Depth=1 # => This Loop Header: Depth=2 @@ -5489,13 +5464,13 @@ initialize_grid_to_scalar: # @initialize_grid_to_scalar add.d $fp, $a4, $a0 srli.d $s0, $s3, 32 ld.d $a0, $sp, 120 # 8-byte Folded Reload - bge $a3, $a0, .LBB12_28 + bge $a7, $a0, .LBB12_28 # %bb.9: # %.preheader.us.us92.us.preheader # in Loop: Header=BB12_8 Depth=2 move $a4, $zero - move $s8, $a2 - move $s5, $a1 - move $t0, $s1 + move $s8, $a1 + move $s5, $a2 + move $t0, $a3 b .LBB12_11 .p2align 4, , 16 .LBB12_10: # %._crit_edge.us.us.us @@ -5518,21 +5493,21 @@ initialize_grid_to_scalar: # @initialize_grid_to_scalar bge $t0, $t2, .LBB12_15 # %bb.12: # %.lr.ph.split.us84.us.us.preheader # in Loop: Header=BB12_11 Depth=3 - or $a0, $t0, $a3 - move $s6, $s1 + or $a0, $t0, $a7 + move $s1, $a3 ori $a5, $zero, 4 bgeu $s4, $a5, .LBB12_18 .LBB12_13: # %.lr.ph.split.us84.us.us.preheader176 # in Loop: Header=BB12_11 Depth=3 - move $ra, $s6 + move $ra, $s1 .p2align 4, , 16 .LBB12_14: # %.lr.ph.split.us84.us.us # Parent Loop BB12_3 Depth=1 # Parent Loop BB12_8 Depth=2 # Parent Loop BB12_11 Depth=3 # => This Inner Loop Header: Depth=4 - add.w $t1, $s8, $s6 - or $t7, $a0, $s6 + add.w $t1, $s8, $s1 + or $t7, $a0, $s1 addi.w $t7, $t7, 0 addi.w $a5, $zero, -1 slt $t8, $ra, $t4 @@ -5544,18 +5519,18 @@ initialize_grid_to_scalar: # @initialize_grid_to_scalar slli.d $a5, $t1, 3 fstx.d $ft1, $s2, $a5 addi.d $ra, $ra, 1 - addi.d $s6, $s6, 1 + addi.d $s1, $s1, 1 bne $t5, $ra, .LBB12_14 b .LBB12_10 .p2align 4, , 16 .LBB12_15: # %.lr.ph.split.us.us.us.us.preheader # in Loop: Header=BB12_11 Depth=3 - move $s6, $s1 + move $s1, $a3 bgeu $s4, $a6, .LBB12_23 .LBB12_16: # %.lr.ph.split.us.us.us.us.preheader177 # in Loop: Header=BB12_11 Depth=3 - add.w $a0, $s8, $s6 - sub.d $s6, $t5, $s6 + add.w $a0, $s8, $s1 + sub.d $s1, $t5, $s1 .p2align 4, , 16 .LBB12_17: # %.lr.ph.split.us.us.us.us # Parent Loop BB12_3 Depth=1 @@ -5564,25 +5539,25 @@ initialize_grid_to_scalar: # @initialize_grid_to_scalar # => This Inner Loop Header: Depth=4 slli.d $a5, $a0, 3 stx.d $zero, $s2, $a5 - addi.d $s6, $s6, -1 + addi.d $s1, $s1, -1 addi.w $a0, $a0, 1 - bnez $s6, .LBB12_17 + bnez $s1, .LBB12_17 b .LBB12_10 .p2align 4, , 16 .LBB12_18: # %vector.scevcheck # in Loop: Header=BB12_11 Depth=3 add.w $t1, $ra, $s3 - move $s6, $s1 + move $s1, $a3 blt $t1, $ra, .LBB12_13 # %bb.19: # %vector.scevcheck # in Loop: Header=BB12_11 Depth=3 - move $s6, $s1 + move $s1, $a3 bnez $s0, .LBB12_13 # %bb.20: # %vector.ph # in Loop: Header=BB12_11 Depth=3 vinsgr2vr.w $vr9, $a0, 0 vinsgr2vr.w $vr9, $a0, 1 - move $s6, $a7 + move $s1, $s6 move $ra, $s5 vori.b $vr10, $vr7, 0 vori.b $vr11, $vr8, 0 @@ -5597,13 +5572,9 @@ initialize_grid_to_scalar: # @initialize_grid_to_scalar vor.v $vr14, $vr9, $vr11 vor.v $vr13, $vr9, $vr13 vslt.w $vr14, $vr4, $vr14 - vshuf4i.w $vr14, $vr14, 16 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr14, $vr14, 32 + vsllwil.d.w $vr14, $vr14, 0 vslt.w $vr13, $vr4, $vr13 - vshuf4i.w $vr13, $vr13, 16 - vslli.d $vr13, $vr13, 32 - vsrai.d $vr13, $vr13, 32 + vsllwil.d.w $vr13, $vr13, 0 vslt.d $vr15, $vr10, $vr6 vslt.d $vr12, $vr12, $vr6 vand.v $vr14, $vr14, $vr15 @@ -5616,53 +5587,53 @@ initialize_grid_to_scalar: # @initialize_grid_to_scalar vst $vr12, $t1, 16 vaddi.du $vr10, $vr10, 4 vaddi.wu $vr11, $vr11, 4 - addi.d $s6, $s6, -4 + addi.d $s1, $s1, -4 addi.w $ra, $ra, 4 - bnez $s6, .LBB12_21 + bnez $s1, .LBB12_21 # %bb.22: # %middle.block # in Loop: Header=BB12_11 Depth=3 - move $s6, $s7 - beq $s4, $a7, .LBB12_10 + move $s1, $s7 + beq $s4, $s6, .LBB12_10 b .LBB12_13 .p2align 4, , 16 .LBB12_23: # %vector.scevcheck148 # in Loop: Header=BB12_11 Depth=3 add.w $a0, $ra, $s3 - move $s6, $s1 + move $s1, $a3 blt $a0, $ra, .LBB12_16 # %bb.24: # %vector.scevcheck148 # in Loop: Header=BB12_11 Depth=3 - move $s6, $s1 + move $s1, $a3 bnez $s0, .LBB12_16 # %bb.25: # %vector.body154.preheader # in Loop: Header=BB12_11 Depth=3 - move $a0, $a7 - move $s6, $s5 + move $a0, $s6 + move $s1, $s5 .p2align 4, , 16 .LBB12_26: # %vector.body154 # Parent Loop BB12_3 Depth=1 # Parent Loop BB12_8 Depth=2 # Parent Loop BB12_11 Depth=3 # => This Inner Loop Header: Depth=4 - alsl.d $ra, $s6, $s2, 3 - slli.d $t1, $s6, 3 + alsl.d $ra, $s1, $s2, 3 + slli.d $t1, $s1, 3 vstx $vr5, $s2, $t1 vst $vr5, $ra, 16 addi.d $a0, $a0, -4 - addi.w $s6, $s6, 4 + addi.w $s1, $s1, 4 bnez $a0, .LBB12_26 # %bb.27: # %middle.block158 # in Loop: Header=BB12_11 Depth=3 - move $s6, $s7 - beq $s4, $a7, .LBB12_10 + move $s1, $s7 + beq $s4, $s6, .LBB12_10 b .LBB12_16 .p2align 4, , 16 .LBB12_28: # %.preheader.us.us.us.us.preheader # in Loop: Header=BB12_8 Depth=2 move $a0, $zero - move $a4, $a2 - move $t0, $a1 - move $s8, $s1 + move $a4, $a1 + move $t0, $a2 + move $s8, $a3 b .LBB12_30 .p2align 4, , 16 .LBB12_29: # %._crit_edge.split.us.us.us.us.us @@ -5678,23 +5649,23 @@ initialize_grid_to_scalar: # @initialize_grid_to_scalar # => This Loop Header: Depth=3 # Child Loop BB12_36 Depth 4 # Child Loop BB12_32 Depth 4 - move $s6, $s1 + move $s5, $a3 bgeu $s4, $a6, .LBB12_33 .LBB12_31: # %scalar.ph164.preheader # in Loop: Header=BB12_30 Depth=3 - add.w $s5, $a4, $s6 - sub.d $s6, $t5, $s6 + add.w $s1, $a4, $s5 + sub.d $s5, $t5, $s5 .p2align 4, , 16 .LBB12_32: # %scalar.ph164 # Parent Loop BB12_3 Depth=1 # Parent Loop BB12_8 Depth=2 # Parent Loop BB12_30 Depth=3 # => This Inner Loop Header: Depth=4 - slli.d $a5, $s5, 3 + slli.d $a5, $s1, 3 stx.d $zero, $s2, $a5 - addi.d $s6, $s6, -1 - addi.w $s5, $s5, 1 - bnez $s6, .LBB12_32 + addi.d $s5, $s5, -1 + addi.w $s1, $s1, 1 + bnez $s5, .LBB12_32 b .LBB12_29 .p2align 4, , 16 .LBB12_33: # %vector.scevcheck161 @@ -5702,33 +5673,33 @@ initialize_grid_to_scalar: # @initialize_grid_to_scalar mul.d $a5, $t6, $a0 add.w $a5, $fp, $a5 add.w $t1, $a5, $s3 - move $s6, $s1 + move $s5, $a3 blt $t1, $a5, .LBB12_31 # %bb.34: # %vector.scevcheck161 # in Loop: Header=BB12_30 Depth=3 - move $s6, $s1 + move $s5, $a3 bnez $s0, .LBB12_31 # %bb.35: # %vector.body169.preheader # in Loop: Header=BB12_30 Depth=3 - move $s5, $a7 - move $s6, $t0 + move $s1, $s6 + move $s5, $t0 .p2align 4, , 16 .LBB12_36: # %vector.body169 # Parent Loop BB12_3 Depth=1 # Parent Loop BB12_8 Depth=2 # Parent Loop BB12_30 Depth=3 # => This Inner Loop Header: Depth=4 - alsl.d $a5, $s6, $s2, 3 - slli.d $t1, $s6, 3 + alsl.d $a5, $s5, $s2, 3 + slli.d $t1, $s5, 3 vstx $vr5, $s2, $t1 vst $vr5, $a5, 16 - addi.d $s5, $s5, -4 - addi.w $s6, $s6, 4 - bnez $s5, .LBB12_36 + addi.d $s1, $s1, -4 + addi.w $s5, $s5, 4 + bnez $s1, .LBB12_36 # %bb.37: # %middle.block173 # in Loop: Header=BB12_30 Depth=3 - move $s6, $s7 - beq $s4, $a7, .LBB12_29 + move $s5, $s7 + beq $s4, $s6, .LBB12_29 b .LBB12_31 .LBB12_38: # %._crit_edge100 pcaddu18i $ra, %call36(CycleTime) diff --git a/results/MultiSource/Benchmarks/MallocBench/espresso/CMakeFiles/espresso.dir/cofactor.s b/results/MultiSource/Benchmarks/MallocBench/espresso/CMakeFiles/espresso.dir/cofactor.s index a66ee1b6..5b00d5c4 100644 --- a/results/MultiSource/Benchmarks/MallocBench/espresso/CMakeFiles/espresso.dir/cofactor.s +++ b/results/MultiSource/Benchmarks/MallocBench/espresso/CMakeFiles/espresso.dir/cofactor.s @@ -1022,18 +1022,12 @@ binate_split_select: # @binate_split_select vaddi.wu $vr5, $vr1, 4 vsrai.w $vr6, $vr1, 5 vsrai.w $vr7, $vr5, 5 - vshuf4i.w $vr8, $vr6, 50 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vshuf4i.w $vr6, $vr6, 16 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr9, $vr7, 50 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr7, $vr7, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 + vshuf4i.w $vr8, $vr6, 14 + vsllwil.d.w $vr8, $vr8, 0 + vsllwil.d.w $vr6, $vr6, 0 + vshuf4i.w $vr9, $vr7, 14 + vsllwil.d.w $vr9, $vr9, 0 + vsllwil.d.w $vr7, $vr7, 0 vpickve2gr.d $a6, $vr6, 0 alsl.d $a6, $a6, $s3, 2 vpickve2gr.d $a7, $vr6, 1 diff --git a/results/MultiSource/Benchmarks/MiBench/automotive-susan/CMakeFiles/automotive-susan.dir/susan.s b/results/MultiSource/Benchmarks/MiBench/automotive-susan/CMakeFiles/automotive-susan.dir/susan.s index 54d9565c..e6394768 100644 --- a/results/MultiSource/Benchmarks/MiBench/automotive-susan/CMakeFiles/automotive-susan.dir/susan.s +++ b/results/MultiSource/Benchmarks/MiBench/automotive-susan/CMakeFiles/automotive-susan.dir/susan.s @@ -1873,31 +1873,31 @@ susan_smoothing: # @susan_smoothing # Child Loop BB10_35 Depth 4 # Child Loop BB10_38 Depth 4 ld.d $a1, $sp, 112 - mul.w $a5, $t7, $s6 + mul.w $t2, $t7, $s6 add.d $t8, $a1, $t6 - ldx.bu $s5, $t8, $a5 + ldx.bu $s5, $t8, $t2 move $a7, $zero blt $s0, $s4, .LBB10_26 # %bb.30: # %.preheader.preheader # in Loop: Header=BB10_29 Depth=2 - move $t2, $zero - add.d $a5, $t8, $a5 - st.d $a5, $sp, 80 # 8-byte Folded Spill - ld.d $a5, $sp, 88 # 8-byte Folded Reload - add.d $s7, $a5, $s5 - ld.d $a5, $sp, 64 # 8-byte Folded Reload - mul.w $a5, $t7, $a5 - add.d $a1, $a1, $a5 + move $a5, $zero + add.d $t0, $t8, $t2 + st.d $t0, $sp, 80 # 8-byte Folded Spill + ld.d $t0, $sp, 88 # 8-byte Folded Reload + add.d $s7, $t0, $s5 + ld.d $t0, $sp, 64 # 8-byte Folded Reload + mul.w $t2, $t7, $t0 + add.d $a1, $a1, $t2 add.d $a1, $a1, $t6 - ld.d $a5, $sp, 40 # 8-byte Folded Reload - add.d $a5, $a1, $a5 + ld.d $t0, $sp, 40 # 8-byte Folded Reload + add.d $t2, $a1, $t0 ld.d $a1, $sp, 48 # 8-byte Folded Reload move $t5, $s4 b .LBB10_32 .p2align 4, , 16 .LBB10_31: # %._crit_edge283 # in Loop: Header=BB10_32 Depth=3 - add.d $a5, $t5, $a0 + add.d $t2, $t5, $a0 addi.w $t5, $s8, 1 move $a1, $ra beq $s8, $s1, .LBB10_39 @@ -1911,19 +1911,19 @@ susan_smoothing: # @susan_smoothing bgeu $a3, $t1, .LBB10_34 # %bb.33: # in Loop: Header=BB10_32 Depth=3 move $ra, $a1 - move $t5, $a5 + move $t5, $t2 move $a1, $s4 b .LBB10_37 .p2align 4, , 16 .LBB10_34: # %vector.ph # in Loop: Header=BB10_32 Depth=3 add.d $ra, $a1, $s2 - add.d $t5, $a5, $s2 + add.d $t5, $t2, $s2 vori.b $vr2, $vr0, 0 vinsgr2vr.w $vr2, $a7, 0 vori.b $vr1, $vr0, 0 - vinsgr2vr.w $vr1, $t2, 0 - addi.d $a7, $a5, 4 + vinsgr2vr.w $vr1, $a5, 0 + addi.d $a7, $t2, 4 addi.d $t2, $a1, 4 move $a5, $s2 vori.b $vr4, $vr0, 0 @@ -1936,62 +1936,68 @@ susan_smoothing: # @susan_smoothing # => This Inner Loop Header: Depth=4 ld.w $a1, $a7, -4 ld.w $fp, $a7, 0 - ld.w $t4, $t2, -4 - ld.w $s3, $t2, 0 - vinsgr2vr.w $vr5, $a1, 0 - vinsgr2vr.w $vr6, $fp, 0 - vinsgr2vr.w $vr7, $t4, 0 - vinsgr2vr.w $vr8, $s3, 0 - vilvl.b $vr7, $vr0, $vr7 - vilvl.h $vr7, $vr0, $vr7 - vilvl.b $vr8, $vr0, $vr8 - vilvl.h $vr8, $vr0, $vr8 - vilvl.b $vr5, $vr0, $vr5 - vilvl.h $vr5, $vr0, $vr5 - vilvl.w $vr9, $vr0, $vr5 - vilvh.w $vr10, $vr0, $vr5 - vilvl.b $vr6, $vr0, $vr6 - vilvl.h $vr6, $vr0, $vr6 - vilvl.w $vr11, $vr0, $vr6 - vilvh.w $vr12, $vr0, $vr6 - vneg.d $vr10, $vr10 - vneg.d $vr9, $vr9 - vneg.d $vr12, $vr12 + vinsgr2vr.w $vr6, $a1, 0 + vinsgr2vr.w $vr7, $fp, 0 + vsllwil.hu.bu $vr5, $vr6, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + ld.w $a1, $t2, -4 + ld.w $fp, $t2, 0 + vsllwil.hu.bu $vr8, $vr7, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vinsgr2vr.w $vr9, $a1, 0 + vinsgr2vr.w $vr10, $fp, 0 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr11, $vr5, 0 + vshuf4i.b $vr6, $vr6, 14 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vsllwil.du.wu $vr12, $vr8, 0 + vshuf4i.b $vr7, $vr7, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vneg.d $vr6, $vr6 vneg.d $vr11, $vr11 - vpickve2gr.d $a1, $vr9, 0 - vpickve2gr.d $t4, $vr9, 1 - vpickve2gr.d $fp, $vr10, 0 - vpickve2gr.d $s3, $vr10, 1 - vpickve2gr.d $t0, $vr11, 0 - vpickve2gr.d $t3, $vr11, 1 - vpickve2gr.d $s6, $vr12, 0 - vpickve2gr.d $s0, $vr12, 1 + vneg.d $vr7, $vr7 + vneg.d $vr12, $vr12 + vpickve2gr.d $a1, $vr11, 0 + vpickve2gr.d $fp, $vr11, 1 + vpickve2gr.d $t4, $vr6, 0 + vpickve2gr.d $s3, $vr6, 1 + vpickve2gr.d $t0, $vr12, 0 + vpickve2gr.d $t3, $vr12, 1 + vpickve2gr.d $s6, $vr7, 0 + vpickve2gr.d $s0, $vr7, 1 ldx.b $a1, $s7, $a1 - ldx.b $t4, $s7, $t4 ldx.b $fp, $s7, $fp + ldx.b $t4, $s7, $t4 ldx.b $s3, $s7, $s3 - vinsgr2vr.b $vr9, $a1, 0 - vinsgr2vr.b $vr9, $t4, 1 - vinsgr2vr.b $vr9, $fp, 2 - vinsgr2vr.b $vr9, $s3, 3 + vinsgr2vr.b $vr6, $a1, 0 + vinsgr2vr.b $vr6, $fp, 1 + vinsgr2vr.b $vr6, $t4, 2 + vinsgr2vr.b $vr6, $s3, 3 ldx.b $a1, $s7, $t0 ldx.b $t0, $s7, $t3 ldx.b $t3, $s7, $s6 ldx.b $t4, $s7, $s0 - vinsgr2vr.b $vr10, $a1, 0 - vinsgr2vr.b $vr10, $t0, 1 - vinsgr2vr.b $vr10, $t3, 2 - vinsgr2vr.b $vr10, $t4, 3 - vilvl.b $vr9, $vr0, $vr9 - vilvl.h $vr9, $vr0, $vr9 - vilvl.b $vr10, $vr0, $vr10 - vilvl.h $vr10, $vr0, $vr10 - vmul.w $vr11, $vr9, $vr7 - vmul.w $vr12, $vr10, $vr8 - vmadd.w $vr1, $vr9, $vr7 - vmadd.w $vr3, $vr10, $vr8 + vinsgr2vr.b $vr7, $a1, 0 + vinsgr2vr.b $vr7, $t0, 1 + vinsgr2vr.b $vr7, $t3, 2 + vinsgr2vr.b $vr7, $t4, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vmul.w $vr11, $vr6, $vr9 + vmul.w $vr12, $vr7, $vr10 + vmadd.w $vr1, $vr6, $vr9 + vmadd.w $vr3, $vr7, $vr10 vmadd.w $vr2, $vr11, $vr5 - vmadd.w $vr4, $vr12, $vr6 + vmadd.w $vr4, $vr12, $vr8 addi.d $a5, $a5, -8 addi.d $a7, $a7, 8 addi.d $t2, $t2, 8 @@ -2005,12 +2011,12 @@ susan_smoothing: # @susan_smoothing vadd.w $vr1, $vr3, $vr1 vhaddw.d.w $vr1, $vr1, $vr1 vhaddw.q.d $vr1, $vr1, $vr1 - vpickve2gr.d $t2, $vr1, 0 + vpickve2gr.d $a5, $vr1, 0 move $a1, $a6 beq $a4, $s2, .LBB10_31 .LBB10_37: # %scalar.ph.preheader # in Loop: Header=BB10_32 Depth=3 - sub.d $a5, $a2, $a1 + sub.d $t2, $a2, $a1 move $a1, $ra move $fp, $t5 .p2align 4, , 16 @@ -2026,20 +2032,20 @@ susan_smoothing: # @susan_smoothing addi.d $t5, $fp, 1 addi.d $ra, $a1, 1 mul.d $a1, $t4, $t3 - add.d $t2, $a1, $t2 + add.d $a5, $a1, $a5 mul.d $a1, $a1, $t0 - addi.w $a5, $a5, -1 + addi.w $t2, $t2, -1 add.d $a7, $a1, $a7 move $a1, $ra move $fp, $t5 - bnez $a5, .LBB10_38 + bnez $t2, .LBB10_38 b .LBB10_31 .p2align 4, , 16 .LBB10_39: # %._crit_edge294.split # in Loop: Header=BB10_29 Depth=2 lu12i.w $a1, -3 ori $a1, $a1, 2288 - add.w $a1, $t2, $a1 + add.w $a1, $a5, $a1 ld.d $s0, $sp, 56 # 8-byte Folded Reload ld.d $s3, $sp, 32 # 8-byte Folded Reload ld.d $s6, $sp, 72 # 8-byte Folded Reload @@ -2646,8 +2652,8 @@ edge_draw: # @edge_draw stx.b $zero, $a0, $a7 .LBB11_52: # %pred.store.continue86 # in Loop: Header=BB11_50 Depth=1 - vilvl.b $vr4, $vr4, $vr4 - vilvl.h $vr4, $vr4, $vr4 + vsllwil.h.b $vr4, $vr4, 0 + vsllwil.w.h $vr4, $vr4, 0 vpickve2gr.w $a7, $vr4, 1 andi $a7, $a7, 1 beqz $a7, .LBB11_54 @@ -7726,8 +7732,8 @@ main: # @main stx.b $zero, $s2, $a4 .LBB18_131: # %pred.store.continue313 # in Loop: Header=BB18_129 Depth=1 - vilvl.b $vr4, $vr4, $vr4 - vilvl.h $vr4, $vr4, $vr4 + vsllwil.h.b $vr4, $vr4, 0 + vsllwil.w.h $vr4, $vr4, 0 vpickve2gr.w $a4, $vr4, 1 andi $a4, $a4, 1 beqz $a4, .LBB18_133 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcdctmgr.s b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcdctmgr.s index 141ca95c..472eed67 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcdctmgr.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcdctmgr.s @@ -94,24 +94,24 @@ jinit_forward_dct: # @jinit_forward_dct .type start_pass_fdctmgr,@function start_pass_fdctmgr: # @start_pass_fdctmgr # %bb.0: - addi.d $sp, $sp, -192 - st.d $ra, $sp, 184 # 8-byte Folded Spill - st.d $fp, $sp, 176 # 8-byte Folded Spill - st.d $s0, $sp, 168 # 8-byte Folded Spill - st.d $s1, $sp, 160 # 8-byte Folded Spill - st.d $s2, $sp, 152 # 8-byte Folded Spill - st.d $s3, $sp, 144 # 8-byte Folded Spill - st.d $s4, $sp, 136 # 8-byte Folded Spill - st.d $s5, $sp, 128 # 8-byte Folded Spill - st.d $s6, $sp, 120 # 8-byte Folded Spill - st.d $s7, $sp, 112 # 8-byte Folded Spill - st.d $s8, $sp, 104 # 8-byte Folded Spill - fst.d $fs0, $sp, 96 # 8-byte Folded Spill - fst.d $fs1, $sp, 88 # 8-byte Folded Spill - fst.d $fs2, $sp, 80 # 8-byte Folded Spill - fst.d $fs3, $sp, 72 # 8-byte Folded Spill - fst.d $fs4, $sp, 64 # 8-byte Folded Spill - fst.d $fs5, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -176 + st.d $ra, $sp, 168 # 8-byte Folded Spill + st.d $fp, $sp, 160 # 8-byte Folded Spill + st.d $s0, $sp, 152 # 8-byte Folded Spill + st.d $s1, $sp, 144 # 8-byte Folded Spill + st.d $s2, $sp, 136 # 8-byte Folded Spill + st.d $s3, $sp, 128 # 8-byte Folded Spill + st.d $s4, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 112 # 8-byte Folded Spill + st.d $s6, $sp, 104 # 8-byte Folded Spill + st.d $s7, $sp, 96 # 8-byte Folded Spill + st.d $s8, $sp, 88 # 8-byte Folded Spill + fst.d $fs0, $sp, 80 # 8-byte Folded Spill + fst.d $fs1, $sp, 72 # 8-byte Folded Spill + fst.d $fs2, $sp, 64 # 8-byte Folded Spill + fst.d $fs3, $sp, 56 # 8-byte Folded Spill + fst.d $fs4, $sp, 48 # 8-byte Folded Spill + fst.d $fs5, $sp, 40 # 8-byte Folded Spill move $fp, $a0 ld.w $a0, $a0, 68 blez $a0, .LBB1_21 @@ -139,12 +139,10 @@ start_pass_fdctmgr: # @start_pass_fdctmgr pcalau12i $a0, %pc_hi20(.LCPI1_5) fld.d $fs5, $a0, %pc_lo12(.LCPI1_5) ori $s2, $zero, 64 - vrepli.b $vr6, 0 ori $a0, $zero, 1024 - vreplgr2vr.d $vr7, $a0 + vreplgr2vr.d $vr6, $a0 ori $s7, $zero, 128 - vst $vr6, $sp, 32 # 16-byte Folded Spill - vst $vr7, $sp, 16 # 16-byte Folded Spill + vst $vr6, $sp, 16 # 16-byte Folded Spill b .LBB1_4 .p2align 4, , 16 .LBB1_2: # in Loop: Header=BB1_4 Depth=1 @@ -154,8 +152,7 @@ start_pass_fdctmgr: # @start_pass_fdctmgr st.w $a2, $a0, 40 move $a0, $fp jirl $ra, $a1, 0 - vld $vr7, $sp, 16 # 16-byte Folded Reload - vld $vr6, $sp, 32 # 16-byte Folded Reload + vld $vr6, $sp, 16 # 16-byte Folded Reload vldi $vr5, -992 .LBB1_3: # %.loopexit # in Loop: Header=BB1_4 Depth=1 @@ -182,8 +179,7 @@ start_pass_fdctmgr: # @start_pass_fdctmgr st.w $a2, $a0, 40 move $a0, $fp jirl $ra, $a1, 0 - vld $vr7, $sp, 16 # 16-byte Folded Reload - vld $vr6, $sp, 32 # 16-byte Folded Reload + vld $vr6, $sp, 16 # 16-byte Folded Reload vldi $vr5, -992 slli.d $a0, $s6, 3 ldx.d $s4, $s1, $a0 @@ -208,8 +204,7 @@ start_pass_fdctmgr: # @start_pass_fdctmgr ori $a2, $zero, 256 move $a0, $fp jirl $ra, $a3, 0 - vld $vr7, $sp, 16 # 16-byte Folded Reload - vld $vr6, $sp, 32 # 16-byte Folded Reload + vld $vr6, $sp, 16 # 16-byte Folded Reload vldi $vr5, -992 st.d $a0, $s6, 0 .LBB1_12: # %vector.body @@ -219,67 +214,67 @@ start_pass_fdctmgr: # @start_pass_fdctmgr vinsgr2vr.d $vr0, $a1, 0 vinsgr2vr.d $vr1, $a2, 0 ld.d $a1, $s4, 16 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vslli.w $vr0, $vr0, 3 vst $vr0, $a0, 0 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $s4, 24 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 3 vst $vr1, $a0, 16 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $s4, 32 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vslli.w $vr0, $vr0, 3 vst $vr0, $a0, 32 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $s4, 40 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 3 vst $vr1, $a0, 48 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $s4, 48 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vslli.w $vr0, $vr0, 3 vst $vr0, $a0, 64 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $s4, 56 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 3 vst $vr1, $a0, 80 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $s4, 64 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vslli.w $vr0, $vr0, 3 vst $vr0, $a0, 96 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $s4, 72 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 3 vst $vr1, $a0, 112 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $s4, 80 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vslli.w $vr0, $vr0, 3 vst $vr0, $a0, 128 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $s4, 88 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 3 vst $vr1, $a0, 144 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $s4, 96 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vslli.w $vr0, $vr0, 3 vst $vr0, $a0, 160 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $s4, 104 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 3 vst $vr1, $a0, 176 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr0, $vr6, $vr0 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr0, $vr0, 3 vslli.w $vr1, $vr1, 3 ld.d $a1, $s4, 112 @@ -288,8 +283,8 @@ start_pass_fdctmgr: # @start_pass_fdctmgr vst $vr1, $a0, 208 vinsgr2vr.d $vr0, $a1, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr0, $vr6, $vr0 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr0, $vr0, 3 vslli.w $vr1, $vr1, 3 vst $vr0, $a0, 224 @@ -308,8 +303,7 @@ start_pass_fdctmgr: # @start_pass_fdctmgr ori $a2, $zero, 256 move $a0, $fp jirl $ra, $a3, 0 - vld $vr7, $sp, 16 # 16-byte Folded Reload - vld $vr6, $sp, 32 # 16-byte Folded Reload + vld $vr6, $sp, 16 # 16-byte Folded Reload vldi $vr5, -992 st.d $a0, $s6, 0 .LBB1_15: # %vector.ph111 @@ -321,23 +315,23 @@ start_pass_fdctmgr: # @start_pass_fdctmgr # => This Inner Loop Header: Depth=2 ldx.d $a2, $s4, $a1 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr6, $vr0 - vilvl.w $vr1, $vr6, $vr0 - vilvh.w $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr1, $vr0, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vshuf4i.h $vr0, $vr0, 14 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 pcalau12i $a2, %pc_hi20(start_pass_fdctmgr.aanscales) addi.d $a2, $a2, %pc_lo12(start_pass_fdctmgr.aanscales) ldx.d $a2, $a2, $a1 vinsgr2vr.d $vr2, $a2, 0 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr3, $vr2, $vr2 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vori.b $vr4, $vr7, 0 + vsllwil.w.h $vr3, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vori.b $vr4, $vr6, 0 vmadd.d $vr4, $vr2, $vr0 - vori.b $vr0, $vr7, 0 + vori.b $vr0, $vr6, 0 vmadd.d $vr0, $vr3, $vr1 vsrli.d $vr0, $vr0, 11 vsrli.d $vr1, $vr4, 11 @@ -362,8 +356,7 @@ start_pass_fdctmgr: # @start_pass_fdctmgr ori $a2, $zero, 256 move $a0, $fp jirl $ra, $a3, 0 - vld $vr7, $sp, 16 # 16-byte Folded Reload - vld $vr6, $sp, 32 # 16-byte Folded Reload + vld $vr6, $sp, 16 # 16-byte Folded Reload vldi $vr5, -992 st.d $a0, $s6, 0 .LBB1_19: # in Loop: Header=BB1_4 Depth=1 @@ -451,24 +444,24 @@ start_pass_fdctmgr: # @start_pass_fdctmgr bne $a1, $s2, .LBB1_20 b .LBB1_3 .LBB1_21: # %._crit_edge - fld.d $fs5, $sp, 56 # 8-byte Folded Reload - fld.d $fs4, $sp, 64 # 8-byte Folded Reload - fld.d $fs3, $sp, 72 # 8-byte Folded Reload - fld.d $fs2, $sp, 80 # 8-byte Folded Reload - fld.d $fs1, $sp, 88 # 8-byte Folded Reload - fld.d $fs0, $sp, 96 # 8-byte Folded Reload - ld.d $s8, $sp, 104 # 8-byte Folded Reload - ld.d $s7, $sp, 112 # 8-byte Folded Reload - ld.d $s6, $sp, 120 # 8-byte Folded Reload - ld.d $s5, $sp, 128 # 8-byte Folded Reload - ld.d $s4, $sp, 136 # 8-byte Folded Reload - ld.d $s3, $sp, 144 # 8-byte Folded Reload - ld.d $s2, $sp, 152 # 8-byte Folded Reload - ld.d $s1, $sp, 160 # 8-byte Folded Reload - ld.d $s0, $sp, 168 # 8-byte Folded Reload - ld.d $fp, $sp, 176 # 8-byte Folded Reload - ld.d $ra, $sp, 184 # 8-byte Folded Reload - addi.d $sp, $sp, 192 + fld.d $fs5, $sp, 40 # 8-byte Folded Reload + fld.d $fs4, $sp, 48 # 8-byte Folded Reload + fld.d $fs3, $sp, 56 # 8-byte Folded Reload + fld.d $fs2, $sp, 64 # 8-byte Folded Reload + fld.d $fs1, $sp, 72 # 8-byte Folded Reload + fld.d $fs0, $sp, 80 # 8-byte Folded Reload + ld.d $s8, $sp, 88 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload + ld.d $s6, $sp, 104 # 8-byte Folded Reload + ld.d $s5, $sp, 112 # 8-byte Folded Reload + ld.d $s4, $sp, 120 # 8-byte Folded Reload + ld.d $s3, $sp, 128 # 8-byte Folded Reload + ld.d $s2, $sp, 136 # 8-byte Folded Reload + ld.d $s1, $sp, 144 # 8-byte Folded Reload + ld.d $s0, $sp, 152 # 8-byte Folded Reload + ld.d $fp, $sp, 160 # 8-byte Folded Reload + ld.d $ra, $sp, 168 # 8-byte Folded Reload + addi.d $sp, $sp, 176 ret .Lfunc_end1: .size start_pass_fdctmgr, .Lfunc_end1-start_pass_fdctmgr diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcmarker.s b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcmarker.s index 41ae1e10..e896e8bc 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcmarker.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcmarker.s @@ -2041,9 +2041,8 @@ emit_dqt: # @emit_dqt vor.v $vr0, $vr1, $vr0 vrepli.h $vr1, 255 vslt.hu $vr0, $vr1, $vr0 - vilvl.h $vr0, $vr0, $vr0 ld.w $a0, $s1, 128 - vslli.w $vr0, $vr0, 16 + vsllwil.w.h $vr0, $vr0, 0 vmskltz.w $vr0, $vr0 vpickve2gr.hu $a1, $vr0, 0 andi $s2, $a1, 15 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcmaster.s b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcmaster.s index d491f0eb..cde29c42 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcmaster.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcmaster.s @@ -912,12 +912,8 @@ prepare_for_pass: # @prepare_for_pass ld.d $t2, $a7, 0 vinsgr2vr.d $vr2, $t1, 0 vinsgr2vr.d $vr3, $t2, 0 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 vori.b $vr4, $vr0, 0 vmadd.d $vr4, $vr2, $vr1 vori.b $vr2, $vr0, 0 @@ -972,12 +968,8 @@ prepare_for_pass: # @prepare_for_pass ld.d $t2, $a7, 0 vinsgr2vr.d $vr2, $t1, 0 vinsgr2vr.d $vr3, $t2, 0 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 vori.b $vr4, $vr0, 0 vmadd.d $vr4, $vr2, $vr1 vori.b $vr2, $vr0, 0 @@ -1252,12 +1244,8 @@ prepare_for_pass: # @prepare_for_pass ld.d $t2, $a7, 0 vinsgr2vr.d $vr2, $t1, 0 vinsgr2vr.d $vr3, $t2, 0 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 vori.b $vr4, $vr0, 0 vmadd.d $vr4, $vr2, $vr1 vori.b $vr2, $vr0, 0 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcparam.s b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcparam.s index 9beeff58..a6935d2f 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcparam.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcparam.s @@ -41,70 +41,70 @@ jpeg_add_quant_table: # @jpeg_add_quant_table .LBB0_4: vreplgr2vr.d $vr0, $s1 move $a1, $zero - vrepli.b $vr1, 0 - vrepli.d $vr2, 50 + vrepli.d $vr1, 50 lu12i.w $a2, 461373 ori $a2, $a2, 1803 lu32i.d $a2, 461373 lu52i.d $a2, $a2, -1475 - vreplgr2vr.d $vr3, $a2 + vreplgr2vr.d $vr2, $a2 lu12i.w $a2, 7 ori $a2, $a2, 4095 - vreplgr2vr.d $vr4, $a2 + vreplgr2vr.d $vr3, $a2 beqz $s0, .LBB0_7 # %bb.5: # %vector.body.preheader lu12i.w $a2, 6 ori $a2, $a2, 973 - vreplgr2vr.d $vr5, $a2 - vrepli.h $vr6, 255 + vreplgr2vr.d $vr4, $a2 + vrepli.h $vr5, 255 ori $a2, $zero, 128 .p2align 4, , 16 .LBB0_6: # %vector.body # =>This Inner Loop Header: Depth=1 - vld $vr7, $fp, 0 - vilvl.w $vr8, $vr1, $vr7 - vilvh.w $vr7, $vr1, $vr7 + vld $vr6, $fp, 0 + vsllwil.du.wu $vr7, $vr6, 0 + vshuf4i.w $vr6, $vr6, 14 + vsllwil.du.wu $vr6, $vr6, 0 + vmul.d $vr8, $vr0, $vr6 vmul.d $vr9, $vr0, $vr7 - vmul.d $vr10, $vr0, $vr8 - vori.b $vr11, $vr2, 0 - vmadd.d $vr11, $vr0, $vr8 - vori.b $vr8, $vr2, 0 - vmadd.d $vr8, $vr0, $vr7 - vmuh.d $vr7, $vr8, $vr3 - vadd.d $vr7, $vr7, $vr8 - vsrli.d $vr8, $vr7, 63 + vori.b $vr10, $vr1, 0 + vmadd.d $vr10, $vr0, $vr7 + vori.b $vr7, $vr1, 0 + vmadd.d $vr7, $vr0, $vr6 + vmuh.d $vr6, $vr7, $vr2 + vadd.d $vr6, $vr6, $vr7 + vsrli.d $vr7, $vr6, 63 + vsrai.d $vr6, $vr6, 6 + vadd.d $vr6, $vr6, $vr7 + vmuh.d $vr7, $vr10, $vr2 + vadd.d $vr7, $vr7, $vr10 + vsrli.d $vr10, $vr7, 63 vsrai.d $vr7, $vr7, 6 - vadd.d $vr7, $vr7, $vr8 - vmuh.d $vr8, $vr11, $vr3 - vadd.d $vr8, $vr8, $vr11 - vsrli.d $vr11, $vr8, 63 - vsrai.d $vr8, $vr8, 6 - vadd.d $vr8, $vr8, $vr11 - vmaxi.d $vr8, $vr8, 1 + vadd.d $vr7, $vr7, $vr10 vmaxi.d $vr7, $vr7, 1 - vmin.d $vr7, $vr7, $vr4 - vmin.d $vr8, $vr8, $vr4 - vslt.d $vr10, $vr5, $vr10 - vpickve2gr.d $a3, $vr10, 0 - vinsgr2vr.h $vr11, $a3, 0 - vpickve2gr.d $a3, $vr10, 1 - vinsgr2vr.h $vr11, $a3, 1 - vslt.d $vr9, $vr5, $vr9 + vmaxi.d $vr6, $vr6, 1 + vmin.d $vr6, $vr6, $vr3 + vmin.d $vr7, $vr7, $vr3 + vslt.d $vr9, $vr4, $vr9 vpickve2gr.d $a3, $vr9, 0 - vinsgr2vr.h $vr11, $a3, 2 + vinsgr2vr.h $vr10, $a3, 0 vpickve2gr.d $a3, $vr9, 1 - vinsgr2vr.h $vr11, $a3, 3 + vinsgr2vr.h $vr10, $a3, 1 + vslt.d $vr8, $vr4, $vr8 vpickve2gr.d $a3, $vr8, 0 - vinsgr2vr.h $vr9, $a3, 0 + vinsgr2vr.h $vr10, $a3, 2 vpickve2gr.d $a3, $vr8, 1 - vinsgr2vr.h $vr9, $a3, 1 + vinsgr2vr.h $vr10, $a3, 3 vpickve2gr.d $a3, $vr7, 0 - vinsgr2vr.h $vr9, $a3, 2 + vinsgr2vr.h $vr8, $a3, 0 vpickve2gr.d $a3, $vr7, 1 - vinsgr2vr.h $vr9, $a3, 3 - vbitsel.v $vr7, $vr9, $vr6, $vr11 + vinsgr2vr.h $vr8, $a3, 1 + vpickve2gr.d $a3, $vr6, 0 + vinsgr2vr.h $vr8, $a3, 2 + vpickve2gr.d $a3, $vr6, 1 + vinsgr2vr.h $vr8, $a3, 3 + vbitsel.v $vr6, $vr8, $vr5, $vr10 add.d $a3, $a0, $a1 - vstelm.d $vr7, $a3, 0, 0 + vstelm.d $vr6, $a3, 0, 0 addi.d $a1, $a1, 8 addi.d $fp, $fp, 16 bne $a1, $a2, .LBB0_6 @@ -114,37 +114,38 @@ jpeg_add_quant_table: # @jpeg_add_quant_table .p2align 4, , 16 .LBB0_8: # %vector.body41 # =>This Inner Loop Header: Depth=1 - vld $vr5, $fp, 0 - vilvh.w $vr6, $vr1, $vr5 - vilvl.w $vr5, $vr1, $vr5 - vori.b $vr7, $vr2, 0 - vmadd.d $vr7, $vr0, $vr5 - vori.b $vr5, $vr2, 0 - vmadd.d $vr5, $vr0, $vr6 - vmuh.d $vr6, $vr5, $vr3 - vadd.d $vr5, $vr6, $vr5 + vld $vr4, $fp, 0 + vshuf4i.w $vr5, $vr4, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vori.b $vr6, $vr1, 0 + vmadd.d $vr6, $vr0, $vr4 + vori.b $vr4, $vr1, 0 + vmadd.d $vr4, $vr0, $vr5 + vmuh.d $vr5, $vr4, $vr2 + vadd.d $vr4, $vr5, $vr4 + vsrli.d $vr5, $vr4, 63 + vsrai.d $vr4, $vr4, 6 + vadd.d $vr4, $vr4, $vr5 + vmuh.d $vr5, $vr6, $vr2 + vadd.d $vr5, $vr5, $vr6 vsrli.d $vr6, $vr5, 63 vsrai.d $vr5, $vr5, 6 vadd.d $vr5, $vr5, $vr6 - vmuh.d $vr6, $vr7, $vr3 - vadd.d $vr6, $vr6, $vr7 - vsrli.d $vr7, $vr6, 63 - vsrai.d $vr6, $vr6, 6 - vadd.d $vr6, $vr6, $vr7 - vmaxi.d $vr6, $vr6, 1 vmaxi.d $vr5, $vr5, 1 - vmin.d $vr5, $vr5, $vr4 - vmin.d $vr6, $vr6, $vr4 - vpickve2gr.d $a3, $vr6, 0 - vinsgr2vr.h $vr7, $a3, 0 - vpickve2gr.d $a3, $vr6, 1 - vinsgr2vr.h $vr7, $a3, 1 + vmaxi.d $vr4, $vr4, 1 + vmin.d $vr4, $vr4, $vr3 + vmin.d $vr5, $vr5, $vr3 vpickve2gr.d $a3, $vr5, 0 - vinsgr2vr.h $vr7, $a3, 2 + vinsgr2vr.h $vr6, $a3, 0 vpickve2gr.d $a3, $vr5, 1 - vinsgr2vr.h $vr7, $a3, 3 + vinsgr2vr.h $vr6, $a3, 1 + vpickve2gr.d $a3, $vr4, 0 + vinsgr2vr.h $vr6, $a3, 2 + vpickve2gr.d $a3, $vr4, 1 + vinsgr2vr.h $vr6, $a3, 3 add.d $a3, $a0, $a1 - vstelm.d $vr7, $a3, 0, 0 + vstelm.d $vr6, $a3, 0, 0 addi.d $a1, $a1, 8 addi.d $fp, $fp, 16 bne $a1, $a2, .LBB0_8 @@ -166,14 +167,14 @@ jpeg_add_quant_table: # @jpeg_add_quant_table .type jpeg_set_linear_quality,@function jpeg_set_linear_quality: # @jpeg_set_linear_quality # %bb.0: - addi.d $sp, $sp, -128 - st.d $ra, $sp, 120 # 8-byte Folded Spill - st.d $fp, $sp, 112 # 8-byte Folded Spill - st.d $s0, $sp, 104 # 8-byte Folded Spill - st.d $s1, $sp, 96 # 8-byte Folded Spill - st.d $s2, $sp, 88 # 8-byte Folded Spill - st.d $s3, $sp, 80 # 8-byte Folded Spill - st.d $s4, $sp, 72 # 8-byte Folded Spill + addi.d $sp, $sp, -112 + st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s1, $sp, 80 # 8-byte Folded Spill + st.d $s2, $sp, 72 # 8-byte Folded Spill + st.d $s3, $sp, 64 # 8-byte Folded Spill + st.d $s4, $sp, 56 # 8-byte Folded Spill move $s0, $a0 ld.w $a0, $a0, 28 ori $a3, $zero, 100 @@ -199,12 +200,11 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality st.d $a0, $s0, 88 .LBB1_4: vreplgr2vr.d $vr0, $s1 - vrepli.b $vr9, 0 - vrepli.d $vr10, 50 + vrepli.d $vr9, 50 lu12i.w $s3, 461373 lu12i.w $s2, 7 lu12i.w $s4, 6 - vrepli.h $vr11, 255 + vrepli.h $vr10, 255 pcalau12i $a1, %pc_hi20(jpeg_set_linear_quality.std_luminance_quant_tbl) addi.d $a1, $a1, %pc_lo12(jpeg_set_linear_quality.std_luminance_quant_tbl) move $a2, $zero @@ -223,13 +223,14 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality .LBB1_6: # %vector.body # =>This Inner Loop Header: Depth=1 vld $vr4, $a1, 0 - vilvl.w $vr5, $vr9, $vr4 - vilvh.w $vr4, $vr9, $vr4 + vsllwil.du.wu $vr5, $vr4, 0 + vshuf4i.w $vr4, $vr4, 14 + vsllwil.du.wu $vr4, $vr4, 0 vmul.d $vr6, $vr0, $vr4 vmul.d $vr7, $vr0, $vr5 - vori.b $vr8, $vr10, 0 + vori.b $vr8, $vr9, 0 vmadd.d $vr8, $vr0, $vr5 - vori.b $vr5, $vr10, 0 + vori.b $vr5, $vr9, 0 vmadd.d $vr5, $vr0, $vr4 vmuh.d $vr4, $vr5, $vr1 vadd.d $vr4, $vr4, $vr5 @@ -263,7 +264,7 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality vinsgr2vr.h $vr6, $a4, 2 vpickve2gr.d $a4, $vr4, 1 vinsgr2vr.h $vr6, $a4, 3 - vbitsel.v $vr4, $vr6, $vr11, $vr8 + vbitsel.v $vr4, $vr6, $vr10, $vr8 add.d $a4, $a0, $a2 vstelm.d $vr4, $a4, 0, 0 addi.d $a2, $a2, 8 @@ -276,11 +277,12 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality .LBB1_8: # %vector.body36 # =>This Inner Loop Header: Depth=1 vld $vr3, $a1, 0 - vilvh.w $vr4, $vr9, $vr3 - vilvl.w $vr3, $vr9, $vr3 - vori.b $vr5, $vr10, 0 + vshuf4i.w $vr4, $vr3, 14 + vsllwil.du.wu $vr4, $vr4, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vori.b $vr5, $vr9, 0 vmadd.d $vr5, $vr0, $vr3 - vori.b $vr3, $vr10, 0 + vori.b $vr3, $vr9, 0 vmadd.d $vr3, $vr0, $vr4 vmuh.d $vr4, $vr3, $vr1 vadd.d $vr3, $vr4, $vr3 @@ -322,26 +324,22 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality ori $a2, $zero, 18 st.w $a2, $a0, 40 move $a0, $s0 - vst $vr9, $sp, 48 # 16-byte Folded Spill - vst $vr10, $sp, 32 # 16-byte Folded Spill - vst $vr11, $sp, 16 # 16-byte Folded Spill + vst $vr9, $sp, 32 # 16-byte Folded Spill + vst $vr10, $sp, 16 # 16-byte Folded Spill jirl $ra, $a1, 0 - vld $vr11, $sp, 16 # 16-byte Folded Reload - vld $vr10, $sp, 32 # 16-byte Folded Reload - vld $vr9, $sp, 48 # 16-byte Folded Reload + vld $vr10, $sp, 16 # 16-byte Folded Reload + vld $vr9, $sp, 32 # 16-byte Folded Reload .LBB1_11: ld.d $a0, $s0, 96 bnez $a0, .LBB1_13 # %bb.12: move $a0, $s0 - vst $vr9, $sp, 48 # 16-byte Folded Spill - vst $vr10, $sp, 32 # 16-byte Folded Spill - vst $vr11, $sp, 16 # 16-byte Folded Spill + vst $vr9, $sp, 32 # 16-byte Folded Spill + vst $vr10, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(jpeg_alloc_quant_table) jirl $ra, $ra, 0 - vld $vr11, $sp, 16 # 16-byte Folded Reload - vld $vr10, $sp, 32 # 16-byte Folded Reload - vld $vr9, $sp, 48 # 16-byte Folded Reload + vld $vr10, $sp, 16 # 16-byte Folded Reload + vld $vr9, $sp, 32 # 16-byte Folded Reload st.d $a0, $s0, 96 .LBB1_13: vreplgr2vr.d $vr0, $s1 @@ -363,13 +361,14 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality .LBB1_15: # %vector.body45 # =>This Inner Loop Header: Depth=1 vld $vr4, $a1, 0 - vilvl.w $vr5, $vr9, $vr4 - vilvh.w $vr4, $vr9, $vr4 + vsllwil.du.wu $vr5, $vr4, 0 + vshuf4i.w $vr4, $vr4, 14 + vsllwil.du.wu $vr4, $vr4, 0 vmul.d $vr6, $vr0, $vr4 vmul.d $vr7, $vr0, $vr5 - vori.b $vr8, $vr10, 0 + vori.b $vr8, $vr9, 0 vmadd.d $vr8, $vr0, $vr5 - vori.b $vr5, $vr10, 0 + vori.b $vr5, $vr9, 0 vmadd.d $vr5, $vr0, $vr4 vmuh.d $vr4, $vr5, $vr1 vadd.d $vr4, $vr4, $vr5 @@ -403,7 +402,7 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality vinsgr2vr.h $vr6, $a4, 2 vpickve2gr.d $a4, $vr4, 1 vinsgr2vr.h $vr6, $a4, 3 - vbitsel.v $vr4, $vr6, $vr11, $vr8 + vbitsel.v $vr4, $vr6, $vr10, $vr8 add.d $a4, $a0, $a2 vstelm.d $vr4, $a4, 0, 0 addi.d $a2, $a2, 8 @@ -416,11 +415,12 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality .LBB1_17: # %vector.body54 # =>This Inner Loop Header: Depth=1 vld $vr3, $a1, 0 - vilvh.w $vr4, $vr9, $vr3 - vilvl.w $vr3, $vr9, $vr3 - vori.b $vr5, $vr10, 0 + vshuf4i.w $vr4, $vr3, 14 + vsllwil.du.wu $vr4, $vr4, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vori.b $vr5, $vr9, 0 vmadd.d $vr5, $vr0, $vr3 - vori.b $vr3, $vr10, 0 + vori.b $vr3, $vr9, 0 vmadd.d $vr3, $vr0, $vr4 vmuh.d $vr4, $vr3, $vr1 vadd.d $vr3, $vr4, $vr3 @@ -451,14 +451,14 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality bne $a2, $a3, .LBB1_17 .LBB1_18: # %jpeg_add_quant_table.exit20 st.w $zero, $a0, 128 - ld.d $s4, $sp, 72 # 8-byte Folded Reload - ld.d $s3, $sp, 80 # 8-byte Folded Reload - ld.d $s2, $sp, 88 # 8-byte Folded Reload - ld.d $s1, $sp, 96 # 8-byte Folded Reload - ld.d $s0, $sp, 104 # 8-byte Folded Reload - ld.d $fp, $sp, 112 # 8-byte Folded Reload - ld.d $ra, $sp, 120 # 8-byte Folded Reload - addi.d $sp, $sp, 128 + ld.d $s4, $sp, 56 # 8-byte Folded Reload + ld.d $s3, $sp, 64 # 8-byte Folded Reload + ld.d $s2, $sp, 72 # 8-byte Folded Reload + ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $fp, $sp, 96 # 8-byte Folded Reload + ld.d $ra, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret .Lfunc_end1: .size jpeg_set_linear_quality, .Lfunc_end1-jpeg_set_linear_quality diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcsample.s b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcsample.s index e30baf29..67133dd4 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcsample.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jcsample.s @@ -258,11 +258,6 @@ sep_downsample: # @sep_downsample .word 7 # 0x7 .word 4294967295 # 0xffffffff .word 4294967295 # 0xffffffff -.LCPI3_1: - .word 1 # 0x1 - .word 5 # 0x5 - .word 2 # 0x2 - .word 7 # 0x7 .text .p2align 5 .type fullsize_smooth_downsample,@function @@ -330,8 +325,7 @@ fullsize_smooth_downsample: # @fullsize_smooth_downsample vreplgr2vr.d $vr1, $a2 lu12i.w $t1, 8 ori $t2, $zero, 16 - vrepli.b $vr2, 0 - vreplgr2vr.d $vr3, $t1 + vreplgr2vr.d $vr2, $t1 .p2align 4, , 16 .LBB3_6: # =>This Loop Header: Depth=1 # Child Loop BB3_12 Depth 2 @@ -404,169 +398,200 @@ fullsize_smooth_downsample: # @fullsize_smooth_downsample add.d $t6, $t6, $a7 add.d $t5, $t5, $a7 add.d $t3, $t3, $a7 - vinsgr2vr.w $vr5, $s3, 3 - vinsgr2vr.w $vr4, $s4, 3 + vinsgr2vr.w $vr6, $s3, 3 + vinsgr2vr.w $vr3, $s4, 3 addi.d $s3, $s5, 2 move $s4, $a7 .p2align 4, , 16 .LBB3_12: # %vector.body # Parent Loop BB3_6 Depth=1 # => This Inner Loop Header: Depth=2 - vld $vr6, $s3, -1 - vld $vr9, $t8, 0 - vilvl.b $vr7, $vr2, $vr6 - vilvl.h $vr8, $vr2, $vr7 - vilvh.b $vr10, $vr2, $vr9 - vilvh.h $vr11, $vr2, $vr10 - vilvl.h $vr10, $vr2, $vr10 - vld $vr12, $s2, 0 - vilvl.b $vr9, $vr2, $vr9 - vilvh.h $vr13, $vr2, $vr9 - vilvl.h $vr9, $vr2, $vr9 - vilvh.b $vr14, $vr2, $vr12 - vilvh.h $vr15, $vr2, $vr14 - vilvl.h $vr14, $vr2, $vr14 - vilvl.b $vr12, $vr2, $vr12 - vilvh.h $vr16, $vr2, $vr12 - vilvl.h $vr12, $vr2, $vr12 - vadd.w $vr12, $vr12, $vr9 - vld $vr9, $s3, 0 - vadd.w $vr13, $vr16, $vr13 + vld $vr5, $s3, -1 + vld $vr7, $t8, 0 + vsllwil.hu.bu $vr4, $vr5, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vbsrl.v $vr8, $vr7, 12 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vbsrl.v $vr9, $vr7, 8 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsrli.d $vr10, $vr7, 32 + vsllwil.hu.bu $vr10, $vr10, 0 + vld $vr11, $s2, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vbsrl.v $vr12, $vr11, 12 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vbsrl.v $vr13, $vr11, 8 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsrli.d $vr14, $vr11, 32 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vadd.w $vr11, $vr11, $vr7 + vld $vr7, $s3, 0 vadd.w $vr10, $vr14, $vr10 - vadd.w $vr11, $vr15, $vr11 - vilvl.b $vr14, $vr2, $vr9 - vilvl.h $vr15, $vr2, $vr14 - vilvh.h $vr14, $vr2, $vr14 - vilvh.b $vr9, $vr2, $vr9 + vadd.w $vr9, $vr13, $vr9 + vadd.w $vr8, $vr12, $vr8 + vsllwil.hu.bu $vr12, $vr7, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsrli.d $vr13, $vr7, 32 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vbsrl.v $vr14, $vr7, 8 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vbsrl.v $vr7, $vr7, 12 pcalau12i $s5, %pc_hi20(.LCPI3_0) - vld $vr16, $s5, %pc_lo12(.LCPI3_0) - vilvl.h $vr17, $vr2, $vr9 - vilvh.h $vr9, $vr2, $vr9 - vbsrl.v $vr18, $vr4, 12 - vshuf.w $vr16, $vr4, $vr5 - vadd.w $vr4, $vr11, $vr9 - vadd.w $vr9, $vr10, $vr17 - vadd.w $vr10, $vr13, $vr14 - vadd.w $vr13, $vr12, $vr15 - vbsrl.v $vr5, $vr13, 12 - vbsll.v $vr11, $vr10, 4 - vor.v $vr14, $vr11, $vr5 - vbsrl.v $vr5, $vr10, 12 - vbsll.v $vr11, $vr9, 4 - vor.v $vr17, $vr11, $vr5 - vbsrl.v $vr5, $vr9, 12 - vbsll.v $vr11, $vr4, 4 - vor.v $vr5, $vr11, $vr5 - vbsll.v $vr11, $vr13, 4 - vor.v $vr18, $vr11, $vr18 - vshuf4i.w $vr15, $vr13, 14 - vshuf4i.w $vr11, $vr10, 14 - pcalau12i $s5, %pc_hi20(.LCPI3_1) - vld $vr19, $s5, %pc_lo12(.LCPI3_1) - vshuf4i.w $vr12, $vr9, 14 - vilvl.w $vr20, $vr2, $vr12 - vilvl.w $vr21, $vr2, $vr13 - vilvl.w $vr16, $vr2, $vr16 - vori.b $vr22, $vr19, 0 - vori.b $vr23, $vr19, 0 - vilvl.w $vr18, $vr2, $vr18 - vadd.d $vr16, $vr16, $vr18 - vori.b $vr18, $vr19, 0 - vshuf.w $vr19, $vr2, $vr13 - vadd.d $vr19, $vr21, $vr19 - vilvl.w $vr21, $vr2, $vr15 - vilvl.w $vr14, $vr2, $vr14 - vadd.d $vr21, $vr21, $vr14 - vilvl.w $vr14, $vr2, $vr10 - vshuf.w $vr18, $vr2, $vr10 - vadd.d $vr18, $vr14, $vr18 - vilvl.w $vr14, $vr2, $vr11 - vilvl.w $vr17, $vr2, $vr17 - vadd.d $vr17, $vr14, $vr17 - vilvl.w $vr14, $vr2, $vr9 - vshuf.w $vr23, $vr2, $vr9 - vadd.d $vr23, $vr14, $vr23 - vilvl.w $vr14, $vr2, $vr5 - vadd.d $vr20, $vr20, $vr14 - vilvl.w $vr14, $vr2, $vr4 - vshuf.w $vr22, $vr2, $vr4 - vadd.d $vr22, $vr14, $vr22 - vilvl.w $vr14, $vr2, $vr8 + vld $vr15, $s5, %pc_lo12(.LCPI3_0) + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vbsrl.v $vr16, $vr3, 12 + vshuf.w $vr15, $vr3, $vr6 + vadd.w $vr3, $vr8, $vr7 + vadd.w $vr7, $vr9, $vr14 + vadd.w $vr9, $vr10, $vr13 + vadd.w $vr12, $vr11, $vr12 + vbsrl.v $vr6, $vr12, 12 + vbsll.v $vr8, $vr9, 4 + vor.v $vr11, $vr8, $vr6 + vbsrl.v $vr6, $vr9, 12 + vbsll.v $vr8, $vr7, 4 + vor.v $vr14, $vr8, $vr6 + vbsrl.v $vr6, $vr7, 12 + vbsll.v $vr8, $vr3, 4 + vor.v $vr6, $vr8, $vr6 + vbsll.v $vr8, $vr12, 4 + vor.v $vr16, $vr8, $vr16 + vshuf4i.w $vr13, $vr12, 14 + vshuf4i.w $vr10, $vr9, 14 + vshuf4i.w $vr8, $vr7, 14 + vsllwil.du.wu $vr15, $vr15, 0 + vsllwil.du.wu $vr17, $vr3, 0 + vsllwil.du.wu $vr18, $vr8, 0 + vsllwil.du.wu $vr19, $vr7, 0 + vsllwil.du.wu $vr20, $vr9, 0 + vsllwil.du.wu $vr21, $vr12, 0 + vsllwil.du.wu $vr16, $vr16, 0 + vadd.d $vr15, $vr15, $vr16 + vshuf4i.w $vr16, $vr12, 9 + vsllwil.du.wu $vr16, $vr16, 0 + vadd.d $vr16, $vr21, $vr16 + vsllwil.du.wu $vr21, $vr13, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vadd.d $vr21, $vr21, $vr11 + vshuf4i.w $vr11, $vr9, 9 + vsllwil.du.wu $vr11, $vr11, 0 + vadd.d $vr20, $vr20, $vr11 + vsllwil.du.wu $vr11, $vr10, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vadd.d $vr14, $vr11, $vr14 + vshuf4i.w $vr11, $vr7, 9 + vsllwil.du.wu $vr11, $vr11, 0 + vadd.d $vr19, $vr19, $vr11 + vsllwil.du.wu $vr11, $vr6, 0 + vadd.d $vr18, $vr18, $vr11 + vshuf4i.w $vr11, $vr3, 9 + vsllwil.du.wu $vr11, $vr11, 0 + vadd.d $vr17, $vr17, $vr11 + vshuf4i.b $vr11, $vr5, 14 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vshuf4i.w $vr12, $vr12, 16 + vsub.d $vr15, $vr15, $vr4 + vadd.d $vr12, $vr15, $vr12 + vsrli.d $vr15, $vr5, 32 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 vshuf4i.w $vr13, $vr13, 16 - vsub.d $vr16, $vr16, $vr14 + vsub.d $vr16, $vr16, $vr11 vadd.d $vr13, $vr16, $vr13 - vilvh.w $vr8, $vr2, $vr8 - vilvh.h $vr7, $vr2, $vr7 - vshuf4i.w $vr15, $vr15, 16 - vsub.d $vr16, $vr19, $vr8 - vadd.d $vr15, $vr16, $vr15 - vilvl.w $vr16, $vr2, $vr7 - vilvh.w $vr7, $vr2, $vr7 - vilvh.b $vr6, $vr2, $vr6 - vshuf4i.w $vr10, $vr10, 16 - vsub.d $vr19, $vr21, $vr16 - vadd.d $vr10, $vr19, $vr10 - vilvl.h $vr19, $vr2, $vr6 - vshuf4i.w $vr11, $vr11, 16 - vsub.d $vr18, $vr18, $vr7 - vadd.d $vr11, $vr18, $vr11 - vilvl.w $vr18, $vr2, $vr19 - vilvh.w $vr19, $vr2, $vr19 - vilvh.h $vr6, $vr2, $vr6 + vsrli.d $vr16, $vr5, 48 + vsllwil.hu.bu $vr16, $vr16, 0 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.du.wu $vr16, $vr16, 0 vshuf4i.w $vr9, $vr9, 16 - vsub.d $vr17, $vr17, $vr18 - vadd.d $vr9, $vr17, $vr9 - vilvl.w $vr17, $vr2, $vr6 - vilvh.w $vr6, $vr2, $vr6 - vshuf4i.w $vr12, $vr12, 16 - vsub.d $vr21, $vr23, $vr19 - vadd.d $vr12, $vr21, $vr12 - vshuf4i.w $vr21, $vr4, 16 - vsub.d $vr20, $vr20, $vr17 - vadd.d $vr20, $vr20, $vr21 - vshuf4i.w $vr21, $vr4, 50 - vsub.d $vr22, $vr22, $vr6 - vadd.d $vr21, $vr22, $vr21 - vmul.d $vr6, $vr0, $vr6 - vmadd.d $vr6, $vr21, $vr1 - vmul.d $vr17, $vr0, $vr17 - vmadd.d $vr17, $vr20, $vr1 - vmul.d $vr19, $vr0, $vr19 - vmadd.d $vr19, $vr12, $vr1 - vmul.d $vr12, $vr0, $vr18 - vmadd.d $vr12, $vr9, $vr1 - vmul.d $vr7, $vr0, $vr7 - vmadd.d $vr7, $vr11, $vr1 - vmul.d $vr9, $vr0, $vr16 - vmadd.d $vr9, $vr10, $vr1 - vmul.d $vr8, $vr0, $vr8 - vmadd.d $vr8, $vr15, $vr1 - vmul.d $vr10, $vr0, $vr14 - vmadd.d $vr10, $vr13, $vr1 - vadd.d $vr10, $vr10, $vr3 - vadd.d $vr8, $vr8, $vr3 - vsrli.d $vr8, $vr8, 16 - vsrli.d $vr10, $vr10, 16 - vpickev.w $vr8, $vr8, $vr10 - vadd.d $vr9, $vr9, $vr3 - vadd.d $vr7, $vr7, $vr3 + vsub.d $vr21, $vr21, $vr15 + vadd.d $vr9, $vr21, $vr9 + vbsrl.v $vr21, $vr5, 8 + vsllwil.hu.bu $vr21, $vr21, 0 + vsllwil.wu.hu $vr21, $vr21, 0 + vsllwil.du.wu $vr21, $vr21, 0 + vshuf4i.w $vr10, $vr10, 16 + vsub.d $vr20, $vr20, $vr16 + vadd.d $vr10, $vr20, $vr10 + vbsrl.v $vr20, $vr5, 10 + vsllwil.hu.bu $vr20, $vr20, 0 + vsllwil.wu.hu $vr20, $vr20, 0 + vsllwil.du.wu $vr20, $vr20, 0 + vshuf4i.w $vr7, $vr7, 16 + vsub.d $vr14, $vr14, $vr21 + vadd.d $vr7, $vr14, $vr7 + vbsrl.v $vr14, $vr5, 12 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vbsrl.v $vr5, $vr5, 14 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.w $vr8, $vr8, 16 + vsub.d $vr19, $vr19, $vr20 + vadd.d $vr8, $vr19, $vr8 + vshuf4i.w $vr19, $vr3, 16 + vsub.d $vr18, $vr18, $vr14 + vadd.d $vr18, $vr18, $vr19 + vshuf4i.w $vr19, $vr3, 50 + vsub.d $vr17, $vr17, $vr5 + vadd.d $vr17, $vr17, $vr19 + vmul.d $vr5, $vr0, $vr5 + vmadd.d $vr5, $vr17, $vr1 + vmul.d $vr14, $vr0, $vr14 + vmadd.d $vr14, $vr18, $vr1 + vmul.d $vr17, $vr0, $vr20 + vmadd.d $vr17, $vr8, $vr1 + vmul.d $vr8, $vr0, $vr21 + vmadd.d $vr8, $vr7, $vr1 + vmul.d $vr7, $vr0, $vr16 + vmadd.d $vr7, $vr10, $vr1 + vmul.d $vr10, $vr0, $vr15 + vmadd.d $vr10, $vr9, $vr1 + vmul.d $vr9, $vr0, $vr11 + vmadd.d $vr9, $vr13, $vr1 + vmul.d $vr4, $vr0, $vr4 + vmadd.d $vr4, $vr12, $vr1 + vadd.d $vr4, $vr4, $vr2 + vadd.d $vr9, $vr9, $vr2 + vsrli.d $vr9, $vr9, 16 + vsrli.d $vr4, $vr4, 16 + vpickev.w $vr4, $vr9, $vr4 + vadd.d $vr9, $vr10, $vr2 + vadd.d $vr7, $vr7, $vr2 vsrli.d $vr7, $vr7, 16 vsrli.d $vr9, $vr9, 16 vpickev.w $vr7, $vr7, $vr9 - vpickev.h $vr7, $vr7, $vr8 - vadd.d $vr8, $vr12, $vr3 - vadd.d $vr9, $vr19, $vr3 - vsrli.d $vr9, $vr9, 16 + vpickev.h $vr4, $vr7, $vr4 + vadd.d $vr7, $vr8, $vr2 + vadd.d $vr8, $vr17, $vr2 vsrli.d $vr8, $vr8, 16 - vpickev.w $vr8, $vr9, $vr8 - vadd.d $vr9, $vr17, $vr3 - vadd.d $vr6, $vr6, $vr3 - vsrli.d $vr6, $vr6, 16 - vsrli.d $vr9, $vr9, 16 - vpickev.w $vr6, $vr6, $vr9 - vpickev.h $vr6, $vr6, $vr8 - vpickev.b $vr6, $vr6, $vr7 - vst $vr6, $t7, 0 + vsrli.d $vr7, $vr7, 16 + vpickev.w $vr7, $vr8, $vr7 + vadd.d $vr8, $vr14, $vr2 + vadd.d $vr5, $vr5, $vr2 + vsrli.d $vr5, $vr5, 16 + vsrli.d $vr8, $vr8, 16 + vpickev.w $vr5, $vr5, $vr8 + vpickev.h $vr5, $vr5, $vr7 + vpickev.b $vr4, $vr5, $vr4 + vst $vr4, $t7, 0 addi.d $s4, $s4, -16 addi.d $s3, $s3, 16 addi.d $t8, $t8, 16 @@ -575,8 +600,8 @@ fullsize_smooth_downsample: # @fullsize_smooth_downsample bnez $s4, .LBB3_12 # %bb.13: # %middle.block # in Loop: Header=BB3_6 Depth=1 - vpickve2gr.w $s3, $vr4, 2 - vpickve2gr.w $s4, $vr4, 3 + vpickve2gr.w $s3, $vr3, 2 + vpickve2gr.w $s4, $vr3, 3 move $t7, $t0 .LBB3_14: # %scalar.ph.preheader # in Loop: Header=BB3_6 Depth=1 @@ -1245,12 +1270,12 @@ int_downsample: # @int_downsample ld.h $s4, $t7, 0 vinsgr2vr.h $vr3, $s2, 0 vinsgr2vr.h $vr4, $s4, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vilvl.w $vr4, $vr0, $vr4 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 vadd.d $vr1, $vr1, $vr3 vadd.d $vr2, $vr2, $vr4 addi.d $t8, $t8, -4 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jddctmgr.s b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jddctmgr.s index 6aaa75ce..96a42439 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jddctmgr.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jddctmgr.s @@ -82,24 +82,24 @@ jinit_inverse_dct: # @jinit_inverse_dct .type start_pass,@function start_pass: # @start_pass # %bb.0: - addi.d $sp, $sp, -192 - st.d $ra, $sp, 184 # 8-byte Folded Spill - st.d $fp, $sp, 176 # 8-byte Folded Spill - st.d $s0, $sp, 168 # 8-byte Folded Spill - st.d $s1, $sp, 160 # 8-byte Folded Spill - st.d $s2, $sp, 152 # 8-byte Folded Spill - st.d $s3, $sp, 144 # 8-byte Folded Spill - st.d $s4, $sp, 136 # 8-byte Folded Spill - st.d $s5, $sp, 128 # 8-byte Folded Spill - st.d $s6, $sp, 120 # 8-byte Folded Spill - st.d $s7, $sp, 112 # 8-byte Folded Spill - st.d $s8, $sp, 104 # 8-byte Folded Spill - fst.d $fs0, $sp, 96 # 8-byte Folded Spill - fst.d $fs1, $sp, 88 # 8-byte Folded Spill - fst.d $fs2, $sp, 80 # 8-byte Folded Spill - fst.d $fs3, $sp, 72 # 8-byte Folded Spill - fst.d $fs4, $sp, 64 # 8-byte Folded Spill - fst.d $fs5, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -176 + st.d $ra, $sp, 168 # 8-byte Folded Spill + st.d $fp, $sp, 160 # 8-byte Folded Spill + st.d $s0, $sp, 152 # 8-byte Folded Spill + st.d $s1, $sp, 144 # 8-byte Folded Spill + st.d $s2, $sp, 136 # 8-byte Folded Spill + st.d $s3, $sp, 128 # 8-byte Folded Spill + st.d $s4, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 112 # 8-byte Folded Spill + st.d $s6, $sp, 104 # 8-byte Folded Spill + st.d $s7, $sp, 96 # 8-byte Folded Spill + st.d $s8, $sp, 88 # 8-byte Folded Spill + fst.d $fs0, $sp, 80 # 8-byte Folded Spill + fst.d $fs1, $sp, 72 # 8-byte Folded Spill + fst.d $fs2, $sp, 64 # 8-byte Folded Spill + fst.d $fs3, $sp, 56 # 8-byte Folded Spill + fst.d $fs4, $sp, 48 # 8-byte Folded Spill + fst.d $fs5, $sp, 40 # 8-byte Folded Spill move $fp, $a0 ld.w $a0, $a0, 48 blez $a0, .LBB1_21 @@ -110,7 +110,6 @@ start_pass: # @start_pass addi.d $a6, $a0, 88 pcalau12i $a0, %got_pc_hi20(jpeg_idct_1x1) ld.d $s3, $a0, %got_pc_lo12(jpeg_idct_1x1) - ori $a7, $zero, 7 pcalau12i $a0, %pc_hi20(.LCPI1_0) fld.d $fs0, $a0, %pc_lo12(.LCPI1_0) pcalau12i $a0, %pc_hi20(.LCPI1_1) @@ -123,11 +122,11 @@ start_pass: # @start_pass fld.d $fs4, $a0, %pc_lo12(.LCPI1_4) pcalau12i $a0, %pc_hi20(.LCPI1_5) fld.d $fs5, $a0, %pc_lo12(.LCPI1_5) - vrepli.b $vr5, 0 + ori $a7, $zero, 7 pcalau12i $a0, %pc_hi20(start_pass.aanscales) addi.d $s7, $a0, %pc_lo12(start_pass.aanscales) ori $a0, $zero, 2048 - vreplgr2vr.d $vr6, $a0 + vreplgr2vr.d $vr5, $a0 ori $s8, $zero, 128 pcalau12i $a0, %pc_hi20(.LJTI1_0) addi.d $s6, $a0, %pc_lo12(.LJTI1_0) @@ -137,9 +136,8 @@ start_pass: # @start_pass move $s5, $zero move $a0, $zero move $a1, $zero - st.d $a6, $sp, 48 # 8-byte Folded Spill - vst $vr5, $sp, 32 # 16-byte Folded Spill - vst $vr6, $sp, 16 # 16-byte Folded Spill + st.d $a6, $sp, 32 # 8-byte Folded Spill + vst $vr5, $sp, 16 # 16-byte Folded Spill b .LBB1_4 .LBB1_2: # %vector.body # in Loop: Header=BB1_4 Depth=1 @@ -147,40 +145,40 @@ start_pass: # @start_pass ld.d $a5, $a3, 8 vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a4, $a3, 16 ld.d $a5, $a3, 24 vst $vr0, $a2, 0 vst $vr1, $a2, 16 vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a4, $a3, 32 ld.d $a5, $a3, 40 vst $vr0, $a2, 32 vst $vr1, $a2, 48 vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a4, $a3, 48 ld.d $a5, $a3, 56 vst $vr0, $a2, 64 vst $vr1, $a2, 80 vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a4, $a3, 64 ld.d $a5, $a3, 72 vst $vr0, $a2, 96 vst $vr1, $a2, 112 vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a4, $a3, 80 ld.d $a5, $a3, 88 vst $vr0, $a2, 128 @@ -188,23 +186,23 @@ start_pass: # @start_pass vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a5, 0 ld.d $a4, $a3, 96 - vilvl.h $vr0, $vr5, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a2, 160 ld.d $a5, $a3, 104 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr1, $a2, 176 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a4, $a3, 112 ld.d $a3, $a3, 120 vst $vr0, $a2, 192 vst $vr1, $a2, 208 vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a3, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $a2, 224 vst $vr1, $a2, 240 .p2align 4, , 16 @@ -264,10 +262,9 @@ start_pass: # @start_pass st.w $a2, $a0, 40 move $a0, $fp jirl $ra, $a1, 0 - vld $vr6, $sp, 16 # 16-byte Folded Reload - vld $vr5, $sp, 32 # 16-byte Folded Reload + vld $vr5, $sp, 16 # 16-byte Folded Reload ori $a7, $zero, 7 - ld.d $a6, $sp, 48 # 8-byte Folded Reload + ld.d $a6, $sp, 32 # 8-byte Folded Reload move $a1, $s2 move $a0, $s4 .LBB1_13: # in Loop: Header=BB1_4 Depth=1 @@ -714,21 +711,21 @@ start_pass: # @start_pass # => This Inner Loop Header: Depth=2 ldx.d $a5, $a3, $a4 vinsgr2vr.d $vr0, $a5, 0 + vsllwil.wu.hu $vr1, $vr0, 0 + vsllwil.du.wu $vr1, $vr1, 0 ldx.d $a5, $s7, $a4 - vilvl.h $vr0, $vr5, $vr0 - vilvl.w $vr1, $vr5, $vr0 - vilvh.w $vr0, $vr5, $vr0 + vshuf4i.h $vr0, $vr0, 14 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vinsgr2vr.d $vr2, $a5, 0 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr3, $vr2, $vr2 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vori.b $vr4, $vr6, 0 + vsllwil.w.h $vr3, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vori.b $vr4, $vr5, 0 vmadd.d $vr4, $vr2, $vr0 - vori.b $vr0, $vr6, 0 + vori.b $vr0, $vr5, 0 vmadd.d $vr0, $vr3, $vr1 vsrli.d $vr0, $vr0, 12 vsrli.d $vr1, $vr4, 12 @@ -739,24 +736,24 @@ start_pass: # @start_pass bne $a4, $s8, .LBB1_20 b .LBB1_3 .LBB1_21: # %._crit_edge - fld.d $fs5, $sp, 56 # 8-byte Folded Reload - fld.d $fs4, $sp, 64 # 8-byte Folded Reload - fld.d $fs3, $sp, 72 # 8-byte Folded Reload - fld.d $fs2, $sp, 80 # 8-byte Folded Reload - fld.d $fs1, $sp, 88 # 8-byte Folded Reload - fld.d $fs0, $sp, 96 # 8-byte Folded Reload - ld.d $s8, $sp, 104 # 8-byte Folded Reload - ld.d $s7, $sp, 112 # 8-byte Folded Reload - ld.d $s6, $sp, 120 # 8-byte Folded Reload - ld.d $s5, $sp, 128 # 8-byte Folded Reload - ld.d $s4, $sp, 136 # 8-byte Folded Reload - ld.d $s3, $sp, 144 # 8-byte Folded Reload - ld.d $s2, $sp, 152 # 8-byte Folded Reload - ld.d $s1, $sp, 160 # 8-byte Folded Reload - ld.d $s0, $sp, 168 # 8-byte Folded Reload - ld.d $fp, $sp, 176 # 8-byte Folded Reload - ld.d $ra, $sp, 184 # 8-byte Folded Reload - addi.d $sp, $sp, 192 + fld.d $fs5, $sp, 40 # 8-byte Folded Reload + fld.d $fs4, $sp, 48 # 8-byte Folded Reload + fld.d $fs3, $sp, 56 # 8-byte Folded Reload + fld.d $fs2, $sp, 64 # 8-byte Folded Reload + fld.d $fs1, $sp, 72 # 8-byte Folded Reload + fld.d $fs0, $sp, 80 # 8-byte Folded Reload + ld.d $s8, $sp, 88 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload + ld.d $s6, $sp, 104 # 8-byte Folded Reload + ld.d $s5, $sp, 112 # 8-byte Folded Reload + ld.d $s4, $sp, 120 # 8-byte Folded Reload + ld.d $s3, $sp, 128 # 8-byte Folded Reload + ld.d $s2, $sp, 136 # 8-byte Folded Reload + ld.d $s1, $sp, 144 # 8-byte Folded Reload + ld.d $s0, $sp, 152 # 8-byte Folded Reload + ld.d $fp, $sp, 160 # 8-byte Folded Reload + ld.d $ra, $sp, 168 # 8-byte Folded Reload + addi.d $sp, $sp, 176 ret .Lfunc_end1: .size start_pass, .Lfunc_end1-start_pass diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jdmarker.s b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jdmarker.s index 7d7bcfd9..45aa1368 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jdmarker.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jdmarker.s @@ -364,46 +364,44 @@ reset_marker_reader: # @reset_marker_reader .type read_markers,@function read_markers: # @read_markers # %bb.0: - addi.d $sp, $sp, -592 - st.d $ra, $sp, 584 # 8-byte Folded Spill - st.d $fp, $sp, 576 # 8-byte Folded Spill - st.d $s0, $sp, 568 # 8-byte Folded Spill - st.d $s1, $sp, 560 # 8-byte Folded Spill - st.d $s2, $sp, 552 # 8-byte Folded Spill - st.d $s3, $sp, 544 # 8-byte Folded Spill - st.d $s4, $sp, 536 # 8-byte Folded Spill - st.d $s5, $sp, 528 # 8-byte Folded Spill - st.d $s6, $sp, 520 # 8-byte Folded Spill - st.d $s7, $sp, 512 # 8-byte Folded Spill - st.d $s8, $sp, 504 # 8-byte Folded Spill + addi.d $sp, $sp, -576 + st.d $ra, $sp, 568 # 8-byte Folded Spill + st.d $fp, $sp, 560 # 8-byte Folded Spill + st.d $s0, $sp, 552 # 8-byte Folded Spill + st.d $s1, $sp, 544 # 8-byte Folded Spill + st.d $s2, $sp, 536 # 8-byte Folded Spill + st.d $s3, $sp, 528 # 8-byte Folded Spill + st.d $s4, $sp, 520 # 8-byte Folded Spill + st.d $s5, $sp, 512 # 8-byte Folded Spill + st.d $s6, $sp, 504 # 8-byte Folded Spill + st.d $s7, $sp, 496 # 8-byte Folded Spill + st.d $s8, $sp, 488 # 8-byte Folded Spill move $fp, $a0 addi.d $a0, $a0, 360 - st.d $a0, $sp, 192 # 8-byte Folded Spill - addi.d $a0, $fp, 192 st.d $a0, $sp, 200 # 8-byte Folded Spill + addi.d $a0, $fp, 192 + st.d $a0, $sp, 208 # 8-byte Folded Spill addi.d $a0, $fp, 256 - st.d $a0, $sp, 136 # 8-byte Folded Spill + st.d $a0, $sp, 144 # 8-byte Folded Spill addi.d $a0, $fp, 224 - st.d $a0, $sp, 128 # 8-byte Folded Spill + st.d $a0, $sp, 136 # 8-byte Folded Spill addi.d $s8, $fp, 312 addi.d $a0, $fp, 328 - st.d $a0, $sp, 216 # 8-byte Folded Spill + st.d $a0, $sp, 224 # 8-byte Folded Spill addi.d $a0, $fp, 344 - st.d $a0, $sp, 184 # 8-byte Folded Spill + st.d $a0, $sp, 192 # 8-byte Folded Spill ld.w $s0, $fp, 524 ori $s6, $zero, 255 ori $s1, $zero, 253 lu12i.w $a0, 4112 ori $a0, $a0, 257 - st.d $a0, $sp, 176 # 8-byte Folded Spill + st.d $a0, $sp, 184 # 8-byte Folded Spill lu12i.w $a0, 20560 ori $a0, $a0, 1285 - st.d $a0, $sp, 168 # 8-byte Folded Spill + st.d $a0, $sp, 176 # 8-byte Folded Spill lu12i.w $a0, 16 ori $a0, $a0, 1 - st.d $a0, $sp, 160 # 8-byte Folded Spill - vrepli.b $vr0, 0 - vst $vr0, $sp, 224 # 16-byte Folded Spill + st.d $a0, $sp, 168 # 8-byte Folded Spill b .LBB4_3 .LBB4_1: # in Loop: Header=BB4_3 Depth=1 ld.d $a0, $fp, 0 @@ -648,22 +646,22 @@ read_markers: # @read_markers # in Loop: Header=BB4_3 Depth=1 st.d $zero, $s8, 8 st.d $zero, $s8, 0 - ld.d $a1, $sp, 176 # 8-byte Folded Reload + ld.d $a1, $sp, 184 # 8-byte Folded Reload bstrins.d $a1, $a1, 56, 32 - ld.d $a2, $sp, 216 # 8-byte Folded Reload + ld.d $a2, $sp, 224 # 8-byte Folded Reload st.d $a1, $a2, 0 st.d $a1, $a2, 8 - ld.d $a1, $sp, 168 # 8-byte Folded Reload + ld.d $a1, $sp, 176 # 8-byte Folded Reload bstrins.d $a1, $a1, 58, 32 - ld.d $a2, $sp, 184 # 8-byte Folded Reload + ld.d $a2, $sp, 192 # 8-byte Folded Reload st.d $a1, $a2, 0 st.d $a1, $a2, 8 st.w $zero, $fp, 52 st.w $zero, $fp, 384 - ld.d $a1, $sp, 192 # 8-byte Folded Reload + ld.d $a1, $sp, 200 # 8-byte Folded Reload st.d $zero, $a1, 0 st.b $zero, $a1, 8 - ld.d $a1, $sp, 160 # 8-byte Folded Reload + ld.d $a1, $sp, 168 # 8-byte Folded Reload st.w $a1, $fp, 370 st.w $zero, $fp, 376 st.b $zero, $fp, 380 @@ -765,11 +763,11 @@ read_markers: # @read_markers jirl $ra, $a1, 0 .LBB4_58: # in Loop: Header=BB4_53 Depth=2 slli.d $a0, $s4, 3 - ld.d $a1, $sp, 200 # 8-byte Folded Reload + ld.d $a1, $sp, 208 # 8-byte Folded Reload ldx.d $s0, $a1, $a0 bnez $s0, .LBB4_60 # %bb.59: # in Loop: Header=BB4_53 Depth=2 - ld.d $a0, $sp, 200 # 8-byte Folded Reload + ld.d $a0, $sp, 208 # 8-byte Folded Reload alsl.d $s4, $s4, $a0, 3 move $a0, $fp pcaddu18i $ra, %call36(jpeg_alloc_quant_table) @@ -870,12 +868,11 @@ read_markers: # @read_markers ld.d $a1, $s0, 0 ld.d $a2, $s0, 8 vinsgr2vr.d $vr0, $a1, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a0, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a0, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a0, 60 ori $s4, $zero, 92 st.w $s4, $a0, 40 @@ -886,12 +883,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 24 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -901,12 +897,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 40 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -916,12 +911,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 56 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -931,12 +925,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 72 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -946,12 +939,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 88 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -961,12 +953,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 104 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -976,12 +967,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 120 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -1069,63 +1059,63 @@ read_markers: # @read_markers bltu $a1, $a0, .LBB4_158 # %bb.90: # %.lr.ph161.i # in Loop: Header=BB4_3 Depth=1 - addi.d $s3, $a1, -2 - st.d $s8, $sp, 8 # 8-byte Folded Spill + addi.d $s2, $a1, -2 + st.d $s8, $sp, 16 # 8-byte Folded Spill b .LBB4_93 .p2align 4, , 16 .LBB4_91: # in Loop: Header=BB4_93 Depth=2 - ld.d $a0, $sp, 136 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload alsl.d $a0, $s6, $a0, 3 - ld.d $a1, $sp, 128 # 8-byte Folded Reload - ld.d $a2, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 136 # 8-byte Folded Reload + ld.d $a2, $sp, 64 # 8-byte Folded Reload alsl.d $a1, $a2, $a1, 3 - masknez $a0, $a0, $s3 - maskeqz $a1, $a1, $s3 - or $s3, $a1, $a0 - ld.d $a0, $s3, 0 + masknez $a0, $a0, $s2 + maskeqz $a1, $a1, $s2 + or $s2, $a1, $a0 + ld.d $a0, $s2, 0 beqz $a0, .LBB4_157 .LBB4_92: # in Loop: Header=BB4_93 Depth=2 st.b $zero, $a0, 0 - ld.d $a1, $sp, 152 # 8-byte Folded Reload + ld.d $a1, $sp, 160 # 8-byte Folded Reload st.b $a1, $a0, 1 - ld.d $a1, $sp, 144 # 8-byte Folded Reload + ld.d $a1, $sp, 152 # 8-byte Folded Reload st.b $a1, $a0, 2 - ld.d $a1, $sp, 120 # 8-byte Folded Reload + ld.d $a1, $sp, 128 # 8-byte Folded Reload st.b $a1, $a0, 3 - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 120 # 8-byte Folded Reload st.b $a1, $a0, 4 - ld.d $a1, $sp, 104 # 8-byte Folded Reload + ld.d $a1, $sp, 112 # 8-byte Folded Reload st.b $a1, $a0, 5 - ld.d $a1, $sp, 96 # 8-byte Folded Reload + ld.d $a1, $sp, 104 # 8-byte Folded Reload st.b $a1, $a0, 6 - ld.d $a1, $sp, 88 # 8-byte Folded Reload + ld.d $a1, $sp, 96 # 8-byte Folded Reload st.b $a1, $a0, 7 - ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 88 # 8-byte Folded Reload st.b $a1, $a0, 8 - ld.d $a1, $sp, 208 # 8-byte Folded Reload + ld.d $a1, $sp, 216 # 8-byte Folded Reload st.b $a1, $a0, 9 - ld.d $a1, $sp, 72 # 8-byte Folded Reload + ld.d $a1, $sp, 80 # 8-byte Folded Reload st.b $a1, $a0, 10 - ld.d $a1, $sp, 64 # 8-byte Folded Reload + ld.d $a1, $sp, 72 # 8-byte Folded Reload st.b $a1, $a0, 11 - st.b $s2, $a0, 12 - st.b $s5, $a0, 13 + st.b $s5, $a0, 12 + st.b $s3, $a0, 13 st.b $s7, $a0, 14 st.b $s8, $a0, 15 - ld.d $a1, $sp, 40 # 8-byte Folded Reload + ld.d $a1, $sp, 48 # 8-byte Folded Reload st.b $a1, $a0, 16 - ld.d $a0, $s3, 0 - ld.d $a1, $sp, 32 # 8-byte Folded Reload - ld.d $a2, $sp, 48 # 8-byte Folded Reload - sub.d $s3, $a2, $a1 + ld.d $a0, $s2, 0 + ld.d $a1, $sp, 40 # 8-byte Folded Reload + ld.d $a2, $sp, 56 # 8-byte Folded Reload + sub.d $s2, $a2, $a1 addi.d $a0, $a0, 17 - addi.d $a1, $sp, 248 + addi.d $a1, $sp, 232 ori $a2, $zero, 256 pcaddu18i $ra, %call36(memcpy) jirl $ra, $ra, 0 - ld.d $s8, $sp, 8 # 8-byte Folded Reload + ld.d $s8, $sp, 16 # 8-byte Folded Reload ori $s6, $zero, 255 - blez $s3, .LBB4_158 + blez $s2, .LBB4_158 .LBB4_93: # Parent Loop BB4_3 Depth=1 # => This Loop Header: Depth=2 # Child Loop BB4_151 Depth 3 @@ -1141,7 +1131,7 @@ read_markers: # @read_markers .LBB4_96: # in Loop: Header=BB4_93 Depth=2 ld.bu $a1, $s4, 0 ld.d $a0, $fp, 0 - st.d $a1, $sp, 56 # 8-byte Folded Spill + st.d $a1, $sp, 64 # 8-byte Folded Spill st.w $a1, $a0, 44 ld.d $a1, $fp, 0 ld.d $a2, $a1, 8 @@ -1155,72 +1145,72 @@ read_markers: # @read_markers # %bb.97: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $s4, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 152 # 8-byte Folded Spill + st.d $a2, $sp, 160 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_115 .LBB4_98: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 144 # 8-byte Folded Spill + st.d $a2, $sp, 152 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_117 .LBB4_99: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 120 # 8-byte Folded Spill + st.d $a2, $sp, 128 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_119 .LBB4_100: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 112 # 8-byte Folded Spill + st.d $a2, $sp, 120 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_121 .LBB4_101: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 104 # 8-byte Folded Spill + st.d $a2, $sp, 112 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_123 .LBB4_102: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 96 # 8-byte Folded Spill + st.d $a2, $sp, 104 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_125 .LBB4_103: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 88 # 8-byte Folded Spill + st.d $a2, $sp, 96 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_127 .LBB4_104: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 80 # 8-byte Folded Spill + st.d $a2, $sp, 88 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_129 .LBB4_105: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 208 # 8-byte Folded Spill + st.d $a2, $sp, 216 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_131 .LBB4_106: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 72 # 8-byte Folded Spill + st.d $a2, $sp, 80 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_133 .LBB4_107: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 64 # 8-byte Folded Spill + st.d $a2, $sp, 72 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_135 .LBB4_108: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 - ld.bu $s2, $a0, 0 + ld.bu $s3, $a0, 0 addi.d $a1, $a1, -1 beqz $a1, .LBB4_137 .LBB4_109: # in Loop: Header=BB4_93 Depth=2 @@ -1239,9 +1229,10 @@ read_markers: # @read_markers addi.d $a1, $a1, -1 beqz $a1, .LBB4_143 .LBB4_112: # in Loop: Header=BB4_93 Depth=2 - st.d $a1, $sp, 32 # 8-byte Folded Spill - move $s5, $s4 - move $a5, $s3 + st.d $a1, $sp, 40 # 8-byte Folded Spill + move $t3, $s4 + move $s5, $s3 + move $a5, $s2 addi.d $t6, $a0, 1 b .LBB4_145 .p2align 4, , 16 @@ -1254,7 +1245,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 152 # 8-byte Folded Spill + st.d $a2, $sp, 160 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_98 .LBB4_115: # in Loop: Header=BB4_93 Depth=2 @@ -1266,7 +1257,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 144 # 8-byte Folded Spill + st.d $a2, $sp, 152 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_99 .LBB4_117: # in Loop: Header=BB4_93 Depth=2 @@ -1278,7 +1269,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 120 # 8-byte Folded Spill + st.d $a2, $sp, 128 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_100 .LBB4_119: # in Loop: Header=BB4_93 Depth=2 @@ -1290,7 +1281,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 112 # 8-byte Folded Spill + st.d $a2, $sp, 120 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_101 .LBB4_121: # in Loop: Header=BB4_93 Depth=2 @@ -1302,7 +1293,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 104 # 8-byte Folded Spill + st.d $a2, $sp, 112 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_102 .LBB4_123: # in Loop: Header=BB4_93 Depth=2 @@ -1314,7 +1305,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 96 # 8-byte Folded Spill + st.d $a2, $sp, 104 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_103 .LBB4_125: # in Loop: Header=BB4_93 Depth=2 @@ -1326,7 +1317,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 88 # 8-byte Folded Spill + st.d $a2, $sp, 96 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_104 .LBB4_127: # in Loop: Header=BB4_93 Depth=2 @@ -1338,7 +1329,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 80 # 8-byte Folded Spill + st.d $a2, $sp, 88 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_105 .LBB4_129: # in Loop: Header=BB4_93 Depth=2 @@ -1350,7 +1341,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 208 # 8-byte Folded Spill + st.d $a2, $sp, 216 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_106 .LBB4_131: # in Loop: Header=BB4_93 Depth=2 @@ -1362,7 +1353,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 72 # 8-byte Folded Spill + st.d $a2, $sp, 80 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_107 .LBB4_133: # in Loop: Header=BB4_93 Depth=2 @@ -1374,7 +1365,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 64 # 8-byte Folded Spill + st.d $a2, $sp, 72 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_108 .LBB4_135: # in Loop: Header=BB4_93 Depth=2 @@ -1385,7 +1376,7 @@ read_markers: # @read_markers # %bb.136: # in Loop: Header=BB4_93 Depth=2 ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 - ld.bu $s2, $a0, 0 + ld.bu $s3, $a0, 0 addi.d $a1, $a1, -1 bnez $a1, .LBB4_109 .LBB4_137: # in Loop: Header=BB4_93 Depth=2 @@ -1427,39 +1418,41 @@ read_markers: # @read_markers jirl $ra, $a1, 0 beqz $a0, .LBB4_232 # %bb.144: # in Loop: Header=BB4_93 Depth=2 - move $s5, $s4 - move $a5, $s3 + move $t3, $s4 + move $s5, $s3 + move $a5, $s2 ld.d $t6, $s0, 0 ld.d $a0, $s0, 8 - st.d $a0, $sp, 32 # 8-byte Folded Spill + st.d $a0, $sp, 40 # 8-byte Folded Spill .LBB4_145: # in Loop: Header=BB4_93 Depth=2 - ld.d $a2, $sp, 152 # 8-byte Folded Reload - ld.d $a3, $sp, 144 # 8-byte Folded Reload + ld.d $a2, $sp, 160 # 8-byte Folded Reload + ld.d $a3, $sp, 152 # 8-byte Folded Reload add.d $a0, $a3, $a2 - ld.d $a4, $sp, 120 # 8-byte Folded Reload + ld.d $a4, $sp, 128 # 8-byte Folded Reload add.d $a0, $a0, $a4 - ld.d $a6, $sp, 112 # 8-byte Folded Reload + ld.d $a6, $sp, 120 # 8-byte Folded Reload add.d $a0, $a0, $a6 - ld.d $a7, $sp, 104 # 8-byte Folded Reload + ld.d $a7, $sp, 112 # 8-byte Folded Reload add.d $a0, $a0, $a7 - ld.d $t0, $sp, 96 # 8-byte Folded Reload + ld.d $t0, $sp, 104 # 8-byte Folded Reload add.d $a0, $a0, $t0 - ld.d $t1, $sp, 88 # 8-byte Folded Reload + ld.d $t1, $sp, 96 # 8-byte Folded Reload add.d $a0, $a0, $t1 - ld.d $t2, $sp, 80 # 8-byte Folded Reload + ld.d $t2, $sp, 88 # 8-byte Folded Reload add.d $a0, $a0, $t2 - ld.d $t7, $sp, 208 # 8-byte Folded Reload - add.d $a0, $a0, $t7 - ld.d $s4, $sp, 72 # 8-byte Folded Reload + ld.d $a1, $sp, 216 # 8-byte Folded Reload + add.d $a0, $a0, $a1 + ld.d $s4, $sp, 80 # 8-byte Folded Reload add.d $a0, $a0, $s4 - ld.d $s1, $sp, 64 # 8-byte Folded Reload + ld.d $s1, $sp, 72 # 8-byte Folded Reload add.d $a0, $a0, $s1 - st.d $a0, $sp, 16 # 8-byte Folded Spill - add.d $a0, $a0, $s2 - ld.d $a1, $fp, 0 + st.d $a0, $sp, 24 # 8-byte Folded Spill add.d $a0, $a0, $s5 + ld.d $a1, $fp, 0 + move $s3, $t3 + add.d $a0, $a0, $t3 add.d $a0, $a0, $s7 - st.d $t6, $sp, 24 # 8-byte Folded Spill + st.d $t6, $sp, 32 # 8-byte Folded Spill ld.bu $s6, $t6, 0 st.w $a2, $a1, 44 st.w $a3, $a1, 48 @@ -1473,23 +1466,23 @@ read_markers: # @read_markers st.w $a2, $a1, 40 ld.d $a2, $a1, 8 add.d $a0, $a0, $s8 - add.d $s3, $a0, $s6 + add.d $s2, $a0, $s6 addi.d $a0, $a5, -17 - st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a0, $sp, 56 # 8-byte Folded Spill ori $a1, $zero, 2 move $a0, $fp jirl $ra, $a2, 0 ld.d $a0, $fp, 0 - ld.d $a1, $sp, 208 # 8-byte Folded Reload + ld.d $a1, $sp, 216 # 8-byte Folded Reload st.w $a1, $a0, 44 st.w $s4, $a0, 48 st.w $s1, $a0, 52 - st.w $s2, $a0, 56 - st.w $s5, $a0, 60 + st.w $s5, $a0, 56 + st.w $s3, $a0, 60 st.w $s7, $a0, 64 ld.d $a2, $a0, 8 st.w $s8, $a0, 68 - st.d $s6, $sp, 40 # 8-byte Folded Spill + st.d $s6, $sp, 48 # 8-byte Folded Spill st.w $s6, $a0, 72 ori $a1, $zero, 85 st.w $a1, $a0, 40 @@ -1497,10 +1490,10 @@ read_markers: # @read_markers move $a0, $fp jirl $ra, $a2, 0 ori $a0, $zero, 256 - bltu $a0, $s3, .LBB4_147 + bltu $a0, $s2, .LBB4_147 # %bb.146: # in Loop: Header=BB4_93 Depth=2 - ld.d $a0, $sp, 48 # 8-byte Folded Reload - bge $a0, $s3, .LBB4_148 + ld.d $a0, $sp, 56 # 8-byte Folded Reload + bge $a0, $s2, .LBB4_148 .LBB4_147: # in Loop: Header=BB4_93 Depth=2 ld.d $a0, $fp, 0 ld.d $a1, $a0, 0 @@ -1509,22 +1502,22 @@ read_markers: # @read_markers move $a0, $fp jirl $ra, $a1, 0 .LBB4_148: # in Loop: Header=BB4_93 Depth=2 - ld.d $a0, $sp, 24 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload addi.d $a0, $a0, 1 - ld.d $a1, $sp, 32 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload addi.d $s1, $a1, -1 - st.d $s3, $sp, 32 # 8-byte Folded Spill - beqz $s3, .LBB4_154 + st.d $s2, $sp, 40 # 8-byte Folded Spill + beqz $s2, .LBB4_154 # %bb.149: # %.lr.ph.i78.preheader # in Loop: Header=BB4_93 Depth=2 - ld.d $a1, $sp, 16 # 8-byte Folded Reload - add.d $a1, $a1, $s2 + ld.d $a1, $sp, 24 # 8-byte Folded Reload add.d $a1, $a1, $s5 + add.d $a1, $a1, $s3 add.d $a1, $a1, $s7 add.d $a1, $a1, $s8 - ld.d $a2, $sp, 40 # 8-byte Folded Reload - add.w $s3, $a1, $a2 - addi.d $s6, $sp, 248 + ld.d $a2, $sp, 48 # 8-byte Folded Reload + add.w $s2, $a1, $a2 + addi.d $s6, $sp, 232 b .LBB4_151 .p2align 4, , 16 .LBB4_150: # in Loop: Header=BB4_151 Depth=3 @@ -1532,10 +1525,10 @@ read_markers: # @read_markers addi.d $s1, $s1, -1 addi.d $s4, $a0, 1 st.b $a1, $s6, 0 - addi.d $s3, $s3, -1 + addi.d $s2, $s2, -1 addi.d $s6, $s6, 1 move $a0, $s4 - beqz $s3, .LBB4_155 + beqz $s2, .LBB4_155 .LBB4_151: # %.lr.ph.i78 # Parent Loop BB4_3 Depth=1 # Parent Loop BB4_93 Depth=2 @@ -1554,12 +1547,12 @@ read_markers: # @read_markers move $s4, $a0 .LBB4_155: # %._crit_edge.i82 # in Loop: Header=BB4_93 Depth=2 - ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 64 # 8-byte Folded Reload andi $a0, $a1, 16 - sltui $s3, $a0, 1 + sltui $s2, $a0, 1 addi.d $s6, $a1, -16 - masknez $a0, $s6, $s3 - maskeqz $a1, $a1, $s3 + masknez $a0, $s6, $s2 + maskeqz $a1, $a1, $s2 or $a0, $a1, $a0 ori $a1, $zero, 4 blt $a0, $a1, .LBB4_91 @@ -1577,7 +1570,7 @@ read_markers: # @read_markers move $a0, $fp pcaddu18i $ra, %call36(jpeg_alloc_huff_table) jirl $ra, $ra, 0 - st.d $a0, $s3, 0 + st.d $a0, $s2, 0 b .LBB4_92 .LBB4_158: # %get_dht.exit # in Loop: Header=BB4_3 Depth=1 @@ -1736,7 +1729,7 @@ read_markers: # @read_markers andi $a0, $s5, 15 stx.b $a0, $s8, $s4 srli.d $a1, $s5, 4 - ld.d $a2, $sp, 216 # 8-byte Folded Reload + ld.d $a2, $sp, 224 # 8-byte Folded Reload stx.b $a1, $a2, $s4 bgeu $a1, $a0, .LBB4_172 # %bb.183: # in Loop: Header=BB4_173 Depth=2 @@ -2053,18 +2046,18 @@ read_markers: # @read_markers .LBB4_232: move $a0, $zero .LBB4_233: # %first_marker.exit.thread - ld.d $s8, $sp, 504 # 8-byte Folded Reload - ld.d $s7, $sp, 512 # 8-byte Folded Reload - ld.d $s6, $sp, 520 # 8-byte Folded Reload - ld.d $s5, $sp, 528 # 8-byte Folded Reload - ld.d $s4, $sp, 536 # 8-byte Folded Reload - ld.d $s3, $sp, 544 # 8-byte Folded Reload - ld.d $s2, $sp, 552 # 8-byte Folded Reload - ld.d $s1, $sp, 560 # 8-byte Folded Reload - ld.d $s0, $sp, 568 # 8-byte Folded Reload - ld.d $fp, $sp, 576 # 8-byte Folded Reload - ld.d $ra, $sp, 584 # 8-byte Folded Reload - addi.d $sp, $sp, 592 + ld.d $s8, $sp, 488 # 8-byte Folded Reload + ld.d $s7, $sp, 496 # 8-byte Folded Reload + ld.d $s6, $sp, 504 # 8-byte Folded Reload + ld.d $s5, $sp, 512 # 8-byte Folded Reload + ld.d $s4, $sp, 520 # 8-byte Folded Reload + ld.d $s3, $sp, 528 # 8-byte Folded Reload + ld.d $s2, $sp, 536 # 8-byte Folded Reload + ld.d $s1, $sp, 544 # 8-byte Folded Reload + ld.d $s0, $sp, 552 # 8-byte Folded Reload + ld.d $fp, $sp, 560 # 8-byte Folded Reload + ld.d $ra, $sp, 568 # 8-byte Folded Reload + addi.d $sp, $sp, 576 ret .Lfunc_end4: .size read_markers, .Lfunc_end4-read_markers diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jdsample.s b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jdsample.s index d5d0c193..3588829f 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jdsample.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jdsample.s @@ -407,11 +407,10 @@ h2v1_fancy_upsample: # @h2v1_fancy_upsample st.d $s6, $sp, 40 # 8-byte Folded Spill st.d $s7, $sp, 32 # 8-byte Folded Spill st.d $s8, $sp, 24 # 8-byte Folded Spill - move $a4, $zero ld.d $a3, $a3, 0 + move $a4, $zero ori $t4, $zero, 16 - vrepli.b $vr0, 0 - vrepli.h $vr1, 3 + vrepli.h $vr0, 3 b .LBB5_4 .LBB5_2: # in Loop: Header=BB5_4 Depth=1 addi.d $a6, $a7, -2 @@ -484,123 +483,126 @@ h2v1_fancy_upsample: # @h2v1_fancy_upsample # => This Inner Loop Header: Depth=2 addi.d $a5, $a6, 2 pcalau12i $t4, %pc_hi20(.LCPI5_0) - vld $vr2, $t4, %pc_lo12(.LCPI5_0) - vreplgr2vr.d $vr3, $a6 + vld $vr1, $t4, %pc_lo12(.LCPI5_0) + vreplgr2vr.d $vr2, $a6 pcalau12i $t4, %pc_hi20(.LCPI5_1) - vld $vr4, $t4, %pc_lo12(.LCPI5_1) + vld $vr3, $t4, %pc_lo12(.LCPI5_1) pcalau12i $t4, %pc_hi20(.LCPI5_2) - vld $vr5, $t4, %pc_lo12(.LCPI5_2) + vld $vr4, $t4, %pc_lo12(.LCPI5_2) pcalau12i $t4, %pc_hi20(.LCPI5_3) - vld $vr6, $t4, %pc_lo12(.LCPI5_3) - vadd.d $vr7, $vr3, $vr2 - vadd.d $vr8, $vr3, $vr4 - vadd.d $vr9, $vr3, $vr5 - vadd.d $vr10, $vr3, $vr6 + vld $vr5, $t4, %pc_lo12(.LCPI5_3) + vadd.d $vr6, $vr2, $vr1 + vadd.d $vr7, $vr2, $vr3 + vadd.d $vr8, $vr2, $vr4 + vadd.d $vr9, $vr2, $vr5 pcalau12i $t4, %pc_hi20(.LCPI5_4) - vld $vr11, $t4, %pc_lo12(.LCPI5_4) + vld $vr10, $t4, %pc_lo12(.LCPI5_4) pcalau12i $t4, %pc_hi20(.LCPI5_5) - vld $vr12, $t4, %pc_lo12(.LCPI5_5) + vld $vr11, $t4, %pc_lo12(.LCPI5_5) pcalau12i $t4, %pc_hi20(.LCPI5_6) - vld $vr13, $t4, %pc_lo12(.LCPI5_6) + vld $vr12, $t4, %pc_lo12(.LCPI5_6) pcalau12i $t4, %pc_hi20(.LCPI5_7) - vld $vr14, $t4, %pc_lo12(.LCPI5_7) - vadd.d $vr15, $vr3, $vr11 - vadd.d $vr16, $vr3, $vr12 - vadd.d $vr17, $vr3, $vr13 - vadd.d $vr18, $vr3, $vr14 - vreplgr2vr.d $vr3, $a5 - vadd.d $vr2, $vr3, $vr2 - vadd.d $vr4, $vr3, $vr4 - vadd.d $vr5, $vr3, $vr5 - vadd.d $vr6, $vr3, $vr6 - vadd.d $vr11, $vr3, $vr11 - vadd.d $vr12, $vr3, $vr12 - vld $vr19, $t5, -1 - vadd.d $vr13, $vr3, $vr13 - vld $vr20, $t5, -2 - vadd.d $vr14, $vr3, $vr14 - vilvl.b $vr3, $vr0, $vr19 - vilvh.b $vr19, $vr0, $vr19 - vilvl.b $vr21, $vr0, $vr20 - vilvh.b $vr20, $vr0, $vr20 - vmadd.h $vr20, $vr19, $vr1 - vmadd.h $vr21, $vr3, $vr1 - vaddi.hu $vr21, $vr21, 1 + vld $vr13, $t4, %pc_lo12(.LCPI5_7) + vadd.d $vr14, $vr2, $vr10 + vadd.d $vr15, $vr2, $vr11 + vadd.d $vr16, $vr2, $vr12 + vadd.d $vr17, $vr2, $vr13 + vreplgr2vr.d $vr2, $a5 + vadd.d $vr1, $vr2, $vr1 + vadd.d $vr3, $vr2, $vr3 + vadd.d $vr18, $vr2, $vr4 + vadd.d $vr5, $vr2, $vr5 + vadd.d $vr10, $vr2, $vr10 + vadd.d $vr11, $vr2, $vr11 + vld $vr4, $t5, -1 + vadd.d $vr12, $vr2, $vr12 + vadd.d $vr13, $vr2, $vr13 + vld $vr19, $t5, -2 + vsllwil.hu.bu $vr2, $vr4, 0 + vbsrl.v $vr4, $vr4, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.hu.bu $vr20, $vr19, 0 + vbsrl.v $vr19, $vr19, 8 + vsllwil.hu.bu $vr19, $vr19, 0 + vmadd.h $vr19, $vr4, $vr0 + vmadd.h $vr20, $vr2, $vr0 vaddi.hu $vr20, $vr20, 1 + vaddi.hu $vr19, $vr19, 1 + vsrli.h $vr19, $vr19, 2 vsrli.h $vr20, $vr20, 2 - vsrli.h $vr21, $vr21, 2 - vpickve2gr.d $t7, $vr18, 1 - vpickve2gr.d $t8, $vr17, 0 - vpickve2gr.d $fp, $vr17, 1 - vpickve2gr.d $s0, $vr16, 0 - vpickve2gr.d $s1, $vr16, 1 - vpickve2gr.d $s2, $vr15, 0 - vpickve2gr.d $s3, $vr15, 1 - vpickve2gr.d $s4, $vr10, 0 - vpickve2gr.d $s5, $vr10, 1 - vpickve2gr.d $s6, $vr9, 0 - vpickve2gr.d $s7, $vr9, 1 - vpickve2gr.d $s8, $vr8, 0 - vpickve2gr.d $ra, $vr8, 1 - vpickve2gr.d $a5, $vr7, 0 - vpickve2gr.d $t4, $vr7, 1 - vpickve2gr.d $t2, $vr14, 0 - vstelm.b $vr21, $t2, 0, 0 - vpickve2gr.d $t2, $vr14, 1 - vstelm.b $vr21, $t2, 0, 2 + vpickve2gr.d $t7, $vr17, 1 + vpickve2gr.d $t8, $vr16, 0 + vpickve2gr.d $fp, $vr16, 1 + vpickve2gr.d $s0, $vr15, 0 + vpickve2gr.d $s1, $vr15, 1 + vpickve2gr.d $s2, $vr14, 0 + vpickve2gr.d $s3, $vr14, 1 + vpickve2gr.d $s4, $vr9, 0 + vpickve2gr.d $s5, $vr9, 1 + vpickve2gr.d $s6, $vr8, 0 + vpickve2gr.d $s7, $vr8, 1 + vpickve2gr.d $s8, $vr7, 0 + vpickve2gr.d $ra, $vr7, 1 + vpickve2gr.d $a5, $vr6, 0 + vpickve2gr.d $t4, $vr6, 1 vpickve2gr.d $t2, $vr13, 0 - vstelm.b $vr21, $t2, 0, 4 - vpickve2gr.d $t2, $vr13, 1 - vstelm.b $vr21, $t2, 0, 6 - vpickve2gr.d $t2, $vr12, 0 - vstelm.b $vr21, $t2, 0, 8 - vpickve2gr.d $t2, $vr12, 1 - vstelm.b $vr21, $t2, 0, 10 - vpickve2gr.d $t2, $vr11, 0 - vstelm.b $vr21, $t2, 0, 12 - vpickve2gr.d $t2, $vr11, 1 - vstelm.b $vr21, $t2, 0, 14 - vpickve2gr.d $t2, $vr6, 0 vstelm.b $vr20, $t2, 0, 0 - vpickve2gr.d $t2, $vr6, 1 + vpickve2gr.d $t2, $vr13, 1 vstelm.b $vr20, $t2, 0, 2 - vpickve2gr.d $t2, $vr5, 0 + vpickve2gr.d $t2, $vr12, 0 vstelm.b $vr20, $t2, 0, 4 - vpickve2gr.d $t2, $vr5, 1 + vpickve2gr.d $t2, $vr12, 1 vstelm.b $vr20, $t2, 0, 6 - vpickve2gr.d $t2, $vr4, 0 + vpickve2gr.d $t2, $vr11, 0 vstelm.b $vr20, $t2, 0, 8 - vpickve2gr.d $t2, $vr4, 1 + vpickve2gr.d $t2, $vr11, 1 vstelm.b $vr20, $t2, 0, 10 - vpickve2gr.d $t2, $vr2, 0 - vld $vr4, $t5, 0 + vpickve2gr.d $t2, $vr10, 0 vstelm.b $vr20, $t2, 0, 12 - vpickve2gr.d $t2, $vr2, 1 + vpickve2gr.d $t2, $vr10, 1 vstelm.b $vr20, $t2, 0, 14 - vilvh.b $vr2, $vr0, $vr4 - vmadd.h $vr2, $vr19, $vr1 - vilvl.b $vr4, $vr0, $vr4 - vmadd.h $vr4, $vr3, $vr1 - vaddi.hu $vr3, $vr4, 2 - vaddi.hu $vr2, $vr2, 2 + vpickve2gr.d $t2, $vr5, 0 + vstelm.b $vr19, $t2, 0, 0 + vpickve2gr.d $t2, $vr5, 1 + vstelm.b $vr19, $t2, 0, 2 + vpickve2gr.d $t2, $vr18, 0 + vstelm.b $vr19, $t2, 0, 4 + vpickve2gr.d $t2, $vr18, 1 + vstelm.b $vr19, $t2, 0, 6 + vpickve2gr.d $t2, $vr3, 0 + vstelm.b $vr19, $t2, 0, 8 + vpickve2gr.d $t2, $vr3, 1 + vstelm.b $vr19, $t2, 0, 10 + vpickve2gr.d $t2, $vr1, 0 + vld $vr3, $t5, 0 + vstelm.b $vr19, $t2, 0, 12 + vpickve2gr.d $t2, $vr1, 1 + vstelm.b $vr19, $t2, 0, 14 + vbsrl.v $vr1, $vr3, 8 + vsllwil.hu.bu $vr1, $vr1, 0 + vmadd.h $vr1, $vr4, $vr0 + vsllwil.hu.bu $vr3, $vr3, 0 + vmadd.h $vr3, $vr2, $vr0 + vaddi.hu $vr2, $vr3, 2 + vaddi.hu $vr1, $vr1, 2 + vsrli.h $vr1, $vr1, 2 vsrli.h $vr2, $vr2, 2 - vsrli.h $vr3, $vr3, 2 - vstelm.b $vr3, $a6, 3, 0 - vstelm.b $vr3, $t7, 3, 2 - vstelm.b $vr3, $t8, 3, 4 - vstelm.b $vr3, $fp, 3, 6 - vstelm.b $vr3, $s0, 3, 8 - vstelm.b $vr3, $s1, 3, 10 - vstelm.b $vr3, $s2, 3, 12 - vstelm.b $vr3, $s3, 3, 14 - vstelm.b $vr2, $s4, 3, 0 - vstelm.b $vr2, $s5, 3, 2 - vstelm.b $vr2, $s6, 3, 4 - vstelm.b $vr2, $s7, 3, 6 - vstelm.b $vr2, $s8, 3, 8 - vstelm.b $vr2, $ra, 3, 10 - vstelm.b $vr2, $a5, 3, 12 - vstelm.b $vr2, $t4, 3, 14 + vstelm.b $vr2, $a6, 3, 0 + vstelm.b $vr2, $t7, 3, 2 + vstelm.b $vr2, $t8, 3, 4 + vstelm.b $vr2, $fp, 3, 6 + vstelm.b $vr2, $s0, 3, 8 + vstelm.b $vr2, $s1, 3, 10 + vstelm.b $vr2, $s2, 3, 12 + vstelm.b $vr2, $s3, 3, 14 + vstelm.b $vr1, $s4, 3, 0 + vstelm.b $vr1, $s5, 3, 2 + vstelm.b $vr1, $s6, 3, 4 + vstelm.b $vr1, $s7, 3, 6 + vstelm.b $vr1, $s8, 3, 8 + vstelm.b $vr1, $ra, 3, 10 + vstelm.b $vr1, $a5, 3, 12 + vstelm.b $vr1, $t4, 3, 14 addi.d $a6, $a6, 32 addi.d $t6, $t6, -16 addi.d $t5, $t5, 16 @@ -791,31 +793,29 @@ h2v2_fancy_upsample: # @h2v2_fancy_upsample ld.w $a4, $a0, 392 blez $a4, .LBB7_30 # %bb.1: # %.preheader.lr.ph - addi.d $sp, $sp, -160 - st.d $ra, $sp, 152 # 8-byte Folded Spill - st.d $fp, $sp, 144 # 8-byte Folded Spill - st.d $s0, $sp, 136 # 8-byte Folded Spill - st.d $s1, $sp, 128 # 8-byte Folded Spill - st.d $s2, $sp, 120 # 8-byte Folded Spill - st.d $s3, $sp, 112 # 8-byte Folded Spill - st.d $s4, $sp, 104 # 8-byte Folded Spill - st.d $s5, $sp, 96 # 8-byte Folded Spill - st.d $s6, $sp, 88 # 8-byte Folded Spill - st.d $s7, $sp, 80 # 8-byte Folded Spill - st.d $s8, $sp, 72 # 8-byte Folded Spill - fst.d $fs0, $sp, 64 # 8-byte Folded Spill - fst.d $fs1, $sp, 56 # 8-byte Folded Spill - fst.d $fs2, $sp, 48 # 8-byte Folded Spill - fst.d $fs3, $sp, 40 # 8-byte Folded Spill - fst.d $fs4, $sp, 32 # 8-byte Folded Spill - fst.d $fs5, $sp, 24 # 8-byte Folded Spill - fst.d $fs6, $sp, 16 # 8-byte Folded Spill + addi.d $sp, $sp, -144 + st.d $ra, $sp, 136 # 8-byte Folded Spill + st.d $fp, $sp, 128 # 8-byte Folded Spill + st.d $s0, $sp, 120 # 8-byte Folded Spill + st.d $s1, $sp, 112 # 8-byte Folded Spill + st.d $s2, $sp, 104 # 8-byte Folded Spill + st.d $s3, $sp, 96 # 8-byte Folded Spill + st.d $s4, $sp, 88 # 8-byte Folded Spill + st.d $s5, $sp, 80 # 8-byte Folded Spill + st.d $s6, $sp, 72 # 8-byte Folded Spill + st.d $s7, $sp, 64 # 8-byte Folded Spill + st.d $s8, $sp, 56 # 8-byte Folded Spill + fst.d $fs0, $sp, 48 # 8-byte Folded Spill + fst.d $fs1, $sp, 40 # 8-byte Folded Spill + fst.d $fs2, $sp, 32 # 8-byte Folded Spill + fst.d $fs3, $sp, 24 # 8-byte Folded Spill + fst.d $fs4, $sp, 16 # 8-byte Folded Spill + fst.d $fs5, $sp, 8 # 8-byte Folded Spill + ld.d $a3, $a3, 0 move $a4, $zero move $a5, $zero - ld.d $a3, $a3, 0 ori $s7, $zero, 16 - vrepli.b $vr0, 0 - vrepli.w $vr1, 3 + vrepli.w $vr0, 3 pcalau12i $t2, %pc_hi20(.LCPI7_3) pcalau12i $t3, %pc_hi20(.LCPI7_4) pcalau12i $t4, %pc_hi20(.LCPI7_5) @@ -914,7 +914,7 @@ h2v2_fancy_upsample: # @h2v2_fancy_upsample add.d $s5, $fp, $s5 alsl.d $s6, $s7, $s2, 4 alsl.d $s7, $s7, $s3, 4 - vinsgr2vr.w $vr11, $ra, 3 + vinsgr2vr.w $vr13, $ra, 3 vinsgr2vr.w $vr8, $a6, 3 move $ra, $s8 .p2align 4, , 16 @@ -922,165 +922,175 @@ h2v2_fancy_upsample: # @h2v2_fancy_upsample # Parent Loop BB7_4 Depth=1 # => This Inner Loop Header: Depth=2 addi.d $a6, $fp, 2 - vld $vr12, $a7, %pc_lo12(.LCPI7_0) + vld $vr9, $a7, %pc_lo12(.LCPI7_0) vreplgr2vr.d $vr10, $fp - vld $vr13, $t0, %pc_lo12(.LCPI7_1) + vld $vr12, $t0, %pc_lo12(.LCPI7_1) vld $vr14, $t1, %pc_lo12(.LCPI7_2) vld $vr15, $t2, %pc_lo12(.LCPI7_3) + vadd.d $vr1, $vr10, $vr9 vadd.d $vr2, $vr10, $vr12 - vadd.d $vr3, $vr10, $vr13 - vadd.d $vr4, $vr10, $vr14 - vadd.d $vr5, $vr10, $vr15 - vld $vr17, $t3, %pc_lo12(.LCPI7_4) - vld $vr18, $t4, %pc_lo12(.LCPI7_5) - vld $vr19, $t5, %pc_lo12(.LCPI7_6) - vld $vr21, $t6, %pc_lo12(.LCPI7_7) + vadd.d $vr3, $vr10, $vr14 + vadd.d $vr4, $vr10, $vr15 + vld $vr16, $t3, %pc_lo12(.LCPI7_4) + vld $vr17, $t4, %pc_lo12(.LCPI7_5) + vld $vr18, $t5, %pc_lo12(.LCPI7_6) + vld $vr19, $t6, %pc_lo12(.LCPI7_7) + vadd.d $vr5, $vr10, $vr16 vadd.d $vr6, $vr10, $vr17 vadd.d $vr7, $vr10, $vr18 - vadd.d $vr9, $vr10, $vr19 - vadd.d $vr10, $vr10, $vr21 - vreplgr2vr.d $vr22, $a6 - vadd.d $vr12, $vr22, $vr12 - vadd.d $vr13, $vr22, $vr13 - vadd.d $vr14, $vr22, $vr14 - vadd.d $vr16, $vr22, $vr15 - vadd.d $vr17, $vr22, $vr17 - vld $vr15, $s3, 0 - vadd.d $vr18, $vr22, $vr18 - vadd.d $vr20, $vr22, $vr19 - vadd.d $vr21, $vr22, $vr21 - vilvh.b $vr19, $vr0, $vr15 - vilvl.h $vr24, $vr0, $vr19 - vilvh.h $vr19, $vr0, $vr19 - vilvl.b $vr15, $vr0, $vr15 + vadd.d $vr10, $vr10, $vr19 + vreplgr2vr.d $vr20, $a6 + vadd.d $vr11, $vr20, $vr9 + vadd.d $vr12, $vr20, $vr12 + vadd.d $vr14, $vr20, $vr14 + vadd.d $vr15, $vr20, $vr15 + vadd.d $vr16, $vr20, $vr16 + vld $vr9, $s3, 0 + vadd.d $vr17, $vr20, $vr17 + vadd.d $vr18, $vr20, $vr18 + vadd.d $vr19, $vr20, $vr19 + vbsrl.v $vr20, $vr9, 8 + vsllwil.hu.bu $vr20, $vr20, 0 + vsllwil.wu.hu $vr20, $vr20, 0 + vbsrl.v $vr21, $vr9, 12 + vsllwil.hu.bu $vr21, $vr21, 0 + vsllwil.wu.hu $vr23, $vr21, 0 + vsrli.d $vr21, $vr9, 32 + vsllwil.hu.bu $vr21, $vr21, 0 vld $vr22, $s2, 0 - vilvh.h $vr25, $vr0, $vr15 - vilvl.h $vr26, $vr0, $vr15 - vld $vr27, $t7, %pc_lo12(.LCPI7_8) - vilvh.b $vr23, $vr0, $vr22 - vilvl.h $vr15, $vr0, $vr23 - vbsrl.v $vr28, $vr8, 12 - vshuf.w $vr27, $vr8, $vr11 - vilvh.h $vr8, $vr0, $vr23 - vilvl.b $vr11, $vr0, $vr22 - vilvh.h $vr23, $vr0, $vr11 - vilvl.h $vr22, $vr0, $vr11 - vmadd.w $vr22, $vr26, $vr1 - vmadd.w $vr23, $vr25, $vr1 - vmadd.w $vr8, $vr19, $vr1 - vmadd.w $vr15, $vr24, $vr1 - vbsrl.v $vr11, $vr15, 12 - vbsll.v $vr19, $vr8, 4 - vor.v $vr11, $vr19, $vr11 - vbsrl.v $vr19, $vr23, 12 - vbsll.v $vr24, $vr15, 4 - vor.v $vr19, $vr24, $vr19 - vbsrl.v $vr24, $vr22, 12 - vbsll.v $vr25, $vr23, 4 - vor.v $vr24, $vr25, $vr24 - vbsll.v $vr25, $vr22, 4 - vor.v $vr25, $vr25, $vr28 - vpackev.d $vr26, $vr22, $vr27 - vbsrl.v $vr27, $vr15, 8 - vbsll.v $vr28, $vr8, 8 + vsllwil.wu.hu $vr24, $vr21, 0 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr25, $vr9, 0 + vbsrl.v $vr9, $vr22, 8 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vld $vr26, $t7, %pc_lo12(.LCPI7_8) + vbsrl.v $vr21, $vr22, 12 + vsllwil.hu.bu $vr21, $vr21, 0 + vbsrl.v $vr27, $vr8, 12 + vshuf.w $vr26, $vr8, $vr13 + vsllwil.wu.hu $vr8, $vr21, 0 + vsrli.d $vr13, $vr22, 32 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr21, $vr13, 0 + vsllwil.hu.bu $vr13, $vr22, 0 + vsllwil.wu.hu $vr22, $vr13, 0 + vmadd.w $vr22, $vr25, $vr0 + vmadd.w $vr21, $vr24, $vr0 + vmadd.w $vr8, $vr23, $vr0 + vmadd.w $vr9, $vr20, $vr0 + vbsrl.v $vr13, $vr9, 12 + vbsll.v $vr20, $vr8, 4 + vor.v $vr13, $vr20, $vr13 + vbsrl.v $vr20, $vr21, 12 + vbsll.v $vr23, $vr9, 4 + vor.v $vr20, $vr23, $vr20 + vbsrl.v $vr23, $vr22, 12 + vbsll.v $vr24, $vr21, 4 + vor.v $vr23, $vr24, $vr23 + vbsll.v $vr24, $vr22, 4 + vor.v $vr24, $vr24, $vr27 + vpackev.d $vr25, $vr22, $vr26 + vbsrl.v $vr26, $vr9, 8 + vbsll.v $vr27, $vr8, 8 + vor.v $vr26, $vr27, $vr26 + vbsrl.v $vr27, $vr21, 8 + vbsll.v $vr28, $vr9, 8 vor.v $vr27, $vr28, $vr27 - vbsrl.v $vr28, $vr23, 8 - vbsll.v $vr29, $vr15, 8 + vbsrl.v $vr28, $vr22, 8 + vbsll.v $vr29, $vr21, 8 vor.v $vr28, $vr29, $vr28 - vbsrl.v $vr29, $vr22, 8 - vbsll.v $vr30, $vr23, 8 - vor.v $vr29, $vr30, $vr29 - vmadd.w $vr29, $vr24, $vr1 - vmadd.w $vr28, $vr19, $vr1 - vmadd.w $vr27, $vr11, $vr1 - vmadd.w $vr26, $vr25, $vr1 + vmadd.w $vr28, $vr23, $vr0 + vmadd.w $vr27, $vr20, $vr0 + vmadd.w $vr26, $vr13, $vr0 + vmadd.w $vr25, $vr24, $vr0 + vaddi.wu $vr26, $vr26, 8 vaddi.wu $vr27, $vr27, 8 vaddi.wu $vr28, $vr28, 8 - vaddi.wu $vr29, $vr29, 8 - vaddi.wu $vr26, $vr26, 8 - vsrli.w $vr29, $vr29, 4 + vaddi.wu $vr25, $vr25, 8 vsrli.w $vr28, $vr28, 4 vsrli.w $vr27, $vr27, 4 vsrli.w $vr26, $vr26, 4 - vpickev.h $vr27, $vr27, $vr28 - vpickev.h $vr29, $vr29, $vr26 - vpickve2gr.d $a6, $vr21, 0 - vstelm.b $vr26, $a6, 0, 0 - vpickve2gr.d $a6, $vr21, 1 - vstelm.b $vr29, $a6, 0, 2 - vpickve2gr.d $a6, $vr20, 0 - vstelm.b $vr29, $a6, 0, 4 - vpickve2gr.d $a6, $vr20, 1 - vstelm.b $vr29, $a6, 0, 6 + vsrli.w $vr25, $vr25, 4 + vpickev.h $vr26, $vr26, $vr27 + vpickev.h $vr25, $vr28, $vr25 + vpickve2gr.d $a6, $vr19, 0 + vstelm.b $vr25, $a6, 0, 0 + vpickve2gr.d $a6, $vr19, 1 + vstelm.b $vr25, $a6, 0, 2 vpickve2gr.d $a6, $vr18, 0 - vstelm.b $vr29, $a6, 0, 8 + vstelm.b $vr25, $a6, 0, 4 vpickve2gr.d $a6, $vr18, 1 - vstelm.b $vr29, $a6, 0, 10 + vstelm.b $vr25, $a6, 0, 6 vpickve2gr.d $a6, $vr17, 0 - vstelm.b $vr29, $a6, 0, 12 + vstelm.b $vr25, $a6, 0, 8 vpickve2gr.d $a6, $vr17, 1 - vstelm.b $vr29, $a6, 0, 14 + vstelm.b $vr25, $a6, 0, 10 vpickve2gr.d $a6, $vr16, 0 - vstelm.b $vr28, $a6, 0, 0 + vstelm.b $vr25, $a6, 0, 12 vpickve2gr.d $a6, $vr16, 1 - vstelm.b $vr27, $a6, 0, 2 + vstelm.b $vr25, $a6, 0, 14 + vpickve2gr.d $a6, $vr15, 0 + vstelm.b $vr26, $a6, 0, 0 + vpickve2gr.d $a6, $vr15, 1 + vstelm.b $vr26, $a6, 0, 2 vpickve2gr.d $a6, $vr14, 0 - vstelm.b $vr27, $a6, 0, 4 + vstelm.b $vr26, $a6, 0, 4 vpickve2gr.d $a6, $vr14, 1 - vstelm.b $vr27, $a6, 0, 6 - vpickve2gr.d $a6, $vr13, 0 - vstelm.b $vr27, $a6, 0, 8 - vpickve2gr.d $a6, $vr13, 1 - vstelm.b $vr27, $a6, 0, 10 + vstelm.b $vr26, $a6, 0, 6 vpickve2gr.d $a6, $vr12, 0 - vstelm.b $vr27, $a6, 0, 12 + vstelm.b $vr26, $a6, 0, 8 vpickve2gr.d $a6, $vr12, 1 - vstelm.b $vr27, $a6, 0, 14 + vstelm.b $vr26, $a6, 0, 10 + vpickve2gr.d $a6, $vr11, 0 + vstelm.b $vr26, $a6, 0, 12 + vpickve2gr.d $a6, $vr11, 1 + vstelm.b $vr26, $a6, 0, 14 vpickve2gr.d $a6, $vr10, 1 - vmadd.w $vr22, $vr25, $vr1 - vmadd.w $vr23, $vr24, $vr1 - vaddi.wu $vr10, $vr23, 7 - vaddi.wu $vr12, $vr22, 7 - vsrli.w $vr12, $vr12, 4 + vmadd.w $vr22, $vr24, $vr0 + vmadd.w $vr21, $vr23, $vr0 + vaddi.wu $vr10, $vr21, 7 + vaddi.wu $vr11, $vr22, 7 + vsrli.w $vr11, $vr11, 4 vsrli.w $vr10, $vr10, 4 - vpickev.h $vr10, $vr10, $vr12 - vstelm.b $vr12, $fp, 3, 0 + vpickev.h $vr10, $vr10, $vr11 + vstelm.b $vr10, $fp, 3, 0 vstelm.b $vr10, $a6, 3, 2 - vpickve2gr.d $a6, $vr9, 0 - vstelm.b $vr10, $a6, 3, 4 - vpickve2gr.d $a6, $vr9, 1 - vstelm.b $vr10, $a6, 3, 6 vpickve2gr.d $a6, $vr7, 0 - vstelm.b $vr10, $a6, 3, 8 + vstelm.b $vr10, $a6, 3, 4 vpickve2gr.d $a6, $vr7, 1 - vstelm.b $vr10, $a6, 3, 10 + vstelm.b $vr10, $a6, 3, 6 vpickve2gr.d $a6, $vr6, 0 - vstelm.b $vr10, $a6, 3, 12 + vstelm.b $vr10, $a6, 3, 8 vpickve2gr.d $a6, $vr6, 1 - vstelm.b $vr10, $a6, 3, 14 + vstelm.b $vr10, $a6, 3, 10 vpickve2gr.d $a6, $vr5, 0 - vmadd.w $vr15, $vr19, $vr1 - vaddi.wu $vr6, $vr15, 7 - vsrli.w $vr6, $vr6, 4 - vstelm.b $vr6, $a6, 3, 0 + vstelm.b $vr10, $a6, 3, 12 vpickve2gr.d $a6, $vr5, 1 + vstelm.b $vr10, $a6, 3, 14 + vpickve2gr.d $a6, $vr4, 0 + vmadd.w $vr9, $vr20, $vr0 vori.b $vr5, $vr8, 0 - vmadd.w $vr5, $vr11, $vr1 + vmadd.w $vr5, $vr13, $vr0 vaddi.wu $vr5, $vr5, 7 + vaddi.wu $vr6, $vr9, 7 + vsrli.w $vr6, $vr6, 4 vsrli.w $vr5, $vr5, 4 vpickev.h $vr5, $vr5, $vr6 + vstelm.b $vr5, $a6, 3, 0 + vpickve2gr.d $a6, $vr4, 1 vstelm.b $vr5, $a6, 3, 2 - vpickve2gr.d $a6, $vr4, 0 + vpickve2gr.d $a6, $vr3, 0 vstelm.b $vr5, $a6, 3, 4 - vpickve2gr.d $a6, $vr4, 1 + vpickve2gr.d $a6, $vr3, 1 vstelm.b $vr5, $a6, 3, 6 - vpickve2gr.d $a6, $vr3, 0 + vpickve2gr.d $a6, $vr2, 0 vstelm.b $vr5, $a6, 3, 8 - vpickve2gr.d $a6, $vr3, 1 + vpickve2gr.d $a6, $vr2, 1 vstelm.b $vr5, $a6, 3, 10 - vpickve2gr.d $a6, $vr2, 0 + vpickve2gr.d $a6, $vr1, 0 vstelm.b $vr5, $a6, 3, 12 - vpickve2gr.d $a6, $vr2, 1 + vpickve2gr.d $a6, $vr1, 1 vstelm.b $vr5, $a6, 3, 14 addi.d $fp, $fp, 32 addi.d $ra, $ra, -16 @@ -1206,7 +1216,7 @@ h2v2_fancy_upsample: # @h2v2_fancy_upsample add.d $s4, $t8, $s4 alsl.d $s5, $s6, $s1, 4 alsl.d $s6, $s6, $s2, 4 - vinsgr2vr.w $vr11, $a6, 3 + vinsgr2vr.w $vr13, $a6, 3 vinsgr2vr.w $vr8, $s8, 3 move $s8, $s7 .p2align 4, , 16 @@ -1214,165 +1224,175 @@ h2v2_fancy_upsample: # @h2v2_fancy_upsample # Parent Loop BB7_4 Depth=1 # => This Inner Loop Header: Depth=2 addi.d $a6, $t8, 2 - vld $vr12, $a7, %pc_lo12(.LCPI7_0) + vld $vr9, $a7, %pc_lo12(.LCPI7_0) vreplgr2vr.d $vr10, $t8 - vld $vr13, $t0, %pc_lo12(.LCPI7_1) + vld $vr12, $t0, %pc_lo12(.LCPI7_1) vld $vr14, $t1, %pc_lo12(.LCPI7_2) vld $vr15, $t2, %pc_lo12(.LCPI7_3) + vadd.d $vr1, $vr10, $vr9 vadd.d $vr2, $vr10, $vr12 - vadd.d $vr3, $vr10, $vr13 - vadd.d $vr4, $vr10, $vr14 - vadd.d $vr5, $vr10, $vr15 - vld $vr17, $t3, %pc_lo12(.LCPI7_4) - vld $vr18, $t4, %pc_lo12(.LCPI7_5) - vld $vr19, $t5, %pc_lo12(.LCPI7_6) - vld $vr21, $t6, %pc_lo12(.LCPI7_7) + vadd.d $vr3, $vr10, $vr14 + vadd.d $vr4, $vr10, $vr15 + vld $vr16, $t3, %pc_lo12(.LCPI7_4) + vld $vr17, $t4, %pc_lo12(.LCPI7_5) + vld $vr18, $t5, %pc_lo12(.LCPI7_6) + vld $vr19, $t6, %pc_lo12(.LCPI7_7) + vadd.d $vr5, $vr10, $vr16 vadd.d $vr6, $vr10, $vr17 vadd.d $vr7, $vr10, $vr18 - vadd.d $vr9, $vr10, $vr19 - vadd.d $vr10, $vr10, $vr21 - vreplgr2vr.d $vr22, $a6 - vadd.d $vr12, $vr22, $vr12 - vadd.d $vr13, $vr22, $vr13 - vadd.d $vr14, $vr22, $vr14 - vadd.d $vr16, $vr22, $vr15 - vadd.d $vr17, $vr22, $vr17 - vld $vr15, $s2, 0 - vadd.d $vr18, $vr22, $vr18 - vadd.d $vr20, $vr22, $vr19 - vadd.d $vr21, $vr22, $vr21 - vilvh.b $vr19, $vr0, $vr15 - vilvl.h $vr24, $vr0, $vr19 - vilvh.h $vr19, $vr0, $vr19 - vilvl.b $vr15, $vr0, $vr15 + vadd.d $vr10, $vr10, $vr19 + vreplgr2vr.d $vr20, $a6 + vadd.d $vr11, $vr20, $vr9 + vadd.d $vr12, $vr20, $vr12 + vadd.d $vr14, $vr20, $vr14 + vadd.d $vr15, $vr20, $vr15 + vadd.d $vr16, $vr20, $vr16 + vld $vr9, $s2, 0 + vadd.d $vr17, $vr20, $vr17 + vadd.d $vr18, $vr20, $vr18 + vadd.d $vr19, $vr20, $vr19 + vbsrl.v $vr20, $vr9, 8 + vsllwil.hu.bu $vr20, $vr20, 0 + vsllwil.wu.hu $vr20, $vr20, 0 + vbsrl.v $vr21, $vr9, 12 + vsllwil.hu.bu $vr21, $vr21, 0 + vsllwil.wu.hu $vr23, $vr21, 0 + vsrli.d $vr21, $vr9, 32 + vsllwil.hu.bu $vr21, $vr21, 0 vld $vr22, $s1, 0 - vilvh.h $vr25, $vr0, $vr15 - vilvl.h $vr26, $vr0, $vr15 - vld $vr27, $t7, %pc_lo12(.LCPI7_8) - vilvh.b $vr23, $vr0, $vr22 - vilvl.h $vr15, $vr0, $vr23 - vbsrl.v $vr28, $vr8, 12 - vshuf.w $vr27, $vr8, $vr11 - vilvh.h $vr8, $vr0, $vr23 - vilvl.b $vr11, $vr0, $vr22 - vilvh.h $vr23, $vr0, $vr11 - vilvl.h $vr22, $vr0, $vr11 - vmadd.w $vr22, $vr26, $vr1 - vmadd.w $vr23, $vr25, $vr1 - vmadd.w $vr8, $vr19, $vr1 - vmadd.w $vr15, $vr24, $vr1 - vbsrl.v $vr11, $vr15, 12 - vbsll.v $vr19, $vr8, 4 - vor.v $vr11, $vr19, $vr11 - vbsrl.v $vr19, $vr23, 12 - vbsll.v $vr24, $vr15, 4 - vor.v $vr19, $vr24, $vr19 - vbsrl.v $vr24, $vr22, 12 - vbsll.v $vr25, $vr23, 4 - vor.v $vr24, $vr25, $vr24 - vbsll.v $vr25, $vr22, 4 - vor.v $vr25, $vr25, $vr28 - vpackev.d $vr26, $vr22, $vr27 - vbsrl.v $vr27, $vr15, 8 - vbsll.v $vr28, $vr8, 8 + vsllwil.wu.hu $vr24, $vr21, 0 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr25, $vr9, 0 + vbsrl.v $vr9, $vr22, 8 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vld $vr26, $t7, %pc_lo12(.LCPI7_8) + vbsrl.v $vr21, $vr22, 12 + vsllwil.hu.bu $vr21, $vr21, 0 + vbsrl.v $vr27, $vr8, 12 + vshuf.w $vr26, $vr8, $vr13 + vsllwil.wu.hu $vr8, $vr21, 0 + vsrli.d $vr13, $vr22, 32 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr21, $vr13, 0 + vsllwil.hu.bu $vr13, $vr22, 0 + vsllwil.wu.hu $vr22, $vr13, 0 + vmadd.w $vr22, $vr25, $vr0 + vmadd.w $vr21, $vr24, $vr0 + vmadd.w $vr8, $vr23, $vr0 + vmadd.w $vr9, $vr20, $vr0 + vbsrl.v $vr13, $vr9, 12 + vbsll.v $vr20, $vr8, 4 + vor.v $vr13, $vr20, $vr13 + vbsrl.v $vr20, $vr21, 12 + vbsll.v $vr23, $vr9, 4 + vor.v $vr20, $vr23, $vr20 + vbsrl.v $vr23, $vr22, 12 + vbsll.v $vr24, $vr21, 4 + vor.v $vr23, $vr24, $vr23 + vbsll.v $vr24, $vr22, 4 + vor.v $vr24, $vr24, $vr27 + vpackev.d $vr25, $vr22, $vr26 + vbsrl.v $vr26, $vr9, 8 + vbsll.v $vr27, $vr8, 8 + vor.v $vr26, $vr27, $vr26 + vbsrl.v $vr27, $vr21, 8 + vbsll.v $vr28, $vr9, 8 vor.v $vr27, $vr28, $vr27 - vbsrl.v $vr28, $vr23, 8 - vbsll.v $vr29, $vr15, 8 + vbsrl.v $vr28, $vr22, 8 + vbsll.v $vr29, $vr21, 8 vor.v $vr28, $vr29, $vr28 - vbsrl.v $vr29, $vr22, 8 - vbsll.v $vr30, $vr23, 8 - vor.v $vr29, $vr30, $vr29 - vmadd.w $vr29, $vr24, $vr1 - vmadd.w $vr28, $vr19, $vr1 - vmadd.w $vr27, $vr11, $vr1 - vmadd.w $vr26, $vr25, $vr1 + vmadd.w $vr28, $vr23, $vr0 + vmadd.w $vr27, $vr20, $vr0 + vmadd.w $vr26, $vr13, $vr0 + vmadd.w $vr25, $vr24, $vr0 + vaddi.wu $vr26, $vr26, 8 vaddi.wu $vr27, $vr27, 8 vaddi.wu $vr28, $vr28, 8 - vaddi.wu $vr29, $vr29, 8 - vaddi.wu $vr26, $vr26, 8 - vsrli.w $vr29, $vr29, 4 + vaddi.wu $vr25, $vr25, 8 vsrli.w $vr28, $vr28, 4 vsrli.w $vr27, $vr27, 4 vsrli.w $vr26, $vr26, 4 - vpickev.h $vr27, $vr27, $vr28 - vpickev.h $vr29, $vr29, $vr26 - vpickve2gr.d $a6, $vr21, 0 - vstelm.b $vr26, $a6, 0, 0 - vpickve2gr.d $a6, $vr21, 1 - vstelm.b $vr29, $a6, 0, 2 - vpickve2gr.d $a6, $vr20, 0 - vstelm.b $vr29, $a6, 0, 4 - vpickve2gr.d $a6, $vr20, 1 - vstelm.b $vr29, $a6, 0, 6 + vsrli.w $vr25, $vr25, 4 + vpickev.h $vr26, $vr26, $vr27 + vpickev.h $vr25, $vr28, $vr25 + vpickve2gr.d $a6, $vr19, 0 + vstelm.b $vr25, $a6, 0, 0 + vpickve2gr.d $a6, $vr19, 1 + vstelm.b $vr25, $a6, 0, 2 vpickve2gr.d $a6, $vr18, 0 - vstelm.b $vr29, $a6, 0, 8 + vstelm.b $vr25, $a6, 0, 4 vpickve2gr.d $a6, $vr18, 1 - vstelm.b $vr29, $a6, 0, 10 + vstelm.b $vr25, $a6, 0, 6 vpickve2gr.d $a6, $vr17, 0 - vstelm.b $vr29, $a6, 0, 12 + vstelm.b $vr25, $a6, 0, 8 vpickve2gr.d $a6, $vr17, 1 - vstelm.b $vr29, $a6, 0, 14 + vstelm.b $vr25, $a6, 0, 10 vpickve2gr.d $a6, $vr16, 0 - vstelm.b $vr28, $a6, 0, 0 + vstelm.b $vr25, $a6, 0, 12 vpickve2gr.d $a6, $vr16, 1 - vstelm.b $vr27, $a6, 0, 2 + vstelm.b $vr25, $a6, 0, 14 + vpickve2gr.d $a6, $vr15, 0 + vstelm.b $vr26, $a6, 0, 0 + vpickve2gr.d $a6, $vr15, 1 + vstelm.b $vr26, $a6, 0, 2 vpickve2gr.d $a6, $vr14, 0 - vstelm.b $vr27, $a6, 0, 4 + vstelm.b $vr26, $a6, 0, 4 vpickve2gr.d $a6, $vr14, 1 - vstelm.b $vr27, $a6, 0, 6 - vpickve2gr.d $a6, $vr13, 0 - vstelm.b $vr27, $a6, 0, 8 - vpickve2gr.d $a6, $vr13, 1 - vstelm.b $vr27, $a6, 0, 10 + vstelm.b $vr26, $a6, 0, 6 vpickve2gr.d $a6, $vr12, 0 - vstelm.b $vr27, $a6, 0, 12 + vstelm.b $vr26, $a6, 0, 8 vpickve2gr.d $a6, $vr12, 1 - vstelm.b $vr27, $a6, 0, 14 + vstelm.b $vr26, $a6, 0, 10 + vpickve2gr.d $a6, $vr11, 0 + vstelm.b $vr26, $a6, 0, 12 + vpickve2gr.d $a6, $vr11, 1 + vstelm.b $vr26, $a6, 0, 14 vpickve2gr.d $a6, $vr10, 1 - vmadd.w $vr22, $vr25, $vr1 - vmadd.w $vr23, $vr24, $vr1 - vaddi.wu $vr10, $vr23, 7 - vaddi.wu $vr12, $vr22, 7 - vsrli.w $vr12, $vr12, 4 + vmadd.w $vr22, $vr24, $vr0 + vmadd.w $vr21, $vr23, $vr0 + vaddi.wu $vr10, $vr21, 7 + vaddi.wu $vr11, $vr22, 7 + vsrli.w $vr11, $vr11, 4 vsrli.w $vr10, $vr10, 4 - vpickev.h $vr10, $vr10, $vr12 - vstelm.b $vr12, $t8, 3, 0 + vpickev.h $vr10, $vr10, $vr11 + vstelm.b $vr10, $t8, 3, 0 vstelm.b $vr10, $a6, 3, 2 - vpickve2gr.d $a6, $vr9, 0 - vstelm.b $vr10, $a6, 3, 4 - vpickve2gr.d $a6, $vr9, 1 - vstelm.b $vr10, $a6, 3, 6 vpickve2gr.d $a6, $vr7, 0 - vstelm.b $vr10, $a6, 3, 8 + vstelm.b $vr10, $a6, 3, 4 vpickve2gr.d $a6, $vr7, 1 - vstelm.b $vr10, $a6, 3, 10 + vstelm.b $vr10, $a6, 3, 6 vpickve2gr.d $a6, $vr6, 0 - vstelm.b $vr10, $a6, 3, 12 + vstelm.b $vr10, $a6, 3, 8 vpickve2gr.d $a6, $vr6, 1 - vstelm.b $vr10, $a6, 3, 14 + vstelm.b $vr10, $a6, 3, 10 vpickve2gr.d $a6, $vr5, 0 - vmadd.w $vr15, $vr19, $vr1 - vaddi.wu $vr6, $vr15, 7 - vsrli.w $vr6, $vr6, 4 - vstelm.b $vr6, $a6, 3, 0 + vstelm.b $vr10, $a6, 3, 12 vpickve2gr.d $a6, $vr5, 1 + vstelm.b $vr10, $a6, 3, 14 + vpickve2gr.d $a6, $vr4, 0 + vmadd.w $vr9, $vr20, $vr0 vori.b $vr5, $vr8, 0 - vmadd.w $vr5, $vr11, $vr1 + vmadd.w $vr5, $vr13, $vr0 vaddi.wu $vr5, $vr5, 7 + vaddi.wu $vr6, $vr9, 7 + vsrli.w $vr6, $vr6, 4 vsrli.w $vr5, $vr5, 4 vpickev.h $vr5, $vr5, $vr6 + vstelm.b $vr5, $a6, 3, 0 + vpickve2gr.d $a6, $vr4, 1 vstelm.b $vr5, $a6, 3, 2 - vpickve2gr.d $a6, $vr4, 0 + vpickve2gr.d $a6, $vr3, 0 vstelm.b $vr5, $a6, 3, 4 - vpickve2gr.d $a6, $vr4, 1 + vpickve2gr.d $a6, $vr3, 1 vstelm.b $vr5, $a6, 3, 6 - vpickve2gr.d $a6, $vr3, 0 + vpickve2gr.d $a6, $vr2, 0 vstelm.b $vr5, $a6, 3, 8 - vpickve2gr.d $a6, $vr3, 1 + vpickve2gr.d $a6, $vr2, 1 vstelm.b $vr5, $a6, 3, 10 - vpickve2gr.d $a6, $vr2, 0 + vpickve2gr.d $a6, $vr1, 0 vstelm.b $vr5, $a6, 3, 12 - vpickve2gr.d $a6, $vr2, 1 + vpickve2gr.d $a6, $vr1, 1 vstelm.b $vr5, $a6, 3, 14 addi.d $t8, $t8, 32 addi.d $s8, $s8, -16 @@ -1424,25 +1444,24 @@ h2v2_fancy_upsample: # @h2v2_fancy_upsample addi.d $t8, $fp, -2 b .LBB7_3 .LBB7_29: - fld.d $fs6, $sp, 16 # 8-byte Folded Reload - fld.d $fs5, $sp, 24 # 8-byte Folded Reload - fld.d $fs4, $sp, 32 # 8-byte Folded Reload - fld.d $fs3, $sp, 40 # 8-byte Folded Reload - fld.d $fs2, $sp, 48 # 8-byte Folded Reload - fld.d $fs1, $sp, 56 # 8-byte Folded Reload - fld.d $fs0, $sp, 64 # 8-byte Folded Reload - ld.d $s8, $sp, 72 # 8-byte Folded Reload - ld.d $s7, $sp, 80 # 8-byte Folded Reload - ld.d $s6, $sp, 88 # 8-byte Folded Reload - ld.d $s5, $sp, 96 # 8-byte Folded Reload - ld.d $s4, $sp, 104 # 8-byte Folded Reload - ld.d $s3, $sp, 112 # 8-byte Folded Reload - ld.d $s2, $sp, 120 # 8-byte Folded Reload - ld.d $s1, $sp, 128 # 8-byte Folded Reload - ld.d $s0, $sp, 136 # 8-byte Folded Reload - ld.d $fp, $sp, 144 # 8-byte Folded Reload - ld.d $ra, $sp, 152 # 8-byte Folded Reload - addi.d $sp, $sp, 160 + fld.d $fs5, $sp, 8 # 8-byte Folded Reload + fld.d $fs4, $sp, 16 # 8-byte Folded Reload + fld.d $fs3, $sp, 24 # 8-byte Folded Reload + fld.d $fs2, $sp, 32 # 8-byte Folded Reload + fld.d $fs1, $sp, 40 # 8-byte Folded Reload + fld.d $fs0, $sp, 48 # 8-byte Folded Reload + ld.d $s8, $sp, 56 # 8-byte Folded Reload + ld.d $s7, $sp, 64 # 8-byte Folded Reload + ld.d $s6, $sp, 72 # 8-byte Folded Reload + ld.d $s5, $sp, 80 # 8-byte Folded Reload + ld.d $s4, $sp, 88 # 8-byte Folded Reload + ld.d $s3, $sp, 96 # 8-byte Folded Reload + ld.d $s2, $sp, 104 # 8-byte Folded Reload + ld.d $s1, $sp, 112 # 8-byte Folded Reload + ld.d $s0, $sp, 120 # 8-byte Folded Reload + ld.d $fp, $sp, 128 # 8-byte Folded Reload + ld.d $ra, $sp, 136 # 8-byte Folded Reload + addi.d $sp, $sp, 144 .LBB7_30: # %._crit_edge68 ret .Lfunc_end7: diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jfdctfst.s b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jfdctfst.s index 06ae4e82..88726e02 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jfdctfst.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jfdctfst.s @@ -99,12 +99,9 @@ jpeg_fdct_ifast: # @jpeg_fdct_ifast vsub.w $vr3, $vr8, $vr6 vst $vr3, $a0, 128 vadd.w $vr0, $vr0, $vr5 - vshuf4i.w $vr3, $vr0, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr0, $vr0, 50 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr6, $vr0, 32 + vsllwil.d.w $vr3, $vr0, 0 + vshuf4i.w $vr0, $vr0, 14 + vsllwil.d.w $vr6, $vr0, 0 vrepli.d $vr0, 181 vmul.d $vr6, $vr6, $vr0 vmul.d $vr3, $vr3, $vr0 @@ -119,24 +116,18 @@ jpeg_fdct_ifast: # @jpeg_fdct_ifast vadd.w $vr5, $vr1, $vr2 vadd.w $vr6, $vr2, $vr4 vsub.w $vr1, $vr3, $vr6 - vshuf4i.w $vr2, $vr1, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr1, $vr1, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr7, $vr1, 32 + vsllwil.d.w $vr2, $vr1, 0 + vshuf4i.w $vr1, $vr1, 14 + vsllwil.d.w $vr7, $vr1, 0 vrepli.d $vr1, 98 vmul.d $vr7, $vr7, $vr1 vmul.d $vr2, $vr2, $vr1 vsrli.d $vr2, $vr2, 8 vsrli.d $vr7, $vr7, 8 vpickev.w $vr7, $vr7, $vr2 - vshuf4i.w $vr2, $vr3, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr8, $vr2, 32 - vshuf4i.w $vr2, $vr3, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr3, $vr2, 32 + vsllwil.d.w $vr8, $vr3, 0 + vshuf4i.w $vr2, $vr3, 14 + vsllwil.d.w $vr3, $vr2, 0 vrepli.d $vr2, 139 vmul.d $vr3, $vr3, $vr2 vmul.d $vr8, $vr8, $vr2 @@ -144,12 +135,9 @@ jpeg_fdct_ifast: # @jpeg_fdct_ifast vsrli.d $vr3, $vr3, 8 vpickev.w $vr3, $vr3, $vr8 vadd.w $vr8, $vr7, $vr3 - vshuf4i.w $vr3, $vr6, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr9, $vr3, 32 - vshuf4i.w $vr3, $vr6, 50 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr6, $vr3, 32 + vsllwil.d.w $vr9, $vr6, 0 + vshuf4i.w $vr3, $vr6, 14 + vsllwil.d.w $vr6, $vr3, 0 vrepli.d $vr3, 334 vmul.d $vr6, $vr6, $vr3 vmul.d $vr9, $vr9, $vr3 @@ -157,12 +145,9 @@ jpeg_fdct_ifast: # @jpeg_fdct_ifast vsrli.d $vr6, $vr6, 8 vpickev.w $vr6, $vr6, $vr9 vadd.w $vr6, $vr7, $vr6 - vshuf4i.w $vr7, $vr5, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 - vshuf4i.w $vr5, $vr5, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vsllwil.d.w $vr7, $vr5, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.d.w $vr5, $vr5, 0 vmul.d $vr5, $vr5, $vr0 vmul.d $vr7, $vr7, $vr0 vsrli.d $vr7, $vr7, 8 @@ -203,12 +188,9 @@ jpeg_fdct_ifast: # @jpeg_fdct_ifast vsub.w $vr8, $vr12, $vr10 vst $vr8, $a0, 144 vadd.w $vr5, $vr5, $vr9 - vshuf4i.w $vr8, $vr5, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vshuf4i.w $vr5, $vr5, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vsllwil.d.w $vr8, $vr5, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.d.w $vr5, $vr5, 0 vmul.d $vr5, $vr5, $vr0 vmul.d $vr8, $vr8, $vr0 vsrli.d $vr8, $vr8, 8 @@ -222,47 +204,35 @@ jpeg_fdct_ifast: # @jpeg_fdct_ifast vadd.w $vr6, $vr6, $vr7 vadd.w $vr7, $vr7, $vr4 vsub.w $vr8, $vr5, $vr7 - vshuf4i.w $vr9, $vr8, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr8, $vr8, 50 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vsllwil.d.w $vr9, $vr8, 0 + vshuf4i.w $vr8, $vr8, 14 + vsllwil.d.w $vr8, $vr8, 0 vmul.d $vr8, $vr8, $vr1 vmul.d $vr1, $vr9, $vr1 vsrli.d $vr1, $vr1, 8 vsrli.d $vr8, $vr8, 8 vpickev.w $vr1, $vr8, $vr1 - vshuf4i.w $vr8, $vr5, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vshuf4i.w $vr5, $vr5, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vsllwil.d.w $vr8, $vr5, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.d.w $vr5, $vr5, 0 vmul.d $vr5, $vr5, $vr2 vmul.d $vr2, $vr8, $vr2 vsrli.d $vr2, $vr2, 8 vsrli.d $vr5, $vr5, 8 vpickev.w $vr2, $vr5, $vr2 vadd.w $vr2, $vr1, $vr2 - vshuf4i.w $vr5, $vr7, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr7, $vr7, 50 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 + vsllwil.d.w $vr5, $vr7, 0 + vshuf4i.w $vr7, $vr7, 14 + vsllwil.d.w $vr7, $vr7, 0 vmul.d $vr7, $vr7, $vr3 vmul.d $vr3, $vr5, $vr3 vsrli.d $vr3, $vr3, 8 vsrli.d $vr5, $vr7, 8 vpickev.w $vr3, $vr5, $vr3 vadd.w $vr1, $vr1, $vr3 - vshuf4i.w $vr3, $vr6, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr5, $vr6, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vsllwil.d.w $vr3, $vr6, 0 + vshuf4i.w $vr5, $vr6, 14 + vsllwil.d.w $vr5, $vr5, 0 vmul.d $vr5, $vr5, $vr0 vmul.d $vr0, $vr3, $vr0 vsrli.d $vr0, $vr0, 8 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jfdctint.s b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jfdctint.s index 8146d8a1..5fc4a4c9 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jfdctint.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jfdctint.s @@ -127,117 +127,93 @@ jpeg_fdct_islow: # @jpeg_fdct_islow addi.d $t8, $t8, 32 bgez $fp, .LBB0_1 # %bb.2: # %vector.body - vld $vr2, $a0, 0 - vld $vr3, $a0, 224 - vadd.w $vr1, $vr3, $vr2 - vshuf4i.w $vr0, $vr1, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 - vshuf4i.w $vr1, $vr1, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vsub.w $vr2, $vr2, $vr3 - vshuf4i.w $vr3, $vr2, 50 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr13, $vr3, 32 - vld $vr3, $a0, 32 + vld $vr0, $a0, 0 + vld $vr1, $a0, 224 + vadd.w $vr2, $vr1, $vr0 + vsllwil.d.w $vr3, $vr2, 0 + vshuf4i.w $vr2, $vr2, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsub.w $vr0, $vr0, $vr1 + vld $vr1, $a0, 32 vld $vr4, $a0, 192 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr14, $vr2, 32 - vadd.w $vr2, $vr4, $vr3 - vshuf4i.w $vr5, $vr2, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr2, $vr2, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vsub.w $vr3, $vr3, $vr4 - vshuf4i.w $vr4, $vr3, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr15, $vr4, 32 + vshuf4i.w $vr5, $vr0, 14 + vsllwil.d.w $vr13, $vr5, 0 + vsllwil.d.w $vr14, $vr0, 0 + vadd.w $vr0, $vr4, $vr1 + vsllwil.d.w $vr5, $vr0, 0 + vshuf4i.w $vr0, $vr0, 14 + vsllwil.d.w $vr0, $vr0, 0 + vsub.w $vr1, $vr1, $vr4 vld $vr4, $a0, 64 vld $vr6, $a0, 160 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr16, $vr3, 32 - vadd.w $vr3, $vr6, $vr4 - vshuf4i.w $vr7, $vr3, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 - vshuf4i.w $vr3, $vr3, 50 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr7, $vr1, 14 + vsllwil.d.w $vr15, $vr7, 0 + vsllwil.d.w $vr16, $vr1, 0 + vadd.w $vr1, $vr6, $vr4 + vsllwil.d.w $vr7, $vr1, 0 + vshuf4i.w $vr1, $vr1, 14 + vsllwil.d.w $vr1, $vr1, 0 vsub.w $vr4, $vr4, $vr6 - vshuf4i.w $vr6, $vr4, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr17, $vr6, 32 vld $vr6, $a0, 96 vld $vr8, $a0, 128 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr18, $vr4, 32 + vshuf4i.w $vr9, $vr4, 14 + vsllwil.d.w $vr17, $vr9, 0 + vsllwil.d.w $vr18, $vr4, 0 vadd.w $vr4, $vr8, $vr6 - vshuf4i.w $vr9, $vr4, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr4, $vr4, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 + vsllwil.d.w $vr9, $vr4, 0 + vshuf4i.w $vr4, $vr4, 14 + vsllwil.d.w $vr4, $vr4, 0 vsub.w $vr6, $vr6, $vr8 - vshuf4i.w $vr8, $vr6, 50 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr19, $vr8, 32 - vshuf4i.w $vr6, $vr6, 16 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr20, $vr6, 32 - vadd.d $vr6, $vr4, $vr1 - vadd.d $vr8, $vr9, $vr0 - vsub.d $vr9, $vr0, $vr9 - vsub.d $vr1, $vr1, $vr4 - vadd.d $vr0, $vr3, $vr2 - vadd.d $vr4, $vr7, $vr5 + vshuf4i.w $vr8, $vr6, 14 + vsllwil.d.w $vr19, $vr8, 0 + vsllwil.d.w $vr20, $vr6, 0 + vadd.d $vr6, $vr4, $vr2 + vadd.d $vr8, $vr9, $vr3 + vsub.d $vr3, $vr3, $vr9 + vsub.d $vr2, $vr2, $vr4 + vadd.d $vr4, $vr1, $vr0 + vadd.d $vr9, $vr7, $vr5 vsub.d $vr5, $vr5, $vr7 - vsub.d $vr2, $vr2, $vr3 - vadd.d $vr3, $vr4, $vr8 - vadd.d $vr7, $vr0, $vr6 + vsub.d $vr1, $vr0, $vr1 + vadd.d $vr0, $vr9, $vr8 + vadd.d $vr7, $vr4, $vr6 vaddi.du $vr7, $vr7, 2 - vaddi.du $vr3, $vr3, 2 - vsrli.d $vr3, $vr3, 2 + vaddi.du $vr0, $vr0, 2 + vsrli.d $vr0, $vr0, 2 vsrli.d $vr7, $vr7, 2 - vpickev.w $vr3, $vr7, $vr3 - vst $vr3, $a0, 0 - vsub.d $vr3, $vr8, $vr4 - vsub.d $vr0, $vr6, $vr0 + vpickev.w $vr0, $vr7, $vr0 + vst $vr0, $a0, 0 + vsub.d $vr0, $vr8, $vr9 + vsub.d $vr4, $vr6, $vr4 + vaddi.du $vr4, $vr4, 2 vaddi.du $vr0, $vr0, 2 - vaddi.du $vr3, $vr3, 2 - vsrli.d $vr3, $vr3, 2 vsrli.d $vr0, $vr0, 2 - vpickev.w $vr0, $vr0, $vr3 + vsrli.d $vr4, $vr4, 2 + vpickev.w $vr0, $vr4, $vr0 vst $vr0, $a0, 128 - vadd.d $vr3, $vr1, $vr2 - vadd.d $vr4, $vr9, $vr5 + vadd.d $vr4, $vr2, $vr1 + vadd.d $vr8, $vr3, $vr5 vreplgr2vr.d $vr7, $t2 vreplgr2vr.d $vr0, $t4 vreplgr2vr.d $vr6, $t3 - vori.b $vr8, $vr0, 0 vori.b $vr10, $vr0, 0 - vmadd.d $vr10, $vr3, $vr7 - vmadd.d $vr8, $vr4, $vr7 - vori.b $vr3, $vr10, 0 - vmadd.d $vr3, $vr1, $vr6 - vori.b $vr1, $vr8, 0 - vmadd.d $vr1, $vr9, $vr6 - vsrli.d $vr1, $vr1, 15 - vsrli.d $vr3, $vr3, 15 - vpickev.w $vr1, $vr3, $vr1 - vst $vr1, $a0, 64 + vori.b $vr11, $vr0, 0 + vmadd.d $vr11, $vr4, $vr7 + vmadd.d $vr10, $vr8, $vr7 + vori.b $vr4, $vr11, 0 + vmadd.d $vr4, $vr2, $vr6 + vori.b $vr2, $vr10, 0 + vmadd.d $vr2, $vr3, $vr6 + vsrli.d $vr2, $vr2, 15 + vsrli.d $vr3, $vr4, 15 + vpickev.w $vr2, $vr3, $vr2 + vst $vr2, $a0, 64 lu32i.d $t1, 32767 vreplgr2vr.d $vr9, $t1 - vmadd.d $vr10, $vr2, $vr9 - vmadd.d $vr8, $vr5, $vr9 - vsrli.d $vr1, $vr8, 15 - vsrli.d $vr2, $vr10, 15 + vmadd.d $vr11, $vr1, $vr9 + vmadd.d $vr10, $vr5, $vr9 + vsrli.d $vr1, $vr10, 15 + vsrli.d $vr2, $vr11, 15 vpickev.w $vr1, $vr2, $vr1 vst $vr1, $a0, 192 vadd.d $vr21, $vr20, $vr14 @@ -315,67 +291,43 @@ jpeg_fdct_islow: # @jpeg_fdct_islow vpickev.w $vr13, $vr13, $vr14 vst $vr13, $a0, 32 vadd.w $vr13, $vr16, $vr15 - vshuf4i.w $vr14, $vr13, 16 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr21, $vr14, 32 - vshuf4i.w $vr13, $vr13, 50 - vslli.d $vr13, $vr13, 32 - vsrai.d $vr22, $vr13, 32 + vsllwil.d.w $vr21, $vr13, 0 + vshuf4i.w $vr13, $vr13, 14 + vsllwil.d.w $vr22, $vr13, 0 vsub.w $vr14, $vr15, $vr16 - vshuf4i.w $vr13, $vr14, 50 - vslli.d $vr13, $vr13, 32 - vsrai.d $vr13, $vr13, 32 vld $vr15, $a0, 48 vld $vr16, $a0, 208 - vshuf4i.w $vr14, $vr14, 16 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr14, $vr14, 32 + vshuf4i.w $vr13, $vr14, 14 + vsllwil.d.w $vr13, $vr13, 0 + vsllwil.d.w $vr14, $vr14, 0 vadd.w $vr17, $vr16, $vr15 - vshuf4i.w $vr18, $vr17, 16 - vslli.d $vr18, $vr18, 32 - vsrai.d $vr23, $vr18, 32 - vshuf4i.w $vr17, $vr17, 50 - vslli.d $vr17, $vr17, 32 - vsrai.d $vr24, $vr17, 32 + vsllwil.d.w $vr23, $vr17, 0 + vshuf4i.w $vr17, $vr17, 14 + vsllwil.d.w $vr24, $vr17, 0 vsub.w $vr16, $vr15, $vr16 - vshuf4i.w $vr15, $vr16, 50 - vslli.d $vr15, $vr15, 32 - vsrai.d $vr15, $vr15, 32 vld $vr17, $a0, 80 vld $vr18, $a0, 176 - vshuf4i.w $vr16, $vr16, 16 - vslli.d $vr16, $vr16, 32 - vsrai.d $vr16, $vr16, 32 + vshuf4i.w $vr15, $vr16, 14 + vsllwil.d.w $vr15, $vr15, 0 + vsllwil.d.w $vr16, $vr16, 0 vadd.w $vr19, $vr18, $vr17 - vshuf4i.w $vr20, $vr19, 16 - vslli.d $vr20, $vr20, 32 - vsrai.d $vr25, $vr20, 32 - vshuf4i.w $vr19, $vr19, 50 - vslli.d $vr19, $vr19, 32 - vsrai.d $vr26, $vr19, 32 + vsllwil.d.w $vr25, $vr19, 0 + vshuf4i.w $vr19, $vr19, 14 + vsllwil.d.w $vr26, $vr19, 0 vsub.w $vr18, $vr17, $vr18 - vshuf4i.w $vr17, $vr18, 50 - vslli.d $vr17, $vr17, 32 - vsrai.d $vr17, $vr17, 32 vld $vr19, $a0, 112 vld $vr20, $a0, 144 - vshuf4i.w $vr18, $vr18, 16 - vslli.d $vr18, $vr18, 32 - vsrai.d $vr18, $vr18, 32 + vshuf4i.w $vr17, $vr18, 14 + vsllwil.d.w $vr17, $vr17, 0 + vsllwil.d.w $vr18, $vr18, 0 vadd.w $vr27, $vr20, $vr19 - vshuf4i.w $vr28, $vr27, 16 - vslli.d $vr28, $vr28, 32 - vsrai.d $vr28, $vr28, 32 - vshuf4i.w $vr27, $vr27, 50 - vslli.d $vr27, $vr27, 32 - vsrai.d $vr27, $vr27, 32 + vsllwil.d.w $vr28, $vr27, 0 + vshuf4i.w $vr27, $vr27, 14 + vsllwil.d.w $vr27, $vr27, 0 vsub.w $vr20, $vr19, $vr20 - vshuf4i.w $vr19, $vr20, 50 - vslli.d $vr19, $vr19, 32 - vsrai.d $vr19, $vr19, 32 - vshuf4i.w $vr20, $vr20, 16 - vslli.d $vr20, $vr20, 32 - vsrai.d $vr20, $vr20, 32 + vshuf4i.w $vr19, $vr20, 14 + vsllwil.d.w $vr19, $vr19, 0 + vsllwil.d.w $vr20, $vr20, 0 vadd.d $vr29, $vr27, $vr22 vadd.d $vr30, $vr28, $vr21 vsub.d $vr21, $vr21, $vr28 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jquant1.s b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jquant1.s index 2ae9ab1f..1cbca9a9 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jquant1.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jquant1.s @@ -443,15 +443,15 @@ jinit_1pass_quantizer: # @jinit_1pass_quantizer .type start_pass_1_quant,@function start_pass_1_quant: # @start_pass_1_quant # %bb.0: - addi.d $sp, $sp, -112 - st.d $ra, $sp, 104 # 8-byte Folded Spill - st.d $fp, $sp, 96 # 8-byte Folded Spill - st.d $s0, $sp, 88 # 8-byte Folded Spill - st.d $s1, $sp, 80 # 8-byte Folded Spill - st.d $s2, $sp, 72 # 8-byte Folded Spill - st.d $s3, $sp, 64 # 8-byte Folded Spill - st.d $s4, $sp, 56 # 8-byte Folded Spill - st.d $s5, $sp, 48 # 8-byte Folded Spill + addi.d $sp, $sp, -96 + st.d $ra, $sp, 88 # 8-byte Folded Spill + st.d $fp, $sp, 80 # 8-byte Folded Spill + st.d $s0, $sp, 72 # 8-byte Folded Spill + st.d $s1, $sp, 64 # 8-byte Folded Spill + st.d $s2, $sp, 56 # 8-byte Folded Spill + st.d $s3, $sp, 48 # 8-byte Folded Spill + st.d $s4, $sp, 40 # 8-byte Folded Spill + st.d $s5, $sp, 32 # 8-byte Folded Spill move $fp, $a0 ld.d $s0, $a0, 608 ld.d $a1, $s0, 32 @@ -510,15 +510,15 @@ start_pass_1_quant: # @start_pass_1_quant ori $a2, $zero, 47 st.w $a2, $a0, 40 move $a0, $fp - ld.d $s5, $sp, 48 # 8-byte Folded Reload - ld.d $s4, $sp, 56 # 8-byte Folded Reload - ld.d $s3, $sp, 64 # 8-byte Folded Reload - ld.d $s2, $sp, 72 # 8-byte Folded Reload - ld.d $s1, $sp, 80 # 8-byte Folded Reload - ld.d $s0, $sp, 88 # 8-byte Folded Reload - ld.d $fp, $sp, 96 # 8-byte Folded Reload - ld.d $ra, $sp, 104 # 8-byte Folded Reload - addi.d $sp, $sp, 112 + ld.d $s5, $sp, 32 # 8-byte Folded Reload + ld.d $s4, $sp, 40 # 8-byte Folded Reload + ld.d $s3, $sp, 48 # 8-byte Folded Reload + ld.d $s2, $sp, 56 # 8-byte Folded Reload + ld.d $s1, $sp, 64 # 8-byte Folded Reload + ld.d $s0, $sp, 72 # 8-byte Folded Reload + ld.d $fp, $sp, 80 # 8-byte Folded Reload + ld.d $ra, $sp, 88 # 8-byte Folded Reload + addi.d $sp, $sp, 96 jr $a1 .LBB1_10: pcalau12i $a0, %pc_hi20(color_quantize) @@ -543,10 +543,8 @@ start_pass_1_quant: # @start_pass_1_quant pcalau12i $a0, %pc_hi20(base_dither_matrix+8) addi.d $s3, $a0, %pc_lo12(base_dither_matrix+8) move $s4, $zero - vrepli.b $vr3, 0 - vrepli.w $vr4, 255 - vst $vr3, $sp, 32 # 16-byte Folded Spill - vst $vr4, $sp, 16 # 16-byte Folded Spill + vrepli.w $vr3, 255 + vst $vr3, $sp, 16 # 16-byte Folded Spill b .LBB1_17 .p2align 4, , 16 .LBB1_15: # in Loop: Header=BB1_17 Depth=1 @@ -587,8 +585,7 @@ start_pass_1_quant: # @start_pass_1_quant ori $a2, $zero, 1024 move $a0, $fp jirl $ra, $a3, 0 - vld $vr4, $sp, 16 # 16-byte Folded Reload - vld $vr3, $sp, 32 # 16-byte Folded Reload + vld $vr3, $sp, 16 # 16-byte Folded Reload move $a1, $zero slli.d $a2, $s5, 9 addi.d $a2, $a2, -512 @@ -600,17 +597,14 @@ start_pass_1_quant: # @start_pass_1_quant # => This Inner Loop Header: Depth=2 ld.w $a3, $a2, -8 vinsgr2vr.w $vr1, $a3, 0 - vilvl.b $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr3, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 1 - vsub.w $vr1, $vr4, $vr1 - vmul.w $vr1, $vr1, $vr4 - vshuf4i.w $vr2, $vr1, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr1, $vr1, 16 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 + vsub.w $vr1, $vr3, $vr1 + vmul.w $vr1, $vr1, $vr3 + vshuf4i.w $vr2, $vr1, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr1, $vr1, 0 vdiv.d $vr1, $vr1, $vr0 vdiv.d $vr2, $vr2, $vr0 vpickev.w $vr1, $vr2, $vr1 @@ -618,51 +612,42 @@ start_pass_1_quant: # @start_pass_1_quant ld.w $a3, $a2, -4 add.d $a4, $a0, $a1 vinsgr2vr.w $vr1, $a3, 0 - vilvl.b $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr3, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 1 - vsub.w $vr1, $vr4, $vr1 - vmul.w $vr1, $vr1, $vr4 - vshuf4i.w $vr2, $vr1, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr1, $vr1, 16 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 + vsub.w $vr1, $vr3, $vr1 + vmul.w $vr1, $vr1, $vr3 + vshuf4i.w $vr2, $vr1, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr1, $vr1, 0 vdiv.d $vr1, $vr1, $vr0 vdiv.d $vr2, $vr2, $vr0 vpickev.w $vr1, $vr2, $vr1 vst $vr1, $a4, 16 ld.w $a3, $a2, 0 vinsgr2vr.w $vr1, $a3, 0 - vilvl.b $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr3, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 1 - vsub.w $vr1, $vr4, $vr1 - vmul.w $vr1, $vr1, $vr4 - vshuf4i.w $vr2, $vr1, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr1, $vr1, 16 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 + vsub.w $vr1, $vr3, $vr1 + vmul.w $vr1, $vr1, $vr3 + vshuf4i.w $vr2, $vr1, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr1, $vr1, 0 vdiv.d $vr1, $vr1, $vr0 vdiv.d $vr2, $vr2, $vr0 vpickev.w $vr1, $vr2, $vr1 vst $vr1, $a4, 32 ld.w $a3, $a2, 4 vinsgr2vr.w $vr1, $a3, 0 - vilvl.b $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr3, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 1 - vsub.w $vr1, $vr4, $vr1 - vmul.w $vr1, $vr1, $vr4 - vshuf4i.w $vr2, $vr1, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr1, $vr1, 16 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 + vsub.w $vr1, $vr3, $vr1 + vmul.w $vr1, $vr1, $vr3 + vshuf4i.w $vr2, $vr1, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr1, $vr1, 0 vdiv.d $vr1, $vr1, $vr0 vdiv.d $vr2, $vr2, $vr0 vpickev.w $vr1, $vr2, $vr1 @@ -717,15 +702,15 @@ start_pass_1_quant: # @start_pass_1_quant addi.d $s1, $s1, 8 blt $s2, $a0, .LBB1_28 .LBB1_29: # %create_odither_tables.exit - ld.d $s5, $sp, 48 # 8-byte Folded Reload - ld.d $s4, $sp, 56 # 8-byte Folded Reload - ld.d $s3, $sp, 64 # 8-byte Folded Reload - ld.d $s2, $sp, 72 # 8-byte Folded Reload - ld.d $s1, $sp, 80 # 8-byte Folded Reload - ld.d $s0, $sp, 88 # 8-byte Folded Reload - ld.d $fp, $sp, 96 # 8-byte Folded Reload - ld.d $ra, $sp, 104 # 8-byte Folded Reload - addi.d $sp, $sp, 112 + ld.d $s5, $sp, 32 # 8-byte Folded Reload + ld.d $s4, $sp, 40 # 8-byte Folded Reload + ld.d $s3, $sp, 48 # 8-byte Folded Reload + ld.d $s2, $sp, 56 # 8-byte Folded Reload + ld.d $s1, $sp, 64 # 8-byte Folded Reload + ld.d $s0, $sp, 72 # 8-byte Folded Reload + ld.d $fp, $sp, 80 # 8-byte Folded Reload + ld.d $ra, $sp, 88 # 8-byte Folded Reload + addi.d $sp, $sp, 96 ret .Lfunc_end1: .size start_pass_1_quant, .Lfunc_end1-start_pass_1_quant @@ -1011,12 +996,12 @@ color_quantize: # @color_quantize ld.h $t5, $t5, 2 vinsgr2vr.h $vr3, $t6, 0 vinsgr2vr.h $vr4, $t5, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vilvl.w $vr4, $vr0, $vr4 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 ld.d $t5, $t4, -16 vpickve2gr.d $t6, $vr3, 0 ld.d $t7, $t4, -8 @@ -1033,10 +1018,10 @@ color_quantize: # @color_quantize vinsgr2vr.b $vr3, $t6, 1 vinsgr2vr.b $vr4, $t7, 0 vinsgr2vr.b $vr4, $t8, 1 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 vadd.w $vr1, $vr1, $vr3 vadd.w $vr2, $vr2, $vr4 addi.d $t3, $t3, 4 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jquant2.s b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jquant2.s index 5c8380d1..251a91c3 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jquant2.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-jpeg/CMakeFiles/consumer-jpeg.dir/jquant2.s @@ -886,20 +886,20 @@ finish_pass1: # @finish_pass1 ld.d $s0, $sp, 40 # 8-byte Folded Reload ld.d $a0, $s0, 152 ld.d $a0, $a0, 0 - srai.d $a1, $ra, 1 + srai.d $a1, $s7, 1 add.d $a3, $s1, $a1 - div.d $a3, $a3, $ra + div.d $a3, $a3, $s7 ld.d $t0, $sp, 48 # 8-byte Folded Reload stx.b $a3, $a0, $t0 ld.d $a0, $s0, 152 ld.d $a0, $a0, 8 add.d $a3, $s2, $a1 - div.d $a3, $a3, $ra + div.d $a3, $a3, $s7 stx.b $a3, $a0, $t0 ld.d $a0, $s0, 152 ld.d $a0, $a0, 16 - add.d $a1, $s7, $a1 - div.d $a1, $a1, $ra + add.d $a1, $s6, $a1 + div.d $a1, $a1, $s7 stx.b $a1, $a0, $t0 addi.d $t0, $t0, 1 ld.d $fp, $sp, 32 # 8-byte Folded Reload @@ -925,10 +925,10 @@ finish_pass1: # @finish_pass1 # in Loop: Header=BB5_20 Depth=1 st.d $t0, $sp, 48 # 8-byte Folded Spill ld.d $t0, $s0, 608 - move $s7, $zero + move $s6, $zero move $s2, $zero move $s1, $zero - move $ra, $zero + move $s7, $zero ld.w $t1, $a5, 4 ld.d $a5, $t0, 48 ldx.w $a6, $fp, $a6 @@ -983,9 +983,9 @@ finish_pass1: # @finish_pass1 # Child Loop BB5_29 Depth 4 # Child Loop BB5_33 Depth 4 slli.d $a1, $s5, 6 - add.d $s6, $t8, $a1 + add.d $s8, $t8, $a1 slli.d $a1, $s5, 2 - addi.w $s8, $a1, 2 + addi.w $ra, $a1, 2 bgeu $t1, $a2, .LBB5_28 # %bb.27: # in Loop: Header=BB5_26 Depth=3 move $s0, $a4 @@ -993,16 +993,16 @@ finish_pass1: # @finish_pass1 .p2align 4, , 16 .LBB5_28: # %vector.ph # in Loop: Header=BB5_26 Depth=3 - add.d $s6, $s6, $t5 + add.d $s8, $s8, $t5 vori.b $vr7, $vr1, 0 - vinsgr2vr.d $vr7, $s7, 0 + vinsgr2vr.d $vr7, $s6, 0 vori.b $vr6, $vr1, 0 vinsgr2vr.d $vr6, $s2, 0 vori.b $vr5, $vr1, 0 vinsgr2vr.d $vr5, $s1, 0 vori.b $vr4, $vr1, 0 - vinsgr2vr.d $vr4, $ra, 0 - vreplgr2vr.d $vr9, $s8 + vinsgr2vr.d $vr4, $s7, 0 + vreplgr2vr.d $vr9, $ra move $s1, $t3 move $s2, $s4 vori.b $vr12, $vr1, 0 @@ -1021,19 +1021,15 @@ finish_pass1: # @finish_pass1 vinsgr2vr.w $vr14, $a1, 0 vinsgr2vr.w $vr15, $fp, 0 vseqi.h $vr16, $vr14, 0 - vilvl.h $vr16, $vr16, $vr16 - vilvl.w $vr16, $vr16, $vr16 - vslli.d $vr16, $vr16, 48 - vsrai.d $vr16, $vr16, 48 + vsllwil.w.h $vr16, $vr16, 0 + vsllwil.d.w $vr16, $vr16, 0 vseqi.h $vr17, $vr15, 0 - vilvl.h $vr17, $vr17, $vr17 - vilvl.w $vr17, $vr17, $vr17 - vslli.d $vr17, $vr17, 48 - vsrai.d $vr17, $vr17, 48 - vilvl.h $vr14, $vr1, $vr14 - vilvl.w $vr14, $vr1, $vr14 - vilvl.h $vr15, $vr1, $vr15 - vilvl.w $vr15, $vr1, $vr15 + vsllwil.w.h $vr17, $vr17, 0 + vsllwil.d.w $vr17, $vr17, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 vadd.d $vr4, $vr4, $vr14 vadd.d $vr8, $vr8, $vr15 vmul.d $vr18, $vr3, $vr14 @@ -1043,12 +1039,8 @@ finish_pass1: # @finish_pass1 vslli.w $vr22, $vr13, 3 vbitseti.w $vr23, $vr22, 2 vaddi.wu $vr22, $vr22, 20 - vshuf4i.w $vr23, $vr23, 16 - vslli.d $vr23, $vr23, 32 - vsrai.d $vr23, $vr23, 32 - vshuf4i.w $vr22, $vr22, 16 - vslli.d $vr22, $vr22, 32 - vsrai.d $vr22, $vr22, 32 + vsllwil.d.w $vr23, $vr23, 0 + vsllwil.d.w $vr22, $vr22, 0 vmul.d $vr14, $vr14, $vr23 vmul.d $vr15, $vr15, $vr22 vbitsel.v $vr18, $vr18, $vr1, $vr16 @@ -1071,7 +1063,7 @@ finish_pass1: # @finish_pass1 # in Loop: Header=BB5_26 Depth=3 vadd.d $vr7, $vr12, $vr7 vhaddw.q.d $vr7, $vr7, $vr7 - vpickve2gr.d $s7, $vr7, 0 + vpickve2gr.d $s6, $vr7, 0 vadd.d $vr6, $vr11, $vr6 vhaddw.q.d $vr6, $vr6, $vr6 vpickve2gr.d $s2, $vr6, 0 @@ -1080,7 +1072,7 @@ finish_pass1: # @finish_pass1 vpickve2gr.d $s1, $vr5, 0 vadd.d $vr4, $vr8, $vr4 vhaddw.q.d $vr4, $vr4, $vr4 - vpickve2gr.d $ra, $vr4, 0 + vpickve2gr.d $s7, $vr4, 0 move $s0, $t4 beq $t2, $t3, .LBB5_25 .LBB5_31: # %scalar.ph.preheader @@ -1091,7 +1083,7 @@ finish_pass1: # @finish_pass1 b .LBB5_33 .p2align 4, , 16 .LBB5_32: # in Loop: Header=BB5_33 Depth=4 - addi.d $s6, $s6, 2 + addi.d $s8, $s8, 2 addi.w $a1, $a1, -1 addi.w $s0, $s0, 8 beqz $a1, .LBB5_25 @@ -1100,16 +1092,16 @@ finish_pass1: # @finish_pass1 # Parent Loop BB5_24 Depth=2 # Parent Loop BB5_26 Depth=3 # => This Inner Loop Header: Depth=4 - ld.hu $fp, $s6, 0 + ld.hu $fp, $s8, 0 beqz $fp, .LBB5_32 # %bb.34: # in Loop: Header=BB5_33 Depth=4 - add.d $ra, $ra, $fp + add.d $s7, $s7, $fp mul.d $a0, $fp, $s3 add.d $s1, $a0, $s1 - mul.d $a0, $fp, $s8 + mul.d $a0, $fp, $ra add.d $s2, $a0, $s2 mul.d $a0, $fp, $s0 - add.d $s7, $a0, $s7 + add.d $s6, $a0, $s6 b .LBB5_32 .p2align 4, , 16 .LBB5_35: # %.preheader.us.i.i @@ -2018,7 +2010,7 @@ fill_inverse_cmap: # @fill_inverse_cmap fst.d $fs5, $sp, 1896 # 8-byte Folded Spill fst.d $fs6, $sp, 1888 # 8-byte Folded Spill fst.d $fs7, $sp, 1880 # 8-byte Folded Spill - addi.d $sp, $sp, -736 + addi.d $sp, $sp, -608 ld.d $a4, $a0, 608 move $a5, $a1 ld.d $a1, $a4, 48 @@ -2079,6 +2071,14 @@ fill_inverse_cmap: # @fill_inverse_cmap .LBB10_4: # %find_nearby_colors.exit ori $a1, $s2, 4095 vreplgr2vr.d $vr0, $a1 + vst $vr0, $sp, 432 + vst $vr0, $sp, 448 + vst $vr0, $sp, 464 + vst $vr0, $sp, 480 + vst $vr0, $sp, 496 + vst $vr0, $sp, 512 + vst $vr0, $sp, 528 + vst $vr0, $sp, 544 vst $vr0, $sp, 560 vst $vr0, $sp, 576 vst $vr0, $sp, 592 @@ -2135,14 +2135,6 @@ fill_inverse_cmap: # @fill_inverse_cmap vst $vr0, $sp, 1408 vst $vr0, $sp, 1424 vst $vr0, $sp, 1440 - vst $vr0, $sp, 1456 - vst $vr0, $sp, 1472 - vst $vr0, $sp, 1488 - vst $vr0, $sp, 1504 - vst $vr0, $sp, 1520 - vst $vr0, $sp, 1536 - vst $vr0, $sp, 1552 - vst $vr0, $sp, 1568 blez $t2, .LBB10_73 # %bb.5: # %.lr.ph.i37 ld.d $a0, $a0, 152 @@ -2157,7 +2149,7 @@ fill_inverse_cmap: # @fill_inverse_cmap beq $t0, $t2, .LBB10_73 .LBB10_7: # =>This Loop Header: Depth=1 # Child Loop BB10_9 Depth 2 - addi.d $a2, $sp, 304 + addi.d $a2, $sp, 176 ldx.bu $t8, $t0, $a2 ld.d $a2, $a0, 0 ldx.bu $a2, $a2, $t8 @@ -2189,8 +2181,8 @@ fill_inverse_cmap: # @fill_inverse_cmap addi.d $s8, $a2, 1296 addi.d $ra, $a2, 1584 addi.d $a4, $a2, 1872 - addi.d $t5, $sp, 191 - addi.d $t4, $sp, 688 + addi.d $t5, $sp, 63 + addi.d $t4, $sp, 560 move $t7, $a1 b .LBB10_9 .p2align 4, , 16 @@ -2541,7 +2533,7 @@ fill_inverse_cmap: # @fill_inverse_cmap andi $a2, $a2, 28 ld.d $a3, $sp, 32 # 8-byte Folded Reload alsl.d $a2, $a2, $a3, 3 - addi.d $a3, $sp, 176 + addi.d $a3, $sp, 48 slli.d $a4, $a4, 6 ori $a5, $zero, 32 .p2align 4, , 16 @@ -2650,7 +2642,7 @@ fill_inverse_cmap: # @fill_inverse_cmap addi.d $a3, $a3, 32 bne $a0, $a5, .LBB10_74 # %bb.75: - addi.d $sp, $sp, 736 + addi.d $sp, $sp, 608 fld.d $fs7, $sp, 1880 # 8-byte Folded Reload fld.d $fs6, $sp, 1888 # 8-byte Folded Reload fld.d $fs5, $sp, 1896 # 8-byte Folded Reload @@ -2675,328 +2667,250 @@ fill_inverse_cmap: # @fill_inverse_cmap .LBB10_76: # %vector.ph bstrpick.d $a1, $t1, 30, 2 slli.d $s1, $a1, 2 - vinsgr2vr.w $vr18, $a5, 0 - vinsgr2vr.w $vr18, $a5, 1 + vinsgr2vr.w $vr0, $a5, 0 + vinsgr2vr.w $vr0, $a5, 1 vinsgr2vr.w $vr1, $t2, 0 vinsgr2vr.w $vr1, $t2, 1 - vinsgr2vr.w $vr0, $t3, 0 - vinsgr2vr.w $vr0, $t3, 1 - vst $vr0, $sp, 80 # 16-byte Folded Spill + vinsgr2vr.w $vr2, $t3, 0 + vinsgr2vr.w $vr2, $t3, 1 vinsgr2vr.w $vr3, $a6, 0 vinsgr2vr.w $vr3, $a6, 1 vinsgr2vr.w $vr4, $t4, 0 vinsgr2vr.w $vr4, $t4, 1 - vinsgr2vr.w $vr0, $t5, 0 - vinsgr2vr.w $vr0, $t5, 1 - vst $vr0, $sp, 64 # 16-byte Folded Spill + vinsgr2vr.w $vr5, $t5, 0 + vinsgr2vr.w $vr5, $t5, 1 vinsgr2vr.w $vr6, $a7, 0 vinsgr2vr.w $vr6, $a7, 1 vinsgr2vr.w $vr7, $t6, 0 vinsgr2vr.w $vr7, $t6, 1 - vinsgr2vr.w $vr0, $t7, 0 - vinsgr2vr.w $vr0, $t7, 1 - vst $vr0, $sp, 48 # 16-byte Folded Spill - move $a1, $s2 - addi.d $s2, $t8, 2 - addi.d $s3, $sp, 576 - addi.d $s4, $fp, 2 - addi.d $s5, $s0, 2 - ori $a1, $a1, 4095 - vreplgr2vr.d $vr9, $a1 - vrepli.b $vr10, 0 - vrepli.w $vr11, 3 - move $s6, $s1 - vori.b $vr12, $vr9, 0 + vinsgr2vr.w $vr8, $t7, 0 + vinsgr2vr.w $vr8, $t7, 1 + addi.d $a1, $t8, 2 + addi.d $a2, $sp, 448 + addi.d $a3, $fp, 2 + addi.d $a4, $s0, 2 + ori $s2, $s2, 4095 + vreplgr2vr.d $vr9, $s2 + vrepli.w $vr10, 3 + move $s2, $s1 + vori.b $vr11, $vr9, 0 .p2align 4, , 16 .LBB10_77: # %vector.body # =>This Inner Loop Header: Depth=1 - ld.h $a1, $s2, -2 - ld.h $a2, $s2, 0 - vinsgr2vr.h $vr0, $a1, 0 - vinsgr2vr.h $vr2, $a2, 0 - vilvl.b $vr0, $vr10, $vr0 - vilvl.h $vr14, $vr10, $vr0 - vilvl.b $vr0, $vr10, $vr2 - vilvl.h $vr13, $vr10, $vr0 - vsle.wu $vr16, $vr18, $vr14 - vshuf4i.w $vr0, $vr16, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr2, $vr0, 32 - vsle.wu $vr15, $vr18, $vr13 - vshuf4i.w $vr0, $vr15, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 - vslt.wu $vr5, $vr1, $vr14 - vslt.wu $vr8, $vr1, $vr13 - vand.v $vr5, $vr16, $vr5 - vst $vr5, $sp, 144 # 16-byte Folded Spill - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vand.v $vr8, $vr15, $vr8 - vst $vr8, $sp, 160 # 16-byte Folded Spill - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vsub.w $vr19, $vr14, $vr1 - vsub.w $vr20, $vr13, $vr1 - vslli.w $vr19, $vr19, 1 - vslli.w $vr20, $vr20, 1 - vilvl.w $vr19, $vr10, $vr19 - vilvl.w $vr20, $vr10, $vr20 - vmul.d $vr19, $vr19, $vr19 - vmul.d $vr20, $vr20, $vr20 - vsub.w $vr21, $vr14, $vr18 - vsub.w $vr23, $vr13, $vr18 - vslli.w $vr21, $vr21, 1 - vslli.w $vr23, $vr23, 1 - vshuf4i.w $vr21, $vr21, 16 - vslli.d $vr21, $vr21, 32 - vsrai.d $vr21, $vr21, 32 - vshuf4i.w $vr23, $vr23, 16 - vslli.d $vr23, $vr23, 32 - vsrai.d $vr23, $vr23, 32 - vmul.d $vr21, $vr21, $vr21 - vmul.d $vr23, $vr23, $vr23 - vand.v $vr5, $vr5, $vr19 - vbitsel.v $vr2, $vr21, $vr5, $vr2 - vst $vr2, $sp, 128 # 16-byte Folded Spill - ld.h $a1, $s4, -2 - ld.h $a2, $s4, 0 - vand.v $vr2, $vr8, $vr20 - vbitsel.v $vr0, $vr23, $vr2, $vr0 - vst $vr0, $sp, 112 # 16-byte Folded Spill - vinsgr2vr.h $vr0, $a1, 0 - vinsgr2vr.h $vr2, $a2, 0 - vilvl.b $vr0, $vr10, $vr0 - vilvl.h $vr21, $vr10, $vr0 - vilvl.b $vr0, $vr10, $vr2 - vilvl.h $vr20, $vr10, $vr0 - vsle.wu $vr24, $vr3, $vr21 - vshuf4i.w $vr0, $vr24, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr5, $vr0, 32 - vsle.wu $vr23, $vr3, $vr20 - vshuf4i.w $vr0, $vr23, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 - vslt.wu $vr2, $vr4, $vr21 - vslt.wu $vr8, $vr4, $vr20 - vand.v $vr26, $vr24, $vr2 - vshuf4i.w $vr2, $vr26, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr19, $vr2, 32 - vand.v $vr25, $vr23, $vr8 - vshuf4i.w $vr2, $vr25, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vsub.w $vr8, $vr21, $vr4 - vsub.w $vr27, $vr20, $vr4 - vmul.w $vr8, $vr8, $vr11 - vmul.w $vr27, $vr27, $vr11 - vilvl.w $vr8, $vr10, $vr8 - vilvl.w $vr27, $vr10, $vr27 - vmul.d $vr8, $vr8, $vr8 + ld.h $s3, $a1, -2 + ld.h $s4, $a1, 0 + vinsgr2vr.h $vr12, $s3, 0 + vinsgr2vr.h $vr13, $s4, 0 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsle.wu $vr14, $vr0, $vr12 + vsllwil.d.w $vr15, $vr14, 0 + vsle.wu $vr16, $vr0, $vr13 + vsllwil.d.w $vr17, $vr16, 0 + vslt.wu $vr18, $vr1, $vr12 + vslt.wu $vr19, $vr1, $vr13 + vslt.wu $vr20, $vr2, $vr12 + vslt.wu $vr21, $vr2, $vr13 + vand.v $vr18, $vr14, $vr18 + vsllwil.d.w $vr22, $vr18, 0 + vand.v $vr19, $vr16, $vr19 + vsllwil.d.w $vr23, $vr19, 0 + vsub.w $vr24, $vr12, $vr1 + vsub.w $vr25, $vr13, $vr1 + vslli.w $vr24, $vr24, 1 + vslli.w $vr25, $vr25, 1 + vsllwil.du.wu $vr24, $vr24, 0 + vsllwil.du.wu $vr25, $vr25, 0 + vmul.d $vr24, $vr24, $vr24 + vmul.d $vr25, $vr25, $vr25 + vsub.w $vr26, $vr12, $vr0 + vsub.w $vr27, $vr13, $vr0 + vslli.w $vr26, $vr26, 1 + vslli.w $vr27, $vr27, 1 + vsllwil.d.w $vr26, $vr26, 0 + vsllwil.d.w $vr27, $vr27, 0 + vmul.d $vr26, $vr26, $vr26 vmul.d $vr27, $vr27, $vr27 - vsub.w $vr28, $vr21, $vr3 - vsub.w $vr29, $vr20, $vr3 - vmul.w $vr28, $vr28, $vr11 - vmul.w $vr29, $vr29, $vr11 - vshuf4i.w $vr28, $vr28, 16 - vslli.d $vr28, $vr28, 32 - vsrai.d $vr28, $vr28, 32 - vshuf4i.w $vr29, $vr29, 16 - vslli.d $vr29, $vr29, 32 - vsrai.d $vr29, $vr29, 32 + vor.v $vr18, $vr18, $vr20 + vand.v $vr14, $vr14, $vr18 + vpickve2gr.d $s3, $vr14, 0 + vinsgr2vr.w $vr18, $s3, 0 + vsllwil.d.w $vr14, $vr14, 0 + vpickve2gr.d $s3, $vr14, 1 + vinsgr2vr.w $vr18, $s3, 1 + vslli.w $vr14, $vr18, 31 + vsrai.w $vr14, $vr14, 31 + vbitsel.v $vr18, $vr1, $vr0, $vr14 + vor.v $vr14, $vr19, $vr21 + vand.v $vr14, $vr16, $vr14 + vpickve2gr.d $s3, $vr14, 0 + vinsgr2vr.w $vr16, $s3, 0 + vsllwil.d.w $vr14, $vr14, 0 + vpickve2gr.d $s3, $vr14, 1 + vinsgr2vr.w $vr16, $s3, 1 + vslli.w $vr14, $vr16, 31 + vsrai.w $vr14, $vr14, 31 + vbitsel.v $vr16, $vr1, $vr0, $vr14 + vand.v $vr14, $vr22, $vr24 + vbitsel.v $vr14, $vr26, $vr14, $vr15 + vand.v $vr15, $vr23, $vr25 + vbitsel.v $vr15, $vr27, $vr15, $vr17 + ld.h $s3, $a3, -2 + vsub.w $vr12, $vr12, $vr18 + vsub.w $vr13, $vr13, $vr16 + vslli.w $vr12, $vr12, 1 + vinsgr2vr.h $vr16, $s3, 0 + ld.h $s3, $a3, 0 + vslli.w $vr13, $vr13, 1 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr13, $vr13, 0 + vinsgr2vr.h $vr17, $s3, 0 + vsllwil.hu.bu $vr16, $vr16, 0 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.hu.bu $vr17, $vr17, 0 + vsllwil.wu.hu $vr17, $vr17, 0 + vsle.wu $vr18, $vr3, $vr16 + vsllwil.d.w $vr19, $vr18, 0 + vsle.wu $vr20, $vr3, $vr17 + vsllwil.d.w $vr21, $vr20, 0 + vslt.wu $vr22, $vr4, $vr16 + vslt.wu $vr23, $vr4, $vr17 + vslt.wu $vr24, $vr5, $vr16 + vslt.wu $vr25, $vr5, $vr17 + vand.v $vr22, $vr18, $vr22 + vsllwil.d.w $vr26, $vr22, 0 + vand.v $vr23, $vr20, $vr23 + vsllwil.d.w $vr27, $vr23, 0 + vsub.w $vr28, $vr16, $vr4 + vsub.w $vr29, $vr17, $vr4 + vmul.w $vr28, $vr28, $vr10 + vmul.w $vr29, $vr29, $vr10 + vsllwil.du.wu $vr28, $vr28, 0 + vsllwil.du.wu $vr29, $vr29, 0 vmul.d $vr28, $vr28, $vr28 vmul.d $vr29, $vr29, $vr29 - vand.v $vr8, $vr19, $vr8 - vbitsel.v $vr5, $vr28, $vr8, $vr5 - vst $vr5, $sp, 96 # 16-byte Folded Spill - ld.h $a1, $s5, -2 - ld.h $a2, $s5, 0 - vand.v $vr2, $vr2, $vr27 - vbitsel.v $vr8, $vr29, $vr2, $vr0 - vinsgr2vr.h $vr0, $a1, 0 - vinsgr2vr.h $vr2, $a2, 0 - vilvl.b $vr0, $vr10, $vr0 - vilvl.h $vr28, $vr10, $vr0 - vilvl.b $vr0, $vr10, $vr2 - vilvl.h $vr27, $vr10, $vr0 - vsle.wu $vr30, $vr6, $vr28 - vshuf4i.w $vr0, $vr30, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 - vsle.wu $vr29, $vr6, $vr27 - vshuf4i.w $vr2, $vr29, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr19, $vr2, 32 - vslt.wu $vr2, $vr7, $vr28 - vslt.wu $vr22, $vr7, $vr27 - vand.v $vr5, $vr30, $vr2 - vshuf4i.w $vr2, $vr5, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr17, $vr2, 32 - vand.v $vr2, $vr29, $vr22 - vsub.w $vr22, $vr28, $vr7 - vilvl.w $vr22, $vr10, $vr22 - vmul.d $vr22, $vr22, $vr22 - vand.v $vr17, $vr17, $vr22 - vsub.w $vr22, $vr28, $vr6 - vshuf4i.w $vr22, $vr22, 16 - vslli.d $vr22, $vr22, 32 - vsrai.d $vr22, $vr22, 32 - vmul.d $vr22, $vr22, $vr22 - vbitsel.v $vr0, $vr22, $vr17, $vr0 - vshuf4i.w $vr17, $vr2, 16 - vslli.d $vr17, $vr17, 32 - vsrai.d $vr17, $vr17, 32 - vsub.w $vr22, $vr27, $vr7 - vilvl.w $vr22, $vr10, $vr22 - vmul.d $vr22, $vr22, $vr22 - vand.v $vr17, $vr17, $vr22 - vsub.w $vr22, $vr27, $vr6 - vshuf4i.w $vr22, $vr22, 16 - vslli.d $vr22, $vr22, 32 - vsrai.d $vr22, $vr22, 32 - vmul.d $vr22, $vr22, $vr22 - vbitsel.v $vr19, $vr22, $vr17, $vr19 - vld $vr22, $sp, 80 # 16-byte Folded Reload - vslt.wu $vr17, $vr22, $vr14 - vld $vr31, $sp, 144 # 16-byte Folded Reload - vor.v $vr17, $vr31, $vr17 - vand.v $vr16, $vr16, $vr17 - vshuf4i.w $vr16, $vr16, 16 - vslli.d $vr16, $vr16, 32 - vsrai.d $vr16, $vr16, 32 - vpickve2gr.d $a1, $vr16, 0 - vpickve2gr.d $a2, $vr16, 1 - vslt.wu $vr16, $vr22, $vr13 - vld $vr17, $sp, 160 # 16-byte Folded Reload - vor.v $vr16, $vr17, $vr16 - vand.v $vr15, $vr15, $vr16 - vshuf4i.w $vr15, $vr15, 16 - vslli.d $vr15, $vr15, 32 - vsrai.d $vr15, $vr15, 32 - vpickve2gr.d $a3, $vr15, 0 - vpickve2gr.d $a4, $vr15, 1 - vinsgr2vr.w $vr15, $a1, 0 - vinsgr2vr.w $vr15, $a2, 1 - vslli.w $vr15, $vr15, 31 - vsrai.w $vr15, $vr15, 31 - vbitsel.v $vr15, $vr1, $vr18, $vr15 - vsub.w $vr14, $vr14, $vr15 - vinsgr2vr.w $vr15, $a3, 0 - vinsgr2vr.w $vr15, $a4, 1 - vslli.w $vr15, $vr15, 31 - vsrai.w $vr15, $vr15, 31 - vbitsel.v $vr15, $vr1, $vr18, $vr15 - vsub.w $vr13, $vr13, $vr15 - vld $vr16, $sp, 64 # 16-byte Folded Reload - vslt.wu $vr15, $vr16, $vr21 - vor.v $vr15, $vr26, $vr15 - vand.v $vr15, $vr24, $vr15 - vshuf4i.w $vr15, $vr15, 16 - vslli.d $vr15, $vr15, 32 - vsrai.d $vr15, $vr15, 32 - vpickve2gr.d $a1, $vr15, 0 - vpickve2gr.d $a2, $vr15, 1 - vslt.wu $vr15, $vr16, $vr20 - vor.v $vr15, $vr25, $vr15 - vand.v $vr15, $vr23, $vr15 - vshuf4i.w $vr15, $vr15, 16 - vslli.d $vr15, $vr15, 32 - vsrai.d $vr15, $vr15, 32 - vpickve2gr.d $a3, $vr15, 0 - vpickve2gr.d $a4, $vr15, 1 - vld $vr15, $sp, 128 # 16-byte Folded Reload - vld $vr16, $sp, 96 # 16-byte Folded Reload - vadd.d $vr15, $vr15, $vr16 - vld $vr16, $sp, 112 # 16-byte Folded Reload - vadd.d $vr8, $vr16, $vr8 - vinsgr2vr.w $vr16, $a1, 0 - vinsgr2vr.w $vr16, $a2, 1 - vslli.w $vr16, $vr16, 31 - vsrai.w $vr16, $vr16, 31 - vbitsel.v $vr16, $vr4, $vr3, $vr16 - vsub.w $vr16, $vr21, $vr16 - vinsgr2vr.w $vr17, $a3, 0 - vinsgr2vr.w $vr17, $a4, 1 - vslli.w $vr17, $vr17, 31 - vsrai.w $vr17, $vr17, 31 - vbitsel.v $vr17, $vr4, $vr3, $vr17 - vsub.w $vr17, $vr20, $vr17 - vslli.w $vr14, $vr14, 1 - vshuf4i.w $vr14, $vr14, 16 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr14, $vr14, 32 - vmul.w $vr16, $vr16, $vr11 - vshuf4i.w $vr16, $vr16, 16 - vslli.d $vr16, $vr16, 32 - vsrai.d $vr16, $vr16, 32 + vsub.w $vr30, $vr16, $vr3 + vsub.w $vr31, $vr17, $vr3 + vmul.w $vr30, $vr30, $vr10 + vmul.w $vr31, $vr31, $vr10 + vsllwil.d.w $vr30, $vr30, 0 + vsllwil.d.w $vr31, $vr31, 0 + vmul.d $vr30, $vr30, $vr30 + vmul.d $vr31, $vr31, $vr31 + vor.v $vr22, $vr22, $vr24 + vand.v $vr18, $vr18, $vr22 + vpickve2gr.d $s3, $vr18, 0 + vinsgr2vr.w $vr22, $s3, 0 + vsllwil.d.w $vr18, $vr18, 0 + vpickve2gr.d $s3, $vr18, 1 + vinsgr2vr.w $vr22, $s3, 1 + vslli.w $vr18, $vr22, 31 + vsrai.w $vr18, $vr18, 31 + vbitsel.v $vr18, $vr4, $vr3, $vr18 + vor.v $vr22, $vr23, $vr25 + vand.v $vr20, $vr20, $vr22 + vpickve2gr.d $s3, $vr20, 0 + vinsgr2vr.w $vr22, $s3, 0 + vsllwil.d.w $vr20, $vr20, 0 + vpickve2gr.d $s3, $vr20, 1 + vinsgr2vr.w $vr22, $s3, 1 + vslli.w $vr20, $vr22, 31 + vsrai.w $vr20, $vr20, 31 + vbitsel.v $vr20, $vr4, $vr3, $vr20 + vand.v $vr22, $vr26, $vr28 + vbitsel.v $vr19, $vr30, $vr22, $vr19 + vadd.d $vr14, $vr14, $vr19 + vand.v $vr19, $vr27, $vr29 + vbitsel.v $vr19, $vr31, $vr19, $vr21 + vadd.d $vr15, $vr15, $vr19 + vsub.w $vr16, $vr16, $vr18 + vsub.w $vr17, $vr17, $vr20 + vmul.w $vr16, $vr16, $vr10 + vmul.w $vr17, $vr17, $vr10 + ld.h $s3, $a4, -2 + vsllwil.d.w $vr16, $vr16, 0 + vsllwil.d.w $vr17, $vr17, 0 vmul.d $vr16, $vr16, $vr16 - vmadd.d $vr16, $vr14, $vr14 - vslli.w $vr13, $vr13, 1 - vshuf4i.w $vr13, $vr13, 16 - vslli.d $vr13, $vr13, 32 - vsrai.d $vr13, $vr13, 32 - vmul.w $vr14, $vr17, $vr11 - vshuf4i.w $vr14, $vr14, 16 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr14, $vr14, 32 - vmul.d $vr14, $vr14, $vr14 - vmadd.d $vr14, $vr13, $vr13 - vld $vr17, $sp, 48 # 16-byte Folded Reload - vslt.wu $vr13, $vr17, $vr28 - vor.v $vr5, $vr5, $vr13 - vand.v $vr5, $vr30, $vr5 - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vpickve2gr.d $a1, $vr5, 0 - vpickve2gr.d $a2, $vr5, 1 - vslt.wu $vr5, $vr17, $vr27 - vor.v $vr2, $vr2, $vr5 - vand.v $vr2, $vr29, $vr2 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vpickve2gr.d $a3, $vr2, 0 - vpickve2gr.d $a4, $vr2, 1 - vadd.d $vr0, $vr15, $vr0 - vadd.d $vr2, $vr8, $vr19 - vinsgr2vr.w $vr5, $a1, 0 - vinsgr2vr.w $vr5, $a2, 1 - vslli.w $vr5, $vr5, 31 - vsrai.w $vr5, $vr5, 31 - vbitsel.v $vr5, $vr7, $vr6, $vr5 - vsub.w $vr5, $vr28, $vr5 - vinsgr2vr.w $vr8, $a3, 0 - vinsgr2vr.w $vr8, $a4, 1 - vslli.w $vr8, $vr8, 31 - vsrai.w $vr8, $vr8, 31 - vbitsel.v $vr8, $vr7, $vr6, $vr8 - vsub.w $vr8, $vr27, $vr8 - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vmadd.d $vr16, $vr5, $vr5 - vshuf4i.w $vr5, $vr8, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vmadd.d $vr14, $vr5, $vr5 - vst $vr0, $s3, -16 - vst $vr2, $s3, 0 + vinsgr2vr.h $vr18, $s3, 0 + ld.h $s3, $a4, 0 + vmul.d $vr17, $vr17, $vr17 + vmadd.d $vr16, $vr12, $vr12 + vmadd.d $vr17, $vr13, $vr13 + vinsgr2vr.h $vr12, $s3, 0 + vsllwil.hu.bu $vr13, $vr18, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsle.wu $vr18, $vr6, $vr13 + vsllwil.d.w $vr19, $vr18, 0 + vsle.wu $vr20, $vr6, $vr12 + vsllwil.d.w $vr21, $vr20, 0 + vslt.wu $vr22, $vr7, $vr13 + vslt.wu $vr23, $vr7, $vr12 + vslt.wu $vr24, $vr8, $vr13 + vslt.wu $vr25, $vr8, $vr12 + vand.v $vr22, $vr18, $vr22 + vsllwil.d.w $vr26, $vr22, 0 + vand.v $vr23, $vr20, $vr23 + vsllwil.d.w $vr27, $vr23, 0 + vsub.w $vr28, $vr13, $vr7 + vsub.w $vr29, $vr12, $vr7 + vsllwil.du.wu $vr28, $vr28, 0 + vsllwil.du.wu $vr29, $vr29, 0 + vmul.d $vr28, $vr28, $vr28 + vmul.d $vr29, $vr29, $vr29 + vsub.w $vr30, $vr13, $vr6 + vsub.w $vr31, $vr12, $vr6 + vsllwil.d.w $vr30, $vr30, 0 + vsllwil.d.w $vr31, $vr31, 0 + vmul.d $vr30, $vr30, $vr30 + vmul.d $vr31, $vr31, $vr31 + vor.v $vr22, $vr22, $vr24 + vand.v $vr18, $vr18, $vr22 + vpickve2gr.d $s3, $vr18, 0 + vinsgr2vr.w $vr22, $s3, 0 + vsllwil.d.w $vr18, $vr18, 0 + vpickve2gr.d $s3, $vr18, 1 + vinsgr2vr.w $vr22, $s3, 1 + vslli.w $vr18, $vr22, 31 + vsrai.w $vr18, $vr18, 31 + vbitsel.v $vr18, $vr7, $vr6, $vr18 + vor.v $vr22, $vr23, $vr25 + vand.v $vr20, $vr20, $vr22 + vpickve2gr.d $s3, $vr20, 0 + vinsgr2vr.w $vr22, $s3, 0 + vsllwil.d.w $vr20, $vr20, 0 + vpickve2gr.d $s3, $vr20, 1 + vinsgr2vr.w $vr22, $s3, 1 + vslli.w $vr20, $vr22, 31 + vsrai.w $vr20, $vr20, 31 + vbitsel.v $vr20, $vr7, $vr6, $vr20 + vand.v $vr22, $vr26, $vr28 + vbitsel.v $vr19, $vr30, $vr22, $vr19 + vadd.d $vr14, $vr14, $vr19 + vand.v $vr19, $vr27, $vr29 + vbitsel.v $vr19, $vr31, $vr19, $vr21 + vadd.d $vr15, $vr15, $vr19 + vsub.w $vr13, $vr13, $vr18 + vsub.w $vr12, $vr12, $vr20 + vsllwil.d.w $vr13, $vr13, 0 + vsllwil.d.w $vr12, $vr12, 0 + vmadd.d $vr16, $vr13, $vr13 + vmadd.d $vr17, $vr12, $vr12 + vst $vr14, $a2, -16 + vst $vr15, $a2, 0 vmin.d $vr9, $vr16, $vr9 - vmin.d $vr12, $vr14, $vr12 - addi.d $s6, $s6, -4 - addi.d $s2, $s2, 4 - addi.d $s3, $s3, 32 - addi.d $s4, $s4, 4 - addi.d $s5, $s5, 4 - bnez $s6, .LBB10_77 + vmin.d $vr11, $vr17, $vr11 + addi.d $s2, $s2, -4 + addi.d $a1, $a1, 4 + addi.d $a2, $a2, 32 + addi.d $a3, $a3, 4 + addi.d $a4, $a4, 4 + bnez $s2, .LBB10_77 # %bb.78: # %middle.block - vmin.d $vr0, $vr9, $vr12 + vmin.d $vr0, $vr9, $vr11 vbsrl.v $vr1, $vr0, 8 vmin.d $vr0, $vr1, $vr0 vpickve2gr.d $a1, $vr0, 0 @@ -3004,8 +2918,8 @@ fill_inverse_cmap: # @fill_inverse_cmap .LBB10_79: # %.lr.ph170.i.preheader move $a2, $zero move $t2, $zero - addi.d $a3, $sp, 560 - addi.d $a4, $sp, 304 + addi.d $a3, $sp, 432 + addi.d $a4, $sp, 176 lu12i.w $s2, 524287 b .LBB10_81 .p2align 4, , 16 @@ -3024,7 +2938,7 @@ fill_inverse_cmap: # @fill_inverse_cmap b .LBB10_80 .LBB10_83: # %scalar.ph.preheader sub.d $a2, $t1, $s1 - addi.d $a3, $sp, 560 + addi.d $a3, $sp, 432 alsl.d $a3, $s1, $a3, 3 add.d $a4, $s0, $s1 add.d $fp, $fp, $s1 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/formatBitstream.s b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/formatBitstream.s index 995da378..22d26d38 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/formatBitstream.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/formatBitstream.s @@ -36,7 +36,7 @@ BF_BitstreamFrame: # @BF_BitstreamFrame ld.d $s3, $a2, %pc_lo12(side_queue_free) st.d $a1, $sp, 8 # 8-byte Folded Spill move $fp, $a0 - st.d $a0, $sp, 88 # 8-byte Folded Spill + st.d $a0, $sp, 104 # 8-byte Folded Spill beqz $s3, .LBB1_12 # %bb.1: ld.d $a1, $s3, 0 @@ -92,7 +92,7 @@ BF_BitstreamFrame: # @BF_BitstreamFrame ld.d $a2, $s3, 32 move $a1, $a2 ld.d $a2, $a2, 8 - ld.d $a3, $sp, 88 # 8-byte Folded Reload + ld.d $a3, $sp, 104 # 8-byte Folded Reload ld.d $fp, $a3, 24 st.w $zero, $a2, 0 ld.w $a2, $fp, 0 @@ -214,12 +214,12 @@ BF_BitstreamFrame: # @BF_BitstreamFrame bne $s0, $s1, .LBB1_15 # %bb.16: # %.preheader181.i st.d $s6, $sp, 64 # 8-byte Folded Spill - ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $s0, $sp, 104 # 8-byte Folded Reload ld.w $a3, $s0, 4 blez $a3, .LBB1_145 # %bb.17: # %.preheader180.lr.ph.i st.d $fp, $sp, 56 # 8-byte Folded Spill - st.d $s8, $sp, 96 # 8-byte Folded Spill + st.d $s8, $sp, 112 # 8-byte Folded Spill st.d $s7, $sp, 72 # 8-byte Folded Spill move $s2, $zero move $s8, $zero @@ -230,7 +230,7 @@ BF_BitstreamFrame: # @BF_BitstreamFrame # Child Loop BB1_19 Depth 2 move $fp, $a3 move $s7, $s2 - ld.d $s0, $sp, 96 # 8-byte Folded Reload + ld.d $s0, $sp, 112 # 8-byte Folded Reload ld.d $s6, $sp, 64 # 8-byte Folded Reload .p2align 4, , 16 .LBB1_19: # Parent Loop BB1_18 Depth=1 @@ -267,16 +267,16 @@ BF_BitstreamFrame: # @BF_BitstreamFrame bne $s8, $fp, .LBB1_18 # %bb.21: ld.d $a0, $sp, 80 # 8-byte Folded Reload - ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $s0, $sp, 104 # 8-byte Folded Reload ld.d $s3, $sp, 64 # 8-byte Folded Reload ld.d $s7, $sp, 72 # 8-byte Folded Reload - ld.d $s8, $sp, 96 # 8-byte Folded Reload + ld.d $s8, $sp, 112 # 8-byte Folded Reload ld.d $fp, $sp, 56 # 8-byte Folded Reload b .LBB1_2 .LBB1_22: move $a0, $a1 .LBB1_23: # %BF_LoadHolderFromBitstreamPart.exit108.i - ld.d $t3, $sp, 88 # 8-byte Folded Reload + ld.d $t3, $sp, 104 # 8-byte Folded Reload ld.d $a1, $t3, 16 ld.wu $a2, $a1, 0 addi.w $a3, $a2, 0 @@ -303,7 +303,6 @@ BF_BitstreamFrame: # @BF_BitstreamFrame addi.d $a1, $a4, 36 move $a4, $a0 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB1_28: # %vector.body # =>This Inner Loop Header: Depth=1 @@ -311,27 +310,27 @@ BF_BitstreamFrame: # @BF_BitstreamFrame ld.h $a6, $a1, -24 ld.h $a7, $a1, -16 ld.h $t0, $a1, -8 - vinsgr2vr.h $vr3, $a5, 0 - vinsgr2vr.h $vr3, $a6, 1 - vinsgr2vr.h $vr3, $a7, 2 - vinsgr2vr.h $vr3, $t0, 3 + vinsgr2vr.h $vr2, $a5, 0 + vinsgr2vr.h $vr2, $a6, 1 + vinsgr2vr.h $vr2, $a7, 2 + vinsgr2vr.h $vr2, $t0, 3 ld.h $a5, $a1, 0 ld.h $a6, $a1, 8 ld.h $a7, $a1, 16 ld.h $t0, $a1, 24 - vinsgr2vr.h $vr4, $a5, 0 - vinsgr2vr.h $vr4, $a6, 1 - vinsgr2vr.h $vr4, $a7, 2 - vinsgr2vr.h $vr4, $t0, 3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.h $vr3, $a5, 0 + vinsgr2vr.h $vr3, $a6, 1 + vinsgr2vr.h $vr3, $a7, 2 + vinsgr2vr.h $vr3, $t0, 3 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 addi.d $a4, $a4, -8 addi.d $a1, $a1, 64 bnez $a4, .LBB1_28 # %bb.29: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a1, $vr0, 0 @@ -374,7 +373,6 @@ BF_BitstreamFrame: # @BF_BitstreamFrame addi.d $a5, $a6, 36 move $a6, $a2 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB1_37: # %vector.body150 # =>This Inner Loop Header: Depth=1 @@ -382,27 +380,27 @@ BF_BitstreamFrame: # @BF_BitstreamFrame ld.h $t0, $a5, -24 ld.h $t1, $a5, -16 ld.h $t2, $a5, -8 - vinsgr2vr.h $vr3, $a7, 0 - vinsgr2vr.h $vr3, $t0, 1 - vinsgr2vr.h $vr3, $t1, 2 - vinsgr2vr.h $vr3, $t2, 3 + vinsgr2vr.h $vr2, $a7, 0 + vinsgr2vr.h $vr2, $t0, 1 + vinsgr2vr.h $vr2, $t1, 2 + vinsgr2vr.h $vr2, $t2, 3 ld.h $a7, $a5, 0 ld.h $t0, $a5, 8 ld.h $t1, $a5, 16 ld.h $t2, $a5, 24 - vinsgr2vr.h $vr4, $a7, 0 - vinsgr2vr.h $vr4, $t0, 1 - vinsgr2vr.h $vr4, $t1, 2 - vinsgr2vr.h $vr4, $t2, 3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.h $vr3, $a7, 0 + vinsgr2vr.h $vr3, $t0, 1 + vinsgr2vr.h $vr3, $t1, 2 + vinsgr2vr.h $vr3, $t2, 3 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 addi.d $a6, $a6, -8 addi.d $a5, $a5, 64 bnez $a6, .LBB1_37 # %bb.38: # %middle.block164 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a5, $vr0, 0 @@ -427,16 +425,16 @@ BF_BitstreamFrame: # @BF_BitstreamFrame ld.d $a0, $sp, 64 # 8-byte Folded Reload addi.d $t1, $a0, 40 addi.d $t2, $t3, 32 - vrepli.b $vr4, 0 - st.d $t1, $sp, 80 # 8-byte Folded Spill - vst $vr4, $sp, 96 # 16-byte Folded Spill + vrepli.b $vr0, 0 + vst $vr0, $sp, 80 # 16-byte Folded Spill + st.d $t1, $sp, 112 # 8-byte Folded Spill b .LBB1_45 .p2align 4, , 16 .LBB1_43: # in Loop: Header=BB1_45 Depth=1 move $a3, $zero .LBB1_44: # %BF_PartLength.exit143.i # in Loop: Header=BB1_45 Depth=1 - ld.d $a0, $sp, 88 # 8-byte Folded Reload + ld.d $a0, $sp, 104 # 8-byte Folded Reload ld.w $a0, $a0, 8 addi.d $fp, $fp, 1 add.d $s6, $a3, $s6 @@ -475,7 +473,6 @@ BF_BitstreamFrame: # @BF_BitstreamFrame ld.w $a0, $a1, 0 addi.d $a2, $a0, 1 move $s2, $s3 - vld $vr4, $sp, 96 # 16-byte Folded Reload .LBB1_48: # %BF_addElement.exit.i125.i # in Loop: Header=BB1_49 Depth=2 alsl.d $a3, $s8, $s0, 3 @@ -542,7 +539,7 @@ BF_BitstreamFrame: # @BF_BitstreamFrame .p2align 4, , 16 .LBB1_53: # %BF_LoadHolderFromBitstreamPart.exit134.i # in Loop: Header=BB1_45 Depth=1 - ld.d $t1, $sp, 80 # 8-byte Folded Reload + ld.d $t1, $sp, 112 # 8-byte Folded Reload alsl.d $a0, $fp, $t1, 3 move $t2, $s7 alsl.d $a1, $fp, $s7, 3 @@ -570,8 +567,8 @@ BF_BitstreamFrame: # @BF_BitstreamFrame add.d $a2, $a4, $a2 addi.d $a3, $a4, 36 move $a4, $a1 - vori.b $vr0, $vr4, 0 - vori.b $vr1, $vr4, 0 + vld $vr1, $sp, 80 # 16-byte Folded Reload + vori.b $vr0, $vr1, 0 .p2align 4, , 16 .LBB1_57: # %vector.body175 # Parent Loop BB1_45 Depth=1 @@ -592,8 +589,8 @@ BF_BitstreamFrame: # @BF_BitstreamFrame vinsgr2vr.h $vr3, $a6, 1 vinsgr2vr.h $vr3, $a7, 2 vinsgr2vr.h $vr3, $t0, 3 - vilvl.h $vr2, $vr4, $vr2 - vilvl.h $vr3, $vr4, $vr3 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr3, $vr3, 0 vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 addi.d $a4, $a4, -8 @@ -621,7 +618,7 @@ BF_BitstreamFrame: # @BF_BitstreamFrame bnez $a0, .LBB1_60 b .LBB1_44 .LBB1_61: # %.preheader179.i - ld.d $t1, $sp, 88 # 8-byte Folded Reload + ld.d $t1, $sp, 104 # 8-byte Folded Reload ld.w $a1, $t1, 4 blez $a1, .LBB1_86 # %bb.62: # %.preheader178.lr.ph.i @@ -633,8 +630,8 @@ BF_BitstreamFrame: # @BF_BitstreamFrame st.d $a2, $sp, 56 # 8-byte Folded Spill addi.d $a2, $t1, 48 st.d $a2, $sp, 48 # 8-byte Folded Spill - vrepli.b $vr4, 0 - vst $vr4, $sp, 96 # 16-byte Folded Spill + vrepli.b $vr0, 0 + vst $vr0, $sp, 80 # 16-byte Folded Spill b .LBB1_66 .p2align 4, , 16 .LBB1_64: # %._crit_edge.loopexit.i @@ -665,8 +662,8 @@ BF_BitstreamFrame: # @BF_BitstreamFrame .p2align 4, , 16 .LBB1_68: # in Loop: Header=BB1_70 Depth=2 move $a3, $zero - ld.d $t1, $sp, 88 # 8-byte Folded Reload - ld.d $s6, $sp, 80 # 8-byte Folded Reload + ld.d $t1, $sp, 104 # 8-byte Folded Reload + ld.d $s6, $sp, 112 # 8-byte Folded Reload .LBB1_69: # %BF_PartLength.exit166.i # in Loop: Header=BB1_70 Depth=2 ld.w $a0, $t1, 8 @@ -679,7 +676,7 @@ BF_BitstreamFrame: # @BF_BitstreamFrame # Child Loop BB1_77 Depth 4 # Child Loop BB1_82 Depth 3 # Child Loop BB1_85 Depth 3 - st.d $s6, $sp, 80 # 8-byte Folded Spill + st.d $s6, $sp, 112 # 8-byte Folded Spill slli.d $a0, $s0, 3 move $s1, $t2 ldx.d $s2, $t2, $a0 @@ -710,7 +707,6 @@ BF_BitstreamFrame: # @BF_BitstreamFrame ld.w $a0, $a1, 0 addi.d $a2, $a0, 1 move $s2, $s3 - vld $vr4, $sp, 96 # 16-byte Folded Reload .LBB1_73: # %BF_addElement.exit.i148.i # in Loop: Header=BB1_74 Depth=3 alsl.d $a3, $fp, $s7, 3 @@ -791,9 +787,9 @@ BF_BitstreamFrame: # @BF_BitstreamFrame # %bb.79: # %.lr.ph.preheader.i159.i # in Loop: Header=BB1_70 Depth=2 ld.d $a4, $a1, 8 - ld.d $t1, $sp, 88 # 8-byte Folded Reload + ld.d $t1, $sp, 104 # 8-byte Folded Reload ori $a1, $zero, 8 - ld.d $s6, $sp, 80 # 8-byte Folded Reload + ld.d $s6, $sp, 112 # 8-byte Folded Reload bgeu $a0, $a1, .LBB1_81 # %bb.80: # in Loop: Header=BB1_70 Depth=2 move $a3, $zero @@ -809,8 +805,8 @@ BF_BitstreamFrame: # @BF_BitstreamFrame add.d $a2, $a4, $a2 addi.d $a3, $a4, 36 move $a4, $a1 - vori.b $vr0, $vr4, 0 - vori.b $vr1, $vr4, 0 + vld $vr1, $sp, 80 # 16-byte Folded Reload + vori.b $vr0, $vr1, 0 .p2align 4, , 16 .LBB1_82: # %vector.body200 # Parent Loop BB1_66 Depth=1 @@ -832,8 +828,8 @@ BF_BitstreamFrame: # @BF_BitstreamFrame vinsgr2vr.h $vr3, $a6, 1 vinsgr2vr.h $vr3, $a7, 2 vinsgr2vr.h $vr3, $t0, 3 - vilvl.h $vr2, $vr4, $vr2 - vilvl.h $vr3, $vr4, $vr3 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr3, $vr3, 0 vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 addi.d $a4, $a4, -8 @@ -931,7 +927,7 @@ BF_BitstreamFrame: # @BF_BitstreamFrame move $s7, $zero .LBB1_96: # %writePartMainData.exit58.i # in Loop: Header=BB1_97 Depth=2 - ld.d $t1, $sp, 88 # 8-byte Folded Reload + ld.d $t1, $sp, 104 # 8-byte Folded Reload ld.w $a0, $t1, 8 ld.d $a1, $sp, 80 # 8-byte Folded Reload add.d $a1, $s8, $a1 @@ -1026,7 +1022,7 @@ BF_BitstreamFrame: # @BF_BitstreamFrame .LBB1_107: # %writePartMainData.exit.i # in Loop: Header=BB1_97 Depth=2 ld.d $a0, $sp, 56 # 8-byte Folded Reload - st.d $s7, $sp, 96 # 8-byte Folded Spill + st.d $s7, $sp, 112 # 8-byte Folded Spill ldx.d $s6, $a0, $s7 ld.w $a0, $s6, 0 beqz $a0, .LBB1_116 @@ -1104,7 +1100,7 @@ BF_BitstreamFrame: # @BF_BitstreamFrame .LBB1_117: # %writePartMainData.exit44.i # in Loop: Header=BB1_97 Depth=2 ld.d $a0, $sp, 48 # 8-byte Folded Reload - ld.d $a1, $sp, 96 # 8-byte Folded Reload + ld.d $a1, $sp, 112 # 8-byte Folded Reload ldx.d $a2, $a0, $a1 ld.w $a0, $a2, 0 beqz $a0, .LBB1_95 @@ -1114,7 +1110,7 @@ BF_BitstreamFrame: # @BF_BitstreamFrame move $s7, $zero move $fp, $zero addi.d $s6, $a1, 4 - st.d $a2, $sp, 96 # 8-byte Folded Spill + st.d $a2, $sp, 112 # 8-byte Folded Spill b .LBB1_120 .p2align 4, , 16 .LBB1_119: # in Loop: Header=BB1_120 Depth=3 @@ -1171,7 +1167,7 @@ BF_BitstreamFrame: # @BF_BitstreamFrame sub.d $a0, $a1, $s1 st.w $a0, $s3, %pc_lo12(BitsRemaining) ld.hu $a1, $s6, 0 - ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload ld.w $a0, $a0, 0 add.d $s7, $a1, $s7 addi.w $fp, $fp, 1 @@ -1561,7 +1557,6 @@ BF_PartLength: # @BF_PartLength addi.d $a4, $a5, 36 move $a5, $a0 vori.b $vr1, $vr0, 0 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB3_5: # %vector.body # =>This Inner Loop Header: Depth=1 @@ -1569,27 +1564,27 @@ BF_PartLength: # @BF_PartLength ld.h $a7, $a4, -24 ld.h $t0, $a4, -16 ld.h $t1, $a4, -8 - vinsgr2vr.h $vr3, $a6, 0 - vinsgr2vr.h $vr3, $a7, 1 - vinsgr2vr.h $vr3, $t0, 2 - vinsgr2vr.h $vr3, $t1, 3 + vinsgr2vr.h $vr2, $a6, 0 + vinsgr2vr.h $vr2, $a7, 1 + vinsgr2vr.h $vr2, $t0, 2 + vinsgr2vr.h $vr2, $t1, 3 ld.h $a6, $a4, 0 ld.h $a7, $a4, 8 ld.h $t0, $a4, 16 ld.h $t1, $a4, 24 - vinsgr2vr.h $vr4, $a6, 0 - vinsgr2vr.h $vr4, $a7, 1 - vinsgr2vr.h $vr4, $t0, 2 - vinsgr2vr.h $vr4, $t1, 3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.h $vr4, $vr0, $vr4 + vinsgr2vr.h $vr3, $a6, 0 + vinsgr2vr.h $vr3, $a7, 1 + vinsgr2vr.h $vr3, $t0, 2 + vinsgr2vr.h $vr3, $t1, 3 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 addi.d $a5, $a5, -8 addi.d $a4, $a4, 64 bnez $a5, .LBB3_5 # %bb.6: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr1, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a4, $vr0, 0 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/lame.s b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/lame.s index 42c44227..c65d221d 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/lame.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/lame.s @@ -2265,7 +2265,7 @@ lame_encode_buffer: # @lame_encode_buffer ld.d $s5, $sp, 56 # 8-byte Folded Reload .LBB5_17: # %.loopexit # in Loop: Header=BB5_18 Depth=1 - sub.w $s1, $s1, $s8 + sub.w $s1, $s1, $s0 blez $s1, .LBB5_46 .LBB5_18: # =>This Loop Header: Depth=1 # Child Loop BB5_22 Depth 2 @@ -2280,7 +2280,7 @@ lame_encode_buffer: # @lame_encode_buffer # in Loop: Header=BB5_18 Depth=1 move $s4, $zero move $s3, $zero - addi.d $s0, $sp, 72 + addi.d $s8, $sp, 72 ld.d $s7, $sp, 24 # 8-byte Folded Reload b .LBB5_22 .p2align 4, , 16 @@ -2288,22 +2288,22 @@ lame_encode_buffer: # @lame_encode_buffer slt $a0, $a2, $s1 masknez $a3, $s1, $a0 maskeqz $a0, $a2, $a0 - or $s8, $a0, $a3 - slli.d $a2, $s8, 1 + or $s0, $a0, $a3 + slli.d $a2, $s0, 1 move $a0, $a1 move $a1, $s5 pcaddu18i $ra, %call36(memcpy) jirl $ra, $ra, 0 - st.w $s8, $sp, 68 - move $a0, $s8 + st.w $s0, $sp, 68 + move $a0, $s0 .LBB5_21: # in Loop: Header=BB5_22 Depth=2 - alsl.d $a1, $s8, $s5, 1 - st.d $a1, $s0, 0 + alsl.d $a1, $s0, $s5, 1 + st.d $a1, $s8, 0 ld.w $a1, $s2, 204 addi.d $s3, $s3, 1 addi.w $s4, $s4, 1 add.d $s7, $s7, $fp - addi.d $s0, $s0, 8 + addi.d $s8, $s8, 8 bge $s3, $a1, .LBB5_25 .LBB5_22: # %.lr.ph81 # Parent Loop BB5_18 Depth=1 @@ -2311,7 +2311,7 @@ lame_encode_buffer: # @lame_encode_buffer fld.s $fa0, $s2, 216 ld.w $a0, $s6, %pc_lo12(mf_size) ld.w $a2, $s2, 188 - ld.d $s5, $s0, 0 + ld.d $s5, $s8, 0 vldi $vr1, -1168 fcmp.ceq.s $fcc0, $fa0, $fa1 alsl.d $a1, $a0, $s7, 1 @@ -2324,11 +2324,11 @@ lame_encode_buffer: # @lame_encode_buffer move $a6, $s4 pcaddu18i $ra, %call36(fill_buffer_resample) jirl $ra, $ra, 0 - ld.w $s8, $sp, 68 + ld.w $s0, $sp, 68 b .LBB5_21 .p2align 4, , 16 .LBB5_24: # in Loop: Header=BB5_18 Depth=1 - move $s8, $zero + move $s0, $zero move $a0, $zero .LBB5_25: # %._crit_edge # in Loop: Header=BB5_18 Depth=1 @@ -2505,21 +2505,15 @@ lame_encode_buffer: # @lame_encode_buffer .LBB5_48: # %vector.body # =>This Inner Loop Header: Depth=1 vld $vr1, $a1, 0 - vilvl.h $vr2, $vr1, $vr1 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vld $vr3, $a2, 0 - vilvh.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 - vilvl.h $vr4, $vr3, $vr3 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvh.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vadd.w $vr1, $vr3, $vr1 - vadd.w $vr2, $vr4, $vr2 + vld $vr2, $a2, 0 + vsllwil.w.h $vr3, $vr1, 0 + vbsrl.v $vr1, $vr1, 8 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.w.h $vr4, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 + vadd.w $vr1, $vr2, $vr1 + vadd.w $vr2, $vr4, $vr3 vsrli.w $vr3, $vr2, 31 vadd.w $vr2, $vr2, $vr3 vsrli.w $vr2, $vr2, 1 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/layer3.s b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/layer3.s index 82f90712..b62b206b 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/layer3.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/layer3.s @@ -2204,12 +2204,8 @@ init_layer3: # @init_layer3 ld.d $t0, $a2, -82 vinsgr2vr.d $vr3, $a7, 0 vinsgr2vr.d $vr4, $t0, 0 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr4, $vr4, 0 vaddi.wu $vr3, $vr3, 7 vaddi.wu $vr4, $vr4, 7 vmuh.w $vr3, $vr3, $vr1 @@ -2232,12 +2228,8 @@ init_layer3: # @init_layer3 vst $vr4, $a0, -72 vinsgr2vr.d $vr3, $a7, 0 vinsgr2vr.d $vr4, $t0, 0 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr4, $vr4, 0 vaddi.wu $vr3, $vr3, 7 vaddi.wu $vr4, $vr4, 7 vmuh.w $vr3, $vr3, $vr1 @@ -2352,9 +2344,7 @@ init_layer3: # @init_layer3 ld.d $t1, $a2, 0 add.d $a7, $t0, $a1 vinsgr2vr.d $vr3, $t1, 0 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 + vsllwil.w.h $vr3, $vr3, 0 vadd.w $vr3, $vr3, $vr2 vmuh.w $vr3, $vr3, $vr1 vsrai.w $vr3, $vr3, 2 @@ -2366,9 +2356,7 @@ init_layer3: # @init_layer3 vbitsel.v $vr3, $vr0, $vr4, $vr3 vstx $vr3, $t0, $a1 vinsgr2vr.d $vr3, $t1, 0 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 + vsllwil.w.h $vr3, $vr3, 0 vadd.w $vr3, $vr3, $vr2 vmuh.w $vr3, $vr3, $vr1 vsrai.w $vr3, $vr3, 2 @@ -2380,9 +2368,7 @@ init_layer3: # @init_layer3 vbitsel.v $vr3, $vr0, $vr4, $vr3 vst $vr3, $a7, 16 vinsgr2vr.d $vr3, $t0, 0 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 + vsllwil.w.h $vr3, $vr3, 0 vadd.w $vr3, $vr3, $vr2 vmuh.w $vr3, $vr3, $vr1 vsrai.w $vr3, $vr3, 2 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/takehiro.s b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/takehiro.s index a978f41f..29a463bb 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/takehiro.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/takehiro.s @@ -529,59 +529,52 @@ choose_table_short: # @choose_table_short vinsgr2vr.w $vr1, $a6, 0 addi.d $t5, $t5, 16 move $t6, $t4 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB1_18: # %vector.body168 # =>This Inner Loop Header: Depth=1 - vld $vr3, $t5, -16 - vld $vr4, $t5, 0 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr6, $vr4, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vpickve2gr.d $t7, $vr3, 0 - vpickve2gr.d $t8, $vr3, 1 - vpickve2gr.d $fp, $vr5, 0 - vpickve2gr.d $s0, $vr5, 1 - vpickve2gr.d $s1, $vr4, 0 - vpickve2gr.d $s2, $vr4, 1 - vpickve2gr.d $s3, $vr6, 0 - vpickve2gr.d $s4, $vr6, 1 + vld $vr2, $t5, -16 + vld $vr3, $t5, 0 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 + vpickve2gr.d $t7, $vr2, 0 + vpickve2gr.d $t8, $vr2, 1 + vpickve2gr.d $fp, $vr4, 0 + vpickve2gr.d $s0, $vr4, 1 + vpickve2gr.d $s1, $vr3, 0 + vpickve2gr.d $s2, $vr3, 1 + vpickve2gr.d $s3, $vr5, 0 + vpickve2gr.d $s4, $vr5, 1 ldx.b $t7, $t1, $t7 ldx.b $t8, $t1, $t8 ldx.b $fp, $t1, $fp ldx.b $s0, $t1, $s0 - vinsgr2vr.b $vr3, $t7, 0 - vinsgr2vr.b $vr3, $t8, 1 - vinsgr2vr.b $vr3, $fp, 2 - vinsgr2vr.b $vr3, $s0, 3 + vinsgr2vr.b $vr2, $t7, 0 + vinsgr2vr.b $vr2, $t8, 1 + vinsgr2vr.b $vr2, $fp, 2 + vinsgr2vr.b $vr2, $s0, 3 ldx.b $t7, $t1, $s1 ldx.b $t8, $t1, $s2 ldx.b $fp, $t1, $s3 ldx.b $s0, $t1, $s4 - vinsgr2vr.b $vr4, $t7, 0 - vinsgr2vr.b $vr4, $t8, 1 - vinsgr2vr.b $vr4, $fp, 2 - vinsgr2vr.b $vr4, $s0, 3 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 + vinsgr2vr.b $vr3, $t7, 0 + vinsgr2vr.b $vr3, $t8, 1 + vinsgr2vr.b $vr3, $fp, 2 + vinsgr2vr.b $vr3, $s0, 3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr3 addi.d $t6, $t6, -8 addi.d $t5, $t5, 32 bnez $t6, .LBB1_18 # %bb.19: # %middle.block177 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t5, $vr0, 0 @@ -657,59 +650,52 @@ choose_table_short: # @choose_table_short vinsgr2vr.w $vr1, $a6, 0 addi.d $a6, $t2, 16 move $t2, $t1 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB1_24: # %vector.body190 # =>This Inner Loop Header: Depth=1 - vld $vr3, $a6, -16 - vld $vr4, $a6, 0 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr6, $vr4, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vpickve2gr.d $t3, $vr3, 0 - vpickve2gr.d $t4, $vr3, 1 - vpickve2gr.d $t5, $vr5, 0 - vpickve2gr.d $t6, $vr5, 1 - vpickve2gr.d $t7, $vr4, 0 - vpickve2gr.d $t8, $vr4, 1 - vpickve2gr.d $fp, $vr6, 0 - vpickve2gr.d $s0, $vr6, 1 + vld $vr2, $a6, -16 + vld $vr3, $a6, 0 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 + vpickve2gr.d $t3, $vr2, 0 + vpickve2gr.d $t4, $vr2, 1 + vpickve2gr.d $t5, $vr4, 0 + vpickve2gr.d $t6, $vr4, 1 + vpickve2gr.d $t7, $vr3, 0 + vpickve2gr.d $t8, $vr3, 1 + vpickve2gr.d $fp, $vr5, 0 + vpickve2gr.d $s0, $vr5, 1 ldx.b $t3, $a7, $t3 ldx.b $t4, $a7, $t4 ldx.b $t5, $a7, $t5 ldx.b $t6, $a7, $t6 - vinsgr2vr.b $vr3, $t3, 0 - vinsgr2vr.b $vr3, $t4, 1 - vinsgr2vr.b $vr3, $t5, 2 - vinsgr2vr.b $vr3, $t6, 3 + vinsgr2vr.b $vr2, $t3, 0 + vinsgr2vr.b $vr2, $t4, 1 + vinsgr2vr.b $vr2, $t5, 2 + vinsgr2vr.b $vr2, $t6, 3 ldx.b $t3, $a7, $t7 ldx.b $t4, $a7, $t8 ldx.b $t5, $a7, $fp ldx.b $t6, $a7, $s0 - vinsgr2vr.b $vr4, $t3, 0 - vinsgr2vr.b $vr4, $t4, 1 - vinsgr2vr.b $vr4, $t5, 2 - vinsgr2vr.b $vr4, $t6, 3 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 + vinsgr2vr.b $vr3, $t3, 0 + vinsgr2vr.b $vr3, $t4, 1 + vinsgr2vr.b $vr3, $t5, 2 + vinsgr2vr.b $vr3, $t6, 3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr3 addi.d $t2, $t2, -8 addi.d $a6, $a6, 32 bnez $t2, .LBB1_24 # %bb.25: # %middle.block199 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a6, $vr0, 0 @@ -1005,59 +991,52 @@ choose_table_short: # @choose_table_short vinsgr2vr.w $vr1, $a6, 0 addi.d $a6, $t0, 16 move $t0, $a7 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB1_65: # %vector.body148 # =>This Inner Loop Header: Depth=1 - vld $vr3, $a6, -16 - vld $vr4, $a6, 0 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr6, $vr4, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vpickve2gr.d $t1, $vr3, 0 - vpickve2gr.d $t2, $vr3, 1 - vpickve2gr.d $t3, $vr5, 0 - vpickve2gr.d $t4, $vr5, 1 - vpickve2gr.d $t5, $vr4, 0 - vpickve2gr.d $t6, $vr4, 1 - vpickve2gr.d $t7, $vr6, 0 - vpickve2gr.d $t8, $vr6, 1 + vld $vr2, $a6, -16 + vld $vr3, $a6, 0 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 + vpickve2gr.d $t1, $vr2, 0 + vpickve2gr.d $t2, $vr2, 1 + vpickve2gr.d $t3, $vr4, 0 + vpickve2gr.d $t4, $vr4, 1 + vpickve2gr.d $t5, $vr3, 0 + vpickve2gr.d $t6, $vr3, 1 + vpickve2gr.d $t7, $vr5, 0 + vpickve2gr.d $t8, $vr5, 1 ldx.b $t1, $a5, $t1 ldx.b $t2, $a5, $t2 ldx.b $t3, $a5, $t3 ldx.b $t4, $a5, $t4 - vinsgr2vr.b $vr3, $t1, 0 - vinsgr2vr.b $vr3, $t2, 1 - vinsgr2vr.b $vr3, $t3, 2 - vinsgr2vr.b $vr3, $t4, 3 + vinsgr2vr.b $vr2, $t1, 0 + vinsgr2vr.b $vr2, $t2, 1 + vinsgr2vr.b $vr2, $t3, 2 + vinsgr2vr.b $vr2, $t4, 3 ldx.b $t1, $a5, $t5 ldx.b $t2, $a5, $t6 ldx.b $t3, $a5, $t7 ldx.b $t4, $a5, $t8 - vinsgr2vr.b $vr4, $t1, 0 - vinsgr2vr.b $vr4, $t2, 1 - vinsgr2vr.b $vr4, $t3, 2 - vinsgr2vr.b $vr4, $t4, 3 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 + vinsgr2vr.b $vr3, $t1, 0 + vinsgr2vr.b $vr3, $t2, 1 + vinsgr2vr.b $vr3, $t3, 2 + vinsgr2vr.b $vr3, $t4, 3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr3 addi.d $t0, $t0, -8 addi.d $a6, $a6, 32 bnez $t0, .LBB1_65 # %bb.66: # %middle.block156 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a6, $vr0, 0 @@ -1951,59 +1930,52 @@ choose_table: # @choose_table vinsgr2vr.w $vr1, $a5, 0 addi.d $t4, $t4, 16 move $t5, $t3 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB3_18: # %vector.body207 # =>This Inner Loop Header: Depth=1 - vld $vr3, $t4, -16 - vld $vr4, $t4, 0 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr6, $vr4, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vpickve2gr.d $t6, $vr3, 0 - vpickve2gr.d $t7, $vr3, 1 - vpickve2gr.d $t8, $vr5, 0 - vpickve2gr.d $fp, $vr5, 1 - vpickve2gr.d $s0, $vr4, 0 - vpickve2gr.d $s1, $vr4, 1 - vpickve2gr.d $s2, $vr6, 0 - vpickve2gr.d $s3, $vr6, 1 + vld $vr2, $t4, -16 + vld $vr3, $t4, 0 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 + vpickve2gr.d $t6, $vr2, 0 + vpickve2gr.d $t7, $vr2, 1 + vpickve2gr.d $t8, $vr4, 0 + vpickve2gr.d $fp, $vr4, 1 + vpickve2gr.d $s0, $vr3, 0 + vpickve2gr.d $s1, $vr3, 1 + vpickve2gr.d $s2, $vr5, 0 + vpickve2gr.d $s3, $vr5, 1 ldx.b $t6, $t0, $t6 ldx.b $t7, $t0, $t7 ldx.b $t8, $t0, $t8 ldx.b $fp, $t0, $fp - vinsgr2vr.b $vr3, $t6, 0 - vinsgr2vr.b $vr3, $t7, 1 - vinsgr2vr.b $vr3, $t8, 2 - vinsgr2vr.b $vr3, $fp, 3 + vinsgr2vr.b $vr2, $t6, 0 + vinsgr2vr.b $vr2, $t7, 1 + vinsgr2vr.b $vr2, $t8, 2 + vinsgr2vr.b $vr2, $fp, 3 ldx.b $t6, $t0, $s0 ldx.b $t7, $t0, $s1 ldx.b $t8, $t0, $s2 ldx.b $fp, $t0, $s3 - vinsgr2vr.b $vr4, $t6, 0 - vinsgr2vr.b $vr4, $t7, 1 - vinsgr2vr.b $vr4, $t8, 2 - vinsgr2vr.b $vr4, $fp, 3 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 + vinsgr2vr.b $vr3, $t6, 0 + vinsgr2vr.b $vr3, $t7, 1 + vinsgr2vr.b $vr3, $t8, 2 + vinsgr2vr.b $vr3, $fp, 3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr3 addi.d $t5, $t5, -8 addi.d $t4, $t4, 32 bnez $t5, .LBB3_18 # %bb.19: # %middle.block216 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t4, $vr0, 0 @@ -2057,59 +2029,52 @@ choose_table: # @choose_table vinsgr2vr.w $vr1, $a5, 0 addi.d $a5, $t2, 16 move $t2, $t1 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB3_25: # %vector.body227 # =>This Inner Loop Header: Depth=1 - vld $vr3, $a5, -16 - vld $vr4, $a5, 0 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr6, $vr4, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vpickve2gr.d $t3, $vr3, 0 - vpickve2gr.d $t4, $vr3, 1 - vpickve2gr.d $t5, $vr5, 0 - vpickve2gr.d $t6, $vr5, 1 - vpickve2gr.d $t7, $vr4, 0 - vpickve2gr.d $t8, $vr4, 1 - vpickve2gr.d $fp, $vr6, 0 - vpickve2gr.d $s0, $vr6, 1 + vld $vr2, $a5, -16 + vld $vr3, $a5, 0 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 + vpickve2gr.d $t3, $vr2, 0 + vpickve2gr.d $t4, $vr2, 1 + vpickve2gr.d $t5, $vr4, 0 + vpickve2gr.d $t6, $vr4, 1 + vpickve2gr.d $t7, $vr3, 0 + vpickve2gr.d $t8, $vr3, 1 + vpickve2gr.d $fp, $vr5, 0 + vpickve2gr.d $s0, $vr5, 1 ldx.b $t3, $a6, $t3 ldx.b $t4, $a6, $t4 ldx.b $t5, $a6, $t5 ldx.b $t6, $a6, $t6 - vinsgr2vr.b $vr3, $t3, 0 - vinsgr2vr.b $vr3, $t4, 1 - vinsgr2vr.b $vr3, $t5, 2 - vinsgr2vr.b $vr3, $t6, 3 + vinsgr2vr.b $vr2, $t3, 0 + vinsgr2vr.b $vr2, $t4, 1 + vinsgr2vr.b $vr2, $t5, 2 + vinsgr2vr.b $vr2, $t6, 3 ldx.b $t3, $a6, $t7 ldx.b $t4, $a6, $t8 ldx.b $t5, $a6, $fp ldx.b $t6, $a6, $s0 - vinsgr2vr.b $vr4, $t3, 0 - vinsgr2vr.b $vr4, $t4, 1 - vinsgr2vr.b $vr4, $t5, 2 - vinsgr2vr.b $vr4, $t6, 3 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 + vinsgr2vr.b $vr3, $t3, 0 + vinsgr2vr.b $vr3, $t4, 1 + vinsgr2vr.b $vr3, $t5, 2 + vinsgr2vr.b $vr3, $t6, 3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr3 addi.d $t2, $t2, -8 addi.d $a5, $a5, 32 bnez $t2, .LBB3_25 # %bb.26: # %middle.block236 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a5, $vr0, 0 @@ -2223,16 +2188,15 @@ choose_table: # @choose_table vreplgr2vr.w $vr0, $a6 vreplgr2vr.w $vr1, $a7 addi.d $a0, $a0, 32 - vrepli.b $vr2, 0 - vrepli.w $vr3, 14 - vrepli.w $vr4, 15 + vrepli.b $vr8, 0 + vrepli.w $vr2, 14 + vrepli.w $vr3, 15 move $t4, $t3 - vori.b $vr9, $vr2, 0 - vori.b $vr10, $vr2, 0 - vori.b $vr7, $vr2, 0 - vori.b $vr8, $vr2, 0 - vori.b $vr6, $vr2, 0 - vori.b $vr5, $vr2, 0 + vori.b $vr9, $vr8, 0 + vori.b $vr6, $vr8, 0 + vori.b $vr7, $vr8, 0 + vori.b $vr5, $vr8, 0 + vori.b $vr4, $vr8, 0 .p2align 4, , 16 .LBB3_49: # %vector.body141 # =>This Inner Loop Header: Depth=1 @@ -2240,149 +2204,143 @@ choose_table: # @choose_table ld.w $t6, $a0, -24 ld.w $t7, $a0, -16 ld.w $t8, $a0, -8 - vinsgr2vr.w $vr11, $t5, 0 - vinsgr2vr.w $vr11, $t6, 1 - vinsgr2vr.w $vr11, $t7, 2 - vinsgr2vr.w $vr11, $t8, 3 + vinsgr2vr.w $vr10, $t5, 0 + vinsgr2vr.w $vr10, $t6, 1 + vinsgr2vr.w $vr10, $t7, 2 + vinsgr2vr.w $vr10, $t8, 3 ld.w $t5, $a0, 0 ld.w $t6, $a0, 8 ld.w $t7, $a0, 16 ld.w $t8, $a0, 24 - vinsgr2vr.w $vr12, $t5, 0 - vinsgr2vr.w $vr12, $t6, 1 - vinsgr2vr.w $vr12, $t7, 2 - vinsgr2vr.w $vr12, $t8, 3 + vinsgr2vr.w $vr11, $t5, 0 + vinsgr2vr.w $vr11, $t6, 1 + vinsgr2vr.w $vr11, $t7, 2 + vinsgr2vr.w $vr11, $t8, 3 ld.w $t5, $a0, -28 ld.w $t6, $a0, -20 ld.w $t7, $a0, -12 ld.w $t8, $a0, -4 - vinsgr2vr.w $vr13, $t5, 0 - vinsgr2vr.w $vr13, $t6, 1 - vinsgr2vr.w $vr13, $t7, 2 - vinsgr2vr.w $vr13, $t8, 3 + vinsgr2vr.w $vr12, $t5, 0 + vinsgr2vr.w $vr12, $t6, 1 + vinsgr2vr.w $vr12, $t7, 2 + vinsgr2vr.w $vr12, $t8, 3 ld.w $t5, $a0, 4 ld.w $t6, $a0, 12 ld.w $t7, $a0, 20 ld.w $t8, $a0, 28 - vinsgr2vr.w $vr14, $t5, 0 - vinsgr2vr.w $vr14, $t6, 1 - vinsgr2vr.w $vr14, $t7, 2 - vinsgr2vr.w $vr14, $t8, 3 - vslt.w $vr15, $vr3, $vr11 - vslt.w $vr16, $vr3, $vr12 + vinsgr2vr.w $vr13, $t5, 0 + vinsgr2vr.w $vr13, $t6, 1 + vinsgr2vr.w $vr13, $t7, 2 + vinsgr2vr.w $vr13, $t8, 3 + vslt.w $vr14, $vr2, $vr10 + vslt.w $vr15, $vr2, $vr11 + vmini.w $vr16, $vr10, 15 vmini.w $vr17, $vr11, 15 - vmini.w $vr18, $vr12, 15 + vslli.w $vr16, $vr16, 4 vslli.w $vr17, $vr17, 4 - vslli.w $vr18, $vr18, 4 - vseqi.w $vr11, $vr11, 0 - vadd.w $vr6, $vr6, $vr11 - vseqi.w $vr11, $vr13, 0 - vadd.w $vr6, $vr6, $vr11 - vseqi.w $vr11, $vr12, 0 - vadd.w $vr5, $vr5, $vr11 - vseqi.w $vr11, $vr14, 0 - vadd.w $vr5, $vr5, $vr11 - vand.v $vr11, $vr15, $vr0 - vadd.w $vr7, $vr7, $vr11 - vand.v $vr11, $vr16, $vr0 - vadd.w $vr8, $vr8, $vr11 - vand.v $vr11, $vr15, $vr1 - vadd.w $vr9, $vr9, $vr11 - vand.v $vr11, $vr16, $vr1 - vadd.w $vr10, $vr10, $vr11 - vslt.w $vr11, $vr3, $vr13 - vslt.w $vr12, $vr3, $vr14 - vbitsel.v $vr13, $vr13, $vr4, $vr11 - vbitsel.v $vr14, $vr14, $vr4, $vr12 + vseqi.w $vr10, $vr10, 0 + vadd.w $vr5, $vr5, $vr10 + vseqi.w $vr10, $vr12, 0 + vadd.w $vr5, $vr5, $vr10 + vseqi.w $vr10, $vr11, 0 + vadd.w $vr4, $vr4, $vr10 + vseqi.w $vr10, $vr13, 0 + vadd.w $vr4, $vr4, $vr10 + vand.v $vr10, $vr14, $vr0 + vadd.w $vr6, $vr6, $vr10 + vand.v $vr10, $vr15, $vr0 + vadd.w $vr7, $vr7, $vr10 + vand.v $vr10, $vr14, $vr1 + vadd.w $vr8, $vr8, $vr10 + vand.v $vr10, $vr15, $vr1 + vadd.w $vr9, $vr9, $vr10 + vslt.w $vr10, $vr2, $vr12 + vslt.w $vr11, $vr2, $vr13 + vbitsel.v $vr12, $vr12, $vr3, $vr10 + vbitsel.v $vr13, $vr13, $vr3, $vr11 + vadd.w $vr12, $vr16, $vr12 vadd.w $vr13, $vr17, $vr13 - vadd.w $vr14, $vr18, $vr14 - vaddi.wu $vr6, $vr6, 2 vaddi.wu $vr5, $vr5, 2 - vand.v $vr15, $vr11, $vr0 - vadd.w $vr7, $vr7, $vr15 - vand.v $vr15, $vr12, $vr0 - vadd.w $vr8, $vr8, $vr15 - vand.v $vr11, $vr11, $vr1 - vadd.w $vr9, $vr9, $vr11 - vand.v $vr11, $vr12, $vr1 - vadd.w $vr10, $vr10, $vr11 - vshuf4i.w $vr11, $vr13, 50 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr12, $vr13, 16 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr13, $vr14, 50 - vslli.d $vr13, $vr13, 32 - vsrai.d $vr13, $vr13, 32 - vshuf4i.w $vr14, $vr14, 16 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr14, $vr14, 32 - vpickve2gr.d $t5, $vr12, 0 - vpickve2gr.d $t6, $vr12, 1 - vpickve2gr.d $t7, $vr11, 0 - vpickve2gr.d $t8, $vr11, 1 - vpickve2gr.d $fp, $vr14, 0 - vpickve2gr.d $s0, $vr14, 1 - vpickve2gr.d $s1, $vr13, 0 - vpickve2gr.d $s2, $vr13, 1 + vaddi.wu $vr4, $vr4, 2 + vand.v $vr14, $vr10, $vr0 + vadd.w $vr6, $vr6, $vr14 + vand.v $vr14, $vr11, $vr0 + vadd.w $vr7, $vr7, $vr14 + vand.v $vr10, $vr10, $vr1 + vadd.w $vr8, $vr8, $vr10 + vand.v $vr10, $vr11, $vr1 + vadd.w $vr9, $vr9, $vr10 + vshuf4i.w $vr10, $vr12, 14 + vsllwil.d.w $vr10, $vr10, 0 + vsllwil.d.w $vr11, $vr12, 0 + vshuf4i.w $vr12, $vr13, 14 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr13, $vr13, 0 + vpickve2gr.d $t5, $vr11, 0 + vpickve2gr.d $t6, $vr11, 1 + vpickve2gr.d $t7, $vr10, 0 + vpickve2gr.d $t8, $vr10, 1 + vpickve2gr.d $fp, $vr13, 0 + vpickve2gr.d $s0, $vr13, 1 + vpickve2gr.d $s1, $vr12, 0 + vpickve2gr.d $s2, $vr12, 1 ldx.b $s3, $t0, $t5 ldx.b $s4, $t0, $t6 ldx.b $s5, $t0, $t7 ldx.b $s6, $t0, $t8 - vinsgr2vr.b $vr11, $s3, 0 - vinsgr2vr.b $vr11, $s4, 1 - vinsgr2vr.b $vr11, $s5, 2 - vinsgr2vr.b $vr11, $s6, 3 + vinsgr2vr.b $vr10, $s3, 0 + vinsgr2vr.b $vr10, $s4, 1 + vinsgr2vr.b $vr10, $s5, 2 + vinsgr2vr.b $vr10, $s6, 3 ldx.b $s3, $t0, $fp ldx.b $s4, $t0, $s0 ldx.b $s5, $t0, $s1 ldx.b $s6, $t0, $s2 - vinsgr2vr.b $vr12, $s3, 0 - vinsgr2vr.b $vr12, $s4, 1 - vinsgr2vr.b $vr12, $s5, 2 - vinsgr2vr.b $vr12, $s6, 3 - vilvl.b $vr11, $vr2, $vr11 - vilvl.h $vr11, $vr2, $vr11 - vilvl.b $vr12, $vr2, $vr12 - vilvl.h $vr12, $vr2, $vr12 + vinsgr2vr.b $vr11, $s3, 0 + vinsgr2vr.b $vr11, $s4, 1 + vinsgr2vr.b $vr11, $s5, 2 + vinsgr2vr.b $vr11, $s6, 3 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vadd.w $vr6, $vr6, $vr10 vadd.w $vr7, $vr7, $vr11 - vadd.w $vr8, $vr8, $vr12 ldx.b $t5, $t1, $t5 ldx.b $t6, $t1, $t6 ldx.b $t7, $t1, $t7 ldx.b $t8, $t1, $t8 - vinsgr2vr.b $vr11, $t5, 0 - vinsgr2vr.b $vr11, $t6, 1 - vinsgr2vr.b $vr11, $t7, 2 - vinsgr2vr.b $vr11, $t8, 3 + vinsgr2vr.b $vr10, $t5, 0 + vinsgr2vr.b $vr10, $t6, 1 + vinsgr2vr.b $vr10, $t7, 2 + vinsgr2vr.b $vr10, $t8, 3 ldx.b $t5, $t1, $fp ldx.b $t6, $t1, $s0 ldx.b $t7, $t1, $s1 ldx.b $t8, $t1, $s2 - vinsgr2vr.b $vr12, $t5, 0 - vinsgr2vr.b $vr12, $t6, 1 - vinsgr2vr.b $vr12, $t7, 2 - vinsgr2vr.b $vr12, $t8, 3 - vilvl.b $vr11, $vr2, $vr11 - vilvl.h $vr11, $vr2, $vr11 - vilvl.b $vr12, $vr2, $vr12 - vilvl.h $vr12, $vr2, $vr12 + vinsgr2vr.b $vr11, $t5, 0 + vinsgr2vr.b $vr11, $t6, 1 + vinsgr2vr.b $vr11, $t7, 2 + vinsgr2vr.b $vr11, $t8, 3 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vadd.w $vr8, $vr8, $vr10 vadd.w $vr9, $vr9, $vr11 - vadd.w $vr10, $vr10, $vr12 addi.d $t4, $t4, -8 addi.d $a0, $a0, 64 bnez $t4, .LBB3_49 # %bb.50: # %middle.block174 - vadd.w $vr0, $vr10, $vr9 + vadd.w $vr0, $vr9, $vr8 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t5, $vr0, 0 - vadd.w $vr0, $vr8, $vr7 + vadd.w $vr0, $vr7, $vr6 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t6, $vr0, 0 - vadd.w $vr0, $vr5, $vr6 + vadd.w $vr0, $vr4, $vr5 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $t4, $vr0, 0 @@ -2490,59 +2448,52 @@ choose_table: # @choose_table vinsgr2vr.w $vr1, $a5, 0 addi.d $a5, $t0, 16 move $t0, $a7 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB3_63: # %vector.body188 # =>This Inner Loop Header: Depth=1 - vld $vr3, $a5, -16 - vld $vr4, $a5, 0 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr6, $vr4, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vpickve2gr.d $t1, $vr3, 0 - vpickve2gr.d $t2, $vr3, 1 - vpickve2gr.d $t3, $vr5, 0 - vpickve2gr.d $t4, $vr5, 1 - vpickve2gr.d $t5, $vr4, 0 - vpickve2gr.d $t6, $vr4, 1 - vpickve2gr.d $t7, $vr6, 0 - vpickve2gr.d $t8, $vr6, 1 + vld $vr2, $a5, -16 + vld $vr3, $a5, 0 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 + vpickve2gr.d $t1, $vr2, 0 + vpickve2gr.d $t2, $vr2, 1 + vpickve2gr.d $t3, $vr4, 0 + vpickve2gr.d $t4, $vr4, 1 + vpickve2gr.d $t5, $vr3, 0 + vpickve2gr.d $t6, $vr3, 1 + vpickve2gr.d $t7, $vr5, 0 + vpickve2gr.d $t8, $vr5, 1 ldx.b $t1, $a3, $t1 ldx.b $t2, $a3, $t2 ldx.b $t3, $a3, $t3 ldx.b $t4, $a3, $t4 - vinsgr2vr.b $vr3, $t1, 0 - vinsgr2vr.b $vr3, $t2, 1 - vinsgr2vr.b $vr3, $t3, 2 - vinsgr2vr.b $vr3, $t4, 3 + vinsgr2vr.b $vr2, $t1, 0 + vinsgr2vr.b $vr2, $t2, 1 + vinsgr2vr.b $vr2, $t3, 2 + vinsgr2vr.b $vr2, $t4, 3 ldx.b $t1, $a3, $t5 ldx.b $t2, $a3, $t6 ldx.b $t3, $a3, $t7 ldx.b $t4, $a3, $t8 - vinsgr2vr.b $vr4, $t1, 0 - vinsgr2vr.b $vr4, $t2, 1 - vinsgr2vr.b $vr4, $t3, 2 - vinsgr2vr.b $vr4, $t4, 3 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 + vinsgr2vr.b $vr3, $t1, 0 + vinsgr2vr.b $vr3, $t2, 1 + vinsgr2vr.b $vr3, $t3, 2 + vinsgr2vr.b $vr3, $t4, 3 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr3 addi.d $t0, $t0, -8 addi.d $a5, $a5, 32 bnez $t0, .LBB3_63 # %bb.64: # %middle.block196 - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a5, $vr0, 0 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-typeset/CMakeFiles/consumer-typeset.dir/z29.s b/results/MultiSource/Benchmarks/MiBench/consumer-typeset/CMakeFiles/consumer-typeset.dir/z29.s index bd4c2c44..92c2a814 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-typeset/CMakeFiles/consumer-typeset.dir/z29.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-typeset/CMakeFiles/consumer-typeset.dir/z29.s @@ -1562,30 +1562,29 @@ InsertSym: # @InsertSym slli.d $a4, $a2, 3 sub.d $a0, $a0, $a4 alsl.d $a2, $a2, $s0, 3 - vld $vr4, $sp, 16 # 16-byte Folded Reload - vori.b $vr0, $vr4, 0 + vld $vr3, $sp, 16 # 16-byte Folded Reload + vori.b $vr0, $vr3, 0 vinsgr2vr.w $vr0, $a1, 0 addi.d $a1, $s0, 5 move $a5, $a4 - vori.b $vr1, $vr4, 0 .p2align 4, , 16 .LBB15_60: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $a6, $a1, -4 ld.w $a7, $a1, 0 - vinsgr2vr.w $vr2, $a6, 0 - vinsgr2vr.w $vr3, $a7, 0 - vilvl.b $vr2, $vr4, $vr2 - vilvl.h $vr2, $vr4, $vr2 - vilvl.b $vr3, $vr4, $vr3 - vilvl.h $vr3, $vr4, $vr3 - vadd.w $vr0, $vr0, $vr2 - vadd.w $vr1, $vr1, $vr3 + vinsgr2vr.w $vr1, $a6, 0 + vinsgr2vr.w $vr2, $a7, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vadd.w $vr0, $vr0, $vr1 + vadd.w $vr3, $vr3, $vr2 addi.d $a5, $a5, -8 addi.d $a1, $a1, 8 bnez $a5, .LBB15_60 # %bb.61: # %middle.block - vadd.w $vr0, $vr1, $vr0 + vadd.w $vr0, $vr3, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a1, $vr0, 0 @@ -2099,25 +2098,24 @@ InsertAlternativeName: # @InsertAlternativeName vinsgr2vr.w $vr1, $a1, 0 addi.d $a1, $s0, 5 move $a5, $a4 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB16_4: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $a6, $a1, -4 ld.w $a7, $a1, 0 - vinsgr2vr.w $vr3, $a6, 0 - vinsgr2vr.w $vr4, $a7, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 + vinsgr2vr.w $vr2, $a6, 0 + vinsgr2vr.w $vr3, $a7, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr3 addi.d $a5, $a5, -8 addi.d $a1, $a1, 8 bnez $a5, .LBB16_4 # %bb.5: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a1, $vr0, 0 @@ -2470,25 +2468,24 @@ SearchSym: # @SearchSym vinsgr2vr.w $vr1, $a2, 0 addi.d $a2, $t0, 5 move $a5, $a4 - vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB17_4: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $a6, $a2, -4 ld.w $a7, $a2, 0 - vinsgr2vr.w $vr3, $a6, 0 - vinsgr2vr.w $vr4, $a7, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vadd.w $vr1, $vr1, $vr3 - vadd.w $vr2, $vr2, $vr4 + vinsgr2vr.w $vr2, $a6, 0 + vinsgr2vr.w $vr3, $a7, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vadd.w $vr1, $vr1, $vr2 + vadd.w $vr0, $vr0, $vr3 addi.d $a5, $a5, -8 addi.d $a2, $a2, 8 bnez $a5, .LBB17_4 # %bb.5: # %middle.block - vadd.w $vr0, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a2, $vr0, 0 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-typeset/CMakeFiles/consumer-typeset.dir/z37.s b/results/MultiSource/Benchmarks/MiBench/consumer-typeset/CMakeFiles/consumer-typeset.dir/z37.s index 87cfcc66..2aa661f6 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-typeset/CMakeFiles/consumer-typeset.dir/z37.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-typeset/CMakeFiles/consumer-typeset.dir/z37.s @@ -1048,7 +1048,7 @@ FontChange: # @FontChange addi.d $a0, $s7, 32 st.d $a0, $sp, 232 # 8-byte Folded Spill ori $s1, $zero, 10 - pcalau12i $s0, %pc_hi20(fd_tag) + pcalau12i $s6, %pc_hi20(fd_tag) pcalau12i $a0, %pc_hi20(fd_family) st.d $a0, $sp, 256 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(.L.str.77) @@ -1061,12 +1061,15 @@ FontChange: # @FontChange move $s2, $zero st.d $zero, $sp, 192 # 8-byte Folded Spill st.d $zero, $sp, 184 # 8-byte Folded Spill - move $fp, $zero + move $s0, $zero st.d $zero, $sp, 160 # 8-byte Folded Spill move $s5, $zero b .LBB1_111 .LBB1_109: # in Loop: Header=BB1_111 Depth=1 - move $fp, $s6 + ori $a3, $zero, 1 + pcaddu18i $ra, %call36(Error) + jirl $ra, $ra, 0 + .p2align 4, , 16 .LBB1_110: # in Loop: Header=BB1_111 Depth=1 ld.d $s3, $s3, 8 beq $s3, $s7, .LBB1_160 @@ -1074,13 +1077,12 @@ FontChange: # @FontChange # =>This Loop Header: Depth=1 # Child Loop BB1_112 Depth 2 # Child Loop BB1_154 Depth 2 - # Child Loop BB1_149 Depth 2 - # Child Loop BB1_145 Depth 2 - # Child Loop BB1_141 Depth 2 - # Child Loop BB1_137 Depth 2 + # Child Loop BB1_150 Depth 2 + # Child Loop BB1_146 Depth 2 + # Child Loop BB1_142 Depth 2 + # Child Loop BB1_138 Depth 2 # Child Loop BB1_130 Depth 2 # Child Loop BB1_125 Depth 2 - move $s6, $fp move $fp, $s3 .p2align 4, , 16 .LBB1_112: # Parent Loop BB1_111 Depth=1 @@ -1106,8 +1108,8 @@ FontChange: # @FontChange .LBB1_115: # %.loopexit520.i # in Loop: Header=BB1_111 Depth=1 ld.d $a0, $fp, 80 - ld.d $a1, $s0, %pc_lo12(fd_tag) - beq $a0, $a1, .LBB1_109 + ld.d $a1, $s6, %pc_lo12(fd_tag) + beq $a0, $a1, .LBB1_110 # %bb.116: # in Loop: Header=BB1_111 Depth=1 ld.d $a1, $sp, 256 # 8-byte Folded Reload ld.d $a1, $a1, %pc_lo12(fd_family) @@ -1119,21 +1121,21 @@ FontChange: # @FontChange # %bb.118: # in Loop: Header=BB1_111 Depth=1 pcalau12i $a1, %pc_hi20(fd_name) ld.d $a1, $a1, %pc_lo12(fd_name) - beq $a0, $a1, .LBB1_136 + beq $a0, $a1, .LBB1_137 # %bb.119: # in Loop: Header=BB1_111 Depth=1 pcalau12i $a1, %pc_hi20(fd_metrics) ld.d $a1, $a1, %pc_lo12(fd_metrics) - beq $a0, $a1, .LBB1_140 + beq $a0, $a1, .LBB1_141 # %bb.120: # in Loop: Header=BB1_111 Depth=1 pcalau12i $a1, %pc_hi20(fd_extra_metrics) ld.d $a1, $a1, %pc_lo12(fd_extra_metrics) - beq $a0, $a1, .LBB1_144 + beq $a0, $a1, .LBB1_145 # %bb.121: # in Loop: Header=BB1_111 Depth=1 pcalau12i $a1, %pc_hi20(fd_mapping) ld.d $a1, $a1, %pc_lo12(fd_mapping) - beq $a0, $a1, .LBB1_148 + beq $a0, $a1, .LBB1_149 # %bb.122: # in Loop: Header=BB1_111 Depth=1 - st.d $s6, $sp, 168 # 8-byte Folded Spill + st.d $s0, $sp, 168 # 8-byte Folded Spill pcalau12i $s1, %pc_hi20(fd_recode) ld.d $a1, $s1, %pc_lo12(fd_recode) beq $a0, $a1, .LBB1_153 @@ -1162,14 +1164,13 @@ FontChange: # @FontChange addi.d $a0, $a0, -11 addi.d $s5, $s4, 64 ori $a1, $zero, 1 - move $fp, $s6 bltu $a1, $a0, .LBB1_128 # %bb.127: # in Loop: Header=BB1_111 Depth=1 move $a0, $s5 ld.d $a1, $sp, 216 # 8-byte Folded Reload pcaddu18i $ra, %call36(strcmp) jirl $ra, $ra, 0 - beqz $a0, .LBB1_134 + beqz $a0, .LBB1_135 .LBB1_128: # %.loopexit517.i # in Loop: Header=BB1_111 Depth=1 ori $a0, $zero, 37 @@ -1180,10 +1181,7 @@ FontChange: # @FontChange ld.d $a4, $sp, 232 # 8-byte Folded Reload move $a5, $s5 ld.d $a6, $sp, 200 # 8-byte Folded Reload - pcaddu18i $ra, %call36(Error) - jirl $ra, $ra, 0 - move $s5, $s1 - b .LBB1_135 + b .LBB1_134 .LBB1_129: # in Loop: Header=BB1_111 Depth=1 move $s1, $s5 ld.d $s2, $fp, 8 @@ -1197,14 +1195,13 @@ FontChange: # @FontChange addi.d $a0, $a0, -11 addi.d $s5, $s2, 64 ori $a1, $zero, 1 - move $fp, $s6 bltu $a1, $a0, .LBB1_133 # %bb.132: # in Loop: Header=BB1_111 Depth=1 move $a0, $s5 move $a1, $s8 pcaddu18i $ra, %call36(strcmp) jirl $ra, $ra, 0 - beqz $a0, .LBB1_134 + beqz $a0, .LBB1_135 .LBB1_133: # %.loopexit518.i # in Loop: Header=BB1_111 Depth=1 pcalau12i $a0, %pc_hi20(.L.str.78) @@ -1217,116 +1214,110 @@ FontChange: # @FontChange addi.d $a7, $sp, 1456 ld.d $a4, $sp, 232 # 8-byte Folded Reload move $a5, $s5 +.LBB1_134: # in Loop: Header=BB1_111 Depth=1 pcaddu18i $ra, %call36(Error) jirl $ra, $ra, 0 -.LBB1_134: # in Loop: Header=BB1_111 Depth=1 - move $s5, $s1 .LBB1_135: # in Loop: Header=BB1_111 Depth=1 + move $s5, $s1 +.LBB1_136: # in Loop: Header=BB1_111 Depth=1 ori $s1, $zero, 10 b .LBB1_110 -.LBB1_136: # in Loop: Header=BB1_111 Depth=1 +.LBB1_137: # in Loop: Header=BB1_111 Depth=1 ld.d $a0, $fp, 8 .p2align 4, , 16 -.LBB1_137: # Parent Loop BB1_111 Depth=1 +.LBB1_138: # Parent Loop BB1_111 Depth=1 # => This Inner Loop Header: Depth=2 ld.d $a0, $a0, 16 ld.bu $a1, $a0, 32 - beqz $a1, .LBB1_137 -# %bb.138: # in Loop: Header=BB1_111 Depth=1 + beqz $a1, .LBB1_138 +# %bb.139: # in Loop: Header=BB1_111 Depth=1 ori $a1, $zero, 1 pcaddu18i $ra, %call36(ReplaceWithTidy) jirl $ra, $ra, 0 ld.bu $a1, $a0, 32 addi.d $a1, $a1, -11 ori $a2, $zero, 2 - move $fp, $s6 st.d $a0, $sp, 192 # 8-byte Folded Spill bltu $a1, $a2, .LBB1_110 -# %bb.139: # in Loop: Header=BB1_111 Depth=1 +# %bb.140: # in Loop: Header=BB1_111 Depth=1 addi.d $a4, $a0, 32 pcalau12i $a0, %pc_hi20(.L.str.79) addi.d $a2, $a0, %pc_lo12(.L.str.79) ori $a0, $zero, 37 ori $a1, $zero, 14 - b .LBB1_152 -.LBB1_140: # in Loop: Header=BB1_111 Depth=1 + b .LBB1_109 +.LBB1_141: # in Loop: Header=BB1_111 Depth=1 ld.d $a0, $fp, 8 .p2align 4, , 16 -.LBB1_141: # Parent Loop BB1_111 Depth=1 +.LBB1_142: # Parent Loop BB1_111 Depth=1 # => This Inner Loop Header: Depth=2 ld.d $a0, $a0, 16 ld.bu $a1, $a0, 32 - beqz $a1, .LBB1_141 -# %bb.142: # in Loop: Header=BB1_111 Depth=1 + beqz $a1, .LBB1_142 +# %bb.143: # in Loop: Header=BB1_111 Depth=1 ori $a1, $zero, 1 pcaddu18i $ra, %call36(ReplaceWithTidy) jirl $ra, $ra, 0 ld.bu $a1, $a0, 32 addi.d $a1, $a1, -11 ori $a2, $zero, 2 - move $fp, $s6 st.d $a0, $sp, 184 # 8-byte Folded Spill bltu $a1, $a2, .LBB1_110 -# %bb.143: # in Loop: Header=BB1_111 Depth=1 +# %bb.144: # in Loop: Header=BB1_111 Depth=1 addi.d $a4, $a0, 32 pcalau12i $a0, %pc_hi20(.L.str.80) addi.d $a2, $a0, %pc_lo12(.L.str.80) ori $a0, $zero, 37 ori $a1, $zero, 15 - b .LBB1_152 -.LBB1_144: # in Loop: Header=BB1_111 Depth=1 + b .LBB1_109 +.LBB1_145: # in Loop: Header=BB1_111 Depth=1 ld.d $a0, $fp, 8 .p2align 4, , 16 -.LBB1_145: # Parent Loop BB1_111 Depth=1 +.LBB1_146: # Parent Loop BB1_111 Depth=1 # => This Inner Loop Header: Depth=2 ld.d $a0, $a0, 16 ld.bu $a1, $a0, 32 - beqz $a1, .LBB1_145 -# %bb.146: # in Loop: Header=BB1_111 Depth=1 + beqz $a1, .LBB1_146 +# %bb.147: # in Loop: Header=BB1_111 Depth=1 ori $a1, $zero, 1 pcaddu18i $ra, %call36(ReplaceWithTidy) jirl $ra, $ra, 0 - move $fp, $a0 + move $s0, $a0 ld.bu $a0, $a0, 32 addi.d $a0, $a0, -11 ori $a1, $zero, 2 bltu $a0, $a1, .LBB1_110 -# %bb.147: # in Loop: Header=BB1_111 Depth=1 - addi.d $a4, $fp, 32 +# %bb.148: # in Loop: Header=BB1_111 Depth=1 + addi.d $a4, $s0, 32 pcalau12i $a0, %pc_hi20(.L.str.81) addi.d $a2, $a0, %pc_lo12(.L.str.81) ori $a0, $zero, 37 ori $a1, $zero, 16 - b .LBB1_152 -.LBB1_148: # in Loop: Header=BB1_111 Depth=1 + b .LBB1_109 +.LBB1_149: # in Loop: Header=BB1_111 Depth=1 ld.d $a0, $fp, 8 .p2align 4, , 16 -.LBB1_149: # Parent Loop BB1_111 Depth=1 +.LBB1_150: # Parent Loop BB1_111 Depth=1 # => This Inner Loop Header: Depth=2 ld.d $a0, $a0, 16 ld.bu $a1, $a0, 32 - beqz $a1, .LBB1_149 -# %bb.150: # in Loop: Header=BB1_111 Depth=1 + beqz $a1, .LBB1_150 +# %bb.151: # in Loop: Header=BB1_111 Depth=1 ori $a1, $zero, 1 pcaddu18i $ra, %call36(ReplaceWithTidy) jirl $ra, $ra, 0 ld.bu $a1, $a0, 32 addi.d $a1, $a1, -11 ori $a2, $zero, 2 - move $fp, $s6 st.d $a0, $sp, 160 # 8-byte Folded Spill bltu $a1, $a2, .LBB1_110 -# %bb.151: # in Loop: Header=BB1_111 Depth=1 +# %bb.152: # in Loop: Header=BB1_111 Depth=1 addi.d $a4, $a0, 32 pcalau12i $a0, %pc_hi20(.L.str.82) addi.d $a2, $a0, %pc_lo12(.L.str.82) ori $a0, $zero, 37 ori $a1, $zero, 17 -.LBB1_152: # in Loop: Header=BB1_111 Depth=1 - ori $a3, $zero, 1 - pcaddu18i $ra, %call36(Error) - jirl $ra, $ra, 0 - b .LBB1_110 + b .LBB1_109 .LBB1_153: # in Loop: Header=BB1_111 Depth=1 ld.d $a0, $fp, 8 .LBB1_154: # Parent Loop BB1_111 Depth=1 @@ -1359,12 +1350,12 @@ FontChange: # @FontChange pcaddu18i $ra, %call36(Error) jirl $ra, $ra, 0 .LBB1_158: # in Loop: Header=BB1_111 Depth=1 - ld.d $fp, $sp, 168 # 8-byte Folded Reload - b .LBB1_135 + ld.d $s0, $sp, 168 # 8-byte Folded Reload + b .LBB1_136 .LBB1_159: move $s5, $zero st.d $zero, $sp, 160 # 8-byte Folded Spill - move $fp, $zero + move $s0, $zero st.d $zero, $sp, 184 # 8-byte Folded Spill st.d $zero, $sp, 192 # 8-byte Folded Spill move $s2, $zero @@ -1387,7 +1378,7 @@ FontChange: # @FontChange ld.d $a0, $sp, 160 # 8-byte Folded Reload beqz $a0, .LBB1_170 # %bb.165: - st.d $fp, $sp, 168 # 8-byte Folded Spill + st.d $s0, $sp, 168 # 8-byte Folded Spill ld.d $s0, $s5, %pc_lo12(font_root) addi.d $s3, $s4, 64 move $s1, $s0 @@ -3939,12 +3930,9 @@ FontChange: # @FontChange .LBB1_433: # %vec.epilog.vector.body # =>This Inner Loop Header: Depth=1 vld $vr2, $a6, 0 - vilvl.h $vr3, $vr2, $vr2 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 + vsllwil.w.h $vr3, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 vmul.w $vr2, $vr0, $vr2 vmul.w $vr3, $vr0, $vr3 vdiv.w $vr3, $vr3, $vr1 @@ -4004,18 +3992,12 @@ FontChange: # @FontChange # =>This Inner Loop Header: Depth=1 vld $vr2, $a5, -16 vld $vr3, $a5, 0 - vilvl.h $vr4, $vr2, $vr2 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvh.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr5, $vr3, $vr3 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 - vilvh.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 + vsllwil.w.h $vr4, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr5, $vr3, 0 + vbsrl.v $vr3, $vr3, 8 + vsllwil.w.h $vr3, $vr3, 0 vmul.w $vr2, $vr0, $vr2 vmul.w $vr4, $vr0, $vr4 vmul.w $vr3, $vr0, $vr3 diff --git a/results/MultiSource/Benchmarks/MiBench/office-ispell/CMakeFiles/office-ispell.dir/correct.s b/results/MultiSource/Benchmarks/MiBench/office-ispell/CMakeFiles/office-ispell.dir/correct.s index 50c65992..f6893c6b 100644 --- a/results/MultiSource/Benchmarks/MiBench/office-ispell/CMakeFiles/office-ispell.dir/correct.s +++ b/results/MultiSource/Benchmarks/MiBench/office-ispell/CMakeFiles/office-ispell.dir/correct.s @@ -3517,19 +3517,19 @@ ins_root_cap: # @ins_root_cap .type save_root_cap,@function save_root_cap: # @save_root_cap # %bb.0: - addi.d $sp, $sp, -480 - st.d $ra, $sp, 472 # 8-byte Folded Spill - st.d $fp, $sp, 464 # 8-byte Folded Spill - st.d $s0, $sp, 456 # 8-byte Folded Spill - st.d $s1, $sp, 448 # 8-byte Folded Spill - st.d $s2, $sp, 440 # 8-byte Folded Spill - st.d $s3, $sp, 432 # 8-byte Folded Spill - st.d $s4, $sp, 424 # 8-byte Folded Spill - st.d $s5, $sp, 416 # 8-byte Folded Spill - st.d $s6, $sp, 408 # 8-byte Folded Spill - st.d $s7, $sp, 400 # 8-byte Folded Spill - st.d $s8, $sp, 392 # 8-byte Folded Spill - ld.d $s6, $sp, 496 + addi.d $sp, $sp, -464 + st.d $ra, $sp, 456 # 8-byte Folded Spill + st.d $fp, $sp, 448 # 8-byte Folded Spill + st.d $s0, $sp, 440 # 8-byte Folded Spill + st.d $s1, $sp, 432 # 8-byte Folded Spill + st.d $s2, $sp, 424 # 8-byte Folded Spill + st.d $s3, $sp, 416 # 8-byte Folded Spill + st.d $s4, $sp, 408 # 8-byte Folded Spill + st.d $s5, $sp, 400 # 8-byte Folded Spill + st.d $s6, $sp, 392 # 8-byte Folded Spill + st.d $s7, $sp, 384 # 8-byte Folded Spill + st.d $s8, $sp, 376 # 8-byte Folded Spill + ld.d $s6, $sp, 480 ld.w $t0, $s6, 0 ori $t1, $zero, 9 blt $t1, $t0, .LBB10_8 @@ -3542,8 +3542,8 @@ save_root_cap: # @save_root_cap move $s3, $a1 move $s4, $a4 move $s5, $a5 - ld.d $s7, $sp, 488 - addi.d $a0, $sp, 208 + ld.d $s7, $sp, 472 + addi.d $a0, $sp, 192 move $a1, $a2 pcaddu18i $ra, %call36(strcpy) jirl $ra, $ra, 0 @@ -3557,12 +3557,12 @@ save_root_cap: # @save_root_cap beqz $a2, .LBB10_6 # %bb.3: # %.lr.ph.preheader move $a0, $s3 - ld.d $s3, $sp, 480 + ld.d $s3, $sp, 464 pcalau12i $a3, %got_pc_hi20(hashheader) ld.d $s8, $a3, %got_pc_lo12(hashheader) add.d $a3, $s8, $a2 ld.bu $a3, $a3, 1210 - st.d $a3, $sp, 200 # 8-byte Folded Spill + st.d $a3, $sp, 184 # 8-byte Folded Spill addi.d $a3, $a0, 1 .p2align 4, , 16 .LBB10_4: # %.lr.ph @@ -3575,7 +3575,7 @@ save_root_cap: # @save_root_cap addi.d $a3, $a3, 1 bnez $a2, .LBB10_4 .LBB10_6: - addi.d $a0, $sp, 208 + addi.d $a0, $sp, 192 pcaddu18i $ra, %call36(upcase) jirl $ra, $ra, 0 .LBB10_7: # %.loopexit151.sink.split @@ -3583,25 +3583,25 @@ save_root_cap: # @save_root_cap ori $a1, $zero, 120 mul.d $a0, $a0, $a1 add.d $a0, $s7, $a0 - addi.d $a1, $sp, 208 + addi.d $a1, $sp, 192 pcaddu18i $ra, %call36(strcpy) jirl $ra, $ra, 0 ld.w $a0, $s6, 0 addi.d $a0, $a0, 1 st.w $a0, $s6, 0 .LBB10_8: # %.loopexit151 - ld.d $s8, $sp, 392 # 8-byte Folded Reload - ld.d $s7, $sp, 400 # 8-byte Folded Reload - ld.d $s6, $sp, 408 # 8-byte Folded Reload - ld.d $s5, $sp, 416 # 8-byte Folded Reload - ld.d $s4, $sp, 424 # 8-byte Folded Reload - ld.d $s3, $sp, 432 # 8-byte Folded Reload - ld.d $s2, $sp, 440 # 8-byte Folded Reload - ld.d $s1, $sp, 448 # 8-byte Folded Reload - ld.d $s0, $sp, 456 # 8-byte Folded Reload - ld.d $fp, $sp, 464 # 8-byte Folded Reload - ld.d $ra, $sp, 472 # 8-byte Folded Reload - addi.d $sp, $sp, 480 + ld.d $s8, $sp, 376 # 8-byte Folded Reload + ld.d $s7, $sp, 384 # 8-byte Folded Reload + ld.d $s6, $sp, 392 # 8-byte Folded Reload + ld.d $s5, $sp, 400 # 8-byte Folded Reload + ld.d $s4, $sp, 408 # 8-byte Folded Reload + ld.d $s3, $sp, 416 # 8-byte Folded Reload + ld.d $s2, $sp, 424 # 8-byte Folded Reload + ld.d $s1, $sp, 432 # 8-byte Folded Reload + ld.d $s0, $sp, 440 # 8-byte Folded Reload + ld.d $fp, $sp, 448 # 8-byte Folded Reload + ld.d $ra, $sp, 456 # 8-byte Folded Reload + addi.d $sp, $sp, 464 ret .LBB10_9: # %.preheader153.preheader addi.d $a0, $a0, 1 @@ -3628,21 +3628,21 @@ save_root_cap: # @save_root_cap # %bb.13: ld.d $s0, $s0, 0 .LBB10_14: - addi.d $a1, $sp, 208 + addi.d $a1, $sp, 192 add.d $a1, $a1, $s1 - st.d $a1, $sp, 184 # 8-byte Folded Spill + st.d $a1, $sp, 168 # 8-byte Folded Spill add.d $a1, $s4, $s2 sub.w $a2, $a0, $a1 - st.d $a2, $sp, 176 # 8-byte Folded Spill + st.d $a2, $sp, 160 # 8-byte Folded Spill add.d $a2, $s1, $a0 sub.d $a3, $s5, $a1 add.w $a3, $a3, $a2 - st.d $a3, $sp, 168 # 8-byte Folded Spill + st.d $a3, $sp, 152 # 8-byte Folded Spill sub.w $a4, $a2, $a1 nor $a3, $s4, $zero - addi.d $s4, $sp, 208 + addi.d $s4, $sp, 192 add.w $a0, $a0, $a3 - st.d $a0, $sp, 160 # 8-byte Folded Spill + st.d $a0, $sp, 144 # 8-byte Folded Spill add.d $a3, $s4, $a4 bstrpick.d $a0, $s1, 31, 0 st.d $a0, $sp, 136 # 8-byte Folded Spill @@ -3669,7 +3669,7 @@ save_root_cap: # @save_root_cap st.d $a5, $sp, 88 # 8-byte Folded Spill bstrpick.d $a0, $a5, 32, 3 slli.d $a5, $a0, 3 - st.d $a4, $sp, 192 # 8-byte Folded Spill + st.d $a4, $sp, 176 # 8-byte Folded Spill alsl.w $a4, $a0, $a4, 3 st.d $a4, $sp, 56 # 8-byte Folded Spill st.d $a3, $sp, 128 # 8-byte Folded Spill @@ -3685,8 +3685,6 @@ save_root_cap: # @save_root_cap sub.d $a0, $a0, $a1 st.d $a0, $sp, 104 # 8-byte Folded Spill lu12i.w $s5, 196608 - vrepli.b $vr0, 0 - vst $vr0, $sp, 144 # 16-byte Folded Spill beqz $fp, .LBB10_16 .p2align 4, , 16 .LBB10_15: @@ -3713,8 +3711,8 @@ save_root_cap: # @save_root_cap jirl $ra, $ra, 0 move $s5, $a0 add.d $a1, $a0, $s2 - ld.d $a0, $sp, 184 # 8-byte Folded Reload - ld.d $a2, $sp, 176 # 8-byte Folded Reload + ld.d $a0, $sp, 168 # 8-byte Folded Reload + ld.d $a2, $sp, 160 # 8-byte Folded Reload pcaddu18i $ra, %call36(memmove) jirl $ra, $ra, 0 ldx.bu $a0, $s5, $s2 @@ -3722,7 +3720,6 @@ save_root_cap: # @save_root_cap ld.bu $a0, $a0, 1210 beqz $a0, .LBB10_26 # %bb.20: # %.preheader149 - vld $vr8, $sp, 144 # 16-byte Folded Reload blez $s1, .LBB10_51 # %bb.21: # %iter.check302 ori $a0, $zero, 8 @@ -3732,10 +3729,10 @@ save_root_cap: # @save_root_cap b .LBB10_43 .p2align 4, , 16 .LBB10_23: - addi.d $a0, $sp, 208 + addi.d $a0, $sp, 192 pcaddu18i $ra, %call36(lowcase) jirl $ra, $ra, 0 - ld.d $a0, $sp, 200 # 8-byte Folded Reload + ld.d $a0, $sp, 184 # 8-byte Folded Reload bnez $a0, .LBB10_25 # %bb.24: ld.d $a0, $s0, 16 @@ -3743,13 +3740,12 @@ save_root_cap: # @save_root_cap lu12i.w $a1, 131072 bne $a0, $a1, .LBB10_78 .LBB10_25: - ld.bu $a0, $sp, 208 + ld.bu $a0, $sp, 192 add.d $a0, $s8, $a0 ld.b $a0, $a0, 754 - st.b $a0, $sp, 208 + st.b $a0, $sp, 192 b .LBB10_78 .LBB10_26: # %.preheader - vld $vr8, $sp, 144 # 16-byte Folded Reload blez $s1, .LBB10_51 # %bb.27: # %iter.check274 ori $a0, $zero, 8 @@ -3854,43 +3850,60 @@ save_root_cap: # @save_root_cap bne $a2, $a0, .LBB10_49 b .LBB10_51 .LBB10_39: # %vector.body308.preheader - addi.d $a0, $sp, 208 + addi.d $a0, $sp, 192 ld.d $a1, $sp, 112 # 8-byte Folded Reload .p2align 4, , 16 .LBB10_40: # %vector.body308 # =>This Inner Loop Header: Depth=1 vld $vr0, $a0, 0 - vilvh.b $vr1, $vr8, $vr0 - vilvh.h $vr2, $vr8, $vr1 - vilvh.w $vr3, $vr8, $vr2 - vilvl.w $vr2, $vr8, $vr2 - vilvl.h $vr1, $vr8, $vr1 - vilvh.w $vr4, $vr8, $vr1 - vilvl.w $vr1, $vr8, $vr1 - vilvl.b $vr0, $vr8, $vr0 - vilvh.h $vr5, $vr8, $vr0 - vilvh.w $vr6, $vr8, $vr5 - vilvl.w $vr5, $vr8, $vr5 - vilvl.h $vr0, $vr8, $vr0 - vilvh.w $vr7, $vr8, $vr0 - vilvl.w $vr0, $vr8, $vr0 + vbsrl.v $vr1, $vr0, 14 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 12 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr0, 10 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr4, $vr0, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsrli.d $vr5, $vr0, 48 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsrli.d $vr6, $vr0, 32 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.b $vr7, $vr0, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vpickve2gr.d $a2, $vr0, 0 addi.d $a3, $s8, 754 vpickve2gr.d $a4, $vr0, 1 vpickve2gr.d $a5, $vr7, 0 vpickve2gr.d $a6, $vr7, 1 - vpickve2gr.d $a7, $vr5, 0 - vpickve2gr.d $t0, $vr5, 1 - vpickve2gr.d $t1, $vr6, 0 - vpickve2gr.d $t2, $vr6, 1 - vpickve2gr.d $t3, $vr1, 0 - vpickve2gr.d $t4, $vr1, 1 - vpickve2gr.d $t5, $vr4, 0 - vpickve2gr.d $t6, $vr4, 1 + vpickve2gr.d $a7, $vr6, 0 + vpickve2gr.d $t0, $vr6, 1 + vpickve2gr.d $t1, $vr5, 0 + vpickve2gr.d $t2, $vr5, 1 + vpickve2gr.d $t3, $vr4, 0 + vpickve2gr.d $t4, $vr4, 1 + vpickve2gr.d $t5, $vr3, 0 + vpickve2gr.d $t6, $vr3, 1 vpickve2gr.d $t7, $vr2, 0 vpickve2gr.d $t8, $vr2, 1 - vpickve2gr.d $ra, $vr3, 0 - vpickve2gr.d $s4, $vr3, 1 + vpickve2gr.d $ra, $vr1, 0 + vpickve2gr.d $s4, $vr1, 1 ldx.b $a2, $a3, $a2 ldx.b $a4, $a3, $a4 ldx.b $a5, $a3, $a5 @@ -3928,7 +3941,7 @@ save_root_cap: # @save_root_cap addi.d $a0, $a0, 16 bnez $a1, .LBB10_40 # %bb.41: # %middle.block312 - addi.d $s4, $sp, 208 + addi.d $s4, $sp, 192 ld.d $a0, $sp, 136 # 8-byte Folded Reload ld.d $a1, $sp, 112 # 8-byte Folded Reload beq $a1, $a0, .LBB10_51 @@ -3953,43 +3966,60 @@ save_root_cap: # @save_root_cap bnez $a1, .LBB10_44 b .LBB10_51 .LBB10_45: # %vector.body280.preheader - addi.d $a0, $sp, 208 + addi.d $a0, $sp, 192 ld.d $a1, $sp, 112 # 8-byte Folded Reload .p2align 4, , 16 .LBB10_46: # %vector.body280 # =>This Inner Loop Header: Depth=1 vld $vr0, $a0, 0 - vilvh.b $vr1, $vr8, $vr0 - vilvh.h $vr2, $vr8, $vr1 - vilvh.w $vr3, $vr8, $vr2 - vilvl.w $vr2, $vr8, $vr2 - vilvl.h $vr1, $vr8, $vr1 - vilvh.w $vr4, $vr8, $vr1 - vilvl.w $vr1, $vr8, $vr1 - vilvl.b $vr0, $vr8, $vr0 - vilvh.h $vr5, $vr8, $vr0 - vilvh.w $vr6, $vr8, $vr5 - vilvl.w $vr5, $vr8, $vr5 - vilvl.h $vr0, $vr8, $vr0 - vilvh.w $vr7, $vr8, $vr0 - vilvl.w $vr0, $vr8, $vr0 + vbsrl.v $vr1, $vr0, 14 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 12 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr0, 10 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr4, $vr0, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsrli.d $vr5, $vr0, 48 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsrli.d $vr6, $vr0, 32 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.b $vr7, $vr0, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vpickve2gr.d $a2, $vr0, 0 addi.d $a3, $s8, 526 vpickve2gr.d $a4, $vr0, 1 vpickve2gr.d $a5, $vr7, 0 vpickve2gr.d $a6, $vr7, 1 - vpickve2gr.d $a7, $vr5, 0 - vpickve2gr.d $t0, $vr5, 1 - vpickve2gr.d $t1, $vr6, 0 - vpickve2gr.d $t2, $vr6, 1 - vpickve2gr.d $t3, $vr1, 0 - vpickve2gr.d $t4, $vr1, 1 - vpickve2gr.d $t5, $vr4, 0 - vpickve2gr.d $t6, $vr4, 1 + vpickve2gr.d $a7, $vr6, 0 + vpickve2gr.d $t0, $vr6, 1 + vpickve2gr.d $t1, $vr5, 0 + vpickve2gr.d $t2, $vr5, 1 + vpickve2gr.d $t3, $vr4, 0 + vpickve2gr.d $t4, $vr4, 1 + vpickve2gr.d $t5, $vr3, 0 + vpickve2gr.d $t6, $vr3, 1 vpickve2gr.d $t7, $vr2, 0 vpickve2gr.d $t8, $vr2, 1 - vpickve2gr.d $s4, $vr3, 0 - vpickve2gr.d $ra, $vr3, 1 + vpickve2gr.d $s4, $vr1, 0 + vpickve2gr.d $ra, $vr1, 1 ldx.b $a2, $a3, $a2 ldx.b $a4, $a3, $a4 ldx.b $a5, $a3, $a5 @@ -4027,7 +4057,7 @@ save_root_cap: # @save_root_cap addi.d $a0, $a0, 16 bnez $a1, .LBB10_46 # %bb.47: # %middle.block284 - addi.d $s4, $sp, 208 + addi.d $s4, $sp, 192 ld.d $a0, $sp, 136 # 8-byte Folded Reload ld.d $a1, $sp, 112 # 8-byte Folded Reload beq $a1, $a0, .LBB10_51 @@ -4052,18 +4082,18 @@ save_root_cap: # @save_root_cap bnez $a1, .LBB10_50 .p2align 4, , 16 .LBB10_51: # %.loopexit148 - ld.d $a0, $sp, 160 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload ldx.bu $a0, $s5, $a0 add.d $a0, $s8, $a0 ld.bu $a0, $a0, 1210 beqz $a0, .LBB10_59 # %bb.52: - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a1, $sp, 192 # 8-byte Folded Reload + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a1, $sp, 176 # 8-byte Folded Reload lu12i.w $s5, 196608 bge $a1, $a0, .LBB10_78 # %bb.53: # %iter.check239 - ld.d $a1, $sp, 192 # 8-byte Folded Reload + ld.d $a1, $sp, 176 # 8-byte Folded Reload ld.d $a0, $sp, 128 # 8-byte Folded Reload ld.d $a2, $sp, 120 # 8-byte Folded Reload ori $a3, $zero, 7 @@ -4120,12 +4150,12 @@ save_root_cap: # @save_root_cap bne $a2, $a3, .LBB10_70 b .LBB10_78 .LBB10_59: - ld.d $a0, $sp, 168 # 8-byte Folded Reload - ld.d $a1, $sp, 192 # 8-byte Folded Reload + ld.d $a0, $sp, 152 # 8-byte Folded Reload + ld.d $a1, $sp, 176 # 8-byte Folded Reload lu12i.w $s5, 196608 bge $a1, $a0, .LBB10_78 # %bb.60: # %iter.check - ld.d $a1, $sp, 192 # 8-byte Folded Reload + ld.d $a1, $sp, 176 # 8-byte Folded Reload ld.d $a0, $sp, 128 # 8-byte Folded Reload ld.d $a2, $sp, 120 # 8-byte Folded Reload ori $a3, $zero, 7 @@ -4188,37 +4218,54 @@ save_root_cap: # @save_root_cap .LBB10_67: # %vector.body245 # =>This Inner Loop Header: Depth=1 vld $vr0, $a0, 0 - vilvh.b $vr1, $vr8, $vr0 - vilvh.h $vr2, $vr8, $vr1 - vilvh.w $vr3, $vr8, $vr2 - vilvl.w $vr2, $vr8, $vr2 - vilvl.h $vr1, $vr8, $vr1 - vilvh.w $vr4, $vr8, $vr1 - vilvl.w $vr1, $vr8, $vr1 - vilvl.b $vr0, $vr8, $vr0 - vilvh.h $vr5, $vr8, $vr0 - vilvh.w $vr6, $vr8, $vr5 - vilvl.w $vr5, $vr8, $vr5 - vilvl.h $vr0, $vr8, $vr0 - vilvh.w $vr7, $vr8, $vr0 - vilvl.w $vr0, $vr8, $vr0 + vbsrl.v $vr1, $vr0, 14 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 12 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr0, 10 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr4, $vr0, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsrli.d $vr5, $vr0, 48 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsrli.d $vr6, $vr0, 32 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.b $vr7, $vr0, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vpickve2gr.d $a2, $vr0, 0 addi.d $a3, $s8, 754 vpickve2gr.d $a4, $vr0, 1 vpickve2gr.d $a5, $vr7, 0 vpickve2gr.d $a6, $vr7, 1 - vpickve2gr.d $a7, $vr5, 0 - vpickve2gr.d $t0, $vr5, 1 - vpickve2gr.d $t1, $vr6, 0 - vpickve2gr.d $t2, $vr6, 1 - vpickve2gr.d $t3, $vr1, 0 - vpickve2gr.d $t4, $vr1, 1 - vpickve2gr.d $t5, $vr4, 0 - vpickve2gr.d $t6, $vr4, 1 + vpickve2gr.d $a7, $vr6, 0 + vpickve2gr.d $t0, $vr6, 1 + vpickve2gr.d $t1, $vr5, 0 + vpickve2gr.d $t2, $vr5, 1 + vpickve2gr.d $t3, $vr4, 0 + vpickve2gr.d $t4, $vr4, 1 + vpickve2gr.d $t5, $vr3, 0 + vpickve2gr.d $t6, $vr3, 1 vpickve2gr.d $t7, $vr2, 0 vpickve2gr.d $t8, $vr2, 1 - vpickve2gr.d $s4, $vr3, 0 - vpickve2gr.d $s5, $vr3, 1 + vpickve2gr.d $s4, $vr1, 0 + vpickve2gr.d $s5, $vr1, 1 ldx.b $a2, $a3, $a2 ldx.b $a4, $a3, $a4 ldx.b $a5, $a3, $a5 @@ -4256,7 +4303,7 @@ save_root_cap: # @save_root_cap addi.d $a0, $a0, 16 bnez $a1, .LBB10_67 # %bb.68: # %middle.block250 - addi.d $s4, $sp, 208 + addi.d $s4, $sp, 192 ld.d $a0, $sp, 88 # 8-byte Folded Reload ld.d $a1, $sp, 96 # 8-byte Folded Reload lu12i.w $s5, 196608 @@ -4288,37 +4335,54 @@ save_root_cap: # @save_root_cap .LBB10_73: # %vector.body # =>This Inner Loop Header: Depth=1 vld $vr0, $a0, 0 - vilvh.b $vr1, $vr8, $vr0 - vilvh.h $vr2, $vr8, $vr1 - vilvh.w $vr3, $vr8, $vr2 - vilvl.w $vr2, $vr8, $vr2 - vilvl.h $vr1, $vr8, $vr1 - vilvh.w $vr4, $vr8, $vr1 - vilvl.w $vr1, $vr8, $vr1 - vilvl.b $vr0, $vr8, $vr0 - vilvh.h $vr5, $vr8, $vr0 - vilvh.w $vr6, $vr8, $vr5 - vilvl.w $vr5, $vr8, $vr5 - vilvl.h $vr0, $vr8, $vr0 - vilvh.w $vr7, $vr8, $vr0 - vilvl.w $vr0, $vr8, $vr0 + vbsrl.v $vr1, $vr0, 14 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 12 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr0, 10 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr4, $vr0, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsrli.d $vr5, $vr0, 48 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsrli.d $vr6, $vr0, 32 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.b $vr7, $vr0, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vpickve2gr.d $a2, $vr0, 0 addi.d $a3, $s8, 526 vpickve2gr.d $a4, $vr0, 1 vpickve2gr.d $a5, $vr7, 0 vpickve2gr.d $a6, $vr7, 1 - vpickve2gr.d $a7, $vr5, 0 - vpickve2gr.d $t0, $vr5, 1 - vpickve2gr.d $t1, $vr6, 0 - vpickve2gr.d $t2, $vr6, 1 - vpickve2gr.d $t3, $vr1, 0 - vpickve2gr.d $t4, $vr1, 1 - vpickve2gr.d $t5, $vr4, 0 - vpickve2gr.d $t6, $vr4, 1 + vpickve2gr.d $a7, $vr6, 0 + vpickve2gr.d $t0, $vr6, 1 + vpickve2gr.d $t1, $vr5, 0 + vpickve2gr.d $t2, $vr5, 1 + vpickve2gr.d $t3, $vr4, 0 + vpickve2gr.d $t4, $vr4, 1 + vpickve2gr.d $t5, $vr3, 0 + vpickve2gr.d $t6, $vr3, 1 vpickve2gr.d $t7, $vr2, 0 vpickve2gr.d $t8, $vr2, 1 - vpickve2gr.d $s4, $vr3, 0 - vpickve2gr.d $s5, $vr3, 1 + vpickve2gr.d $s4, $vr1, 0 + vpickve2gr.d $s5, $vr1, 1 ldx.b $a2, $a3, $a2 ldx.b $a4, $a3, $a4 ldx.b $a5, $a3, $a5 @@ -4356,7 +4420,7 @@ save_root_cap: # @save_root_cap addi.d $a0, $a0, 16 bnez $a1, .LBB10_73 # %bb.74: # %middle.block - addi.d $s4, $sp, 208 + addi.d $s4, $sp, 192 ld.d $a0, $sp, 88 # 8-byte Folded Reload ld.d $a1, $sp, 96 # 8-byte Folded Reload lu12i.w $s5, 196608 @@ -4386,7 +4450,7 @@ save_root_cap: # @save_root_cap ori $a1, $zero, 120 mul.d $a0, $a0, $a1 add.d $a0, $s7, $a0 - addi.d $a1, $sp, 208 + addi.d $a1, $sp, 192 pcaddu18i $ra, %call36(strcpy) jirl $ra, $ra, 0 ld.w $a0, $s6, 0 @@ -4405,19 +4469,19 @@ save_root_cap: # @save_root_cap .LBB10_81: lu12i.w $a0, 196608 and $a2, $a1, $a0 - ld.d $a3, $sp, 200 # 8-byte Folded Reload + ld.d $a3, $sp, 184 # 8-byte Folded Reload beqz $a3, .LBB10_84 # %bb.82: slli.d $a2, $a2, 35 bnez $a2, .LBB10_85 .LBB10_83: - addi.d $a0, $sp, 208 + addi.d $a0, $sp, 192 pcaddu18i $ra, %call36(lowcase) jirl $ra, $ra, 0 - ld.bu $a0, $sp, 208 + ld.bu $a0, $sp, 192 add.d $a0, $s8, $a0 ld.b $a0, $a0, 754 - st.b $a0, $sp, 208 + st.b $a0, $sp, 192 b .LBB10_7 .LBB10_84: beqz $a2, .LBB10_108 @@ -4451,7 +4515,7 @@ save_root_cap: # @save_root_cap andi $a5, $a5, 1 beqz $a5, .LBB10_89 .LBB10_93: # in Loop: Header=BB10_90 Depth=1 - ld.d $a5, $sp, 200 # 8-byte Folded Reload + ld.d $a5, $sp, 184 # 8-byte Folded Reload beqz $a5, .LBB10_88 # %bb.94: # in Loop: Header=BB10_90 Depth=1 bne $a4, $a1, .LBB10_89 @@ -4479,7 +4543,7 @@ save_root_cap: # @save_root_cap andi $a5, $a5, 1 beqz $a5, .LBB10_96 # %bb.100: # in Loop: Header=BB10_97 Depth=1 - ld.d $a5, $sp, 200 # 8-byte Folded Reload + ld.d $a5, $sp, 184 # 8-byte Folded Reload beqz $a5, .LBB10_95 # %bb.101: # in Loop: Header=BB10_97 Depth=1 bne $a4, $a1, .LBB10_96 @@ -4502,13 +4566,13 @@ save_root_cap: # @save_root_cap andi $a5, $a5, 1 beqz $a5, .LBB10_103 # %bb.106: # in Loop: Header=BB10_104 Depth=1 - ld.d $a5, $sp, 200 # 8-byte Folded Reload + ld.d $a5, $sp, 184 # 8-byte Folded Reload beqz $a5, .LBB10_102 # %bb.107: # in Loop: Header=BB10_104 Depth=1 bne $a4, $a1, .LBB10_103 b .LBB10_83 .LBB10_108: - addi.d $a0, $sp, 208 + addi.d $a0, $sp, 192 pcaddu18i $ra, %call36(lowcase) jirl $ra, $ra, 0 b .LBB10_7 diff --git a/results/MultiSource/Benchmarks/MiBench/office-ispell/CMakeFiles/office-ispell.dir/tgood.s b/results/MultiSource/Benchmarks/MiBench/office-ispell/CMakeFiles/office-ispell.dir/tgood.s index cc74e562..926af9fc 100644 --- a/results/MultiSource/Benchmarks/MiBench/office-ispell/CMakeFiles/office-ispell.dir/tgood.s +++ b/results/MultiSource/Benchmarks/MiBench/office-ispell/CMakeFiles/office-ispell.dir/tgood.s @@ -616,21 +616,21 @@ chk_suf: # @chk_suf .type expand_pre,@function expand_pre: # @expand_pre # %bb.0: - addi.d $sp, $sp, -272 - st.d $ra, $sp, 264 # 8-byte Folded Spill - st.d $fp, $sp, 256 # 8-byte Folded Spill - st.d $s0, $sp, 248 # 8-byte Folded Spill - st.d $s1, $sp, 240 # 8-byte Folded Spill - st.d $s2, $sp, 232 # 8-byte Folded Spill - st.d $s3, $sp, 224 # 8-byte Folded Spill - st.d $s4, $sp, 216 # 8-byte Folded Spill - st.d $s5, $sp, 208 # 8-byte Folded Spill - st.d $s6, $sp, 200 # 8-byte Folded Spill - st.d $s7, $sp, 192 # 8-byte Folded Spill - st.d $s8, $sp, 184 # 8-byte Folded Spill - st.d $a4, $sp, 40 # 8-byte Folded Spill - st.d $a3, $sp, 56 # 8-byte Folded Spill - st.d $a0, $sp, 8 # 8-byte Folded Spill + addi.d $sp, $sp, -256 + st.d $ra, $sp, 248 # 8-byte Folded Spill + st.d $fp, $sp, 240 # 8-byte Folded Spill + st.d $s0, $sp, 232 # 8-byte Folded Spill + st.d $s1, $sp, 224 # 8-byte Folded Spill + st.d $s2, $sp, 216 # 8-byte Folded Spill + st.d $s3, $sp, 208 # 8-byte Folded Spill + st.d $s4, $sp, 200 # 8-byte Folded Spill + st.d $s5, $sp, 192 # 8-byte Folded Spill + st.d $s6, $sp, 184 # 8-byte Folded Spill + st.d $s7, $sp, 176 # 8-byte Folded Spill + st.d $s8, $sp, 168 # 8-byte Folded Spill + st.d $a4, $sp, 24 # 8-byte Folded Spill + st.d $a3, $sp, 40 # 8-byte Folded Spill + st.d $a0, $sp, 16 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(numpflags) ld.d $a0, $a0, %got_pc_lo12(numpflags) ld.w $a0, $a0, 0 @@ -645,9 +645,7 @@ expand_pre: # @expand_pre pcalau12i $a1, %got_pc_hi20(hashheader) ld.d $s7, $a1, %got_pc_lo12(hashheader) move $s4, $zero - vrepli.b $vr8, 0 - st.d $a2, $sp, 48 # 8-byte Folded Spill - vst $vr8, $sp, 16 # 16-byte Folded Spill + st.d $a2, $sp, 32 # 8-byte Folded Spill b .LBB3_5 .p2align 4, , 16 .LBB3_2: # in Loop: Header=BB3_5 Depth=1 @@ -717,10 +715,10 @@ expand_pre: # @expand_pre beqz $a0, .LBB3_14 # %bb.13: # in Loop: Header=BB3_5 Depth=1 ld.d $a1, $s6, 8 - addi.d $a0, $sp, 64 + addi.d $a0, $sp, 48 pcaddu18i $ra, %call36(strcpy) jirl $ra, $ra, 0 - addi.d $a0, $sp, 64 + addi.d $a0, $sp, 48 add.d $s5, $a0, $s8 .LBB3_14: # in Loop: Header=BB3_5 Depth=1 add.w $fp, $s0, $s8 @@ -736,7 +734,6 @@ expand_pre: # @expand_pre # %bb.15: # %.preheader76.i.preheader # in Loop: Header=BB3_5 Depth=1 move $a1, $s2 - vld $vr8, $sp, 16 # 16-byte Folded Reload .p2align 4, , 16 .LBB3_16: # %.preheader76.i # Parent Loop BB3_5 Depth=1 @@ -772,14 +769,14 @@ expand_pre: # @expand_pre addi.w $a3, $a1, -2 bstrpick.d $a1, $a3, 31, 0 addi.d $a1, $a1, 1 - addi.d $a2, $sp, 65 + addi.d $a2, $sp, 49 ori $a4, $zero, 15 bltu $a3, $a4, .LBB3_26 # %bb.23: # %vector.ph49 # in Loop: Header=BB3_5 Depth=1 bstrpick.d $a2, $a1, 32, 4 slli.d $a3, $a2, 4 - addi.d $a4, $sp, 65 + addi.d $a4, $sp, 49 alsl.d $a2, $a2, $a4, 4 move $a5, $a3 .p2align 4, , 16 @@ -787,36 +784,53 @@ expand_pre: # @expand_pre # Parent Loop BB3_5 Depth=1 # => This Inner Loop Header: Depth=2 vld $vr0, $a4, 0 - vilvh.b $vr1, $vr8, $vr0 - vilvh.h $vr2, $vr8, $vr1 - vilvh.w $vr3, $vr8, $vr2 - vilvl.w $vr2, $vr8, $vr2 - vilvl.h $vr1, $vr8, $vr1 - vilvh.w $vr4, $vr8, $vr1 - vilvl.w $vr1, $vr8, $vr1 - vilvl.b $vr0, $vr8, $vr0 - vilvh.h $vr5, $vr8, $vr0 - vilvh.w $vr6, $vr8, $vr5 - vilvl.w $vr5, $vr8, $vr5 - vilvl.h $vr0, $vr8, $vr0 - vilvh.w $vr7, $vr8, $vr0 - vilvl.w $vr0, $vr8, $vr0 + vbsrl.v $vr1, $vr0, 14 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 12 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr0, 10 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr4, $vr0, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsrli.d $vr5, $vr0, 48 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsrli.d $vr6, $vr0, 32 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.b $vr7, $vr0, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vpickve2gr.d $a6, $vr0, 0 vpickve2gr.d $a7, $vr0, 1 vpickve2gr.d $t0, $vr7, 0 vpickve2gr.d $t1, $vr7, 1 - vpickve2gr.d $t2, $vr5, 0 - vpickve2gr.d $t3, $vr5, 1 - vpickve2gr.d $t4, $vr6, 0 - vpickve2gr.d $t5, $vr6, 1 - vpickve2gr.d $t6, $vr1, 0 - vpickve2gr.d $t7, $vr1, 1 - vpickve2gr.d $t8, $vr4, 0 - vpickve2gr.d $s0, $vr4, 1 + vpickve2gr.d $t2, $vr6, 0 + vpickve2gr.d $t3, $vr6, 1 + vpickve2gr.d $t4, $vr5, 0 + vpickve2gr.d $t5, $vr5, 1 + vpickve2gr.d $t6, $vr4, 0 + vpickve2gr.d $t7, $vr4, 1 + vpickve2gr.d $t8, $vr3, 0 + vpickve2gr.d $s0, $vr3, 1 vpickve2gr.d $s1, $vr2, 0 vpickve2gr.d $s5, $vr2, 1 - vpickve2gr.d $s8, $vr3, 0 - vpickve2gr.d $ra, $vr3, 1 + vpickve2gr.d $s8, $vr1, 0 + vpickve2gr.d $ra, $vr1, 1 ldx.b $a6, $a0, $a6 ldx.b $a7, $a0, $a7 ldx.b $t0, $a0, $t0 @@ -858,7 +872,7 @@ expand_pre: # @expand_pre beq $a1, $a3, .LBB3_45 .LBB3_26: # %.lr.ph.i64.i.preheader # in Loop: Header=BB3_5 Depth=1 - addi.d $a0, $sp, 64 + addi.d $a0, $sp, 48 add.d $a0, $a1, $a0 addi.d $a1, $a2, -1 .p2align 4, , 16 @@ -875,14 +889,13 @@ expand_pre: # @expand_pre b .LBB3_45 .LBB3_28: # in Loop: Header=BB3_5 Depth=1 move $fp, $zero - ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $s5, $sp, 32 # 8-byte Folded Reload ori $s8, $zero, 1 b .LBB3_3 .LBB3_29: # in Loop: Header=BB3_5 Depth=1 ld.bu $a1, $s5, 0 add.d $a1, $s7, $a1 ld.bu $a1, $a1, 1210 - vld $vr8, $sp, 16 # 16-byte Folded Reload bnez $a1, .LBB3_45 # %bb.30: # in Loop: Header=BB3_5 Depth=1 ld.h $a1, $s6, 20 @@ -890,9 +903,9 @@ expand_pre: # @expand_pre # %bb.31: # %.lr.ph.i68.preheader.i # in Loop: Header=BB3_5 Depth=1 bstrpick.d $a1, $a1, 15, 0 - addi.d $a5, $sp, 64 + addi.d $a5, $sp, 48 bstrpick.d $a4, $a1, 15, 0 - addi.d $a3, $sp, 64 + addi.d $a3, $sp, 48 ori $a2, $zero, 16 bltu $a4, $a2, .LBB3_35 # %bb.32: # %vector.ph @@ -906,36 +919,53 @@ expand_pre: # @expand_pre # Parent Loop BB3_5 Depth=1 # => This Inner Loop Header: Depth=2 vld $vr0, $a5, 0 - vilvh.b $vr1, $vr8, $vr0 - vilvh.h $vr2, $vr8, $vr1 - vilvh.w $vr3, $vr8, $vr2 - vilvl.w $vr2, $vr8, $vr2 - vilvl.h $vr1, $vr8, $vr1 - vilvh.w $vr4, $vr8, $vr1 - vilvl.w $vr1, $vr8, $vr1 - vilvl.b $vr0, $vr8, $vr0 - vilvh.h $vr5, $vr8, $vr0 - vilvh.w $vr6, $vr8, $vr5 - vilvl.w $vr5, $vr8, $vr5 - vilvl.h $vr0, $vr8, $vr0 - vilvh.w $vr7, $vr8, $vr0 - vilvl.w $vr0, $vr8, $vr0 + vbsrl.v $vr1, $vr0, 14 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 12 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr0, 10 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr4, $vr0, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsrli.d $vr5, $vr0, 48 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsrli.d $vr6, $vr0, 32 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.b $vr7, $vr0, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vpickve2gr.d $a7, $vr0, 0 vpickve2gr.d $t0, $vr0, 1 vpickve2gr.d $t1, $vr7, 0 vpickve2gr.d $t2, $vr7, 1 - vpickve2gr.d $t3, $vr5, 0 - vpickve2gr.d $t4, $vr5, 1 - vpickve2gr.d $t5, $vr6, 0 - vpickve2gr.d $t6, $vr6, 1 - vpickve2gr.d $t7, $vr1, 0 - vpickve2gr.d $t8, $vr1, 1 - vpickve2gr.d $s0, $vr4, 0 - vpickve2gr.d $s1, $vr4, 1 + vpickve2gr.d $t3, $vr6, 0 + vpickve2gr.d $t4, $vr6, 1 + vpickve2gr.d $t5, $vr5, 0 + vpickve2gr.d $t6, $vr5, 1 + vpickve2gr.d $t7, $vr4, 0 + vpickve2gr.d $t8, $vr4, 1 + vpickve2gr.d $s0, $vr3, 0 + vpickve2gr.d $s1, $vr3, 1 vpickve2gr.d $s5, $vr2, 0 vpickve2gr.d $s8, $vr2, 1 - vpickve2gr.d $ra, $vr3, 0 - vpickve2gr.d $a2, $vr3, 1 + vpickve2gr.d $ra, $vr1, 0 + vpickve2gr.d $a2, $vr1, 1 ldx.b $a7, $a0, $a7 ldx.b $t0, $a0, $t0 ldx.b $t1, $a0, $t1 @@ -974,7 +1004,7 @@ expand_pre: # @expand_pre bnez $a6, .LBB3_33 # %bb.34: # %middle.block # in Loop: Header=BB3_5 Depth=1 - addi.d $a5, $sp, 64 + addi.d $a5, $sp, 48 beq $a4, $a1, .LBB3_45 .LBB3_35: # %.lr.ph.i68.i.preheader # in Loop: Header=BB3_5 Depth=1 @@ -997,7 +1027,7 @@ expand_pre: # @expand_pre ld.h $a1, $s6, 20 blez $a1, .LBB3_45 # %bb.38: # in Loop: Header=BB3_5 Depth=1 - addi.d $a2, $sp, 64 + addi.d $a2, $sp, 48 ldx.bu $a2, $a1, $a2 add.d $a2, $s7, $a2 ld.bu $a2, $a2, 1210 @@ -1005,9 +1035,9 @@ expand_pre: # @expand_pre # %bb.39: # %.lr.ph.i.preheader.i # in Loop: Header=BB3_5 Depth=1 bstrpick.d $a1, $a1, 15, 0 - addi.d $a5, $sp, 64 + addi.d $a5, $sp, 48 bstrpick.d $a4, $a1, 15, 0 - addi.d $a3, $sp, 64 + addi.d $a3, $sp, 48 ori $a2, $zero, 16 bltu $a4, $a2, .LBB3_43 # %bb.40: # %vector.ph62 @@ -1021,36 +1051,53 @@ expand_pre: # @expand_pre # Parent Loop BB3_5 Depth=1 # => This Inner Loop Header: Depth=2 vld $vr0, $a5, 0 - vilvh.b $vr1, $vr8, $vr0 - vilvh.h $vr2, $vr8, $vr1 - vilvh.w $vr3, $vr8, $vr2 - vilvl.w $vr2, $vr8, $vr2 - vilvl.h $vr1, $vr8, $vr1 - vilvh.w $vr4, $vr8, $vr1 - vilvl.w $vr1, $vr8, $vr1 - vilvl.b $vr0, $vr8, $vr0 - vilvh.h $vr5, $vr8, $vr0 - vilvh.w $vr6, $vr8, $vr5 - vilvl.w $vr5, $vr8, $vr5 - vilvl.h $vr0, $vr8, $vr0 - vilvh.w $vr7, $vr8, $vr0 - vilvl.w $vr0, $vr8, $vr0 + vbsrl.v $vr1, $vr0, 14 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 12 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr0, 10 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr4, $vr0, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsrli.d $vr5, $vr0, 48 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsrli.d $vr6, $vr0, 32 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.b $vr7, $vr0, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vpickve2gr.d $a7, $vr0, 0 vpickve2gr.d $t0, $vr0, 1 vpickve2gr.d $t1, $vr7, 0 vpickve2gr.d $t2, $vr7, 1 - vpickve2gr.d $t3, $vr5, 0 - vpickve2gr.d $t4, $vr5, 1 - vpickve2gr.d $t5, $vr6, 0 - vpickve2gr.d $t6, $vr6, 1 - vpickve2gr.d $t7, $vr1, 0 - vpickve2gr.d $t8, $vr1, 1 - vpickve2gr.d $s0, $vr4, 0 - vpickve2gr.d $s1, $vr4, 1 + vpickve2gr.d $t3, $vr6, 0 + vpickve2gr.d $t4, $vr6, 1 + vpickve2gr.d $t5, $vr5, 0 + vpickve2gr.d $t6, $vr5, 1 + vpickve2gr.d $t7, $vr4, 0 + vpickve2gr.d $t8, $vr4, 1 + vpickve2gr.d $s0, $vr3, 0 + vpickve2gr.d $s1, $vr3, 1 vpickve2gr.d $s5, $vr2, 0 vpickve2gr.d $s8, $vr2, 1 - vpickve2gr.d $ra, $vr3, 0 - vpickve2gr.d $a2, $vr3, 1 + vpickve2gr.d $ra, $vr1, 0 + vpickve2gr.d $a2, $vr1, 1 ldx.b $a7, $a0, $a7 ldx.b $t0, $a0, $t0 ldx.b $t1, $a0, $t1 @@ -1089,7 +1136,7 @@ expand_pre: # @expand_pre bnez $a6, .LBB3_41 # %bb.42: # %middle.block70 # in Loop: Header=BB3_5 Depth=1 - addi.d $a5, $sp, 64 + addi.d $a5, $sp, 48 beq $a4, $a1, .LBB3_45 .LBB3_43: # %.lr.ph.i.i.preheader # in Loop: Header=BB3_5 Depth=1 @@ -1109,31 +1156,31 @@ expand_pre: # @expand_pre bne $a3, $a0, .LBB3_44 .LBB3_45: # %forcelc.exit.i # in Loop: Header=BB3_5 Depth=1 - ld.d $a0, $sp, 56 # 8-byte Folded Reload - ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload + ld.d $s5, $sp, 32 # 8-byte Folded Reload ori $s8, $zero, 1 ori $a1, $zero, 4 beq $a0, $a1, .LBB3_49 # %bb.46: # %forcelc.exit.i # in Loop: Header=BB3_5 Depth=1 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload ori $a1, $zero, 3 bne $a0, $a1, .LBB3_48 # %bb.47: # in Loop: Header=BB3_5 Depth=1 pcalau12i $a0, %pc_hi20(.L.str) addi.d $a0, $a0, %pc_lo12(.L.str) - ld.d $a1, $sp, 8 # 8-byte Folded Reload + ld.d $a1, $sp, 16 # 8-byte Folded Reload pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 .LBB3_48: # in Loop: Header=BB3_5 Depth=1 - addi.d $a0, $sp, 64 + addi.d $a0, $sp, 48 ori $a1, $zero, 1 pcaddu18i $ra, %call36(ichartosstr) jirl $ra, $ra, 0 move $a1, $a0 pcalau12i $a0, %pc_hi20(.L.str.1) addi.d $a0, $a0, %pc_lo12(.L.str.1) - ld.d $a2, $sp, 40 # 8-byte Folded Reload + ld.d $a2, $sp, 24 # 8-byte Folded Reload pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 .LBB3_49: # in Loop: Header=BB3_5 Depth=1 @@ -1141,12 +1188,12 @@ expand_pre: # @expand_pre andi $a0, $a0, 1 beqz $a0, .LBB3_3 # %bb.50: # in Loop: Header=BB3_5 Depth=1 - addi.d $a1, $sp, 64 + addi.d $a1, $sp, 48 ori $a3, $zero, 1 - ld.d $a0, $sp, 8 # 8-byte Folded Reload + ld.d $a0, $sp, 16 # 8-byte Folded Reload move $a2, $s5 - ld.d $a4, $sp, 56 # 8-byte Folded Reload - ld.d $a5, $sp, 40 # 8-byte Folded Reload + ld.d $a4, $sp, 40 # 8-byte Folded Reload + ld.d $a5, $sp, 24 # 8-byte Folded Reload pcaddu18i $ra, %call36(expand_suf) jirl $ra, $ra, 0 add.d $fp, $a0, $fp @@ -1155,18 +1202,18 @@ expand_pre: # @expand_pre move $s4, $zero .LBB3_52: # %._crit_edge move $a0, $s4 - ld.d $s8, $sp, 184 # 8-byte Folded Reload - ld.d $s7, $sp, 192 # 8-byte Folded Reload - ld.d $s6, $sp, 200 # 8-byte Folded Reload - ld.d $s5, $sp, 208 # 8-byte Folded Reload - ld.d $s4, $sp, 216 # 8-byte Folded Reload - ld.d $s3, $sp, 224 # 8-byte Folded Reload - ld.d $s2, $sp, 232 # 8-byte Folded Reload - ld.d $s1, $sp, 240 # 8-byte Folded Reload - ld.d $s0, $sp, 248 # 8-byte Folded Reload - ld.d $fp, $sp, 256 # 8-byte Folded Reload - ld.d $ra, $sp, 264 # 8-byte Folded Reload - addi.d $sp, $sp, 272 + ld.d $s8, $sp, 168 # 8-byte Folded Reload + ld.d $s7, $sp, 176 # 8-byte Folded Reload + ld.d $s6, $sp, 184 # 8-byte Folded Reload + ld.d $s5, $sp, 192 # 8-byte Folded Reload + ld.d $s4, $sp, 200 # 8-byte Folded Reload + ld.d $s3, $sp, 208 # 8-byte Folded Reload + ld.d $s2, $sp, 216 # 8-byte Folded Reload + ld.d $s1, $sp, 224 # 8-byte Folded Reload + ld.d $s0, $sp, 232 # 8-byte Folded Reload + ld.d $fp, $sp, 240 # 8-byte Folded Reload + ld.d $ra, $sp, 248 # 8-byte Folded Reload + addi.d $sp, $sp, 256 ret .Lfunc_end3: .size expand_pre, .Lfunc_end3-expand_pre @@ -1176,19 +1223,19 @@ expand_pre: # @expand_pre .type expand_suf,@function expand_suf: # @expand_suf # %bb.0: - addi.d $sp, $sp, -304 - st.d $ra, $sp, 296 # 8-byte Folded Spill - st.d $fp, $sp, 288 # 8-byte Folded Spill - st.d $s0, $sp, 280 # 8-byte Folded Spill - st.d $s1, $sp, 272 # 8-byte Folded Spill - st.d $s2, $sp, 264 # 8-byte Folded Spill - st.d $s3, $sp, 256 # 8-byte Folded Spill - st.d $s4, $sp, 248 # 8-byte Folded Spill - st.d $s5, $sp, 240 # 8-byte Folded Spill - st.d $s6, $sp, 232 # 8-byte Folded Spill - st.d $s7, $sp, 224 # 8-byte Folded Spill - st.d $s8, $sp, 216 # 8-byte Folded Spill - st.d $a0, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -288 + st.d $ra, $sp, 280 # 8-byte Folded Spill + st.d $fp, $sp, 272 # 8-byte Folded Spill + st.d $s0, $sp, 264 # 8-byte Folded Spill + st.d $s1, $sp, 256 # 8-byte Folded Spill + st.d $s2, $sp, 248 # 8-byte Folded Spill + st.d $s3, $sp, 240 # 8-byte Folded Spill + st.d $s4, $sp, 232 # 8-byte Folded Spill + st.d $s5, $sp, 224 # 8-byte Folded Spill + st.d $s6, $sp, 216 # 8-byte Folded Spill + st.d $s7, $sp, 208 # 8-byte Folded Spill + st.d $s8, $sp, 200 # 8-byte Folded Spill + st.d $a0, $sp, 40 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(numsflags) ld.d $a0, $a0, %got_pc_lo12(numsflags) ld.w $a0, $a0, 0 @@ -1205,10 +1252,8 @@ expand_suf: # @expand_suf pcalau12i $a1, %got_pc_hi20(hashheader) ld.d $s3, $a1, %got_pc_lo12(hashheader) move $s4, $zero - vrepli.b $vr8, 0 - st.d $a4, $sp, 72 # 8-byte Folded Spill - st.d $s0, $sp, 88 # 8-byte Folded Spill - vst $vr8, $sp, 32 # 16-byte Folded Spill + st.d $a4, $sp, 56 # 8-byte Folded Spill + st.d $s0, $sp, 72 # 8-byte Folded Spill b .LBB4_5 .p2align 4, , 16 .LBB4_2: # in Loop: Header=BB4_5 Depth=1 @@ -1270,27 +1315,26 @@ expand_suf: # @expand_suf move $a0, $zero b .LBB4_37 .LBB4_14: # in Loop: Header=BB4_5 Depth=1 - st.d $a0, $sp, 80 # 8-byte Folded Spill - addi.d $a0, $sp, 96 + st.d $a0, $sp, 64 # 8-byte Folded Spill + addi.d $a0, $sp, 80 move $a1, $s2 pcaddu18i $ra, %call36(strcpy) jirl $ra, $ra, 0 ld.hu $t6, $s6, 20 - addi.d $a0, $sp, 96 + addi.d $a0, $sp, 80 add.d $a0, $a0, $s7 sub.d $s5, $a0, $s0 beqz $t6, .LBB4_19 # %bb.15: # in Loop: Header=BB4_5 Depth=1 ld.d $a1, $s6, 8 move $a0, $s5 - st.d $t6, $sp, 64 # 8-byte Folded Spill + st.d $t6, $sp, 48 # 8-byte Folded Spill pcaddu18i $ra, %call36(strcpy) jirl $ra, $ra, 0 - ld.d $t6, $sp, 64 # 8-byte Folded Reload + ld.d $t6, $sp, 48 # 8-byte Folded Reload ext.w.h $a1, $t6 - ld.d $t7, $sp, 80 # 8-byte Folded Reload + ld.d $t7, $sp, 64 # 8-byte Folded Reload ori $t5, $zero, 1 - vld $vr8, $sp, 32 # 16-byte Folded Reload blez $a1, .LBB4_32 # %bb.16: # in Loop: Header=BB4_5 Depth=1 ld.bu $a1, $s5, -1 @@ -1306,7 +1350,7 @@ expand_suf: # @expand_suf b .LBB4_30 .LBB4_19: # in Loop: Header=BB4_5 Depth=1 st.b $zero, $s5, 0 - ld.d $t7, $sp, 80 # 8-byte Folded Reload + ld.d $t7, $sp, 64 # 8-byte Folded Reload ori $t5, $zero, 1 b .LBB4_32 .LBB4_20: # %vector.main.loop.iter.check @@ -1325,7 +1369,7 @@ expand_suf: # @expand_suf sub.d $a5, $a2, $a4 add.d $a0, $a2, $s7 sub.d $a0, $a0, $s0 - addi.d $a2, $sp, 96 + addi.d $a2, $sp, 80 add.d $a2, $a2, $a0 .p2align 4, , 16 .LBB4_23: # %vec.epilog.vector.body @@ -1366,48 +1410,65 @@ expand_suf: # @expand_suf b .LBB4_32 .LBB4_25: # %vector.ph # in Loop: Header=BB4_5 Depth=1 - st.d $s8, $sp, 24 # 8-byte Folded Spill + st.d $s8, $sp, 32 # 8-byte Folded Spill bstrpick.d $a2, $t6, 14, 4 slli.d $a5, $a2, 4 andi $a0, $t6, 8 - st.d $a0, $sp, 8 # 8-byte Folded Spill + st.d $a0, $sp, 16 # 8-byte Folded Spill move $a4, $s5 - st.d $a5, $sp, 16 # 8-byte Folded Spill + st.d $a5, $sp, 24 # 8-byte Folded Spill .p2align 4, , 16 .LBB4_26: # %vector.body # Parent Loop BB4_5 Depth=1 # => This Inner Loop Header: Depth=2 vld $vr0, $a4, 0 - vilvh.b $vr1, $vr8, $vr0 - vilvh.h $vr2, $vr8, $vr1 - vilvh.w $vr3, $vr8, $vr2 - vilvl.w $vr2, $vr8, $vr2 - vilvl.h $vr1, $vr8, $vr1 - vilvh.w $vr4, $vr8, $vr1 - vilvl.w $vr1, $vr8, $vr1 - vilvl.b $vr0, $vr8, $vr0 - vilvh.h $vr5, $vr8, $vr0 - vilvh.w $vr6, $vr8, $vr5 - vilvl.w $vr5, $vr8, $vr5 - vilvl.h $vr0, $vr8, $vr0 - vilvh.w $vr7, $vr8, $vr0 - vilvl.w $vr0, $vr8, $vr0 + vbsrl.v $vr1, $vr0, 14 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 12 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vsllwil.du.wu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr0, 10 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vbsrl.v $vr4, $vr0, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vsrli.d $vr5, $vr0, 48 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsrli.d $vr6, $vr0, 32 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vshuf4i.b $vr7, $vr0, 14 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vpickve2gr.d $a6, $vr0, 0 vpickve2gr.d $a7, $vr0, 1 vpickve2gr.d $t0, $vr7, 0 vpickve2gr.d $t1, $vr7, 1 - vpickve2gr.d $t2, $vr5, 0 - vpickve2gr.d $t3, $vr5, 1 - vpickve2gr.d $t4, $vr6, 0 - vpickve2gr.d $t5, $vr6, 1 - vpickve2gr.d $t6, $vr1, 0 - vpickve2gr.d $t7, $vr1, 1 - vpickve2gr.d $t8, $vr4, 0 - vpickve2gr.d $ra, $vr4, 1 + vpickve2gr.d $t2, $vr6, 0 + vpickve2gr.d $t3, $vr6, 1 + vpickve2gr.d $t4, $vr5, 0 + vpickve2gr.d $t5, $vr5, 1 + vpickve2gr.d $t6, $vr4, 0 + vpickve2gr.d $t7, $vr4, 1 + vpickve2gr.d $t8, $vr3, 0 + vpickve2gr.d $ra, $vr3, 1 vpickve2gr.d $a3, $vr2, 0 vpickve2gr.d $a2, $vr2, 1 - vpickve2gr.d $a0, $vr3, 0 - vpickve2gr.d $s8, $vr3, 1 + vpickve2gr.d $a0, $vr1, 0 + vpickve2gr.d $s8, $vr1, 1 ldx.b $a6, $a1, $a6 ldx.b $a7, $a1, $a7 ldx.b $t0, $a1, $t0 @@ -1446,15 +1507,15 @@ expand_suf: # @expand_suf bnez $a5, .LBB4_26 # %bb.27: # %middle.block # in Loop: Header=BB4_5 Depth=1 - ld.d $s8, $sp, 24 # 8-byte Folded Reload + ld.d $s8, $sp, 32 # 8-byte Folded Reload ori $t5, $zero, 1 - ld.d $t6, $sp, 64 # 8-byte Folded Reload - ld.d $t7, $sp, 80 # 8-byte Folded Reload - ld.d $a2, $sp, 16 # 8-byte Folded Reload + ld.d $t6, $sp, 48 # 8-byte Folded Reload + ld.d $t7, $sp, 64 # 8-byte Folded Reload + ld.d $a2, $sp, 24 # 8-byte Folded Reload beq $a2, $t6, .LBB4_32 # %bb.28: # %vec.epilog.iter.check # in Loop: Header=BB4_5 Depth=1 - ld.d $a0, $sp, 8 # 8-byte Folded Reload + ld.d $a0, $sp, 16 # 8-byte Folded Reload bnez $a0, .LBB4_22 # %bb.29: # in Loop: Header=BB4_5 Depth=1 sub.d $a3, $t6, $a2 @@ -1475,7 +1536,7 @@ expand_suf: # @expand_suf bltu $t5, $a1, .LBB4_31 .LBB4_32: # %forcelc.exit.i # in Loop: Header=BB4_5 Depth=1 - ld.d $s5, $sp, 72 # 8-byte Folded Reload + ld.d $s5, $sp, 56 # 8-byte Folded Reload ori $a0, $zero, 4 beq $s5, $a0, .LBB4_36 # %bb.33: # %forcelc.exit.i @@ -1485,11 +1546,11 @@ expand_suf: # @expand_suf # %bb.34: # in Loop: Header=BB4_5 Depth=1 pcalau12i $a0, %pc_hi20(.L.str) addi.d $a0, $a0, %pc_lo12(.L.str) - ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 .LBB4_35: # in Loop: Header=BB4_5 Depth=1 - addi.d $a0, $sp, 96 + addi.d $a0, $sp, 80 ori $a1, $zero, 1 pcaddu18i $ra, %call36(ichartosstr) jirl $ra, $ra, 0 @@ -1501,7 +1562,7 @@ expand_suf: # @expand_suf jirl $ra, $ra, 0 ld.hu $t6, $s6, 20 ld.h $s0, $s6, 18 - ld.d $t7, $sp, 80 # 8-byte Folded Reload + ld.d $t7, $sp, 64 # 8-byte Folded Reload ori $t5, $zero, 1 .LBB4_36: # in Loop: Header=BB4_5 Depth=1 ext.w.h $a0, $t6 @@ -1509,24 +1570,24 @@ expand_suf: # @expand_suf add.d $a0, $a1, $a0 .LBB4_37: # %pr_suf_expansion.exit # in Loop: Header=BB4_5 Depth=1 - ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $s0, $sp, 72 # 8-byte Folded Reload b .LBB4_3 .LBB4_38: move $s4, $zero .LBB4_39: # %._crit_edge move $a0, $s4 - ld.d $s8, $sp, 216 # 8-byte Folded Reload - ld.d $s7, $sp, 224 # 8-byte Folded Reload - ld.d $s6, $sp, 232 # 8-byte Folded Reload - ld.d $s5, $sp, 240 # 8-byte Folded Reload - ld.d $s4, $sp, 248 # 8-byte Folded Reload - ld.d $s3, $sp, 256 # 8-byte Folded Reload - ld.d $s2, $sp, 264 # 8-byte Folded Reload - ld.d $s1, $sp, 272 # 8-byte Folded Reload - ld.d $s0, $sp, 280 # 8-byte Folded Reload - ld.d $fp, $sp, 288 # 8-byte Folded Reload - ld.d $ra, $sp, 296 # 8-byte Folded Reload - addi.d $sp, $sp, 304 + ld.d $s8, $sp, 200 # 8-byte Folded Reload + ld.d $s7, $sp, 208 # 8-byte Folded Reload + ld.d $s6, $sp, 216 # 8-byte Folded Reload + ld.d $s5, $sp, 224 # 8-byte Folded Reload + ld.d $s4, $sp, 232 # 8-byte Folded Reload + ld.d $s3, $sp, 240 # 8-byte Folded Reload + ld.d $s2, $sp, 248 # 8-byte Folded Reload + ld.d $s1, $sp, 256 # 8-byte Folded Reload + ld.d $s0, $sp, 264 # 8-byte Folded Reload + ld.d $fp, $sp, 272 # 8-byte Folded Reload + ld.d $ra, $sp, 280 # 8-byte Folded Reload + addi.d $sp, $sp, 288 ret .Lfunc_end4: .size expand_suf, .Lfunc_end4-expand_suf diff --git a/results/MultiSource/Benchmarks/MiBench/office-stringsearch/CMakeFiles/office-stringsearch.dir/bmhisrch.s b/results/MultiSource/Benchmarks/MiBench/office-stringsearch/CMakeFiles/office-stringsearch.dir/bmhisrch.s index 736fa9ba..f3e6aa9f 100644 --- a/results/MultiSource/Benchmarks/MiBench/office-stringsearch/CMakeFiles/office-stringsearch.dir/bmhisrch.s +++ b/results/MultiSource/Benchmarks/MiBench/office-stringsearch/CMakeFiles/office-stringsearch.dir/bmhisrch.s @@ -190,14 +190,12 @@ bmhi_init: # @bmhi_init ld.w $t1, $a7, 0 vinsgr2vr.w $vr1, $t1, 0 vseq.b $vr1, $vr1, $vr0 - vilvl.b $vr1, $vr1, $vr1 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 24 + vsllwil.h.b $vr1, $vr1, 0 + vsllwil.w.h $vr1, $vr1, 0 vmskltz.w $vr2, $vr1 vpickve2gr.hu $t1, $vr2, 0 beqz $t1, .LBB0_11 # %bb.13: # in Loop: Header=BB0_12 Depth=1 - vsrai.w $vr1, $vr1, 24 vpickve2gr.w $t1, $vr1, 3 andi $t1, $t1, 1 vpickve2gr.w $t2, $vr1, 2 diff --git a/results/MultiSource/Benchmarks/MiBench/office-stringsearch/CMakeFiles/office-stringsearch.dir/bmhsrch.s b/results/MultiSource/Benchmarks/MiBench/office-stringsearch/CMakeFiles/office-stringsearch.dir/bmhsrch.s index 8c8174de..5e21d569 100644 --- a/results/MultiSource/Benchmarks/MiBench/office-stringsearch/CMakeFiles/office-stringsearch.dir/bmhsrch.s +++ b/results/MultiSource/Benchmarks/MiBench/office-stringsearch/CMakeFiles/office-stringsearch.dir/bmhsrch.s @@ -100,37 +100,39 @@ bmh_init: # @bmh_init move $a4, $zero b .LBB0_6 .LBB0_3: # %vector.ph30 + pcalau12i $a4, %pc_hi20(.LCPI0_0) + vld $vr1, $a4, %pc_lo12(.LCPI0_0) bstrpick.d $a4, $a0, 30, 2 - pcalau12i $a5, %pc_hi20(.LCPI0_0) - vld $vr1, $a5, %pc_lo12(.LCPI0_0) slli.d $a4, $a4, 2 vrepli.b $vr2, -1 - vrepli.b $vr3, 0 move $a5, $fp move $a6, $a4 .p2align 4, , 16 .LBB0_4: # %vector.body33 # =>This Inner Loop Header: Depth=1 ld.w $a7, $a5, 0 - vxor.v $vr4, $vr1, $vr2 - vadd.w $vr4, $vr0, $vr4 - vinsgr2vr.w $vr5, $a7, 0 - vilvl.b $vr5, $vr3, $vr5 - vilvl.h $vr5, $vr3, $vr5 - vilvh.w $vr6, $vr3, $vr5 - vilvl.w $vr5, $vr3, $vr5 - vpickve2gr.d $a7, $vr5, 0 + vxor.v $vr3, $vr1, $vr2 + vadd.w $vr3, $vr0, $vr3 + vinsgr2vr.w $vr4, $a7, 0 + vshuf4i.b $vr5, $vr4, 14 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vpickve2gr.d $a7, $vr4, 0 alsl.d $a7, $a7, $a2, 2 - vpickve2gr.d $t0, $vr5, 1 + vpickve2gr.d $t0, $vr4, 1 alsl.d $t0, $t0, $a2, 2 - vpickve2gr.d $t1, $vr6, 0 + vpickve2gr.d $t1, $vr5, 0 alsl.d $t1, $t1, $a2, 2 - vpickve2gr.d $t2, $vr6, 1 + vpickve2gr.d $t2, $vr5, 1 alsl.d $t2, $t2, $a2, 2 - vstelm.w $vr4, $a7, 0, 0 - vstelm.w $vr4, $t0, 0, 1 - vstelm.w $vr4, $t1, 0, 2 - vstelm.w $vr4, $t2, 0, 3 + vstelm.w $vr3, $a7, 0, 0 + vstelm.w $vr3, $t0, 0, 1 + vstelm.w $vr3, $t1, 0, 2 + vstelm.w $vr3, $t2, 0, 3 vaddi.wu $vr1, $vr1, 4 addi.d $a6, $a6, -4 addi.d $a5, $a5, 4 @@ -199,14 +201,12 @@ bmh_init: # @bmh_init ld.w $t3, $t1, 0 vinsgr2vr.w $vr1, $t3, 0 vseq.b $vr1, $vr1, $vr0 - vilvl.b $vr1, $vr1, $vr1 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 24 + vsllwil.h.b $vr1, $vr1, 0 + vsllwil.w.h $vr1, $vr1, 0 vmskltz.w $vr2, $vr1 vpickve2gr.hu $t3, $vr2, 0 beqz $t3, .LBB0_12 # %bb.14: # in Loop: Header=BB0_13 Depth=1 - vsrai.w $vr1, $vr1, 24 vpickve2gr.w $t3, $vr1, 3 andi $t3, $t3, 1 vpickve2gr.w $t4, $vr1, 2 diff --git a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/gsm_decode.s b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/gsm_decode.s index d8fffb79..ae640eae 100644 --- a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/gsm_decode.s +++ b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/gsm_decode.s @@ -62,17 +62,16 @@ gsm_decode: # @gsm_decode bstrpick.d $t3, $a2, 4, 2 srli.d $t4, $t1, 7 bstrins.d $t4, $a2, 2, 1 - vinsgr2vr.b $vr1, $a3, 0 - vinsgr2vr.b $vr1, $a5, 1 - vinsgr2vr.b $vr1, $a6, 2 - vinsgr2vr.b $vr1, $t0, 3 - vinsgr2vr.b $vr1, $a4, 4 - vinsgr2vr.b $vr1, $t2, 5 - vinsgr2vr.b $vr1, $t3, 6 - vinsgr2vr.b $vr1, $t4, 7 - vrepli.b $vr0, 0 - vilvl.b $vr1, $vr0, $vr1 - vst $vr1, $sp, 16 + vinsgr2vr.b $vr0, $a3, 0 + vinsgr2vr.b $vr0, $a5, 1 + vinsgr2vr.b $vr0, $a6, 2 + vinsgr2vr.b $vr0, $t0, 3 + vinsgr2vr.b $vr0, $a4, 4 + vinsgr2vr.b $vr0, $t2, 5 + vinsgr2vr.b $vr0, $t3, 6 + vinsgr2vr.b $vr0, $t4, 7 + vsllwil.hu.bu $vr0, $vr0, 0 + vst $vr0, $sp, 16 bstrpick.d $a2, $t1, 6, 4 st.h $a2, $sp, 32 ld.bu $a2, $a1, 12 @@ -94,28 +93,28 @@ gsm_decode: # @gsm_decode srli.d $a4, $a3, 6 bstrins.d $a4, $t1, 2, 2 st.h $a4, $sp, 36 - vinsgr2vr.b $vr1, $a3, 0 - vinsgr2vr.b $vr1, $a2, 1 - vshuf4i.b $vr2, $vr1, 80 + vinsgr2vr.b $vr0, $a3, 0 + vinsgr2vr.b $vr0, $a2, 1 + vshuf4i.b $vr1, $vr0, 80 lu12i.w $a3, 4160 ori $a3, $a3, 3 ld.bu $a4, $a1, 15 - vreplgr2vr.w $vr1, $a3 - vsrl.b $vr2, $vr2, $vr1 - vandi.b $vr2, $vr2, 7 + vreplgr2vr.w $vr0, $a3 + vsrl.b $vr1, $vr1, $vr0 + vandi.b $vr1, $vr1, 7 srli.d $a3, $a4, 6 ld.bu $a5, $a1, 16 bstrins.d $a3, $a2, 2, 2 bstrpick.d $a2, $a4, 5, 3 andi $a4, $a4, 7 srli.d $a6, $a5, 5 - vinsgr2vr.b $vr2, $a3, 4 - vinsgr2vr.b $vr2, $a2, 5 - vinsgr2vr.b $vr2, $a4, 6 - vinsgr2vr.b $vr2, $a6, 7 - vilvl.b $vr2, $vr0, $vr2 + vinsgr2vr.b $vr1, $a3, 4 + vinsgr2vr.b $vr1, $a2, 5 + vinsgr2vr.b $vr1, $a4, 6 + vinsgr2vr.b $vr1, $a6, 7 + vsllwil.hu.bu $vr1, $vr1, 0 ld.bu $a2, $a1, 17 - vst $vr2, $sp, 38 + vst $vr1, $sp, 38 bstrpick.d $a3, $a5, 4, 2 st.h $a3, $sp, 54 srli.d $a3, $a2, 7 @@ -142,25 +141,25 @@ gsm_decode: # @gsm_decode srli.d $a5, $a4, 6 bstrins.d $a5, $a2, 2, 2 st.h $a5, $sp, 62 - vinsgr2vr.b $vr2, $a4, 0 - vinsgr2vr.b $vr2, $a3, 1 + vinsgr2vr.b $vr1, $a4, 0 + vinsgr2vr.b $vr1, $a3, 1 ld.bu $a2, $a1, 22 - vshuf4i.b $vr2, $vr2, 80 - vsrl.b $vr2, $vr2, $vr1 - vandi.b $vr2, $vr2, 7 + vshuf4i.b $vr1, $vr1, 80 + vsrl.b $vr1, $vr1, $vr0 + vandi.b $vr1, $vr1, 7 srli.d $a4, $a2, 6 ld.bu $a5, $a1, 23 bstrins.d $a4, $a3, 2, 2 bstrpick.d $a3, $a2, 5, 3 andi $a2, $a2, 7 srli.d $a6, $a5, 5 - vinsgr2vr.b $vr2, $a4, 4 - vinsgr2vr.b $vr2, $a3, 5 - vinsgr2vr.b $vr2, $a2, 6 - vinsgr2vr.b $vr2, $a6, 7 - vilvl.b $vr2, $vr0, $vr2 + vinsgr2vr.b $vr1, $a4, 4 + vinsgr2vr.b $vr1, $a3, 5 + vinsgr2vr.b $vr1, $a2, 6 + vinsgr2vr.b $vr1, $a6, 7 + vsllwil.hu.bu $vr1, $vr1, 0 ld.bu $a2, $a1, 24 - vst $vr2, $sp, 64 + vst $vr1, $sp, 64 bstrpick.d $a3, $a5, 4, 2 st.h $a3, $sp, 80 srli.d $a3, $a2, 7 @@ -187,23 +186,23 @@ gsm_decode: # @gsm_decode srli.d $a5, $a4, 6 bstrins.d $a5, $a2, 2, 2 st.h $a5, $sp, 88 - vinsgr2vr.b $vr2, $a4, 0 - vinsgr2vr.b $vr2, $a3, 1 + vinsgr2vr.b $vr1, $a4, 0 + vinsgr2vr.b $vr1, $a3, 1 ld.bu $a2, $a1, 29 - vshuf4i.b $vr2, $vr2, 80 - vsrl.b $vr1, $vr2, $vr1 - vandi.b $vr1, $vr1, 7 + vshuf4i.b $vr1, $vr1, 80 + vsrl.b $vr0, $vr1, $vr0 + vandi.b $vr0, $vr0, 7 srli.d $a4, $a2, 6 ld.bu $a5, $a1, 30 bstrins.d $a4, $a3, 2, 2 bstrpick.d $a3, $a2, 5, 3 andi $a2, $a2, 7 srli.d $a6, $a5, 5 - vinsgr2vr.b $vr1, $a4, 4 - vinsgr2vr.b $vr1, $a3, 5 - vinsgr2vr.b $vr1, $a2, 6 - vinsgr2vr.b $vr1, $a6, 7 - vilvl.b $vr0, $vr0, $vr1 + vinsgr2vr.b $vr0, $a4, 4 + vinsgr2vr.b $vr0, $a3, 5 + vinsgr2vr.b $vr0, $a2, 6 + vinsgr2vr.b $vr0, $a6, 7 + vsllwil.hu.bu $vr0, $vr0, 0 ld.bu $a2, $a1, 31 vst $vr0, $sp, 90 bstrpick.d $a3, $a5, 4, 2 diff --git a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/long_term.s b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/long_term.s index e2ea151f..604d9999 100644 --- a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/long_term.s +++ b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/long_term.s @@ -5,26 +5,23 @@ .type Gsm_Long_Term_Predictor,@function Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor # %bb.0: # %iter.check - addi.d $sp, $sp, -176 - st.d $ra, $sp, 168 # 8-byte Folded Spill - st.d $fp, $sp, 160 # 8-byte Folded Spill - st.d $s0, $sp, 152 # 8-byte Folded Spill - st.d $s1, $sp, 144 # 8-byte Folded Spill - st.d $s2, $sp, 136 # 8-byte Folded Spill - st.d $s3, $sp, 128 # 8-byte Folded Spill - st.d $s4, $sp, 120 # 8-byte Folded Spill - st.d $s5, $sp, 112 # 8-byte Folded Spill - st.d $s6, $sp, 104 # 8-byte Folded Spill - st.d $s7, $sp, 96 # 8-byte Folded Spill - st.d $s8, $sp, 88 # 8-byte Folded Spill - fst.d $fs0, $sp, 80 # 8-byte Folded Spill - fst.d $fs1, $sp, 72 # 8-byte Folded Spill - fst.d $fs2, $sp, 64 # 8-byte Folded Spill - fst.d $fs3, $sp, 56 # 8-byte Folded Spill - fst.d $fs4, $sp, 48 # 8-byte Folded Spill - fst.d $fs5, $sp, 40 # 8-byte Folded Spill - fst.d $fs6, $sp, 32 # 8-byte Folded Spill - fst.d $fs7, $sp, 24 # 8-byte Folded Spill + addi.d $sp, $sp, -144 + st.d $ra, $sp, 136 # 8-byte Folded Spill + st.d $fp, $sp, 128 # 8-byte Folded Spill + st.d $s0, $sp, 120 # 8-byte Folded Spill + st.d $s1, $sp, 112 # 8-byte Folded Spill + st.d $s2, $sp, 104 # 8-byte Folded Spill + st.d $s3, $sp, 96 # 8-byte Folded Spill + st.d $s4, $sp, 88 # 8-byte Folded Spill + st.d $s5, $sp, 80 # 8-byte Folded Spill + st.d $s6, $sp, 72 # 8-byte Folded Spill + st.d $s7, $sp, 64 # 8-byte Folded Spill + st.d $s8, $sp, 56 # 8-byte Folded Spill + fst.d $fs0, $sp, 48 # 8-byte Folded Spill + fst.d $fs1, $sp, 40 # 8-byte Folded Spill + fst.d $fs2, $sp, 32 # 8-byte Folded Spill + fst.d $fs3, $sp, 24 # 8-byte Folded Spill + fst.d $fs4, $sp, 16 # 8-byte Folded Spill move $fp, $a1 vld $vr0, $a1, 16 vslti.h $vr1, $vr0, 0 @@ -82,7 +79,7 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vmax.h $vr0, $vr1, $vr0 vpickve2gr.h $a0, $vr0, 0 bstrpick.d $a0, $a0, 15, 0 - st.d $a6, $sp, 16 # 8-byte Folded Spill + st.d $a6, $sp, 8 # 8-byte Folded Spill move $s3, $a5 move $s1, $a4 move $s0, $a3 @@ -106,219 +103,159 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor .LBB0_3: # %.thread.i move $a1, $zero vld $vr0, $fp, 48 - vld $vr1, $fp, 0 + vld $vr1, $fp, 32 vld $vr2, $fp, 16 - vld $vr3, $fp, 32 - vilvh.h $vr4, $vr0, $vr0 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvh.h $vr5, $vr3, $vr3 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr6, $vr2, $vr2 - vslli.w $vr6, $vr6, 16 - vsrai.w $vr6, $vr6, 16 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvh.h $vr7, $vr1, $vr1 - vslli.w $vr7, $vr7, 16 - vsrai.w $vr7, $vr7, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vld $vr3, $fp, 0 + vsllwil.w.h $vr4, $vr0, 0 + vsllwil.w.h $vr5, $vr1, 0 + vsllwil.w.h $vr6, $vr2, 0 + vsllwil.w.h $vr7, $vr3, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.w.h $vr0, $vr0, 0 + vbsrl.v $vr1, $vr1, 8 + vsllwil.w.h $vr1, $vr1, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 + vbsrl.v $vr3, $vr3, 8 + vsllwil.w.h $vr3, $vr3, 0 vreplgr2vr.w $vr16, $a0 - vsra.w $vr15, $vr1, $vr16 - vsra.w $vr13, $vr7, $vr16 - vsra.w $vr11, $vr2, $vr16 + vsra.w $vr15, $vr3, $vr16 + vsra.w $vr14, $vr2, $vr16 + vsra.w $vr13, $vr1, $vr16 + vsra.w $vr12, $vr0, $vr16 + vsra.w $vr11, $vr7, $vr16 vsra.w $vr9, $vr6, $vr16 - vsra.w $vr7, $vr3, $vr16 - vsra.w $vr5, $vr5, $vr16 - vsra.w $vr3, $vr0, $vr16 - vsra.w $vr1, $vr4, $vr16 - vshuf4i.w $vr0, $vr1, 50 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 - vshuf4i.w $vr1, $vr1, 16 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vshuf4i.w $vr2, $vr3, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr4, $vr5, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr6, $vr7, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr7, $vr7, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 - vshuf4i.w $vr8, $vr9, 50 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr10, $vr11, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr11, $vr11, 16 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr12, $vr13, 50 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr13, $vr13, 16 - vslli.d $vr13, $vr13, 32 - vsrai.d $vr13, $vr13, 32 - vshuf4i.w $vr14, $vr15, 50 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr14, $vr14, 32 + vsra.w $vr7, $vr5, $vr16 + vsra.w $vr5, $vr4, $vr16 + vsllwil.d.w $vr0, $vr5, 0 + vsllwil.d.w $vr1, $vr7, 0 + vsllwil.d.w $vr2, $vr9, 0 + vsllwil.d.w $vr3, $vr11, 0 + vsllwil.d.w $vr4, $vr12, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr6, $vr13, 0 + vshuf4i.w $vr7, $vr7, 14 + vsllwil.d.w $vr7, $vr7, 0 + vsllwil.d.w $vr8, $vr14, 0 + vshuf4i.w $vr9, $vr9, 14 + vsllwil.d.w $vr9, $vr9, 0 + vsllwil.d.w $vr10, $vr15, 0 + vshuf4i.w $vr11, $vr11, 14 + vsllwil.d.w $vr11, $vr11, 0 + vshuf4i.w $vr12, $vr12, 14 + vsllwil.d.w $vr12, $vr12, 0 + vshuf4i.w $vr13, $vr13, 14 + vsllwil.d.w $vr13, $vr13, 0 + vshuf4i.w $vr14, $vr14, 14 vld $vr17, $fp, 64 - vshuf4i.w $vr15, $vr15, 16 - vslli.d $vr15, $vr15, 32 - vsrai.d $vr15, $vr15, 32 - vilvh.h $vr18, $vr17, $vr17 - vslli.w $vr18, $vr18, 16 - vsrai.w $vr18, $vr18, 16 - vilvl.h $vr17, $vr17, $vr17 - vslli.w $vr17, $vr17, 16 - vsrai.w $vr17, $vr17, 16 + vsllwil.d.w $vr14, $vr14, 0 + vshuf4i.w $vr15, $vr15, 14 + vsllwil.d.w $vr15, $vr15, 0 + vsllwil.w.h $vr18, $vr17, 0 + vbsrl.v $vr17, $vr17, 8 + vsllwil.w.h $vr17, $vr17, 0 vsra.w $vr19, $vr17, $vr16 - vsra.w $vr17, $vr18, $vr16 - vshuf4i.w $vr16, $vr17, 50 - vslli.d $vr16, $vr16, 32 - vsrai.d $vr16, $vr16, 32 - vshuf4i.w $vr17, $vr17, 16 - vslli.d $vr17, $vr17, 32 - vsrai.d $vr17, $vr17, 32 - vshuf4i.w $vr18, $vr19, 50 - vslli.d $vr18, $vr18, 32 - vsrai.d $vr18, $vr18, 32 - vshuf4i.w $vr19, $vr19, 16 - vslli.d $vr19, $vr19, 32 - vsrai.d $vr19, $vr19, 32 + vsra.w $vr18, $vr18, $vr16 + vsllwil.d.w $vr16, $vr18, 0 + vsllwil.d.w $vr17, $vr19, 0 + vshuf4i.w $vr18, $vr18, 14 + vsllwil.d.w $vr18, $vr18, 0 + vshuf4i.w $vr19, $vr19, 14 + vsllwil.d.w $vr19, $vr19, 0 addi.d $a3, $s2, -16 ori $a4, $zero, 40 ori $a5, $zero, 121 ori $a2, $zero, 40 .p2align 4, , 16 .LBB0_4: # =>This Inner Loop Header: Depth=1 - vld $vr22, $a3, -16 - vld $vr24, $a3, -32 - vld $vr23, $a3, -48 - vld $vr25, $a3, -64 - vilvh.h $vr20, $vr22, $vr22 - vilvh.h $vr21, $vr24, $vr24 - vilvl.h $vr22, $vr22, $vr22 - vilvl.h $vr24, $vr24, $vr24 - vilvh.h $vr28, $vr25, $vr25 - vilvh.w $vr26, $vr28, $vr28 - vslli.d $vr26, $vr26, 48 - vsrai.d $vr29, $vr26, 48 - vilvl.h $vr27, $vr25, $vr25 - vilvh.w $vr25, $vr27, $vr27 - vslli.d $vr25, $vr25, 48 - vld $vr30, $a3, 0 - vsrai.d $vr25, $vr25, 48 - vmul.d $vr26, $vr14, $vr25 - vmul.d $vr25, $vr12, $vr29 - vilvl.h $vr29, $vr30, $vr30 - vilvh.h $vr30, $vr30, $vr30 - vilvh.w $vr31, $vr30, $vr30 - vslli.d $vr31, $vr31, 48 - vsrai.d $vr31, $vr31, 48 - vmadd.d $vr25, $vr16, $vr31 - vilvh.w $vr31, $vr29, $vr29 - vslli.d $vr31, $vr31, 48 - vsrai.d $vr31, $vr31, 48 - vmadd.d $vr26, $vr18, $vr31 - vilvl.w $vr31, $vr22, $vr22 - vslli.d $vr31, $vr31, 48 - vsrai.d $vr31, $vr31, 48 - vilvl.w $vr28, $vr28, $vr28 - vslli.d $vr28, $vr28, 48 - vsrai.d $vr28, $vr28, 48 - vmul.d $vr28, $vr13, $vr28 - vilvl.w $vr30, $vr30, $vr30 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vmadd.d $vr28, $vr17, $vr30 - vilvl.w $vr30, $vr24, $vr24 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vilvl.w $vr27, $vr27, $vr27 - vslli.d $vr27, $vr27, 48 - vsrai.d $vr27, $vr27, 48 - vmul.d $vr27, $vr15, $vr27 - vilvl.w $vr29, $vr29, $vr29 - vslli.d $vr29, $vr29, 48 - vsrai.d $vr29, $vr29, 48 - vmadd.d $vr27, $vr19, $vr29 - vilvl.h $vr29, $vr23, $vr23 - vmadd.d $vr27, $vr7, $vr30 - vilvl.w $vr30, $vr29, $vr29 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vmul.d $vr30, $vr11, $vr30 - vmadd.d $vr30, $vr3, $vr31 - vilvl.w $vr31, $vr20, $vr20 - vslli.d $vr31, $vr31, 48 - vsrai.d $vr31, $vr31, 48 - vadd.d $vr27, $vr27, $vr30 - vilvl.w $vr30, $vr21, $vr21 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vilvh.h $vr23, $vr23, $vr23 - vmadd.d $vr28, $vr5, $vr30 - vilvl.w $vr30, $vr23, $vr23 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vmul.d $vr30, $vr9, $vr30 - vmadd.d $vr30, $vr1, $vr31 - vadd.d $vr28, $vr28, $vr30 - vadd.d $vr27, $vr27, $vr28 - vilvh.w $vr24, $vr24, $vr24 - vslli.d $vr24, $vr24, 48 - vsrai.d $vr24, $vr24, 48 - vmadd.d $vr26, $vr6, $vr24 - vilvh.w $vr22, $vr22, $vr22 - vslli.d $vr22, $vr22, 48 - vsrai.d $vr22, $vr22, 48 - vilvh.w $vr24, $vr29, $vr29 - vslli.d $vr24, $vr24, 48 - vsrai.d $vr24, $vr24, 48 - vmul.d $vr24, $vr10, $vr24 - vmadd.d $vr24, $vr2, $vr22 - vadd.d $vr22, $vr26, $vr24 - vilvh.w $vr21, $vr21, $vr21 - vslli.d $vr21, $vr21, 48 - vsrai.d $vr21, $vr21, 48 - vmadd.d $vr25, $vr4, $vr21 - vilvh.w $vr20, $vr20, $vr20 - vslli.d $vr20, $vr20, 48 - vsrai.d $vr20, $vr20, 48 - vilvh.w $vr21, $vr23, $vr23 - vslli.d $vr21, $vr21, 48 - vsrai.d $vr21, $vr21, 48 - vmul.d $vr21, $vr8, $vr21 - vmadd.d $vr21, $vr0, $vr20 - vadd.d $vr20, $vr25, $vr21 - vadd.d $vr20, $vr22, $vr20 - vadd.d $vr20, $vr27, $vr20 + vld $vr24, $a3, -64 + vld $vr20, $a3, -48 + vld $vr22, $a3, -32 + vld $vr21, $a3, -16 + vbsrl.v $vr23, $vr24, 12 + vsllwil.w.h $vr23, $vr23, 0 + vsllwil.d.w $vr23, $vr23, 0 + vshuf4i.h $vr25, $vr24, 14 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vbsrl.v $vr26, $vr24, 8 + vsllwil.w.h $vr26, $vr26, 0 + vsllwil.d.w $vr26, $vr26, 0 + vld $vr27, $a3, 0 + vmul.d $vr26, $vr10, $vr26 + vmul.d $vr25, $vr11, $vr25 + vmul.d $vr23, $vr15, $vr23 + vbsrl.v $vr28, $vr27, 12 + vsllwil.w.h $vr28, $vr28, 0 + vsllwil.d.w $vr28, $vr28, 0 + vmadd.d $vr23, $vr19, $vr28 + vshuf4i.h $vr28, $vr27, 14 + vsllwil.w.h $vr28, $vr28, 0 + vsllwil.d.w $vr28, $vr28, 0 + vmadd.d $vr25, $vr18, $vr28 + vbsrl.v $vr28, $vr27, 8 + vsllwil.w.h $vr28, $vr28, 0 + vsllwil.d.w $vr28, $vr28, 0 + vmadd.d $vr26, $vr17, $vr28 + vsllwil.w.h $vr28, $vr21, 0 + vsllwil.d.w $vr28, $vr28, 0 + vsllwil.w.h $vr24, $vr24, 0 + vsllwil.d.w $vr24, $vr24, 0 + vmul.d $vr24, $vr3, $vr24 + vsllwil.w.h $vr27, $vr27, 0 + vsllwil.d.w $vr27, $vr27, 0 + vmadd.d $vr24, $vr16, $vr27 + vsllwil.w.h $vr27, $vr22, 0 + vsllwil.d.w $vr27, $vr27, 0 + vmadd.d $vr24, $vr1, $vr27 + vsllwil.w.h $vr27, $vr20, 0 + vsllwil.d.w $vr27, $vr27, 0 + vmul.d $vr27, $vr2, $vr27 + vmadd.d $vr27, $vr0, $vr28 + vbsrl.v $vr28, $vr21, 8 + vsllwil.w.h $vr28, $vr28, 0 + vsllwil.d.w $vr28, $vr28, 0 + vadd.d $vr24, $vr24, $vr27 + vbsrl.v $vr27, $vr22, 8 + vsllwil.w.h $vr27, $vr27, 0 + vsllwil.d.w $vr27, $vr27, 0 + vmadd.d $vr26, $vr6, $vr27 + vbsrl.v $vr27, $vr20, 8 + vsllwil.w.h $vr27, $vr27, 0 + vsllwil.d.w $vr27, $vr27, 0 + vmul.d $vr27, $vr8, $vr27 + vmadd.d $vr27, $vr4, $vr28 + vadd.d $vr26, $vr26, $vr27 + vshuf4i.h $vr27, $vr21, 14 + vsllwil.w.h $vr27, $vr27, 0 + vsllwil.d.w $vr27, $vr27, 0 + vadd.d $vr24, $vr24, $vr26 + vshuf4i.h $vr26, $vr22, 14 + vsllwil.w.h $vr26, $vr26, 0 + vsllwil.d.w $vr26, $vr26, 0 + vmadd.d $vr25, $vr7, $vr26 + vshuf4i.h $vr26, $vr20, 14 + vsllwil.w.h $vr26, $vr26, 0 + vsllwil.d.w $vr26, $vr26, 0 + vmul.d $vr26, $vr9, $vr26 + vmadd.d $vr26, $vr5, $vr27 + vadd.d $vr25, $vr25, $vr26 + vbsrl.v $vr22, $vr22, 12 + vsllwil.w.h $vr22, $vr22, 0 + vsllwil.d.w $vr22, $vr22, 0 + vmadd.d $vr23, $vr13, $vr22 + vbsrl.v $vr21, $vr21, 12 + vsllwil.w.h $vr21, $vr21, 0 + vsllwil.d.w $vr21, $vr21, 0 + vbsrl.v $vr20, $vr20, 12 + vsllwil.w.h $vr20, $vr20, 0 + vsllwil.d.w $vr20, $vr20, 0 + vmul.d $vr20, $vr14, $vr20 + vmadd.d $vr20, $vr12, $vr21 + vadd.d $vr20, $vr23, $vr20 + vadd.d $vr20, $vr25, $vr20 + vadd.d $vr20, $vr24, $vr20 vhaddw.q.d $vr20, $vr20, $vr20 vpickve2gr.d $a6, $vr20, 0 slt $a7, $a1, $a6 @@ -349,32 +286,24 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr1, $a1, 0 vsrai.h $vr0, $vr0, 3 vsrai.h $vr1, $vr1, 3 - vilvl.h $vr0, $vr0, $vr0 - vilvl.w $vr0, $vr0, $vr0 - vslli.d $vr0, $vr0, 48 - vsrai.d $vr2, $vr0, 48 - vilvl.h $vr0, $vr1, $vr1 - vilvl.w $vr0, $vr0, $vr0 + vsllwil.w.h $vr0, $vr0, 0 + vsllwil.d.w $vr2, $vr0, 0 ori $a1, $zero, 4 sub.d $a1, $a1, $a0 alsl.d $a2, $a1, $s2, 1 slli.d $a1, $a1, 1 ldx.w $a1, $s2, $a1 ld.w $a2, $a2, 4 - vslli.d $vr0, $vr0, 48 - vsrai.d $vr3, $vr0, 48 + vsllwil.w.h $vr0, $vr1, 0 + vsllwil.d.w $vr3, $vr0, 0 vinsgr2vr.w $vr0, $a1, 0 vinsgr2vr.w $vr1, $a2, 0 vsrai.h $vr0, $vr0, 3 vsrai.h $vr1, $vr1, 3 - vilvl.h $vr0, $vr0, $vr0 - vilvl.w $vr0, $vr0, $vr0 - vslli.d $vr0, $vr0, 48 - vsrai.d $vr0, $vr0, 48 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + vsllwil.w.h $vr0, $vr0, 0 + vsllwil.d.w $vr0, $vr0, 0 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmul.d $vr0, $vr0, $vr0 vmul.d $vr1, $vr1, $vr1 ori $a1, $zero, 8 @@ -389,14 +318,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 ori $a1, $zero, 12 sub.d $a1, $a1, $a0 alsl.d $a2, $a1, $s2, 1 @@ -409,14 +334,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 sub.d $a1, $s7, $a0 alsl.d $a2, $a1, $s2, 1 slli.d $a1, $a1, 1 @@ -428,14 +349,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 ori $a1, $zero, 20 sub.d $a1, $a1, $a0 alsl.d $a2, $a1, $s2, 1 @@ -448,14 +365,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 ori $a1, $zero, 24 sub.d $a1, $a1, $a0 alsl.d $a2, $a1, $s2, 1 @@ -468,14 +381,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 ori $a1, $zero, 28 sub.d $a1, $a1, $a0 alsl.d $a2, $a1, $s2, 1 @@ -488,14 +397,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 ori $a1, $zero, 32 sub.d $a1, $a1, $a0 alsl.d $a2, $a1, $s2, 1 @@ -508,14 +413,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 ori $a1, $zero, 36 sub.d $a0, $a1, $a0 alsl.d $a1, $a0, $s2, 1 @@ -528,14 +429,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a1, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 vmadd.d $vr0, $vr2, $vr2 vmadd.d $vr1, $vr3, $vr3 vadd.d $vr0, $vr1, $vr0 @@ -574,7 +471,7 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor bge $a0, $s8, .LBB0_37 .LBB0_10: # %.preheader64.i ori $a0, $zero, 3 - ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a1, $sp, 8 # 8-byte Folded Reload st.h $a0, $a1, 0 ld.h $a0, $s3, 0 sub.d $a2, $s0, $s1 @@ -600,18 +497,15 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor slli.d $a1, $a0, 1 sub.d $a1, $s2, $a1 vld $vr0, $a1, 0 - vilvl.h $vr1, $vr0, $vr0 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr2, $vr1, 16 - vilvh.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr3, $vr0, 16 + vsllwil.w.h $vr2, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.w.h $vr3, $vr0, 0 lu12i.w $a1, 4 vreplgr2vr.w $vr0, $a1 vreplgr2vr.w $vr1, $s6 b .LBB0_23 .LBB0_16: # %.preheader.i - ld.d $a0, $sp, 16 # 8-byte Folded Reload + ld.d $a0, $sp, 8 # 8-byte Folded Reload st.h $zero, $a0, 0 ld.h $a0, $s3, 0 sub.d $a1, $s0, $s1 @@ -637,12 +531,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor slli.d $a1, $a0, 1 sub.d $a1, $s2, $a1 vld $vr0, $a1, 0 - vilvl.h $vr1, $vr0, $vr0 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr2, $vr1, 16 - vilvh.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr3, $vr0, 16 + vsllwil.w.h $vr2, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.w.h $vr3, $vr0, 0 lu12i.w $a1, 4 vreplgr2vr.w $vr0, $a1 ori $a1, $zero, 3277 @@ -664,12 +555,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor sub.d $a1, $a1, $a0 slli.d $a1, $a1, 1 vldx $vr2, $s2, $a1 - vilvl.h $vr3, $vr2, $vr2 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 + vsllwil.w.h $vr3, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 vori.b $vr4, $vr0, 0 vmadd.w $vr4, $vr2, $vr1 vori.b $vr2, $vr0, 0 @@ -685,12 +573,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor sub.d $a1, $a1, $a0 slli.d $a1, $a1, 1 vldx $vr2, $s2, $a1 - vilvl.h $vr3, $vr2, $vr2 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 + vsllwil.w.h $vr3, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 vori.b $vr4, $vr0, 0 vmadd.w $vr4, $vr2, $vr1 vori.b $vr2, $vr0, 0 @@ -706,12 +591,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor sub.d $a1, $a1, $a0 slli.d $a1, $a1, 1 vldx $vr2, $s2, $a1 - vilvl.h $vr3, $vr2, $vr2 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 + vsllwil.w.h $vr3, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 vori.b $vr4, $vr0, 0 vmadd.w $vr4, $vr2, $vr1 vori.b $vr2, $vr0, 0 @@ -727,12 +609,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor sub.d $a0, $a1, $a0 slli.d $a0, $a0, 1 vldx $vr2, $s2, $a0 - vilvl.h $vr3, $vr2, $vr2 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 + vsllwil.w.h $vr3, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 vori.b $vr4, $vr0, 0 vmadd.w $vr4, $vr2, $vr1 vmadd.w $vr0, $vr3, $vr1 @@ -806,30 +685,27 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor addi.d $a1, $a1, 2 bne $a1, $a4, .LBB0_27 .LBB0_28: # %Long_term_analysis_filtering.exit - fld.d $fs7, $sp, 24 # 8-byte Folded Reload - fld.d $fs6, $sp, 32 # 8-byte Folded Reload - fld.d $fs5, $sp, 40 # 8-byte Folded Reload - fld.d $fs4, $sp, 48 # 8-byte Folded Reload - fld.d $fs3, $sp, 56 # 8-byte Folded Reload - fld.d $fs2, $sp, 64 # 8-byte Folded Reload - fld.d $fs1, $sp, 72 # 8-byte Folded Reload - fld.d $fs0, $sp, 80 # 8-byte Folded Reload - ld.d $s8, $sp, 88 # 8-byte Folded Reload - ld.d $s7, $sp, 96 # 8-byte Folded Reload - ld.d $s6, $sp, 104 # 8-byte Folded Reload - ld.d $s5, $sp, 112 # 8-byte Folded Reload - ld.d $s4, $sp, 120 # 8-byte Folded Reload - ld.d $s3, $sp, 128 # 8-byte Folded Reload - ld.d $s2, $sp, 136 # 8-byte Folded Reload - ld.d $s1, $sp, 144 # 8-byte Folded Reload - ld.d $s0, $sp, 152 # 8-byte Folded Reload - ld.d $fp, $sp, 160 # 8-byte Folded Reload - ld.d $ra, $sp, 168 # 8-byte Folded Reload - addi.d $sp, $sp, 176 + fld.d $fs4, $sp, 16 # 8-byte Folded Reload + fld.d $fs3, $sp, 24 # 8-byte Folded Reload + fld.d $fs2, $sp, 32 # 8-byte Folded Reload + fld.d $fs1, $sp, 40 # 8-byte Folded Reload + fld.d $fs0, $sp, 48 # 8-byte Folded Reload + ld.d $s8, $sp, 56 # 8-byte Folded Reload + ld.d $s7, $sp, 64 # 8-byte Folded Reload + ld.d $s6, $sp, 72 # 8-byte Folded Reload + ld.d $s5, $sp, 80 # 8-byte Folded Reload + ld.d $s4, $sp, 88 # 8-byte Folded Reload + ld.d $s3, $sp, 96 # 8-byte Folded Reload + ld.d $s2, $sp, 104 # 8-byte Folded Reload + ld.d $s1, $sp, 112 # 8-byte Folded Reload + ld.d $s0, $sp, 120 # 8-byte Folded Reload + ld.d $fp, $sp, 128 # 8-byte Folded Reload + ld.d $ra, $sp, 136 # 8-byte Folded Reload + addi.d $sp, $sp, 144 ret .LBB0_29: # %.preheader60.i ori $a0, $zero, 1 - ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a1, $sp, 8 # 8-byte Folded Reload st.h $a0, $a1, 0 ld.h $a0, $s3, 0 sub.d $a2, $s0, $s1 @@ -855,12 +731,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor slli.d $a1, $a0, 1 sub.d $a1, $s2, $a1 vld $vr0, $a1, 0 - vilvl.h $vr1, $vr0, $vr0 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr2, $vr1, 16 - vilvh.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr3, $vr0, 16 + vsllwil.w.h $vr2, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.w.h $vr3, $vr0, 0 lu12i.w $a1, 4 vreplgr2vr.w $vr0, $a1 lu12i.w $a1, 2 @@ -900,7 +773,7 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor b .LBB0_28 .LBB0_37: # %.preheader62.i ori $a0, $zero, 2 - ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a1, $sp, 8 # 8-byte Folded Reload st.h $a0, $a1, 0 ld.h $a0, $s3, 0 sub.d $a2, $s0, $s1 @@ -926,12 +799,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor slli.d $a1, $a0, 1 sub.d $a1, $s2, $a1 vld $vr0, $a1, 0 - vilvl.h $vr1, $vr0, $vr0 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr2, $vr1, 16 - vilvh.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr3, $vr0, 16 + vsllwil.w.h $vr2, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.w.h $vr3, $vr0, 0 lu12i.w $a1, 4 vreplgr2vr.w $vr0, $a1 lu12i.w $a1, 5 @@ -1004,20 +874,17 @@ Gsm_Long_Term_Synthesis_Filtering: # @Gsm_Long_Term_Synthesis_Filtering sub.d $a1, $a4, $a1 vld $vr1, $a1, 0 vreplgr2vr.d $vr0, $a2 - vilvh.h $vr2, $vr1, $vr1 - vilvl.w $vr3, $vr2, $vr2 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr4, $vr1, $vr1 - vslli.d $vr4, $vr4, 48 - vsrai.d $vr4, $vr4, 48 - vilvh.w $vr1, $vr1, $vr1 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr5, $vr1, 48 + vbsrl.v $vr2, $vr1, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vbsrl.v $vr3, $vr1, 12 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 + vsllwil.w.h $vr4, $vr1, 0 + vsllwil.d.w $vr4, $vr4, 0 + vshuf4i.h $vr1, $vr1, 14 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr5, $vr1, 0 ori $a1, $zero, 0 lu32i.d $a1, 32768 vreplgr2vr.d $vr1, $a1 @@ -1027,10 +894,10 @@ Gsm_Long_Term_Synthesis_Filtering: # @Gsm_Long_Term_Synthesis_Filtering vori.b $vr5, $vr1, 0 vmadd.d $vr5, $vr0, $vr4 vori.b $vr4, $vr1, 0 - vmadd.d $vr4, $vr0, $vr2 - vori.b $vr2, $vr1, 0 - vmadd.d $vr2, $vr0, $vr3 - vsrli.d $vr2, $vr2, 48 + vmadd.d $vr4, $vr0, $vr3 + vori.b $vr3, $vr1, 0 + vmadd.d $vr3, $vr0, $vr2 + vsrli.d $vr2, $vr3, 48 vsrli.d $vr3, $vr4, 48 vsrli.d $vr4, $vr5, 48 vsrli.d $vr5, $vr7, 48 @@ -1043,30 +910,27 @@ Gsm_Long_Term_Synthesis_Filtering: # @Gsm_Long_Term_Synthesis_Filtering sub.d $a1, $a1, $a0 slli.d $a1, $a1, 1 vldx $vr2, $a4, $a1 - vilvh.h $vr3, $vr2, $vr2 - vilvl.w $vr4, $vr3, $vr3 - vslli.d $vr4, $vr4, 48 - vsrai.d $vr4, $vr4, 48 - vilvh.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr5, $vr2, $vr2 - vslli.d $vr5, $vr5, 48 - vsrai.d $vr5, $vr5, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 + vbsrl.v $vr3, $vr2, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 + vbsrl.v $vr4, $vr2, 12 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr2, 0 + vsllwil.d.w $vr5, $vr5, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 vld $vr6, $a3, 16 vori.b $vr7, $vr1, 0 vmadd.d $vr7, $vr0, $vr2 vori.b $vr2, $vr1, 0 vmadd.d $vr2, $vr0, $vr5 vori.b $vr5, $vr1, 0 - vmadd.d $vr5, $vr0, $vr3 - vori.b $vr3, $vr1, 0 - vmadd.d $vr3, $vr0, $vr4 - vsrli.d $vr3, $vr3, 48 + vmadd.d $vr5, $vr0, $vr4 + vori.b $vr4, $vr1, 0 + vmadd.d $vr4, $vr0, $vr3 + vsrli.d $vr3, $vr4, 48 vsrli.d $vr4, $vr5, 48 vsrli.d $vr2, $vr2, 48 vsrli.d $vr5, $vr7, 48 @@ -1079,30 +943,27 @@ Gsm_Long_Term_Synthesis_Filtering: # @Gsm_Long_Term_Synthesis_Filtering sub.d $a1, $a1, $a0 slli.d $a1, $a1, 1 vldx $vr2, $a4, $a1 - vilvh.h $vr3, $vr2, $vr2 - vilvl.w $vr4, $vr3, $vr3 - vslli.d $vr4, $vr4, 48 - vsrai.d $vr4, $vr4, 48 - vilvh.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr5, $vr2, $vr2 - vslli.d $vr5, $vr5, 48 - vsrai.d $vr5, $vr5, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 + vbsrl.v $vr3, $vr2, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 + vbsrl.v $vr4, $vr2, 12 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr2, 0 + vsllwil.d.w $vr5, $vr5, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 vld $vr6, $a3, 32 vori.b $vr7, $vr1, 0 vmadd.d $vr7, $vr0, $vr2 vori.b $vr2, $vr1, 0 vmadd.d $vr2, $vr0, $vr5 vori.b $vr5, $vr1, 0 - vmadd.d $vr5, $vr0, $vr3 - vori.b $vr3, $vr1, 0 - vmadd.d $vr3, $vr0, $vr4 - vsrli.d $vr3, $vr3, 48 + vmadd.d $vr5, $vr0, $vr4 + vori.b $vr4, $vr1, 0 + vmadd.d $vr4, $vr0, $vr3 + vsrli.d $vr3, $vr4, 48 vsrli.d $vr4, $vr5, 48 vsrli.d $vr2, $vr2, 48 vsrli.d $vr5, $vr7, 48 @@ -1115,30 +976,27 @@ Gsm_Long_Term_Synthesis_Filtering: # @Gsm_Long_Term_Synthesis_Filtering sub.d $a1, $a1, $a0 slli.d $a1, $a1, 1 vldx $vr2, $a4, $a1 - vilvh.h $vr3, $vr2, $vr2 - vilvl.w $vr4, $vr3, $vr3 - vslli.d $vr4, $vr4, 48 - vsrai.d $vr4, $vr4, 48 - vilvh.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr5, $vr2, $vr2 - vslli.d $vr5, $vr5, 48 - vsrai.d $vr5, $vr5, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 + vbsrl.v $vr3, $vr2, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 + vbsrl.v $vr4, $vr2, 12 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr2, 0 + vsllwil.d.w $vr5, $vr5, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 vld $vr6, $a3, 48 vori.b $vr7, $vr1, 0 vmadd.d $vr7, $vr0, $vr2 vori.b $vr2, $vr1, 0 vmadd.d $vr2, $vr0, $vr5 vori.b $vr5, $vr1, 0 - vmadd.d $vr5, $vr0, $vr3 - vori.b $vr3, $vr1, 0 - vmadd.d $vr3, $vr0, $vr4 - vsrli.d $vr3, $vr3, 48 + vmadd.d $vr5, $vr0, $vr4 + vori.b $vr4, $vr1, 0 + vmadd.d $vr4, $vr0, $vr3 + vsrli.d $vr3, $vr4, 48 vsrli.d $vr4, $vr5, 48 vsrli.d $vr2, $vr2, 48 vsrli.d $vr5, $vr7, 48 @@ -1151,28 +1009,25 @@ Gsm_Long_Term_Synthesis_Filtering: # @Gsm_Long_Term_Synthesis_Filtering sub.d $a0, $a1, $a0 slli.d $a0, $a0, 1 vldx $vr2, $a4, $a0 - vilvh.h $vr3, $vr2, $vr2 - vilvl.w $vr4, $vr3, $vr3 - vslli.d $vr4, $vr4, 48 - vsrai.d $vr4, $vr4, 48 - vilvh.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr5, $vr2, $vr2 - vslli.d $vr5, $vr5, 48 - vsrai.d $vr5, $vr5, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 + vbsrl.v $vr3, $vr2, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 + vbsrl.v $vr4, $vr2, 12 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr2, 0 + vsllwil.d.w $vr5, $vr5, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 vld $vr6, $a3, 64 vori.b $vr7, $vr1, 0 vmadd.d $vr7, $vr0, $vr2 vori.b $vr2, $vr1, 0 vmadd.d $vr2, $vr0, $vr5 vori.b $vr5, $vr1, 0 - vmadd.d $vr5, $vr0, $vr3 - vmadd.d $vr1, $vr0, $vr4 + vmadd.d $vr5, $vr0, $vr4 + vmadd.d $vr1, $vr0, $vr3 vsrli.d $vr0, $vr1, 48 vsrli.d $vr1, $vr5, 48 vsrli.d $vr2, $vr2, 48 diff --git a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/lpc.s b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/lpc.s index 4a7dfb56..aea624e8 100644 --- a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/lpc.s +++ b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/lpc.s @@ -3,10 +3,10 @@ .p2align 4, 0x0 # -- Begin function Gsm_LPC_Analysis .LCPI0_0: .half 1 # 0x1 + .half 8 # 0x8 .half 65535 # 0xffff .half 65535 # 0xffff .half 65535 # 0xffff - .half 8 # 0x8 .half 65535 # 0xffff .half 65535 # 0xffff .half 65535 # 0xffff @@ -260,18 +260,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis .LBB0_4: # %vector.body108 vld $vr0, $s0, 0 vld $vr1, $s0, 16 - vilvh.h $vr2, $vr0, $vr0 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvh.h $vr3, $vr1, $vr1 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr0, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr0, $vr0, 0 + vbsrl.v $vr3, $vr1, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr4, $vr0, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -292,18 +286,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 0 vst $vr1, $s0, 16 vld $vr1, $s0, 48 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -322,18 +310,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 32 vst $vr1, $s0, 48 vld $vr1, $s0, 80 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -352,18 +334,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 64 vst $vr1, $s0, 80 vld $vr1, $s0, 112 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -382,18 +358,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 96 vst $vr1, $s0, 112 vld $vr1, $s0, 144 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -412,18 +382,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 128 vst $vr1, $s0, 144 vld $vr1, $s0, 176 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -442,18 +406,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 160 vst $vr1, $s0, 176 vld $vr1, $s0, 208 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -472,18 +430,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 192 vst $vr1, $s0, 208 vld $vr1, $s0, 240 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -502,18 +454,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 224 vst $vr1, $s0, 240 vld $vr1, $s0, 272 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -532,18 +478,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 256 vst $vr1, $s0, 272 vld $vr1, $s0, 304 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -559,18 +499,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis .LBB0_7: # %vector.body124 vld $vr0, $s0, 0 vld $vr1, $s0, 16 - vilvh.h $vr2, $vr0, $vr0 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvh.h $vr3, $vr1, $vr1 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr0, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr0, $vr0, 0 + vbsrl.v $vr3, $vr1, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr4, $vr0, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -591,18 +525,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 0 vst $vr1, $s0, 16 vld $vr1, $s0, 48 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -621,18 +549,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 32 vst $vr1, $s0, 48 vld $vr1, $s0, 80 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -651,18 +573,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 64 vst $vr1, $s0, 80 vld $vr1, $s0, 112 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -681,18 +597,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 96 vst $vr1, $s0, 112 vld $vr1, $s0, 144 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -711,18 +621,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 128 vst $vr1, $s0, 144 vld $vr1, $s0, 176 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -741,18 +645,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 160 vst $vr1, $s0, 176 vld $vr1, $s0, 208 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -771,18 +669,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 192 vst $vr1, $s0, 208 vld $vr1, $s0, 240 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -801,18 +693,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 224 vst $vr1, $s0, 240 vld $vr1, $s0, 272 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -831,18 +717,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 256 vst $vr1, $s0, 272 vld $vr1, $s0, 304 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -851,18 +731,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis .LBB0_8: # %vector.body132 vld $vr0, $s0, 0 vld $vr1, $s0, 16 - vilvh.h $vr2, $vr0, $vr0 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvh.h $vr3, $vr1, $vr1 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr0, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr0, $vr0, 0 + vbsrl.v $vr3, $vr1, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr4, $vr0, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -883,18 +757,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 0 vst $vr1, $s0, 16 vld $vr1, $s0, 48 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -913,18 +781,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 32 vst $vr1, $s0, 48 vld $vr1, $s0, 80 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -943,18 +805,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 64 vst $vr1, $s0, 80 vld $vr1, $s0, 112 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -973,18 +829,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 96 vst $vr1, $s0, 112 vld $vr1, $s0, 144 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -1003,18 +853,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 128 vst $vr1, $s0, 144 vld $vr1, $s0, 176 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -1033,18 +877,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 160 vst $vr1, $s0, 176 vld $vr1, $s0, 208 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -1063,18 +901,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 192 vst $vr1, $s0, 208 vld $vr1, $s0, 240 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -1093,18 +925,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 224 vst $vr1, $s0, 240 vld $vr1, $s0, 272 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -1123,18 +949,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 256 vst $vr1, $s0, 272 vld $vr1, $s0, 304 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -1143,18 +963,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis .LBB0_9: # %vector.body116 vld $vr0, $s0, 0 vld $vr1, $s0, 16 - vilvh.h $vr2, $vr0, $vr0 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvh.h $vr3, $vr1, $vr1 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr0, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr0, $vr0, 0 + vbsrl.v $vr3, $vr1, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr4, $vr0, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1175,18 +989,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 0 vst $vr1, $s0, 16 vld $vr1, $s0, 48 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1205,18 +1013,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 32 vst $vr1, $s0, 48 vld $vr1, $s0, 80 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1235,18 +1037,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 64 vst $vr1, $s0, 80 vld $vr1, $s0, 112 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1265,18 +1061,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 96 vst $vr1, $s0, 112 vld $vr1, $s0, 144 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1295,18 +1085,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 128 vst $vr1, $s0, 144 vld $vr1, $s0, 176 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1325,18 +1109,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 160 vst $vr1, $s0, 176 vld $vr1, $s0, 208 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1355,18 +1133,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 192 vst $vr1, $s0, 208 vld $vr1, $s0, 240 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1385,18 +1157,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 224 vst $vr1, $s0, 240 vld $vr1, $s0, 272 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1415,18 +1181,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 256 vst $vr1, $s0, 272 vld $vr1, $s0, 304 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1547,91 +1307,71 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis add.d $a4, $s0, $a0 ld.w $a5, $a4, 16 ld.w $a6, $a4, 20 - vinsgr2vr.w $vr22, $a5, 0 - vori.b $vr23, $vr18, 0 - vshuf.h $vr23, $vr22, $vr19 + vinsgr2vr.w $vr20, $a5, 0 + vori.b $vr22, $vr18, 0 + vshuf.h $vr22, $vr20, $vr19 vinsgr2vr.w $vr19, $a6, 0 - vilvl.h $vr20, $vr22, $vr22 - vilvl.w $vr20, $vr20, $vr20 - vslli.d $vr20, $vr20, 48 - vsrai.d $vr20, $vr20, 48 - vilvl.h $vr21, $vr19, $vr19 - vilvl.w $vr21, $vr21, $vr21 - vslli.d $vr21, $vr21, 48 - vsrai.d $vr21, $vr21, 48 - vmadd.d $vr17, $vr20, $vr20 - vmadd.d $vr16, $vr21, $vr21 - vslli.d $vr23, $vr23, 48 - vsrai.d $vr23, $vr23, 48 - vori.b $vr24, $vr18, 0 - vshuf.h $vr24, $vr19, $vr22 - vslli.d $vr22, $vr24, 48 + vori.b $vr23, $vr18, 0 + vshuf.h $vr23, $vr19, $vr20 + vsllwil.w.h $vr20, $vr20, 0 + vsllwil.d.w $vr21, $vr20, 0 + vsllwil.w.h $vr20, $vr19, 0 + vsllwil.d.w $vr20, $vr20, 0 + vmadd.d $vr17, $vr21, $vr21 + vmadd.d $vr16, $vr20, $vr20 + vsllwil.w.h $vr22, $vr22, 0 ld.w $a5, $a4, 12 - vsrai.d $vr22, $vr22, 48 - vmadd.d $vr0, $vr20, $vr23 - vmadd.d $vr7, $vr21, $vr22 - vinsgr2vr.w $vr22, $a5, 0 + vsllwil.d.w $vr22, $vr22, 0 + vsllwil.w.h $vr23, $vr23, 0 + vsllwil.d.w $vr23, $vr23, 0 + vinsgr2vr.w $vr24, $a5, 0 ld.w $a5, $a4, 10 - vilvl.h $vr22, $vr22, $vr22 - vilvl.w $vr22, $vr22, $vr22 - vslli.d $vr22, $vr22, 48 + vmadd.d $vr0, $vr21, $vr22 + vmadd.d $vr7, $vr20, $vr23 + vsllwil.w.h $vr22, $vr24, 0 vinsgr2vr.w $vr23, $a5, 0 ld.w $a5, $a4, 14 - vsrai.d $vr22, $vr22, 48 - vmadd.d $vr1, $vr22, $vr20 - vmadd.d $vr8, $vr20, $vr21 + vsllwil.d.w $vr22, $vr22, 0 + vmadd.d $vr1, $vr22, $vr21 + vmadd.d $vr8, $vr21, $vr20 vinsgr2vr.w $vr24, $a5, 0 - vilvl.h $vr23, $vr23, $vr23 - vilvl.w $vr23, $vr23, $vr23 - vslli.d $vr23, $vr23, 48 - vsrai.d $vr23, $vr23, 48 - vilvl.h $vr24, $vr24, $vr24 - vilvl.w $vr24, $vr24, $vr24 - vslli.d $vr24, $vr24, 48 + vsllwil.w.h $vr23, $vr23, 0 + vsllwil.d.w $vr23, $vr23, 0 + vsllwil.w.h $vr24, $vr24, 0 ld.w $a5, $a4, 8 - vsrai.d $vr24, $vr24, 48 - vmadd.d $vr2, $vr23, $vr20 - vmadd.d $vr9, $vr24, $vr21 + vsllwil.d.w $vr24, $vr24, 0 + vmadd.d $vr2, $vr23, $vr21 + vmadd.d $vr9, $vr24, $vr20 vinsgr2vr.w $vr24, $a5, 0 - vilvl.h $vr24, $vr24, $vr24 - vilvl.w $vr24, $vr24, $vr24 - vslli.d $vr24, $vr24, 48 + vsllwil.w.h $vr24, $vr24, 0 ld.w $a5, $a4, 6 - vsrai.d $vr24, $vr24, 48 - vmadd.d $vr3, $vr24, $vr20 - vmadd.d $vr10, $vr22, $vr21 + vsllwil.d.w $vr24, $vr24, 0 + vmadd.d $vr3, $vr24, $vr21 + vmadd.d $vr10, $vr22, $vr20 vinsgr2vr.w $vr22, $a5, 0 - vilvl.h $vr22, $vr22, $vr22 - vilvl.w $vr22, $vr22, $vr22 - vslli.d $vr22, $vr22, 48 + vsllwil.w.h $vr22, $vr22, 0 ld.w $a5, $a4, 4 - vsrai.d $vr22, $vr22, 48 - vmadd.d $vr4, $vr22, $vr20 - vmadd.d $vr11, $vr23, $vr21 + vsllwil.d.w $vr22, $vr22, 0 + vmadd.d $vr4, $vr22, $vr21 + vmadd.d $vr11, $vr23, $vr20 vinsgr2vr.w $vr23, $a5, 0 - vilvl.h $vr23, $vr23, $vr23 - vilvl.w $vr23, $vr23, $vr23 - vslli.d $vr23, $vr23, 48 + vsllwil.w.h $vr23, $vr23, 0 ld.w $a4, $a4, 2 - vsrai.d $vr23, $vr23, 48 - vmadd.d $vr5, $vr23, $vr20 - vmadd.d $vr12, $vr24, $vr21 + vsllwil.d.w $vr23, $vr23, 0 + vmadd.d $vr5, $vr23, $vr21 + vmadd.d $vr12, $vr24, $vr20 vinsgr2vr.w $vr24, $a4, 0 - vilvl.h $vr24, $vr24, $vr24 - vilvl.w $vr24, $vr24, $vr24 - vslli.d $vr24, $vr24, 48 + vsllwil.w.h $vr24, $vr24, 0 ldx.w $a4, $s0, $a0 - vsrai.d $vr24, $vr24, 48 - vmadd.d $vr6, $vr24, $vr20 - vmadd.d $vr13, $vr22, $vr21 + vsllwil.d.w $vr24, $vr24, 0 + vmadd.d $vr6, $vr24, $vr21 + vmadd.d $vr13, $vr22, $vr20 vinsgr2vr.w $vr22, $a4, 0 - vilvl.h $vr22, $vr22, $vr22 - vilvl.w $vr22, $vr22, $vr22 - vslli.d $vr22, $vr22, 48 - vsrai.d $vr22, $vr22, 48 - vmadd.d $vr14, $vr22, $vr20 + vsllwil.w.h $vr22, $vr22, 0 + vsllwil.d.w $vr22, $vr22, 0 + vmadd.d $vr14, $vr22, $vr21 addi.d $a0, $a0, 8 - vmadd.d $vr15, $vr23, $vr21 + vmadd.d $vr15, $vr23, $vr20 bne $a0, $a3, .LBB0_12 # %bb.13: # %middle.block177 vadd.d $vr16, $vr16, $vr17 @@ -2281,50 +2021,44 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vslli.h $vr4, $vr4, 15 vsrai.h $vr4, $vr4, 15 vbitsel.v $vr0, $vr0, $vr1, $vr4 - vilvl.h $vr1, $vr3, $vr3 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vsllwil.w.h $vr1, $vr3, 0 lu12i.w $a0, 10 vreplgr2vr.w $vr3, $a0 vmul.w $vr1, $vr1, $vr3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 pcalau12i $a0, %pc_hi20(.LCPI0_1) vld $vr3, $a0, %pc_lo12(.LCPI0_1) ori $a0, $zero, 0 lu32i.d $a0, 40960 vreplgr2vr.d $vr4, $a0 vmadd.d $vr3, $vr2, $vr4 - vsrli.d $vr2, $vr3, 32 - vilvl.h $vr0, $vr0, $vr0 - vilvh.w $vr3, $vr0, $vr0 - vslli.d $vr3, $vr3, 48 + vshuf4i.h $vr2, $vr0, 14 + vsllwil.w.h $vr2, $vr2, 0 pcalau12i $a0, %pc_hi20(.LCPI0_3) vld $vr4, $a0, %pc_lo12(.LCPI0_3) pcalau12i $a0, %pc_hi20(.LCPI0_4) vld $vr5, $a0, %pc_lo12(.LCPI0_4) - vsrai.d $vr3, $vr3, 48 + vsllwil.d.w $vr2, $vr2, 0 pcalau12i $a0, %pc_hi20(.LCPI0_2) vld $vr6, $a0, %pc_lo12(.LCPI0_2) - vmadd.d $vr5, $vr3, $vr4 + vmadd.d $vr5, $vr2, $vr4 pcalau12i $a0, %pc_hi20(.LCPI0_5) - vld $vr3, $a0, %pc_lo12(.LCPI0_5) - vilvl.w $vr0, $vr0, $vr0 - vslli.d $vr0, $vr0, 48 - vsrai.d $vr0, $vr0, 48 - vmadd.d $vr3, $vr0, $vr6 + vld $vr2, $a0, %pc_lo12(.LCPI0_5) + vsrli.d $vr3, $vr3, 32 + vsllwil.w.h $vr0, $vr0, 0 + vsllwil.d.w $vr0, $vr0, 0 + vmadd.d $vr2, $vr0, $vr6 pcalau12i $a0, %pc_hi20(.LCPI0_6) vld $vr0, $a0, %pc_lo12(.LCPI0_6) - vsrli.d $vr3, $vr3, 32 + vsrli.d $vr2, $vr2, 32 vsrli.d $vr4, $vr5, 32 - vpickev.w $vr3, $vr4, $vr3 - vshuf.w $vr0, $vr2, $vr1 + vpickev.w $vr2, $vr4, $vr2 + vshuf.w $vr0, $vr3, $vr1 lu12i.w $a0, 4096 vreplgr2vr.w $vr1, $a0 vadd.w $vr0, $vr0, $vr1 - vadd.w $vr1, $vr3, $vr1 + vadd.w $vr1, $vr2, $vr1 vsrai.w $vr1, $vr1, 25 vsrai.w $vr0, $vr0, 25 vshuf4i.w $vr2, $vr0, 14 diff --git a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/rpe.s b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/rpe.s index 7518c9bf..d714f3e4 100644 --- a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/rpe.s +++ b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/rpe.s @@ -1,462 +1,358 @@ .file "rpe.c" - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4, 0x0 # -- Begin function Gsm_RPE_Encoding -.LCPI0_0: - .half 3 # 0x3 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 4 # 0x4 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff -.LCPI0_1: - .half 5 # 0x5 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 6 # 0x6 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff -.LCPI0_2: - .half 7 # 0x7 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 8 # 0x8 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff -.LCPI0_3: - .half 1 # 0x1 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 2 # 0x2 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff .text - .globl Gsm_RPE_Encoding + .globl Gsm_RPE_Encoding # -- Begin function Gsm_RPE_Encoding .p2align 5 .type Gsm_RPE_Encoding,@function Gsm_RPE_Encoding: # @Gsm_RPE_Encoding # %bb.0: # %vector.ph - addi.d $sp, $sp, -320 - st.d $ra, $sp, 312 # 8-byte Folded Spill - st.d $fp, $sp, 304 # 8-byte Folded Spill - st.d $s0, $sp, 296 # 8-byte Folded Spill - st.d $s1, $sp, 288 # 8-byte Folded Spill - st.d $s2, $sp, 280 # 8-byte Folded Spill - st.d $s3, $sp, 272 # 8-byte Folded Spill - st.d $s4, $sp, 264 # 8-byte Folded Spill - st.d $s5, $sp, 256 # 8-byte Folded Spill - st.d $s6, $sp, 248 # 8-byte Folded Spill - st.d $s7, $sp, 240 # 8-byte Folded Spill - st.d $s8, $sp, 232 # 8-byte Folded Spill - fst.d $fs0, $sp, 224 # 8-byte Folded Spill - fst.d $fs1, $sp, 216 # 8-byte Folded Spill - fst.d $fs2, $sp, 208 # 8-byte Folded Spill - fst.d $fs3, $sp, 200 # 8-byte Folded Spill - fst.d $fs4, $sp, 192 # 8-byte Folded Spill - fst.d $fs5, $sp, 184 # 8-byte Folded Spill - fst.d $fs6, $sp, 176 # 8-byte Folded Spill + addi.d $sp, $sp, -288 + st.d $ra, $sp, 280 # 8-byte Folded Spill + st.d $fp, $sp, 272 # 8-byte Folded Spill + st.d $s0, $sp, 264 # 8-byte Folded Spill + st.d $s1, $sp, 256 # 8-byte Folded Spill + st.d $s2, $sp, 248 # 8-byte Folded Spill + st.d $s3, $sp, 240 # 8-byte Folded Spill + st.d $s4, $sp, 232 # 8-byte Folded Spill + st.d $s5, $sp, 224 # 8-byte Folded Spill + st.d $s6, $sp, 216 # 8-byte Folded Spill + st.d $s7, $sp, 208 # 8-byte Folded Spill + st.d $s8, $sp, 200 # 8-byte Folded Spill + fst.d $fs0, $sp, 192 # 8-byte Folded Spill + fst.d $fs1, $sp, 184 # 8-byte Folded Spill move $fp, $a1 ld.h $a0, $a1, -10 - st.d $a4, $sp, 48 # 8-byte Folded Spill + st.d $a4, $sp, 56 # 8-byte Folded Spill move $t3, $a3 - st.d $a2, $sp, 40 # 8-byte Folded Spill + st.d $a2, $sp, 48 # 8-byte Folded Spill move $a2, $zero - vinsgr2vr.h $vr14, $a0, 7 - pcalau12i $a0, %pc_hi20(.LCPI0_0) - vld $vr0, $a0, %pc_lo12(.LCPI0_0) - pcalau12i $a0, %pc_hi20(.LCPI0_1) - vld $vr1, $a0, %pc_lo12(.LCPI0_1) - pcalau12i $a0, %pc_hi20(.LCPI0_2) - vld $vr2, $a0, %pc_lo12(.LCPI0_2) - pcalau12i $a0, %pc_hi20(.LCPI0_3) - vld $vr3, $a0, %pc_lo12(.LCPI0_3) + vinsgr2vr.h $vr8, $a0, 7 lu12i.w $a0, 1 ori $a1, $a0, 1645 - vreplgr2vr.d $vr4, $a1 + vreplgr2vr.d $vr0, $a1 ori $a1, $zero, 2054 - vreplgr2vr.d $vr5, $a1 - vrepli.d $vr6, -374 - vrepli.d $vr7, -134 - vreplgr2vr.d $vr8, $a0 + vreplgr2vr.d $vr1, $a1 + vrepli.d $vr2, -374 + vrepli.d $vr3, -134 + vreplgr2vr.d $vr4, $a0 lu12i.w $a1, -8 - vreplgr2vr.d $vr9, $a1 + vreplgr2vr.d $vr5, $a1 lu12i.w $a1, 7 ori $a1, $a1, 4095 - vreplgr2vr.d $vr10, $a1 - addi.d $a3, $sp, 96 + vreplgr2vr.d $vr6, $a1 + addi.d $a3, $sp, 104 ori $a4, $zero, 80 .p2align 4, , 16 .LBB0_1: # %vector.body # =>This Inner Loop Header: Depth=1 add.d $a5, $fp, $a2 - vld $vr11, $a5, -8 - vori.b $vr12, $vr0, 0 - vshuf.h $vr12, $vr0, $vr11 - vslli.d $vr12, $vr12, 48 - vsrai.d $vr12, $vr12, 48 - vori.b $vr13, $vr1, 0 - vshuf.h $vr13, $vr0, $vr11 - vslli.d $vr13, $vr13, 48 - vsrai.d $vr13, $vr13, 48 - vori.b $vr15, $vr2, 0 - vshuf.h $vr15, $vr11, $vr14 - vslli.d $vr14, $vr15, 48 - vsrai.d $vr14, $vr14, 48 - vori.b $vr15, $vr3, 0 - vshuf.h $vr15, $vr0, $vr11 - vslli.d $vr15, $vr15, 48 - vsrai.d $vr15, $vr15, 48 - vilvl.h $vr17, $vr11, $vr11 - vilvh.w $vr16, $vr17, $vr17 - vslli.d $vr16, $vr16, 48 - vsrai.d $vr16, $vr16, 48 - vilvl.w $vr17, $vr17, $vr17 - vslli.d $vr17, $vr17, 48 - vsrai.d $vr17, $vr17, 48 - vilvh.h $vr19, $vr11, $vr11 - vilvh.w $vr18, $vr19, $vr19 - vslli.d $vr18, $vr18, 48 - vsrai.d $vr18, $vr18, 48 - vld $vr22, $a5, -4 - vilvl.w $vr19, $vr19, $vr19 - vslli.d $vr19, $vr19, 48 - vsrai.d $vr19, $vr19, 48 - vilvh.h $vr21, $vr22, $vr22 - vilvl.w $vr20, $vr21, $vr21 - vslli.d $vr20, $vr20, 48 - vsrai.d $vr20, $vr20, 48 - vilvh.w $vr21, $vr21, $vr21 - vslli.d $vr21, $vr21, 48 - vsrai.d $vr21, $vr21, 48 - vilvl.h $vr23, $vr22, $vr22 - vilvl.w $vr22, $vr23, $vr23 - vslli.d $vr22, $vr22, 48 - vsrai.d $vr22, $vr22, 48 - vld $vr26, $a5, -2 - vilvh.w $vr23, $vr23, $vr23 - vslli.d $vr23, $vr23, 48 - vsrai.d $vr23, $vr23, 48 - vilvh.h $vr25, $vr26, $vr26 - vilvl.w $vr24, $vr25, $vr25 - vslli.d $vr24, $vr24, 48 - vsrai.d $vr24, $vr24, 48 - vilvh.w $vr25, $vr25, $vr25 - vslli.d $vr25, $vr25, 48 - vsrai.d $vr25, $vr25, 48 - vilvl.h $vr26, $vr26, $vr26 - vilvl.w $vr27, $vr26, $vr26 - vslli.d $vr27, $vr27, 48 - vsrai.d $vr27, $vr27, 48 - vld $vr28, $a5, 2 - vilvh.w $vr26, $vr26, $vr26 - vslli.d $vr26, $vr26, 48 - vsrai.d $vr26, $vr26, 48 - vilvh.h $vr29, $vr28, $vr28 - vilvl.h $vr28, $vr28, $vr28 - vilvh.w $vr30, $vr28, $vr28 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vadd.d $vr26, $vr30, $vr26 - vilvl.w $vr30, $vr29, $vr29 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vilvh.w $vr29, $vr29, $vr29 - vilvl.w $vr28, $vr28, $vr28 - vslli.d $vr28, $vr28, 48 - vsrai.d $vr28, $vr28, 48 - vadd.d $vr27, $vr28, $vr27 - vld $vr28, $a5, 4 - vslli.d $vr29, $vr29, 48 - vsrai.d $vr29, $vr29, 48 - vadd.d $vr25, $vr29, $vr25 - vilvh.h $vr29, $vr28, $vr28 - vilvl.h $vr28, $vr28, $vr28 - vadd.d $vr24, $vr30, $vr24 - vilvh.w $vr30, $vr28, $vr28 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vadd.d $vr23, $vr30, $vr23 - vilvl.w $vr30, $vr29, $vr29 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vilvh.w $vr29, $vr29, $vr29 - vilvl.w $vr28, $vr28, $vr28 - vslli.d $vr28, $vr28, 48 - vsrai.d $vr28, $vr28, 48 - vadd.d $vr22, $vr28, $vr22 - vld $vr28, $a5, 8 - vslli.d $vr29, $vr29, 48 - vsrai.d $vr29, $vr29, 48 - vadd.d $vr21, $vr29, $vr21 - vilvl.h $vr29, $vr28, $vr28 - vilvh.h $vr28, $vr28, $vr28 - vadd.d $vr20, $vr30, $vr20 - vilvl.w $vr30, $vr28, $vr28 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vadd.d $vr19, $vr30, $vr19 - vilvh.w $vr30, $vr29, $vr29 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vilvl.w $vr29, $vr29, $vr29 - vilvh.w $vr28, $vr28, $vr28 - vslli.d $vr28, $vr28, 48 - vsrai.d $vr28, $vr28, 48 - vadd.d $vr18, $vr28, $vr18 - vld $vr28, $a5, 10 - vslli.d $vr29, $vr29, 48 - vsrai.d $vr29, $vr29, 48 - vadd.d $vr17, $vr29, $vr17 - vilvl.h $vr29, $vr28, $vr28 - vadd.d $vr16, $vr30, $vr16 - vilvh.w $vr30, $vr29, $vr29 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vadd.d $vr15, $vr30, $vr15 - vldx $vr30, $fp, $a2 - vilvh.h $vr28, $vr28, $vr28 - vilvl.w $vr29, $vr29, $vr29 - vslli.d $vr29, $vr29, 48 - vsrai.d $vr29, $vr29, 48 - vadd.d $vr14, $vr29, $vr14 - vilvh.w $vr29, $vr28, $vr28 - vslli.d $vr29, $vr29, 48 - vsrai.d $vr29, $vr29, 48 - vadd.d $vr13, $vr29, $vr13 - vilvh.h $vr29, $vr30, $vr30 - vilvl.w $vr28, $vr28, $vr28 - vslli.d $vr28, $vr28, 48 - vsrai.d $vr28, $vr28, 48 - vadd.d $vr12, $vr28, $vr12 - vilvl.w $vr28, $vr29, $vr29 - vslli.d $vr28, $vr28, 48 - vsrai.d $vr28, $vr28, 48 - vslli.d $vr28, $vr28, 13 - vmadd.d $vr28, $vr24, $vr4 - vmadd.d $vr28, $vr20, $vr5 - vilvh.w $vr20, $vr29, $vr29 - vslli.d $vr20, $vr20, 48 - vsrai.d $vr20, $vr20, 48 - vslli.d $vr20, $vr20, 13 - vmadd.d $vr20, $vr25, $vr4 - vilvl.h $vr24, $vr30, $vr30 - vmadd.d $vr20, $vr21, $vr5 - vilvl.w $vr21, $vr24, $vr24 - vslli.d $vr21, $vr21, 48 - vsrai.d $vr21, $vr21, 48 - vslli.d $vr21, $vr21, 13 - vmadd.d $vr21, $vr27, $vr4 - vmadd.d $vr21, $vr22, $vr5 - vilvh.w $vr22, $vr24, $vr24 - vslli.d $vr22, $vr22, 48 - vsrai.d $vr22, $vr22, 48 - vslli.d $vr22, $vr22, 13 - vmadd.d $vr22, $vr26, $vr4 - vmadd.d $vr22, $vr23, $vr5 - vmadd.d $vr22, $vr16, $vr6 - vmadd.d $vr21, $vr17, $vr6 - vmadd.d $vr20, $vr18, $vr6 - vmadd.d $vr28, $vr19, $vr6 - vmadd.d $vr28, $vr12, $vr7 - vmadd.d $vr20, $vr13, $vr7 - vmadd.d $vr21, $vr14, $vr7 - vmadd.d $vr22, $vr15, $vr7 - vadd.d $vr12, $vr22, $vr8 - vadd.d $vr13, $vr21, $vr8 - vsrai.d $vr13, $vr13, 13 - vsrai.d $vr12, $vr12, 13 - vmax.d $vr12, $vr12, $vr9 - vmax.d $vr13, $vr13, $vr9 - vmin.d $vr13, $vr13, $vr10 - vmin.d $vr12, $vr12, $vr10 - vpickev.w $vr12, $vr12, $vr13 - vadd.d $vr13, $vr20, $vr8 - vadd.d $vr14, $vr28, $vr8 - vsrai.d $vr14, $vr14, 13 - vsrai.d $vr13, $vr13, 13 - vmax.d $vr13, $vr13, $vr9 - vmax.d $vr14, $vr14, $vr9 - vmin.d $vr14, $vr14, $vr10 - vmin.d $vr13, $vr13, $vr10 - vpickev.w $vr13, $vr13, $vr14 - vpickev.h $vr12, $vr13, $vr12 - vstx $vr12, $a2, $a3 + vld $vr7, $a5, -8 + vbsrl.v $vr8, $vr8, 14 + vbsll.v $vr9, $vr7, 2 + vor.v $vr10, $vr9, $vr8 + vbsrl.v $vr8, $vr7, 6 + vsllwil.w.h $vr8, $vr8, 0 + vsllwil.d.w $vr8, $vr8, 0 + vbsrl.v $vr9, $vr7, 10 + vsllwil.w.h $vr9, $vr9, 0 + vsllwil.d.w $vr9, $vr9, 0 + vsllwil.w.h $vr10, $vr10, 0 + vsllwil.d.w $vr10, $vr10, 0 + vshuf4i.h $vr11, $vr7, 9 + vsllwil.w.h $vr11, $vr11, 0 + vsllwil.d.w $vr11, $vr11, 0 + vshuf4i.h $vr12, $vr7, 14 + vsllwil.w.h $vr12, $vr12, 0 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.w.h $vr13, $vr7, 0 + vsllwil.d.w $vr13, $vr13, 0 + vbsrl.v $vr14, $vr7, 12 + vsllwil.w.h $vr14, $vr14, 0 + vsllwil.d.w $vr14, $vr14, 0 + vld $vr19, $a5, -4 + vbsrl.v $vr15, $vr7, 8 + vsllwil.w.h $vr15, $vr15, 0 + vsllwil.d.w $vr15, $vr15, 0 + vbsrl.v $vr16, $vr19, 8 + vsllwil.w.h $vr16, $vr16, 0 + vsllwil.d.w $vr16, $vr16, 0 + vbsrl.v $vr17, $vr19, 12 + vsllwil.w.h $vr17, $vr17, 0 + vsllwil.d.w $vr17, $vr17, 0 + vsllwil.w.h $vr18, $vr19, 0 + vsllwil.d.w $vr18, $vr18, 0 + vld $vr20, $a5, -2 + vshuf4i.h $vr19, $vr19, 14 + vsllwil.w.h $vr19, $vr19, 0 + vsllwil.d.w $vr19, $vr19, 0 + vbsrl.v $vr21, $vr20, 8 + vsllwil.w.h $vr21, $vr21, 0 + vsllwil.d.w $vr21, $vr21, 0 + vbsrl.v $vr22, $vr20, 12 + vsllwil.w.h $vr22, $vr22, 0 + vsllwil.d.w $vr22, $vr22, 0 + vsllwil.w.h $vr23, $vr20, 0 + vsllwil.d.w $vr23, $vr23, 0 + vld $vr24, $a5, 2 + vshuf4i.h $vr20, $vr20, 14 + vsllwil.w.h $vr20, $vr20, 0 + vsllwil.d.w $vr20, $vr20, 0 + vshuf4i.h $vr25, $vr24, 14 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr20, $vr25, $vr20 + vsllwil.w.h $vr25, $vr24, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr23, $vr25, $vr23 + vbsrl.v $vr25, $vr24, 8 + vbsrl.v $vr24, $vr24, 12 + vsllwil.w.h $vr24, $vr24, 0 + vsllwil.d.w $vr24, $vr24, 0 + vadd.d $vr22, $vr24, $vr22 + vld $vr24, $a5, 4 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr21, $vr25, $vr21 + vshuf4i.h $vr25, $vr24, 14 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr19, $vr25, $vr19 + vsllwil.w.h $vr25, $vr24, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr18, $vr25, $vr18 + vbsrl.v $vr25, $vr24, 8 + vbsrl.v $vr24, $vr24, 12 + vsllwil.w.h $vr24, $vr24, 0 + vsllwil.d.w $vr24, $vr24, 0 + vadd.d $vr17, $vr24, $vr17 + vld $vr24, $a5, 8 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr16, $vr25, $vr16 + vbsrl.v $vr25, $vr24, 8 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr15, $vr25, $vr15 + vbsrl.v $vr25, $vr24, 12 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr14, $vr25, $vr14 + vshuf4i.h $vr25, $vr24, 14 + vsllwil.w.h $vr24, $vr24, 0 + vsllwil.d.w $vr24, $vr24, 0 + vadd.d $vr13, $vr24, $vr13 + vld $vr24, $a5, 10 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr12, $vr25, $vr12 + vshuf4i.h $vr25, $vr24, 14 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr11, $vr25, $vr11 + vsllwil.w.h $vr25, $vr24, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr10, $vr25, $vr10 + vbsrl.v $vr25, $vr24, 12 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr9, $vr25, $vr9 + vldx $vr25, $fp, $a2 + vbsrl.v $vr24, $vr24, 8 + vsllwil.w.h $vr24, $vr24, 0 + vsllwil.d.w $vr24, $vr24, 0 + vadd.d $vr8, $vr24, $vr8 + vbsrl.v $vr24, $vr25, 8 + vsllwil.w.h $vr24, $vr24, 0 + vsllwil.d.w $vr24, $vr24, 0 + vslli.d $vr24, $vr24, 13 + vmadd.d $vr24, $vr21, $vr0 + vmadd.d $vr24, $vr16, $vr1 + vbsrl.v $vr16, $vr25, 12 + vsllwil.w.h $vr16, $vr16, 0 + vsllwil.d.w $vr16, $vr16, 0 + vslli.d $vr16, $vr16, 13 + vmadd.d $vr16, $vr22, $vr0 + vmadd.d $vr16, $vr17, $vr1 + vsllwil.w.h $vr17, $vr25, 0 + vsllwil.d.w $vr17, $vr17, 0 + vslli.d $vr17, $vr17, 13 + vmadd.d $vr17, $vr23, $vr0 + vmadd.d $vr17, $vr18, $vr1 + vshuf4i.h $vr18, $vr25, 14 + vsllwil.w.h $vr18, $vr18, 0 + vsllwil.d.w $vr18, $vr18, 0 + vslli.d $vr18, $vr18, 13 + vmadd.d $vr18, $vr20, $vr0 + vmadd.d $vr18, $vr19, $vr1 + vmadd.d $vr18, $vr12, $vr2 + vmadd.d $vr17, $vr13, $vr2 + vmadd.d $vr16, $vr14, $vr2 + vmadd.d $vr24, $vr15, $vr2 + vmadd.d $vr24, $vr8, $vr3 + vmadd.d $vr16, $vr9, $vr3 + vmadd.d $vr17, $vr10, $vr3 + vmadd.d $vr18, $vr11, $vr3 + vadd.d $vr8, $vr18, $vr4 + vadd.d $vr9, $vr17, $vr4 + vsrai.d $vr9, $vr9, 13 + vsrai.d $vr8, $vr8, 13 + vmax.d $vr8, $vr8, $vr5 + vmax.d $vr9, $vr9, $vr5 + vmin.d $vr9, $vr9, $vr6 + vmin.d $vr8, $vr8, $vr6 + vpickev.w $vr8, $vr8, $vr9 + vadd.d $vr9, $vr16, $vr4 + vadd.d $vr10, $vr24, $vr4 + vsrai.d $vr10, $vr10, 13 + vsrai.d $vr9, $vr9, 13 + vmax.d $vr9, $vr9, $vr5 + vmax.d $vr10, $vr10, $vr5 + vmin.d $vr10, $vr10, $vr6 + vmin.d $vr9, $vr9, $vr6 + vpickev.w $vr9, $vr9, $vr10 + vpickev.h $vr8, $vr9, $vr8 + vstx $vr8, $a2, $a3 addi.d $a2, $a2, 16 - vori.b $vr14, $vr11, 0 + vori.b $vr8, $vr7, 0 bne $a2, $a4, .LBB0_1 # %bb.2: # %Weighting_filter.exit - ld.h $a2, $sp, 96 - ld.h $a3, $sp, 98 + ld.h $a2, $sp, 104 + ld.h $a3, $sp, 106 srai.d $a2, $a2, 2 mul.d $a2, $a2, $a2 - ld.h $a4, $sp, 104 + ld.h $a4, $sp, 112 srai.d $a3, $a3, 2 mul.d $a3, $a3, $a3 - ld.h $a5, $sp, 110 + ld.h $a5, $sp, 118 srai.d $a4, $a4, 2 mul.d $a4, $a4, $a4 add.d $a3, $a4, $a3 srai.d $a4, $a5, 2 - ld.h $a5, $sp, 116 + ld.h $a5, $sp, 124 mul.d $a4, $a4, $a4 add.d $a3, $a3, $a4 - ld.h $a4, $sp, 122 + ld.h $a4, $sp, 130 srai.d $a5, $a5, 2 mul.d $a5, $a5, $a5 add.d $a3, $a3, $a5 srai.d $a4, $a4, 2 - ld.h $a5, $sp, 128 + ld.h $a5, $sp, 136 mul.d $a4, $a4, $a4 add.d $a3, $a3, $a4 - ld.h $a4, $sp, 134 + ld.h $a4, $sp, 142 srai.d $a5, $a5, 2 mul.d $a5, $a5, $a5 add.d $a3, $a3, $a5 srai.d $a4, $a4, 2 - ld.h $a5, $sp, 140 + ld.h $a5, $sp, 148 mul.d $a4, $a4, $a4 add.d $a3, $a3, $a4 - ld.h $a4, $sp, 146 + ld.h $a4, $sp, 154 srai.d $a5, $a5, 2 mul.d $a5, $a5, $a5 add.d $a3, $a3, $a5 srai.d $a4, $a4, 2 - ld.h $a5, $sp, 152 + ld.h $a5, $sp, 160 mul.d $a4, $a4, $a4 add.d $a3, $a3, $a4 - ld.h $a4, $sp, 158 + ld.h $a4, $sp, 166 srai.d $a5, $a5, 2 mul.d $a5, $a5, $a5 add.d $a3, $a3, $a5 srai.d $a4, $a4, 2 - ld.h $a5, $sp, 164 + ld.h $a5, $sp, 172 mul.d $a4, $a4, $a4 add.d $a3, $a3, $a4 - ld.h $a4, $sp, 170 + ld.h $a4, $sp, 178 srai.d $a5, $a5, 2 mul.d $a5, $a5, $a5 add.d $a3, $a3, $a5 srai.d $a4, $a4, 2 - ld.w $a5, $sp, 100 mul.d $a4, $a4, $a4 + ld.w $a5, $sp, 108 add.d $a3, $a3, $a4 slli.d $a3, $a3, 1 + ld.w $a4, $sp, 114 vinsgr2vr.w $vr0, $a5, 0 vsrai.h $vr0, $vr0, 2 - ld.w $a4, $sp, 106 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 + vsllwil.w.h $vr0, $vr0, 0 vinsgr2vr.w $vr1, $a4, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - ld.w $a4, $sp, 112 - vsrai.w $vr1, $vr1, 16 + ld.w $a4, $sp, 120 + vsllwil.w.h $vr1, $vr1, 0 vmul.w $vr1, $vr1, $vr1 vmadd.w $vr1, $vr0, $vr0 vinsgr2vr.w $vr0, $a4, 0 + ld.w $a4, $sp, 126 vsrai.h $vr0, $vr0, 2 - vilvl.h $vr0, $vr0, $vr0 - ld.w $a4, $sp, 118 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 + vsllwil.w.h $vr0, $vr0, 0 vmadd.w $vr1, $vr0, $vr0 vinsgr2vr.w $vr0, $a4, 0 vsrai.h $vr0, $vr0, 2 - vilvl.h $vr0, $vr0, $vr0 - vilvl.w $vr0, $vr0, $vr0 - ld.w $a4, $sp, 124 - vslli.d $vr0, $vr0, 48 - ld.w $a5, $sp, 130 - vsrai.d $vr2, $vr0, 48 + ld.w $a4, $sp, 132 + vsllwil.w.h $vr0, $vr0, 0 + ld.w $a5, $sp, 138 + vsllwil.d.w $vr2, $vr0, 0 vinsgr2vr.w $vr0, $a4, 0 vsrai.h $vr3, $vr0, 2 vinsgr2vr.w $vr0, $a5, 0 vsrai.h $vr4, $vr0, 2 - ld.h $a4, $sp, 138 - vrepli.b $vr0, 0 - vilvl.w $vr0, $vr0, $vr1 + ld.h $a4, $sp, 146 + vsllwil.du.wu $vr0, $vr1, 0 vmadd.d $vr0, $vr2, $vr2 - vilvl.h $vr1, $vr3, $vr3 - vilvl.w $vr1, $vr1, $vr1 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 - vmadd.d $vr0, $vr1, $vr1 - vilvl.h $vr1, $vr4, $vr4 - vilvl.w $vr1, $vr1, $vr1 - vld $vr2, $sp, 136 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + vsllwil.w.h $vr1, $vr3, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 - vinsgr2vr.h $vr2, $a4, 1 - vsrai.h $vr1, $vr2, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - ld.w $a4, $sp, 142 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + vld $vr1, $sp, 144 + vsllwil.w.h $vr2, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vmadd.d $vr0, $vr2, $vr2 + vinsgr2vr.h $vr1, $a4, 1 + vsrai.h $vr1, $vr1, 2 + ld.w $a4, $sp, 150 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vinsgr2vr.w $vr1, $a4, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - ld.w $a4, $sp, 148 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + ld.w $a4, $sp, 156 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vinsgr2vr.w $vr1, $a4, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - ld.w $a4, $sp, 154 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + ld.w $a4, $sp, 162 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vinsgr2vr.w $vr1, $a4, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - ld.w $a4, $sp, 160 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + ld.w $a4, $sp, 168 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vinsgr2vr.w $vr1, $a4, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - ld.w $a4, $sp, 166 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + ld.w $a4, $sp, 174 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vinsgr2vr.w $vr1, $a4, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vpickve2gr.d $a4, $vr0, 1 add.d $a2, $a4, $a2 slli.d $a2, $a2, 1 slt $a4, $a2, $a3 - ld.w $a5, $sp, 172 + ld.w $a5, $sp, 180 masknez $a2, $a2, $a4 maskeqz $a3, $a3, $a4 or $a2, $a3, $a2 vinsgr2vr.w $vr1, $a5, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vslli.d $vr0, $vr0, 1 vpickve2gr.d $a3, $vr0, 0 @@ -479,7 +375,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding ori $a4, $zero, 3 maskeqz $a3, $a4, $a3 or $a3, $a3, $a2 - addi.d $a4, $sp, 96 + addi.d $a4, $sp, 104 alsl.d $a2, $a3, $a4, 1 slli.d $a5, $a3, 1 ldx.hu $a6, $a5, $a4 @@ -496,7 +392,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding ext.w.h $s0, $a7 ext.w.h $s2, $a5 ext.w.h $s8, $a4 - st.d $t3, $sp, 56 # 8-byte Folded Spill + st.d $t3, $sp, 64 # 8-byte Folded Spill st.h $a3, $t3, 0 lu12i.w $a3, 8 xor $t3, $a6, $a3 @@ -505,7 +401,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $t4, $t4, $t3 maskeqz $t3, $a1, $t3 or $t3, $t3, $t4 - st.d $t7, $sp, 16 # 8-byte Folded Spill + st.d $t7, $sp, 24 # 8-byte Folded Spill slti $t4, $t7, 0 masknez $a6, $a6, $t4 maskeqz $t3, $t3, $t4 @@ -516,7 +412,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $t4, $t4, $a6 maskeqz $a6, $a1, $a6 or $a6, $a6, $t4 - st.d $t6, $sp, 24 # 8-byte Folded Spill + st.d $t6, $sp, 32 # 8-byte Folded Spill slti $t4, $t6, 0 masknez $t2, $t2, $t4 maskeqz $t4, $a6, $t4 @@ -534,7 +430,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $t4, $t4, $t3 maskeqz $t3, $a1, $t3 or $t3, $t3, $t4 - st.d $t5, $sp, 32 # 8-byte Folded Spill + st.d $t5, $sp, 40 # 8-byte Folded Spill slti $t4, $t5, 0 masknez $t1, $t1, $t4 maskeqz $t3, $t3, $t4 @@ -551,7 +447,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $t3, $t3, $t2 maskeqz $t2, $a1, $t2 or $t2, $t2, $t3 - st.d $t8, $sp, 8 # 8-byte Folded Spill + st.d $t8, $sp, 16 # 8-byte Folded Spill slti $t3, $t8, 0 masknez $t4, $t0, $t3 maskeqz $t2, $t2, $t3 @@ -571,7 +467,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding slti $t3, $s0, 0 masknez $a7, $a7, $t3 maskeqz $t2, $t2, $t3 - ext.w.h $s1, $t0 + ext.w.h $s4, $t0 or $a7, $t2, $a7 ext.w.h $a7, $a7 slt $t2, $a7, $t1 @@ -603,7 +499,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding slti $t2, $s8, 0 masknez $a4, $a4, $t2 maskeqz $t1, $t1, $t2 - ext.w.h $s4, $a5 + ext.w.h $s5, $a5 or $a4, $t1, $a4 ext.w.h $a4, $a4 slt $t1, $a4, $a7 @@ -632,10 +528,10 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $a7, $a7, $a6 maskeqz $a6, $a1, $a6 or $a6, $a6, $a7 - slti $a7, $s1, 0 + slti $a7, $s4, 0 masknez $t0, $t0, $a7 maskeqz $a6, $a6, $a7 - ext.w.h $s5, $t1 + ext.w.h $s1, $t1 or $a6, $a6, $t0 ext.w.h $a6, $a6 slt $a7, $a6, $a4 @@ -648,7 +544,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $a7, $a7, $a6 maskeqz $a6, $a1, $a6 or $a6, $a6, $a7 - slti $a7, $s4, 0 + slti $a7, $s5, 0 masknez $a5, $a5, $a7 maskeqz $a6, $a6, $a7 ld.hu $a7, $a2, 66 @@ -664,7 +560,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $a6, $a6, $a5 maskeqz $a5, $a1, $a5 or $a5, $a5, $a6 - slti $a6, $s5, 0 + slti $a6, $s1, 0 masknez $t0, $t1, $a6 maskeqz $a5, $a5, $a6 ext.w.h $s6, $a7 @@ -776,13 +672,13 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding vinsgr2vr.h $vr0, $s2, 2 vinsgr2vr.h $vr0, $s8, 4 vinsgr2vr.h $vr0, $s3, 6 - ld.d $a6, $sp, 16 # 8-byte Folded Reload - vinsgr2vr.h $vr1, $a6, 0 ld.d $a6, $sp, 24 # 8-byte Folded Reload - vinsgr2vr.h $vr1, $a6, 2 + vinsgr2vr.h $vr1, $a6, 0 ld.d $a6, $sp, 32 # 8-byte Folded Reload + vinsgr2vr.h $vr1, $a6, 2 + ld.d $a6, $sp, 40 # 8-byte Folded Reload vinsgr2vr.h $vr1, $a6, 4 - ld.d $a6, $sp, 8 # 8-byte Folded Reload + ld.d $a6, $sp, 16 # 8-byte Folded Reload vinsgr2vr.h $vr1, $a6, 6 vreplgr2vr.w $vr2, $a5 vsll.w $vr1, $vr1, $vr2 @@ -800,23 +696,23 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding vsrai.w $vr0, $vr0, 28 vpickev.h $vr0, $vr0, $vr1 vaddi.hu $vr0, $vr0, 4 - ld.d $a6, $sp, 48 # 8-byte Folded Reload + ld.d $a6, $sp, 56 # 8-byte Folded Reload vst $vr0, $a6, 0 - sll.w $a5, $s1, $a3 + sll.w $a5, $s4, $a3 ext.w.h $a5, $a5 mul.d $a5, $a5, $a4 slli.w $a5, $a5, 1 srli.d $a5, $a5, 28 addi.d $a5, $a5, 4 st.h $a5, $a6, 16 - sll.w $a5, $s4, $a3 + sll.w $a5, $s5, $a3 ext.w.h $a5, $a5 mul.d $a5, $a5, $a4 slli.w $a5, $a5, 1 srli.d $a5, $a5, 28 addi.d $a5, $a5, 4 st.h $a5, $a6, 18 - sll.w $a5, $s5, $a3 + sll.w $a5, $s1, $a3 ext.w.h $a5, $a5 mul.d $a5, $a5, $a4 slli.w $a5, $a5, 1 @@ -837,16 +733,16 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding srli.d $a3, $a3, 28 addi.d $a3, $a3, 4 st.h $a3, $a6, 24 - ld.d $a3, $sp, 40 # 8-byte Folded Reload + ld.d $a3, $sp, 48 # 8-byte Folded Reload st.h $a0, $a3, 0 ext.w.h $a1, $a1 ext.w.h $a2, $a2 - addi.d $a3, $sp, 70 - addi.d $s0, $sp, 70 + addi.d $a3, $sp, 78 + addi.d $s0, $sp, 78 move $a0, $a6 pcaddu18i $ra, %call36(APCM_inverse_quantization) jirl $ra, $ra, 0 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload ld.hu $a0, $a0, 0 ori $a1, $zero, 3 bltu $a1, $a0, .LBB0_15 @@ -863,7 +759,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding addi.d $fp, $fp, 2 .LBB0_11: ori $a1, $zero, 13 - addi.d $s0, $sp, 70 + addi.d $s0, $sp, 78 .LBB0_12: st.h $zero, $fp, 0 addi.d $fp, $fp, 2 @@ -892,47 +788,37 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding addi.d $a2, $a0, 2 move $a0, $fp move $a1, $zero - fld.d $fs6, $sp, 176 # 8-byte Folded Reload - fld.d $fs5, $sp, 184 # 8-byte Folded Reload - fld.d $fs4, $sp, 192 # 8-byte Folded Reload - fld.d $fs3, $sp, 200 # 8-byte Folded Reload - fld.d $fs2, $sp, 208 # 8-byte Folded Reload - fld.d $fs1, $sp, 216 # 8-byte Folded Reload - fld.d $fs0, $sp, 224 # 8-byte Folded Reload - ld.d $s8, $sp, 232 # 8-byte Folded Reload - ld.d $s7, $sp, 240 # 8-byte Folded Reload - ld.d $s6, $sp, 248 # 8-byte Folded Reload - ld.d $s5, $sp, 256 # 8-byte Folded Reload - ld.d $s4, $sp, 264 # 8-byte Folded Reload - ld.d $s3, $sp, 272 # 8-byte Folded Reload - ld.d $s2, $sp, 280 # 8-byte Folded Reload - ld.d $s1, $sp, 288 # 8-byte Folded Reload - ld.d $s0, $sp, 296 # 8-byte Folded Reload - ld.d $fp, $sp, 304 # 8-byte Folded Reload - ld.d $ra, $sp, 312 # 8-byte Folded Reload - addi.d $sp, $sp, 320 + fld.d $fs1, $sp, 184 # 8-byte Folded Reload + fld.d $fs0, $sp, 192 # 8-byte Folded Reload + ld.d $s8, $sp, 200 # 8-byte Folded Reload + ld.d $s7, $sp, 208 # 8-byte Folded Reload + ld.d $s6, $sp, 216 # 8-byte Folded Reload + ld.d $s5, $sp, 224 # 8-byte Folded Reload + ld.d $s4, $sp, 232 # 8-byte Folded Reload + ld.d $s3, $sp, 240 # 8-byte Folded Reload + ld.d $s2, $sp, 248 # 8-byte Folded Reload + ld.d $s1, $sp, 256 # 8-byte Folded Reload + ld.d $s0, $sp, 264 # 8-byte Folded Reload + ld.d $fp, $sp, 272 # 8-byte Folded Reload + ld.d $ra, $sp, 280 # 8-byte Folded Reload + addi.d $sp, $sp, 288 pcaddu18i $t8, %call36(memset) jr $t8 .LBB0_17: # %RPE_grid_positioning.exit - fld.d $fs6, $sp, 176 # 8-byte Folded Reload - fld.d $fs5, $sp, 184 # 8-byte Folded Reload - fld.d $fs4, $sp, 192 # 8-byte Folded Reload - fld.d $fs3, $sp, 200 # 8-byte Folded Reload - fld.d $fs2, $sp, 208 # 8-byte Folded Reload - fld.d $fs1, $sp, 216 # 8-byte Folded Reload - fld.d $fs0, $sp, 224 # 8-byte Folded Reload - ld.d $s8, $sp, 232 # 8-byte Folded Reload - ld.d $s7, $sp, 240 # 8-byte Folded Reload - ld.d $s6, $sp, 248 # 8-byte Folded Reload - ld.d $s5, $sp, 256 # 8-byte Folded Reload - ld.d $s4, $sp, 264 # 8-byte Folded Reload - ld.d $s3, $sp, 272 # 8-byte Folded Reload - ld.d $s2, $sp, 280 # 8-byte Folded Reload - ld.d $s1, $sp, 288 # 8-byte Folded Reload - ld.d $s0, $sp, 296 # 8-byte Folded Reload - ld.d $fp, $sp, 304 # 8-byte Folded Reload - ld.d $ra, $sp, 312 # 8-byte Folded Reload - addi.d $sp, $sp, 320 + fld.d $fs1, $sp, 184 # 8-byte Folded Reload + fld.d $fs0, $sp, 192 # 8-byte Folded Reload + ld.d $s8, $sp, 200 # 8-byte Folded Reload + ld.d $s7, $sp, 208 # 8-byte Folded Reload + ld.d $s6, $sp, 216 # 8-byte Folded Reload + ld.d $s5, $sp, 224 # 8-byte Folded Reload + ld.d $s4, $sp, 232 # 8-byte Folded Reload + ld.d $s3, $sp, 240 # 8-byte Folded Reload + ld.d $s2, $sp, 248 # 8-byte Folded Reload + ld.d $s1, $sp, 256 # 8-byte Folded Reload + ld.d $s0, $sp, 264 # 8-byte Folded Reload + ld.d $fp, $sp, 272 # 8-byte Folded Reload + ld.d $ra, $sp, 280 # 8-byte Folded Reload + addi.d $sp, $sp, 288 ret .Lfunc_end0: .size Gsm_RPE_Encoding, .Lfunc_end0-Gsm_RPE_Encoding diff --git a/results/MultiSource/Benchmarks/PAQ8p/CMakeFiles/paq8p.dir/paq8p.s b/results/MultiSource/Benchmarks/PAQ8p/CMakeFiles/paq8p.dir/paq8p.s index 5bf9fd7b..9279264a 100644 --- a/results/MultiSource/Benchmarks/PAQ8p/CMakeFiles/paq8p.dir/paq8p.s +++ b/results/MultiSource/Benchmarks/PAQ8p/CMakeFiles/paq8p.dir/paq8p.s @@ -740,29 +740,23 @@ _Z5trainPsS_ii: # @_Z5trainPsS_ii .LBB13_9: # %vector.body # =>This Inner Loop Header: Depth=1 vld $vr3, $a7, 0 - vilvh.h $vr4, $vr3, $vr3 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vld $vr5, $t0, 0 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr6, $vr5, $vr5 - vslli.w $vr6, $vr6, 16 - vsrai.w $vr6, $vr6, 16 - vilvh.h $vr5, $vr5, $vr5 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 - vmul.w $vr5, $vr0, $vr5 + vld $vr4, $t0, 0 + vbsrl.v $vr5, $vr3, 8 + vsllwil.w.h $vr5, $vr5, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr6, $vr4, 0 + vbsrl.v $vr4, $vr4, 8 + vsllwil.w.h $vr4, $vr4, 0 + vmul.w $vr4, $vr0, $vr4 vmul.w $vr6, $vr0, $vr6 vsrai.w $vr6, $vr6, 15 - vsrai.w $vr5, $vr5, 15 - vaddi.wu $vr5, $vr5, 1 - vsrai.w $vr5, $vr5, 1 + vsrai.w $vr4, $vr4, 15 + vaddi.wu $vr4, $vr4, 1 + vsrai.w $vr4, $vr4, 1 vaddi.wu $vr6, $vr6, 1 vsrai.w $vr6, $vr6, 1 vadd.w $vr3, $vr6, $vr3 - vadd.w $vr4, $vr5, $vr4 + vadd.w $vr4, $vr4, $vr5 vmax.w $vr4, $vr4, $vr1 vmax.w $vr3, $vr3, $vr1 vmin.w $vr3, $vr3, $vr2 @@ -1763,12 +1757,9 @@ _ZN3APMC2Ei: # @_ZN3APMC2Ei vadd.w $vr11, $vr10, $vr5 vand.v $vr10, $vr10, $vr6 vsrai.w $vr11, $vr11, 7 - vshuf4i.w $vr12, $vr11, 50 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr11, $vr11, 16 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 + vshuf4i.w $vr12, $vr11, 14 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr11, $vr11, 0 vpickve2gr.d $a3, $vr11, 0 alsl.d $a3, $a3, $a2, 2 vpickve2gr.d $a4, $vr11, 1 @@ -16294,29 +16285,23 @@ _ZN5Mixer6updateEv: # @_ZN5Mixer6updateEv # Parent Loop BB57_4 Depth=1 # => This Inner Loop Header: Depth=2 vld $vr2, $s2, 0 - vilvh.h $vr3, $vr2, $vr2 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vld $vr4, $s3, 0 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr5, $vr4, $vr4 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 - vilvh.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vmul.w $vr4, $vr1, $vr4 + vld $vr3, $s3, 0 + vbsrl.v $vr4, $vr2, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr5, $vr3, 0 + vbsrl.v $vr3, $vr3, 8 + vsllwil.w.h $vr3, $vr3, 0 + vmul.w $vr3, $vr1, $vr3 vmul.w $vr5, $vr1, $vr5 vsrai.w $vr5, $vr5, 15 - vsrai.w $vr4, $vr4, 15 - vaddi.wu $vr4, $vr4, 1 - vsrai.w $vr4, $vr4, 1 + vsrai.w $vr3, $vr3, 15 + vaddi.wu $vr3, $vr3, 1 + vsrai.w $vr3, $vr3, 1 vaddi.wu $vr5, $vr5, 1 vsrai.w $vr5, $vr5, 1 vadd.w $vr2, $vr5, $vr2 - vadd.w $vr3, $vr4, $vr3 + vadd.w $vr3, $vr3, $vr4 vmax.w $vr3, $vr3, $vr0 vmax.w $vr2, $vr2, $vr0 vreplgr2vr.w $vr4, $fp @@ -16726,29 +16711,23 @@ _ZN5Mixer1pEv: # @_ZN5Mixer1pEv # Parent Loop BB59_13 Depth=2 # => This Inner Loop Header: Depth=3 vld $vr1, $s1, 0 - vilvh.h $vr2, $vr1, $vr1 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vld $vr3, $a6, 0 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 - vilvl.h $vr4, $vr3, $vr3 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvh.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vmul.w $vr3, $vr0, $vr3 + vld $vr2, $a6, 0 + vbsrl.v $vr3, $vr1, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.w.h $vr4, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 + vmul.w $vr2, $vr0, $vr2 vmul.w $vr4, $vr0, $vr4 vsrai.w $vr4, $vr4, 15 - vsrai.w $vr3, $vr3, 15 - vaddi.wu $vr3, $vr3, 1 - vsrai.w $vr3, $vr3, 1 + vsrai.w $vr2, $vr2, 15 + vaddi.wu $vr2, $vr2, 1 + vsrai.w $vr2, $vr2, 1 vaddi.wu $vr4, $vr4, 1 vsrai.w $vr4, $vr4, 1 vadd.w $vr1, $vr4, $vr1 - vadd.w $vr2, $vr3, $vr2 + vadd.w $vr2, $vr2, $vr3 vmax.w $vr2, $vr2, $vr5 vmax.w $vr1, $vr1, $vr5 vreplgr2vr.w $vr3, $s5 @@ -21734,29 +21713,23 @@ _Z13contextModel2v: # @_Z13contextModel2v # Parent Loop BB69_10 Depth=1 # => This Inner Loop Header: Depth=2 vld $vr2, $s2, 0 - vilvh.h $vr3, $vr2, $vr2 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vld $vr4, $s3, 0 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr5, $vr4, $vr4 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 - vilvh.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vmul.w $vr4, $vr1, $vr4 + vld $vr3, $s3, 0 + vbsrl.v $vr4, $vr2, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr5, $vr3, 0 + vbsrl.v $vr3, $vr3, 8 + vsllwil.w.h $vr3, $vr3, 0 + vmul.w $vr3, $vr1, $vr3 vmul.w $vr5, $vr1, $vr5 vsrai.w $vr5, $vr5, 15 - vsrai.w $vr4, $vr4, 15 - vaddi.wu $vr4, $vr4, 1 - vsrai.w $vr4, $vr4, 1 + vsrai.w $vr3, $vr3, 15 + vaddi.wu $vr3, $vr3, 1 + vsrai.w $vr3, $vr3, 1 vaddi.wu $vr5, $vr5, 1 vsrai.w $vr5, $vr5, 1 vadd.w $vr2, $vr5, $vr2 - vadd.w $vr3, $vr4, $vr3 + vadd.w $vr3, $vr3, $vr4 vmax.w $vr3, $vr3, $vr0 vmax.w $vr2, $vr2, $vr0 vreplgr2vr.w $vr4, $t8 @@ -24041,7 +24014,6 @@ _ZN7EncoderC2E4ModeP8_IO_FILE: # @_ZN7EncoderC2E4ModeP8_IO_FILE vreplgr2vr.d $vr0, $a0 lu12i.w $a0, -1 lu12i.w $a1, 4 - vrepli.b $vr1, 0 pcalau12i $a2, %pc_hi20(_ZL2dt) addi.d $a2, $a2, %pc_lo12(_ZL2dt) lu12i.w $a3, 1 @@ -24049,25 +24021,9 @@ _ZN7EncoderC2E4ModeP8_IO_FILE: # @_ZN7EncoderC2E4ModeP8_IO_FILE .p2align 4, , 16 .LBB75_4: # %vector.body # =>This Inner Loop Header: Depth=1 - vslli.h $vr2, $vr0, 1 - vaddi.hu $vr3, $vr2, 3 - vaddi.hu $vr2, $vr2, 11 - vpickve2gr.h $a5, $vr3, 1 - bstrpick.d $a5, $a5, 15, 0 - div.du $a5, $a1, $a5 - vpickve2gr.h $a6, $vr3, 0 - bstrpick.d $a6, $a6, 15, 0 - div.du $a6, $a1, $a6 - vinsgr2vr.h $vr4, $a6, 0 - vinsgr2vr.h $vr4, $a5, 1 - vpickve2gr.h $a5, $vr3, 2 - bstrpick.d $a5, $a5, 15, 0 - div.du $a5, $a1, $a5 - vinsgr2vr.h $vr4, $a5, 2 - vpickve2gr.h $a5, $vr3, 3 - bstrpick.d $a5, $a5, 15, 0 - div.du $a5, $a1, $a5 - vinsgr2vr.h $vr4, $a5, 3 + vslli.h $vr1, $vr0, 1 + vaddi.hu $vr2, $vr1, 3 + vaddi.hu $vr1, $vr1, 11 vpickve2gr.h $a5, $vr2, 1 bstrpick.d $a5, $a5, 15, 0 div.du $a5, $a1, $a5 @@ -24084,11 +24040,27 @@ _ZN7EncoderC2E4ModeP8_IO_FILE: # @_ZN7EncoderC2E4ModeP8_IO_FILE bstrpick.d $a5, $a5, 15, 0 div.du $a5, $a1, $a5 vinsgr2vr.h $vr3, $a5, 3 - vilvl.h $vr2, $vr1, $vr4 - vilvl.h $vr3, $vr1, $vr3 + vpickve2gr.h $a5, $vr1, 1 + bstrpick.d $a5, $a5, 15, 0 + div.du $a5, $a1, $a5 + vpickve2gr.h $a6, $vr1, 0 + bstrpick.d $a6, $a6, 15, 0 + div.du $a6, $a1, $a6 + vinsgr2vr.h $vr2, $a6, 0 + vinsgr2vr.h $vr2, $a5, 1 + vpickve2gr.h $a5, $vr1, 2 + bstrpick.d $a5, $a5, 15, 0 + div.du $a5, $a1, $a5 + vinsgr2vr.h $vr2, $a5, 2 + vpickve2gr.h $a5, $vr1, 3 + bstrpick.d $a5, $a5, 15, 0 + div.du $a5, $a1, $a5 + vinsgr2vr.h $vr2, $a5, 3 + vsllwil.wu.hu $vr1, $vr3, 0 + vsllwil.wu.hu $vr2, $vr2, 0 add.d $a5, $a2, $a0 - vstx $vr2, $a5, $a3 - vstx $vr3, $a5, $a4 + vstx $vr1, $a5, $a3 + vstx $vr2, $a5, $a4 addi.d $a0, $a0, 32 vaddi.hu $vr0, $vr0, 8 bnez $a0, .LBB75_4 @@ -29263,19 +29235,19 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception21 # %bb.0: - addi.d $sp, $sp, -336 - .cfi_def_cfa_offset 336 - st.d $ra, $sp, 328 # 8-byte Folded Spill - st.d $fp, $sp, 320 # 8-byte Folded Spill - st.d $s0, $sp, 312 # 8-byte Folded Spill - st.d $s1, $sp, 304 # 8-byte Folded Spill - st.d $s2, $sp, 296 # 8-byte Folded Spill - st.d $s3, $sp, 288 # 8-byte Folded Spill - st.d $s4, $sp, 280 # 8-byte Folded Spill - st.d $s5, $sp, 272 # 8-byte Folded Spill - st.d $s6, $sp, 264 # 8-byte Folded Spill - st.d $s7, $sp, 256 # 8-byte Folded Spill - st.d $s8, $sp, 248 # 8-byte Folded Spill + addi.d $sp, $sp, -320 + .cfi_def_cfa_offset 320 + st.d $ra, $sp, 312 # 8-byte Folded Spill + st.d $fp, $sp, 304 # 8-byte Folded Spill + st.d $s0, $sp, 296 # 8-byte Folded Spill + st.d $s1, $sp, 288 # 8-byte Folded Spill + st.d $s2, $sp, 280 # 8-byte Folded Spill + st.d $s3, $sp, 272 # 8-byte Folded Spill + st.d $s4, $sp, 264 # 8-byte Folded Spill + st.d $s5, $sp, 256 # 8-byte Folded Spill + st.d $s6, $sp, 248 # 8-byte Folded Spill + st.d $s7, $sp, 240 # 8-byte Folded Spill + st.d $s8, $sp, 232 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -29303,7 +29275,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc ld.bu $a1, $a0, 0 ori $a2, $zero, 45 pcalau12i $a4, %pc_hi20(level) - st.d $a4, $sp, 72 # 8-byte Folded Spill + st.d $a4, $sp, 56 # 8-byte Folded Spill bne $a1, $a2, .LBB104_4 # %bb.2: ld.bu $a1, $a0, 1 @@ -29312,10 +29284,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc ld.bu $a0, $a0, 2 beqz $a0, .LBB104_31 .LBB104_4: - st.d $a3, $sp, 56 # 8-byte Folded Spill + st.d $a3, $sp, 40 # 8-byte Folded Spill move $fp, $zero .LBB104_5: # %.thread - st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s0, $sp, 72 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(programChecker) addi.d $s0, $a0, %pc_lo12(programChecker) ld.w $s1, $s0, 0 @@ -29344,7 +29316,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc ori $a1, $zero, 1 pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 - st.d $a0, $sp, 96 # 8-byte Folded Spill + st.d $a0, $sp, 80 # 8-byte Folded Spill beqz $a0, .LBB104_330 # %bb.11: # %_ZN5ArrayIlLi0EEC2Ei.exit ld.d $s5, $s2, 8 @@ -29358,7 +29330,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc maskeqz $a1, $s5, $a1 or $a1, $a1, $a0 .Ltmp344: # EH_LABEL - addi.d $a0, $sp, 224 + addi.d $a0, $sp, 208 pcaddu18i $ra, %call36(_ZN6StringC2EPKc) jirl $ra, $ra, 0 .Ltmp345: # EH_LABEL @@ -29427,7 +29399,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .LBB104_20: move $fp, $zero ori $a0, $zero, 1 - st.d $a0, $sp, 64 # 8-byte Folded Spill + st.d $a0, $sp, 48 # 8-byte Folded Spill ori $s1, $zero, 1 b .LBB104_73 .LBB104_21: # %.critedge.i @@ -29436,9 +29408,9 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc or $a0, $fp, $a0 bnez $a0, .LBB104_20 .LBB104_22: # %_ZN5ArrayIcLi0EE8pop_backEv.exit.i - ld.w $a0, $sp, 224 - ld.w $a1, $sp, 228 - ld.d $s5, $sp, 240 + ld.w $a0, $sp, 208 + ld.w $a1, $sp, 212 + ld.d $s5, $sp, 224 slt $a2, $zero, $a0 sub.w $s4, $a0, $a2 bne $s4, $a1, .LBB104_36 @@ -29449,11 +29421,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc masknez $a1, $a1, $a2 maskeqz $a0, $a0, $a2 or $a0, $a0, $a1 - ld.d $s6, $sp, 232 + ld.d $s6, $sp, 216 ld.w $fp, $s0, 0 ld.w $s1, $s0, 4 - st.w $a0, $sp, 228 - st.w $a0, $sp, 224 + st.w $a0, $sp, 212 + st.w $a0, $sp, 208 add.w $a1, $fp, $a0 st.w $a1, $s0, 0 bge $s1, $a1, .LBB104_25 @@ -29464,10 +29436,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc ori $a1, $zero, 1 pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 - st.d $a0, $sp, 232 + st.d $a0, $sp, 216 beqz $a0, .LBB104_323 # %bb.26: - st.d $a0, $sp, 240 + st.d $a0, $sp, 224 beqz $s6, .LBB104_35 # %bb.27: beqz $s5, .LBB104_30 @@ -29484,7 +29456,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s6 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.d $s5, $sp, 240 + ld.d $s5, $sp, 224 b .LBB104_36 .LBB104_31: addi.d $a0, $a1, -58 @@ -29496,7 +29468,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc beq $a1, $a2, .LBB104_227 # %bb.33: .Ltmp339: # EH_LABEL - st.d $a3, $sp, 56 # 8-byte Folded Spill + st.d $a3, $sp, 40 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(.L.str.39) addi.d $a0, $a0, %pc_lo12(.L.str.39) pcaddu18i $ra, %call36(_Z4quitPKc) @@ -29507,12 +29479,12 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $s5, $a0 .LBB104_36: # %_ZN5ArrayIcLi0EE9push_backERKc.exit.i addi.d $a0, $s4, 1 - st.w $a0, $sp, 224 + st.w $a0, $sp, 208 ori $a0, $zero, 46 stx.b $a0, $s5, $s4 - ld.w $s4, $sp, 224 - ld.w $a0, $sp, 228 - ld.d $s5, $sp, 240 + ld.w $s4, $sp, 208 + ld.w $a0, $sp, 212 + ld.d $s5, $sp, 224 bne $s4, $a0, .LBB104_46 # %bb.37: slli.w $a0, $s4, 1 @@ -29521,11 +29493,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc masknez $a1, $a1, $a2 maskeqz $a0, $a0, $a2 or $a0, $a0, $a1 - ld.d $s6, $sp, 232 + ld.d $s6, $sp, 216 ld.w $fp, $s0, 0 ld.w $s1, $s0, 4 - st.w $a0, $sp, 228 - st.w $a0, $sp, 224 + st.w $a0, $sp, 212 + st.w $a0, $sp, 208 add.w $a1, $fp, $a0 st.w $a1, $s0, 0 bge $s1, $a1, .LBB104_39 @@ -29536,10 +29508,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc ori $a1, $zero, 1 pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 - st.d $a0, $sp, 232 + st.d $a0, $sp, 216 beqz $a0, .LBB104_323 # %bb.40: - st.d $a0, $sp, 240 + st.d $a0, $sp, 224 beqz $s6, .LBB104_45 # %bb.41: beqz $s5, .LBB104_44 @@ -29556,19 +29528,19 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s6 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.d $s5, $sp, 240 + ld.d $s5, $sp, 224 b .LBB104_46 .LBB104_45: move $s5, $a0 .LBB104_46: addi.d $a0, $s4, 1 - st.w $a0, $sp, 224 + st.w $a0, $sp, 208 stx.b $zero, $s5, $s4 - ld.w $fp, $sp, 224 + ld.w $fp, $sp, 208 blez $fp, .LBB104_48 # %bb.47: addi.w $fp, $fp, -1 - st.w $fp, $sp, 224 + st.w $fp, $sp, 208 .LBB104_48: # %_ZN5ArrayIcLi0EE8pop_backEv.exit.i286 ld.bu $a1, $s3, 0 beqz $a1, .LBB104_62 @@ -29582,7 +29554,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s5 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.d $a0, $sp, 240 + ld.d $a0, $sp, 224 .LBB104_51: # %.noexc293 # in Loop: Header=BB104_53 Depth=1 ld.bu $a1, $fp, -1 @@ -29590,16 +29562,16 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .LBB104_52: # %_ZN5ArrayIcLi0EE9push_backERKc.exit.i288 # in Loop: Header=BB104_53 Depth=1 addi.d $a0, $s3, 1 - st.w $a0, $sp, 224 + st.w $a0, $sp, 208 stx.b $a1, $s4, $s3 ld.bu $a1, $fp, 0 addi.d $fp, $fp, 1 beqz $a1, .LBB104_61 .LBB104_53: # %.lr.ph.i # =>This Inner Loop Header: Depth=1 - ld.w $s3, $sp, 224 - ld.w $a0, $sp, 228 - ld.d $s4, $sp, 240 + ld.w $s3, $sp, 208 + ld.w $a0, $sp, 212 + ld.d $s4, $sp, 224 bne $s3, $a0, .LBB104_52 # %bb.54: # in Loop: Header=BB104_53 Depth=1 slli.w $a0, $s3, 1 @@ -29607,11 +29579,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc masknez $a2, $s1, $a1 maskeqz $a0, $a0, $a1 or $a0, $a0, $a2 - ld.d $s5, $sp, 232 + ld.d $s5, $sp, 216 ld.w $s6, $s0, 0 ld.w $s7, $s0, 4 - st.w $a0, $sp, 228 - st.w $a0, $sp, 224 + st.w $a0, $sp, 212 + st.w $a0, $sp, 208 add.w $a1, $s6, $a0 st.w $a1, $s0, 0 bge $s7, $a1, .LBB104_56 @@ -29623,10 +29595,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc ori $a1, $zero, 1 pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 - st.d $a0, $sp, 232 + st.d $a0, $sp, 216 beqz $a0, .LBB104_323 # %bb.57: # in Loop: Header=BB104_53 Depth=1 - st.d $a0, $sp, 240 + st.d $a0, $sp, 224 beqz $s5, .LBB104_51 # %bb.58: # in Loop: Header=BB104_53 Depth=1 beqz $s4, .LBB104_50 @@ -29641,10 +29613,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc st.w $s6, $s0, 4 b .LBB104_50 .LBB104_61: # %._crit_edge.loopexit.i290 - ld.w $fp, $sp, 224 + ld.w $fp, $sp, 208 .LBB104_62: # %._crit_edge.i - ld.w $a0, $sp, 228 - ld.d $s3, $sp, 240 + ld.w $a0, $sp, 212 + ld.d $s3, $sp, 224 bne $fp, $a0, .LBB104_72 # %bb.63: slli.w $a0, $fp, 1 @@ -29653,11 +29625,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc masknez $a1, $a1, $a2 maskeqz $a0, $a0, $a2 or $s5, $a0, $a1 - ld.d $s4, $sp, 232 + ld.d $s4, $sp, 216 ld.w $s1, $s0, 0 ld.w $s6, $s0, 4 - st.w $s5, $sp, 228 - st.w $s5, $sp, 224 + st.w $s5, $sp, 212 + st.w $s5, $sp, 208 add.w $a0, $s1, $s5 st.w $a0, $s0, 0 bge $s6, $a0, .LBB104_65 @@ -29669,10 +29641,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s5 pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 - st.d $a0, $sp, 232 + st.d $a0, $sp, 216 beqz $a0, .LBB104_323 # %bb.66: - st.d $a0, $sp, 240 + st.d $a0, $sp, 224 beqz $s4, .LBB104_71 # %bb.67: beqz $s3, .LBB104_70 @@ -29692,44 +29664,44 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s4 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.d $s3, $sp, 240 + ld.d $s3, $sp, 224 b .LBB104_72 .LBB104_71: move $s3, $a0 .LBB104_72: # %_ZN6StringpLEPKc.exit295 - st.d $zero, $sp, 64 # 8-byte Folded Spill + st.d $zero, $sp, 48 # 8-byte Folded Spill move $s1, $zero addi.d $a0, $fp, 1 - st.w $a0, $sp, 224 + st.w $a0, $sp, 208 stx.b $zero, $s3, $fp ori $fp, $zero, 1 .LBB104_73: .Ltmp350: # EH_LABEL pcalau12i $a0, %pc_hi20(.L.str.36) addi.d $a1, $a0, %pc_lo12(.L.str.36) - addi.d $a0, $sp, 200 + addi.d $a0, $sp, 184 pcaddu18i $ra, %call36(_ZN6StringC2EPKc) jirl $ra, $ra, 0 .Ltmp351: # EH_LABEL # %bb.74: - st.d $fp, $sp, 48 # 8-byte Folded Spill - st.d $s1, $sp, 40 # 8-byte Folded Spill + st.d $fp, $sp, 32 # 8-byte Folded Spill + st.d $s1, $sp, 24 # 8-byte Folded Spill beqz $fp, .LBB104_102 # %bb.75: .Ltmp353: # EH_LABEL pcalau12i $a0, %pc_hi20(.L.str.36) addi.d $a1, $a0, %pc_lo12(.L.str.36) - addi.d $a0, $sp, 136 + addi.d $a0, $sp, 120 pcaddu18i $ra, %call36(_ZN6StringC2EPKc) jirl $ra, $ra, 0 .Ltmp354: # EH_LABEL # %bb.76: # %.preheader570 - st.d $s8, $sp, 104 # 8-byte Folded Spill + st.d $s8, $sp, 88 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(.L.str.43) addi.d $a0, $a0, %pc_lo12(.L.str.43) - st.d $a0, $sp, 80 # 8-byte Folded Spill + st.d $a0, $sp, 64 # 8-byte Folded Spill ori $fp, $zero, 2 - ld.d $a0, $sp, 88 # 8-byte Folded Reload + ld.d $a0, $sp, 72 # 8-byte Folded Reload blt $a0, $fp, .LBB104_332 # %bb.77: # %.lr.ph592 move $s5, $zero @@ -29743,12 +29715,12 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .p2align 4, , 16 .LBB104_78: # %_ZN5ArrayIcLi0EED2Ev.exit297 # in Loop: Header=BB104_79 Depth=1 - ld.d $a0, $sp, 184 + ld.d $a0, $sp, 168 add.w $s5, $s4, $s5 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 addi.d $s1, $s1, 1 - ld.d $a0, $sp, 88 # 8-byte Folded Reload + ld.d $a0, $sp, 72 # 8-byte Folded Reload beq $s1, $a0, .LBB104_110 .LBB104_79: # =>This Loop Header: Depth=1 # Child Loop BB104_83 Depth 2 @@ -29757,12 +29729,12 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc slli.d $a0, $s1, 3 ldx.d $a1, $s2, $a0 .Ltmp356: # EH_LABEL - addi.d $a0, $sp, 176 + addi.d $a0, $sp, 160 pcaddu18i $ra, %call36(_ZN6StringC2EPKc) jirl $ra, $ra, 0 .Ltmp357: # EH_LABEL # %bb.80: # in Loop: Header=BB104_79 Depth=1 - ld.w $a1, $sp, 176 + ld.w $a1, $sp, 160 addi.w $a0, $a1, -1 blez $a1, .LBB104_90 # %bb.81: # %.lr.ph.preheader @@ -29776,7 +29748,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .LBB104_83: # %.lr.ph # Parent Loop BB104_79 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $a3, $sp, 192 + ld.d $a3, $sp, 176 ldx.bu $a4, $a3, $a2 bne $a4, $s7, .LBB104_82 # %bb.84: # in Loop: Header=BB104_83 Depth=2 @@ -29794,7 +29766,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .LBB104_87: # %.lr.ph587 # Parent Loop BB104_79 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $a2, $sp, 192 + ld.d $a2, $sp, 176 ldx.bu $a3, $a2, $a1 bne $a3, $s3, .LBB104_90 # %bb.88: # in Loop: Header=BB104_87 Depth=2 @@ -29808,7 +29780,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .p2align 4, , 16 .LBB104_90: # %.critedge # in Loop: Header=BB104_79 Depth=1 - ld.d $a2, $sp, 192 + ld.d $a2, $sp, 176 bstrpick.d $a1, $a0, 31, 0 move $a4, $a0 .p2align 4, , 16 @@ -29838,8 +29810,8 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .LBB104_96: # %.critedge2.thread # in Loop: Header=BB104_79 Depth=1 .Ltmp359: # EH_LABEL - addi.d $a0, $sp, 136 - addi.d $a1, $sp, 200 + addi.d $a0, $sp, 120 + addi.d $a1, $sp, 184 pcaddu18i $ra, %call36(_Z7putsizeR6StringS0_PKci) jirl $ra, $ra, 0 .Ltmp360: # EH_LABEL @@ -29853,13 +29825,13 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc or $a0, $s8, $a0 beqz $a0, .LBB104_100 # %bb.99: # in Loop: Header=BB104_79 Depth=1 - ld.d $a1, $sp, 192 + ld.d $a1, $sp, 176 pcalau12i $a0, %pc_hi20(.L.str.42) addi.d $a0, $a0, %pc_lo12(.L.str.42) pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 .LBB104_100: # in Loop: Header=BB104_79 Depth=1 - ld.w $a0, $sp, 176 + ld.w $a0, $sp, 160 ld.w $a1, $s0, 0 ld.w $a2, $s0, 4 sub.w $a0, $a1, $a0 @@ -29869,15 +29841,15 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc st.w $a0, $s0, 4 b .LBB104_78 .LBB104_102: - st.d $zero, $sp, 104 # 8-byte Folded Spill - st.d $zero, $sp, 80 # 8-byte Folded Spill + st.d $zero, $sp, 88 # 8-byte Folded Spill + st.d $zero, $sp, 64 # 8-byte Folded Spill ori $s5, $zero, 1 move $s7, $s8 - ld.d $s6, $sp, 96 # 8-byte Folded Reload - ld.d $a0, $sp, 64 # 8-byte Folded Reload + ld.d $s6, $sp, 80 # 8-byte Folded Reload + ld.d $a0, $sp, 48 # 8-byte Folded Reload beqz $a0, .LBB104_140 .LBB104_103: - ld.d $a0, $sp, 240 + ld.d $a0, $sp, 224 pcalau12i $a1, %pc_hi20(.L.str.47) addi.d $a1, $a1, %pc_lo12(.L.str.47) pcaddu18i $ra, %call36(fopen) @@ -29908,9 +29880,9 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc ori $a2, $zero, 5 maskeqz $a0, $a2, $a0 or $a0, $a0, $a1 - ld.d $a1, $sp, 72 # 8-byte Folded Reload + ld.d $a1, $sp, 56 # 8-byte Folded Reload st.w $a0, $a1, %pc_lo12(level) - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 88 # 8-byte Folded Reload slli.d $a0, $a0, 3 sub.d $fp, $zero, $a0 .p2align 4, , 16 @@ -29923,16 +29895,16 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.108: # in Loop: Header=BB104_107 Depth=1 beqz $a0, .LBB104_114 # %bb.109: # in Loop: Header=BB104_107 Depth=1 - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 88 # 8-byte Folded Reload addi.w $a0, $a0, 1 - st.d $a0, $sp, 104 # 8-byte Folded Spill + st.d $a0, $sp, 88 # 8-byte Folded Spill addi.d $fp, $fp, -8 b .LBB104_107 .LBB104_110: # %._crit_edge - ld.d $s8, $sp, 104 # 8-byte Folded Reload + ld.d $s8, $sp, 88 # 8-byte Folded Reload beqz $s5, .LBB104_315 # %bb.111: - ld.d $a0, $sp, 240 + ld.d $a0, $sp, 224 pcalau12i $a1, %pc_hi20(.L.str.44) addi.d $a1, $a1, %pc_lo12(.L.str.44) pcaddu18i $ra, %call36(fopen) @@ -29940,12 +29912,12 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc beqz $a0, .LBB104_314 # %bb.112: move $a4, $a0 - ld.d $a0, $sp, 72 # 8-byte Folded Reload + ld.d $a0, $sp, 56 # 8-byte Folded Reload ld.w $a2, $a0, %pc_lo12(level) - ld.d $a3, $sp, 152 + ld.d $a3, $sp, 136 pcalau12i $a0, %pc_hi20(.L.str.45) addi.d $a1, $a0, %pc_lo12(.L.str.45) - st.d $a4, $sp, 80 # 8-byte Folded Spill + st.d $a4, $sp, 64 # 8-byte Folded Spill move $a0, $a4 pcaddu18i $ra, %call36(fprintf) jirl $ra, $ra, 0 @@ -29958,34 +29930,34 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc bne $s5, $a0, .LBB104_120 # %bb.113: move $s7, $s8 - ld.d $s6, $sp, 96 # 8-byte Folded Reload - ld.d $s4, $sp, 80 # 8-byte Folded Reload + ld.d $s6, $sp, 80 # 8-byte Folded Reload + ld.d $s4, $sp, 64 # 8-byte Folded Reload b .LBB104_131 .LBB104_114: - ld.d $a0, $sp, 72 # 8-byte Folded Reload + ld.d $a0, $sp, 56 # 8-byte Folded Reload ld.w $a2, $a0, %pc_lo12(level) pcalau12i $a0, %pc_hi20(.L.str.51) addi.d $a0, $a0, %pc_lo12(.L.str.51) - ld.d $a1, $sp, 104 # 8-byte Folded Reload + ld.d $a1, $sp, 88 # 8-byte Folded Reload pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 - st.d $s1, $sp, 80 # 8-byte Folded Spill + st.d $s1, $sp, 64 # 8-byte Folded Spill move $a0, $s1 pcaddu18i $ra, %call36(ftell) jirl $ra, $ra, 0 - ld.w $a1, $sp, 204 + ld.w $a1, $sp, 188 move $s3, $a0 addi.w $s8, $a0, 4 bge $a1, $s8, .LBB104_141 # %bb.115: - ld.d $s4, $sp, 208 - ld.d $s1, $sp, 216 - ld.w $a4, $sp, 200 - st.w $s8, $sp, 204 - st.w $s8, $sp, 200 + ld.d $s4, $sp, 192 + ld.d $s1, $sp, 200 + ld.w $a4, $sp, 184 + st.w $s8, $sp, 188 + st.w $s8, $sp, 184 blez $s8, .LBB104_142 # %bb.116: - st.d $a4, $sp, 96 # 8-byte Folded Spill + st.d $a4, $sp, 80 # 8-byte Folded Spill ld.w $a0, $s0, 0 ld.w $a1, $s0, 4 add.w $a0, $a0, $s8 @@ -29998,11 +29970,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s8 pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 - st.d $a0, $sp, 208 + st.d $a0, $sp, 192 beqz $a0, .LBB104_341 # %bb.119: - st.d $a0, $sp, 216 - ld.d $a4, $sp, 96 # 8-byte Folded Reload + st.d $a0, $sp, 200 + ld.d $a4, $sp, 80 # 8-byte Folded Reload bnez $s4, .LBB104_143 b .LBB104_147 .LBB104_120: @@ -30046,8 +30018,8 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 move $s6, $a0 - ld.d $a0, $sp, 96 # 8-byte Folded Reload - ld.d $s4, $sp, 80 # 8-byte Folded Reload + ld.d $a0, $sp, 80 # 8-byte Folded Reload + ld.d $s4, $sp, 64 # 8-byte Folded Reload beqz $s6, .LBB104_339 # %bb.128: # %_ZN5ArrayIlLi0EE6createEi.exit.i st.d $zero, $s6, 0 @@ -30059,7 +30031,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 .LBB104_131: # %_ZN5ArrayIlLi0EE6resizeEi.exit - ld.d $fp, $sp, 216 + ld.d $fp, $sp, 200 move $a0, $s4 pcaddu18i $ra, %call36(rewind) jirl $ra, $ra, 0 @@ -30104,7 +30076,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a1, $zero pcaddu18i $ra, %call36(fseek) jirl $ra, $ra, 0 - ld.w $a0, $sp, 136 + ld.w $a0, $sp, 120 ld.w $a1, $s0, 0 ld.w $a2, $s0, 4 sub.w $a0, $a1, $a0 @@ -30113,27 +30085,27 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.138: st.w $a0, $s0, 4 .LBB104_139: # %_ZN5ArrayIcLi0EED2Ev.exit310 - ld.d $a0, $sp, 144 + ld.d $a0, $sp, 128 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - st.d $s5, $sp, 104 # 8-byte Folded Spill - ld.d $a0, $sp, 64 # 8-byte Folded Reload + st.d $s5, $sp, 88 # 8-byte Folded Spill + ld.d $a0, $sp, 48 # 8-byte Folded Reload bnez $a0, .LBB104_103 .LBB104_140: move $s8, $s7 move $fp, $s5 - st.d $s6, $sp, 96 # 8-byte Folded Spill - ld.d $a0, $sp, 72 # 8-byte Folded Reload + st.d $s6, $sp, 80 # 8-byte Folded Spill + ld.d $a0, $sp, 56 # 8-byte Folded Reload ld.w $a0, $a0, %pc_lo12(level) ori $a1, $zero, 12 bgeu $a1, $a0, .LBB104_181 b .LBB104_182 .LBB104_141: - st.w $s8, $sp, 200 + st.w $s8, $sp, 184 b .LBB104_147 .LBB104_142: move $a0, $zero - addi.d $a2, $sp, 208 + addi.d $a2, $sp, 192 vrepli.b $vr0, 0 vst $vr0, $a2, 0 beqz $s4, .LBB104_147 @@ -30159,17 +30131,17 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 .LBB104_147: # %_ZN5ArrayIcLi0EE6resizeEi.exit - ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s1, $sp, 64 # 8-byte Folded Reload move $a0, $s1 pcaddu18i $ra, %call36(rewind) jirl $ra, $ra, 0 - ld.d $a0, $sp, 216 + ld.d $a0, $sp, 200 ori $a1, $zero, 1 move $a2, $s3 move $a3, $s1 pcaddu18i $ra, %call36(fread) jirl $ra, $ra, 0 - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 88 # 8-byte Folded Reload bge $s5, $a0, .LBB104_154 # %bb.148: ld.w $s1, $s0, 0 @@ -30199,13 +30171,13 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.153: # %.noexc327 .LBB104_154: move $s8, $s7 - st.d $s6, $sp, 96 # 8-byte Folded Spill + st.d $s6, $sp, 80 # 8-byte Folded Spill move $s7, $a0 b .LBB104_168 .LBB104_155: move $a0, $zero .LBB104_156: # %_ZN5ArrayIPcLi0EE6createEi.exit.i323 - ld.d $a3, $sp, 104 # 8-byte Folded Reload + ld.d $a3, $sp, 88 # 8-byte Folded Reload slt $a2, $a3, $s5 masknez $a1, $s5, $a2 maskeqz $a2, $a3, $a2 @@ -30227,7 +30199,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc jirl $ra, $ra, 0 ld.w $s1, $s0, 0 ld.w $s3, $s0, 4 - ld.d $s7, $sp, 104 # 8-byte Folded Reload + ld.d $s7, $sp, 88 # 8-byte Folded Reload blez $s7, .LBB104_164 # %bb.159: sub.w $a0, $zero, $fp @@ -30255,7 +30227,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $zero move $s8, $s4 .LBB104_165: # %_ZN5ArrayIlLi0EE6createEi.exit.i332 - st.d $a0, $sp, 96 # 8-byte Folded Spill + st.d $a0, $sp, 80 # 8-byte Folded Spill move $a1, $s6 move $a2, $s5 pcaddu18i $ra, %call36(memcpy) @@ -30270,7 +30242,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 .LBB104_168: # %_ZN5ArrayIlLi0EE6resizeEi.exit337 - ld.d $fp, $sp, 216 + ld.d $fp, $sp, 200 ori $a0, $zero, 13 ld.bu $a1, $fp, 0 beqz $a1, .LBB104_171 @@ -30304,7 +30276,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc jirl $ra, $ra, 0 slli.d $a1, $s1, 3 addi.d $a3, $fp, 1 - ld.d $a2, $sp, 96 # 8-byte Folded Reload + ld.d $a2, $sp, 80 # 8-byte Folded Reload stx.d $a0, $a2, $a1 move $a2, $a3 .p2align 4, , 16 @@ -30335,7 +30307,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc b .LBB104_173 .LBB104_180: move $fp, $s7 - ld.d $a0, $sp, 72 # 8-byte Folded Reload + ld.d $a0, $sp, 56 # 8-byte Folded Reload ld.w $a0, $a0, %pc_lo12(level) ori $a1, $zero, 12 bltu $a1, $a0, .LBB104_182 @@ -30347,14 +30319,13 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc addi.d $a0, $a0, %pc_lo12(buf) move $s7, $s8 move $s5, $fp - ld.d $s6, $sp, 96 # 8-byte Folded Reload + ld.d $s6, $sp, 80 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZN5ArrayIhLi0EE6resizeEi) jirl $ra, $ra, 0 .Ltmp389: # EH_LABEL .LBB104_182: # %_ZN3Buf7setsizeEi.exit - st.d $fp, $sp, 32 # 8-byte Folded Spill - vrepli.b $vr4, 0 - ld.d $s4, $sp, 104 # 8-byte Folded Reload + st.d $fp, $sp, 16 # 8-byte Folded Spill + ld.d $s4, $sp, 88 # 8-byte Folded Reload blez $s4, .LBB104_185 # %bb.183: # %.lr.ph601.preheader ori $a0, $zero, 4 @@ -30369,11 +30340,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .LBB104_186: # %vector.ph bstrpick.d $a0, $s4, 30, 2 slli.d $a0, $a0, 2 - ld.d $a1, $sp, 96 # 8-byte Folded Reload + vrepli.b $vr0, 0 + ld.d $a1, $sp, 80 # 8-byte Folded Reload addi.d $a1, $a1, 16 move $a2, $a0 - vori.b $vr0, $vr4, 0 - vori.b $vr1, $vr4, 0 + vori.b $vr1, $vr0, 0 .p2align 4, , 16 .LBB104_187: # %vector.body # =>This Inner Loop Header: Depth=1 @@ -30390,7 +30361,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc vpickve2gr.d $s5, $vr0, 0 beq $a0, $s4, .LBB104_191 .LBB104_189: # %.lr.ph601.preheader863 - ld.d $a1, $sp, 96 # 8-byte Folded Reload + ld.d $a1, $sp, 80 # 8-byte Folded Reload alsl.d $a1, $a0, $a1, 3 sub.d $a0, $s4, $a0 .p2align 4, , 16 @@ -30403,52 +30374,50 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc bnez $a0, .LBB104_190 .LBB104_191: # %._crit_edge602 ori $a0, $zero, 2048 - st.w $a0, $sp, 136 - ld.d $a0, $sp, 40 # 8-byte Folded Reload - st.w $a0, $sp, 140 - ld.d $a0, $sp, 80 # 8-byte Folded Reload - st.d $a0, $sp, 144 + st.w $a0, $sp, 120 + ld.d $a0, $sp, 24 # 8-byte Folded Reload + st.w $a0, $sp, 124 + ld.d $a0, $sp, 64 # 8-byte Folded Reload + st.d $a0, $sp, 128 ori $a0, $zero, 0 - ld.d $a1, $sp, 72 # 8-byte Folded Reload + ld.d $a1, $sp, 56 # 8-byte Folded Reload ld.w $a1, $a1, %pc_lo12(level) lu32i.d $a0, -1 - st.d $a0, $sp, 152 - st.w $zero, $sp, 160 + st.d $a0, $sp, 136 + st.w $zero, $sp, 144 slti $a0, $a1, 1 - ld.d $a1, $sp, 64 # 8-byte Folded Reload + ld.d $a1, $sp, 48 # 8-byte Folded Reload xori $a1, $a1, 1 or $a0, $a1, $a0 - st.d $zero, $sp, 168 - vst $vr4, $sp, 16 # 16-byte Folded Spill + st.d $zero, $sp, 152 bnez $a0, .LBB104_193 # %bb.192: # %.preheader.preheader.i - ld.d $a0, $sp, 80 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload pcaddu18i $ra, %call36(getc) jirl $ra, $ra, 0 - ld.d $a1, $sp, 144 + ld.d $a1, $sp, 128 andi $fp, $a0, 255 - st.w $fp, $sp, 160 + st.w $fp, $sp, 144 move $a0, $a1 pcaddu18i $ra, %call36(getc) jirl $ra, $ra, 0 - ld.d $a1, $sp, 144 + ld.d $a1, $sp, 128 move $s6, $a0 bstrins.d $s6, $fp, 63, 8 - st.w $s6, $sp, 160 + st.w $s6, $sp, 144 move $a0, $a1 pcaddu18i $ra, %call36(getc) jirl $ra, $ra, 0 - ld.d $a1, $sp, 144 + ld.d $a1, $sp, 128 move $s7, $a0 bstrins.d $s7, $s6, 63, 8 - st.w $s7, $sp, 160 + st.w $s7, $sp, 144 move $a0, $a1 pcaddu18i $ra, %call36(getc) jirl $ra, $ra, 0 - vld $vr4, $sp, 16 # 16-byte Folded Reload bstrins.d $a0, $s7, 63, 8 - ld.d $s4, $sp, 104 # 8-byte Folded Reload - st.w $a0, $sp, 160 + ld.d $s4, $sp, 88 # 8-byte Folded Reload + st.w $a0, $sp, 144 .LBB104_193: # %vector.body856.preheader lu12i.w $a0, 16 lu32i.d $a0, 196610 @@ -30497,8 +30466,8 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc bstrpick.d $a5, $a5, 15, 0 div.du $a5, $a1, $a5 vinsgr2vr.h $vr2, $a5, 3 - vilvl.h $vr1, $vr4, $vr3 - vilvl.h $vr2, $vr4, $vr2 + vsllwil.wu.hu $vr1, $vr3, 0 + vsllwil.wu.hu $vr2, $vr2, 0 add.d $a5, $a2, $a0 vstx $vr1, $a5, $a3 vstx $vr2, $a5, $a4 @@ -30506,20 +30475,20 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc vaddi.hu $vr0, $vr0, 8 bnez $a0, .LBB104_194 # %bb.195: # %_ZN7EncoderC2E4ModeP8_IO_FILE.exit - ld.d $a0, $sp, 48 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload beqz $a0, .LBB104_203 # %bb.196: # %.preheader blez $s4, .LBB104_200 # %bb.197: # %.lr.ph616.preheader move $fp, $s8 - ld.d $s1, $sp, 96 # 8-byte Folded Reload + ld.d $s1, $sp, 80 # 8-byte Folded Reload .p2align 4, , 16 .LBB104_198: # %.lr.ph616 # =>This Inner Loop Header: Depth=1 ld.d $a0, $fp, 0 ld.d $a1, $s1, 0 .Ltmp408: # EH_LABEL - addi.d $a2, $sp, 136 + addi.d $a2, $sp, 120 pcaddu18i $ra, %call36(_Z8compressPKclR7Encoder) jirl $ra, $ra, 0 .Ltmp409: # EH_LABEL @@ -30529,23 +30498,23 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc addi.d $fp, $fp, 8 bnez $s4, .LBB104_198 .LBB104_200: # %._crit_edge617 - ld.w $a0, $sp, 140 - ld.d $a1, $sp, 144 + ld.w $a0, $sp, 124 + ld.d $a1, $sp, 128 bnez $a0, .LBB104_223 # %bb.201: # %._crit_edge617 - ld.d $a0, $sp, 72 # 8-byte Folded Reload + ld.d $a0, $sp, 56 # 8-byte Folded Reload ld.w $a0, $a0, %pc_lo12(level) - ld.d $fp, $sp, 32 # 8-byte Folded Reload + ld.d $fp, $sp, 16 # 8-byte Folded Reload blez $a0, .LBB104_224 # %bb.202: - ld.bu $a0, $sp, 155 + ld.bu $a0, $sp, 139 pcaddu18i $ra, %call36(putc) jirl $ra, $ra, 0 - ld.d $a1, $sp, 144 + ld.d $a1, $sp, 128 b .LBB104_224 .LBB104_203: ori $fp, $zero, 2 - ld.d $s1, $sp, 88 # 8-byte Folded Reload + ld.d $s1, $sp, 72 # 8-byte Folded Reload slt $a0, $fp, $s1 ori $a1, $zero, 8 masknez $a1, $a1, $a0 @@ -30554,19 +30523,19 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc or $a0, $a0, $a1 ldx.d $a1, $s2, $a0 .Ltmp390: # EH_LABEL - addi.d $a0, $sp, 176 + addi.d $a0, $sp, 160 pcaddu18i $ra, %call36(_ZN6StringC2EPKc) jirl $ra, $ra, 0 .Ltmp391: # EH_LABEL # %bb.204: bne $s1, $fp, .LBB104_231 # %bb.205: - ld.w $fp, $sp, 176 + ld.w $fp, $sp, 160 ori $a0, $zero, 2 addi.w $a6, $fp, -2 blt $fp, $a0, .LBB104_213 # %bb.206: # %.lr.ph608 - ld.d $a0, $sp, 192 + ld.d $a0, $sp, 176 ori $a1, $zero, 92 ori $a2, $zero, 47 ori $a3, $zero, 1 @@ -30594,16 +30563,16 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc addi.w $a0, $zero, -1 bne $a6, $a0, .LBB104_231 .LBB104_214: # %._crit_edge609.thread - ld.w $a1, $sp, 180 + ld.w $a1, $sp, 164 ori $a0, $zero, 2 bge $a1, $a0, .LBB104_229 # %bb.215: - ld.d $s1, $sp, 184 - ld.d $s2, $sp, 192 + ld.d $s1, $sp, 168 + ld.d $s2, $sp, 176 ld.w $s5, $s0, 0 ld.w $s3, $s0, 4 lu32i.d $a0, 2 - st.d $a0, $sp, 176 + st.d $a0, $sp, 160 addi.w $a0, $s5, 2 st.w $a0, $s0, 0 bge $s3, $a0, .LBB104_217 @@ -30615,10 +30584,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc ori $a1, $zero, 1 pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 - st.d $a0, $sp, 184 + st.d $a0, $sp, 168 beqz $a0, .LBB104_343 # %bb.218: # %_ZN5ArrayIcLi0EE6createEi.exit.i431 - st.d $a0, $sp, 192 + st.d $a0, $sp, 176 beqz $s1, .LBB104_230 # %bb.219: beqz $s2, .LBB104_222 @@ -30641,7 +30610,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc jirl $ra, $ra, 0 b .LBB104_230 .LBB104_223: - ld.d $fp, $sp, 32 # 8-byte Folded Reload + ld.d $fp, $sp, 16 # 8-byte Folded Reload .LBB104_224: move $a0, $a1 pcaddu18i $ra, %call36(ftell) @@ -30659,11 +30628,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .LBB104_226: addi.d $a1, $a1, -48 andi $a1, $a1, 255 - ld.d $a2, $sp, 72 # 8-byte Folded Reload + ld.d $a2, $sp, 56 # 8-byte Folded Reload st.w $a1, $a2, %pc_lo12(level) .LBB104_227: ori $a1, $zero, 2 - st.d $zero, $sp, 56 # 8-byte Folded Spill + st.d $zero, $sp, 40 # 8-byte Folded Spill beq $s0, $a1, .LBB104_326 # %bb.228: addi.d $s2, $s2, 8 @@ -30671,37 +30640,37 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc addi.w $s0, $s0, -1 b .LBB104_5 .LBB104_229: - st.w $a0, $sp, 176 + st.w $a0, $sp, 160 .LBB104_230: # %_ZN6StringaSEPKc.exit - ld.d $a0, $sp, 192 + ld.d $a0, $sp, 176 ori $a1, $zero, 46 st.h $a1, $a0, 0 .LBB104_231: # %.thread555 - ld.d $s1, $sp, 192 + ld.d $s1, $sp, 176 move $a0, $s1 pcaddu18i $ra, %call36(strlen) jirl $ra, $ra, 0 - ld.w $a1, $sp, 180 + ld.w $a1, $sp, 164 addi.w $s5, $a0, 1 bge $a1, $s5, .LBB104_234 # %bb.232: addi.w $a0, $a0, 0 - ld.d $s2, $sp, 184 - ld.w $fp, $sp, 176 - st.w $s5, $sp, 180 + ld.d $s2, $sp, 168 + ld.w $fp, $sp, 160 + st.w $s5, $sp, 164 lu12i.w $a1, 524287 ori $a1, $a1, 4095 - st.w $s5, $sp, 176 + st.w $s5, $sp, 160 bltu $a0, $a1, .LBB104_235 # %bb.233: move $a0, $zero - addi.d $a1, $sp, 184 - vld $vr0, $sp, 16 # 16-byte Folded Reload + addi.d $a1, $sp, 168 + vrepli.b $vr0, 0 vst $vr0, $a1, 0 bnez $s2, .LBB104_239 b .LBB104_242 .LBB104_234: - st.w $s5, $sp, 176 + st.w $s5, $sp, 160 move $a0, $s1 b .LBB104_242 .LBB104_235: @@ -30717,10 +30686,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s5 pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 - st.d $a0, $sp, 184 + st.d $a0, $sp, 168 beqz $a0, .LBB104_335 # %bb.238: - st.d $a0, $sp, 192 + st.d $a0, $sp, 176 beqz $s2, .LBB104_242 .LBB104_239: slt $a1, $s5, $fp @@ -30741,16 +30710,16 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s2 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.d $a0, $sp, 192 + ld.d $a0, $sp, 176 .LBB104_242: move $a1, $s1 pcaddu18i $ra, %call36(strcpy) jirl $ra, $ra, 0 - ld.d $s2, $sp, 192 + ld.d $s2, $sp, 176 ld.bu $a0, $s2, 0 beqz $a0, .LBB104_267 # %bb.243: - ld.w $s1, $sp, 176 + ld.w $s1, $sp, 160 ori $a0, $zero, 3 bne $s1, $a0, .LBB104_245 # %bb.244: @@ -30763,7 +30732,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .LBB104_246: # %.thread556 addi.w $s1, $s1, -1 .LBB104_247: # %_ZN5ArrayIcLi0EE8pop_backEv.exit.i345 - ld.w $a0, $sp, 180 + ld.w $a0, $sp, 164 bne $s1, $a0, .LBB104_256 # %bb.248: slli.w $a0, $s1, 1 @@ -30772,11 +30741,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc masknez $a1, $a1, $a2 maskeqz $a0, $a0, $a2 or $a0, $a0, $a1 - ld.d $s5, $sp, 184 + ld.d $s5, $sp, 168 ld.w $fp, $s0, 0 ld.w $s3, $s0, 4 - st.w $a0, $sp, 180 - st.w $a0, $sp, 176 + st.w $a0, $sp, 164 + st.w $a0, $sp, 160 add.w $a1, $fp, $a0 st.w $a1, $s0, 0 bge $s3, $a1, .LBB104_250 @@ -30787,10 +30756,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc ori $a1, $zero, 1 pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 - st.d $a0, $sp, 184 + st.d $a0, $sp, 168 beqz $a0, .LBB104_335 # %bb.251: - st.d $a0, $sp, 192 + st.d $a0, $sp, 176 beqz $s5, .LBB104_255 # %bb.252: move $a1, $s2 @@ -30805,18 +30774,18 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s5 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.d $s2, $sp, 192 + ld.d $s2, $sp, 176 b .LBB104_256 .LBB104_255: move $s2, $a0 .LBB104_256: # %_ZN5ArrayIcLi0EE9push_backERKc.exit.i348 addi.d $a0, $s1, 1 - st.w $a0, $sp, 176 + st.w $a0, $sp, 160 ori $a0, $zero, 47 stx.b $a0, $s2, $s1 - ld.w $s1, $sp, 176 - ld.w $a0, $sp, 180 - ld.d $s2, $sp, 192 + ld.w $s1, $sp, 160 + ld.w $a0, $sp, 164 + ld.d $s2, $sp, 176 bne $s1, $a0, .LBB104_266 # %bb.257: slli.w $a0, $s1, 1 @@ -30825,11 +30794,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc masknez $a1, $a1, $a2 maskeqz $a0, $a0, $a2 or $a0, $a0, $a1 - ld.d $s5, $sp, 184 + ld.d $s5, $sp, 168 ld.w $fp, $s0, 0 ld.w $s3, $s0, 4 - st.w $a0, $sp, 180 - st.w $a0, $sp, 176 + st.w $a0, $sp, 164 + st.w $a0, $sp, 160 add.w $a1, $fp, $a0 st.w $a1, $s0, 0 bge $s3, $a1, .LBB104_259 @@ -30840,10 +30809,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc ori $a1, $zero, 1 pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 - st.d $a0, $sp, 184 + st.d $a0, $sp, 168 beqz $a0, .LBB104_335 # %bb.260: - st.d $a0, $sp, 192 + st.d $a0, $sp, 176 beqz $s5, .LBB104_265 # %bb.261: beqz $s2, .LBB104_264 @@ -30860,13 +30829,13 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s5 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.d $s2, $sp, 192 + ld.d $s2, $sp, 176 b .LBB104_266 .LBB104_265: move $s2, $a0 .LBB104_266: # %_ZN6StringpLEPKc.exit356 addi.d $a0, $s1, 1 - st.w $a0, $sp, 176 + st.w $a0, $sp, 160 stx.b $zero, $s2, $s1 .LBB104_267: blez $s4, .LBB104_300 @@ -30877,30 +30846,30 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .p2align 4, , 16 .LBB104_269: # %_ZN5ArrayIcLi0EED2Ev.exit373 # in Loop: Header=BB104_270 Depth=1 - ld.d $a0, $sp, 120 + ld.d $a0, $sp, 104 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 addi.d $fp, $fp, 1 - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 88 # 8-byte Folded Reload beq $fp, $a0, .LBB104_300 .LBB104_270: # =>This Loop Header: Depth=1 # Child Loop BB104_278 Depth 2 move $s7, $s8 - ld.d $a1, $sp, 192 + ld.d $a1, $sp, 176 .Ltmp399: # EH_LABEL - addi.d $a0, $sp, 112 + addi.d $a0, $sp, 96 pcaddu18i $ra, %call36(_ZN6StringC2EPKc) jirl $ra, $ra, 0 .Ltmp400: # EH_LABEL # %bb.271: # in Loop: Header=BB104_270 Depth=1 slli.d $a1, $fp, 3 - ld.w $s8, $sp, 112 + ld.w $s8, $sp, 96 ldx.d $a0, $s7, $a1 - st.d $a1, $sp, 88 # 8-byte Folded Spill + st.d $a1, $sp, 72 # 8-byte Folded Spill blez $s8, .LBB104_273 # %bb.272: # in Loop: Header=BB104_270 Depth=1 addi.w $s8, $s8, -1 - st.w $s8, $sp, 112 + st.w $s8, $sp, 96 .LBB104_273: # %_ZN5ArrayIcLi0EE8pop_backEv.exit.i359 # in Loop: Header=BB104_270 Depth=1 ld.bu $a1, $a0, 0 @@ -30915,16 +30884,16 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s5 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.d $a0, $sp, 128 + ld.d $a0, $sp, 112 .LBB104_276: # %.noexc369 # in Loop: Header=BB104_278 Depth=2 - st.w $s1, $sp, 112 + st.w $s1, $sp, 96 ld.bu $a1, $s8, -1 move $s2, $a0 .LBB104_277: # %_ZN5ArrayIcLi0EE9push_backERKc.exit.i363 # in Loop: Header=BB104_278 Depth=2 addi.d $a0, $s1, 1 - st.w $a0, $sp, 112 + st.w $a0, $sp, 96 stx.b $a1, $s2, $s1 ld.bu $a1, $s8, 0 addi.d $s8, $s8, 1 @@ -30932,9 +30901,9 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .LBB104_278: # %.lr.ph.i361 # Parent Loop BB104_270 Depth=1 # => This Inner Loop Header: Depth=2 - ld.w $s1, $sp, 112 - ld.w $a0, $sp, 116 - ld.d $s2, $sp, 128 + ld.w $s1, $sp, 96 + ld.w $a0, $sp, 100 + ld.d $s2, $sp, 112 bne $s1, $a0, .LBB104_277 # %bb.279: # in Loop: Header=BB104_278 Depth=2 slli.w $a0, $s1, 1 @@ -30942,11 +30911,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc masknez $a2, $s6, $a1 maskeqz $a0, $a0, $a1 or $a0, $a0, $a2 - ld.d $s5, $sp, 120 + ld.d $s5, $sp, 104 ld.w $s3, $s0, 0 ld.w $s4, $s0, 4 - st.w $a0, $sp, 116 - st.w $a0, $sp, 112 + st.w $a0, $sp, 100 + st.w $a0, $sp, 96 add.w $a1, $s3, $a0 st.w $a1, $s0, 0 bge $s4, $a1, .LBB104_281 @@ -30958,10 +30927,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc ori $a1, $zero, 1 pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 - st.d $a0, $sp, 120 + st.d $a0, $sp, 104 beqz $a0, .LBB104_321 # %bb.282: # in Loop: Header=BB104_278 Depth=2 - st.d $a0, $sp, 128 + st.d $a0, $sp, 112 beqz $s5, .LBB104_276 # %bb.283: # in Loop: Header=BB104_278 Depth=2 beqz $s2, .LBB104_275 @@ -30978,11 +30947,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .p2align 4, , 16 .LBB104_286: # %._crit_edge.loopexit.i365 # in Loop: Header=BB104_270 Depth=1 - ld.w $s8, $sp, 112 + ld.w $s8, $sp, 96 .LBB104_287: # %._crit_edge.i367 # in Loop: Header=BB104_270 Depth=1 - ld.w $a0, $sp, 116 - ld.d $s1, $sp, 128 + ld.w $a0, $sp, 100 + ld.d $s1, $sp, 112 bne $s8, $a0, .LBB104_297 # %bb.288: # in Loop: Header=BB104_270 Depth=1 slli.w $a0, $s8, 1 @@ -30990,11 +30959,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc masknez $a2, $s6, $a1 maskeqz $a0, $a0, $a1 or $s5, $a0, $a2 - ld.d $s2, $sp, 120 + ld.d $s2, $sp, 104 ld.w $s3, $s0, 0 ld.w $s4, $s0, 4 - st.w $s5, $sp, 116 - st.w $s5, $sp, 112 + st.w $s5, $sp, 100 + st.w $s5, $sp, 96 add.w $a0, $s3, $s5 st.w $a0, $s0, 0 bge $s4, $a0, .LBB104_290 @@ -31007,10 +30976,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s5 pcaddu18i $ra, %call36(calloc) jirl $ra, $ra, 0 - st.d $a0, $sp, 120 + st.d $a0, $sp, 104 beqz $a0, .LBB104_321 # %bb.291: # in Loop: Header=BB104_270 Depth=1 - st.d $a0, $sp, 128 + st.d $a0, $sp, 112 beqz $s2, .LBB104_296 # %bb.292: # in Loop: Header=BB104_270 Depth=1 beqz $s1, .LBB104_295 @@ -31031,26 +31000,26 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s2 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.d $s1, $sp, 128 + ld.d $s1, $sp, 112 b .LBB104_297 .LBB104_296: # in Loop: Header=BB104_270 Depth=1 move $s1, $a0 .p2align 4, , 16 .LBB104_297: # in Loop: Header=BB104_270 Depth=1 addi.d $a0, $s8, 1 - st.w $a0, $sp, 112 + st.w $a0, $sp, 96 stx.b $zero, $s1, $s8 - ld.d $a0, $sp, 128 - ld.d $a1, $sp, 96 # 8-byte Folded Reload - ld.d $a2, $sp, 88 # 8-byte Folded Reload + ld.d $a0, $sp, 112 + ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $a2, $sp, 72 # 8-byte Folded Reload ldx.d $a1, $a1, $a2 .Ltmp402: # EH_LABEL - addi.d $a2, $sp, 136 + addi.d $a2, $sp, 120 pcaddu18i $ra, %call36(_Z10decompressPKclR7Encoder) jirl $ra, $ra, 0 .Ltmp403: # EH_LABEL # %bb.298: # in Loop: Header=BB104_270 Depth=1 - ld.w $a0, $sp, 112 + ld.w $a0, $sp, 96 ld.w $a1, $s0, 0 ld.w $a2, $s0, 4 sub.w $a0, $a1, $a0 @@ -31061,7 +31030,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc st.w $a0, $s0, 4 b .LBB104_269 .LBB104_300: # %._crit_edge614 - ld.w $a0, $sp, 176 + ld.w $a0, $sp, 160 ld.w $a1, $s0, 0 ld.w $a2, $s0, 4 sub.w $a0, $a1, $a0 @@ -31070,15 +31039,15 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.301: st.w $a0, $s0, 4 .LBB104_302: # %_ZN5ArrayIcLi0EED2Ev.exit358 - ld.d $a0, $sp, 184 + ld.d $a0, $sp, 168 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.d $fp, $sp, 32 # 8-byte Folded Reload + ld.d $fp, $sp, 16 # 8-byte Folded Reload .LBB104_303: - ld.d $a0, $sp, 80 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload pcaddu18i $ra, %call36(fclose) jirl $ra, $ra, 0 - ld.w $a0, $sp, 200 + ld.w $a0, $sp, 184 ld.w $a1, $s0, 0 ld.w $a2, $s0, 4 sub.w $a0, $a1, $a0 @@ -31087,10 +31056,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.304: st.w $a0, $s0, 4 .LBB104_305: # %_ZN5ArrayIcLi0EED2Ev.exit379 - ld.d $a0, $sp, 208 + ld.d $a0, $sp, 192 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.w $a0, $sp, 224 + ld.w $a0, $sp, 208 ld.w $a1, $s0, 0 ld.w $a2, $s0, 4 sub.w $a0, $a1, $a0 @@ -31099,7 +31068,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.306: st.w $a0, $s0, 4 .LBB104_307: # %_ZN5ArrayIcLi0EED2Ev.exit381 - ld.d $a0, $sp, 232 + ld.d $a0, $sp, 216 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 ld.w $a0, $s0, 0 @@ -31111,7 +31080,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.308: st.w $a0, $s0, 4 .LBB104_309: # %_ZN5ArrayIlLi0EED2Ev.exit - ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a0, $sp, 80 # 8-byte Folded Reload pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 ld.w $a0, $s0, 0 @@ -31125,7 +31094,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $a0, $s8 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload beqz $a0, .LBB104_313 .LBB104_312: pcalau12i $a0, %pc_hi20(.Lstr.4) @@ -31139,35 +31108,35 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc jirl $ra, $ra, 0 .LBB104_313: move $a0, $zero - ld.d $s8, $sp, 248 # 8-byte Folded Reload - ld.d $s7, $sp, 256 # 8-byte Folded Reload - ld.d $s6, $sp, 264 # 8-byte Folded Reload - ld.d $s5, $sp, 272 # 8-byte Folded Reload - ld.d $s4, $sp, 280 # 8-byte Folded Reload - ld.d $s3, $sp, 288 # 8-byte Folded Reload - ld.d $s2, $sp, 296 # 8-byte Folded Reload - ld.d $s1, $sp, 304 # 8-byte Folded Reload - ld.d $s0, $sp, 312 # 8-byte Folded Reload - ld.d $fp, $sp, 320 # 8-byte Folded Reload - ld.d $ra, $sp, 328 # 8-byte Folded Reload - addi.d $sp, $sp, 336 + ld.d $s8, $sp, 232 # 8-byte Folded Reload + ld.d $s7, $sp, 240 # 8-byte Folded Reload + ld.d $s6, $sp, 248 # 8-byte Folded Reload + ld.d $s5, $sp, 256 # 8-byte Folded Reload + ld.d $s4, $sp, 264 # 8-byte Folded Reload + ld.d $s3, $sp, 272 # 8-byte Folded Reload + ld.d $s2, $sp, 280 # 8-byte Folded Reload + ld.d $s1, $sp, 288 # 8-byte Folded Reload + ld.d $s0, $sp, 296 # 8-byte Folded Reload + ld.d $fp, $sp, 304 # 8-byte Folded Reload + ld.d $ra, $sp, 312 # 8-byte Folded Reload + addi.d $sp, $sp, 320 ret .LBB104_314: - ld.d $a0, $sp, 240 + ld.d $a0, $sp, 224 pcaddu18i $ra, %call36(perror) jirl $ra, $ra, 0 - st.d $zero, $sp, 80 # 8-byte Folded Spill + st.d $zero, $sp, 64 # 8-byte Folded Spill .LBB104_315: # %.invoke831 ori $s5, $zero, 1 .Ltmp417: # EH_LABEL ori $s6, $zero, 1 - ld.d $a0, $sp, 80 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload pcaddu18i $ra, %call36(_Z4quitPKc) jirl $ra, $ra, 0 .Ltmp418: # EH_LABEL # %bb.316: # %.cont832 .LBB104_317: - ld.d $a0, $sp, 240 + ld.d $a0, $sp, 224 pcaddu18i $ra, %call36(perror) jirl $ra, $ra, 0 .Ltmp414: # EH_LABEL @@ -31177,8 +31146,8 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .Ltmp415: # EH_LABEL # %bb.318: # %.unreachable558 .LBB104_319: - ld.d $a2, $sp, 216 - ld.d $a1, $sp, 240 + ld.d $a2, $sp, 200 + ld.d $a1, $sp, 224 sub.d $a2, $a0, $a2 pcalau12i $a0, %pc_hi20(.L.str.52) addi.d $a0, $a0, %pc_lo12(.L.str.52) @@ -31207,7 +31176,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .Ltmp348: # EH_LABEL # %bb.324: # %.cont .LBB104_325: - st.d $a3, $sp, 56 # 8-byte Folded Spill + st.d $a3, $sp, 40 # 8-byte Folded Spill .LBB104_326: # %.thread535 pcalau12i $a0, %pc_hi20(.L.str.40) addi.d $a0, $a0, %pc_lo12(.L.str.40) @@ -31237,10 +31206,10 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .Ltmp421: # EH_LABEL # %bb.331: # %.noexc279 .LBB104_332: - ld.d $s8, $sp, 104 # 8-byte Folded Reload + ld.d $s8, $sp, 88 # 8-byte Folded Reload b .LBB104_315 .LBB104_333: - ld.d $a1, $sp, 240 + ld.d $a1, $sp, 224 pcalau12i $a0, %pc_hi20(.L.str.49) addi.d $a0, $a0, %pc_lo12(.L.str.49) pcalau12i $a2, %pc_hi20(.L.str.50) @@ -31275,7 +31244,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .Ltmp362: # EH_LABEL pcalau12i $a0, %pc_hi20(.L.str.59) addi.d $a0, $a0, %pc_lo12(.L.str.59) - st.d $zero, $sp, 96 # 8-byte Folded Spill + st.d $zero, $sp, 80 # 8-byte Folded Spill move $s8, $s7 move $s6, $s5 pcaddu18i $ra, %call36(_Z4quitPKc) @@ -31286,7 +31255,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .Ltmp380: # EH_LABEL pcalau12i $a0, %pc_hi20(.L.str.59) addi.d $a0, $a0, %pc_lo12(.L.str.59) - st.d $s5, $sp, 104 # 8-byte Folded Spill + st.d $s5, $sp, 88 # 8-byte Folded Spill pcaddu18i $ra, %call36(_Z4quitPKc) jirl $ra, $ra, 0 .Ltmp381: # EH_LABEL @@ -31307,8 +31276,8 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc .Ltmp387: # EH_LABEL move $s1, $a1 move $s2, $a0 - st.d $zero, $sp, 96 # 8-byte Folded Spill - ld.d $s5, $sp, 104 # 8-byte Folded Reload + st.d $zero, $sp, 80 # 8-byte Folded Spill + ld.d $s5, $sp, 88 # 8-byte Folded Reload move $fp, $s5 move $s8, $s4 b .LBB104_389 @@ -31317,8 +31286,8 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $s1, $a1 move $s2, $a0 move $s8, $s7 - ld.d $fp, $sp, 104 # 8-byte Folded Reload - st.d $s6, $sp, 96 # 8-byte Folded Spill + ld.d $fp, $sp, 88 # 8-byte Folded Reload + st.d $s6, $sp, 80 # 8-byte Folded Spill b .LBB104_389 .LBB104_348: .Ltmp398: # EH_LABEL @@ -31397,7 +31366,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $s2, $a0 move $s8, $s7 move $fp, $s5 - st.d $s6, $sp, 96 # 8-byte Folded Spill + st.d $s6, $sp, 80 # 8-byte Folded Spill b .LBB104_386 .LBB104_369: .Ltmp358: # EH_LABEL @@ -31406,7 +31375,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc b .LBB104_373 .LBB104_370: .Ltmp361: # EH_LABEL - ld.w $a2, $sp, 176 + ld.w $a2, $sp, 160 ld.w $a3, $s0, 0 ld.w $a4, $s0, 4 move $s1, $a1 @@ -31417,20 +31386,20 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.371: st.w $a0, $s0, 4 .LBB104_372: # %_ZN5ArrayIcLi0EED2Ev.exit - ld.d $a0, $sp, 184 + ld.d $a0, $sp, 168 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 .LBB104_373: ori $fp, $zero, 1 ori $s5, $zero, 1 - ld.d $s8, $sp, 104 # 8-byte Folded Reload + ld.d $s8, $sp, 88 # 8-byte Folded Reload b .LBB104_386 .LBB104_374: # %.loopexit.split-lp .Ltmp407: # EH_LABEL .LBB104_375: move $s1, $a1 move $s2, $a0 - ld.w $a0, $sp, 112 + ld.w $a0, $sp, 96 ld.w $a1, $s0, 0 ld.w $a2, $s0, 4 sub.w $a0, $a1, $a0 @@ -31439,11 +31408,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.376: st.w $a0, $s0, 4 .LBB104_377: # %_ZN5ArrayIcLi0EED2Ev.exit375 - ld.d $a0, $sp, 120 + ld.d $a0, $sp, 104 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 .LBB104_378: - ld.w $a0, $sp, 176 + ld.w $a0, $sp, 160 ld.w $a1, $s0, 0 ld.w $a2, $s0, 4 sub.w $a0, $a1, $a0 @@ -31452,12 +31421,12 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.379: st.w $a0, $s0, 4 .LBB104_380: # %_ZN5ArrayIcLi0EED2Ev.exit377 - ld.d $a0, $sp, 184 + ld.d $a0, $sp, 168 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 move $s8, $s7 .LBB104_381: - ld.d $fp, $sp, 32 # 8-byte Folded Reload + ld.d $fp, $sp, 16 # 8-byte Folded Reload move $s5, $fp b .LBB104_389 .LBB104_382: @@ -31474,7 +31443,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $s2, $a0 move $s8, $s7 move $fp, $s5 - st.d $s6, $sp, 96 # 8-byte Folded Spill + st.d $s6, $sp, 80 # 8-byte Folded Spill b .LBB104_389 .LBB104_385: .Ltmp419: # EH_LABEL @@ -31483,7 +31452,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc move $fp, $s5 move $s5, $s6 .LBB104_386: - ld.w $a0, $sp, 136 + ld.w $a0, $sp, 120 ld.w $a1, $s0, 0 ld.w $a2, $s0, 4 sub.w $a0, $a1, $a0 @@ -31492,11 +31461,11 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.387: st.w $a0, $s0, 4 .LBB104_388: # %_ZN5ArrayIcLi0EED2Ev.exit312 - ld.d $a0, $sp, 144 + ld.d $a0, $sp, 128 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 .LBB104_389: - ld.w $a0, $sp, 200 + ld.w $a0, $sp, 184 ld.w $a1, $s0, 0 ld.w $a2, $s0, 4 sub.w $a0, $a1, $a0 @@ -31505,13 +31474,13 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.390: st.w $a0, $s0, 4 .LBB104_391: # %_ZN5ArrayIcLi0EED2Ev.exit385 - ld.d $a0, $sp, 208 + ld.d $a0, $sp, 192 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 slli.d $s3, $s5, 3 slli.d $fp, $fp, 3 .LBB104_392: - ld.w $a0, $sp, 224 + ld.w $a0, $sp, 208 ld.w $a1, $s0, 0 ld.w $a2, $s0, 4 sub.w $a0, $a1, $a0 @@ -31520,7 +31489,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.393: st.w $a0, $s0, 4 .LBB104_394: # %_ZN5ArrayIcLi0EED2Ev.exit387 - ld.d $a0, $sp, 232 + ld.d $a0, $sp, 216 pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 .LBB104_395: @@ -31532,7 +31501,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc # %bb.396: st.w $a0, $s0, 4 .LBB104_397: # %_ZN5ArrayIlLi0EED2Ev.exit389 - ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a0, $sp, 80 # 8-byte Folded Reload pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 .LBB104_398: @@ -31548,7 +31517,7 @@ _Z7paqmainiPPc: # @_Z7paqmainiPPc pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 .LBB104_401: - ld.d $fp, $sp, 56 # 8-byte Folded Reload + ld.d $fp, $sp, 40 # 8-byte Folded Reload addi.w $a0, $s1, 0 ori $a1, $zero, 1 bne $a0, $a1, .LBB104_405 diff --git a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/density.s b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/density.s index 9cc17d78..5244ea2a 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/density.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/density.s @@ -662,18 +662,12 @@ density: # @density # => This Inner Loop Header: Depth=3 vld $vr3, $s2, -16 vld $vr4, $s2, 0 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr6, $vr4, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 + vshuf4i.w $vr6, $vr4, 14 + vsllwil.d.w $vr6, $vr6, 0 + vsllwil.d.w $vr4, $vr4, 0 vpickve2gr.d $s4, $vr3, 0 mul.d $s4, $s4, $t0 add.d $s4, $t5, $s4 @@ -902,18 +896,12 @@ density: # @density # => This Inner Loop Header: Depth=3 vld $vr3, $s0, -16 vld $vr4, $s0, 0 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr6, $vr4, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 + vshuf4i.w $vr6, $vr4, 14 + vsllwil.d.w $vr6, $vr6, 0 + vsllwil.d.w $vr4, $vr4, 0 vpickve2gr.d $s2, $vr3, 0 mul.d $s2, $s2, $a6 add.d $s2, $t3, $s2 diff --git a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/hash.s b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/hash.s index 825fa9e8..9d532411 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/hash.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/hash.s @@ -122,14 +122,10 @@ addhash: # @addhash ld.w $a5, $a2, 0 vinsgr2vr.w $vr2, $a4, 0 vinsgr2vr.w $vr3, $a5, 0 - vilvl.b $vr2, $vr2, $vr2 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 24 - vsrai.w $vr2, $vr2, 24 - vilvl.b $vr3, $vr3, $vr3 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 24 - vsrai.w $vr3, $vr3, 24 + vsllwil.h.b $vr2, $vr2, 0 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.h.b $vr3, $vr3, 0 + vsllwil.w.h $vr3, $vr3, 0 vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 addi.d $a3, $a3, -8 @@ -267,14 +263,10 @@ hashfind: # @hashfind ld.w $a5, $a2, 0 vinsgr2vr.w $vr2, $a4, 0 vinsgr2vr.w $vr3, $a5, 0 - vilvl.b $vr2, $vr2, $vr2 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 24 - vsrai.w $vr2, $vr2, 24 - vilvl.b $vr3, $vr3, $vr3 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 24 - vsrai.w $vr3, $vr3, 24 + vsllwil.h.b $vr2, $vr2, 0 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.h.b $vr3, $vr3, 0 + vsllwil.w.h $vr3, $vr3, 0 vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 addi.d $a3, $a3, -8 diff --git a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/main.s b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/main.s index 15c54bf7..aae65bf7 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/main.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/main.s @@ -1578,24 +1578,16 @@ prepSpots: # @prepSpots ld.w $t4, $t1, 0 vinsgr2vr.w $vr9, $t3, 0 vinsgr2vr.w $vr10, $t4, 0 - vilvl.b $vr11, $vr9, $vr9 - vilvl.h $vr11, $vr11, $vr11 - vslli.w $vr11, $vr11, 24 - vsrai.w $vr11, $vr11, 24 - vilvl.b $vr12, $vr10, $vr10 - vilvl.h $vr12, $vr12, $vr12 - vslli.w $vr12, $vr12, 24 - vsrai.w $vr12, $vr12, 24 + vsllwil.h.b $vr11, $vr9, 0 + vsllwil.w.h $vr11, $vr11, 0 + vsllwil.h.b $vr12, $vr10, 0 + vsllwil.w.h $vr12, $vr12, 0 vslt.b $vr9, $vr4, $vr9 - vilvl.b $vr9, $vr9, $vr9 - vilvl.h $vr9, $vr9, $vr9 - vslli.w $vr9, $vr9, 24 - vsrai.w $vr9, $vr9, 24 + vsllwil.h.b $vr9, $vr9, 0 + vsllwil.w.h $vr9, $vr9, 0 vslt.b $vr10, $vr4, $vr10 - vilvl.b $vr10, $vr10, $vr10 - vilvl.h $vr10, $vr10, $vr10 - vslli.w $vr10, $vr10, 24 - vsrai.w $vr10, $vr10, 24 + vsllwil.h.b $vr10, $vr10, 0 + vsllwil.w.h $vr10, $vr10, 0 vadd.w $vr13, $vr11, $vr5 vadd.w $vr14, $vr12, $vr5 vsub.w $vr11, $vr6, $vr11 diff --git a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/mshortest.s b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/mshortest.s index 19dd3d08..5171c3ba 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/mshortest.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/mshortest.s @@ -2029,12 +2029,8 @@ mshortest: # @mshortest ld.d $t2, $a6, 0 vinsgr2vr.d $vr0, $t1, 0 vinsgr2vr.d $vr1, $t2, 0 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vsllwil.w.h $vr0, $vr0, 0 + vsllwil.w.h $vr1, $vr1, 0 vst $vr0, $a7, -16 vst $vr1, $a7, 0 addi.d $a6, $a6, 16 diff --git a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/procesnet.s b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/procesnet.s index 3201ab85..ed8fdd2d 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/procesnet.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/procesnet.s @@ -207,12 +207,8 @@ procesnet: # @procesnet ld.d $t1, $a6, 0 vinsgr2vr.d $vr3, $t0, 0 vinsgr2vr.d $vr4, $t1, 0 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr4, $vr4, 0 vslt.w $vr3, $vr0, $vr3 vslt.w $vr4, $vr0, $vr4 vsub.w $vr1, $vr1, $vr3 diff --git a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/reduceg.s b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/reduceg.s index f275329b..177d9038 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/reduceg.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/reduceg.s @@ -210,18 +210,12 @@ reduceg: # @reduceg # => This Inner Loop Header: Depth=3 vld $vr2, $a4, -16 vld $vr3, $a4, 0 - vshuf4i.w $vr4, $vr2, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $a6, $vr2, 0 mul.d $a6, $a6, $s4 add.d $a6, $a0, $a6 @@ -444,18 +438,12 @@ reduceg: # @reduceg # => This Inner Loop Header: Depth=3 vld $vr2, $a4, -16 vld $vr3, $a4, 0 - vshuf4i.w $vr4, $vr2, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr5, $vr3, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vshuf4i.w $vr5, $vr3, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $a6, $vr2, 0 mul.d $a6, $a6, $s4 add.d $a6, $a0, $a6 diff --git a/results/MultiSource/Benchmarks/Prolangs-C/agrep/CMakeFiles/agrep.dir/main.s b/results/MultiSource/Benchmarks/Prolangs-C/agrep/CMakeFiles/agrep.dir/main.s index 3291df2f..f5919817 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/agrep/CMakeFiles/agrep.dir/main.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/agrep/CMakeFiles/agrep.dir/main.s @@ -2041,15 +2041,11 @@ checksg: # @checksg vxori.b $vr4, $vr4, 255 vandn.v $vr4, $vr6, $vr4 vandn.v $vr3, $vr7, $vr3 - vilvl.b $vr3, $vr3, $vr3 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 24 - vsrai.w $vr3, $vr3, 24 + vsllwil.h.b $vr3, $vr3, 0 + vsllwil.w.h $vr3, $vr3, 0 vandn.v $vr4, $vr8, $vr4 - vilvl.b $vr4, $vr4, $vr4 - vilvl.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 24 - vsrai.w $vr4, $vr4, 24 + vsllwil.h.b $vr4, $vr4, 0 + vsllwil.w.h $vr4, $vr4, 0 vpickve2gr.w $a5, $vr4, 0 vinsgr2vr.h $vr5, $a5, 0 vpickve2gr.w $a5, $vr4, 1 diff --git a/results/MultiSource/Benchmarks/Prolangs-C/agrep/CMakeFiles/agrep.dir/sgrep.s b/results/MultiSource/Benchmarks/Prolangs-C/agrep/CMakeFiles/agrep.dir/sgrep.s index 4a599dce..bcc70036 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/agrep/CMakeFiles/agrep.dir/sgrep.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/agrep/CMakeFiles/agrep.dir/sgrep.s @@ -1666,14 +1666,12 @@ initmask: # @initmask vseq.b $vr3, $vr2, $vr0 vseq.b $vr2, $vr2, $vr1 vor.v $vr2, $vr3, $vr2 - vilvl.b $vr2, $vr2, $vr2 - vslli.h $vr2, $vr2, 8 - vsrai.h $vr2, $vr2, 8 vpickve2gr.h $a4, $vr2, 0 andi $a4, $a4, 1 bnez $a4, .LBB4_92 # %bb.85: # %pred.store.continue154 # in Loop: Header=BB4_84 Depth=1 + vsllwil.h.b $vr2, $vr2, 0 vpickve2gr.h $a4, $vr2, 1 andi $a4, $a4, 1 bnez $a4, .LBB4_93 @@ -1712,6 +1710,7 @@ initmask: # @initmask .LBB4_92: # %pred.store.if153 # in Loop: Header=BB4_84 Depth=1 st.b $a3, $a2, -3 + vsllwil.h.b $vr2, $vr2, 0 vpickve2gr.h $a4, $vr2, 1 andi $a4, $a4, 1 beqz $a4, .LBB4_86 @@ -3245,7 +3244,6 @@ m_preprocess: # @m_preprocess blt $a2, $a3, .LBB11_45 # %bb.1: # %.lr.ph vreplgr2vr.w $vr0, $s1 - vrepli.b $vr1, 0 ori $a2, $zero, 256 ori $a3, $zero, 1 move $a5, $a1 @@ -3272,11 +3270,11 @@ m_preprocess: # @m_preprocess .LBB11_5: # %vector.body # Parent Loop BB11_3 Depth=1 # => This Inner Loop Header: Depth=2 - vldx $vr2, $t0, $a7 - vilvl.b $vr3, $vr1, $vr2 - vilvl.h $vr4, $vr1, $vr3 - vseq.w $vr4, $vr0, $vr4 - vpickve2gr.b $t1, $vr4, 0 + vldx $vr1, $t0, $a7 + vsllwil.hu.bu $vr2, $vr1, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vseq.w $vr2, $vr0, $vr2 + vpickve2gr.b $t1, $vr2, 0 andi $t1, $t1, 1 beqz $t1, .LBB11_7 # %bb.6: # %pred.store.if @@ -3284,21 +3282,21 @@ m_preprocess: # @m_preprocess stx.b $a1, $t0, $a7 .LBB11_7: # %pred.store.continue # in Loop: Header=BB11_5 Depth=2 - vreplvei.h $vr5, $vr4, 2 - vpickve2gr.b $t1, $vr5, 2 + vreplvei.h $vr3, $vr2, 2 + vpickve2gr.b $t1, $vr3, 2 andi $t2, $t1, 1 add.d $t1, $t0, $a7 bnez $t2, .LBB11_25 # %bb.8: # %pred.store.continue52 # in Loop: Header=BB11_5 Depth=2 - vreplvei.h $vr5, $vr4, 4 - vpickve2gr.b $t2, $vr5, 4 + vreplvei.h $vr3, $vr2, 4 + vpickve2gr.b $t2, $vr3, 4 andi $t2, $t2, 1 bnez $t2, .LBB11_26 .LBB11_9: # %pred.store.continue54 # in Loop: Header=BB11_5 Depth=2 - vreplvei.h $vr4, $vr4, 6 - vpickve2gr.b $t2, $vr4, 6 + vreplvei.h $vr2, $vr2, 6 + vpickve2gr.b $t2, $vr2, 6 andi $t2, $t2, 1 beqz $t2, .LBB11_11 .LBB11_10: # %pred.store.if55 @@ -3306,28 +3304,30 @@ m_preprocess: # @m_preprocess st.b $a1, $t1, 3 .LBB11_11: # %pred.store.continue56 # in Loop: Header=BB11_5 Depth=2 - vilvh.h $vr3, $vr1, $vr3 - vseq.w $vr3, $vr0, $vr3 - vreplvei.h $vr4, $vr3, 0 - vpickve2gr.b $t2, $vr4, 8 + vsrli.d $vr2, $vr1, 32 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vseq.w $vr2, $vr0, $vr2 + vreplvei.h $vr3, $vr2, 0 + vpickve2gr.b $t2, $vr3, 8 andi $t2, $t2, 1 bnez $t2, .LBB11_27 # %bb.12: # %pred.store.continue58 # in Loop: Header=BB11_5 Depth=2 - vreplvei.h $vr4, $vr3, 2 - vpickve2gr.b $t2, $vr4, 10 + vreplvei.h $vr3, $vr2, 2 + vpickve2gr.b $t2, $vr3, 10 andi $t2, $t2, 1 bnez $t2, .LBB11_28 .LBB11_13: # %pred.store.continue60 # in Loop: Header=BB11_5 Depth=2 - vreplvei.h $vr4, $vr3, 4 - vpickve2gr.b $t2, $vr4, 12 + vreplvei.h $vr3, $vr2, 4 + vpickve2gr.b $t2, $vr3, 12 andi $t2, $t2, 1 bnez $t2, .LBB11_29 .LBB11_14: # %pred.store.continue62 # in Loop: Header=BB11_5 Depth=2 - vreplvei.h $vr3, $vr3, 6 - vpickve2gr.b $t2, $vr3, 14 + vreplvei.h $vr2, $vr2, 6 + vpickve2gr.b $t2, $vr2, 14 andi $t2, $t2, 1 beqz $t2, .LBB11_16 .LBB11_15: # %pred.store.if63 @@ -3335,28 +3335,29 @@ m_preprocess: # @m_preprocess st.b $a1, $t1, 7 .LBB11_16: # %pred.store.continue64 # in Loop: Header=BB11_5 Depth=2 - vilvh.b $vr2, $vr1, $vr2 - vilvl.h $vr3, $vr1, $vr2 - vseq.w $vr3, $vr0, $vr3 - vpickve2gr.b $t2, $vr3, 0 + vbsrl.v $vr2, $vr1, 8 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vseq.w $vr2, $vr0, $vr2 + vpickve2gr.b $t2, $vr2, 0 andi $t2, $t2, 1 bnez $t2, .LBB11_30 # %bb.17: # %pred.store.continue66 # in Loop: Header=BB11_5 Depth=2 - vreplvei.h $vr4, $vr3, 2 - vpickve2gr.b $t2, $vr4, 2 + vreplvei.h $vr3, $vr2, 2 + vpickve2gr.b $t2, $vr3, 2 andi $t2, $t2, 1 bnez $t2, .LBB11_31 .LBB11_18: # %pred.store.continue68 # in Loop: Header=BB11_5 Depth=2 - vreplvei.h $vr4, $vr3, 4 - vpickve2gr.b $t2, $vr4, 4 + vreplvei.h $vr3, $vr2, 4 + vpickve2gr.b $t2, $vr3, 4 andi $t2, $t2, 1 bnez $t2, .LBB11_32 .LBB11_19: # %pred.store.continue70 # in Loop: Header=BB11_5 Depth=2 - vreplvei.h $vr3, $vr3, 6 - vpickve2gr.b $t2, $vr3, 6 + vreplvei.h $vr2, $vr2, 6 + vpickve2gr.b $t2, $vr2, 6 andi $t2, $t2, 1 beqz $t2, .LBB11_21 .LBB11_20: # %pred.store.if71 @@ -3364,28 +3365,30 @@ m_preprocess: # @m_preprocess st.b $a1, $t1, 11 .LBB11_21: # %pred.store.continue72 # in Loop: Header=BB11_5 Depth=2 - vilvh.h $vr2, $vr1, $vr2 - vseq.w $vr2, $vr0, $vr2 - vreplvei.h $vr3, $vr2, 0 - vpickve2gr.b $t2, $vr3, 8 + vbsrl.v $vr1, $vr1, 12 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vseq.w $vr1, $vr0, $vr1 + vreplvei.h $vr2, $vr1, 0 + vpickve2gr.b $t2, $vr2, 8 andi $t2, $t2, 1 bnez $t2, .LBB11_33 # %bb.22: # %pred.store.continue74 # in Loop: Header=BB11_5 Depth=2 - vreplvei.h $vr3, $vr2, 2 - vpickve2gr.b $t2, $vr3, 10 + vreplvei.h $vr2, $vr1, 2 + vpickve2gr.b $t2, $vr2, 10 andi $t2, $t2, 1 bnez $t2, .LBB11_34 .LBB11_23: # %pred.store.continue76 # in Loop: Header=BB11_5 Depth=2 - vreplvei.h $vr3, $vr2, 4 - vpickve2gr.b $t2, $vr3, 12 + vreplvei.h $vr2, $vr1, 4 + vpickve2gr.b $t2, $vr2, 12 andi $t2, $t2, 1 bnez $t2, .LBB11_35 .LBB11_24: # %pred.store.continue78 # in Loop: Header=BB11_5 Depth=2 - vreplvei.h $vr2, $vr2, 6 - vpickve2gr.b $t2, $vr2, 14 + vreplvei.h $vr1, $vr1, 6 + vpickve2gr.b $t2, $vr1, 14 andi $t2, $t2, 1 beqz $t2, .LBB11_4 b .LBB11_36 @@ -3393,15 +3396,15 @@ m_preprocess: # @m_preprocess .LBB11_25: # %pred.store.if51 # in Loop: Header=BB11_5 Depth=2 st.b $a1, $t1, 1 - vreplvei.h $vr5, $vr4, 4 - vpickve2gr.b $t2, $vr5, 4 + vreplvei.h $vr3, $vr2, 4 + vpickve2gr.b $t2, $vr3, 4 andi $t2, $t2, 1 beqz $t2, .LBB11_9 .LBB11_26: # %pred.store.if53 # in Loop: Header=BB11_5 Depth=2 st.b $a1, $t1, 2 - vreplvei.h $vr4, $vr4, 6 - vpickve2gr.b $t2, $vr4, 6 + vreplvei.h $vr2, $vr2, 6 + vpickve2gr.b $t2, $vr2, 6 andi $t2, $t2, 1 bnez $t2, .LBB11_10 b .LBB11_11 @@ -3409,22 +3412,22 @@ m_preprocess: # @m_preprocess .LBB11_27: # %pred.store.if57 # in Loop: Header=BB11_5 Depth=2 st.b $a1, $t1, 4 - vreplvei.h $vr4, $vr3, 2 - vpickve2gr.b $t2, $vr4, 10 + vreplvei.h $vr3, $vr2, 2 + vpickve2gr.b $t2, $vr3, 10 andi $t2, $t2, 1 beqz $t2, .LBB11_13 .LBB11_28: # %pred.store.if59 # in Loop: Header=BB11_5 Depth=2 st.b $a1, $t1, 5 - vreplvei.h $vr4, $vr3, 4 - vpickve2gr.b $t2, $vr4, 12 + vreplvei.h $vr3, $vr2, 4 + vpickve2gr.b $t2, $vr3, 12 andi $t2, $t2, 1 beqz $t2, .LBB11_14 .LBB11_29: # %pred.store.if61 # in Loop: Header=BB11_5 Depth=2 st.b $a1, $t1, 6 - vreplvei.h $vr3, $vr3, 6 - vpickve2gr.b $t2, $vr3, 14 + vreplvei.h $vr2, $vr2, 6 + vpickve2gr.b $t2, $vr2, 14 andi $t2, $t2, 1 bnez $t2, .LBB11_15 b .LBB11_16 @@ -3432,22 +3435,22 @@ m_preprocess: # @m_preprocess .LBB11_30: # %pred.store.if65 # in Loop: Header=BB11_5 Depth=2 st.b $a1, $t1, 8 - vreplvei.h $vr4, $vr3, 2 - vpickve2gr.b $t2, $vr4, 2 + vreplvei.h $vr3, $vr2, 2 + vpickve2gr.b $t2, $vr3, 2 andi $t2, $t2, 1 beqz $t2, .LBB11_18 .LBB11_31: # %pred.store.if67 # in Loop: Header=BB11_5 Depth=2 st.b $a1, $t1, 9 - vreplvei.h $vr4, $vr3, 4 - vpickve2gr.b $t2, $vr4, 4 + vreplvei.h $vr3, $vr2, 4 + vpickve2gr.b $t2, $vr3, 4 andi $t2, $t2, 1 beqz $t2, .LBB11_19 .LBB11_32: # %pred.store.if69 # in Loop: Header=BB11_5 Depth=2 st.b $a1, $t1, 10 - vreplvei.h $vr3, $vr3, 6 - vpickve2gr.b $t2, $vr3, 6 + vreplvei.h $vr2, $vr2, 6 + vpickve2gr.b $t2, $vr2, 6 andi $t2, $t2, 1 bnez $t2, .LBB11_20 b .LBB11_21 @@ -3455,22 +3458,22 @@ m_preprocess: # @m_preprocess .LBB11_33: # %pred.store.if73 # in Loop: Header=BB11_5 Depth=2 st.b $a1, $t1, 12 - vreplvei.h $vr3, $vr2, 2 - vpickve2gr.b $t2, $vr3, 10 + vreplvei.h $vr2, $vr1, 2 + vpickve2gr.b $t2, $vr2, 10 andi $t2, $t2, 1 beqz $t2, .LBB11_23 .LBB11_34: # %pred.store.if75 # in Loop: Header=BB11_5 Depth=2 st.b $a1, $t1, 13 - vreplvei.h $vr3, $vr2, 4 - vpickve2gr.b $t2, $vr3, 12 + vreplvei.h $vr2, $vr1, 4 + vpickve2gr.b $t2, $vr2, 12 andi $t2, $t2, 1 beqz $t2, .LBB11_24 .LBB11_35: # %pred.store.if77 # in Loop: Header=BB11_5 Depth=2 st.b $a1, $t1, 14 - vreplvei.h $vr2, $vr2, 6 - vpickve2gr.b $t2, $vr2, 14 + vreplvei.h $vr1, $vr1, 6 + vpickve2gr.b $t2, $vr1, 14 andi $t2, $t2, 1 beqz $t2, .LBB11_4 .LBB11_36: # %pred.store.if79 @@ -4027,7 +4030,6 @@ sgrep: # @sgrep blt $a1, $a2, .LBB15_75 # %bb.12: # %.lr.ph.i vreplgr2vr.w $vr0, $s4 - vrepli.b $vr1, 0 ori $a1, $zero, 256 ori $a2, $zero, 1 move $a4, $a0 @@ -4054,11 +4056,11 @@ sgrep: # @sgrep .LBB15_16: # %vector.body174 # Parent Loop BB15_14 Depth=1 # => This Inner Loop Header: Depth=2 - vldx $vr2, $a7, $a6 - vilvl.b $vr3, $vr1, $vr2 - vilvl.h $vr4, $vr1, $vr3 - vseq.w $vr4, $vr0, $vr4 - vpickve2gr.b $t0, $vr4, 0 + vldx $vr1, $a7, $a6 + vsllwil.hu.bu $vr2, $vr1, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vseq.w $vr2, $vr0, $vr2 + vpickve2gr.b $t0, $vr2, 0 andi $t0, $t0, 1 beqz $t0, .LBB15_18 # %bb.17: # %pred.store.if177 @@ -4066,21 +4068,21 @@ sgrep: # @sgrep stx.b $a0, $a7, $a6 .LBB15_18: # %pred.store.continue178 # in Loop: Header=BB15_16 Depth=2 - vreplvei.h $vr5, $vr4, 2 - vpickve2gr.b $t0, $vr5, 2 + vreplvei.h $vr3, $vr2, 2 + vpickve2gr.b $t0, $vr3, 2 andi $t1, $t0, 1 add.d $t0, $a7, $a6 bnez $t1, .LBB15_36 # %bb.19: # %pred.store.continue180 # in Loop: Header=BB15_16 Depth=2 - vreplvei.h $vr5, $vr4, 4 - vpickve2gr.b $t1, $vr5, 4 + vreplvei.h $vr3, $vr2, 4 + vpickve2gr.b $t1, $vr3, 4 andi $t1, $t1, 1 bnez $t1, .LBB15_37 .LBB15_20: # %pred.store.continue182 # in Loop: Header=BB15_16 Depth=2 - vreplvei.h $vr4, $vr4, 6 - vpickve2gr.b $t1, $vr4, 6 + vreplvei.h $vr2, $vr2, 6 + vpickve2gr.b $t1, $vr2, 6 andi $t1, $t1, 1 beqz $t1, .LBB15_22 .LBB15_21: # %pred.store.if183 @@ -4088,28 +4090,30 @@ sgrep: # @sgrep st.b $a0, $t0, 3 .LBB15_22: # %pred.store.continue184 # in Loop: Header=BB15_16 Depth=2 - vilvh.h $vr3, $vr1, $vr3 - vseq.w $vr3, $vr0, $vr3 - vreplvei.h $vr4, $vr3, 0 - vpickve2gr.b $t1, $vr4, 8 + vsrli.d $vr2, $vr1, 32 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vseq.w $vr2, $vr0, $vr2 + vreplvei.h $vr3, $vr2, 0 + vpickve2gr.b $t1, $vr3, 8 andi $t1, $t1, 1 bnez $t1, .LBB15_38 # %bb.23: # %pred.store.continue186 # in Loop: Header=BB15_16 Depth=2 - vreplvei.h $vr4, $vr3, 2 - vpickve2gr.b $t1, $vr4, 10 + vreplvei.h $vr3, $vr2, 2 + vpickve2gr.b $t1, $vr3, 10 andi $t1, $t1, 1 bnez $t1, .LBB15_39 .LBB15_24: # %pred.store.continue188 # in Loop: Header=BB15_16 Depth=2 - vreplvei.h $vr4, $vr3, 4 - vpickve2gr.b $t1, $vr4, 12 + vreplvei.h $vr3, $vr2, 4 + vpickve2gr.b $t1, $vr3, 12 andi $t1, $t1, 1 bnez $t1, .LBB15_40 .LBB15_25: # %pred.store.continue190 # in Loop: Header=BB15_16 Depth=2 - vreplvei.h $vr3, $vr3, 6 - vpickve2gr.b $t1, $vr3, 14 + vreplvei.h $vr2, $vr2, 6 + vpickve2gr.b $t1, $vr2, 14 andi $t1, $t1, 1 beqz $t1, .LBB15_27 .LBB15_26: # %pred.store.if191 @@ -4117,28 +4121,29 @@ sgrep: # @sgrep st.b $a0, $t0, 7 .LBB15_27: # %pred.store.continue192 # in Loop: Header=BB15_16 Depth=2 - vilvh.b $vr2, $vr1, $vr2 - vilvl.h $vr3, $vr1, $vr2 - vseq.w $vr3, $vr0, $vr3 - vpickve2gr.b $t1, $vr3, 0 + vbsrl.v $vr2, $vr1, 8 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vseq.w $vr2, $vr0, $vr2 + vpickve2gr.b $t1, $vr2, 0 andi $t1, $t1, 1 bnez $t1, .LBB15_41 # %bb.28: # %pred.store.continue194 # in Loop: Header=BB15_16 Depth=2 - vreplvei.h $vr4, $vr3, 2 - vpickve2gr.b $t1, $vr4, 2 + vreplvei.h $vr3, $vr2, 2 + vpickve2gr.b $t1, $vr3, 2 andi $t1, $t1, 1 bnez $t1, .LBB15_42 .LBB15_29: # %pred.store.continue196 # in Loop: Header=BB15_16 Depth=2 - vreplvei.h $vr4, $vr3, 4 - vpickve2gr.b $t1, $vr4, 4 + vreplvei.h $vr3, $vr2, 4 + vpickve2gr.b $t1, $vr3, 4 andi $t1, $t1, 1 bnez $t1, .LBB15_43 .LBB15_30: # %pred.store.continue198 # in Loop: Header=BB15_16 Depth=2 - vreplvei.h $vr3, $vr3, 6 - vpickve2gr.b $t1, $vr3, 6 + vreplvei.h $vr2, $vr2, 6 + vpickve2gr.b $t1, $vr2, 6 andi $t1, $t1, 1 beqz $t1, .LBB15_32 .LBB15_31: # %pred.store.if199 @@ -4146,28 +4151,30 @@ sgrep: # @sgrep st.b $a0, $t0, 11 .LBB15_32: # %pred.store.continue200 # in Loop: Header=BB15_16 Depth=2 - vilvh.h $vr2, $vr1, $vr2 - vseq.w $vr2, $vr0, $vr2 - vreplvei.h $vr3, $vr2, 0 - vpickve2gr.b $t1, $vr3, 8 + vbsrl.v $vr1, $vr1, 12 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vseq.w $vr1, $vr0, $vr1 + vreplvei.h $vr2, $vr1, 0 + vpickve2gr.b $t1, $vr2, 8 andi $t1, $t1, 1 bnez $t1, .LBB15_44 # %bb.33: # %pred.store.continue202 # in Loop: Header=BB15_16 Depth=2 - vreplvei.h $vr3, $vr2, 2 - vpickve2gr.b $t1, $vr3, 10 + vreplvei.h $vr2, $vr1, 2 + vpickve2gr.b $t1, $vr2, 10 andi $t1, $t1, 1 bnez $t1, .LBB15_45 .LBB15_34: # %pred.store.continue204 # in Loop: Header=BB15_16 Depth=2 - vreplvei.h $vr3, $vr2, 4 - vpickve2gr.b $t1, $vr3, 12 + vreplvei.h $vr2, $vr1, 4 + vpickve2gr.b $t1, $vr2, 12 andi $t1, $t1, 1 bnez $t1, .LBB15_46 .LBB15_35: # %pred.store.continue206 # in Loop: Header=BB15_16 Depth=2 - vreplvei.h $vr2, $vr2, 6 - vpickve2gr.b $t1, $vr2, 14 + vreplvei.h $vr1, $vr1, 6 + vpickve2gr.b $t1, $vr1, 14 andi $t1, $t1, 1 beqz $t1, .LBB15_15 b .LBB15_47 @@ -4175,15 +4182,15 @@ sgrep: # @sgrep .LBB15_36: # %pred.store.if179 # in Loop: Header=BB15_16 Depth=2 st.b $a0, $t0, 1 - vreplvei.h $vr5, $vr4, 4 - vpickve2gr.b $t1, $vr5, 4 + vreplvei.h $vr3, $vr2, 4 + vpickve2gr.b $t1, $vr3, 4 andi $t1, $t1, 1 beqz $t1, .LBB15_20 .LBB15_37: # %pred.store.if181 # in Loop: Header=BB15_16 Depth=2 st.b $a0, $t0, 2 - vreplvei.h $vr4, $vr4, 6 - vpickve2gr.b $t1, $vr4, 6 + vreplvei.h $vr2, $vr2, 6 + vpickve2gr.b $t1, $vr2, 6 andi $t1, $t1, 1 bnez $t1, .LBB15_21 b .LBB15_22 @@ -4191,22 +4198,22 @@ sgrep: # @sgrep .LBB15_38: # %pred.store.if185 # in Loop: Header=BB15_16 Depth=2 st.b $a0, $t0, 4 - vreplvei.h $vr4, $vr3, 2 - vpickve2gr.b $t1, $vr4, 10 + vreplvei.h $vr3, $vr2, 2 + vpickve2gr.b $t1, $vr3, 10 andi $t1, $t1, 1 beqz $t1, .LBB15_24 .LBB15_39: # %pred.store.if187 # in Loop: Header=BB15_16 Depth=2 st.b $a0, $t0, 5 - vreplvei.h $vr4, $vr3, 4 - vpickve2gr.b $t1, $vr4, 12 + vreplvei.h $vr3, $vr2, 4 + vpickve2gr.b $t1, $vr3, 12 andi $t1, $t1, 1 beqz $t1, .LBB15_25 .LBB15_40: # %pred.store.if189 # in Loop: Header=BB15_16 Depth=2 st.b $a0, $t0, 6 - vreplvei.h $vr3, $vr3, 6 - vpickve2gr.b $t1, $vr3, 14 + vreplvei.h $vr2, $vr2, 6 + vpickve2gr.b $t1, $vr2, 14 andi $t1, $t1, 1 bnez $t1, .LBB15_26 b .LBB15_27 @@ -4214,22 +4221,22 @@ sgrep: # @sgrep .LBB15_41: # %pred.store.if193 # in Loop: Header=BB15_16 Depth=2 st.b $a0, $t0, 8 - vreplvei.h $vr4, $vr3, 2 - vpickve2gr.b $t1, $vr4, 2 + vreplvei.h $vr3, $vr2, 2 + vpickve2gr.b $t1, $vr3, 2 andi $t1, $t1, 1 beqz $t1, .LBB15_29 .LBB15_42: # %pred.store.if195 # in Loop: Header=BB15_16 Depth=2 st.b $a0, $t0, 9 - vreplvei.h $vr4, $vr3, 4 - vpickve2gr.b $t1, $vr4, 4 + vreplvei.h $vr3, $vr2, 4 + vpickve2gr.b $t1, $vr3, 4 andi $t1, $t1, 1 beqz $t1, .LBB15_30 .LBB15_43: # %pred.store.if197 # in Loop: Header=BB15_16 Depth=2 st.b $a0, $t0, 10 - vreplvei.h $vr3, $vr3, 6 - vpickve2gr.b $t1, $vr3, 6 + vreplvei.h $vr2, $vr2, 6 + vpickve2gr.b $t1, $vr2, 6 andi $t1, $t1, 1 bnez $t1, .LBB15_31 b .LBB15_32 @@ -4237,22 +4244,22 @@ sgrep: # @sgrep .LBB15_44: # %pred.store.if201 # in Loop: Header=BB15_16 Depth=2 st.b $a0, $t0, 12 - vreplvei.h $vr3, $vr2, 2 - vpickve2gr.b $t1, $vr3, 10 + vreplvei.h $vr2, $vr1, 2 + vpickve2gr.b $t1, $vr2, 10 andi $t1, $t1, 1 beqz $t1, .LBB15_34 .LBB15_45: # %pred.store.if203 # in Loop: Header=BB15_16 Depth=2 st.b $a0, $t0, 13 - vreplvei.h $vr3, $vr2, 4 - vpickve2gr.b $t1, $vr3, 12 + vreplvei.h $vr2, $vr1, 4 + vpickve2gr.b $t1, $vr2, 12 andi $t1, $t1, 1 beqz $t1, .LBB15_35 .LBB15_46: # %pred.store.if205 # in Loop: Header=BB15_16 Depth=2 st.b $a0, $t0, 14 - vreplvei.h $vr2, $vr2, 6 - vpickve2gr.b $t1, $vr2, 14 + vreplvei.h $vr1, $vr1, 6 + vpickve2gr.b $t1, $vr1, 14 andi $t1, $t1, 1 beqz $t1, .LBB15_15 .LBB15_47: # %pred.store.if207 @@ -4822,42 +4829,37 @@ sgrep: # @sgrep vseq.b $vr3, $vr2, $vr0 vseq.b $vr2, $vr2, $vr1 vor.v $vr2, $vr3, $vr2 - vilvl.b $vr2, $vr2, $vr2 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 24 - vsrai.w $vr2, $vr2, 24 vpickve2gr.w $a4, $vr2, 0 andi $a4, $a4, 1 - bnez $a4, .LBB15_147 -# %bb.144: # %pred.store.continue164 + beqz $a4, .LBB15_145 +# %bb.144: # %pred.store.if163 + # in Loop: Header=BB15_143 Depth=1 + st.b $a3, $a2, -1 +.LBB15_145: # %pred.store.continue164 # in Loop: Header=BB15_143 Depth=1 + vsllwil.h.b $vr2, $vr2, 0 + vsllwil.w.h $vr2, $vr2, 0 vpickve2gr.w $a4, $vr2, 1 andi $a4, $a4, 1 bnez $a4, .LBB15_148 -.LBB15_145: # %pred.store.continue166 +# %bb.146: # %pred.store.continue166 # in Loop: Header=BB15_143 Depth=1 vpickve2gr.w $a4, $vr2, 2 andi $a4, $a4, 1 bnez $a4, .LBB15_149 -.LBB15_146: # %pred.store.continue168 +.LBB15_147: # %pred.store.continue168 # in Loop: Header=BB15_143 Depth=1 vpickve2gr.w $a4, $vr2, 3 andi $a4, $a4, 1 beqz $a4, .LBB15_142 b .LBB15_150 .p2align 4, , 16 -.LBB15_147: # %pred.store.if163 - # in Loop: Header=BB15_143 Depth=1 - st.b $a3, $a2, -1 - vpickve2gr.w $a4, $vr2, 1 - andi $a4, $a4, 1 - beqz $a4, .LBB15_145 .LBB15_148: # %pred.store.if165 # in Loop: Header=BB15_143 Depth=1 st.b $a3, $a2, 0 vpickve2gr.w $a4, $vr2, 2 andi $a4, $a4, 1 - beqz $a4, .LBB15_146 + beqz $a4, .LBB15_147 .LBB15_149: # %pred.store.if167 # in Loop: Header=BB15_143 Depth=1 st.b $a3, $a2, 1 diff --git a/results/MultiSource/Benchmarks/Prolangs-C/bison/CMakeFiles/mybison.dir/LR0.s b/results/MultiSource/Benchmarks/Prolangs-C/bison/CMakeFiles/mybison.dir/LR0.s index 14188779..8065d131 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/bison/CMakeFiles/mybison.dir/LR0.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/bison/CMakeFiles/mybison.dir/LR0.s @@ -1673,12 +1673,8 @@ get_state: # @get_state ld.d $a7, $a4, 0 vinsgr2vr.d $vr2, $a6, 0 vinsgr2vr.d $vr3, $a7, 0 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 vadd.w $vr0, $vr0, $vr2 vadd.w $vr1, $vr1, $vr3 addi.d $a5, $a5, -8 diff --git a/results/MultiSource/Benchmarks/Prolangs-C/bison/CMakeFiles/mybison.dir/output.s b/results/MultiSource/Benchmarks/Prolangs-C/bison/CMakeFiles/mybison.dir/output.s index 18f36e77..0444f270 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/bison/CMakeFiles/mybison.dir/output.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/bison/CMakeFiles/mybison.dir/output.s @@ -3427,12 +3427,8 @@ action_row: # @action_row ld.d $t7, $t4, 0 vinsgr2vr.d $vr4, $t6, 0 vinsgr2vr.d $vr5, $t7, 0 - vilvl.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr5, $vr5, $vr5 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr5, 0 vseq.w $vr4, $vr1, $vr4 vseq.w $vr5, $vr1, $vr5 vsub.w $vr2, $vr2, $vr4 @@ -3498,9 +3494,7 @@ action_row: # @action_row .LBB21_76: # %vector.body218 # =>This Inner Loop Header: Depth=1 vld $vr1, $a3, -8 - vilvl.h $vr2, $vr1, $vr1 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 + vsllwil.w.h $vr2, $vr1, 0 vseq.w $vr2, $vr0, $vr2 vpickve2gr.h $a5, $vr2, 0 andi $a5, $a5, 1 @@ -3545,9 +3539,8 @@ action_row: # @action_row st.h $zero, $a3, -2 .LBB21_84: # %pred.store.continue226 # in Loop: Header=BB21_76 Depth=1 - vilvh.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr1, $vr1, 8 + vsllwil.w.h $vr1, $vr1, 0 vseq.w $vr1, $vr0, $vr1 vpickve2gr.h $a5, $vr1, 0 andi $a5, $a5, 1 @@ -3905,12 +3898,8 @@ save_column: # @save_column ld.d $t1, $a6, 0 vinsgr2vr.d $vr3, $t0, 0 vinsgr2vr.d $vr4, $t1, 0 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr4, $vr4, $vr4 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr4, $vr4, 0 vseq.w $vr3, $vr0, $vr3 vseq.w $vr4, $vr0, $vr4 vadd.w $vr1, $vr1, $vr3 diff --git a/results/MultiSource/Benchmarks/Prolangs-C/gnugo/CMakeFiles/gnugo.dir/endgame.s b/results/MultiSource/Benchmarks/Prolangs-C/gnugo/CMakeFiles/gnugo.dir/endgame.s index 33a22a7f..664edcbc 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/gnugo/CMakeFiles/gnugo.dir/endgame.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/gnugo/CMakeFiles/gnugo.dir/endgame.s @@ -276,7 +276,6 @@ endgame: # @endgame vori.b $vr3, $vr2, 0 vori.b $vr4, $vr2, 0 vori.b $vr5, $vr2, 0 - vori.b $vr6, $vr2, 0 .p2align 4, , 16 .LBB0_16: # %vector.body # =>This Inner Loop Header: Depth=1 @@ -285,581 +284,581 @@ endgame: # @endgame ld.b $a6, $a4, 19 ld.b $a7, $a4, 38 ld.b $t0, $a4, 57 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 76 ld.b $a6, $a4, 95 ld.b $a7, $a4, 114 ld.b $t0, $a4, 133 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 1 ld.b $a6, $a4, 20 ld.b $a7, $a4, 39 ld.b $t0, $a4, 58 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 77 ld.b $a6, $a4, 96 ld.b $a7, $a4, 115 ld.b $t0, $a4, 134 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 2 ld.b $a6, $a4, 21 ld.b $a7, $a4, 40 ld.b $t0, $a4, 59 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 78 ld.b $a6, $a4, 97 ld.b $a7, $a4, 116 ld.b $t0, $a4, 135 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 3 ld.b $a6, $a4, 22 ld.b $a7, $a4, 41 ld.b $t0, $a4, 60 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 79 ld.b $a6, $a4, 98 ld.b $a7, $a4, 117 ld.b $t0, $a4, 136 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 4 ld.b $a6, $a4, 23 ld.b $a7, $a4, 42 ld.b $t0, $a4, 61 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 80 ld.b $a6, $a4, 99 ld.b $a7, $a4, 118 ld.b $t0, $a4, 137 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 5 ld.b $a6, $a4, 24 ld.b $a7, $a4, 43 ld.b $t0, $a4, 62 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 81 ld.b $a6, $a4, 100 ld.b $a7, $a4, 119 ld.b $t0, $a4, 138 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 6 ld.b $a6, $a4, 25 ld.b $a7, $a4, 44 ld.b $t0, $a4, 63 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 82 ld.b $a6, $a4, 101 ld.b $a7, $a4, 120 ld.b $t0, $a4, 139 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 7 ld.b $a6, $a4, 26 ld.b $a7, $a4, 45 ld.b $t0, $a4, 64 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 83 ld.b $a6, $a4, 102 ld.b $a7, $a4, 121 ld.b $t0, $a4, 140 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 8 ld.b $a6, $a4, 27 ld.b $a7, $a4, 46 ld.b $t0, $a4, 65 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 84 ld.b $a6, $a4, 103 ld.b $a7, $a4, 122 ld.b $t0, $a4, 141 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 9 ld.b $a6, $a4, 28 ld.b $a7, $a4, 47 ld.b $t0, $a4, 66 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 85 ld.b $a6, $a4, 104 ld.b $a7, $a4, 123 ld.b $t0, $a4, 142 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 10 ld.b $a6, $a4, 29 ld.b $a7, $a4, 48 ld.b $t0, $a4, 67 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 86 ld.b $a6, $a4, 105 ld.b $a7, $a4, 124 ld.b $t0, $a4, 143 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 11 ld.b $a6, $a4, 30 ld.b $a7, $a4, 49 ld.b $t0, $a4, 68 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 87 ld.b $a6, $a4, 106 ld.b $a7, $a4, 125 ld.b $t0, $a4, 144 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 12 ld.b $a6, $a4, 31 ld.b $a7, $a4, 50 ld.b $t0, $a4, 69 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 88 ld.b $a6, $a4, 107 ld.b $a7, $a4, 126 ld.b $t0, $a4, 145 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 13 ld.b $a6, $a4, 32 ld.b $a7, $a4, 51 ld.b $t0, $a4, 70 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 89 ld.b $a6, $a4, 108 ld.b $a7, $a4, 127 ld.b $t0, $a4, 146 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 14 ld.b $a6, $a4, 33 ld.b $a7, $a4, 52 ld.b $t0, $a4, 71 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 90 ld.b $a6, $a4, 109 ld.b $a7, $a4, 128 ld.b $t0, $a4, 147 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 15 ld.b $a6, $a4, 34 ld.b $a7, $a4, 53 ld.b $t0, $a4, 72 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 91 ld.b $a6, $a4, 110 ld.b $a7, $a4, 129 ld.b $t0, $a4, 148 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 16 ld.b $a6, $a4, 35 ld.b $a7, $a4, 54 ld.b $t0, $a4, 73 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 92 ld.b $a6, $a4, 111 ld.b $a7, $a4, 130 ld.b $t0, $a4, 149 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 17 ld.b $a6, $a4, 36 ld.b $a7, $a4, 55 ld.b $t0, $a4, 74 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 93 ld.b $a6, $a4, 112 ld.b $a7, $a4, 131 ld.b $t0, $a4, 150 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $t0, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $t0, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 vsub.w $vr3, $vr3, $vr7 - vsub.w $vr4, $vr4, $vr8 ld.b $a5, $a4, 18 ld.b $a6, $a4, 37 ld.b $a7, $a4, 56 ld.b $t0, $a4, 75 - vinsgr2vr.b $vr7, $a5, 0 - vinsgr2vr.b $vr7, $a6, 1 - vinsgr2vr.b $vr7, $a7, 2 - vinsgr2vr.b $vr7, $t0, 3 + vinsgr2vr.b $vr6, $a5, 0 + vinsgr2vr.b $vr6, $a6, 1 + vinsgr2vr.b $vr6, $a7, 2 + vinsgr2vr.b $vr6, $t0, 3 ld.b $a5, $a4, 94 ld.b $a6, $a4, 113 ld.b $a7, $a4, 132 ld.b $a4, $a4, 151 - vinsgr2vr.b $vr8, $a5, 0 - vinsgr2vr.b $vr8, $a6, 1 - vinsgr2vr.b $vr8, $a7, 2 - vinsgr2vr.b $vr8, $a4, 3 - vilvl.b $vr7, $vr2, $vr7 - vilvl.h $vr7, $vr2, $vr7 - vilvl.b $vr8, $vr2, $vr8 - vilvl.h $vr8, $vr2, $vr8 - vseq.w $vr9, $vr0, $vr7 - vseq.w $vr10, $vr0, $vr8 - vseq.w $vr7, $vr1, $vr7 - vseq.w $vr8, $vr1, $vr8 - vsub.w $vr5, $vr5, $vr9 - vsub.w $vr6, $vr6, $vr10 - vandn.v $vr7, $vr9, $vr7 - vandn.v $vr8, $vr10, $vr8 - vsub.w $vr3, $vr3, $vr7 + vinsgr2vr.b $vr7, $a5, 0 + vinsgr2vr.b $vr7, $a6, 1 + vinsgr2vr.b $vr7, $a7, 2 + vinsgr2vr.b $vr7, $a4, 3 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vseq.w $vr8, $vr1, $vr6 + vseq.w $vr9, $vr1, $vr7 + vseq.w $vr6, $vr0, $vr6 + vseq.w $vr7, $vr0, $vr7 + vsub.w $vr4, $vr4, $vr6 + vsub.w $vr5, $vr5, $vr7 + vandn.v $vr6, $vr6, $vr8 + vandn.v $vr7, $vr7, $vr9 + vsub.w $vr2, $vr2, $vr6 addi.d $a2, $a2, 152 - vsub.w $vr4, $vr4, $vr8 + vsub.w $vr3, $vr3, $vr7 bne $a2, $a3, .LBB0_16 # %bb.17: # %scalar.ph move $a2, $zero - vadd.w $vr0, $vr6, $vr5 + vadd.w $vr0, $vr5, $vr4 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $fp, $vr0, 0 - vadd.w $vr0, $vr4, $vr3 + vadd.w $vr0, $vr3, $vr2 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $s0, $vr0, 0 diff --git a/results/MultiSource/Benchmarks/Ptrdist/anagram/CMakeFiles/anagram.dir/anagram.s b/results/MultiSource/Benchmarks/Ptrdist/anagram/CMakeFiles/anagram.dir/anagram.s index 7ee3d9dc..58a53f24 100644 --- a/results/MultiSource/Benchmarks/Ptrdist/anagram/CMakeFiles/anagram.dir/anagram.s +++ b/results/MultiSource/Benchmarks/Ptrdist/anagram/CMakeFiles/anagram.dir/anagram.s @@ -438,32 +438,31 @@ NextWord: # @NextWord .type BuildWord,@function BuildWord: # @BuildWord # %bb.0: - addi.d $sp, $sp, -128 - st.d $ra, $sp, 120 # 8-byte Folded Spill - st.d $fp, $sp, 112 # 8-byte Folded Spill - st.d $s0, $sp, 104 # 8-byte Folded Spill - st.d $s1, $sp, 96 # 8-byte Folded Spill - st.d $s2, $sp, 88 # 8-byte Folded Spill - st.d $s3, $sp, 80 # 8-byte Folded Spill - st.d $s4, $sp, 72 # 8-byte Folded Spill - st.d $s5, $sp, 64 # 8-byte Folded Spill + addi.d $sp, $sp, -112 + st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s1, $sp, 80 # 8-byte Folded Spill + st.d $s2, $sp, 72 # 8-byte Folded Spill + st.d $s3, $sp, 64 # 8-byte Folded Spill + st.d $s4, $sp, 56 # 8-byte Folded Spill + st.d $s5, $sp, 48 # 8-byte Folded Spill move $fp, $a0 - st.h $zero, $sp, 56 + st.h $zero, $sp, 40 ld.bu $s5, $a0, 0 - st.d $zero, $sp, 48 - vrepli.b $vr6, 0 - vst $vr6, $sp, 32 + st.d $zero, $sp, 32 + vrepli.b $vr0, 0 + vst $vr0, $sp, 16 pcalau12i $a0, %pc_hi20(alPhrase) addi.d $s1, $a0, %pc_lo12(alPhrase) beqz $s5, .LBB6_7 # %bb.1: # %.lr.ph - vst $vr6, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(__ctype_b_loc) jirl $ra, $ra, 0 ld.d $s0, $a0, 0 move $s2, $zero addi.d $s3, $fp, 1 - addi.d $s4, $sp, 32 + addi.d $s4, $sp, 16 b .LBB6_3 .p2align 4, , 16 .LBB6_2: # %.backedge @@ -495,68 +494,65 @@ BuildWord: # @BuildWord addi.d $s2, $s2, 1 b .LBB6_2 .LBB6_6: # %.preheader.loopexit - ld.d $a0, $sp, 32 - ld.w $a1, $sp, 36 - ld.d $a2, $sp, 40 - ld.w $a3, $sp, 44 + ld.d $a0, $sp, 16 + ld.w $a1, $sp, 20 + ld.d $a2, $sp, 24 + ld.w $a3, $sp, 28 vinsgr2vr.d $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vinsgr2vr.d $vr4, $a2, 0 - vinsgr2vr.w $vr5, $a3, 0 - ld.bu $a1, $sp, 48 - vrepli.b $vr6, 0 - vilvl.b $vr0, $vr6, $vr0 - vilvl.h $vr3, $vr6, $vr0 - vilvl.b $vr0, $vr6, $vr1 - vilvl.h $vr2, $vr6, $vr0 - vilvl.b $vr0, $vr6, $vr4 - vilvl.h $vr0, $vr6, $vr0 - vilvl.b $vr1, $vr6, $vr5 - vilvl.h $vr1, $vr6, $vr1 - vld $vr6, $sp, 16 # 16-byte Folded Reload + vinsgr2vr.d $vr2, $a2, 0 + vinsgr2vr.w $vr4, $a3, 0 + ld.bu $a1, $sp, 32 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr3, $vr1, 0 + vsllwil.hu.bu $vr1, $vr2, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.hu.bu $vr2, $vr4, 0 + vsllwil.wu.hu $vr2, $vr2, 0 b .LBB6_8 .LBB6_7: move $a1, $zero move $s2, $zero - vori.b $vr3, $vr6, 0 - vori.b $vr2, $vr6, 0 - vori.b $vr0, $vr6, 0 - vori.b $vr1, $vr6, 0 + vori.b $vr3, $vr0, 0 + vori.b $vr1, $vr0, 0 + vori.b $vr2, $vr0, 0 .LBB6_8: # %.preheader pcalau12i $a0, %pc_hi20(auGlobalFrequency) addi.d $a0, $a0, %pc_lo12(auGlobalFrequency) vld $vr4, $a0, 0 vld $vr5, $a0, 16 - vadd.w $vr3, $vr4, $vr3 - vst $vr3, $a0, 0 - vld $vr3, $a0, 32 - vadd.w $vr2, $vr5, $vr2 + vadd.w $vr0, $vr4, $vr0 + vst $vr0, $a0, 0 + vld $vr0, $a0, 32 + vadd.w $vr3, $vr5, $vr3 vld $vr4, $a0, 48 - vst $vr2, $a0, 16 - vadd.w $vr0, $vr3, $vr0 + vst $vr3, $a0, 16 + vadd.w $vr0, $vr0, $vr1 vst $vr0, $a0, 32 - vadd.w $vr0, $vr4, $vr1 + vadd.w $vr0, $vr4, $vr2 vst $vr0, $a0, 48 - ld.bu $a2, $sp, 49 - ld.bu $a3, $sp, 50 - ld.bu $a4, $sp, 51 + ld.bu $a2, $sp, 33 + ld.bu $a3, $sp, 34 + ld.bu $a4, $sp, 35 vld $vr0, $a0, 64 vinsgr2vr.w $vr1, $a1, 0 vinsgr2vr.w $vr1, $a2, 1 vinsgr2vr.w $vr1, $a3, 2 - ld.w $a1, $sp, 52 + ld.w $a1, $sp, 36 vinsgr2vr.w $vr1, $a4, 3 vadd.w $vr0, $vr0, $vr1 vst $vr0, $a0, 64 vinsgr2vr.w $vr0, $a1, 0 - ld.bu $a1, $sp, 56 + ld.bu $a1, $sp, 40 ld.w $a2, $a0, 96 - vilvl.b $vr0, $vr6, $vr0 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 vld $vr1, $a0, 80 add.d $a1, $a2, $a1 st.w $a1, $a0, 96 - ld.bu $a3, $sp, 57 + ld.bu $a3, $sp, 41 ld.w $a4, $a0, 100 pcalau12i $a1, %pc_hi20(cpwCand) ld.wu $a2, $a1, %pc_lo12(cpwCand) @@ -588,7 +584,7 @@ BuildWord: # @BuildWord vrepli.b $vr0, 0 vst $vr0, $a0, 0 ld.wu $a1, $s1, 12 - ld.bu $a2, $sp, 32 + ld.bu $a2, $sp, 16 ld.w $a3, $s1, 4 slli.d $a1, $a1, 3 ldx.d $a4, $a0, $a1 @@ -596,7 +592,7 @@ BuildWord: # @BuildWord or $a2, $a4, $a2 ld.wu $a3, $s1, 28 stx.d $a2, $a0, $a1 - ld.bu $a1, $sp, 33 + ld.bu $a1, $sp, 17 ld.w $a2, $s1, 20 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -604,7 +600,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a2, $s1, 44 stx.d $a1, $a0, $a3 - ld.bu $a1, $sp, 34 + ld.bu $a1, $sp, 18 ld.w $a3, $s1, 36 slli.d $a2, $a2, 3 ldx.d $a4, $a0, $a2 @@ -612,7 +608,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a3, $s1, 60 stx.d $a1, $a0, $a2 - ld.bu $a1, $sp, 35 + ld.bu $a1, $sp, 19 ld.w $a2, $s1, 52 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -620,7 +616,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a2, $s1, 76 stx.d $a1, $a0, $a3 - ld.bu $a1, $sp, 36 + ld.bu $a1, $sp, 20 ld.w $a3, $s1, 68 slli.d $a2, $a2, 3 ldx.d $a4, $a0, $a2 @@ -628,7 +624,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a3, $s1, 92 stx.d $a1, $a0, $a2 - ld.bu $a1, $sp, 37 + ld.bu $a1, $sp, 21 ld.w $a2, $s1, 84 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -636,7 +632,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a2, $s1, 108 stx.d $a1, $a0, $a3 - ld.bu $a1, $sp, 38 + ld.bu $a1, $sp, 22 ld.w $a3, $s1, 100 slli.d $a2, $a2, 3 ldx.d $a4, $a0, $a2 @@ -644,7 +640,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a3, $s1, 124 stx.d $a1, $a0, $a2 - ld.bu $a1, $sp, 39 + ld.bu $a1, $sp, 23 ld.w $a2, $s1, 116 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -652,7 +648,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a2, $s1, 140 stx.d $a1, $a0, $a3 - ld.bu $a1, $sp, 40 + ld.bu $a1, $sp, 24 ld.w $a3, $s1, 132 slli.d $a2, $a2, 3 ldx.d $a4, $a0, $a2 @@ -660,7 +656,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a3, $s1, 156 stx.d $a1, $a0, $a2 - ld.bu $a1, $sp, 41 + ld.bu $a1, $sp, 25 ld.w $a2, $s1, 148 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -668,7 +664,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a2, $s1, 172 stx.d $a1, $a0, $a3 - ld.bu $a1, $sp, 42 + ld.bu $a1, $sp, 26 ld.w $a3, $s1, 164 slli.d $a2, $a2, 3 ldx.d $a4, $a0, $a2 @@ -676,7 +672,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a3, $s1, 188 stx.d $a1, $a0, $a2 - ld.bu $a1, $sp, 43 + ld.bu $a1, $sp, 27 ld.w $a2, $s1, 180 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -684,7 +680,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a2, $s1, 204 stx.d $a1, $a0, $a3 - ld.bu $a1, $sp, 44 + ld.bu $a1, $sp, 28 ld.w $a3, $s1, 196 slli.d $a2, $a2, 3 ldx.d $a4, $a0, $a2 @@ -692,7 +688,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a3, $s1, 220 stx.d $a1, $a0, $a2 - ld.bu $a1, $sp, 45 + ld.bu $a1, $sp, 29 ld.w $a2, $s1, 212 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -700,7 +696,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a2, $s1, 236 stx.d $a1, $a0, $a3 - ld.bu $a1, $sp, 46 + ld.bu $a1, $sp, 30 ld.w $a3, $s1, 228 slli.d $a2, $a2, 3 ldx.d $a4, $a0, $a2 @@ -708,7 +704,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a3, $s1, 252 stx.d $a1, $a0, $a2 - ld.bu $a1, $sp, 47 + ld.bu $a1, $sp, 31 ld.w $a2, $s1, 244 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -716,7 +712,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a2, $s1, 268 stx.d $a1, $a0, $a3 - ld.bu $a1, $sp, 48 + ld.bu $a1, $sp, 32 ld.w $a3, $s1, 260 slli.d $a2, $a2, 3 ldx.d $a4, $a0, $a2 @@ -724,7 +720,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a3, $s1, 284 stx.d $a1, $a0, $a2 - ld.bu $a1, $sp, 49 + ld.bu $a1, $sp, 33 ld.w $a2, $s1, 276 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -732,7 +728,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a2, $s1, 300 stx.d $a1, $a0, $a3 - ld.bu $a1, $sp, 50 + ld.bu $a1, $sp, 34 ld.w $a3, $s1, 292 slli.d $a2, $a2, 3 ldx.d $a4, $a0, $a2 @@ -740,7 +736,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a3, $s1, 316 stx.d $a1, $a0, $a2 - ld.bu $a1, $sp, 51 + ld.bu $a1, $sp, 35 ld.w $a2, $s1, 308 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -748,7 +744,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a2, $s1, 332 stx.d $a1, $a0, $a3 - ld.bu $a1, $sp, 52 + ld.bu $a1, $sp, 36 ld.w $a3, $s1, 324 slli.d $a2, $a2, 3 ldx.d $a4, $a0, $a2 @@ -756,7 +752,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a3, $s1, 348 stx.d $a1, $a0, $a2 - ld.bu $a1, $sp, 53 + ld.bu $a1, $sp, 37 ld.w $a2, $s1, 340 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -764,7 +760,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a2, $s1, 364 stx.d $a1, $a0, $a3 - ld.bu $a1, $sp, 54 + ld.bu $a1, $sp, 38 ld.w $a3, $s1, 356 slli.d $a2, $a2, 3 ldx.d $a4, $a0, $a2 @@ -772,7 +768,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a3, $s1, 380 stx.d $a1, $a0, $a2 - ld.bu $a1, $sp, 55 + ld.bu $a1, $sp, 39 ld.w $a2, $s1, 372 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -780,7 +776,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a2, $s1, 396 stx.d $a1, $a0, $a3 - ld.bu $a1, $sp, 56 + ld.bu $a1, $sp, 40 ld.w $a3, $s1, 388 slli.d $a2, $a2, 3 ldx.d $a4, $a0, $a2 @@ -788,7 +784,7 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 ld.wu $a3, $s1, 412 stx.d $a1, $a0, $a2 - ld.bu $a1, $sp, 57 + ld.bu $a1, $sp, 41 ld.w $a2, $s1, 404 slli.d $a3, $a3, 3 ldx.d $a4, $a0, $a3 @@ -798,15 +794,15 @@ BuildWord: # @BuildWord or $a1, $a4, $a1 stx.d $a1, $a0, $a3 .LBB6_13: # %.loopexit - ld.d $s5, $sp, 64 # 8-byte Folded Reload - ld.d $s4, $sp, 72 # 8-byte Folded Reload - ld.d $s3, $sp, 80 # 8-byte Folded Reload - ld.d $s2, $sp, 88 # 8-byte Folded Reload - ld.d $s1, $sp, 96 # 8-byte Folded Reload - ld.d $s0, $sp, 104 # 8-byte Folded Reload - ld.d $fp, $sp, 112 # 8-byte Folded Reload - ld.d $ra, $sp, 120 # 8-byte Folded Reload - addi.d $sp, $sp, 128 + ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $s4, $sp, 56 # 8-byte Folded Reload + ld.d $s3, $sp, 64 # 8-byte Folded Reload + ld.d $s2, $sp, 72 # 8-byte Folded Reload + ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $fp, $sp, 96 # 8-byte Folded Reload + ld.d $ra, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret .LBB6_14: pcalau12i $a0, %pc_hi20(.L.str.10) diff --git a/results/MultiSource/Benchmarks/Ptrdist/bc/CMakeFiles/bc.dir/number.s b/results/MultiSource/Benchmarks/Ptrdist/bc/CMakeFiles/bc.dir/number.s index 9c31da0c..135724f7 100644 --- a/results/MultiSource/Benchmarks/Ptrdist/bc/CMakeFiles/bc.dir/number.s +++ b/results/MultiSource/Benchmarks/Ptrdist/bc/CMakeFiles/bc.dir/number.s @@ -1489,27 +1489,7 @@ bc_sub: # @bc_sub .Lfunc_end14: .size bc_sub, .Lfunc_end14-bc_sub # -- End function - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4, 0x0 # -- Begin function bc_multiply -.LCPI15_0: - .byte 1 # 0x1 - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .byte 0 # 0x0 - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .byte 255 # 0xff - .text - .globl bc_multiply + .globl bc_multiply # -- Begin function bc_multiply .p2align 5 .type bc_multiply,@function bc_multiply: # @bc_multiply @@ -1526,18 +1506,18 @@ bc_multiply: # @bc_multiply st.d $s6, $sp, 72 # 8-byte Folded Spill st.d $s7, $sp, 64 # 8-byte Folded Spill st.d $s8, $sp, 56 # 8-byte Folded Spill - move $s1, $a1 + move $s6, $a1 move $s0, $a0 ld.w $a4, $a0, 4 ld.w $s7, $a0, 8 ld.w $a0, $a1, 4 ld.w $a1, $a1, 8 - st.d $a2, $sp, 32 # 8-byte Folded Spill + st.d $a2, $sp, 40 # 8-byte Folded Spill st.d $a4, $sp, 48 # 8-byte Folded Spill add.w $fp, $s7, $a4 add.d $s8, $a1, $a0 addi.w $s3, $s8, 0 - add.d $s6, $s8, $fp + add.d $s1, $s8, $fp add.w $a0, $a1, $s7 slt $a2, $a1, $s7 masknez $a1, $a1, $a2 @@ -1552,7 +1532,7 @@ bc_multiply: # @bc_multiply masknez $a2, $a0, $a2 or $s2, $a1, $a2 sub.w $s4, $a0, $s2 - sub.w $s5, $s6, $a0 + sub.w $s5, $s1, $a0 add.d $a0, $s5, $s2 addi.d $a0, $a0, 1040 pcaddu18i $ra, %call36(malloc) @@ -1561,7 +1541,7 @@ bc_multiply: # @bc_multiply st.w $s2, $a0, 8 ori $a1, $zero, 1 ld.w $a2, $s0, 0 - ld.w $a3, $s1, 0 + ld.w $a3, $s6, 0 st.w $a1, $a0, 12 addi.d $s2, $a0, 16 st.b $zero, $a0, 16 @@ -1569,349 +1549,338 @@ bc_multiply: # @bc_multiply sltu $a1, $zero, $a1 st.w $a1, $a0, 0 addi.d $a1, $fp, 15 - add.d $a2, $s1, $s3 + add.d $a2, $s6, $s3 nor $a3, $s4, $zero - st.d $s6, $sp, 24 # 8-byte Folded Spill - add.w $t0, $s6, $a3 + st.d $s1, $sp, 24 # 8-byte Folded Spill + add.w $a7, $s1, $a3 addi.w $a3, $s3, -1 - lu12i.w $t8, 419430 - pcalau12i $a4, %pc_hi20(.LCPI15_0) - st.d $s8, $sp, 40 # 8-byte Folded Spill + lu12i.w $t7, 419430 + st.d $s8, $sp, 32 # 8-byte Folded Spill blez $s4, .LBB15_14 # %bb.1: # %.lr.ph151 - st.d $t0, $sp, 8 # 8-byte Folded Spill - move $a7, $zero - move $a5, $zero - ld.d $a6, $sp, 48 # 8-byte Folded Reload - add.d $a6, $a6, $s7 - addi.d $t1, $a6, -2 - add.d $a6, $a6, $s0 - addi.d $t2, $a6, 13 - sub.w $t3, $zero, $s3 - st.d $s1, $sp, 16 # 8-byte Folded Spill - add.d $a6, $s8, $s1 - addi.d $t4, $a6, 17 - ori $t5, $zero, 16 - ori $t6, $zero, 3 - ori $t7, $zero, 17 + st.d $a7, $sp, 16 # 8-byte Folded Spill + move $a6, $zero + move $a4, $zero + ld.d $a5, $sp, 48 # 8-byte Folded Reload + add.d $a5, $a5, $s7 + addi.d $t0, $a5, -2 + add.d $a5, $a5, $s0 + addi.d $t1, $a5, 13 + sub.w $t2, $zero, $s3 + add.d $a5, $s8, $s6 + addi.d $t3, $a5, 17 + ori $t4, $zero, 16 + ori $t5, $zero, 3 + ori $t6, $zero, 17 vrepli.b $vr0, 0 - ori $a6, $t8, 1639 - lu32i.d $a6, 419430 - lu52i.d $t8, $a6, 1638 - move $s6, $s7 + ori $a5, $t7, 1639 + lu32i.d $a5, 419430 + lu52i.d $t7, $a5, 1638 + move $s1, $s7 b .LBB15_3 .p2align 4, , 16 .LBB15_2: # %._crit_edge # in Loop: Header=BB15_3 Depth=1 - mulh.d $a5, $a5, $t8 - srli.d $a6, $a5, 63 - srai.d $a5, $a5, 2 - add.d $a5, $a5, $a6 - addi.w $a7, $a7, 1 - addi.w $t3, $t3, 1 - beq $a7, $s4, .LBB15_13 + mulh.d $a4, $a4, $t7 + srli.d $a5, $a4, 63 + srai.d $a4, $a4, 2 + add.d $a4, $a4, $a5 + addi.w $a6, $a6, 1 + addi.w $t2, $t2, 1 + beq $a6, $s4, .LBB15_13 .LBB15_3: # =>This Loop Header: Depth=1 # Child Loop BB15_8 Depth 2 # Child Loop BB15_11 Depth 2 - addi.w $t0, $zero, -1 - sub.w $a6, $a7, $s3 - slt $fp, $t0, $a6 - maskeqz $a6, $a6, $fp - masknez $fp, $t0, $fp - or $fp, $a6, $fp - nor $a6, $fp, $zero - add.d $ra, $a1, $a6 - blt $ra, $t5, .LBB15_2 + addi.w $a5, $zero, -1 + sub.w $a7, $a6, $s3 + slt $t8, $a5, $a7 + maskeqz $a7, $a7, $t8 + masknez $t8, $a5, $t8 + or $a7, $a7, $t8 + nor $t8, $a7, $zero + add.d $t8, $a1, $t8 + blt $t8, $t4, .LBB15_2 # %bb.4: # in Loop: Header=BB15_3 Depth=1 - slt $a6, $a7, $a3 - masknez $s5, $a3, $a6 - maskeqz $a6, $a7, $a6 - or $a6, $a6, $s5 - bltz $a6, .LBB15_2 + slt $fp, $a6, $a3 + masknez $s5, $a3, $fp + maskeqz $fp, $a6, $fp + or $fp, $fp, $s5 + bltz $fp, .LBB15_2 # %bb.5: # %.lr.ph.preheader # in Loop: Header=BB15_3 Depth=1 - bstrpick.d $a6, $a6, 31, 0 - sub.w $fp, $zero, $fp - add.d $fp, $t1, $fp - sltu $s5, $fp, $a6 - masknez $s7, $a6, $s5 - maskeqz $fp, $fp, $s5 - or $fp, $fp, $s7 - bgeu $fp, $t6, .LBB15_7 + bstrpick.d $ra, $fp, 31, 0 + sub.w $a7, $zero, $a7 + add.d $a7, $t0, $a7 + sltu $fp, $a7, $ra + masknez $s5, $ra, $fp + maskeqz $a7, $a7, $fp + or $s5, $a7, $s5 + bgeu $s5, $t5, .LBB15_7 # %bb.6: # in Loop: Header=BB15_3 Depth=1 - move $fp, $zero + move $a5, $zero b .LBB15_10 .p2align 4, , 16 .LBB15_7: # %vector.ph # in Loop: Header=BB15_3 Depth=1 - slt $s5, $t0, $t3 - masknez $t0, $t0, $s5 - maskeqz $s5, $t3, $s5 - or $s7, $s5, $t0 - sub.w $t0, $zero, $s7 - add.d $t0, $t2, $t0 - sub.d $s5, $t4, $a6 - sub.d $s7, $t1, $s7 - sltu $s8, $s7, $a6 - masknez $s1, $a6, $s8 - maskeqz $s7, $s7, $s8 - or $s1, $s7, $s1 - addi.d $s1, $s1, 1 - bstrpick.d $s1, $s1, 32, 2 - slli.d $s8, $s1, 2 - addi.d $s7, $fp, 1 - bstrpick.d $fp, $s7, 32, 2 - slli.d $fp, $fp, 2 - sub.d $ra, $ra, $fp + slt $a7, $a5, $t2 + masknez $a5, $a5, $a7 + maskeqz $a7, $t2, $a7 + or $a5, $a7, $a5 + sub.w $a7, $zero, $a5 + add.d $fp, $t1, $a7 + sub.d $a7, $t3, $ra + sub.d $a5, $t0, $a5 + sltu $s7, $a5, $ra + masknez $s8, $ra, $s7 + maskeqz $a5, $a5, $s7 + or $a5, $a5, $s8 + addi.d $a5, $a5, 1 + bstrpick.d $a5, $a5, 32, 2 + slli.d $s7, $a5, 2 + addi.d $s5, $s5, 1 + bstrpick.d $a5, $s5, 32, 2 + slli.d $a5, $a5, 2 + sub.d $t8, $t8, $a5 vori.b $vr1, $vr0, 0 - vinsgr2vr.d $vr1, $a5, 0 + vinsgr2vr.d $vr1, $a4, 0 vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB15_8: # %vector.body # Parent Loop BB15_3 Depth=1 # => This Inner Loop Header: Depth=2 - ld.h $a5, $t0, 0 - ld.h $s1, $t0, -2 - vld $vr3, $a4, %pc_lo12(.LCPI15_0) - vinsgr2vr.h $vr4, $a5, 0 - vinsgr2vr.h $vr5, $s1, 0 - ld.h $a5, $s5, -2 - vshuf.b $vr4, $vr0, $vr4, $vr3 - vslli.d $vr4, $vr4, 56 - vsrai.d $vr4, $vr4, 56 - vinsgr2vr.h $vr6, $a5, 0 - ld.h $a5, $s5, 0 - vshuf.b $vr3, $vr0, $vr5, $vr3 - vslli.d $vr3, $vr3, 56 - vsrai.d $vr3, $vr3, 56 - vinsgr2vr.h $vr5, $a5, 0 - vilvl.b $vr6, $vr6, $vr6 - vilvl.h $vr6, $vr6, $vr6 - vilvl.w $vr6, $vr6, $vr6 - vslli.d $vr6, $vr6, 56 - vsrai.d $vr6, $vr6, 56 - vilvl.b $vr5, $vr5, $vr5 - vilvl.h $vr5, $vr5, $vr5 - vilvl.w $vr5, $vr5, $vr5 - vslli.d $vr5, $vr5, 56 - vsrai.d $vr5, $vr5, 56 - vmadd.d $vr1, $vr6, $vr4 - vmadd.d $vr2, $vr5, $vr3 - addi.d $t0, $t0, -4 - addi.d $s8, $s8, -4 - addi.d $s5, $s5, 4 - bnez $s8, .LBB15_8 + ld.h $a4, $fp, 0 + ld.h $s8, $fp, -2 + vinsgr2vr.h $vr3, $a4, 0 + vshuf4i.b $vr3, $vr3, 1 + vinsgr2vr.h $vr4, $s8, 0 + vshuf4i.b $vr4, $vr4, 1 + ld.h $a4, $a7, -2 + vsllwil.h.b $vr3, $vr3, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 + vinsgr2vr.h $vr5, $a4, 0 + ld.h $a4, $a7, 0 + vsllwil.h.b $vr4, $vr4, 0 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.d.w $vr4, $vr4, 0 + vinsgr2vr.h $vr6, $a4, 0 + vsllwil.h.b $vr5, $vr5, 0 + vsllwil.w.h $vr5, $vr5, 0 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.h.b $vr6, $vr6, 0 + vsllwil.w.h $vr6, $vr6, 0 + vsllwil.d.w $vr6, $vr6, 0 + vmadd.d $vr1, $vr5, $vr3 + vmadd.d $vr2, $vr6, $vr4 + addi.d $fp, $fp, -4 + addi.d $s7, $s7, -4 + addi.d $a7, $a7, 4 + bnez $s7, .LBB15_8 # %bb.9: # %middle.block # in Loop: Header=BB15_3 Depth=1 vadd.d $vr1, $vr2, $vr1 vhaddw.q.d $vr1, $vr1, $vr1 - vpickve2gr.d $a5, $vr1, 0 - beq $s7, $fp, .LBB15_2 + vpickve2gr.d $a4, $vr1, 0 + beq $s5, $a5, .LBB15_2 .LBB15_10: # %scalar.ph # in Loop: Header=BB15_3 Depth=1 - sub.d $a6, $fp, $a6 - addi.d $a6, $a6, 15 + sub.d $a5, $a5, $ra + addi.d $a5, $a5, 15 .p2align 4, , 16 .LBB15_11: # %.lr.ph # Parent Loop BB15_3 Depth=1 # => This Inner Loop Header: Depth=2 - ldx.b $t0, $s0, $ra - ldx.b $fp, $a2, $a6 - mul.d $t0, $fp, $t0 - add.d $a5, $t0, $a5 - bltu $ra, $t7, .LBB15_2 + ldx.b $a7, $s0, $t8 + ldx.b $fp, $a2, $a5 + mul.d $a7, $fp, $a7 + add.d $a4, $a7, $a4 + bltu $t8, $t6, .LBB15_2 # %bb.12: # %.lr.ph # in Loop: Header=BB15_11 Depth=2 - addi.d $t0, $a6, -15 - addi.d $ra, $ra, -1 - addi.d $a6, $a6, 1 - bltz $t0, .LBB15_11 + addi.d $a7, $a5, -15 + addi.d $t8, $t8, -1 + addi.d $a5, $a5, 1 + bltz $a7, .LBB15_11 b .LBB15_2 .LBB15_13: - ld.d $s1, $sp, 16 # 8-byte Folded Reload - ld.d $t2, $sp, 48 # 8-byte Folded Reload - lu12i.w $t8, 419430 - ld.d $t0, $sp, 8 # 8-byte Folded Reload + ld.d $t1, $sp, 48 # 8-byte Folded Reload + lu12i.w $t7, 419430 + ld.d $a7, $sp, 16 # 8-byte Folded Reload b .LBB15_15 .LBB15_14: - move $a5, $zero + move $a4, $zero move $s4, $zero - ld.d $t2, $sp, 48 # 8-byte Folded Reload - move $s6, $s7 + ld.d $t1, $sp, 48 # 8-byte Folded Reload + move $s1, $s7 .LBB15_15: # %.preheader - ld.d $a6, $sp, 24 # 8-byte Folded Reload - addi.w $a7, $a6, -1 - add.d $t0, $s2, $t0 - ld.d $t4, $sp, 40 # 8-byte Folded Reload - bge $s4, $a7, .LBB15_28 + ld.d $a5, $sp, 24 # 8-byte Folded Reload + addi.w $a6, $a5, -1 + add.d $a7, $s2, $a7 + ld.d $t3, $sp, 32 # 8-byte Folded Reload + bge $s4, $a6, .LBB15_28 # %bb.16: # %.lr.ph164 - move $t1, $zero + move $t0, $zero vrepli.b $vr0, 0 - add.d $a6, $t2, $s6 - addi.d $t2, $a6, -2 - sub.w $t3, $s4, $s3 - add.d $a6, $a6, $s0 - addi.d $a6, $a6, 13 - st.d $a6, $sp, 48 # 8-byte Folded Spill - add.d $a6, $t4, $s1 - addi.d $a6, $a6, 17 - st.d $a6, $sp, 40 # 8-byte Folded Spill - addi.w $t6, $zero, -1 - ori $t7, $zero, 16 + add.d $a5, $t1, $s1 + addi.d $t1, $a5, -2 + sub.w $t2, $s4, $s3 + add.d $a5, $a5, $s0 + addi.d $a5, $a5, 13 + st.d $a5, $sp, 48 # 8-byte Folded Spill + add.d $a5, $t3, $s6 + addi.d $t4, $a5, 17 + addi.w $t5, $zero, -1 + ori $t6, $zero, 16 ori $s8, $zero, 3 - ori $s1, $zero, 17 - ori $a6, $t8, 1639 - lu32i.d $a6, 419430 - lu52i.d $a6, $a6, 1638 - move $s5, $t3 + ori $t8, $zero, 17 + ori $a5, $t7, 1639 + lu32i.d $a5, 419430 + lu52i.d $a5, $a5, 1638 + move $s1, $t2 b .LBB15_18 .p2align 4, , 16 .LBB15_17: # %._crit_edge159 # in Loop: Header=BB15_18 Depth=1 - mulh.d $t4, $a5, $a6 - srli.d $t5, $t4, 63 - srai.d $t4, $t4, 2 - add.d $t4, $t4, $t5 - slli.d $t5, $t4, 3 - alsl.d $t5, $t4, $t5, 1 - sub.d $a5, $a5, $t5 - addi.d $t8, $t0, -1 - st.b $a5, $t0, 0 + mulh.d $t3, $a4, $a5 + srli.d $t7, $t3, 63 + srai.d $t3, $t3, 2 + add.d $t7, $t3, $t7 + slli.d $t3, $t7, 3 + alsl.d $t3, $t7, $t3, 1 + sub.d $a4, $a4, $t3 + addi.d $fp, $a7, -1 + st.b $a4, $a7, 0 addi.w $s4, $s4, 1 - addi.d $t1, $t1, 1 - addi.w $s5, $s5, 1 - move $t0, $t8 - move $a5, $t4 - bge $s4, $a7, .LBB15_29 + addi.d $t0, $t0, 1 + addi.w $s1, $s1, 1 + move $a7, $fp + move $a4, $t7 + bge $s4, $a6, .LBB15_29 .LBB15_18: # =>This Loop Header: Depth=1 # Child Loop BB15_23 Depth 2 # Child Loop BB15_26 Depth 2 - sub.w $t4, $s4, $s3 - srai.d $t8, $t4, 31 - nor $t8, $t8, $zero - andn $t4, $t8, $t4 - add.d $s6, $a1, $t4 - blt $s6, $t7, .LBB15_17 + sub.w $t7, $s4, $s3 + srai.d $fp, $t7, 31 + nor $fp, $fp, $zero + andn $t7, $fp, $t7 + add.d $s5, $a1, $t7 + blt $s5, $t6, .LBB15_17 # %bb.19: # in Loop: Header=BB15_18 Depth=1 - slt $t4, $s4, $a3 - masknez $t8, $a3, $t4 - maskeqz $t4, $s4, $t4 - or $t4, $t4, $t8 - bltz $t4, .LBB15_17 + slt $t7, $s4, $a3 + masknez $fp, $a3, $t7 + maskeqz $t7, $s4, $t7 + or $t7, $t7, $fp + bltz $t7, .LBB15_17 # %bb.20: # %.lr.ph158.preheader # in Loop: Header=BB15_18 Depth=1 - bstrpick.d $s7, $t4, 31, 0 - add.w $t4, $t3, $t1 - slt $t8, $t6, $t4 - maskeqz $t4, $t4, $t8 - masknez $t8, $t6, $t8 - or $t4, $t4, $t8 - sub.w $t4, $zero, $t4 - add.d $t4, $t2, $t4 - sltu $t8, $t4, $s7 - masknez $fp, $s7, $t8 - maskeqz $t4, $t4, $t8 - or $t8, $t4, $fp - bgeu $t8, $s8, .LBB15_22 + bstrpick.d $s6, $t7, 31, 0 + add.w $t7, $t2, $t0 + slt $fp, $t5, $t7 + maskeqz $t7, $t7, $fp + masknez $fp, $t5, $fp + or $t7, $t7, $fp + sub.w $t7, $zero, $t7 + add.d $t7, $t1, $t7 + sltu $fp, $t7, $s6 + masknez $s7, $s6, $fp + maskeqz $t7, $t7, $fp + or $s7, $t7, $s7 + bgeu $s7, $s8, .LBB15_22 # %bb.21: # in Loop: Header=BB15_18 Depth=1 - move $fp, $zero + move $s7, $zero b .LBB15_25 .p2align 4, , 16 .LBB15_22: # %vector.ph201 # in Loop: Header=BB15_18 Depth=1 - slt $t4, $t6, $s5 - masknez $fp, $t6, $t4 - maskeqz $t4, $s5, $t4 - or $t4, $t4, $fp - sub.w $fp, $zero, $t4 - ld.d $t5, $sp, 48 # 8-byte Folded Reload - add.d $s8, $t5, $fp - ld.d $t5, $sp, 40 # 8-byte Folded Reload - sub.d $ra, $t5, $s7 - sub.d $t4, $t2, $t4 - sltu $fp, $t4, $s7 - masknez $t5, $s7, $fp - maskeqz $t4, $t4, $fp - or $t4, $t4, $t5 - addi.d $t4, $t4, 1 - bstrpick.d $t4, $t4, 32, 2 - slli.d $t4, $t4, 2 - addi.d $t8, $t8, 1 - bstrpick.d $t5, $t8, 32, 2 - slli.d $fp, $t5, 2 - sub.d $s6, $s6, $fp + slt $t7, $t5, $s1 + masknez $fp, $t5, $t7 + maskeqz $t7, $s1, $t7 + or $t7, $t7, $fp + sub.w $fp, $zero, $t7 + ld.d $t3, $sp, 48 # 8-byte Folded Reload + add.d $fp, $t3, $fp + sub.d $s8, $t4, $s6 + sub.d $t7, $t1, $t7 + sltu $ra, $t7, $s6 + masknez $t3, $s6, $ra + maskeqz $t7, $t7, $ra + or $t3, $t7, $t3 + addi.d $t3, $t3, 1 + bstrpick.d $t3, $t3, 32, 2 + slli.d $t7, $t3, 2 + addi.d $ra, $s7, 1 + bstrpick.d $t3, $ra, 32, 2 + slli.d $s7, $t3, 2 + sub.d $s5, $s5, $s7 vori.b $vr1, $vr0, 0 - vinsgr2vr.d $vr1, $a5, 0 + vinsgr2vr.d $vr1, $a4, 0 vori.b $vr2, $vr0, 0 .p2align 4, , 16 .LBB15_23: # %vector.body204 # Parent Loop BB15_18 Depth=1 # => This Inner Loop Header: Depth=2 - ld.h $a5, $s8, 0 - ld.h $t5, $s8, -2 - vld $vr3, $a4, %pc_lo12(.LCPI15_0) - vinsgr2vr.h $vr4, $a5, 0 - vinsgr2vr.h $vr5, $t5, 0 - ld.h $a5, $ra, -2 - vshuf.b $vr4, $vr0, $vr4, $vr3 - vslli.d $vr4, $vr4, 56 - vsrai.d $vr4, $vr4, 56 - vinsgr2vr.h $vr6, $a5, 0 - ld.h $a5, $ra, 0 - vshuf.b $vr3, $vr0, $vr5, $vr3 - vslli.d $vr3, $vr3, 56 - vsrai.d $vr3, $vr3, 56 - vinsgr2vr.h $vr5, $a5, 0 - vilvl.b $vr6, $vr6, $vr6 - vilvl.h $vr6, $vr6, $vr6 - vilvl.w $vr6, $vr6, $vr6 - vslli.d $vr6, $vr6, 56 - vsrai.d $vr6, $vr6, 56 - vilvl.b $vr5, $vr5, $vr5 - vilvl.h $vr5, $vr5, $vr5 - vilvl.w $vr5, $vr5, $vr5 - vslli.d $vr5, $vr5, 56 - vsrai.d $vr5, $vr5, 56 - vmadd.d $vr1, $vr6, $vr4 - vmadd.d $vr2, $vr5, $vr3 - addi.d $s8, $s8, -4 - addi.d $t4, $t4, -4 - addi.d $ra, $ra, 4 - bnez $t4, .LBB15_23 + ld.h $a4, $fp, 0 + ld.h $t3, $fp, -2 + vinsgr2vr.h $vr3, $a4, 0 + vshuf4i.b $vr3, $vr3, 1 + vinsgr2vr.h $vr4, $t3, 0 + vshuf4i.b $vr4, $vr4, 1 + ld.h $a4, $s8, -2 + vsllwil.h.b $vr3, $vr3, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 + vinsgr2vr.h $vr5, $a4, 0 + ld.h $a4, $s8, 0 + vsllwil.h.b $vr4, $vr4, 0 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.d.w $vr4, $vr4, 0 + vinsgr2vr.h $vr6, $a4, 0 + vsllwil.h.b $vr5, $vr5, 0 + vsllwil.w.h $vr5, $vr5, 0 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.h.b $vr6, $vr6, 0 + vsllwil.w.h $vr6, $vr6, 0 + vsllwil.d.w $vr6, $vr6, 0 + vmadd.d $vr1, $vr5, $vr3 + vmadd.d $vr2, $vr6, $vr4 + addi.d $fp, $fp, -4 + addi.d $t7, $t7, -4 + addi.d $s8, $s8, 4 + bnez $t7, .LBB15_23 # %bb.24: # %middle.block217 # in Loop: Header=BB15_18 Depth=1 vadd.d $vr1, $vr2, $vr1 vhaddw.q.d $vr1, $vr1, $vr1 - vpickve2gr.d $a5, $vr1, 0 + vpickve2gr.d $a4, $vr1, 0 ori $s8, $zero, 3 - beq $t8, $fp, .LBB15_17 + beq $ra, $s7, .LBB15_17 .LBB15_25: # %scalar.ph199 # in Loop: Header=BB15_18 Depth=1 - sub.d $t4, $fp, $s7 - addi.d $t4, $t4, 15 + sub.d $t3, $s7, $s6 + addi.d $t7, $t3, 15 .p2align 4, , 16 .LBB15_26: # %.lr.ph158 # Parent Loop BB15_18 Depth=1 # => This Inner Loop Header: Depth=2 - ldx.b $t5, $s0, $s6 - ldx.b $t8, $a2, $t4 - mul.d $t5, $t8, $t5 - add.d $a5, $t5, $a5 - bltu $s6, $s1, .LBB15_17 + ldx.b $t3, $s0, $s5 + ldx.b $fp, $a2, $t7 + mul.d $t3, $fp, $t3 + add.d $a4, $t3, $a4 + bltu $s5, $t8, .LBB15_17 # %bb.27: # %.lr.ph158 # in Loop: Header=BB15_26 Depth=2 - addi.d $t5, $t4, -15 - addi.d $s6, $s6, -1 - addi.d $t4, $t4, 1 - bltz $t5, .LBB15_26 + addi.d $t3, $t7, -15 + addi.d $s5, $s5, -1 + addi.d $t7, $t7, 1 + bltz $t3, .LBB15_26 b .LBB15_17 .LBB15_28: - move $t4, $a5 - move $t8, $t0 + move $t7, $a4 + move $fp, $a7 .LBB15_29: # %._crit_edge165 - ld.d $s0, $sp, 32 # 8-byte Folded Reload + ld.d $s0, $sp, 40 # 8-byte Folded Reload ld.d $a1, $s0, 0 - st.b $t4, $t8, 0 + st.b $t7, $fp, 0 beqz $a1, .LBB15_31 # %bb.30: ld.w $a2, $a1, 12 diff --git a/results/MultiSource/Benchmarks/Trimaran/netbench-url/CMakeFiles/netbench-url.dir/search.s b/results/MultiSource/Benchmarks/Trimaran/netbench-url/CMakeFiles/netbench-url.dir/search.s index 4cba59f6..57193f49 100644 --- a/results/MultiSource/Benchmarks/Trimaran/netbench-url/CMakeFiles/netbench-url.dir/search.s +++ b/results/MultiSource/Benchmarks/Trimaran/netbench-url/CMakeFiles/netbench-url.dir/search.s @@ -96,37 +96,39 @@ calculate_bm_table: # @calculate_bm_table move $a1, $zero b .LBB0_7 .LBB0_4: # %vector.ph24 + pcalau12i $a1, %pc_hi20(.LCPI0_0) + vld $vr1, $a1, %pc_lo12(.LCPI0_0) bstrpick.d $a1, $fp, 30, 2 - pcalau12i $a2, %pc_hi20(.LCPI0_0) - vld $vr1, $a2, %pc_lo12(.LCPI0_0) slli.d $a1, $a1, 2 vrepli.b $vr2, -1 - vrepli.b $vr3, 0 move $a2, $s0 move $a3, $a1 .p2align 4, , 16 .LBB0_5: # %vector.body27 # =>This Inner Loop Header: Depth=1 ld.w $a4, $a2, 0 - vxor.v $vr4, $vr1, $vr2 - vadd.w $vr4, $vr0, $vr4 - vinsgr2vr.w $vr5, $a4, 0 - vilvl.b $vr5, $vr3, $vr5 - vilvl.h $vr5, $vr3, $vr5 - vilvh.w $vr6, $vr3, $vr5 - vilvl.w $vr5, $vr3, $vr5 - vpickve2gr.d $a4, $vr5, 0 + vxor.v $vr3, $vr1, $vr2 + vadd.w $vr3, $vr0, $vr3 + vinsgr2vr.w $vr4, $a4, 0 + vshuf4i.b $vr5, $vr4, 14 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vpickve2gr.d $a4, $vr4, 0 alsl.d $a4, $a4, $a0, 2 - vpickve2gr.d $a5, $vr5, 1 + vpickve2gr.d $a5, $vr4, 1 alsl.d $a5, $a5, $a0, 2 - vpickve2gr.d $a6, $vr6, 0 + vpickve2gr.d $a6, $vr5, 0 alsl.d $a6, $a6, $a0, 2 - vpickve2gr.d $a7, $vr6, 1 + vpickve2gr.d $a7, $vr5, 1 alsl.d $a7, $a7, $a0, 2 - vstelm.w $vr4, $a4, 0, 0 - vstelm.w $vr4, $a5, 0, 1 - vstelm.w $vr4, $a6, 0, 2 - vstelm.w $vr4, $a7, 0, 3 + vstelm.w $vr3, $a4, 0, 0 + vstelm.w $vr3, $a5, 0, 1 + vstelm.w $vr3, $a6, 0, 2 + vstelm.w $vr3, $a7, 0, 3 vaddi.wu $vr1, $vr1, 4 addi.d $a3, $a3, -4 addi.d $a2, $a2, 4 @@ -590,37 +592,39 @@ NewPatternNode: # @NewPatternNode move $a1, $zero b .LBB3_9 .LBB3_6: # %vector.ph21 + pcalau12i $a1, %pc_hi20(.LCPI3_0) + vld $vr1, $a1, %pc_lo12(.LCPI3_0) bstrpick.d $a1, $s0, 30, 2 - pcalau12i $a2, %pc_hi20(.LCPI3_0) - vld $vr1, $a2, %pc_lo12(.LCPI3_0) slli.d $a1, $a1, 2 vrepli.b $vr2, -1 - vrepli.b $vr3, 0 move $a2, $s1 move $a3, $a1 .p2align 4, , 16 .LBB3_7: # %vector.body24 # =>This Inner Loop Header: Depth=1 ld.w $a4, $a2, 0 - vxor.v $vr4, $vr1, $vr2 - vadd.w $vr4, $vr0, $vr4 - vinsgr2vr.w $vr5, $a4, 0 - vilvl.b $vr5, $vr3, $vr5 - vilvl.h $vr5, $vr3, $vr5 - vilvh.w $vr6, $vr3, $vr5 - vilvl.w $vr5, $vr3, $vr5 - vpickve2gr.d $a4, $vr5, 0 + vxor.v $vr3, $vr1, $vr2 + vadd.w $vr3, $vr0, $vr3 + vinsgr2vr.w $vr4, $a4, 0 + vshuf4i.b $vr5, $vr4, 14 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vpickve2gr.d $a4, $vr4, 0 alsl.d $a4, $a4, $a0, 2 - vpickve2gr.d $a5, $vr5, 1 + vpickve2gr.d $a5, $vr4, 1 alsl.d $a5, $a5, $a0, 2 - vpickve2gr.d $a6, $vr6, 0 + vpickve2gr.d $a6, $vr5, 0 alsl.d $a6, $a6, $a0, 2 - vpickve2gr.d $a7, $vr6, 1 + vpickve2gr.d $a7, $vr5, 1 alsl.d $a7, $a7, $a0, 2 - vstelm.w $vr4, $a4, 0, 0 - vstelm.w $vr4, $a5, 0, 1 - vstelm.w $vr4, $a6, 0, 2 - vstelm.w $vr4, $a7, 0, 3 + vstelm.w $vr3, $a4, 0, 0 + vstelm.w $vr3, $a5, 0, 1 + vstelm.w $vr3, $a6, 0, 2 + vstelm.w $vr3, $a7, 0, 3 vaddi.wu $vr1, $vr1, 4 addi.d $a3, $a3, -4 addi.d $a2, $a2, 4 @@ -806,37 +810,39 @@ NewStrTreeNode: # @NewStrTreeNode move $a1, $zero b .LBB4_9 .LBB4_6: # %vector.ph17 + pcalau12i $a1, %pc_hi20(.LCPI4_0) + vld $vr1, $a1, %pc_lo12(.LCPI4_0) bstrpick.d $a1, $s0, 30, 2 - pcalau12i $a2, %pc_hi20(.LCPI4_0) - vld $vr1, $a2, %pc_lo12(.LCPI4_0) slli.d $a1, $a1, 2 vrepli.b $vr2, -1 - vrepli.b $vr3, 0 move $a2, $s1 move $a3, $a1 .p2align 4, , 16 .LBB4_7: # %vector.body20 # =>This Inner Loop Header: Depth=1 ld.w $a4, $a2, 0 - vxor.v $vr4, $vr1, $vr2 - vadd.w $vr4, $vr0, $vr4 - vinsgr2vr.w $vr5, $a4, 0 - vilvl.b $vr5, $vr3, $vr5 - vilvl.h $vr5, $vr3, $vr5 - vilvh.w $vr6, $vr3, $vr5 - vilvl.w $vr5, $vr3, $vr5 - vpickve2gr.d $a4, $vr5, 0 + vxor.v $vr3, $vr1, $vr2 + vadd.w $vr3, $vr0, $vr3 + vinsgr2vr.w $vr4, $a4, 0 + vshuf4i.b $vr5, $vr4, 14 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vpickve2gr.d $a4, $vr4, 0 alsl.d $a4, $a4, $a0, 2 - vpickve2gr.d $a5, $vr5, 1 + vpickve2gr.d $a5, $vr4, 1 alsl.d $a5, $a5, $a0, 2 - vpickve2gr.d $a6, $vr6, 0 + vpickve2gr.d $a6, $vr5, 0 alsl.d $a6, $a6, $a0, 2 - vpickve2gr.d $a7, $vr6, 1 + vpickve2gr.d $a7, $vr5, 1 alsl.d $a7, $a7, $a0, 2 - vstelm.w $vr4, $a4, 0, 0 - vstelm.w $vr4, $a5, 0, 1 - vstelm.w $vr4, $a6, 0, 2 - vstelm.w $vr4, $a7, 0, 3 + vstelm.w $vr3, $a4, 0, 0 + vstelm.w $vr3, $a5, 0, 1 + vstelm.w $vr3, $a6, 0, 2 + vstelm.w $vr3, $a7, 0, 3 vaddi.wu $vr1, $vr1, 4 addi.d $a3, $a3, -4 addi.d $a2, $a2, 4 @@ -1014,37 +1020,39 @@ find_lcs: # @find_lcs move $a1, $zero b .LBB5_10 .LBB5_7: # %vector.ph146 + pcalau12i $a1, %pc_hi20(.LCPI5_0) + vld $vr1, $a1, %pc_lo12(.LCPI5_0) bstrpick.d $a1, $s3, 30, 2 - pcalau12i $a2, %pc_hi20(.LCPI5_0) - vld $vr1, $a2, %pc_lo12(.LCPI5_0) slli.d $a1, $a1, 2 vrepli.b $vr2, -1 - vrepli.b $vr3, 0 move $a2, $s4 move $a3, $a1 .p2align 4, , 16 .LBB5_8: # %vector.body149 # =>This Inner Loop Header: Depth=1 ld.w $a4, $a2, 0 - vxor.v $vr4, $vr1, $vr2 - vadd.w $vr4, $vr0, $vr4 - vinsgr2vr.w $vr5, $a4, 0 - vilvl.b $vr5, $vr3, $vr5 - vilvl.h $vr5, $vr3, $vr5 - vilvh.w $vr6, $vr3, $vr5 - vilvl.w $vr5, $vr3, $vr5 - vpickve2gr.d $a4, $vr5, 0 + vxor.v $vr3, $vr1, $vr2 + vadd.w $vr3, $vr0, $vr3 + vinsgr2vr.w $vr4, $a4, 0 + vshuf4i.b $vr5, $vr4, 14 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vpickve2gr.d $a4, $vr4, 0 alsl.d $a4, $a4, $a0, 2 - vpickve2gr.d $a5, $vr5, 1 + vpickve2gr.d $a5, $vr4, 1 alsl.d $a5, $a5, $a0, 2 - vpickve2gr.d $a6, $vr6, 0 + vpickve2gr.d $a6, $vr5, 0 alsl.d $a6, $a6, $a0, 2 - vpickve2gr.d $a7, $vr6, 1 + vpickve2gr.d $a7, $vr5, 1 alsl.d $a7, $a7, $a0, 2 - vstelm.w $vr4, $a4, 0, 0 - vstelm.w $vr4, $a5, 0, 1 - vstelm.w $vr4, $a6, 0, 2 - vstelm.w $vr4, $a7, 0, 3 + vstelm.w $vr3, $a4, 0, 0 + vstelm.w $vr3, $a5, 0, 1 + vstelm.w $vr3, $a6, 0, 2 + vstelm.w $vr3, $a7, 0, 3 vaddi.wu $vr1, $vr1, 4 addi.d $a3, $a3, -4 addi.d $a2, $a2, 4 diff --git a/results/MultiSource/Benchmarks/Trimaran/netbench-url/CMakeFiles/netbench-url.dir/url.s b/results/MultiSource/Benchmarks/Trimaran/netbench-url/CMakeFiles/netbench-url.dir/url.s index effaa37a..bc458a66 100644 --- a/results/MultiSource/Benchmarks/Trimaran/netbench-url/CMakeFiles/netbench-url.dir/url.s +++ b/results/MultiSource/Benchmarks/Trimaran/netbench-url/CMakeFiles/netbench-url.dir/url.s @@ -32,27 +32,26 @@ internet_checksum: # @internet_checksum vrepli.b $vr1, -1 move $a3, $a5 vori.b $vr2, $vr0, 0 - vori.b $vr3, $vr0, 0 .p2align 4, , 16 .LBB0_5: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $a6, $a0, -4 ld.w $a7, $a0, 0 - vinsgr2vr.w $vr4, $a6, 0 - vinsgr2vr.w $vr5, $a7, 0 - vilvl.h $vr4, $vr0, $vr4 - vilvl.w $vr4, $vr0, $vr4 - vilvl.h $vr5, $vr0, $vr5 - vilvl.w $vr5, $vr0, $vr5 + vinsgr2vr.w $vr3, $a6, 0 + vinsgr2vr.w $vr4, $a7, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vxor.v $vr3, $vr3, $vr1 vxor.v $vr4, $vr4, $vr1 - vxor.v $vr5, $vr5, $vr1 + vadd.d $vr0, $vr0, $vr3 vadd.d $vr2, $vr2, $vr4 - vadd.d $vr3, $vr3, $vr5 addi.d $a3, $a3, -4 addi.d $a0, $a0, 8 bnez $a3, .LBB0_5 # %bb.6: # %middle.block - vadd.d $vr0, $vr3, $vr2 + vadd.d $vr0, $vr2, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a3, $vr0, 0 beq $a5, $a4, .LBB0_9 diff --git a/results/MultiSource/Benchmarks/VersaBench/8b10b/CMakeFiles/8b10b.dir/calc.s b/results/MultiSource/Benchmarks/VersaBench/8b10b/CMakeFiles/8b10b.dir/calc.s index 91860540..8436c3fe 100644 --- a/results/MultiSource/Benchmarks/VersaBench/8b10b/CMakeFiles/8b10b.dir/calc.s +++ b/results/MultiSource/Benchmarks/VersaBench/8b10b/CMakeFiles/8b10b.dir/calc.s @@ -132,12 +132,11 @@ bigTableSetup: # @bigTableSetup vrepli.w $vr2, 7 pcalau12i $a1, %pc_hi20(lookupTable5B) addi.d $a1, $a1, %pc_lo12(lookupTable5B) - vrepli.b $vr3, 0 pcalau12i $a2, %pc_hi20(lookupTable3B) addi.d $a2, $a2, %pc_lo12(lookupTable3B) lu12i.w $a3, 16 - vreplgr2vr.w $vr4, $a3 - vrepli.w $vr5, 29 + vreplgr2vr.w $vr3, $a3 + vrepli.w $vr4, 29 pcalau12i $a4, %pc_hi20(bigTable) addi.d $a4, $a4, %pc_lo12(bigTable) move $a5, $zero @@ -145,73 +144,74 @@ bigTableSetup: # @bigTableSetup .p2align 4, , 16 .LBB4_1: # %vector.body # =>This Inner Loop Header: Depth=1 - vsrli.w $vr6, $vr0, 9 - vsrli.w $vr7, $vr0, 8 - vand.v $vr8, $vr7, $vr1 + vsrli.w $vr5, $vr0, 9 + vsrli.w $vr6, $vr0, 8 + vand.v $vr7, $vr6, $vr1 srli.d $a7, $a5, 3 - vpickve2gr.w $t0, $vr7, 0 + vpickve2gr.w $t0, $vr6, 0 bstrins.d $a7, $t0, 63, 5 - vand.v $vr7, $vr0, $vr2 - vslli.w $vr8, $vr8, 3 - vor.v $vr7, $vr8, $vr7 + vand.v $vr6, $vr0, $vr2 + vslli.w $vr7, $vr7, 3 + vor.v $vr6, $vr7, $vr6 andi $a7, $a7, 63 slli.d $a7, $a7, 2 ldx.wu $a7, $a1, $a7 - vilvh.w $vr8, $vr3, $vr7 - vilvl.w $vr7, $vr3, $vr7 - vpickve2gr.d $t0, $vr7, 0 + vshuf4i.w $vr7, $vr6, 14 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vpickve2gr.d $t0, $vr6, 0 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr7, 1 + vpickve2gr.d $t1, $vr6, 1 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr8, 0 + vpickve2gr.d $t2, $vr7, 0 slli.d $t2, $t2, 2 - vpickve2gr.d $t3, $vr8, 1 + vpickve2gr.d $t3, $vr7, 1 slli.d $t3, $t3, 2 ldx.w $t0, $a2, $t0 ldx.w $t1, $a2, $t1 ldx.w $t2, $a2, $t2 ldx.w $t3, $a2, $t3 - vinsgr2vr.w $vr7, $t0, 0 - vinsgr2vr.w $vr7, $t1, 1 - vinsgr2vr.w $vr7, $t2, 2 - vinsgr2vr.w $vr7, $t3, 3 + vinsgr2vr.w $vr6, $t0, 0 + vinsgr2vr.w $vr6, $t1, 1 + vinsgr2vr.w $vr6, $t2, 2 + vinsgr2vr.w $vr6, $t3, 3 and $t0, $a7, $a3 sltui $t0, $t0, 1 - vreplgr2vr.w $vr8, $t0 - vslli.w $vr8, $vr8, 31 - vsrai.w $vr8, $vr8, 31 + vreplgr2vr.w $vr7, $t0 + vslli.w $vr7, $vr7, 31 + vsrai.w $vr7, $vr7, 31 andi $t0, $a7, 994 - vreplgr2vr.w $vr9, $t0 + vreplgr2vr.w $vr8, $t0 bstrpick.d $t1, $a7, 18, 18 - vreplgr2vr.w $vr10, $t1 - vseq.w $vr10, $vr6, $vr10 + vreplgr2vr.w $vr9, $t1 + vseq.w $vr9, $vr5, $vr9 xori $t0, $t0, 994 - vreplgr2vr.w $vr11, $t0 - vbitsel.v $vr10, $vr11, $vr9, $vr10 + vreplgr2vr.w $vr10, $t0 + vbitsel.v $vr9, $vr10, $vr8, $vr9 srli.d $a7, $a7, 19 andi $a7, $a7, 1 - vreplgr2vr.w $vr11, $a7 - vand.v $vr11, $vr8, $vr11 - vxor.v $vr6, $vr6, $vr11 + vreplgr2vr.w $vr10, $a7 + vand.v $vr10, $vr7, $vr10 + vxor.v $vr5, $vr5, $vr10 + vbitsel.v $vr7, $vr8, $vr9, $vr7 + vand.v $vr8, $vr6, $vr3 + vseqi.w $vr8, $vr8, 0 + vand.v $vr9, $vr6, $vr4 + vsrli.w $vr10, $vr6, 18 + vand.v $vr10, $vr10, $vr1 + vseq.w $vr10, $vr5, $vr10 + vxor.v $vr11, $vr9, $vr4 + vbitsel.v $vr10, $vr11, $vr9, $vr10 + vsrli.w $vr6, $vr6, 19 + vand.v $vr6, $vr6, $vr8 + vand.v $vr6, $vr6, $vr1 + vxor.v $vr6, $vr5, $vr6 vbitsel.v $vr8, $vr9, $vr10, $vr8 - vand.v $vr9, $vr7, $vr4 - vseqi.w $vr9, $vr9, 0 - vand.v $vr10, $vr7, $vr5 - vsrli.w $vr11, $vr7, 18 - vand.v $vr11, $vr11, $vr1 - vseq.w $vr11, $vr6, $vr11 - vxor.v $vr12, $vr10, $vr5 - vbitsel.v $vr11, $vr12, $vr10, $vr11 - vsrli.w $vr7, $vr7, 19 - vand.v $vr7, $vr7, $vr9 - vand.v $vr7, $vr7, $vr1 - vxor.v $vr7, $vr6, $vr7 - vbitsel.v $vr9, $vr10, $vr11, $vr9 - vor.v $vr8, $vr8, $vr9 - vslli.w $vr7, $vr7, 16 - vor.v $vr7, $vr8, $vr7 + vor.v $vr7, $vr7, $vr8 + vslli.w $vr6, $vr6, 16 + vor.v $vr6, $vr7, $vr6 add.d $a7, $a4, $a0 - vstx $vr7, $a7, $a6 + vstx $vr6, $a7, $a6 vaddi.wu $vr0, $vr0, 4 addi.d $a0, $a0, 16 addi.w $a5, $a5, 4 @@ -219,7 +219,7 @@ bigTableSetup: # @bigTableSetup # %bb.2: # %middle.block pcalau12i $a0, %pc_hi20(disparity1) addi.d $a0, $a0, %pc_lo12(disparity1) - vstelm.w $vr6, $a0, 0, 3 + vstelm.w $vr5, $a0, 0, 3 pcalau12i $a0, %pc_hi20(disparity0) st.w $zero, $a0, %pc_lo12(disparity0) ret diff --git a/results/MultiSource/Benchmarks/mafft/CMakeFiles/pairlocalalign.dir/Falign.s b/results/MultiSource/Benchmarks/mafft/CMakeFiles/pairlocalalign.dir/Falign.s index d9694549..1b881e55 100644 --- a/results/MultiSource/Benchmarks/mafft/CMakeFiles/pairlocalalign.dir/Falign.s +++ b/results/MultiSource/Benchmarks/mafft/CMakeFiles/pairlocalalign.dir/Falign.s @@ -1227,12 +1227,8 @@ Fgetlag: # @Fgetlag # =>This Inner Loop Header: Depth=1 vsub.w $vr2, $vr0, $vr1 vsubi.wu $vr3, $vr2, 2 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $a7, $vr2, 0 slli.d $a7, $a7, 4 vpickve2gr.d $t0, $vr2, 1 @@ -4253,12 +4249,8 @@ Falign: # @Falign # =>This Inner Loop Header: Depth=1 vsub.w $vr2, $vr0, $vr1 vsubi.wu $vr3, $vr2, 2 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $a7, $vr2, 0 slli.d $a7, $a7, 4 vpickve2gr.d $t0, $vr2, 1 @@ -6489,12 +6481,8 @@ Falign_noudp: # @Falign_noudp # =>This Inner Loop Header: Depth=1 vsub.w $vr2, $vr0, $vr1 vsubi.wu $vr3, $vr2, 2 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $a7, $vr2, 0 slli.d $a7, $a7, 4 vpickve2gr.d $t0, $vr2, 1 @@ -8813,12 +8801,8 @@ Falign_udpari_long: # @Falign_udpari_long # =>This Inner Loop Header: Depth=1 vsub.w $vr2, $vr0, $vr1 vsubi.wu $vr3, $vr2, 2 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 vpickve2gr.d $a7, $vr2, 0 slli.d $a7, $a7, 4 vpickve2gr.d $t0, $vr2, 1 diff --git a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/gsm_decode.s b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/gsm_decode.s index d8fffb79..ae640eae 100644 --- a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/gsm_decode.s +++ b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/gsm_decode.s @@ -62,17 +62,16 @@ gsm_decode: # @gsm_decode bstrpick.d $t3, $a2, 4, 2 srli.d $t4, $t1, 7 bstrins.d $t4, $a2, 2, 1 - vinsgr2vr.b $vr1, $a3, 0 - vinsgr2vr.b $vr1, $a5, 1 - vinsgr2vr.b $vr1, $a6, 2 - vinsgr2vr.b $vr1, $t0, 3 - vinsgr2vr.b $vr1, $a4, 4 - vinsgr2vr.b $vr1, $t2, 5 - vinsgr2vr.b $vr1, $t3, 6 - vinsgr2vr.b $vr1, $t4, 7 - vrepli.b $vr0, 0 - vilvl.b $vr1, $vr0, $vr1 - vst $vr1, $sp, 16 + vinsgr2vr.b $vr0, $a3, 0 + vinsgr2vr.b $vr0, $a5, 1 + vinsgr2vr.b $vr0, $a6, 2 + vinsgr2vr.b $vr0, $t0, 3 + vinsgr2vr.b $vr0, $a4, 4 + vinsgr2vr.b $vr0, $t2, 5 + vinsgr2vr.b $vr0, $t3, 6 + vinsgr2vr.b $vr0, $t4, 7 + vsllwil.hu.bu $vr0, $vr0, 0 + vst $vr0, $sp, 16 bstrpick.d $a2, $t1, 6, 4 st.h $a2, $sp, 32 ld.bu $a2, $a1, 12 @@ -94,28 +93,28 @@ gsm_decode: # @gsm_decode srli.d $a4, $a3, 6 bstrins.d $a4, $t1, 2, 2 st.h $a4, $sp, 36 - vinsgr2vr.b $vr1, $a3, 0 - vinsgr2vr.b $vr1, $a2, 1 - vshuf4i.b $vr2, $vr1, 80 + vinsgr2vr.b $vr0, $a3, 0 + vinsgr2vr.b $vr0, $a2, 1 + vshuf4i.b $vr1, $vr0, 80 lu12i.w $a3, 4160 ori $a3, $a3, 3 ld.bu $a4, $a1, 15 - vreplgr2vr.w $vr1, $a3 - vsrl.b $vr2, $vr2, $vr1 - vandi.b $vr2, $vr2, 7 + vreplgr2vr.w $vr0, $a3 + vsrl.b $vr1, $vr1, $vr0 + vandi.b $vr1, $vr1, 7 srli.d $a3, $a4, 6 ld.bu $a5, $a1, 16 bstrins.d $a3, $a2, 2, 2 bstrpick.d $a2, $a4, 5, 3 andi $a4, $a4, 7 srli.d $a6, $a5, 5 - vinsgr2vr.b $vr2, $a3, 4 - vinsgr2vr.b $vr2, $a2, 5 - vinsgr2vr.b $vr2, $a4, 6 - vinsgr2vr.b $vr2, $a6, 7 - vilvl.b $vr2, $vr0, $vr2 + vinsgr2vr.b $vr1, $a3, 4 + vinsgr2vr.b $vr1, $a2, 5 + vinsgr2vr.b $vr1, $a4, 6 + vinsgr2vr.b $vr1, $a6, 7 + vsllwil.hu.bu $vr1, $vr1, 0 ld.bu $a2, $a1, 17 - vst $vr2, $sp, 38 + vst $vr1, $sp, 38 bstrpick.d $a3, $a5, 4, 2 st.h $a3, $sp, 54 srli.d $a3, $a2, 7 @@ -142,25 +141,25 @@ gsm_decode: # @gsm_decode srli.d $a5, $a4, 6 bstrins.d $a5, $a2, 2, 2 st.h $a5, $sp, 62 - vinsgr2vr.b $vr2, $a4, 0 - vinsgr2vr.b $vr2, $a3, 1 + vinsgr2vr.b $vr1, $a4, 0 + vinsgr2vr.b $vr1, $a3, 1 ld.bu $a2, $a1, 22 - vshuf4i.b $vr2, $vr2, 80 - vsrl.b $vr2, $vr2, $vr1 - vandi.b $vr2, $vr2, 7 + vshuf4i.b $vr1, $vr1, 80 + vsrl.b $vr1, $vr1, $vr0 + vandi.b $vr1, $vr1, 7 srli.d $a4, $a2, 6 ld.bu $a5, $a1, 23 bstrins.d $a4, $a3, 2, 2 bstrpick.d $a3, $a2, 5, 3 andi $a2, $a2, 7 srli.d $a6, $a5, 5 - vinsgr2vr.b $vr2, $a4, 4 - vinsgr2vr.b $vr2, $a3, 5 - vinsgr2vr.b $vr2, $a2, 6 - vinsgr2vr.b $vr2, $a6, 7 - vilvl.b $vr2, $vr0, $vr2 + vinsgr2vr.b $vr1, $a4, 4 + vinsgr2vr.b $vr1, $a3, 5 + vinsgr2vr.b $vr1, $a2, 6 + vinsgr2vr.b $vr1, $a6, 7 + vsllwil.hu.bu $vr1, $vr1, 0 ld.bu $a2, $a1, 24 - vst $vr2, $sp, 64 + vst $vr1, $sp, 64 bstrpick.d $a3, $a5, 4, 2 st.h $a3, $sp, 80 srli.d $a3, $a2, 7 @@ -187,23 +186,23 @@ gsm_decode: # @gsm_decode srli.d $a5, $a4, 6 bstrins.d $a5, $a2, 2, 2 st.h $a5, $sp, 88 - vinsgr2vr.b $vr2, $a4, 0 - vinsgr2vr.b $vr2, $a3, 1 + vinsgr2vr.b $vr1, $a4, 0 + vinsgr2vr.b $vr1, $a3, 1 ld.bu $a2, $a1, 29 - vshuf4i.b $vr2, $vr2, 80 - vsrl.b $vr1, $vr2, $vr1 - vandi.b $vr1, $vr1, 7 + vshuf4i.b $vr1, $vr1, 80 + vsrl.b $vr0, $vr1, $vr0 + vandi.b $vr0, $vr0, 7 srli.d $a4, $a2, 6 ld.bu $a5, $a1, 30 bstrins.d $a4, $a3, 2, 2 bstrpick.d $a3, $a2, 5, 3 andi $a2, $a2, 7 srli.d $a6, $a5, 5 - vinsgr2vr.b $vr1, $a4, 4 - vinsgr2vr.b $vr1, $a3, 5 - vinsgr2vr.b $vr1, $a2, 6 - vinsgr2vr.b $vr1, $a6, 7 - vilvl.b $vr0, $vr0, $vr1 + vinsgr2vr.b $vr0, $a4, 4 + vinsgr2vr.b $vr0, $a3, 5 + vinsgr2vr.b $vr0, $a2, 6 + vinsgr2vr.b $vr0, $a6, 7 + vsllwil.hu.bu $vr0, $vr0, 0 ld.bu $a2, $a1, 31 vst $vr0, $sp, 90 bstrpick.d $a3, $a5, 4, 2 diff --git a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/long_term.s b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/long_term.s index e2ea151f..604d9999 100644 --- a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/long_term.s +++ b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/long_term.s @@ -5,26 +5,23 @@ .type Gsm_Long_Term_Predictor,@function Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor # %bb.0: # %iter.check - addi.d $sp, $sp, -176 - st.d $ra, $sp, 168 # 8-byte Folded Spill - st.d $fp, $sp, 160 # 8-byte Folded Spill - st.d $s0, $sp, 152 # 8-byte Folded Spill - st.d $s1, $sp, 144 # 8-byte Folded Spill - st.d $s2, $sp, 136 # 8-byte Folded Spill - st.d $s3, $sp, 128 # 8-byte Folded Spill - st.d $s4, $sp, 120 # 8-byte Folded Spill - st.d $s5, $sp, 112 # 8-byte Folded Spill - st.d $s6, $sp, 104 # 8-byte Folded Spill - st.d $s7, $sp, 96 # 8-byte Folded Spill - st.d $s8, $sp, 88 # 8-byte Folded Spill - fst.d $fs0, $sp, 80 # 8-byte Folded Spill - fst.d $fs1, $sp, 72 # 8-byte Folded Spill - fst.d $fs2, $sp, 64 # 8-byte Folded Spill - fst.d $fs3, $sp, 56 # 8-byte Folded Spill - fst.d $fs4, $sp, 48 # 8-byte Folded Spill - fst.d $fs5, $sp, 40 # 8-byte Folded Spill - fst.d $fs6, $sp, 32 # 8-byte Folded Spill - fst.d $fs7, $sp, 24 # 8-byte Folded Spill + addi.d $sp, $sp, -144 + st.d $ra, $sp, 136 # 8-byte Folded Spill + st.d $fp, $sp, 128 # 8-byte Folded Spill + st.d $s0, $sp, 120 # 8-byte Folded Spill + st.d $s1, $sp, 112 # 8-byte Folded Spill + st.d $s2, $sp, 104 # 8-byte Folded Spill + st.d $s3, $sp, 96 # 8-byte Folded Spill + st.d $s4, $sp, 88 # 8-byte Folded Spill + st.d $s5, $sp, 80 # 8-byte Folded Spill + st.d $s6, $sp, 72 # 8-byte Folded Spill + st.d $s7, $sp, 64 # 8-byte Folded Spill + st.d $s8, $sp, 56 # 8-byte Folded Spill + fst.d $fs0, $sp, 48 # 8-byte Folded Spill + fst.d $fs1, $sp, 40 # 8-byte Folded Spill + fst.d $fs2, $sp, 32 # 8-byte Folded Spill + fst.d $fs3, $sp, 24 # 8-byte Folded Spill + fst.d $fs4, $sp, 16 # 8-byte Folded Spill move $fp, $a1 vld $vr0, $a1, 16 vslti.h $vr1, $vr0, 0 @@ -82,7 +79,7 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vmax.h $vr0, $vr1, $vr0 vpickve2gr.h $a0, $vr0, 0 bstrpick.d $a0, $a0, 15, 0 - st.d $a6, $sp, 16 # 8-byte Folded Spill + st.d $a6, $sp, 8 # 8-byte Folded Spill move $s3, $a5 move $s1, $a4 move $s0, $a3 @@ -106,219 +103,159 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor .LBB0_3: # %.thread.i move $a1, $zero vld $vr0, $fp, 48 - vld $vr1, $fp, 0 + vld $vr1, $fp, 32 vld $vr2, $fp, 16 - vld $vr3, $fp, 32 - vilvh.h $vr4, $vr0, $vr0 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvh.h $vr5, $vr3, $vr3 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr6, $vr2, $vr2 - vslli.w $vr6, $vr6, 16 - vsrai.w $vr6, $vr6, 16 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvh.h $vr7, $vr1, $vr1 - vslli.w $vr7, $vr7, 16 - vsrai.w $vr7, $vr7, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vld $vr3, $fp, 0 + vsllwil.w.h $vr4, $vr0, 0 + vsllwil.w.h $vr5, $vr1, 0 + vsllwil.w.h $vr6, $vr2, 0 + vsllwil.w.h $vr7, $vr3, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.w.h $vr0, $vr0, 0 + vbsrl.v $vr1, $vr1, 8 + vsllwil.w.h $vr1, $vr1, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 + vbsrl.v $vr3, $vr3, 8 + vsllwil.w.h $vr3, $vr3, 0 vreplgr2vr.w $vr16, $a0 - vsra.w $vr15, $vr1, $vr16 - vsra.w $vr13, $vr7, $vr16 - vsra.w $vr11, $vr2, $vr16 + vsra.w $vr15, $vr3, $vr16 + vsra.w $vr14, $vr2, $vr16 + vsra.w $vr13, $vr1, $vr16 + vsra.w $vr12, $vr0, $vr16 + vsra.w $vr11, $vr7, $vr16 vsra.w $vr9, $vr6, $vr16 - vsra.w $vr7, $vr3, $vr16 - vsra.w $vr5, $vr5, $vr16 - vsra.w $vr3, $vr0, $vr16 - vsra.w $vr1, $vr4, $vr16 - vshuf4i.w $vr0, $vr1, 50 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 - vshuf4i.w $vr1, $vr1, 16 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vshuf4i.w $vr2, $vr3, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr4, $vr5, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr6, $vr7, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr6, $vr6, 32 - vshuf4i.w $vr7, $vr7, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 - vshuf4i.w $vr8, $vr9, 50 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr10, $vr11, 50 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 - vshuf4i.w $vr11, $vr11, 16 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vshuf4i.w $vr12, $vr13, 50 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vshuf4i.w $vr13, $vr13, 16 - vslli.d $vr13, $vr13, 32 - vsrai.d $vr13, $vr13, 32 - vshuf4i.w $vr14, $vr15, 50 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr14, $vr14, 32 + vsra.w $vr7, $vr5, $vr16 + vsra.w $vr5, $vr4, $vr16 + vsllwil.d.w $vr0, $vr5, 0 + vsllwil.d.w $vr1, $vr7, 0 + vsllwil.d.w $vr2, $vr9, 0 + vsllwil.d.w $vr3, $vr11, 0 + vsllwil.d.w $vr4, $vr12, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.d.w $vr5, $vr5, 0 + vsllwil.d.w $vr6, $vr13, 0 + vshuf4i.w $vr7, $vr7, 14 + vsllwil.d.w $vr7, $vr7, 0 + vsllwil.d.w $vr8, $vr14, 0 + vshuf4i.w $vr9, $vr9, 14 + vsllwil.d.w $vr9, $vr9, 0 + vsllwil.d.w $vr10, $vr15, 0 + vshuf4i.w $vr11, $vr11, 14 + vsllwil.d.w $vr11, $vr11, 0 + vshuf4i.w $vr12, $vr12, 14 + vsllwil.d.w $vr12, $vr12, 0 + vshuf4i.w $vr13, $vr13, 14 + vsllwil.d.w $vr13, $vr13, 0 + vshuf4i.w $vr14, $vr14, 14 vld $vr17, $fp, 64 - vshuf4i.w $vr15, $vr15, 16 - vslli.d $vr15, $vr15, 32 - vsrai.d $vr15, $vr15, 32 - vilvh.h $vr18, $vr17, $vr17 - vslli.w $vr18, $vr18, 16 - vsrai.w $vr18, $vr18, 16 - vilvl.h $vr17, $vr17, $vr17 - vslli.w $vr17, $vr17, 16 - vsrai.w $vr17, $vr17, 16 + vsllwil.d.w $vr14, $vr14, 0 + vshuf4i.w $vr15, $vr15, 14 + vsllwil.d.w $vr15, $vr15, 0 + vsllwil.w.h $vr18, $vr17, 0 + vbsrl.v $vr17, $vr17, 8 + vsllwil.w.h $vr17, $vr17, 0 vsra.w $vr19, $vr17, $vr16 - vsra.w $vr17, $vr18, $vr16 - vshuf4i.w $vr16, $vr17, 50 - vslli.d $vr16, $vr16, 32 - vsrai.d $vr16, $vr16, 32 - vshuf4i.w $vr17, $vr17, 16 - vslli.d $vr17, $vr17, 32 - vsrai.d $vr17, $vr17, 32 - vshuf4i.w $vr18, $vr19, 50 - vslli.d $vr18, $vr18, 32 - vsrai.d $vr18, $vr18, 32 - vshuf4i.w $vr19, $vr19, 16 - vslli.d $vr19, $vr19, 32 - vsrai.d $vr19, $vr19, 32 + vsra.w $vr18, $vr18, $vr16 + vsllwil.d.w $vr16, $vr18, 0 + vsllwil.d.w $vr17, $vr19, 0 + vshuf4i.w $vr18, $vr18, 14 + vsllwil.d.w $vr18, $vr18, 0 + vshuf4i.w $vr19, $vr19, 14 + vsllwil.d.w $vr19, $vr19, 0 addi.d $a3, $s2, -16 ori $a4, $zero, 40 ori $a5, $zero, 121 ori $a2, $zero, 40 .p2align 4, , 16 .LBB0_4: # =>This Inner Loop Header: Depth=1 - vld $vr22, $a3, -16 - vld $vr24, $a3, -32 - vld $vr23, $a3, -48 - vld $vr25, $a3, -64 - vilvh.h $vr20, $vr22, $vr22 - vilvh.h $vr21, $vr24, $vr24 - vilvl.h $vr22, $vr22, $vr22 - vilvl.h $vr24, $vr24, $vr24 - vilvh.h $vr28, $vr25, $vr25 - vilvh.w $vr26, $vr28, $vr28 - vslli.d $vr26, $vr26, 48 - vsrai.d $vr29, $vr26, 48 - vilvl.h $vr27, $vr25, $vr25 - vilvh.w $vr25, $vr27, $vr27 - vslli.d $vr25, $vr25, 48 - vld $vr30, $a3, 0 - vsrai.d $vr25, $vr25, 48 - vmul.d $vr26, $vr14, $vr25 - vmul.d $vr25, $vr12, $vr29 - vilvl.h $vr29, $vr30, $vr30 - vilvh.h $vr30, $vr30, $vr30 - vilvh.w $vr31, $vr30, $vr30 - vslli.d $vr31, $vr31, 48 - vsrai.d $vr31, $vr31, 48 - vmadd.d $vr25, $vr16, $vr31 - vilvh.w $vr31, $vr29, $vr29 - vslli.d $vr31, $vr31, 48 - vsrai.d $vr31, $vr31, 48 - vmadd.d $vr26, $vr18, $vr31 - vilvl.w $vr31, $vr22, $vr22 - vslli.d $vr31, $vr31, 48 - vsrai.d $vr31, $vr31, 48 - vilvl.w $vr28, $vr28, $vr28 - vslli.d $vr28, $vr28, 48 - vsrai.d $vr28, $vr28, 48 - vmul.d $vr28, $vr13, $vr28 - vilvl.w $vr30, $vr30, $vr30 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vmadd.d $vr28, $vr17, $vr30 - vilvl.w $vr30, $vr24, $vr24 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vilvl.w $vr27, $vr27, $vr27 - vslli.d $vr27, $vr27, 48 - vsrai.d $vr27, $vr27, 48 - vmul.d $vr27, $vr15, $vr27 - vilvl.w $vr29, $vr29, $vr29 - vslli.d $vr29, $vr29, 48 - vsrai.d $vr29, $vr29, 48 - vmadd.d $vr27, $vr19, $vr29 - vilvl.h $vr29, $vr23, $vr23 - vmadd.d $vr27, $vr7, $vr30 - vilvl.w $vr30, $vr29, $vr29 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vmul.d $vr30, $vr11, $vr30 - vmadd.d $vr30, $vr3, $vr31 - vilvl.w $vr31, $vr20, $vr20 - vslli.d $vr31, $vr31, 48 - vsrai.d $vr31, $vr31, 48 - vadd.d $vr27, $vr27, $vr30 - vilvl.w $vr30, $vr21, $vr21 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vilvh.h $vr23, $vr23, $vr23 - vmadd.d $vr28, $vr5, $vr30 - vilvl.w $vr30, $vr23, $vr23 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vmul.d $vr30, $vr9, $vr30 - vmadd.d $vr30, $vr1, $vr31 - vadd.d $vr28, $vr28, $vr30 - vadd.d $vr27, $vr27, $vr28 - vilvh.w $vr24, $vr24, $vr24 - vslli.d $vr24, $vr24, 48 - vsrai.d $vr24, $vr24, 48 - vmadd.d $vr26, $vr6, $vr24 - vilvh.w $vr22, $vr22, $vr22 - vslli.d $vr22, $vr22, 48 - vsrai.d $vr22, $vr22, 48 - vilvh.w $vr24, $vr29, $vr29 - vslli.d $vr24, $vr24, 48 - vsrai.d $vr24, $vr24, 48 - vmul.d $vr24, $vr10, $vr24 - vmadd.d $vr24, $vr2, $vr22 - vadd.d $vr22, $vr26, $vr24 - vilvh.w $vr21, $vr21, $vr21 - vslli.d $vr21, $vr21, 48 - vsrai.d $vr21, $vr21, 48 - vmadd.d $vr25, $vr4, $vr21 - vilvh.w $vr20, $vr20, $vr20 - vslli.d $vr20, $vr20, 48 - vsrai.d $vr20, $vr20, 48 - vilvh.w $vr21, $vr23, $vr23 - vslli.d $vr21, $vr21, 48 - vsrai.d $vr21, $vr21, 48 - vmul.d $vr21, $vr8, $vr21 - vmadd.d $vr21, $vr0, $vr20 - vadd.d $vr20, $vr25, $vr21 - vadd.d $vr20, $vr22, $vr20 - vadd.d $vr20, $vr27, $vr20 + vld $vr24, $a3, -64 + vld $vr20, $a3, -48 + vld $vr22, $a3, -32 + vld $vr21, $a3, -16 + vbsrl.v $vr23, $vr24, 12 + vsllwil.w.h $vr23, $vr23, 0 + vsllwil.d.w $vr23, $vr23, 0 + vshuf4i.h $vr25, $vr24, 14 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vbsrl.v $vr26, $vr24, 8 + vsllwil.w.h $vr26, $vr26, 0 + vsllwil.d.w $vr26, $vr26, 0 + vld $vr27, $a3, 0 + vmul.d $vr26, $vr10, $vr26 + vmul.d $vr25, $vr11, $vr25 + vmul.d $vr23, $vr15, $vr23 + vbsrl.v $vr28, $vr27, 12 + vsllwil.w.h $vr28, $vr28, 0 + vsllwil.d.w $vr28, $vr28, 0 + vmadd.d $vr23, $vr19, $vr28 + vshuf4i.h $vr28, $vr27, 14 + vsllwil.w.h $vr28, $vr28, 0 + vsllwil.d.w $vr28, $vr28, 0 + vmadd.d $vr25, $vr18, $vr28 + vbsrl.v $vr28, $vr27, 8 + vsllwil.w.h $vr28, $vr28, 0 + vsllwil.d.w $vr28, $vr28, 0 + vmadd.d $vr26, $vr17, $vr28 + vsllwil.w.h $vr28, $vr21, 0 + vsllwil.d.w $vr28, $vr28, 0 + vsllwil.w.h $vr24, $vr24, 0 + vsllwil.d.w $vr24, $vr24, 0 + vmul.d $vr24, $vr3, $vr24 + vsllwil.w.h $vr27, $vr27, 0 + vsllwil.d.w $vr27, $vr27, 0 + vmadd.d $vr24, $vr16, $vr27 + vsllwil.w.h $vr27, $vr22, 0 + vsllwil.d.w $vr27, $vr27, 0 + vmadd.d $vr24, $vr1, $vr27 + vsllwil.w.h $vr27, $vr20, 0 + vsllwil.d.w $vr27, $vr27, 0 + vmul.d $vr27, $vr2, $vr27 + vmadd.d $vr27, $vr0, $vr28 + vbsrl.v $vr28, $vr21, 8 + vsllwil.w.h $vr28, $vr28, 0 + vsllwil.d.w $vr28, $vr28, 0 + vadd.d $vr24, $vr24, $vr27 + vbsrl.v $vr27, $vr22, 8 + vsllwil.w.h $vr27, $vr27, 0 + vsllwil.d.w $vr27, $vr27, 0 + vmadd.d $vr26, $vr6, $vr27 + vbsrl.v $vr27, $vr20, 8 + vsllwil.w.h $vr27, $vr27, 0 + vsllwil.d.w $vr27, $vr27, 0 + vmul.d $vr27, $vr8, $vr27 + vmadd.d $vr27, $vr4, $vr28 + vadd.d $vr26, $vr26, $vr27 + vshuf4i.h $vr27, $vr21, 14 + vsllwil.w.h $vr27, $vr27, 0 + vsllwil.d.w $vr27, $vr27, 0 + vadd.d $vr24, $vr24, $vr26 + vshuf4i.h $vr26, $vr22, 14 + vsllwil.w.h $vr26, $vr26, 0 + vsllwil.d.w $vr26, $vr26, 0 + vmadd.d $vr25, $vr7, $vr26 + vshuf4i.h $vr26, $vr20, 14 + vsllwil.w.h $vr26, $vr26, 0 + vsllwil.d.w $vr26, $vr26, 0 + vmul.d $vr26, $vr9, $vr26 + vmadd.d $vr26, $vr5, $vr27 + vadd.d $vr25, $vr25, $vr26 + vbsrl.v $vr22, $vr22, 12 + vsllwil.w.h $vr22, $vr22, 0 + vsllwil.d.w $vr22, $vr22, 0 + vmadd.d $vr23, $vr13, $vr22 + vbsrl.v $vr21, $vr21, 12 + vsllwil.w.h $vr21, $vr21, 0 + vsllwil.d.w $vr21, $vr21, 0 + vbsrl.v $vr20, $vr20, 12 + vsllwil.w.h $vr20, $vr20, 0 + vsllwil.d.w $vr20, $vr20, 0 + vmul.d $vr20, $vr14, $vr20 + vmadd.d $vr20, $vr12, $vr21 + vadd.d $vr20, $vr23, $vr20 + vadd.d $vr20, $vr25, $vr20 + vadd.d $vr20, $vr24, $vr20 vhaddw.q.d $vr20, $vr20, $vr20 vpickve2gr.d $a6, $vr20, 0 slt $a7, $a1, $a6 @@ -349,32 +286,24 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr1, $a1, 0 vsrai.h $vr0, $vr0, 3 vsrai.h $vr1, $vr1, 3 - vilvl.h $vr0, $vr0, $vr0 - vilvl.w $vr0, $vr0, $vr0 - vslli.d $vr0, $vr0, 48 - vsrai.d $vr2, $vr0, 48 - vilvl.h $vr0, $vr1, $vr1 - vilvl.w $vr0, $vr0, $vr0 + vsllwil.w.h $vr0, $vr0, 0 + vsllwil.d.w $vr2, $vr0, 0 ori $a1, $zero, 4 sub.d $a1, $a1, $a0 alsl.d $a2, $a1, $s2, 1 slli.d $a1, $a1, 1 ldx.w $a1, $s2, $a1 ld.w $a2, $a2, 4 - vslli.d $vr0, $vr0, 48 - vsrai.d $vr3, $vr0, 48 + vsllwil.w.h $vr0, $vr1, 0 + vsllwil.d.w $vr3, $vr0, 0 vinsgr2vr.w $vr0, $a1, 0 vinsgr2vr.w $vr1, $a2, 0 vsrai.h $vr0, $vr0, 3 vsrai.h $vr1, $vr1, 3 - vilvl.h $vr0, $vr0, $vr0 - vilvl.w $vr0, $vr0, $vr0 - vslli.d $vr0, $vr0, 48 - vsrai.d $vr0, $vr0, 48 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + vsllwil.w.h $vr0, $vr0, 0 + vsllwil.d.w $vr0, $vr0, 0 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmul.d $vr0, $vr0, $vr0 vmul.d $vr1, $vr1, $vr1 ori $a1, $zero, 8 @@ -389,14 +318,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 ori $a1, $zero, 12 sub.d $a1, $a1, $a0 alsl.d $a2, $a1, $s2, 1 @@ -409,14 +334,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 sub.d $a1, $s7, $a0 alsl.d $a2, $a1, $s2, 1 slli.d $a1, $a1, 1 @@ -428,14 +349,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 ori $a1, $zero, 20 sub.d $a1, $a1, $a0 alsl.d $a2, $a1, $s2, 1 @@ -448,14 +365,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 ori $a1, $zero, 24 sub.d $a1, $a1, $a0 alsl.d $a2, $a1, $s2, 1 @@ -468,14 +381,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 ori $a1, $zero, 28 sub.d $a1, $a1, $a0 alsl.d $a2, $a1, $s2, 1 @@ -488,14 +397,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 ori $a1, $zero, 32 sub.d $a1, $a1, $a0 alsl.d $a2, $a1, $s2, 1 @@ -508,14 +413,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a2, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 ori $a1, $zero, 36 sub.d $a0, $a1, $a0 alsl.d $a1, $a0, $s2, 1 @@ -528,14 +429,10 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor vinsgr2vr.w $vr3, $a1, 0 vsrai.h $vr2, $vr2, 3 vsrai.h $vr3, $vr3, 3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr3, $vr3, $vr3 - vilvl.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 vmadd.d $vr0, $vr2, $vr2 vmadd.d $vr1, $vr3, $vr3 vadd.d $vr0, $vr1, $vr0 @@ -574,7 +471,7 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor bge $a0, $s8, .LBB0_37 .LBB0_10: # %.preheader64.i ori $a0, $zero, 3 - ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a1, $sp, 8 # 8-byte Folded Reload st.h $a0, $a1, 0 ld.h $a0, $s3, 0 sub.d $a2, $s0, $s1 @@ -600,18 +497,15 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor slli.d $a1, $a0, 1 sub.d $a1, $s2, $a1 vld $vr0, $a1, 0 - vilvl.h $vr1, $vr0, $vr0 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr2, $vr1, 16 - vilvh.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr3, $vr0, 16 + vsllwil.w.h $vr2, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.w.h $vr3, $vr0, 0 lu12i.w $a1, 4 vreplgr2vr.w $vr0, $a1 vreplgr2vr.w $vr1, $s6 b .LBB0_23 .LBB0_16: # %.preheader.i - ld.d $a0, $sp, 16 # 8-byte Folded Reload + ld.d $a0, $sp, 8 # 8-byte Folded Reload st.h $zero, $a0, 0 ld.h $a0, $s3, 0 sub.d $a1, $s0, $s1 @@ -637,12 +531,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor slli.d $a1, $a0, 1 sub.d $a1, $s2, $a1 vld $vr0, $a1, 0 - vilvl.h $vr1, $vr0, $vr0 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr2, $vr1, 16 - vilvh.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr3, $vr0, 16 + vsllwil.w.h $vr2, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.w.h $vr3, $vr0, 0 lu12i.w $a1, 4 vreplgr2vr.w $vr0, $a1 ori $a1, $zero, 3277 @@ -664,12 +555,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor sub.d $a1, $a1, $a0 slli.d $a1, $a1, 1 vldx $vr2, $s2, $a1 - vilvl.h $vr3, $vr2, $vr2 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 + vsllwil.w.h $vr3, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 vori.b $vr4, $vr0, 0 vmadd.w $vr4, $vr2, $vr1 vori.b $vr2, $vr0, 0 @@ -685,12 +573,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor sub.d $a1, $a1, $a0 slli.d $a1, $a1, 1 vldx $vr2, $s2, $a1 - vilvl.h $vr3, $vr2, $vr2 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 + vsllwil.w.h $vr3, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 vori.b $vr4, $vr0, 0 vmadd.w $vr4, $vr2, $vr1 vori.b $vr2, $vr0, 0 @@ -706,12 +591,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor sub.d $a1, $a1, $a0 slli.d $a1, $a1, 1 vldx $vr2, $s2, $a1 - vilvl.h $vr3, $vr2, $vr2 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 + vsllwil.w.h $vr3, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 vori.b $vr4, $vr0, 0 vmadd.w $vr4, $vr2, $vr1 vori.b $vr2, $vr0, 0 @@ -727,12 +609,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor sub.d $a0, $a1, $a0 slli.d $a0, $a0, 1 vldx $vr2, $s2, $a0 - vilvl.h $vr3, $vr2, $vr2 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 + vsllwil.w.h $vr3, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 vori.b $vr4, $vr0, 0 vmadd.w $vr4, $vr2, $vr1 vmadd.w $vr0, $vr3, $vr1 @@ -806,30 +685,27 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor addi.d $a1, $a1, 2 bne $a1, $a4, .LBB0_27 .LBB0_28: # %Long_term_analysis_filtering.exit - fld.d $fs7, $sp, 24 # 8-byte Folded Reload - fld.d $fs6, $sp, 32 # 8-byte Folded Reload - fld.d $fs5, $sp, 40 # 8-byte Folded Reload - fld.d $fs4, $sp, 48 # 8-byte Folded Reload - fld.d $fs3, $sp, 56 # 8-byte Folded Reload - fld.d $fs2, $sp, 64 # 8-byte Folded Reload - fld.d $fs1, $sp, 72 # 8-byte Folded Reload - fld.d $fs0, $sp, 80 # 8-byte Folded Reload - ld.d $s8, $sp, 88 # 8-byte Folded Reload - ld.d $s7, $sp, 96 # 8-byte Folded Reload - ld.d $s6, $sp, 104 # 8-byte Folded Reload - ld.d $s5, $sp, 112 # 8-byte Folded Reload - ld.d $s4, $sp, 120 # 8-byte Folded Reload - ld.d $s3, $sp, 128 # 8-byte Folded Reload - ld.d $s2, $sp, 136 # 8-byte Folded Reload - ld.d $s1, $sp, 144 # 8-byte Folded Reload - ld.d $s0, $sp, 152 # 8-byte Folded Reload - ld.d $fp, $sp, 160 # 8-byte Folded Reload - ld.d $ra, $sp, 168 # 8-byte Folded Reload - addi.d $sp, $sp, 176 + fld.d $fs4, $sp, 16 # 8-byte Folded Reload + fld.d $fs3, $sp, 24 # 8-byte Folded Reload + fld.d $fs2, $sp, 32 # 8-byte Folded Reload + fld.d $fs1, $sp, 40 # 8-byte Folded Reload + fld.d $fs0, $sp, 48 # 8-byte Folded Reload + ld.d $s8, $sp, 56 # 8-byte Folded Reload + ld.d $s7, $sp, 64 # 8-byte Folded Reload + ld.d $s6, $sp, 72 # 8-byte Folded Reload + ld.d $s5, $sp, 80 # 8-byte Folded Reload + ld.d $s4, $sp, 88 # 8-byte Folded Reload + ld.d $s3, $sp, 96 # 8-byte Folded Reload + ld.d $s2, $sp, 104 # 8-byte Folded Reload + ld.d $s1, $sp, 112 # 8-byte Folded Reload + ld.d $s0, $sp, 120 # 8-byte Folded Reload + ld.d $fp, $sp, 128 # 8-byte Folded Reload + ld.d $ra, $sp, 136 # 8-byte Folded Reload + addi.d $sp, $sp, 144 ret .LBB0_29: # %.preheader60.i ori $a0, $zero, 1 - ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a1, $sp, 8 # 8-byte Folded Reload st.h $a0, $a1, 0 ld.h $a0, $s3, 0 sub.d $a2, $s0, $s1 @@ -855,12 +731,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor slli.d $a1, $a0, 1 sub.d $a1, $s2, $a1 vld $vr0, $a1, 0 - vilvl.h $vr1, $vr0, $vr0 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr2, $vr1, 16 - vilvh.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr3, $vr0, 16 + vsllwil.w.h $vr2, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.w.h $vr3, $vr0, 0 lu12i.w $a1, 4 vreplgr2vr.w $vr0, $a1 lu12i.w $a1, 2 @@ -900,7 +773,7 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor b .LBB0_28 .LBB0_37: # %.preheader62.i ori $a0, $zero, 2 - ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a1, $sp, 8 # 8-byte Folded Reload st.h $a0, $a1, 0 ld.h $a0, $s3, 0 sub.d $a2, $s0, $s1 @@ -926,12 +799,9 @@ Gsm_Long_Term_Predictor: # @Gsm_Long_Term_Predictor slli.d $a1, $a0, 1 sub.d $a1, $s2, $a1 vld $vr0, $a1, 0 - vilvl.h $vr1, $vr0, $vr0 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr2, $vr1, 16 - vilvh.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr3, $vr0, 16 + vsllwil.w.h $vr2, $vr0, 0 + vbsrl.v $vr0, $vr0, 8 + vsllwil.w.h $vr3, $vr0, 0 lu12i.w $a1, 4 vreplgr2vr.w $vr0, $a1 lu12i.w $a1, 5 @@ -1004,20 +874,17 @@ Gsm_Long_Term_Synthesis_Filtering: # @Gsm_Long_Term_Synthesis_Filtering sub.d $a1, $a4, $a1 vld $vr1, $a1, 0 vreplgr2vr.d $vr0, $a2 - vilvh.h $vr2, $vr1, $vr1 - vilvl.w $vr3, $vr2, $vr2 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr4, $vr1, $vr1 - vslli.d $vr4, $vr4, 48 - vsrai.d $vr4, $vr4, 48 - vilvh.w $vr1, $vr1, $vr1 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr5, $vr1, 48 + vbsrl.v $vr2, $vr1, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vbsrl.v $vr3, $vr1, 12 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 + vsllwil.w.h $vr4, $vr1, 0 + vsllwil.d.w $vr4, $vr4, 0 + vshuf4i.h $vr1, $vr1, 14 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr5, $vr1, 0 ori $a1, $zero, 0 lu32i.d $a1, 32768 vreplgr2vr.d $vr1, $a1 @@ -1027,10 +894,10 @@ Gsm_Long_Term_Synthesis_Filtering: # @Gsm_Long_Term_Synthesis_Filtering vori.b $vr5, $vr1, 0 vmadd.d $vr5, $vr0, $vr4 vori.b $vr4, $vr1, 0 - vmadd.d $vr4, $vr0, $vr2 - vori.b $vr2, $vr1, 0 - vmadd.d $vr2, $vr0, $vr3 - vsrli.d $vr2, $vr2, 48 + vmadd.d $vr4, $vr0, $vr3 + vori.b $vr3, $vr1, 0 + vmadd.d $vr3, $vr0, $vr2 + vsrli.d $vr2, $vr3, 48 vsrli.d $vr3, $vr4, 48 vsrli.d $vr4, $vr5, 48 vsrli.d $vr5, $vr7, 48 @@ -1043,30 +910,27 @@ Gsm_Long_Term_Synthesis_Filtering: # @Gsm_Long_Term_Synthesis_Filtering sub.d $a1, $a1, $a0 slli.d $a1, $a1, 1 vldx $vr2, $a4, $a1 - vilvh.h $vr3, $vr2, $vr2 - vilvl.w $vr4, $vr3, $vr3 - vslli.d $vr4, $vr4, 48 - vsrai.d $vr4, $vr4, 48 - vilvh.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr5, $vr2, $vr2 - vslli.d $vr5, $vr5, 48 - vsrai.d $vr5, $vr5, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 + vbsrl.v $vr3, $vr2, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 + vbsrl.v $vr4, $vr2, 12 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr2, 0 + vsllwil.d.w $vr5, $vr5, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 vld $vr6, $a3, 16 vori.b $vr7, $vr1, 0 vmadd.d $vr7, $vr0, $vr2 vori.b $vr2, $vr1, 0 vmadd.d $vr2, $vr0, $vr5 vori.b $vr5, $vr1, 0 - vmadd.d $vr5, $vr0, $vr3 - vori.b $vr3, $vr1, 0 - vmadd.d $vr3, $vr0, $vr4 - vsrli.d $vr3, $vr3, 48 + vmadd.d $vr5, $vr0, $vr4 + vori.b $vr4, $vr1, 0 + vmadd.d $vr4, $vr0, $vr3 + vsrli.d $vr3, $vr4, 48 vsrli.d $vr4, $vr5, 48 vsrli.d $vr2, $vr2, 48 vsrli.d $vr5, $vr7, 48 @@ -1079,30 +943,27 @@ Gsm_Long_Term_Synthesis_Filtering: # @Gsm_Long_Term_Synthesis_Filtering sub.d $a1, $a1, $a0 slli.d $a1, $a1, 1 vldx $vr2, $a4, $a1 - vilvh.h $vr3, $vr2, $vr2 - vilvl.w $vr4, $vr3, $vr3 - vslli.d $vr4, $vr4, 48 - vsrai.d $vr4, $vr4, 48 - vilvh.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr5, $vr2, $vr2 - vslli.d $vr5, $vr5, 48 - vsrai.d $vr5, $vr5, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 + vbsrl.v $vr3, $vr2, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 + vbsrl.v $vr4, $vr2, 12 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr2, 0 + vsllwil.d.w $vr5, $vr5, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 vld $vr6, $a3, 32 vori.b $vr7, $vr1, 0 vmadd.d $vr7, $vr0, $vr2 vori.b $vr2, $vr1, 0 vmadd.d $vr2, $vr0, $vr5 vori.b $vr5, $vr1, 0 - vmadd.d $vr5, $vr0, $vr3 - vori.b $vr3, $vr1, 0 - vmadd.d $vr3, $vr0, $vr4 - vsrli.d $vr3, $vr3, 48 + vmadd.d $vr5, $vr0, $vr4 + vori.b $vr4, $vr1, 0 + vmadd.d $vr4, $vr0, $vr3 + vsrli.d $vr3, $vr4, 48 vsrli.d $vr4, $vr5, 48 vsrli.d $vr2, $vr2, 48 vsrli.d $vr5, $vr7, 48 @@ -1115,30 +976,27 @@ Gsm_Long_Term_Synthesis_Filtering: # @Gsm_Long_Term_Synthesis_Filtering sub.d $a1, $a1, $a0 slli.d $a1, $a1, 1 vldx $vr2, $a4, $a1 - vilvh.h $vr3, $vr2, $vr2 - vilvl.w $vr4, $vr3, $vr3 - vslli.d $vr4, $vr4, 48 - vsrai.d $vr4, $vr4, 48 - vilvh.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr5, $vr2, $vr2 - vslli.d $vr5, $vr5, 48 - vsrai.d $vr5, $vr5, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 + vbsrl.v $vr3, $vr2, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 + vbsrl.v $vr4, $vr2, 12 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr2, 0 + vsllwil.d.w $vr5, $vr5, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 vld $vr6, $a3, 48 vori.b $vr7, $vr1, 0 vmadd.d $vr7, $vr0, $vr2 vori.b $vr2, $vr1, 0 vmadd.d $vr2, $vr0, $vr5 vori.b $vr5, $vr1, 0 - vmadd.d $vr5, $vr0, $vr3 - vori.b $vr3, $vr1, 0 - vmadd.d $vr3, $vr0, $vr4 - vsrli.d $vr3, $vr3, 48 + vmadd.d $vr5, $vr0, $vr4 + vori.b $vr4, $vr1, 0 + vmadd.d $vr4, $vr0, $vr3 + vsrli.d $vr3, $vr4, 48 vsrli.d $vr4, $vr5, 48 vsrli.d $vr2, $vr2, 48 vsrli.d $vr5, $vr7, 48 @@ -1151,28 +1009,25 @@ Gsm_Long_Term_Synthesis_Filtering: # @Gsm_Long_Term_Synthesis_Filtering sub.d $a0, $a1, $a0 slli.d $a0, $a0, 1 vldx $vr2, $a4, $a0 - vilvh.h $vr3, $vr2, $vr2 - vilvl.w $vr4, $vr3, $vr3 - vslli.d $vr4, $vr4, 48 - vsrai.d $vr4, $vr4, 48 - vilvh.w $vr3, $vr3, $vr3 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr5, $vr2, $vr2 - vslli.d $vr5, $vr5, 48 - vsrai.d $vr5, $vr5, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 + vbsrl.v $vr3, $vr2, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.d.w $vr3, $vr3, 0 + vbsrl.v $vr4, $vr2, 12 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.d.w $vr4, $vr4, 0 + vsllwil.w.h $vr5, $vr2, 0 + vsllwil.d.w $vr5, $vr5, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 vld $vr6, $a3, 64 vori.b $vr7, $vr1, 0 vmadd.d $vr7, $vr0, $vr2 vori.b $vr2, $vr1, 0 vmadd.d $vr2, $vr0, $vr5 vori.b $vr5, $vr1, 0 - vmadd.d $vr5, $vr0, $vr3 - vmadd.d $vr1, $vr0, $vr4 + vmadd.d $vr5, $vr0, $vr4 + vmadd.d $vr1, $vr0, $vr3 vsrli.d $vr0, $vr1, 48 vsrli.d $vr1, $vr5, 48 vsrli.d $vr2, $vr2, 48 diff --git a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/lpc.s b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/lpc.s index 4a7dfb56..aea624e8 100644 --- a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/lpc.s +++ b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/lpc.s @@ -3,10 +3,10 @@ .p2align 4, 0x0 # -- Begin function Gsm_LPC_Analysis .LCPI0_0: .half 1 # 0x1 + .half 8 # 0x8 .half 65535 # 0xffff .half 65535 # 0xffff .half 65535 # 0xffff - .half 8 # 0x8 .half 65535 # 0xffff .half 65535 # 0xffff .half 65535 # 0xffff @@ -260,18 +260,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis .LBB0_4: # %vector.body108 vld $vr0, $s0, 0 vld $vr1, $s0, 16 - vilvh.h $vr2, $vr0, $vr0 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvh.h $vr3, $vr1, $vr1 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr0, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr0, $vr0, 0 + vbsrl.v $vr3, $vr1, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr4, $vr0, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -292,18 +286,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 0 vst $vr1, $s0, 16 vld $vr1, $s0, 48 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -322,18 +310,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 32 vst $vr1, $s0, 48 vld $vr1, $s0, 80 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -352,18 +334,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 64 vst $vr1, $s0, 80 vld $vr1, $s0, 112 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -382,18 +358,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 96 vst $vr1, $s0, 112 vld $vr1, $s0, 144 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -412,18 +382,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 128 vst $vr1, $s0, 144 vld $vr1, $s0, 176 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -442,18 +406,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 160 vst $vr1, $s0, 176 vld $vr1, $s0, 208 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -472,18 +430,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 192 vst $vr1, $s0, 208 vld $vr1, $s0, 240 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -502,18 +454,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 224 vst $vr1, $s0, 240 vld $vr1, $s0, 272 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -532,18 +478,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 256 vst $vr1, $s0, 272 vld $vr1, $s0, 304 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 11 vslli.w $vr2, $vr2, 11 vslli.w $vr1, $vr1, 11 @@ -559,18 +499,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis .LBB0_7: # %vector.body124 vld $vr0, $s0, 0 vld $vr1, $s0, 16 - vilvh.h $vr2, $vr0, $vr0 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvh.h $vr3, $vr1, $vr1 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr0, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr0, $vr0, 0 + vbsrl.v $vr3, $vr1, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr4, $vr0, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -591,18 +525,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 0 vst $vr1, $s0, 16 vld $vr1, $s0, 48 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -621,18 +549,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 32 vst $vr1, $s0, 48 vld $vr1, $s0, 80 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -651,18 +573,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 64 vst $vr1, $s0, 80 vld $vr1, $s0, 112 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -681,18 +597,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 96 vst $vr1, $s0, 112 vld $vr1, $s0, 144 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -711,18 +621,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 128 vst $vr1, $s0, 144 vld $vr1, $s0, 176 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -741,18 +645,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 160 vst $vr1, $s0, 176 vld $vr1, $s0, 208 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -771,18 +669,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 192 vst $vr1, $s0, 208 vld $vr1, $s0, 240 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -801,18 +693,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 224 vst $vr1, $s0, 240 vld $vr1, $s0, 272 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -831,18 +717,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 256 vst $vr1, $s0, 272 vld $vr1, $s0, 304 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 13 vslli.w $vr2, $vr2, 13 vslli.w $vr1, $vr1, 13 @@ -851,18 +731,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis .LBB0_8: # %vector.body132 vld $vr0, $s0, 0 vld $vr1, $s0, 16 - vilvh.h $vr2, $vr0, $vr0 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvh.h $vr3, $vr1, $vr1 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr0, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr0, $vr0, 0 + vbsrl.v $vr3, $vr1, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr4, $vr0, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -883,18 +757,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 0 vst $vr1, $s0, 16 vld $vr1, $s0, 48 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -913,18 +781,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 32 vst $vr1, $s0, 48 vld $vr1, $s0, 80 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -943,18 +805,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 64 vst $vr1, $s0, 80 vld $vr1, $s0, 112 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -973,18 +829,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 96 vst $vr1, $s0, 112 vld $vr1, $s0, 144 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -1003,18 +853,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 128 vst $vr1, $s0, 144 vld $vr1, $s0, 176 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -1033,18 +877,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 160 vst $vr1, $s0, 176 vld $vr1, $s0, 208 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -1063,18 +901,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 192 vst $vr1, $s0, 208 vld $vr1, $s0, 240 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -1093,18 +925,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 224 vst $vr1, $s0, 240 vld $vr1, $s0, 272 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -1123,18 +949,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 256 vst $vr1, $s0, 272 vld $vr1, $s0, 304 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 14 vslli.w $vr2, $vr2, 14 vslli.w $vr1, $vr1, 14 @@ -1143,18 +963,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis .LBB0_9: # %vector.body116 vld $vr0, $s0, 0 vld $vr1, $s0, 16 - vilvh.h $vr2, $vr0, $vr0 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvh.h $vr3, $vr1, $vr1 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr0, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr0, $vr0, 0 + vbsrl.v $vr3, $vr1, 8 + vsllwil.w.h $vr3, $vr3, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr4, $vr0, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1175,18 +989,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 0 vst $vr1, $s0, 16 vld $vr1, $s0, 48 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1205,18 +1013,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 32 vst $vr1, $s0, 48 vld $vr1, $s0, 80 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1235,18 +1037,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 64 vst $vr1, $s0, 80 vld $vr1, $s0, 112 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1265,18 +1061,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 96 vst $vr1, $s0, 112 vld $vr1, $s0, 144 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1295,18 +1085,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 128 vst $vr1, $s0, 144 vld $vr1, $s0, 176 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1325,18 +1109,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 160 vst $vr1, $s0, 176 vld $vr1, $s0, 208 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1355,18 +1133,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 192 vst $vr1, $s0, 208 vld $vr1, $s0, 240 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1385,18 +1157,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 224 vst $vr1, $s0, 240 vld $vr1, $s0, 272 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1415,18 +1181,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr2, $s0, 256 vst $vr1, $s0, 272 vld $vr1, $s0, 304 - vilvh.h $vr2, $vr3, $vr3 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vilvh.h $vr4, $vr1, $vr1 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vbsrl.v $vr2, $vr3, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr3, $vr3, 0 + vbsrl.v $vr4, $vr1, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr1, $vr1, 0 vslli.w $vr3, $vr3, 12 vslli.w $vr2, $vr2, 12 vslli.w $vr1, $vr1, 12 @@ -1547,91 +1307,71 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis add.d $a4, $s0, $a0 ld.w $a5, $a4, 16 ld.w $a6, $a4, 20 - vinsgr2vr.w $vr22, $a5, 0 - vori.b $vr23, $vr18, 0 - vshuf.h $vr23, $vr22, $vr19 + vinsgr2vr.w $vr20, $a5, 0 + vori.b $vr22, $vr18, 0 + vshuf.h $vr22, $vr20, $vr19 vinsgr2vr.w $vr19, $a6, 0 - vilvl.h $vr20, $vr22, $vr22 - vilvl.w $vr20, $vr20, $vr20 - vslli.d $vr20, $vr20, 48 - vsrai.d $vr20, $vr20, 48 - vilvl.h $vr21, $vr19, $vr19 - vilvl.w $vr21, $vr21, $vr21 - vslli.d $vr21, $vr21, 48 - vsrai.d $vr21, $vr21, 48 - vmadd.d $vr17, $vr20, $vr20 - vmadd.d $vr16, $vr21, $vr21 - vslli.d $vr23, $vr23, 48 - vsrai.d $vr23, $vr23, 48 - vori.b $vr24, $vr18, 0 - vshuf.h $vr24, $vr19, $vr22 - vslli.d $vr22, $vr24, 48 + vori.b $vr23, $vr18, 0 + vshuf.h $vr23, $vr19, $vr20 + vsllwil.w.h $vr20, $vr20, 0 + vsllwil.d.w $vr21, $vr20, 0 + vsllwil.w.h $vr20, $vr19, 0 + vsllwil.d.w $vr20, $vr20, 0 + vmadd.d $vr17, $vr21, $vr21 + vmadd.d $vr16, $vr20, $vr20 + vsllwil.w.h $vr22, $vr22, 0 ld.w $a5, $a4, 12 - vsrai.d $vr22, $vr22, 48 - vmadd.d $vr0, $vr20, $vr23 - vmadd.d $vr7, $vr21, $vr22 - vinsgr2vr.w $vr22, $a5, 0 + vsllwil.d.w $vr22, $vr22, 0 + vsllwil.w.h $vr23, $vr23, 0 + vsllwil.d.w $vr23, $vr23, 0 + vinsgr2vr.w $vr24, $a5, 0 ld.w $a5, $a4, 10 - vilvl.h $vr22, $vr22, $vr22 - vilvl.w $vr22, $vr22, $vr22 - vslli.d $vr22, $vr22, 48 + vmadd.d $vr0, $vr21, $vr22 + vmadd.d $vr7, $vr20, $vr23 + vsllwil.w.h $vr22, $vr24, 0 vinsgr2vr.w $vr23, $a5, 0 ld.w $a5, $a4, 14 - vsrai.d $vr22, $vr22, 48 - vmadd.d $vr1, $vr22, $vr20 - vmadd.d $vr8, $vr20, $vr21 + vsllwil.d.w $vr22, $vr22, 0 + vmadd.d $vr1, $vr22, $vr21 + vmadd.d $vr8, $vr21, $vr20 vinsgr2vr.w $vr24, $a5, 0 - vilvl.h $vr23, $vr23, $vr23 - vilvl.w $vr23, $vr23, $vr23 - vslli.d $vr23, $vr23, 48 - vsrai.d $vr23, $vr23, 48 - vilvl.h $vr24, $vr24, $vr24 - vilvl.w $vr24, $vr24, $vr24 - vslli.d $vr24, $vr24, 48 + vsllwil.w.h $vr23, $vr23, 0 + vsllwil.d.w $vr23, $vr23, 0 + vsllwil.w.h $vr24, $vr24, 0 ld.w $a5, $a4, 8 - vsrai.d $vr24, $vr24, 48 - vmadd.d $vr2, $vr23, $vr20 - vmadd.d $vr9, $vr24, $vr21 + vsllwil.d.w $vr24, $vr24, 0 + vmadd.d $vr2, $vr23, $vr21 + vmadd.d $vr9, $vr24, $vr20 vinsgr2vr.w $vr24, $a5, 0 - vilvl.h $vr24, $vr24, $vr24 - vilvl.w $vr24, $vr24, $vr24 - vslli.d $vr24, $vr24, 48 + vsllwil.w.h $vr24, $vr24, 0 ld.w $a5, $a4, 6 - vsrai.d $vr24, $vr24, 48 - vmadd.d $vr3, $vr24, $vr20 - vmadd.d $vr10, $vr22, $vr21 + vsllwil.d.w $vr24, $vr24, 0 + vmadd.d $vr3, $vr24, $vr21 + vmadd.d $vr10, $vr22, $vr20 vinsgr2vr.w $vr22, $a5, 0 - vilvl.h $vr22, $vr22, $vr22 - vilvl.w $vr22, $vr22, $vr22 - vslli.d $vr22, $vr22, 48 + vsllwil.w.h $vr22, $vr22, 0 ld.w $a5, $a4, 4 - vsrai.d $vr22, $vr22, 48 - vmadd.d $vr4, $vr22, $vr20 - vmadd.d $vr11, $vr23, $vr21 + vsllwil.d.w $vr22, $vr22, 0 + vmadd.d $vr4, $vr22, $vr21 + vmadd.d $vr11, $vr23, $vr20 vinsgr2vr.w $vr23, $a5, 0 - vilvl.h $vr23, $vr23, $vr23 - vilvl.w $vr23, $vr23, $vr23 - vslli.d $vr23, $vr23, 48 + vsllwil.w.h $vr23, $vr23, 0 ld.w $a4, $a4, 2 - vsrai.d $vr23, $vr23, 48 - vmadd.d $vr5, $vr23, $vr20 - vmadd.d $vr12, $vr24, $vr21 + vsllwil.d.w $vr23, $vr23, 0 + vmadd.d $vr5, $vr23, $vr21 + vmadd.d $vr12, $vr24, $vr20 vinsgr2vr.w $vr24, $a4, 0 - vilvl.h $vr24, $vr24, $vr24 - vilvl.w $vr24, $vr24, $vr24 - vslli.d $vr24, $vr24, 48 + vsllwil.w.h $vr24, $vr24, 0 ldx.w $a4, $s0, $a0 - vsrai.d $vr24, $vr24, 48 - vmadd.d $vr6, $vr24, $vr20 - vmadd.d $vr13, $vr22, $vr21 + vsllwil.d.w $vr24, $vr24, 0 + vmadd.d $vr6, $vr24, $vr21 + vmadd.d $vr13, $vr22, $vr20 vinsgr2vr.w $vr22, $a4, 0 - vilvl.h $vr22, $vr22, $vr22 - vilvl.w $vr22, $vr22, $vr22 - vslli.d $vr22, $vr22, 48 - vsrai.d $vr22, $vr22, 48 - vmadd.d $vr14, $vr22, $vr20 + vsllwil.w.h $vr22, $vr22, 0 + vsllwil.d.w $vr22, $vr22, 0 + vmadd.d $vr14, $vr22, $vr21 addi.d $a0, $a0, 8 - vmadd.d $vr15, $vr23, $vr21 + vmadd.d $vr15, $vr23, $vr20 bne $a0, $a3, .LBB0_12 # %bb.13: # %middle.block177 vadd.d $vr16, $vr16, $vr17 @@ -2281,50 +2021,44 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vslli.h $vr4, $vr4, 15 vsrai.h $vr4, $vr4, 15 vbitsel.v $vr0, $vr0, $vr1, $vr4 - vilvl.h $vr1, $vr3, $vr3 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vsllwil.w.h $vr1, $vr3, 0 lu12i.w $a0, 10 vreplgr2vr.w $vr3, $a0 vmul.w $vr1, $vr1, $vr3 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 pcalau12i $a0, %pc_hi20(.LCPI0_1) vld $vr3, $a0, %pc_lo12(.LCPI0_1) ori $a0, $zero, 0 lu32i.d $a0, 40960 vreplgr2vr.d $vr4, $a0 vmadd.d $vr3, $vr2, $vr4 - vsrli.d $vr2, $vr3, 32 - vilvl.h $vr0, $vr0, $vr0 - vilvh.w $vr3, $vr0, $vr0 - vslli.d $vr3, $vr3, 48 + vshuf4i.h $vr2, $vr0, 14 + vsllwil.w.h $vr2, $vr2, 0 pcalau12i $a0, %pc_hi20(.LCPI0_3) vld $vr4, $a0, %pc_lo12(.LCPI0_3) pcalau12i $a0, %pc_hi20(.LCPI0_4) vld $vr5, $a0, %pc_lo12(.LCPI0_4) - vsrai.d $vr3, $vr3, 48 + vsllwil.d.w $vr2, $vr2, 0 pcalau12i $a0, %pc_hi20(.LCPI0_2) vld $vr6, $a0, %pc_lo12(.LCPI0_2) - vmadd.d $vr5, $vr3, $vr4 + vmadd.d $vr5, $vr2, $vr4 pcalau12i $a0, %pc_hi20(.LCPI0_5) - vld $vr3, $a0, %pc_lo12(.LCPI0_5) - vilvl.w $vr0, $vr0, $vr0 - vslli.d $vr0, $vr0, 48 - vsrai.d $vr0, $vr0, 48 - vmadd.d $vr3, $vr0, $vr6 + vld $vr2, $a0, %pc_lo12(.LCPI0_5) + vsrli.d $vr3, $vr3, 32 + vsllwil.w.h $vr0, $vr0, 0 + vsllwil.d.w $vr0, $vr0, 0 + vmadd.d $vr2, $vr0, $vr6 pcalau12i $a0, %pc_hi20(.LCPI0_6) vld $vr0, $a0, %pc_lo12(.LCPI0_6) - vsrli.d $vr3, $vr3, 32 + vsrli.d $vr2, $vr2, 32 vsrli.d $vr4, $vr5, 32 - vpickev.w $vr3, $vr4, $vr3 - vshuf.w $vr0, $vr2, $vr1 + vpickev.w $vr2, $vr4, $vr2 + vshuf.w $vr0, $vr3, $vr1 lu12i.w $a0, 4096 vreplgr2vr.w $vr1, $a0 vadd.w $vr0, $vr0, $vr1 - vadd.w $vr1, $vr3, $vr1 + vadd.w $vr1, $vr2, $vr1 vsrai.w $vr1, $vr1, 25 vsrai.w $vr0, $vr0, 25 vshuf4i.w $vr2, $vr0, 14 diff --git a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/rpe.s b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/rpe.s index 7518c9bf..d714f3e4 100644 --- a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/rpe.s +++ b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/rpe.s @@ -1,462 +1,358 @@ .file "rpe.c" - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4, 0x0 # -- Begin function Gsm_RPE_Encoding -.LCPI0_0: - .half 3 # 0x3 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 4 # 0x4 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff -.LCPI0_1: - .half 5 # 0x5 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 6 # 0x6 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff -.LCPI0_2: - .half 7 # 0x7 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 8 # 0x8 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff -.LCPI0_3: - .half 1 # 0x1 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 2 # 0x2 - .half 65535 # 0xffff - .half 65535 # 0xffff - .half 65535 # 0xffff .text - .globl Gsm_RPE_Encoding + .globl Gsm_RPE_Encoding # -- Begin function Gsm_RPE_Encoding .p2align 5 .type Gsm_RPE_Encoding,@function Gsm_RPE_Encoding: # @Gsm_RPE_Encoding # %bb.0: # %vector.ph - addi.d $sp, $sp, -320 - st.d $ra, $sp, 312 # 8-byte Folded Spill - st.d $fp, $sp, 304 # 8-byte Folded Spill - st.d $s0, $sp, 296 # 8-byte Folded Spill - st.d $s1, $sp, 288 # 8-byte Folded Spill - st.d $s2, $sp, 280 # 8-byte Folded Spill - st.d $s3, $sp, 272 # 8-byte Folded Spill - st.d $s4, $sp, 264 # 8-byte Folded Spill - st.d $s5, $sp, 256 # 8-byte Folded Spill - st.d $s6, $sp, 248 # 8-byte Folded Spill - st.d $s7, $sp, 240 # 8-byte Folded Spill - st.d $s8, $sp, 232 # 8-byte Folded Spill - fst.d $fs0, $sp, 224 # 8-byte Folded Spill - fst.d $fs1, $sp, 216 # 8-byte Folded Spill - fst.d $fs2, $sp, 208 # 8-byte Folded Spill - fst.d $fs3, $sp, 200 # 8-byte Folded Spill - fst.d $fs4, $sp, 192 # 8-byte Folded Spill - fst.d $fs5, $sp, 184 # 8-byte Folded Spill - fst.d $fs6, $sp, 176 # 8-byte Folded Spill + addi.d $sp, $sp, -288 + st.d $ra, $sp, 280 # 8-byte Folded Spill + st.d $fp, $sp, 272 # 8-byte Folded Spill + st.d $s0, $sp, 264 # 8-byte Folded Spill + st.d $s1, $sp, 256 # 8-byte Folded Spill + st.d $s2, $sp, 248 # 8-byte Folded Spill + st.d $s3, $sp, 240 # 8-byte Folded Spill + st.d $s4, $sp, 232 # 8-byte Folded Spill + st.d $s5, $sp, 224 # 8-byte Folded Spill + st.d $s6, $sp, 216 # 8-byte Folded Spill + st.d $s7, $sp, 208 # 8-byte Folded Spill + st.d $s8, $sp, 200 # 8-byte Folded Spill + fst.d $fs0, $sp, 192 # 8-byte Folded Spill + fst.d $fs1, $sp, 184 # 8-byte Folded Spill move $fp, $a1 ld.h $a0, $a1, -10 - st.d $a4, $sp, 48 # 8-byte Folded Spill + st.d $a4, $sp, 56 # 8-byte Folded Spill move $t3, $a3 - st.d $a2, $sp, 40 # 8-byte Folded Spill + st.d $a2, $sp, 48 # 8-byte Folded Spill move $a2, $zero - vinsgr2vr.h $vr14, $a0, 7 - pcalau12i $a0, %pc_hi20(.LCPI0_0) - vld $vr0, $a0, %pc_lo12(.LCPI0_0) - pcalau12i $a0, %pc_hi20(.LCPI0_1) - vld $vr1, $a0, %pc_lo12(.LCPI0_1) - pcalau12i $a0, %pc_hi20(.LCPI0_2) - vld $vr2, $a0, %pc_lo12(.LCPI0_2) - pcalau12i $a0, %pc_hi20(.LCPI0_3) - vld $vr3, $a0, %pc_lo12(.LCPI0_3) + vinsgr2vr.h $vr8, $a0, 7 lu12i.w $a0, 1 ori $a1, $a0, 1645 - vreplgr2vr.d $vr4, $a1 + vreplgr2vr.d $vr0, $a1 ori $a1, $zero, 2054 - vreplgr2vr.d $vr5, $a1 - vrepli.d $vr6, -374 - vrepli.d $vr7, -134 - vreplgr2vr.d $vr8, $a0 + vreplgr2vr.d $vr1, $a1 + vrepli.d $vr2, -374 + vrepli.d $vr3, -134 + vreplgr2vr.d $vr4, $a0 lu12i.w $a1, -8 - vreplgr2vr.d $vr9, $a1 + vreplgr2vr.d $vr5, $a1 lu12i.w $a1, 7 ori $a1, $a1, 4095 - vreplgr2vr.d $vr10, $a1 - addi.d $a3, $sp, 96 + vreplgr2vr.d $vr6, $a1 + addi.d $a3, $sp, 104 ori $a4, $zero, 80 .p2align 4, , 16 .LBB0_1: # %vector.body # =>This Inner Loop Header: Depth=1 add.d $a5, $fp, $a2 - vld $vr11, $a5, -8 - vori.b $vr12, $vr0, 0 - vshuf.h $vr12, $vr0, $vr11 - vslli.d $vr12, $vr12, 48 - vsrai.d $vr12, $vr12, 48 - vori.b $vr13, $vr1, 0 - vshuf.h $vr13, $vr0, $vr11 - vslli.d $vr13, $vr13, 48 - vsrai.d $vr13, $vr13, 48 - vori.b $vr15, $vr2, 0 - vshuf.h $vr15, $vr11, $vr14 - vslli.d $vr14, $vr15, 48 - vsrai.d $vr14, $vr14, 48 - vori.b $vr15, $vr3, 0 - vshuf.h $vr15, $vr0, $vr11 - vslli.d $vr15, $vr15, 48 - vsrai.d $vr15, $vr15, 48 - vilvl.h $vr17, $vr11, $vr11 - vilvh.w $vr16, $vr17, $vr17 - vslli.d $vr16, $vr16, 48 - vsrai.d $vr16, $vr16, 48 - vilvl.w $vr17, $vr17, $vr17 - vslli.d $vr17, $vr17, 48 - vsrai.d $vr17, $vr17, 48 - vilvh.h $vr19, $vr11, $vr11 - vilvh.w $vr18, $vr19, $vr19 - vslli.d $vr18, $vr18, 48 - vsrai.d $vr18, $vr18, 48 - vld $vr22, $a5, -4 - vilvl.w $vr19, $vr19, $vr19 - vslli.d $vr19, $vr19, 48 - vsrai.d $vr19, $vr19, 48 - vilvh.h $vr21, $vr22, $vr22 - vilvl.w $vr20, $vr21, $vr21 - vslli.d $vr20, $vr20, 48 - vsrai.d $vr20, $vr20, 48 - vilvh.w $vr21, $vr21, $vr21 - vslli.d $vr21, $vr21, 48 - vsrai.d $vr21, $vr21, 48 - vilvl.h $vr23, $vr22, $vr22 - vilvl.w $vr22, $vr23, $vr23 - vslli.d $vr22, $vr22, 48 - vsrai.d $vr22, $vr22, 48 - vld $vr26, $a5, -2 - vilvh.w $vr23, $vr23, $vr23 - vslli.d $vr23, $vr23, 48 - vsrai.d $vr23, $vr23, 48 - vilvh.h $vr25, $vr26, $vr26 - vilvl.w $vr24, $vr25, $vr25 - vslli.d $vr24, $vr24, 48 - vsrai.d $vr24, $vr24, 48 - vilvh.w $vr25, $vr25, $vr25 - vslli.d $vr25, $vr25, 48 - vsrai.d $vr25, $vr25, 48 - vilvl.h $vr26, $vr26, $vr26 - vilvl.w $vr27, $vr26, $vr26 - vslli.d $vr27, $vr27, 48 - vsrai.d $vr27, $vr27, 48 - vld $vr28, $a5, 2 - vilvh.w $vr26, $vr26, $vr26 - vslli.d $vr26, $vr26, 48 - vsrai.d $vr26, $vr26, 48 - vilvh.h $vr29, $vr28, $vr28 - vilvl.h $vr28, $vr28, $vr28 - vilvh.w $vr30, $vr28, $vr28 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vadd.d $vr26, $vr30, $vr26 - vilvl.w $vr30, $vr29, $vr29 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vilvh.w $vr29, $vr29, $vr29 - vilvl.w $vr28, $vr28, $vr28 - vslli.d $vr28, $vr28, 48 - vsrai.d $vr28, $vr28, 48 - vadd.d $vr27, $vr28, $vr27 - vld $vr28, $a5, 4 - vslli.d $vr29, $vr29, 48 - vsrai.d $vr29, $vr29, 48 - vadd.d $vr25, $vr29, $vr25 - vilvh.h $vr29, $vr28, $vr28 - vilvl.h $vr28, $vr28, $vr28 - vadd.d $vr24, $vr30, $vr24 - vilvh.w $vr30, $vr28, $vr28 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vadd.d $vr23, $vr30, $vr23 - vilvl.w $vr30, $vr29, $vr29 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vilvh.w $vr29, $vr29, $vr29 - vilvl.w $vr28, $vr28, $vr28 - vslli.d $vr28, $vr28, 48 - vsrai.d $vr28, $vr28, 48 - vadd.d $vr22, $vr28, $vr22 - vld $vr28, $a5, 8 - vslli.d $vr29, $vr29, 48 - vsrai.d $vr29, $vr29, 48 - vadd.d $vr21, $vr29, $vr21 - vilvl.h $vr29, $vr28, $vr28 - vilvh.h $vr28, $vr28, $vr28 - vadd.d $vr20, $vr30, $vr20 - vilvl.w $vr30, $vr28, $vr28 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vadd.d $vr19, $vr30, $vr19 - vilvh.w $vr30, $vr29, $vr29 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vilvl.w $vr29, $vr29, $vr29 - vilvh.w $vr28, $vr28, $vr28 - vslli.d $vr28, $vr28, 48 - vsrai.d $vr28, $vr28, 48 - vadd.d $vr18, $vr28, $vr18 - vld $vr28, $a5, 10 - vslli.d $vr29, $vr29, 48 - vsrai.d $vr29, $vr29, 48 - vadd.d $vr17, $vr29, $vr17 - vilvl.h $vr29, $vr28, $vr28 - vadd.d $vr16, $vr30, $vr16 - vilvh.w $vr30, $vr29, $vr29 - vslli.d $vr30, $vr30, 48 - vsrai.d $vr30, $vr30, 48 - vadd.d $vr15, $vr30, $vr15 - vldx $vr30, $fp, $a2 - vilvh.h $vr28, $vr28, $vr28 - vilvl.w $vr29, $vr29, $vr29 - vslli.d $vr29, $vr29, 48 - vsrai.d $vr29, $vr29, 48 - vadd.d $vr14, $vr29, $vr14 - vilvh.w $vr29, $vr28, $vr28 - vslli.d $vr29, $vr29, 48 - vsrai.d $vr29, $vr29, 48 - vadd.d $vr13, $vr29, $vr13 - vilvh.h $vr29, $vr30, $vr30 - vilvl.w $vr28, $vr28, $vr28 - vslli.d $vr28, $vr28, 48 - vsrai.d $vr28, $vr28, 48 - vadd.d $vr12, $vr28, $vr12 - vilvl.w $vr28, $vr29, $vr29 - vslli.d $vr28, $vr28, 48 - vsrai.d $vr28, $vr28, 48 - vslli.d $vr28, $vr28, 13 - vmadd.d $vr28, $vr24, $vr4 - vmadd.d $vr28, $vr20, $vr5 - vilvh.w $vr20, $vr29, $vr29 - vslli.d $vr20, $vr20, 48 - vsrai.d $vr20, $vr20, 48 - vslli.d $vr20, $vr20, 13 - vmadd.d $vr20, $vr25, $vr4 - vilvl.h $vr24, $vr30, $vr30 - vmadd.d $vr20, $vr21, $vr5 - vilvl.w $vr21, $vr24, $vr24 - vslli.d $vr21, $vr21, 48 - vsrai.d $vr21, $vr21, 48 - vslli.d $vr21, $vr21, 13 - vmadd.d $vr21, $vr27, $vr4 - vmadd.d $vr21, $vr22, $vr5 - vilvh.w $vr22, $vr24, $vr24 - vslli.d $vr22, $vr22, 48 - vsrai.d $vr22, $vr22, 48 - vslli.d $vr22, $vr22, 13 - vmadd.d $vr22, $vr26, $vr4 - vmadd.d $vr22, $vr23, $vr5 - vmadd.d $vr22, $vr16, $vr6 - vmadd.d $vr21, $vr17, $vr6 - vmadd.d $vr20, $vr18, $vr6 - vmadd.d $vr28, $vr19, $vr6 - vmadd.d $vr28, $vr12, $vr7 - vmadd.d $vr20, $vr13, $vr7 - vmadd.d $vr21, $vr14, $vr7 - vmadd.d $vr22, $vr15, $vr7 - vadd.d $vr12, $vr22, $vr8 - vadd.d $vr13, $vr21, $vr8 - vsrai.d $vr13, $vr13, 13 - vsrai.d $vr12, $vr12, 13 - vmax.d $vr12, $vr12, $vr9 - vmax.d $vr13, $vr13, $vr9 - vmin.d $vr13, $vr13, $vr10 - vmin.d $vr12, $vr12, $vr10 - vpickev.w $vr12, $vr12, $vr13 - vadd.d $vr13, $vr20, $vr8 - vadd.d $vr14, $vr28, $vr8 - vsrai.d $vr14, $vr14, 13 - vsrai.d $vr13, $vr13, 13 - vmax.d $vr13, $vr13, $vr9 - vmax.d $vr14, $vr14, $vr9 - vmin.d $vr14, $vr14, $vr10 - vmin.d $vr13, $vr13, $vr10 - vpickev.w $vr13, $vr13, $vr14 - vpickev.h $vr12, $vr13, $vr12 - vstx $vr12, $a2, $a3 + vld $vr7, $a5, -8 + vbsrl.v $vr8, $vr8, 14 + vbsll.v $vr9, $vr7, 2 + vor.v $vr10, $vr9, $vr8 + vbsrl.v $vr8, $vr7, 6 + vsllwil.w.h $vr8, $vr8, 0 + vsllwil.d.w $vr8, $vr8, 0 + vbsrl.v $vr9, $vr7, 10 + vsllwil.w.h $vr9, $vr9, 0 + vsllwil.d.w $vr9, $vr9, 0 + vsllwil.w.h $vr10, $vr10, 0 + vsllwil.d.w $vr10, $vr10, 0 + vshuf4i.h $vr11, $vr7, 9 + vsllwil.w.h $vr11, $vr11, 0 + vsllwil.d.w $vr11, $vr11, 0 + vshuf4i.h $vr12, $vr7, 14 + vsllwil.w.h $vr12, $vr12, 0 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.w.h $vr13, $vr7, 0 + vsllwil.d.w $vr13, $vr13, 0 + vbsrl.v $vr14, $vr7, 12 + vsllwil.w.h $vr14, $vr14, 0 + vsllwil.d.w $vr14, $vr14, 0 + vld $vr19, $a5, -4 + vbsrl.v $vr15, $vr7, 8 + vsllwil.w.h $vr15, $vr15, 0 + vsllwil.d.w $vr15, $vr15, 0 + vbsrl.v $vr16, $vr19, 8 + vsllwil.w.h $vr16, $vr16, 0 + vsllwil.d.w $vr16, $vr16, 0 + vbsrl.v $vr17, $vr19, 12 + vsllwil.w.h $vr17, $vr17, 0 + vsllwil.d.w $vr17, $vr17, 0 + vsllwil.w.h $vr18, $vr19, 0 + vsllwil.d.w $vr18, $vr18, 0 + vld $vr20, $a5, -2 + vshuf4i.h $vr19, $vr19, 14 + vsllwil.w.h $vr19, $vr19, 0 + vsllwil.d.w $vr19, $vr19, 0 + vbsrl.v $vr21, $vr20, 8 + vsllwil.w.h $vr21, $vr21, 0 + vsllwil.d.w $vr21, $vr21, 0 + vbsrl.v $vr22, $vr20, 12 + vsllwil.w.h $vr22, $vr22, 0 + vsllwil.d.w $vr22, $vr22, 0 + vsllwil.w.h $vr23, $vr20, 0 + vsllwil.d.w $vr23, $vr23, 0 + vld $vr24, $a5, 2 + vshuf4i.h $vr20, $vr20, 14 + vsllwil.w.h $vr20, $vr20, 0 + vsllwil.d.w $vr20, $vr20, 0 + vshuf4i.h $vr25, $vr24, 14 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr20, $vr25, $vr20 + vsllwil.w.h $vr25, $vr24, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr23, $vr25, $vr23 + vbsrl.v $vr25, $vr24, 8 + vbsrl.v $vr24, $vr24, 12 + vsllwil.w.h $vr24, $vr24, 0 + vsllwil.d.w $vr24, $vr24, 0 + vadd.d $vr22, $vr24, $vr22 + vld $vr24, $a5, 4 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr21, $vr25, $vr21 + vshuf4i.h $vr25, $vr24, 14 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr19, $vr25, $vr19 + vsllwil.w.h $vr25, $vr24, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr18, $vr25, $vr18 + vbsrl.v $vr25, $vr24, 8 + vbsrl.v $vr24, $vr24, 12 + vsllwil.w.h $vr24, $vr24, 0 + vsllwil.d.w $vr24, $vr24, 0 + vadd.d $vr17, $vr24, $vr17 + vld $vr24, $a5, 8 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr16, $vr25, $vr16 + vbsrl.v $vr25, $vr24, 8 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr15, $vr25, $vr15 + vbsrl.v $vr25, $vr24, 12 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr14, $vr25, $vr14 + vshuf4i.h $vr25, $vr24, 14 + vsllwil.w.h $vr24, $vr24, 0 + vsllwil.d.w $vr24, $vr24, 0 + vadd.d $vr13, $vr24, $vr13 + vld $vr24, $a5, 10 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr12, $vr25, $vr12 + vshuf4i.h $vr25, $vr24, 14 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr11, $vr25, $vr11 + vsllwil.w.h $vr25, $vr24, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr10, $vr25, $vr10 + vbsrl.v $vr25, $vr24, 12 + vsllwil.w.h $vr25, $vr25, 0 + vsllwil.d.w $vr25, $vr25, 0 + vadd.d $vr9, $vr25, $vr9 + vldx $vr25, $fp, $a2 + vbsrl.v $vr24, $vr24, 8 + vsllwil.w.h $vr24, $vr24, 0 + vsllwil.d.w $vr24, $vr24, 0 + vadd.d $vr8, $vr24, $vr8 + vbsrl.v $vr24, $vr25, 8 + vsllwil.w.h $vr24, $vr24, 0 + vsllwil.d.w $vr24, $vr24, 0 + vslli.d $vr24, $vr24, 13 + vmadd.d $vr24, $vr21, $vr0 + vmadd.d $vr24, $vr16, $vr1 + vbsrl.v $vr16, $vr25, 12 + vsllwil.w.h $vr16, $vr16, 0 + vsllwil.d.w $vr16, $vr16, 0 + vslli.d $vr16, $vr16, 13 + vmadd.d $vr16, $vr22, $vr0 + vmadd.d $vr16, $vr17, $vr1 + vsllwil.w.h $vr17, $vr25, 0 + vsllwil.d.w $vr17, $vr17, 0 + vslli.d $vr17, $vr17, 13 + vmadd.d $vr17, $vr23, $vr0 + vmadd.d $vr17, $vr18, $vr1 + vshuf4i.h $vr18, $vr25, 14 + vsllwil.w.h $vr18, $vr18, 0 + vsllwil.d.w $vr18, $vr18, 0 + vslli.d $vr18, $vr18, 13 + vmadd.d $vr18, $vr20, $vr0 + vmadd.d $vr18, $vr19, $vr1 + vmadd.d $vr18, $vr12, $vr2 + vmadd.d $vr17, $vr13, $vr2 + vmadd.d $vr16, $vr14, $vr2 + vmadd.d $vr24, $vr15, $vr2 + vmadd.d $vr24, $vr8, $vr3 + vmadd.d $vr16, $vr9, $vr3 + vmadd.d $vr17, $vr10, $vr3 + vmadd.d $vr18, $vr11, $vr3 + vadd.d $vr8, $vr18, $vr4 + vadd.d $vr9, $vr17, $vr4 + vsrai.d $vr9, $vr9, 13 + vsrai.d $vr8, $vr8, 13 + vmax.d $vr8, $vr8, $vr5 + vmax.d $vr9, $vr9, $vr5 + vmin.d $vr9, $vr9, $vr6 + vmin.d $vr8, $vr8, $vr6 + vpickev.w $vr8, $vr8, $vr9 + vadd.d $vr9, $vr16, $vr4 + vadd.d $vr10, $vr24, $vr4 + vsrai.d $vr10, $vr10, 13 + vsrai.d $vr9, $vr9, 13 + vmax.d $vr9, $vr9, $vr5 + vmax.d $vr10, $vr10, $vr5 + vmin.d $vr10, $vr10, $vr6 + vmin.d $vr9, $vr9, $vr6 + vpickev.w $vr9, $vr9, $vr10 + vpickev.h $vr8, $vr9, $vr8 + vstx $vr8, $a2, $a3 addi.d $a2, $a2, 16 - vori.b $vr14, $vr11, 0 + vori.b $vr8, $vr7, 0 bne $a2, $a4, .LBB0_1 # %bb.2: # %Weighting_filter.exit - ld.h $a2, $sp, 96 - ld.h $a3, $sp, 98 + ld.h $a2, $sp, 104 + ld.h $a3, $sp, 106 srai.d $a2, $a2, 2 mul.d $a2, $a2, $a2 - ld.h $a4, $sp, 104 + ld.h $a4, $sp, 112 srai.d $a3, $a3, 2 mul.d $a3, $a3, $a3 - ld.h $a5, $sp, 110 + ld.h $a5, $sp, 118 srai.d $a4, $a4, 2 mul.d $a4, $a4, $a4 add.d $a3, $a4, $a3 srai.d $a4, $a5, 2 - ld.h $a5, $sp, 116 + ld.h $a5, $sp, 124 mul.d $a4, $a4, $a4 add.d $a3, $a3, $a4 - ld.h $a4, $sp, 122 + ld.h $a4, $sp, 130 srai.d $a5, $a5, 2 mul.d $a5, $a5, $a5 add.d $a3, $a3, $a5 srai.d $a4, $a4, 2 - ld.h $a5, $sp, 128 + ld.h $a5, $sp, 136 mul.d $a4, $a4, $a4 add.d $a3, $a3, $a4 - ld.h $a4, $sp, 134 + ld.h $a4, $sp, 142 srai.d $a5, $a5, 2 mul.d $a5, $a5, $a5 add.d $a3, $a3, $a5 srai.d $a4, $a4, 2 - ld.h $a5, $sp, 140 + ld.h $a5, $sp, 148 mul.d $a4, $a4, $a4 add.d $a3, $a3, $a4 - ld.h $a4, $sp, 146 + ld.h $a4, $sp, 154 srai.d $a5, $a5, 2 mul.d $a5, $a5, $a5 add.d $a3, $a3, $a5 srai.d $a4, $a4, 2 - ld.h $a5, $sp, 152 + ld.h $a5, $sp, 160 mul.d $a4, $a4, $a4 add.d $a3, $a3, $a4 - ld.h $a4, $sp, 158 + ld.h $a4, $sp, 166 srai.d $a5, $a5, 2 mul.d $a5, $a5, $a5 add.d $a3, $a3, $a5 srai.d $a4, $a4, 2 - ld.h $a5, $sp, 164 + ld.h $a5, $sp, 172 mul.d $a4, $a4, $a4 add.d $a3, $a3, $a4 - ld.h $a4, $sp, 170 + ld.h $a4, $sp, 178 srai.d $a5, $a5, 2 mul.d $a5, $a5, $a5 add.d $a3, $a3, $a5 srai.d $a4, $a4, 2 - ld.w $a5, $sp, 100 mul.d $a4, $a4, $a4 + ld.w $a5, $sp, 108 add.d $a3, $a3, $a4 slli.d $a3, $a3, 1 + ld.w $a4, $sp, 114 vinsgr2vr.w $vr0, $a5, 0 vsrai.h $vr0, $vr0, 2 - ld.w $a4, $sp, 106 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 + vsllwil.w.h $vr0, $vr0, 0 vinsgr2vr.w $vr1, $a4, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - ld.w $a4, $sp, 112 - vsrai.w $vr1, $vr1, 16 + ld.w $a4, $sp, 120 + vsllwil.w.h $vr1, $vr1, 0 vmul.w $vr1, $vr1, $vr1 vmadd.w $vr1, $vr0, $vr0 vinsgr2vr.w $vr0, $a4, 0 + ld.w $a4, $sp, 126 vsrai.h $vr0, $vr0, 2 - vilvl.h $vr0, $vr0, $vr0 - ld.w $a4, $sp, 118 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 + vsllwil.w.h $vr0, $vr0, 0 vmadd.w $vr1, $vr0, $vr0 vinsgr2vr.w $vr0, $a4, 0 vsrai.h $vr0, $vr0, 2 - vilvl.h $vr0, $vr0, $vr0 - vilvl.w $vr0, $vr0, $vr0 - ld.w $a4, $sp, 124 - vslli.d $vr0, $vr0, 48 - ld.w $a5, $sp, 130 - vsrai.d $vr2, $vr0, 48 + ld.w $a4, $sp, 132 + vsllwil.w.h $vr0, $vr0, 0 + ld.w $a5, $sp, 138 + vsllwil.d.w $vr2, $vr0, 0 vinsgr2vr.w $vr0, $a4, 0 vsrai.h $vr3, $vr0, 2 vinsgr2vr.w $vr0, $a5, 0 vsrai.h $vr4, $vr0, 2 - ld.h $a4, $sp, 138 - vrepli.b $vr0, 0 - vilvl.w $vr0, $vr0, $vr1 + ld.h $a4, $sp, 146 + vsllwil.du.wu $vr0, $vr1, 0 vmadd.d $vr0, $vr2, $vr2 - vilvl.h $vr1, $vr3, $vr3 - vilvl.w $vr1, $vr1, $vr1 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 - vmadd.d $vr0, $vr1, $vr1 - vilvl.h $vr1, $vr4, $vr4 - vilvl.w $vr1, $vr1, $vr1 - vld $vr2, $sp, 136 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + vsllwil.w.h $vr1, $vr3, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 - vinsgr2vr.h $vr2, $a4, 1 - vsrai.h $vr1, $vr2, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - ld.w $a4, $sp, 142 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + vld $vr1, $sp, 144 + vsllwil.w.h $vr2, $vr4, 0 + vsllwil.d.w $vr2, $vr2, 0 + vmadd.d $vr0, $vr2, $vr2 + vinsgr2vr.h $vr1, $a4, 1 + vsrai.h $vr1, $vr1, 2 + ld.w $a4, $sp, 150 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vinsgr2vr.w $vr1, $a4, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - ld.w $a4, $sp, 148 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + ld.w $a4, $sp, 156 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vinsgr2vr.w $vr1, $a4, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - ld.w $a4, $sp, 154 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + ld.w $a4, $sp, 162 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vinsgr2vr.w $vr1, $a4, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - ld.w $a4, $sp, 160 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + ld.w $a4, $sp, 168 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vinsgr2vr.w $vr1, $a4, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - ld.w $a4, $sp, 166 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + ld.w $a4, $sp, 174 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vinsgr2vr.w $vr1, $a4, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vpickve2gr.d $a4, $vr0, 1 add.d $a2, $a4, $a2 slli.d $a2, $a2, 1 slt $a4, $a2, $a3 - ld.w $a5, $sp, 172 + ld.w $a5, $sp, 180 masknez $a2, $a2, $a4 maskeqz $a3, $a3, $a4 or $a2, $a3, $a2 vinsgr2vr.w $vr1, $a5, 0 vsrai.h $vr1, $vr1, 2 - vilvl.h $vr1, $vr1, $vr1 - vilvl.w $vr1, $vr1, $vr1 - vslli.d $vr1, $vr1, 48 - vsrai.d $vr1, $vr1, 48 + vsllwil.w.h $vr1, $vr1, 0 + vsllwil.d.w $vr1, $vr1, 0 vmadd.d $vr0, $vr1, $vr1 vslli.d $vr0, $vr0, 1 vpickve2gr.d $a3, $vr0, 0 @@ -479,7 +375,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding ori $a4, $zero, 3 maskeqz $a3, $a4, $a3 or $a3, $a3, $a2 - addi.d $a4, $sp, 96 + addi.d $a4, $sp, 104 alsl.d $a2, $a3, $a4, 1 slli.d $a5, $a3, 1 ldx.hu $a6, $a5, $a4 @@ -496,7 +392,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding ext.w.h $s0, $a7 ext.w.h $s2, $a5 ext.w.h $s8, $a4 - st.d $t3, $sp, 56 # 8-byte Folded Spill + st.d $t3, $sp, 64 # 8-byte Folded Spill st.h $a3, $t3, 0 lu12i.w $a3, 8 xor $t3, $a6, $a3 @@ -505,7 +401,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $t4, $t4, $t3 maskeqz $t3, $a1, $t3 or $t3, $t3, $t4 - st.d $t7, $sp, 16 # 8-byte Folded Spill + st.d $t7, $sp, 24 # 8-byte Folded Spill slti $t4, $t7, 0 masknez $a6, $a6, $t4 maskeqz $t3, $t3, $t4 @@ -516,7 +412,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $t4, $t4, $a6 maskeqz $a6, $a1, $a6 or $a6, $a6, $t4 - st.d $t6, $sp, 24 # 8-byte Folded Spill + st.d $t6, $sp, 32 # 8-byte Folded Spill slti $t4, $t6, 0 masknez $t2, $t2, $t4 maskeqz $t4, $a6, $t4 @@ -534,7 +430,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $t4, $t4, $t3 maskeqz $t3, $a1, $t3 or $t3, $t3, $t4 - st.d $t5, $sp, 32 # 8-byte Folded Spill + st.d $t5, $sp, 40 # 8-byte Folded Spill slti $t4, $t5, 0 masknez $t1, $t1, $t4 maskeqz $t3, $t3, $t4 @@ -551,7 +447,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $t3, $t3, $t2 maskeqz $t2, $a1, $t2 or $t2, $t2, $t3 - st.d $t8, $sp, 8 # 8-byte Folded Spill + st.d $t8, $sp, 16 # 8-byte Folded Spill slti $t3, $t8, 0 masknez $t4, $t0, $t3 maskeqz $t2, $t2, $t3 @@ -571,7 +467,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding slti $t3, $s0, 0 masknez $a7, $a7, $t3 maskeqz $t2, $t2, $t3 - ext.w.h $s1, $t0 + ext.w.h $s4, $t0 or $a7, $t2, $a7 ext.w.h $a7, $a7 slt $t2, $a7, $t1 @@ -603,7 +499,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding slti $t2, $s8, 0 masknez $a4, $a4, $t2 maskeqz $t1, $t1, $t2 - ext.w.h $s4, $a5 + ext.w.h $s5, $a5 or $a4, $t1, $a4 ext.w.h $a4, $a4 slt $t1, $a4, $a7 @@ -632,10 +528,10 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $a7, $a7, $a6 maskeqz $a6, $a1, $a6 or $a6, $a6, $a7 - slti $a7, $s1, 0 + slti $a7, $s4, 0 masknez $t0, $t0, $a7 maskeqz $a6, $a6, $a7 - ext.w.h $s5, $t1 + ext.w.h $s1, $t1 or $a6, $a6, $t0 ext.w.h $a6, $a6 slt $a7, $a6, $a4 @@ -648,7 +544,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $a7, $a7, $a6 maskeqz $a6, $a1, $a6 or $a6, $a6, $a7 - slti $a7, $s4, 0 + slti $a7, $s5, 0 masknez $a5, $a5, $a7 maskeqz $a6, $a6, $a7 ld.hu $a7, $a2, 66 @@ -664,7 +560,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding masknez $a6, $a6, $a5 maskeqz $a5, $a1, $a5 or $a5, $a5, $a6 - slti $a6, $s5, 0 + slti $a6, $s1, 0 masknez $t0, $t1, $a6 maskeqz $a5, $a5, $a6 ext.w.h $s6, $a7 @@ -776,13 +672,13 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding vinsgr2vr.h $vr0, $s2, 2 vinsgr2vr.h $vr0, $s8, 4 vinsgr2vr.h $vr0, $s3, 6 - ld.d $a6, $sp, 16 # 8-byte Folded Reload - vinsgr2vr.h $vr1, $a6, 0 ld.d $a6, $sp, 24 # 8-byte Folded Reload - vinsgr2vr.h $vr1, $a6, 2 + vinsgr2vr.h $vr1, $a6, 0 ld.d $a6, $sp, 32 # 8-byte Folded Reload + vinsgr2vr.h $vr1, $a6, 2 + ld.d $a6, $sp, 40 # 8-byte Folded Reload vinsgr2vr.h $vr1, $a6, 4 - ld.d $a6, $sp, 8 # 8-byte Folded Reload + ld.d $a6, $sp, 16 # 8-byte Folded Reload vinsgr2vr.h $vr1, $a6, 6 vreplgr2vr.w $vr2, $a5 vsll.w $vr1, $vr1, $vr2 @@ -800,23 +696,23 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding vsrai.w $vr0, $vr0, 28 vpickev.h $vr0, $vr0, $vr1 vaddi.hu $vr0, $vr0, 4 - ld.d $a6, $sp, 48 # 8-byte Folded Reload + ld.d $a6, $sp, 56 # 8-byte Folded Reload vst $vr0, $a6, 0 - sll.w $a5, $s1, $a3 + sll.w $a5, $s4, $a3 ext.w.h $a5, $a5 mul.d $a5, $a5, $a4 slli.w $a5, $a5, 1 srli.d $a5, $a5, 28 addi.d $a5, $a5, 4 st.h $a5, $a6, 16 - sll.w $a5, $s4, $a3 + sll.w $a5, $s5, $a3 ext.w.h $a5, $a5 mul.d $a5, $a5, $a4 slli.w $a5, $a5, 1 srli.d $a5, $a5, 28 addi.d $a5, $a5, 4 st.h $a5, $a6, 18 - sll.w $a5, $s5, $a3 + sll.w $a5, $s1, $a3 ext.w.h $a5, $a5 mul.d $a5, $a5, $a4 slli.w $a5, $a5, 1 @@ -837,16 +733,16 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding srli.d $a3, $a3, 28 addi.d $a3, $a3, 4 st.h $a3, $a6, 24 - ld.d $a3, $sp, 40 # 8-byte Folded Reload + ld.d $a3, $sp, 48 # 8-byte Folded Reload st.h $a0, $a3, 0 ext.w.h $a1, $a1 ext.w.h $a2, $a2 - addi.d $a3, $sp, 70 - addi.d $s0, $sp, 70 + addi.d $a3, $sp, 78 + addi.d $s0, $sp, 78 move $a0, $a6 pcaddu18i $ra, %call36(APCM_inverse_quantization) jirl $ra, $ra, 0 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload ld.hu $a0, $a0, 0 ori $a1, $zero, 3 bltu $a1, $a0, .LBB0_15 @@ -863,7 +759,7 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding addi.d $fp, $fp, 2 .LBB0_11: ori $a1, $zero, 13 - addi.d $s0, $sp, 70 + addi.d $s0, $sp, 78 .LBB0_12: st.h $zero, $fp, 0 addi.d $fp, $fp, 2 @@ -892,47 +788,37 @@ Gsm_RPE_Encoding: # @Gsm_RPE_Encoding addi.d $a2, $a0, 2 move $a0, $fp move $a1, $zero - fld.d $fs6, $sp, 176 # 8-byte Folded Reload - fld.d $fs5, $sp, 184 # 8-byte Folded Reload - fld.d $fs4, $sp, 192 # 8-byte Folded Reload - fld.d $fs3, $sp, 200 # 8-byte Folded Reload - fld.d $fs2, $sp, 208 # 8-byte Folded Reload - fld.d $fs1, $sp, 216 # 8-byte Folded Reload - fld.d $fs0, $sp, 224 # 8-byte Folded Reload - ld.d $s8, $sp, 232 # 8-byte Folded Reload - ld.d $s7, $sp, 240 # 8-byte Folded Reload - ld.d $s6, $sp, 248 # 8-byte Folded Reload - ld.d $s5, $sp, 256 # 8-byte Folded Reload - ld.d $s4, $sp, 264 # 8-byte Folded Reload - ld.d $s3, $sp, 272 # 8-byte Folded Reload - ld.d $s2, $sp, 280 # 8-byte Folded Reload - ld.d $s1, $sp, 288 # 8-byte Folded Reload - ld.d $s0, $sp, 296 # 8-byte Folded Reload - ld.d $fp, $sp, 304 # 8-byte Folded Reload - ld.d $ra, $sp, 312 # 8-byte Folded Reload - addi.d $sp, $sp, 320 + fld.d $fs1, $sp, 184 # 8-byte Folded Reload + fld.d $fs0, $sp, 192 # 8-byte Folded Reload + ld.d $s8, $sp, 200 # 8-byte Folded Reload + ld.d $s7, $sp, 208 # 8-byte Folded Reload + ld.d $s6, $sp, 216 # 8-byte Folded Reload + ld.d $s5, $sp, 224 # 8-byte Folded Reload + ld.d $s4, $sp, 232 # 8-byte Folded Reload + ld.d $s3, $sp, 240 # 8-byte Folded Reload + ld.d $s2, $sp, 248 # 8-byte Folded Reload + ld.d $s1, $sp, 256 # 8-byte Folded Reload + ld.d $s0, $sp, 264 # 8-byte Folded Reload + ld.d $fp, $sp, 272 # 8-byte Folded Reload + ld.d $ra, $sp, 280 # 8-byte Folded Reload + addi.d $sp, $sp, 288 pcaddu18i $t8, %call36(memset) jr $t8 .LBB0_17: # %RPE_grid_positioning.exit - fld.d $fs6, $sp, 176 # 8-byte Folded Reload - fld.d $fs5, $sp, 184 # 8-byte Folded Reload - fld.d $fs4, $sp, 192 # 8-byte Folded Reload - fld.d $fs3, $sp, 200 # 8-byte Folded Reload - fld.d $fs2, $sp, 208 # 8-byte Folded Reload - fld.d $fs1, $sp, 216 # 8-byte Folded Reload - fld.d $fs0, $sp, 224 # 8-byte Folded Reload - ld.d $s8, $sp, 232 # 8-byte Folded Reload - ld.d $s7, $sp, 240 # 8-byte Folded Reload - ld.d $s6, $sp, 248 # 8-byte Folded Reload - ld.d $s5, $sp, 256 # 8-byte Folded Reload - ld.d $s4, $sp, 264 # 8-byte Folded Reload - ld.d $s3, $sp, 272 # 8-byte Folded Reload - ld.d $s2, $sp, 280 # 8-byte Folded Reload - ld.d $s1, $sp, 288 # 8-byte Folded Reload - ld.d $s0, $sp, 296 # 8-byte Folded Reload - ld.d $fp, $sp, 304 # 8-byte Folded Reload - ld.d $ra, $sp, 312 # 8-byte Folded Reload - addi.d $sp, $sp, 320 + fld.d $fs1, $sp, 184 # 8-byte Folded Reload + fld.d $fs0, $sp, 192 # 8-byte Folded Reload + ld.d $s8, $sp, 200 # 8-byte Folded Reload + ld.d $s7, $sp, 208 # 8-byte Folded Reload + ld.d $s6, $sp, 216 # 8-byte Folded Reload + ld.d $s5, $sp, 224 # 8-byte Folded Reload + ld.d $s4, $sp, 232 # 8-byte Folded Reload + ld.d $s3, $sp, 240 # 8-byte Folded Reload + ld.d $s2, $sp, 248 # 8-byte Folded Reload + ld.d $s1, $sp, 256 # 8-byte Folded Reload + ld.d $s0, $sp, 264 # 8-byte Folded Reload + ld.d $fp, $sp, 272 # 8-byte Folded Reload + ld.d $ra, $sp, 280 # 8-byte Folded Reload + addi.d $sp, $sp, 288 ret .Lfunc_end0: .size Gsm_RPE_Encoding, .Lfunc_end0-Gsm_RPE_Encoding diff --git a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcdctmgr.s b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcdctmgr.s index 141ca95c..472eed67 100644 --- a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcdctmgr.s +++ b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcdctmgr.s @@ -94,24 +94,24 @@ jinit_forward_dct: # @jinit_forward_dct .type start_pass_fdctmgr,@function start_pass_fdctmgr: # @start_pass_fdctmgr # %bb.0: - addi.d $sp, $sp, -192 - st.d $ra, $sp, 184 # 8-byte Folded Spill - st.d $fp, $sp, 176 # 8-byte Folded Spill - st.d $s0, $sp, 168 # 8-byte Folded Spill - st.d $s1, $sp, 160 # 8-byte Folded Spill - st.d $s2, $sp, 152 # 8-byte Folded Spill - st.d $s3, $sp, 144 # 8-byte Folded Spill - st.d $s4, $sp, 136 # 8-byte Folded Spill - st.d $s5, $sp, 128 # 8-byte Folded Spill - st.d $s6, $sp, 120 # 8-byte Folded Spill - st.d $s7, $sp, 112 # 8-byte Folded Spill - st.d $s8, $sp, 104 # 8-byte Folded Spill - fst.d $fs0, $sp, 96 # 8-byte Folded Spill - fst.d $fs1, $sp, 88 # 8-byte Folded Spill - fst.d $fs2, $sp, 80 # 8-byte Folded Spill - fst.d $fs3, $sp, 72 # 8-byte Folded Spill - fst.d $fs4, $sp, 64 # 8-byte Folded Spill - fst.d $fs5, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -176 + st.d $ra, $sp, 168 # 8-byte Folded Spill + st.d $fp, $sp, 160 # 8-byte Folded Spill + st.d $s0, $sp, 152 # 8-byte Folded Spill + st.d $s1, $sp, 144 # 8-byte Folded Spill + st.d $s2, $sp, 136 # 8-byte Folded Spill + st.d $s3, $sp, 128 # 8-byte Folded Spill + st.d $s4, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 112 # 8-byte Folded Spill + st.d $s6, $sp, 104 # 8-byte Folded Spill + st.d $s7, $sp, 96 # 8-byte Folded Spill + st.d $s8, $sp, 88 # 8-byte Folded Spill + fst.d $fs0, $sp, 80 # 8-byte Folded Spill + fst.d $fs1, $sp, 72 # 8-byte Folded Spill + fst.d $fs2, $sp, 64 # 8-byte Folded Spill + fst.d $fs3, $sp, 56 # 8-byte Folded Spill + fst.d $fs4, $sp, 48 # 8-byte Folded Spill + fst.d $fs5, $sp, 40 # 8-byte Folded Spill move $fp, $a0 ld.w $a0, $a0, 68 blez $a0, .LBB1_21 @@ -139,12 +139,10 @@ start_pass_fdctmgr: # @start_pass_fdctmgr pcalau12i $a0, %pc_hi20(.LCPI1_5) fld.d $fs5, $a0, %pc_lo12(.LCPI1_5) ori $s2, $zero, 64 - vrepli.b $vr6, 0 ori $a0, $zero, 1024 - vreplgr2vr.d $vr7, $a0 + vreplgr2vr.d $vr6, $a0 ori $s7, $zero, 128 - vst $vr6, $sp, 32 # 16-byte Folded Spill - vst $vr7, $sp, 16 # 16-byte Folded Spill + vst $vr6, $sp, 16 # 16-byte Folded Spill b .LBB1_4 .p2align 4, , 16 .LBB1_2: # in Loop: Header=BB1_4 Depth=1 @@ -154,8 +152,7 @@ start_pass_fdctmgr: # @start_pass_fdctmgr st.w $a2, $a0, 40 move $a0, $fp jirl $ra, $a1, 0 - vld $vr7, $sp, 16 # 16-byte Folded Reload - vld $vr6, $sp, 32 # 16-byte Folded Reload + vld $vr6, $sp, 16 # 16-byte Folded Reload vldi $vr5, -992 .LBB1_3: # %.loopexit # in Loop: Header=BB1_4 Depth=1 @@ -182,8 +179,7 @@ start_pass_fdctmgr: # @start_pass_fdctmgr st.w $a2, $a0, 40 move $a0, $fp jirl $ra, $a1, 0 - vld $vr7, $sp, 16 # 16-byte Folded Reload - vld $vr6, $sp, 32 # 16-byte Folded Reload + vld $vr6, $sp, 16 # 16-byte Folded Reload vldi $vr5, -992 slli.d $a0, $s6, 3 ldx.d $s4, $s1, $a0 @@ -208,8 +204,7 @@ start_pass_fdctmgr: # @start_pass_fdctmgr ori $a2, $zero, 256 move $a0, $fp jirl $ra, $a3, 0 - vld $vr7, $sp, 16 # 16-byte Folded Reload - vld $vr6, $sp, 32 # 16-byte Folded Reload + vld $vr6, $sp, 16 # 16-byte Folded Reload vldi $vr5, -992 st.d $a0, $s6, 0 .LBB1_12: # %vector.body @@ -219,67 +214,67 @@ start_pass_fdctmgr: # @start_pass_fdctmgr vinsgr2vr.d $vr0, $a1, 0 vinsgr2vr.d $vr1, $a2, 0 ld.d $a1, $s4, 16 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vslli.w $vr0, $vr0, 3 vst $vr0, $a0, 0 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $s4, 24 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 3 vst $vr1, $a0, 16 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $s4, 32 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vslli.w $vr0, $vr0, 3 vst $vr0, $a0, 32 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $s4, 40 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 3 vst $vr1, $a0, 48 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $s4, 48 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vslli.w $vr0, $vr0, 3 vst $vr0, $a0, 64 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $s4, 56 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 3 vst $vr1, $a0, 80 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $s4, 64 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vslli.w $vr0, $vr0, 3 vst $vr0, $a0, 96 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $s4, 72 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 3 vst $vr1, $a0, 112 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $s4, 80 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vslli.w $vr0, $vr0, 3 vst $vr0, $a0, 128 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $s4, 88 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 3 vst $vr1, $a0, 144 vinsgr2vr.d $vr1, $a1, 0 ld.d $a1, $s4, 96 - vilvl.h $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vslli.w $vr0, $vr0, 3 vst $vr0, $a0, 160 vinsgr2vr.d $vr0, $a1, 0 ld.d $a1, $s4, 104 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 3 vst $vr1, $a0, 176 vinsgr2vr.d $vr1, $a1, 0 - vilvl.h $vr0, $vr6, $vr0 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr0, $vr0, 3 vslli.w $vr1, $vr1, 3 ld.d $a1, $s4, 112 @@ -288,8 +283,8 @@ start_pass_fdctmgr: # @start_pass_fdctmgr vst $vr1, $a0, 208 vinsgr2vr.d $vr0, $a1, 0 vinsgr2vr.d $vr1, $a2, 0 - vilvl.h $vr0, $vr6, $vr0 - vilvl.h $vr1, $vr6, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr0, $vr0, 3 vslli.w $vr1, $vr1, 3 vst $vr0, $a0, 224 @@ -308,8 +303,7 @@ start_pass_fdctmgr: # @start_pass_fdctmgr ori $a2, $zero, 256 move $a0, $fp jirl $ra, $a3, 0 - vld $vr7, $sp, 16 # 16-byte Folded Reload - vld $vr6, $sp, 32 # 16-byte Folded Reload + vld $vr6, $sp, 16 # 16-byte Folded Reload vldi $vr5, -992 st.d $a0, $s6, 0 .LBB1_15: # %vector.ph111 @@ -321,23 +315,23 @@ start_pass_fdctmgr: # @start_pass_fdctmgr # => This Inner Loop Header: Depth=2 ldx.d $a2, $s4, $a1 vinsgr2vr.d $vr0, $a2, 0 - vilvl.h $vr0, $vr6, $vr0 - vilvl.w $vr1, $vr6, $vr0 - vilvh.w $vr0, $vr6, $vr0 + vsllwil.wu.hu $vr1, $vr0, 0 + vsllwil.du.wu $vr1, $vr1, 0 + vshuf4i.h $vr0, $vr0, 14 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 pcalau12i $a2, %pc_hi20(start_pass_fdctmgr.aanscales) addi.d $a2, $a2, %pc_lo12(start_pass_fdctmgr.aanscales) ldx.d $a2, $a2, $a1 vinsgr2vr.d $vr2, $a2, 0 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr3, $vr2, $vr2 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vori.b $vr4, $vr7, 0 + vsllwil.w.h $vr3, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vori.b $vr4, $vr6, 0 vmadd.d $vr4, $vr2, $vr0 - vori.b $vr0, $vr7, 0 + vori.b $vr0, $vr6, 0 vmadd.d $vr0, $vr3, $vr1 vsrli.d $vr0, $vr0, 11 vsrli.d $vr1, $vr4, 11 @@ -362,8 +356,7 @@ start_pass_fdctmgr: # @start_pass_fdctmgr ori $a2, $zero, 256 move $a0, $fp jirl $ra, $a3, 0 - vld $vr7, $sp, 16 # 16-byte Folded Reload - vld $vr6, $sp, 32 # 16-byte Folded Reload + vld $vr6, $sp, 16 # 16-byte Folded Reload vldi $vr5, -992 st.d $a0, $s6, 0 .LBB1_19: # in Loop: Header=BB1_4 Depth=1 @@ -451,24 +444,24 @@ start_pass_fdctmgr: # @start_pass_fdctmgr bne $a1, $s2, .LBB1_20 b .LBB1_3 .LBB1_21: # %._crit_edge - fld.d $fs5, $sp, 56 # 8-byte Folded Reload - fld.d $fs4, $sp, 64 # 8-byte Folded Reload - fld.d $fs3, $sp, 72 # 8-byte Folded Reload - fld.d $fs2, $sp, 80 # 8-byte Folded Reload - fld.d $fs1, $sp, 88 # 8-byte Folded Reload - fld.d $fs0, $sp, 96 # 8-byte Folded Reload - ld.d $s8, $sp, 104 # 8-byte Folded Reload - ld.d $s7, $sp, 112 # 8-byte Folded Reload - ld.d $s6, $sp, 120 # 8-byte Folded Reload - ld.d $s5, $sp, 128 # 8-byte Folded Reload - ld.d $s4, $sp, 136 # 8-byte Folded Reload - ld.d $s3, $sp, 144 # 8-byte Folded Reload - ld.d $s2, $sp, 152 # 8-byte Folded Reload - ld.d $s1, $sp, 160 # 8-byte Folded Reload - ld.d $s0, $sp, 168 # 8-byte Folded Reload - ld.d $fp, $sp, 176 # 8-byte Folded Reload - ld.d $ra, $sp, 184 # 8-byte Folded Reload - addi.d $sp, $sp, 192 + fld.d $fs5, $sp, 40 # 8-byte Folded Reload + fld.d $fs4, $sp, 48 # 8-byte Folded Reload + fld.d $fs3, $sp, 56 # 8-byte Folded Reload + fld.d $fs2, $sp, 64 # 8-byte Folded Reload + fld.d $fs1, $sp, 72 # 8-byte Folded Reload + fld.d $fs0, $sp, 80 # 8-byte Folded Reload + ld.d $s8, $sp, 88 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload + ld.d $s6, $sp, 104 # 8-byte Folded Reload + ld.d $s5, $sp, 112 # 8-byte Folded Reload + ld.d $s4, $sp, 120 # 8-byte Folded Reload + ld.d $s3, $sp, 128 # 8-byte Folded Reload + ld.d $s2, $sp, 136 # 8-byte Folded Reload + ld.d $s1, $sp, 144 # 8-byte Folded Reload + ld.d $s0, $sp, 152 # 8-byte Folded Reload + ld.d $fp, $sp, 160 # 8-byte Folded Reload + ld.d $ra, $sp, 168 # 8-byte Folded Reload + addi.d $sp, $sp, 176 ret .Lfunc_end1: .size start_pass_fdctmgr, .Lfunc_end1-start_pass_fdctmgr diff --git a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcmarker.s b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcmarker.s index 41ae1e10..e896e8bc 100644 --- a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcmarker.s +++ b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcmarker.s @@ -2041,9 +2041,8 @@ emit_dqt: # @emit_dqt vor.v $vr0, $vr1, $vr0 vrepli.h $vr1, 255 vslt.hu $vr0, $vr1, $vr0 - vilvl.h $vr0, $vr0, $vr0 ld.w $a0, $s1, 128 - vslli.w $vr0, $vr0, 16 + vsllwil.w.h $vr0, $vr0, 0 vmskltz.w $vr0, $vr0 vpickve2gr.hu $a1, $vr0, 0 andi $s2, $a1, 15 diff --git a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcmaster.s b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcmaster.s index d491f0eb..cde29c42 100644 --- a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcmaster.s +++ b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcmaster.s @@ -912,12 +912,8 @@ prepare_for_pass: # @prepare_for_pass ld.d $t2, $a7, 0 vinsgr2vr.d $vr2, $t1, 0 vinsgr2vr.d $vr3, $t2, 0 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 vori.b $vr4, $vr0, 0 vmadd.d $vr4, $vr2, $vr1 vori.b $vr2, $vr0, 0 @@ -972,12 +968,8 @@ prepare_for_pass: # @prepare_for_pass ld.d $t2, $a7, 0 vinsgr2vr.d $vr2, $t1, 0 vinsgr2vr.d $vr3, $t2, 0 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 vori.b $vr4, $vr0, 0 vmadd.d $vr4, $vr2, $vr1 vori.b $vr2, $vr0, 0 @@ -1252,12 +1244,8 @@ prepare_for_pass: # @prepare_for_pass ld.d $t2, $a7, 0 vinsgr2vr.d $vr2, $t1, 0 vinsgr2vr.d $vr3, $t2, 0 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 vori.b $vr4, $vr0, 0 vmadd.d $vr4, $vr2, $vr1 vori.b $vr2, $vr0, 0 diff --git a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcparam.s b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcparam.s index 9beeff58..a6935d2f 100644 --- a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcparam.s +++ b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcparam.s @@ -41,70 +41,70 @@ jpeg_add_quant_table: # @jpeg_add_quant_table .LBB0_4: vreplgr2vr.d $vr0, $s1 move $a1, $zero - vrepli.b $vr1, 0 - vrepli.d $vr2, 50 + vrepli.d $vr1, 50 lu12i.w $a2, 461373 ori $a2, $a2, 1803 lu32i.d $a2, 461373 lu52i.d $a2, $a2, -1475 - vreplgr2vr.d $vr3, $a2 + vreplgr2vr.d $vr2, $a2 lu12i.w $a2, 7 ori $a2, $a2, 4095 - vreplgr2vr.d $vr4, $a2 + vreplgr2vr.d $vr3, $a2 beqz $s0, .LBB0_7 # %bb.5: # %vector.body.preheader lu12i.w $a2, 6 ori $a2, $a2, 973 - vreplgr2vr.d $vr5, $a2 - vrepli.h $vr6, 255 + vreplgr2vr.d $vr4, $a2 + vrepli.h $vr5, 255 ori $a2, $zero, 128 .p2align 4, , 16 .LBB0_6: # %vector.body # =>This Inner Loop Header: Depth=1 - vld $vr7, $fp, 0 - vilvl.w $vr8, $vr1, $vr7 - vilvh.w $vr7, $vr1, $vr7 + vld $vr6, $fp, 0 + vsllwil.du.wu $vr7, $vr6, 0 + vshuf4i.w $vr6, $vr6, 14 + vsllwil.du.wu $vr6, $vr6, 0 + vmul.d $vr8, $vr0, $vr6 vmul.d $vr9, $vr0, $vr7 - vmul.d $vr10, $vr0, $vr8 - vori.b $vr11, $vr2, 0 - vmadd.d $vr11, $vr0, $vr8 - vori.b $vr8, $vr2, 0 - vmadd.d $vr8, $vr0, $vr7 - vmuh.d $vr7, $vr8, $vr3 - vadd.d $vr7, $vr7, $vr8 - vsrli.d $vr8, $vr7, 63 + vori.b $vr10, $vr1, 0 + vmadd.d $vr10, $vr0, $vr7 + vori.b $vr7, $vr1, 0 + vmadd.d $vr7, $vr0, $vr6 + vmuh.d $vr6, $vr7, $vr2 + vadd.d $vr6, $vr6, $vr7 + vsrli.d $vr7, $vr6, 63 + vsrai.d $vr6, $vr6, 6 + vadd.d $vr6, $vr6, $vr7 + vmuh.d $vr7, $vr10, $vr2 + vadd.d $vr7, $vr7, $vr10 + vsrli.d $vr10, $vr7, 63 vsrai.d $vr7, $vr7, 6 - vadd.d $vr7, $vr7, $vr8 - vmuh.d $vr8, $vr11, $vr3 - vadd.d $vr8, $vr8, $vr11 - vsrli.d $vr11, $vr8, 63 - vsrai.d $vr8, $vr8, 6 - vadd.d $vr8, $vr8, $vr11 - vmaxi.d $vr8, $vr8, 1 + vadd.d $vr7, $vr7, $vr10 vmaxi.d $vr7, $vr7, 1 - vmin.d $vr7, $vr7, $vr4 - vmin.d $vr8, $vr8, $vr4 - vslt.d $vr10, $vr5, $vr10 - vpickve2gr.d $a3, $vr10, 0 - vinsgr2vr.h $vr11, $a3, 0 - vpickve2gr.d $a3, $vr10, 1 - vinsgr2vr.h $vr11, $a3, 1 - vslt.d $vr9, $vr5, $vr9 + vmaxi.d $vr6, $vr6, 1 + vmin.d $vr6, $vr6, $vr3 + vmin.d $vr7, $vr7, $vr3 + vslt.d $vr9, $vr4, $vr9 vpickve2gr.d $a3, $vr9, 0 - vinsgr2vr.h $vr11, $a3, 2 + vinsgr2vr.h $vr10, $a3, 0 vpickve2gr.d $a3, $vr9, 1 - vinsgr2vr.h $vr11, $a3, 3 + vinsgr2vr.h $vr10, $a3, 1 + vslt.d $vr8, $vr4, $vr8 vpickve2gr.d $a3, $vr8, 0 - vinsgr2vr.h $vr9, $a3, 0 + vinsgr2vr.h $vr10, $a3, 2 vpickve2gr.d $a3, $vr8, 1 - vinsgr2vr.h $vr9, $a3, 1 + vinsgr2vr.h $vr10, $a3, 3 vpickve2gr.d $a3, $vr7, 0 - vinsgr2vr.h $vr9, $a3, 2 + vinsgr2vr.h $vr8, $a3, 0 vpickve2gr.d $a3, $vr7, 1 - vinsgr2vr.h $vr9, $a3, 3 - vbitsel.v $vr7, $vr9, $vr6, $vr11 + vinsgr2vr.h $vr8, $a3, 1 + vpickve2gr.d $a3, $vr6, 0 + vinsgr2vr.h $vr8, $a3, 2 + vpickve2gr.d $a3, $vr6, 1 + vinsgr2vr.h $vr8, $a3, 3 + vbitsel.v $vr6, $vr8, $vr5, $vr10 add.d $a3, $a0, $a1 - vstelm.d $vr7, $a3, 0, 0 + vstelm.d $vr6, $a3, 0, 0 addi.d $a1, $a1, 8 addi.d $fp, $fp, 16 bne $a1, $a2, .LBB0_6 @@ -114,37 +114,38 @@ jpeg_add_quant_table: # @jpeg_add_quant_table .p2align 4, , 16 .LBB0_8: # %vector.body41 # =>This Inner Loop Header: Depth=1 - vld $vr5, $fp, 0 - vilvh.w $vr6, $vr1, $vr5 - vilvl.w $vr5, $vr1, $vr5 - vori.b $vr7, $vr2, 0 - vmadd.d $vr7, $vr0, $vr5 - vori.b $vr5, $vr2, 0 - vmadd.d $vr5, $vr0, $vr6 - vmuh.d $vr6, $vr5, $vr3 - vadd.d $vr5, $vr6, $vr5 + vld $vr4, $fp, 0 + vshuf4i.w $vr5, $vr4, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vori.b $vr6, $vr1, 0 + vmadd.d $vr6, $vr0, $vr4 + vori.b $vr4, $vr1, 0 + vmadd.d $vr4, $vr0, $vr5 + vmuh.d $vr5, $vr4, $vr2 + vadd.d $vr4, $vr5, $vr4 + vsrli.d $vr5, $vr4, 63 + vsrai.d $vr4, $vr4, 6 + vadd.d $vr4, $vr4, $vr5 + vmuh.d $vr5, $vr6, $vr2 + vadd.d $vr5, $vr5, $vr6 vsrli.d $vr6, $vr5, 63 vsrai.d $vr5, $vr5, 6 vadd.d $vr5, $vr5, $vr6 - vmuh.d $vr6, $vr7, $vr3 - vadd.d $vr6, $vr6, $vr7 - vsrli.d $vr7, $vr6, 63 - vsrai.d $vr6, $vr6, 6 - vadd.d $vr6, $vr6, $vr7 - vmaxi.d $vr6, $vr6, 1 vmaxi.d $vr5, $vr5, 1 - vmin.d $vr5, $vr5, $vr4 - vmin.d $vr6, $vr6, $vr4 - vpickve2gr.d $a3, $vr6, 0 - vinsgr2vr.h $vr7, $a3, 0 - vpickve2gr.d $a3, $vr6, 1 - vinsgr2vr.h $vr7, $a3, 1 + vmaxi.d $vr4, $vr4, 1 + vmin.d $vr4, $vr4, $vr3 + vmin.d $vr5, $vr5, $vr3 vpickve2gr.d $a3, $vr5, 0 - vinsgr2vr.h $vr7, $a3, 2 + vinsgr2vr.h $vr6, $a3, 0 vpickve2gr.d $a3, $vr5, 1 - vinsgr2vr.h $vr7, $a3, 3 + vinsgr2vr.h $vr6, $a3, 1 + vpickve2gr.d $a3, $vr4, 0 + vinsgr2vr.h $vr6, $a3, 2 + vpickve2gr.d $a3, $vr4, 1 + vinsgr2vr.h $vr6, $a3, 3 add.d $a3, $a0, $a1 - vstelm.d $vr7, $a3, 0, 0 + vstelm.d $vr6, $a3, 0, 0 addi.d $a1, $a1, 8 addi.d $fp, $fp, 16 bne $a1, $a2, .LBB0_8 @@ -166,14 +167,14 @@ jpeg_add_quant_table: # @jpeg_add_quant_table .type jpeg_set_linear_quality,@function jpeg_set_linear_quality: # @jpeg_set_linear_quality # %bb.0: - addi.d $sp, $sp, -128 - st.d $ra, $sp, 120 # 8-byte Folded Spill - st.d $fp, $sp, 112 # 8-byte Folded Spill - st.d $s0, $sp, 104 # 8-byte Folded Spill - st.d $s1, $sp, 96 # 8-byte Folded Spill - st.d $s2, $sp, 88 # 8-byte Folded Spill - st.d $s3, $sp, 80 # 8-byte Folded Spill - st.d $s4, $sp, 72 # 8-byte Folded Spill + addi.d $sp, $sp, -112 + st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s1, $sp, 80 # 8-byte Folded Spill + st.d $s2, $sp, 72 # 8-byte Folded Spill + st.d $s3, $sp, 64 # 8-byte Folded Spill + st.d $s4, $sp, 56 # 8-byte Folded Spill move $s0, $a0 ld.w $a0, $a0, 28 ori $a3, $zero, 100 @@ -199,12 +200,11 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality st.d $a0, $s0, 88 .LBB1_4: vreplgr2vr.d $vr0, $s1 - vrepli.b $vr9, 0 - vrepli.d $vr10, 50 + vrepli.d $vr9, 50 lu12i.w $s3, 461373 lu12i.w $s2, 7 lu12i.w $s4, 6 - vrepli.h $vr11, 255 + vrepli.h $vr10, 255 pcalau12i $a1, %pc_hi20(jpeg_set_linear_quality.std_luminance_quant_tbl) addi.d $a1, $a1, %pc_lo12(jpeg_set_linear_quality.std_luminance_quant_tbl) move $a2, $zero @@ -223,13 +223,14 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality .LBB1_6: # %vector.body # =>This Inner Loop Header: Depth=1 vld $vr4, $a1, 0 - vilvl.w $vr5, $vr9, $vr4 - vilvh.w $vr4, $vr9, $vr4 + vsllwil.du.wu $vr5, $vr4, 0 + vshuf4i.w $vr4, $vr4, 14 + vsllwil.du.wu $vr4, $vr4, 0 vmul.d $vr6, $vr0, $vr4 vmul.d $vr7, $vr0, $vr5 - vori.b $vr8, $vr10, 0 + vori.b $vr8, $vr9, 0 vmadd.d $vr8, $vr0, $vr5 - vori.b $vr5, $vr10, 0 + vori.b $vr5, $vr9, 0 vmadd.d $vr5, $vr0, $vr4 vmuh.d $vr4, $vr5, $vr1 vadd.d $vr4, $vr4, $vr5 @@ -263,7 +264,7 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality vinsgr2vr.h $vr6, $a4, 2 vpickve2gr.d $a4, $vr4, 1 vinsgr2vr.h $vr6, $a4, 3 - vbitsel.v $vr4, $vr6, $vr11, $vr8 + vbitsel.v $vr4, $vr6, $vr10, $vr8 add.d $a4, $a0, $a2 vstelm.d $vr4, $a4, 0, 0 addi.d $a2, $a2, 8 @@ -276,11 +277,12 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality .LBB1_8: # %vector.body36 # =>This Inner Loop Header: Depth=1 vld $vr3, $a1, 0 - vilvh.w $vr4, $vr9, $vr3 - vilvl.w $vr3, $vr9, $vr3 - vori.b $vr5, $vr10, 0 + vshuf4i.w $vr4, $vr3, 14 + vsllwil.du.wu $vr4, $vr4, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vori.b $vr5, $vr9, 0 vmadd.d $vr5, $vr0, $vr3 - vori.b $vr3, $vr10, 0 + vori.b $vr3, $vr9, 0 vmadd.d $vr3, $vr0, $vr4 vmuh.d $vr4, $vr3, $vr1 vadd.d $vr3, $vr4, $vr3 @@ -322,26 +324,22 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality ori $a2, $zero, 18 st.w $a2, $a0, 40 move $a0, $s0 - vst $vr9, $sp, 48 # 16-byte Folded Spill - vst $vr10, $sp, 32 # 16-byte Folded Spill - vst $vr11, $sp, 16 # 16-byte Folded Spill + vst $vr9, $sp, 32 # 16-byte Folded Spill + vst $vr10, $sp, 16 # 16-byte Folded Spill jirl $ra, $a1, 0 - vld $vr11, $sp, 16 # 16-byte Folded Reload - vld $vr10, $sp, 32 # 16-byte Folded Reload - vld $vr9, $sp, 48 # 16-byte Folded Reload + vld $vr10, $sp, 16 # 16-byte Folded Reload + vld $vr9, $sp, 32 # 16-byte Folded Reload .LBB1_11: ld.d $a0, $s0, 96 bnez $a0, .LBB1_13 # %bb.12: move $a0, $s0 - vst $vr9, $sp, 48 # 16-byte Folded Spill - vst $vr10, $sp, 32 # 16-byte Folded Spill - vst $vr11, $sp, 16 # 16-byte Folded Spill + vst $vr9, $sp, 32 # 16-byte Folded Spill + vst $vr10, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(jpeg_alloc_quant_table) jirl $ra, $ra, 0 - vld $vr11, $sp, 16 # 16-byte Folded Reload - vld $vr10, $sp, 32 # 16-byte Folded Reload - vld $vr9, $sp, 48 # 16-byte Folded Reload + vld $vr10, $sp, 16 # 16-byte Folded Reload + vld $vr9, $sp, 32 # 16-byte Folded Reload st.d $a0, $s0, 96 .LBB1_13: vreplgr2vr.d $vr0, $s1 @@ -363,13 +361,14 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality .LBB1_15: # %vector.body45 # =>This Inner Loop Header: Depth=1 vld $vr4, $a1, 0 - vilvl.w $vr5, $vr9, $vr4 - vilvh.w $vr4, $vr9, $vr4 + vsllwil.du.wu $vr5, $vr4, 0 + vshuf4i.w $vr4, $vr4, 14 + vsllwil.du.wu $vr4, $vr4, 0 vmul.d $vr6, $vr0, $vr4 vmul.d $vr7, $vr0, $vr5 - vori.b $vr8, $vr10, 0 + vori.b $vr8, $vr9, 0 vmadd.d $vr8, $vr0, $vr5 - vori.b $vr5, $vr10, 0 + vori.b $vr5, $vr9, 0 vmadd.d $vr5, $vr0, $vr4 vmuh.d $vr4, $vr5, $vr1 vadd.d $vr4, $vr4, $vr5 @@ -403,7 +402,7 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality vinsgr2vr.h $vr6, $a4, 2 vpickve2gr.d $a4, $vr4, 1 vinsgr2vr.h $vr6, $a4, 3 - vbitsel.v $vr4, $vr6, $vr11, $vr8 + vbitsel.v $vr4, $vr6, $vr10, $vr8 add.d $a4, $a0, $a2 vstelm.d $vr4, $a4, 0, 0 addi.d $a2, $a2, 8 @@ -416,11 +415,12 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality .LBB1_17: # %vector.body54 # =>This Inner Loop Header: Depth=1 vld $vr3, $a1, 0 - vilvh.w $vr4, $vr9, $vr3 - vilvl.w $vr3, $vr9, $vr3 - vori.b $vr5, $vr10, 0 + vshuf4i.w $vr4, $vr3, 14 + vsllwil.du.wu $vr4, $vr4, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vori.b $vr5, $vr9, 0 vmadd.d $vr5, $vr0, $vr3 - vori.b $vr3, $vr10, 0 + vori.b $vr3, $vr9, 0 vmadd.d $vr3, $vr0, $vr4 vmuh.d $vr4, $vr3, $vr1 vadd.d $vr3, $vr4, $vr3 @@ -451,14 +451,14 @@ jpeg_set_linear_quality: # @jpeg_set_linear_quality bne $a2, $a3, .LBB1_17 .LBB1_18: # %jpeg_add_quant_table.exit20 st.w $zero, $a0, 128 - ld.d $s4, $sp, 72 # 8-byte Folded Reload - ld.d $s3, $sp, 80 # 8-byte Folded Reload - ld.d $s2, $sp, 88 # 8-byte Folded Reload - ld.d $s1, $sp, 96 # 8-byte Folded Reload - ld.d $s0, $sp, 104 # 8-byte Folded Reload - ld.d $fp, $sp, 112 # 8-byte Folded Reload - ld.d $ra, $sp, 120 # 8-byte Folded Reload - addi.d $sp, $sp, 128 + ld.d $s4, $sp, 56 # 8-byte Folded Reload + ld.d $s3, $sp, 64 # 8-byte Folded Reload + ld.d $s2, $sp, 72 # 8-byte Folded Reload + ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $fp, $sp, 96 # 8-byte Folded Reload + ld.d $ra, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret .Lfunc_end1: .size jpeg_set_linear_quality, .Lfunc_end1-jpeg_set_linear_quality diff --git a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcsample.s b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcsample.s index e30baf29..67133dd4 100644 --- a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcsample.s +++ b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jcsample.s @@ -258,11 +258,6 @@ sep_downsample: # @sep_downsample .word 7 # 0x7 .word 4294967295 # 0xffffffff .word 4294967295 # 0xffffffff -.LCPI3_1: - .word 1 # 0x1 - .word 5 # 0x5 - .word 2 # 0x2 - .word 7 # 0x7 .text .p2align 5 .type fullsize_smooth_downsample,@function @@ -330,8 +325,7 @@ fullsize_smooth_downsample: # @fullsize_smooth_downsample vreplgr2vr.d $vr1, $a2 lu12i.w $t1, 8 ori $t2, $zero, 16 - vrepli.b $vr2, 0 - vreplgr2vr.d $vr3, $t1 + vreplgr2vr.d $vr2, $t1 .p2align 4, , 16 .LBB3_6: # =>This Loop Header: Depth=1 # Child Loop BB3_12 Depth 2 @@ -404,169 +398,200 @@ fullsize_smooth_downsample: # @fullsize_smooth_downsample add.d $t6, $t6, $a7 add.d $t5, $t5, $a7 add.d $t3, $t3, $a7 - vinsgr2vr.w $vr5, $s3, 3 - vinsgr2vr.w $vr4, $s4, 3 + vinsgr2vr.w $vr6, $s3, 3 + vinsgr2vr.w $vr3, $s4, 3 addi.d $s3, $s5, 2 move $s4, $a7 .p2align 4, , 16 .LBB3_12: # %vector.body # Parent Loop BB3_6 Depth=1 # => This Inner Loop Header: Depth=2 - vld $vr6, $s3, -1 - vld $vr9, $t8, 0 - vilvl.b $vr7, $vr2, $vr6 - vilvl.h $vr8, $vr2, $vr7 - vilvh.b $vr10, $vr2, $vr9 - vilvh.h $vr11, $vr2, $vr10 - vilvl.h $vr10, $vr2, $vr10 - vld $vr12, $s2, 0 - vilvl.b $vr9, $vr2, $vr9 - vilvh.h $vr13, $vr2, $vr9 - vilvl.h $vr9, $vr2, $vr9 - vilvh.b $vr14, $vr2, $vr12 - vilvh.h $vr15, $vr2, $vr14 - vilvl.h $vr14, $vr2, $vr14 - vilvl.b $vr12, $vr2, $vr12 - vilvh.h $vr16, $vr2, $vr12 - vilvl.h $vr12, $vr2, $vr12 - vadd.w $vr12, $vr12, $vr9 - vld $vr9, $s3, 0 - vadd.w $vr13, $vr16, $vr13 + vld $vr5, $s3, -1 + vld $vr7, $t8, 0 + vsllwil.hu.bu $vr4, $vr5, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 + vbsrl.v $vr8, $vr7, 12 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.wu.hu $vr8, $vr8, 0 + vbsrl.v $vr9, $vr7, 8 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsrli.d $vr10, $vr7, 32 + vsllwil.hu.bu $vr10, $vr10, 0 + vld $vr11, $s2, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vbsrl.v $vr12, $vr11, 12 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vbsrl.v $vr13, $vr11, 8 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsrli.d $vr14, $vr11, 32 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vadd.w $vr11, $vr11, $vr7 + vld $vr7, $s3, 0 vadd.w $vr10, $vr14, $vr10 - vadd.w $vr11, $vr15, $vr11 - vilvl.b $vr14, $vr2, $vr9 - vilvl.h $vr15, $vr2, $vr14 - vilvh.h $vr14, $vr2, $vr14 - vilvh.b $vr9, $vr2, $vr9 + vadd.w $vr9, $vr13, $vr9 + vadd.w $vr8, $vr12, $vr8 + vsllwil.hu.bu $vr12, $vr7, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsrli.d $vr13, $vr7, 32 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vbsrl.v $vr14, $vr7, 8 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vbsrl.v $vr7, $vr7, 12 pcalau12i $s5, %pc_hi20(.LCPI3_0) - vld $vr16, $s5, %pc_lo12(.LCPI3_0) - vilvl.h $vr17, $vr2, $vr9 - vilvh.h $vr9, $vr2, $vr9 - vbsrl.v $vr18, $vr4, 12 - vshuf.w $vr16, $vr4, $vr5 - vadd.w $vr4, $vr11, $vr9 - vadd.w $vr9, $vr10, $vr17 - vadd.w $vr10, $vr13, $vr14 - vadd.w $vr13, $vr12, $vr15 - vbsrl.v $vr5, $vr13, 12 - vbsll.v $vr11, $vr10, 4 - vor.v $vr14, $vr11, $vr5 - vbsrl.v $vr5, $vr10, 12 - vbsll.v $vr11, $vr9, 4 - vor.v $vr17, $vr11, $vr5 - vbsrl.v $vr5, $vr9, 12 - vbsll.v $vr11, $vr4, 4 - vor.v $vr5, $vr11, $vr5 - vbsll.v $vr11, $vr13, 4 - vor.v $vr18, $vr11, $vr18 - vshuf4i.w $vr15, $vr13, 14 - vshuf4i.w $vr11, $vr10, 14 - pcalau12i $s5, %pc_hi20(.LCPI3_1) - vld $vr19, $s5, %pc_lo12(.LCPI3_1) - vshuf4i.w $vr12, $vr9, 14 - vilvl.w $vr20, $vr2, $vr12 - vilvl.w $vr21, $vr2, $vr13 - vilvl.w $vr16, $vr2, $vr16 - vori.b $vr22, $vr19, 0 - vori.b $vr23, $vr19, 0 - vilvl.w $vr18, $vr2, $vr18 - vadd.d $vr16, $vr16, $vr18 - vori.b $vr18, $vr19, 0 - vshuf.w $vr19, $vr2, $vr13 - vadd.d $vr19, $vr21, $vr19 - vilvl.w $vr21, $vr2, $vr15 - vilvl.w $vr14, $vr2, $vr14 - vadd.d $vr21, $vr21, $vr14 - vilvl.w $vr14, $vr2, $vr10 - vshuf.w $vr18, $vr2, $vr10 - vadd.d $vr18, $vr14, $vr18 - vilvl.w $vr14, $vr2, $vr11 - vilvl.w $vr17, $vr2, $vr17 - vadd.d $vr17, $vr14, $vr17 - vilvl.w $vr14, $vr2, $vr9 - vshuf.w $vr23, $vr2, $vr9 - vadd.d $vr23, $vr14, $vr23 - vilvl.w $vr14, $vr2, $vr5 - vadd.d $vr20, $vr20, $vr14 - vilvl.w $vr14, $vr2, $vr4 - vshuf.w $vr22, $vr2, $vr4 - vadd.d $vr22, $vr14, $vr22 - vilvl.w $vr14, $vr2, $vr8 + vld $vr15, $s5, %pc_lo12(.LCPI3_0) + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vbsrl.v $vr16, $vr3, 12 + vshuf.w $vr15, $vr3, $vr6 + vadd.w $vr3, $vr8, $vr7 + vadd.w $vr7, $vr9, $vr14 + vadd.w $vr9, $vr10, $vr13 + vadd.w $vr12, $vr11, $vr12 + vbsrl.v $vr6, $vr12, 12 + vbsll.v $vr8, $vr9, 4 + vor.v $vr11, $vr8, $vr6 + vbsrl.v $vr6, $vr9, 12 + vbsll.v $vr8, $vr7, 4 + vor.v $vr14, $vr8, $vr6 + vbsrl.v $vr6, $vr7, 12 + vbsll.v $vr8, $vr3, 4 + vor.v $vr6, $vr8, $vr6 + vbsll.v $vr8, $vr12, 4 + vor.v $vr16, $vr8, $vr16 + vshuf4i.w $vr13, $vr12, 14 + vshuf4i.w $vr10, $vr9, 14 + vshuf4i.w $vr8, $vr7, 14 + vsllwil.du.wu $vr15, $vr15, 0 + vsllwil.du.wu $vr17, $vr3, 0 + vsllwil.du.wu $vr18, $vr8, 0 + vsllwil.du.wu $vr19, $vr7, 0 + vsllwil.du.wu $vr20, $vr9, 0 + vsllwil.du.wu $vr21, $vr12, 0 + vsllwil.du.wu $vr16, $vr16, 0 + vadd.d $vr15, $vr15, $vr16 + vshuf4i.w $vr16, $vr12, 9 + vsllwil.du.wu $vr16, $vr16, 0 + vadd.d $vr16, $vr21, $vr16 + vsllwil.du.wu $vr21, $vr13, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vadd.d $vr21, $vr21, $vr11 + vshuf4i.w $vr11, $vr9, 9 + vsllwil.du.wu $vr11, $vr11, 0 + vadd.d $vr20, $vr20, $vr11 + vsllwil.du.wu $vr11, $vr10, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vadd.d $vr14, $vr11, $vr14 + vshuf4i.w $vr11, $vr7, 9 + vsllwil.du.wu $vr11, $vr11, 0 + vadd.d $vr19, $vr19, $vr11 + vsllwil.du.wu $vr11, $vr6, 0 + vadd.d $vr18, $vr18, $vr11 + vshuf4i.w $vr11, $vr3, 9 + vsllwil.du.wu $vr11, $vr11, 0 + vadd.d $vr17, $vr17, $vr11 + vshuf4i.b $vr11, $vr5, 14 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vshuf4i.w $vr12, $vr12, 16 + vsub.d $vr15, $vr15, $vr4 + vadd.d $vr12, $vr15, $vr12 + vsrli.d $vr15, $vr5, 32 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 vshuf4i.w $vr13, $vr13, 16 - vsub.d $vr16, $vr16, $vr14 + vsub.d $vr16, $vr16, $vr11 vadd.d $vr13, $vr16, $vr13 - vilvh.w $vr8, $vr2, $vr8 - vilvh.h $vr7, $vr2, $vr7 - vshuf4i.w $vr15, $vr15, 16 - vsub.d $vr16, $vr19, $vr8 - vadd.d $vr15, $vr16, $vr15 - vilvl.w $vr16, $vr2, $vr7 - vilvh.w $vr7, $vr2, $vr7 - vilvh.b $vr6, $vr2, $vr6 - vshuf4i.w $vr10, $vr10, 16 - vsub.d $vr19, $vr21, $vr16 - vadd.d $vr10, $vr19, $vr10 - vilvl.h $vr19, $vr2, $vr6 - vshuf4i.w $vr11, $vr11, 16 - vsub.d $vr18, $vr18, $vr7 - vadd.d $vr11, $vr18, $vr11 - vilvl.w $vr18, $vr2, $vr19 - vilvh.w $vr19, $vr2, $vr19 - vilvh.h $vr6, $vr2, $vr6 + vsrli.d $vr16, $vr5, 48 + vsllwil.hu.bu $vr16, $vr16, 0 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.du.wu $vr16, $vr16, 0 vshuf4i.w $vr9, $vr9, 16 - vsub.d $vr17, $vr17, $vr18 - vadd.d $vr9, $vr17, $vr9 - vilvl.w $vr17, $vr2, $vr6 - vilvh.w $vr6, $vr2, $vr6 - vshuf4i.w $vr12, $vr12, 16 - vsub.d $vr21, $vr23, $vr19 - vadd.d $vr12, $vr21, $vr12 - vshuf4i.w $vr21, $vr4, 16 - vsub.d $vr20, $vr20, $vr17 - vadd.d $vr20, $vr20, $vr21 - vshuf4i.w $vr21, $vr4, 50 - vsub.d $vr22, $vr22, $vr6 - vadd.d $vr21, $vr22, $vr21 - vmul.d $vr6, $vr0, $vr6 - vmadd.d $vr6, $vr21, $vr1 - vmul.d $vr17, $vr0, $vr17 - vmadd.d $vr17, $vr20, $vr1 - vmul.d $vr19, $vr0, $vr19 - vmadd.d $vr19, $vr12, $vr1 - vmul.d $vr12, $vr0, $vr18 - vmadd.d $vr12, $vr9, $vr1 - vmul.d $vr7, $vr0, $vr7 - vmadd.d $vr7, $vr11, $vr1 - vmul.d $vr9, $vr0, $vr16 - vmadd.d $vr9, $vr10, $vr1 - vmul.d $vr8, $vr0, $vr8 - vmadd.d $vr8, $vr15, $vr1 - vmul.d $vr10, $vr0, $vr14 - vmadd.d $vr10, $vr13, $vr1 - vadd.d $vr10, $vr10, $vr3 - vadd.d $vr8, $vr8, $vr3 - vsrli.d $vr8, $vr8, 16 - vsrli.d $vr10, $vr10, 16 - vpickev.w $vr8, $vr8, $vr10 - vadd.d $vr9, $vr9, $vr3 - vadd.d $vr7, $vr7, $vr3 + vsub.d $vr21, $vr21, $vr15 + vadd.d $vr9, $vr21, $vr9 + vbsrl.v $vr21, $vr5, 8 + vsllwil.hu.bu $vr21, $vr21, 0 + vsllwil.wu.hu $vr21, $vr21, 0 + vsllwil.du.wu $vr21, $vr21, 0 + vshuf4i.w $vr10, $vr10, 16 + vsub.d $vr20, $vr20, $vr16 + vadd.d $vr10, $vr20, $vr10 + vbsrl.v $vr20, $vr5, 10 + vsllwil.hu.bu $vr20, $vr20, 0 + vsllwil.wu.hu $vr20, $vr20, 0 + vsllwil.du.wu $vr20, $vr20, 0 + vshuf4i.w $vr7, $vr7, 16 + vsub.d $vr14, $vr14, $vr21 + vadd.d $vr7, $vr14, $vr7 + vbsrl.v $vr14, $vr5, 12 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vbsrl.v $vr5, $vr5, 14 + vsllwil.hu.bu $vr5, $vr5, 0 + vsllwil.wu.hu $vr5, $vr5, 0 + vsllwil.du.wu $vr5, $vr5, 0 + vshuf4i.w $vr8, $vr8, 16 + vsub.d $vr19, $vr19, $vr20 + vadd.d $vr8, $vr19, $vr8 + vshuf4i.w $vr19, $vr3, 16 + vsub.d $vr18, $vr18, $vr14 + vadd.d $vr18, $vr18, $vr19 + vshuf4i.w $vr19, $vr3, 50 + vsub.d $vr17, $vr17, $vr5 + vadd.d $vr17, $vr17, $vr19 + vmul.d $vr5, $vr0, $vr5 + vmadd.d $vr5, $vr17, $vr1 + vmul.d $vr14, $vr0, $vr14 + vmadd.d $vr14, $vr18, $vr1 + vmul.d $vr17, $vr0, $vr20 + vmadd.d $vr17, $vr8, $vr1 + vmul.d $vr8, $vr0, $vr21 + vmadd.d $vr8, $vr7, $vr1 + vmul.d $vr7, $vr0, $vr16 + vmadd.d $vr7, $vr10, $vr1 + vmul.d $vr10, $vr0, $vr15 + vmadd.d $vr10, $vr9, $vr1 + vmul.d $vr9, $vr0, $vr11 + vmadd.d $vr9, $vr13, $vr1 + vmul.d $vr4, $vr0, $vr4 + vmadd.d $vr4, $vr12, $vr1 + vadd.d $vr4, $vr4, $vr2 + vadd.d $vr9, $vr9, $vr2 + vsrli.d $vr9, $vr9, 16 + vsrli.d $vr4, $vr4, 16 + vpickev.w $vr4, $vr9, $vr4 + vadd.d $vr9, $vr10, $vr2 + vadd.d $vr7, $vr7, $vr2 vsrli.d $vr7, $vr7, 16 vsrli.d $vr9, $vr9, 16 vpickev.w $vr7, $vr7, $vr9 - vpickev.h $vr7, $vr7, $vr8 - vadd.d $vr8, $vr12, $vr3 - vadd.d $vr9, $vr19, $vr3 - vsrli.d $vr9, $vr9, 16 + vpickev.h $vr4, $vr7, $vr4 + vadd.d $vr7, $vr8, $vr2 + vadd.d $vr8, $vr17, $vr2 vsrli.d $vr8, $vr8, 16 - vpickev.w $vr8, $vr9, $vr8 - vadd.d $vr9, $vr17, $vr3 - vadd.d $vr6, $vr6, $vr3 - vsrli.d $vr6, $vr6, 16 - vsrli.d $vr9, $vr9, 16 - vpickev.w $vr6, $vr6, $vr9 - vpickev.h $vr6, $vr6, $vr8 - vpickev.b $vr6, $vr6, $vr7 - vst $vr6, $t7, 0 + vsrli.d $vr7, $vr7, 16 + vpickev.w $vr7, $vr8, $vr7 + vadd.d $vr8, $vr14, $vr2 + vadd.d $vr5, $vr5, $vr2 + vsrli.d $vr5, $vr5, 16 + vsrli.d $vr8, $vr8, 16 + vpickev.w $vr5, $vr5, $vr8 + vpickev.h $vr5, $vr5, $vr7 + vpickev.b $vr4, $vr5, $vr4 + vst $vr4, $t7, 0 addi.d $s4, $s4, -16 addi.d $s3, $s3, 16 addi.d $t8, $t8, 16 @@ -575,8 +600,8 @@ fullsize_smooth_downsample: # @fullsize_smooth_downsample bnez $s4, .LBB3_12 # %bb.13: # %middle.block # in Loop: Header=BB3_6 Depth=1 - vpickve2gr.w $s3, $vr4, 2 - vpickve2gr.w $s4, $vr4, 3 + vpickve2gr.w $s3, $vr3, 2 + vpickve2gr.w $s4, $vr3, 3 move $t7, $t0 .LBB3_14: # %scalar.ph.preheader # in Loop: Header=BB3_6 Depth=1 @@ -1245,12 +1270,12 @@ int_downsample: # @int_downsample ld.h $s4, $t7, 0 vinsgr2vr.h $vr3, $s2, 0 vinsgr2vr.h $vr4, $s4, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vilvl.w $vr4, $vr0, $vr4 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 vadd.d $vr1, $vr1, $vr3 vadd.d $vr2, $vr2, $vr4 addi.d $t8, $t8, -4 diff --git a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jddctmgr.s b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jddctmgr.s index 6aaa75ce..96a42439 100644 --- a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jddctmgr.s +++ b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jddctmgr.s @@ -82,24 +82,24 @@ jinit_inverse_dct: # @jinit_inverse_dct .type start_pass,@function start_pass: # @start_pass # %bb.0: - addi.d $sp, $sp, -192 - st.d $ra, $sp, 184 # 8-byte Folded Spill - st.d $fp, $sp, 176 # 8-byte Folded Spill - st.d $s0, $sp, 168 # 8-byte Folded Spill - st.d $s1, $sp, 160 # 8-byte Folded Spill - st.d $s2, $sp, 152 # 8-byte Folded Spill - st.d $s3, $sp, 144 # 8-byte Folded Spill - st.d $s4, $sp, 136 # 8-byte Folded Spill - st.d $s5, $sp, 128 # 8-byte Folded Spill - st.d $s6, $sp, 120 # 8-byte Folded Spill - st.d $s7, $sp, 112 # 8-byte Folded Spill - st.d $s8, $sp, 104 # 8-byte Folded Spill - fst.d $fs0, $sp, 96 # 8-byte Folded Spill - fst.d $fs1, $sp, 88 # 8-byte Folded Spill - fst.d $fs2, $sp, 80 # 8-byte Folded Spill - fst.d $fs3, $sp, 72 # 8-byte Folded Spill - fst.d $fs4, $sp, 64 # 8-byte Folded Spill - fst.d $fs5, $sp, 56 # 8-byte Folded Spill + addi.d $sp, $sp, -176 + st.d $ra, $sp, 168 # 8-byte Folded Spill + st.d $fp, $sp, 160 # 8-byte Folded Spill + st.d $s0, $sp, 152 # 8-byte Folded Spill + st.d $s1, $sp, 144 # 8-byte Folded Spill + st.d $s2, $sp, 136 # 8-byte Folded Spill + st.d $s3, $sp, 128 # 8-byte Folded Spill + st.d $s4, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 112 # 8-byte Folded Spill + st.d $s6, $sp, 104 # 8-byte Folded Spill + st.d $s7, $sp, 96 # 8-byte Folded Spill + st.d $s8, $sp, 88 # 8-byte Folded Spill + fst.d $fs0, $sp, 80 # 8-byte Folded Spill + fst.d $fs1, $sp, 72 # 8-byte Folded Spill + fst.d $fs2, $sp, 64 # 8-byte Folded Spill + fst.d $fs3, $sp, 56 # 8-byte Folded Spill + fst.d $fs4, $sp, 48 # 8-byte Folded Spill + fst.d $fs5, $sp, 40 # 8-byte Folded Spill move $fp, $a0 ld.w $a0, $a0, 48 blez $a0, .LBB1_21 @@ -110,7 +110,6 @@ start_pass: # @start_pass addi.d $a6, $a0, 88 pcalau12i $a0, %got_pc_hi20(jpeg_idct_1x1) ld.d $s3, $a0, %got_pc_lo12(jpeg_idct_1x1) - ori $a7, $zero, 7 pcalau12i $a0, %pc_hi20(.LCPI1_0) fld.d $fs0, $a0, %pc_lo12(.LCPI1_0) pcalau12i $a0, %pc_hi20(.LCPI1_1) @@ -123,11 +122,11 @@ start_pass: # @start_pass fld.d $fs4, $a0, %pc_lo12(.LCPI1_4) pcalau12i $a0, %pc_hi20(.LCPI1_5) fld.d $fs5, $a0, %pc_lo12(.LCPI1_5) - vrepli.b $vr5, 0 + ori $a7, $zero, 7 pcalau12i $a0, %pc_hi20(start_pass.aanscales) addi.d $s7, $a0, %pc_lo12(start_pass.aanscales) ori $a0, $zero, 2048 - vreplgr2vr.d $vr6, $a0 + vreplgr2vr.d $vr5, $a0 ori $s8, $zero, 128 pcalau12i $a0, %pc_hi20(.LJTI1_0) addi.d $s6, $a0, %pc_lo12(.LJTI1_0) @@ -137,9 +136,8 @@ start_pass: # @start_pass move $s5, $zero move $a0, $zero move $a1, $zero - st.d $a6, $sp, 48 # 8-byte Folded Spill - vst $vr5, $sp, 32 # 16-byte Folded Spill - vst $vr6, $sp, 16 # 16-byte Folded Spill + st.d $a6, $sp, 32 # 8-byte Folded Spill + vst $vr5, $sp, 16 # 16-byte Folded Spill b .LBB1_4 .LBB1_2: # %vector.body # in Loop: Header=BB1_4 Depth=1 @@ -147,40 +145,40 @@ start_pass: # @start_pass ld.d $a5, $a3, 8 vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a4, $a3, 16 ld.d $a5, $a3, 24 vst $vr0, $a2, 0 vst $vr1, $a2, 16 vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a4, $a3, 32 ld.d $a5, $a3, 40 vst $vr0, $a2, 32 vst $vr1, $a2, 48 vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a4, $a3, 48 ld.d $a5, $a3, 56 vst $vr0, $a2, 64 vst $vr1, $a2, 80 vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a4, $a3, 64 ld.d $a5, $a3, 72 vst $vr0, $a2, 96 vst $vr1, $a2, 112 vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a4, $a3, 80 ld.d $a5, $a3, 88 vst $vr0, $a2, 128 @@ -188,23 +186,23 @@ start_pass: # @start_pass vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a5, 0 ld.d $a4, $a3, 96 - vilvl.h $vr0, $vr5, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a2, 160 ld.d $a5, $a3, 104 vinsgr2vr.d $vr0, $a4, 0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr1, $a2, 176 vinsgr2vr.d $vr1, $a5, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ld.d $a4, $a3, 112 ld.d $a3, $a3, 120 vst $vr0, $a2, 192 vst $vr1, $a2, 208 vinsgr2vr.d $vr0, $a4, 0 vinsgr2vr.d $vr1, $a3, 0 - vilvl.h $vr0, $vr5, $vr0 - vilvl.h $vr1, $vr5, $vr1 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $a2, 224 vst $vr1, $a2, 240 .p2align 4, , 16 @@ -264,10 +262,9 @@ start_pass: # @start_pass st.w $a2, $a0, 40 move $a0, $fp jirl $ra, $a1, 0 - vld $vr6, $sp, 16 # 16-byte Folded Reload - vld $vr5, $sp, 32 # 16-byte Folded Reload + vld $vr5, $sp, 16 # 16-byte Folded Reload ori $a7, $zero, 7 - ld.d $a6, $sp, 48 # 8-byte Folded Reload + ld.d $a6, $sp, 32 # 8-byte Folded Reload move $a1, $s2 move $a0, $s4 .LBB1_13: # in Loop: Header=BB1_4 Depth=1 @@ -714,21 +711,21 @@ start_pass: # @start_pass # => This Inner Loop Header: Depth=2 ldx.d $a5, $a3, $a4 vinsgr2vr.d $vr0, $a5, 0 + vsllwil.wu.hu $vr1, $vr0, 0 + vsllwil.du.wu $vr1, $vr1, 0 ldx.d $a5, $s7, $a4 - vilvl.h $vr0, $vr5, $vr0 - vilvl.w $vr1, $vr5, $vr0 - vilvh.w $vr0, $vr5, $vr0 + vshuf4i.h $vr0, $vr0, 14 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.du.wu $vr0, $vr0, 0 vinsgr2vr.d $vr2, $a5, 0 - vilvl.h $vr2, $vr2, $vr2 - vilvl.w $vr3, $vr2, $vr2 - vslli.d $vr3, $vr3, 48 - vsrai.d $vr3, $vr3, 48 - vilvh.w $vr2, $vr2, $vr2 - vslli.d $vr2, $vr2, 48 - vsrai.d $vr2, $vr2, 48 - vori.b $vr4, $vr6, 0 + vsllwil.w.h $vr3, $vr2, 0 + vsllwil.d.w $vr3, $vr3, 0 + vshuf4i.h $vr2, $vr2, 14 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.d.w $vr2, $vr2, 0 + vori.b $vr4, $vr5, 0 vmadd.d $vr4, $vr2, $vr0 - vori.b $vr0, $vr6, 0 + vori.b $vr0, $vr5, 0 vmadd.d $vr0, $vr3, $vr1 vsrli.d $vr0, $vr0, 12 vsrli.d $vr1, $vr4, 12 @@ -739,24 +736,24 @@ start_pass: # @start_pass bne $a4, $s8, .LBB1_20 b .LBB1_3 .LBB1_21: # %._crit_edge - fld.d $fs5, $sp, 56 # 8-byte Folded Reload - fld.d $fs4, $sp, 64 # 8-byte Folded Reload - fld.d $fs3, $sp, 72 # 8-byte Folded Reload - fld.d $fs2, $sp, 80 # 8-byte Folded Reload - fld.d $fs1, $sp, 88 # 8-byte Folded Reload - fld.d $fs0, $sp, 96 # 8-byte Folded Reload - ld.d $s8, $sp, 104 # 8-byte Folded Reload - ld.d $s7, $sp, 112 # 8-byte Folded Reload - ld.d $s6, $sp, 120 # 8-byte Folded Reload - ld.d $s5, $sp, 128 # 8-byte Folded Reload - ld.d $s4, $sp, 136 # 8-byte Folded Reload - ld.d $s3, $sp, 144 # 8-byte Folded Reload - ld.d $s2, $sp, 152 # 8-byte Folded Reload - ld.d $s1, $sp, 160 # 8-byte Folded Reload - ld.d $s0, $sp, 168 # 8-byte Folded Reload - ld.d $fp, $sp, 176 # 8-byte Folded Reload - ld.d $ra, $sp, 184 # 8-byte Folded Reload - addi.d $sp, $sp, 192 + fld.d $fs5, $sp, 40 # 8-byte Folded Reload + fld.d $fs4, $sp, 48 # 8-byte Folded Reload + fld.d $fs3, $sp, 56 # 8-byte Folded Reload + fld.d $fs2, $sp, 64 # 8-byte Folded Reload + fld.d $fs1, $sp, 72 # 8-byte Folded Reload + fld.d $fs0, $sp, 80 # 8-byte Folded Reload + ld.d $s8, $sp, 88 # 8-byte Folded Reload + ld.d $s7, $sp, 96 # 8-byte Folded Reload + ld.d $s6, $sp, 104 # 8-byte Folded Reload + ld.d $s5, $sp, 112 # 8-byte Folded Reload + ld.d $s4, $sp, 120 # 8-byte Folded Reload + ld.d $s3, $sp, 128 # 8-byte Folded Reload + ld.d $s2, $sp, 136 # 8-byte Folded Reload + ld.d $s1, $sp, 144 # 8-byte Folded Reload + ld.d $s0, $sp, 152 # 8-byte Folded Reload + ld.d $fp, $sp, 160 # 8-byte Folded Reload + ld.d $ra, $sp, 168 # 8-byte Folded Reload + addi.d $sp, $sp, 176 ret .Lfunc_end1: .size start_pass, .Lfunc_end1-start_pass diff --git a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jdmarker.s b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jdmarker.s index 7d7bcfd9..45aa1368 100644 --- a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jdmarker.s +++ b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jdmarker.s @@ -364,46 +364,44 @@ reset_marker_reader: # @reset_marker_reader .type read_markers,@function read_markers: # @read_markers # %bb.0: - addi.d $sp, $sp, -592 - st.d $ra, $sp, 584 # 8-byte Folded Spill - st.d $fp, $sp, 576 # 8-byte Folded Spill - st.d $s0, $sp, 568 # 8-byte Folded Spill - st.d $s1, $sp, 560 # 8-byte Folded Spill - st.d $s2, $sp, 552 # 8-byte Folded Spill - st.d $s3, $sp, 544 # 8-byte Folded Spill - st.d $s4, $sp, 536 # 8-byte Folded Spill - st.d $s5, $sp, 528 # 8-byte Folded Spill - st.d $s6, $sp, 520 # 8-byte Folded Spill - st.d $s7, $sp, 512 # 8-byte Folded Spill - st.d $s8, $sp, 504 # 8-byte Folded Spill + addi.d $sp, $sp, -576 + st.d $ra, $sp, 568 # 8-byte Folded Spill + st.d $fp, $sp, 560 # 8-byte Folded Spill + st.d $s0, $sp, 552 # 8-byte Folded Spill + st.d $s1, $sp, 544 # 8-byte Folded Spill + st.d $s2, $sp, 536 # 8-byte Folded Spill + st.d $s3, $sp, 528 # 8-byte Folded Spill + st.d $s4, $sp, 520 # 8-byte Folded Spill + st.d $s5, $sp, 512 # 8-byte Folded Spill + st.d $s6, $sp, 504 # 8-byte Folded Spill + st.d $s7, $sp, 496 # 8-byte Folded Spill + st.d $s8, $sp, 488 # 8-byte Folded Spill move $fp, $a0 addi.d $a0, $a0, 360 - st.d $a0, $sp, 192 # 8-byte Folded Spill - addi.d $a0, $fp, 192 st.d $a0, $sp, 200 # 8-byte Folded Spill + addi.d $a0, $fp, 192 + st.d $a0, $sp, 208 # 8-byte Folded Spill addi.d $a0, $fp, 256 - st.d $a0, $sp, 136 # 8-byte Folded Spill + st.d $a0, $sp, 144 # 8-byte Folded Spill addi.d $a0, $fp, 224 - st.d $a0, $sp, 128 # 8-byte Folded Spill + st.d $a0, $sp, 136 # 8-byte Folded Spill addi.d $s8, $fp, 312 addi.d $a0, $fp, 328 - st.d $a0, $sp, 216 # 8-byte Folded Spill + st.d $a0, $sp, 224 # 8-byte Folded Spill addi.d $a0, $fp, 344 - st.d $a0, $sp, 184 # 8-byte Folded Spill + st.d $a0, $sp, 192 # 8-byte Folded Spill ld.w $s0, $fp, 524 ori $s6, $zero, 255 ori $s1, $zero, 253 lu12i.w $a0, 4112 ori $a0, $a0, 257 - st.d $a0, $sp, 176 # 8-byte Folded Spill + st.d $a0, $sp, 184 # 8-byte Folded Spill lu12i.w $a0, 20560 ori $a0, $a0, 1285 - st.d $a0, $sp, 168 # 8-byte Folded Spill + st.d $a0, $sp, 176 # 8-byte Folded Spill lu12i.w $a0, 16 ori $a0, $a0, 1 - st.d $a0, $sp, 160 # 8-byte Folded Spill - vrepli.b $vr0, 0 - vst $vr0, $sp, 224 # 16-byte Folded Spill + st.d $a0, $sp, 168 # 8-byte Folded Spill b .LBB4_3 .LBB4_1: # in Loop: Header=BB4_3 Depth=1 ld.d $a0, $fp, 0 @@ -648,22 +646,22 @@ read_markers: # @read_markers # in Loop: Header=BB4_3 Depth=1 st.d $zero, $s8, 8 st.d $zero, $s8, 0 - ld.d $a1, $sp, 176 # 8-byte Folded Reload + ld.d $a1, $sp, 184 # 8-byte Folded Reload bstrins.d $a1, $a1, 56, 32 - ld.d $a2, $sp, 216 # 8-byte Folded Reload + ld.d $a2, $sp, 224 # 8-byte Folded Reload st.d $a1, $a2, 0 st.d $a1, $a2, 8 - ld.d $a1, $sp, 168 # 8-byte Folded Reload + ld.d $a1, $sp, 176 # 8-byte Folded Reload bstrins.d $a1, $a1, 58, 32 - ld.d $a2, $sp, 184 # 8-byte Folded Reload + ld.d $a2, $sp, 192 # 8-byte Folded Reload st.d $a1, $a2, 0 st.d $a1, $a2, 8 st.w $zero, $fp, 52 st.w $zero, $fp, 384 - ld.d $a1, $sp, 192 # 8-byte Folded Reload + ld.d $a1, $sp, 200 # 8-byte Folded Reload st.d $zero, $a1, 0 st.b $zero, $a1, 8 - ld.d $a1, $sp, 160 # 8-byte Folded Reload + ld.d $a1, $sp, 168 # 8-byte Folded Reload st.w $a1, $fp, 370 st.w $zero, $fp, 376 st.b $zero, $fp, 380 @@ -765,11 +763,11 @@ read_markers: # @read_markers jirl $ra, $a1, 0 .LBB4_58: # in Loop: Header=BB4_53 Depth=2 slli.d $a0, $s4, 3 - ld.d $a1, $sp, 200 # 8-byte Folded Reload + ld.d $a1, $sp, 208 # 8-byte Folded Reload ldx.d $s0, $a1, $a0 bnez $s0, .LBB4_60 # %bb.59: # in Loop: Header=BB4_53 Depth=2 - ld.d $a0, $sp, 200 # 8-byte Folded Reload + ld.d $a0, $sp, 208 # 8-byte Folded Reload alsl.d $s4, $s4, $a0, 3 move $a0, $fp pcaddu18i $ra, %call36(jpeg_alloc_quant_table) @@ -870,12 +868,11 @@ read_markers: # @read_markers ld.d $a1, $s0, 0 ld.d $a2, $s0, 8 vinsgr2vr.d $vr0, $a1, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a0, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a0, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a0, 60 ori $s4, $zero, 92 st.w $s4, $a0, 40 @@ -886,12 +883,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 24 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -901,12 +897,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 40 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -916,12 +911,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 56 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -931,12 +925,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 72 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -946,12 +939,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 88 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -961,12 +953,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 104 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -976,12 +967,11 @@ read_markers: # @read_markers ld.d $a1, $fp, 0 ld.d $a2, $s0, 120 vinsgr2vr.d $vr0, $a0, 0 - vld $vr1, $sp, 224 # 16-byte Folded Reload - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 44 vinsgr2vr.d $vr0, $a2, 0 ld.d $a2, $a1, 8 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $a1, 60 st.w $s4, $a1, 40 ori $a1, $zero, 2 @@ -1069,63 +1059,63 @@ read_markers: # @read_markers bltu $a1, $a0, .LBB4_158 # %bb.90: # %.lr.ph161.i # in Loop: Header=BB4_3 Depth=1 - addi.d $s3, $a1, -2 - st.d $s8, $sp, 8 # 8-byte Folded Spill + addi.d $s2, $a1, -2 + st.d $s8, $sp, 16 # 8-byte Folded Spill b .LBB4_93 .p2align 4, , 16 .LBB4_91: # in Loop: Header=BB4_93 Depth=2 - ld.d $a0, $sp, 136 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload alsl.d $a0, $s6, $a0, 3 - ld.d $a1, $sp, 128 # 8-byte Folded Reload - ld.d $a2, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 136 # 8-byte Folded Reload + ld.d $a2, $sp, 64 # 8-byte Folded Reload alsl.d $a1, $a2, $a1, 3 - masknez $a0, $a0, $s3 - maskeqz $a1, $a1, $s3 - or $s3, $a1, $a0 - ld.d $a0, $s3, 0 + masknez $a0, $a0, $s2 + maskeqz $a1, $a1, $s2 + or $s2, $a1, $a0 + ld.d $a0, $s2, 0 beqz $a0, .LBB4_157 .LBB4_92: # in Loop: Header=BB4_93 Depth=2 st.b $zero, $a0, 0 - ld.d $a1, $sp, 152 # 8-byte Folded Reload + ld.d $a1, $sp, 160 # 8-byte Folded Reload st.b $a1, $a0, 1 - ld.d $a1, $sp, 144 # 8-byte Folded Reload + ld.d $a1, $sp, 152 # 8-byte Folded Reload st.b $a1, $a0, 2 - ld.d $a1, $sp, 120 # 8-byte Folded Reload + ld.d $a1, $sp, 128 # 8-byte Folded Reload st.b $a1, $a0, 3 - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 120 # 8-byte Folded Reload st.b $a1, $a0, 4 - ld.d $a1, $sp, 104 # 8-byte Folded Reload + ld.d $a1, $sp, 112 # 8-byte Folded Reload st.b $a1, $a0, 5 - ld.d $a1, $sp, 96 # 8-byte Folded Reload + ld.d $a1, $sp, 104 # 8-byte Folded Reload st.b $a1, $a0, 6 - ld.d $a1, $sp, 88 # 8-byte Folded Reload + ld.d $a1, $sp, 96 # 8-byte Folded Reload st.b $a1, $a0, 7 - ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 88 # 8-byte Folded Reload st.b $a1, $a0, 8 - ld.d $a1, $sp, 208 # 8-byte Folded Reload + ld.d $a1, $sp, 216 # 8-byte Folded Reload st.b $a1, $a0, 9 - ld.d $a1, $sp, 72 # 8-byte Folded Reload + ld.d $a1, $sp, 80 # 8-byte Folded Reload st.b $a1, $a0, 10 - ld.d $a1, $sp, 64 # 8-byte Folded Reload + ld.d $a1, $sp, 72 # 8-byte Folded Reload st.b $a1, $a0, 11 - st.b $s2, $a0, 12 - st.b $s5, $a0, 13 + st.b $s5, $a0, 12 + st.b $s3, $a0, 13 st.b $s7, $a0, 14 st.b $s8, $a0, 15 - ld.d $a1, $sp, 40 # 8-byte Folded Reload + ld.d $a1, $sp, 48 # 8-byte Folded Reload st.b $a1, $a0, 16 - ld.d $a0, $s3, 0 - ld.d $a1, $sp, 32 # 8-byte Folded Reload - ld.d $a2, $sp, 48 # 8-byte Folded Reload - sub.d $s3, $a2, $a1 + ld.d $a0, $s2, 0 + ld.d $a1, $sp, 40 # 8-byte Folded Reload + ld.d $a2, $sp, 56 # 8-byte Folded Reload + sub.d $s2, $a2, $a1 addi.d $a0, $a0, 17 - addi.d $a1, $sp, 248 + addi.d $a1, $sp, 232 ori $a2, $zero, 256 pcaddu18i $ra, %call36(memcpy) jirl $ra, $ra, 0 - ld.d $s8, $sp, 8 # 8-byte Folded Reload + ld.d $s8, $sp, 16 # 8-byte Folded Reload ori $s6, $zero, 255 - blez $s3, .LBB4_158 + blez $s2, .LBB4_158 .LBB4_93: # Parent Loop BB4_3 Depth=1 # => This Loop Header: Depth=2 # Child Loop BB4_151 Depth 3 @@ -1141,7 +1131,7 @@ read_markers: # @read_markers .LBB4_96: # in Loop: Header=BB4_93 Depth=2 ld.bu $a1, $s4, 0 ld.d $a0, $fp, 0 - st.d $a1, $sp, 56 # 8-byte Folded Spill + st.d $a1, $sp, 64 # 8-byte Folded Spill st.w $a1, $a0, 44 ld.d $a1, $fp, 0 ld.d $a2, $a1, 8 @@ -1155,72 +1145,72 @@ read_markers: # @read_markers # %bb.97: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $s4, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 152 # 8-byte Folded Spill + st.d $a2, $sp, 160 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_115 .LBB4_98: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 144 # 8-byte Folded Spill + st.d $a2, $sp, 152 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_117 .LBB4_99: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 120 # 8-byte Folded Spill + st.d $a2, $sp, 128 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_119 .LBB4_100: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 112 # 8-byte Folded Spill + st.d $a2, $sp, 120 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_121 .LBB4_101: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 104 # 8-byte Folded Spill + st.d $a2, $sp, 112 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_123 .LBB4_102: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 96 # 8-byte Folded Spill + st.d $a2, $sp, 104 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_125 .LBB4_103: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 88 # 8-byte Folded Spill + st.d $a2, $sp, 96 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_127 .LBB4_104: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 80 # 8-byte Folded Spill + st.d $a2, $sp, 88 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_129 .LBB4_105: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 208 # 8-byte Folded Spill + st.d $a2, $sp, 216 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_131 .LBB4_106: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 72 # 8-byte Folded Spill + st.d $a2, $sp, 80 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_133 .LBB4_107: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 64 # 8-byte Folded Spill + st.d $a2, $sp, 72 # 8-byte Folded Spill addi.d $a1, $a1, -1 beqz $a1, .LBB4_135 .LBB4_108: # in Loop: Header=BB4_93 Depth=2 addi.d $a0, $a0, 1 - ld.bu $s2, $a0, 0 + ld.bu $s3, $a0, 0 addi.d $a1, $a1, -1 beqz $a1, .LBB4_137 .LBB4_109: # in Loop: Header=BB4_93 Depth=2 @@ -1239,9 +1229,10 @@ read_markers: # @read_markers addi.d $a1, $a1, -1 beqz $a1, .LBB4_143 .LBB4_112: # in Loop: Header=BB4_93 Depth=2 - st.d $a1, $sp, 32 # 8-byte Folded Spill - move $s5, $s4 - move $a5, $s3 + st.d $a1, $sp, 40 # 8-byte Folded Spill + move $t3, $s4 + move $s5, $s3 + move $a5, $s2 addi.d $t6, $a0, 1 b .LBB4_145 .p2align 4, , 16 @@ -1254,7 +1245,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 152 # 8-byte Folded Spill + st.d $a2, $sp, 160 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_98 .LBB4_115: # in Loop: Header=BB4_93 Depth=2 @@ -1266,7 +1257,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 144 # 8-byte Folded Spill + st.d $a2, $sp, 152 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_99 .LBB4_117: # in Loop: Header=BB4_93 Depth=2 @@ -1278,7 +1269,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 120 # 8-byte Folded Spill + st.d $a2, $sp, 128 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_100 .LBB4_119: # in Loop: Header=BB4_93 Depth=2 @@ -1290,7 +1281,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 112 # 8-byte Folded Spill + st.d $a2, $sp, 120 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_101 .LBB4_121: # in Loop: Header=BB4_93 Depth=2 @@ -1302,7 +1293,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 104 # 8-byte Folded Spill + st.d $a2, $sp, 112 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_102 .LBB4_123: # in Loop: Header=BB4_93 Depth=2 @@ -1314,7 +1305,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 96 # 8-byte Folded Spill + st.d $a2, $sp, 104 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_103 .LBB4_125: # in Loop: Header=BB4_93 Depth=2 @@ -1326,7 +1317,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 88 # 8-byte Folded Spill + st.d $a2, $sp, 96 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_104 .LBB4_127: # in Loop: Header=BB4_93 Depth=2 @@ -1338,7 +1329,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 80 # 8-byte Folded Spill + st.d $a2, $sp, 88 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_105 .LBB4_129: # in Loop: Header=BB4_93 Depth=2 @@ -1350,7 +1341,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 208 # 8-byte Folded Spill + st.d $a2, $sp, 216 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_106 .LBB4_131: # in Loop: Header=BB4_93 Depth=2 @@ -1362,7 +1353,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 72 # 8-byte Folded Spill + st.d $a2, $sp, 80 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_107 .LBB4_133: # in Loop: Header=BB4_93 Depth=2 @@ -1374,7 +1365,7 @@ read_markers: # @read_markers ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 ld.bu $a2, $a0, 0 - st.d $a2, $sp, 64 # 8-byte Folded Spill + st.d $a2, $sp, 72 # 8-byte Folded Spill addi.d $a1, $a1, -1 bnez $a1, .LBB4_108 .LBB4_135: # in Loop: Header=BB4_93 Depth=2 @@ -1385,7 +1376,7 @@ read_markers: # @read_markers # %bb.136: # in Loop: Header=BB4_93 Depth=2 ld.d $a0, $s0, 0 ld.d $a1, $s0, 8 - ld.bu $s2, $a0, 0 + ld.bu $s3, $a0, 0 addi.d $a1, $a1, -1 bnez $a1, .LBB4_109 .LBB4_137: # in Loop: Header=BB4_93 Depth=2 @@ -1427,39 +1418,41 @@ read_markers: # @read_markers jirl $ra, $a1, 0 beqz $a0, .LBB4_232 # %bb.144: # in Loop: Header=BB4_93 Depth=2 - move $s5, $s4 - move $a5, $s3 + move $t3, $s4 + move $s5, $s3 + move $a5, $s2 ld.d $t6, $s0, 0 ld.d $a0, $s0, 8 - st.d $a0, $sp, 32 # 8-byte Folded Spill + st.d $a0, $sp, 40 # 8-byte Folded Spill .LBB4_145: # in Loop: Header=BB4_93 Depth=2 - ld.d $a2, $sp, 152 # 8-byte Folded Reload - ld.d $a3, $sp, 144 # 8-byte Folded Reload + ld.d $a2, $sp, 160 # 8-byte Folded Reload + ld.d $a3, $sp, 152 # 8-byte Folded Reload add.d $a0, $a3, $a2 - ld.d $a4, $sp, 120 # 8-byte Folded Reload + ld.d $a4, $sp, 128 # 8-byte Folded Reload add.d $a0, $a0, $a4 - ld.d $a6, $sp, 112 # 8-byte Folded Reload + ld.d $a6, $sp, 120 # 8-byte Folded Reload add.d $a0, $a0, $a6 - ld.d $a7, $sp, 104 # 8-byte Folded Reload + ld.d $a7, $sp, 112 # 8-byte Folded Reload add.d $a0, $a0, $a7 - ld.d $t0, $sp, 96 # 8-byte Folded Reload + ld.d $t0, $sp, 104 # 8-byte Folded Reload add.d $a0, $a0, $t0 - ld.d $t1, $sp, 88 # 8-byte Folded Reload + ld.d $t1, $sp, 96 # 8-byte Folded Reload add.d $a0, $a0, $t1 - ld.d $t2, $sp, 80 # 8-byte Folded Reload + ld.d $t2, $sp, 88 # 8-byte Folded Reload add.d $a0, $a0, $t2 - ld.d $t7, $sp, 208 # 8-byte Folded Reload - add.d $a0, $a0, $t7 - ld.d $s4, $sp, 72 # 8-byte Folded Reload + ld.d $a1, $sp, 216 # 8-byte Folded Reload + add.d $a0, $a0, $a1 + ld.d $s4, $sp, 80 # 8-byte Folded Reload add.d $a0, $a0, $s4 - ld.d $s1, $sp, 64 # 8-byte Folded Reload + ld.d $s1, $sp, 72 # 8-byte Folded Reload add.d $a0, $a0, $s1 - st.d $a0, $sp, 16 # 8-byte Folded Spill - add.d $a0, $a0, $s2 - ld.d $a1, $fp, 0 + st.d $a0, $sp, 24 # 8-byte Folded Spill add.d $a0, $a0, $s5 + ld.d $a1, $fp, 0 + move $s3, $t3 + add.d $a0, $a0, $t3 add.d $a0, $a0, $s7 - st.d $t6, $sp, 24 # 8-byte Folded Spill + st.d $t6, $sp, 32 # 8-byte Folded Spill ld.bu $s6, $t6, 0 st.w $a2, $a1, 44 st.w $a3, $a1, 48 @@ -1473,23 +1466,23 @@ read_markers: # @read_markers st.w $a2, $a1, 40 ld.d $a2, $a1, 8 add.d $a0, $a0, $s8 - add.d $s3, $a0, $s6 + add.d $s2, $a0, $s6 addi.d $a0, $a5, -17 - st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a0, $sp, 56 # 8-byte Folded Spill ori $a1, $zero, 2 move $a0, $fp jirl $ra, $a2, 0 ld.d $a0, $fp, 0 - ld.d $a1, $sp, 208 # 8-byte Folded Reload + ld.d $a1, $sp, 216 # 8-byte Folded Reload st.w $a1, $a0, 44 st.w $s4, $a0, 48 st.w $s1, $a0, 52 - st.w $s2, $a0, 56 - st.w $s5, $a0, 60 + st.w $s5, $a0, 56 + st.w $s3, $a0, 60 st.w $s7, $a0, 64 ld.d $a2, $a0, 8 st.w $s8, $a0, 68 - st.d $s6, $sp, 40 # 8-byte Folded Spill + st.d $s6, $sp, 48 # 8-byte Folded Spill st.w $s6, $a0, 72 ori $a1, $zero, 85 st.w $a1, $a0, 40 @@ -1497,10 +1490,10 @@ read_markers: # @read_markers move $a0, $fp jirl $ra, $a2, 0 ori $a0, $zero, 256 - bltu $a0, $s3, .LBB4_147 + bltu $a0, $s2, .LBB4_147 # %bb.146: # in Loop: Header=BB4_93 Depth=2 - ld.d $a0, $sp, 48 # 8-byte Folded Reload - bge $a0, $s3, .LBB4_148 + ld.d $a0, $sp, 56 # 8-byte Folded Reload + bge $a0, $s2, .LBB4_148 .LBB4_147: # in Loop: Header=BB4_93 Depth=2 ld.d $a0, $fp, 0 ld.d $a1, $a0, 0 @@ -1509,22 +1502,22 @@ read_markers: # @read_markers move $a0, $fp jirl $ra, $a1, 0 .LBB4_148: # in Loop: Header=BB4_93 Depth=2 - ld.d $a0, $sp, 24 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload addi.d $a0, $a0, 1 - ld.d $a1, $sp, 32 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload addi.d $s1, $a1, -1 - st.d $s3, $sp, 32 # 8-byte Folded Spill - beqz $s3, .LBB4_154 + st.d $s2, $sp, 40 # 8-byte Folded Spill + beqz $s2, .LBB4_154 # %bb.149: # %.lr.ph.i78.preheader # in Loop: Header=BB4_93 Depth=2 - ld.d $a1, $sp, 16 # 8-byte Folded Reload - add.d $a1, $a1, $s2 + ld.d $a1, $sp, 24 # 8-byte Folded Reload add.d $a1, $a1, $s5 + add.d $a1, $a1, $s3 add.d $a1, $a1, $s7 add.d $a1, $a1, $s8 - ld.d $a2, $sp, 40 # 8-byte Folded Reload - add.w $s3, $a1, $a2 - addi.d $s6, $sp, 248 + ld.d $a2, $sp, 48 # 8-byte Folded Reload + add.w $s2, $a1, $a2 + addi.d $s6, $sp, 232 b .LBB4_151 .p2align 4, , 16 .LBB4_150: # in Loop: Header=BB4_151 Depth=3 @@ -1532,10 +1525,10 @@ read_markers: # @read_markers addi.d $s1, $s1, -1 addi.d $s4, $a0, 1 st.b $a1, $s6, 0 - addi.d $s3, $s3, -1 + addi.d $s2, $s2, -1 addi.d $s6, $s6, 1 move $a0, $s4 - beqz $s3, .LBB4_155 + beqz $s2, .LBB4_155 .LBB4_151: # %.lr.ph.i78 # Parent Loop BB4_3 Depth=1 # Parent Loop BB4_93 Depth=2 @@ -1554,12 +1547,12 @@ read_markers: # @read_markers move $s4, $a0 .LBB4_155: # %._crit_edge.i82 # in Loop: Header=BB4_93 Depth=2 - ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 64 # 8-byte Folded Reload andi $a0, $a1, 16 - sltui $s3, $a0, 1 + sltui $s2, $a0, 1 addi.d $s6, $a1, -16 - masknez $a0, $s6, $s3 - maskeqz $a1, $a1, $s3 + masknez $a0, $s6, $s2 + maskeqz $a1, $a1, $s2 or $a0, $a1, $a0 ori $a1, $zero, 4 blt $a0, $a1, .LBB4_91 @@ -1577,7 +1570,7 @@ read_markers: # @read_markers move $a0, $fp pcaddu18i $ra, %call36(jpeg_alloc_huff_table) jirl $ra, $ra, 0 - st.d $a0, $s3, 0 + st.d $a0, $s2, 0 b .LBB4_92 .LBB4_158: # %get_dht.exit # in Loop: Header=BB4_3 Depth=1 @@ -1736,7 +1729,7 @@ read_markers: # @read_markers andi $a0, $s5, 15 stx.b $a0, $s8, $s4 srli.d $a1, $s5, 4 - ld.d $a2, $sp, 216 # 8-byte Folded Reload + ld.d $a2, $sp, 224 # 8-byte Folded Reload stx.b $a1, $a2, $s4 bgeu $a1, $a0, .LBB4_172 # %bb.183: # in Loop: Header=BB4_173 Depth=2 @@ -2053,18 +2046,18 @@ read_markers: # @read_markers .LBB4_232: move $a0, $zero .LBB4_233: # %first_marker.exit.thread - ld.d $s8, $sp, 504 # 8-byte Folded Reload - ld.d $s7, $sp, 512 # 8-byte Folded Reload - ld.d $s6, $sp, 520 # 8-byte Folded Reload - ld.d $s5, $sp, 528 # 8-byte Folded Reload - ld.d $s4, $sp, 536 # 8-byte Folded Reload - ld.d $s3, $sp, 544 # 8-byte Folded Reload - ld.d $s2, $sp, 552 # 8-byte Folded Reload - ld.d $s1, $sp, 560 # 8-byte Folded Reload - ld.d $s0, $sp, 568 # 8-byte Folded Reload - ld.d $fp, $sp, 576 # 8-byte Folded Reload - ld.d $ra, $sp, 584 # 8-byte Folded Reload - addi.d $sp, $sp, 592 + ld.d $s8, $sp, 488 # 8-byte Folded Reload + ld.d $s7, $sp, 496 # 8-byte Folded Reload + ld.d $s6, $sp, 504 # 8-byte Folded Reload + ld.d $s5, $sp, 512 # 8-byte Folded Reload + ld.d $s4, $sp, 520 # 8-byte Folded Reload + ld.d $s3, $sp, 528 # 8-byte Folded Reload + ld.d $s2, $sp, 536 # 8-byte Folded Reload + ld.d $s1, $sp, 544 # 8-byte Folded Reload + ld.d $s0, $sp, 552 # 8-byte Folded Reload + ld.d $fp, $sp, 560 # 8-byte Folded Reload + ld.d $ra, $sp, 568 # 8-byte Folded Reload + addi.d $sp, $sp, 576 ret .Lfunc_end4: .size read_markers, .Lfunc_end4-read_markers diff --git a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jdsample.s b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jdsample.s index d5d0c193..3588829f 100644 --- a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jdsample.s +++ b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jdsample.s @@ -407,11 +407,10 @@ h2v1_fancy_upsample: # @h2v1_fancy_upsample st.d $s6, $sp, 40 # 8-byte Folded Spill st.d $s7, $sp, 32 # 8-byte Folded Spill st.d $s8, $sp, 24 # 8-byte Folded Spill - move $a4, $zero ld.d $a3, $a3, 0 + move $a4, $zero ori $t4, $zero, 16 - vrepli.b $vr0, 0 - vrepli.h $vr1, 3 + vrepli.h $vr0, 3 b .LBB5_4 .LBB5_2: # in Loop: Header=BB5_4 Depth=1 addi.d $a6, $a7, -2 @@ -484,123 +483,126 @@ h2v1_fancy_upsample: # @h2v1_fancy_upsample # => This Inner Loop Header: Depth=2 addi.d $a5, $a6, 2 pcalau12i $t4, %pc_hi20(.LCPI5_0) - vld $vr2, $t4, %pc_lo12(.LCPI5_0) - vreplgr2vr.d $vr3, $a6 + vld $vr1, $t4, %pc_lo12(.LCPI5_0) + vreplgr2vr.d $vr2, $a6 pcalau12i $t4, %pc_hi20(.LCPI5_1) - vld $vr4, $t4, %pc_lo12(.LCPI5_1) + vld $vr3, $t4, %pc_lo12(.LCPI5_1) pcalau12i $t4, %pc_hi20(.LCPI5_2) - vld $vr5, $t4, %pc_lo12(.LCPI5_2) + vld $vr4, $t4, %pc_lo12(.LCPI5_2) pcalau12i $t4, %pc_hi20(.LCPI5_3) - vld $vr6, $t4, %pc_lo12(.LCPI5_3) - vadd.d $vr7, $vr3, $vr2 - vadd.d $vr8, $vr3, $vr4 - vadd.d $vr9, $vr3, $vr5 - vadd.d $vr10, $vr3, $vr6 + vld $vr5, $t4, %pc_lo12(.LCPI5_3) + vadd.d $vr6, $vr2, $vr1 + vadd.d $vr7, $vr2, $vr3 + vadd.d $vr8, $vr2, $vr4 + vadd.d $vr9, $vr2, $vr5 pcalau12i $t4, %pc_hi20(.LCPI5_4) - vld $vr11, $t4, %pc_lo12(.LCPI5_4) + vld $vr10, $t4, %pc_lo12(.LCPI5_4) pcalau12i $t4, %pc_hi20(.LCPI5_5) - vld $vr12, $t4, %pc_lo12(.LCPI5_5) + vld $vr11, $t4, %pc_lo12(.LCPI5_5) pcalau12i $t4, %pc_hi20(.LCPI5_6) - vld $vr13, $t4, %pc_lo12(.LCPI5_6) + vld $vr12, $t4, %pc_lo12(.LCPI5_6) pcalau12i $t4, %pc_hi20(.LCPI5_7) - vld $vr14, $t4, %pc_lo12(.LCPI5_7) - vadd.d $vr15, $vr3, $vr11 - vadd.d $vr16, $vr3, $vr12 - vadd.d $vr17, $vr3, $vr13 - vadd.d $vr18, $vr3, $vr14 - vreplgr2vr.d $vr3, $a5 - vadd.d $vr2, $vr3, $vr2 - vadd.d $vr4, $vr3, $vr4 - vadd.d $vr5, $vr3, $vr5 - vadd.d $vr6, $vr3, $vr6 - vadd.d $vr11, $vr3, $vr11 - vadd.d $vr12, $vr3, $vr12 - vld $vr19, $t5, -1 - vadd.d $vr13, $vr3, $vr13 - vld $vr20, $t5, -2 - vadd.d $vr14, $vr3, $vr14 - vilvl.b $vr3, $vr0, $vr19 - vilvh.b $vr19, $vr0, $vr19 - vilvl.b $vr21, $vr0, $vr20 - vilvh.b $vr20, $vr0, $vr20 - vmadd.h $vr20, $vr19, $vr1 - vmadd.h $vr21, $vr3, $vr1 - vaddi.hu $vr21, $vr21, 1 + vld $vr13, $t4, %pc_lo12(.LCPI5_7) + vadd.d $vr14, $vr2, $vr10 + vadd.d $vr15, $vr2, $vr11 + vadd.d $vr16, $vr2, $vr12 + vadd.d $vr17, $vr2, $vr13 + vreplgr2vr.d $vr2, $a5 + vadd.d $vr1, $vr2, $vr1 + vadd.d $vr3, $vr2, $vr3 + vadd.d $vr18, $vr2, $vr4 + vadd.d $vr5, $vr2, $vr5 + vadd.d $vr10, $vr2, $vr10 + vadd.d $vr11, $vr2, $vr11 + vld $vr4, $t5, -1 + vadd.d $vr12, $vr2, $vr12 + vadd.d $vr13, $vr2, $vr13 + vld $vr19, $t5, -2 + vsllwil.hu.bu $vr2, $vr4, 0 + vbsrl.v $vr4, $vr4, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.hu.bu $vr20, $vr19, 0 + vbsrl.v $vr19, $vr19, 8 + vsllwil.hu.bu $vr19, $vr19, 0 + vmadd.h $vr19, $vr4, $vr0 + vmadd.h $vr20, $vr2, $vr0 vaddi.hu $vr20, $vr20, 1 + vaddi.hu $vr19, $vr19, 1 + vsrli.h $vr19, $vr19, 2 vsrli.h $vr20, $vr20, 2 - vsrli.h $vr21, $vr21, 2 - vpickve2gr.d $t7, $vr18, 1 - vpickve2gr.d $t8, $vr17, 0 - vpickve2gr.d $fp, $vr17, 1 - vpickve2gr.d $s0, $vr16, 0 - vpickve2gr.d $s1, $vr16, 1 - vpickve2gr.d $s2, $vr15, 0 - vpickve2gr.d $s3, $vr15, 1 - vpickve2gr.d $s4, $vr10, 0 - vpickve2gr.d $s5, $vr10, 1 - vpickve2gr.d $s6, $vr9, 0 - vpickve2gr.d $s7, $vr9, 1 - vpickve2gr.d $s8, $vr8, 0 - vpickve2gr.d $ra, $vr8, 1 - vpickve2gr.d $a5, $vr7, 0 - vpickve2gr.d $t4, $vr7, 1 - vpickve2gr.d $t2, $vr14, 0 - vstelm.b $vr21, $t2, 0, 0 - vpickve2gr.d $t2, $vr14, 1 - vstelm.b $vr21, $t2, 0, 2 + vpickve2gr.d $t7, $vr17, 1 + vpickve2gr.d $t8, $vr16, 0 + vpickve2gr.d $fp, $vr16, 1 + vpickve2gr.d $s0, $vr15, 0 + vpickve2gr.d $s1, $vr15, 1 + vpickve2gr.d $s2, $vr14, 0 + vpickve2gr.d $s3, $vr14, 1 + vpickve2gr.d $s4, $vr9, 0 + vpickve2gr.d $s5, $vr9, 1 + vpickve2gr.d $s6, $vr8, 0 + vpickve2gr.d $s7, $vr8, 1 + vpickve2gr.d $s8, $vr7, 0 + vpickve2gr.d $ra, $vr7, 1 + vpickve2gr.d $a5, $vr6, 0 + vpickve2gr.d $t4, $vr6, 1 vpickve2gr.d $t2, $vr13, 0 - vstelm.b $vr21, $t2, 0, 4 - vpickve2gr.d $t2, $vr13, 1 - vstelm.b $vr21, $t2, 0, 6 - vpickve2gr.d $t2, $vr12, 0 - vstelm.b $vr21, $t2, 0, 8 - vpickve2gr.d $t2, $vr12, 1 - vstelm.b $vr21, $t2, 0, 10 - vpickve2gr.d $t2, $vr11, 0 - vstelm.b $vr21, $t2, 0, 12 - vpickve2gr.d $t2, $vr11, 1 - vstelm.b $vr21, $t2, 0, 14 - vpickve2gr.d $t2, $vr6, 0 vstelm.b $vr20, $t2, 0, 0 - vpickve2gr.d $t2, $vr6, 1 + vpickve2gr.d $t2, $vr13, 1 vstelm.b $vr20, $t2, 0, 2 - vpickve2gr.d $t2, $vr5, 0 + vpickve2gr.d $t2, $vr12, 0 vstelm.b $vr20, $t2, 0, 4 - vpickve2gr.d $t2, $vr5, 1 + vpickve2gr.d $t2, $vr12, 1 vstelm.b $vr20, $t2, 0, 6 - vpickve2gr.d $t2, $vr4, 0 + vpickve2gr.d $t2, $vr11, 0 vstelm.b $vr20, $t2, 0, 8 - vpickve2gr.d $t2, $vr4, 1 + vpickve2gr.d $t2, $vr11, 1 vstelm.b $vr20, $t2, 0, 10 - vpickve2gr.d $t2, $vr2, 0 - vld $vr4, $t5, 0 + vpickve2gr.d $t2, $vr10, 0 vstelm.b $vr20, $t2, 0, 12 - vpickve2gr.d $t2, $vr2, 1 + vpickve2gr.d $t2, $vr10, 1 vstelm.b $vr20, $t2, 0, 14 - vilvh.b $vr2, $vr0, $vr4 - vmadd.h $vr2, $vr19, $vr1 - vilvl.b $vr4, $vr0, $vr4 - vmadd.h $vr4, $vr3, $vr1 - vaddi.hu $vr3, $vr4, 2 - vaddi.hu $vr2, $vr2, 2 + vpickve2gr.d $t2, $vr5, 0 + vstelm.b $vr19, $t2, 0, 0 + vpickve2gr.d $t2, $vr5, 1 + vstelm.b $vr19, $t2, 0, 2 + vpickve2gr.d $t2, $vr18, 0 + vstelm.b $vr19, $t2, 0, 4 + vpickve2gr.d $t2, $vr18, 1 + vstelm.b $vr19, $t2, 0, 6 + vpickve2gr.d $t2, $vr3, 0 + vstelm.b $vr19, $t2, 0, 8 + vpickve2gr.d $t2, $vr3, 1 + vstelm.b $vr19, $t2, 0, 10 + vpickve2gr.d $t2, $vr1, 0 + vld $vr3, $t5, 0 + vstelm.b $vr19, $t2, 0, 12 + vpickve2gr.d $t2, $vr1, 1 + vstelm.b $vr19, $t2, 0, 14 + vbsrl.v $vr1, $vr3, 8 + vsllwil.hu.bu $vr1, $vr1, 0 + vmadd.h $vr1, $vr4, $vr0 + vsllwil.hu.bu $vr3, $vr3, 0 + vmadd.h $vr3, $vr2, $vr0 + vaddi.hu $vr2, $vr3, 2 + vaddi.hu $vr1, $vr1, 2 + vsrli.h $vr1, $vr1, 2 vsrli.h $vr2, $vr2, 2 - vsrli.h $vr3, $vr3, 2 - vstelm.b $vr3, $a6, 3, 0 - vstelm.b $vr3, $t7, 3, 2 - vstelm.b $vr3, $t8, 3, 4 - vstelm.b $vr3, $fp, 3, 6 - vstelm.b $vr3, $s0, 3, 8 - vstelm.b $vr3, $s1, 3, 10 - vstelm.b $vr3, $s2, 3, 12 - vstelm.b $vr3, $s3, 3, 14 - vstelm.b $vr2, $s4, 3, 0 - vstelm.b $vr2, $s5, 3, 2 - vstelm.b $vr2, $s6, 3, 4 - vstelm.b $vr2, $s7, 3, 6 - vstelm.b $vr2, $s8, 3, 8 - vstelm.b $vr2, $ra, 3, 10 - vstelm.b $vr2, $a5, 3, 12 - vstelm.b $vr2, $t4, 3, 14 + vstelm.b $vr2, $a6, 3, 0 + vstelm.b $vr2, $t7, 3, 2 + vstelm.b $vr2, $t8, 3, 4 + vstelm.b $vr2, $fp, 3, 6 + vstelm.b $vr2, $s0, 3, 8 + vstelm.b $vr2, $s1, 3, 10 + vstelm.b $vr2, $s2, 3, 12 + vstelm.b $vr2, $s3, 3, 14 + vstelm.b $vr1, $s4, 3, 0 + vstelm.b $vr1, $s5, 3, 2 + vstelm.b $vr1, $s6, 3, 4 + vstelm.b $vr1, $s7, 3, 6 + vstelm.b $vr1, $s8, 3, 8 + vstelm.b $vr1, $ra, 3, 10 + vstelm.b $vr1, $a5, 3, 12 + vstelm.b $vr1, $t4, 3, 14 addi.d $a6, $a6, 32 addi.d $t6, $t6, -16 addi.d $t5, $t5, 16 @@ -791,31 +793,29 @@ h2v2_fancy_upsample: # @h2v2_fancy_upsample ld.w $a4, $a0, 392 blez $a4, .LBB7_30 # %bb.1: # %.preheader.lr.ph - addi.d $sp, $sp, -160 - st.d $ra, $sp, 152 # 8-byte Folded Spill - st.d $fp, $sp, 144 # 8-byte Folded Spill - st.d $s0, $sp, 136 # 8-byte Folded Spill - st.d $s1, $sp, 128 # 8-byte Folded Spill - st.d $s2, $sp, 120 # 8-byte Folded Spill - st.d $s3, $sp, 112 # 8-byte Folded Spill - st.d $s4, $sp, 104 # 8-byte Folded Spill - st.d $s5, $sp, 96 # 8-byte Folded Spill - st.d $s6, $sp, 88 # 8-byte Folded Spill - st.d $s7, $sp, 80 # 8-byte Folded Spill - st.d $s8, $sp, 72 # 8-byte Folded Spill - fst.d $fs0, $sp, 64 # 8-byte Folded Spill - fst.d $fs1, $sp, 56 # 8-byte Folded Spill - fst.d $fs2, $sp, 48 # 8-byte Folded Spill - fst.d $fs3, $sp, 40 # 8-byte Folded Spill - fst.d $fs4, $sp, 32 # 8-byte Folded Spill - fst.d $fs5, $sp, 24 # 8-byte Folded Spill - fst.d $fs6, $sp, 16 # 8-byte Folded Spill + addi.d $sp, $sp, -144 + st.d $ra, $sp, 136 # 8-byte Folded Spill + st.d $fp, $sp, 128 # 8-byte Folded Spill + st.d $s0, $sp, 120 # 8-byte Folded Spill + st.d $s1, $sp, 112 # 8-byte Folded Spill + st.d $s2, $sp, 104 # 8-byte Folded Spill + st.d $s3, $sp, 96 # 8-byte Folded Spill + st.d $s4, $sp, 88 # 8-byte Folded Spill + st.d $s5, $sp, 80 # 8-byte Folded Spill + st.d $s6, $sp, 72 # 8-byte Folded Spill + st.d $s7, $sp, 64 # 8-byte Folded Spill + st.d $s8, $sp, 56 # 8-byte Folded Spill + fst.d $fs0, $sp, 48 # 8-byte Folded Spill + fst.d $fs1, $sp, 40 # 8-byte Folded Spill + fst.d $fs2, $sp, 32 # 8-byte Folded Spill + fst.d $fs3, $sp, 24 # 8-byte Folded Spill + fst.d $fs4, $sp, 16 # 8-byte Folded Spill + fst.d $fs5, $sp, 8 # 8-byte Folded Spill + ld.d $a3, $a3, 0 move $a4, $zero move $a5, $zero - ld.d $a3, $a3, 0 ori $s7, $zero, 16 - vrepli.b $vr0, 0 - vrepli.w $vr1, 3 + vrepli.w $vr0, 3 pcalau12i $t2, %pc_hi20(.LCPI7_3) pcalau12i $t3, %pc_hi20(.LCPI7_4) pcalau12i $t4, %pc_hi20(.LCPI7_5) @@ -914,7 +914,7 @@ h2v2_fancy_upsample: # @h2v2_fancy_upsample add.d $s5, $fp, $s5 alsl.d $s6, $s7, $s2, 4 alsl.d $s7, $s7, $s3, 4 - vinsgr2vr.w $vr11, $ra, 3 + vinsgr2vr.w $vr13, $ra, 3 vinsgr2vr.w $vr8, $a6, 3 move $ra, $s8 .p2align 4, , 16 @@ -922,165 +922,175 @@ h2v2_fancy_upsample: # @h2v2_fancy_upsample # Parent Loop BB7_4 Depth=1 # => This Inner Loop Header: Depth=2 addi.d $a6, $fp, 2 - vld $vr12, $a7, %pc_lo12(.LCPI7_0) + vld $vr9, $a7, %pc_lo12(.LCPI7_0) vreplgr2vr.d $vr10, $fp - vld $vr13, $t0, %pc_lo12(.LCPI7_1) + vld $vr12, $t0, %pc_lo12(.LCPI7_1) vld $vr14, $t1, %pc_lo12(.LCPI7_2) vld $vr15, $t2, %pc_lo12(.LCPI7_3) + vadd.d $vr1, $vr10, $vr9 vadd.d $vr2, $vr10, $vr12 - vadd.d $vr3, $vr10, $vr13 - vadd.d $vr4, $vr10, $vr14 - vadd.d $vr5, $vr10, $vr15 - vld $vr17, $t3, %pc_lo12(.LCPI7_4) - vld $vr18, $t4, %pc_lo12(.LCPI7_5) - vld $vr19, $t5, %pc_lo12(.LCPI7_6) - vld $vr21, $t6, %pc_lo12(.LCPI7_7) + vadd.d $vr3, $vr10, $vr14 + vadd.d $vr4, $vr10, $vr15 + vld $vr16, $t3, %pc_lo12(.LCPI7_4) + vld $vr17, $t4, %pc_lo12(.LCPI7_5) + vld $vr18, $t5, %pc_lo12(.LCPI7_6) + vld $vr19, $t6, %pc_lo12(.LCPI7_7) + vadd.d $vr5, $vr10, $vr16 vadd.d $vr6, $vr10, $vr17 vadd.d $vr7, $vr10, $vr18 - vadd.d $vr9, $vr10, $vr19 - vadd.d $vr10, $vr10, $vr21 - vreplgr2vr.d $vr22, $a6 - vadd.d $vr12, $vr22, $vr12 - vadd.d $vr13, $vr22, $vr13 - vadd.d $vr14, $vr22, $vr14 - vadd.d $vr16, $vr22, $vr15 - vadd.d $vr17, $vr22, $vr17 - vld $vr15, $s3, 0 - vadd.d $vr18, $vr22, $vr18 - vadd.d $vr20, $vr22, $vr19 - vadd.d $vr21, $vr22, $vr21 - vilvh.b $vr19, $vr0, $vr15 - vilvl.h $vr24, $vr0, $vr19 - vilvh.h $vr19, $vr0, $vr19 - vilvl.b $vr15, $vr0, $vr15 + vadd.d $vr10, $vr10, $vr19 + vreplgr2vr.d $vr20, $a6 + vadd.d $vr11, $vr20, $vr9 + vadd.d $vr12, $vr20, $vr12 + vadd.d $vr14, $vr20, $vr14 + vadd.d $vr15, $vr20, $vr15 + vadd.d $vr16, $vr20, $vr16 + vld $vr9, $s3, 0 + vadd.d $vr17, $vr20, $vr17 + vadd.d $vr18, $vr20, $vr18 + vadd.d $vr19, $vr20, $vr19 + vbsrl.v $vr20, $vr9, 8 + vsllwil.hu.bu $vr20, $vr20, 0 + vsllwil.wu.hu $vr20, $vr20, 0 + vbsrl.v $vr21, $vr9, 12 + vsllwil.hu.bu $vr21, $vr21, 0 + vsllwil.wu.hu $vr23, $vr21, 0 + vsrli.d $vr21, $vr9, 32 + vsllwil.hu.bu $vr21, $vr21, 0 vld $vr22, $s2, 0 - vilvh.h $vr25, $vr0, $vr15 - vilvl.h $vr26, $vr0, $vr15 - vld $vr27, $t7, %pc_lo12(.LCPI7_8) - vilvh.b $vr23, $vr0, $vr22 - vilvl.h $vr15, $vr0, $vr23 - vbsrl.v $vr28, $vr8, 12 - vshuf.w $vr27, $vr8, $vr11 - vilvh.h $vr8, $vr0, $vr23 - vilvl.b $vr11, $vr0, $vr22 - vilvh.h $vr23, $vr0, $vr11 - vilvl.h $vr22, $vr0, $vr11 - vmadd.w $vr22, $vr26, $vr1 - vmadd.w $vr23, $vr25, $vr1 - vmadd.w $vr8, $vr19, $vr1 - vmadd.w $vr15, $vr24, $vr1 - vbsrl.v $vr11, $vr15, 12 - vbsll.v $vr19, $vr8, 4 - vor.v $vr11, $vr19, $vr11 - vbsrl.v $vr19, $vr23, 12 - vbsll.v $vr24, $vr15, 4 - vor.v $vr19, $vr24, $vr19 - vbsrl.v $vr24, $vr22, 12 - vbsll.v $vr25, $vr23, 4 - vor.v $vr24, $vr25, $vr24 - vbsll.v $vr25, $vr22, 4 - vor.v $vr25, $vr25, $vr28 - vpackev.d $vr26, $vr22, $vr27 - vbsrl.v $vr27, $vr15, 8 - vbsll.v $vr28, $vr8, 8 + vsllwil.wu.hu $vr24, $vr21, 0 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr25, $vr9, 0 + vbsrl.v $vr9, $vr22, 8 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vld $vr26, $t7, %pc_lo12(.LCPI7_8) + vbsrl.v $vr21, $vr22, 12 + vsllwil.hu.bu $vr21, $vr21, 0 + vbsrl.v $vr27, $vr8, 12 + vshuf.w $vr26, $vr8, $vr13 + vsllwil.wu.hu $vr8, $vr21, 0 + vsrli.d $vr13, $vr22, 32 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr21, $vr13, 0 + vsllwil.hu.bu $vr13, $vr22, 0 + vsllwil.wu.hu $vr22, $vr13, 0 + vmadd.w $vr22, $vr25, $vr0 + vmadd.w $vr21, $vr24, $vr0 + vmadd.w $vr8, $vr23, $vr0 + vmadd.w $vr9, $vr20, $vr0 + vbsrl.v $vr13, $vr9, 12 + vbsll.v $vr20, $vr8, 4 + vor.v $vr13, $vr20, $vr13 + vbsrl.v $vr20, $vr21, 12 + vbsll.v $vr23, $vr9, 4 + vor.v $vr20, $vr23, $vr20 + vbsrl.v $vr23, $vr22, 12 + vbsll.v $vr24, $vr21, 4 + vor.v $vr23, $vr24, $vr23 + vbsll.v $vr24, $vr22, 4 + vor.v $vr24, $vr24, $vr27 + vpackev.d $vr25, $vr22, $vr26 + vbsrl.v $vr26, $vr9, 8 + vbsll.v $vr27, $vr8, 8 + vor.v $vr26, $vr27, $vr26 + vbsrl.v $vr27, $vr21, 8 + vbsll.v $vr28, $vr9, 8 vor.v $vr27, $vr28, $vr27 - vbsrl.v $vr28, $vr23, 8 - vbsll.v $vr29, $vr15, 8 + vbsrl.v $vr28, $vr22, 8 + vbsll.v $vr29, $vr21, 8 vor.v $vr28, $vr29, $vr28 - vbsrl.v $vr29, $vr22, 8 - vbsll.v $vr30, $vr23, 8 - vor.v $vr29, $vr30, $vr29 - vmadd.w $vr29, $vr24, $vr1 - vmadd.w $vr28, $vr19, $vr1 - vmadd.w $vr27, $vr11, $vr1 - vmadd.w $vr26, $vr25, $vr1 + vmadd.w $vr28, $vr23, $vr0 + vmadd.w $vr27, $vr20, $vr0 + vmadd.w $vr26, $vr13, $vr0 + vmadd.w $vr25, $vr24, $vr0 + vaddi.wu $vr26, $vr26, 8 vaddi.wu $vr27, $vr27, 8 vaddi.wu $vr28, $vr28, 8 - vaddi.wu $vr29, $vr29, 8 - vaddi.wu $vr26, $vr26, 8 - vsrli.w $vr29, $vr29, 4 + vaddi.wu $vr25, $vr25, 8 vsrli.w $vr28, $vr28, 4 vsrli.w $vr27, $vr27, 4 vsrli.w $vr26, $vr26, 4 - vpickev.h $vr27, $vr27, $vr28 - vpickev.h $vr29, $vr29, $vr26 - vpickve2gr.d $a6, $vr21, 0 - vstelm.b $vr26, $a6, 0, 0 - vpickve2gr.d $a6, $vr21, 1 - vstelm.b $vr29, $a6, 0, 2 - vpickve2gr.d $a6, $vr20, 0 - vstelm.b $vr29, $a6, 0, 4 - vpickve2gr.d $a6, $vr20, 1 - vstelm.b $vr29, $a6, 0, 6 + vsrli.w $vr25, $vr25, 4 + vpickev.h $vr26, $vr26, $vr27 + vpickev.h $vr25, $vr28, $vr25 + vpickve2gr.d $a6, $vr19, 0 + vstelm.b $vr25, $a6, 0, 0 + vpickve2gr.d $a6, $vr19, 1 + vstelm.b $vr25, $a6, 0, 2 vpickve2gr.d $a6, $vr18, 0 - vstelm.b $vr29, $a6, 0, 8 + vstelm.b $vr25, $a6, 0, 4 vpickve2gr.d $a6, $vr18, 1 - vstelm.b $vr29, $a6, 0, 10 + vstelm.b $vr25, $a6, 0, 6 vpickve2gr.d $a6, $vr17, 0 - vstelm.b $vr29, $a6, 0, 12 + vstelm.b $vr25, $a6, 0, 8 vpickve2gr.d $a6, $vr17, 1 - vstelm.b $vr29, $a6, 0, 14 + vstelm.b $vr25, $a6, 0, 10 vpickve2gr.d $a6, $vr16, 0 - vstelm.b $vr28, $a6, 0, 0 + vstelm.b $vr25, $a6, 0, 12 vpickve2gr.d $a6, $vr16, 1 - vstelm.b $vr27, $a6, 0, 2 + vstelm.b $vr25, $a6, 0, 14 + vpickve2gr.d $a6, $vr15, 0 + vstelm.b $vr26, $a6, 0, 0 + vpickve2gr.d $a6, $vr15, 1 + vstelm.b $vr26, $a6, 0, 2 vpickve2gr.d $a6, $vr14, 0 - vstelm.b $vr27, $a6, 0, 4 + vstelm.b $vr26, $a6, 0, 4 vpickve2gr.d $a6, $vr14, 1 - vstelm.b $vr27, $a6, 0, 6 - vpickve2gr.d $a6, $vr13, 0 - vstelm.b $vr27, $a6, 0, 8 - vpickve2gr.d $a6, $vr13, 1 - vstelm.b $vr27, $a6, 0, 10 + vstelm.b $vr26, $a6, 0, 6 vpickve2gr.d $a6, $vr12, 0 - vstelm.b $vr27, $a6, 0, 12 + vstelm.b $vr26, $a6, 0, 8 vpickve2gr.d $a6, $vr12, 1 - vstelm.b $vr27, $a6, 0, 14 + vstelm.b $vr26, $a6, 0, 10 + vpickve2gr.d $a6, $vr11, 0 + vstelm.b $vr26, $a6, 0, 12 + vpickve2gr.d $a6, $vr11, 1 + vstelm.b $vr26, $a6, 0, 14 vpickve2gr.d $a6, $vr10, 1 - vmadd.w $vr22, $vr25, $vr1 - vmadd.w $vr23, $vr24, $vr1 - vaddi.wu $vr10, $vr23, 7 - vaddi.wu $vr12, $vr22, 7 - vsrli.w $vr12, $vr12, 4 + vmadd.w $vr22, $vr24, $vr0 + vmadd.w $vr21, $vr23, $vr0 + vaddi.wu $vr10, $vr21, 7 + vaddi.wu $vr11, $vr22, 7 + vsrli.w $vr11, $vr11, 4 vsrli.w $vr10, $vr10, 4 - vpickev.h $vr10, $vr10, $vr12 - vstelm.b $vr12, $fp, 3, 0 + vpickev.h $vr10, $vr10, $vr11 + vstelm.b $vr10, $fp, 3, 0 vstelm.b $vr10, $a6, 3, 2 - vpickve2gr.d $a6, $vr9, 0 - vstelm.b $vr10, $a6, 3, 4 - vpickve2gr.d $a6, $vr9, 1 - vstelm.b $vr10, $a6, 3, 6 vpickve2gr.d $a6, $vr7, 0 - vstelm.b $vr10, $a6, 3, 8 + vstelm.b $vr10, $a6, 3, 4 vpickve2gr.d $a6, $vr7, 1 - vstelm.b $vr10, $a6, 3, 10 + vstelm.b $vr10, $a6, 3, 6 vpickve2gr.d $a6, $vr6, 0 - vstelm.b $vr10, $a6, 3, 12 + vstelm.b $vr10, $a6, 3, 8 vpickve2gr.d $a6, $vr6, 1 - vstelm.b $vr10, $a6, 3, 14 + vstelm.b $vr10, $a6, 3, 10 vpickve2gr.d $a6, $vr5, 0 - vmadd.w $vr15, $vr19, $vr1 - vaddi.wu $vr6, $vr15, 7 - vsrli.w $vr6, $vr6, 4 - vstelm.b $vr6, $a6, 3, 0 + vstelm.b $vr10, $a6, 3, 12 vpickve2gr.d $a6, $vr5, 1 + vstelm.b $vr10, $a6, 3, 14 + vpickve2gr.d $a6, $vr4, 0 + vmadd.w $vr9, $vr20, $vr0 vori.b $vr5, $vr8, 0 - vmadd.w $vr5, $vr11, $vr1 + vmadd.w $vr5, $vr13, $vr0 vaddi.wu $vr5, $vr5, 7 + vaddi.wu $vr6, $vr9, 7 + vsrli.w $vr6, $vr6, 4 vsrli.w $vr5, $vr5, 4 vpickev.h $vr5, $vr5, $vr6 + vstelm.b $vr5, $a6, 3, 0 + vpickve2gr.d $a6, $vr4, 1 vstelm.b $vr5, $a6, 3, 2 - vpickve2gr.d $a6, $vr4, 0 + vpickve2gr.d $a6, $vr3, 0 vstelm.b $vr5, $a6, 3, 4 - vpickve2gr.d $a6, $vr4, 1 + vpickve2gr.d $a6, $vr3, 1 vstelm.b $vr5, $a6, 3, 6 - vpickve2gr.d $a6, $vr3, 0 + vpickve2gr.d $a6, $vr2, 0 vstelm.b $vr5, $a6, 3, 8 - vpickve2gr.d $a6, $vr3, 1 + vpickve2gr.d $a6, $vr2, 1 vstelm.b $vr5, $a6, 3, 10 - vpickve2gr.d $a6, $vr2, 0 + vpickve2gr.d $a6, $vr1, 0 vstelm.b $vr5, $a6, 3, 12 - vpickve2gr.d $a6, $vr2, 1 + vpickve2gr.d $a6, $vr1, 1 vstelm.b $vr5, $a6, 3, 14 addi.d $fp, $fp, 32 addi.d $ra, $ra, -16 @@ -1206,7 +1216,7 @@ h2v2_fancy_upsample: # @h2v2_fancy_upsample add.d $s4, $t8, $s4 alsl.d $s5, $s6, $s1, 4 alsl.d $s6, $s6, $s2, 4 - vinsgr2vr.w $vr11, $a6, 3 + vinsgr2vr.w $vr13, $a6, 3 vinsgr2vr.w $vr8, $s8, 3 move $s8, $s7 .p2align 4, , 16 @@ -1214,165 +1224,175 @@ h2v2_fancy_upsample: # @h2v2_fancy_upsample # Parent Loop BB7_4 Depth=1 # => This Inner Loop Header: Depth=2 addi.d $a6, $t8, 2 - vld $vr12, $a7, %pc_lo12(.LCPI7_0) + vld $vr9, $a7, %pc_lo12(.LCPI7_0) vreplgr2vr.d $vr10, $t8 - vld $vr13, $t0, %pc_lo12(.LCPI7_1) + vld $vr12, $t0, %pc_lo12(.LCPI7_1) vld $vr14, $t1, %pc_lo12(.LCPI7_2) vld $vr15, $t2, %pc_lo12(.LCPI7_3) + vadd.d $vr1, $vr10, $vr9 vadd.d $vr2, $vr10, $vr12 - vadd.d $vr3, $vr10, $vr13 - vadd.d $vr4, $vr10, $vr14 - vadd.d $vr5, $vr10, $vr15 - vld $vr17, $t3, %pc_lo12(.LCPI7_4) - vld $vr18, $t4, %pc_lo12(.LCPI7_5) - vld $vr19, $t5, %pc_lo12(.LCPI7_6) - vld $vr21, $t6, %pc_lo12(.LCPI7_7) + vadd.d $vr3, $vr10, $vr14 + vadd.d $vr4, $vr10, $vr15 + vld $vr16, $t3, %pc_lo12(.LCPI7_4) + vld $vr17, $t4, %pc_lo12(.LCPI7_5) + vld $vr18, $t5, %pc_lo12(.LCPI7_6) + vld $vr19, $t6, %pc_lo12(.LCPI7_7) + vadd.d $vr5, $vr10, $vr16 vadd.d $vr6, $vr10, $vr17 vadd.d $vr7, $vr10, $vr18 - vadd.d $vr9, $vr10, $vr19 - vadd.d $vr10, $vr10, $vr21 - vreplgr2vr.d $vr22, $a6 - vadd.d $vr12, $vr22, $vr12 - vadd.d $vr13, $vr22, $vr13 - vadd.d $vr14, $vr22, $vr14 - vadd.d $vr16, $vr22, $vr15 - vadd.d $vr17, $vr22, $vr17 - vld $vr15, $s2, 0 - vadd.d $vr18, $vr22, $vr18 - vadd.d $vr20, $vr22, $vr19 - vadd.d $vr21, $vr22, $vr21 - vilvh.b $vr19, $vr0, $vr15 - vilvl.h $vr24, $vr0, $vr19 - vilvh.h $vr19, $vr0, $vr19 - vilvl.b $vr15, $vr0, $vr15 + vadd.d $vr10, $vr10, $vr19 + vreplgr2vr.d $vr20, $a6 + vadd.d $vr11, $vr20, $vr9 + vadd.d $vr12, $vr20, $vr12 + vadd.d $vr14, $vr20, $vr14 + vadd.d $vr15, $vr20, $vr15 + vadd.d $vr16, $vr20, $vr16 + vld $vr9, $s2, 0 + vadd.d $vr17, $vr20, $vr17 + vadd.d $vr18, $vr20, $vr18 + vadd.d $vr19, $vr20, $vr19 + vbsrl.v $vr20, $vr9, 8 + vsllwil.hu.bu $vr20, $vr20, 0 + vsllwil.wu.hu $vr20, $vr20, 0 + vbsrl.v $vr21, $vr9, 12 + vsllwil.hu.bu $vr21, $vr21, 0 + vsllwil.wu.hu $vr23, $vr21, 0 + vsrli.d $vr21, $vr9, 32 + vsllwil.hu.bu $vr21, $vr21, 0 vld $vr22, $s1, 0 - vilvh.h $vr25, $vr0, $vr15 - vilvl.h $vr26, $vr0, $vr15 - vld $vr27, $t7, %pc_lo12(.LCPI7_8) - vilvh.b $vr23, $vr0, $vr22 - vilvl.h $vr15, $vr0, $vr23 - vbsrl.v $vr28, $vr8, 12 - vshuf.w $vr27, $vr8, $vr11 - vilvh.h $vr8, $vr0, $vr23 - vilvl.b $vr11, $vr0, $vr22 - vilvh.h $vr23, $vr0, $vr11 - vilvl.h $vr22, $vr0, $vr11 - vmadd.w $vr22, $vr26, $vr1 - vmadd.w $vr23, $vr25, $vr1 - vmadd.w $vr8, $vr19, $vr1 - vmadd.w $vr15, $vr24, $vr1 - vbsrl.v $vr11, $vr15, 12 - vbsll.v $vr19, $vr8, 4 - vor.v $vr11, $vr19, $vr11 - vbsrl.v $vr19, $vr23, 12 - vbsll.v $vr24, $vr15, 4 - vor.v $vr19, $vr24, $vr19 - vbsrl.v $vr24, $vr22, 12 - vbsll.v $vr25, $vr23, 4 - vor.v $vr24, $vr25, $vr24 - vbsll.v $vr25, $vr22, 4 - vor.v $vr25, $vr25, $vr28 - vpackev.d $vr26, $vr22, $vr27 - vbsrl.v $vr27, $vr15, 8 - vbsll.v $vr28, $vr8, 8 + vsllwil.wu.hu $vr24, $vr21, 0 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr25, $vr9, 0 + vbsrl.v $vr9, $vr22, 8 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vld $vr26, $t7, %pc_lo12(.LCPI7_8) + vbsrl.v $vr21, $vr22, 12 + vsllwil.hu.bu $vr21, $vr21, 0 + vbsrl.v $vr27, $vr8, 12 + vshuf.w $vr26, $vr8, $vr13 + vsllwil.wu.hu $vr8, $vr21, 0 + vsrli.d $vr13, $vr22, 32 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr21, $vr13, 0 + vsllwil.hu.bu $vr13, $vr22, 0 + vsllwil.wu.hu $vr22, $vr13, 0 + vmadd.w $vr22, $vr25, $vr0 + vmadd.w $vr21, $vr24, $vr0 + vmadd.w $vr8, $vr23, $vr0 + vmadd.w $vr9, $vr20, $vr0 + vbsrl.v $vr13, $vr9, 12 + vbsll.v $vr20, $vr8, 4 + vor.v $vr13, $vr20, $vr13 + vbsrl.v $vr20, $vr21, 12 + vbsll.v $vr23, $vr9, 4 + vor.v $vr20, $vr23, $vr20 + vbsrl.v $vr23, $vr22, 12 + vbsll.v $vr24, $vr21, 4 + vor.v $vr23, $vr24, $vr23 + vbsll.v $vr24, $vr22, 4 + vor.v $vr24, $vr24, $vr27 + vpackev.d $vr25, $vr22, $vr26 + vbsrl.v $vr26, $vr9, 8 + vbsll.v $vr27, $vr8, 8 + vor.v $vr26, $vr27, $vr26 + vbsrl.v $vr27, $vr21, 8 + vbsll.v $vr28, $vr9, 8 vor.v $vr27, $vr28, $vr27 - vbsrl.v $vr28, $vr23, 8 - vbsll.v $vr29, $vr15, 8 + vbsrl.v $vr28, $vr22, 8 + vbsll.v $vr29, $vr21, 8 vor.v $vr28, $vr29, $vr28 - vbsrl.v $vr29, $vr22, 8 - vbsll.v $vr30, $vr23, 8 - vor.v $vr29, $vr30, $vr29 - vmadd.w $vr29, $vr24, $vr1 - vmadd.w $vr28, $vr19, $vr1 - vmadd.w $vr27, $vr11, $vr1 - vmadd.w $vr26, $vr25, $vr1 + vmadd.w $vr28, $vr23, $vr0 + vmadd.w $vr27, $vr20, $vr0 + vmadd.w $vr26, $vr13, $vr0 + vmadd.w $vr25, $vr24, $vr0 + vaddi.wu $vr26, $vr26, 8 vaddi.wu $vr27, $vr27, 8 vaddi.wu $vr28, $vr28, 8 - vaddi.wu $vr29, $vr29, 8 - vaddi.wu $vr26, $vr26, 8 - vsrli.w $vr29, $vr29, 4 + vaddi.wu $vr25, $vr25, 8 vsrli.w $vr28, $vr28, 4 vsrli.w $vr27, $vr27, 4 vsrli.w $vr26, $vr26, 4 - vpickev.h $vr27, $vr27, $vr28 - vpickev.h $vr29, $vr29, $vr26 - vpickve2gr.d $a6, $vr21, 0 - vstelm.b $vr26, $a6, 0, 0 - vpickve2gr.d $a6, $vr21, 1 - vstelm.b $vr29, $a6, 0, 2 - vpickve2gr.d $a6, $vr20, 0 - vstelm.b $vr29, $a6, 0, 4 - vpickve2gr.d $a6, $vr20, 1 - vstelm.b $vr29, $a6, 0, 6 + vsrli.w $vr25, $vr25, 4 + vpickev.h $vr26, $vr26, $vr27 + vpickev.h $vr25, $vr28, $vr25 + vpickve2gr.d $a6, $vr19, 0 + vstelm.b $vr25, $a6, 0, 0 + vpickve2gr.d $a6, $vr19, 1 + vstelm.b $vr25, $a6, 0, 2 vpickve2gr.d $a6, $vr18, 0 - vstelm.b $vr29, $a6, 0, 8 + vstelm.b $vr25, $a6, 0, 4 vpickve2gr.d $a6, $vr18, 1 - vstelm.b $vr29, $a6, 0, 10 + vstelm.b $vr25, $a6, 0, 6 vpickve2gr.d $a6, $vr17, 0 - vstelm.b $vr29, $a6, 0, 12 + vstelm.b $vr25, $a6, 0, 8 vpickve2gr.d $a6, $vr17, 1 - vstelm.b $vr29, $a6, 0, 14 + vstelm.b $vr25, $a6, 0, 10 vpickve2gr.d $a6, $vr16, 0 - vstelm.b $vr28, $a6, 0, 0 + vstelm.b $vr25, $a6, 0, 12 vpickve2gr.d $a6, $vr16, 1 - vstelm.b $vr27, $a6, 0, 2 + vstelm.b $vr25, $a6, 0, 14 + vpickve2gr.d $a6, $vr15, 0 + vstelm.b $vr26, $a6, 0, 0 + vpickve2gr.d $a6, $vr15, 1 + vstelm.b $vr26, $a6, 0, 2 vpickve2gr.d $a6, $vr14, 0 - vstelm.b $vr27, $a6, 0, 4 + vstelm.b $vr26, $a6, 0, 4 vpickve2gr.d $a6, $vr14, 1 - vstelm.b $vr27, $a6, 0, 6 - vpickve2gr.d $a6, $vr13, 0 - vstelm.b $vr27, $a6, 0, 8 - vpickve2gr.d $a6, $vr13, 1 - vstelm.b $vr27, $a6, 0, 10 + vstelm.b $vr26, $a6, 0, 6 vpickve2gr.d $a6, $vr12, 0 - vstelm.b $vr27, $a6, 0, 12 + vstelm.b $vr26, $a6, 0, 8 vpickve2gr.d $a6, $vr12, 1 - vstelm.b $vr27, $a6, 0, 14 + vstelm.b $vr26, $a6, 0, 10 + vpickve2gr.d $a6, $vr11, 0 + vstelm.b $vr26, $a6, 0, 12 + vpickve2gr.d $a6, $vr11, 1 + vstelm.b $vr26, $a6, 0, 14 vpickve2gr.d $a6, $vr10, 1 - vmadd.w $vr22, $vr25, $vr1 - vmadd.w $vr23, $vr24, $vr1 - vaddi.wu $vr10, $vr23, 7 - vaddi.wu $vr12, $vr22, 7 - vsrli.w $vr12, $vr12, 4 + vmadd.w $vr22, $vr24, $vr0 + vmadd.w $vr21, $vr23, $vr0 + vaddi.wu $vr10, $vr21, 7 + vaddi.wu $vr11, $vr22, 7 + vsrli.w $vr11, $vr11, 4 vsrli.w $vr10, $vr10, 4 - vpickev.h $vr10, $vr10, $vr12 - vstelm.b $vr12, $t8, 3, 0 + vpickev.h $vr10, $vr10, $vr11 + vstelm.b $vr10, $t8, 3, 0 vstelm.b $vr10, $a6, 3, 2 - vpickve2gr.d $a6, $vr9, 0 - vstelm.b $vr10, $a6, 3, 4 - vpickve2gr.d $a6, $vr9, 1 - vstelm.b $vr10, $a6, 3, 6 vpickve2gr.d $a6, $vr7, 0 - vstelm.b $vr10, $a6, 3, 8 + vstelm.b $vr10, $a6, 3, 4 vpickve2gr.d $a6, $vr7, 1 - vstelm.b $vr10, $a6, 3, 10 + vstelm.b $vr10, $a6, 3, 6 vpickve2gr.d $a6, $vr6, 0 - vstelm.b $vr10, $a6, 3, 12 + vstelm.b $vr10, $a6, 3, 8 vpickve2gr.d $a6, $vr6, 1 - vstelm.b $vr10, $a6, 3, 14 + vstelm.b $vr10, $a6, 3, 10 vpickve2gr.d $a6, $vr5, 0 - vmadd.w $vr15, $vr19, $vr1 - vaddi.wu $vr6, $vr15, 7 - vsrli.w $vr6, $vr6, 4 - vstelm.b $vr6, $a6, 3, 0 + vstelm.b $vr10, $a6, 3, 12 vpickve2gr.d $a6, $vr5, 1 + vstelm.b $vr10, $a6, 3, 14 + vpickve2gr.d $a6, $vr4, 0 + vmadd.w $vr9, $vr20, $vr0 vori.b $vr5, $vr8, 0 - vmadd.w $vr5, $vr11, $vr1 + vmadd.w $vr5, $vr13, $vr0 vaddi.wu $vr5, $vr5, 7 + vaddi.wu $vr6, $vr9, 7 + vsrli.w $vr6, $vr6, 4 vsrli.w $vr5, $vr5, 4 vpickev.h $vr5, $vr5, $vr6 + vstelm.b $vr5, $a6, 3, 0 + vpickve2gr.d $a6, $vr4, 1 vstelm.b $vr5, $a6, 3, 2 - vpickve2gr.d $a6, $vr4, 0 + vpickve2gr.d $a6, $vr3, 0 vstelm.b $vr5, $a6, 3, 4 - vpickve2gr.d $a6, $vr4, 1 + vpickve2gr.d $a6, $vr3, 1 vstelm.b $vr5, $a6, 3, 6 - vpickve2gr.d $a6, $vr3, 0 + vpickve2gr.d $a6, $vr2, 0 vstelm.b $vr5, $a6, 3, 8 - vpickve2gr.d $a6, $vr3, 1 + vpickve2gr.d $a6, $vr2, 1 vstelm.b $vr5, $a6, 3, 10 - vpickve2gr.d $a6, $vr2, 0 + vpickve2gr.d $a6, $vr1, 0 vstelm.b $vr5, $a6, 3, 12 - vpickve2gr.d $a6, $vr2, 1 + vpickve2gr.d $a6, $vr1, 1 vstelm.b $vr5, $a6, 3, 14 addi.d $t8, $t8, 32 addi.d $s8, $s8, -16 @@ -1424,25 +1444,24 @@ h2v2_fancy_upsample: # @h2v2_fancy_upsample addi.d $t8, $fp, -2 b .LBB7_3 .LBB7_29: - fld.d $fs6, $sp, 16 # 8-byte Folded Reload - fld.d $fs5, $sp, 24 # 8-byte Folded Reload - fld.d $fs4, $sp, 32 # 8-byte Folded Reload - fld.d $fs3, $sp, 40 # 8-byte Folded Reload - fld.d $fs2, $sp, 48 # 8-byte Folded Reload - fld.d $fs1, $sp, 56 # 8-byte Folded Reload - fld.d $fs0, $sp, 64 # 8-byte Folded Reload - ld.d $s8, $sp, 72 # 8-byte Folded Reload - ld.d $s7, $sp, 80 # 8-byte Folded Reload - ld.d $s6, $sp, 88 # 8-byte Folded Reload - ld.d $s5, $sp, 96 # 8-byte Folded Reload - ld.d $s4, $sp, 104 # 8-byte Folded Reload - ld.d $s3, $sp, 112 # 8-byte Folded Reload - ld.d $s2, $sp, 120 # 8-byte Folded Reload - ld.d $s1, $sp, 128 # 8-byte Folded Reload - ld.d $s0, $sp, 136 # 8-byte Folded Reload - ld.d $fp, $sp, 144 # 8-byte Folded Reload - ld.d $ra, $sp, 152 # 8-byte Folded Reload - addi.d $sp, $sp, 160 + fld.d $fs5, $sp, 8 # 8-byte Folded Reload + fld.d $fs4, $sp, 16 # 8-byte Folded Reload + fld.d $fs3, $sp, 24 # 8-byte Folded Reload + fld.d $fs2, $sp, 32 # 8-byte Folded Reload + fld.d $fs1, $sp, 40 # 8-byte Folded Reload + fld.d $fs0, $sp, 48 # 8-byte Folded Reload + ld.d $s8, $sp, 56 # 8-byte Folded Reload + ld.d $s7, $sp, 64 # 8-byte Folded Reload + ld.d $s6, $sp, 72 # 8-byte Folded Reload + ld.d $s5, $sp, 80 # 8-byte Folded Reload + ld.d $s4, $sp, 88 # 8-byte Folded Reload + ld.d $s3, $sp, 96 # 8-byte Folded Reload + ld.d $s2, $sp, 104 # 8-byte Folded Reload + ld.d $s1, $sp, 112 # 8-byte Folded Reload + ld.d $s0, $sp, 120 # 8-byte Folded Reload + ld.d $fp, $sp, 128 # 8-byte Folded Reload + ld.d $ra, $sp, 136 # 8-byte Folded Reload + addi.d $sp, $sp, 144 .LBB7_30: # %._crit_edge68 ret .Lfunc_end7: diff --git a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jfdctfst.s b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jfdctfst.s index 06ae4e82..88726e02 100644 --- a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jfdctfst.s +++ b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jfdctfst.s @@ -99,12 +99,9 @@ jpeg_fdct_ifast: # @jpeg_fdct_ifast vsub.w $vr3, $vr8, $vr6 vst $vr3, $a0, 128 vadd.w $vr0, $vr0, $vr5 - vshuf4i.w $vr3, $vr0, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr0, $vr0, 50 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr6, $vr0, 32 + vsllwil.d.w $vr3, $vr0, 0 + vshuf4i.w $vr0, $vr0, 14 + vsllwil.d.w $vr6, $vr0, 0 vrepli.d $vr0, 181 vmul.d $vr6, $vr6, $vr0 vmul.d $vr3, $vr3, $vr0 @@ -119,24 +116,18 @@ jpeg_fdct_ifast: # @jpeg_fdct_ifast vadd.w $vr5, $vr1, $vr2 vadd.w $vr6, $vr2, $vr4 vsub.w $vr1, $vr3, $vr6 - vshuf4i.w $vr2, $vr1, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr1, $vr1, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr7, $vr1, 32 + vsllwil.d.w $vr2, $vr1, 0 + vshuf4i.w $vr1, $vr1, 14 + vsllwil.d.w $vr7, $vr1, 0 vrepli.d $vr1, 98 vmul.d $vr7, $vr7, $vr1 vmul.d $vr2, $vr2, $vr1 vsrli.d $vr2, $vr2, 8 vsrli.d $vr7, $vr7, 8 vpickev.w $vr7, $vr7, $vr2 - vshuf4i.w $vr2, $vr3, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr8, $vr2, 32 - vshuf4i.w $vr2, $vr3, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr3, $vr2, 32 + vsllwil.d.w $vr8, $vr3, 0 + vshuf4i.w $vr2, $vr3, 14 + vsllwil.d.w $vr3, $vr2, 0 vrepli.d $vr2, 139 vmul.d $vr3, $vr3, $vr2 vmul.d $vr8, $vr8, $vr2 @@ -144,12 +135,9 @@ jpeg_fdct_ifast: # @jpeg_fdct_ifast vsrli.d $vr3, $vr3, 8 vpickev.w $vr3, $vr3, $vr8 vadd.w $vr8, $vr7, $vr3 - vshuf4i.w $vr3, $vr6, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr9, $vr3, 32 - vshuf4i.w $vr3, $vr6, 50 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr6, $vr3, 32 + vsllwil.d.w $vr9, $vr6, 0 + vshuf4i.w $vr3, $vr6, 14 + vsllwil.d.w $vr6, $vr3, 0 vrepli.d $vr3, 334 vmul.d $vr6, $vr6, $vr3 vmul.d $vr9, $vr9, $vr3 @@ -157,12 +145,9 @@ jpeg_fdct_ifast: # @jpeg_fdct_ifast vsrli.d $vr6, $vr6, 8 vpickev.w $vr6, $vr6, $vr9 vadd.w $vr6, $vr7, $vr6 - vshuf4i.w $vr7, $vr5, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 - vshuf4i.w $vr5, $vr5, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vsllwil.d.w $vr7, $vr5, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.d.w $vr5, $vr5, 0 vmul.d $vr5, $vr5, $vr0 vmul.d $vr7, $vr7, $vr0 vsrli.d $vr7, $vr7, 8 @@ -203,12 +188,9 @@ jpeg_fdct_ifast: # @jpeg_fdct_ifast vsub.w $vr8, $vr12, $vr10 vst $vr8, $a0, 144 vadd.w $vr5, $vr5, $vr9 - vshuf4i.w $vr8, $vr5, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vshuf4i.w $vr5, $vr5, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vsllwil.d.w $vr8, $vr5, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.d.w $vr5, $vr5, 0 vmul.d $vr5, $vr5, $vr0 vmul.d $vr8, $vr8, $vr0 vsrli.d $vr8, $vr8, 8 @@ -222,47 +204,35 @@ jpeg_fdct_ifast: # @jpeg_fdct_ifast vadd.w $vr6, $vr6, $vr7 vadd.w $vr7, $vr7, $vr4 vsub.w $vr8, $vr5, $vr7 - vshuf4i.w $vr9, $vr8, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr8, $vr8, 50 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vsllwil.d.w $vr9, $vr8, 0 + vshuf4i.w $vr8, $vr8, 14 + vsllwil.d.w $vr8, $vr8, 0 vmul.d $vr8, $vr8, $vr1 vmul.d $vr1, $vr9, $vr1 vsrli.d $vr1, $vr1, 8 vsrli.d $vr8, $vr8, 8 vpickev.w $vr1, $vr8, $vr1 - vshuf4i.w $vr8, $vr5, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vshuf4i.w $vr5, $vr5, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vsllwil.d.w $vr8, $vr5, 0 + vshuf4i.w $vr5, $vr5, 14 + vsllwil.d.w $vr5, $vr5, 0 vmul.d $vr5, $vr5, $vr2 vmul.d $vr2, $vr8, $vr2 vsrli.d $vr2, $vr2, 8 vsrli.d $vr5, $vr5, 8 vpickev.w $vr2, $vr5, $vr2 vadd.w $vr2, $vr1, $vr2 - vshuf4i.w $vr5, $vr7, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr7, $vr7, 50 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 + vsllwil.d.w $vr5, $vr7, 0 + vshuf4i.w $vr7, $vr7, 14 + vsllwil.d.w $vr7, $vr7, 0 vmul.d $vr7, $vr7, $vr3 vmul.d $vr3, $vr5, $vr3 vsrli.d $vr3, $vr3, 8 vsrli.d $vr5, $vr7, 8 vpickev.w $vr3, $vr5, $vr3 vadd.w $vr1, $vr1, $vr3 - vshuf4i.w $vr3, $vr6, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr5, $vr6, 50 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 + vsllwil.d.w $vr3, $vr6, 0 + vshuf4i.w $vr5, $vr6, 14 + vsllwil.d.w $vr5, $vr5, 0 vmul.d $vr5, $vr5, $vr0 vmul.d $vr0, $vr3, $vr0 vsrli.d $vr0, $vr0, 8 diff --git a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jfdctint.s b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jfdctint.s index 8146d8a1..5fc4a4c9 100644 --- a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jfdctint.s +++ b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jfdctint.s @@ -127,117 +127,93 @@ jpeg_fdct_islow: # @jpeg_fdct_islow addi.d $t8, $t8, 32 bgez $fp, .LBB0_1 # %bb.2: # %vector.body - vld $vr2, $a0, 0 - vld $vr3, $a0, 224 - vadd.w $vr1, $vr3, $vr2 - vshuf4i.w $vr0, $vr1, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 - vshuf4i.w $vr1, $vr1, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vsub.w $vr2, $vr2, $vr3 - vshuf4i.w $vr3, $vr2, 50 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr13, $vr3, 32 - vld $vr3, $a0, 32 + vld $vr0, $a0, 0 + vld $vr1, $a0, 224 + vadd.w $vr2, $vr1, $vr0 + vsllwil.d.w $vr3, $vr2, 0 + vshuf4i.w $vr2, $vr2, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsub.w $vr0, $vr0, $vr1 + vld $vr1, $a0, 32 vld $vr4, $a0, 192 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr14, $vr2, 32 - vadd.w $vr2, $vr4, $vr3 - vshuf4i.w $vr5, $vr2, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vshuf4i.w $vr2, $vr2, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vsub.w $vr3, $vr3, $vr4 - vshuf4i.w $vr4, $vr3, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr15, $vr4, 32 + vshuf4i.w $vr5, $vr0, 14 + vsllwil.d.w $vr13, $vr5, 0 + vsllwil.d.w $vr14, $vr0, 0 + vadd.w $vr0, $vr4, $vr1 + vsllwil.d.w $vr5, $vr0, 0 + vshuf4i.w $vr0, $vr0, 14 + vsllwil.d.w $vr0, $vr0, 0 + vsub.w $vr1, $vr1, $vr4 vld $vr4, $a0, 64 vld $vr6, $a0, 160 - vshuf4i.w $vr3, $vr3, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr16, $vr3, 32 - vadd.w $vr3, $vr6, $vr4 - vshuf4i.w $vr7, $vr3, 16 - vslli.d $vr7, $vr7, 32 - vsrai.d $vr7, $vr7, 32 - vshuf4i.w $vr3, $vr3, 50 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 + vshuf4i.w $vr7, $vr1, 14 + vsllwil.d.w $vr15, $vr7, 0 + vsllwil.d.w $vr16, $vr1, 0 + vadd.w $vr1, $vr6, $vr4 + vsllwil.d.w $vr7, $vr1, 0 + vshuf4i.w $vr1, $vr1, 14 + vsllwil.d.w $vr1, $vr1, 0 vsub.w $vr4, $vr4, $vr6 - vshuf4i.w $vr6, $vr4, 50 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr17, $vr6, 32 vld $vr6, $a0, 96 vld $vr8, $a0, 128 - vshuf4i.w $vr4, $vr4, 16 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr18, $vr4, 32 + vshuf4i.w $vr9, $vr4, 14 + vsllwil.d.w $vr17, $vr9, 0 + vsllwil.d.w $vr18, $vr4, 0 vadd.w $vr4, $vr8, $vr6 - vshuf4i.w $vr9, $vr4, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 - vshuf4i.w $vr4, $vr4, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 + vsllwil.d.w $vr9, $vr4, 0 + vshuf4i.w $vr4, $vr4, 14 + vsllwil.d.w $vr4, $vr4, 0 vsub.w $vr6, $vr6, $vr8 - vshuf4i.w $vr8, $vr6, 50 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr19, $vr8, 32 - vshuf4i.w $vr6, $vr6, 16 - vslli.d $vr6, $vr6, 32 - vsrai.d $vr20, $vr6, 32 - vadd.d $vr6, $vr4, $vr1 - vadd.d $vr8, $vr9, $vr0 - vsub.d $vr9, $vr0, $vr9 - vsub.d $vr1, $vr1, $vr4 - vadd.d $vr0, $vr3, $vr2 - vadd.d $vr4, $vr7, $vr5 + vshuf4i.w $vr8, $vr6, 14 + vsllwil.d.w $vr19, $vr8, 0 + vsllwil.d.w $vr20, $vr6, 0 + vadd.d $vr6, $vr4, $vr2 + vadd.d $vr8, $vr9, $vr3 + vsub.d $vr3, $vr3, $vr9 + vsub.d $vr2, $vr2, $vr4 + vadd.d $vr4, $vr1, $vr0 + vadd.d $vr9, $vr7, $vr5 vsub.d $vr5, $vr5, $vr7 - vsub.d $vr2, $vr2, $vr3 - vadd.d $vr3, $vr4, $vr8 - vadd.d $vr7, $vr0, $vr6 + vsub.d $vr1, $vr0, $vr1 + vadd.d $vr0, $vr9, $vr8 + vadd.d $vr7, $vr4, $vr6 vaddi.du $vr7, $vr7, 2 - vaddi.du $vr3, $vr3, 2 - vsrli.d $vr3, $vr3, 2 + vaddi.du $vr0, $vr0, 2 + vsrli.d $vr0, $vr0, 2 vsrli.d $vr7, $vr7, 2 - vpickev.w $vr3, $vr7, $vr3 - vst $vr3, $a0, 0 - vsub.d $vr3, $vr8, $vr4 - vsub.d $vr0, $vr6, $vr0 + vpickev.w $vr0, $vr7, $vr0 + vst $vr0, $a0, 0 + vsub.d $vr0, $vr8, $vr9 + vsub.d $vr4, $vr6, $vr4 + vaddi.du $vr4, $vr4, 2 vaddi.du $vr0, $vr0, 2 - vaddi.du $vr3, $vr3, 2 - vsrli.d $vr3, $vr3, 2 vsrli.d $vr0, $vr0, 2 - vpickev.w $vr0, $vr0, $vr3 + vsrli.d $vr4, $vr4, 2 + vpickev.w $vr0, $vr4, $vr0 vst $vr0, $a0, 128 - vadd.d $vr3, $vr1, $vr2 - vadd.d $vr4, $vr9, $vr5 + vadd.d $vr4, $vr2, $vr1 + vadd.d $vr8, $vr3, $vr5 vreplgr2vr.d $vr7, $t2 vreplgr2vr.d $vr0, $t4 vreplgr2vr.d $vr6, $t3 - vori.b $vr8, $vr0, 0 vori.b $vr10, $vr0, 0 - vmadd.d $vr10, $vr3, $vr7 - vmadd.d $vr8, $vr4, $vr7 - vori.b $vr3, $vr10, 0 - vmadd.d $vr3, $vr1, $vr6 - vori.b $vr1, $vr8, 0 - vmadd.d $vr1, $vr9, $vr6 - vsrli.d $vr1, $vr1, 15 - vsrli.d $vr3, $vr3, 15 - vpickev.w $vr1, $vr3, $vr1 - vst $vr1, $a0, 64 + vori.b $vr11, $vr0, 0 + vmadd.d $vr11, $vr4, $vr7 + vmadd.d $vr10, $vr8, $vr7 + vori.b $vr4, $vr11, 0 + vmadd.d $vr4, $vr2, $vr6 + vori.b $vr2, $vr10, 0 + vmadd.d $vr2, $vr3, $vr6 + vsrli.d $vr2, $vr2, 15 + vsrli.d $vr3, $vr4, 15 + vpickev.w $vr2, $vr3, $vr2 + vst $vr2, $a0, 64 lu32i.d $t1, 32767 vreplgr2vr.d $vr9, $t1 - vmadd.d $vr10, $vr2, $vr9 - vmadd.d $vr8, $vr5, $vr9 - vsrli.d $vr1, $vr8, 15 - vsrli.d $vr2, $vr10, 15 + vmadd.d $vr11, $vr1, $vr9 + vmadd.d $vr10, $vr5, $vr9 + vsrli.d $vr1, $vr10, 15 + vsrli.d $vr2, $vr11, 15 vpickev.w $vr1, $vr2, $vr1 vst $vr1, $a0, 192 vadd.d $vr21, $vr20, $vr14 @@ -315,67 +291,43 @@ jpeg_fdct_islow: # @jpeg_fdct_islow vpickev.w $vr13, $vr13, $vr14 vst $vr13, $a0, 32 vadd.w $vr13, $vr16, $vr15 - vshuf4i.w $vr14, $vr13, 16 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr21, $vr14, 32 - vshuf4i.w $vr13, $vr13, 50 - vslli.d $vr13, $vr13, 32 - vsrai.d $vr22, $vr13, 32 + vsllwil.d.w $vr21, $vr13, 0 + vshuf4i.w $vr13, $vr13, 14 + vsllwil.d.w $vr22, $vr13, 0 vsub.w $vr14, $vr15, $vr16 - vshuf4i.w $vr13, $vr14, 50 - vslli.d $vr13, $vr13, 32 - vsrai.d $vr13, $vr13, 32 vld $vr15, $a0, 48 vld $vr16, $a0, 208 - vshuf4i.w $vr14, $vr14, 16 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr14, $vr14, 32 + vshuf4i.w $vr13, $vr14, 14 + vsllwil.d.w $vr13, $vr13, 0 + vsllwil.d.w $vr14, $vr14, 0 vadd.w $vr17, $vr16, $vr15 - vshuf4i.w $vr18, $vr17, 16 - vslli.d $vr18, $vr18, 32 - vsrai.d $vr23, $vr18, 32 - vshuf4i.w $vr17, $vr17, 50 - vslli.d $vr17, $vr17, 32 - vsrai.d $vr24, $vr17, 32 + vsllwil.d.w $vr23, $vr17, 0 + vshuf4i.w $vr17, $vr17, 14 + vsllwil.d.w $vr24, $vr17, 0 vsub.w $vr16, $vr15, $vr16 - vshuf4i.w $vr15, $vr16, 50 - vslli.d $vr15, $vr15, 32 - vsrai.d $vr15, $vr15, 32 vld $vr17, $a0, 80 vld $vr18, $a0, 176 - vshuf4i.w $vr16, $vr16, 16 - vslli.d $vr16, $vr16, 32 - vsrai.d $vr16, $vr16, 32 + vshuf4i.w $vr15, $vr16, 14 + vsllwil.d.w $vr15, $vr15, 0 + vsllwil.d.w $vr16, $vr16, 0 vadd.w $vr19, $vr18, $vr17 - vshuf4i.w $vr20, $vr19, 16 - vslli.d $vr20, $vr20, 32 - vsrai.d $vr25, $vr20, 32 - vshuf4i.w $vr19, $vr19, 50 - vslli.d $vr19, $vr19, 32 - vsrai.d $vr26, $vr19, 32 + vsllwil.d.w $vr25, $vr19, 0 + vshuf4i.w $vr19, $vr19, 14 + vsllwil.d.w $vr26, $vr19, 0 vsub.w $vr18, $vr17, $vr18 - vshuf4i.w $vr17, $vr18, 50 - vslli.d $vr17, $vr17, 32 - vsrai.d $vr17, $vr17, 32 vld $vr19, $a0, 112 vld $vr20, $a0, 144 - vshuf4i.w $vr18, $vr18, 16 - vslli.d $vr18, $vr18, 32 - vsrai.d $vr18, $vr18, 32 + vshuf4i.w $vr17, $vr18, 14 + vsllwil.d.w $vr17, $vr17, 0 + vsllwil.d.w $vr18, $vr18, 0 vadd.w $vr27, $vr20, $vr19 - vshuf4i.w $vr28, $vr27, 16 - vslli.d $vr28, $vr28, 32 - vsrai.d $vr28, $vr28, 32 - vshuf4i.w $vr27, $vr27, 50 - vslli.d $vr27, $vr27, 32 - vsrai.d $vr27, $vr27, 32 + vsllwil.d.w $vr28, $vr27, 0 + vshuf4i.w $vr27, $vr27, 14 + vsllwil.d.w $vr27, $vr27, 0 vsub.w $vr20, $vr19, $vr20 - vshuf4i.w $vr19, $vr20, 50 - vslli.d $vr19, $vr19, 32 - vsrai.d $vr19, $vr19, 32 - vshuf4i.w $vr20, $vr20, 16 - vslli.d $vr20, $vr20, 32 - vsrai.d $vr20, $vr20, 32 + vshuf4i.w $vr19, $vr20, 14 + vsllwil.d.w $vr19, $vr19, 0 + vsllwil.d.w $vr20, $vr20, 0 vadd.d $vr29, $vr27, $vr22 vadd.d $vr30, $vr28, $vr21 vsub.d $vr21, $vr21, $vr28 diff --git a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jquant1.s b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jquant1.s index 2ae9ab1f..1cbca9a9 100644 --- a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jquant1.s +++ b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jquant1.s @@ -443,15 +443,15 @@ jinit_1pass_quantizer: # @jinit_1pass_quantizer .type start_pass_1_quant,@function start_pass_1_quant: # @start_pass_1_quant # %bb.0: - addi.d $sp, $sp, -112 - st.d $ra, $sp, 104 # 8-byte Folded Spill - st.d $fp, $sp, 96 # 8-byte Folded Spill - st.d $s0, $sp, 88 # 8-byte Folded Spill - st.d $s1, $sp, 80 # 8-byte Folded Spill - st.d $s2, $sp, 72 # 8-byte Folded Spill - st.d $s3, $sp, 64 # 8-byte Folded Spill - st.d $s4, $sp, 56 # 8-byte Folded Spill - st.d $s5, $sp, 48 # 8-byte Folded Spill + addi.d $sp, $sp, -96 + st.d $ra, $sp, 88 # 8-byte Folded Spill + st.d $fp, $sp, 80 # 8-byte Folded Spill + st.d $s0, $sp, 72 # 8-byte Folded Spill + st.d $s1, $sp, 64 # 8-byte Folded Spill + st.d $s2, $sp, 56 # 8-byte Folded Spill + st.d $s3, $sp, 48 # 8-byte Folded Spill + st.d $s4, $sp, 40 # 8-byte Folded Spill + st.d $s5, $sp, 32 # 8-byte Folded Spill move $fp, $a0 ld.d $s0, $a0, 608 ld.d $a1, $s0, 32 @@ -510,15 +510,15 @@ start_pass_1_quant: # @start_pass_1_quant ori $a2, $zero, 47 st.w $a2, $a0, 40 move $a0, $fp - ld.d $s5, $sp, 48 # 8-byte Folded Reload - ld.d $s4, $sp, 56 # 8-byte Folded Reload - ld.d $s3, $sp, 64 # 8-byte Folded Reload - ld.d $s2, $sp, 72 # 8-byte Folded Reload - ld.d $s1, $sp, 80 # 8-byte Folded Reload - ld.d $s0, $sp, 88 # 8-byte Folded Reload - ld.d $fp, $sp, 96 # 8-byte Folded Reload - ld.d $ra, $sp, 104 # 8-byte Folded Reload - addi.d $sp, $sp, 112 + ld.d $s5, $sp, 32 # 8-byte Folded Reload + ld.d $s4, $sp, 40 # 8-byte Folded Reload + ld.d $s3, $sp, 48 # 8-byte Folded Reload + ld.d $s2, $sp, 56 # 8-byte Folded Reload + ld.d $s1, $sp, 64 # 8-byte Folded Reload + ld.d $s0, $sp, 72 # 8-byte Folded Reload + ld.d $fp, $sp, 80 # 8-byte Folded Reload + ld.d $ra, $sp, 88 # 8-byte Folded Reload + addi.d $sp, $sp, 96 jr $a1 .LBB1_10: pcalau12i $a0, %pc_hi20(color_quantize) @@ -543,10 +543,8 @@ start_pass_1_quant: # @start_pass_1_quant pcalau12i $a0, %pc_hi20(base_dither_matrix+8) addi.d $s3, $a0, %pc_lo12(base_dither_matrix+8) move $s4, $zero - vrepli.b $vr3, 0 - vrepli.w $vr4, 255 - vst $vr3, $sp, 32 # 16-byte Folded Spill - vst $vr4, $sp, 16 # 16-byte Folded Spill + vrepli.w $vr3, 255 + vst $vr3, $sp, 16 # 16-byte Folded Spill b .LBB1_17 .p2align 4, , 16 .LBB1_15: # in Loop: Header=BB1_17 Depth=1 @@ -587,8 +585,7 @@ start_pass_1_quant: # @start_pass_1_quant ori $a2, $zero, 1024 move $a0, $fp jirl $ra, $a3, 0 - vld $vr4, $sp, 16 # 16-byte Folded Reload - vld $vr3, $sp, 32 # 16-byte Folded Reload + vld $vr3, $sp, 16 # 16-byte Folded Reload move $a1, $zero slli.d $a2, $s5, 9 addi.d $a2, $a2, -512 @@ -600,17 +597,14 @@ start_pass_1_quant: # @start_pass_1_quant # => This Inner Loop Header: Depth=2 ld.w $a3, $a2, -8 vinsgr2vr.w $vr1, $a3, 0 - vilvl.b $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr3, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 1 - vsub.w $vr1, $vr4, $vr1 - vmul.w $vr1, $vr1, $vr4 - vshuf4i.w $vr2, $vr1, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr1, $vr1, 16 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 + vsub.w $vr1, $vr3, $vr1 + vmul.w $vr1, $vr1, $vr3 + vshuf4i.w $vr2, $vr1, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr1, $vr1, 0 vdiv.d $vr1, $vr1, $vr0 vdiv.d $vr2, $vr2, $vr0 vpickev.w $vr1, $vr2, $vr1 @@ -618,51 +612,42 @@ start_pass_1_quant: # @start_pass_1_quant ld.w $a3, $a2, -4 add.d $a4, $a0, $a1 vinsgr2vr.w $vr1, $a3, 0 - vilvl.b $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr3, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 1 - vsub.w $vr1, $vr4, $vr1 - vmul.w $vr1, $vr1, $vr4 - vshuf4i.w $vr2, $vr1, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr1, $vr1, 16 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 + vsub.w $vr1, $vr3, $vr1 + vmul.w $vr1, $vr1, $vr3 + vshuf4i.w $vr2, $vr1, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr1, $vr1, 0 vdiv.d $vr1, $vr1, $vr0 vdiv.d $vr2, $vr2, $vr0 vpickev.w $vr1, $vr2, $vr1 vst $vr1, $a4, 16 ld.w $a3, $a2, 0 vinsgr2vr.w $vr1, $a3, 0 - vilvl.b $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr3, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 1 - vsub.w $vr1, $vr4, $vr1 - vmul.w $vr1, $vr1, $vr4 - vshuf4i.w $vr2, $vr1, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr1, $vr1, 16 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 + vsub.w $vr1, $vr3, $vr1 + vmul.w $vr1, $vr1, $vr3 + vshuf4i.w $vr2, $vr1, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr1, $vr1, 0 vdiv.d $vr1, $vr1, $vr0 vdiv.d $vr2, $vr2, $vr0 vpickev.w $vr1, $vr2, $vr1 vst $vr1, $a4, 32 ld.w $a3, $a2, 4 vinsgr2vr.w $vr1, $a3, 0 - vilvl.b $vr1, $vr3, $vr1 - vilvl.h $vr1, $vr3, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vslli.w $vr1, $vr1, 1 - vsub.w $vr1, $vr4, $vr1 - vmul.w $vr1, $vr1, $vr4 - vshuf4i.w $vr2, $vr1, 50 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vshuf4i.w $vr1, $vr1, 16 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 + vsub.w $vr1, $vr3, $vr1 + vmul.w $vr1, $vr1, $vr3 + vshuf4i.w $vr2, $vr1, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr1, $vr1, 0 vdiv.d $vr1, $vr1, $vr0 vdiv.d $vr2, $vr2, $vr0 vpickev.w $vr1, $vr2, $vr1 @@ -717,15 +702,15 @@ start_pass_1_quant: # @start_pass_1_quant addi.d $s1, $s1, 8 blt $s2, $a0, .LBB1_28 .LBB1_29: # %create_odither_tables.exit - ld.d $s5, $sp, 48 # 8-byte Folded Reload - ld.d $s4, $sp, 56 # 8-byte Folded Reload - ld.d $s3, $sp, 64 # 8-byte Folded Reload - ld.d $s2, $sp, 72 # 8-byte Folded Reload - ld.d $s1, $sp, 80 # 8-byte Folded Reload - ld.d $s0, $sp, 88 # 8-byte Folded Reload - ld.d $fp, $sp, 96 # 8-byte Folded Reload - ld.d $ra, $sp, 104 # 8-byte Folded Reload - addi.d $sp, $sp, 112 + ld.d $s5, $sp, 32 # 8-byte Folded Reload + ld.d $s4, $sp, 40 # 8-byte Folded Reload + ld.d $s3, $sp, 48 # 8-byte Folded Reload + ld.d $s2, $sp, 56 # 8-byte Folded Reload + ld.d $s1, $sp, 64 # 8-byte Folded Reload + ld.d $s0, $sp, 72 # 8-byte Folded Reload + ld.d $fp, $sp, 80 # 8-byte Folded Reload + ld.d $ra, $sp, 88 # 8-byte Folded Reload + addi.d $sp, $sp, 96 ret .Lfunc_end1: .size start_pass_1_quant, .Lfunc_end1-start_pass_1_quant @@ -1011,12 +996,12 @@ color_quantize: # @color_quantize ld.h $t5, $t5, 2 vinsgr2vr.h $vr3, $t6, 0 vinsgr2vr.h $vr4, $t5, 0 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.w $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vilvl.w $vr4, $vr0, $vr4 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 + vsllwil.du.wu $vr4, $vr4, 0 ld.d $t5, $t4, -16 vpickve2gr.d $t6, $vr3, 0 ld.d $t7, $t4, -8 @@ -1033,10 +1018,10 @@ color_quantize: # @color_quantize vinsgr2vr.b $vr3, $t6, 1 vinsgr2vr.b $vr4, $t7, 0 vinsgr2vr.b $vr4, $t8, 1 - vilvl.b $vr3, $vr0, $vr3 - vilvl.h $vr3, $vr0, $vr3 - vilvl.b $vr4, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vsllwil.wu.hu $vr4, $vr4, 0 vadd.w $vr1, $vr1, $vr3 vadd.w $vr2, $vr2, $vr4 addi.d $t3, $t3, 4 diff --git a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jquant2.s b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jquant2.s index 5c8380d1..251a91c3 100644 --- a/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jquant2.s +++ b/results/MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/CMakeFiles/cjpeg.dir/jquant2.s @@ -886,20 +886,20 @@ finish_pass1: # @finish_pass1 ld.d $s0, $sp, 40 # 8-byte Folded Reload ld.d $a0, $s0, 152 ld.d $a0, $a0, 0 - srai.d $a1, $ra, 1 + srai.d $a1, $s7, 1 add.d $a3, $s1, $a1 - div.d $a3, $a3, $ra + div.d $a3, $a3, $s7 ld.d $t0, $sp, 48 # 8-byte Folded Reload stx.b $a3, $a0, $t0 ld.d $a0, $s0, 152 ld.d $a0, $a0, 8 add.d $a3, $s2, $a1 - div.d $a3, $a3, $ra + div.d $a3, $a3, $s7 stx.b $a3, $a0, $t0 ld.d $a0, $s0, 152 ld.d $a0, $a0, 16 - add.d $a1, $s7, $a1 - div.d $a1, $a1, $ra + add.d $a1, $s6, $a1 + div.d $a1, $a1, $s7 stx.b $a1, $a0, $t0 addi.d $t0, $t0, 1 ld.d $fp, $sp, 32 # 8-byte Folded Reload @@ -925,10 +925,10 @@ finish_pass1: # @finish_pass1 # in Loop: Header=BB5_20 Depth=1 st.d $t0, $sp, 48 # 8-byte Folded Spill ld.d $t0, $s0, 608 - move $s7, $zero + move $s6, $zero move $s2, $zero move $s1, $zero - move $ra, $zero + move $s7, $zero ld.w $t1, $a5, 4 ld.d $a5, $t0, 48 ldx.w $a6, $fp, $a6 @@ -983,9 +983,9 @@ finish_pass1: # @finish_pass1 # Child Loop BB5_29 Depth 4 # Child Loop BB5_33 Depth 4 slli.d $a1, $s5, 6 - add.d $s6, $t8, $a1 + add.d $s8, $t8, $a1 slli.d $a1, $s5, 2 - addi.w $s8, $a1, 2 + addi.w $ra, $a1, 2 bgeu $t1, $a2, .LBB5_28 # %bb.27: # in Loop: Header=BB5_26 Depth=3 move $s0, $a4 @@ -993,16 +993,16 @@ finish_pass1: # @finish_pass1 .p2align 4, , 16 .LBB5_28: # %vector.ph # in Loop: Header=BB5_26 Depth=3 - add.d $s6, $s6, $t5 + add.d $s8, $s8, $t5 vori.b $vr7, $vr1, 0 - vinsgr2vr.d $vr7, $s7, 0 + vinsgr2vr.d $vr7, $s6, 0 vori.b $vr6, $vr1, 0 vinsgr2vr.d $vr6, $s2, 0 vori.b $vr5, $vr1, 0 vinsgr2vr.d $vr5, $s1, 0 vori.b $vr4, $vr1, 0 - vinsgr2vr.d $vr4, $ra, 0 - vreplgr2vr.d $vr9, $s8 + vinsgr2vr.d $vr4, $s7, 0 + vreplgr2vr.d $vr9, $ra move $s1, $t3 move $s2, $s4 vori.b $vr12, $vr1, 0 @@ -1021,19 +1021,15 @@ finish_pass1: # @finish_pass1 vinsgr2vr.w $vr14, $a1, 0 vinsgr2vr.w $vr15, $fp, 0 vseqi.h $vr16, $vr14, 0 - vilvl.h $vr16, $vr16, $vr16 - vilvl.w $vr16, $vr16, $vr16 - vslli.d $vr16, $vr16, 48 - vsrai.d $vr16, $vr16, 48 + vsllwil.w.h $vr16, $vr16, 0 + vsllwil.d.w $vr16, $vr16, 0 vseqi.h $vr17, $vr15, 0 - vilvl.h $vr17, $vr17, $vr17 - vilvl.w $vr17, $vr17, $vr17 - vslli.d $vr17, $vr17, 48 - vsrai.d $vr17, $vr17, 48 - vilvl.h $vr14, $vr1, $vr14 - vilvl.w $vr14, $vr1, $vr14 - vilvl.h $vr15, $vr1, $vr15 - vilvl.w $vr15, $vr1, $vr15 + vsllwil.w.h $vr17, $vr17, 0 + vsllwil.d.w $vr17, $vr17, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 vadd.d $vr4, $vr4, $vr14 vadd.d $vr8, $vr8, $vr15 vmul.d $vr18, $vr3, $vr14 @@ -1043,12 +1039,8 @@ finish_pass1: # @finish_pass1 vslli.w $vr22, $vr13, 3 vbitseti.w $vr23, $vr22, 2 vaddi.wu $vr22, $vr22, 20 - vshuf4i.w $vr23, $vr23, 16 - vslli.d $vr23, $vr23, 32 - vsrai.d $vr23, $vr23, 32 - vshuf4i.w $vr22, $vr22, 16 - vslli.d $vr22, $vr22, 32 - vsrai.d $vr22, $vr22, 32 + vsllwil.d.w $vr23, $vr23, 0 + vsllwil.d.w $vr22, $vr22, 0 vmul.d $vr14, $vr14, $vr23 vmul.d $vr15, $vr15, $vr22 vbitsel.v $vr18, $vr18, $vr1, $vr16 @@ -1071,7 +1063,7 @@ finish_pass1: # @finish_pass1 # in Loop: Header=BB5_26 Depth=3 vadd.d $vr7, $vr12, $vr7 vhaddw.q.d $vr7, $vr7, $vr7 - vpickve2gr.d $s7, $vr7, 0 + vpickve2gr.d $s6, $vr7, 0 vadd.d $vr6, $vr11, $vr6 vhaddw.q.d $vr6, $vr6, $vr6 vpickve2gr.d $s2, $vr6, 0 @@ -1080,7 +1072,7 @@ finish_pass1: # @finish_pass1 vpickve2gr.d $s1, $vr5, 0 vadd.d $vr4, $vr8, $vr4 vhaddw.q.d $vr4, $vr4, $vr4 - vpickve2gr.d $ra, $vr4, 0 + vpickve2gr.d $s7, $vr4, 0 move $s0, $t4 beq $t2, $t3, .LBB5_25 .LBB5_31: # %scalar.ph.preheader @@ -1091,7 +1083,7 @@ finish_pass1: # @finish_pass1 b .LBB5_33 .p2align 4, , 16 .LBB5_32: # in Loop: Header=BB5_33 Depth=4 - addi.d $s6, $s6, 2 + addi.d $s8, $s8, 2 addi.w $a1, $a1, -1 addi.w $s0, $s0, 8 beqz $a1, .LBB5_25 @@ -1100,16 +1092,16 @@ finish_pass1: # @finish_pass1 # Parent Loop BB5_24 Depth=2 # Parent Loop BB5_26 Depth=3 # => This Inner Loop Header: Depth=4 - ld.hu $fp, $s6, 0 + ld.hu $fp, $s8, 0 beqz $fp, .LBB5_32 # %bb.34: # in Loop: Header=BB5_33 Depth=4 - add.d $ra, $ra, $fp + add.d $s7, $s7, $fp mul.d $a0, $fp, $s3 add.d $s1, $a0, $s1 - mul.d $a0, $fp, $s8 + mul.d $a0, $fp, $ra add.d $s2, $a0, $s2 mul.d $a0, $fp, $s0 - add.d $s7, $a0, $s7 + add.d $s6, $a0, $s6 b .LBB5_32 .p2align 4, , 16 .LBB5_35: # %.preheader.us.i.i @@ -2018,7 +2010,7 @@ fill_inverse_cmap: # @fill_inverse_cmap fst.d $fs5, $sp, 1896 # 8-byte Folded Spill fst.d $fs6, $sp, 1888 # 8-byte Folded Spill fst.d $fs7, $sp, 1880 # 8-byte Folded Spill - addi.d $sp, $sp, -736 + addi.d $sp, $sp, -608 ld.d $a4, $a0, 608 move $a5, $a1 ld.d $a1, $a4, 48 @@ -2079,6 +2071,14 @@ fill_inverse_cmap: # @fill_inverse_cmap .LBB10_4: # %find_nearby_colors.exit ori $a1, $s2, 4095 vreplgr2vr.d $vr0, $a1 + vst $vr0, $sp, 432 + vst $vr0, $sp, 448 + vst $vr0, $sp, 464 + vst $vr0, $sp, 480 + vst $vr0, $sp, 496 + vst $vr0, $sp, 512 + vst $vr0, $sp, 528 + vst $vr0, $sp, 544 vst $vr0, $sp, 560 vst $vr0, $sp, 576 vst $vr0, $sp, 592 @@ -2135,14 +2135,6 @@ fill_inverse_cmap: # @fill_inverse_cmap vst $vr0, $sp, 1408 vst $vr0, $sp, 1424 vst $vr0, $sp, 1440 - vst $vr0, $sp, 1456 - vst $vr0, $sp, 1472 - vst $vr0, $sp, 1488 - vst $vr0, $sp, 1504 - vst $vr0, $sp, 1520 - vst $vr0, $sp, 1536 - vst $vr0, $sp, 1552 - vst $vr0, $sp, 1568 blez $t2, .LBB10_73 # %bb.5: # %.lr.ph.i37 ld.d $a0, $a0, 152 @@ -2157,7 +2149,7 @@ fill_inverse_cmap: # @fill_inverse_cmap beq $t0, $t2, .LBB10_73 .LBB10_7: # =>This Loop Header: Depth=1 # Child Loop BB10_9 Depth 2 - addi.d $a2, $sp, 304 + addi.d $a2, $sp, 176 ldx.bu $t8, $t0, $a2 ld.d $a2, $a0, 0 ldx.bu $a2, $a2, $t8 @@ -2189,8 +2181,8 @@ fill_inverse_cmap: # @fill_inverse_cmap addi.d $s8, $a2, 1296 addi.d $ra, $a2, 1584 addi.d $a4, $a2, 1872 - addi.d $t5, $sp, 191 - addi.d $t4, $sp, 688 + addi.d $t5, $sp, 63 + addi.d $t4, $sp, 560 move $t7, $a1 b .LBB10_9 .p2align 4, , 16 @@ -2541,7 +2533,7 @@ fill_inverse_cmap: # @fill_inverse_cmap andi $a2, $a2, 28 ld.d $a3, $sp, 32 # 8-byte Folded Reload alsl.d $a2, $a2, $a3, 3 - addi.d $a3, $sp, 176 + addi.d $a3, $sp, 48 slli.d $a4, $a4, 6 ori $a5, $zero, 32 .p2align 4, , 16 @@ -2650,7 +2642,7 @@ fill_inverse_cmap: # @fill_inverse_cmap addi.d $a3, $a3, 32 bne $a0, $a5, .LBB10_74 # %bb.75: - addi.d $sp, $sp, 736 + addi.d $sp, $sp, 608 fld.d $fs7, $sp, 1880 # 8-byte Folded Reload fld.d $fs6, $sp, 1888 # 8-byte Folded Reload fld.d $fs5, $sp, 1896 # 8-byte Folded Reload @@ -2675,328 +2667,250 @@ fill_inverse_cmap: # @fill_inverse_cmap .LBB10_76: # %vector.ph bstrpick.d $a1, $t1, 30, 2 slli.d $s1, $a1, 2 - vinsgr2vr.w $vr18, $a5, 0 - vinsgr2vr.w $vr18, $a5, 1 + vinsgr2vr.w $vr0, $a5, 0 + vinsgr2vr.w $vr0, $a5, 1 vinsgr2vr.w $vr1, $t2, 0 vinsgr2vr.w $vr1, $t2, 1 - vinsgr2vr.w $vr0, $t3, 0 - vinsgr2vr.w $vr0, $t3, 1 - vst $vr0, $sp, 80 # 16-byte Folded Spill + vinsgr2vr.w $vr2, $t3, 0 + vinsgr2vr.w $vr2, $t3, 1 vinsgr2vr.w $vr3, $a6, 0 vinsgr2vr.w $vr3, $a6, 1 vinsgr2vr.w $vr4, $t4, 0 vinsgr2vr.w $vr4, $t4, 1 - vinsgr2vr.w $vr0, $t5, 0 - vinsgr2vr.w $vr0, $t5, 1 - vst $vr0, $sp, 64 # 16-byte Folded Spill + vinsgr2vr.w $vr5, $t5, 0 + vinsgr2vr.w $vr5, $t5, 1 vinsgr2vr.w $vr6, $a7, 0 vinsgr2vr.w $vr6, $a7, 1 vinsgr2vr.w $vr7, $t6, 0 vinsgr2vr.w $vr7, $t6, 1 - vinsgr2vr.w $vr0, $t7, 0 - vinsgr2vr.w $vr0, $t7, 1 - vst $vr0, $sp, 48 # 16-byte Folded Spill - move $a1, $s2 - addi.d $s2, $t8, 2 - addi.d $s3, $sp, 576 - addi.d $s4, $fp, 2 - addi.d $s5, $s0, 2 - ori $a1, $a1, 4095 - vreplgr2vr.d $vr9, $a1 - vrepli.b $vr10, 0 - vrepli.w $vr11, 3 - move $s6, $s1 - vori.b $vr12, $vr9, 0 + vinsgr2vr.w $vr8, $t7, 0 + vinsgr2vr.w $vr8, $t7, 1 + addi.d $a1, $t8, 2 + addi.d $a2, $sp, 448 + addi.d $a3, $fp, 2 + addi.d $a4, $s0, 2 + ori $s2, $s2, 4095 + vreplgr2vr.d $vr9, $s2 + vrepli.w $vr10, 3 + move $s2, $s1 + vori.b $vr11, $vr9, 0 .p2align 4, , 16 .LBB10_77: # %vector.body # =>This Inner Loop Header: Depth=1 - ld.h $a1, $s2, -2 - ld.h $a2, $s2, 0 - vinsgr2vr.h $vr0, $a1, 0 - vinsgr2vr.h $vr2, $a2, 0 - vilvl.b $vr0, $vr10, $vr0 - vilvl.h $vr14, $vr10, $vr0 - vilvl.b $vr0, $vr10, $vr2 - vilvl.h $vr13, $vr10, $vr0 - vsle.wu $vr16, $vr18, $vr14 - vshuf4i.w $vr0, $vr16, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr2, $vr0, 32 - vsle.wu $vr15, $vr18, $vr13 - vshuf4i.w $vr0, $vr15, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 - vslt.wu $vr5, $vr1, $vr14 - vslt.wu $vr8, $vr1, $vr13 - vand.v $vr5, $vr16, $vr5 - vst $vr5, $sp, 144 # 16-byte Folded Spill - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vand.v $vr8, $vr15, $vr8 - vst $vr8, $sp, 160 # 16-byte Folded Spill - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 - vsub.w $vr19, $vr14, $vr1 - vsub.w $vr20, $vr13, $vr1 - vslli.w $vr19, $vr19, 1 - vslli.w $vr20, $vr20, 1 - vilvl.w $vr19, $vr10, $vr19 - vilvl.w $vr20, $vr10, $vr20 - vmul.d $vr19, $vr19, $vr19 - vmul.d $vr20, $vr20, $vr20 - vsub.w $vr21, $vr14, $vr18 - vsub.w $vr23, $vr13, $vr18 - vslli.w $vr21, $vr21, 1 - vslli.w $vr23, $vr23, 1 - vshuf4i.w $vr21, $vr21, 16 - vslli.d $vr21, $vr21, 32 - vsrai.d $vr21, $vr21, 32 - vshuf4i.w $vr23, $vr23, 16 - vslli.d $vr23, $vr23, 32 - vsrai.d $vr23, $vr23, 32 - vmul.d $vr21, $vr21, $vr21 - vmul.d $vr23, $vr23, $vr23 - vand.v $vr5, $vr5, $vr19 - vbitsel.v $vr2, $vr21, $vr5, $vr2 - vst $vr2, $sp, 128 # 16-byte Folded Spill - ld.h $a1, $s4, -2 - ld.h $a2, $s4, 0 - vand.v $vr2, $vr8, $vr20 - vbitsel.v $vr0, $vr23, $vr2, $vr0 - vst $vr0, $sp, 112 # 16-byte Folded Spill - vinsgr2vr.h $vr0, $a1, 0 - vinsgr2vr.h $vr2, $a2, 0 - vilvl.b $vr0, $vr10, $vr0 - vilvl.h $vr21, $vr10, $vr0 - vilvl.b $vr0, $vr10, $vr2 - vilvl.h $vr20, $vr10, $vr0 - vsle.wu $vr24, $vr3, $vr21 - vshuf4i.w $vr0, $vr24, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr5, $vr0, 32 - vsle.wu $vr23, $vr3, $vr20 - vshuf4i.w $vr0, $vr23, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 - vslt.wu $vr2, $vr4, $vr21 - vslt.wu $vr8, $vr4, $vr20 - vand.v $vr26, $vr24, $vr2 - vshuf4i.w $vr2, $vr26, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr19, $vr2, 32 - vand.v $vr25, $vr23, $vr8 - vshuf4i.w $vr2, $vr25, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vsub.w $vr8, $vr21, $vr4 - vsub.w $vr27, $vr20, $vr4 - vmul.w $vr8, $vr8, $vr11 - vmul.w $vr27, $vr27, $vr11 - vilvl.w $vr8, $vr10, $vr8 - vilvl.w $vr27, $vr10, $vr27 - vmul.d $vr8, $vr8, $vr8 + ld.h $s3, $a1, -2 + ld.h $s4, $a1, 0 + vinsgr2vr.h $vr12, $s3, 0 + vinsgr2vr.h $vr13, $s4, 0 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsle.wu $vr14, $vr0, $vr12 + vsllwil.d.w $vr15, $vr14, 0 + vsle.wu $vr16, $vr0, $vr13 + vsllwil.d.w $vr17, $vr16, 0 + vslt.wu $vr18, $vr1, $vr12 + vslt.wu $vr19, $vr1, $vr13 + vslt.wu $vr20, $vr2, $vr12 + vslt.wu $vr21, $vr2, $vr13 + vand.v $vr18, $vr14, $vr18 + vsllwil.d.w $vr22, $vr18, 0 + vand.v $vr19, $vr16, $vr19 + vsllwil.d.w $vr23, $vr19, 0 + vsub.w $vr24, $vr12, $vr1 + vsub.w $vr25, $vr13, $vr1 + vslli.w $vr24, $vr24, 1 + vslli.w $vr25, $vr25, 1 + vsllwil.du.wu $vr24, $vr24, 0 + vsllwil.du.wu $vr25, $vr25, 0 + vmul.d $vr24, $vr24, $vr24 + vmul.d $vr25, $vr25, $vr25 + vsub.w $vr26, $vr12, $vr0 + vsub.w $vr27, $vr13, $vr0 + vslli.w $vr26, $vr26, 1 + vslli.w $vr27, $vr27, 1 + vsllwil.d.w $vr26, $vr26, 0 + vsllwil.d.w $vr27, $vr27, 0 + vmul.d $vr26, $vr26, $vr26 vmul.d $vr27, $vr27, $vr27 - vsub.w $vr28, $vr21, $vr3 - vsub.w $vr29, $vr20, $vr3 - vmul.w $vr28, $vr28, $vr11 - vmul.w $vr29, $vr29, $vr11 - vshuf4i.w $vr28, $vr28, 16 - vslli.d $vr28, $vr28, 32 - vsrai.d $vr28, $vr28, 32 - vshuf4i.w $vr29, $vr29, 16 - vslli.d $vr29, $vr29, 32 - vsrai.d $vr29, $vr29, 32 + vor.v $vr18, $vr18, $vr20 + vand.v $vr14, $vr14, $vr18 + vpickve2gr.d $s3, $vr14, 0 + vinsgr2vr.w $vr18, $s3, 0 + vsllwil.d.w $vr14, $vr14, 0 + vpickve2gr.d $s3, $vr14, 1 + vinsgr2vr.w $vr18, $s3, 1 + vslli.w $vr14, $vr18, 31 + vsrai.w $vr14, $vr14, 31 + vbitsel.v $vr18, $vr1, $vr0, $vr14 + vor.v $vr14, $vr19, $vr21 + vand.v $vr14, $vr16, $vr14 + vpickve2gr.d $s3, $vr14, 0 + vinsgr2vr.w $vr16, $s3, 0 + vsllwil.d.w $vr14, $vr14, 0 + vpickve2gr.d $s3, $vr14, 1 + vinsgr2vr.w $vr16, $s3, 1 + vslli.w $vr14, $vr16, 31 + vsrai.w $vr14, $vr14, 31 + vbitsel.v $vr16, $vr1, $vr0, $vr14 + vand.v $vr14, $vr22, $vr24 + vbitsel.v $vr14, $vr26, $vr14, $vr15 + vand.v $vr15, $vr23, $vr25 + vbitsel.v $vr15, $vr27, $vr15, $vr17 + ld.h $s3, $a3, -2 + vsub.w $vr12, $vr12, $vr18 + vsub.w $vr13, $vr13, $vr16 + vslli.w $vr12, $vr12, 1 + vinsgr2vr.h $vr16, $s3, 0 + ld.h $s3, $a3, 0 + vslli.w $vr13, $vr13, 1 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr13, $vr13, 0 + vinsgr2vr.h $vr17, $s3, 0 + vsllwil.hu.bu $vr16, $vr16, 0 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.hu.bu $vr17, $vr17, 0 + vsllwil.wu.hu $vr17, $vr17, 0 + vsle.wu $vr18, $vr3, $vr16 + vsllwil.d.w $vr19, $vr18, 0 + vsle.wu $vr20, $vr3, $vr17 + vsllwil.d.w $vr21, $vr20, 0 + vslt.wu $vr22, $vr4, $vr16 + vslt.wu $vr23, $vr4, $vr17 + vslt.wu $vr24, $vr5, $vr16 + vslt.wu $vr25, $vr5, $vr17 + vand.v $vr22, $vr18, $vr22 + vsllwil.d.w $vr26, $vr22, 0 + vand.v $vr23, $vr20, $vr23 + vsllwil.d.w $vr27, $vr23, 0 + vsub.w $vr28, $vr16, $vr4 + vsub.w $vr29, $vr17, $vr4 + vmul.w $vr28, $vr28, $vr10 + vmul.w $vr29, $vr29, $vr10 + vsllwil.du.wu $vr28, $vr28, 0 + vsllwil.du.wu $vr29, $vr29, 0 vmul.d $vr28, $vr28, $vr28 vmul.d $vr29, $vr29, $vr29 - vand.v $vr8, $vr19, $vr8 - vbitsel.v $vr5, $vr28, $vr8, $vr5 - vst $vr5, $sp, 96 # 16-byte Folded Spill - ld.h $a1, $s5, -2 - ld.h $a2, $s5, 0 - vand.v $vr2, $vr2, $vr27 - vbitsel.v $vr8, $vr29, $vr2, $vr0 - vinsgr2vr.h $vr0, $a1, 0 - vinsgr2vr.h $vr2, $a2, 0 - vilvl.b $vr0, $vr10, $vr0 - vilvl.h $vr28, $vr10, $vr0 - vilvl.b $vr0, $vr10, $vr2 - vilvl.h $vr27, $vr10, $vr0 - vsle.wu $vr30, $vr6, $vr28 - vshuf4i.w $vr0, $vr30, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 - vsle.wu $vr29, $vr6, $vr27 - vshuf4i.w $vr2, $vr29, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr19, $vr2, 32 - vslt.wu $vr2, $vr7, $vr28 - vslt.wu $vr22, $vr7, $vr27 - vand.v $vr5, $vr30, $vr2 - vshuf4i.w $vr2, $vr5, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr17, $vr2, 32 - vand.v $vr2, $vr29, $vr22 - vsub.w $vr22, $vr28, $vr7 - vilvl.w $vr22, $vr10, $vr22 - vmul.d $vr22, $vr22, $vr22 - vand.v $vr17, $vr17, $vr22 - vsub.w $vr22, $vr28, $vr6 - vshuf4i.w $vr22, $vr22, 16 - vslli.d $vr22, $vr22, 32 - vsrai.d $vr22, $vr22, 32 - vmul.d $vr22, $vr22, $vr22 - vbitsel.v $vr0, $vr22, $vr17, $vr0 - vshuf4i.w $vr17, $vr2, 16 - vslli.d $vr17, $vr17, 32 - vsrai.d $vr17, $vr17, 32 - vsub.w $vr22, $vr27, $vr7 - vilvl.w $vr22, $vr10, $vr22 - vmul.d $vr22, $vr22, $vr22 - vand.v $vr17, $vr17, $vr22 - vsub.w $vr22, $vr27, $vr6 - vshuf4i.w $vr22, $vr22, 16 - vslli.d $vr22, $vr22, 32 - vsrai.d $vr22, $vr22, 32 - vmul.d $vr22, $vr22, $vr22 - vbitsel.v $vr19, $vr22, $vr17, $vr19 - vld $vr22, $sp, 80 # 16-byte Folded Reload - vslt.wu $vr17, $vr22, $vr14 - vld $vr31, $sp, 144 # 16-byte Folded Reload - vor.v $vr17, $vr31, $vr17 - vand.v $vr16, $vr16, $vr17 - vshuf4i.w $vr16, $vr16, 16 - vslli.d $vr16, $vr16, 32 - vsrai.d $vr16, $vr16, 32 - vpickve2gr.d $a1, $vr16, 0 - vpickve2gr.d $a2, $vr16, 1 - vslt.wu $vr16, $vr22, $vr13 - vld $vr17, $sp, 160 # 16-byte Folded Reload - vor.v $vr16, $vr17, $vr16 - vand.v $vr15, $vr15, $vr16 - vshuf4i.w $vr15, $vr15, 16 - vslli.d $vr15, $vr15, 32 - vsrai.d $vr15, $vr15, 32 - vpickve2gr.d $a3, $vr15, 0 - vpickve2gr.d $a4, $vr15, 1 - vinsgr2vr.w $vr15, $a1, 0 - vinsgr2vr.w $vr15, $a2, 1 - vslli.w $vr15, $vr15, 31 - vsrai.w $vr15, $vr15, 31 - vbitsel.v $vr15, $vr1, $vr18, $vr15 - vsub.w $vr14, $vr14, $vr15 - vinsgr2vr.w $vr15, $a3, 0 - vinsgr2vr.w $vr15, $a4, 1 - vslli.w $vr15, $vr15, 31 - vsrai.w $vr15, $vr15, 31 - vbitsel.v $vr15, $vr1, $vr18, $vr15 - vsub.w $vr13, $vr13, $vr15 - vld $vr16, $sp, 64 # 16-byte Folded Reload - vslt.wu $vr15, $vr16, $vr21 - vor.v $vr15, $vr26, $vr15 - vand.v $vr15, $vr24, $vr15 - vshuf4i.w $vr15, $vr15, 16 - vslli.d $vr15, $vr15, 32 - vsrai.d $vr15, $vr15, 32 - vpickve2gr.d $a1, $vr15, 0 - vpickve2gr.d $a2, $vr15, 1 - vslt.wu $vr15, $vr16, $vr20 - vor.v $vr15, $vr25, $vr15 - vand.v $vr15, $vr23, $vr15 - vshuf4i.w $vr15, $vr15, 16 - vslli.d $vr15, $vr15, 32 - vsrai.d $vr15, $vr15, 32 - vpickve2gr.d $a3, $vr15, 0 - vpickve2gr.d $a4, $vr15, 1 - vld $vr15, $sp, 128 # 16-byte Folded Reload - vld $vr16, $sp, 96 # 16-byte Folded Reload - vadd.d $vr15, $vr15, $vr16 - vld $vr16, $sp, 112 # 16-byte Folded Reload - vadd.d $vr8, $vr16, $vr8 - vinsgr2vr.w $vr16, $a1, 0 - vinsgr2vr.w $vr16, $a2, 1 - vslli.w $vr16, $vr16, 31 - vsrai.w $vr16, $vr16, 31 - vbitsel.v $vr16, $vr4, $vr3, $vr16 - vsub.w $vr16, $vr21, $vr16 - vinsgr2vr.w $vr17, $a3, 0 - vinsgr2vr.w $vr17, $a4, 1 - vslli.w $vr17, $vr17, 31 - vsrai.w $vr17, $vr17, 31 - vbitsel.v $vr17, $vr4, $vr3, $vr17 - vsub.w $vr17, $vr20, $vr17 - vslli.w $vr14, $vr14, 1 - vshuf4i.w $vr14, $vr14, 16 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr14, $vr14, 32 - vmul.w $vr16, $vr16, $vr11 - vshuf4i.w $vr16, $vr16, 16 - vslli.d $vr16, $vr16, 32 - vsrai.d $vr16, $vr16, 32 + vsub.w $vr30, $vr16, $vr3 + vsub.w $vr31, $vr17, $vr3 + vmul.w $vr30, $vr30, $vr10 + vmul.w $vr31, $vr31, $vr10 + vsllwil.d.w $vr30, $vr30, 0 + vsllwil.d.w $vr31, $vr31, 0 + vmul.d $vr30, $vr30, $vr30 + vmul.d $vr31, $vr31, $vr31 + vor.v $vr22, $vr22, $vr24 + vand.v $vr18, $vr18, $vr22 + vpickve2gr.d $s3, $vr18, 0 + vinsgr2vr.w $vr22, $s3, 0 + vsllwil.d.w $vr18, $vr18, 0 + vpickve2gr.d $s3, $vr18, 1 + vinsgr2vr.w $vr22, $s3, 1 + vslli.w $vr18, $vr22, 31 + vsrai.w $vr18, $vr18, 31 + vbitsel.v $vr18, $vr4, $vr3, $vr18 + vor.v $vr22, $vr23, $vr25 + vand.v $vr20, $vr20, $vr22 + vpickve2gr.d $s3, $vr20, 0 + vinsgr2vr.w $vr22, $s3, 0 + vsllwil.d.w $vr20, $vr20, 0 + vpickve2gr.d $s3, $vr20, 1 + vinsgr2vr.w $vr22, $s3, 1 + vslli.w $vr20, $vr22, 31 + vsrai.w $vr20, $vr20, 31 + vbitsel.v $vr20, $vr4, $vr3, $vr20 + vand.v $vr22, $vr26, $vr28 + vbitsel.v $vr19, $vr30, $vr22, $vr19 + vadd.d $vr14, $vr14, $vr19 + vand.v $vr19, $vr27, $vr29 + vbitsel.v $vr19, $vr31, $vr19, $vr21 + vadd.d $vr15, $vr15, $vr19 + vsub.w $vr16, $vr16, $vr18 + vsub.w $vr17, $vr17, $vr20 + vmul.w $vr16, $vr16, $vr10 + vmul.w $vr17, $vr17, $vr10 + ld.h $s3, $a4, -2 + vsllwil.d.w $vr16, $vr16, 0 + vsllwil.d.w $vr17, $vr17, 0 vmul.d $vr16, $vr16, $vr16 - vmadd.d $vr16, $vr14, $vr14 - vslli.w $vr13, $vr13, 1 - vshuf4i.w $vr13, $vr13, 16 - vslli.d $vr13, $vr13, 32 - vsrai.d $vr13, $vr13, 32 - vmul.w $vr14, $vr17, $vr11 - vshuf4i.w $vr14, $vr14, 16 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr14, $vr14, 32 - vmul.d $vr14, $vr14, $vr14 - vmadd.d $vr14, $vr13, $vr13 - vld $vr17, $sp, 48 # 16-byte Folded Reload - vslt.wu $vr13, $vr17, $vr28 - vor.v $vr5, $vr5, $vr13 - vand.v $vr5, $vr30, $vr5 - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vpickve2gr.d $a1, $vr5, 0 - vpickve2gr.d $a2, $vr5, 1 - vslt.wu $vr5, $vr17, $vr27 - vor.v $vr2, $vr2, $vr5 - vand.v $vr2, $vr29, $vr2 - vshuf4i.w $vr2, $vr2, 16 - vslli.d $vr2, $vr2, 32 - vsrai.d $vr2, $vr2, 32 - vpickve2gr.d $a3, $vr2, 0 - vpickve2gr.d $a4, $vr2, 1 - vadd.d $vr0, $vr15, $vr0 - vadd.d $vr2, $vr8, $vr19 - vinsgr2vr.w $vr5, $a1, 0 - vinsgr2vr.w $vr5, $a2, 1 - vslli.w $vr5, $vr5, 31 - vsrai.w $vr5, $vr5, 31 - vbitsel.v $vr5, $vr7, $vr6, $vr5 - vsub.w $vr5, $vr28, $vr5 - vinsgr2vr.w $vr8, $a3, 0 - vinsgr2vr.w $vr8, $a4, 1 - vslli.w $vr8, $vr8, 31 - vsrai.w $vr8, $vr8, 31 - vbitsel.v $vr8, $vr7, $vr6, $vr8 - vsub.w $vr8, $vr27, $vr8 - vshuf4i.w $vr5, $vr5, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vmadd.d $vr16, $vr5, $vr5 - vshuf4i.w $vr5, $vr8, 16 - vslli.d $vr5, $vr5, 32 - vsrai.d $vr5, $vr5, 32 - vmadd.d $vr14, $vr5, $vr5 - vst $vr0, $s3, -16 - vst $vr2, $s3, 0 + vinsgr2vr.h $vr18, $s3, 0 + ld.h $s3, $a4, 0 + vmul.d $vr17, $vr17, $vr17 + vmadd.d $vr16, $vr12, $vr12 + vmadd.d $vr17, $vr13, $vr13 + vinsgr2vr.h $vr12, $s3, 0 + vsllwil.hu.bu $vr13, $vr18, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsle.wu $vr18, $vr6, $vr13 + vsllwil.d.w $vr19, $vr18, 0 + vsle.wu $vr20, $vr6, $vr12 + vsllwil.d.w $vr21, $vr20, 0 + vslt.wu $vr22, $vr7, $vr13 + vslt.wu $vr23, $vr7, $vr12 + vslt.wu $vr24, $vr8, $vr13 + vslt.wu $vr25, $vr8, $vr12 + vand.v $vr22, $vr18, $vr22 + vsllwil.d.w $vr26, $vr22, 0 + vand.v $vr23, $vr20, $vr23 + vsllwil.d.w $vr27, $vr23, 0 + vsub.w $vr28, $vr13, $vr7 + vsub.w $vr29, $vr12, $vr7 + vsllwil.du.wu $vr28, $vr28, 0 + vsllwil.du.wu $vr29, $vr29, 0 + vmul.d $vr28, $vr28, $vr28 + vmul.d $vr29, $vr29, $vr29 + vsub.w $vr30, $vr13, $vr6 + vsub.w $vr31, $vr12, $vr6 + vsllwil.d.w $vr30, $vr30, 0 + vsllwil.d.w $vr31, $vr31, 0 + vmul.d $vr30, $vr30, $vr30 + vmul.d $vr31, $vr31, $vr31 + vor.v $vr22, $vr22, $vr24 + vand.v $vr18, $vr18, $vr22 + vpickve2gr.d $s3, $vr18, 0 + vinsgr2vr.w $vr22, $s3, 0 + vsllwil.d.w $vr18, $vr18, 0 + vpickve2gr.d $s3, $vr18, 1 + vinsgr2vr.w $vr22, $s3, 1 + vslli.w $vr18, $vr22, 31 + vsrai.w $vr18, $vr18, 31 + vbitsel.v $vr18, $vr7, $vr6, $vr18 + vor.v $vr22, $vr23, $vr25 + vand.v $vr20, $vr20, $vr22 + vpickve2gr.d $s3, $vr20, 0 + vinsgr2vr.w $vr22, $s3, 0 + vsllwil.d.w $vr20, $vr20, 0 + vpickve2gr.d $s3, $vr20, 1 + vinsgr2vr.w $vr22, $s3, 1 + vslli.w $vr20, $vr22, 31 + vsrai.w $vr20, $vr20, 31 + vbitsel.v $vr20, $vr7, $vr6, $vr20 + vand.v $vr22, $vr26, $vr28 + vbitsel.v $vr19, $vr30, $vr22, $vr19 + vadd.d $vr14, $vr14, $vr19 + vand.v $vr19, $vr27, $vr29 + vbitsel.v $vr19, $vr31, $vr19, $vr21 + vadd.d $vr15, $vr15, $vr19 + vsub.w $vr13, $vr13, $vr18 + vsub.w $vr12, $vr12, $vr20 + vsllwil.d.w $vr13, $vr13, 0 + vsllwil.d.w $vr12, $vr12, 0 + vmadd.d $vr16, $vr13, $vr13 + vmadd.d $vr17, $vr12, $vr12 + vst $vr14, $a2, -16 + vst $vr15, $a2, 0 vmin.d $vr9, $vr16, $vr9 - vmin.d $vr12, $vr14, $vr12 - addi.d $s6, $s6, -4 - addi.d $s2, $s2, 4 - addi.d $s3, $s3, 32 - addi.d $s4, $s4, 4 - addi.d $s5, $s5, 4 - bnez $s6, .LBB10_77 + vmin.d $vr11, $vr17, $vr11 + addi.d $s2, $s2, -4 + addi.d $a1, $a1, 4 + addi.d $a2, $a2, 32 + addi.d $a3, $a3, 4 + addi.d $a4, $a4, 4 + bnez $s2, .LBB10_77 # %bb.78: # %middle.block - vmin.d $vr0, $vr9, $vr12 + vmin.d $vr0, $vr9, $vr11 vbsrl.v $vr1, $vr0, 8 vmin.d $vr0, $vr1, $vr0 vpickve2gr.d $a1, $vr0, 0 @@ -3004,8 +2918,8 @@ fill_inverse_cmap: # @fill_inverse_cmap .LBB10_79: # %.lr.ph170.i.preheader move $a2, $zero move $t2, $zero - addi.d $a3, $sp, 560 - addi.d $a4, $sp, 304 + addi.d $a3, $sp, 432 + addi.d $a4, $sp, 176 lu12i.w $s2, 524287 b .LBB10_81 .p2align 4, , 16 @@ -3024,7 +2938,7 @@ fill_inverse_cmap: # @fill_inverse_cmap b .LBB10_80 .LBB10_83: # %scalar.ph.preheader sub.d $a2, $t1, $s1 - addi.d $a3, $sp, 560 + addi.d $a3, $sp, 432 alsl.d $a3, $s1, $a3, 3 add.d $a4, $s0, $s1 add.d $fp, $fp, $s1 diff --git a/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/getbits.s b/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/getbits.s index 2d3eac08..ed09180c 100644 --- a/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/getbits.s +++ b/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/getbits.s @@ -277,50 +277,49 @@ Flush_Buffer: # @Flush_Buffer slli.d $t0, $a0, 2 alsl.d $a6, $a0, $a2, 2 slli.d $a0, $a0, 5 - vrepli.b $vr1, 0 + vrepli.b $vr0, 0 add.w $a0, $a3, $a0 - vori.b $vr0, $vr1, 0 - vinsgr2vr.w $vr0, $a5, 0 + vori.b $vr1, $vr0, 0 + vinsgr2vr.w $vr1, $a5, 0 vinsgr2vr.w $vr2, $a3, 0 vinsgr2vr.w $vr2, $a3, 1 ori $a3, $zero, 0 lu32i.d $a3, 8 vreplgr2vr.d $vr3, $a3 - vadd.w $vr3, $vr2, $vr3 - vrepli.w $vr4, 24 - vrepli.w $vr5, 8 - vrepli.w $vr6, 32 + vadd.w $vr2, $vr2, $vr3 + vrepli.w $vr3, 24 + vrepli.w $vr4, 8 + vrepli.w $vr5, 32 move $a3, $t0 - vori.b $vr2, $vr1, 0 .p2align 4, , 16 .LBB1_30: # %vector.body # =>This Inner Loop Header: Depth=1 move $a5, $a2 ld.h $a2, $a2, 0 ld.h $t1, $a5, 2 - vinsgr2vr.h $vr7, $a2, 0 - vinsgr2vr.h $vr8, $t1, 0 - vilvl.b $vr7, $vr1, $vr7 - vilvl.h $vr7, $vr1, $vr7 - vilvl.b $vr8, $vr1, $vr8 - vilvl.h $vr8, $vr1, $vr8 - vsub.w $vr9, $vr4, $vr3 - vsub.w $vr10, $vr5, $vr3 + vinsgr2vr.h $vr6, $a2, 0 + vinsgr2vr.h $vr7, $t1, 0 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.wu.hu $vr7, $vr7, 0 + vsub.w $vr8, $vr3, $vr2 + vsub.w $vr9, $vr4, $vr2 + vsll.w $vr6, $vr6, $vr8 vsll.w $vr7, $vr7, $vr9 - vsll.w $vr8, $vr8, $vr10 + vor.v $vr1, $vr6, $vr1 vor.v $vr0, $vr7, $vr0 - vor.v $vr2, $vr8, $vr2 addi.d $a2, $a5, 4 addi.d $a3, $a3, -4 - vadd.w $vr3, $vr3, $vr6 + vadd.w $vr2, $vr2, $vr5 bnez $a3, .LBB1_30 # %bb.31: # %middle.block - vreplgr2vr.d $vr1, $a5 - vaddi.du $vr1, $vr1, 1 - vpickve2gr.d $a2, $vr1, 1 + vreplgr2vr.d $vr2, $a5 + vaddi.du $vr2, $vr2, 1 + vpickve2gr.d $a2, $vr2, 1 addi.d $a2, $a2, 3 stptr.d $a2, $a1, 2056 - vor.v $vr0, $vr2, $vr0 + vor.v $vr0, $vr0, $vr1 vbsrl.v $vr1, $vr0, 4 vor.v $vr0, $vr1, $vr0 vpickve2gr.w $a5, $vr0, 0 diff --git a/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/gethdr.s b/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/gethdr.s index dcca2976..8f08b38e 100644 --- a/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/gethdr.s +++ b/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/gethdr.s @@ -5,18 +5,18 @@ .type Get_Hdr,@function Get_Hdr: # @Get_Hdr # %bb.0: - addi.d $sp, $sp, -176 - st.d $ra, $sp, 168 # 8-byte Folded Spill - st.d $fp, $sp, 160 # 8-byte Folded Spill - st.d $s0, $sp, 152 # 8-byte Folded Spill - st.d $s1, $sp, 144 # 8-byte Folded Spill - st.d $s2, $sp, 136 # 8-byte Folded Spill - st.d $s3, $sp, 128 # 8-byte Folded Spill - st.d $s4, $sp, 120 # 8-byte Folded Spill - st.d $s5, $sp, 112 # 8-byte Folded Spill - st.d $s6, $sp, 104 # 8-byte Folded Spill - st.d $s7, $sp, 96 # 8-byte Folded Spill - st.d $s8, $sp, 88 # 8-byte Folded Spill + addi.d $sp, $sp, -160 + st.d $ra, $sp, 152 # 8-byte Folded Spill + st.d $fp, $sp, 144 # 8-byte Folded Spill + st.d $s0, $sp, 136 # 8-byte Folded Spill + st.d $s1, $sp, 128 # 8-byte Folded Spill + st.d $s2, $sp, 120 # 8-byte Folded Spill + st.d $s3, $sp, 112 # 8-byte Folded Spill + st.d $s4, $sp, 104 # 8-byte Folded Spill + st.d $s5, $sp, 96 # 8-byte Folded Spill + st.d $s6, $sp, 88 # 8-byte Folded Spill + st.d $s7, $sp, 80 # 8-byte Folded Spill + st.d $s8, $sp, 72 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(ld) ld.d $fp, $a0, %got_pc_lo12(ld) ori $s0, $zero, 2096 @@ -24,16 +24,16 @@ Get_Hdr: # @Get_Hdr ori $s2, $zero, 438 pcalau12i $a0, %got_pc_hi20(horizontal_size) ld.d $a0, $a0, %got_pc_lo12(horizontal_size) - st.d $a0, $sp, 80 # 8-byte Folded Spill + st.d $a0, $sp, 64 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(vertical_size) ld.d $a0, $a0, %got_pc_lo12(vertical_size) - st.d $a0, $sp, 72 # 8-byte Folded Spill + st.d $a0, $sp, 56 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(aspect_ratio_information) ld.d $a0, $a0, %got_pc_lo12(aspect_ratio_information) - st.d $a0, $sp, 64 # 8-byte Folded Spill + st.d $a0, $sp, 48 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(frame_rate_code) ld.d $a0, $a0, %got_pc_lo12(frame_rate_code) - st.d $a0, $sp, 56 # 8-byte Folded Spill + st.d $a0, $sp, 40 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(bit_rate_value) ld.d $s8, $a0, %got_pc_lo12(bit_rate_value) pcalau12i $a0, %got_pc_hi20(vbv_buffer_size) @@ -43,12 +43,10 @@ Get_Hdr: # @Get_Hdr pcalau12i $a0, %got_pc_hi20(default_intra_quantizer_matrix) ld.d $s5, $a0, %got_pc_lo12(default_intra_quantizer_matrix) addi.d $a0, $s5, 64 - st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a0, $sp, 32 # 8-byte Folded Spill ori $s6, $zero, 64 - vrepli.b $vr0, 0 - vst $vr0, $sp, 16 # 16-byte Folded Spill vrepli.w $vr0, 16 - vst $vr0, $sp, 32 # 16-byte Folded Spill + vst $vr0, $sp, 16 # 16-byte Folded Spill b .LBB0_2 .p2align 4, , 16 .LBB0_1: # %group_of_pictures_header.exit @@ -139,22 +137,22 @@ Get_Hdr: # @Get_Hdr ori $a0, $zero, 12 pcaddu18i $ra, %call36(Get_Bits) jirl $ra, $ra, 0 - ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 64 # 8-byte Folded Reload st.w $a0, $a1, 0 ori $a0, $zero, 12 pcaddu18i $ra, %call36(Get_Bits) jirl $ra, $ra, 0 - ld.d $a1, $sp, 72 # 8-byte Folded Reload + ld.d $a1, $sp, 56 # 8-byte Folded Reload st.w $a0, $a1, 0 ori $a0, $zero, 4 pcaddu18i $ra, %call36(Get_Bits) jirl $ra, $ra, 0 - ld.d $a1, $sp, 64 # 8-byte Folded Reload + ld.d $a1, $sp, 48 # 8-byte Folded Reload st.w $a0, $a1, 0 ori $a0, $zero, 4 pcaddu18i $ra, %call36(Get_Bits) jirl $ra, $ra, 0 - ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload st.w $a0, $a1, 0 ori $a0, $zero, 18 pcaddu18i $ra, %call36(Get_Bits) @@ -242,7 +240,7 @@ Get_Hdr: # @Get_Hdr # in Loop: Header=BB0_2 Depth=1 addi.d $a2, $a1, 2047 addi.d $a0, $a2, 57 - ld.d $a3, $sp, 48 # 8-byte Folded Reload + ld.d $a3, $sp, 32 # 8-byte Folded Reload bgeu $a0, $a3, .LBB0_20 # %bb.17: # %.preheader22.i # in Loop: Header=BB0_2 Depth=1 @@ -268,94 +266,93 @@ Get_Hdr: # @Get_Hdr vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a2, 0 ld.w $a0, $s5, 8 - vld $vr2, $sp, 16 # 16-byte Folded Reload - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 ori $a2, $zero, 2104 vstx $vr0, $a1, $a2 vinsgr2vr.w $vr0, $a0, 0 ld.w $a0, $s5, 12 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ori $a2, $zero, 2120 vstx $vr1, $a1, $a2 vinsgr2vr.w $vr1, $a0, 0 ld.w $a0, $s5, 16 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 ori $a2, $zero, 2136 vstx $vr0, $a1, $a2 vinsgr2vr.w $vr0, $a0, 0 ld.w $a0, $s5, 20 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ori $a2, $zero, 2152 vstx $vr1, $a1, $a2 vinsgr2vr.w $vr1, $a0, 0 ld.w $a0, $s5, 24 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 ori $a2, $zero, 2168 vstx $vr0, $a1, $a2 vinsgr2vr.w $vr0, $a0, 0 ld.w $a0, $s5, 28 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ori $a2, $zero, 2184 vstx $vr1, $a1, $a2 vinsgr2vr.w $vr1, $a0, 0 ld.w $a0, $s5, 32 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 ori $a2, $zero, 2200 vstx $vr0, $a1, $a2 vinsgr2vr.w $vr0, $a0, 0 ld.w $a0, $s5, 36 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ori $a2, $zero, 2216 vstx $vr1, $a1, $a2 vinsgr2vr.w $vr1, $a0, 0 ld.w $a0, $s5, 40 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 ori $a2, $zero, 2232 vstx $vr0, $a1, $a2 vinsgr2vr.w $vr0, $a0, 0 ld.w $a0, $s5, 44 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ori $a2, $zero, 2248 vstx $vr1, $a1, $a2 vinsgr2vr.w $vr1, $a0, 0 ld.w $a0, $s5, 48 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 ori $a2, $zero, 2264 vstx $vr0, $a1, $a2 vinsgr2vr.w $vr0, $a0, 0 ld.w $a0, $s5, 52 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ori $a2, $zero, 2280 vstx $vr1, $a1, $a2 vinsgr2vr.w $vr1, $a0, 0 ld.w $a0, $s5, 56 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 ori $a2, $zero, 2296 vstx $vr0, $a1, $a2 vinsgr2vr.w $vr0, $a0, 0 ld.w $a0, $s5, 60 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ori $a2, $zero, 2312 vstx $vr1, $a1, $a2 vinsgr2vr.w $vr1, $a0, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 ori $a0, $zero, 2328 vstx $vr0, $a1, $a0 ori $a0, $zero, 2344 @@ -390,7 +387,7 @@ Get_Hdr: # @Get_Hdr b .LBB0_25 .LBB0_24: # %.preheader.i # in Loop: Header=BB0_2 Depth=1 - vld $vr0, $sp, 32 # 16-byte Folded Reload + vld $vr0, $sp, 16 # 16-byte Folded Reload ori $a0, $zero, 2360 vstx $vr0, $a1, $a0 ori $a0, $zero, 2376 @@ -702,18 +699,18 @@ Get_Hdr: # @Get_Hdr or $a1, $a1, $a4 st.w $a1, $a3, %pc_lo12(True_Framenum_max) .LBB0_45: # %picture_header.exit - ld.d $s8, $sp, 88 # 8-byte Folded Reload - ld.d $s7, $sp, 96 # 8-byte Folded Reload - ld.d $s6, $sp, 104 # 8-byte Folded Reload - ld.d $s5, $sp, 112 # 8-byte Folded Reload - ld.d $s4, $sp, 120 # 8-byte Folded Reload - ld.d $s3, $sp, 128 # 8-byte Folded Reload - ld.d $s2, $sp, 136 # 8-byte Folded Reload - ld.d $s1, $sp, 144 # 8-byte Folded Reload - ld.d $s0, $sp, 152 # 8-byte Folded Reload - ld.d $fp, $sp, 160 # 8-byte Folded Reload - ld.d $ra, $sp, 168 # 8-byte Folded Reload - addi.d $sp, $sp, 176 + ld.d $s8, $sp, 72 # 8-byte Folded Reload + ld.d $s7, $sp, 80 # 8-byte Folded Reload + ld.d $s6, $sp, 88 # 8-byte Folded Reload + ld.d $s5, $sp, 96 # 8-byte Folded Reload + ld.d $s4, $sp, 104 # 8-byte Folded Reload + ld.d $s3, $sp, 112 # 8-byte Folded Reload + ld.d $s2, $sp, 120 # 8-byte Folded Reload + ld.d $s1, $sp, 128 # 8-byte Folded Reload + ld.d $s0, $sp, 136 # 8-byte Folded Reload + ld.d $fp, $sp, 144 # 8-byte Folded Reload + ld.d $ra, $sp, 152 # 8-byte Folded Reload + addi.d $sp, $sp, 160 ret .LBB0_46: pcalau12i $a3, %pc_hi20(Temporal_Reference_Base) diff --git a/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/getpic.s b/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/getpic.s index 1ec0ed8e..9736dff7 100644 --- a/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/getpic.s +++ b/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/getpic.s @@ -5,23 +5,23 @@ .type Decode_Picture,@function Decode_Picture: # @Decode_Picture # %bb.0: - addi.d $sp, $sp, -416 - st.d $ra, $sp, 408 # 8-byte Folded Spill - st.d $fp, $sp, 400 # 8-byte Folded Spill - st.d $s0, $sp, 392 # 8-byte Folded Spill - st.d $s1, $sp, 384 # 8-byte Folded Spill - st.d $s2, $sp, 376 # 8-byte Folded Spill - st.d $s3, $sp, 368 # 8-byte Folded Spill - st.d $s4, $sp, 360 # 8-byte Folded Spill - st.d $s5, $sp, 352 # 8-byte Folded Spill - st.d $s6, $sp, 344 # 8-byte Folded Spill - st.d $s7, $sp, 336 # 8-byte Folded Spill - st.d $s8, $sp, 328 # 8-byte Folded Spill - st.d $a1, $sp, 40 # 8-byte Folded Spill + addi.d $sp, $sp, -432 + st.d $ra, $sp, 424 # 8-byte Folded Spill + st.d $fp, $sp, 416 # 8-byte Folded Spill + st.d $s0, $sp, 408 # 8-byte Folded Spill + st.d $s1, $sp, 400 # 8-byte Folded Spill + st.d $s2, $sp, 392 # 8-byte Folded Spill + st.d $s3, $sp, 384 # 8-byte Folded Spill + st.d $s4, $sp, 376 # 8-byte Folded Spill + st.d $s5, $sp, 368 # 8-byte Folded Spill + st.d $s6, $sp, 360 # 8-byte Folded Spill + st.d $s7, $sp, 352 # 8-byte Folded Spill + st.d $s8, $sp, 344 # 8-byte Folded Spill + st.d $a1, $sp, 48 # 8-byte Folded Spill move $s0, $a0 pcalau12i $a0, %got_pc_hi20(picture_structure) ld.d $a0, $a0, %got_pc_lo12(picture_structure) - st.d $a0, $sp, 208 # 8-byte Folded Spill + st.d $a0, $sp, 224 # 8-byte Folded Spill ld.w $a3, $a0, 0 pcalau12i $a0, %got_pc_hi20(Second_Field) ld.d $fp, $a0, %got_pc_lo12(Second_Field) @@ -35,7 +35,7 @@ Decode_Picture: # @Decode_Picture addi.d $a0, $a0, %pc_lo12(.Lstr) pcaddu18i $ra, %call36(puts) jirl $ra, $ra, 0 - ld.d $a0, $sp, 208 # 8-byte Folded Reload + ld.d $a0, $sp, 224 # 8-byte Folded Reload ld.w $a3, $a0, 0 st.w $zero, $fp, 0 pcalau12i $a0, %got_pc_hi20(picture_coding_type) @@ -163,16 +163,16 @@ Decode_Picture: # @Decode_Picture beqz $a1, .LBB0_19 # %bb.18: move $a0, $s0 - ld.d $a1, $sp, 40 # 8-byte Folded Reload + ld.d $a1, $sp, 48 # 8-byte Folded Reload pcaddu18i $ra, %call36(Substitute_Frame_Buffer) jirl $ra, $ra, 0 ld.w $a0, $fp, 0 .LBB0_19: - st.d $s0, $sp, 24 # 8-byte Folded Spill - st.d $fp, $sp, 32 # 8-byte Folded Spill + st.d $s0, $sp, 32 # 8-byte Folded Spill + st.d $fp, $sp, 40 # 8-byte Folded Spill pcalau12i $a1, %got_pc_hi20(base) ld.d $a1, $a1, %got_pc_lo12(base) - st.d $a1, $sp, 200 # 8-byte Folded Spill + st.d $a1, $sp, 216 # 8-byte Folded Spill ldptr.w $a1, $a1, 3160 beqz $a1, .LBB0_22 # %bb.20: @@ -183,20 +183,20 @@ Decode_Picture: # @Decode_Picture .LBB0_22: pcalau12i $a0, %got_pc_hi20(mb_width) ld.d $a0, $a0, %got_pc_lo12(mb_width) - st.d $a0, $sp, 72 # 8-byte Folded Spill + st.d $a0, $sp, 104 # 8-byte Folded Spill ld.w $s0, $a0, 0 pcalau12i $a0, %got_pc_hi20(mb_height) ld.d $a0, $a0, %got_pc_lo12(mb_height) ld.w $s1, $a0, 0 - ld.d $a0, $sp, 208 # 8-byte Folded Reload + ld.d $a0, $sp, 224 # 8-byte Folded Reload ld.w $s3, $a0, 0 pcalau12i $a0, %got_pc_hi20(ld) ld.d $fp, $a0, %got_pc_lo12(ld) - ld.d $s4, $sp, 200 # 8-byte Folded Reload + ld.d $s4, $sp, 216 # 8-byte Folded Reload st.d $s4, $fp, 0 pcalau12i $a0, %got_pc_hi20(Fault_Flag) ld.d $a0, $a0, %got_pc_lo12(Fault_Flag) - st.d $a0, $sp, 128 # 8-byte Folded Spill + st.d $a0, $sp, 160 # 8-byte Folded Spill st.w $zero, $a0, 0 pcaddu18i $ra, %call36(next_start_code) jirl $ra, $ra, 0 @@ -206,7 +206,7 @@ Decode_Picture: # @Decode_Picture move $s2, $a0 addi.w $a0, $a0, -432 addi.w $a1, $zero, -175 - st.d $a1, $sp, 56 # 8-byte Folded Spill + st.d $a1, $sp, 88 # 8-byte Folded Spill bgeu $a0, $a1, .LBB0_31 .LBB0_23: # %._crit_edge.i pcalau12i $a0, %got_pc_hi20(Quiet_Flag) @@ -220,13 +220,13 @@ Decode_Picture: # @Decode_Picture pcaddu18i $ra, %call36(puts) jirl $ra, $ra, 0 .LBB0_26: # %picture_data.exit - ld.d $a0, $sp, 40 # 8-byte Folded Reload + ld.d $a0, $sp, 48 # 8-byte Folded Reload beqz $a0, .LBB0_206 # %bb.27: - ld.d $a0, $sp, 208 # 8-byte Folded Reload + ld.d $a0, $sp, 224 # 8-byte Folded Reload ld.w $a1, $a0, 0 ori $a0, $zero, 3 - ld.d $fp, $sp, 32 # 8-byte Folded Reload + ld.d $fp, $sp, 40 # 8-byte Folded Reload beq $a1, $a0, .LBB0_29 # %bb.28: ld.w $a2, $fp, 0 @@ -238,13 +238,13 @@ Decode_Picture: # @Decode_Picture ld.w $a1, $a1, 0 bne $a1, $a0, .LBB0_207 # %bb.30: - ld.d $a0, $sp, 24 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload addi.w $a1, $a0, -1 pcalau12i $a0, %got_pc_hi20(auxframe) ld.d $a0, $a0, %got_pc_lo12(auxframe) pcaddu18i $ra, %call36(Write_Frame) jirl $ra, $ra, 0 - ld.d $a0, $sp, 208 # 8-byte Folded Reload + ld.d $a0, $sp, 224 # 8-byte Folded Reload ld.w $a0, $a0, 0 ori $a1, $zero, 3 bne $a0, $a1, .LBB0_208 @@ -254,32 +254,32 @@ Decode_Picture: # @Decode_Picture addi.d $a1, $s3, -3 sltu $a1, $zero, $a1 sra.w $a0, $a0, $a1 - st.d $a0, $sp, 80 # 8-byte Folded Spill + st.d $a0, $sp, 112 # 8-byte Folded Spill ori $s7, $zero, 1 pcalau12i $a0, %got_pc_hi20(enhan) ld.d $a0, $a0, %got_pc_lo12(enhan) - st.d $a0, $sp, 216 # 8-byte Folded Spill + st.d $a0, $sp, 232 # 8-byte Folded Spill addi.w $a0, $zero, -176 - st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a0, $sp, 56 # 8-byte Folded Spill vrepli.b $vr0, 0 - vst $vr0, $sp, 176 # 16-byte Folded Spill + vst $vr0, $sp, 64 # 16-byte Folded Spill pcalau12i $a0, %got_pc_hi20(concealment_motion_vectors) ld.d $a0, $a0, %got_pc_lo12(concealment_motion_vectors) - st.d $a0, $sp, 64 # 8-byte Folded Spill + st.d $a0, $sp, 96 # 8-byte Folded Spill lu12i.w $a0, 15 ori $a0, $a0, 2048 vreplgr2vr.h $vr0, $a0 - vst $vr0, $sp, 160 # 16-byte Folded Spill + vst $vr0, $sp, 192 # 16-byte Folded Spill ori $a0, $zero, 2047 vreplgr2vr.h $vr0, $a0 - vst $vr0, $sp, 144 # 16-byte Folded Spill + vst $vr0, $sp, 176 # 16-byte Folded Spill b .LBB0_35 .p2align 4, , 16 .LBB0_32: # in Loop: Header=BB0_35 Depth=1 move $s3, $a0 pcaddu18i $ra, %call36(Get_macroblock_address_increment) jirl $ra, $ra, 0 - ld.d $a1, $sp, 128 # 8-byte Folded Reload + ld.d $a1, $sp, 160 # 8-byte Folded Reload ld.w $a1, $a1, 0 beqz $a1, .LBB0_40 .LBB0_33: # in Loop: Header=BB0_35 Depth=1 @@ -290,7 +290,7 @@ Decode_Picture: # @Decode_Picture .LBB0_34: # %slice.exit.i # in Loop: Header=BB0_35 Depth=1 st.d $s0, $fp, 0 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload st.w $zero, $a0, 0 pcaddu18i $ra, %call36(next_start_code) jirl $ra, $ra, 0 @@ -300,7 +300,7 @@ Decode_Picture: # @Decode_Picture move $s2, $a0 addi.w $a0, $a0, -432 move $s4, $s0 - ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 88 # 8-byte Folded Reload bltu $a0, $a1, .LBB0_23 .LBB0_35: # =>This Loop Header: Depth=1 # Child Loop BB0_43 Depth 2 @@ -315,7 +315,7 @@ Decode_Picture: # @Decode_Picture move $s0, $s4 bne $a1, $s7, .LBB0_32 # %bb.36: # in Loop: Header=BB0_35 Depth=1 - ld.d $a0, $sp, 216 # 8-byte Folded Reload + ld.d $a0, $sp, 232 # 8-byte Folded Reload st.d $a0, $fp, 0 pcaddu18i $ra, %call36(next_start_code) jirl $ra, $ra, 0 @@ -324,7 +324,7 @@ Decode_Picture: # @Decode_Picture jirl $ra, $ra, 0 move $s2, $a0 addi.w $a0, $a0, -432 - ld.d $a1, $sp, 48 # 8-byte Folded Reload + ld.d $a1, $sp, 56 # 8-byte Folded Reload bgeu $a1, $a0, .LBB0_211 # %bb.37: # in Loop: Header=BB0_35 Depth=1 pcaddu18i $ra, %call36(Flush_Buffer32) @@ -339,12 +339,12 @@ Decode_Picture: # @Decode_Picture .LBB0_39: # in Loop: Header=BB0_35 Depth=1 pcaddu18i $ra, %call36(Get_macroblock_address_increment) jirl $ra, $ra, 0 - ld.d $a1, $sp, 128 # 8-byte Folded Reload + ld.d $a1, $sp, 160 # 8-byte Folded Reload ld.w $a1, $a1, 0 bnez $a1, .LBB0_33 .LBB0_40: # in Loop: Header=BB0_35 Depth=1 slli.d $a1, $s3, 7 - ld.d $a2, $sp, 72 # 8-byte Folded Reload + ld.d $a2, $sp, 104 # 8-byte Folded Reload ld.w $a2, $a2, 0 andi $a3, $s2, 255 add.d $a1, $a3, $a1 @@ -352,17 +352,17 @@ Decode_Picture: # @Decode_Picture mul.d $a1, $a2, $a1 add.d $a0, $a0, $a1 addi.w $s1, $a0, -1 - st.w $zero, $sp, 296 - st.d $zero, $sp, 288 - vld $vr0, $sp, 176 # 16-byte Folded Reload - vst $vr0, $sp, 256 + st.w $zero, $sp, 312 + st.d $zero, $sp, 304 + vld $vr0, $sp, 64 # 16-byte Folded Reload vst $vr0, $sp, 272 - ld.d $a0, $sp, 80 # 8-byte Folded Reload - ld.d $a3, $sp, 216 # 8-byte Folded Reload + vst $vr0, $sp, 288 + ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload bge $s1, $a0, .LBB0_26 # %bb.41: # %.preheader.i.i.preheader # in Loop: Header=BB0_35 Depth=1 - st.d $zero, $sp, 112 # 8-byte Folded Spill + st.d $zero, $sp, 144 # 8-byte Folded Spill move $s5, $zero ori $a2, $zero, 1 b .LBB0_43 @@ -372,21 +372,21 @@ Decode_Picture: # @Decode_Picture ld.w $a0, $s3, 0 ori $a1, $zero, 3148 ldx.w $a1, $a3, $a1 - ld.d $s1, $sp, 104 # 8-byte Folded Reload + ld.d $s1, $sp, 136 # 8-byte Folded Reload addi.w $s1, $s1, 1 - ld.d $a2, $sp, 88 # 8-byte Folded Reload + ld.d $a2, $sp, 120 # 8-byte Folded Reload addi.w $a2, $a2, -1 sltu $a0, $zero, $a0 addi.d $a1, $a1, -3 sltui $a1, $a1, 1 and $a0, $a0, $a1 - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 144 # 8-byte Folded Reload add.w $a1, $a1, $a0 - st.d $a1, $sp, 112 # 8-byte Folded Spill - ld.d $s5, $sp, 120 # 8-byte Folded Reload + st.d $a1, $sp, 144 # 8-byte Folded Spill + ld.d $s5, $sp, 152 # 8-byte Folded Reload sub.w $s5, $s5, $a0 - ld.d $s0, $sp, 200 # 8-byte Folded Reload - ld.d $a0, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 216 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload ori $s7, $zero, 1 bge $s1, $a0, .LBB0_26 .LBB0_43: # %.preheader.i.i @@ -398,9 +398,9 @@ Decode_Picture: # @Decode_Picture st.d $s0, $fp, 0 beqz $a2, .LBB0_52 # %bb.44: # in Loop: Header=BB0_43 Depth=2 - st.d $s1, $sp, 104 # 8-byte Folded Spill - st.d $a2, $sp, 88 # 8-byte Folded Spill - st.d $s5, $sp, 120 # 8-byte Folded Spill + st.d $s1, $sp, 136 # 8-byte Folded Spill + st.d $a2, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 152 # 8-byte Folded Spill bne $a2, $s7, .LBB0_62 .LBB0_45: # in Loop: Header=BB0_43 Depth=2 ldptr.w $a0, $s0, 3148 @@ -415,26 +415,26 @@ Decode_Picture: # @Decode_Picture or $a0, $a0, $a1 st.d $a0, $fp, 0 .LBB0_47: # in Loop: Header=BB0_43 Depth=2 - addi.d $a0, $sp, 308 - addi.d $a1, $sp, 228 - addi.d $a2, $sp, 224 - addi.d $a3, $sp, 304 - addi.d $a4, $sp, 324 - addi.d $a5, $sp, 320 - addi.d $a6, $sp, 316 - addi.d $a7, $sp, 312 - addi.d $t0, $sp, 300 + addi.d $a0, $sp, 324 + addi.d $a1, $sp, 244 + addi.d $a2, $sp, 240 + addi.d $a3, $sp, 320 + addi.d $a4, $sp, 340 + addi.d $a5, $sp, 336 + addi.d $a6, $sp, 332 + addi.d $a7, $sp, 328 + addi.d $t0, $sp, 316 st.d $t0, $sp, 0 pcaddu18i $ra, %call36(macroblock_modes) jirl $ra, $ra, 0 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.w $a0, $a0, 0 ori $t1, $zero, 3 bnez $a0, .LBB0_205 # %bb.48: # in Loop: Header=BB0_43 Depth=2 - ld.wu $s2, $sp, 308 + ld.wu $s2, $sp, 324 andi $a0, $s2, 16 - ld.d $s0, $sp, 200 # 8-byte Folded Reload + ld.d $s0, $sp, 216 # 8-byte Folded Reload beqz $a0, .LBB0_75 # %bb.49: # in Loop: Header=BB0_43 Depth=2 ori $a0, $zero, 5 @@ -468,7 +468,7 @@ Decode_Picture: # @Decode_Picture jirl $ra, $ra, 0 beqz $a0, .LBB0_34 # %bb.56: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.w $a0, $a0, 0 bnez $a0, .LBB0_34 # %bb.57: # in Loop: Header=BB0_43 Depth=2 @@ -479,20 +479,20 @@ Decode_Picture: # @Decode_Picture ldx.w $a0, $s0, $a0 bne $a0, $s7, .LBB0_60 # %bb.59: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 216 # 8-byte Folded Reload + ld.d $a0, $sp, 232 # 8-byte Folded Reload st.d $a0, $fp, 0 .LBB0_60: # in Loop: Header=BB0_43 Depth=2 pcaddu18i $ra, %call36(Get_macroblock_address_increment) jirl $ra, $ra, 0 - ld.d $a1, $sp, 128 # 8-byte Folded Reload + ld.d $a1, $sp, 160 # 8-byte Folded Reload ld.w $a1, $a1, 0 bnez $a1, .LBB0_34 # %bb.61: # in Loop: Header=BB0_43 Depth=2 move $a2, $a0 - ld.d $a3, $sp, 216 # 8-byte Folded Reload - st.d $s1, $sp, 104 # 8-byte Folded Spill - st.d $a2, $sp, 88 # 8-byte Folded Spill - st.d $s5, $sp, 120 # 8-byte Folded Spill + ld.d $a3, $sp, 232 # 8-byte Folded Reload + st.d $s1, $sp, 136 # 8-byte Folded Spill + st.d $a2, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 152 # 8-byte Folded Spill beq $a2, $s7, .LBB0_45 .LBB0_62: # in Loop: Header=BB0_43 Depth=2 ldptr.w $a0, $s0, 3148 @@ -513,21 +513,21 @@ Decode_Picture: # @Decode_Picture move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload .LBB0_66: # %._crit_edge.i33.i.i # in Loop: Header=BB0_43 Depth=2 - st.w $zero, $sp, 296 - st.d $zero, $sp, 288 + st.w $zero, $sp, 312 + st.d $zero, $sp, 304 pcalau12i $a0, %got_pc_hi20(picture_coding_type) ld.d $a0, $a0, %got_pc_lo12(picture_coding_type) ld.w $a0, $a0, 0 ori $a1, $zero, 2 bne $a0, $a1, .LBB0_68 # %bb.67: # in Loop: Header=BB0_43 Depth=2 + st.d $zero, $sp, 288 st.d $zero, $sp, 272 - st.d $zero, $sp, 256 .LBB0_68: # in Loop: Header=BB0_43 Depth=2 - ld.d $a1, $sp, 208 # 8-byte Folded Reload + ld.d $a1, $sp, 224 # 8-byte Folded Reload ld.w $a2, $a1, 0 ori $a1, $zero, 2 ori $t1, $zero, 3 @@ -535,19 +535,19 @@ Decode_Picture: # @Decode_Picture # %bb.69: # in Loop: Header=BB0_43 Depth=2 addi.d $a1, $a2, -2 sltui $a1, $a1, 1 - st.w $a1, $sp, 244 - st.w $a1, $sp, 240 + st.w $a1, $sp, 260 + st.w $a1, $sp, 256 ori $a1, $zero, 1 .LBB0_70: # %skipped_macroblock.exit.i.i # in Loop: Header=BB0_43 Depth=2 - st.w $a1, $sp, 304 + st.w $a1, $sp, 320 addi.d $a0, $a0, -1 - ld.w $s2, $sp, 308 + ld.w $s2, $sp, 324 sltui $a0, $a0, 1 slli.d $a0, $a0, 3 - st.w $a0, $sp, 228 + st.w $a0, $sp, 244 bstrins.d $s2, $zero, 0, 0 - st.w $s2, $sp, 308 + st.w $s2, $sp, 324 pcalau12i $a0, %got_pc_hi20(Two_Streams) ld.d $s3, $a0, %got_pc_lo12(Two_Streams) ld.w $a0, $s3, 0 @@ -569,13 +569,13 @@ Decode_Picture: # @Decode_Picture .LBB0_75: # in Loop: Header=BB0_43 Depth=2 andi $a0, $s2, 8 andi $a1, $s2, 1 - st.d $a1, $sp, 136 # 8-byte Folded Spill + st.d $a1, $sp, 168 # 8-byte Folded Spill bnez $a0, .LBB0_78 # %bb.76: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 136 # 8-byte Folded Reload + ld.d $a0, $sp, 168 # 8-byte Folded Reload beqz $a0, .LBB0_82 # %bb.77: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 64 # 8-byte Folded Reload + ld.d $a0, $sp, 96 # 8-byte Folded Reload ld.w $a0, $a0, 0 beqz $a0, .LBB0_82 .LBB0_78: # in Loop: Header=BB0_43 Depth=2 @@ -583,20 +583,20 @@ Decode_Picture: # @Decode_Picture ldptr.w $a0, $a0, 3144 beqz $a0, .LBB0_80 # %bb.79: # in Loop: Header=BB0_43 Depth=2 - ld.w $a4, $sp, 324 - ld.w $a5, $sp, 320 + ld.w $a4, $sp, 340 + ld.w $a5, $sp, 336 pcalau12i $a0, %got_pc_hi20(f_code) ld.d $a0, $a0, %got_pc_lo12(f_code) ld.w $a1, $a0, 0 ld.w $a0, $a0, 4 - ld.w $a3, $sp, 316 - ld.w $a2, $sp, 312 + ld.w $a3, $sp, 332 + ld.w $a2, $sp, 328 addi.w $a6, $a1, -1 addi.w $a7, $a0, -1 st.d $a2, $sp, 8 - addi.d $a0, $sp, 256 - addi.d $a1, $sp, 232 - addi.d $a2, $sp, 240 + addi.d $a0, $sp, 272 + addi.d $a1, $sp, 248 + addi.d $a2, $sp, 256 st.d $a3, $sp, 0 move $a3, $zero pcaddu18i $ra, %call36(motion_vectors) @@ -610,8 +610,8 @@ Decode_Picture: # @Decode_Picture pcalau12i $a0, %got_pc_hi20(full_pel_forward_vector) ld.d $a0, $a0, %got_pc_lo12(full_pel_forward_vector) ld.w $a6, $a0, 0 - addi.d $a0, $sp, 256 - addi.d $a1, $sp, 232 + addi.d $a0, $sp, 272 + addi.d $a1, $sp, 248 move $a3, $a2 move $a4, $zero move $a5, $zero @@ -620,12 +620,12 @@ Decode_Picture: # @Decode_Picture .LBB0_81: # in Loop: Header=BB0_43 Depth=2 ori $t1, $zero, 3 .LBB0_82: # in Loop: Header=BB0_43 Depth=2 - st.d $s2, $sp, 96 # 8-byte Folded Spill - ld.d $a0, $sp, 128 # 8-byte Folded Reload + st.d $s2, $sp, 128 # 8-byte Folded Spill + ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.w $a0, $a0, 0 bnez $a0, .LBB0_205 # %bb.83: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a0, $sp, 128 # 8-byte Folded Reload andi $a0, $a0, 4 beqz $a0, .LBB0_87 # %bb.84: # in Loop: Header=BB0_43 Depth=2 @@ -633,24 +633,24 @@ Decode_Picture: # @Decode_Picture ldptr.w $a0, $a0, 3144 beqz $a0, .LBB0_86 # %bb.85: # in Loop: Header=BB0_43 Depth=2 - ld.w $a4, $sp, 324 - ld.w $a5, $sp, 320 + ld.w $a4, $sp, 340 + ld.w $a5, $sp, 336 pcalau12i $a0, %got_pc_hi20(f_code) ld.d $a0, $a0, %got_pc_lo12(f_code) ld.w $a1, $a0, 8 ld.w $a0, $a0, 12 - ld.w $a2, $sp, 312 + ld.w $a2, $sp, 328 addi.w $a6, $a1, -1 addi.w $a7, $a0, -1 st.d $a2, $sp, 8 - addi.d $a0, $sp, 256 - addi.d $a1, $sp, 232 - addi.d $a2, $sp, 240 + addi.d $a0, $sp, 272 + addi.d $a1, $sp, 248 + addi.d $a2, $sp, 256 ori $a3, $zero, 1 st.d $zero, $sp, 0 pcaddu18i $ra, %call36(motion_vectors) jirl $ra, $ra, 0 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.w $a0, $a0, 0 ori $t1, $zero, 3 beqz $a0, .LBB0_87 @@ -663,25 +663,25 @@ Decode_Picture: # @Decode_Picture pcalau12i $a0, %got_pc_hi20(full_pel_backward_vector) ld.d $a0, $a0, %got_pc_lo12(full_pel_backward_vector) ld.w $a6, $a0, 0 - addi.d $a1, $sp, 232 - addi.d $a0, $sp, 264 + addi.d $a1, $sp, 248 + addi.d $a0, $sp, 280 move $a3, $a2 move $a4, $zero move $a5, $zero pcaddu18i $ra, %call36(motion_vector) jirl $ra, $ra, 0 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.w $a0, $a0, 0 ori $t1, $zero, 3 bnez $a0, .LBB0_205 .p2align 4, , 16 .LBB0_87: # %.thread.i.i # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 136 # 8-byte Folded Reload + ld.d $a0, $sp, 168 # 8-byte Folded Reload beqz $a0, .LBB0_90 # %bb.88: # %.thread.i.i # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 64 # 8-byte Folded Reload + ld.d $a0, $sp, 96 # 8-byte Folded Reload ld.w $a0, $a0, 0 beqz $a0, .LBB0_90 # %bb.89: # in Loop: Header=BB0_43 Depth=2 @@ -690,22 +690,22 @@ Decode_Picture: # @Decode_Picture jirl $ra, $ra, 0 ori $t1, $zero, 3 .LBB0_90: # in Loop: Header=BB0_43 Depth=2 - ld.d $a1, $sp, 200 # 8-byte Folded Reload + ld.d $a1, $sp, 216 # 8-byte Folded Reload ldptr.w $a0, $a1, 3148 - ld.d $a2, $sp, 96 # 8-byte Folded Reload + ld.d $a2, $sp, 128 # 8-byte Folded Reload bne $a0, $s1, .LBB0_93 # %bb.91: # in Loop: Header=BB0_43 Depth=2 ori $a0, $zero, 3164 ldx.w $a0, $a1, $a0 bne $a0, $t1, .LBB0_93 # %bb.92: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 216 # 8-byte Folded Reload + ld.d $a0, $sp, 232 # 8-byte Folded Reload st.d $a0, $fp, 0 .LBB0_93: # in Loop: Header=BB0_43 Depth=2 andi $a0, $a2, 2 bnez $a0, .LBB0_95 # %bb.94: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 136 # 8-byte Folded Reload + ld.d $a0, $sp, 168 # 8-byte Folded Reload sltui $a0, $a0, 1 pcalau12i $a1, %got_pc_hi20(block_count) ld.d $a1, $a1, %got_pc_lo12(block_count) @@ -741,7 +741,7 @@ Decode_Picture: # @Decode_Picture ori $t1, $zero, 3 or $s4, $a0, $s0 .LBB0_100: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.w $a0, $a0, 0 bnez $a0, .LBB0_205 # %bb.101: # %.preheader.i.i.i @@ -759,14 +759,14 @@ Decode_Picture: # @Decode_Picture b .LBB0_107 .p2align 4, , 16 .LBB0_103: # in Loop: Header=BB0_107 Depth=3 - addi.d $a1, $sp, 288 + addi.d $a1, $sp, 304 beqz $a0, .LBB0_114 # %bb.104: # in Loop: Header=BB0_107 Depth=3 move $a0, $s5 pcaddu18i $ra, %call36(Decode_MPEG2_Intra_Block) jirl $ra, $ra, 0 .LBB0_105: # in Loop: Header=BB0_107 Depth=3 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.w $a0, $a0, 0 ori $t1, $zero, 3 bnez $a0, .LBB0_205 @@ -781,7 +781,7 @@ Decode_Picture: # @Decode_Picture # Parent Loop BB0_35 Depth=1 # Parent Loop BB0_43 Depth=2 # => This Inner Loop Header: Depth=3 - ld.d $s0, $sp, 200 # 8-byte Folded Reload + ld.d $s0, $sp, 216 # 8-byte Folded Reload ldptr.w $a0, $s0, 3148 bne $a0, $s1, .LBB0_109 # %bb.108: # in Loop: Header=BB0_107 Depth=3 @@ -805,7 +805,7 @@ Decode_Picture: # @Decode_Picture # %bb.111: # in Loop: Header=BB0_107 Depth=3 ori $a0, $zero, 3144 ldx.w $a0, $s0, $a0 - ld.d $a1, $sp, 136 # 8-byte Folded Reload + ld.d $a1, $sp, 168 # 8-byte Folded Reload bnez $a1, .LBB0_103 # %bb.112: # in Loop: Header=BB0_107 Depth=3 beqz $a0, .LBB0_115 @@ -839,22 +839,22 @@ Decode_Picture: # @Decode_Picture jirl $ra, $ra, 0 ori $t1, $zero, 3 .LBB0_118: # in Loop: Header=BB0_43 Depth=2 - ld.d $a3, $sp, 216 # 8-byte Folded Reload - ld.d $s1, $sp, 104 # 8-byte Folded Reload - ld.d $s5, $sp, 120 # 8-byte Folded Reload - ld.d $s2, $sp, 96 # 8-byte Folded Reload - ld.d $a0, $sp, 136 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload + ld.d $s1, $sp, 136 # 8-byte Folded Reload + ld.d $s5, $sp, 152 # 8-byte Folded Reload + ld.d $s2, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 168 # 8-byte Folded Reload bnez $a0, .LBB0_120 # %bb.119: # %.thread87.i.i # in Loop: Header=BB0_43 Depth=2 - st.w $zero, $sp, 296 - st.d $zero, $sp, 288 + st.w $zero, $sp, 312 + st.d $zero, $sp, 304 andi $a0, $s2, 9 ori $s7, $zero, 1 beqz $a0, .LBB0_122 b .LBB0_125 .LBB0_120: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 64 # 8-byte Folded Reload + ld.d $a0, $sp, 96 # 8-byte Folded Reload ld.w $a0, $a0, 0 beqz $a0, .LBB0_142 # %bb.121: # in Loop: Header=BB0_43 Depth=2 @@ -866,25 +866,25 @@ Decode_Picture: # @Decode_Picture ori $a1, $zero, 2 bne $a0, $a1, .LBB0_125 # %bb.123: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 208 # 8-byte Folded Reload + ld.d $a0, $sp, 224 # 8-byte Folded Reload ld.w $a0, $a0, 0 + st.d $zero, $sp, 288 st.d $zero, $sp, 272 - st.d $zero, $sp, 256 bne $a0, $t1, .LBB0_139 # %bb.124: # in Loop: Header=BB0_43 Depth=2 ori $a0, $zero, 2 - st.w $a0, $sp, 304 + st.w $a0, $sp, 320 .LBB0_125: # in Loop: Header=BB0_43 Depth=2 - ld.w $a0, $sp, 224 + ld.w $a0, $sp, 240 ori $a1, $zero, 4 bne $a0, $a1, .LBB0_127 .LBB0_126: # in Loop: Header=BB0_43 Depth=2 - vld $vr0, $sp, 176 # 16-byte Folded Reload + vld $vr0, $sp, 64 # 16-byte Folded Reload + vst $vr0, $sp, 288 vst $vr0, $sp, 272 - vst $vr0, $sp, 256 .LBB0_127: # %decode_macroblock.exit.i.i # in Loop: Header=BB0_43 Depth=2 - ld.d $s0, $sp, 200 # 8-byte Folded Reload + ld.d $s0, $sp, 216 # 8-byte Folded Reload pcalau12i $a0, %got_pc_hi20(Two_Streams) ld.d $s3, $a0, %got_pc_lo12(Two_Streams) ld.w $a0, $s3, 0 @@ -897,33 +897,33 @@ Decode_Picture: # @Decode_Picture st.d $a3, $fp, 0 beqz $s5, .LBB0_136 # %bb.130: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload bne $a0, $s1, .LBB0_154 .LBB0_131: # in Loop: Header=BB0_43 Depth=2 bne $s5, $s7, .LBB0_140 .LBB0_132: # %.thread55.i.i.i # in Loop: Header=BB0_43 Depth=2 - addi.d $a0, $sp, 324 - addi.d $a1, $sp, 316 - addi.d $a2, $sp, 316 - addi.d $a3, $sp, 316 - addi.d $a4, $sp, 316 - addi.d $a5, $sp, 316 - addi.d $a6, $sp, 316 - addi.d $a7, $sp, 316 - addi.d $t0, $sp, 320 + addi.d $a0, $sp, 340 + addi.d $a1, $sp, 332 + addi.d $a2, $sp, 332 + addi.d $a3, $sp, 332 + addi.d $a4, $sp, 332 + addi.d $a5, $sp, 332 + addi.d $a6, $sp, 332 + addi.d $a7, $sp, 332 + addi.d $t0, $sp, 336 st.d $t0, $sp, 0 pcaddu18i $ra, %call36(macroblock_modes) jirl $ra, $ra, 0 - ld.wu $a0, $sp, 324 + ld.wu $a0, $sp, 340 andi $s0, $a0, 2 bnez $s0, .LBB0_150 # %bb.133: # in Loop: Header=BB0_43 Depth=2 andi $a0, $a0, 16 - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload bnez $a0, .LBB0_151 .LBB0_134: # in Loop: Header=BB0_43 Depth=2 - st.d $s2, $sp, 96 # 8-byte Folded Spill + st.d $s2, $sp, 128 # 8-byte Folded Spill bnez $s0, .LBB0_157 .LBB0_135: # in Loop: Header=BB0_43 Depth=2 move $s4, $zero @@ -934,24 +934,24 @@ Decode_Picture: # @Decode_Picture jirl $ra, $ra, 0 beqz $a0, .LBB0_143 # %bb.137: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 80 # 8-byte Folded Reload - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 144 # 8-byte Folded Reload bge $a1, $a0, .LBB0_147 # %bb.138: # in Loop: Header=BB0_43 Depth=2 pcaddu18i $ra, %call36(Get_macroblock_address_increment) jirl $ra, $ra, 0 move $s5, $a0 - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload ori $t1, $zero, 3 - ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload beq $a0, $s1, .LBB0_131 b .LBB0_154 .LBB0_139: # in Loop: Header=BB0_43 Depth=2 - st.w $s7, $sp, 304 + st.w $s7, $sp, 320 addi.d $a0, $a0, -2 sltui $a0, $a0, 1 - st.w $a0, $sp, 240 - ld.w $a0, $sp, 224 + st.w $a0, $sp, 256 + ld.w $a0, $sp, 240 ori $a1, $zero, 4 beq $a0, $a1, .LBB0_126 b .LBB0_127 @@ -971,12 +971,12 @@ Decode_Picture: # @Decode_Picture pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 ori $t1, $zero, 3 - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload b .LBB0_170 .LBB0_142: # in Loop: Header=BB0_43 Depth=2 - vld $vr0, $sp, 176 # 16-byte Folded Reload + vld $vr0, $sp, 64 # 16-byte Folded Reload + vst $vr0, $sp, 288 vst $vr0, $sp, 272 - vst $vr0, $sp, 256 andi $a0, $s2, 9 ori $s7, $zero, 1 beqz $a0, .LBB0_122 @@ -989,7 +989,7 @@ Decode_Picture: # @Decode_Picture jirl $ra, $ra, 0 move $s5, $a0 addi.w $a0, $a0, -432 - ld.d $a1, $sp, 48 # 8-byte Folded Reload + ld.d $a1, $sp, 56 # 8-byte Folded Reload bltu $a1, $a0, .LBB0_153 # %bb.144: # in Loop: Header=BB0_43 Depth=2 pcalau12i $a0, %got_pc_hi20(Quiet_Flag) @@ -1005,7 +1005,7 @@ Decode_Picture: # @Decode_Picture ori $t1, $zero, 3 .LBB0_146: # in Loop: Header=BB0_43 Depth=2 move $s5, $zero - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload b .LBB0_171 .LBB0_147: # in Loop: Header=BB0_43 Depth=2 pcalau12i $a0, %got_pc_hi20(Quiet_Flag) @@ -1019,14 +1019,14 @@ Decode_Picture: # @Decode_Picture jirl $ra, $ra, 0 .LBB0_149: # in Loop: Header=BB0_43 Depth=2 move $s5, $zero - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload ori $t1, $zero, 3 b .LBB0_171 .LBB0_150: # in Loop: Header=BB0_43 Depth=2 - ld.w $a1, $sp, 320 - st.w $a1, $sp, 300 + ld.w $a1, $sp, 336 + st.w $a1, $sp, 316 andi $a0, $a0, 16 - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload beqz $a0, .LBB0_134 .LBB0_151: # in Loop: Header=BB0_43 Depth=2 ori $a0, $zero, 5 @@ -1040,8 +1040,8 @@ Decode_Picture: # @Decode_Picture ld.d $a2, $a2, %got_pc_lo12(Non_Linear_quantizer_scale) ldx.bu $a0, $a2, $a0 stptr.w $a0, $a1, 3168 - ld.d $a3, $sp, 216 # 8-byte Folded Reload - st.d $s2, $sp, 96 # 8-byte Folded Spill + ld.d $a3, $sp, 232 # 8-byte Folded Reload + st.d $s2, $sp, 128 # 8-byte Folded Spill beqz $s0, .LBB0_135 b .LBB0_157 .LBB0_153: # %.thread.i.i.i @@ -1054,7 +1054,7 @@ Decode_Picture: # @Decode_Picture pcaddu18i $ra, %call36(Get_macroblock_address_increment) jirl $ra, $ra, 0 slli.d $a1, $s4, 7 - ld.d $a2, $sp, 72 # 8-byte Folded Reload + ld.d $a2, $sp, 104 # 8-byte Folded Reload ld.w $a2, $a2, 0 andi $a3, $s5, 255 add.d $a1, $a3, $a1 @@ -1063,9 +1063,9 @@ Decode_Picture: # @Decode_Picture add.d $a0, $a0, $a1 addi.w $a0, $a0, -1 ori $s5, $zero, 1 - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload ori $t1, $zero, 3 - st.d $a0, $sp, 112 # 8-byte Folded Spill + st.d $a0, $sp, 144 # 8-byte Folded Spill beq $a0, $s1, .LBB0_132 .LBB0_154: # in Loop: Header=BB0_43 Depth=2 pcalau12i $a0, %got_pc_hi20(Quiet_Flag) @@ -1078,13 +1078,13 @@ Decode_Picture: # @Decode_Picture pcaddu18i $ra, %call36(puts) jirl $ra, $ra, 0 ori $t1, $zero, 3 - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload b .LBB0_171 .LBB0_156: # in Loop: Header=BB0_43 Depth=2 slli.d $a0, $a0, 1 stptr.w $a0, $a1, 3168 - ld.d $a3, $sp, 216 # 8-byte Folded Reload - st.d $s2, $sp, 96 # 8-byte Folded Spill + ld.d $a3, $sp, 232 # 8-byte Folded Reload + st.d $s2, $sp, 128 # 8-byte Folded Spill beqz $s0, .LBB0_135 .LBB0_157: # in Loop: Header=BB0_43 Depth=2 pcaddu18i $ra, %call36(Get_coded_block_pattern) @@ -1096,7 +1096,7 @@ Decode_Picture: # @Decode_Picture ori $a1, $zero, 2 beq $a0, $a1, .LBB0_160 # %bb.158: # in Loop: Header=BB0_43 Depth=2 - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload ori $t1, $zero, 3 bne $a0, $t1, .LBB0_162 # %bb.159: # in Loop: Header=BB0_43 Depth=2 @@ -1107,7 +1107,7 @@ Decode_Picture: # @Decode_Picture pcaddu18i $ra, %call36(Get_Bits) jirl $ra, $ra, 0 or $s4, $a0, $s0 - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload .LBB0_161: # in Loop: Header=BB0_43 Depth=2 ori $t1, $zero, 3 .LBB0_162: # in Loop: Header=BB0_43 Depth=2 @@ -1152,32 +1152,32 @@ Decode_Picture: # @Decode_Picture ld.w $s2, $s1, 0 b .LBB0_164 .LBB0_167: # in Loop: Header=BB0_43 Depth=2 - ld.d $s0, $sp, 200 # 8-byte Folded Reload + ld.d $s0, $sp, 216 # 8-byte Folded Reload ori $s7, $zero, 1 - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload ori $t1, $zero, 3 - ld.d $s1, $sp, 104 # 8-byte Folded Reload + ld.d $s1, $sp, 136 # 8-byte Folded Reload ori $s5, $zero, 1 b .LBB0_169 .LBB0_168: # in Loop: Header=BB0_43 Depth=2 - ld.d $s0, $sp, 200 # 8-byte Folded Reload + ld.d $s0, $sp, 216 # 8-byte Folded Reload ori $s7, $zero, 1 - ld.d $s1, $sp, 104 # 8-byte Folded Reload + ld.d $s1, $sp, 136 # 8-byte Folded Reload .LBB0_169: # %.loopexit.i.i.i # in Loop: Header=BB0_43 Depth=2 - ld.d $s2, $sp, 96 # 8-byte Folded Reload + ld.d $s2, $sp, 128 # 8-byte Folded Reload .LBB0_170: # %.loopexit.i.i.i # in Loop: Header=BB0_43 Depth=2 st.d $s0, $fp, 0 - st.d $s1, $sp, 112 # 8-byte Folded Spill + st.d $s1, $sp, 144 # 8-byte Folded Spill .LBB0_171: # %Decode_SNR_Macroblock.exit.i.i # in Loop: Header=BB0_43 Depth=2 - st.d $s5, $sp, 120 # 8-byte Folded Spill + st.d $s5, $sp, 152 # 8-byte Folded Spill .LBB0_172: # in Loop: Header=BB0_43 Depth=2 - ld.d $a0, $sp, 72 # 8-byte Folded Reload + ld.d $a0, $sp, 104 # 8-byte Folded Reload ld.w $a0, $a0, 0 - ld.w $a1, $sp, 300 - st.d $a1, $sp, 136 # 8-byte Folded Spill + ld.w $a1, $sp, 316 + st.d $a1, $sp, 168 # 8-byte Folded Spill div.w $a1, $s1, $a0 mul.d $a0, $a1, $a0 sub.d $a0, $s1, $a0 @@ -1186,18 +1186,18 @@ Decode_Picture: # @Decode_Picture slli.w $s5, $a1, 4 bnez $s6, .LBB0_175 # %bb.173: # in Loop: Header=BB0_43 Depth=2 - ld.w $a7, $sp, 228 - ld.w $a3, $sp, 304 + ld.w $a7, $sp, 244 + ld.w $a3, $sp, 320 addi.w $a2, $s2, 0 - addi.d $a4, $sp, 256 - addi.d $a5, $sp, 240 - addi.d $a6, $sp, 232 + addi.d $a4, $sp, 272 + addi.d $a5, $sp, 256 + addi.d $a6, $sp, 248 move $a0, $s4 move $a1, $s5 pcaddu18i $ra, %call36(form_predictions) jirl $ra, $ra, 0 ori $t1, $zero, 3 - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload ldptr.w $a0, $s0, 3148 beq $a0, $s7, .LBB0_176 .LBB0_174: # in Loop: Header=BB0_43 Depth=2 @@ -1562,7 +1562,7 @@ Decode_Picture: # @Decode_Picture addi.d $s8, $s8, 128 addi.w $s1, $s1, 1 addi.d $s2, $s2, 4 - ld.d $a3, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 232 # 8-byte Folded Reload bge $s0, $a0, .LBB0_42 .LBB0_180: # Parent Loop BB0_35 Depth=1 # Parent Loop BB0_43 Depth=2 @@ -1576,7 +1576,7 @@ Decode_Picture: # @Decode_Picture # %bb.182: # %vector.body36 # in Loop: Header=BB0_180 Depth=3 add.d $a0, $a3, $s8 - ld.d $a1, $sp, 200 # 8-byte Folded Reload + ld.d $a1, $sp, 216 # 8-byte Folded Reload add.d $a1, $a1, $s8 vld $vr0, $a0, -126 vld $vr1, $a0, -110 @@ -1625,28 +1625,27 @@ Decode_Picture: # @Decode_Picture ld.d $a3, $a1, -118 vinsgr2vr.d $vr0, $a2, 0 vinsgr2vr.d $vr1, $a3, 0 - vld $vr7, $sp, 160 # 16-byte Folded Reload - vmax.h $vr0, $vr0, $vr7 - vmax.h $vr1, $vr1, $vr7 + vld $vr6, $sp, 192 # 16-byte Folded Reload + vmax.h $vr0, $vr0, $vr6 + vmax.h $vr1, $vr1, $vr6 ld.d $a2, $a1, -110 - vld $vr8, $sp, 144 # 16-byte Folded Reload - vmin.h $vr0, $vr0, $vr8 - vmin.h $vr1, $vr1, $vr8 - vld $vr6, $sp, 176 # 16-byte Folded Reload - vilvl.h $vr2, $vr6, $vr0 + vld $vr7, $sp, 176 # 16-byte Folded Reload + vmin.h $vr0, $vr0, $vr7 + vmin.h $vr1, $vr1, $vr7 + vsllwil.wu.hu $vr2, $vr0, 0 vinsgr2vr.d $vr3, $a2, 0 ld.d $a2, $a1, -102 - vilvl.h $vr4, $vr6, $vr1 + vsllwil.wu.hu $vr4, $vr1, 0 vpackev.d $vr0, $vr1, $vr0 vst $vr0, $a1, -126 vinsgr2vr.d $vr0, $a2, 0 - vmax.h $vr1, $vr3, $vr7 - vmax.h $vr0, $vr0, $vr7 - vmin.h $vr1, $vr1, $vr8 - vmin.h $vr0, $vr0, $vr8 + vmax.h $vr1, $vr3, $vr6 + vmax.h $vr0, $vr0, $vr6 + vmin.h $vr1, $vr1, $vr7 + vmin.h $vr0, $vr0, $vr7 ld.d $a2, $a1, -94 - vilvl.h $vr3, $vr6, $vr1 - vilvl.h $vr5, $vr6, $vr0 + vsllwil.wu.hu $vr3, $vr1, 0 + vsllwil.wu.hu $vr5, $vr0, 0 vpackev.d $vr0, $vr0, $vr1 vinsgr2vr.d $vr1, $a2, 0 ld.d $a2, $a1, -86 @@ -1654,13 +1653,13 @@ Decode_Picture: # @Decode_Picture vadd.w $vr0, $vr2, $vr3 vadd.w $vr2, $vr4, $vr5 vinsgr2vr.d $vr3, $a2, 0 - vmax.h $vr1, $vr1, $vr7 - vmax.h $vr3, $vr3, $vr7 - vmin.h $vr1, $vr1, $vr8 - vmin.h $vr3, $vr3, $vr8 + vmax.h $vr1, $vr1, $vr6 + vmax.h $vr3, $vr3, $vr6 + vmin.h $vr1, $vr1, $vr7 + vmin.h $vr3, $vr3, $vr7 ld.d $a2, $a1, -78 - vilvl.h $vr4, $vr6, $vr1 - vilvl.h $vr5, $vr6, $vr3 + vsllwil.wu.hu $vr4, $vr1, 0 + vsllwil.wu.hu $vr5, $vr3, 0 vpackev.d $vr1, $vr3, $vr1 vinsgr2vr.d $vr3, $a2, 0 ld.d $a2, $a1, -70 @@ -1668,13 +1667,13 @@ Decode_Picture: # @Decode_Picture vadd.w $vr0, $vr0, $vr4 vadd.w $vr1, $vr2, $vr5 vinsgr2vr.d $vr2, $a2, 0 - vmax.h $vr3, $vr3, $vr7 - vmax.h $vr2, $vr2, $vr7 - vmin.h $vr3, $vr3, $vr8 - vmin.h $vr2, $vr2, $vr8 + vmax.h $vr3, $vr3, $vr6 + vmax.h $vr2, $vr2, $vr6 + vmin.h $vr3, $vr3, $vr7 + vmin.h $vr2, $vr2, $vr7 ld.d $a2, $a1, -62 - vilvl.h $vr4, $vr6, $vr3 - vilvl.h $vr5, $vr6, $vr2 + vsllwil.wu.hu $vr4, $vr3, 0 + vsllwil.wu.hu $vr5, $vr2, 0 vpackev.d $vr2, $vr2, $vr3 vinsgr2vr.d $vr3, $a2, 0 ld.d $a2, $a1, -54 @@ -1682,13 +1681,13 @@ Decode_Picture: # @Decode_Picture vadd.w $vr0, $vr0, $vr4 vadd.w $vr1, $vr1, $vr5 vinsgr2vr.d $vr2, $a2, 0 - vmax.h $vr3, $vr3, $vr7 - vmax.h $vr2, $vr2, $vr7 - vmin.h $vr3, $vr3, $vr8 - vmin.h $vr2, $vr2, $vr8 + vmax.h $vr3, $vr3, $vr6 + vmax.h $vr2, $vr2, $vr6 + vmin.h $vr3, $vr3, $vr7 + vmin.h $vr2, $vr2, $vr7 ld.d $a2, $a1, -46 - vilvl.h $vr4, $vr6, $vr3 - vilvl.h $vr5, $vr6, $vr2 + vsllwil.wu.hu $vr4, $vr3, 0 + vsllwil.wu.hu $vr5, $vr2, 0 vpackev.d $vr2, $vr2, $vr3 vinsgr2vr.d $vr3, $a2, 0 ld.d $a2, $a1, -38 @@ -1696,13 +1695,13 @@ Decode_Picture: # @Decode_Picture vadd.w $vr0, $vr0, $vr4 vadd.w $vr1, $vr1, $vr5 vinsgr2vr.d $vr2, $a2, 0 - vmax.h $vr3, $vr3, $vr7 - vmax.h $vr2, $vr2, $vr7 - vmin.h $vr3, $vr3, $vr8 - vmin.h $vr2, $vr2, $vr8 + vmax.h $vr3, $vr3, $vr6 + vmax.h $vr2, $vr2, $vr6 + vmin.h $vr3, $vr3, $vr7 + vmin.h $vr2, $vr2, $vr7 ld.d $a2, $a1, -30 - vilvl.h $vr4, $vr6, $vr3 - vilvl.h $vr5, $vr6, $vr2 + vsllwil.wu.hu $vr4, $vr3, 0 + vsllwil.wu.hu $vr5, $vr2, 0 vpackev.d $vr2, $vr2, $vr3 vinsgr2vr.d $vr3, $a2, 0 ld.d $a2, $a1, -22 @@ -1710,13 +1709,13 @@ Decode_Picture: # @Decode_Picture vadd.w $vr0, $vr0, $vr4 vadd.w $vr1, $vr1, $vr5 vinsgr2vr.d $vr2, $a2, 0 - vmax.h $vr3, $vr3, $vr7 - vmax.h $vr2, $vr2, $vr7 - vmin.h $vr3, $vr3, $vr8 - vmin.h $vr2, $vr2, $vr8 + vmax.h $vr3, $vr3, $vr6 + vmax.h $vr2, $vr2, $vr6 + vmin.h $vr3, $vr3, $vr7 + vmin.h $vr2, $vr2, $vr7 ld.d $a2, $a1, -14 - vilvl.h $vr4, $vr6, $vr3 - vilvl.h $vr5, $vr6, $vr2 + vsllwil.wu.hu $vr4, $vr3, 0 + vsllwil.wu.hu $vr5, $vr2, 0 vpackev.d $vr2, $vr2, $vr3 vinsgr2vr.d $vr3, $a2, 0 ld.d $a2, $a1, -6 @@ -1724,12 +1723,12 @@ Decode_Picture: # @Decode_Picture vadd.w $vr0, $vr0, $vr4 vadd.w $vr1, $vr1, $vr5 vinsgr2vr.d $vr2, $a2, 0 - vmax.h $vr3, $vr3, $vr7 - vmax.h $vr2, $vr2, $vr7 - vmin.h $vr3, $vr3, $vr8 - vmin.h $vr2, $vr2, $vr8 - vilvl.h $vr4, $vr6, $vr3 - vilvl.h $vr5, $vr6, $vr2 + vmax.h $vr3, $vr3, $vr6 + vmax.h $vr2, $vr2, $vr6 + vmin.h $vr3, $vr3, $vr7 + vmin.h $vr2, $vr2, $vr7 + vsllwil.wu.hu $vr4, $vr3, 0 + vsllwil.wu.hu $vr5, $vr2, 0 vpackev.d $vr2, $vr2, $vr3 vadd.w $vr0, $vr0, $vr4 vadd.w $vr1, $vr1, $vr5 @@ -1759,7 +1758,7 @@ Decode_Picture: # @Decode_Picture ori $t1, $zero, 3 bltu $t1, $s0, .LBB0_192 .LBB0_188: # in Loop: Header=BB0_180 Depth=3 - ld.d $a0, $sp, 208 # 8-byte Folded Reload + ld.d $a0, $sp, 224 # 8-byte Folded Reload ld.w $a3, $a0, 0 pcalau12i $a0, %got_pc_hi20(current_frame) ld.d $a0, $a0, %got_pc_lo12(current_frame) @@ -1770,7 +1769,7 @@ Decode_Picture: # @Decode_Picture bne $a3, $t1, .LBB0_197 # %bb.189: # in Loop: Header=BB0_180 Depth=3 slli.d $a2, $a2, 3 - ld.d $a3, $sp, 136 # 8-byte Folded Reload + ld.d $a3, $sp, 168 # 8-byte Folded Reload beqz $a3, .LBB0_204 # %bb.190: # in Loop: Header=BB0_180 Depth=3 bstrpick.d $a3, $s1, 31, 1 @@ -1795,7 +1794,7 @@ Decode_Picture: # @Decode_Picture sltu $a0, $zero, $a0 sra.w $a0, $s4, $a0 addi.d $a2, $a4, -1 - ld.d $a3, $sp, 208 # 8-byte Folded Reload + ld.d $a3, $sp, 224 # 8-byte Folded Reload ld.w $a5, $a3, 0 sltui $a2, $a2, 1 sra.w $a3, $s5, $a2 @@ -1809,7 +1808,7 @@ Decode_Picture: # @Decode_Picture pcalau12i $a1, %got_pc_hi20(Chroma_Width) ld.d $a1, $a1, %got_pc_lo12(Chroma_Width) ld.w $a1, $a1, 0 - ld.d $a6, $sp, 136 # 8-byte Folded Reload + ld.d $a6, $sp, 168 # 8-byte Folded Reload beqz $a6, .LBB0_199 # %bb.194: # in Loop: Header=BB0_180 Depth=3 ori $a6, $zero, 1 @@ -2273,7 +2272,7 @@ Decode_Picture: # @Decode_Picture .p2align 4, , 16 .LBB0_205: # %decode_macroblock.exit.thread.i.i # in Loop: Header=BB0_35 Depth=1 - ld.d $s0, $sp, 200 # 8-byte Folded Reload + ld.d $s0, $sp, 216 # 8-byte Folded Reload ori $s7, $zero, 1 b .LBB0_34 .LBB0_206: @@ -2282,8 +2281,8 @@ Decode_Picture: # @Decode_Picture ld.w $a0, $a0, 0 pcalau12i $a1, %pc_hi20(frame_reorder.Oldref_progressive_frame) st.w $a0, $a1, %pc_lo12(frame_reorder.Oldref_progressive_frame) - ld.d $fp, $sp, 32 # 8-byte Folded Reload - ld.d $a0, $sp, 208 # 8-byte Folded Reload + ld.d $fp, $sp, 40 # 8-byte Folded Reload + ld.d $a0, $sp, 224 # 8-byte Folded Reload ld.w $a0, $a0, 0 ori $a1, $zero, 3 bne $a0, $a1, .LBB0_208 @@ -2297,7 +2296,7 @@ Decode_Picture: # @Decode_Picture pcalau12i $s1, %pc_hi20(frame_reorder.Newref_progressive_frame) st.w $a0, $s1, %pc_lo12(frame_reorder.Newref_progressive_frame) st.w $a1, $fp, 0 - ld.d $a0, $sp, 24 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload addi.w $a1, $a0, -1 pcalau12i $a0, %got_pc_hi20(forward_reference_frame) ld.d $a0, $a0, %got_pc_lo12(forward_reference_frame) @@ -2305,9 +2304,9 @@ Decode_Picture: # @Decode_Picture jirl $ra, $ra, 0 ld.w $a0, $s1, %pc_lo12(frame_reorder.Newref_progressive_frame) st.w $a0, $fp, 0 - ld.d $fp, $sp, 32 # 8-byte Folded Reload + ld.d $fp, $sp, 40 # 8-byte Folded Reload st.w $a0, $s0, %pc_lo12(frame_reorder.Oldref_progressive_frame) - ld.d $a0, $sp, 208 # 8-byte Folded Reload + ld.d $a0, $sp, 224 # 8-byte Folded Reload ld.w $a0, $a0, 0 ori $a1, $zero, 3 beq $a0, $a1, .LBB0_210 @@ -2317,18 +2316,18 @@ Decode_Picture: # @Decode_Picture .LBB0_209: # %frame_reorder.exit.thread st.w $a1, $fp, 0 .LBB0_210: - ld.d $s8, $sp, 328 # 8-byte Folded Reload - ld.d $s7, $sp, 336 # 8-byte Folded Reload - ld.d $s6, $sp, 344 # 8-byte Folded Reload - ld.d $s5, $sp, 352 # 8-byte Folded Reload - ld.d $s4, $sp, 360 # 8-byte Folded Reload - ld.d $s3, $sp, 368 # 8-byte Folded Reload - ld.d $s2, $sp, 376 # 8-byte Folded Reload - ld.d $s1, $sp, 384 # 8-byte Folded Reload - ld.d $s0, $sp, 392 # 8-byte Folded Reload - ld.d $fp, $sp, 400 # 8-byte Folded Reload - ld.d $ra, $sp, 408 # 8-byte Folded Reload - addi.d $sp, $sp, 416 + ld.d $s8, $sp, 344 # 8-byte Folded Reload + ld.d $s7, $sp, 352 # 8-byte Folded Reload + ld.d $s6, $sp, 360 # 8-byte Folded Reload + ld.d $s5, $sp, 368 # 8-byte Folded Reload + ld.d $s4, $sp, 376 # 8-byte Folded Reload + ld.d $s3, $sp, 384 # 8-byte Folded Reload + ld.d $s2, $sp, 392 # 8-byte Folded Reload + ld.d $s1, $sp, 400 # 8-byte Folded Reload + ld.d $s0, $sp, 408 # 8-byte Folded Reload + ld.d $fp, $sp, 416 # 8-byte Folded Reload + ld.d $ra, $sp, 424 # 8-byte Folded Reload + addi.d $sp, $sp, 432 ret .LBB0_211: pcalau12i $a0, %got_pc_hi20(Quiet_Flag) diff --git a/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/recon.s b/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/recon.s index ccebcf29..a18a6d9d 100644 --- a/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/recon.s +++ b/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/recon.s @@ -760,7 +760,7 @@ form_component_prediction: # @form_component_prediction st.d $s2, $sp, 0 # 8-byte Folded Spill ld.d $t1, $sp, 32 ld.d $t7, $sp, 40 - ld.d $t6, $sp, 48 + ld.d $t4, $sp, 48 srai.d $t3, $t1, 1 srli.d $t0, $t7, 1 add.d $t0, $t0, $a7 @@ -769,14 +769,14 @@ form_component_prediction: # @form_component_prediction add.d $t0, $t0, $a6 add.d $t0, $t0, $t3 mul.w $t5, $a7, $a2 - add.d $t4, $a1, $t5 + add.d $t6, $a1, $t5 or $a7, $t7, $t1 andi $t8, $a7, 1 - add.d $a7, $t4, $a6 + add.d $a7, $t6, $a6 bnez $t8, .LBB2_13 # %bb.1: bstrpick.d $a2, $a4, 31, 0 - beqz $t6, .LBB2_50 + beqz $t4, .LBB2_50 # %bb.2: # %.preheader210.preheader move $t1, $zero addi.w $t4, $a5, -1 @@ -879,7 +879,7 @@ form_component_prediction: # @form_component_prediction beqz $t1, .LBB2_27 # %bb.15: bstrpick.d $t1, $a4, 31, 0 - beqz $t6, .LBB2_73 + beqz $t4, .LBB2_73 # %bb.16: # %.preheader216.preheader move $t4, $zero addi.w $t6, $a5, -1 @@ -1026,7 +1026,7 @@ form_component_prediction: # @form_component_prediction andi $t1, $a2, 12 andi $t2, $a2, 28 ori $t3, $zero, 16 - bnez $t6, .LBB2_31 + bnez $t4, .LBB2_31 b .LBB2_41 .p2align 4, , 16 .LBB2_30: # %.loopexit617 @@ -1195,7 +1195,7 @@ form_component_prediction: # @form_component_prediction move $a1, $zero add.d $a0, $a0, $t3 add.d $a0, $a0, $t2 - sub.d $a0, $t4, $a0 + sub.d $a0, $t6, $a0 andi $a6, $a2, 16 andi $t1, $a2, 12 andi $t2, $a2, 28 @@ -1257,33 +1257,33 @@ form_component_prediction: # @form_component_prediction b .LBB2_51 .LBB2_61: bstrpick.d $t1, $a4, 31, 0 - addi.w $t4, $a5, -1 - mul.d $t7, $a3, $t4 + addi.w $t6, $a5, -1 + mul.d $t7, $a3, $t6 add.d $a1, $a1, $t7 add.d $a1, $a1, $a6 add.d $a1, $a1, $t5 - add.d $t4, $a1, $t1 + add.d $t5, $a1, $t1 add.d $a1, $a0, $t3 add.d $a1, $a1, $a6 - add.d $t5, $a1, $a2 - add.d $t5, $t5, $t2 + add.d $t6, $a1, $a2 + add.d $t6, $t6, $t2 add.d $a1, $a1, $t7 add.d $a1, $a1, $t2 add.d $a1, $a1, $t1 addi.d $t7, $a1, 1 add.d $t8, $t7, $a2 move $a1, $zero - beqz $t6, .LBB2_84 + beqz $t4, .LBB2_84 # %bb.62: # %iter.check394.preheader - sltu $t6, $a7, $t8 - sltu $t5, $t5, $t4 - and $t5, $t6, $t5 + sltu $t4, $a7, $t8 + sltu $t6, $t6, $t5 + and $t4, $t4, $t6 sltu $t6, $a7, $t7 - sltu $t4, $t0, $t4 - and $t4, $t6, $t4 + sltu $t5, $t0, $t5 + and $t5, $t6, $t5 slti $t6, $a3, 0 - or $t4, $t4, $t6 - or $t4, $t5, $t4 + or $t5, $t5, $t6 + or $t4, $t4, $t5 andi $t5, $t1, 16 andi $t6, $t1, 12 add.d $t3, $t3, $a6 @@ -1292,7 +1292,6 @@ form_component_prediction: # @form_component_prediction add.d $t2, $t3, $t2 add.d $a0, $a0, $t2 ori $t2, $zero, 16 - vrepli.b $vr0, 0 b .LBB2_64 .p2align 4, , 16 .LBB2_63: # %.loopexit615 @@ -1320,40 +1319,45 @@ form_component_prediction: # @form_component_prediction .p2align 4, , 16 .LBB2_68: # %vector.ph397 # in Loop: Header=BB2_64 Depth=1 - vld $vr1, $a7, 0 + vld $vr0, $a7, 0 add.d $t3, $t0, $a2 + vbsrl.v $vr1, $vr0, 8 vld $vr2, $t0, 0 - vilvh.b $vr3, $vr0, $vr1 - vilvl.b $vr1, $vr0, $vr1 - vld $vr4, $t0, 1 - vilvl.b $vr5, $vr0, $vr2 - vilvh.b $vr2, $vr0, $vr2 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vld $vr3, $t0, 1 + vsllwil.hu.bu $vr4, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.hu.bu $vr5, $vr3, 0 vld $vr6, $t3, 0 - vilvl.b $vr7, $vr0, $vr4 - vld $vr8, $t3, 1 - vilvh.b $vr4, $vr0, $vr4 - vilvh.b $vr9, $vr0, $vr6 - vilvl.b $vr6, $vr0, $vr6 - vilvl.b $vr10, $vr0, $vr8 - vilvh.b $vr8, $vr0, $vr8 - vadd.h $vr2, $vr2, $vr4 - vadd.h $vr4, $vr5, $vr7 - vadd.h $vr4, $vr4, $vr6 - vadd.h $vr2, $vr2, $vr9 + vbsrl.v $vr3, $vr3, 8 + vsllwil.hu.bu $vr3, $vr3, 0 + vld $vr7, $t3, 1 + vbsrl.v $vr8, $vr6, 8 + vsllwil.hu.bu $vr8, $vr8, 0 + vsllwil.hu.bu $vr6, $vr6, 0 + vsllwil.hu.bu $vr9, $vr7, 0 + vbsrl.v $vr7, $vr7, 8 + vsllwil.hu.bu $vr7, $vr7, 0 + vadd.h $vr2, $vr2, $vr3 + vadd.h $vr3, $vr4, $vr5 + vadd.h $vr3, $vr3, $vr6 vadd.h $vr2, $vr2, $vr8 - vadd.h $vr4, $vr4, $vr10 - vaddi.hu $vr4, $vr4, 2 + vadd.h $vr2, $vr2, $vr7 + vadd.h $vr3, $vr3, $vr9 + vaddi.hu $vr3, $vr3, 2 vaddi.hu $vr2, $vr2, 2 vsrli.h $vr2, $vr2, 2 - vsrli.h $vr4, $vr4, 2 - vadd.h $vr1, $vr1, $vr4 + vsrli.h $vr3, $vr3, 2 + vadd.h $vr0, $vr0, $vr3 + vaddi.hu $vr0, $vr0, 1 + vsrli.h $vr0, $vr0, 1 + vadd.h $vr1, $vr1, $vr2 vaddi.hu $vr1, $vr1, 1 vsrli.h $vr1, $vr1, 1 - vadd.h $vr2, $vr3, $vr2 - vaddi.hu $vr2, $vr2, 1 - vsrli.h $vr2, $vr2, 1 - vpickev.b $vr1, $vr2, $vr1 - vst $vr1, $a7, 0 + vpickev.b $vr0, $vr1, $vr0 + vst $vr0, $a7, 0 beq $t5, $t1, .LBB2_63 # %bb.69: # %vec.epilog.iter.check410 # in Loop: Header=BB2_64 Depth=1 @@ -1366,34 +1370,34 @@ form_component_prediction: # @form_component_prediction # => This Inner Loop Header: Depth=2 ldx.w $t7, $a7, $t3 add.d $t8, $a7, $t3 - vinsgr2vr.w $vr1, $t7, 0 + vinsgr2vr.w $vr0, $t7, 0 ldx.w $t7, $t0, $t3 add.d $fp, $t0, $t3 ld.w $fp, $fp, 1 - vilvl.b $vr1, $vr0, $vr1 - vinsgr2vr.w $vr2, $t7, 0 - vilvl.b $vr2, $vr0, $vr2 - vinsgr2vr.w $vr3, $fp, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vinsgr2vr.w $vr1, $t7, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vinsgr2vr.w $vr2, $fp, 0 ldx.w $t7, $a0, $t3 add.d $fp, $a0, $t3 ld.w $fp, $fp, 1 - vilvl.b $vr3, $vr0, $vr3 - vinsgr2vr.w $vr4, $t7, 0 - vilvl.b $vr4, $vr0, $vr4 - vinsgr2vr.w $vr5, $fp, 0 - vilvl.b $vr5, $vr0, $vr5 - vadd.h $vr2, $vr2, $vr3 - vadd.h $vr2, $vr2, $vr4 - vadd.h $vr2, $vr2, $vr5 - vaddi.hu $vr2, $vr2, 2 - vsrli.h $vr2, $vr2, 2 - vor.v $vr3, $vr1, $vr2 - vxor.v $vr1, $vr1, $vr2 - vsrli.h $vr1, $vr1, 1 - vsub.h $vr1, $vr3, $vr1 - vpickev.b $vr1, $vr1, $vr1 + vsllwil.hu.bu $vr2, $vr2, 0 + vinsgr2vr.w $vr3, $t7, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vinsgr2vr.w $vr4, $fp, 0 + vsllwil.hu.bu $vr4, $vr4, 0 + vadd.h $vr1, $vr1, $vr2 + vadd.h $vr1, $vr1, $vr3 + vadd.h $vr1, $vr1, $vr4 + vaddi.hu $vr1, $vr1, 2 + vsrli.h $vr1, $vr1, 2 + vor.v $vr2, $vr0, $vr1 + vxor.v $vr0, $vr0, $vr1 + vsrli.h $vr0, $vr0, 1 + vsub.h $vr0, $vr2, $vr0 + vpickev.b $vr0, $vr0, $vr0 addi.d $t3, $t3, 4 - vstelm.w $vr1, $t8, 0, 0 + vstelm.w $vr0, $t8, 0, 0 bne $a6, $t3, .LBB2_70 # %bb.71: # %vec.epilog.middle.block423 # in Loop: Header=BB2_64 Depth=1 @@ -1425,12 +1429,12 @@ form_component_prediction: # @form_component_prediction b .LBB2_63 .LBB2_73: # %.preheader213.preheader move $a1, $zero - add.d $t5, $a0, $t3 - add.d $t5, $t5, $t2 - add.d $t6, $t5, $a2 - sub.d $t6, $t4, $t6 - sub.d $t4, $t4, $t5 - sltui $t5, $t6, 16 + add.d $t4, $a0, $t3 + add.d $t4, $t4, $t2 + add.d $t5, $t4, $a2 + sub.d $t5, $t6, $t5 + sub.d $t4, $t6, $t4 + sltui $t5, $t5, 16 sltui $t4, $t4, 16 or $t4, $t5, $t4 andi $t5, $t1, 16 @@ -1515,15 +1519,15 @@ form_component_prediction: # @form_component_prediction bne $t1, $t7, .LBB2_83 b .LBB2_74 .LBB2_84: # %iter.check443.preheader - sltu $t6, $a7, $t8 - sltu $t5, $t5, $t4 - and $t5, $t6, $t5 + sltu $t4, $a7, $t8 + sltu $t6, $t6, $t5 + and $t4, $t4, $t6 sltu $t6, $a7, $t7 - sltu $t4, $t0, $t4 - and $t4, $t6, $t4 + sltu $t5, $t0, $t5 + and $t5, $t6, $t5 slti $t6, $a3, 0 - or $t4, $t4, $t6 - or $t4, $t5, $t4 + or $t5, $t5, $t6 + or $t4, $t4, $t5 andi $t5, $t1, 16 andi $t6, $t1, 12 add.d $t3, $t3, $a6 @@ -1532,7 +1536,6 @@ form_component_prediction: # @form_component_prediction add.d $t2, $t3, $t2 add.d $a0, $a0, $t2 ori $t2, $zero, 16 - vrepli.b $vr0, 0 b .LBB2_86 .p2align 4, , 16 .LBB2_85: # %.loopexit614 @@ -1560,31 +1563,35 @@ form_component_prediction: # @form_component_prediction .p2align 4, , 16 .LBB2_90: # %vector.ph446 # in Loop: Header=BB2_86 Depth=1 - vld $vr1, $t0, 0 + vld $vr0, $t0, 0 add.d $t3, $t0, $a2 - vld $vr2, $t0, 1 - vilvh.b $vr3, $vr0, $vr1 - vilvl.b $vr1, $vr0, $vr1 + vld $vr1, $t0, 1 + vbsrl.v $vr2, $vr0, 8 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vbsrl.v $vr3, $vr1, 8 vld $vr4, $t3, 0 - vilvh.b $vr5, $vr0, $vr2 - vld $vr6, $t3, 1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.b $vr7, $vr0, $vr4 - vilvh.b $vr4, $vr0, $vr4 - vilvh.b $vr8, $vr0, $vr6 - vilvl.b $vr6, $vr0, $vr6 - vadd.h $vr1, $vr1, $vr2 - vadd.h $vr2, $vr3, $vr5 - vadd.h $vr2, $vr2, $vr4 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vld $vr5, $t3, 1 + vsllwil.hu.bu $vr6, $vr4, 0 + vbsrl.v $vr4, $vr4, 8 + vsllwil.hu.bu $vr4, $vr4, 0 + vbsrl.v $vr7, $vr5, 8 + vsllwil.hu.bu $vr7, $vr7, 0 + vsllwil.hu.bu $vr5, $vr5, 0 + vadd.h $vr0, $vr0, $vr1 + vadd.h $vr1, $vr2, $vr3 + vadd.h $vr1, $vr1, $vr4 + vadd.h $vr0, $vr0, $vr6 + vadd.h $vr0, $vr0, $vr5 vadd.h $vr1, $vr1, $vr7 - vadd.h $vr1, $vr1, $vr6 - vadd.h $vr2, $vr2, $vr8 - vaddi.hu $vr2, $vr2, 2 vaddi.hu $vr1, $vr1, 2 + vaddi.hu $vr0, $vr0, 2 + vsrli.h $vr0, $vr0, 2 vsrli.h $vr1, $vr1, 2 - vsrli.h $vr2, $vr2, 2 - vpickev.b $vr1, $vr2, $vr1 - vst $vr1, $a7, 0 + vpickev.b $vr0, $vr1, $vr0 + vst $vr0, $a7, 0 beq $t5, $t1, .LBB2_85 # %bb.91: # %vec.epilog.iter.check458 # in Loop: Header=BB2_86 Depth=1 @@ -1598,26 +1605,26 @@ form_component_prediction: # @form_component_prediction ldx.w $t7, $t0, $t3 add.d $t8, $t0, $t3 ld.w $t8, $t8, 1 - vinsgr2vr.w $vr1, $t7, 0 - vilvl.b $vr1, $vr0, $vr1 - vinsgr2vr.w $vr2, $t8, 0 + vinsgr2vr.w $vr0, $t7, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vinsgr2vr.w $vr1, $t8, 0 ldx.w $t7, $a0, $t3 add.d $t8, $a0, $t3 ld.w $t8, $t8, 1 - vilvl.b $vr2, $vr0, $vr2 - vinsgr2vr.w $vr3, $t7, 0 - vilvl.b $vr3, $vr0, $vr3 - vinsgr2vr.w $vr4, $t8, 0 - vilvl.b $vr4, $vr0, $vr4 - vadd.h $vr1, $vr1, $vr2 - vadd.h $vr1, $vr1, $vr3 - vadd.h $vr1, $vr1, $vr4 - vaddi.hu $vr1, $vr1, 2 - vsrli.h $vr1, $vr1, 2 - vpickev.b $vr1, $vr1, $vr1 + vsllwil.hu.bu $vr1, $vr1, 0 + vinsgr2vr.w $vr2, $t7, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vinsgr2vr.w $vr3, $t8, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vadd.h $vr0, $vr0, $vr1 + vadd.h $vr0, $vr0, $vr2 + vadd.h $vr0, $vr0, $vr3 + vaddi.hu $vr0, $vr0, 2 + vsrli.h $vr0, $vr0, 2 + vpickev.b $vr0, $vr0, $vr0 add.d $t7, $a7, $t3 addi.d $t3, $t3, 4 - vstelm.w $vr1, $t7, 0, 0 + vstelm.w $vr0, $t7, 0, 0 bne $a6, $t3, .LBB2_92 # %bb.93: # %vec.epilog.middle.block470 # in Loop: Header=BB2_86 Depth=1 diff --git a/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/spatscal.s b/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/spatscal.s index 3eebeb96..c514aba8 100644 --- a/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/spatscal.s +++ b/results/MultiSource/Benchmarks/mediabench/mpeg2/mpeg2dec/CMakeFiles/mpeg2decode.dir/spatscal.s @@ -584,7 +584,6 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame slli.d $t2, $fp, 1 ori $t3, $zero, 16 ori $t4, $zero, 8 - vrepli.b $vr0, 0 move $t5, $a4 b .LBB3_5 .p2align 4, , 16 @@ -653,8 +652,8 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame bnez $s3, .LBB3_7 # %bb.11: # %vector.ph # in Loop: Header=BB3_5 Depth=1 - vreplgr2vr.h $vr1, $t8 - vreplgr2vr.h $vr2, $t7 + vreplgr2vr.h $vr0, $t8 + vreplgr2vr.h $vr1, $t7 move $s3, $t5 move $s5, $t1 .p2align 4, , 16 @@ -663,13 +662,13 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame # => This Inner Loop Header: Depth=2 ld.d $ra, $s2, 0 ldx.d $s8, $s2, $t6 - vinsgr2vr.d $vr3, $ra, 0 - vilvl.b $vr3, $vr0, $vr3 - vinsgr2vr.d $vr4, $s8, 0 - vilvl.b $vr4, $vr0, $vr4 - vmul.h $vr4, $vr2, $vr4 - vmadd.h $vr4, $vr1, $vr3 - vst $vr4, $s3, 0 + vinsgr2vr.d $vr2, $ra, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vinsgr2vr.d $vr3, $s8, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vmul.h $vr3, $vr1, $vr3 + vmadd.h $vr3, $vr0, $vr2 + vst $vr3, $s3, 0 addi.d $s5, $s5, -8 addi.d $s3, $s3, 16 addi.d $s2, $s2, 8 @@ -721,7 +720,6 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame slli.d $t3, $fp, 1 ori $t4, $zero, 16 ori $t5, $zero, 8 - vrepli.b $vr0, 0 move $t6, $a4 b .LBB3_20 .p2align 4, , 16 @@ -790,8 +788,8 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame bnez $s3, .LBB3_22 # %bb.26: # %vector.ph228 # in Loop: Header=BB3_20 Depth=1 - vreplgr2vr.h $vr1, $s0 - vreplgr2vr.h $vr2, $t8 + vreplgr2vr.h $vr0, $s0 + vreplgr2vr.h $vr1, $t8 move $s3, $t6 move $s5, $t2 .p2align 4, , 16 @@ -800,13 +798,13 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame # => This Inner Loop Header: Depth=2 ld.d $s8, $s2, 0 ldx.d $ra, $s2, $t7 - vinsgr2vr.d $vr3, $s8, 0 - vilvl.b $vr3, $vr0, $vr3 - vinsgr2vr.d $vr4, $ra, 0 - vilvl.b $vr4, $vr0, $vr4 - vmul.h $vr4, $vr2, $vr4 - vmadd.h $vr4, $vr1, $vr3 - vst $vr4, $s3, 0 + vinsgr2vr.d $vr2, $s8, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vinsgr2vr.d $vr3, $ra, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vmul.h $vr3, $vr1, $vr3 + vmadd.h $vr3, $vr0, $vr2 + vst $vr3, $s3, 0 addi.d $s5, $s5, -8 addi.d $s3, $s3, 16 addi.d $s2, $s2, 8 @@ -860,7 +858,6 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame slli.d $t2, $fp, 2 ori $s3, $zero, 16 ori $t4, $zero, 8 - vrepli.b $vr0, 0 move $t5, $a4 b .LBB3_33 .p2align 4, , 16 @@ -929,8 +926,8 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame bnez $s5, .LBB3_35 # %bb.39: # %vector.ph288 # in Loop: Header=BB3_33 Depth=1 - vreplgr2vr.h $vr1, $t8 - vreplgr2vr.h $vr2, $t7 + vreplgr2vr.h $vr0, $t8 + vreplgr2vr.h $vr1, $t7 move $s5, $t5 move $ra, $t1 .p2align 4, , 16 @@ -939,13 +936,13 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame # => This Inner Loop Header: Depth=2 ld.d $s8, $s2, 0 ldx.d $t3, $s2, $t6 - vinsgr2vr.d $vr3, $s8, 0 - vilvl.b $vr3, $vr0, $vr3 - vinsgr2vr.d $vr4, $t3, 0 - vilvl.b $vr4, $vr0, $vr4 - vmul.h $vr4, $vr2, $vr4 - vmadd.h $vr4, $vr1, $vr3 - vst $vr4, $s5, 0 + vinsgr2vr.d $vr2, $s8, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vinsgr2vr.d $vr3, $t3, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vmul.h $vr3, $vr1, $vr3 + vmadd.h $vr3, $vr0, $vr2 + vst $vr3, $s5, 0 addi.d $ra, $ra, -8 addi.d $s5, $s5, 16 addi.d $s2, $s2, 8 @@ -983,7 +980,6 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame slli.d $t2, $fp, 1 ori $t3, $zero, 16 ori $t4, $zero, 8 - vrepli.b $vr0, 0 move $t5, $a4 b .LBB3_46 .p2align 4, , 16 @@ -1052,8 +1048,8 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame bnez $s3, .LBB3_48 # %bb.52: # %vector.ph258 # in Loop: Header=BB3_46 Depth=1 - vreplgr2vr.h $vr1, $t8 - vreplgr2vr.h $vr2, $t7 + vreplgr2vr.h $vr0, $t8 + vreplgr2vr.h $vr1, $t7 move $s3, $t5 move $s5, $t1 .p2align 4, , 16 @@ -1062,13 +1058,13 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame # => This Inner Loop Header: Depth=2 ld.d $s8, $s2, 0 ldx.d $ra, $s2, $t6 - vinsgr2vr.d $vr3, $s8, 0 - vilvl.b $vr3, $vr0, $vr3 - vinsgr2vr.d $vr4, $ra, 0 - vilvl.b $vr4, $vr0, $vr4 - vmul.h $vr4, $vr2, $vr4 - vmadd.h $vr4, $vr1, $vr3 - vst $vr4, $s3, 0 + vinsgr2vr.d $vr2, $s8, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vinsgr2vr.d $vr3, $ra, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vmul.h $vr3, $vr1, $vr3 + vmadd.h $vr3, $vr0, $vr2 + vst $vr3, $s3, 0 addi.d $s5, $s5, -8 addi.d $s3, $s3, 16 addi.d $s2, $s2, 8 @@ -1102,7 +1098,6 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame ori $t3, $zero, 1 ori $t4, $zero, 16 ori $t5, $zero, 8 - vrepli.b $vr0, 0 move $t6, $a2 b .LBB3_59 .p2align 4, , 16 @@ -1173,8 +1168,8 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame bnez $s3, .LBB3_61 # %bb.65: # %vector.ph319 # in Loop: Header=BB3_59 Depth=1 - vreplgr2vr.h $vr1, $s0 - vreplgr2vr.h $vr2, $t8 + vreplgr2vr.h $vr0, $s0 + vreplgr2vr.h $vr1, $t8 move $s5, $t6 move $ra, $t1 .p2align 4, , 16 @@ -1183,13 +1178,13 @@ Make_Spatial_Prediction_Frame: # @Make_Spatial_Prediction_Frame # => This Inner Loop Header: Depth=2 ld.d $s3, $s2, 0 ldx.d $s8, $s2, $t7 - vinsgr2vr.d $vr3, $s3, 0 - vilvl.b $vr3, $vr0, $vr3 - vinsgr2vr.d $vr4, $s8, 0 - vilvl.b $vr4, $vr0, $vr4 - vmul.h $vr4, $vr2, $vr4 - vmadd.h $vr4, $vr1, $vr3 - vst $vr4, $s5, 0 + vinsgr2vr.d $vr2, $s3, 0 + vsllwil.hu.bu $vr2, $vr2, 0 + vinsgr2vr.d $vr3, $s8, 0 + vsllwil.hu.bu $vr3, $vr3, 0 + vmul.h $vr3, $vr1, $vr3 + vmadd.h $vr3, $vr0, $vr2 + vst $vr3, $s5, 0 addi.d $ra, $ra, -8 addi.d $s5, $s5, 16 addi.d $s2, $s2, 8 diff --git a/results/MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/nbench1.s b/results/MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/nbench1.s index fc9fc0e3..5f2d2f02 100644 --- a/results/MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/nbench1.s +++ b/results/MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/nbench1.s @@ -3653,25 +3653,23 @@ DoAssignIteration: # @DoAssignIteration vinsgr2vr.d $vr3, $a3, 0 vseqi.h $vr3, $vr3, 1 vandn.v $vr2, $vr3, $vr2 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr3, $vr2, 16 - vsrai.w $vr3, $vr3, 16 - vpickve2gr.w $a3, $vr3, 0 + vpickve2gr.w $a3, $vr2, 0 andi $a3, $a3, 1 bnez $a3, .LBB10_94 # %bb.87: # %pred.store.continue69 # in Loop: Header=BB10_51 Depth=4 - vpickve2gr.w $a3, $vr3, 1 + vsllwil.w.h $vr2, $vr2, 0 + vpickve2gr.w $a3, $vr2, 1 andi $a3, $a3, 1 bnez $a3, .LBB10_95 .LBB10_88: # %pred.store.continue71 # in Loop: Header=BB10_51 Depth=4 - vpickve2gr.w $a3, $vr3, 2 + vpickve2gr.w $a3, $vr2, 2 andi $a3, $a3, 1 bnez $a3, .LBB10_96 .LBB10_89: # %pred.store.continue73 # in Loop: Header=BB10_51 Depth=4 - vpickve2gr.w $a3, $vr3, 3 + vpickve2gr.w $a3, $vr2, 3 andi $a3, $a3, 1 beqz $a3, .LBB10_91 .LBB10_90: # %pred.store.if74 @@ -3726,7 +3724,8 @@ DoAssignIteration: # @DoAssignIteration ori $a3, $a3, 464 add.d $a3, $sp, $a3 st.h $s2, $a3, 0 - vpickve2gr.w $a3, $vr3, 1 + vsllwil.w.h $vr2, $vr2, 0 + vpickve2gr.w $a3, $vr2, 1 andi $a3, $a3, 1 beqz $a3, .LBB10_88 .LBB10_95: # %pred.store.if70 @@ -3735,7 +3734,7 @@ DoAssignIteration: # @DoAssignIteration ori $a3, $a3, 466 add.d $a3, $sp, $a3 st.h $s2, $a3, 0 - vpickve2gr.w $a3, $vr3, 2 + vpickve2gr.w $a3, $vr2, 2 andi $a3, $a3, 1 beqz $a3, .LBB10_89 .LBB10_96: # %pred.store.if72 @@ -3744,7 +3743,7 @@ DoAssignIteration: # @DoAssignIteration ori $a3, $a3, 468 add.d $a3, $sp, $a3 st.h $s2, $a3, 0 - vpickve2gr.w $a3, $vr3, 3 + vpickve2gr.w $a3, $vr2, 3 andi $a3, $a3, 1 bnez $a3, .LBB10_90 b .LBB10_91 diff --git a/results/MultiSource/Benchmarks/tramp3d-v4/CMakeFiles/tramp3d-v4.dir/tramp3d-v4.s b/results/MultiSource/Benchmarks/tramp3d-v4/CMakeFiles/tramp3d-v4.dir/tramp3d-v4.s index 0d85945d..4e0cd1ed 100644 --- a/results/MultiSource/Benchmarks/tramp3d-v4/CMakeFiles/tramp3d-v4.dir/tramp3d-v4.s +++ b/results/MultiSource/Benchmarks/tramp3d-v4/CMakeFiles/tramp3d-v4.dir/tramp3d-v4.s @@ -158806,18 +158806,14 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR vfmadd.d $vr7, $vr8, $vr0, $vr7 vadd.w $vr8, $vr4, $vr6 vadd.w $vr9, $vr6, $vr5 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 + vsllwil.d.w $vr9, $vr9, 0 vpickve2gr.d $a0, $vr9, 0 mul.d $a0, $a0, $a6 vpickve2gr.d $t2, $vr9, 1 mul.d $t2, $t2, $a6 fldx.d $ft1, $t4, $a0 fldx.d $ft2, $t4, $t2 - vshuf4i.w $vr8, $vr8, 16 - vslli.d $vr8, $vr8, 32 - vsrai.d $vr8, $vr8, 32 + vsllwil.d.w $vr8, $vr8, 0 vpickve2gr.d $a0, $vr8, 0 mul.d $a0, $a0, $a6 vpickve2gr.d $t2, $vr8, 1 @@ -160324,18 +160320,14 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR vfmul.d $vr8, $vr8, $vr0 vadd.w $vr11, $vr5, $vr7 vadd.w $vr12, $vr7, $vr6 - vshuf4i.w $vr12, $vr12, 16 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 + vsllwil.d.w $vr12, $vr12, 0 vpickve2gr.d $t3, $vr12, 0 mul.d $t3, $t3, $s5 vpickve2gr.d $t6, $vr12, 1 mul.d $t6, $t6, $s5 fldx.d $ft4, $t5, $t3 fldx.d $ft5, $t5, $t6 - vshuf4i.w $vr11, $vr11, 16 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 + vsllwil.d.w $vr11, $vr11, 0 vpickve2gr.d $t3, $vr11, 0 mul.d $t3, $t3, $s5 vpickve2gr.d $t6, $vr11, 1 @@ -186646,9 +186638,7 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR vfmul.d $vr8, $vr8, $vr0 vfmadd.d $vr8, $vr9, $vr0, $vr8 vadd.w $vr9, $vr7, $vr5 - vshuf4i.w $vr9, $vr9, 16 - vslli.d $vr9, $vr9, 32 - vsrai.d $vr9, $vr9, 32 + vsllwil.d.w $vr9, $vr9, 0 vpickve2gr.d $a5, $vr9, 0 mul.d $a5, $a5, $a6 vpickve2gr.d $t6, $vr9, 1 @@ -186656,9 +186646,7 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR fldx.d $ft1, $t4, $a5 fldx.d $ft2, $t4, $t6 vadd.w $vr11, $vr7, $vr6 - vshuf4i.w $vr11, $vr11, 16 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 + vsllwil.d.w $vr11, $vr11, 0 vpickve2gr.d $a5, $vr11, 0 mul.d $a5, $a5, $a6 vpickve2gr.d $t6, $vr11, 1 @@ -188919,11 +188907,11 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR # in Loop: Header=BB873_6 Depth=2 mul.d $a3, $t1, $s2 ld.d $a4, $sp, 256 # 8-byte Folded Reload - add.w $t2, $a4, $a3 + add.w $s8, $a4, $a3 ld.d $a4, $sp, 312 # 8-byte Folded Reload - add.w $a6, $t2, $a4 + add.w $a6, $s8, $a4 ld.d $a4, $sp, 320 # 8-byte Folded Reload - blt $a6, $t2, .LBB873_7 + blt $a6, $s8, .LBB873_7 # %bb.12: # %vector.scevcheck # in Loop: Header=BB873_6 Depth=2 ld.d $a4, $sp, 248 # 8-byte Folded Reload @@ -188937,19 +188925,19 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR ld.d $a4, $sp, 304 # 8-byte Folded Reload mul.d $a6, $a4, $s2 ld.d $a4, $sp, 240 # 8-byte Folded Reload - add.w $t3, $a4, $a6 + add.w $t2, $a4, $a6 ld.d $a4, $sp, 312 # 8-byte Folded Reload - add.w $fp, $t3, $a4 + add.w $t3, $t2, $a4 ld.d $a4, $sp, 320 # 8-byte Folded Reload - blt $fp, $t3, .LBB873_7 + blt $t3, $t2, .LBB873_7 # %bb.14: # %vector.scevcheck # in Loop: Header=BB873_6 Depth=2 - mul.d $s4, $t3, $t4 - add.d $s8, $t5, $s4 + mul.d $s4, $t2, $t4 + add.d $t2, $t5, $s4 ld.d $a4, $sp, 272 # 8-byte Folded Reload - add.d $t3, $s8, $a4 + add.d $t3, $t2, $a4 ld.d $a4, $sp, 320 # 8-byte Folded Reload - bltu $t3, $s8, .LBB873_7 + bltu $t3, $t2, .LBB873_7 # %bb.15: # %vector.scevcheck # in Loop: Header=BB873_6 Depth=2 ld.d $a4, $sp, 224 # 8-byte Folded Reload @@ -188997,9 +188985,9 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR bnez $a3, .LBB873_7 # %bb.20: # %vector.memcheck # in Loop: Header=BB873_6 Depth=2 - alsl.d $a3, $t2, $t0, 3 + alsl.d $a3, $s8, $t0, 3 ld.d $a4, $sp, 232 # 8-byte Folded Reload - alsl.d $a4, $t2, $a4, 3 + alsl.d $a4, $s8, $a4, 3 sltu $a4, $a6, $a4 sltu $a3, $a3, $a7 and $a3, $a4, $a3 @@ -189019,7 +189007,7 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR ld.d $a3, $sp, 216 # 8-byte Folded Reload add.d $a3, $a3, $s4 sltu $a3, $a6, $a3 - sltu $a4, $s8, $a7 + sltu $a4, $t2, $a7 and $a3, $a3, $a4 ld.d $a4, $sp, 320 # 8-byte Folded Reload bnez $a3, .LBB873_7 @@ -189054,9 +189042,7 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR vldx $vr11, $t0, $a6 vfmul.d $vr9, $vr9, $vr0 vadd.w $vr12, $vr8, $vr6 - vshuf4i.w $vr12, $vr12, 16 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 + vsllwil.d.w $vr12, $vr12, 0 vpickve2gr.d $a6, $vr12, 0 mul.d $a6, $a6, $t4 vpickve2gr.d $a7, $vr12, 1 @@ -189064,9 +189050,7 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR fldx.d $ft4, $t5, $a6 fldx.d $ft5, $t5, $a7 vadd.w $vr14, $vr8, $vr7 - vshuf4i.w $vr14, $vr14, 16 - vslli.d $vr14, $vr14, 32 - vsrai.d $vr14, $vr14, 32 + vsllwil.d.w $vr14, $vr14, 0 vpickve2gr.d $a6, $vr14, 0 mul.d $a6, $a6, $t4 vpickve2gr.d $a7, $vr14, 1 @@ -197818,9 +197802,7 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR vfmul.d $vr9, $vr9, $vr0 vfmadd.d $vr9, $vr10, $vr0, $vr9 vadd.w $vr10, $vr8, $vr6 - vshuf4i.w $vr10, $vr10, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 + vsllwil.d.w $vr10, $vr10, 0 vpickve2gr.d $a0, $vr10, 0 mul.d $a0, $a0, $a6 vpickve2gr.d $t6, $vr10, 1 @@ -197828,9 +197810,7 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR fldx.d $ft2, $t4, $a0 fldx.d $ft3, $t4, $t6 vadd.w $vr12, $vr8, $vr7 - vshuf4i.w $vr12, $vr12, 16 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 + vsllwil.d.w $vr12, $vr12, 0 vpickve2gr.d $a0, $vr12, 0 mul.d $a0, $a0, $a6 vpickve2gr.d $t6, $vr12, 1 @@ -199043,9 +199023,7 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR vfmul.d $vr9, $vr9, $vr0 vfmadd.d $vr9, $vr10, $vr0, $vr9 vadd.w $vr10, $vr8, $vr6 - vshuf4i.w $vr10, $vr10, 16 - vslli.d $vr10, $vr10, 32 - vsrai.d $vr10, $vr10, 32 + vsllwil.d.w $vr10, $vr10, 0 vpickve2gr.d $a0, $vr10, 0 mul.d $a0, $a0, $a3 vpickve2gr.d $a1, $vr10, 1 @@ -199053,9 +199031,7 @@ _ZN18LoopApplyEvaluator8evaluateI16ApplyMultiArgLocI9MultiArg3I5FieldI22UniformR fldx.d $ft2, $t2, $a0 fldx.d $ft3, $t2, $a1 vadd.w $vr12, $vr8, $vr7 - vshuf4i.w $vr12, $vr12, 16 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 + vsllwil.d.w $vr12, $vr12, 0 vpickve2gr.d $a0, $vr12, 0 mul.d $a0, $a0, $a3 vpickve2gr.d $a1, $vr12, 1 diff --git a/results/SingleSource/Benchmarks/Adobe-C++/CMakeFiles/simple_types_loop_invariant.dir/simple_types_loop_invariant.s b/results/SingleSource/Benchmarks/Adobe-C++/CMakeFiles/simple_types_loop_invariant.dir/simple_types_loop_invariant.s index 0f8c29d3..d1927470 100644 --- a/results/SingleSource/Benchmarks/Adobe-C++/CMakeFiles/simple_types_loop_invariant.dir/simple_types_loop_invariant.s +++ b/results/SingleSource/Benchmarks/Adobe-C++/CMakeFiles/simple_types_loop_invariant.dir/simple_types_loop_invariant.s @@ -4737,18 +4737,12 @@ _Z14test_variable1Ia22custom_divide_variableIaEEvPT_iS2_PKc: # @_Z14test_variabl # => This Inner Loop Header: Depth=2 vld $vr2, $a0, -16 vld $vr3, $a0, 0 - vilvh.b $vr4, $vr2, $vr2 - vslli.h $vr4, $vr4, 8 - vsrai.h $vr4, $vr4, 8 - vilvl.b $vr2, $vr2, $vr2 - vslli.h $vr2, $vr2, 8 - vsrai.h $vr2, $vr2, 8 - vilvh.b $vr5, $vr3, $vr3 - vslli.h $vr5, $vr5, 8 - vsrai.h $vr5, $vr5, 8 - vilvl.b $vr3, $vr3, $vr3 - vslli.h $vr3, $vr3, 8 - vsrai.h $vr3, $vr3, 8 + vbsrl.v $vr4, $vr2, 8 + vsllwil.h.b $vr4, $vr4, 0 + vsllwil.h.b $vr2, $vr2, 0 + vbsrl.v $vr5, $vr3, 8 + vsllwil.h.b $vr5, $vr5, 0 + vsllwil.h.b $vr3, $vr3, 0 vdiv.h $vr2, $vr2, $vr6 vdiv.h $vr4, $vr4, $vr6 vdiv.h $vr3, $vr3, $vr6 @@ -4788,9 +4782,7 @@ _Z14test_variable1Ia22custom_divide_variableIaEEvPT_iS2_PKc: # @_Z14test_variabl # => This Inner Loop Header: Depth=2 ld.d $a3, $a2, 0 vinsgr2vr.d $vr1, $a3, 0 - vilvl.b $vr1, $vr1, $vr1 - vslli.h $vr1, $vr1, 8 - vsrai.h $vr1, $vr1, 8 + vsllwil.h.b $vr1, $vr1, 0 vdiv.h $vr1, $vr1, $vr6 vpickev.b $vr1, $vr1, $vr1 vadd.b $vr0, $vr0, $vr1 @@ -5053,18 +5045,12 @@ _Z14test_variable4Ia31custom_divide_multiple_variableIaEEvPT_iS2_S2_S2_S2_PKc: # # => This Inner Loop Header: Depth=2 vld $vr2, $a0, -16 vld $vr3, $a0, 0 - vilvl.b $vr4, $vr2, $vr2 - vslli.h $vr4, $vr4, 8 - vsrai.h $vr4, $vr4, 8 - vilvh.b $vr2, $vr2, $vr2 - vslli.h $vr2, $vr2, 8 - vsrai.h $vr2, $vr2, 8 - vilvl.b $vr5, $vr3, $vr3 - vslli.h $vr5, $vr5, 8 - vsrai.h $vr5, $vr5, 8 - vilvh.b $vr3, $vr3, $vr3 - vslli.h $vr3, $vr3, 8 - vsrai.h $vr3, $vr3, 8 + vsllwil.h.b $vr4, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.h.b $vr2, $vr2, 0 + vsllwil.h.b $vr5, $vr3, 0 + vbsrl.v $vr3, $vr3, 8 + vsllwil.h.b $vr3, $vr3, 0 vdiv.h $vr2, $vr2, $vr6 vdiv.h $vr4, $vr4, $vr6 vdiv.h $vr3, $vr3, $vr6 @@ -5116,9 +5102,7 @@ _Z14test_variable4Ia31custom_divide_multiple_variableIaEEvPT_iS2_S2_S2_S2_PKc: # # => This Inner Loop Header: Depth=2 ld.d $a3, $a2, 0 vinsgr2vr.d $vr1, $a3, 0 - vilvl.b $vr1, $vr1, $vr1 - vslli.h $vr1, $vr1, 8 - vsrai.h $vr1, $vr1, 8 + vsllwil.h.b $vr1, $vr1, 0 vdiv.h $vr1, $vr1, $vr6 vdiv.h $vr1, $vr1, $vr7 vdiv.h $vr1, $vr1, $vr8 @@ -15814,18 +15798,12 @@ _Z14test_variable1Is22custom_divide_variableIsEEvPT_iS2_PKc: # @_Z14test_variabl # => This Inner Loop Header: Depth=2 vld $vr2, $a0, -16 vld $vr3, $a0, 0 - vilvh.h $vr4, $vr2, $vr2 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvh.h $vr5, $vr3, $vr3 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 + vbsrl.v $vr4, $vr2, 8 + vsllwil.w.h $vr4, $vr4, 0 + vsllwil.w.h $vr2, $vr2, 0 + vbsrl.v $vr5, $vr3, 8 + vsllwil.w.h $vr5, $vr5, 0 + vsllwil.w.h $vr3, $vr3, 0 vdiv.w $vr2, $vr2, $vr6 vdiv.w $vr4, $vr4, $vr6 vdiv.w $vr3, $vr3, $vr6 @@ -15864,9 +15842,7 @@ _Z14test_variable1Is22custom_divide_variableIsEEvPT_iS2_PKc: # @_Z14test_variabl # => This Inner Loop Header: Depth=2 ld.d $a3, $a2, 0 vinsgr2vr.d $vr1, $a3, 0 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vsllwil.w.h $vr1, $vr1, 0 vdiv.w $vr1, $vr1, $vr6 vpickev.h $vr1, $vr1, $vr1 vadd.h $vr0, $vr0, $vr1 @@ -16072,7 +16048,7 @@ _Z14test_variable4Is31custom_divide_multiple_variableIsEEvPT_iS2_S2_S2_S2_PKc: # andi $a0, $s4, 12 st.d $a0, $sp, 8 # 8-byte Folded Spill bstrpick.d $a0, $s4, 30, 4 - slli.d $fp, $a0, 4 + slli.d $s8, $a0, 4 vreplgr2vr.w $vr6, $s3 vreplgr2vr.w $vr7, $s2 vreplgr2vr.w $vr8, $s1 @@ -16084,14 +16060,14 @@ _Z14test_variable4Is31custom_divide_multiple_variableIsEEvPT_iS2_S2_S2_S2_PKc: # st.d $a0, $sp, 48 # 8-byte Folded Spill sub.d $a0, $zero, $a0 st.d $a0, $sp, 40 # 8-byte Folded Spill - ori $a6, $zero, 4 + ori $a5, $zero, 4 pcalau12i $s7, %pc_hi20(init_value) lu12i.w $a0, 1 ori $s6, $a0, 3904 pcalau12i $a0, %pc_hi20(.L.str.179) addi.d $a0, $a0, %pc_lo12(.L.str.179) st.d $a0, $sp, 56 # 8-byte Folded Spill - move $s8, $zero + move $fp, $zero vrepli.b $vr0, 0 vst $vr0, $sp, 128 # 16-byte Folded Spill vst $vr6, $sp, 112 # 16-byte Folded Spill @@ -16102,14 +16078,14 @@ _Z14test_variable4Is31custom_divide_multiple_variableIsEEvPT_iS2_S2_S2_S2_PKc: # .p2align 4, , 16 .LBB51_3: # %_Z26check_shifted_variable_sumIs31custom_divide_multiple_variableIsEEvT_S2_S2_S2_S2_.exit.us # in Loop: Header=BB51_4 Depth=1 - addi.w $s8, $s8, 1 - bge $s8, $a1, .LBB51_23 + addi.w $fp, $fp, 1 + bge $fp, $a1, .LBB51_23 .LBB51_4: # %iter.check # =>This Loop Header: Depth=1 # Child Loop BB51_9 Depth 2 # Child Loop BB51_13 Depth 2 # Child Loop BB51_16 Depth 2 - bgeu $s4, $a6, .LBB51_6 + bgeu $s4, $a5, .LBB51_6 # %bb.5: # in Loop: Header=BB51_4 Depth=1 move $a3, $zero move $a0, $zero @@ -16127,7 +16103,7 @@ _Z14test_variable4Is31custom_divide_multiple_variableIsEEvPT_iS2_S2_S2_S2_PKc: # .LBB51_8: # %vector.body.preheader # in Loop: Header=BB51_4 Depth=1 ld.d $a0, $sp, 32 # 8-byte Folded Reload - move $a2, $fp + move $a2, $s8 vld $vr1, $sp, 128 # 16-byte Folded Reload vori.b $vr0, $vr1, 0 .p2align 4, , 16 @@ -16136,18 +16112,12 @@ _Z14test_variable4Is31custom_divide_multiple_variableIsEEvPT_iS2_S2_S2_S2_PKc: # # => This Inner Loop Header: Depth=2 vld $vr2, $a0, -16 vld $vr3, $a0, 0 - vilvl.h $vr4, $vr2, $vr2 - vslli.w $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 16 - vilvh.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 - vilvl.h $vr5, $vr3, $vr3 - vslli.w $vr5, $vr5, 16 - vsrai.w $vr5, $vr5, 16 - vilvh.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 + vsllwil.w.h $vr4, $vr2, 0 + vbsrl.v $vr2, $vr2, 8 + vsllwil.w.h $vr2, $vr2, 0 + vsllwil.w.h $vr5, $vr3, 0 + vbsrl.v $vr3, $vr3, 8 + vsllwil.w.h $vr3, $vr3, 0 vdiv.w $vr2, $vr2, $vr6 vdiv.w $vr4, $vr4, $vr6 vdiv.w $vr3, $vr3, $vr6 @@ -16178,11 +16148,11 @@ _Z14test_variable4Is31custom_divide_multiple_variableIsEEvPT_iS2_S2_S2_S2_PKc: # vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $a0, $vr0, 0 - beq $fp, $s4, .LBB51_17 + beq $s8, $s4, .LBB51_17 # %bb.11: # %vec.epilog.iter.check # in Loop: Header=BB51_4 Depth=1 - move $a2, $fp - move $a3, $fp + move $a2, $s8 + move $a3, $s8 ld.d $a4, $sp, 8 # 8-byte Folded Reload beqz $a4, .LBB51_15 .LBB51_12: # %vec.epilog.ph @@ -16198,9 +16168,7 @@ _Z14test_variable4Is31custom_divide_multiple_variableIsEEvPT_iS2_S2_S2_S2_PKc: # # => This Inner Loop Header: Depth=2 ld.d $a3, $a2, 0 vinsgr2vr.d $vr1, $a3, 0 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vsllwil.w.h $vr1, $vr1, 0 vdiv.w $vr1, $vr1, $vr6 vdiv.w $vr1, $vr1, $vr7 vdiv.w $vr1, $vr1, $vr8 @@ -16255,7 +16223,7 @@ _Z14test_variable4Is31custom_divide_multiple_variableIsEEvPT_iS2_S2_S2_S2_PKc: # ld.d $a0, $sp, 56 # 8-byte Folded Reload pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 - ori $a6, $zero, 4 + ori $a5, $zero, 4 vld $vr9, $sp, 64 # 16-byte Folded Reload vld $vr8, $sp, 80 # 16-byte Folded Reload vld $vr7, $sp, 96 # 16-byte Folded Reload @@ -16264,19 +16232,19 @@ _Z14test_variable4Is31custom_divide_multiple_variableIsEEvPT_iS2_S2_S2_S2_PKc: # ld.w $a1, $a0, %pc_lo12(iterations) b .LBB51_3 .LBB51_19: # %.preheader.preheader - pcalau12i $s5, %pc_hi20(init_value) - fld.d $fa0, $s5, %pc_lo12(init_value) + pcalau12i $fp, %pc_hi20(init_value) + fld.d $fa0, $fp, %pc_lo12(init_value) lu12i.w $a0, 1 - ori $s6, $a0, 3904 + ori $s5, $a0, 3904 pcalau12i $a0, %pc_hi20(.L.str.179) addi.d $s4, $a0, %pc_lo12(.L.str.179) - move $s7, $zero + move $s6, $zero b .LBB51_21 .p2align 4, , 16 .LBB51_20: # %_Z26check_shifted_variable_sumIs31custom_divide_multiple_variableIsEEvT_S2_S2_S2_S2_.exit # in Loop: Header=BB51_21 Depth=1 - addi.w $s7, $s7, 1 - bge $s7, $a1, .LBB51_23 + addi.w $s6, $s6, 1 + bge $s6, $a1, .LBB51_23 .LBB51_21: # %.preheader # =>This Inner Loop Header: Depth=1 ftintrz.l.d $fa1, $fa0 @@ -16285,7 +16253,7 @@ _Z14test_variable4Is31custom_divide_multiple_variableIsEEvPT_iS2_S2_S2_S2_PKc: # div.w $a0, $a0, $s2 div.w $a0, $a0, $s1 div.w $a0, $a0, $s0 - mul.d $a0, $a0, $s6 + mul.d $a0, $a0, $s5 bstrpick.d $a0, $a0, 15, 6 slli.d $a0, $a0, 6 beqz $a0, .LBB51_20 @@ -16295,7 +16263,7 @@ _Z14test_variable4Is31custom_divide_multiple_variableIsEEvPT_iS2_S2_S2_S2_PKc: # move $a0, $s4 pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 - fld.d $fa0, $s5, %pc_lo12(init_value) + fld.d $fa0, $fp, %pc_lo12(init_value) ld.d $a0, $sp, 152 # 8-byte Folded Reload ld.w $a1, $a0, %pc_lo12(iterations) b .LBB51_20 diff --git a/results/SingleSource/Benchmarks/Misc/CMakeFiles/ReedSolomon.dir/ReedSolomon.s b/results/SingleSource/Benchmarks/Misc/CMakeFiles/ReedSolomon.dir/ReedSolomon.s index d1b3171a..744def97 100644 --- a/results/SingleSource/Benchmarks/Misc/CMakeFiles/ReedSolomon.dir/ReedSolomon.s +++ b/results/SingleSource/Benchmarks/Misc/CMakeFiles/ReedSolomon.dir/ReedSolomon.s @@ -34,22 +34,22 @@ .type rsdec_204,@function rsdec_204: # @rsdec_204 # %bb.0: - addi.d $sp, $sp, -1712 - st.d $ra, $sp, 1704 # 8-byte Folded Spill - st.d $fp, $sp, 1696 # 8-byte Folded Spill - st.d $s0, $sp, 1688 # 8-byte Folded Spill - st.d $s1, $sp, 1680 # 8-byte Folded Spill - st.d $s2, $sp, 1672 # 8-byte Folded Spill - st.d $s3, $sp, 1664 # 8-byte Folded Spill - st.d $s4, $sp, 1656 # 8-byte Folded Spill - st.d $s5, $sp, 1648 # 8-byte Folded Spill - st.d $s6, $sp, 1640 # 8-byte Folded Spill - st.d $s7, $sp, 1632 # 8-byte Folded Spill - st.d $s8, $sp, 1624 # 8-byte Folded Spill + addi.d $sp, $sp, -1696 + st.d $ra, $sp, 1688 # 8-byte Folded Spill + st.d $fp, $sp, 1680 # 8-byte Folded Spill + st.d $s0, $sp, 1672 # 8-byte Folded Spill + st.d $s1, $sp, 1664 # 8-byte Folded Spill + st.d $s2, $sp, 1656 # 8-byte Folded Spill + st.d $s3, $sp, 1648 # 8-byte Folded Spill + st.d $s4, $sp, 1640 # 8-byte Folded Spill + st.d $s5, $sp, 1632 # 8-byte Folded Spill + st.d $s6, $sp, 1624 # 8-byte Folded Spill + st.d $s7, $sp, 1616 # 8-byte Folded Spill + st.d $s8, $sp, 1608 # 8-byte Folded Spill pcalau12i $s1, %pc_hi20(inited) ld.bu $a2, $s1, %pc_lo12(inited) move $s0, $a1 - st.d $a0, $sp, 32 # 8-byte Folded Spill + st.d $a0, $sp, 16 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(index_of) addi.d $s3, $a0, %pc_lo12(index_of) pcalau12i $a0, %pc_hi20(alpha_to) @@ -114,262 +114,259 @@ rsdec_204: # @rsdec_204 .LBB0_6: # %.preheader32.preheader ld.w $a0, $s0, 188 vinsgr2vr.w $vr0, $a0, 0 - vrepli.b $vr1, 0 - vst $vr1, $sp, 16 # 16-byte Folded Spill - vilvl.b $vr0, $vr1, $vr0 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 pcalau12i $a0, %pc_hi20(recd) addi.d $s1, $a0, %pc_lo12(recd) ld.w $a0, $s0, 192 vst $vr0, $s1, 0 vinsgr2vr.w $vr0, $a0, 0 ld.w $a0, $s0, 196 - vilvl.b $vr0, $vr1, $vr0 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $s1, 16 vinsgr2vr.w $vr0, $a0, 0 ld.w $a0, $s0, 200 - vilvl.b $vr0, $vr1, $vr0 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $s1, 32 vinsgr2vr.w $vr0, $a0, 0 - vilvl.b $vr0, $vr1, $vr0 - vilvl.h $vr0, $vr1, $vr0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 vst $vr0, $s1, 48 addi.d $a0, $s1, 64 ori $a2, $zero, 204 move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - vld $vr2, $sp, 16 # 16-byte Folded Reload ld.w $a0, $s0, 0 ld.w $a1, $s0, 4 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 268 vst $vr1, $s1, 284 ld.w $a0, $s0, 8 ld.w $a1, $s0, 12 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 300 vst $vr1, $s1, 316 ld.w $a0, $s0, 16 ld.w $a1, $s0, 20 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 332 vst $vr1, $s1, 348 ld.w $a0, $s0, 24 ld.w $a1, $s0, 28 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 364 vst $vr1, $s1, 380 ld.w $a0, $s0, 32 ld.w $a1, $s0, 36 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 396 vst $vr1, $s1, 412 ld.w $a0, $s0, 40 ld.w $a1, $s0, 44 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 428 vst $vr1, $s1, 444 ld.w $a0, $s0, 48 ld.w $a1, $s0, 52 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 460 vst $vr1, $s1, 476 ld.w $a0, $s0, 56 ld.w $a1, $s0, 60 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 492 vst $vr1, $s1, 508 ld.w $a0, $s0, 64 ld.w $a1, $s0, 68 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 524 vst $vr1, $s1, 540 ld.w $a0, $s0, 72 ld.w $a1, $s0, 76 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 556 vst $vr1, $s1, 572 ld.w $a0, $s0, 80 ld.w $a1, $s0, 84 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 588 vst $vr1, $s1, 604 ld.w $a0, $s0, 88 ld.w $a1, $s0, 92 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 620 vst $vr1, $s1, 636 ld.w $a0, $s0, 96 ld.w $a1, $s0, 100 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 652 vst $vr1, $s1, 668 ld.w $a0, $s0, 104 ld.w $a1, $s0, 108 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 684 vst $vr1, $s1, 700 ld.w $a0, $s0, 112 ld.w $a1, $s0, 116 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 716 vst $vr1, $s1, 732 ld.w $a0, $s0, 120 ld.w $a1, $s0, 124 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 748 vst $vr1, $s1, 764 ld.w $a0, $s0, 128 ld.w $a1, $s0, 132 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 780 vst $vr1, $s1, 796 ld.w $a0, $s0, 136 ld.w $a1, $s0, 140 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 812 vst $vr1, $s1, 828 ld.w $a0, $s0, 144 ld.w $a1, $s0, 148 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 844 vst $vr1, $s1, 860 ld.w $a0, $s0, 152 ld.w $a1, $s0, 156 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 876 vst $vr1, $s1, 892 ld.w $a0, $s0, 160 ld.w $a1, $s0, 164 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 908 vst $vr1, $s1, 924 ld.w $a0, $s0, 168 ld.w $a1, $s0, 172 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 940 vst $vr1, $s1, 956 ld.w $a0, $s0, 176 ld.w $a1, $s0, 180 vinsgr2vr.w $vr0, $a0, 0 vinsgr2vr.w $vr1, $a1, 0 - vilvl.b $vr0, $vr2, $vr0 - vilvl.h $vr0, $vr2, $vr0 - vilvl.b $vr1, $vr2, $vr1 - vilvl.h $vr1, $vr2, $vr1 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 vst $vr0, $s1, 972 vst $vr1, $s1, 988 ld.bu $a0, $s0, 184 @@ -386,12 +383,9 @@ rsdec_204: # @rsdec_204 .LBB0_7: # %vector.body124 # =>This Inner Loop Header: Depth=1 vldx $vr0, $s1, $a0 - vshuf4i.w $vr1, $vr0, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vshuf4i.w $vr0, $vr0, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 + vshuf4i.w $vr1, $vr0, 14 + vsllwil.d.w $vr1, $vr1, 0 + vsllwil.d.w $vr0, $vr0, 0 vpickve2gr.d $a2, $vr0, 0 slli.d $a2, $a2, 2 vpickve2gr.d $a3, $vr0, 1 @@ -426,7 +420,7 @@ rsdec_204: # @rsdec_204 st.w $a3, $s1, 1012 st.w $a2, $s1, 1016 ori $a2, $zero, 1 - addi.d $a3, $sp, 188 + addi.d $a3, $sp, 172 addi.w $a0, $zero, -1 ori $a4, $zero, 1020 ori $a5, $zero, 17 @@ -482,40 +476,41 @@ rsdec_204: # @rsdec_204 .LBB0_14: beqz $a1, .LBB0_58 # %bb.15: - ld.w $s7, $sp, 192 + ld.w $s7, $sp, 176 move $s6, $zero - st.w $zero, $sp, 400 - addi.d $a1, $sp, 192 - st.w $s7, $sp, 404 - st.w $zero, $sp, 472 - addi.d $a2, $sp, 536 + st.w $zero, $sp, 384 + addi.d $a1, $sp, 176 + st.w $s7, $sp, 388 + st.w $zero, $sp, 456 + addi.d $a2, $sp, 520 + st.d $a0, $sp, 460 + st.d $a0, $sp, 468 st.d $a0, $sp, 476 st.d $a0, $sp, 484 st.d $a0, $sp, 492 st.d $a0, $sp, 500 st.d $a0, $sp, 508 - st.d $a0, $sp, 516 - st.d $a0, $sp, 524 move $a3, $a0 lu32i.d $a3, 1 - st.d $a3, $sp, 532 - st.d $zero, $sp, 540 - vst $vr2, $sp, 548 - vst $vr2, $sp, 564 - vst $vr2, $sp, 580 - st.w $zero, $sp, 596 - st.d $zero, $sp, 328 + st.d $a3, $sp, 516 + st.d $zero, $sp, 524 + vrepli.b $vr0, 0 + vst $vr0, $sp, 532 + vst $vr0, $sp, 548 + vst $vr0, $sp, 564 + st.w $zero, $sp, 580 + st.d $zero, $sp, 312 move $a3, $a0 lu32i.d $a3, 0 - st.d $a3, $sp, 256 - addi.d $a3, $sp, 600 - addi.d $a4, $sp, 604 + st.d $a3, $sp, 240 + addi.d $a3, $sp, 584 + addi.d $a4, $sp, 588 ori $a5, $zero, 1 - addi.d $a6, $sp, 400 - addi.d $s0, $sp, 328 - addi.d $t2, $sp, 256 + addi.d $a6, $sp, 384 + addi.d $s0, $sp, 312 + addi.d $t2, $sp, 240 move $a7, $a0 - addi.d $t7, $sp, 400 + addi.d $t7, $sp, 384 ori $t8, $zero, 1 b .LBB0_17 .p2align 4, , 16 @@ -548,9 +543,9 @@ rsdec_204: # @rsdec_204 alsl.d $t1, $s6, $s0, 2 beq $s7, $a0, .LBB0_26 # %bb.18: # in Loop: Header=BB0_17 Depth=1 - st.d $t6, $sp, 40 # 8-byte Folded Spill + st.d $t6, $sp, 24 # 8-byte Folded Spill slli.d $t3, $s6, 6 - addi.d $t0, $sp, 472 + addi.d $t0, $sp, 456 add.d $t3, $t0, $t3 addi.d $s8, $t3, 128 move $t6, $a7 @@ -579,7 +574,7 @@ rsdec_204: # @rsdec_204 # in Loop: Header=BB0_17 Depth=1 bstrpick.d $t0, $fp, 31, 0 addi.d $t3, $t0, 1 - addi.d $s0, $sp, 328 + addi.d $s0, $sp, 312 b .LBB0_24 .p2align 4, , 16 .LBB0_23: # in Loop: Header=BB0_24 Depth=2 @@ -619,13 +614,13 @@ rsdec_204: # @rsdec_204 .p2align 4, , 16 .LBB0_29: # in Loop: Header=BB0_17 Depth=1 move $ra, $zero - addi.d $s0, $sp, 328 + addi.d $s0, $sp, 312 .LBB0_30: # %.loopexit327.i # in Loop: Header=BB0_17 Depth=1 slli.d $t0, $ra, 2 ldx.w $t3, $t0, $s0 ldx.w $t4, $s5, $s0 - ld.d $t5, $sp, 40 # 8-byte Folded Reload + ld.d $t5, $sp, 24 # 8-byte Folded Reload sub.d $t5, $t5, $ra add.w $t3, $t3, $t5 slt $t5, $t3, $t4 @@ -634,15 +629,15 @@ rsdec_204: # @rsdec_204 or $s5, $t5, $t3 st.w $s5, $t1, 8 ldx.w $t3, $t0, $s0 - vst $vr2, $s8, 0 - vst $vr2, $s8, 16 - vst $vr2, $s8, 32 - vst $vr2, $s8, 48 + vst $vr0, $s8, 0 + vst $vr0, $s8, 16 + vst $vr0, $s8, 32 + vst $vr0, $s8, 48 bltz $t3, .LBB0_35 # %bb.31: # %.lr.ph.i # in Loop: Header=BB0_17 Depth=1 slli.d $t0, $ra, 6 - addi.d $t1, $sp, 472 + addi.d $t1, $sp, 456 add.d $t1, $t1, $t0 addi.d $s7, $s7, 255 alsl.d $t5, $ra, $a6, 2 @@ -696,54 +691,52 @@ rsdec_204: # @rsdec_204 # in Loop: Header=BB0_17 Depth=1 bstrpick.d $t0, $t1, 31, 2 slli.d $t4, $t0, 2 - move $t5, $a3 - move $s7, $t4 + move $t3, $a3 + move $t5, $t4 .p2align 4, , 16 .LBB0_39: # %vector.body141 # Parent Loop BB0_17 Depth=1 # => This Inner Loop Header: Depth=2 - vld $vr0, $t5, -64 - vld $vr1, $t5, 0 - vxor.v $vr1, $vr1, $vr0 - vst $vr1, $t5, 0 - vshuf4i.w $vr1, $vr0, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vshuf4i.w $vr0, $vr0, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 - vpickve2gr.d $t0, $vr0, 0 + vld $vr1, $t3, -64 + vld $vr2, $t3, 0 + vxor.v $vr2, $vr2, $vr1 + vst $vr2, $t3, 0 + vshuf4i.w $vr2, $vr1, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr1, $vr1, 0 + vpickve2gr.d $t0, $vr1, 0 slli.d $t0, $t0, 2 - vpickve2gr.d $t3, $vr0, 1 - slli.d $t3, $t3, 2 - vpickve2gr.d $t6, $vr1, 0 + vpickve2gr.d $t6, $vr1, 1 slli.d $t6, $t6, 2 - vpickve2gr.d $fp, $vr1, 1 + vpickve2gr.d $fp, $vr2, 0 slli.d $fp, $fp, 2 + vpickve2gr.d $s0, $vr2, 1 + slli.d $s0, $s0, 2 ldx.w $t0, $s3, $t0 - ldx.w $t3, $s3, $t3 ldx.w $t6, $s3, $t6 ldx.w $fp, $s3, $fp - vinsgr2vr.w $vr0, $t0, 0 - vinsgr2vr.w $vr0, $t3, 1 - vinsgr2vr.w $vr0, $t6, 2 - vinsgr2vr.w $vr0, $fp, 3 - vst $vr0, $t5, -64 - addi.d $s7, $s7, -4 - addi.d $t5, $t5, 16 - bnez $s7, .LBB0_39 + ldx.w $s0, $s3, $s0 + vinsgr2vr.w $vr1, $t0, 0 + vinsgr2vr.w $vr1, $t6, 1 + vinsgr2vr.w $vr1, $fp, 2 + vinsgr2vr.w $vr1, $s0, 3 + vst $vr1, $t3, -64 + addi.d $t5, $t5, -4 + addi.d $t3, $t3, 16 + bnez $t5, .LBB0_39 # %bb.40: # %middle.block146 # in Loop: Header=BB0_17 Depth=1 + addi.d $s0, $sp, 312 bne $t4, $t1, .LBB0_42 .LBB0_41: # in Loop: Header=BB0_17 Depth=1 - ld.d $t6, $sp, 40 # 8-byte Folded Reload + ld.d $t6, $sp, 24 # 8-byte Folded Reload b .LBB0_49 .p2align 4, , 16 .LBB0_42: # %scalar.ph136.preheader # in Loop: Header=BB0_17 Depth=1 sub.d $t1, $t1, $t4 slli.d $t3, $t4, 2 - ld.d $t6, $sp, 40 # 8-byte Folded Reload + ld.d $t6, $sp, 24 # 8-byte Folded Reload .p2align 4, , 16 .LBB0_43: # %scalar.ph136 # Parent Loop BB0_17 Depth=1 @@ -771,31 +764,28 @@ rsdec_204: # @rsdec_204 .LBB0_45: # %vector.body131 # Parent Loop BB0_17 Depth=1 # => This Inner Loop Header: Depth=2 - vldx $vr0, $a2, $t3 - vstx $vr0, $a3, $t3 - vshuf4i.w $vr1, $vr0, 50 - vslli.d $vr1, $vr1, 32 - vsrai.d $vr1, $vr1, 32 - vshuf4i.w $vr0, $vr0, 16 - vslli.d $vr0, $vr0, 32 - vsrai.d $vr0, $vr0, 32 - vpickve2gr.d $t6, $vr0, 0 + vldx $vr1, $a2, $t3 + vstx $vr1, $a3, $t3 + vshuf4i.w $vr2, $vr1, 14 + vsllwil.d.w $vr2, $vr2, 0 + vsllwil.d.w $vr1, $vr1, 0 + vpickve2gr.d $t6, $vr1, 0 slli.d $t6, $t6, 2 - vpickve2gr.d $fp, $vr0, 1 + vpickve2gr.d $fp, $vr1, 1 slli.d $fp, $fp, 2 - vpickve2gr.d $s7, $vr1, 0 + vpickve2gr.d $s7, $vr2, 0 slli.d $s7, $s7, 2 - vpickve2gr.d $s8, $vr1, 1 + vpickve2gr.d $s8, $vr2, 1 slli.d $s8, $s8, 2 ldx.w $t6, $s3, $t6 ldx.w $fp, $s3, $fp ldx.w $s7, $s3, $s7 ldx.w $s8, $s3, $s8 - vinsgr2vr.w $vr0, $t6, 0 - vinsgr2vr.w $vr0, $fp, 1 - vinsgr2vr.w $vr0, $s7, 2 - vinsgr2vr.w $vr0, $s8, 3 - vstx $vr0, $a2, $t3 + vinsgr2vr.w $vr1, $t6, 0 + vinsgr2vr.w $vr1, $fp, 1 + vinsgr2vr.w $vr1, $s7, 2 + vinsgr2vr.w $vr1, $s8, 3 + vstx $vr1, $a2, $t3 addi.d $t3, $t3, 16 bne $t5, $t3, .LBB0_45 # %bb.46: # %middle.block135 @@ -828,7 +818,7 @@ rsdec_204: # @rsdec_204 ori $t0, $zero, 15 beq $s6, $t0, .LBB0_62 # %bb.50: # in Loop: Header=BB0_17 Depth=1 - addi.d $t0, $sp, 188 + addi.d $t0, $sp, 172 ldx.w $t4, $t1, $t0 addi.w $t1, $zero, -1 beq $t4, $t1, .LBB0_52 @@ -911,7 +901,7 @@ rsdec_204: # @rsdec_204 addi.w $s6, $s5, 1 bltz $s5, .LBB0_96 # %bb.64: # %.lr.ph350.i - addi.d $a0, $sp, 1560 + addi.d $a0, $sp, 1544 move $a1, $s6 .p2align 4, , 16 .LBB0_65: # =>This Inner Loop Header: Depth=1 @@ -925,8 +915,8 @@ rsdec_204: # @rsdec_204 # %bb.66: # %.preheader318.i beqz $s5, .LBB0_95 # %bb.67: # %.preheader316.i.preheader - addi.d $s0, $sp, 56 - addi.d $a1, $sp, 1564 + addi.d $s0, $sp, 40 + addi.d $a1, $sp, 1548 slli.d $a2, $s5, 2 move $a0, $s0 pcaddu18i $ra, %call36(memcpy) @@ -935,9 +925,9 @@ rsdec_204: # @rsdec_204 addi.d $a2, $s6, -1 ori $a3, $zero, 1 addi.w $a0, $zero, -1 - addi.d $a4, $sp, 156 + addi.d $a4, $sp, 140 ori $a5, $zero, 255 - addi.d $a6, $sp, 124 + addi.d $a6, $sp, 108 ori $a7, $zero, 256 b .LBB0_70 .p2align 4, , 16 @@ -996,12 +986,12 @@ rsdec_204: # @rsdec_204 bne $a1, $s5, .LBB0_97 # %bb.76: # %.lr.ph368.i move $a1, $zero - addi.d $a2, $sp, 1560 + addi.d $a2, $sp, 1544 ori $a3, $zero, 1 - addi.d $a4, $sp, 192 - addi.d $a5, $sp, 188 + addi.d $a4, $sp, 176 + addi.d $a5, $sp, 172 ori $a6, $zero, 2 - addi.d $a7, $sp, 88 + addi.d $a7, $sp, 72 b .LBB0_78 .p2align 4, , 16 .LBB0_77: # %._crit_edge365.i @@ -1102,7 +1092,7 @@ rsdec_204: # @rsdec_204 vld $vr2, $s1, 284 vshuf.b $vr1, $vr0, $vr1, $vr0 vshuf.b $vr2, $vr0, $vr2, $vr0 - ld.d $a1, $sp, 32 # 8-byte Folded Reload + ld.d $a1, $sp, 16 # 8-byte Folded Reload vstelm.w $vr1, $a1, 0, 0 vstelm.w $vr2, $a1, 4, 0 vld $vr1, $s1, 300 @@ -1245,18 +1235,18 @@ rsdec_204: # @rsdec_204 st.b $a0, $a1, 186 ld.b $a0, $s1, 1016 st.b $a0, $a1, 187 - ld.d $s8, $sp, 1624 # 8-byte Folded Reload - ld.d $s7, $sp, 1632 # 8-byte Folded Reload - ld.d $s6, $sp, 1640 # 8-byte Folded Reload - ld.d $s5, $sp, 1648 # 8-byte Folded Reload - ld.d $s4, $sp, 1656 # 8-byte Folded Reload - ld.d $s3, $sp, 1664 # 8-byte Folded Reload - ld.d $s2, $sp, 1672 # 8-byte Folded Reload - ld.d $s1, $sp, 1680 # 8-byte Folded Reload - ld.d $s0, $sp, 1688 # 8-byte Folded Reload - ld.d $fp, $sp, 1696 # 8-byte Folded Reload - ld.d $ra, $sp, 1704 # 8-byte Folded Reload - addi.d $sp, $sp, 1712 + ld.d $s8, $sp, 1608 # 8-byte Folded Reload + ld.d $s7, $sp, 1616 # 8-byte Folded Reload + ld.d $s6, $sp, 1624 # 8-byte Folded Reload + ld.d $s5, $sp, 1632 # 8-byte Folded Reload + ld.d $s4, $sp, 1640 # 8-byte Folded Reload + ld.d $s3, $sp, 1648 # 8-byte Folded Reload + ld.d $s2, $sp, 1656 # 8-byte Folded Reload + ld.d $s1, $sp, 1664 # 8-byte Folded Reload + ld.d $s0, $sp, 1672 # 8-byte Folded Reload + ld.d $fp, $sp, 1680 # 8-byte Folded Reload + ld.d $ra, $sp, 1688 # 8-byte Folded Reload + addi.d $sp, $sp, 1696 ret .LBB0_95: ori $s6, $zero, 1 @@ -1309,9 +1299,9 @@ rsdec_204: # @rsdec_204 # %bb.106: # %.lr.ph383.i move $a1, $zero addi.d $a2, $s6, -1 - addi.d $a3, $sp, 92 - addi.d $a4, $sp, 124 - addi.d $a5, $sp, 156 + addi.d $a3, $sp, 76 + addi.d $a4, $sp, 108 + addi.d $a5, $sp, 140 b .LBB0_109 .p2align 4, , 16 .LBB0_107: # %._crit_edge379.i @@ -1394,7 +1384,7 @@ rsdec_204: # @rsdec_204 ldx.w $t0, $s3, $t0 move $t2, $zero move $t1, $zero - addi.d $t3, $sp, 124 + addi.d $t3, $sp, 108 b .LBB0_116 .p2align 4, , 16 .LBB0_115: # in Loop: Header=BB0_116 Depth=2 @@ -1532,235 +1522,234 @@ rsenc_204: # @rsenc_204 jirl $ra, $ra, 0 ld.w $a0, $s0, 0 ld.w $a1, $s0, 4 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vrepli.b $vr0, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 204 - vst $vr2, $s1, 220 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 204 + vst $vr1, $s1, 220 ld.w $a0, $s0, 8 ld.w $a1, $s0, 12 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 236 - vst $vr2, $s1, 252 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 236 + vst $vr1, $s1, 252 ld.w $a0, $s0, 16 ld.w $a1, $s0, 20 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 268 - vst $vr2, $s1, 284 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 268 + vst $vr1, $s1, 284 ld.w $a0, $s0, 24 ld.w $a1, $s0, 28 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 300 - vst $vr2, $s1, 316 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 300 + vst $vr1, $s1, 316 ld.w $a0, $s0, 32 ld.w $a1, $s0, 36 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 332 - vst $vr2, $s1, 348 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 332 + vst $vr1, $s1, 348 ld.w $a0, $s0, 40 ld.w $a1, $s0, 44 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 364 - vst $vr2, $s1, 380 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 364 + vst $vr1, $s1, 380 ld.w $a0, $s0, 48 ld.w $a1, $s0, 52 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 396 - vst $vr2, $s1, 412 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 396 + vst $vr1, $s1, 412 ld.w $a0, $s0, 56 ld.w $a1, $s0, 60 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 428 - vst $vr2, $s1, 444 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 428 + vst $vr1, $s1, 444 ld.w $a0, $s0, 64 ld.w $a1, $s0, 68 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 460 - vst $vr2, $s1, 476 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 460 + vst $vr1, $s1, 476 ld.w $a0, $s0, 72 ld.w $a1, $s0, 76 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 492 - vst $vr2, $s1, 508 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 492 + vst $vr1, $s1, 508 ld.w $a0, $s0, 80 ld.w $a1, $s0, 84 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 524 - vst $vr2, $s1, 540 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 524 + vst $vr1, $s1, 540 ld.w $a0, $s0, 88 ld.w $a1, $s0, 92 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 556 - vst $vr2, $s1, 572 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 556 + vst $vr1, $s1, 572 ld.w $a0, $s0, 96 ld.w $a1, $s0, 100 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 588 - vst $vr2, $s1, 604 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 588 + vst $vr1, $s1, 604 ld.w $a0, $s0, 104 ld.w $a1, $s0, 108 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 620 - vst $vr2, $s1, 636 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 620 + vst $vr1, $s1, 636 ld.w $a0, $s0, 112 ld.w $a1, $s0, 116 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 652 - vst $vr2, $s1, 668 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 652 + vst $vr1, $s1, 668 ld.w $a0, $s0, 120 ld.w $a1, $s0, 124 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 684 - vst $vr2, $s1, 700 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 684 + vst $vr1, $s1, 700 ld.w $a0, $s0, 128 ld.w $a1, $s0, 132 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 716 - vst $vr2, $s1, 732 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 716 + vst $vr1, $s1, 732 ld.w $a0, $s0, 136 ld.w $a1, $s0, 140 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 748 - vst $vr2, $s1, 764 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 748 + vst $vr1, $s1, 764 ld.w $a0, $s0, 144 ld.w $a1, $s0, 148 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 780 - vst $vr2, $s1, 796 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 780 + vst $vr1, $s1, 796 ld.w $a0, $s0, 152 ld.w $a1, $s0, 156 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 812 - vst $vr2, $s1, 828 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 812 + vst $vr1, $s1, 828 ld.w $a0, $s0, 160 ld.w $a1, $s0, 164 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 844 - vst $vr2, $s1, 860 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 844 + vst $vr1, $s1, 860 ld.w $a0, $s0, 168 ld.w $a1, $s0, 172 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 876 - vst $vr2, $s1, 892 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 876 + vst $vr1, $s1, 892 ld.w $a0, $s0, 176 ld.w $a1, $s0, 180 - vinsgr2vr.w $vr1, $a0, 0 - vinsgr2vr.w $vr2, $a1, 0 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr1, $vr0, $vr1 - vilvl.b $vr2, $vr0, $vr2 - vilvl.h $vr2, $vr0, $vr2 - vst $vr1, $s1, 908 - vst $vr2, $s1, 924 + vinsgr2vr.w $vr0, $a0, 0 + vinsgr2vr.w $vr1, $a1, 0 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.hu.bu $vr1, $vr1, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vst $vr0, $s1, 908 + vst $vr1, $s1, 924 ld.bu $a0, $s0, 184 st.w $a0, $s1, 940 ld.bu $a0, $s0, 185 @@ -1771,6 +1760,7 @@ rsenc_204: # @rsenc_204 st.w $a0, $s1, 952 pcalau12i $a0, %pc_hi20(bb) addi.d $a0, $a0, %pc_lo12(bb) + vrepli.b $vr0, 0 vst $vr0, $a0, 48 vst $vr0, $a0, 32 vst $vr0, $a0, 16 diff --git a/results/SingleSource/Benchmarks/Misc/CMakeFiles/evalloop.dir/evalloop.s b/results/SingleSource/Benchmarks/Misc/CMakeFiles/evalloop.dir/evalloop.s index 15219741..4039b11f 100644 --- a/results/SingleSource/Benchmarks/Misc/CMakeFiles/evalloop.dir/evalloop.s +++ b/results/SingleSource/Benchmarks/Misc/CMakeFiles/evalloop.dir/evalloop.s @@ -953,7 +953,6 @@ main: # @main ori $a1, $a1, 529 lu32i.d $a1, 135300 lu52i.d $a1, $a1, 132 - vrepli.b $vr1, 0 addi.d $a2, $sp, 16 lu12i.w $a4, 1 ori $a3, $a4, 4064 @@ -961,7 +960,7 @@ main: # @main .p2align 4, , 16 .LBB2_1: # %vector.body # =>This Inner Loop Header: Depth=1 - vaddi.hu $vr2, $vr0, 4 + vaddi.hu $vr1, $vr0, 4 vpickve2gr.h $a5, $vr0, 1 bstrpick.d $a6, $a5, 15, 0 mulh.du $a6, $a6, $a1 @@ -974,57 +973,57 @@ main: # @main slli.d $t0, $a7, 5 sub.d $a7, $a7, $t0 add.d $a6, $a6, $a7 - vinsgr2vr.h $vr3, $a6, 0 - vinsgr2vr.h $vr3, $a5, 1 + vinsgr2vr.h $vr2, $a6, 0 + vinsgr2vr.h $vr2, $a5, 1 vpickve2gr.h $a5, $vr0, 2 bstrpick.d $a6, $a5, 15, 0 mulh.du $a6, $a6, $a1 slli.d $a7, $a6, 5 sub.d $a6, $a6, $a7 add.d $a5, $a5, $a6 - vinsgr2vr.h $vr3, $a5, 2 + vinsgr2vr.h $vr2, $a5, 2 vpickve2gr.h $a5, $vr0, 3 bstrpick.d $a6, $a5, 15, 0 mulh.du $a6, $a6, $a1 slli.d $a7, $a6, 5 sub.d $a6, $a6, $a7 add.d $a5, $a5, $a6 - vinsgr2vr.h $vr3, $a5, 3 - vpickve2gr.h $a5, $vr2, 1 + vinsgr2vr.h $vr2, $a5, 3 + vpickve2gr.h $a5, $vr1, 1 bstrpick.d $a6, $a5, 15, 0 mulh.du $a6, $a6, $a1 slli.d $a7, $a6, 5 sub.d $a6, $a6, $a7 add.d $a5, $a5, $a6 - vpickve2gr.h $a6, $vr2, 0 + vpickve2gr.h $a6, $vr1, 0 bstrpick.d $a6, $a6, 15, 0 mulh.du $a7, $a6, $a1 slli.d $t0, $a7, 5 sub.d $a7, $a7, $t0 add.d $a6, $a6, $a7 - vinsgr2vr.h $vr4, $a6, 0 - vinsgr2vr.h $vr4, $a5, 1 - vpickve2gr.h $a5, $vr2, 2 + vinsgr2vr.h $vr3, $a6, 0 + vinsgr2vr.h $vr3, $a5, 1 + vpickve2gr.h $a5, $vr1, 2 bstrpick.d $a6, $a5, 15, 0 mulh.du $a6, $a6, $a1 slli.d $a7, $a6, 5 sub.d $a6, $a6, $a7 add.d $a5, $a5, $a6 - vinsgr2vr.h $vr4, $a5, 2 - vpickve2gr.h $a5, $vr2, 3 + vinsgr2vr.h $vr3, $a5, 2 + vpickve2gr.h $a5, $vr1, 3 bstrpick.d $a6, $a5, 15, 0 mulh.du $a6, $a6, $a1 slli.d $a7, $a6, 5 sub.d $a6, $a6, $a7 add.d $a5, $a5, $a6 - vinsgr2vr.h $vr4, $a5, 3 + vinsgr2vr.h $vr3, $a5, 3 + vaddi.hu $vr1, $vr2, 1 vaddi.hu $vr2, $vr3, 1 - vaddi.hu $vr3, $vr4, 1 - vilvl.h $vr2, $vr1, $vr2 - vilvl.h $vr3, $vr1, $vr3 + vsllwil.wu.hu $vr1, $vr1, 0 + vsllwil.wu.hu $vr2, $vr2, 0 add.d $a5, $a2, $a0 - vstx $vr2, $a5, $a3 - vstx $vr3, $a5, $a4 + vstx $vr1, $a5, $a3 + vstx $vr2, $a5, $a4 addi.d $a0, $a0, 32 vaddi.hu $vr0, $vr0, 8 bnez $a0, .LBB2_1 diff --git a/results/SingleSource/Benchmarks/Misc/CMakeFiles/perlin.dir/perlin.s b/results/SingleSource/Benchmarks/Misc/CMakeFiles/perlin.dir/perlin.s index 6e5f2174..5818492b 100644 --- a/results/SingleSource/Benchmarks/Misc/CMakeFiles/perlin.dir/perlin.s +++ b/results/SingleSource/Benchmarks/Misc/CMakeFiles/perlin.dir/perlin.s @@ -162,34 +162,24 @@ main: # @main vinsgr2vr.d $vr26, $a7, 0 vand.v $vr27, $vr26, $vr8 vslti.wu $vr28, $vr27, 4 - vshuf4i.w $vr28, $vr28, 16 - vslli.d $vr28, $vr28, 32 - vsrai.d $vr28, $vr28, 32 + vsllwil.d.w $vr28, $vr28, 0 vand.v $vr29, $vr26, $vr16 vseqi.w $vr29, $vr29, 12 - vshuf4i.w $vr29, $vr29, 16 - vslli.d $vr29, $vr29, 32 - vsrai.d $vr29, $vr29, 32 + vsllwil.d.w $vr29, $vr29, 0 vbitsel.v $vr29, $vr9, $vr21, $vr29 vreplvei.d $vr25, $vr25, 0 vbitsel.v $vr28, $vr29, $vr25, $vr28 vslti.wu $vr27, $vr27, 8 - vshuf4i.w $vr27, $vr27, 16 - vslli.d $vr27, $vr27, 32 - vsrai.d $vr27, $vr27, 32 + vsllwil.d.w $vr27, $vr27, 0 vbitsel.v $vr27, $vr25, $vr21, $vr27 vand.v $vr29, $vr26, $vr17 vseqi.w $vr29, $vr29, 0 - vshuf4i.w $vr29, $vr29, 16 - vslli.d $vr29, $vr29, 32 - vsrai.d $vr29, $vr29, 32 + vsllwil.d.w $vr29, $vr29, 0 vbitrevi.d $vr30, $vr27, 63 vbitsel.v $vr27, $vr30, $vr27, $vr29 vand.v $vr26, $vr26, $vr18 vseqi.w $vr26, $vr26, 0 - vshuf4i.w $vr26, $vr26, 16 - vslli.d $vr26, $vr26, 32 - vsrai.d $vr26, $vr26, 32 + vsllwil.d.w $vr26, $vr26, 0 ld.d $a5, $a6, 396 vbitrevi.d $vr29, $vr28, 63 vbitsel.v $vr26, $vr29, $vr28, $vr26 @@ -197,33 +187,23 @@ main: # @main vinsgr2vr.d $vr27, $a5, 0 vand.v $vr28, $vr27, $vr8 vslti.wu $vr29, $vr28, 4 - vshuf4i.w $vr29, $vr29, 16 - vslli.d $vr29, $vr29, 32 - vsrai.d $vr29, $vr29, 32 + vsllwil.d.w $vr29, $vr29, 0 vand.v $vr30, $vr27, $vr16 vseqi.w $vr30, $vr30, 12 - vshuf4i.w $vr30, $vr30, 16 - vslli.d $vr30, $vr30, 32 - vsrai.d $vr30, $vr30, 32 + vsllwil.d.w $vr30, $vr30, 0 vbitsel.v $vr30, $vr9, $vr20, $vr30 vbitsel.v $vr29, $vr30, $vr25, $vr29 vslti.wu $vr28, $vr28, 8 - vshuf4i.w $vr28, $vr28, 16 - vslli.d $vr28, $vr28, 32 - vsrai.d $vr28, $vr28, 32 + vsllwil.d.w $vr28, $vr28, 0 vbitsel.v $vr25, $vr25, $vr20, $vr28 vand.v $vr28, $vr27, $vr17 vseqi.w $vr28, $vr28, 0 - vshuf4i.w $vr28, $vr28, 16 - vslli.d $vr28, $vr28, 32 - vsrai.d $vr28, $vr28, 32 + vsllwil.d.w $vr28, $vr28, 0 vbitrevi.d $vr30, $vr25, 63 vbitsel.v $vr25, $vr30, $vr25, $vr28 vand.v $vr27, $vr27, $vr18 vseqi.w $vr27, $vr27, 0 - vshuf4i.w $vr27, $vr27, 16 - vslli.d $vr27, $vr27, 32 - vsrai.d $vr27, $vr27, 32 + vsllwil.d.w $vr27, $vr27, 0 vbitrevi.d $vr28, $vr29, 63 vbitsel.v $vr27, $vr28, $vr29, $vr27 ld.d $a4, $a4, 396 @@ -233,34 +213,24 @@ main: # @main vinsgr2vr.d $vr26, $a4, 0 vand.v $vr27, $vr26, $vr8 vslti.wu $vr28, $vr27, 4 - vshuf4i.w $vr28, $vr28, 16 - vslli.d $vr28, $vr28, 32 - vsrai.d $vr28, $vr28, 32 + vsllwil.d.w $vr28, $vr28, 0 vand.v $vr29, $vr26, $vr16 vseqi.w $vr29, $vr29, 12 - vshuf4i.w $vr29, $vr29, 16 - vslli.d $vr29, $vr29, 32 - vsrai.d $vr29, $vr29, 32 + vsllwil.d.w $vr29, $vr29, 0 vbitsel.v $vr29, $vr9, $vr21, $vr29 vreplvei.d $vr24, $vr24, 0 vbitsel.v $vr28, $vr29, $vr24, $vr28 vslti.wu $vr27, $vr27, 8 - vshuf4i.w $vr27, $vr27, 16 - vslli.d $vr27, $vr27, 32 - vsrai.d $vr27, $vr27, 32 + vsllwil.d.w $vr27, $vr27, 0 vbitsel.v $vr27, $vr24, $vr21, $vr27 vand.v $vr29, $vr26, $vr17 vseqi.w $vr29, $vr29, 0 - vshuf4i.w $vr29, $vr29, 16 - vslli.d $vr29, $vr29, 32 - vsrai.d $vr29, $vr29, 32 + vsllwil.d.w $vr29, $vr29, 0 vbitrevi.d $vr30, $vr27, 63 vbitsel.v $vr27, $vr30, $vr27, $vr29 vand.v $vr26, $vr26, $vr18 vseqi.w $vr26, $vr26, 0 - vshuf4i.w $vr26, $vr26, 16 - vslli.d $vr26, $vr26, 32 - vsrai.d $vr26, $vr26, 32 + vsllwil.d.w $vr26, $vr26, 0 ld.d $a3, $a3, 396 vbitrevi.d $vr29, $vr28, 63 vbitsel.v $vr26, $vr29, $vr28, $vr26 @@ -268,33 +238,23 @@ main: # @main vinsgr2vr.d $vr27, $a3, 0 vand.v $vr28, $vr27, $vr8 vslti.wu $vr29, $vr28, 4 - vshuf4i.w $vr29, $vr29, 16 - vslli.d $vr29, $vr29, 32 - vsrai.d $vr29, $vr29, 32 + vsllwil.d.w $vr29, $vr29, 0 vand.v $vr30, $vr27, $vr16 vseqi.w $vr30, $vr30, 12 - vshuf4i.w $vr30, $vr30, 16 - vslli.d $vr30, $vr30, 32 - vsrai.d $vr30, $vr30, 32 + vsllwil.d.w $vr30, $vr30, 0 vbitsel.v $vr30, $vr9, $vr20, $vr30 vbitsel.v $vr29, $vr30, $vr24, $vr29 vslti.wu $vr28, $vr28, 8 - vshuf4i.w $vr28, $vr28, 16 - vslli.d $vr28, $vr28, 32 - vsrai.d $vr28, $vr28, 32 + vsllwil.d.w $vr28, $vr28, 0 vbitsel.v $vr24, $vr24, $vr20, $vr28 vand.v $vr28, $vr27, $vr17 vseqi.w $vr28, $vr28, 0 - vshuf4i.w $vr28, $vr28, 16 - vslli.d $vr28, $vr28, 32 - vsrai.d $vr28, $vr28, 32 + vsllwil.d.w $vr28, $vr28, 0 vbitrevi.d $vr30, $vr24, 63 vbitsel.v $vr24, $vr30, $vr24, $vr28 vand.v $vr27, $vr27, $vr18 vseqi.w $vr27, $vr27, 0 - vshuf4i.w $vr27, $vr27, 16 - vslli.d $vr27, $vr27, 32 - vsrai.d $vr27, $vr27, 32 + vsllwil.d.w $vr27, $vr27, 0 vbitrevi.d $vr28, $vr29, 63 vbitsel.v $vr27, $vr28, $vr29, $vr27 vfadd.d $vr24, $vr24, $vr27 diff --git a/results/SingleSource/Benchmarks/Misc/CMakeFiles/revertBits.dir/revertBits.s b/results/SingleSource/Benchmarks/Misc/CMakeFiles/revertBits.dir/revertBits.s index 55889dfe..37173aee 100644 --- a/results/SingleSource/Benchmarks/Misc/CMakeFiles/revertBits.dir/revertBits.s +++ b/results/SingleSource/Benchmarks/Misc/CMakeFiles/revertBits.dir/revertBits.s @@ -25,11 +25,6 @@ ReverseBits64: # @ReverseBits64 .LCPI2_0: .dword 0 # 0x0 .dword 1 # 0x1 -.LCPI2_1: - .word 1 # 0x1 - .word 5 # 0x5 - .word 0 # 0x0 - .word 7 # 0x7 .text .globl main .p2align 5 @@ -63,60 +58,57 @@ main: # @main addi.w $fp, $fp, 1 bne $s0, $s1, .LBB2_1 # %bb.2: # %vector.ph - vrepli.b $vr1, 0 - vori.b $vr0, $vr1, 0 - vinsgr2vr.d $vr0, $s3, 0 - vori.b $vr2, $vr1, 0 + vrepli.b $vr0, 0 + vori.b $vr1, $vr0, 0 + vinsgr2vr.d $vr1, $s3, 0 + vori.b $vr2, $vr0, 0 vinsgr2vr.d $vr2, $s2, 0 pcalau12i $a0, %pc_hi20(.LCPI2_0) vld $vr3, $a0, %pc_lo12(.LCPI2_0) - pcalau12i $a0, %pc_hi20(.LCPI2_1) - vld $vr4, $a0, %pc_lo12(.LCPI2_1) ori $a0, $zero, 0 lu32i.d $a0, 1 - vreplgr2vr.d $vr6, $a0 - vori.b $vr5, $vr1, 0 - vori.b $vr7, $vr1, 0 + vreplgr2vr.d $vr4, $a0 + vori.b $vr5, $vr0, 0 .p2align 4, , 16 .LBB2_3: # %vector.body # =>This Inner Loop Header: Depth=1 - vaddi.du $vr8, $vr3, 2 - vaddi.wu $vr9, $vr6, 2 - vpickve2gr.d $a0, $vr6, 0 + vaddi.du $vr6, $vr3, 2 + vaddi.wu $vr7, $vr4, 2 + vpickve2gr.d $a0, $vr4, 0 bitrev.d $a0, $a0 - vinsgr2vr.d $vr10, $a0, 0 - vpickve2gr.d $a0, $vr9, 0 + vinsgr2vr.d $vr8, $a0, 0 + vshuf4i.w $vr8, $vr8, 177 + vpickve2gr.d $a0, $vr7, 0 bitrev.d $a0, $a0 - vinsgr2vr.d $vr9, $a0, 0 - vori.b $vr11, $vr4, 0 - vshuf.w $vr11, $vr1, $vr10 - vori.b $vr10, $vr4, 0 - vshuf.w $vr10, $vr1, $vr9 - vsub.d $vr0, $vr0, $vr11 - vsub.d $vr5, $vr5, $vr10 + vinsgr2vr.d $vr7, $a0, 0 + vshuf4i.w $vr7, $vr7, 177 + vsllwil.du.wu $vr8, $vr8, 0 + vsllwil.du.wu $vr7, $vr7, 0 + vsub.d $vr1, $vr1, $vr8 + vsub.d $vr0, $vr0, $vr7 vpickve2gr.d $a0, $vr3, 0 bitrev.d $a0, $a0 - vinsgr2vr.d $vr9, $a0, 0 + vinsgr2vr.d $vr7, $a0, 0 vpickve2gr.d $a0, $vr3, 1 bitrev.d $a0, $a0 - vinsgr2vr.d $vr9, $a0, 1 - vpickve2gr.d $a0, $vr8, 0 + vinsgr2vr.d $vr7, $a0, 1 + vpickve2gr.d $a0, $vr6, 0 bitrev.d $a0, $a0 - vinsgr2vr.d $vr10, $a0, 0 - vpickve2gr.d $a0, $vr8, 1 + vinsgr2vr.d $vr8, $a0, 0 + vpickve2gr.d $a0, $vr6, 1 bitrev.d $a0, $a0 - vinsgr2vr.d $vr10, $a0, 1 - vsub.d $vr2, $vr2, $vr9 - vsub.d $vr7, $vr7, $vr10 + vinsgr2vr.d $vr8, $a0, 1 + vsub.d $vr2, $vr2, $vr7 + vsub.d $vr5, $vr5, $vr8 vaddi.du $vr3, $vr3, 4 addi.d $s1, $s1, -4 - vaddi.wu $vr6, $vr6, 4 + vaddi.wu $vr4, $vr4, 4 bnez $s1, .LBB2_3 # %bb.4: # %middle.block - vadd.d $vr1, $vr7, $vr2 - vhaddw.q.d $vr1, $vr1, $vr1 - vpickve2gr.d $fp, $vr1, 0 - vadd.d $vr0, $vr5, $vr0 + vadd.d $vr2, $vr5, $vr2 + vhaddw.q.d $vr2, $vr2, $vr2 + vpickve2gr.d $fp, $vr2, 0 + vadd.d $vr0, $vr0, $vr1 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $s0, $vr0, 0 pcalau12i $a0, %pc_hi20(.L.str.2) diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-930921-1.dir/930921-1.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-930921-1.dir/930921-1.s index b32360b7..0eefefda 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-930921-1.dir/930921-1.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-930921-1.dir/930921-1.s @@ -65,14 +65,13 @@ f: # @f .type main,@function main: # @main # %bb.0: # %vector.ph - addi.d $sp, $sp, -80 - st.d $ra, $sp, 72 # 8-byte Folded Spill - fst.d $fs0, $sp, 64 # 8-byte Folded Spill - fst.d $fs1, $sp, 56 # 8-byte Folded Spill - fst.d $fs2, $sp, 48 # 8-byte Folded Spill - fst.d $fs3, $sp, 40 # 8-byte Folded Spill - fst.d $fs4, $sp, 32 # 8-byte Folded Spill - fst.d $fs5, $sp, 24 # 8-byte Folded Spill + addi.d $sp, $sp, -64 + st.d $ra, $sp, 56 # 8-byte Folded Spill + fst.d $fs0, $sp, 48 # 8-byte Folded Spill + fst.d $fs1, $sp, 40 # 8-byte Folded Spill + fst.d $fs2, $sp, 32 # 8-byte Folded Spill + fst.d $fs3, $sp, 24 # 8-byte Folded Spill + fst.d $fs4, $sp, 16 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(.LCPI1_0) vld $vr0, $a0, %pc_lo12(.LCPI1_0) pcalau12i $a0, %pc_hi20(.LCPI1_1) @@ -100,112 +99,121 @@ main: # @main lu32i.d $a1, 0 vreplgr2vr.d $vr10, $a1 vrepli.h $vr11, 3 - vrepli.b $vr12, 0 - vrepli.b $vr13, -1 + vrepli.b $vr12, -1 .p2align 4, , 16 .LBB1_1: # %vector.body # =>This Inner Loop Header: Depth=1 vmul.d $vr16, $vr9, $vr10 - vmul.d $vr17, $vr8, $vr10 - vmul.d $vr18, $vr7, $vr10 - vmul.d $vr19, $vr6, $vr10 - vmul.d $vr20, $vr5, $vr10 - vmul.d $vr21, $vr4, $vr10 - vmul.d $vr15, $vr3, $vr10 - vmul.d $vr14, $vr2, $vr10 + vmul.d $vr18, $vr8, $vr10 + vmul.d $vr19, $vr7, $vr10 + vmul.d $vr20, $vr6, $vr10 + vmul.d $vr17, $vr5, $vr10 + vmul.d $vr15, $vr4, $vr10 + vmul.d $vr14, $vr3, $vr10 + vmul.d $vr13, $vr2, $vr10 + vsrli.d $vr13, $vr13, 33 vsrli.d $vr14, $vr14, 33 vsrli.d $vr15, $vr15, 33 - vsrli.d $vr21, $vr21, 33 + vsrli.d $vr17, $vr17, 33 vsrli.d $vr20, $vr20, 33 vsrli.d $vr19, $vr19, 33 vsrli.d $vr18, $vr18, 33 - vsrli.d $vr17, $vr17, 33 - vsrli.d $vr16, $vr16, 33 + vsrli.d $vr21, $vr16, 33 vdiv.hu $vr22, $vr1, $vr11 vdiv.hu $vr23, $vr0, $vr11 - vilvh.h $vr24, $vr12, $vr23 - vilvh.w $vr25, $vr12, $vr24 - vilvl.w $vr24, $vr12, $vr24 - vilvl.h $vr23, $vr12, $vr23 - vilvh.w $vr26, $vr12, $vr23 - vilvl.w $vr23, $vr12, $vr23 - vilvh.h $vr27, $vr12, $vr22 - vilvh.w $vr28, $vr12, $vr27 - vilvl.w $vr27, $vr12, $vr27 - vilvl.h $vr22, $vr12, $vr22 - vilvh.w $vr29, $vr12, $vr22 - vilvl.w $vr22, $vr12, $vr22 - vseq.d $vr16, $vr16, $vr22 - vxor.v $vr16, $vr16, $vr13 - vseq.d $vr17, $vr17, $vr29 - vxor.v $vr17, $vr17, $vr13 - vpickev.w $vr17, $vr17, $vr16 - vseq.d $vr18, $vr18, $vr27 - vxor.v $vr18, $vr18, $vr13 - vseq.d $vr19, $vr19, $vr28 - vxor.v $vr19, $vr19, $vr13 - vpickev.w $vr18, $vr19, $vr18 - vpickev.h $vr17, $vr18, $vr17 - vpickve2gr.b $a1, $vr17, 2 + vbsrl.v $vr16, $vr23, 12 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.du.wu $vr16, $vr16, 0 + vbsrl.v $vr24, $vr23, 8 + vsllwil.wu.hu $vr24, $vr24, 0 + vsllwil.du.wu $vr24, $vr24, 0 + vshuf4i.h $vr25, $vr23, 14 + vsllwil.wu.hu $vr25, $vr25, 0 + vsllwil.du.wu $vr25, $vr25, 0 + vsllwil.wu.hu $vr23, $vr23, 0 + vsllwil.du.wu $vr23, $vr23, 0 + vbsrl.v $vr26, $vr22, 12 + vsllwil.wu.hu $vr26, $vr26, 0 + vsllwil.du.wu $vr26, $vr26, 0 + vbsrl.v $vr27, $vr22, 8 + vsllwil.wu.hu $vr27, $vr27, 0 + vsllwil.du.wu $vr27, $vr27, 0 + vshuf4i.h $vr28, $vr22, 14 + vsllwil.wu.hu $vr28, $vr28, 0 + vsllwil.du.wu $vr28, $vr28, 0 + vsllwil.wu.hu $vr22, $vr22, 0 + vsllwil.du.wu $vr22, $vr22, 0 + vseq.d $vr21, $vr21, $vr22 + vxor.v $vr21, $vr21, $vr12 + vseq.d $vr18, $vr18, $vr28 + vxor.v $vr18, $vr18, $vr12 + vpickev.w $vr18, $vr18, $vr21 + vseq.d $vr19, $vr19, $vr27 + vxor.v $vr19, $vr19, $vr12 + vseq.d $vr20, $vr20, $vr26 + vxor.v $vr20, $vr20, $vr12 + vpickev.w $vr19, $vr20, $vr19 + vpickev.h $vr18, $vr19, $vr18 + vpickve2gr.b $a1, $vr18, 2 andi $a1, $a1, 1 - vpickve2gr.b $a2, $vr16, 0 + vpickve2gr.b $a2, $vr21, 0 bstrins.d $a2, $a1, 63, 1 - vpickve2gr.b $a1, $vr17, 4 + vpickve2gr.b $a1, $vr18, 4 bstrins.d $a2, $a1, 2, 2 - vpickve2gr.b $a1, $vr17, 6 + vpickve2gr.b $a1, $vr18, 6 bstrins.d $a2, $a1, 3, 3 - vpickve2gr.b $a1, $vr17, 8 + vpickve2gr.b $a1, $vr18, 8 bstrins.d $a2, $a1, 4, 4 - vpickve2gr.b $a1, $vr17, 10 + vpickve2gr.b $a1, $vr18, 10 bstrins.d $a2, $a1, 5, 5 - vpickve2gr.b $a1, $vr17, 12 + vpickve2gr.b $a1, $vr18, 12 andi $a1, $a1, 1 slli.d $a1, $a1, 6 or $a1, $a2, $a1 - vpickve2gr.b $a2, $vr17, 14 + vpickve2gr.b $a2, $vr18, 14 andi $a2, $a2, 1 slli.d $a2, $a2, 7 or $a1, $a1, $a2 - vseq.d $vr16, $vr20, $vr23 - vxor.v $vr16, $vr16, $vr13 - vpickve2gr.b $a2, $vr16, 0 + vseq.d $vr17, $vr17, $vr23 + vxor.v $vr17, $vr17, $vr12 + vpickve2gr.b $a2, $vr17, 0 andi $a2, $a2, 1 slli.d $a2, $a2, 8 or $a1, $a1, $a2 - vseq.d $vr17, $vr21, $vr26 - vxor.v $vr17, $vr17, $vr13 - vpickev.w $vr16, $vr17, $vr16 - vseq.d $vr15, $vr15, $vr24 - vxor.v $vr15, $vr15, $vr13 - vseq.d $vr14, $vr14, $vr25 - vxor.v $vr14, $vr14, $vr13 - vpickev.w $vr14, $vr14, $vr15 - vpickev.h $vr14, $vr14, $vr16 - vpickve2gr.b $a2, $vr14, 2 + vseq.d $vr15, $vr15, $vr25 + vxor.v $vr15, $vr15, $vr12 + vpickev.w $vr15, $vr15, $vr17 + vseq.d $vr14, $vr14, $vr24 + vxor.v $vr14, $vr14, $vr12 + vseq.d $vr13, $vr13, $vr16 + vxor.v $vr13, $vr13, $vr12 + vpickev.w $vr13, $vr13, $vr14 + vpickev.h $vr13, $vr13, $vr15 + vpickve2gr.b $a2, $vr13, 2 andi $a2, $a2, 1 slli.d $a2, $a2, 9 or $a1, $a1, $a2 - vpickve2gr.b $a2, $vr14, 4 + vpickve2gr.b $a2, $vr13, 4 andi $a2, $a2, 1 slli.d $a2, $a2, 10 or $a1, $a1, $a2 - vpickve2gr.b $a2, $vr14, 6 + vpickve2gr.b $a2, $vr13, 6 andi $a2, $a2, 1 slli.d $a2, $a2, 11 or $a1, $a1, $a2 - vpickve2gr.b $a2, $vr14, 8 + vpickve2gr.b $a2, $vr13, 8 andi $a2, $a2, 1 slli.d $a2, $a2, 12 or $a1, $a1, $a2 - vpickve2gr.b $a2, $vr14, 10 + vpickve2gr.b $a2, $vr13, 10 andi $a2, $a2, 1 slli.d $a2, $a2, 13 or $a1, $a1, $a2 - vpickve2gr.b $a2, $vr14, 12 + vpickve2gr.b $a2, $vr13, 12 andi $a2, $a2, 1 slli.d $a2, $a2, 14 or $a1, $a1, $a2 - vpickve2gr.b $a2, $vr14, 14 + vpickve2gr.b $a2, $vr13, 14 slli.d $a2, $a2, 15 or $a1, $a1, $a2 bstrpick.d $a1, $a1, 15, 0 diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-comp-goto-1.dir/comp-goto-1.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-comp-goto-1.dir/comp-goto-1.s index ada5e996..5fb964ca 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-comp-goto-1.dir/comp-goto-1.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-comp-goto-1.dir/comp-goto-1.s @@ -42,45 +42,44 @@ simulator_kernel: # @simulator_kernel lu12i.w $t1, 63 ori $t1, $t1, 4095 vreplgr2vr.w $vr1, $t1 - vrepli.b $vr2, 0 lu12i.w $t1, -64 - vreplgr2vr.d $vr3, $t1 + vreplgr2vr.d $vr2, $t1 move $t1, $a7 .p2align 4, , 16 .LBB1_5: # %vector.body # =>This Inner Loop Header: Depth=1 - vld $vr4, $t0, -16 - vld $vr5, $t0, 0 + vld $vr3, $t0, -16 + vld $vr4, $t0, 0 + vslli.d $vr5, $vr3, 46 vslli.d $vr6, $vr4, 46 - vslli.d $vr7, $vr5, 46 + vsrai.d $vr5, $vr5, 43 vsrai.d $vr6, $vr6, 43 - vsrai.d $vr7, $vr7, 43 - vpickve2gr.d $t2, $vr6, 0 - vpickve2gr.d $t3, $vr6, 1 - vpickve2gr.d $t4, $vr7, 0 - vpickve2gr.d $t5, $vr7, 1 + vpickve2gr.d $t2, $vr5, 0 + vpickve2gr.d $t3, $vr5, 1 + vpickve2gr.d $t4, $vr6, 0 + vpickve2gr.d $t5, $vr6, 1 ldx.d $t2, $a1, $t2 ldx.d $t3, $a1, $t3 ldx.d $t4, $a1, $t4 ldx.d $t5, $a1, $t5 - vinsgr2vr.d $vr6, $t2, 0 - vinsgr2vr.d $vr6, $t3, 1 - vinsgr2vr.d $vr7, $t4, 0 - vinsgr2vr.d $vr7, $t5, 1 + vinsgr2vr.d $vr5, $t2, 0 + vinsgr2vr.d $vr5, $t3, 1 + vinsgr2vr.d $vr6, $t4, 0 + vinsgr2vr.d $vr6, $t5, 1 + vshuf4i.w $vr5, $vr5, 8 vshuf4i.w $vr6, $vr6, 8 - vshuf4i.w $vr7, $vr7, 8 + vsub.w $vr5, $vr5, $vr0 vsub.w $vr6, $vr6, $vr0 - vsub.w $vr7, $vr7, $vr0 + vand.v $vr5, $vr5, $vr1 vand.v $vr6, $vr6, $vr1 - vand.v $vr7, $vr7, $vr1 - vilvl.w $vr6, $vr2, $vr6 - vilvl.w $vr7, $vr2, $vr7 - vand.v $vr4, $vr4, $vr3 - vand.v $vr5, $vr5, $vr3 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr6, $vr6, 0 + vand.v $vr3, $vr3, $vr2 + vand.v $vr4, $vr4, $vr2 + vor.v $vr3, $vr3, $vr5 vor.v $vr4, $vr4, $vr6 - vor.v $vr5, $vr5, $vr7 - vst $vr4, $t0, -16 - vst $vr5, $t0, 0 + vst $vr3, $t0, -16 + vst $vr4, $t0, 0 addi.d $t1, $t1, -4 addi.d $t0, $t0, 32 bnez $t1, .LBB1_5 diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-loop-11.dir/loop-11.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-loop-11.dir/loop-11.s index 0abd0854..19465252 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-loop-11.dir/loop-11.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-loop-11.dir/loop-11.s @@ -39,46 +39,46 @@ main: # @main vadd.w $vr0, $vr0, $vr2 bne $a1, $a2, .LBB0_1 # %bb.2: # %scalar.ph - move $a1, $zero - ori $a2, $zero, 6 - st.w $a2, $a0, 24 - ori $a2, $zero, 4 - lu32i.d $a2, 5 - st.d $a2, $a0, 16 - ori $a2, $zero, 2 - lu32i.d $a2, 3 - st.d $a2, $a0, 8 - ori $a2, $zero, 0 - lu32i.d $a2, 1 - st.d $a2, $a0, 0 - pcalau12i $a2, %pc_hi20(.LCPI0_1) - vld $vr0, $a2, %pc_lo12(.LCPI0_1) - pcalau12i $a2, %pc_hi20(.LCPI0_2) - vld $vr1, $a2, %pc_lo12(.LCPI0_2) - vrepli.b $vr2, 0 - vrepli.b $vr3, -1 - ori $a2, $zero, 768 + move $a2, $zero + ori $a1, $zero, 6 + st.w $a1, $a0, 24 + ori $a1, $zero, 4 + lu32i.d $a1, 5 + st.d $a1, $a0, 16 + ori $a1, $zero, 2 + lu32i.d $a1, 3 + st.d $a1, $a0, 8 + pcalau12i $a1, %pc_hi20(.LCPI0_1) + vld $vr0, $a1, %pc_lo12(.LCPI0_1) + pcalau12i $a1, %pc_hi20(.LCPI0_2) + vld $vr1, $a1, %pc_lo12(.LCPI0_2) + ori $a1, $zero, 0 + lu32i.d $a1, 1 + st.d $a1, $a0, 0 + vrepli.b $vr2, -1 + ori $a1, $zero, 768 .p2align 4, , 16 .LBB0_3: # %vector.body8 # =>This Inner Loop Header: Depth=1 - vldx $vr4, $a0, $a1 - vilvh.w $vr5, $vr2, $vr4 - vilvl.w $vr4, $vr2, $vr4 - vseq.d $vr4, $vr1, $vr4 - vxor.v $vr4, $vr4, $vr3 - vseq.d $vr5, $vr0, $vr5 - vxor.v $vr5, $vr5, $vr3 - vpickev.w $vr4, $vr5, $vr4 - vmskltz.w $vr4, $vr4 - vpickve2gr.hu $a4, $vr4, 0 + vldx $vr3, $a0, $a2 + vshuf4i.w $vr4, $vr3, 14 + vsllwil.du.wu $vr4, $vr4, 0 + vsllwil.du.wu $vr3, $vr3, 0 + vseq.d $vr3, $vr1, $vr3 + vxor.v $vr3, $vr3, $vr2 + vseq.d $vr4, $vr0, $vr4 + vxor.v $vr4, $vr4, $vr2 + vpickev.w $vr3, $vr4, $vr3 + vmskltz.w $vr3, $vr3 + vpickve2gr.hu $a4, $vr3, 0 bnez $a4, .LBB0_5 # %bb.4: # %vector.body8 # in Loop: Header=BB0_3 Depth=1 - move $a3, $a1 + move $a3, $a2 vaddi.du $vr1, $vr1, 4 vaddi.du $vr0, $vr0, 4 - addi.d $a1, $a1, 16 - bne $a3, $a2, .LBB0_3 + addi.d $a2, $a2, 16 + bne $a3, $a1, .LBB0_3 .LBB0_5: # %middle.split andi $a1, $a4, 15 bnez $a1, .LBB0_10 diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-memset-4.dir/memset-4.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-memset-4.dir/memset-4.s index cf913332..eb13d558 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-memset-4.dir/memset-4.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-memset-4.dir/memset-4.s @@ -22,18 +22,14 @@ main: # @main pcaddu18i $ra, %call36(f) jirl $ra, $ra, 0 ld.d $a0, $sp, 8 + ld.d $a1, $sp, 16 vinsgr2vr.d $vr0, $a0, 0 vseqi.b $vr0, $vr0, 0 - ld.d $a0, $sp, 16 - vilvl.b $vr0, $vr0, $vr0 - vslli.h $vr0, $vr0, 8 - vsrai.h $vr0, $vr0, 8 - vinsgr2vr.d $vr1, $a0, 0 + vsllwil.h.b $vr0, $vr0, 0 + vinsgr2vr.d $vr1, $a1, 0 vseqi.b $vr1, $vr1, 0 - vilvl.b $vr1, $vr1, $vr1 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 24 - vsrai.w $vr1, $vr1, 24 + vsllwil.h.b $vr1, $vr1, 0 + vsllwil.w.h $vr1, $vr1, 0 vpickve2gr.h $a0, $vr0, 0 vinsgr2vr.w $vr2, $a0, 0 vpickve2gr.h $a0, $vr0, 1 diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-mode-dependent-address.dir/mode-dependent-address.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-mode-dependent-address.dir/mode-dependent-address.s index d754714a..1f4047cc 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-mode-dependent-address.dir/mode-dependent-address.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-mode-dependent-address.dir/mode-dependent-address.s @@ -52,13 +52,11 @@ f883b: # @f883b .LBB0_1: # %vector.body # =>This Inner Loop Header: Depth=1 ld.w $a6, $a1, 0 + ld.d $a7, $a2, 0 vinsgr2vr.w $vr3, $a6, 0 vmini.h $vr3, $vr3, 1 - ld.d $a6, $a2, 0 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vinsgr2vr.d $vr4, $a6, 0 + vsllwil.w.h $vr3, $vr3, 0 + vinsgr2vr.d $vr4, $a7, 0 vld $vr5, $a3, 0 vsra.w $vr3, $vr3, $vr4 vadd.w $vr3, $vr3, $vr2 @@ -183,13 +181,11 @@ main: # @main .LBB1_3: # %vector.body27 # =>This Inner Loop Header: Depth=1 ld.w $a6, $a2, 0 + ld.d $a7, $a1, 0 vinsgr2vr.w $vr3, $a6, 0 vmini.h $vr3, $vr3, 1 - ld.d $a6, $a1, 0 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 - vinsgr2vr.d $vr4, $a6, 0 + vsllwil.w.h $vr3, $vr3, 0 + vinsgr2vr.d $vr4, $a7, 0 vld $vr5, $a0, 0 vsra.w $vr3, $vr3, $vr4 vadd.w $vr3, $vr3, $vr2 @@ -215,13 +211,11 @@ main: # @main .LBB1_5: # %vector.body35 # =>This Inner Loop Header: Depth=1 ldx.w $a4, $a3, $a5 - vinsgr2vr.w $vr1, $a4, 0 - vilvl.b $vr1, $vr1, $vr1 - vld $vr2, $a0, 0 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 24 - vsrai.w $vr1, $vr1, 24 - vseq.w $vr1, $vr2, $vr1 + vld $vr1, $a0, 0 + vinsgr2vr.w $vr2, $a4, 0 + vsllwil.h.b $vr2, $vr2, 0 + vsllwil.w.h $vr2, $vr2, 0 + vseq.w $vr1, $vr1, $vr2 vxor.v $vr1, $vr1, $vr0 vmskltz.w $vr1, $vr1 vpickve2gr.hu $a4, $vr1, 0 diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr23135.dir/pr23135.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr23135.dir/pr23135.s index 636b97f3..a0cda057 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr23135.dir/pr23135.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr23135.dir/pr23135.s @@ -36,7 +36,7 @@ main: # @main lu32i.d $a1, 113 vreplgr2vr.d $vr1, $a1 vseq.w $vr1, $vr0, $vr1 - vshuf4i.w $vr2, $vr1, 16 + vsllwil.d.w $vr2, $vr1, 0 vpickve2gr.d $a1, $vr1, 0 vpickve2gr.d $a2, $vr2, 1 and $a1, $a1, $a2 @@ -49,7 +49,7 @@ main: # @main lu32i.d $a1, 1300 vreplgr2vr.d $vr2, $a1 vseq.w $vr2, $vr1, $vr2 - vshuf4i.w $vr4, $vr2, 16 + vsllwil.d.w $vr4, $vr2, 0 vpickve2gr.d $a1, $vr2, 0 vpickve2gr.d $a2, $vr4, 1 and $a1, $a1, $a2 @@ -69,7 +69,7 @@ main: # @main lu32i.d $a1, 7 vreplgr2vr.d $vr4, $a1 vseq.w $vr4, $vr2, $vr4 - vshuf4i.w $vr5, $vr4, 16 + vsllwil.d.w $vr5, $vr4, 0 vpickve2gr.d $a1, $vr4, 0 vpickve2gr.d $a2, $vr5, 1 and $a1, $a1, $a2 @@ -82,7 +82,7 @@ main: # @main lu32i.d $a1, 4 vreplgr2vr.d $vr5, $a1 vseq.w $vr5, $vr4, $vr5 - vshuf4i.w $vr7, $vr5, 16 + vsllwil.d.w $vr7, $vr5, 0 vpickve2gr.d $a1, $vr5, 0 vpickve2gr.d $a2, $vr7, 1 and $a1, $a1, $a2 @@ -95,7 +95,7 @@ main: # @main lu32i.d $a1, 109 vreplgr2vr.d $vr7, $a1 vseq.w $vr7, $vr5, $vr7 - vshuf4i.w $vr8, $vr7, 16 + vsllwil.d.w $vr8, $vr7, 0 vpickve2gr.d $a1, $vr7, 0 vpickve2gr.d $a2, $vr8, 1 and $a1, $a1, $a2 @@ -108,7 +108,7 @@ main: # @main lu32i.d $a1, 105 vreplgr2vr.d $vr7, $a1 vseq.w $vr7, $vr6, $vr7 - vshuf4i.w $vr8, $vr7, 16 + vsllwil.d.w $vr8, $vr7, 0 vpickve2gr.d $a1, $vr7, 0 vpickve2gr.d $a2, $vr8, 1 and $a1, $a1, $a2 @@ -121,7 +121,7 @@ main: # @main lu32i.d $a1, -100 vreplgr2vr.d $vr8, $a1 vseq.w $vr8, $vr7, $vr8 - vshuf4i.w $vr9, $vr8, 16 + vsllwil.d.w $vr9, $vr8, 0 vpickve2gr.d $a1, $vr8, 0 vpickve2gr.d $a2, $vr9, 1 and $a1, $a1, $a2 @@ -135,7 +135,7 @@ main: # @main lu32i.d $a1, 100 vreplgr2vr.d $vr9, $a1 vseq.w $vr9, $vr3, $vr9 - vshuf4i.w $vr10, $vr9, 16 + vsllwil.d.w $vr10, $vr9, 0 vpickve2gr.d $a1, $vr9, 0 vpickve2gr.d $a2, $vr10, 1 and $a1, $a1, $a2 @@ -156,7 +156,7 @@ main: # @main lu32i.d $a2, 1430 vreplgr2vr.d $vr9, $a2 vseq.w $vr9, $vr3, $vr9 - vshuf4i.w $vr10, $vr9, 16 + vsllwil.d.w $vr10, $vr9, 0 vpickve2gr.d $a2, $vr9, 0 vpickve2gr.d $a3, $vr10, 1 and $a2, $a2, $a3 @@ -177,7 +177,7 @@ main: # @main vreplgr2vr.d $vr9, $a2 vseq.w $vr9, $vr3, $vr9 vpickve2gr.d $a2, $vr9, 0 - vshuf4i.w $vr9, $vr9, 16 + vsllwil.d.w $vr9, $vr9, 0 vpickve2gr.d $a3, $vr9, 1 and $a2, $a2, $a3 andi $a2, $a2, 1 diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr43784.dir/pr43784.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr43784.dir/pr43784.s index 5ab1e790..4632fe41 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr43784.dir/pr43784.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr43784.dir/pr43784.s @@ -381,99 +381,115 @@ main: # @main pcalau12i $a0, %pc_hi20(.LCPI0_23) vld $vr7, $a0, %pc_lo12(.LCPI0_23) ori $a2, $zero, 4 - vrepli.b $vr8, 0 - vrepli.b $vr9, -1 + vrepli.b $vr8, -1 ori $a0, $zero, 244 .p2align 4, , 16 .LBB0_1: # %vector.body15 # =>This Inner Loop Header: Depth=1 move $a1, $a2 - vldx $vr10, $fp, $a2 - vilvh.b $vr11, $vr8, $vr10 - vilvh.h $vr12, $vr8, $vr11 - vilvh.w $vr13, $vr8, $vr12 - vilvl.w $vr12, $vr8, $vr12 - vilvl.h $vr11, $vr8, $vr11 - vilvh.w $vr14, $vr8, $vr11 - vilvl.w $vr11, $vr8, $vr11 - vilvl.b $vr10, $vr8, $vr10 - vilvh.h $vr15, $vr8, $vr10 - vilvh.w $vr16, $vr8, $vr15 - vilvl.w $vr15, $vr8, $vr15 - vilvl.h $vr10, $vr8, $vr10 - vilvh.w $vr17, $vr8, $vr10 - vilvl.w $vr10, $vr8, $vr10 - vseq.d $vr10, $vr7, $vr10 - vxor.v $vr10, $vr10, $vr9 - vseq.d $vr17, $vr6, $vr17 - vxor.v $vr17, $vr17, $vr9 - vpickev.w $vr17, $vr17, $vr10 + vldx $vr11, $fp, $a2 + vbsrl.v $vr9, $vr11, 14 + vsllwil.hu.bu $vr9, $vr9, 0 + vsllwil.wu.hu $vr9, $vr9, 0 + vsllwil.du.wu $vr9, $vr9, 0 + vbsrl.v $vr10, $vr11, 12 + vsllwil.hu.bu $vr10, $vr10, 0 + vsllwil.wu.hu $vr10, $vr10, 0 + vsllwil.du.wu $vr10, $vr10, 0 + vbsrl.v $vr12, $vr11, 10 + vsllwil.hu.bu $vr12, $vr12, 0 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vbsrl.v $vr13, $vr11, 8 + vsllwil.hu.bu $vr13, $vr13, 0 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vsrli.d $vr14, $vr11, 48 + vsllwil.hu.bu $vr14, $vr14, 0 + vsllwil.wu.hu $vr14, $vr14, 0 + vsllwil.du.wu $vr14, $vr14, 0 + vsrli.d $vr15, $vr11, 32 + vsllwil.hu.bu $vr15, $vr15, 0 + vsllwil.wu.hu $vr15, $vr15, 0 + vsllwil.du.wu $vr15, $vr15, 0 + vshuf4i.b $vr16, $vr11, 14 + vsllwil.hu.bu $vr16, $vr16, 0 + vsllwil.wu.hu $vr16, $vr16, 0 + vsllwil.du.wu $vr16, $vr16, 0 + vsllwil.hu.bu $vr11, $vr11, 0 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vseq.d $vr11, $vr7, $vr11 + vxor.v $vr11, $vr11, $vr8 + vseq.d $vr16, $vr6, $vr16 + vxor.v $vr16, $vr16, $vr8 + vpickev.w $vr16, $vr16, $vr11 vseq.d $vr15, $vr5, $vr15 - vxor.v $vr15, $vr15, $vr9 - vseq.d $vr16, $vr4, $vr16 - vxor.v $vr16, $vr16, $vr9 - vpickev.w $vr15, $vr16, $vr15 - vpickev.h $vr15, $vr15, $vr17 - vpickve2gr.b $a2, $vr15, 2 + vxor.v $vr15, $vr15, $vr8 + vseq.d $vr14, $vr4, $vr14 + vxor.v $vr14, $vr14, $vr8 + vpickev.w $vr14, $vr14, $vr15 + vpickev.h $vr14, $vr14, $vr16 + vpickve2gr.b $a2, $vr14, 2 andi $a2, $a2, 1 - vpickve2gr.b $a3, $vr10, 0 + vpickve2gr.b $a3, $vr11, 0 bstrins.d $a3, $a2, 63, 1 - vpickve2gr.b $a2, $vr15, 4 + vpickve2gr.b $a2, $vr14, 4 bstrins.d $a3, $a2, 2, 2 - vpickve2gr.b $a2, $vr15, 6 + vpickve2gr.b $a2, $vr14, 6 bstrins.d $a3, $a2, 3, 3 - vpickve2gr.b $a2, $vr15, 8 + vpickve2gr.b $a2, $vr14, 8 bstrins.d $a3, $a2, 4, 4 - vpickve2gr.b $a2, $vr15, 10 + vpickve2gr.b $a2, $vr14, 10 bstrins.d $a3, $a2, 5, 5 - vpickve2gr.b $a2, $vr15, 12 + vpickve2gr.b $a2, $vr14, 12 andi $a2, $a2, 1 slli.d $a2, $a2, 6 or $a2, $a3, $a2 - vpickve2gr.b $a3, $vr15, 14 + vpickve2gr.b $a3, $vr14, 14 andi $a3, $a3, 1 slli.d $a3, $a3, 7 or $a2, $a2, $a3 - vseq.d $vr10, $vr3, $vr11 - vxor.v $vr10, $vr10, $vr9 - vpickve2gr.b $a3, $vr10, 0 + vseq.d $vr11, $vr3, $vr13 + vxor.v $vr11, $vr11, $vr8 + vpickve2gr.b $a3, $vr11, 0 andi $a3, $a3, 1 slli.d $a3, $a3, 8 or $a2, $a2, $a3 - vseq.d $vr11, $vr2, $vr14 - vxor.v $vr11, $vr11, $vr9 - vpickev.w $vr10, $vr11, $vr10 - vseq.d $vr11, $vr1, $vr12 - vxor.v $vr11, $vr11, $vr9 - vseq.d $vr12, $vr0, $vr13 - vxor.v $vr12, $vr12, $vr9 + vseq.d $vr12, $vr2, $vr12 + vxor.v $vr12, $vr12, $vr8 vpickev.w $vr11, $vr12, $vr11 - vpickev.h $vr10, $vr11, $vr10 - vpickve2gr.b $a3, $vr10, 2 + vseq.d $vr10, $vr1, $vr10 + vxor.v $vr10, $vr10, $vr8 + vseq.d $vr9, $vr0, $vr9 + vxor.v $vr9, $vr9, $vr8 + vpickev.w $vr9, $vr9, $vr10 + vpickev.h $vr9, $vr9, $vr11 + vpickve2gr.b $a3, $vr9, 2 andi $a3, $a3, 1 slli.d $a3, $a3, 9 or $a2, $a2, $a3 - vpickve2gr.b $a3, $vr10, 4 + vpickve2gr.b $a3, $vr9, 4 andi $a3, $a3, 1 slli.d $a3, $a3, 10 or $a2, $a2, $a3 - vpickve2gr.b $a3, $vr10, 6 + vpickve2gr.b $a3, $vr9, 6 andi $a3, $a3, 1 slli.d $a3, $a3, 11 or $a2, $a2, $a3 - vpickve2gr.b $a3, $vr10, 8 + vpickve2gr.b $a3, $vr9, 8 andi $a3, $a3, 1 slli.d $a3, $a3, 12 or $a2, $a2, $a3 - vpickve2gr.b $a3, $vr10, 10 + vpickve2gr.b $a3, $vr9, 10 andi $a3, $a3, 1 slli.d $a3, $a3, 13 or $a2, $a2, $a3 - vpickve2gr.b $a3, $vr10, 12 + vpickve2gr.b $a3, $vr9, 12 andi $a3, $a3, 1 slli.d $a3, $a3, 14 or $a2, $a2, $a3 - vpickve2gr.b $a3, $vr10, 14 + vpickve2gr.b $a3, $vr9, 14 slli.d $a3, $a3, 15 or $a2, $a2, $a3 bstrpick.d $a3, $a2, 15, 0 diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr51581-1.dir/pr51581-1.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr51581-1.dir/pr51581-1.s index 6deb8c0b..efe0b338 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr51581-1.dir/pr51581-1.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr51581-1.dir/pr51581-1.s @@ -273,37 +273,38 @@ f8: # @f8 addi.d $a1, $a1, %pc_lo12(b) lu12i.w $a2, 4 ori $a3, $a2, 16 - vrepli.b $vr0, 0 lu12i.w $a4, -349526 ori $a4, $a4, 2731 lu32i.d $a4, 0 - vreplgr2vr.d $vr1, $a4 + vreplgr2vr.d $vr0, $a4 pcalau12i $a4, %pc_hi20(d) addi.d $a4, $a4, %pc_lo12(d) .p2align 4, , 16 .LBB7_1: # %vector.body # =>This Inner Loop Header: Depth=1 add.d $a5, $a1, $a0 - vldx $vr2, $a5, $a2 - vldx $vr3, $a5, $a3 - vilvl.w $vr4, $vr0, $vr2 - vilvh.w $vr2, $vr0, $vr2 - vilvl.w $vr5, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vmul.d $vr2, $vr2, $vr1 - vmul.d $vr4, $vr4, $vr1 - vmul.d $vr3, $vr3, $vr1 - vmul.d $vr5, $vr5, $vr1 + vldx $vr1, $a5, $a2 + vldx $vr2, $a5, $a3 + vsllwil.du.wu $vr3, $vr1, 0 + vshuf4i.w $vr1, $vr1, 14 + vsllwil.du.wu $vr1, $vr1, 0 + vsllwil.du.wu $vr4, $vr2, 0 + vshuf4i.w $vr2, $vr2, 14 + vsllwil.du.wu $vr2, $vr2, 0 + vmul.d $vr1, $vr1, $vr0 + vmul.d $vr3, $vr3, $vr0 + vmul.d $vr2, $vr2, $vr0 + vmul.d $vr4, $vr4, $vr0 + vsrli.d $vr3, $vr3, 33 + vsrli.d $vr1, $vr1, 33 vsrli.d $vr4, $vr4, 33 vsrli.d $vr2, $vr2, 33 - vsrli.d $vr5, $vr5, 33 - vsrli.d $vr3, $vr3, 33 + vpickev.w $vr1, $vr1, $vr3 vpickev.w $vr2, $vr2, $vr4 - vpickev.w $vr3, $vr3, $vr5 add.d $a5, $a4, $a0 - vstx $vr2, $a5, $a2 + vstx $vr1, $a5, $a2 addi.d $a0, $a0, 32 - vstx $vr3, $a5, $a3 + vstx $vr2, $a5, $a3 bnez $a0, .LBB7_1 # %bb.2: # %middle.block ret @@ -359,36 +360,37 @@ f10: # @f10 addi.d $a1, $a1, %pc_lo12(b) lu12i.w $a2, 4 ori $a3, $a2, 16 - vrepli.b $vr0, 0 lu12i.w $a4, 233016 ori $a4, $a4, 3641 - vreplgr2vr.d $vr1, $a4 + vreplgr2vr.d $vr0, $a4 pcalau12i $a4, %pc_hi20(d) addi.d $a4, $a4, %pc_lo12(d) .p2align 4, , 16 .LBB9_1: # %vector.body # =>This Inner Loop Header: Depth=1 add.d $a5, $a1, $a0 - vldx $vr2, $a5, $a2 - vldx $vr3, $a5, $a3 - vilvl.w $vr4, $vr0, $vr2 - vilvh.w $vr2, $vr0, $vr2 - vilvl.w $vr5, $vr0, $vr3 - vilvh.w $vr3, $vr0, $vr3 - vmul.d $vr2, $vr2, $vr1 - vmul.d $vr4, $vr4, $vr1 - vmul.d $vr3, $vr3, $vr1 - vmul.d $vr5, $vr5, $vr1 + vldx $vr1, $a5, $a2 + vldx $vr2, $a5, $a3 + vsllwil.du.wu $vr3, $vr1, 0 + vshuf4i.w $vr1, $vr1, 14 + vsllwil.du.wu $vr1, $vr1, 0 + vsllwil.du.wu $vr4, $vr2, 0 + vshuf4i.w $vr2, $vr2, 14 + vsllwil.du.wu $vr2, $vr2, 0 + vmul.d $vr1, $vr1, $vr0 + vmul.d $vr3, $vr3, $vr0 + vmul.d $vr2, $vr2, $vr0 + vmul.d $vr4, $vr4, $vr0 + vsrli.d $vr3, $vr3, 34 + vsrli.d $vr1, $vr1, 34 vsrli.d $vr4, $vr4, 34 vsrli.d $vr2, $vr2, 34 - vsrli.d $vr5, $vr5, 34 - vsrli.d $vr3, $vr3, 34 + vpickev.w $vr1, $vr1, $vr3 vpickev.w $vr2, $vr2, $vr4 - vpickev.w $vr3, $vr3, $vr5 add.d $a5, $a4, $a0 - vstx $vr2, $a5, $a2 + vstx $vr1, $a5, $a2 addi.d $a0, $a0, 32 - vstx $vr3, $a5, $a3 + vstx $vr2, $a5, $a3 bnez $a0, .LBB9_1 # %bb.2: # %middle.block ret diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr51581-2.dir/pr51581-2.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr51581-2.dir/pr51581-2.s index e875fa3b..40840558 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr51581-2.dir/pr51581-2.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr51581-2.dir/pr51581-2.s @@ -287,40 +287,41 @@ f8: # @f8 addi.d $a1, $a1, %pc_lo12(b) lu12i.w $a2, 4 ori $a3, $a2, 16 - vrepli.b $vr0, 0 lu12i.w $a4, -349526 ori $a4, $a4, 2731 lu32i.d $a4, 0 - vreplgr2vr.d $vr1, $a4 - vrepli.w $vr2, -3 + vreplgr2vr.d $vr0, $a4 + vrepli.w $vr1, -3 pcalau12i $a4, %pc_hi20(d) addi.d $a4, $a4, %pc_lo12(d) .p2align 4, , 16 .LBB7_1: # %vector.body # =>This Inner Loop Header: Depth=1 add.d $a5, $a1, $a0 - vldx $vr3, $a5, $a2 - vldx $vr4, $a5, $a3 - vilvl.w $vr5, $vr0, $vr3 - vilvh.w $vr6, $vr0, $vr3 - vilvl.w $vr7, $vr0, $vr4 - vilvh.w $vr8, $vr0, $vr4 - vmul.d $vr6, $vr6, $vr1 - vmul.d $vr5, $vr5, $vr1 - vmul.d $vr8, $vr8, $vr1 - vmul.d $vr7, $vr7, $vr1 + vldx $vr2, $a5, $a2 + vldx $vr3, $a5, $a3 + vsllwil.du.wu $vr4, $vr2, 0 + vshuf4i.w $vr5, $vr2, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr6, $vr3, 0 + vshuf4i.w $vr7, $vr3, 14 + vsllwil.du.wu $vr7, $vr7, 0 + vmul.d $vr5, $vr5, $vr0 + vmul.d $vr4, $vr4, $vr0 + vmul.d $vr7, $vr7, $vr0 + vmul.d $vr6, $vr6, $vr0 + vsrli.d $vr4, $vr4, 33 vsrli.d $vr5, $vr5, 33 vsrli.d $vr6, $vr6, 33 vsrli.d $vr7, $vr7, 33 - vsrli.d $vr8, $vr8, 33 - vpickev.w $vr5, $vr6, $vr5 - vpickev.w $vr6, $vr8, $vr7 - vmadd.w $vr3, $vr5, $vr2 - vmadd.w $vr4, $vr6, $vr2 + vpickev.w $vr4, $vr5, $vr4 + vpickev.w $vr5, $vr7, $vr6 + vmadd.w $vr2, $vr4, $vr1 + vmadd.w $vr3, $vr5, $vr1 add.d $a5, $a4, $a0 - vstx $vr3, $a5, $a2 + vstx $vr2, $a5, $a2 addi.d $a0, $a0, 32 - vstx $vr4, $a5, $a3 + vstx $vr3, $a5, $a3 bnez $a0, .LBB7_1 # %bb.2: # %middle.block ret @@ -347,12 +348,9 @@ f9: # @f9 # =>This Inner Loop Header: Depth=1 add.d $a4, $a1, $a0 vldx $vr2, $a4, $a2 - vshuf4i.w $vr3, $vr2, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr4, $vr2, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 + vsllwil.d.w $vr3, $vr2, 0 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 vmul.d $vr4, $vr4, $vr0 vmul.d $vr3, $vr3, $vr0 vsrai.d $vr3, $vr3, 34 @@ -380,39 +378,40 @@ f10: # @f10 addi.d $a1, $a1, %pc_lo12(b) lu12i.w $a2, 4 ori $a3, $a2, 16 - vrepli.b $vr0, 0 lu12i.w $a4, 233016 ori $a4, $a4, 3641 - vreplgr2vr.d $vr1, $a4 - vrepli.w $vr2, -18 + vreplgr2vr.d $vr0, $a4 + vrepli.w $vr1, -18 pcalau12i $a4, %pc_hi20(d) addi.d $a4, $a4, %pc_lo12(d) .p2align 4, , 16 .LBB9_1: # %vector.body # =>This Inner Loop Header: Depth=1 add.d $a5, $a1, $a0 - vldx $vr3, $a5, $a2 - vldx $vr4, $a5, $a3 - vilvl.w $vr5, $vr0, $vr3 - vilvh.w $vr6, $vr0, $vr3 - vilvl.w $vr7, $vr0, $vr4 - vilvh.w $vr8, $vr0, $vr4 - vmul.d $vr6, $vr6, $vr1 - vmul.d $vr5, $vr5, $vr1 - vmul.d $vr8, $vr8, $vr1 - vmul.d $vr7, $vr7, $vr1 + vldx $vr2, $a5, $a2 + vldx $vr3, $a5, $a3 + vsllwil.du.wu $vr4, $vr2, 0 + vshuf4i.w $vr5, $vr2, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr6, $vr3, 0 + vshuf4i.w $vr7, $vr3, 14 + vsllwil.du.wu $vr7, $vr7, 0 + vmul.d $vr5, $vr5, $vr0 + vmul.d $vr4, $vr4, $vr0 + vmul.d $vr7, $vr7, $vr0 + vmul.d $vr6, $vr6, $vr0 + vsrli.d $vr4, $vr4, 34 vsrli.d $vr5, $vr5, 34 vsrli.d $vr6, $vr6, 34 vsrli.d $vr7, $vr7, 34 - vsrli.d $vr8, $vr8, 34 - vpickev.w $vr5, $vr6, $vr5 - vpickev.w $vr6, $vr8, $vr7 - vmadd.w $vr3, $vr5, $vr2 - vmadd.w $vr4, $vr6, $vr2 + vpickev.w $vr4, $vr5, $vr4 + vpickev.w $vr5, $vr7, $vr6 + vmadd.w $vr2, $vr4, $vr1 + vmadd.w $vr3, $vr5, $vr1 add.d $a5, $a4, $a0 - vstx $vr3, $a5, $a2 + vstx $vr2, $a5, $a2 addi.d $a0, $a0, 32 - vstx $vr4, $a5, $a3 + vstx $vr3, $a5, $a3 bnez $a0, .LBB9_1 # %bb.2: # %middle.block ret @@ -439,12 +438,9 @@ f11: # @f11 # =>This Inner Loop Header: Depth=1 add.d $a4, $a1, $a0 vldx $vr2, $a4, $a2 - vshuf4i.w $vr3, $vr2, 16 - vslli.d $vr3, $vr3, 32 - vsrai.d $vr3, $vr3, 32 - vshuf4i.w $vr4, $vr2, 50 - vslli.d $vr4, $vr4, 32 - vsrai.d $vr4, $vr4, 32 + vsllwil.d.w $vr3, $vr2, 0 + vshuf4i.w $vr4, $vr2, 14 + vsllwil.d.w $vr4, $vr4, 0 vmul.d $vr4, $vr4, $vr0 vmul.d $vr3, $vr3, $vr0 vsrai.d $vr3, $vr3, 35 diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr65401.dir/pr65401.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr65401.dir/pr65401.s index cfbcbd9a..0a4e2c8d 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr65401.dir/pr65401.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-pr65401.dir/pr65401.s @@ -127,13 +127,13 @@ bar: # @bar .type main,@function main: # @main # %bb.0: # %vector.ph - addi.d $sp, $sp, -368 - st.d $ra, $sp, 360 # 8-byte Folded Spill - st.d $fp, $sp, 352 # 8-byte Folded Spill - st.d $s0, $sp, 344 # 8-byte Folded Spill - st.d $s1, $sp, 336 # 8-byte Folded Spill - st.d $s2, $sp, 328 # 8-byte Folded Spill - st.d $s3, $sp, 320 # 8-byte Folded Spill + addi.d $sp, $sp, -352 + st.d $ra, $sp, 344 # 8-byte Folded Spill + st.d $fp, $sp, 336 # 8-byte Folded Spill + st.d $s0, $sp, 328 # 8-byte Folded Spill + st.d $s1, $sp, 320 # 8-byte Folded Spill + st.d $s2, $sp, 312 # 8-byte Folded Spill + st.d $s3, $sp, 304 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(.LCPI2_0) vld $vr0, $a0, %pc_lo12(.LCPI2_0) pcalau12i $a0, %pc_hi20(.LCPI2_1) @@ -143,13 +143,13 @@ main: # @main pcalau12i $a0, %pc_hi20(.LCPI2_3) vld $vr3, $a0, %pc_lo12(.LCPI2_3) vst $vr0, $sp, 128 # 16-byte Folded Spill - vst $vr0, $sp, 192 + vst $vr0, $sp, 176 vst $vr1, $sp, 112 # 16-byte Folded Spill - vst $vr1, $sp, 208 + vst $vr1, $sp, 192 vst $vr2, $sp, 96 # 16-byte Folded Spill - vst $vr2, $sp, 224 + vst $vr2, $sp, 208 vst $vr3, $sp, 80 # 16-byte Folded Spill - vst $vr3, $sp, 240 + vst $vr3, $sp, 224 pcalau12i $a0, %pc_hi20(.LCPI2_4) vld $vr0, $a0, %pc_lo12(.LCPI2_4) pcalau12i $a0, %pc_hi20(.LCPI2_5) @@ -159,15 +159,15 @@ main: # @main pcalau12i $a0, %pc_hi20(.LCPI2_7) vld $vr3, $a0, %pc_lo12(.LCPI2_7) vst $vr0, $sp, 64 # 16-byte Folded Spill - vst $vr0, $sp, 256 + vst $vr0, $sp, 240 vst $vr1, $sp, 48 # 16-byte Folded Spill - vst $vr1, $sp, 272 + vst $vr1, $sp, 256 vst $vr2, $sp, 32 # 16-byte Folded Spill - vst $vr2, $sp, 288 + vst $vr2, $sp, 272 vst $vr3, $sp, 16 # 16-byte Folded Spill - vst $vr3, $sp, 304 - addi.d $a0, $sp, 192 - addi.d $s3, $sp, 192 + vst $vr3, $sp, 288 + addi.d $a0, $sp, 176 + addi.d $s3, $sp, 176 pcaddu18i $ra, %call36(foo) jirl $ra, $ra, 0 move $a2, $zero @@ -182,8 +182,7 @@ main: # @main vrepli.d $vr4, -255 lu12i.w $a0, 4 vreplgr2vr.d $vr5, $a0 - vrepli.b $vr14, 0 - vrepli.b $vr15, -1 + vrepli.b $vr14, -1 ori $a0, $zero, 112 .p2align 4, , 16 .LBB2_1: # %vector.body41 @@ -198,16 +197,21 @@ main: # @main vmadd.d $vr9, $vr2, $vr4 vori.b $vr10, $vr5, 0 vmadd.d $vr10, $vr3, $vr4 - vilvh.h $vr11, $vr14, $vr6 - vilvh.w $vr12, $vr14, $vr11 - vilvl.w $vr11, $vr14, $vr11 - vilvl.h $vr6, $vr14, $vr6 - vilvh.w $vr13, $vr14, $vr6 - vilvl.w $vr6, $vr14, $vr6 + vbsrl.v $vr11, $vr6, 12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vbsrl.v $vr12, $vr6, 8 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vshuf4i.h $vr13, $vr6, 14 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 vseq.d $vr6, $vr10, $vr6 - vxor.v $vr6, $vr6, $vr15 + vxor.v $vr6, $vr6, $vr14 vseq.d $vr9, $vr9, $vr13 - vxor.v $vr9, $vr9, $vr15 + vxor.v $vr9, $vr9, $vr14 vpickev.w $vr9, $vr9, $vr6 vpickve2gr.h $a2, $vr9, 2 andi $a2, $a2, 1 @@ -217,10 +221,10 @@ main: # @main bstrins.d $a3, $a2, 2, 2 vpickve2gr.h $a2, $vr9, 6 bstrins.d $a3, $a2, 3, 3 - vseq.d $vr6, $vr8, $vr11 - vxor.v $vr6, $vr6, $vr15 - vseq.d $vr7, $vr7, $vr12 - vxor.v $vr7, $vr7, $vr15 + vseq.d $vr6, $vr8, $vr12 + vxor.v $vr6, $vr6, $vr14 + vseq.d $vr7, $vr7, $vr11 + vxor.v $vr7, $vr7, $vr14 vpickev.w $vr6, $vr7, $vr6 vpickve2gr.h $a2, $vr6, 0 bstrins.d $a3, $a2, 4, 4 @@ -244,28 +248,27 @@ main: # @main addi.d $a2, $a1, 16 bne $a1, $a0, .LBB2_1 .LBB2_3: # %middle.split - vst $vr15, $sp, 144 # 16-byte Folded Spill - vst $vr14, $sp, 160 # 16-byte Folded Spill + vst $vr14, $sp, 144 # 16-byte Folded Spill bnez $a3, .LBB2_9 # %bb.4: # %vector.body49 vld $vr0, $sp, 128 # 16-byte Folded Reload - vst $vr0, $sp, 192 + vst $vr0, $sp, 176 vld $vr0, $sp, 112 # 16-byte Folded Reload - vst $vr0, $sp, 208 + vst $vr0, $sp, 192 vld $vr0, $sp, 96 # 16-byte Folded Reload - vst $vr0, $sp, 224 + vst $vr0, $sp, 208 vld $vr0, $sp, 80 # 16-byte Folded Reload - vst $vr0, $sp, 240 + vst $vr0, $sp, 224 vld $vr0, $sp, 64 # 16-byte Folded Reload - vst $vr0, $sp, 256 + vst $vr0, $sp, 240 vld $vr0, $sp, 48 # 16-byte Folded Reload - vst $vr0, $sp, 272 + vst $vr0, $sp, 256 vld $vr0, $sp, 32 # 16-byte Folded Reload - vst $vr0, $sp, 288 + vst $vr0, $sp, 272 vld $vr0, $sp, 16 # 16-byte Folded Reload - vst $vr0, $sp, 304 - addi.d $a0, $sp, 192 - addi.d $s3, $sp, 192 + vst $vr0, $sp, 288 + addi.d $a0, $sp, 176 + addi.d $s3, $sp, 176 pcaddu18i $ra, %call36(bar) jirl $ra, $ra, 0 move $a2, $zero @@ -276,8 +279,7 @@ main: # @main vrepli.d $vr4, 255 vrepli.d $vr5, 64 ori $a0, $zero, 112 - vld $vr14, $sp, 160 # 16-byte Folded Reload - vld $vr15, $sp, 144 # 16-byte Folded Reload + vld $vr14, $sp, 144 # 16-byte Folded Reload .p2align 4, , 16 .LBB2_5: # %vector.body58 # =>This Inner Loop Header: Depth=1 @@ -291,16 +293,21 @@ main: # @main vmadd.d $vr9, $vr2, $vr4 vori.b $vr10, $vr5, 0 vmadd.d $vr10, $vr3, $vr4 - vilvh.h $vr11, $vr14, $vr6 - vilvh.w $vr12, $vr14, $vr11 - vilvl.w $vr11, $vr14, $vr11 - vilvl.h $vr6, $vr14, $vr6 - vilvh.w $vr13, $vr14, $vr6 - vilvl.w $vr6, $vr14, $vr6 + vbsrl.v $vr11, $vr6, 12 + vsllwil.wu.hu $vr11, $vr11, 0 + vsllwil.du.wu $vr11, $vr11, 0 + vbsrl.v $vr12, $vr6, 8 + vsllwil.wu.hu $vr12, $vr12, 0 + vsllwil.du.wu $vr12, $vr12, 0 + vshuf4i.h $vr13, $vr6, 14 + vsllwil.wu.hu $vr13, $vr13, 0 + vsllwil.du.wu $vr13, $vr13, 0 + vsllwil.wu.hu $vr6, $vr6, 0 + vsllwil.du.wu $vr6, $vr6, 0 vseq.d $vr6, $vr10, $vr6 - vxor.v $vr6, $vr6, $vr15 + vxor.v $vr6, $vr6, $vr14 vseq.d $vr9, $vr9, $vr13 - vxor.v $vr9, $vr9, $vr15 + vxor.v $vr9, $vr9, $vr14 vpickev.w $vr9, $vr9, $vr6 vpickve2gr.h $a2, $vr9, 2 andi $a2, $a2, 1 @@ -310,10 +317,10 @@ main: # @main bstrins.d $a3, $a2, 2, 2 vpickve2gr.h $a2, $vr9, 6 bstrins.d $a3, $a2, 3, 3 - vseq.d $vr6, $vr8, $vr11 - vxor.v $vr6, $vr6, $vr15 - vseq.d $vr7, $vr7, $vr12 - vxor.v $vr7, $vr7, $vr15 + vseq.d $vr6, $vr8, $vr12 + vxor.v $vr6, $vr6, $vr14 + vseq.d $vr7, $vr7, $vr11 + vxor.v $vr7, $vr7, $vr14 vpickev.w $vr6, $vr7, $vr6 vpickve2gr.h $a2, $vr6, 0 bstrins.d $a3, $a2, 4, 4 @@ -340,13 +347,13 @@ main: # @main bnez $a3, .LBB2_9 # %bb.8: # %middle.block65 move $a0, $zero - ld.d $s3, $sp, 320 # 8-byte Folded Reload - ld.d $s2, $sp, 328 # 8-byte Folded Reload - ld.d $s1, $sp, 336 # 8-byte Folded Reload - ld.d $s0, $sp, 344 # 8-byte Folded Reload - ld.d $fp, $sp, 352 # 8-byte Folded Reload - ld.d $ra, $sp, 360 # 8-byte Folded Reload - addi.d $sp, $sp, 368 + ld.d $s3, $sp, 304 # 8-byte Folded Reload + ld.d $s2, $sp, 312 # 8-byte Folded Reload + ld.d $s1, $sp, 320 # 8-byte Folded Reload + ld.d $s0, $sp, 328 # 8-byte Folded Reload + ld.d $fp, $sp, 336 # 8-byte Folded Reload + ld.d $ra, $sp, 344 # 8-byte Folded Reload + addi.d $sp, $sp, 352 ret .LBB2_9: # %vector.early.exit pcaddu18i $ra, %call36(abort) diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-ssad-run.dir/ssad-run.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-ssad-run.dir/ssad-run.s index 667cc8e0..961c93b0 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-ssad-run.dir/ssad-run.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-ssad-run.dir/ssad-run.s @@ -7,25 +7,29 @@ bar: # @bar # %bb.0: move $a4, $zero ori $a5, $zero, 16 - vrepli.b $vr0, 0 .p2align 4, , 16 .LBB0_1: # %.preheader.i # =>This Inner Loop Header: Depth=1 - vld $vr1, $a0, 0 - vld $vr2, $a1, 0 - vabsd.b $vr1, $vr1, $vr2 - vilvh.b $vr2, $vr0, $vr1 - vilvl.h $vr3, $vr0, $vr2 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr4, $vr0, $vr1 - vilvh.h $vr2, $vr0, $vr2 - vilvh.h $vr1, $vr0, $vr1 + vld $vr0, $a0, 0 + vld $vr1, $a1, 0 + vabsd.b $vr0, $vr0, $vr1 + vsllwil.hu.bu $vr1, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 8 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr0, 12 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsrli.d $vr0, $vr0, 32 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vadd.w $vr0, $vr0, $vr3 vadd.w $vr1, $vr1, $vr2 - vadd.w $vr2, $vr4, $vr3 - vadd.w $vr1, $vr2, $vr1 - vhaddw.d.w $vr1, $vr1, $vr1 - vhaddw.q.d $vr1, $vr1, $vr1 - vpickve2gr.d $a6, $vr1, 0 + vadd.w $vr0, $vr1, $vr0 + vhaddw.d.w $vr0, $vr0, $vr0 + vhaddw.q.d $vr0, $vr0, $vr0 + vpickve2gr.d $a6, $vr0, 0 add.d $a4, $a6, $a4 addi.d $a0, $a0, 16 addi.w $a5, $a5, -1 diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-usad-run.dir/usad-run.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-usad-run.dir/usad-run.s index 84edce08..669c97ad 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-usad-run.dir/usad-run.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-usad-run.dir/usad-run.s @@ -7,25 +7,29 @@ bar: # @bar # %bb.0: move $a4, $zero ori $a5, $zero, 16 - vrepli.b $vr0, 0 .p2align 4, , 16 .LBB0_1: # %.preheader.i # =>This Inner Loop Header: Depth=1 - vld $vr1, $a0, 0 - vld $vr2, $a1, 0 - vabsd.bu $vr1, $vr1, $vr2 - vilvh.b $vr2, $vr0, $vr1 - vilvl.h $vr3, $vr0, $vr2 - vilvl.b $vr1, $vr0, $vr1 - vilvl.h $vr4, $vr0, $vr1 - vilvh.h $vr2, $vr0, $vr2 - vilvh.h $vr1, $vr0, $vr1 + vld $vr0, $a0, 0 + vld $vr1, $a1, 0 + vabsd.bu $vr0, $vr0, $vr1 + vsllwil.hu.bu $vr1, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vbsrl.v $vr2, $vr0, 8 + vsllwil.hu.bu $vr2, $vr2, 0 + vsllwil.wu.hu $vr2, $vr2, 0 + vbsrl.v $vr3, $vr0, 12 + vsllwil.hu.bu $vr3, $vr3, 0 + vsllwil.wu.hu $vr3, $vr3, 0 + vsrli.d $vr0, $vr0, 32 + vsllwil.hu.bu $vr0, $vr0, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vadd.w $vr0, $vr0, $vr3 vadd.w $vr1, $vr1, $vr2 - vadd.w $vr2, $vr4, $vr3 - vadd.w $vr1, $vr2, $vr1 - vhaddw.d.w $vr1, $vr1, $vr1 - vhaddw.q.d $vr1, $vr1, $vr1 - vpickve2gr.d $a6, $vr1, 0 + vadd.w $vr0, $vr1, $vr0 + vhaddw.d.w $vr0, $vr0, $vr0 + vhaddw.q.d $vr0, $vr0, $vr0 + vpickve2gr.d $a6, $vr0, 0 add.d $a4, $a6, $a4 addi.d $a0, $a0, 16 addi.w $a5, $a5, -1 diff --git a/results/SingleSource/UnitTests/Vectorizer/CMakeFiles/any-of.dir/any-of.s b/results/SingleSource/UnitTests/Vectorizer/CMakeFiles/any-of.dir/any-of.s index a0515f96..35d5999b 100644 --- a/results/SingleSource/UnitTests/Vectorizer/CMakeFiles/any-of.dir/any-of.s +++ b/results/SingleSource/UnitTests/Vectorizer/CMakeFiles/any-of.dir/any-of.s @@ -8686,13 +8686,9 @@ _ZNSt17_Function_handlerIFtPtS0_jEZ4mainE4$_11E9_M_invokeERKSt9_Any_dataOS0_S7_O vinsgr2vr.d $vr4, $t2, 0 vinsgr2vr.d $vr5, $t3, 0 vslt.hu $vr2, $vr4, $vr2 - vilvl.h $vr2, $vr2, $vr2 - vslli.w $vr2, $vr2, 16 - vsrai.w $vr2, $vr2, 16 + vsllwil.w.h $vr2, $vr2, 0 vslt.hu $vr3, $vr5, $vr3 - vilvl.h $vr3, $vr3, $vr3 - vslli.w $vr3, $vr3, 16 - vsrai.w $vr3, $vr3, 16 + vsllwil.w.h $vr3, $vr3, 0 vor.v $vr0, $vr0, $vr2 vor.v $vr1, $vr1, $vr3 addi.d $a7, $a7, -8 diff --git a/results/SingleSource/UnitTests/Vectorizer/CMakeFiles/gcc-loops.dir/gcc-loops.s b/results/SingleSource/UnitTests/Vectorizer/CMakeFiles/gcc-loops.dir/gcc-loops.s index 104df593..09081126 100644 --- a/results/SingleSource/UnitTests/Vectorizer/CMakeFiles/gcc-loops.dir/gcc-loops.s +++ b/results/SingleSource/UnitTests/Vectorizer/CMakeFiles/gcc-loops.dir/gcc-loops.s @@ -915,12 +915,8 @@ _Z10example10bPsS_S_PiS0_S0_: # @_Z10example10bPsS_S_PiS0_S0_ ld.d $a4, $a4, 8 vinsgr2vr.d $vr0, $a5, 0 vinsgr2vr.d $vr1, $a4, 0 - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vsllwil.w.h $vr0, $vr0, 0 + vsllwil.w.h $vr1, $vr1, 0 vst $vr0, $a2, -16 vst $vr1, $a2, 0 addi.d $a0, $a0, 16 @@ -1476,7 +1472,6 @@ _Z9example23PtPj: # @_Z9example23PtPj # %bb.0: # %vector.ph move $a2, $zero addi.d $a0, $a0, 8 - vrepli.b $vr0, 0 ori $a3, $zero, 1024 .p2align 4, , 16 .LBB18_1: # %vector.body @@ -1484,14 +1479,14 @@ _Z9example23PtPj: # @_Z9example23PtPj ld.d $a4, $a0, -8 ld.d $a5, $a0, 0 add.d $a6, $a1, $a2 - vinsgr2vr.d $vr1, $a4, 0 - vinsgr2vr.d $vr2, $a5, 0 - vilvl.h $vr1, $vr0, $vr1 - vilvl.h $vr2, $vr0, $vr2 + vinsgr2vr.d $vr0, $a4, 0 + vinsgr2vr.d $vr1, $a5, 0 + vsllwil.wu.hu $vr0, $vr0, 0 + vsllwil.wu.hu $vr1, $vr1, 0 + vslli.w $vr0, $vr0, 7 vslli.w $vr1, $vr1, 7 - vslli.w $vr2, $vr2, 7 - vstx $vr1, $a1, $a2 - vst $vr2, $a6, 16 + vstx $vr0, $a1, $a2 + vst $vr1, $a6, 16 addi.d $a2, $a2, 32 addi.d $a0, $a0, 16 bne $a2, $a3, .LBB18_1 @@ -1520,12 +1515,8 @@ _Z9example24ss: # @_Z9example24ss ori $a3, $a2, 16 pcalau12i $a4, %pc_hi20(fb) addi.d $a4, $a4, %pc_lo12(fb) - vilvl.h $vr0, $vr0, $vr0 - vslli.w $vr0, $vr0, 16 - vsrai.w $vr0, $vr0, 16 - vilvl.h $vr1, $vr1, $vr1 - vslli.w $vr1, $vr1, 16 - vsrai.w $vr1, $vr1, 16 + vsllwil.w.h $vr0, $vr0, 0 + vsllwil.w.h $vr1, $vr1, 0 pcalau12i $a5, %pc_hi20(ic) addi.d $a5, $a5, %pc_lo12(ic) .p2align 4, , 16 diff --git a/results/SingleSource/UnitTests/Vectorizer/CMakeFiles/runtime-checks.dir/runtime-checks.s b/results/SingleSource/UnitTests/Vectorizer/CMakeFiles/runtime-checks.dir/runtime-checks.s index 138b762b..5c16e2dd 100644 --- a/results/SingleSource/UnitTests/Vectorizer/CMakeFiles/runtime-checks.dir/runtime-checks.s +++ b/results/SingleSource/UnitTests/Vectorizer/CMakeFiles/runtime-checks.dir/runtime-checks.s @@ -11970,7 +11970,6 @@ _ZNSt17_Function_handlerIFvPhS0_jEZ4mainE4$_11E9_M_invokeERKSt9_Any_dataOS0_S7_O .LBB83_9: # %vector.main.loop.iter.check ori $a5, $zero, 16 pcalau12i $a4, %pc_hi20(.LCPI83_3) - vrepli.b $vr0, 0 bgeu $a3, $a5, .LBB83_11 # %bb.10: move $a3, $zero @@ -11980,44 +11979,48 @@ _ZNSt17_Function_handlerIFvPhS0_jEZ4mainE4$_11E9_M_invokeERKSt9_Any_dataOS0_S7_O andi $a6, $a2, 12 bstrpick.d $a5, $a2, 31, 4 pcalau12i $a3, %pc_hi20(.LCPI83_0) - vld $vr1, $a3, %pc_lo12(.LCPI83_0) + vld $vr0, $a3, %pc_lo12(.LCPI83_0) pcalau12i $a3, %pc_hi20(.LCPI83_1) - vld $vr2, $a3, %pc_lo12(.LCPI83_1) + vld $vr1, $a3, %pc_lo12(.LCPI83_1) pcalau12i $a3, %pc_hi20(.LCPI83_2) - vld $vr3, $a3, %pc_lo12(.LCPI83_2) - vld $vr4, $a4, %pc_lo12(.LCPI83_3) + vld $vr2, $a3, %pc_lo12(.LCPI83_2) + vld $vr3, $a4, %pc_lo12(.LCPI83_3) slli.d $a3, $a5, 4 slli.d $a5, $a5, 5 - vrepli.w $vr5, 32 + vrepli.w $vr4, 32 move $a7, $a1 move $t0, $a3 .p2align 4, , 16 .LBB83_12: # %vector.body # =>This Inner Loop Header: Depth=1 - vilvh.w $vr6, $vr0, $vr1 - vilvl.w $vr7, $vr0, $vr1 - vilvh.w $vr8, $vr0, $vr2 - vilvl.w $vr9, $vr0, $vr2 - vilvh.w $vr10, $vr0, $vr3 - vilvl.w $vr11, $vr0, $vr3 - vilvh.w $vr12, $vr0, $vr4 - vilvl.w $vr13, $vr0, $vr4 - vpickve2gr.d $t1, $vr13, 0 - vpickve2gr.d $t2, $vr13, 1 - vpickve2gr.d $t3, $vr12, 0 - vpickve2gr.d $t4, $vr12, 1 - vpickve2gr.d $t5, $vr11, 0 - vpickve2gr.d $t6, $vr11, 1 - vpickve2gr.d $t7, $vr10, 0 - vpickve2gr.d $t8, $vr10, 1 - vpickve2gr.d $fp, $vr9, 0 - vpickve2gr.d $s0, $vr9, 1 - vpickve2gr.d $s1, $vr8, 0 - vpickve2gr.d $s2, $vr8, 1 - vpickve2gr.d $s3, $vr7, 0 - vpickve2gr.d $s4, $vr7, 1 - vpickve2gr.d $s5, $vr6, 0 - vpickve2gr.d $s6, $vr6, 1 + vshuf4i.w $vr5, $vr0, 14 + vsllwil.du.wu $vr5, $vr5, 0 + vsllwil.du.wu $vr6, $vr0, 0 + vshuf4i.w $vr7, $vr1, 14 + vsllwil.du.wu $vr7, $vr7, 0 + vsllwil.du.wu $vr8, $vr1, 0 + vshuf4i.w $vr9, $vr2, 14 + vsllwil.du.wu $vr9, $vr9, 0 + vsllwil.du.wu $vr10, $vr2, 0 + vshuf4i.w $vr11, $vr3, 14 + vsllwil.du.wu $vr11, $vr11, 0 + vsllwil.du.wu $vr12, $vr3, 0 + vpickve2gr.d $t1, $vr12, 0 + vpickve2gr.d $t2, $vr12, 1 + vpickve2gr.d $t3, $vr11, 0 + vpickve2gr.d $t4, $vr11, 1 + vpickve2gr.d $t5, $vr10, 0 + vpickve2gr.d $t6, $vr10, 1 + vpickve2gr.d $t7, $vr9, 0 + vpickve2gr.d $t8, $vr9, 1 + vpickve2gr.d $fp, $vr8, 0 + vpickve2gr.d $s0, $vr8, 1 + vpickve2gr.d $s1, $vr7, 0 + vpickve2gr.d $s2, $vr7, 1 + vpickve2gr.d $s3, $vr6, 0 + vpickve2gr.d $s4, $vr6, 1 + vpickve2gr.d $s5, $vr5, 0 + vpickve2gr.d $s6, $vr5, 1 ldx.b $t1, $a0, $t1 ldx.b $t2, $a0, $t2 ldx.b $t3, $a0, $t3 @@ -12034,28 +12037,28 @@ _ZNSt17_Function_handlerIFvPhS0_jEZ4mainE4$_11E9_M_invokeERKSt9_Any_dataOS0_S7_O ldx.b $s4, $a0, $s4 ldx.b $s5, $a0, $s5 ldx.b $s6, $a0, $s6 - vinsgr2vr.b $vr6, $t1, 0 - vinsgr2vr.b $vr6, $t2, 1 - vinsgr2vr.b $vr6, $t3, 2 - vinsgr2vr.b $vr6, $t4, 3 - vinsgr2vr.b $vr6, $t5, 4 - vinsgr2vr.b $vr6, $t6, 5 - vinsgr2vr.b $vr6, $t7, 6 - vinsgr2vr.b $vr6, $t8, 7 - vinsgr2vr.b $vr6, $fp, 8 - vinsgr2vr.b $vr6, $s0, 9 - vinsgr2vr.b $vr6, $s1, 10 - vinsgr2vr.b $vr6, $s2, 11 - vinsgr2vr.b $vr6, $s3, 12 - vinsgr2vr.b $vr6, $s4, 13 - vinsgr2vr.b $vr6, $s5, 14 - vinsgr2vr.b $vr6, $s6, 15 - vaddi.bu $vr6, $vr6, 10 - vst $vr6, $a7, 0 - vadd.w $vr4, $vr4, $vr5 - vadd.w $vr3, $vr3, $vr5 - vadd.w $vr2, $vr2, $vr5 - vadd.w $vr1, $vr1, $vr5 + vinsgr2vr.b $vr5, $t1, 0 + vinsgr2vr.b $vr5, $t2, 1 + vinsgr2vr.b $vr5, $t3, 2 + vinsgr2vr.b $vr5, $t4, 3 + vinsgr2vr.b $vr5, $t5, 4 + vinsgr2vr.b $vr5, $t6, 5 + vinsgr2vr.b $vr5, $t7, 6 + vinsgr2vr.b $vr5, $t8, 7 + vinsgr2vr.b $vr5, $fp, 8 + vinsgr2vr.b $vr5, $s0, 9 + vinsgr2vr.b $vr5, $s1, 10 + vinsgr2vr.b $vr5, $s2, 11 + vinsgr2vr.b $vr5, $s3, 12 + vinsgr2vr.b $vr5, $s4, 13 + vinsgr2vr.b $vr5, $s5, 14 + vinsgr2vr.b $vr5, $s6, 15 + vaddi.bu $vr5, $vr5, 10 + vst $vr5, $a7, 0 + vadd.w $vr3, $vr3, $vr4 + vadd.w $vr2, $vr2, $vr4 + vadd.w $vr1, $vr1, $vr4 + vadd.w $vr0, $vr0, $vr4 addi.d $t0, $t0, -16 addi.d $a7, $a7, 16 bnez $t0, .LBB83_12 @@ -12066,33 +12069,34 @@ _ZNSt17_Function_handlerIFvPhS0_jEZ4mainE4$_11E9_M_invokeERKSt9_Any_dataOS0_S7_O .LBB83_15: # %vec.epilog.ph move $a6, $a3 bstrpick.d $a7, $a2, 31, 2 - vld $vr1, $a4, %pc_lo12(.LCPI83_3) + vld $vr0, $a4, %pc_lo12(.LCPI83_3) slli.d $a3, $a7, 2 slli.d $a4, $a7, 3 - vreplgr2vr.w $vr2, $a5 - vadd.w $vr1, $vr2, $vr1 + vreplgr2vr.w $vr1, $a5 + vadd.w $vr0, $vr1, $vr0 sub.d $a5, $a6, $a3 add.d $a6, $a1, $a6 .p2align 4, , 16 .LBB83_16: # %vec.epilog.vector.body # =>This Inner Loop Header: Depth=1 - vilvh.w $vr2, $vr0, $vr1 - vilvl.w $vr3, $vr0, $vr1 - vpickve2gr.d $a7, $vr3, 0 - vpickve2gr.d $t0, $vr3, 1 - vpickve2gr.d $t1, $vr2, 0 - vpickve2gr.d $t2, $vr2, 1 + vshuf4i.w $vr1, $vr0, 14 + vsllwil.du.wu $vr1, $vr1, 0 + vsllwil.du.wu $vr2, $vr0, 0 + vpickve2gr.d $a7, $vr2, 0 + vpickve2gr.d $t0, $vr2, 1 + vpickve2gr.d $t1, $vr1, 0 + vpickve2gr.d $t2, $vr1, 1 ldx.b $a7, $a0, $a7 ldx.b $t0, $a0, $t0 ldx.b $t1, $a0, $t1 ldx.b $t2, $a0, $t2 - vinsgr2vr.b $vr2, $a7, 0 - vinsgr2vr.b $vr2, $t0, 1 - vinsgr2vr.b $vr2, $t1, 2 - vinsgr2vr.b $vr2, $t2, 3 - vaddi.bu $vr2, $vr2, 10 - vstelm.w $vr2, $a6, 0, 0 - vaddi.wu $vr1, $vr1, 8 + vinsgr2vr.b $vr1, $a7, 0 + vinsgr2vr.b $vr1, $t0, 1 + vinsgr2vr.b $vr1, $t1, 2 + vinsgr2vr.b $vr1, $t2, 3 + vaddi.bu $vr1, $vr1, 10 + vstelm.w $vr1, $a6, 0, 0 + vaddi.wu $vr0, $vr0, 8 addi.d $a5, $a5, 4 addi.d $a6, $a6, 4 bnez $a5, .LBB83_16 @@ -12228,37 +12232,37 @@ _ZNSt17_Function_handlerIFvPjS0_jEZ4mainE4$_11E9_M_invokeERKSt9_Any_dataOS0_S7_O .LBB87_7: # %"_ZSt10__invoke_rIvRZ4mainE4$_11JPjS2_jEENSt9enable_ifIX16is_invocable_r_vIT_T0_DpT1_EES4_E4typeEOS5_DpOS6_.exit" ret .LBB87_8: # %vector.ph + pcalau12i $a3, %pc_hi20(.LCPI87_0) + vld $vr0, $a3, %pc_lo12(.LCPI87_0) bstrpick.d $a3, $a2, 31, 2 - pcalau12i $a4, %pc_hi20(.LCPI87_0) - vld $vr0, $a4, %pc_lo12(.LCPI87_0) slli.d $a4, $a3, 2 slli.d $a3, $a3, 3 - vrepli.b $vr1, 0 move $a5, $a1 move $a6, $a4 .p2align 4, , 16 .LBB87_9: # %vector.body # =>This Inner Loop Header: Depth=1 - vilvh.w $vr2, $vr1, $vr0 - vilvl.w $vr3, $vr1, $vr0 - vpickve2gr.d $a7, $vr3, 0 + vshuf4i.w $vr1, $vr0, 14 + vsllwil.du.wu $vr1, $vr1, 0 + vsllwil.du.wu $vr2, $vr0, 0 + vpickve2gr.d $a7, $vr2, 0 slli.d $a7, $a7, 2 - vpickve2gr.d $t0, $vr3, 1 + vpickve2gr.d $t0, $vr2, 1 slli.d $t0, $t0, 2 - vpickve2gr.d $t1, $vr2, 0 + vpickve2gr.d $t1, $vr1, 0 slli.d $t1, $t1, 2 - vpickve2gr.d $t2, $vr2, 1 + vpickve2gr.d $t2, $vr1, 1 slli.d $t2, $t2, 2 ldx.w $a7, $a0, $a7 ldx.w $t0, $a0, $t0 ldx.w $t1, $a0, $t1 ldx.w $t2, $a0, $t2 - vinsgr2vr.w $vr2, $a7, 0 - vinsgr2vr.w $vr2, $t0, 1 - vinsgr2vr.w $vr2, $t1, 2 - vinsgr2vr.w $vr2, $t2, 3 - vaddi.wu $vr2, $vr2, 10 - vst $vr2, $a5, 0 + vinsgr2vr.w $vr1, $a7, 0 + vinsgr2vr.w $vr1, $t0, 1 + vinsgr2vr.w $vr1, $t1, 2 + vinsgr2vr.w $vr1, $t2, 3 + vaddi.wu $vr1, $vr1, 10 + vst $vr1, $a5, 0 vaddi.wu $vr0, $vr0, 8 addi.d $a6, $a6, -4 addi.d $a5, $a5, 16 diff --git a/results/SingleSource/UnitTests/Vectorizer/VPlanNativePath/CMakeFiles/outer-loop-vect.dir/outer-loop-vect.s b/results/SingleSource/UnitTests/Vectorizer/VPlanNativePath/CMakeFiles/outer-loop-vect.dir/outer-loop-vect.s index 6bc8b38b..325c2947 100644 --- a/results/SingleSource/UnitTests/Vectorizer/VPlanNativePath/CMakeFiles/outer-loop-vect.dir/outer-loop-vect.s +++ b/results/SingleSource/UnitTests/Vectorizer/VPlanNativePath/CMakeFiles/outer-loop-vect.dir/outer-loop-vect.s @@ -753,43 +753,41 @@ main: # @main vadd.d $vr11, $vr3, $vr11 vpickve2gr.d $a4, $vr11, 0 ld.w $a4, $a4, 0 - vpickve2gr.d $a5, $vr12, 0 + vpickve2gr.d $a5, $vr11, 1 ld.w $a5, $a5, 0 - vpickve2gr.d $a6, $vr12, 1 + vpickve2gr.d $a6, $vr12, 0 ld.w $a6, $a6, 0 - vpickve2gr.d $a7, $vr11, 1 + vpickve2gr.d $a7, $vr12, 1 ld.w $a7, $a7, 0 - vinsgr2vr.w $vr11, $a5, 0 - vinsgr2vr.w $vr11, $a6, 2 - vslli.d $vr11, $vr11, 32 - vsrai.d $vr11, $vr11, 32 - vinsgr2vr.w $vr12, $a4, 0 - vinsgr2vr.w $vr12, $a7, 2 - vslli.d $vr12, $vr12, 32 - vsrai.d $vr12, $vr12, 32 - vmuh.du $vr13, $vr12, $vr4 - vsub.d $vr14, $vr12, $vr13 - vsrli.d $vr14, $vr14, 1 - vadd.d $vr13, $vr14, $vr13 - vsrli.d $vr13, $vr13, 6 - vmsub.d $vr12, $vr13, $vr5 + vinsgr2vr.w $vr11, $a4, 0 + vinsgr2vr.w $vr11, $a5, 1 + vinsgr2vr.w $vr12, $a6, 0 + vinsgr2vr.w $vr12, $a7, 1 + vsllwil.d.w $vr12, $vr12, 0 + vsllwil.d.w $vr11, $vr11, 0 vmuh.du $vr13, $vr11, $vr4 vsub.d $vr14, $vr11, $vr13 vsrli.d $vr14, $vr14, 1 vadd.d $vr13, $vr14, $vr13 vsrli.d $vr13, $vr13, 6 vmsub.d $vr11, $vr13, $vr5 - vslli.d $vr11, $vr11, 2 - vadd.d $vr11, $vr6, $vr11 + vmuh.du $vr13, $vr12, $vr4 + vsub.d $vr14, $vr12, $vr13 + vsrli.d $vr14, $vr14, 1 + vadd.d $vr13, $vr14, $vr13 + vsrli.d $vr13, $vr13, 6 + vmsub.d $vr12, $vr13, $vr5 vslli.d $vr12, $vr12, 2 vadd.d $vr12, $vr6, $vr12 - vpickve2gr.d $a4, $vr12, 0 + vslli.d $vr11, $vr11, 2 + vadd.d $vr11, $vr6, $vr11 + vpickve2gr.d $a4, $vr11, 0 ld.w $a4, $a4, 0 - vpickve2gr.d $a5, $vr12, 1 + vpickve2gr.d $a5, $vr11, 1 ld.w $a5, $a5, 0 - vpickve2gr.d $a6, $vr11, 0 + vpickve2gr.d $a6, $vr12, 0 ld.w $a6, $a6, 0 - vpickve2gr.d $a7, $vr11, 1 + vpickve2gr.d $a7, $vr12, 1 ld.w $a7, $a7, 0 vinsgr2vr.w $vr11, $a4, 0 vinsgr2vr.w $vr11, $a5, 1