diff --git a/results/MicroBenchmarks/LCALS/SubsetALambdaLoops/CMakeFiles/lcalsALambda.dir/LambdaSubsetAbenchmarks.s b/results/MicroBenchmarks/LCALS/SubsetALambdaLoops/CMakeFiles/lcalsALambda.dir/LambdaSubsetAbenchmarks.s index e9b319b5..c5c4f4c2 100644 --- a/results/MicroBenchmarks/LCALS/SubsetALambdaLoops/CMakeFiles/lcalsALambda.dir/LambdaSubsetAbenchmarks.s +++ b/results/MicroBenchmarks/LCALS/SubsetALambdaLoops/CMakeFiles/lcalsALambda.dir/LambdaSubsetAbenchmarks.s @@ -1192,14 +1192,13 @@ _ZL20BM_VOL3D_CALC_LAMBDARN9benchmark5StateE: # @_ZL20BM_VOL3D_CALC_LAMBDARN9ben ori $a0, $zero, 5 pcaddu18i $ra, %call36(_Z8loopInitj) jirl $ra, $ra, 0 - ld.d $s8, $s0, 8 + ld.d $s6, $s0, 8 ld.d $a0, $s2, 32 - ld.d $a1, $s0, 16 - st.d $a1, $sp, 80 # 8-byte Folded Spill + ld.d $s8, $s0, 16 ld.d $a1, $s0, 24 - st.d $a1, $sp, 128 # 8-byte Folded Spill + st.d $a1, $sp, 136 # 8-byte Folded Spill ld.d $a1, $s0, 32 - st.d $a1, $sp, 152 # 8-byte Folded Spill + st.d $a1, $sp, 80 # 8-byte Folded Spill ld.w $a1, $a0, 0 addi.d $a0, $sp, 160 ori $a2, $zero, 3 @@ -1210,158 +1209,159 @@ _ZL20BM_VOL3D_CALC_LAMBDARN9benchmark5StateE: # @_ZL20BM_VOL3D_CALC_LAMBDARN9ben ld.w $s1, $s2, 28 ld.d $s4, $s2, 16 .Ltmp0: # EH_LABEL - st.d $s2, $sp, 16 # 8-byte Folded Spill + st.d $s2, $sp, 8 # 8-byte Folded Spill move $a0, $s2 pcaddu18i $ra, %call36(_ZN9benchmark5State16StartKeepRunningEv) jirl $ra, $ra, 0 .Ltmp1: # EH_LABEL # %bb.1: # %_ZN9benchmark5State3endEv.exit.preheader - ld.d $t4, $sp, 80 # 8-byte Folded Reload + ld.d $ra, $sp, 80 # 8-byte Folded Reload bnez $s1, .LBB2_12 # %bb.2: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s4, .LBB2_12 # %bb.3: # %.lr.ph92 - addi.d $a0, $s8, 8 - alsl.d $a1, $s0, $s8, 3 + addi.d $a0, $s6, 8 + alsl.d $a1, $s0, $s6, 3 alsl.d $a2, $s0, $a0, 3 - st.d $a2, $sp, 144 # 8-byte Folded Spill + st.d $a2, $sp, 72 # 8-byte Folded Spill + alsl.d $a2, $fp, $s6, 3 + st.d $a2, $sp, 120 # 8-byte Folded Spill + ld.w $a3, $sp, 224 alsl.d $a0, $fp, $a0, 3 - st.d $a0, $sp, 136 # 8-byte Folded Spill - ld.w $a2, $sp, 224 - alsl.d $s7, $fp, $a1, 3 - ld.w $a4, $sp, 228 - addi.d $a6, $t4, 8 - slli.d $a5, $a2, 3 - ld.d $a0, $sp, 152 # 8-byte Folded Reload - alsl.d $a3, $a2, $a0, 3 - sub.d $a0, $a4, $a2 - bstrpick.d $a0, $a0, 31, 0 - alsl.d $a1, $a0, $a3, 3 - addi.d $a1, $a1, 8 - slli.d $a7, $fp, 3 - slli.d $t5, $s0, 3 - alsl.d $a7, $a2, $a7, 3 - alsl.d $t1, $s0, $a7, 3 - add.d $t6, $s8, $t1 - alsl.d $t0, $a0, $t1, 3 - addi.d $t2, $t0, 16 + st.d $a0, $sp, 64 # 8-byte Folded Spill + ld.w $a5, $sp, 228 + st.d $a1, $sp, 128 # 8-byte Folded Spill + alsl.d $a0, $fp, $a1, 3 + st.d $a0, $sp, 152 # 8-byte Folded Spill + slli.d $a7, $a3, 3 + alsl.d $a6, $a3, $ra, 3 + sub.d $a0, $a5, $a3 + bstrpick.d $a1, $a0, 31, 0 + alsl.d $a0, $a1, $a6, 3 + addi.d $a4, $a0, 8 + slli.d $a0, $fp, 3 + slli.d $t3, $s0, 3 + alsl.d $t0, $a3, $a0, 3 + alsl.d $a0, $s0, $t0, 3 + add.d $t4, $s6, $a0 + alsl.d $t1, $a1, $a0, 3 + addi.d $t2, $t1, 16 + add.d $t5, $s6, $t2 + add.d $t6, $s6, $t0 + alsl.d $t1, $a1, $t0, 3 + addi.d $t1, $t1, 16 + add.d $t7, $s6, $t1 + alsl.d $t8, $a3, $s6, 3 + sltu $t5, $a6, $t5 + sltu $t4, $t4, $a4 + and $t5, $t5, $t4 + alsl.d $t4, $a1, $a7, 3 + addi.d $t4, $t4, 16 + sltu $t7, $a6, $t7 + sltu $t6, $t6, $a4 + and $t6, $t7, $t6 + add.d $t7, $s6, $t4 + alsl.d $t3, $a3, $t3, 3 + or $t6, $t5, $t6 + add.d $s1, $s6, $t3 + sltu $t5, $a6, $t7 + sltu $t7, $t8, $a4 + and $t7, $t5, $t7 + alsl.d $t5, $a1, $t3, 3 + addi.d $t5, $t5, 16 + or $t6, $t6, $t7 + add.d $t7, $s6, $t5 + sltu $t7, $a6, $t7 + sltu $t8, $s1, $a4 + and $t7, $t7, $t8 + add.d $t8, $s8, $a0 + or $t6, $t6, $t7 add.d $t7, $s8, $t2 - add.d $t8, $s8, $a7 - alsl.d $t0, $a0, $a7, 3 - addi.d $t0, $t0, 16 - add.d $s1, $s8, $t0 - alsl.d $s2, $a2, $s8, 3 - sltu $t7, $a3, $t7 - sltu $t6, $t6, $a1 - and $t7, $t7, $t6 - alsl.d $t6, $a0, $a5, 3 - addi.d $t6, $t6, 16 - sltu $s1, $a3, $s1 - sltu $t8, $t8, $a1 - and $t8, $s1, $t8 - add.d $s1, $s8, $t6 - alsl.d $t5, $a2, $t5, 3 - or $t8, $t7, $t8 - add.d $s3, $s8, $t5 - sltu $t7, $a3, $s1 - sltu $s1, $s2, $a1 - and $s1, $t7, $s1 - alsl.d $t7, $a0, $t5, 3 - addi.d $t7, $t7, 16 - or $t8, $t8, $s1 - add.d $s1, $s8, $t7 - sltu $s1, $a3, $s1 - sltu $s2, $s3, $a1 - and $s1, $s1, $s2 - add.d $s2, $t4, $t1 - or $t8, $t8, $s1 - add.d $s1, $t4, $t2 - sltu $s1, $a3, $s1 - sltu $s2, $s2, $a1 - and $s1, $s1, $s2 - add.d $s2, $t4, $a7 - or $t8, $t8, $s1 - add.d $s1, $t4, $t0 - sltu $s1, $a3, $s1 - sltu $s2, $s2, $a1 - and $s1, $s1, $s2 - alsl.d $s2, $a2, $t4, 3 - or $t8, $t8, $s1 - add.d $s1, $t4, $t6 - sltu $s1, $a3, $s1 - sltu $s2, $s2, $a1 - and $s1, $s1, $s2 - add.d $s2, $t4, $t5 - or $s1, $t8, $s1 - add.d $t8, $t4, $t7 - sltu $t8, $a3, $t8 - sltu $s2, $s2, $a1 - and $s2, $t8, $s2 - alsl.d $t8, $s0, $a6, 3 - or $s1, $s1, $s2 - alsl.d $s2, $fp, $t4, 3 - alsl.d $a6, $fp, $a6, 3 - st.d $a6, $sp, 120 # 8-byte Folded Spill - ld.d $t3, $sp, 128 # 8-byte Folded Reload - add.d $a6, $t3, $t1 - add.d $t1, $t3, $t2 - sltu $t1, $a3, $t1 - sltu $a6, $a6, $a1 - and $a6, $t1, $a6 - st.d $s2, $sp, 72 # 8-byte Folded Spill - alsl.d $t1, $s0, $s2, 3 - st.d $t1, $sp, 112 # 8-byte Folded Spill - or $a6, $s1, $a6 - alsl.d $t1, $fp, $t8, 3 - st.d $t1, $sp, 64 # 8-byte Folded Spill - add.d $a7, $t3, $a7 - add.d $t0, $t3, $t0 - sltu $t0, $a3, $t0 - sltu $a7, $a7, $a1 - and $a7, $t0, $a7 - addi.d $t1, $t3, 8 - or $a6, $a6, $a7 - alsl.d $a7, $a2, $t3, 3 - add.d $t0, $t3, $t6 - sltu $t0, $a3, $t0 - sltu $a7, $a7, $a1 - and $a7, $t0, $a7 - alsl.d $t0, $s0, $t3, 3 - st.d $t1, $sp, 56 # 8-byte Folded Spill - alsl.d $t1, $s0, $t1, 3 - or $a6, $a6, $a7 - alsl.d $t2, $fp, $t3, 3 - add.d $a7, $t3, $t7 - sltu $a3, $a3, $a7 - st.d $t0, $sp, 48 # 8-byte Folded Spill - alsl.d $a7, $fp, $t0, 3 - st.d $a7, $sp, 96 # 8-byte Folded Spill - st.d $t1, $sp, 104 # 8-byte Folded Spill - alsl.d $a7, $fp, $t1, 3 - st.d $a7, $sp, 40 # 8-byte Folded Spill - add.d $a7, $t3, $t5 - addi.d $t0, $a0, 1 - sltu $a0, $a7, $a1 - and $a0, $a3, $a0 - or $a0, $a6, $a0 - st.d $t0, $sp, 32 # 8-byte Folded Spill - bstrpick.d $a1, $t0, 32, 1 - slli.d $s6, $a1, 1 - alsl.d $a1, $a1, $a2, 1 - st.d $a1, $sp, 24 # 8-byte Folded Spill - xor $a1, $a4, $a2 - sltui $a1, $a1, 1 - or $a0, $a1, $a0 - pcalau12i $a1, %pc_hi20(.LCPI2_0) - fld.d $fa0, $a1, %pc_lo12(.LCPI2_0) - lu12i.w $a1, 349525 - ori $a1, $a1, 1365 - lu32i.d $a1, 349525 - lu52i.d $a1, $a1, 1019 - vreplgr2vr.d $vr1, $a1 - addi.d $a1, $a4, 1 - st.d $a1, $sp, 88 # 8-byte Folded Spill - andi $ra, $a0, 1 + sltu $t7, $a6, $t7 + sltu $t8, $t8, $a4 + and $t7, $t7, $t8 + add.d $t8, $s8, $t0 + or $t6, $t6, $t7 + add.d $t7, $s8, $t1 + sltu $t7, $a6, $t7 + sltu $t8, $t8, $a4 + and $t7, $t7, $t8 + alsl.d $t8, $a3, $s8, 3 + or $t6, $t6, $t7 + add.d $t7, $s8, $t4 + sltu $t7, $a6, $t7 + sltu $t8, $t8, $a4 + and $t7, $t7, $t8 + add.d $t8, $s8, $t3 + or $t6, $t6, $t7 + add.d $t7, $s8, $t5 + sltu $t7, $a6, $t7 + sltu $t8, $t8, $a4 + and $t7, $t7, $t8 + alsl.d $t8, $s0, $s8, 3 + or $t6, $t6, $t7 + addi.d $t7, $t8, 8 + ld.d $a2, $sp, 136 # 8-byte Folded Reload + add.d $a0, $a2, $a0 + add.d $t2, $a2, $t2 + sltu $t2, $a6, $t2 + sltu $a0, $a0, $a4 + and $t2, $t2, $a0 + alsl.d $a0, $fp, $s8, 3 + or $t2, $t6, $t2 + st.d $t8, $sp, 112 # 8-byte Folded Spill + alsl.d $t6, $fp, $t8, 3 + st.d $t6, $sp, 104 # 8-byte Folded Spill + add.d $t0, $a2, $t0 + add.d $t1, $a2, $t1 + sltu $t1, $a6, $t1 + sltu $t0, $t0, $a4 + and $t0, $t1, $t0 + st.d $t7, $sp, 56 # 8-byte Folded Spill + alsl.d $t1, $fp, $t7, 3 + st.d $t1, $sp, 48 # 8-byte Folded Spill + or $t0, $t2, $t0 + alsl.d $t1, $a3, $a2, 3 + add.d $t2, $a2, $t4 + sltu $t2, $a6, $t2 + sltu $t1, $t1, $a4 + and $t1, $t2, $t1 + addi.d $t2, $a2, 8 + or $t0, $t0, $t1 + alsl.d $t8, $s0, $a2, 3 + st.d $t2, $sp, 40 # 8-byte Folded Spill + alsl.d $t2, $s0, $t2, 3 + add.d $t1, $a2, $t5 + sltu $a6, $a6, $t1 + alsl.d $t1, $fp, $a2, 3 + alsl.d $t2, $fp, $t2, 3 + st.d $t2, $sp, 32 # 8-byte Folded Spill + alsl.d $t2, $fp, $t8, 3 + st.d $t2, $sp, 96 # 8-byte Folded Spill + add.d $t2, $a2, $t3 + addi.d $a2, $a1, 1 + sltu $a1, $t2, $a4 + and $a1, $a6, $a1 + or $a1, $t0, $a1 + st.d $a2, $sp, 24 # 8-byte Folded Spill + bstrpick.d $a4, $a2, 32, 1 + slli.d $a2, $a4, 1 + st.d $a2, $sp, 144 # 8-byte Folded Spill + alsl.d $a2, $a4, $a3, 1 + st.d $a2, $sp, 16 # 8-byte Folded Spill + xor $a4, $a5, $a3 + sltui $a4, $a4, 1 + or $a1, $a4, $a1 + pcalau12i $a4, %pc_hi20(.LCPI2_0) + fld.d $fa0, $a4, %pc_lo12(.LCPI2_0) + lu12i.w $a4, 349525 + ori $a4, $a4, 1365 + lu32i.d $a4, 349525 + lu52i.d $a4, $a4, 1019 + vreplgr2vr.d $vr1, $a4 + addi.d $a2, $a5, 1 + st.d $a2, $sp, 88 # 8-byte Folded Spill + andi $fp, $a1, 1 b .LBB2_5 .p2align 4, , 16 .LBB2_4: # %"_Z6forallI9simd_execZL20BM_VOL3D_CALC_LAMBDARN9benchmark5StateEE3$_0EviiT0_.exit" @@ -1371,82 +1371,83 @@ _ZL20BM_VOL3D_CALC_LAMBDARN9benchmark5StateE: # @_ZL20BM_VOL3D_CALC_LAMBDARN9ben .LBB2_5: # =>This Loop Header: Depth=1 # Child Loop BB2_8 Depth 2 # Child Loop BB2_11 Depth 2 - blt $a4, $a2, .LBB2_4 + blt $a5, $a3, .LBB2_4 # %bb.6: # %.lr.ph.preheader # in Loop: Header=BB2_5 Depth=1 - move $a0, $a2 - bnez $ra, .LBB2_10 + move $a4, $a3 + bnez $fp, .LBB2_10 # %bb.7: # %vector.body.preheader # in Loop: Header=BB2_5 Depth=1 - move $a7, $t8 - move $t0, $t4 - ld.d $t5, $sp, 72 # 8-byte Folded Reload - ld.d $a6, $sp, 64 # 8-byte Folded Reload - ld.d $t6, $sp, 40 # 8-byte Folded Reload - ld.d $t7, $sp, 56 # 8-byte Folded Reload - ld.d $t1, $sp, 48 # 8-byte Folded Reload - move $a1, $t2 - move $fp, $s7 - ld.d $a0, $sp, 136 # 8-byte Folded Reload - move $t4, $s8 - move $s0, $s8 - ld.d $s1, $sp, 144 # 8-byte Folded Reload - ld.d $a3, $sp, 152 # 8-byte Folded Reload - move $s2, $s6 + ld.d $a1, $sp, 56 # 8-byte Folded Reload + move $s7, $s8 + move $a4, $s8 + move $a6, $a0 + ld.d $t0, $sp, 48 # 8-byte Folded Reload + ld.d $t2, $sp, 32 # 8-byte Folded Reload + ld.d $t3, $sp, 40 # 8-byte Folded Reload + move $t4, $t8 + move $t5, $t1 + ld.d $t6, $sp, 152 # 8-byte Folded Reload + ld.d $t7, $sp, 64 # 8-byte Folded Reload + move $a2, $s6 + move $s0, $s6 + ld.d $s1, $sp, 72 # 8-byte Folded Reload + move $s2, $ra + ld.d $s3, $sp, 144 # 8-byte Folded Reload .p2align 4, , 16 .LBB2_8: # %vector.body # Parent Loop BB2_5 Depth=1 # => This Inner Loop Header: Depth=2 - add.d $s3, $fp, $a5 - vld $vr3, $s3, 8 - add.d $s3, $s0, $a5 - vld $vr2, $s3, 8 + add.d $s5, $t6, $a7 + vld $vr3, $s5, 8 + add.d $s5, $s0, $a7 + vld $vr2, $s5, 8 vfsub.d $vr2, $vr3, $vr2 - add.d $s3, $s1, $a5 - vld $vr4, $s3, -8 - add.d $s3, $a0, $a5 - vld $vr5, $s3, -8 - vldx $vr6, $s1, $a5 - vldx $vr8, $s0, $a5 - vldx $vr9, $a0, $a5 + add.d $s5, $s1, $a7 + vld $vr4, $s5, -8 + add.d $s5, $t7, $a7 + vld $vr5, $s5, -8 + vldx $vr6, $s1, $a7 + vldx $vr8, $s0, $a7 + vldx $vr9, $t7, $a7 vfsub.d $vr7, $vr3, $vr4 vfsub.d $vr4, $vr3, $vr5 vfsub.d $vr5, $vr6, $vr8 vfsub.d $vr3, $vr9, $vr8 - vldx $vr6, $fp, $a5 - vldx $vr10, $a6, $a5 - add.d $s3, $t0, $a5 - vld $vr11, $s3, 8 - add.d $s3, $a7, $a5 - vld $vr12, $s3, -8 + vldx $vr6, $t6, $a7 + vldx $vr10, $t0, $a7 + add.d $s5, $a4, $a7 + vld $vr11, $s5, 8 + add.d $s5, $a1, $a7 + vld $vr12, $s5, -8 vfsub.d $vr9, $vr6, $vr8 - add.d $s3, $a6, $a5 + add.d $s5, $t0, $a7 vfsub.d $vr6, $vr10, $vr11 vfsub.d $vr11, $vr10, $vr12 - add.d $s8, $t5, $a5 - vldx $vr8, $t5, $a5 - vldx $vr12, $a7, $a5 - vldx $vr13, $t0, $a5 - vld $vr14, $s8, 8 - vld $vr15, $s3, -8 + add.d $ra, $a6, $a7 + vldx $vr8, $a6, $a7 + vldx $vr12, $a1, $a7 + vldx $vr13, $a4, $a7 + vld $vr14, $ra, 8 + vld $vr15, $s5, -8 vfsub.d $vr10, $vr10, $vr8 vfsub.d $vr12, $vr12, $vr13 vfsub.d $vr8, $vr14, $vr13 vfsub.d $vr13, $vr15, $vr13 - vldx $vr14, $t6, $a5 - vldx $vr15, $t7, $a5 - add.d $s3, $t6, $a5 - vldx $vr16, $t1, $a5 - add.d $s8, $t7, $a5 + vldx $vr14, $t2, $a7 + vldx $vr15, $t3, $a7 + add.d $s5, $t2, $a7 + vldx $vr16, $t4, $a7 + add.d $ra, $t3, $a7 vfsub.d $vr15, $vr14, $vr15 - add.d $s5, $t1, $a5 + add.d $s6, $t4, $a7 vfsub.d $vr16, $vr14, $vr16 - add.d $t3, $a1, $a5 - vldx $vr17, $a1, $a5 - vld $vr18, $s5, 8 - vld $vr19, $s8, -8 - vld $vr20, $t3, 8 - vld $vr21, $s3, -8 + add.d $s8, $t5, $a7 + vldx $vr17, $t5, $a7 + vld $vr18, $s6, 8 + vld $vr19, $ra, -8 + vld $vr20, $s8, 8 + vld $vr21, $s5, -8 vfsub.d $vr14, $vr14, $vr17 vfsub.d $vr17, $vr18, $vr19 vfsub.d $vr18, $vr20, $vr19 @@ -1499,173 +1500,165 @@ _ZL20BM_VOL3D_CALC_LAMBDARN9benchmark5StateE: # @_ZL20BM_VOL3D_CALC_LAMBDARN9ben vfmadd.d $vr2, $vr9, $vr2, $vr3 vfadd.d $vr2, $vr2, $vr7 vfmul.d $vr2, $vr2, $vr1 - vstx $vr2, $a3, $a5 - addi.d $s2, $s2, -2 - addi.d $a3, $a3, 16 + vstx $vr2, $s2, $a7 + addi.d $s3, $s3, -2 + addi.d $s2, $s2, 16 addi.d $s1, $s1, 16 addi.d $s0, $s0, 16 - addi.d $a0, $a0, 16 - addi.d $fp, $fp, 16 - addi.d $a1, $a1, 16 - addi.d $t1, $t1, 16 addi.d $t7, $t7, 16 addi.d $t6, $t6, 16 - addi.d $a6, $a6, 16 addi.d $t5, $t5, 16 + addi.d $t4, $t4, 16 + addi.d $t3, $t3, 16 + addi.d $t2, $t2, 16 addi.d $t0, $t0, 16 - addi.d $a7, $a7, 16 - bnez $s2, .LBB2_8 + addi.d $a6, $a6, 16 + addi.d $a4, $a4, 16 + addi.d $a1, $a1, 16 + bnez $s3, .LBB2_8 # %bb.9: # %middle.block # in Loop: Header=BB2_5 Depth=1 - ld.d $a0, $sp, 24 # 8-byte Folded Reload - move $s8, $t4 - ld.d $t4, $sp, 80 # 8-byte Folded Reload - ld.d $a1, $sp, 32 # 8-byte Folded Reload - beq $a1, $s6, .LBB2_4 + ld.d $a4, $sp, 16 # 8-byte Folded Reload + move $s6, $a2 + move $s8, $s7 + ld.d $ra, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 24 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload + beq $a1, $a2, .LBB2_4 .LBB2_10: # %.lr.ph.preheader188 # in Loop: Header=BB2_5 Depth=1 - slli.d $fp, $a0, 3 - ld.d $a1, $sp, 88 # 8-byte Folded Reload - sub.d $a7, $a1, $a0 - move $t6, $t2 - ld.d $t5, $sp, 104 # 8-byte Folded Reload - ld.d $t0, $sp, 120 # 8-byte Folded Reload - move $s2, $t4 - move $a6, $t8 - ld.d $s3, $sp, 128 # 8-byte Folded Reload - ld.d $t7, $sp, 96 # 8-byte Folded Reload - ld.d $t1, $sp, 112 # 8-byte Folded Reload - ld.d $a1, $sp, 136 # 8-byte Folded Reload - move $s1, $s8 - ld.d $a0, $sp, 144 # 8-byte Folded Reload - ld.d $s0, $sp, 152 # 8-byte Folded Reload - move $a3, $s7 + slli.d $a1, $a4, 3 + ld.d $a2, $sp, 88 # 8-byte Folded Reload + sub.d $a4, $a2, $a4 + move $s0, $ra + ld.d $s5, $sp, 96 # 8-byte Folded Reload + ld.d $s3, $sp, 136 # 8-byte Folded Reload + move $t7, $t1 + move $t6, $t8 + ld.d $t3, $sp, 104 # 8-byte Folded Reload + move $s2, $s8 + move $t2, $a0 + ld.d $t0, $sp, 112 # 8-byte Folded Reload + ld.d $a6, $sp, 152 # 8-byte Folded Reload + move $s1, $s6 + ld.d $t5, $sp, 120 # 8-byte Folded Reload + ld.d $t4, $sp, 128 # 8-byte Folded Reload .p2align 4, , 16 .LBB2_11: # %.lr.ph # Parent Loop BB2_5 Depth=1 # => This Inner Loop Header: Depth=2 - add.d $t3, $a3, $fp - add.d $s5, $a0, $fp - fld.d $fa2, $s5, -8 - add.d $s5, $a1, $fp - fld.d $fa3, $s5, -8 - fldx.d $fa4, $a0, $fp - fld.d $fa5, $t3, 8 - fldx.d $fa7, $a1, $fp - vldx $vr8, $a3, $fp - fldx.d $ft1, $s1, $fp - fsub.d $fa6, $fa5, $fa2 - vldx $vr10, $s1, $fp - fsub.d $fa3, $fa5, $fa3 - fsub.d $fa4, $fa4, $ft1 - fsub.d $fa2, $fa7, $ft1 - vfsub.d $vr9, $vr8, $vr10 - add.d $t3, $t1, $fp - add.d $s5, $a6, $fp - fld.d $fa5, $s5, -8 - add.d $s5, $t0, $fp - fld.d $fa7, $s5, -8 - fldx.d $ft0, $a6, $fp - fld.d $ft2, $t3, 8 - fldx.d $ft3, $t0, $fp - vldx $vr12, $t1, $fp - fldx.d $ft5, $s2, $fp - fsub.d $ft6, $ft2, $fa5 - vldx $vr15, $s2, $fp - fsub.d $fa7, $ft2, $fa7 - fsub.d $ft0, $ft0, $ft5 - fsub.d $fa5, $ft3, $ft5 - vfsub.d $vr10, $vr12, $vr15 - add.d $t3, $t7, $fp - add.d $s5, $t5, $fp - fld.d $ft3, $s5, -8 - add.d $s5, $t6, $fp - fldx.d $ft4, $t6, $fp - fldx.d $ft5, $t5, $fp - fld.d $ft7, $t3, 8 - fld.d $ft8, $s5, 8 - vldx $vr17, $t7, $fp - fldx.d $ft10, $s3, $fp - fsub.d $ft3, $ft7, $ft3 - vldx $vr19, $s3, $fp - fsub.d $ft4, $ft7, $ft4 - fsub.d $ft5, $ft5, $ft10 - fsub.d $ft7, $ft8, $ft10 - vfsub.d $vr16, $vr17, $vr19 - vreplvei.d $vr17, $vr9, 0 - vreplvei.d $vr9, $vr9, 1 - fadd.d $ft10, $ft1, $ft9 - vreplvei.d $vr19, $vr10, 0 - vreplvei.d $vr10, $vr10, 1 - fadd.d $ft12, $ft2, $ft11 - vreplvei.d $vr21, $vr16, 0 - vreplvei.d $vr16, $vr16, 1 - fadd.d $ft14, $ft8, $ft13 - fneg.d $ft15, $ft0 - fmul.d $ft15, $ft3, $ft15 - fmadd.d $ft15, $ft6, $ft5, $ft15 - fneg.d $fs0, $ft5 - fmul.d $fs0, $fa6, $fs0 - fmadd.d $fs0, $ft3, $fa4, $fs0 - fneg.d $fs1, $fa4 - fmul.d $fs1, $ft6, $fs1 - fmadd.d $fs1, $fa6, $ft0, $fs1 - fmul.d $ft12, $ft12, $fs0 - fmadd.d $ft10, $ft10, $ft15, $ft12 - fmadd.d $ft10, $ft14, $fs1, $ft10 - fadd.d $fa6, $fa6, $fa2 - fadd.d $ft6, $ft6, $fa5 - fadd.d $ft3, $ft3, $ft7 - fneg.d $ft12, $ft11 - fmul.d $ft12, $ft4, $ft12 - fmadd.d $ft12, $fa7, $ft13, $ft12 - fneg.d $ft13, $ft13 - fmul.d $ft13, $fa3, $ft13 - fmadd.d $ft13, $ft4, $ft9, $ft13 - fneg.d $ft9, $ft9 - fmul.d $ft9, $fa7, $ft9 - fmadd.d $ft9, $fa3, $ft11, $ft9 - fmul.d $ft6, $ft6, $ft13 - fmadd.d $fa6, $fa6, $ft12, $ft6 - fmadd.d $fa6, $ft3, $ft9, $fa6 - fadd.d $fa6, $ft10, $fa6 + vldx $vr5, $a6, $a1 + vldx $vr4, $t4, $a1 + vldx $vr2, $t5, $a1 + vreplvei.d $vr3, $vr5, 1 + vldx $vr6, $s1, $a1 + vpackev.d $vr7, $vr2, $vr4 + vfsub.d $vr3, $vr3, $vr7 + vpackod.d $vr2, $vr2, $vr5 + vreplvei.d $vr7, $vr6, 0 + vpackev.d $vr8, $vr4, $vr6 + vshuf4i.d $vr6, $vr0, 1 + vfsub.d $vr2, $vr2, $vr6 + vshuf4i.d $vr4, $vr5, 9 + vfsub.d $vr4, $vr4, $vr7 + vldx $vr7, $t3, $a1 + vldx $vr9, $t0, $a1 + vldx $vr10, $t2, $a1 + vfsub.d $vr8, $vr5, $vr8 + vreplvei.d $vr5, $vr7, 1 + vldx $vr11, $s2, $a1 + vpackev.d $vr6, $vr10, $vr9 + vfsub.d $vr6, $vr5, $vr6 + vpackod.d $vr5, $vr10, $vr7 + vreplvei.d $vr10, $vr11, 0 + vpackev.d $vr12, $vr9, $vr11 + vshuf4i.d $vr11, $vr0, 1 + vfsub.d $vr5, $vr5, $vr11 + vshuf4i.d $vr9, $vr7, 9 + vfsub.d $vr9, $vr9, $vr10 + vldx $vr10, $s5, $a1 + vldx $vr11, $t6, $a1 + vldx $vr13, $t7, $a1 + vfsub.d $vr7, $vr7, $vr12 + vreplvei.d $vr12, $vr10, 1 + vldx $vr14, $s3, $a1 + vpackev.d $vr15, $vr13, $vr11 + vfsub.d $vr12, $vr12, $vr15 + vpackod.d $vr13, $vr13, $vr10 + vpackev.d $vr15, $vr11, $vr14 + vreplvei.d $vr16, $vr14, 0 + vshuf4i.d $vr14, $vr0, 1 + vfsub.d $vr13, $vr13, $vr14 + vfsub.d $vr14, $vr10, $vr15 + vshuf4i.d $vr11, $vr10, 9 + vfsub.d $vr10, $vr11, $vr16 + vfadd.d $vr8, $vr8, $vr2 + vfadd.d $vr7, $vr7, $vr5 + vfadd.d $vr11, $vr14, $vr13 + vbitrevi.d $vr14, $vr9, 63 + vfmul.d $vr14, $vr12, $vr14 + vfmadd.d $vr14, $vr6, $vr10, $vr14 + vbitrevi.d $vr15, $vr10, 63 + vfmul.d $vr15, $vr3, $vr15 + vfmadd.d $vr15, $vr12, $vr4, $vr15 + vbitrevi.d $vr16, $vr4, 63 + vfmul.d $vr16, $vr6, $vr16 + vfmadd.d $vr16, $vr3, $vr9, $vr16 + vfmul.d $vr7, $vr7, $vr15 + vfmadd.d $vr7, $vr8, $vr14, $vr7 + vfmadd.d $vr7, $vr11, $vr16, $vr7 + vreplvei.d $vr8, $vr7, 0 + vreplvei.d $vr7, $vr7, 1 + fadd.d $fa7, $ft0, $fa7 + vreplvei.d $vr3, $vr3, 1 + vreplvei.d $vr4, $vr4, 0 fadd.d $fa3, $fa3, $fa4 - fadd.d $fa4, $fa7, $ft0 - fadd.d $fa7, $ft4, $ft5 - fneg.d $ft0, $fa5 - fmul.d $ft0, $ft8, $ft0 - fmadd.d $ft0, $ft2, $ft7, $ft0 - fneg.d $ft3, $ft7 - fmul.d $ft3, $ft1, $ft3 - fmadd.d $ft3, $ft8, $fa2, $ft3 + vreplvei.d $vr4, $vr9, 0 + vreplvei.d $vr6, $vr6, 1 + fadd.d $fa4, $fa6, $fa4 + vreplvei.d $vr6, $vr10, 0 + vreplvei.d $vr8, $vr12, 1 + fadd.d $fa6, $ft0, $fa6 + vreplvei.d $vr8, $vr5, 1 + fneg.d $ft1, $ft0 + vreplvei.d $vr10, $vr13, 0 + fmul.d $ft1, $ft2, $ft1 + vreplvei.d $vr11, $vr13, 1 + vreplvei.d $vr5, $vr5, 0 + fmadd.d $ft1, $fa5, $ft3, $ft1 + fneg.d $ft3, $ft3 + vreplvei.d $vr12, $vr2, 0 + fmul.d $ft3, $ft4, $ft3 + vreplvei.d $vr2, $vr2, 1 + fmadd.d $ft2, $ft2, $fa2, $ft3 fneg.d $fa2, $fa2 - fmul.d $fa2, $ft2, $fa2 - fmadd.d $fa2, $ft1, $fa5, $fa2 - fmul.d $fa4, $fa4, $ft3 - fmadd.d $fa3, $fa3, $ft0, $fa4 - fmadd.d $fa2, $fa7, $fa2, $fa3 - fadd.d $fa2, $fa2, $fa6 + fmul.d $fa2, $fa5, $fa2 + fmadd.d $fa2, $ft4, $ft0, $fa2 + fmul.d $fa4, $fa4, $ft2 + fmadd.d $fa3, $fa3, $ft1, $fa4 + fmadd.d $fa2, $fa6, $fa2, $fa3 + fadd.d $fa2, $fa2, $fa7 fmul.d $fa2, $fa2, $fa0 - fstx.d $fa2, $s0, $fp - addi.d $a3, $a3, 8 - addi.w $a7, $a7, -1 - addi.d $s0, $s0, 8 - addi.d $a0, $a0, 8 + fstx.d $fa2, $s0, $a1 + addi.d $t4, $t4, 8 + addi.d $t5, $t5, 8 addi.d $s1, $s1, 8 - addi.d $a1, $a1, 8 - addi.d $t1, $t1, 8 - addi.d $t7, $t7, 8 - addi.d $s3, $s3, 8 addi.d $a6, $a6, 8 - addi.d $s2, $s2, 8 addi.d $t0, $t0, 8 - addi.d $t5, $t5, 8 + addi.d $t2, $t2, 8 + addi.d $s2, $s2, 8 + addi.d $t3, $t3, 8 addi.d $t6, $t6, 8 - bnez $a7, .LBB2_11 + addi.d $t7, $t7, 8 + addi.d $s3, $s3, 8 + addi.d $s5, $s5, 8 + addi.w $a4, $a4, -1 + addi.d $s0, $s0, 8 + bnez $a4, .LBB2_11 b .LBB2_4 .LBB2_12: # %_ZN9benchmark5State3endEv.exit._crit_edge .Ltmp2: # EH_LABEL - ld.d $a0, $sp, 16 # 8-byte Folded Reload + ld.d $a0, $sp, 8 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZN9benchmark5State17FinishKeepRunningEv) jirl $ra, $ra, 0 .Ltmp3: # EH_LABEL diff --git a/results/MicroBenchmarks/LCALS/SubsetARawLoops/CMakeFiles/lcalsARaw.dir/RawSubsetAbenchmarks.s b/results/MicroBenchmarks/LCALS/SubsetARawLoops/CMakeFiles/lcalsARaw.dir/RawSubsetAbenchmarks.s index 70025da8..46b514d5 100644 --- a/results/MicroBenchmarks/LCALS/SubsetARawLoops/CMakeFiles/lcalsARaw.dir/RawSubsetAbenchmarks.s +++ b/results/MicroBenchmarks/LCALS/SubsetARawLoops/CMakeFiles/lcalsARaw.dir/RawSubsetAbenchmarks.s @@ -1191,14 +1191,13 @@ _ZL17BM_VOL3D_CALC_RAWRN9benchmark5StateE: # @_ZL17BM_VOL3D_CALC_RAWRN9benchmark ori $a0, $zero, 5 pcaddu18i $ra, %call36(_Z8loopInitj) jirl $ra, $ra, 0 - ld.d $s8, $s0, 8 + ld.d $s6, $s0, 8 ld.d $a0, $s2, 32 - ld.d $a1, $s0, 16 - st.d $a1, $sp, 80 # 8-byte Folded Spill + ld.d $s8, $s0, 16 ld.d $a1, $s0, 24 - st.d $a1, $sp, 128 # 8-byte Folded Spill + st.d $a1, $sp, 136 # 8-byte Folded Spill ld.d $a1, $s0, 32 - st.d $a1, $sp, 152 # 8-byte Folded Spill + st.d $a1, $sp, 80 # 8-byte Folded Spill ld.w $a1, $a0, 0 addi.d $a0, $sp, 160 ori $a2, $zero, 3 @@ -1209,158 +1208,159 @@ _ZL17BM_VOL3D_CALC_RAWRN9benchmark5StateE: # @_ZL17BM_VOL3D_CALC_RAWRN9benchmark ld.w $s1, $s2, 28 ld.d $s4, $s2, 16 .Ltmp0: # EH_LABEL - st.d $s2, $sp, 16 # 8-byte Folded Spill + st.d $s2, $sp, 8 # 8-byte Folded Spill move $a0, $s2 pcaddu18i $ra, %call36(_ZN9benchmark5State16StartKeepRunningEv) jirl $ra, $ra, 0 .Ltmp1: # EH_LABEL # %bb.1: # %_ZN9benchmark5State3endEv.exit.preheader - ld.d $t4, $sp, 80 # 8-byte Folded Reload + ld.d $ra, $sp, 80 # 8-byte Folded Reload bnez $s1, .LBB2_12 # %bb.2: # %_ZN9benchmark5State3endEv.exit.preheader beqz $s4, .LBB2_12 # %bb.3: # %.lr.ph213 - addi.d $a0, $s8, 8 - alsl.d $a1, $s0, $s8, 3 + addi.d $a0, $s6, 8 + alsl.d $a1, $s0, $s6, 3 alsl.d $a2, $s0, $a0, 3 - st.d $a2, $sp, 144 # 8-byte Folded Spill + st.d $a2, $sp, 72 # 8-byte Folded Spill + alsl.d $a2, $fp, $s6, 3 + st.d $a2, $sp, 120 # 8-byte Folded Spill + ld.w $a3, $sp, 224 alsl.d $a0, $fp, $a0, 3 - st.d $a0, $sp, 136 # 8-byte Folded Spill - ld.w $a2, $sp, 224 - alsl.d $s7, $fp, $a1, 3 - ld.w $a4, $sp, 228 - addi.d $a6, $t4, 8 - slli.d $a5, $a2, 3 - ld.d $a0, $sp, 152 # 8-byte Folded Reload - alsl.d $a3, $a2, $a0, 3 - sub.d $a0, $a4, $a2 - bstrpick.d $a0, $a0, 31, 0 - alsl.d $a1, $a0, $a3, 3 - addi.d $a1, $a1, 8 - slli.d $a7, $fp, 3 - slli.d $t5, $s0, 3 - alsl.d $a7, $a2, $a7, 3 - alsl.d $t1, $s0, $a7, 3 - add.d $t6, $s8, $t1 - alsl.d $t0, $a0, $t1, 3 - addi.d $t2, $t0, 16 + st.d $a0, $sp, 64 # 8-byte Folded Spill + ld.w $a5, $sp, 228 + st.d $a1, $sp, 128 # 8-byte Folded Spill + alsl.d $a0, $fp, $a1, 3 + st.d $a0, $sp, 152 # 8-byte Folded Spill + slli.d $a7, $a3, 3 + alsl.d $a6, $a3, $ra, 3 + sub.d $a0, $a5, $a3 + bstrpick.d $a1, $a0, 31, 0 + alsl.d $a0, $a1, $a6, 3 + addi.d $a4, $a0, 8 + slli.d $a0, $fp, 3 + slli.d $t3, $s0, 3 + alsl.d $t0, $a3, $a0, 3 + alsl.d $a0, $s0, $t0, 3 + add.d $t4, $s6, $a0 + alsl.d $t1, $a1, $a0, 3 + addi.d $t2, $t1, 16 + add.d $t5, $s6, $t2 + add.d $t6, $s6, $t0 + alsl.d $t1, $a1, $t0, 3 + addi.d $t1, $t1, 16 + add.d $t7, $s6, $t1 + alsl.d $t8, $a3, $s6, 3 + sltu $t5, $a6, $t5 + sltu $t4, $t4, $a4 + and $t5, $t5, $t4 + alsl.d $t4, $a1, $a7, 3 + addi.d $t4, $t4, 16 + sltu $t7, $a6, $t7 + sltu $t6, $t6, $a4 + and $t6, $t7, $t6 + add.d $t7, $s6, $t4 + alsl.d $t3, $a3, $t3, 3 + or $t6, $t5, $t6 + add.d $s1, $s6, $t3 + sltu $t5, $a6, $t7 + sltu $t7, $t8, $a4 + and $t7, $t5, $t7 + alsl.d $t5, $a1, $t3, 3 + addi.d $t5, $t5, 16 + or $t6, $t6, $t7 + add.d $t7, $s6, $t5 + sltu $t7, $a6, $t7 + sltu $t8, $s1, $a4 + and $t7, $t7, $t8 + add.d $t8, $s8, $a0 + or $t6, $t6, $t7 add.d $t7, $s8, $t2 - add.d $t8, $s8, $a7 - alsl.d $t0, $a0, $a7, 3 - addi.d $t0, $t0, 16 - add.d $s1, $s8, $t0 - alsl.d $s2, $a2, $s8, 3 - sltu $t7, $a3, $t7 - sltu $t6, $t6, $a1 - and $t7, $t7, $t6 - alsl.d $t6, $a0, $a5, 3 - addi.d $t6, $t6, 16 - sltu $s1, $a3, $s1 - sltu $t8, $t8, $a1 - and $t8, $s1, $t8 - add.d $s1, $s8, $t6 - alsl.d $t5, $a2, $t5, 3 - or $t8, $t7, $t8 - add.d $s3, $s8, $t5 - sltu $t7, $a3, $s1 - sltu $s1, $s2, $a1 - and $s1, $t7, $s1 - alsl.d $t7, $a0, $t5, 3 - addi.d $t7, $t7, 16 - or $t8, $t8, $s1 - add.d $s1, $s8, $t7 - sltu $s1, $a3, $s1 - sltu $s2, $s3, $a1 - and $s1, $s1, $s2 - add.d $s2, $t4, $t1 - or $t8, $t8, $s1 - add.d $s1, $t4, $t2 - sltu $s1, $a3, $s1 - sltu $s2, $s2, $a1 - and $s1, $s1, $s2 - add.d $s2, $t4, $a7 - or $t8, $t8, $s1 - add.d $s1, $t4, $t0 - sltu $s1, $a3, $s1 - sltu $s2, $s2, $a1 - and $s1, $s1, $s2 - alsl.d $s2, $a2, $t4, 3 - or $t8, $t8, $s1 - add.d $s1, $t4, $t6 - sltu $s1, $a3, $s1 - sltu $s2, $s2, $a1 - and $s1, $s1, $s2 - add.d $s2, $t4, $t5 - or $s1, $t8, $s1 - add.d $t8, $t4, $t7 - sltu $t8, $a3, $t8 - sltu $s2, $s2, $a1 - and $s2, $t8, $s2 - alsl.d $t8, $s0, $a6, 3 - or $s1, $s1, $s2 - alsl.d $s2, $fp, $t4, 3 - alsl.d $a6, $fp, $a6, 3 - st.d $a6, $sp, 120 # 8-byte Folded Spill - ld.d $t3, $sp, 128 # 8-byte Folded Reload - add.d $a6, $t3, $t1 - add.d $t1, $t3, $t2 - sltu $t1, $a3, $t1 - sltu $a6, $a6, $a1 - and $a6, $t1, $a6 - st.d $s2, $sp, 72 # 8-byte Folded Spill - alsl.d $t1, $s0, $s2, 3 - st.d $t1, $sp, 112 # 8-byte Folded Spill - or $a6, $s1, $a6 - alsl.d $t1, $fp, $t8, 3 - st.d $t1, $sp, 64 # 8-byte Folded Spill - add.d $a7, $t3, $a7 - add.d $t0, $t3, $t0 - sltu $t0, $a3, $t0 - sltu $a7, $a7, $a1 - and $a7, $t0, $a7 - addi.d $t1, $t3, 8 - or $a6, $a6, $a7 - alsl.d $a7, $a2, $t3, 3 - add.d $t0, $t3, $t6 - sltu $t0, $a3, $t0 - sltu $a7, $a7, $a1 - and $a7, $t0, $a7 - alsl.d $t0, $s0, $t3, 3 - st.d $t1, $sp, 56 # 8-byte Folded Spill - alsl.d $t1, $s0, $t1, 3 - or $a6, $a6, $a7 - alsl.d $t2, $fp, $t3, 3 - add.d $a7, $t3, $t7 - sltu $a3, $a3, $a7 - st.d $t0, $sp, 48 # 8-byte Folded Spill - alsl.d $a7, $fp, $t0, 3 - st.d $a7, $sp, 96 # 8-byte Folded Spill - st.d $t1, $sp, 104 # 8-byte Folded Spill - alsl.d $a7, $fp, $t1, 3 - st.d $a7, $sp, 40 # 8-byte Folded Spill - add.d $a7, $t3, $t5 - addi.d $t0, $a0, 1 - sltu $a0, $a7, $a1 - and $a0, $a3, $a0 - or $a0, $a6, $a0 - st.d $t0, $sp, 32 # 8-byte Folded Spill - bstrpick.d $a1, $t0, 32, 1 - slli.d $s6, $a1, 1 - alsl.d $a1, $a1, $a2, 1 - st.d $a1, $sp, 24 # 8-byte Folded Spill - xor $a1, $a4, $a2 - sltui $a1, $a1, 1 - or $a0, $a1, $a0 - pcalau12i $a1, %pc_hi20(.LCPI2_0) - fld.d $fa0, $a1, %pc_lo12(.LCPI2_0) - lu12i.w $a1, 349525 - ori $a1, $a1, 1365 - lu32i.d $a1, 349525 - lu52i.d $a1, $a1, 1019 - vreplgr2vr.d $vr1, $a1 - addi.d $a1, $a4, 1 - st.d $a1, $sp, 88 # 8-byte Folded Spill - andi $ra, $a0, 1 + sltu $t7, $a6, $t7 + sltu $t8, $t8, $a4 + and $t7, $t7, $t8 + add.d $t8, $s8, $t0 + or $t6, $t6, $t7 + add.d $t7, $s8, $t1 + sltu $t7, $a6, $t7 + sltu $t8, $t8, $a4 + and $t7, $t7, $t8 + alsl.d $t8, $a3, $s8, 3 + or $t6, $t6, $t7 + add.d $t7, $s8, $t4 + sltu $t7, $a6, $t7 + sltu $t8, $t8, $a4 + and $t7, $t7, $t8 + add.d $t8, $s8, $t3 + or $t6, $t6, $t7 + add.d $t7, $s8, $t5 + sltu $t7, $a6, $t7 + sltu $t8, $t8, $a4 + and $t7, $t7, $t8 + alsl.d $t8, $s0, $s8, 3 + or $t6, $t6, $t7 + addi.d $t7, $t8, 8 + ld.d $a2, $sp, 136 # 8-byte Folded Reload + add.d $a0, $a2, $a0 + add.d $t2, $a2, $t2 + sltu $t2, $a6, $t2 + sltu $a0, $a0, $a4 + and $t2, $t2, $a0 + alsl.d $a0, $fp, $s8, 3 + or $t2, $t6, $t2 + st.d $t8, $sp, 112 # 8-byte Folded Spill + alsl.d $t6, $fp, $t8, 3 + st.d $t6, $sp, 104 # 8-byte Folded Spill + add.d $t0, $a2, $t0 + add.d $t1, $a2, $t1 + sltu $t1, $a6, $t1 + sltu $t0, $t0, $a4 + and $t0, $t1, $t0 + st.d $t7, $sp, 56 # 8-byte Folded Spill + alsl.d $t1, $fp, $t7, 3 + st.d $t1, $sp, 48 # 8-byte Folded Spill + or $t0, $t2, $t0 + alsl.d $t1, $a3, $a2, 3 + add.d $t2, $a2, $t4 + sltu $t2, $a6, $t2 + sltu $t1, $t1, $a4 + and $t1, $t2, $t1 + addi.d $t2, $a2, 8 + or $t0, $t0, $t1 + alsl.d $t8, $s0, $a2, 3 + st.d $t2, $sp, 40 # 8-byte Folded Spill + alsl.d $t2, $s0, $t2, 3 + add.d $t1, $a2, $t5 + sltu $a6, $a6, $t1 + alsl.d $t1, $fp, $a2, 3 + alsl.d $t2, $fp, $t2, 3 + st.d $t2, $sp, 32 # 8-byte Folded Spill + alsl.d $t2, $fp, $t8, 3 + st.d $t2, $sp, 96 # 8-byte Folded Spill + add.d $t2, $a2, $t3 + addi.d $a2, $a1, 1 + sltu $a1, $t2, $a4 + and $a1, $a6, $a1 + or $a1, $t0, $a1 + st.d $a2, $sp, 24 # 8-byte Folded Spill + bstrpick.d $a4, $a2, 32, 1 + slli.d $a2, $a4, 1 + st.d $a2, $sp, 144 # 8-byte Folded Spill + alsl.d $a2, $a4, $a3, 1 + st.d $a2, $sp, 16 # 8-byte Folded Spill + xor $a4, $a5, $a3 + sltui $a4, $a4, 1 + or $a1, $a4, $a1 + pcalau12i $a4, %pc_hi20(.LCPI2_0) + fld.d $fa0, $a4, %pc_lo12(.LCPI2_0) + lu12i.w $a4, 349525 + ori $a4, $a4, 1365 + lu32i.d $a4, 349525 + lu52i.d $a4, $a4, 1019 + vreplgr2vr.d $vr1, $a4 + addi.d $a2, $a5, 1 + st.d $a2, $sp, 88 # 8-byte Folded Spill + andi $fp, $a1, 1 b .LBB2_5 .p2align 4, , 16 .LBB2_4: # %_ZN9benchmark5State3endEv.exit @@ -1370,82 +1370,83 @@ _ZL17BM_VOL3D_CALC_RAWRN9benchmark5StateE: # @_ZL17BM_VOL3D_CALC_RAWRN9benchmark .LBB2_5: # =>This Loop Header: Depth=1 # Child Loop BB2_8 Depth 2 # Child Loop BB2_11 Depth 2 - blt $a4, $a2, .LBB2_4 + blt $a5, $a3, .LBB2_4 # %bb.6: # %.lr.ph.preheader # in Loop: Header=BB2_5 Depth=1 - move $a0, $a2 - bnez $ra, .LBB2_10 + move $a4, $a3 + bnez $fp, .LBB2_10 # %bb.7: # %vector.body.preheader # in Loop: Header=BB2_5 Depth=1 - move $a7, $t8 - move $t0, $t4 - ld.d $a6, $sp, 72 # 8-byte Folded Reload - ld.d $t5, $sp, 64 # 8-byte Folded Reload - ld.d $t6, $sp, 40 # 8-byte Folded Reload - ld.d $t1, $sp, 56 # 8-byte Folded Reload - ld.d $a1, $sp, 48 # 8-byte Folded Reload - move $t7, $t2 - move $a0, $s7 - ld.d $fp, $sp, 136 # 8-byte Folded Reload - move $t4, $s8 - move $s0, $s8 - ld.d $a3, $sp, 144 # 8-byte Folded Reload - ld.d $s1, $sp, 152 # 8-byte Folded Reload - move $s2, $s6 + ld.d $a1, $sp, 56 # 8-byte Folded Reload + move $s7, $s8 + move $a4, $s8 + move $a6, $a0 + ld.d $t0, $sp, 48 # 8-byte Folded Reload + ld.d $t2, $sp, 32 # 8-byte Folded Reload + ld.d $t3, $sp, 40 # 8-byte Folded Reload + move $t4, $t8 + move $t5, $t1 + ld.d $t6, $sp, 152 # 8-byte Folded Reload + ld.d $t7, $sp, 64 # 8-byte Folded Reload + move $a2, $s6 + move $s0, $s6 + ld.d $s1, $sp, 72 # 8-byte Folded Reload + move $s2, $ra + ld.d $s3, $sp, 144 # 8-byte Folded Reload .p2align 4, , 16 .LBB2_8: # %vector.body # Parent Loop BB2_5 Depth=1 # => This Inner Loop Header: Depth=2 - add.d $s3, $a0, $a5 - vld $vr3, $s3, 8 - add.d $s3, $s0, $a5 - vld $vr2, $s3, 8 + add.d $s5, $t6, $a7 + vld $vr3, $s5, 8 + add.d $s5, $s0, $a7 + vld $vr2, $s5, 8 vfsub.d $vr2, $vr3, $vr2 - add.d $s3, $a3, $a5 - vld $vr4, $s3, -8 - add.d $s3, $fp, $a5 - vld $vr5, $s3, -8 - vldx $vr6, $a3, $a5 - vldx $vr8, $s0, $a5 - vldx $vr9, $fp, $a5 + add.d $s5, $s1, $a7 + vld $vr4, $s5, -8 + add.d $s5, $t7, $a7 + vld $vr5, $s5, -8 + vldx $vr6, $s1, $a7 + vldx $vr8, $s0, $a7 + vldx $vr9, $t7, $a7 vfsub.d $vr7, $vr3, $vr4 vfsub.d $vr4, $vr3, $vr5 vfsub.d $vr5, $vr6, $vr8 vfsub.d $vr3, $vr9, $vr8 - vldx $vr6, $a0, $a5 - vldx $vr10, $t5, $a5 - add.d $s3, $t0, $a5 - vld $vr11, $s3, 8 - add.d $s3, $a7, $a5 - vld $vr12, $s3, -8 + vldx $vr6, $t6, $a7 + vldx $vr10, $t0, $a7 + add.d $s5, $a4, $a7 + vld $vr11, $s5, 8 + add.d $s5, $a1, $a7 + vld $vr12, $s5, -8 vfsub.d $vr9, $vr6, $vr8 - add.d $s3, $t5, $a5 + add.d $s5, $t0, $a7 vfsub.d $vr6, $vr10, $vr11 vfsub.d $vr11, $vr10, $vr12 - add.d $s8, $a6, $a5 - vldx $vr8, $a6, $a5 - vldx $vr12, $a7, $a5 - vldx $vr13, $t0, $a5 - vld $vr14, $s8, 8 - vld $vr15, $s3, -8 + add.d $ra, $a6, $a7 + vldx $vr8, $a6, $a7 + vldx $vr12, $a1, $a7 + vldx $vr13, $a4, $a7 + vld $vr14, $ra, 8 + vld $vr15, $s5, -8 vfsub.d $vr10, $vr10, $vr8 vfsub.d $vr12, $vr12, $vr13 vfsub.d $vr8, $vr14, $vr13 vfsub.d $vr13, $vr15, $vr13 - vldx $vr14, $t6, $a5 - vldx $vr15, $t1, $a5 - add.d $s3, $t6, $a5 - vldx $vr16, $a1, $a5 - add.d $s8, $t1, $a5 + vldx $vr14, $t2, $a7 + vldx $vr15, $t3, $a7 + add.d $s5, $t2, $a7 + vldx $vr16, $t4, $a7 + add.d $ra, $t3, $a7 vfsub.d $vr15, $vr14, $vr15 - add.d $s5, $a1, $a5 + add.d $s6, $t4, $a7 vfsub.d $vr16, $vr14, $vr16 - add.d $t3, $t7, $a5 - vldx $vr17, $t7, $a5 - vld $vr18, $s5, 8 - vld $vr19, $s8, -8 - vld $vr20, $t3, 8 - vld $vr21, $s3, -8 + add.d $s8, $t5, $a7 + vldx $vr17, $t5, $a7 + vld $vr18, $s6, 8 + vld $vr19, $ra, -8 + vld $vr20, $s8, 8 + vld $vr21, $s5, -8 vfsub.d $vr14, $vr14, $vr17 vfsub.d $vr17, $vr18, $vr19 vfsub.d $vr18, $vr20, $vr19 @@ -1498,173 +1499,165 @@ _ZL17BM_VOL3D_CALC_RAWRN9benchmark5StateE: # @_ZL17BM_VOL3D_CALC_RAWRN9benchmark vfmadd.d $vr2, $vr9, $vr2, $vr3 vfadd.d $vr2, $vr2, $vr7 vfmul.d $vr2, $vr2, $vr1 - vstx $vr2, $s1, $a5 - addi.d $s2, $s2, -2 + vstx $vr2, $s2, $a7 + addi.d $s3, $s3, -2 + addi.d $s2, $s2, 16 addi.d $s1, $s1, 16 - addi.d $a3, $a3, 16 addi.d $s0, $s0, 16 - addi.d $fp, $fp, 16 - addi.d $a0, $a0, 16 addi.d $t7, $t7, 16 - addi.d $a1, $a1, 16 - addi.d $t1, $t1, 16 addi.d $t6, $t6, 16 addi.d $t5, $t5, 16 - addi.d $a6, $a6, 16 + addi.d $t4, $t4, 16 + addi.d $t3, $t3, 16 + addi.d $t2, $t2, 16 addi.d $t0, $t0, 16 - addi.d $a7, $a7, 16 - bnez $s2, .LBB2_8 + addi.d $a6, $a6, 16 + addi.d $a4, $a4, 16 + addi.d $a1, $a1, 16 + bnez $s3, .LBB2_8 # %bb.9: # %middle.block # in Loop: Header=BB2_5 Depth=1 - ld.d $a0, $sp, 24 # 8-byte Folded Reload - move $s8, $t4 - ld.d $t4, $sp, 80 # 8-byte Folded Reload - ld.d $a1, $sp, 32 # 8-byte Folded Reload - beq $a1, $s6, .LBB2_4 + ld.d $a4, $sp, 16 # 8-byte Folded Reload + move $s6, $a2 + move $s8, $s7 + ld.d $ra, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 24 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload + beq $a1, $a2, .LBB2_4 .LBB2_10: # %.lr.ph.preheader309 # in Loop: Header=BB2_5 Depth=1 - slli.d $fp, $a0, 3 - ld.d $a1, $sp, 88 # 8-byte Folded Reload - sub.d $a7, $a1, $a0 - move $t6, $t2 - ld.d $t5, $sp, 104 # 8-byte Folded Reload - ld.d $t0, $sp, 120 # 8-byte Folded Reload - move $s2, $t4 - move $a6, $t8 - ld.d $s3, $sp, 128 # 8-byte Folded Reload - ld.d $t7, $sp, 96 # 8-byte Folded Reload - ld.d $t1, $sp, 112 # 8-byte Folded Reload - ld.d $a1, $sp, 136 # 8-byte Folded Reload - move $s1, $s8 - ld.d $a0, $sp, 144 # 8-byte Folded Reload - ld.d $s0, $sp, 152 # 8-byte Folded Reload - move $a3, $s7 + slli.d $a1, $a4, 3 + ld.d $a2, $sp, 88 # 8-byte Folded Reload + sub.d $a4, $a2, $a4 + move $s0, $ra + ld.d $s5, $sp, 96 # 8-byte Folded Reload + ld.d $s3, $sp, 136 # 8-byte Folded Reload + move $t7, $t1 + move $t6, $t8 + ld.d $t3, $sp, 104 # 8-byte Folded Reload + move $s2, $s8 + move $t2, $a0 + ld.d $t0, $sp, 112 # 8-byte Folded Reload + ld.d $a6, $sp, 152 # 8-byte Folded Reload + move $s1, $s6 + ld.d $t5, $sp, 120 # 8-byte Folded Reload + ld.d $t4, $sp, 128 # 8-byte Folded Reload .p2align 4, , 16 .LBB2_11: # %.lr.ph # Parent Loop BB2_5 Depth=1 # => This Inner Loop Header: Depth=2 - add.d $t3, $a3, $fp - add.d $s5, $a0, $fp - fld.d $fa2, $s5, -8 - add.d $s5, $a1, $fp - fld.d $fa3, $s5, -8 - fldx.d $fa4, $a0, $fp - fld.d $fa5, $t3, 8 - fldx.d $fa7, $a1, $fp - vldx $vr8, $a3, $fp - fldx.d $ft1, $s1, $fp - fsub.d $fa6, $fa5, $fa2 - vldx $vr10, $s1, $fp - fsub.d $fa3, $fa5, $fa3 - fsub.d $fa4, $fa4, $ft1 - fsub.d $fa2, $fa7, $ft1 - vfsub.d $vr9, $vr8, $vr10 - add.d $t3, $t1, $fp - add.d $s5, $a6, $fp - fld.d $fa5, $s5, -8 - add.d $s5, $t0, $fp - fld.d $fa7, $s5, -8 - fldx.d $ft0, $a6, $fp - fld.d $ft2, $t3, 8 - fldx.d $ft3, $t0, $fp - vldx $vr12, $t1, $fp - fldx.d $ft5, $s2, $fp - fsub.d $ft6, $ft2, $fa5 - vldx $vr15, $s2, $fp - fsub.d $fa7, $ft2, $fa7 - fsub.d $ft0, $ft0, $ft5 - fsub.d $fa5, $ft3, $ft5 - vfsub.d $vr10, $vr12, $vr15 - add.d $t3, $t7, $fp - add.d $s5, $t5, $fp - fld.d $ft3, $s5, -8 - add.d $s5, $t6, $fp - fldx.d $ft4, $t6, $fp - fldx.d $ft5, $t5, $fp - fld.d $ft7, $t3, 8 - fld.d $ft8, $s5, 8 - vldx $vr17, $t7, $fp - fldx.d $ft10, $s3, $fp - fsub.d $ft3, $ft7, $ft3 - vldx $vr19, $s3, $fp - fsub.d $ft4, $ft7, $ft4 - fsub.d $ft5, $ft5, $ft10 - fsub.d $ft7, $ft8, $ft10 - vfsub.d $vr16, $vr17, $vr19 - vreplvei.d $vr17, $vr9, 0 - vreplvei.d $vr9, $vr9, 1 - fadd.d $ft10, $ft1, $ft9 - vreplvei.d $vr19, $vr10, 0 - vreplvei.d $vr10, $vr10, 1 - fadd.d $ft12, $ft2, $ft11 - vreplvei.d $vr21, $vr16, 0 - vreplvei.d $vr16, $vr16, 1 - fadd.d $ft14, $ft8, $ft13 - fneg.d $ft15, $ft0 - fmul.d $ft15, $ft3, $ft15 - fmadd.d $ft15, $ft6, $ft5, $ft15 - fneg.d $fs0, $ft5 - fmul.d $fs0, $fa6, $fs0 - fmadd.d $fs0, $ft3, $fa4, $fs0 - fneg.d $fs1, $fa4 - fmul.d $fs1, $ft6, $fs1 - fmadd.d $fs1, $fa6, $ft0, $fs1 - fmul.d $ft12, $ft12, $fs0 - fmadd.d $ft10, $ft10, $ft15, $ft12 - fmadd.d $ft10, $ft14, $fs1, $ft10 - fadd.d $fa6, $fa6, $fa2 - fadd.d $ft6, $ft6, $fa5 - fadd.d $ft3, $ft3, $ft7 - fneg.d $ft12, $ft11 - fmul.d $ft12, $ft4, $ft12 - fmadd.d $ft12, $fa7, $ft13, $ft12 - fneg.d $ft13, $ft13 - fmul.d $ft13, $fa3, $ft13 - fmadd.d $ft13, $ft4, $ft9, $ft13 - fneg.d $ft9, $ft9 - fmul.d $ft9, $fa7, $ft9 - fmadd.d $ft9, $fa3, $ft11, $ft9 - fmul.d $ft6, $ft6, $ft13 - fmadd.d $fa6, $fa6, $ft12, $ft6 - fmadd.d $fa6, $ft3, $ft9, $fa6 - fadd.d $fa6, $ft10, $fa6 + vldx $vr5, $a6, $a1 + vldx $vr4, $t4, $a1 + vldx $vr2, $t5, $a1 + vreplvei.d $vr3, $vr5, 1 + vldx $vr6, $s1, $a1 + vpackev.d $vr7, $vr2, $vr4 + vfsub.d $vr3, $vr3, $vr7 + vpackod.d $vr2, $vr2, $vr5 + vreplvei.d $vr7, $vr6, 0 + vpackev.d $vr8, $vr4, $vr6 + vshuf4i.d $vr6, $vr0, 1 + vfsub.d $vr2, $vr2, $vr6 + vshuf4i.d $vr4, $vr5, 9 + vfsub.d $vr4, $vr4, $vr7 + vldx $vr7, $t3, $a1 + vldx $vr9, $t0, $a1 + vldx $vr10, $t2, $a1 + vfsub.d $vr8, $vr5, $vr8 + vreplvei.d $vr5, $vr7, 1 + vldx $vr11, $s2, $a1 + vpackev.d $vr6, $vr10, $vr9 + vfsub.d $vr6, $vr5, $vr6 + vpackod.d $vr5, $vr10, $vr7 + vreplvei.d $vr10, $vr11, 0 + vpackev.d $vr12, $vr9, $vr11 + vshuf4i.d $vr11, $vr0, 1 + vfsub.d $vr5, $vr5, $vr11 + vshuf4i.d $vr9, $vr7, 9 + vfsub.d $vr9, $vr9, $vr10 + vldx $vr10, $s5, $a1 + vldx $vr11, $t6, $a1 + vldx $vr13, $t7, $a1 + vfsub.d $vr7, $vr7, $vr12 + vreplvei.d $vr12, $vr10, 1 + vldx $vr14, $s3, $a1 + vpackev.d $vr15, $vr13, $vr11 + vfsub.d $vr12, $vr12, $vr15 + vpackod.d $vr13, $vr13, $vr10 + vpackev.d $vr15, $vr11, $vr14 + vreplvei.d $vr16, $vr14, 0 + vshuf4i.d $vr14, $vr0, 1 + vfsub.d $vr13, $vr13, $vr14 + vfsub.d $vr14, $vr10, $vr15 + vshuf4i.d $vr11, $vr10, 9 + vfsub.d $vr10, $vr11, $vr16 + vfadd.d $vr8, $vr8, $vr2 + vfadd.d $vr7, $vr7, $vr5 + vfadd.d $vr11, $vr14, $vr13 + vbitrevi.d $vr14, $vr9, 63 + vfmul.d $vr14, $vr12, $vr14 + vfmadd.d $vr14, $vr6, $vr10, $vr14 + vbitrevi.d $vr15, $vr10, 63 + vfmul.d $vr15, $vr3, $vr15 + vfmadd.d $vr15, $vr12, $vr4, $vr15 + vbitrevi.d $vr16, $vr4, 63 + vfmul.d $vr16, $vr6, $vr16 + vfmadd.d $vr16, $vr3, $vr9, $vr16 + vfmul.d $vr7, $vr7, $vr15 + vfmadd.d $vr7, $vr8, $vr14, $vr7 + vfmadd.d $vr7, $vr11, $vr16, $vr7 + vreplvei.d $vr8, $vr7, 0 + vreplvei.d $vr7, $vr7, 1 + fadd.d $fa7, $ft0, $fa7 + vreplvei.d $vr3, $vr3, 1 + vreplvei.d $vr4, $vr4, 0 fadd.d $fa3, $fa3, $fa4 - fadd.d $fa4, $fa7, $ft0 - fadd.d $fa7, $ft4, $ft5 - fneg.d $ft0, $fa5 - fmul.d $ft0, $ft8, $ft0 - fmadd.d $ft0, $ft2, $ft7, $ft0 - fneg.d $ft3, $ft7 - fmul.d $ft3, $ft1, $ft3 - fmadd.d $ft3, $ft8, $fa2, $ft3 + vreplvei.d $vr4, $vr9, 0 + vreplvei.d $vr6, $vr6, 1 + fadd.d $fa4, $fa6, $fa4 + vreplvei.d $vr6, $vr10, 0 + vreplvei.d $vr8, $vr12, 1 + fadd.d $fa6, $ft0, $fa6 + vreplvei.d $vr8, $vr5, 1 + fneg.d $ft1, $ft0 + vreplvei.d $vr10, $vr13, 0 + fmul.d $ft1, $ft2, $ft1 + vreplvei.d $vr11, $vr13, 1 + vreplvei.d $vr5, $vr5, 0 + fmadd.d $ft1, $fa5, $ft3, $ft1 + fneg.d $ft3, $ft3 + vreplvei.d $vr12, $vr2, 0 + fmul.d $ft3, $ft4, $ft3 + vreplvei.d $vr2, $vr2, 1 + fmadd.d $ft2, $ft2, $fa2, $ft3 fneg.d $fa2, $fa2 - fmul.d $fa2, $ft2, $fa2 - fmadd.d $fa2, $ft1, $fa5, $fa2 - fmul.d $fa4, $fa4, $ft3 - fmadd.d $fa3, $fa3, $ft0, $fa4 - fmadd.d $fa2, $fa7, $fa2, $fa3 - fadd.d $fa2, $fa2, $fa6 + fmul.d $fa2, $fa5, $fa2 + fmadd.d $fa2, $ft4, $ft0, $fa2 + fmul.d $fa4, $fa4, $ft2 + fmadd.d $fa3, $fa3, $ft1, $fa4 + fmadd.d $fa2, $fa6, $fa2, $fa3 + fadd.d $fa2, $fa2, $fa7 fmul.d $fa2, $fa2, $fa0 - fstx.d $fa2, $s0, $fp - addi.d $a3, $a3, 8 - addi.w $a7, $a7, -1 - addi.d $s0, $s0, 8 - addi.d $a0, $a0, 8 + fstx.d $fa2, $s0, $a1 + addi.d $t4, $t4, 8 + addi.d $t5, $t5, 8 addi.d $s1, $s1, 8 - addi.d $a1, $a1, 8 - addi.d $t1, $t1, 8 - addi.d $t7, $t7, 8 - addi.d $s3, $s3, 8 addi.d $a6, $a6, 8 - addi.d $s2, $s2, 8 addi.d $t0, $t0, 8 - addi.d $t5, $t5, 8 + addi.d $t2, $t2, 8 + addi.d $s2, $s2, 8 + addi.d $t3, $t3, 8 addi.d $t6, $t6, 8 - bnez $a7, .LBB2_11 + addi.d $t7, $t7, 8 + addi.d $s3, $s3, 8 + addi.d $s5, $s5, 8 + addi.w $a4, $a4, -1 + addi.d $s0, $s0, 8 + bnez $a4, .LBB2_11 b .LBB2_4 .LBB2_12: # %_ZN9benchmark5State3endEv.exit._crit_edge .Ltmp2: # EH_LABEL - ld.d $a0, $sp, 16 # 8-byte Folded Reload + ld.d $a0, $sp, 8 # 8-byte Folded Reload pcaddu18i $ra, %call36(_ZN9benchmark5State17FinishKeepRunningEv) jirl $ra, $ra, 0 .Ltmp3: # EH_LABEL diff --git a/results/MicroBenchmarks/LoopVectorization/CMakeFiles/LoopInterleavingBenchmarks.dir/LoopInterleaving.s b/results/MicroBenchmarks/LoopVectorization/CMakeFiles/LoopInterleavingBenchmarks.dir/LoopInterleaving.s index c69eef6b..3ace1727 100644 --- a/results/MicroBenchmarks/LoopVectorization/CMakeFiles/LoopInterleavingBenchmarks.dir/LoopInterleaving.s +++ b/results/MicroBenchmarks/LoopVectorization/CMakeFiles/LoopInterleavingBenchmarks.dir/LoopInterleaving.s @@ -1608,15 +1608,15 @@ _ZL27loopWithReductionWithVW1IC4i: # @_ZL27loopWithReductionWithVW1IC4i addi.d $a1, $a1, 16 bnez $a3, .LBB21_5 # %bb.6: # %middle.block - vpickve2gr.w $a1, $vr0, 0 - vpickve2gr.w $a3, $vr0, 1 - add.d $a1, $a3, $a1 - vpickve2gr.w $a3, $vr0, 2 - add.d $a1, $a3, $a1 + vreplvei.w $vr1, $vr0, 1 + vadd.w $vr1, $vr1, $vr0 + vreplvei.w $vr2, $vr0, 2 + vadd.w $vr1, $vr2, $vr1 + vpickve2gr.w $a1, $vr1, 0 vpickve2gr.w $a3, $vr0, 3 add.w $a1, $a3, $a1 beq $a2, $a0, .LBB21_9 -.LBB21_7: # %.lr.ph.preheader14 +.LBB21_7: # %.lr.ph.preheader18 pcalau12i $a3, %pc_hi20(A) addi.d $a3, $a3, %pc_lo12(A) alsl.d $a3, $a2, $a3, 2 @@ -3709,15 +3709,15 @@ _ZL30bigLoopWithReductionWithVW1IC4i: # @_ZL30bigLoopWithReductionWithVW1IC4i addi.d $a1, $a1, 16 bnez $a7, .LBB42_5 # %bb.6: # %middle.block - vpickve2gr.w $a1, $vr0, 0 - vpickve2gr.w $a4, $vr0, 1 - add.d $a1, $a4, $a1 - vpickve2gr.w $a4, $vr0, 2 - add.d $a1, $a4, $a1 + vreplvei.w $vr1, $vr0, 1 + vadd.w $vr1, $vr1, $vr0 + vreplvei.w $vr2, $vr0, 2 + vadd.w $vr1, $vr2, $vr1 + vpickve2gr.w $a1, $vr1, 0 vpickve2gr.w $a4, $vr0, 3 add.w $a1, $a4, $a1 beq $a3, $a0, .LBB42_9 -.LBB42_7: # %.lr.ph.preheader17 +.LBB42_7: # %.lr.ph.preheader21 pcalau12i $a4, %pc_hi20(A) addi.d $a4, $a4, %pc_lo12(A) alsl.d $a4, $a3, $a4, 2 diff --git a/results/MicroBenchmarks/SLPVectorization/CMakeFiles/SLPVectorizationBenchmarks.dir/Versioning.s b/results/MicroBenchmarks/SLPVectorization/CMakeFiles/SLPVectorizationBenchmarks.dir/Versioning.s index 32a067fa..70043bcc 100644 --- a/results/MicroBenchmarks/SLPVectorization/CMakeFiles/SLPVectorizationBenchmarks.dir/Versioning.s +++ b/results/MicroBenchmarks/SLPVectorization/CMakeFiles/SLPVectorizationBenchmarks.dir/Versioning.s @@ -1042,14 +1042,11 @@ _Z54benchmark_multiply_accumulate_no_runtime_checks_neededILj2EdEvRN9benchmark5S .p2align 4, , 16 .LBB14_3: # %.lr.ph # =>This Inner Loop Header: Depth=1 - fld.d $fa0, $sp, 32 - fld.d $fa1, $sp, 16 - fld.d $fa2, $sp, 24 - fld.d $fa3, $sp, 40 - fmadd.d $fa1, $fa0, $fa1, $fa0 - fst.d $fa1, $sp, 32 - fmadd.d $fa0, $fa0, $fa2, $fa3 - fst.d $fa0, $sp, 40 + vld $vr0, $sp, 32 + vld $vr1, $sp, 16 + vreplvei.d $vr2, $vr0, 0 + vfmadd.d $vr0, $vr2, $vr1, $vr0 + vst $vr0, $sp, 32 #APP #NO_APP #APP @@ -1209,18 +1206,18 @@ _Z49benchmark_multiply_accumulate_runtime_checks_failILj3EdEvRN9benchmark5StateE _Z54benchmark_multiply_accumulate_no_runtime_checks_neededILj3EdEvRN9benchmark5StateE: # @_Z54benchmark_multiply_accumulate_no_runtime_checks_neededILj3EdEvRN9benchmark5StateE .cfi_startproc # %bb.0: # %_ZN9benchmark5State13StateIteratorC2EPS0_.exit - addi.d $sp, $sp, -96 - .cfi_def_cfa_offset 96 - st.d $ra, $sp, 88 # 8-byte Folded Spill - st.d $fp, $sp, 80 # 8-byte Folded Spill - st.d $s0, $sp, 72 # 8-byte Folded Spill - st.d $s1, $sp, 64 # 8-byte Folded Spill + addi.d $sp, $sp, -112 + .cfi_def_cfa_offset 112 + st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s1, $sp, 80 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 .cfi_offset 24, -32 move $fp, $a0 - addi.d $a0, $sp, 40 + addi.d $a0, $sp, 48 ori $a1, $zero, 3 pcaddu18i $ra, %call36(_ZL9init_dataIdEvPT_j) jirl $ra, $ra, 0 @@ -1237,23 +1234,21 @@ _Z54benchmark_multiply_accumulate_no_runtime_checks_neededILj3EdEvRN9benchmark5S # %bb.1: # %_ZN9benchmark5State13StateIteratorC2EPS0_.exit beqz $s0, .LBB17_4 # %bb.2: - addi.d $a0, $sp, 40 + addi.d $a0, $sp, 48 addi.d $a1, $sp, 16 .p2align 4, , 16 .LBB17_3: # %.lr.ph # =>This Inner Loop Header: Depth=1 - fld.d $fa0, $sp, 40 - fld.d $fa1, $sp, 16 - fmadd.d $fa1, $fa0, $fa1, $fa0 - fld.d $fa2, $sp, 24 - fld.d $fa3, $sp, 48 - fld.d $fa4, $sp, 32 - fld.d $fa5, $sp, 56 - fst.d $fa1, $sp, 40 - fmadd.d $fa1, $fa0, $fa2, $fa3 - fst.d $fa1, $sp, 48 - fmadd.d $fa0, $fa0, $fa4, $fa5 - fst.d $fa0, $sp, 56 + vld $vr0, $sp, 16 + vld $vr1, $sp, 48 + fld.d $fa2, $sp, 48 + fld.d $fa3, $sp, 32 + fld.d $fa4, $sp, 64 + vreplvei.d $vr5, $vr1, 0 + vfmadd.d $vr0, $vr5, $vr0, $vr1 + vst $vr0, $sp, 48 + fmadd.d $fa0, $fa2, $fa3, $fa4 + fst.d $fa0, $sp, 64 #APP #NO_APP #APP @@ -1265,11 +1260,11 @@ _Z54benchmark_multiply_accumulate_no_runtime_checks_neededILj3EdEvRN9benchmark5S move $a0, $fp pcaddu18i $ra, %call36(_ZN9benchmark5State17FinishKeepRunningEv) jirl $ra, $ra, 0 - ld.d $s1, $sp, 64 # 8-byte Folded Reload - ld.d $s0, $sp, 72 # 8-byte Folded Reload - ld.d $fp, $sp, 80 # 8-byte Folded Reload - ld.d $ra, $sp, 88 # 8-byte Folded Reload - addi.d $sp, $sp, 96 + ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $fp, $sp, 96 # 8-byte Folded Reload + ld.d $ra, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret .Lfunc_end17: .size _Z54benchmark_multiply_accumulate_no_runtime_checks_neededILj3EdEvRN9benchmark5StateE, .Lfunc_end17-_Z54benchmark_multiply_accumulate_no_runtime_checks_neededILj3EdEvRN9benchmark5StateE @@ -1446,22 +1441,15 @@ _Z54benchmark_multiply_accumulate_no_runtime_checks_neededILj4EdEvRN9benchmark5S .p2align 4, , 16 .LBB20_3: # %.lr.ph # =>This Inner Loop Header: Depth=1 - fld.d $fa0, $sp, 48 - fld.d $fa1, $sp, 16 - fld.d $fa2, $sp, 24 - fld.d $fa3, $sp, 56 - fmadd.d $fa1, $fa0, $fa1, $fa0 - fst.d $fa1, $sp, 48 - fmadd.d $fa1, $fa0, $fa2, $fa3 - fld.d $fa2, $sp, 32 - fld.d $fa3, $sp, 64 - fld.d $fa4, $sp, 40 - fld.d $fa5, $sp, 72 - fst.d $fa1, $sp, 56 - fmadd.d $fa1, $fa0, $fa2, $fa3 - fst.d $fa1, $sp, 64 - fmadd.d $fa0, $fa0, $fa4, $fa5 - fst.d $fa0, $sp, 72 + vld $vr0, $sp, 48 + vld $vr1, $sp, 16 + vld $vr2, $sp, 32 + vld $vr3, $sp, 64 + vreplvei.d $vr4, $vr0, 0 + vfmadd.d $vr0, $vr4, $vr1, $vr0 + vst $vr0, $sp, 48 + vfmadd.d $vr0, $vr4, $vr2, $vr3 + vst $vr0, $sp, 64 #APP #NO_APP #APP diff --git a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_special.s b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_special.s index 7d478580..5b980b73 100644 --- a/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_special.s +++ b/results/MultiSource/Applications/ClamAV/CMakeFiles/clamscan.dir/libclamav_special.s @@ -50,19 +50,19 @@ cli_check_mydoom_log: # @cli_check_mydoom_log vshuf4i.b $vr0, $vr0, 27 vreplgr2vr.w $vr1, $a0 vxor.v $vr0, $vr0, $vr1 - vpickve2gr.w $a1, $vr0, 0 - vpickve2gr.w $a2, $vr0, 1 - add.d $a1, $a2, $a1 - vpickve2gr.w $a2, $vr0, 2 - ld.w $a3, $sp, 28 - add.d $a1, $a2, $a1 - vpickve2gr.w $a2, $vr0, 3 - add.d $a1, $a2, $a1 - revb.2w $a2, $a3 + vreplvei.w $vr1, $vr0, 1 + vadd.w $vr1, $vr1, $vr0 + vreplvei.w $vr2, $vr0, 2 + vadd.w $vr1, $vr2, $vr1 + ld.w $a1, $sp, 28 + vpickve2gr.w $a2, $vr1, 0 + vpickve2gr.w $a3, $vr0, 3 + add.d $a2, $a3, $a2 + revb.2w $a1, $a1 ld.w $a3, $sp, 32 - xor $a2, $a2, $a0 - st.w $a2, $sp, 28 - add.d $a1, $a2, $a1 + xor $a1, $a1, $a0 + st.w $a1, $sp, 28 + add.d $a1, $a1, $a2 revb.2w $a2, $a3 xor $a2, $a2, $a0 ld.w $a3, $sp, 36 diff --git a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/erc_do_p.s b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/erc_do_p.s index a9db8579..cb6bff50 100644 --- a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/erc_do_p.s +++ b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/erc_do_p.s @@ -1307,24 +1307,22 @@ conceal_lost_frames: # @conceal_lost_frames move $s0, $a0 add.d $s4, $a0, $s5 st.w $s6, $s4, 76 - st.w $s3, $s4, 8 - stx.w $s3, $a0, $s5 - st.w $zero, $s4, 32 st.d $s6, $s4, 24 ld.d $a0, $sp, 80 # 8-byte Folded Reload ldx.w $a0, $fp, $a0 ld.d $a1, $sp, 72 # 8-byte Folded Reload ldx.w $a1, $fp, $a1 - st.w $s6, $s4, 272 - st.w $zero, $s4, 216 + st.w $s3, $s4, 8 + stx.w $s3, $s0, $s5 stptr.w $s3, $fp, 5676 add.d $a1, $a1, $a0 - st.w $a1, $s0, 8 + vreplgr2vr.w $vr0, $a1 + vst $vr0, $s0, 4 ld.d $a2, $sp, 64 # 8-byte Folded Reload ld.w $a0, $a2, 28 - st.w $a1, $s0, 12 - st.w $a1, $s0, 16 - st.w $a1, $s0, 4 + st.w $zero, $s4, 32 + st.w $s6, $s4, 272 + st.w $zero, $s4, 216 addi.w $a0, $a0, -1 stptr.w $a1, $fp, 6056 bltz $a0, .LBB3_12 @@ -2263,11 +2261,9 @@ conceal_non_ref_pics: # @conceal_non_ref_pics # %bb.10: # in Loop: Header=BB18_8 Depth=1 move $s1, $a0 stptr.w $s8, $a2, 6072 + vreplgr2vr.w $vr0, $s8 + vst $vr0, $a0, 4 ldptr.w $a0, $a2, 6068 - st.w $s8, $s1, 8 - st.w $s8, $s1, 12 - st.w $s8, $s1, 16 - st.w $s8, $s1, 4 ori $a4, $zero, 2 beq $a0, $a4, .LBB18_13 # %bb.11: # in Loop: Header=BB18_8 Depth=1 diff --git a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/image.s b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/image.s index 826a652b..ef9f339e 100644 --- a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/image.s +++ b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/image.s @@ -2405,18 +2405,18 @@ find_snr: # @find_snr .type get_block,@function get_block: # @get_block # %bb.0: - addi.d $sp, $sp, -480 - st.d $ra, $sp, 472 # 8-byte Folded Spill - st.d $fp, $sp, 464 # 8-byte Folded Spill - st.d $s0, $sp, 456 # 8-byte Folded Spill - st.d $s1, $sp, 448 # 8-byte Folded Spill - st.d $s2, $sp, 440 # 8-byte Folded Spill - st.d $s3, $sp, 432 # 8-byte Folded Spill - st.d $s4, $sp, 424 # 8-byte Folded Spill - st.d $s5, $sp, 416 # 8-byte Folded Spill - st.d $s6, $sp, 408 # 8-byte Folded Spill - st.d $s7, $sp, 400 # 8-byte Folded Spill - st.d $s8, $sp, 392 # 8-byte Folded Spill + addi.d $sp, $sp, -448 + st.d $ra, $sp, 440 # 8-byte Folded Spill + st.d $fp, $sp, 432 # 8-byte Folded Spill + st.d $s0, $sp, 424 # 8-byte Folded Spill + st.d $s1, $sp, 416 # 8-byte Folded Spill + st.d $s2, $sp, 408 # 8-byte Folded Spill + st.d $s3, $sp, 400 # 8-byte Folded Spill + st.d $s4, $sp, 392 # 8-byte Folded Spill + st.d $s5, $sp, 384 # 8-byte Folded Spill + st.d $s6, $sp, 376 # 8-byte Folded Spill + st.d $s7, $sp, 368 # 8-byte Folded Spill + st.d $s8, $sp, 360 # 8-byte Folded Spill slli.d $a0, $a0, 3 ldx.d $a0, $a1, $a0 pcalau12i $a1, %got_pc_hi20(no_reference_picture) @@ -2461,183 +2461,165 @@ get_block: # @get_block .LBB6_5: ld.w $a1, $a6, 16 .LBB6_6: - srai.d $s7, $a2, 2 + srai.d $t0, $a2, 2 ld.w $a7, $a6, 12 or $a6, $a3, $a2 andi $t1, $a6, 3 - srai.d $ra, $a3, 2 + srai.d $a6, $a3, 2 beqz $t1, .LBB6_19 # %bb.7: - andi $a6, $a3, 3 - andi $t0, $a2, 3 - st.d $s7, $sp, 56 # 8-byte Folded Spill - beqz $a6, .LBB6_21 + andi $t2, $a3, 3 + andi $t1, $a2, 3 + beqz $t2, .LBB6_21 # %bb.8: - st.d $ra, $sp, 48 # 8-byte Folded Spill - beqz $t0, .LBB6_25 + beqz $t1, .LBB6_25 # %bb.9: ori $t3, $zero, 2 - bne $t0, $t3, .LBB6_29 + bne $t1, $t3, .LBB6_29 # %bb.10: # %.preheader472.preheader - st.d $a3, $sp, 40 # 8-byte Folded Spill - st.d $a6, $sp, 32 # 8-byte Folded Spill + st.d $t2, $sp, 24 # 8-byte Folded Spill move $a2, $zero - addi.d $t1, $sp, 68 - ori $t4, $zero, 1 - addi.w $t5, $zero, -1 - addi.w $t6, $zero, -2 - addi.w $t7, $zero, -3 - ori $t8, $zero, 324 - ori $fp, $zero, 4 + vinsgr2vr.w $vr0, $a7, 0 + vinsgr2vr.w $vr0, $a7, 1 + addi.d $t1, $sp, 36 + addi.w $t4, $zero, -1 + ori $t5, $zero, 1 + lu32i.d $t5, -2 + vreplgr2vr.d $vr1, $t5 + move $t5, $t4 + lu32i.d $t5, 2 + vreplgr2vr.d $vr2, $t5 + ori $t5, $zero, 2 + lu32i.d $t5, -3 + vreplgr2vr.d $vr3, $t5 + addi.w $t5, $zero, -2 + lu32i.d $t5, 3 + vreplgr2vr.d $vr4, $t5 + ori $t5, $zero, 324 + ori $t6, $zero, 4 .p2align 4, , 16 .LBB6_11: # %.preheader472 # =>This Loop Header: Depth=1 # Child Loop BB6_12 Depth 2 - move $s0, $zero - add.w $a3, $s7, $a2 - slt $a6, $t3, $a3 - maskeqz $t0, $a3, $a6 - masknez $a6, $t3, $a6 - or $a6, $t0, $a6 - addi.w $a6, $a6, -2 - slt $t0, $a6, $a7 - maskeqz $a6, $a6, $t0 - masknez $t0, $a7, $t0 - or $a6, $a6, $t0 - slt $t0, $t4, $a3 - maskeqz $t2, $a3, $t0 - masknez $t0, $t4, $t0 - or $t0, $t2, $t0 - addi.d $t0, $t0, -1 - slt $t2, $t0, $a7 - maskeqz $t0, $t0, $t2 - masknez $t2, $a7, $t2 - or $t0, $t0, $t2 - srai.d $t2, $a3, 63 - andn $t2, $a3, $t2 - slt $s1, $t2, $a7 - maskeqz $t2, $t2, $s1 - masknez $s1, $a7, $s1 - or $t2, $t2, $s1 - slt $s1, $t5, $a3 - maskeqz $s2, $a3, $s1 - masknez $s1, $t5, $s1 - or $s1, $s2, $s1 - addi.w $s1, $s1, 1 - slt $s2, $s1, $a7 - maskeqz $s1, $s1, $s2 - masknez $s2, $a7, $s2 - or $s1, $s1, $s2 - slt $s2, $t6, $a3 - maskeqz $s3, $a3, $s2 - masknez $s2, $t6, $s2 - or $s2, $s3, $s2 - addi.w $s2, $s2, 2 - slt $s3, $s2, $a7 - maskeqz $s2, $s2, $s3 - masknez $s3, $a7, $s3 - or $s5, $s2, $s3 - slt $s2, $t7, $a3 - maskeqz $a3, $a3, $s2 - masknez $s2, $t7, $s2 - or $a3, $a3, $s2 - addi.w $a3, $a3, 3 - slt $s2, $a3, $a7 - maskeqz $a3, $a3, $s2 - masknez $s2, $a7, $s2 - or $a3, $a3, $s2 - slli.d $s2, $a6, 1 - slli.d $s3, $a3, 1 - slli.d $s4, $t0, 1 - slli.d $s5, $s5, 1 - slli.d $s6, $t2, 1 - slli.d $s7, $s1, 1 - move $s8, $ra + move $t7, $zero + add.w $t8, $t0, $a2 + srai.d $fp, $t8, 63 + andn $fp, $t8, $fp + slt $s0, $fp, $a7 + maskeqz $fp, $fp, $s0 + masknez $s0, $a7, $s0 + or $s3, $fp, $s0 + slt $fp, $t4, $t8 + maskeqz $s0, $t8, $fp + masknez $fp, $t4, $fp + or $fp, $s0, $fp + addi.w $fp, $fp, 1 + slt $s0, $fp, $a7 + maskeqz $fp, $fp, $s0 + masknez $s0, $a7, $s0 + or $s4, $fp, $s0 + vinsgr2vr.w $vr5, $t8, 0 + vinsgr2vr.w $vr5, $t8, 1 + vmax.w $vr6, $vr5, $vr1 + vadd.w $vr6, $vr6, $vr2 + vmin.w $vr6, $vr6, $vr0 + vmax.w $vr5, $vr5, $vr3 + vadd.w $vr5, $vr5, $vr4 + vmin.w $vr5, $vr5, $vr0 + vpickve2gr.w $t8, $vr5, 0 + vpickve2gr.w $s0, $vr5, 1 + vpickve2gr.w $s1, $vr6, 0 + vpickve2gr.w $s2, $vr6, 1 + slli.d $fp, $t8, 1 + slli.d $s0, $s0, 1 + slli.d $s1, $s1, 1 + slli.d $s2, $s2, 1 + slli.d $s3, $s3, 1 + slli.d $s4, $s4, 1 + move $s5, $a6 .p2align 4, , 16 .LBB6_12: # Parent Loop BB6_11 Depth=1 # => This Inner Loop Header: Depth=2 - addi.w $a3, $s8, 0 - slt $a6, $t3, $a3 - masknez $t0, $t3, $a6 - maskeqz $a3, $a3, $a6 - or $a3, $a3, $t0 - addi.w $a3, $a3, -2 - slt $a6, $a3, $a1 - maskeqz $a3, $a3, $a6 - masknez $a6, $a1, $a6 - or $a3, $a3, $a6 - slli.d $a3, $a3, 3 - ldx.d $s1, $a0, $a3 - ldx.hu $a3, $s1, $s2 - ldx.hu $a6, $s1, $s3 - ldx.hu $t0, $s1, $s4 - ldx.hu $t2, $s1, $s5 - add.d $a3, $a6, $a3 - ldx.hu $a6, $s1, $s6 - ldx.hu $ra, $s1, $s7 - add.d $t0, $t2, $t0 - alsl.d $t0, $t0, $t0, 2 - sub.d $a3, $a3, $t0 - add.d $a6, $ra, $a6 - slli.d $t0, $a6, 4 - alsl.d $a6, $a6, $t0, 2 - add.d $a3, $a6, $a3 - stx.w $a3, $t1, $s0 - addi.d $s0, $s0, 36 - addi.d $s8, $s8, 1 - bne $s0, $t8, .LBB6_12 + addi.w $t8, $s5, 0 + slt $s6, $t3, $t8 + masknez $s7, $t3, $s6 + maskeqz $t8, $t8, $s6 + or $t8, $t8, $s7 + addi.w $t8, $t8, -2 + slt $s6, $t8, $a1 + maskeqz $t8, $t8, $s6 + masknez $s6, $a1, $s6 + or $t8, $t8, $s6 + slli.d $t8, $t8, 3 + ldx.d $t8, $a0, $t8 + ldx.hu $s6, $t8, $fp + ldx.hu $s7, $t8, $s0 + ldx.hu $s8, $t8, $s1 + ldx.hu $ra, $t8, $s2 + add.d $s6, $s7, $s6 + ldx.hu $s7, $t8, $s3 + ldx.hu $t2, $t8, $s4 + add.d $s8, $ra, $s8 + alsl.d $s8, $s8, $s8, 2 + sub.d $s6, $s6, $s8 + add.d $t2, $t2, $s7 + slli.d $s7, $t2, 4 + alsl.d $t2, $t2, $s7, 2 + add.d $t2, $t2, $s6 + stx.w $t2, $t1, $t7 + addi.d $t7, $t7, 36 + addi.d $s5, $s5, 1 + bne $t7, $t5, .LBB6_12 # %bb.13: # in Loop: Header=BB6_11 Depth=1 addi.d $a2, $a2, 1 addi.d $t1, $t1, 4 - ld.d $ra, $sp, 48 # 8-byte Folded Reload - ld.d $s7, $sp, 56 # 8-byte Folded Reload - bne $a2, $fp, .LBB6_11 + bne $a2, $t6, .LBB6_11 # %bb.14: # %.preheader471 move $a0, $zero pcalau12i $a1, %pc_hi20(get_block.cur_lineY) - st.d $s1, $a1, %pc_lo12(get_block.cur_lineY) - ld.w $s1, $sp, 68 - ld.w $s2, $sp, 104 - ld.w $t7, $sp, 140 - ld.w $a1, $sp, 72 - ld.w $a2, $sp, 108 - ld.w $t8, $sp, 144 - ld.w $a6, $sp, 76 - ld.w $a7, $sp, 112 - ld.w $fp, $sp, 148 - ld.w $t3, $sp, 80 - ld.w $t5, $sp, 116 - ld.w $s0, $sp, 152 + st.d $t8, $a1, %pc_lo12(get_block.cur_lineY) + ld.w $s1, $sp, 36 + ld.w $s2, $sp, 72 + ld.w $t7, $sp, 108 + ld.w $a1, $sp, 40 + ld.w $a2, $sp, 76 + ld.w $t8, $sp, 112 + ld.w $a6, $sp, 44 + ld.w $a7, $sp, 80 + ld.w $fp, $sp, 116 + ld.w $t3, $sp, 48 + ld.w $t5, $sp, 84 + ld.w $s0, $sp, 120 addi.d $t0, $a5, 12 - addi.d $t1, $sp, 260 + addi.d $t1, $sp, 228 ori $t4, $zero, 20 ori $t6, $zero, 64 .p2align 4, , 16 .LBB6_15: # %.loopexit470 # =>This Inner Loop Header: Depth=1 - ld.w $a3, $t1, -12 - ld.w $t2, $t1, -48 + ld.w $t2, $t1, -12 + ld.w $s4, $t1, -48 ld.w $s5, $t1, -8 - add.d $s3, $s1, $a3 + add.d $s3, $s1, $t2 ld.w $s6, $t1, -44 - add.d $s4, $t2, $s2 + add.d $s4, $s4, $s2 move $s1, $s2 move $s2, $t7 - add.d $a3, $a1, $s5 - ld.w $t2, $t1, -4 + add.d $t2, $a1, $s5 + ld.w $t7, $t1, -4 add.d $s6, $s6, $a2 move $a1, $a2 move $a2, $t8 - ld.w $t7, $t1, -40 - ld.w $t8, $t1, 0 - add.d $t2, $a6, $t2 - ld.w $s7, $t1, -36 - add.d $s8, $t7, $a7 + ld.w $t8, $t1, -40 + ld.w $s5, $t1, 0 + add.d $s7, $a6, $t7 + ld.w $s8, $t1, -36 + add.d $ra, $t8, $a7 move $a6, $a7 move $a7, $fp - add.d $s5, $t3, $t8 + add.d $s5, $t3, $s5 ld.w $t7, $t1, -84 - add.d $s7, $s7, $t5 + add.d $s8, $s8, $t5 move $t3, $t5 move $t5, $s0 addi.w $t8, $s4, 0 @@ -2662,66 +2644,65 @@ get_block: # @get_block alsl.d $fp, $fp, $s6, 2 add.d $s0, $t8, $a2 mul.d $s0, $s0, $t4 - sub.d $a3, $a3, $fp + sub.d $t2, $t2, $fp ldptr.w $fp, $a4, 5900 - add.d $a3, $a3, $s0 - addi.w $a3, $a3, 512 - srai.d $s0, $a3, 10 - srai.d $a3, $a3, 63 - andn $a3, $s0, $a3 - slt $s0, $a3, $fp - maskeqz $a3, $a3, $s0 + add.d $t2, $t2, $s0 + addi.w $t2, $t2, 512 + srai.d $s0, $t2, 10 + srai.d $t2, $t2, 63 + andn $t2, $s0, $t2 + slt $s0, $t2, $fp + maskeqz $t2, $t2, $s0 masknez $fp, $fp, $s0 - or $a3, $a3, $fp + or $t2, $t2, $fp ld.w $fp, $t1, -76 - st.w $a3, $s3, -8 - addi.w $a3, $s8, 0 - alsl.d $a3, $a3, $s8, 2 + st.w $t2, $s3, -8 + addi.w $t2, $ra, 0 + alsl.d $t2, $t2, $ra, 2 add.d $s0, $fp, $a7 mul.d $s0, $s0, $t4 ldptr.w $s4, $a4, 5900 - sub.d $a3, $t2, $a3 - add.d $a3, $a3, $s0 - addi.w $a3, $a3, 512 - srai.d $t2, $a3, 10 - srai.d $a3, $a3, 63 - andn $a3, $t2, $a3 - slt $t2, $a3, $s4 - maskeqz $a3, $a3, $t2 - masknez $t2, $s4, $t2 - or $a3, $a3, $t2 + sub.d $t2, $s7, $t2 + add.d $t2, $t2, $s0 + addi.w $t2, $t2, 512 + srai.d $s0, $t2, 10 + srai.d $t2, $t2, 63 + andn $t2, $s0, $t2 + slt $s0, $t2, $s4 + maskeqz $t2, $t2, $s0 + masknez $s0, $s4, $s0 + or $t2, $t2, $s0 ld.w $s0, $t1, -72 - st.w $a3, $s3, -4 - addi.w $a3, $s7, 0 - alsl.d $a3, $a3, $s7, 2 - add.d $t2, $s0, $t5 - mul.d $t2, $t2, $t4 - ldptr.w $s3, $a4, 5900 - sub.d $a3, $s5, $a3 - add.d $a3, $a3, $t2 - addi.w $a3, $a3, 512 - srai.d $t2, $a3, 10 - srai.d $a3, $a3, 63 - andn $a3, $t2, $a3 - slt $t2, $a3, $s3 - maskeqz $a3, $a3, $t2 - masknez $t2, $s3, $t2 - or $a3, $a3, $t2 - stx.w $a3, $t0, $a0 + st.w $t2, $s3, -4 + addi.w $t2, $s8, 0 + alsl.d $t2, $t2, $s8, 2 + add.d $s3, $s0, $t5 + mul.d $s3, $s3, $t4 + ldptr.w $s4, $a4, 5900 + sub.d $t2, $s5, $t2 + add.d $t2, $t2, $s3 + addi.w $t2, $t2, 512 + srai.d $s3, $t2, 10 + srai.d $t2, $t2, 63 + andn $t2, $s3, $t2 + slt $s3, $t2, $s4 + maskeqz $t2, $t2, $s3 + masknez $s3, $s4, $s3 + or $t2, $t2, $s3 + stx.w $t2, $t0, $a0 addi.d $a0, $a0, 16 addi.d $t1, $t1, 36 bne $a0, $t6, .LBB6_15 # %bb.16: - ld.d $a0, $sp, 40 # 8-byte Folded Reload - andi $a0, $a0, 1 + andi $a0, $a3, 1 + ld.d $a1, $sp, 24 # 8-byte Folded Reload beqz $a0, .LBB6_43 # %bb.17: # %.preheader468 - ld.d $a0, $sp, 32 # 8-byte Folded Reload - srli.d $a0, $a0, 1 + srli.d $a0, $a1, 1 slli.d $a1, $a0, 2 slli.d $a0, $a0, 5 or $a0, $a0, $a1 - addi.d $a1, $sp, 68 + addi.d $a1, $sp, 36 add.d $a0, $a1, $a0 addi.d $a1, $a5, 8 ori $a2, $zero, 84 @@ -2794,115 +2775,101 @@ get_block: # @get_block bne $a2, $a3, .LBB6_18 b .LBB6_43 .LBB6_19: # %.preheader452 - srai.d $a2, $s7, 63 - andn $a2, $s7, $a2 + srai.d $a2, $t0, 63 + andn $a2, $t0, $a2 slt $a3, $a2, $a7 maskeqz $a2, $a2, $a3 masknez $a3, $a7, $a3 - or $a3, $a2, $a3 - addi.w $t2, $zero, -1 - slt $a2, $t2, $s7 - masknez $a4, $t2, $a2 - maskeqz $a2, $s7, $a2 - or $a2, $a2, $a4 - addi.d $a2, $a2, 1 - slt $a4, $a2, $a7 - maskeqz $a2, $a2, $a4 + or $a2, $a2, $a3 + addi.w $t1, $zero, -1 + slt $a3, $t1, $t0 + masknez $a4, $t1, $a3 + maskeqz $a3, $t0, $a3 + or $a3, $a3, $a4 + addi.d $a3, $a3, 1 + slt $a4, $a3, $a7 + maskeqz $a3, $a3, $a4 masknez $a4, $a7, $a4 - or $a4, $a2, $a4 - addi.w $t1, $zero, -2 - slt $a2, $t1, $s7 - masknez $t3, $t1, $a2 - maskeqz $a2, $s7, $a2 - or $a2, $a2, $t3 - addi.d $a2, $a2, 2 - slt $t3, $a2, $a7 - maskeqz $a2, $a2, $t3 - masknez $t3, $a7, $t3 - or $t3, $a2, $t3 - addi.w $a2, $zero, -3 - slt $t4, $a2, $s7 - masknez $t5, $a2, $t4 - maskeqz $t0, $s7, $t4 - or $t0, $t0, $t5 - srai.d $t4, $ra, 63 - andn $t4, $ra, $t4 - slt $t5, $t4, $a1 - maskeqz $t4, $t4, $t5 - masknez $t5, $a1, $t5 - or $t4, $t4, $t5 - slli.d $t4, $t4, 3 - ldx.d $t4, $a0, $t4 - addi.d $t0, $t0, 3 - slt $t5, $t0, $a7 + or $a3, $a3, $a4 + vinsgr2vr.w $vr1, $t0, 0 + vinsgr2vr.w $vr1, $t0, 1 + addi.w $a4, $zero, -2 + lu32i.d $a4, -3 + vreplgr2vr.d $vr0, $a4 + vmax.w $vr1, $vr1, $vr0 + ori $a4, $zero, 2 + lu32i.d $a4, 3 + srai.d $t0, $a6, 63 + andn $t0, $a6, $t0 + slt $t2, $t0, $a1 + maskeqz $t0, $t0, $t2 + masknez $t2, $a1, $t2 + or $t0, $t0, $t2 + slli.d $t0, $t0, 3 + ldx.d $t0, $a0, $t0 + vreplgr2vr.d $vr2, $a4 + vadd.w $vr1, $vr1, $vr2 + slli.d $a2, $a2, 1 + ldx.hu $a4, $t0, $a2 + vinsgr2vr.w $vr3, $a7, 0 + vinsgr2vr.w $vr3, $a7, 1 + vmin.w $vr1, $vr1, $vr3 + st.w $a4, $a5, 0 slli.d $a3, $a3, 1 - ldx.hu $t6, $t4, $a3 - maskeqz $t0, $t0, $t5 - masknez $a7, $a7, $t5 - or $t0, $t0, $a7 - st.w $t6, $a5, 0 + ldx.hu $a7, $t0, $a3 + vpickve2gr.w $a4, $vr1, 0 slli.d $a4, $a4, 1 - ldx.hu $t5, $t4, $a4 - slli.d $a7, $t3, 1 - slt $t3, $t2, $ra - masknez $t2, $t2, $t3 - maskeqz $t3, $ra, $t3 - or $t2, $t3, $t2 - addi.d $t2, $t2, 1 - slt $t3, $t2, $a1 - maskeqz $t2, $t2, $t3 - masknez $t3, $a1, $t3 - or $t2, $t2, $t3 - slli.d $t2, $t2, 3 - ldx.d $t2, $a0, $t2 - ldx.hu $t3, $t4, $a7 - slli.d $t0, $t0, 1 - ldx.hu $t4, $t4, $t0 - ldx.hu $t6, $t2, $a3 - st.w $t5, $a5, 4 - st.w $t3, $a5, 8 - st.w $t4, $a5, 12 - st.w $t6, $a5, 16 - slt $t3, $t1, $ra + ldx.hu $t2, $t0, $a4 + slt $t3, $t1, $a6 masknez $t1, $t1, $t3 - maskeqz $t3, $ra, $t3 + maskeqz $t3, $a6, $t3 or $t1, $t3, $t1 - addi.d $t1, $t1, 2 + addi.d $t1, $t1, 1 slt $t3, $t1, $a1 maskeqz $t1, $t1, $t3 masknez $t3, $a1, $t3 or $t1, $t1, $t3 slli.d $t1, $t1, 3 ldx.d $t1, $a0, $t1 - ldx.hu $t3, $t2, $a4 - ldx.hu $t4, $t2, $a7 - ldx.hu $t2, $t2, $t0 - ldx.hu $t5, $t1, $a3 - st.w $t3, $a5, 20 - st.w $t4, $a5, 24 - st.w $t2, $a5, 28 - st.w $t5, $a5, 32 - ldx.hu $t2, $t1, $a4 - ldx.hu $t3, $t1, $a7 - ldx.hu $t1, $t1, $t0 - slt $t4, $a2, $ra - masknez $a2, $a2, $t4 - maskeqz $a6, $ra, $t4 - or $a2, $a6, $a2 - addi.d $a2, $a2, 3 - slt $a6, $a2, $a1 - maskeqz $a2, $a2, $a6 - masknez $a1, $a1, $a6 - or $a1, $a2, $a1 + vpickve2gr.w $t3, $vr1, 1 + slli.d $t3, $t3, 1 + ldx.hu $t0, $t0, $t3 + ldx.hu $t4, $t1, $a2 + st.w $a7, $a5, 4 + st.w $t2, $a5, 8 + st.w $t0, $a5, 12 + st.w $t4, $a5, 16 + vinsgr2vr.w $vr1, $a6, 0 + vinsgr2vr.w $vr1, $a6, 1 + vmax.w $vr0, $vr1, $vr0 + vadd.w $vr0, $vr0, $vr2 + vinsgr2vr.w $vr1, $a1, 0 + vinsgr2vr.w $vr1, $a1, 1 + vmin.w $vr0, $vr0, $vr1 + vpickve2gr.w $a1, $vr0, 0 slli.d $a1, $a1, 3 - ldx.d $a0, $a0, $a1 - st.w $t2, $a5, 36 - st.w $t3, $a5, 40 - st.w $t1, $a5, 44 - ldx.hu $a1, $a0, $a3 - ldx.hu $a2, $a0, $a4 - ldx.hu $a3, $a0, $a7 - ldx.hu $a4, $a0, $t0 + ldx.d $a1, $a0, $a1 + ldx.hu $a6, $t1, $a3 + ldx.hu $a7, $t1, $a4 + ldx.hu $t0, $t1, $t3 + ldx.hu $t1, $a1, $a2 + st.w $a6, $a5, 20 + st.w $a7, $a5, 24 + st.w $t0, $a5, 28 + st.w $t1, $a5, 32 + ldx.hu $a6, $a1, $a3 + ldx.hu $a7, $a1, $a4 + ldx.hu $a1, $a1, $t3 + vpickve2gr.w $t0, $vr0, 1 + slli.d $t0, $t0, 3 + ldx.d $a0, $a0, $t0 + st.w $a6, $a5, 36 + st.w $a7, $a5, 40 + st.w $a1, $a5, 44 + ldx.hu $a1, $a0, $a2 + ldx.hu $a2, $a0, $a3 + ldx.hu $a3, $a0, $a4 + ldx.hu $a4, $a0, $t3 st.w $a1, $a5, 48 st.w $a2, $a5, 52 st.w $a3, $a5, 56 @@ -2912,231 +2879,208 @@ get_block: # @get_block st.d $a0, $a1, %pc_lo12(get_block.cur_lineY) b .LBB6_43 .LBB6_21: # %.preheader456 - st.d $a2, $sp, 48 # 8-byte Folded Spill - st.d $t0, $sp, 40 # 8-byte Folded Spill move $t4, $zero - srai.d $a3, $ra, 63 - andn $a3, $ra, $a3 + srai.d $a3, $a6, 63 + andn $a3, $a6, $a3 slt $t2, $a3, $a1 maskeqz $a3, $a3, $t2 masknez $t2, $a1, $t2 or $a3, $a3, $t2 - alsl.d $a2, $a3, $a0, 3 - st.d $a2, $sp, 16 # 8-byte Folded Spill + alsl.d $t3, $a3, $a0, 3 slli.d $a3, $a3, 3 ldx.d $t5, $a0, $a3 addi.w $t6, $zero, -1 - slt $a3, $t6, $ra + slt $a3, $t6, $a6 masknez $t2, $t6, $a3 - maskeqz $a3, $ra, $a3 + maskeqz $a3, $a6, $a3 or $a3, $a3, $t2 addi.d $a3, $a3, 1 slt $t2, $a3, $a1 maskeqz $a3, $a3, $t2 masknez $t2, $a1, $t2 or $a3, $a3, $t2 - alsl.d $a2, $a3, $a0, 3 - st.d $a2, $sp, 24 # 8-byte Folded Spill + alsl.d $t2, $a3, $a0, 3 + st.d $t2, $sp, 16 # 8-byte Folded Spill slli.d $a3, $a3, 3 ldx.d $t7, $a0, $a3 + vinsgr2vr.w $vr0, $a6, 0 addi.w $t8, $zero, -2 - slt $a3, $t8, $ra - masknez $fp, $t8, $a3 - maskeqz $a3, $ra, $a3 - or $a3, $a3, $fp - addi.d $a3, $a3, 2 - slt $fp, $a3, $a1 - maskeqz $a3, $a3, $fp - masknez $fp, $a1, $fp - or $a3, $a3, $fp - slli.d $a3, $a3, 3 - ldx.d $a3, $a0, $a3 - addi.w $fp, $zero, -3 - slt $s0, $fp, $ra - masknez $s1, $fp, $s0 - maskeqz $a6, $ra, $s0 - or $a6, $a6, $s1 - addi.d $a6, $a6, 3 - slt $s0, $a6, $a1 - maskeqz $a6, $a6, $s0 - masknez $a1, $a1, $s0 - or $a1, $a6, $a1 - alsl.d $a2, $a1, $a0, 3 - st.d $a2, $sp, 32 # 8-byte Folded Spill - slli.d $a1, $a1, 3 - ldx.d $a0, $a0, $a1 + vinsgr2vr.w $vr0, $a6, 1 + move $a3, $t8 + lu32i.d $a3, -3 + vreplgr2vr.d $vr1, $a3 + vmax.w $vr0, $vr0, $vr1 ori $a6, $zero, 2 - ori $s0, $zero, 1 - ori $s1, $zero, 20 - ori $a2, $zero, 16 - move $s3, $s7 + ori $a3, $zero, 2 + lu32i.d $a3, 3 + vreplgr2vr.d $vr1, $a3 + vadd.w $vr0, $vr0, $vr1 + vinsgr2vr.w $vr1, $a1, 0 + vinsgr2vr.w $vr1, $a1, 1 + vmin.w $vr0, $vr0, $vr1 + vpickve2gr.w $a1, $vr0, 0 + slli.d $a1, $a1, 3 + ldx.d $a3, $a0, $a1 + vpickve2gr.w $fp, $vr0, 1 + alsl.d $a1, $fp, $a0, 3 + st.d $a1, $sp, 24 # 8-byte Folded Spill + slli.d $fp, $fp, 3 + ldx.d $a0, $a0, $fp + vinsgr2vr.w $vr0, $a7, 0 + vinsgr2vr.w $vr0, $a7, 1 + lu32i.d $a6, -3 + vreplgr2vr.d $vr1, $a6 + lu32i.d $t8, 3 + vreplgr2vr.d $vr2, $t8 + ori $a6, $zero, 1 + lu32i.d $a6, -2 + vreplgr2vr.d $vr3, $a6 + move $a6, $t6 + lu32i.d $a6, 2 + vreplgr2vr.d $vr4, $a6 + ori $a6, $zero, 20 + ori $t8, $zero, 16 + move $fp, $t0 .p2align 4, , 16 .LBB6_22: # =>This Inner Loop Header: Depth=1 - addi.w $a1, $s3, 0 - slt $s4, $a6, $a1 - masknez $s5, $a6, $s4 - maskeqz $s4, $a1, $s4 - or $s4, $s4, $s5 - addi.w $s4, $s4, -2 - slt $s5, $s4, $a7 - maskeqz $s4, $s4, $s5 - masknez $s5, $a7, $s5 - or $s5, $s4, $s5 - slt $s4, $s0, $a1 - masknez $s6, $s0, $s4 - maskeqz $s4, $a1, $s4 - or $s4, $s4, $s6 - addi.d $s4, $s4, -1 - slt $s6, $s4, $a7 - maskeqz $s4, $s4, $s6 - masknez $s6, $a7, $s6 - or $s7, $s4, $s6 - srai.d $s4, $a1, 63 - andn $s4, $a1, $s4 - slt $s6, $s4, $a7 - maskeqz $s4, $s4, $s6 - masknez $s6, $a7, $s6 - or $ra, $s4, $s6 - slt $s4, $t6, $a1 - masknez $s6, $t6, $s4 - maskeqz $s4, $a1, $s4 - or $s4, $s4, $s6 - addi.w $s4, $s4, 1 - slt $s6, $s4, $a7 - maskeqz $s4, $s4, $s6 - masknez $s6, $a7, $s6 - or $t2, $s4, $s6 - slt $s4, $t8, $a1 - masknez $s6, $t8, $s4 - maskeqz $s4, $a1, $s4 - or $s4, $s4, $s6 - addi.w $s4, $s4, 2 - slt $s6, $s4, $a7 - maskeqz $s4, $s4, $s6 - masknez $s6, $a7, $s6 - or $s8, $s4, $s6 - slt $s4, $fp, $a1 - masknez $s6, $fp, $s4 - maskeqz $a1, $a1, $s4 - or $a1, $a1, $s6 - addi.w $a1, $a1, 3 - slt $s4, $a1, $a7 - maskeqz $a1, $a1, $s4 - masknez $s4, $a7, $s4 - or $a1, $a1, $s4 - add.d $s4, $a5, $t4 - slli.d $s6, $s5, 1 - ldx.hu $t3, $t5, $s6 - slli.d $s5, $a1, 1 - slli.d $s7, $s7, 1 - ldx.hu $t1, $t5, $s7 - slli.d $s8, $s8, 1 - ldx.hu $t0, $t5, $s8 - slli.d $ra, $ra, 1 - ldx.hu $s2, $t5, $ra - slli.d $a1, $t2, 1 - ldx.hu $t2, $t5, $a1 - add.d $t0, $t0, $t1 - ldx.hu $t1, $t5, $s5 - alsl.d $t0, $t0, $t0, 2 - add.d $t2, $t2, $s2 - mul.d $t2, $t2, $s1 - ldptr.w $s2, $a4, 5900 - add.d $t1, $t3, $t1 - sub.d $t0, $t1, $t0 - add.d $t0, $t0, $t2 - addi.w $t0, $t0, 16 - srai.d $t1, $t0, 5 - srai.d $t0, $t0, 63 - andn $t0, $t1, $t0 - slt $t1, $t0, $s2 - maskeqz $t0, $t0, $t1 - masknez $t1, $s2, $t1 - or $t0, $t0, $t1 - stx.w $t0, $a5, $t4 - ldx.hu $t0, $t7, $s7 - ldx.hu $t1, $t7, $s8 - ldx.hu $t2, $t7, $s6 - ldx.hu $t3, $t7, $ra - ldx.hu $s2, $t7, $a1 - add.d $t0, $t1, $t0 - ldx.hu $t1, $t7, $s5 - alsl.d $t0, $t0, $t0, 2 - add.d $t3, $s2, $t3 - mul.d $t3, $t3, $s1 - ldptr.w $s2, $a4, 5900 - add.d $t1, $t2, $t1 - sub.d $t0, $t1, $t0 - add.d $t0, $t0, $t3 - addi.w $t0, $t0, 16 - srai.d $t1, $t0, 5 - srai.d $t0, $t0, 63 - andn $t0, $t1, $t0 - slt $t1, $t0, $s2 - maskeqz $t0, $t0, $t1 - masknez $t1, $s2, $t1 - or $t0, $t0, $t1 - st.w $t0, $s4, 16 - ldx.hu $t0, $a3, $s7 - ldx.hu $t1, $a3, $s8 - ldx.hu $t2, $a3, $s6 - ldx.hu $t3, $a3, $ra - ldx.hu $s2, $a3, $a1 - add.d $t0, $t1, $t0 - ldx.hu $t1, $a3, $s5 - alsl.d $t0, $t0, $t0, 2 - add.d $t3, $s2, $t3 - mul.d $t3, $t3, $s1 - ldptr.w $s2, $a4, 5900 - add.d $t1, $t2, $t1 - sub.d $t0, $t1, $t0 - add.d $t0, $t0, $t3 - addi.w $t0, $t0, 16 - srai.d $t1, $t0, 5 - srai.d $t0, $t0, 63 - andn $t0, $t1, $t0 - slt $t1, $t0, $s2 - maskeqz $t0, $t0, $t1 - masknez $t1, $s2, $t1 - or $t0, $t0, $t1 - st.w $t0, $s4, 32 - ldx.hu $t0, $a0, $s6 - ldx.hu $t1, $a0, $s7 - ldx.hu $t2, $a0, $s8 - ldx.hu $t3, $a0, $ra - ldx.hu $a1, $a0, $a1 - ldx.hu $s2, $a0, $s5 - add.d $t1, $t2, $t1 - alsl.d $t1, $t1, $t1, 2 - add.d $a1, $a1, $t3 - mul.d $a1, $a1, $s1 + addi.w $s0, $fp, 0 + srai.d $s1, $s0, 63 + andn $s1, $s0, $s1 + slt $s2, $s1, $a7 + maskeqz $s1, $s1, $s2 + masknez $s2, $a7, $s2 + or $s5, $s1, $s2 + slt $s1, $t6, $s0 + masknez $s2, $t6, $s1 + maskeqz $s0, $s0, $s1 + or $s0, $s0, $s2 + addi.w $s0, $s0, 1 + slt $s1, $s0, $a7 + maskeqz $s0, $s0, $s1 + masknez $s1, $a7, $s1 + or $s6, $s0, $s1 + vinsgr2vr.w $vr5, $fp, 0 + vinsgr2vr.w $vr5, $fp, 1 + vmax.w $vr6, $vr5, $vr1 + vadd.w $vr6, $vr6, $vr2 + vmin.w $vr6, $vr6, $vr0 + vmax.w $vr5, $vr5, $vr3 + vadd.w $vr5, $vr5, $vr4 + vmin.w $vr5, $vr5, $vr0 + add.d $s0, $a5, $t4 + vpickve2gr.w $s1, $vr6, 0 + slli.d $s2, $s1, 1 + ldx.hu $s7, $t5, $s2 + vpickve2gr.w $s1, $vr6, 1 + slli.d $s1, $s1, 1 + vpickve2gr.w $s3, $vr5, 0 + slli.d $s3, $s3, 1 + ldx.hu $s8, $t5, $s3 + vpickve2gr.w $s4, $vr5, 1 + slli.d $s4, $s4, 1 + ldx.hu $ra, $t5, $s4 + slli.d $s5, $s5, 1 + ldx.hu $a1, $t5, $s5 + slli.d $s6, $s6, 1 + ldx.hu $t2, $t5, $s6 + add.d $s8, $ra, $s8 + ldx.hu $ra, $t5, $s1 + alsl.d $s8, $s8, $s8, 2 + add.d $a1, $t2, $a1 + mul.d $a1, $a1, $a6 ldptr.w $t2, $a4, 5900 - add.d $t0, $t0, $s2 - sub.d $t0, $t0, $t1 - add.d $a1, $t0, $a1 + add.d $s7, $s7, $ra + sub.d $s7, $s7, $s8 + add.d $a1, $s7, $a1 + addi.w $a1, $a1, 16 + srai.d $s7, $a1, 5 + srai.d $a1, $a1, 63 + andn $a1, $s7, $a1 + slt $s7, $a1, $t2 + maskeqz $a1, $a1, $s7 + masknez $t2, $t2, $s7 + or $a1, $a1, $t2 + stx.w $a1, $a5, $t4 + ldx.hu $a1, $t7, $s3 + ldx.hu $t2, $t7, $s4 + ldx.hu $s7, $t7, $s2 + ldx.hu $s8, $t7, $s5 + ldx.hu $ra, $t7, $s6 + add.d $a1, $t2, $a1 + ldx.hu $t2, $t7, $s1 + alsl.d $a1, $a1, $a1, 2 + add.d $s8, $ra, $s8 + mul.d $s8, $s8, $a6 + ldptr.w $ra, $a4, 5900 + add.d $t2, $s7, $t2 + sub.d $a1, $t2, $a1 + add.d $a1, $a1, $s8 + addi.w $a1, $a1, 16 + srai.d $t2, $a1, 5 + srai.d $a1, $a1, 63 + andn $a1, $t2, $a1 + slt $t2, $a1, $ra + maskeqz $a1, $a1, $t2 + masknez $t2, $ra, $t2 + or $a1, $a1, $t2 + st.w $a1, $s0, 16 + ldx.hu $a1, $a3, $s3 + ldx.hu $t2, $a3, $s4 + ldx.hu $s7, $a3, $s2 + ldx.hu $s8, $a3, $s5 + ldx.hu $ra, $a3, $s6 + add.d $a1, $t2, $a1 + ldx.hu $t2, $a3, $s1 + alsl.d $a1, $a1, $a1, 2 + add.d $s8, $ra, $s8 + mul.d $s8, $s8, $a6 + ldptr.w $ra, $a4, 5900 + add.d $t2, $s7, $t2 + sub.d $a1, $t2, $a1 + add.d $a1, $a1, $s8 + addi.w $a1, $a1, 16 + srai.d $t2, $a1, 5 + srai.d $a1, $a1, 63 + andn $a1, $t2, $a1 + slt $t2, $a1, $ra + maskeqz $a1, $a1, $t2 + masknez $t2, $ra, $t2 + or $a1, $a1, $t2 + st.w $a1, $s0, 32 + ldx.hu $a1, $a0, $s2 + ldx.hu $t2, $a0, $s3 + ldx.hu $s2, $a0, $s4 + ldx.hu $s3, $a0, $s5 + ldx.hu $s4, $a0, $s6 + ldx.hu $s1, $a0, $s1 + add.d $t2, $s2, $t2 + alsl.d $t2, $t2, $t2, 2 + add.d $s2, $s4, $s3 + mul.d $s2, $s2, $a6 + ldptr.w $s3, $a4, 5900 + add.d $a1, $a1, $s1 + sub.d $a1, $a1, $t2 + add.d $a1, $a1, $s2 addi.w $a1, $a1, 16 - srai.d $t0, $a1, 5 + srai.d $t2, $a1, 5 srai.d $a1, $a1, 63 - andn $a1, $t0, $a1 - slt $t0, $a1, $t2 - maskeqz $a1, $a1, $t0 - masknez $t0, $t2, $t0 - or $a1, $a1, $t0 - st.w $a1, $s4, 48 + andn $a1, $t2, $a1 + slt $t2, $a1, $s3 + maskeqz $a1, $a1, $t2 + masknez $t2, $s3, $t2 + or $a1, $a1, $t2 + st.w $a1, $s0, 48 addi.d $t4, $t4, 4 - addi.d $s3, $s3, 1 - bne $t4, $a2, .LBB6_22 + addi.d $fp, $fp, 1 + bne $t4, $t8, .LBB6_22 # %bb.23: pcalau12i $a4, %pc_hi20(get_block.cur_lineY) - ld.d $a1, $sp, 48 # 8-byte Folded Reload - andi $a1, $a1, 1 + andi $a1, $a2, 1 st.d $a0, $a4, %pc_lo12(get_block.cur_lineY) beqz $a1, .LBB6_43 # %bb.24: # %.preheader454 - ld.d $a0, $sp, 40 # 8-byte Folded Reload - srli.d $a0, $a0, 1 - ld.d $a1, $sp, 56 # 8-byte Folded Reload - add.d $a0, $a0, $a1 + srli.d $a0, $t1, 1 + add.d $a0, $a0, $t0 srai.d $a1, $a0, 63 andn $a1, $a0, $a1 slt $a2, $a1, $a7 @@ -3162,619 +3106,577 @@ get_block: # @get_block andn $a0, $a0, $t0 slt $t0, $a0, $a7 maskeqz $a0, $a0, $t0 - ld.d $t1, $sp, 16 # 8-byte Folded Reload - ld.d $t1, $t1, 0 + ld.d $t1, $t3, 0 masknez $a7, $a7, $t0 or $a7, $a0, $a7 slli.d $a0, $a1, 1 - ldx.h $t0, $t1, $a0 - slli.d $a1, $a2, 1 - ldx.h $t2, $t1, $a1 - slli.d $a2, $a6, 1 - ldx.h $t3, $t1, $a2 - slli.d $a6, $a7, 1 - ldx.h $a7, $t1, $a6 + ldx.h $a1, $t1, $a0 + slli.d $a2, $a2, 1 + ldx.h $t0, $t1, $a2 + slli.d $a6, $a6, 1 + ldx.h $t2, $t1, $a6 + slli.d $a7, $a7, 1 + ldx.h $t1, $t1, $a7 vld $vr0, $a5, 0 - vinsgr2vr.h $vr1, $t0, 0 - vinsgr2vr.h $vr1, $t2, 1 - vinsgr2vr.h $vr1, $t3, 2 - vinsgr2vr.h $vr1, $a7, 3 + vinsgr2vr.h $vr1, $a1, 0 + vinsgr2vr.h $vr1, $t0, 1 + vinsgr2vr.h $vr1, $t2, 2 + vinsgr2vr.h $vr1, $t1, 3 vrepli.b $vr2, 0 vilvl.h $vr1, $vr2, $vr1 vadd.w $vr0, $vr0, $vr1 - ld.d $a7, $sp, 24 # 8-byte Folded Reload - ld.d $a7, $a7, 0 + ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a1, $a1, 0 vaddi.wu $vr0, $vr0, 1 vsrai.w $vr0, $vr0, 1 vst $vr0, $a5, 0 - ldx.h $t0, $a7, $a0 - ldx.h $t1, $a7, $a1 - ldx.h $t2, $a7, $a2 - ldx.h $a7, $a7, $a6 + ldx.h $t0, $a1, $a0 + ldx.h $t1, $a1, $a2 + ldx.h $t2, $a1, $a6 + ldx.h $a1, $a1, $a7 vld $vr0, $a5, 16 vinsgr2vr.h $vr1, $t0, 0 vinsgr2vr.h $vr1, $t1, 1 vinsgr2vr.h $vr1, $t2, 2 - vinsgr2vr.h $vr1, $a7, 3 + vinsgr2vr.h $vr1, $a1, 3 vilvl.h $vr1, $vr2, $vr1 vadd.w $vr0, $vr0, $vr1 vaddi.wu $vr0, $vr0, 1 vsrai.w $vr0, $vr0, 1 vst $vr0, $a5, 16 - ldx.h $a7, $a3, $a0 - ldx.h $t0, $a3, $a1 - ldx.h $t1, $a3, $a2 - ldx.h $a3, $a3, $a6 + ldx.h $a1, $a3, $a0 + ldx.h $t0, $a3, $a2 + ldx.h $t1, $a3, $a6 + ldx.h $a3, $a3, $a7 vld $vr0, $a5, 32 - vinsgr2vr.h $vr1, $a7, 0 + vinsgr2vr.h $vr1, $a1, 0 vinsgr2vr.h $vr1, $t0, 1 vinsgr2vr.h $vr1, $t1, 2 vinsgr2vr.h $vr1, $a3, 3 vilvl.h $vr1, $vr2, $vr1 vadd.w $vr0, $vr0, $vr1 - ld.d $a3, $sp, 32 # 8-byte Folded Reload - ld.d $a3, $a3, 0 + ld.d $a1, $sp, 24 # 8-byte Folded Reload + ld.d $a1, $a1, 0 vaddi.wu $vr0, $vr0, 1 vsrai.w $vr0, $vr0, 1 vst $vr0, $a5, 32 - ldx.h $a0, $a3, $a0 - ldx.h $a1, $a3, $a1 - ldx.h $a2, $a3, $a2 - ldx.h $a6, $a3, $a6 + ldx.h $a0, $a1, $a0 + ldx.h $a2, $a1, $a2 + ldx.h $a3, $a1, $a6 + ldx.h $a6, $a1, $a7 vld $vr0, $a5, 48 vinsgr2vr.h $vr1, $a0, 0 - vinsgr2vr.h $vr1, $a1, 1 - vinsgr2vr.h $vr1, $a2, 2 + vinsgr2vr.h $vr1, $a2, 1 + vinsgr2vr.h $vr1, $a3, 2 vinsgr2vr.h $vr1, $a6, 3 vilvl.h $vr1, $vr2, $vr1 vadd.w $vr0, $vr0, $vr1 vaddi.wu $vr0, $vr0, 1 vsrai.w $vr0, $vr0, 1 vst $vr0, $a5, 48 - st.d $a3, $a4, %pc_lo12(get_block.cur_lineY) + st.d $a1, $a4, %pc_lo12(get_block.cur_lineY) b .LBB6_43 .LBB6_25: # %.preheader467 - st.d $a3, $sp, 40 # 8-byte Folded Spill - st.d $a6, $sp, 32 # 8-byte Folded Spill - move $t6, $zero - srai.d $a2, $s7, 63 - andn $a2, $s7, $a2 - slt $t0, $a2, $a7 - maskeqz $a2, $a2, $t0 - masknez $t0, $a7, $t0 - or $t0, $a2, $t0 - addi.w $t5, $zero, -1 - slt $a2, $t5, $s7 - masknez $t1, $t5, $a2 - maskeqz $a2, $s7, $a2 - or $a2, $a2, $t1 - addi.d $a2, $a2, 1 + move $t3, $zero + srai.d $a2, $t0, 63 + andn $a2, $t0, $a2 slt $t1, $a2, $a7 maskeqz $a2, $a2, $t1 masknez $t1, $a7, $t1 - or $t1, $a2, $t1 - addi.w $t4, $zero, -2 - slt $a2, $t4, $s7 - masknez $t3, $t4, $a2 - maskeqz $a2, $s7, $a2 - or $a2, $a2, $t3 - addi.d $a2, $a2, 2 - slt $t3, $a2, $a7 - maskeqz $a2, $a2, $t3 - masknez $t3, $a7, $t3 - or $t3, $a2, $t3 - addi.w $a2, $zero, -3 - slt $t7, $a2, $s7 - masknez $t8, $a2, $t7 - maskeqz $t7, $s7, $t7 - or $t7, $t7, $t8 - addi.d $t7, $t7, 3 - slt $t8, $t7, $a7 - maskeqz $t7, $t7, $t8 - masknez $a7, $a7, $t8 - or $s0, $t7, $a7 - slli.d $a7, $t0, 1 - ori $fp, $zero, 20 - slli.d $t0, $t1, 1 - slli.d $t1, $t3, 1 - slli.d $t3, $s0, 1 - move $s1, $ra + or $a2, $a2, $t1 + addi.w $t1, $zero, -1 + slt $t4, $t1, $t0 + masknez $t5, $t1, $t4 + maskeqz $t4, $t0, $t4 + or $t4, $t4, $t5 + addi.d $t4, $t4, 1 + slt $t5, $t4, $a7 + maskeqz $t4, $t4, $t5 + masknez $t5, $a7, $t5 + or $t4, $t4, $t5 + vinsgr2vr.w $vr1, $t0, 0 + addi.w $t5, $zero, -2 + vinsgr2vr.w $vr1, $t0, 1 + move $t0, $t5 + lu32i.d $t0, -3 + vreplgr2vr.d $vr0, $t0 + vmax.w $vr2, $vr1, $vr0 + ori $t0, $zero, 2 + ori $t6, $zero, 2 + lu32i.d $t6, 3 + vreplgr2vr.d $vr1, $t6 + vadd.w $vr1, $vr2, $vr1 + vinsgr2vr.w $vr3, $a7, 0 + vinsgr2vr.w $vr3, $a7, 1 + vmin.w $vr4, $vr1, $vr3 + vinsgr2vr.w $vr1, $a1, 0 + vinsgr2vr.w $vr1, $a1, 1 + vpickve2gr.w $t6, $vr4, 0 + vpickve2gr.w $t7, $vr4, 1 + lu32i.d $t0, -3 + vreplgr2vr.d $vr4, $t0 + lu32i.d $t5, 3 + vreplgr2vr.d $vr5, $t5 + ori $a7, $zero, 1 + lu32i.d $a7, -2 + vreplgr2vr.d $vr6, $a7 + move $a7, $t1 + lu32i.d $a7, 2 + vreplgr2vr.d $vr7, $a7 + slli.d $a2, $a2, 1 + ori $t0, $zero, 20 + slli.d $a7, $t4, 1 + slli.d $t4, $t6, 1 + slli.d $t5, $t7, 1 + ori $t6, $zero, 64 + move $t7, $a6 .p2align 4, , 16 .LBB6_26: # =>This Inner Loop Header: Depth=1 - addi.w $s2, $s1, 0 - ori $a3, $zero, 2 - slt $s3, $a3, $s2 - masknez $s4, $a3, $s3 - maskeqz $s3, $s2, $s3 - or $s3, $s3, $s4 - addi.w $s3, $s3, -2 - slt $s4, $s3, $a1 - maskeqz $s3, $s3, $s4 - masknez $s4, $a1, $s4 - or $s3, $s3, $s4 - ori $a3, $zero, 1 - slt $s4, $a3, $s2 - masknez $s5, $a3, $s4 - maskeqz $s4, $s2, $s4 - or $s4, $s4, $s5 - addi.d $s4, $s4, -1 - slt $s5, $s4, $a1 - maskeqz $s4, $s4, $s5 - masknez $s5, $a1, $s5 - or $s5, $s4, $s5 - srai.d $s4, $s2, 63 - andn $s4, $s2, $s4 - slt $s6, $s4, $a1 - maskeqz $s4, $s4, $s6 - masknez $s6, $a1, $s6 - or $s6, $s4, $s6 - slt $s4, $t5, $s2 - masknez $s7, $t5, $s4 - maskeqz $s4, $s2, $s4 - or $s4, $s4, $s7 - addi.w $s4, $s4, 1 - slt $s7, $s4, $a1 - maskeqz $s4, $s4, $s7 - masknez $s7, $a1, $s7 - or $s8, $s4, $s7 - slt $s4, $t4, $s2 - masknez $s7, $t4, $s4 - maskeqz $s4, $s2, $s4 - or $s4, $s4, $s7 - addi.w $s4, $s4, 2 - slt $s7, $s4, $a1 - maskeqz $s4, $s4, $s7 - masknez $s7, $a1, $s7 - or $s7, $s4, $s7 - slt $s4, $a2, $s2 - masknez $ra, $a2, $s4 - maskeqz $s2, $s2, $s4 - or $s2, $s2, $ra - addi.w $s2, $s2, 3 - slt $s4, $s2, $a1 - maskeqz $s2, $s2, $s4 - masknez $s4, $a1, $s4 - or $s2, $s2, $s4 + addi.w $t8, $t7, 0 + srai.d $fp, $t8, 63 + andn $fp, $t8, $fp + slt $s0, $fp, $a1 + maskeqz $fp, $fp, $s0 + masknez $s0, $a1, $s0 + or $s2, $fp, $s0 + slt $fp, $t1, $t8 + masknez $s0, $t1, $fp + maskeqz $t8, $t8, $fp + or $t8, $t8, $s0 + addi.w $t8, $t8, 1 + slt $fp, $t8, $a1 + maskeqz $t8, $t8, $fp + masknez $fp, $a1, $fp + or $t8, $t8, $fp + vinsgr2vr.w $vr8, $t7, 0 + vinsgr2vr.w $vr8, $t7, 1 + vmax.w $vr9, $vr8, $vr4 + vadd.w $vr9, $vr9, $vr5 + vmin.w $vr9, $vr9, $vr1 + vpickve2gr.w $fp, $vr9, 0 + slli.d $fp, $fp, 3 + ldx.d $s0, $a0, $fp + vpickve2gr.w $fp, $vr9, 1 + slli.d $fp, $fp, 3 + ldx.d $fp, $a0, $fp + vmax.w $vr8, $vr8, $vr6 + vadd.w $vr8, $vr8, $vr7 + vmin.w $vr8, $vr8, $vr1 + vpickve2gr.w $s1, $vr8, 0 + slli.d $s1, $s1, 3 + ldx.d $s1, $a0, $s1 + vpickve2gr.w $s3, $vr8, 1 slli.d $s3, $s3, 3 - ldx.d $s4, $a0, $s3 + ldx.d $s3, $a0, $s3 slli.d $s2, $s2, 3 - ldx.d $s3, $a0, $s2 - slli.d $s2, $s5, 3 - ldx.d $s5, $a0, $s2 - slli.d $s2, $s7, 3 - ldx.d $s7, $a0, $s2 - slli.d $s2, $s6, 3 - ldx.d $s6, $a0, $s2 - slli.d $s2, $s8, 3 - ldx.d $s8, $a0, $s2 - add.d $s2, $a5, $t6 + ldx.d $s2, $a0, $s2 + slli.d $t8, $t8, 3 + ldx.d $s4, $a0, $t8 + add.d $t8, $a5, $t3 + ldx.hu $s5, $s1, $a2 + ldx.hu $s6, $s3, $a2 + ldx.hu $s7, $s0, $a2 + ldx.hu $s8, $s2, $a2 + ldx.hu $ra, $s4, $a2 + add.d $s5, $s6, $s5 + ldx.hu $s6, $fp, $a2 + alsl.d $s5, $s5, $s5, 2 + add.d $s8, $ra, $s8 + mul.d $s8, $s8, $t0 + ldptr.w $ra, $a4, 5900 + add.d $s6, $s7, $s6 + sub.d $s5, $s6, $s5 + add.d $s5, $s5, $s8 + addi.w $s5, $s5, 16 + srai.d $s6, $s5, 5 + srai.d $s5, $s5, 63 + andn $s5, $s6, $s5 + slt $s6, $s5, $ra + maskeqz $s5, $s5, $s6 + masknez $s6, $ra, $s6 + or $s5, $s5, $s6 + stx.w $s5, $a5, $t3 + ldx.hu $s5, $s1, $a7 + ldx.hu $s6, $s3, $a7 + ldx.hu $s7, $s0, $a7 + ldx.hu $s8, $s2, $a7 ldx.hu $ra, $s4, $a7 - ldx.hu $t2, $s5, $a7 - ldx.hu $s0, $s7, $a7 - ldx.hu $a3, $s6, $a7 - ldx.hu $a6, $s8, $a7 - ldx.hu $t7, $s3, $a7 - add.d $t2, $s0, $t2 - alsl.d $t2, $t2, $t2, 2 - add.d $a3, $a6, $a3 - mul.d $a3, $a3, $fp - ldptr.w $a6, $a4, 5900 - add.d $t7, $ra, $t7 - sub.d $t2, $t7, $t2 - add.d $a3, $t2, $a3 - addi.w $a3, $a3, 16 - srai.d $t2, $a3, 5 - srai.d $a3, $a3, 63 - andn $a3, $t2, $a3 - slt $t2, $a3, $a6 - maskeqz $a3, $a3, $t2 - masknez $a6, $a6, $t2 - or $a3, $a3, $a6 - stx.w $a3, $a5, $t6 - ldx.hu $a3, $s4, $t0 - ldx.hu $a6, $s5, $t0 - ldx.hu $t2, $s7, $t0 - ldx.hu $t7, $s6, $t0 - ldx.hu $s0, $s8, $t0 - ldx.hu $ra, $s3, $t0 - add.d $a6, $t2, $a6 - alsl.d $a6, $a6, $a6, 2 - add.d $t2, $s0, $t7 - mul.d $t2, $t2, $fp - ldptr.w $t7, $a4, 5900 - add.d $a3, $a3, $ra - sub.d $a3, $a3, $a6 - add.d $a3, $a3, $t2 - addi.w $a3, $a3, 16 - srai.d $a6, $a3, 5 - srai.d $a3, $a3, 63 - andn $a3, $a6, $a3 - slt $a6, $a3, $t7 - maskeqz $a3, $a3, $a6 - masknez $a6, $t7, $a6 - or $a3, $a3, $a6 - ldx.hu $a6, $s4, $t1 - ldx.hu $t2, $s5, $t1 - ldx.hu $t7, $s7, $t1 - ldx.hu $s0, $s6, $t1 - ldx.hu $ra, $s8, $t1 - ldx.hu $t8, $s3, $t1 - st.w $a3, $s2, 4 - add.d $a3, $t7, $t2 - add.d $t2, $ra, $s0 - add.d $a6, $a6, $t8 - alsl.d $a3, $a3, $a3, 2 - mul.d $t2, $t2, $fp - sub.d $a3, $a6, $a3 - ldptr.w $a6, $a4, 5900 - add.d $a3, $a3, $t2 - addi.w $a3, $a3, 16 - srai.d $t2, $a3, 5 - srai.d $a3, $a3, 63 - andn $a3, $t2, $a3 - slt $t2, $a3, $a6 - maskeqz $a3, $a3, $t2 - masknez $a6, $a6, $t2 - or $a3, $a3, $a6 - ldx.hu $a6, $s4, $t3 - ldx.hu $t2, $s5, $t3 - ldx.hu $t7, $s7, $t3 - ldx.hu $t8, $s6, $t3 - ldx.hu $s0, $s8, $t3 - ldx.hu $s3, $s3, $t3 - st.w $a3, $s2, 8 - add.d $a3, $t7, $t2 - add.d $t2, $s0, $t8 - add.d $a6, $a6, $s3 - alsl.d $a3, $a3, $a3, 2 - mul.d $t2, $t2, $fp - sub.d $a3, $a6, $a3 - ldptr.w $a6, $a4, 5900 - add.d $a3, $a3, $t2 - addi.w $a3, $a3, 16 - srai.d $t2, $a3, 5 - srai.d $a3, $a3, 63 - andn $a3, $t2, $a3 - slt $t2, $a3, $a6 - maskeqz $a3, $a3, $t2 - masknez $a6, $a6, $t2 - or $a3, $a3, $a6 - st.w $a3, $s2, 12 - addi.d $t6, $t6, 16 - addi.d $s1, $s1, 1 - ori $a3, $zero, 64 - bne $t6, $a3, .LBB6_26 + add.d $s5, $s6, $s5 + ldx.hu $s6, $fp, $a7 + alsl.d $s5, $s5, $s5, 2 + add.d $s8, $ra, $s8 + mul.d $s8, $s8, $t0 + ldptr.w $ra, $a4, 5900 + add.d $s6, $s7, $s6 + sub.d $s5, $s6, $s5 + add.d $s5, $s5, $s8 + addi.w $s5, $s5, 16 + srai.d $s6, $s5, 5 + srai.d $s5, $s5, 63 + andn $s5, $s6, $s5 + slt $s6, $s5, $ra + maskeqz $s5, $s5, $s6 + masknez $s6, $ra, $s6 + or $s5, $s5, $s6 + st.w $s5, $t8, 4 + ldx.hu $s5, $s1, $t4 + ldx.hu $s6, $s3, $t4 + ldx.hu $s7, $s0, $t4 + ldx.hu $s8, $s2, $t4 + ldx.hu $ra, $s4, $t4 + add.d $s5, $s6, $s5 + ldx.hu $s6, $fp, $t4 + alsl.d $s5, $s5, $s5, 2 + add.d $s8, $ra, $s8 + mul.d $s8, $s8, $t0 + ldptr.w $ra, $a4, 5900 + add.d $s6, $s7, $s6 + sub.d $s5, $s6, $s5 + add.d $s5, $s5, $s8 + addi.w $s5, $s5, 16 + srai.d $s6, $s5, 5 + srai.d $s5, $s5, 63 + andn $s5, $s6, $s5 + slt $s6, $s5, $ra + maskeqz $s5, $s5, $s6 + masknez $s6, $ra, $s6 + or $s5, $s5, $s6 + st.w $s5, $t8, 8 + ldx.hu $s0, $s0, $t5 + ldx.hu $s1, $s1, $t5 + ldx.hu $s3, $s3, $t5 + ldx.hu $s2, $s2, $t5 + ldx.hu $s4, $s4, $t5 + ldx.hu $fp, $fp, $t5 + add.d $s1, $s3, $s1 + alsl.d $s1, $s1, $s1, 2 + add.d $s2, $s4, $s2 + mul.d $s2, $s2, $t0 + ldptr.w $s3, $a4, 5900 + add.d $fp, $s0, $fp + sub.d $fp, $fp, $s1 + add.d $fp, $fp, $s2 + addi.w $fp, $fp, 16 + srai.d $s0, $fp, 5 + srai.d $fp, $fp, 63 + andn $fp, $s0, $fp + slt $s0, $fp, $s3 + maskeqz $fp, $fp, $s0 + masknez $s0, $s3, $s0 + or $fp, $fp, $s0 + st.w $fp, $t8, 12 + addi.d $t3, $t3, 16 + addi.d $t7, $t7, 1 + bne $t3, $t6, .LBB6_26 # %bb.27: - ld.d $a3, $sp, 40 # 8-byte Folded Reload andi $a3, $a3, 1 - ld.d $a4, $sp, 48 # 8-byte Folded Reload beqz $a3, .LBB6_43 # %bb.28: # %.preheader465 - ld.d $a3, $sp, 32 # 8-byte Folded Reload - srli.d $a3, $a3, 1 - add.d $a3, $a3, $a4 - srai.d $a4, $a3, 63 - andn $a4, $a3, $a4 - slt $a6, $a4, $a1 - maskeqz $a4, $a4, $a6 - masknez $a6, $a1, $a6 - or $a4, $a4, $a6 - slli.d $a4, $a4, 3 - ldx.d $a4, $a0, $a4 - ldx.h $a6, $a4, $a7 - ldx.h $t2, $a4, $t0 - ldx.h $t6, $a4, $t1 - ldx.h $a4, $a4, $t3 - vld $vr1, $a5, 0 - vinsgr2vr.h $vr2, $a6, 0 - vinsgr2vr.h $vr2, $t2, 1 - vinsgr2vr.h $vr2, $t6, 2 - vinsgr2vr.h $vr2, $a4, 3 - vrepli.b $vr0, 0 - vilvl.h $vr2, $vr0, $vr2 - vadd.w $vr1, $vr1, $vr2 - slt $a4, $t5, $a3 - maskeqz $a6, $a3, $a4 - masknez $a4, $t5, $a4 - or $a4, $a6, $a4 - addi.d $a4, $a4, 1 - slt $a6, $a4, $a1 - maskeqz $a4, $a4, $a6 - masknez $a6, $a1, $a6 - or $a4, $a4, $a6 - slli.d $a4, $a4, 3 - ldx.d $a4, $a0, $a4 - vaddi.wu $vr1, $vr1, 1 - vsrai.w $vr1, $vr1, 1 - vst $vr1, $a5, 0 - ldx.h $a6, $a4, $a7 - ldx.h $t2, $a4, $t0 - ldx.h $t5, $a4, $t1 - ldx.h $a4, $a4, $t3 - vld $vr1, $a5, 16 - vinsgr2vr.h $vr2, $a6, 0 - vinsgr2vr.h $vr2, $t2, 1 - vinsgr2vr.h $vr2, $t5, 2 - vinsgr2vr.h $vr2, $a4, 3 - vilvl.h $vr2, $vr0, $vr2 - vadd.w $vr1, $vr1, $vr2 - slt $a4, $t4, $a3 - maskeqz $a6, $a3, $a4 - masknez $a4, $t4, $a4 - or $a4, $a6, $a4 - addi.d $a4, $a4, 2 - slt $a6, $a4, $a1 - maskeqz $a4, $a4, $a6 - masknez $a6, $a1, $a6 - or $a4, $a4, $a6 + srli.d $a3, $t2, 1 + add.d $a6, $a3, $a6 + ori $a3, $zero, 2 + lu32i.d $a3, 3 + srai.d $a4, $a6, 63 + andn $a4, $a6, $a4 + slt $t0, $a4, $a1 + maskeqz $a4, $a4, $t0 + masknez $t0, $a1, $t0 + or $a4, $a4, $t0 slli.d $a4, $a4, 3 - ldx.d $a4, $a0, $a4 - vaddi.wu $vr1, $vr1, 1 - vsrai.w $vr1, $vr1, 1 + ldx.d $t0, $a0, $a4 + vreplgr2vr.d $vr4, $a3 + vadd.w $vr2, $vr2, $vr4 + vmin.w $vr2, $vr2, $vr3 + ldx.h $t2, $t0, $a2 + ldx.h $t3, $t0, $a7 + vpickve2gr.w $a3, $vr2, 0 + slli.d $a3, $a3, 1 + ldx.h $t4, $t0, $a3 + vpickve2gr.w $a4, $vr2, 1 + slli.d $a4, $a4, 1 + ldx.h $t0, $t0, $a4 + vld $vr3, $a5, 0 + vinsgr2vr.h $vr5, $t2, 0 + vinsgr2vr.h $vr5, $t3, 1 + vinsgr2vr.h $vr5, $t4, 2 + vinsgr2vr.h $vr5, $t0, 3 + vrepli.b $vr2, 0 + vilvl.h $vr5, $vr2, $vr5 + vadd.w $vr3, $vr3, $vr5 + slt $t0, $t1, $a6 + maskeqz $t2, $a6, $t0 + masknez $t0, $t1, $t0 + or $t0, $t2, $t0 + addi.d $t0, $t0, 1 + slt $t1, $t0, $a1 + maskeqz $t0, $t0, $t1 + masknez $a1, $a1, $t1 + or $a1, $t0, $a1 + slli.d $a1, $a1, 3 + ldx.d $a1, $a0, $a1 + vaddi.wu $vr3, $vr3, 1 + vsrai.w $vr3, $vr3, 1 + vst $vr3, $a5, 0 + ldx.h $t0, $a1, $a2 + ldx.h $t1, $a1, $a7 + ldx.h $t2, $a1, $a3 + ldx.h $a1, $a1, $a4 + vld $vr3, $a5, 16 + vinsgr2vr.h $vr5, $t0, 0 + vinsgr2vr.h $vr5, $t1, 1 + vinsgr2vr.h $vr5, $t2, 2 + vinsgr2vr.h $vr5, $a1, 3 + vilvl.h $vr5, $vr2, $vr5 + vadd.w $vr3, $vr3, $vr5 + vaddi.wu $vr3, $vr3, 1 + vinsgr2vr.w $vr5, $a6, 0 + vinsgr2vr.w $vr5, $a6, 1 + vmax.w $vr0, $vr5, $vr0 + vadd.w $vr0, $vr0, $vr4 + vmin.w $vr0, $vr0, $vr1 + vpickve2gr.w $a1, $vr0, 0 + slli.d $a1, $a1, 3 + ldx.d $a1, $a0, $a1 + vsrai.w $vr1, $vr3, 1 vst $vr1, $a5, 16 - ldx.h $a6, $a4, $a7 - ldx.h $t2, $a4, $t0 - ldx.h $t4, $a4, $t1 - ldx.h $a4, $a4, $t3 vld $vr1, $a5, 32 - vinsgr2vr.h $vr2, $a6, 0 - vinsgr2vr.h $vr2, $t2, 1 - vinsgr2vr.h $vr2, $t4, 2 - vinsgr2vr.h $vr2, $a4, 3 - vilvl.h $vr2, $vr0, $vr2 - vadd.w $vr1, $vr1, $vr2 - slt $a4, $a2, $a3 - maskeqz $a3, $a3, $a4 - masknez $a2, $a2, $a4 - or $a2, $a3, $a2 - addi.d $a2, $a2, 3 - slt $a3, $a2, $a1 - maskeqz $a2, $a2, $a3 - masknez $a1, $a1, $a3 - or $a1, $a2, $a1 + ldx.h $a6, $a1, $a2 + ldx.h $t0, $a1, $a7 + ldx.h $t1, $a1, $a3 + ldx.h $a1, $a1, $a4 + vinsgr2vr.h $vr3, $a6, 0 + vinsgr2vr.h $vr3, $t0, 1 + vinsgr2vr.h $vr3, $t1, 2 + vinsgr2vr.h $vr3, $a1, 3 + vilvl.h $vr3, $vr2, $vr3 + vadd.w $vr1, $vr1, $vr3 + vpickve2gr.w $a1, $vr0, 1 slli.d $a1, $a1, 3 ldx.d $a0, $a0, $a1 - vaddi.wu $vr1, $vr1, 1 - vsrai.w $vr1, $vr1, 1 - vst $vr1, $a5, 32 - ldx.h $a1, $a0, $a7 - ldx.h $a2, $a0, $t0 - ldx.h $a3, $a0, $t1 - ldx.h $a4, $a0, $t3 - vld $vr1, $a5, 48 - vinsgr2vr.h $vr2, $a1, 0 - vinsgr2vr.h $vr2, $a2, 1 - vinsgr2vr.h $vr2, $a3, 2 - vinsgr2vr.h $vr2, $a4, 3 - vilvl.h $vr0, $vr0, $vr2 - vadd.w $vr0, $vr1, $vr0 + vaddi.wu $vr0, $vr1, 1 + vsrai.w $vr0, $vr0, 1 + vst $vr0, $a5, 32 + ldx.h $a1, $a0, $a2 + ldx.h $a2, $a0, $a7 + ldx.h $a3, $a0, $a3 + ldx.h $a4, $a0, $a4 + vld $vr0, $a5, 48 + vinsgr2vr.h $vr1, $a1, 0 + vinsgr2vr.h $vr1, $a2, 1 + vinsgr2vr.h $vr1, $a3, 2 + vinsgr2vr.h $vr1, $a4, 3 + vilvl.h $vr1, $vr2, $vr1 + vadd.w $vr0, $vr0, $vr1 vaddi.wu $vr0, $vr0, 1 vsrai.w $vr0, $vr0, 1 vst $vr0, $a5, 48 b .LBB6_20 .LBB6_29: - st.d $t0, $sp, 40 # 8-byte Folded Spill - bne $a6, $t3, .LBB6_39 + bne $t2, $t3, .LBB6_39 # %bb.30: # %.preheader461.preheader - st.d $a2, $sp, 48 # 8-byte Folded Spill move $a3, $zero - addi.d $t2, $sp, 68 - ori $t3, $zero, 2 - ori $t4, $zero, 1 - addi.w $t5, $zero, -1 - addi.w $t6, $zero, -2 - addi.w $t7, $zero, -3 - ori $t8, $zero, 36 - ori $fp, $zero, 4 + vinsgr2vr.w $vr0, $a1, 0 + vinsgr2vr.w $vr0, $a1, 1 + addi.d $t2, $sp, 36 + addi.w $t3, $zero, -1 + ori $t4, $zero, 2 + ori $t5, $zero, 2 + lu32i.d $t5, -3 + vreplgr2vr.d $vr1, $t5 + addi.w $t5, $zero, -2 + lu32i.d $t5, 3 + vreplgr2vr.d $vr2, $t5 + ori $t5, $zero, 1 + lu32i.d $t5, -2 + vreplgr2vr.d $vr3, $t5 + move $t5, $t3 + lu32i.d $t5, 2 + vreplgr2vr.d $vr4, $t5 + ori $t5, $zero, 36 + ori $t6, $zero, 4 .p2align 4, , 16 .LBB6_31: # %.preheader461 # =>This Loop Header: Depth=1 # Child Loop BB6_32 Depth 2 - add.w $a2, $ra, $a3 - slt $a6, $t3, $a2 - maskeqz $t0, $a2, $a6 - masknez $a6, $t3, $a6 - or $a6, $t0, $a6 - addi.w $a6, $a6, -2 - slt $t0, $a6, $a1 - maskeqz $a6, $a6, $t0 - masknez $t0, $a1, $t0 - or $a6, $a6, $t0 - slt $t0, $t4, $a2 - maskeqz $t1, $a2, $t0 - masknez $t0, $t4, $t0 - or $t0, $t1, $t0 - addi.d $t0, $t0, -1 - slt $t1, $t0, $a1 - maskeqz $t0, $t0, $t1 - masknez $t1, $a1, $t1 - or $t0, $t0, $t1 - srai.d $t1, $a2, 63 - andn $t1, $a2, $t1 - slt $s0, $t1, $a1 - maskeqz $t1, $t1, $s0 - masknez $s0, $a1, $s0 - or $t1, $t1, $s0 - slt $s0, $t5, $a2 - maskeqz $s1, $a2, $s0 - masknez $s0, $t5, $s0 - or $s0, $s1, $s0 - addi.w $s0, $s0, 1 - slt $s1, $s0, $a1 - maskeqz $s0, $s0, $s1 - masknez $s1, $a1, $s1 - or $s5, $s0, $s1 - slt $s0, $t6, $a2 - maskeqz $s1, $a2, $s0 - masknez $s0, $t6, $s0 - or $s0, $s1, $s0 - addi.w $s0, $s0, 2 - slt $s1, $s0, $a1 - maskeqz $s0, $s0, $s1 - masknez $s1, $a1, $s1 - or $s3, $s0, $s1 - slt $s0, $t7, $a2 - maskeqz $a2, $a2, $s0 - masknez $s0, $t7, $s0 - or $a2, $a2, $s0 - addi.w $a2, $a2, 3 - slt $s0, $a2, $a1 - maskeqz $a2, $a2, $s0 - masknez $s0, $a1, $s0 - or $a2, $a2, $s0 - slli.d $a6, $a6, 3 - ldx.d $s0, $a0, $a6 - slli.d $a2, $a2, 3 - ldx.d $s1, $a0, $a2 - slli.d $a2, $t0, 3 - ldx.d $s2, $a0, $a2 - slli.d $a2, $s3, 3 - ldx.d $s3, $a0, $a2 - slli.d $a2, $t1, 3 - ldx.d $s4, $a0, $a2 - slli.d $a2, $s5, 3 - ldx.d $s5, $a0, $a2 - move $s6, $zero + add.w $t7, $a6, $a3 + srai.d $t8, $t7, 63 + andn $t8, $t7, $t8 + slt $fp, $t8, $a1 + maskeqz $t8, $t8, $fp + masknez $fp, $a1, $fp + or $s1, $t8, $fp + slt $t8, $t3, $t7 + maskeqz $fp, $t7, $t8 + masknez $t8, $t3, $t8 + or $t8, $fp, $t8 + addi.w $t8, $t8, 1 + slt $fp, $t8, $a1 + maskeqz $t8, $t8, $fp + masknez $fp, $a1, $fp + or $s2, $t8, $fp + vinsgr2vr.w $vr5, $t7, 0 + vinsgr2vr.w $vr5, $t7, 1 + vmax.w $vr6, $vr5, $vr1 + vadd.w $vr6, $vr6, $vr2 + vmin.w $vr6, $vr6, $vr0 + vpickve2gr.w $t7, $vr6, 0 + slli.d $t7, $t7, 3 + ldx.d $t7, $a0, $t7 + vpickve2gr.w $t8, $vr6, 1 + slli.d $t8, $t8, 3 + ldx.d $t8, $a0, $t8 + vmax.w $vr5, $vr5, $vr3 + vadd.w $vr5, $vr5, $vr4 + vmin.w $vr5, $vr5, $vr0 + vpickve2gr.w $fp, $vr5, 0 + slli.d $fp, $fp, 3 + ldx.d $fp, $a0, $fp + vpickve2gr.w $s0, $vr5, 1 + slli.d $s0, $s0, 3 + ldx.d $s0, $a0, $s0 + slli.d $s1, $s1, 3 + ldx.d $s1, $a0, $s1 + slli.d $s2, $s2, 3 + ldx.d $s2, $a0, $s2 + move $s3, $zero + move $s4, $t0 .p2align 4, , 16 .LBB6_32: # Parent Loop BB6_31 Depth=1 # => This Inner Loop Header: Depth=2 - addi.w $a2, $s7, 0 - slt $a6, $t3, $a2 - masknez $t0, $t3, $a6 - maskeqz $a2, $a2, $a6 - or $a2, $a2, $t0 - addi.w $a2, $a2, -2 - slt $a6, $a2, $a7 - maskeqz $a2, $a2, $a6 - masknez $a6, $a7, $a6 - or $a2, $a2, $a6 - slli.d $a2, $a2, 1 - ldx.hu $a6, $s0, $a2 - ldx.hu $t0, $s1, $a2 - ldx.hu $t1, $s2, $a2 - ldx.hu $s8, $s3, $a2 - add.d $a6, $t0, $a6 - ldx.hu $t0, $s4, $a2 - ldx.hu $a2, $s5, $a2 - add.d $t1, $s8, $t1 - alsl.d $t1, $t1, $t1, 2 - sub.d $a6, $a6, $t1 - add.d $a2, $a2, $t0 - slli.d $t0, $a2, 4 - alsl.d $a2, $a2, $t0, 2 - add.d $a2, $a2, $a6 - stx.w $a2, $t2, $s6 - addi.d $s6, $s6, 4 - addi.d $s7, $s7, 1 - bne $s6, $t8, .LBB6_32 + addi.w $s5, $s4, 0 + slt $s6, $t4, $s5 + masknez $s7, $t4, $s6 + maskeqz $s5, $s5, $s6 + or $s5, $s5, $s7 + addi.w $s5, $s5, -2 + slt $s6, $s5, $a7 + maskeqz $s5, $s5, $s6 + masknez $s6, $a7, $s6 + or $s5, $s5, $s6 + slli.d $s5, $s5, 1 + ldx.hu $s6, $t7, $s5 + ldx.hu $s7, $t8, $s5 + ldx.hu $s8, $fp, $s5 + ldx.hu $ra, $s0, $s5 + add.d $s6, $s7, $s6 + ldx.hu $s7, $s1, $s5 + ldx.hu $s5, $s2, $s5 + add.d $s8, $ra, $s8 + alsl.d $s8, $s8, $s8, 2 + sub.d $s6, $s6, $s8 + add.d $s5, $s5, $s7 + slli.d $s7, $s5, 4 + alsl.d $s5, $s5, $s7, 2 + add.d $s5, $s5, $s6 + stx.w $s5, $t2, $s3 + addi.d $s3, $s3, 4 + addi.d $s4, $s4, 1 + bne $s3, $t5, .LBB6_32 # %bb.33: # in Loop: Header=BB6_31 Depth=1 addi.d $a3, $a3, 1 addi.d $t2, $t2, 36 - ld.d $s7, $sp, 56 # 8-byte Folded Reload - bne $a3, $fp, .LBB6_31 + bne $a3, $t6, .LBB6_31 # %bb.34: # %.preheader460 move $a0, $zero - addi.d $a1, $sp, 84 + addi.d $a1, $sp, 52 ori $a3, $zero, 20 ori $a6, $zero, 64 .p2align 4, , 16 .LBB6_35: # %.preheader459 # =>This Inner Loop Header: Depth=1 add.d $a7, $a5, $a0 - ld.w $a2, $a1, -16 - ld.w $t1, $a1, -12 + ld.w $t4, $a1, -16 + ld.w $t5, $a1, -12 ld.w $t2, $a1, 0 ld.w $t0, $a1, 4 - ld.w $t4, $a1, -8 + ld.w $t6, $a1, -8 ld.w $t3, $a1, -4 - add.d $t5, $t2, $t1 - addi.w $t6, $t5, 0 - alsl.d $t5, $t6, $t5, 2 - add.d $t6, $t3, $t4 - mul.d $t6, $t6, $a3 - ldptr.w $t7, $a4, 5900 - add.d $a2, $a2, $t0 - sub.d $a2, $a2, $t5 - add.d $a2, $a2, $t6 - addi.w $a2, $a2, 512 - srai.d $t5, $a2, 10 - srai.d $a2, $a2, 63 - andn $a2, $t5, $a2 - slt $t5, $a2, $t7 - maskeqz $a2, $a2, $t5 - masknez $t5, $t7, $t5 - or $a2, $a2, $t5 - stx.w $a2, $a5, $a0 - ld.w $a2, $a1, 8 - add.d $t5, $t0, $t4 - addi.w $t6, $t5, 0 - alsl.d $t5, $t6, $t5, 2 - add.d $t6, $t2, $t3 - mul.d $t6, $t6, $a3 - ldptr.w $t7, $a4, 5900 - add.d $t1, $t1, $a2 - sub.d $t1, $t1, $t5 - add.d $t1, $t1, $t6 - addi.w $t1, $t1, 512 - srai.d $t5, $t1, 10 - srai.d $t1, $t1, 63 - andn $t1, $t5, $t1 - slt $t5, $t1, $t7 - maskeqz $t1, $t1, $t5 - masknez $t5, $t7, $t5 - or $t1, $t1, $t5 - st.w $t1, $a7, 4 - ld.w $t1, $a1, 12 - add.d $t5, $a2, $t3 - addi.w $t6, $t5, 0 - alsl.d $t5, $t6, $t5, 2 - add.d $t6, $t0, $t2 - mul.d $t6, $t6, $a3 - ldptr.w $t7, $a4, 5900 - add.d $t4, $t4, $t1 - sub.d $t4, $t4, $t5 - add.d $t4, $t4, $t6 + add.d $t7, $t2, $t5 + addi.w $t8, $t7, 0 + alsl.d $t7, $t8, $t7, 2 + add.d $t8, $t3, $t6 + mul.d $t8, $t8, $a3 + ldptr.w $fp, $a4, 5900 + add.d $t4, $t4, $t0 + sub.d $t4, $t4, $t7 + add.d $t4, $t4, $t8 addi.w $t4, $t4, 512 - srai.d $t5, $t4, 10 + srai.d $t7, $t4, 10 srai.d $t4, $t4, 63 - andn $t4, $t5, $t4 - slt $t5, $t4, $t7 - maskeqz $t4, $t4, $t5 - masknez $t5, $t7, $t5 - or $t4, $t4, $t5 - st.w $t4, $a7, 8 - ld.w $t4, $a1, 16 - add.d $t1, $t1, $t2 - addi.w $t2, $t1, 0 - alsl.d $t1, $t2, $t1, 2 - add.d $a2, $a2, $t0 - mul.d $a2, $a2, $a3 - ldptr.w $t0, $a4, 5900 - add.d $t2, $t3, $t4 - sub.d $t1, $t2, $t1 - add.d $a2, $t1, $a2 - addi.w $a2, $a2, 512 - srai.d $t1, $a2, 10 - srai.d $a2, $a2, 63 - andn $a2, $t1, $a2 - slt $t1, $a2, $t0 - maskeqz $a2, $a2, $t1 - masknez $t0, $t0, $t1 - or $a2, $a2, $t0 - st.w $a2, $a7, 12 + andn $t4, $t7, $t4 + slt $t7, $t4, $fp + maskeqz $t4, $t4, $t7 + masknez $t7, $fp, $t7 + or $t4, $t4, $t7 + stx.w $t4, $a5, $a0 + ld.w $t4, $a1, 8 + add.d $t7, $t0, $t6 + addi.w $t8, $t7, 0 + alsl.d $t7, $t8, $t7, 2 + add.d $t8, $t2, $t3 + mul.d $t8, $t8, $a3 + ldptr.w $fp, $a4, 5900 + add.d $t5, $t5, $t4 + sub.d $t5, $t5, $t7 + add.d $t5, $t5, $t8 + addi.w $t5, $t5, 512 + srai.d $t7, $t5, 10 + srai.d $t5, $t5, 63 + andn $t5, $t7, $t5 + slt $t7, $t5, $fp + maskeqz $t5, $t5, $t7 + masknez $t7, $fp, $t7 + or $t5, $t5, $t7 + st.w $t5, $a7, 4 + ld.w $t5, $a1, 12 + add.d $t7, $t4, $t3 + addi.w $t8, $t7, 0 + alsl.d $t7, $t8, $t7, 2 + add.d $t8, $t0, $t2 + mul.d $t8, $t8, $a3 + ldptr.w $fp, $a4, 5900 + add.d $t6, $t6, $t5 + sub.d $t6, $t6, $t7 + add.d $t6, $t6, $t8 + addi.w $t6, $t6, 512 + srai.d $t7, $t6, 10 + srai.d $t6, $t6, 63 + andn $t6, $t7, $t6 + slt $t7, $t6, $fp + maskeqz $t6, $t6, $t7 + masknez $t7, $fp, $t7 + or $t6, $t6, $t7 + st.w $t6, $a7, 8 + ld.w $t6, $a1, 16 + add.d $t2, $t5, $t2 + addi.w $t5, $t2, 0 + alsl.d $t2, $t5, $t2, 2 + add.d $t0, $t4, $t0 + mul.d $t0, $t0, $a3 + ldptr.w $t4, $a4, 5900 + add.d $t3, $t3, $t6 + sub.d $t2, $t3, $t2 + add.d $t0, $t2, $t0 + addi.w $t0, $t0, 512 + srai.d $t2, $t0, 10 + srai.d $t0, $t0, 63 + andn $t0, $t2, $t0 + slt $t2, $t0, $t4 + maskeqz $t0, $t0, $t2 + masknez $t2, $t4, $t2 + or $t0, $t0, $t2 + st.w $t0, $a7, 12 addi.d $a0, $a0, 16 addi.d $a1, $a1, 36 bne $a0, $a6, .LBB6_35 # %bb.36: - ld.d $a0, $sp, 48 # 8-byte Folded Reload - andi $a0, $a0, 1 + andi $a0, $a2, 1 beqz $a0, .LBB6_43 # %bb.37: # %.preheader457 move $a0, $zero - ld.d $a1, $sp, 40 # 8-byte Folded Reload - slli.d $a1, $a1, 1 + slli.d $a1, $t1, 1 andi $a1, $a1, 4 - addi.d $a2, $sp, 68 + addi.d $a2, $sp, 36 add.d $a1, $a1, $a2 addi.d $a1, $a1, 12 addi.d $a2, $a5, 8 @@ -3847,457 +3749,411 @@ get_block: # @get_block bne $a0, $a3, .LBB6_38 b .LBB6_43 .LBB6_39: # %.preheader464 - move $t3, $zero - addi.d $a2, $a6, -1 + st.d $t1, $sp, 24 # 8-byte Folded Spill + move $a3, $zero + addi.d $a2, $t2, -1 sltu $a2, $zero, $a2 - add.d $a6, $ra, $a2 - srai.d $a2, $a6, 63 - andn $a2, $a6, $a2 - slt $a3, $a2, $a1 - maskeqz $a2, $a2, $a3 - masknez $a3, $a1, $a3 - or $a2, $a2, $a3 + add.d $t2, $a6, $a2 + srai.d $a2, $t2, 63 + andn $a2, $t2, $a2 + slt $t3, $a2, $a1 + maskeqz $a2, $a2, $t3 + masknez $t3, $a1, $t3 + or $a2, $a2, $t3 slli.d $a2, $a2, 3 - ldx.d $t5, $a0, $a2 + ldx.d $t3, $a0, $a2 addi.w $a2, $zero, -1 - slt $a3, $a2, $a6 - maskeqz $t0, $a6, $a3 - masknez $a3, $a2, $a3 - or $a3, $t0, $a3 - addi.d $a3, $a3, 1 - slt $t0, $a3, $a1 - maskeqz $a3, $a3, $t0 - masknez $t0, $a1, $t0 - or $a3, $a3, $t0 - slli.d $a3, $a3, 3 - ldx.d $t6, $a0, $a3 - addi.w $a3, $zero, -2 - slt $t0, $a3, $a6 - maskeqz $t1, $a6, $t0 - masknez $t0, $a3, $t0 - or $t0, $t1, $t0 - addi.d $t0, $t0, 2 - slt $t1, $t0, $a1 - maskeqz $t0, $t0, $t1 - masknez $t1, $a1, $t1 - or $t0, $t0, $t1 - slli.d $t0, $t0, 3 - ldx.d $t7, $a0, $t0 - addi.w $t2, $zero, -3 - slt $t0, $t2, $a6 - maskeqz $a6, $a6, $t0 - masknez $t0, $t2, $t0 - or $a6, $a6, $t0 - addi.d $a6, $a6, 3 - slt $t0, $a6, $a1 - maskeqz $a6, $a6, $t0 - masknez $t0, $a1, $t0 - or $a6, $a6, $t0 - slli.d $a6, $a6, 3 - ldx.d $t4, $a0, $a6 - ori $fp, $zero, 1 - ori $s0, $zero, 20 - move $s2, $s7 + slt $t4, $a2, $t2 + maskeqz $t5, $t2, $t4 + masknez $t4, $a2, $t4 + or $t4, $t5, $t4 + addi.d $t4, $t4, 1 + slt $t5, $t4, $a1 + maskeqz $t4, $t4, $t5 + masknez $t5, $a1, $t5 + or $t4, $t4, $t5 + slli.d $t4, $t4, 3 + ldx.d $t4, $a0, $t4 + vinsgr2vr.w $vr0, $t2, 0 + addi.w $t6, $zero, -2 + vinsgr2vr.w $vr0, $t2, 1 + move $t2, $t6 + lu32i.d $t2, -3 + vreplgr2vr.d $vr3, $t2 + vmax.w $vr0, $vr0, $vr3 + ori $t7, $zero, 2 + ori $t2, $zero, 2 + lu32i.d $t2, 3 + vreplgr2vr.d $vr1, $t2 + vadd.w $vr1, $vr0, $vr1 + vinsgr2vr.w $vr0, $a1, 0 + vinsgr2vr.w $vr0, $a1, 1 + vmin.w $vr1, $vr1, $vr0 + vpickve2gr.w $t2, $vr1, 0 + slli.d $t2, $t2, 3 + ldx.d $t5, $a0, $t2 + vpickve2gr.w $t2, $vr1, 1 + slli.d $t2, $t2, 3 + ldx.d $t2, $a0, $t2 + vinsgr2vr.w $vr4, $a7, 0 + vinsgr2vr.w $vr4, $a7, 1 + lu32i.d $t7, -3 + vreplgr2vr.d $vr5, $t7 + lu32i.d $t6, 3 + vreplgr2vr.d $vr1, $t6 + ori $t6, $zero, 1 + lu32i.d $t6, -2 + vreplgr2vr.d $vr6, $t6 + move $t6, $a2 + lu32i.d $t6, 2 + vreplgr2vr.d $vr2, $t6 + ori $t6, $zero, 20 + ori $t7, $zero, 16 + move $t8, $t0 .p2align 4, , 16 .LBB6_40: # =>This Inner Loop Header: Depth=1 - addi.w $a6, $s2, 0 - ori $t1, $zero, 2 - slt $t0, $t1, $a6 - masknez $t1, $t1, $t0 - maskeqz $t0, $a6, $t0 - or $t0, $t0, $t1 - addi.w $t0, $t0, -2 - slt $t1, $t0, $a7 - maskeqz $t0, $t0, $t1 - masknez $t1, $a7, $t1 - or $t0, $t0, $t1 - slt $t1, $fp, $a6 - masknez $s3, $fp, $t1 - maskeqz $t1, $a6, $t1 - or $t1, $t1, $s3 - addi.d $t1, $t1, -1 - slt $s3, $t1, $a7 - maskeqz $t1, $t1, $s3 - masknez $s3, $a7, $s3 - or $t1, $t1, $s3 - srai.d $s3, $a6, 63 - andn $s3, $a6, $s3 - slt $s4, $s3, $a7 - maskeqz $s3, $s3, $s4 - masknez $s4, $a7, $s4 - or $s8, $s3, $s4 - slt $s3, $a2, $a6 - masknez $s4, $a2, $s3 - maskeqz $s3, $a6, $s3 - or $s3, $s3, $s4 - addi.w $s3, $s3, 1 - slt $s4, $s3, $a7 - maskeqz $s3, $s3, $s4 - masknez $s4, $a7, $s4 - or $ra, $s3, $s4 - slt $s3, $a3, $a6 - masknez $s4, $a3, $s3 - maskeqz $s3, $a6, $s3 - or $s3, $s3, $s4 - addi.w $s3, $s3, 2 - slt $s4, $s3, $a7 - maskeqz $s3, $s3, $s4 - masknez $s4, $a7, $s4 - or $s7, $s3, $s4 - slt $s3, $t2, $a6 - masknez $s4, $t2, $s3 - maskeqz $a6, $a6, $s3 - or $a6, $a6, $s4 - addi.w $a6, $a6, 3 - slt $s3, $a6, $a7 - maskeqz $a6, $a6, $s3 - masknez $s3, $a7, $s3 - or $a6, $a6, $s3 - add.d $s3, $a5, $t3 - slli.d $s5, $t0, 1 - ldx.hu $t0, $t5, $s5 - slli.d $s4, $a6, 1 - slli.d $s6, $t1, 1 - ldx.hu $a6, $t5, $s6 - slli.d $s7, $s7, 1 - ldx.hu $t1, $t5, $s7 - slli.d $s8, $s8, 1 - ldx.hu $s1, $t5, $s8 - slli.d $ra, $ra, 1 - ldx.hu $t8, $t5, $ra - add.d $a6, $t1, $a6 - ldx.hu $t1, $t5, $s4 - alsl.d $a6, $a6, $a6, 2 - add.d $t8, $t8, $s1 - mul.d $t8, $t8, $s0 - ldptr.w $s1, $a4, 5900 - add.d $t0, $t0, $t1 - sub.d $a6, $t0, $a6 - add.d $a6, $a6, $t8 - addi.w $a6, $a6, 16 - srai.d $t0, $a6, 5 - srai.d $a6, $a6, 63 - andn $a6, $t0, $a6 - slt $t0, $a6, $s1 - maskeqz $a6, $a6, $t0 - masknez $t0, $s1, $t0 - or $a6, $a6, $t0 - stx.w $a6, $a5, $t3 - ldx.hu $a6, $t6, $s6 - ldx.hu $t0, $t6, $s7 - ldx.hu $t1, $t6, $s5 - ldx.hu $t8, $t6, $s8 - ldx.hu $s1, $t6, $ra - add.d $a6, $t0, $a6 - ldx.hu $t0, $t6, $s4 - alsl.d $a6, $a6, $a6, 2 - add.d $t8, $s1, $t8 - mul.d $t8, $t8, $s0 - ldptr.w $s1, $a4, 5900 - add.d $t0, $t1, $t0 - sub.d $a6, $t0, $a6 - add.d $a6, $a6, $t8 - addi.w $a6, $a6, 16 - srai.d $t0, $a6, 5 - srai.d $a6, $a6, 63 - andn $a6, $t0, $a6 - slt $t0, $a6, $s1 - maskeqz $a6, $a6, $t0 - masknez $t0, $s1, $t0 - or $a6, $a6, $t0 - st.w $a6, $s3, 16 - ldx.hu $a6, $t7, $s6 - ldx.hu $t0, $t7, $s7 - ldx.hu $t1, $t7, $s5 - ldx.hu $t8, $t7, $s8 - ldx.hu $s1, $t7, $ra - add.d $a6, $t0, $a6 - ldx.hu $t0, $t7, $s4 - alsl.d $a6, $a6, $a6, 2 - add.d $t8, $s1, $t8 - mul.d $t8, $t8, $s0 - ldptr.w $s1, $a4, 5900 - add.d $t0, $t1, $t0 - sub.d $a6, $t0, $a6 - add.d $a6, $a6, $t8 - addi.w $a6, $a6, 16 - srai.d $t0, $a6, 5 - srai.d $a6, $a6, 63 - andn $a6, $t0, $a6 - slt $t0, $a6, $s1 - maskeqz $a6, $a6, $t0 - masknez $t0, $s1, $t0 - or $a6, $a6, $t0 - st.w $a6, $s3, 32 - ldx.hu $a6, $t4, $s5 - ldx.hu $t0, $t4, $s6 - ldx.hu $t1, $t4, $s7 - ldx.hu $t8, $t4, $s8 - ldx.hu $s1, $t4, $ra - ldx.hu $s4, $t4, $s4 - add.d $t0, $t1, $t0 - alsl.d $t0, $t0, $t0, 2 - add.d $t1, $s1, $t8 - mul.d $t1, $t1, $s0 - ldptr.w $t8, $a4, 5900 - add.d $a6, $a6, $s4 - sub.d $a6, $a6, $t0 - add.d $a6, $a6, $t1 - addi.w $a6, $a6, 16 - srai.d $t0, $a6, 5 - srai.d $a6, $a6, 63 - andn $a6, $t0, $a6 - slt $t0, $a6, $t8 - maskeqz $a6, $a6, $t0 - masknez $t0, $t8, $t0 - or $a6, $a6, $t0 - st.w $a6, $s3, 48 - addi.d $t3, $t3, 4 - addi.d $s2, $s2, 1 - ori $a6, $zero, 16 - bne $t3, $a6, .LBB6_40 -# %bb.41: # %.preheader462 - move $t3, $zero - pcalau12i $a6, %pc_hi20(get_block.cur_lineY) - st.d $t4, $a6, %pc_lo12(get_block.cur_lineY) - ld.d $a6, $sp, 40 # 8-byte Folded Reload - addi.d $a6, $a6, -1 - sltu $a6, $zero, $a6 - ld.d $t0, $sp, 56 # 8-byte Folded Reload - add.d $a6, $t0, $a6 - srai.d $t0, $a6, 63 - andn $t0, $a6, $t0 - slt $t1, $t0, $a7 - maskeqz $t0, $t0, $t1 - masknez $t1, $a7, $t1 - or $t1, $t0, $t1 - slt $t0, $a2, $a6 - maskeqz $t4, $a6, $t0 - masknez $t0, $a2, $t0 - or $t0, $t4, $t0 - addi.d $t0, $t0, 1 - slt $t4, $t0, $a7 - maskeqz $t0, $t0, $t4 - masknez $t4, $a7, $t4 - or $t5, $t0, $t4 - slt $t0, $a3, $a6 - maskeqz $t4, $a6, $t0 - masknez $t0, $a3, $t0 - or $t0, $t4, $t0 - addi.d $t0, $t0, 2 - slt $t4, $t0, $a7 - maskeqz $t0, $t0, $t4 - masknez $t4, $a7, $t4 - or $t6, $t0, $t4 - slt $t0, $t2, $a6 - maskeqz $a6, $a6, $t0 - masknez $t0, $t2, $t0 - or $a6, $a6, $t0 - addi.d $a6, $a6, 3 - slt $t0, $a6, $a7 - maskeqz $a6, $a6, $t0 - masknez $a7, $a7, $t0 - or $a6, $a6, $a7 - ori $a7, $zero, 2 - ori $t0, $zero, 1 - slli.d $t1, $t1, 1 - ori $t4, $zero, 20 - slli.d $t5, $t5, 1 - slli.d $t6, $t6, 1 - slli.d $t7, $a6, 1 - ld.d $t8, $sp, 48 # 8-byte Folded Reload - .p2align 4, , 16 -.LBB6_42: # =>This Inner Loop Header: Depth=1 - addi.w $a6, $t8, 0 - slt $fp, $a7, $a6 - masknez $s0, $a7, $fp - maskeqz $fp, $a6, $fp - or $fp, $fp, $s0 - addi.w $fp, $fp, -2 - slt $s0, $fp, $a1 - maskeqz $fp, $fp, $s0 - masknez $s0, $a1, $s0 - or $fp, $fp, $s0 - slt $s0, $t0, $a6 - masknez $s1, $t0, $s0 - maskeqz $s0, $a6, $s0 - or $s0, $s0, $s1 - addi.d $s0, $s0, -1 - slt $s1, $s0, $a1 + addi.w $fp, $t8, 0 + srai.d $s0, $fp, 63 + andn $s0, $fp, $s0 + slt $s1, $s0, $a7 maskeqz $s0, $s0, $s1 - masknez $s1, $a1, $s1 - or $s2, $s0, $s1 - srai.d $s0, $a6, 63 - andn $s0, $a6, $s0 - slt $s1, $s0, $a1 - maskeqz $s0, $s0, $s1 - masknez $s1, $a1, $s1 + masknez $s1, $a7, $s1 or $s4, $s0, $s1 - slt $s0, $a2, $a6 + slt $s0, $a2, $fp masknez $s1, $a2, $s0 - maskeqz $s0, $a6, $s0 - or $s0, $s0, $s1 - addi.w $s0, $s0, 1 - slt $s1, $s0, $a1 - maskeqz $s0, $s0, $s1 - masknez $s1, $a1, $s1 - or $s6, $s0, $s1 - slt $s0, $a3, $a6 - masknez $s1, $a3, $s0 - maskeqz $s0, $a6, $s0 - or $s0, $s0, $s1 - addi.w $s0, $s0, 2 - slt $s1, $s0, $a1 - maskeqz $s0, $s0, $s1 - masknez $s1, $a1, $s1 - or $s5, $s0, $s1 - slt $s0, $t2, $a6 - masknez $s1, $t2, $s0 - maskeqz $a6, $a6, $s0 - or $a6, $a6, $s1 - addi.w $a6, $a6, 3 - slt $s0, $a6, $a1 - maskeqz $a6, $a6, $s0 - masknez $s0, $a1, $s0 - or $a6, $a6, $s0 - slli.d $fp, $fp, 3 - ldx.d $s0, $a0, $fp - slli.d $a6, $a6, 3 - ldx.d $s1, $a0, $a6 - slli.d $a6, $s2, 3 - ldx.d $s3, $a0, $a6 - slli.d $a6, $s5, 3 - ldx.d $s5, $a0, $a6 - slli.d $a6, $s4, 3 - ldx.d $s2, $a0, $a6 - slli.d $a6, $s6, 3 - ldx.d $s4, $a0, $a6 - add.d $fp, $a5, $t3 - ldx.hu $a6, $s3, $t1 - ldx.hu $s6, $s5, $t1 - ldx.hu $s7, $s0, $t1 - ldx.hu $s8, $s2, $t1 - ldx.hu $ra, $s4, $t1 - add.d $a6, $s6, $a6 - ldx.hu $s6, $s1, $t1 - alsl.d $a6, $a6, $a6, 2 + maskeqz $fp, $fp, $s0 + or $fp, $fp, $s1 + addi.w $fp, $fp, 1 + slt $s0, $fp, $a7 + maskeqz $fp, $fp, $s0 + masknez $s0, $a7, $s0 + or $s5, $fp, $s0 + vinsgr2vr.w $vr7, $t8, 0 + vinsgr2vr.w $vr7, $t8, 1 + vmax.w $vr8, $vr7, $vr5 + vadd.w $vr8, $vr8, $vr1 + vmin.w $vr8, $vr8, $vr4 + vmax.w $vr7, $vr7, $vr6 + vadd.w $vr7, $vr7, $vr2 + vmin.w $vr7, $vr7, $vr4 + add.d $fp, $a5, $a3 + vpickve2gr.w $s0, $vr8, 0 + slli.d $s1, $s0, 1 + ldx.hu $s6, $t3, $s1 + vpickve2gr.w $s0, $vr8, 1 + slli.d $s0, $s0, 1 + vpickve2gr.w $s2, $vr7, 0 + slli.d $s2, $s2, 1 + ldx.hu $s7, $t3, $s2 + vpickve2gr.w $s3, $vr7, 1 + slli.d $s3, $s3, 1 + ldx.hu $s8, $t3, $s3 + slli.d $s4, $s4, 1 + ldx.hu $ra, $t3, $s4 + slli.d $s5, $s5, 1 + ldx.hu $t1, $t3, $s5 + add.d $s7, $s8, $s7 + ldx.hu $s8, $t3, $s0 + alsl.d $s7, $s7, $s7, 2 + add.d $t1, $t1, $ra + mul.d $t1, $t1, $t6 + ldptr.w $ra, $a4, 5900 + add.d $s6, $s6, $s8 + sub.d $s6, $s6, $s7 + add.d $t1, $s6, $t1 + addi.w $t1, $t1, 16 + srai.d $s6, $t1, 5 + srai.d $t1, $t1, 63 + andn $t1, $s6, $t1 + slt $s6, $t1, $ra + maskeqz $t1, $t1, $s6 + masknez $s6, $ra, $s6 + or $t1, $t1, $s6 + stx.w $t1, $a5, $a3 + ldx.hu $t1, $t4, $s2 + ldx.hu $s6, $t4, $s3 + ldx.hu $s7, $t4, $s1 + ldx.hu $s8, $t4, $s4 + ldx.hu $ra, $t4, $s5 + add.d $t1, $s6, $t1 + ldx.hu $s6, $t4, $s0 + alsl.d $t1, $t1, $t1, 2 add.d $s8, $ra, $s8 - mul.d $s8, $s8, $t4 - ldx.w $ra, $a5, $t3 + mul.d $s8, $s8, $t6 + ldptr.w $ra, $a4, 5900 add.d $s6, $s7, $s6 - ldptr.w $s7, $a4, 5900 - sub.d $a6, $s6, $a6 - add.d $a6, $a6, $s8 - addi.w $a6, $a6, 16 - srai.d $s6, $a6, 5 - srai.d $a6, $a6, 63 - andn $a6, $s6, $a6 - slt $s6, $a6, $s7 - maskeqz $a6, $a6, $s6 - masknez $s6, $s7, $s6 - or $a6, $a6, $s6 - add.d $a6, $ra, $a6 - addi.w $a6, $a6, 1 - srli.d $a6, $a6, 1 - stx.w $a6, $a5, $t3 - ldx.hu $a6, $s3, $t5 - ldx.hu $s6, $s5, $t5 - ldx.hu $s7, $s0, $t5 - ldx.hu $s8, $s2, $t5 - ldx.hu $ra, $s4, $t5 - add.d $a6, $s6, $a6 - ldx.hu $s6, $s1, $t5 - alsl.d $a6, $a6, $a6, 2 + sub.d $t1, $s6, $t1 + add.d $t1, $t1, $s8 + addi.w $t1, $t1, 16 + srai.d $s6, $t1, 5 + srai.d $t1, $t1, 63 + andn $t1, $s6, $t1 + slt $s6, $t1, $ra + maskeqz $t1, $t1, $s6 + masknez $s6, $ra, $s6 + or $t1, $t1, $s6 + st.w $t1, $fp, 16 + ldx.hu $t1, $t5, $s2 + ldx.hu $s6, $t5, $s3 + ldx.hu $s7, $t5, $s1 + ldx.hu $s8, $t5, $s4 + ldx.hu $ra, $t5, $s5 + add.d $t1, $s6, $t1 + ldx.hu $s6, $t5, $s0 + alsl.d $t1, $t1, $t1, 2 add.d $s8, $ra, $s8 - mul.d $s8, $s8, $t4 - ld.w $ra, $fp, 4 + mul.d $s8, $s8, $t6 + ldptr.w $ra, $a4, 5900 add.d $s6, $s7, $s6 - ldptr.w $s7, $a4, 5900 - sub.d $a6, $s6, $a6 - add.d $a6, $a6, $s8 - addi.w $a6, $a6, 16 - srai.d $s6, $a6, 5 - srai.d $a6, $a6, 63 - andn $a6, $s6, $a6 - slt $s6, $a6, $s7 - maskeqz $a6, $a6, $s6 - masknez $s6, $s7, $s6 - or $a6, $a6, $s6 - add.d $a6, $ra, $a6 - addi.w $a6, $a6, 1 - srli.d $a6, $a6, 1 - st.w $a6, $fp, 4 - ldx.hu $a6, $s3, $t6 - ldx.hu $s6, $s5, $t6 - ldx.hu $s7, $s2, $t6 - ldx.hu $s8, $s4, $t6 - ldx.hu $ra, $s0, $t6 - add.d $a6, $s6, $a6 - ldx.hu $s6, $s1, $t6 - add.d $s7, $s8, $s7 - alsl.d $a6, $a6, $a6, 2 - mul.d $s7, $s7, $t4 - add.d $s6, $ra, $s6 - ld.w $s8, $fp, 8 - sub.d $a6, $s6, $a6 - ldptr.w $s6, $a4, 5900 - add.d $a6, $a6, $s7 - addi.w $a6, $a6, 16 - srai.d $s7, $a6, 5 - srai.d $a6, $a6, 63 - andn $a6, $s7, $a6 - slt $s7, $a6, $s6 - maskeqz $a6, $a6, $s7 - masknez $s6, $s6, $s7 - or $a6, $a6, $s6 - add.d $a6, $s8, $a6 - addi.w $a6, $a6, 1 - srli.d $a6, $a6, 1 - ldx.hu $s3, $s3, $t7 - ldx.hu $s5, $s5, $t7 - st.w $a6, $fp, 8 - ldx.hu $a6, $s2, $t7 - ldx.hu $s2, $s4, $t7 - add.d $s3, $s5, $s3 - ldx.hu $s0, $s0, $t7 - ldx.hu $s1, $s1, $t7 - add.d $a6, $s2, $a6 - alsl.d $s2, $s3, $s3, 2 - mul.d $a6, $a6, $t4 - add.d $s0, $s0, $s1 - ld.w $s1, $fp, 12 - sub.d $s0, $s0, $s2 - ldptr.w $s2, $a4, 5900 - add.d $a6, $s0, $a6 - addi.w $a6, $a6, 16 - srai.d $s0, $a6, 5 - srai.d $a6, $a6, 63 - andn $a6, $s0, $a6 - slt $s0, $a6, $s2 - maskeqz $a6, $a6, $s0 - masknez $s0, $s2, $s0 - or $a6, $a6, $s0 - add.d $a6, $s1, $a6 - addi.w $a6, $a6, 1 - srli.d $a6, $a6, 1 - st.w $a6, $fp, 12 - addi.d $t3, $t3, 16 + sub.d $t1, $s6, $t1 + add.d $t1, $t1, $s8 + addi.w $t1, $t1, 16 + srai.d $s6, $t1, 5 + srai.d $t1, $t1, 63 + andn $t1, $s6, $t1 + slt $s6, $t1, $ra + maskeqz $t1, $t1, $s6 + masknez $s6, $ra, $s6 + or $t1, $t1, $s6 + st.w $t1, $fp, 32 + ldx.hu $t1, $t2, $s1 + ldx.hu $s1, $t2, $s2 + ldx.hu $s2, $t2, $s3 + ldx.hu $s3, $t2, $s4 + ldx.hu $s4, $t2, $s5 + ldx.hu $s0, $t2, $s0 + add.d $s1, $s2, $s1 + alsl.d $s1, $s1, $s1, 2 + add.d $s2, $s4, $s3 + mul.d $s2, $s2, $t6 + ldptr.w $s3, $a4, 5900 + add.d $t1, $t1, $s0 + sub.d $t1, $t1, $s1 + add.d $t1, $t1, $s2 + addi.w $t1, $t1, 16 + srai.d $s0, $t1, 5 + srai.d $t1, $t1, 63 + andn $t1, $s0, $t1 + slt $s0, $t1, $s3 + maskeqz $t1, $t1, $s0 + masknez $s0, $s3, $s0 + or $t1, $t1, $s0 + st.w $t1, $fp, 48 + addi.d $a3, $a3, 4 addi.d $t8, $t8, 1 - ori $a6, $zero, 64 - bne $t3, $a6, .LBB6_42 + bne $a3, $t7, .LBB6_40 +# %bb.41: # %.preheader462 + move $a3, $zero + pcalau12i $t3, %pc_hi20(get_block.cur_lineY) + st.d $t2, $t3, %pc_lo12(get_block.cur_lineY) + ld.d $t1, $sp, 24 # 8-byte Folded Reload + addi.d $t1, $t1, -1 + sltu $t1, $zero, $t1 + add.d $t0, $t0, $t1 + srai.d $t1, $t0, 63 + andn $t1, $t0, $t1 + slt $t2, $t1, $a7 + maskeqz $t1, $t1, $t2 + masknez $t2, $a7, $t2 + or $t1, $t1, $t2 + slt $t2, $a2, $t0 + maskeqz $t3, $t0, $t2 + masknez $t2, $a2, $t2 + or $t2, $t3, $t2 + addi.d $t2, $t2, 1 + slt $t3, $t2, $a7 + maskeqz $t2, $t2, $t3 + masknez $a7, $a7, $t3 + or $t2, $t2, $a7 + vinsgr2vr.w $vr5, $t0, 0 + vinsgr2vr.w $vr5, $t0, 1 + vmax.w $vr3, $vr5, $vr3 + ori $a7, $zero, 2 + lu32i.d $a7, 3 + vreplgr2vr.d $vr5, $a7 + vadd.w $vr3, $vr3, $vr5 + vmin.w $vr3, $vr3, $vr4 + vpickve2gr.w $t3, $vr3, 0 + vpickve2gr.w $t4, $vr3, 1 + ori $a7, $zero, 1 + lu32i.d $a7, -2 + vreplgr2vr.d $vr3, $a7 + slli.d $a7, $t1, 1 + ori $t0, $zero, 20 + slli.d $t1, $t2, 1 + slli.d $t2, $t3, 1 + slli.d $t3, $t4, 1 + ori $t4, $zero, 64 + .p2align 4, , 16 +.LBB6_42: # =>This Inner Loop Header: Depth=1 + addi.w $t5, $a6, 0 + srai.d $t6, $t5, 63 + andn $t6, $t5, $t6 + slt $t7, $t6, $a1 + maskeqz $t6, $t6, $t7 + masknez $t7, $a1, $t7 + or $fp, $t6, $t7 + slt $t6, $a2, $t5 + masknez $t7, $a2, $t6 + maskeqz $t5, $t5, $t6 + or $t5, $t5, $t7 + addi.w $t5, $t5, 1 + slt $t6, $t5, $a1 + maskeqz $t5, $t5, $t6 + masknez $t6, $a1, $t6 + or $t5, $t5, $t6 + vinsgr2vr.w $vr4, $a6, 0 + vinsgr2vr.w $vr4, $a6, 1 + ori $t6, $zero, 2 + lu32i.d $t6, -3 + vreplgr2vr.d $vr5, $t6 + vmax.w $vr5, $vr4, $vr5 + vadd.w $vr5, $vr5, $vr1 + vmin.w $vr5, $vr5, $vr0 + vpickve2gr.w $t6, $vr5, 0 + slli.d $t6, $t6, 3 + ldx.d $t7, $a0, $t6 + vpickve2gr.w $t6, $vr5, 1 + slli.d $t6, $t6, 3 + ldx.d $t6, $a0, $t6 + vmax.w $vr4, $vr4, $vr3 + vadd.w $vr4, $vr4, $vr2 + vmin.w $vr4, $vr4, $vr0 + vpickve2gr.w $t8, $vr4, 0 + slli.d $t8, $t8, 3 + ldx.d $t8, $a0, $t8 + vpickve2gr.w $s0, $vr4, 1 + slli.d $s0, $s0, 3 + ldx.d $s0, $a0, $s0 + slli.d $fp, $fp, 3 + ldx.d $fp, $a0, $fp + slli.d $t5, $t5, 3 + ldx.d $s1, $a0, $t5 + add.d $t5, $a5, $a3 + ldx.hu $s2, $t7, $a7 + ldx.hu $s3, $t8, $a7 + ldx.hu $s4, $s0, $a7 + ldx.hu $s5, $fp, $a7 + ldx.hu $s6, $s1, $a7 + ldx.hu $s7, $t6, $a7 + add.d $s3, $s4, $s3 + alsl.d $s3, $s3, $s3, 2 + add.d $s4, $s6, $s5 + mul.d $s4, $s4, $t0 + ldx.w $s5, $a5, $a3 + ldptr.w $s6, $a4, 5900 + add.d $s2, $s2, $s7 + sub.d $s2, $s2, $s3 + add.d $s2, $s2, $s4 + addi.w $s2, $s2, 16 + srai.d $s3, $s2, 5 + srai.d $s2, $s2, 63 + andn $s2, $s3, $s2 + slt $s3, $s2, $s6 + maskeqz $s2, $s2, $s3 + masknez $s3, $s6, $s3 + or $s2, $s2, $s3 + add.d $s2, $s5, $s2 + addi.w $s2, $s2, 1 + srli.d $s2, $s2, 1 + stx.w $s2, $a5, $a3 + ldx.hu $s2, $t7, $t1 + ldx.hu $s3, $t8, $t1 + ldx.hu $s4, $s0, $t1 + ldx.hu $s5, $fp, $t1 + ldx.hu $s6, $s1, $t1 + ldx.hu $s7, $t6, $t1 + add.d $s3, $s4, $s3 + alsl.d $s3, $s3, $s3, 2 + add.d $s4, $s6, $s5 + mul.d $s4, $s4, $t0 + ld.w $s5, $t5, 4 + ldptr.w $s6, $a4, 5900 + add.d $s2, $s2, $s7 + sub.d $s2, $s2, $s3 + add.d $s2, $s2, $s4 + addi.w $s2, $s2, 16 + srai.d $s3, $s2, 5 + srai.d $s2, $s2, 63 + andn $s2, $s3, $s2 + slt $s3, $s2, $s6 + maskeqz $s2, $s2, $s3 + masknez $s3, $s6, $s3 + or $s2, $s2, $s3 + add.d $s2, $s5, $s2 + addi.w $s2, $s2, 1 + srli.d $s2, $s2, 1 + st.w $s2, $t5, 4 + ldx.hu $s2, $t7, $t2 + ldx.hu $s3, $t8, $t2 + ldx.hu $s4, $s0, $t2 + ldx.hu $s5, $fp, $t2 + ldx.hu $s6, $s1, $t2 + ldx.hu $s7, $t6, $t2 + add.d $s3, $s4, $s3 + alsl.d $s3, $s3, $s3, 2 + add.d $s4, $s6, $s5 + mul.d $s4, $s4, $t0 + ld.w $s5, $t5, 8 + ldptr.w $s6, $a4, 5900 + add.d $s2, $s2, $s7 + sub.d $s2, $s2, $s3 + add.d $s2, $s2, $s4 + addi.w $s2, $s2, 16 + srai.d $s3, $s2, 5 + srai.d $s2, $s2, 63 + andn $s2, $s3, $s2 + slt $s3, $s2, $s6 + maskeqz $s2, $s2, $s3 + masknez $s3, $s6, $s3 + or $s2, $s2, $s3 + add.d $s2, $s5, $s2 + addi.w $s2, $s2, 1 + srli.d $s2, $s2, 1 + st.w $s2, $t5, 8 + ldx.hu $t7, $t7, $t3 + ldx.hu $t8, $t8, $t3 + ldx.hu $s0, $s0, $t3 + ldx.hu $fp, $fp, $t3 + ldx.hu $s1, $s1, $t3 + ldx.hu $t6, $t6, $t3 + add.d $t8, $s0, $t8 + alsl.d $t8, $t8, $t8, 2 + add.d $fp, $s1, $fp + mul.d $fp, $fp, $t0 + ld.w $s0, $t5, 12 + ldptr.w $s1, $a4, 5900 + add.d $t6, $t7, $t6 + sub.d $t6, $t6, $t8 + add.d $t6, $t6, $fp + addi.w $t6, $t6, 16 + srai.d $t7, $t6, 5 + srai.d $t6, $t6, 63 + andn $t6, $t7, $t6 + slt $t7, $t6, $s1 + maskeqz $t6, $t6, $t7 + masknez $t7, $s1, $t7 + or $t6, $t6, $t7 + add.d $t6, $s0, $t6 + addi.w $t6, $t6, 1 + srli.d $t6, $t6, 1 + st.w $t6, $t5, 12 + addi.d $a3, $a3, 16 + addi.d $a6, $a6, 1 + bne $a3, $t4, .LBB6_42 .LBB6_43: # %.loopexit - ld.d $s8, $sp, 392 # 8-byte Folded Reload - ld.d $s7, $sp, 400 # 8-byte Folded Reload - ld.d $s6, $sp, 408 # 8-byte Folded Reload - ld.d $s5, $sp, 416 # 8-byte Folded Reload - ld.d $s4, $sp, 424 # 8-byte Folded Reload - ld.d $s3, $sp, 432 # 8-byte Folded Reload - ld.d $s2, $sp, 440 # 8-byte Folded Reload - ld.d $s1, $sp, 448 # 8-byte Folded Reload - ld.d $s0, $sp, 456 # 8-byte Folded Reload - ld.d $fp, $sp, 464 # 8-byte Folded Reload - ld.d $ra, $sp, 472 # 8-byte Folded Reload - addi.d $sp, $sp, 480 + ld.d $s8, $sp, 360 # 8-byte Folded Reload + ld.d $s7, $sp, 368 # 8-byte Folded Reload + ld.d $s6, $sp, 376 # 8-byte Folded Reload + ld.d $s5, $sp, 384 # 8-byte Folded Reload + ld.d $s4, $sp, 392 # 8-byte Folded Reload + ld.d $s3, $sp, 400 # 8-byte Folded Reload + ld.d $s2, $sp, 408 # 8-byte Folded Reload + ld.d $s1, $sp, 416 # 8-byte Folded Reload + ld.d $s0, $sp, 424 # 8-byte Folded Reload + ld.d $fp, $sp, 432 # 8-byte Folded Reload + ld.d $ra, $sp, 440 # 8-byte Folded Reload + addi.d $sp, $sp, 448 ret .Lfunc_end6: .size get_block, .Lfunc_end6-get_block diff --git a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/transform8x8.s b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/transform8x8.s index 9f0b1f41..136f4c3e 100644 --- a/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/transform8x8.s +++ b/results/MultiSource/Applications/JM/ldecod/CMakeFiles/ldecod.dir/transform8x8.s @@ -37,7 +37,7 @@ intrapred8x8: # @intrapred8x8 st.d $s6, $sp, 456 # 8-byte Folded Spill st.d $s7, $sp, 448 # 8-byte Folded Spill st.d $s8, $sp, 440 # 8-byte Folded Spill - move $s0, $a1 + move $s3, $a1 move $s1, $a0 pcalau12i $a0, %got_pc_hi20(dec_picture) ld.d $a0, $a0, %got_pc_lo12(dec_picture) @@ -45,22 +45,22 @@ intrapred8x8: # @intrapred8x8 lu12i.w $a1, 77 ld.w $a2, $s1, 72 ori $a1, $a1, 1528 - ldx.d $s6, $a0, $a1 - ld.w $s3, $s1, 4 + ldx.d $s0, $a0, $a1 + ld.w $s4, $s1, 4 slli.d $a0, $a2, 2 - bstrpick.d $a1, $s0, 31, 31 - add.w $a1, $s0, $a1 - slli.d $s5, $a1, 2 + bstrpick.d $a1, $s3, 31, 31 + add.w $a1, $s3, $a1 + slli.d $s6, $a1, 2 ld.w $a2, $s1, 68 bstrins.d $a1, $zero, 0, 0 ldptr.d $a3, $s1, 5544 - sub.w $s7, $s0, $a1 + sub.w $s7, $s3, $a1 alsl.w $a1, $a2, $a1, 2 slli.d $a1, $a1, 3 ldx.d $a1, $a3, $a1 alsl.w $a0, $s7, $a0, 1 slli.w $s2, $s7, 3 - move $fp, $s5 + move $fp, $s6 bstrins.d $fp, $zero, 2, 0 ldx.bu $a0, $a1, $a0 st.d $a0, $sp, 112 # 8-byte Folded Spill @@ -68,111 +68,111 @@ intrapred8x8: # @intrapred8x8 ld.d $a5, $s8, %pc_lo12(getNeighbour) addi.w $a2, $fp, 0 addi.d $a0, $zero, -1 - alsl.w $s4, $s7, $a0, 3 + alsl.w $s5, $s7, $a0, 3 addi.d $a4, $sp, 192 - move $a0, $s3 - move $a1, $s4 + move $a0, $s4 + move $a1, $s5 st.d $a2, $sp, 104 # 8-byte Folded Spill move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) ori $a0, $zero, 1 - move $a1, $s5 + move $a1, $s6 bstrins.d $a1, $a0, 2, 0 addi.d $a4, $sp, 216 addi.w $a2, $a1, 0 - move $a0, $s3 - move $a1, $s4 + move $a0, $s4 + move $a1, $s5 st.d $a2, $sp, 88 # 8-byte Folded Spill move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) ori $a0, $zero, 2 - move $a1, $s5 + move $a1, $s6 bstrins.d $a1, $a0, 2, 0 addi.d $a4, $sp, 240 addi.w $a2, $a1, 0 - move $a0, $s3 - move $a1, $s4 + move $a0, $s4 + move $a1, $s5 st.d $a2, $sp, 48 # 8-byte Folded Spill move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) ori $a0, $zero, 3 - move $a1, $s5 + move $a1, $s6 bstrins.d $a1, $a0, 2, 0 addi.d $a4, $sp, 264 addi.w $a2, $a1, 0 - move $a0, $s3 - move $a1, $s4 + move $a0, $s4 + move $a1, $s5 st.d $a2, $sp, 72 # 8-byte Folded Spill move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) ori $a0, $zero, 4 - move $a1, $s5 + move $a1, $s6 bstrins.d $a1, $a0, 2, 0 addi.d $a4, $sp, 288 addi.w $a2, $a1, 0 - move $a0, $s3 - move $a1, $s4 + move $a0, $s4 + move $a1, $s5 st.d $a2, $sp, 64 # 8-byte Folded Spill move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) ori $a0, $zero, 5 - move $a1, $s5 + move $a1, $s6 bstrins.d $a1, $a0, 2, 0 addi.d $a4, $sp, 312 addi.w $a2, $a1, 0 - move $a0, $s3 - move $a1, $s4 + move $a0, $s4 + move $a1, $s5 st.d $a2, $sp, 96 # 8-byte Folded Spill move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) ori $a0, $zero, 6 - move $a1, $s5 + move $a1, $s6 bstrins.d $a1, $a0, 2, 0 addi.d $a4, $sp, 336 addi.w $a2, $a1, 0 - move $a0, $s3 - move $a1, $s4 + move $a0, $s4 + move $a1, $s5 st.d $a2, $sp, 56 # 8-byte Folded Spill move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) ori $a0, $zero, 7 - bstrins.d $s5, $a0, 2, 0 + bstrins.d $s6, $a0, 2, 0 addi.d $a4, $sp, 360 - addi.w $a2, $s5, 0 - move $a0, $s3 - move $a1, $s4 + addi.w $a2, $s6, 0 + move $a0, $s4 + move $a1, $s5 st.d $a2, $sp, 80 # 8-byte Folded Spill move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) - addi.w $s5, $fp, -1 + addi.w $s6, $fp, -1 move $fp, $s2 addi.d $a4, $sp, 168 - move $a0, $s3 + move $a0, $s4 move $a1, $s2 - move $a2, $s5 + move $a2, $s6 move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) ori $a0, $zero, 8 alsl.w $a1, $s7, $a0, 3 addi.d $a4, $sp, 144 - move $a0, $s3 - move $a2, $s5 + move $a0, $s4 + move $a2, $s6 move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) addi.d $a4, $sp, 120 - move $a0, $s3 - move $a1, $s4 - move $a2, $s5 + move $a0, $s4 + move $a1, $s5 + move $a2, $s6 move $a3, $zero jirl $ra, $a5, 0 ld.w $a0, $sp, 144 @@ -180,8 +180,8 @@ intrapred8x8: # @intrapred8x8 # %bb.1: addi.d $a0, $s7, -1 sltu $a0, $zero, $a0 - bstrins.d $s0, $zero, 0, 0 - addi.d $a1, $s0, -2 + bstrins.d $s3, $zero, 0, 0 + addi.d $a1, $s3, -2 sltu $a1, $zero, $a1 or $a0, $a0, $a1 b .LBB0_3 @@ -195,7 +195,7 @@ intrapred8x8: # @intrapred8x8 beqz $a1, .LBB0_9 # %bb.4: # %.preheader1380 ld.w $a1, $sp, 192 - beqz $a1, .LBB0_10 + beqz $a1, .LBB0_14 # %bb.5: ld.w $a1, $sp, 196 ld.d $a2, $s1, 16 @@ -203,133 +203,174 @@ intrapred8x8: # @intrapred8x8 ldx.wu $a1, $a2, $a1 andi $a1, $a1, 1 ld.w $a2, $sp, 216 - beqz $a2, .LBB0_11 + beqz $a2, .LBB0_15 .LBB0_6: ld.w $a2, $sp, 220 ld.d $a3, $s1, 16 slli.d $a2, $a2, 2 ldx.w $a4, $a3, $a2 ld.w $a2, $sp, 240 - beqz $a2, .LBB0_12 + beqz $a2, .LBB0_16 .LBB0_7: ld.w $a2, $sp, 244 ld.d $a3, $s1, 16 slli.d $a2, $a2, 2 ldx.w $a3, $a3, $a2 ld.w $a2, $sp, 264 - beqz $a2, .LBB0_13 + beqz $a2, .LBB0_17 .LBB0_8: ld.w $a2, $sp, 268 ld.d $a5, $s1, 16 slli.d $a2, $a2, 2 ldx.w $a2, $a5, $a2 - b .LBB0_14 + b .LBB0_18 .LBB0_9: ld.w $s4, $sp, 192 ld.w $s3, $sp, 168 - ld.w $s0, $sp, 120 - bnez $s3, .LBB0_33 - b .LBB0_38 + ld.w $s5, $sp, 120 + bnez $s3, .LBB0_37 .LBB0_10: + lu12i.w $a1, 1 + ori $a1, $a1, 1796 + ldx.h $a1, $s1, $a1 + st.h $a1, $sp, 398 + st.h $a1, $sp, 396 + st.h $a1, $sp, 394 + st.h $a1, $sp, 392 + st.h $a1, $sp, 390 + st.h $a1, $sp, 388 + st.h $a1, $sp, 386 + st.h $a1, $sp, 400 + bnez $a0, .LBB0_38 +.LBB0_11: + addi.d $a0, $sp, 402 + st.h $a1, $sp, 416 + st.h $a1, $sp, 414 + st.h $a1, $sp, 412 + st.h $a1, $sp, 410 + st.h $a1, $sp, 408 + st.h $a1, $sp, 406 + st.h $a1, $sp, 404 + st.h $a1, $a0, 0 + bnez $s4, .LBB0_39 +.LBB0_12: + lu12i.w $a0, 1 + ori $a0, $a0, 1796 + ldx.h $a0, $s1, $a0 + st.h $a0, $sp, 430 + st.h $a0, $sp, 428 + st.h $a0, $sp, 426 + st.h $a0, $sp, 424 + st.h $a0, $sp, 422 + st.h $a0, $sp, 420 + st.h $a0, $sp, 418 + st.h $a0, $sp, 432 + bnez $s5, .LBB0_40 +.LBB0_13: + lu12i.w $a0, 1 + ori $a0, $a0, 1796 + ldx.h $a0, $s1, $a0 + b .LBB0_41 +.LBB0_14: move $a1, $zero ld.w $a2, $sp, 216 bnez $a2, .LBB0_6 -.LBB0_11: +.LBB0_15: move $a4, $zero ld.w $a2, $sp, 240 bnez $a2, .LBB0_7 -.LBB0_12: +.LBB0_16: move $a3, $zero ld.w $a2, $sp, 264 bnez $a2, .LBB0_8 -.LBB0_13: +.LBB0_17: move $a2, $zero -.LBB0_14: +.LBB0_18: ld.w $a5, $sp, 288 and $a4, $a4, $a1 - beqz $a5, .LBB0_16 -# %bb.15: + beqz $a5, .LBB0_20 +# %bb.19: ld.w $a1, $sp, 292 ld.d $a5, $s1, 16 slli.d $a1, $a1, 2 ldx.w $a1, $a5, $a1 - b .LBB0_17 -.LBB0_16: + b .LBB0_21 +.LBB0_20: move $a1, $zero -.LBB0_17: +.LBB0_21: ld.w $a5, $sp, 312 and $a4, $a3, $a4 - beqz $a5, .LBB0_19 -# %bb.18: + beqz $a5, .LBB0_23 +# %bb.22: ld.w $a3, $sp, 316 ld.d $a5, $s1, 16 slli.d $a3, $a3, 2 ldx.w $a3, $a5, $a3 - b .LBB0_20 -.LBB0_19: + b .LBB0_24 +.LBB0_23: move $a3, $zero -.LBB0_20: +.LBB0_24: ld.w $a5, $sp, 336 and $a4, $a2, $a4 - beqz $a5, .LBB0_22 -# %bb.21: + beqz $a5, .LBB0_26 +# %bb.25: ld.w $a2, $sp, 340 ld.d $a5, $s1, 16 slli.d $a2, $a2, 2 ldx.w $a2, $a5, $a2 - b .LBB0_23 -.LBB0_22: + b .LBB0_27 +.LBB0_26: move $a2, $zero -.LBB0_23: +.LBB0_27: ld.w $a5, $sp, 360 and $a4, $a1, $a4 - beqz $a5, .LBB0_25 -# %bb.24: + beqz $a5, .LBB0_29 +# %bb.28: ld.w $a1, $sp, 364 ld.d $a5, $s1, 16 slli.d $a1, $a1, 2 ldx.w $a1, $a5, $a1 - b .LBB0_26 -.LBB0_25: + b .LBB0_30 +.LBB0_29: move $a1, $zero -.LBB0_26: +.LBB0_30: ld.w $a5, $sp, 168 and $a3, $a3, $a4 - beqz $a5, .LBB0_29 -# %bb.27: + beqz $a5, .LBB0_33 +# %bb.31: ld.w $a4, $sp, 172 ld.d $a5, $s1, 16 slli.d $a4, $a4, 2 ldx.w $s3, $a5, $a4 and $a2, $a2, $a3 - beqz $a0, .LBB0_30 -.LBB0_28: + beqz $a0, .LBB0_34 +.LBB0_32: ld.w $a0, $sp, 148 ld.d $a3, $s1, 16 slli.d $a0, $a0, 2 ldx.w $a0, $a3, $a0 - b .LBB0_31 -.LBB0_29: + b .LBB0_35 +.LBB0_33: move $s3, $zero and $a2, $a2, $a3 - bnez $a0, .LBB0_28 -.LBB0_30: + bnez $a0, .LBB0_32 +.LBB0_34: move $a0, $zero -.LBB0_31: +.LBB0_35: ld.w $a3, $sp, 120 and $s4, $a1, $a2 - beqz $a3, .LBB0_37 -# %bb.32: + beqz $a3, .LBB0_46 +# %bb.36: ld.w $a1, $sp, 124 ld.d $a2, $s1, 16 slli.d $a1, $a1, 2 - ldx.w $s0, $a2, $a1 - beqz $s3, .LBB0_38 -.LBB0_33: + ldx.w $s5, $a2, $a1 + beqz $s3, .LBB0_10 +.LBB0_37: ld.w $a1, $sp, 188 slli.d $a1, $a1, 3 ld.w $a2, $sp, 184 - ldx.d $a1, $s6, $a1 + ldx.d $a1, $s0, $a1 slli.d $a3, $a2, 1 ldx.h $a3, $a1, $a3 alsl.d $a1, $a2, $a1, 1 @@ -348,12 +389,12 @@ intrapred8x8: # @intrapred8x8 st.h $a2, $sp, 398 ld.hu $a1, $a1, 14 st.h $a1, $sp, 400 - beqz $a0, .LBB0_39 -.LBB0_34: + beqz $a0, .LBB0_11 +.LBB0_38: ld.w $a0, $sp, 164 slli.d $a0, $a0, 3 ld.w $a1, $sp, 160 - ldx.d $a0, $s6, $a0 + ldx.d $a0, $s0, $a0 slli.d $a2, $a1, 1 ldx.h $a2, $a0, $a2 alsl.d $a0, $a1, $a0, 1 @@ -373,121 +414,76 @@ intrapred8x8: # @intrapred8x8 ld.hu $a1, $a0, 14 addi.d $a0, $sp, 416 st.h $a1, $a0, 0 - beqz $s4, .LBB0_40 -.LBB0_35: + beqz $s4, .LBB0_12 +.LBB0_39: ld.w $a0, $sp, 212 slli.d $a0, $a0, 3 ld.w $a1, $sp, 208 - ldx.d $a0, $s6, $a0 + ldx.d $a0, $s0, $a0 ld.w $a2, $sp, 236 slli.d $a1, $a1, 1 ldx.h $a0, $a0, $a1 slli.d $a1, $a2, 3 ld.w $a2, $sp, 232 - ldx.d $a1, $s6, $a1 + ldx.d $a1, $s0, $a1 ld.w $a3, $sp, 260 st.h $a0, $sp, 418 slli.d $a0, $a2, 1 ldx.h $a0, $a1, $a0 slli.d $a1, $a3, 3 ld.w $a2, $sp, 256 - ldx.d $a1, $s6, $a1 + ldx.d $a1, $s0, $a1 ld.w $a3, $sp, 284 st.h $a0, $sp, 420 slli.d $a0, $a2, 1 ldx.h $a0, $a1, $a0 slli.d $a1, $a3, 3 ld.w $a2, $sp, 280 - ldx.d $a1, $s6, $a1 + ldx.d $a1, $s0, $a1 ld.w $a3, $sp, 308 st.h $a0, $sp, 422 slli.d $a0, $a2, 1 ldx.h $a0, $a1, $a0 slli.d $a1, $a3, 3 ld.w $a2, $sp, 304 - ldx.d $a1, $s6, $a1 + ldx.d $a1, $s0, $a1 ld.w $a3, $sp, 332 st.h $a0, $sp, 424 slli.d $a0, $a2, 1 ldx.h $a0, $a1, $a0 slli.d $a1, $a3, 3 ld.w $a2, $sp, 328 - ldx.d $a1, $s6, $a1 + ldx.d $a1, $s0, $a1 ld.w $a3, $sp, 356 st.h $a0, $sp, 426 slli.d $a0, $a2, 1 ldx.h $a0, $a1, $a0 slli.d $a1, $a3, 3 ld.w $a2, $sp, 352 - ldx.d $a1, $s6, $a1 + ldx.d $a1, $s0, $a1 ld.w $a3, $sp, 380 st.h $a0, $sp, 428 slli.d $a0, $a2, 1 ldx.h $a0, $a1, $a0 slli.d $a1, $a3, 3 ld.w $a2, $sp, 376 - ldx.d $a1, $s6, $a1 + ldx.d $a1, $s0, $a1 st.h $a0, $sp, 430 slli.d $a0, $a2, 1 ldx.hu $a0, $a1, $a0 st.h $a0, $sp, 432 - beqz $s0, .LBB0_41 -.LBB0_36: + beqz $s5, .LBB0_13 +.LBB0_40: ld.w $a0, $sp, 140 slli.d $a0, $a0, 3 ld.w $a1, $sp, 136 - ldx.d $a0, $s6, $a0 + ldx.d $a0, $s0, $a0 slli.d $a1, $a1, 1 ldx.hu $a0, $a0, $a1 - b .LBB0_42 -.LBB0_37: - move $s0, $zero - bnez $s3, .LBB0_33 -.LBB0_38: - lu12i.w $a1, 1 - ori $a1, $a1, 1796 - ldx.h $a1, $s1, $a1 - st.h $a1, $sp, 398 - st.h $a1, $sp, 396 - st.h $a1, $sp, 394 - st.h $a1, $sp, 392 - st.h $a1, $sp, 390 - st.h $a1, $sp, 388 - st.h $a1, $sp, 386 - st.h $a1, $sp, 400 - bnez $a0, .LBB0_34 -.LBB0_39: - addi.d $a0, $sp, 402 - st.h $a1, $sp, 416 - st.h $a1, $sp, 414 - st.h $a1, $sp, 412 - st.h $a1, $sp, 410 - st.h $a1, $sp, 408 - st.h $a1, $sp, 406 - st.h $a1, $sp, 404 - st.h $a1, $a0, 0 - bnez $s4, .LBB0_35 -.LBB0_40: - lu12i.w $a0, 1 - ori $a0, $a0, 1796 - ldx.h $a0, $s1, $a0 - st.h $a0, $sp, 430 - st.h $a0, $sp, 428 - st.h $a0, $sp, 426 - st.h $a0, $sp, 424 - st.h $a0, $sp, 422 - st.h $a0, $sp, 420 - st.h $a0, $sp, 418 - st.h $a0, $sp, 432 - bnez $s0, .LBB0_36 .LBB0_41: - lu12i.w $a0, 1 - ori $a0, $a0, 1796 - ldx.h $a0, $s1, $a0 -.LBB0_42: st.h $a0, $sp, 384 addi.d $a0, $sp, 384 - move $a1, $s0 + move $a1, $s5 move $a2, $s3 move $a3, $s4 pcaddu18i $ra, %call36(LowPassForIntra8x8Pred) @@ -495,10 +491,10 @@ intrapred8x8: # @intrapred8x8 ori $a0, $zero, 8 ld.d $a1, $sp, 112 # 8-byte Folded Reload bltu $a0, $a1, .LBB0_77 -# %bb.43: +# %bb.42: addi.w $ra, $fp, 1 - addi.w $s5, $fp, 2 - addi.w $s6, $fp, 3 + addi.w $s6, $fp, 2 + addi.w $s0, $fp, 3 addi.w $s7, $fp, 4 addi.w $s8, $fp, 5 addi.w $t8, $fp, 6 @@ -509,15 +505,15 @@ intrapred8x8: # @intrapred8x8 ldx.w $a0, $a1, $a0 add.d $a0, $a1, $a0 jr $a0 -.LBB0_44: - bnez $s3, .LBB0_46 -# %bb.45: +.LBB0_43: + bnez $s3, .LBB0_45 +# %bb.44: ld.w $a1, $s1, 4 pcalau12i $a0, %pc_hi20(.L.str) addi.d $a0, $a0, %pc_lo12(.L.str) pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 -.LBB0_46: # %.loopexit.loopexit1388 +.LBB0_45: # %.loopexit.loopexit1388 move $s4, $zero addi.d $a1, $s1, 104 ld.d $a0, $sp, 80 # 8-byte Folded Reload @@ -554,13 +550,17 @@ intrapred8x8: # @intrapred8x8 vstx $vr0, $a7, $t1 vstx $vr0, $t0, $t1 vstx $vr0, $a1, $t1 - b .LBB0_87 + b .LBB0_88 +.LBB0_46: + move $s5, $zero + bnez $s3, .LBB0_37 + b .LBB0_10 .LBB0_47: - st.d $s8, $sp, 40 # 8-byte Folded Spill st.d $s7, $sp, 16 # 8-byte Folded Spill - st.d $s6, $sp, 24 # 8-byte Folded Spill - st.d $s2, $sp, 32 # 8-byte Folded Spill - beqz $s0, .LBB0_50 + st.d $s0, $sp, 24 # 8-byte Folded Spill + st.d $s8, $sp, 32 # 8-byte Folded Spill + st.d $s2, $sp, 40 # 8-byte Folded Spill + beqz $s5, .LBB0_50 # %bb.48: beqz $s4, .LBB0_50 # %bb.49: @@ -569,12 +569,12 @@ intrapred8x8: # @intrapred8x8 ld.w $a1, $s1, 4 pcalau12i $a0, %pc_hi20(.L.str.4) addi.d $a0, $a0, %pc_lo12(.L.str.4) - move $s0, $ra - move $s2, $t8 + move $s2, $ra + move $s3, $t8 pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 - move $t8, $s2 - move $ra, $s0 + move $t8, $s3 + move $ra, $s2 .LBB0_51: st.d $ra, $sp, 8 # 8-byte Folded Spill ld.hu $a1, $sp, 428 @@ -590,79 +590,79 @@ intrapred8x8: # @intrapred8x8 slli.d $a3, $a0, 5 ld.hu $a6, $sp, 426 add.d $t0, $t4, $a3 - slli.d $t1, $fp, 1 - stx.h $a5, $t0, $t1 + slli.d $s0, $fp, 1 + stx.h $a5, $t0, $s0 addi.d $a5, $a6, 2 add.d $a2, $a5, $a2 alsl.d $a1, $a1, $a2, 1 srli.d $a1, $a1, 2 - slli.d $s6, $ra, 1 - stx.h $a1, $t0, $s6 + slli.d $s8, $ra, 1 + stx.h $a1, $t0, $s8 ld.d $a0, $sp, 56 # 8-byte Folded Reload slli.d $a7, $a0, 5 ld.hu $t5, $sp, 424 add.d $t3, $t4, $a7 - stx.h $a1, $t3, $t1 + stx.h $a1, $t3, $s0 alsl.d $a1, $a6, $a4, 1 add.d $a1, $a1, $t5 srli.d $a1, $a1, 2 - slli.d $t7, $s5, 1 - stx.h $a1, $t0, $t7 - stx.h $a1, $t3, $s6 + slli.d $t1, $s6, 1 + stx.h $a1, $t0, $t1 + stx.h $a1, $t3, $s8 ld.d $a0, $sp, 96 # 8-byte Folded Reload slli.d $a4, $a0, 5 ld.hu $a6, $sp, 422 add.d $t2, $t4, $a4 - stx.h $a1, $t2, $t1 + stx.h $a1, $t2, $s0 alsl.d $a1, $t5, $a5, 1 add.d $a1, $a1, $a6 srli.d $a4, $a1, 2 - move $s7, $s5 - ld.d $s5, $sp, 24 # 8-byte Folded Reload - slli.d $s0, $s5, 1 - stx.h $a4, $t0, $s0 - stx.h $a4, $t3, $t7 - stx.h $a4, $t2, $s6 + move $s7, $s6 + ld.d $s6, $sp, 24 # 8-byte Folded Reload + slli.d $t7, $s6, 1 + stx.h $a4, $t0, $t7 + stx.h $a4, $t3, $t1 + stx.h $a4, $t2, $s8 ld.d $a0, $sp, 64 # 8-byte Folded Reload slli.d $a1, $a0, 5 ld.hu $a7, $sp, 420 add.d $ra, $t4, $a1 - stx.h $a4, $ra, $t1 + stx.h $a4, $ra, $s0 alsl.d $a4, $a6, $t5, 1 add.d $a4, $a4, $a7 addi.d $a4, $a4, 2 srli.d $a5, $a4, 2 - ld.d $s8, $sp, 16 # 8-byte Folded Reload - slli.d $s4, $s8, 1 - stx.h $a5, $t0, $s4 - stx.h $a5, $t3, $s0 - stx.h $a5, $t2, $t7 - stx.h $a5, $ra, $s6 + ld.d $s5, $sp, 16 # 8-byte Folded Reload + slli.d $s2, $s5, 1 + stx.h $a5, $t0, $s2 + stx.h $a5, $t3, $t7 + stx.h $a5, $t2, $t1 + stx.h $a5, $ra, $s8 ld.d $a0, $sp, 72 # 8-byte Folded Reload slli.d $a4, $a0, 5 ld.hu $t5, $sp, 418 add.d $a4, $t4, $a4 - stx.h $a5, $a4, $t1 + stx.h $a5, $a4, $s0 alsl.d $a5, $a7, $a6, 1 add.d $fp, $a5, $t5 - ld.d $a3, $sp, 40 # 8-byte Folded Reload + ld.d $a3, $sp, 32 # 8-byte Folded Reload slli.d $s1, $a3, 1 ld.d $a0, $sp, 48 # 8-byte Folded Reload slli.d $a5, $a0, 5 add.d $a5, $t4, $a5 alsl.d $t6, $t5, $a7, 1 - slli.d $s2, $t8, 1 + slli.d $s3, $t8, 1 ld.d $a0, $sp, 88 # 8-byte Folded Reload slli.d $a6, $a0, 5 add.d $a6, $t4, $a6 - ld.d $a1, $sp, 32 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload alsl.d $a0, $a1, $t0, 1 st.d $a0, $sp, 96 # 8-byte Folded Spill ld.d $a0, $sp, 104 # 8-byte Folded Reload move $a2, $t8 slli.d $t8, $a0, 5 add.d $t8, $t4, $t8 - ld.hu $s3, $sp, 384 + ld.hu $s4, $sp, 384 vinsgr2vr.h $vr0, $t5, 0 pcalau12i $t4, %pc_hi20(.LCPI0_0) vld $vr1, $t4, %pc_lo12(.LCPI0_0) @@ -670,42 +670,43 @@ intrapred8x8: # @intrapred8x8 vld $vr2, $t4, %pc_lo12(.LCPI0_1) alsl.d $a0, $a2, $t3, 1 st.d $a0, $sp, 104 # 8-byte Folded Spill - vinsgr2vr.h $vr0, $s3, 1 - add.d $t6, $t6, $s3 + vinsgr2vr.h $vr0, $s4, 1 + add.d $t6, $t6, $s4 alsl.d $a0, $a3, $t2, 1 st.d $a0, $sp, 88 # 8-byte Folded Spill addi.d $t6, $t6, 2 - srli.d $s3, $t6, 2 - stx.h $s3, $t0, $s2 - alsl.d $t6, $s8, $ra, 1 + srli.d $s4, $t6, 2 + stx.h $s4, $t0, $s3 + alsl.d $a0, $s5, $ra, 1 + st.d $a0, $sp, 80 # 8-byte Folded Spill addi.d $fp, $fp, 2 - srli.d $s2, $fp, 2 - stx.h $s2, $t0, $s1 - alsl.d $t5, $s5, $a4, 1 - stx.h $s3, $t3, $s1 + srli.d $s3, $fp, 2 + stx.h $s3, $t0, $s1 + alsl.d $t6, $s6, $a4, 1 + stx.h $s4, $t3, $s1 alsl.d $fp, $s7, $a5, 1 ld.d $a0, $sp, 8 # 8-byte Folded Reload - alsl.d $t4, $a0, $a6, 1 - stx.h $s2, $t3, $s4 - alsl.d $t3, $a1, $t3, 1 - stx.h $s2, $t2, $s0 - stx.h $s2, $ra, $t7 - stx.h $s2, $a4, $s6 - stx.h $s2, $a5, $t1 - alsl.d $s2, $a2, $t2, 1 - stx.h $s3, $t2, $s4 - alsl.d $s1, $a3, $ra, 1 - stx.h $s3, $ra, $s0 - alsl.d $s0, $s8, $a4, 1 - stx.h $s3, $a4, $t7 - alsl.d $t7, $s5, $a5, 1 - alsl.d $s4, $s7, $a6, 1 + alsl.d $s1, $a0, $a6, 1 + stx.h $s3, $t3, $s2 + alsl.d $t5, $a1, $t3, 1 + stx.h $s3, $t2, $t7 + stx.h $s3, $ra, $t1 + stx.h $s3, $a4, $s8 + stx.h $s3, $a5, $s0 + alsl.d $s3, $a2, $t2, 1 + stx.h $s4, $t2, $s2 + alsl.d $s2, $a3, $ra, 1 + stx.h $s4, $ra, $t7 + alsl.d $t7, $s5, $a4, 1 + stx.h $s4, $a4, $t1 + alsl.d $t3, $s6, $a5, 1 + alsl.d $t4, $s7, $a6, 1 alsl.d $t2, $a1, $t2, 1 - stx.h $s3, $a5, $s6 - alsl.d $t0, $a2, $ra, 1 + stx.h $s4, $a5, $s8 + alsl.d $t1, $a2, $ra, 1 vld $vr3, $sp, 384 - stx.h $s3, $a6, $t1 - alsl.d $s3, $a3, $a4, 1 + stx.h $s4, $a6, $s0 + alsl.d $s4, $a3, $a4, 1 vld $vr4, $sp, 386 vshuf.h $vr1, $vr3, $vr0 vrepli.b $vr0, 0 @@ -726,12 +727,12 @@ intrapred8x8: # @intrapred8x8 vsrli.w $vr0, $vr0, 2 vsrli.w $vr1, $vr1, 2 vpickev.h $vr2, $vr0, $vr1 - vstx $vr2, $t8, $t1 - alsl.d $t1, $s8, $a5, 1 - alsl.d $t8, $s5, $a6, 1 + vstx $vr2, $t8, $s0 + alsl.d $t0, $s5, $a5, 1 + alsl.d $t8, $s6, $a6, 1 alsl.d $a7, $a1, $ra, 1 - alsl.d $s5, $a2, $a4, 1 - alsl.d $s6, $s8, $a6, 1 + alsl.d $s0, $a2, $a4, 1 + alsl.d $s6, $s5, $a6, 1 alsl.d $a4, $a1, $a4, 1 alsl.d $s7, $a3, $a5, 1 alsl.d $ra, $a3, $a6, 1 @@ -745,24 +746,25 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr1, $a1, 0, 0 ld.d $a1, $sp, 88 # 8-byte Folded Reload vstelm.h $vr1, $a1, 0, 0 + ld.d $a1, $sp, 80 # 8-byte Folded Reload + vstelm.h $vr1, $a1, 0, 0 vstelm.h $vr1, $t6, 0, 0 - vstelm.h $vr1, $t5, 0, 0 vstelm.h $vr1, $fp, 0, 0 - vstelm.h $vr1, $t4, 0, 0 - vstelm.h $vr1, $t3, 0, 2 + vstelm.h $vr1, $s1, 0, 0 + vstelm.h $vr1, $t5, 0, 2 + vstelm.h $vr1, $s3, 0, 2 vstelm.h $vr1, $s2, 0, 2 - vstelm.h $vr1, $s1, 0, 2 - vstelm.h $vr1, $s0, 0, 2 vstelm.h $vr1, $t7, 0, 2 - vstelm.h $vr1, $s4, 0, 2 - ld.d $s4, $sp, 112 # 8-byte Folded Reload + vstelm.h $vr1, $t3, 0, 2 + vstelm.h $vr1, $t4, 0, 2 vstelm.h $vr1, $t2, 0, 4 - vstelm.h $vr1, $t0, 0, 4 - vstelm.h $vr1, $s3, 0, 4 vstelm.h $vr1, $t1, 0, 4 + vstelm.h $vr1, $s4, 0, 4 + ld.d $s4, $sp, 112 # 8-byte Folded Reload + vstelm.h $vr1, $t0, 0, 4 vstelm.h $vr1, $t8, 0, 4 vstelm.h $vr1, $a7, 0, 6 - vstelm.h $vr1, $s5, 0, 6 + vstelm.h $vr1, $s0, 0, 6 vstelm.h $vr1, $s7, 0, 6 vstelm.h $vr1, $s6, 0, 6 vstelm.h $vr0, $a4, 0, 0 @@ -771,7 +773,7 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr0, $a5, 0, 2 vstelm.h $vr0, $a0, 0, 2 vstelm.h $vr0, $a6, 0, 4 - b .LBB0_87 + b .LBB0_88 .LBB0_52: beqz $s4, .LBB0_78 # %bb.53: @@ -817,12 +819,12 @@ intrapred8x8: # @intrapred8x8 ld.w $a1, $s1, 4 pcalau12i $a0, %pc_hi20(.L.str.2) addi.d $a0, $a0, %pc_lo12(.L.str.2) - move $s0, $ra - move $s3, $t8 + move $s3, $ra + move $s4, $t8 pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 - move $t8, $s3 - move $ra, $s0 + move $t8, $s4 + move $ra, $s3 .LBB0_57: move $s4, $zero ld.hu $a2, $sp, 386 @@ -839,7 +841,7 @@ intrapred8x8: # @intrapred8x8 ld.d $a0, $sp, 48 # 8-byte Folded Reload slli.d $a1, $a0, 5 add.d $t4, $a7, $a1 - slli.d $t2, $s6, 1 + slli.d $t2, $s0, 1 vld $vr0, $sp, 390 ld.d $a0, $sp, 72 # 8-byte Folded Reload slli.d $a1, $a0, 5 @@ -934,19 +936,19 @@ intrapred8x8: # @intrapred8x8 addi.d $t1, $t1, 2 srli.d $t1, $t1, 2 stx.h $t1, $a7, $t0 - alsl.d $t0, $s5, $a4, 1 + alsl.d $t0, $s6, $a4, 1 vstelm.h $vr1, $t0, 0, 0 alsl.d $t0, $ra, $a5, 1 vstelm.h $vr1, $t0, 0, 0 - alsl.d $t0, $s6, $a4, 1 + alsl.d $t0, $s0, $a4, 1 vstelm.h $vr1, $t0, 0, 2 - alsl.d $t0, $s5, $a5, 1 + alsl.d $t0, $s6, $a5, 1 vstelm.h $vr1, $t0, 0, 2 alsl.d $t0, $fp, $a1, 1 vstelm.h $vr1, $t0, 0, 2 alsl.d $t0, $s7, $a4, 1 vstelm.h $vr1, $t0, 0, 4 - alsl.d $t0, $s6, $a5, 1 + alsl.d $t0, $s0, $a5, 1 vstelm.h $vr1, $t0, 0, 4 alsl.d $t0, $ra, $a1, 1 vstelm.h $vr1, $t0, 0, 4 @@ -956,7 +958,7 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr1, $t0, 0, 6 alsl.d $t0, $s7, $a5, 1 vstelm.h $vr1, $t0, 0, 6 - alsl.d $t0, $s5, $a1, 1 + alsl.d $t0, $s6, $a1, 1 vstelm.h $vr1, $t0, 0, 6 alsl.d $t0, $ra, $a2, 1 vstelm.h $vr1, $t0, 0, 6 @@ -966,9 +968,9 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr0, $t0, 0, 0 alsl.d $t0, $s8, $a5, 1 vstelm.h $vr0, $t0, 0, 0 - alsl.d $t0, $s6, $a1, 1 + alsl.d $t0, $s0, $a1, 1 vstelm.h $vr0, $t0, 0, 0 - alsl.d $t0, $s5, $a2, 1 + alsl.d $t0, $s6, $a2, 1 vstelm.h $vr0, $t0, 0, 0 alsl.d $t0, $ra, $a3, 1 vstelm.h $vr0, $t0, 0, 0 @@ -981,9 +983,9 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr0, $a5, 0, 2 alsl.d $a5, $s7, $a1, 1 vstelm.h $vr0, $a5, 0, 2 - alsl.d $a5, $s6, $a2, 1 + alsl.d $a5, $s0, $a2, 1 vstelm.h $vr0, $a5, 0, 2 - alsl.d $a5, $s5, $a3, 1 + alsl.d $a5, $s6, $a3, 1 vstelm.h $vr0, $a5, 0, 2 alsl.d $a5, $ra, $a6, 1 vstelm.h $vr0, $a5, 0, 2 @@ -994,9 +996,9 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr0, $a4, 0, 4 alsl.d $a4, $s7, $a2, 1 vstelm.h $vr0, $a4, 0, 4 - alsl.d $a4, $s6, $a3, 1 + alsl.d $a4, $s0, $a3, 1 vstelm.h $vr0, $a4, 0, 4 - alsl.d $a4, $s5, $a6, 1 + alsl.d $a4, $s6, $a6, 1 vstelm.h $vr0, $a4, 0, 4 alsl.d $a4, $ra, $a7, 1 vstelm.h $vr0, $a4, 0, 4 @@ -1006,23 +1008,23 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr0, $a1, 0, 6 alsl.d $a1, $s7, $a3, 1 vstelm.h $vr0, $a1, 0, 6 - alsl.d $a1, $s6, $a6, 1 + alsl.d $a1, $s0, $a6, 1 vstelm.h $vr0, $a1, 0, 6 - alsl.d $a1, $s5, $a7, 1 + alsl.d $a1, $s6, $a7, 1 vstelm.h $vr0, $a1, 0, 6 - b .LBB0_87 + b .LBB0_88 .LBB0_58: bnez $s3, .LBB0_60 # %bb.59: ld.w $a1, $s1, 4 pcalau12i $a0, %pc_hi20(.L.str.3) addi.d $a0, $a0, %pc_lo12(.L.str.3) - move $s0, $ra - move $s3, $t8 + move $s3, $ra + move $s4, $t8 pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 - move $t8, $s3 - move $ra, $s0 + move $t8, $s4 + move $ra, $s3 .LBB0_60: move $s4, $zero ld.hu $a2, $sp, 386 @@ -1070,7 +1072,7 @@ intrapred8x8: # @intrapred8x8 add.d $a4, $a7, $a4 stx.h $a3, $a4, $t0 stx.h $a3, $a5, $t2 - slli.d $t3, $s5, 1 + slli.d $t3, $s6, 1 alsl.d $a2, $a2, $a6, 1 vpickve2gr.w $t4, $vr4, 3 add.d $a2, $a2, $t4 @@ -1096,7 +1098,7 @@ intrapred8x8: # @intrapred8x8 add.d $t1, $t1, $t5 bstrpick.d $t1, $t1, 18, 2 stx.h $t1, $a7, $t0 - slli.d $t0, $s6, 1 + slli.d $t0, $s0, 1 stx.h $t1, $a6, $t2 stx.h $t1, $a4, $t3 stx.h $t1, $a5, $t0 @@ -1162,7 +1164,7 @@ intrapred8x8: # @intrapred8x8 stx.h $a4, $a7, $t0 alsl.d $a4, $t3, $a5, 1 add.d $a4, $a4, $a6 - alsl.d $a6, $s5, $a1, 1 + alsl.d $a6, $s6, $a1, 1 alsl.d $t6, $fp, $a3, 1 addi.d $a4, $a4, 2 srli.d $a4, $a4, 2 @@ -1173,21 +1175,21 @@ intrapred8x8: # @intrapred8x8 xor $a7, $a7, $a5 srli.d $a7, $a7, 1 sub.d $a7, $t1, $a7 - alsl.d $t1, $s6, $a1, 1 + alsl.d $t1, $s0, $a1, 1 alsl.d $t7, $ra, $a3, 1 stx.h $a7, $a3, $t0 - alsl.d $t0, $s5, $a2, 1 + alsl.d $t0, $s6, $a2, 1 stx.h $a7, $a2, $t2 alsl.d $a7, $s7, $a1, 1 move $a0, $t8 - alsl.d $t8, $s5, $a3, 1 + alsl.d $t8, $s6, $a3, 1 add.d $a5, $a5, $t3 - alsl.d $t3, $s6, $a2, 1 + alsl.d $t3, $s0, $a2, 1 addi.d $a5, $a5, 1 srli.d $a5, $a5, 1 stx.h $a5, $a3, $t2 alsl.d $a5, $s8, $a1, 1 - alsl.d $t2, $s6, $a3, 1 + alsl.d $t2, $s0, $a3, 1 alsl.d $fp, $s7, $a2, 1 alsl.d $s0, $s2, $a1, 1 alsl.d $a1, $a0, $a1, 1 @@ -1215,7 +1217,7 @@ intrapred8x8: # @intrapred8x8 vstelm.h $vr0, $s0, 0, 6 vstelm.h $vr0, $a3, 0, 7 vstelm.h $vr0, $a2, 0, 7 - b .LBB0_87 + b .LBB0_88 .LBB0_61: bnez $s4, .LBB0_63 # %bb.62: @@ -1253,10 +1255,9 @@ intrapred8x8: # @intrapred8x8 vreplgr2vr.h $vr0, $a3 vst $vr0, $a1, 296 vreplgr2vr.h $vr0, $a2 - vst $vr0, $a1, 328 b .LBB0_87 .LBB0_64: - beqz $s0, .LBB0_67 + beqz $s5, .LBB0_67 # %bb.65: beqz $s4, .LBB0_67 # %bb.66: @@ -1265,12 +1266,12 @@ intrapred8x8: # @intrapred8x8 ld.w $a1, $s1, 4 pcalau12i $a0, %pc_hi20(.L.str.5) addi.d $a0, $a0, %pc_lo12(.L.str.5) - move $s0, $ra - move $s3, $t8 + move $s3, $ra + move $s4, $t8 pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 - move $t8, $s3 - move $ra, $s0 + move $t8, $s4 + move $ra, $s3 .LBB0_68: ld.hu $a1, $sp, 384 ld.hu $a2, $sp, 386 @@ -1282,12 +1283,12 @@ intrapred8x8: # @intrapred8x8 ld.d $a3, $sp, 56 # 8-byte Folded Reload slli.d $a3, $a3, 5 add.d $a3, $s3, $a3 - slli.d $t1, $s6, 1 + slli.d $t1, $s0, 1 stx.h $a0, $a3, $t1 ld.d $a4, $sp, 64 # 8-byte Folded Reload slli.d $a4, $a4, 5 add.d $a6, $s3, $a4 - slli.d $a7, $s5, 1 + slli.d $a7, $s6, 1 stx.h $a0, $a6, $a7 ld.d $a4, $sp, 48 # 8-byte Folded Reload slli.d $a4, $a4, 5 @@ -1465,12 +1466,11 @@ intrapred8x8: # @intrapred8x8 addi.d $a0, $a0, 2 srli.d $a0, $a0, 2 stx.h $a0, $t2, $a4 - b .LBB0_87 + b .LBB0_88 .LBB0_69: - st.d $s2, $sp, 32 # 8-byte Folded Spill - st.d $s8, $sp, 40 # 8-byte Folded Spill + st.d $s2, $sp, 40 # 8-byte Folded Spill move $s2, $fp - beqz $s0, .LBB0_72 + beqz $s5, .LBB0_72 # %bb.70: beqz $s4, .LBB0_72 # %bb.71: @@ -1480,10 +1480,10 @@ intrapred8x8: # @intrapred8x8 pcalau12i $a0, %pc_hi20(.L.str.6) addi.d $a0, $a0, %pc_lo12(.L.str.6) move $fp, $ra - move $s0, $t8 + move $s3, $t8 pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 - move $t8, $s0 + move $t8, $s3 move $ra, $fp .LBB0_73: ld.hu $a1, $sp, 418 @@ -1506,7 +1506,7 @@ intrapred8x8: # @intrapred8x8 ld.d $a4, $sp, 88 # 8-byte Folded Reload slli.d $a4, $a4, 5 add.d $a5, $fp, $a4 - slli.d $t0, $s5, 1 + slli.d $t0, $s6, 1 stx.h $a0, $a5, $t0 ld.d $a4, $sp, 104 # 8-byte Folded Reload slli.d $a4, $a4, 5 @@ -1543,119 +1543,118 @@ intrapred8x8: # @intrapred8x8 slli.d $t6, $t6, 5 add.d $t6, $fp, $t6 stx.h $a0, $t6, $a3 - ld.hu $s1, $sp, 426 + ld.hu $s2, $sp, 426 stx.h $a0, $t4, $a6 stx.h $a0, $t3, $t0 stx.h $a0, $t1, $t2 - add.d $a0, $t8, $s1 + add.d $a0, $t8, $s2 addi.d $a0, $a0, 1 srli.d $a0, $a0, 1 - ld.d $s0, $sp, 80 # 8-byte Folded Reload - slli.d $s0, $s0, 5 - add.d $fp, $fp, $s0 + ld.d $s1, $sp, 80 # 8-byte Folded Reload + slli.d $s1, $s1, 5 + add.d $fp, $fp, $s1 stx.h $a0, $fp, $a3 - ld.hu $s2, $sp, 428 + ld.hu $s3, $sp, 428 stx.h $a0, $t6, $a6 stx.h $a0, $t4, $t0 stx.h $a0, $t3, $t2 - add.d $a0, $s1, $s2 + add.d $a0, $s2, $s3 addi.d $a0, $a0, 1 srli.d $a0, $a0, 1 - ld.hu $s3, $sp, 430 + ld.hu $s4, $sp, 430 stx.h $a0, $fp, $a6 stx.h $a0, $t6, $t0 stx.h $a0, $t4, $t2 - add.d $a0, $s2, $s3 + add.d $a0, $s3, $s4 addi.d $a0, $a0, 1 - ld.hu $s7, $sp, 432 + ld.hu $s6, $sp, 432 srli.d $a0, $a0, 1 stx.h $a0, $fp, $t0 stx.h $a0, $t6, $t2 - add.d $a0, $s3, $s7 + add.d $a0, $s4, $s6 addi.d $a0, $a0, 1 srli.d $a0, $a0, 1 - ld.hu $s4, $sp, 386 + ld.hu $s5, $sp, 386 stx.h $a0, $fp, $t2 addi.d $a0, $a1, 2 alsl.d $t2, $a2, $a0, 1 - add.d $t2, $t2, $s4 - srli.d $s8, $t2, 2 - ld.d $t2, $sp, 32 # 8-byte Folded Reload + add.d $t2, $t2, $s5 + srli.d $s7, $t2, 2 + ld.d $t2, $sp, 40 # 8-byte Folded Reload slli.d $t2, $t2, 1 - stx.h $s8, $t1, $t2 - ld.d $s0, $sp, 40 # 8-byte Folded Reload + stx.h $s7, $t1, $t2 + slli.d $s1, $s8, 1 + stx.h $s7, $a7, $s1 slli.d $s0, $s0, 1 - stx.h $s8, $a7, $s0 - slli.d $s5, $s6, 1 - stx.h $s8, $a5, $s5 - slli.d $s6, $ra, 1 - stx.h $s8, $a4, $s6 + stx.h $s7, $a5, $s0 + slli.d $s8, $ra, 1 + stx.h $s7, $a4, $s8 addi.d $a2, $a2, 2 alsl.d $a1, $a1, $a2, 1 add.d $a1, $a1, $t5 srli.d $a1, $a1, 2 stx.h $a1, $t3, $t2 - stx.h $a1, $t1, $s0 - stx.h $a1, $a7, $s5 - stx.h $a1, $a5, $s6 + stx.h $a1, $t1, $s1 + stx.h $a1, $a7, $s0 + stx.h $a1, $a5, $s8 alsl.d $a0, $t5, $a0, 1 add.d $a0, $a0, $t7 srli.d $a0, $a0, 2 stx.h $a0, $t4, $t2 - stx.h $a0, $t3, $s0 - stx.h $a0, $t1, $s5 - stx.h $a0, $a7, $s6 + stx.h $a0, $t3, $s1 + stx.h $a0, $t1, $s0 + stx.h $a0, $a7, $s8 alsl.d $a0, $t7, $t5, 1 add.d $a0, $a0, $t8 addi.d $a0, $a0, 2 srli.d $a0, $a0, 2 stx.h $a0, $t6, $t2 - stx.h $a0, $t4, $s0 - stx.h $a0, $t3, $s5 - stx.h $a0, $t1, $s6 + stx.h $a0, $t4, $s1 + stx.h $a0, $t3, $s0 + stx.h $a0, $t1, $s8 alsl.d $a0, $t8, $t7, 1 - add.d $a0, $a0, $s1 - addi.d $a0, $a0, 2 - srli.d $a0, $a0, 2 - stx.h $a0, $fp, $t2 - stx.h $a0, $t6, $s0 - stx.h $a0, $t4, $s5 - stx.h $a0, $t3, $s6 - alsl.d $a0, $s1, $t8, 1 add.d $a0, $a0, $s2 addi.d $a0, $a0, 2 srli.d $a0, $a0, 2 - stx.h $a0, $fp, $s0 - stx.h $a0, $t6, $s5 - stx.h $a0, $t4, $s6 - alsl.d $a0, $s2, $s1, 1 + stx.h $a0, $fp, $t2 + stx.h $a0, $t6, $s1 + stx.h $a0, $t4, $s0 + stx.h $a0, $t3, $s8 + alsl.d $a0, $s2, $t8, 1 add.d $a0, $a0, $s3 addi.d $a0, $a0, 2 srli.d $a0, $a0, 2 - stx.h $a0, $fp, $s5 - stx.h $a0, $t6, $s6 + stx.h $a0, $fp, $s1 + stx.h $a0, $t6, $s0 + stx.h $a0, $t4, $s8 alsl.d $a0, $s3, $s2, 1 - add.d $a0, $a0, $s7 + add.d $a0, $a0, $s4 + addi.d $a0, $a0, 2 + srli.d $a0, $a0, 2 + stx.h $a0, $fp, $s0 + stx.h $a0, $t6, $s8 + alsl.d $a0, $s4, $s3, 1 + ld.d $s4, $sp, 112 # 8-byte Folded Reload + add.d $a0, $a0, $s6 addi.d $a0, $a0, 2 ld.hu $a1, $sp, 388 srli.d $a0, $a0, 2 - stx.h $a0, $fp, $s6 - alsl.d $a0, $s4, $a2, 1 + stx.h $a0, $fp, $s8 + alsl.d $a0, $s5, $a2, 1 add.d $a0, $a0, $a1 srli.d $a0, $a0, 2 stx.h $a0, $a7, $a3 ld.hu $a2, $sp, 390 stx.h $a0, $a5, $a6 stx.h $a0, $a4, $t0 - alsl.d $a0, $a1, $s4, 1 - ld.d $s4, $sp, 112 # 8-byte Folded Reload + alsl.d $a0, $a1, $s5, 1 add.d $a0, $a0, $a2 addi.d $a0, $a0, 2 srli.d $a0, $a0, 2 stx.h $a0, $a7, $t2 ld.hu $a7, $sp, 392 - stx.h $a0, $a5, $s0 - stx.h $a0, $a4, $s5 + stx.h $a0, $a5, $s1 + stx.h $a0, $a4, $s0 alsl.d $a0, $a2, $a1, 1 add.d $a0, $a0, $a7 addi.d $a0, $a0, 2 @@ -1669,7 +1668,7 @@ intrapred8x8: # @intrapred8x8 srli.d $a0, $a0, 2 ld.hu $a2, $sp, 396 stx.h $a0, $a5, $t2 - stx.h $a0, $a4, $s0 + stx.h $a0, $a4, $s1 alsl.d $a0, $a1, $a7, 1 add.d $a0, $a0, $a2 addi.d $a0, $a0, 2 @@ -1681,20 +1680,20 @@ intrapred8x8: # @intrapred8x8 addi.d $a0, $a0, 2 srli.d $a0, $a0, 2 stx.h $a0, $a4, $t2 - b .LBB0_87 + b .LBB0_88 .LBB0_74: - move $s3, $s2 + move $s5, $s2 bnez $s4, .LBB0_76 # %bb.75: ld.w $a1, $s1, 4 pcalau12i $a0, %pc_hi20(.L.str.7) addi.d $a0, $a0, %pc_lo12(.L.str.7) - move $s0, $ra - move $s2, $t8 + move $s2, $ra + move $s3, $t8 pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 - move $t8, $s2 - move $ra, $s0 + move $t8, $s3 + move $ra, $s2 .LBB0_76: ld.hu $a1, $sp, 418 ld.hu $a2, $sp, 420 @@ -1712,7 +1711,7 @@ intrapred8x8: # @intrapred8x8 add.d $a0, $a2, $a4 addi.d $a0, $a0, 1 srli.d $a0, $a0, 1 - slli.d $t3, $s5, 1 + slli.d $t3, $s6, 1 stx.h $a0, $a3, $t3 ld.hu $a6, $sp, 424 ld.d $a5, $sp, 88 # 8-byte Folded Reload @@ -1759,58 +1758,58 @@ intrapred8x8: # @intrapred8x8 stx.h $a0, $a7, $t8 stx.h $a0, $t1, $t6 stx.h $a0, $t7, $t3 - ld.hu $s0, $sp, 432 - ld.d $s1, $sp, 96 # 8-byte Folded Reload - slli.d $s1, $s1, 5 - add.d $s1, $t4, $s1 - stx.h $a0, $s1, $t5 - add.d $a0, $fp, $s0 + ld.hu $s1, $sp, 432 + ld.d $s2, $sp, 96 # 8-byte Folded Reload + slli.d $s2, $s2, 5 + add.d $s2, $t4, $s2 + stx.h $a0, $s2, $t5 + add.d $a0, $fp, $s1 addi.d $a0, $a0, 1 srli.d $a0, $a0, 1 stx.h $a0, $t1, $t8 stx.h $a0, $t7, $t6 - stx.h $a0, $s1, $t3 - ld.d $s2, $sp, 56 # 8-byte Folded Reload - slli.d $s2, $s2, 5 - add.d $s2, $t4, $s2 - stx.h $a0, $s2, $t5 + stx.h $a0, $s2, $t3 + ld.d $s3, $sp, 56 # 8-byte Folded Reload + slli.d $s3, $s3, 5 + add.d $s3, $t4, $s3 + stx.h $a0, $s3, $t5 ld.d $a0, $sp, 80 # 8-byte Folded Reload slli.d $a0, $a0, 5 add.d $a0, $t4, $a0 - vreplgr2vr.h $vr0, $s0 + vreplgr2vr.h $vr0, $s1 vstx $vr0, $a0, $t5 - slli.d $a0, $s3, 1 - stx.h $s0, $s2, $a0 - stx.h $s0, $s2, $t8 + slli.d $a0, $s5, 1 + stx.h $s1, $s3, $a0 + stx.h $s1, $s3, $t8 slli.d $t4, $s8, 1 - stx.h $s0, $s2, $t4 - stx.h $s0, $s2, $t6 - slli.d $t5, $s6, 1 - stx.h $s0, $s2, $t5 - stx.h $s0, $s2, $t3 - stx.h $s0, $s1, $a0 - stx.h $s0, $s1, $t8 - stx.h $s0, $s1, $t4 - stx.h $s0, $s1, $t6 - stx.h $s0, $t7, $a0 - stx.h $s0, $t7, $t8 - alsl.d $t3, $s0, $s0, 1 + stx.h $s1, $s3, $t4 + stx.h $s1, $s3, $t6 + slli.d $t5, $s0, 1 + stx.h $s1, $s3, $t5 + stx.h $s1, $s3, $t3 + stx.h $s1, $s2, $a0 + stx.h $s1, $s2, $t8 + stx.h $s1, $s2, $t4 + stx.h $s1, $s2, $t6 + stx.h $s1, $t7, $a0 + stx.h $s1, $t7, $t8 + alsl.d $t3, $s1, $s1, 1 addi.d $t6, $fp, 2 add.d $t3, $t6, $t3 srli.d $t3, $t3, 2 stx.h $t3, $t1, $a0 stx.h $t3, $t7, $t4 - stx.h $t3, $s1, $t5 + stx.h $t3, $s2, $t5 slli.d $t8, $ra, 1 - stx.h $t3, $s2, $t8 + stx.h $t3, $s3, $t8 alsl.d $t3, $fp, $t2, 1 - add.d $t3, $t3, $s0 + add.d $t3, $t3, $s1 addi.d $t3, $t3, 2 srli.d $t3, $t3, 2 stx.h $t3, $a7, $a0 stx.h $t3, $t1, $t4 stx.h $t3, $t7, $t5 - stx.h $t3, $s1, $t8 + stx.h $t3, $s2, $t8 alsl.d $t3, $t2, $t0, 1 add.d $t3, $t6, $t3 srli.d $t3, $t3, 2 @@ -1842,14 +1841,14 @@ intrapred8x8: # @intrapred8x8 alsl.d $a0, $a2, $a0, 1 srli.d $a0, $a0, 2 stx.h $a0, $a3, $t8 - b .LBB0_87 + b .LBB0_88 .LBB0_77: pcalau12i $a0, %pc_hi20(.L.str.8) addi.d $a0, $a0, %pc_lo12(.L.str.8) pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 ori $s4, $zero, 1 - b .LBB0_87 + b .LBB0_88 .LBB0_78: beqz $s4, .LBB0_81 # %bb.79: @@ -1895,74 +1894,20 @@ intrapred8x8: # @intrapred8x8 .LBB0_86: # %.preheader ld.d $a0, $sp, 104 # 8-byte Folded Reload move $s4, $zero - slli.d $a2, $a0, 5 - add.d $a2, $s1, $a2 - alsl.d $a2, $fp, $a2, 1 - st.h $a1, $a2, 104 - st.h $a1, $a2, 106 - st.h $a1, $a2, 108 - st.h $a1, $a2, 110 - st.h $a1, $a2, 112 - st.h $a1, $a2, 114 - st.h $a1, $a2, 116 - st.h $a1, $a2, 118 - st.h $a1, $a2, 136 - st.h $a1, $a2, 138 - st.h $a1, $a2, 140 - st.h $a1, $a2, 142 - st.h $a1, $a2, 144 - st.h $a1, $a2, 146 - st.h $a1, $a2, 148 - st.h $a1, $a2, 150 - st.h $a1, $a2, 168 - st.h $a1, $a2, 170 - st.h $a1, $a2, 172 - st.h $a1, $a2, 174 - st.h $a1, $a2, 176 - st.h $a1, $a2, 178 - st.h $a1, $a2, 180 - st.h $a1, $a2, 182 - st.h $a1, $a2, 200 - st.h $a1, $a2, 202 - st.h $a1, $a2, 204 - st.h $a1, $a2, 206 - st.h $a1, $a2, 208 - st.h $a1, $a2, 210 - st.h $a1, $a2, 212 - st.h $a1, $a2, 214 - st.h $a1, $a2, 232 - st.h $a1, $a2, 234 - st.h $a1, $a2, 236 - st.h $a1, $a2, 238 - st.h $a1, $a2, 240 - st.h $a1, $a2, 242 - st.h $a1, $a2, 244 - st.h $a1, $a2, 246 - st.h $a1, $a2, 264 - st.h $a1, $a2, 266 - st.h $a1, $a2, 268 - st.h $a1, $a2, 270 - st.h $a1, $a2, 272 - st.h $a1, $a2, 274 - st.h $a1, $a2, 276 - st.h $a1, $a2, 278 - st.h $a1, $a2, 296 - st.h $a1, $a2, 298 - st.h $a1, $a2, 300 - st.h $a1, $a2, 302 - st.h $a1, $a2, 304 - st.h $a1, $a2, 306 - st.h $a1, $a2, 308 - st.h $a1, $a2, 310 - st.h $a1, $a2, 328 - st.h $a1, $a2, 330 - st.h $a1, $a2, 332 - st.h $a1, $a2, 334 - st.h $a1, $a2, 336 - st.h $a1, $a2, 338 - st.h $a1, $a2, 340 - st.h $a1, $a2, 342 + vreplgr2vr.h $vr0, $a1 + slli.d $a1, $a0, 5 + add.d $a1, $s1, $a1 + alsl.d $a1, $fp, $a1, 1 + vst $vr0, $a1, 104 + vst $vr0, $a1, 136 + vst $vr0, $a1, 168 + vst $vr0, $a1, 200 + vst $vr0, $a1, 232 + vst $vr0, $a1, 264 + vst $vr0, $a1, 296 .LBB0_87: # %.loopexit + vst $vr0, $a1, 328 +.LBB0_88: # %.loopexit move $a0, $s4 ld.d $s8, $sp, 440 # 8-byte Folded Reload ld.d $s7, $sp, 448 # 8-byte Folded Reload @@ -1982,7 +1927,7 @@ intrapred8x8: # @intrapred8x8 .section .rodata,"a",@progbits .p2align 2, 0x0 .LJTI0_0: - .word .LBB0_44-.LJTI0_0 + .word .LBB0_43-.LJTI0_0 .word .LBB0_61-.LJTI0_0 .word .LBB0_52-.LJTI0_0 .word .LBB0_55-.LJTI0_0 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/block.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/block.s index 8d5f33de..65649ae6 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/block.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/block.s @@ -850,49 +850,49 @@ intrapred_luma_16x16: # @intrapred_luma_16x16 beqz $a0, .LBB1_17 .LBB1_14: ld.w $a2, $sp, 444 - ld.w $a5, $sp, 440 + ld.w $a4, $sp, 440 slli.d $a2, $a2, 3 - ldx.d $a4, $s2, $a2 - alsl.d $a6, $a5, $a4, 1 - slli.d $a7, $a5, 1 - ldx.hu $a2, $a4, $a7 - ld.hu $a5, $a6, 2 + ldx.d $a5, $s2, $a2 + alsl.d $a6, $a4, $a5, 1 + slli.d $a7, $a4, 1 + ldx.hu $a2, $a5, $a7 + ld.hu $a4, $a6, 2 ld.hu $t0, $a6, 4 ld.hu $t1, $a6, 6 - add.d $a2, $a2, $a5 + add.d $a2, $a2, $a4 add.d $a2, $a2, $t0 add.d $a2, $a2, $t1 - ld.hu $a5, $a6, 8 + ld.hu $a4, $a6, 8 ld.hu $t0, $a6, 10 ld.hu $t1, $a6, 12 ld.hu $t2, $a6, 14 - add.d $a2, $a2, $a5 + add.d $a2, $a2, $a4 add.d $a2, $a2, $t0 add.d $a2, $a2, $t1 add.d $a2, $a2, $t2 - ld.hu $a5, $a6, 16 + ld.hu $a4, $a6, 16 ld.hu $t0, $a6, 18 ld.hu $t1, $a6, 20 ld.hu $t2, $a6, 22 - add.d $a2, $a2, $a5 + add.d $a2, $a2, $a4 add.d $a2, $a2, $t0 add.d $a2, $a2, $t1 add.d $a2, $a2, $t2 - ld.hu $a5, $a6, 24 + ld.hu $a4, $a6, 24 ld.hu $t0, $a6, 26 ld.hu $t1, $a6, 28 ld.hu $t2, $a6, 30 - add.d $a2, $a2, $a5 + add.d $a2, $a2, $a4 add.d $a2, $a2, $t0 add.d $a2, $a2, $t1 add.w $a2, $a2, $t2 bnez $a3, .LBB1_19 # %bb.15: - move $a5, $zero + move $a4, $zero vld $vr0, $a6, 16 - vldx $vr1, $a4, $a7 + vldx $vr1, $a5, $a7 addi.d $a2, $a2, 8 - bstrpick.d $a4, $a2, 31, 4 + bstrpick.d $a3, $a2, 31, 4 vst $vr0, $sp, 464 vst $vr1, $sp, 448 b .LBB1_23 @@ -1027,112 +1027,112 @@ intrapred_luma_16x16: # @intrapred_luma_16x16 vldx $vr0, $a4, $a5 vld $vr1, $a3, 16 addi.d $a2, $a2, 16 - bstrpick.d $a4, $a2, 31, 5 + bstrpick.d $a3, $a2, 31, 5 vst $vr0, $sp, 448 vst $vr1, $sp, 464 b .LBB1_22 .LBB1_21: addi.d $a2, $a3, 8 - bstrpick.d $a4, $a2, 31, 4 + bstrpick.d $a3, $a2, 31, 4 .LBB1_22: # %vector.body ld.w $a2, $sp, 60 - ld.w $a3, $sp, 84 + ld.w $a4, $sp, 84 slli.d $a2, $a2, 3 - slli.d $a3, $a3, 3 + slli.d $a4, $a4, 3 ld.w $a5, $sp, 56 ld.w $a6, $sp, 80 ldx.d $a2, $s2, $a2 - ldx.d $a3, $s2, $a3 + ldx.d $a4, $s2, $a4 slli.d $a5, $a5, 1 slli.d $a6, $a6, 1 ldx.h $a2, $a2, $a5 - ldx.h $a3, $a3, $a6 + ldx.h $a4, $a4, $a6 ld.w $a5, $sp, 108 ld.w $a6, $sp, 132 st.h $a2, $sp, 480 - st.h $a3, $sp, 482 + st.h $a4, $sp, 482 slli.d $a2, $a5, 3 - slli.d $a3, $a6, 3 + slli.d $a4, $a6, 3 ld.w $a5, $sp, 104 ld.w $a6, $sp, 128 ldx.d $a2, $s2, $a2 - ldx.d $a3, $s2, $a3 + ldx.d $a4, $s2, $a4 slli.d $a5, $a5, 1 slli.d $a6, $a6, 1 ldx.h $a2, $a2, $a5 - ldx.h $a3, $a3, $a6 + ldx.h $a4, $a4, $a6 ld.w $a5, $sp, 156 ld.w $a6, $sp, 180 st.h $a2, $sp, 484 - st.h $a3, $sp, 486 + st.h $a4, $sp, 486 slli.d $a2, $a5, 3 - slli.d $a3, $a6, 3 + slli.d $a4, $a6, 3 ld.w $a5, $sp, 152 ld.w $a6, $sp, 176 ldx.d $a2, $s2, $a2 - ldx.d $a3, $s2, $a3 + ldx.d $a4, $s2, $a4 slli.d $a5, $a5, 1 slli.d $a6, $a6, 1 ldx.h $a2, $a2, $a5 - ldx.h $a3, $a3, $a6 + ldx.h $a4, $a4, $a6 ld.w $a5, $sp, 204 ld.w $a6, $sp, 228 st.h $a2, $sp, 488 - st.h $a3, $sp, 490 + st.h $a4, $sp, 490 slli.d $a2, $a5, 3 - slli.d $a3, $a6, 3 + slli.d $a4, $a6, 3 ld.w $a5, $sp, 200 ld.w $a6, $sp, 224 ldx.d $a2, $s2, $a2 - ldx.d $a3, $s2, $a3 + ldx.d $a4, $s2, $a4 slli.d $a5, $a5, 1 slli.d $a6, $a6, 1 ldx.h $a2, $a2, $a5 - ldx.h $a3, $a3, $a6 + ldx.h $a4, $a4, $a6 ld.w $a5, $sp, 252 ld.w $a6, $sp, 276 st.h $a2, $sp, 492 - st.h $a3, $sp, 494 + st.h $a4, $sp, 494 slli.d $a2, $a5, 3 - slli.d $a3, $a6, 3 + slli.d $a4, $a6, 3 ld.w $a5, $sp, 248 ld.w $a6, $sp, 272 ldx.d $a2, $s2, $a2 - ldx.d $a3, $s2, $a3 + ldx.d $a4, $s2, $a4 slli.d $a5, $a5, 1 slli.d $a6, $a6, 1 ldx.h $a2, $a2, $a5 - ldx.h $a3, $a3, $a6 + ldx.h $a4, $a4, $a6 ld.w $a5, $sp, 300 ld.w $a6, $sp, 324 st.h $a2, $sp, 496 - st.h $a3, $sp, 498 + st.h $a4, $sp, 498 slli.d $a2, $a5, 3 - slli.d $a3, $a6, 3 + slli.d $a4, $a6, 3 ld.w $a5, $sp, 296 ld.w $a6, $sp, 320 ldx.d $a2, $s2, $a2 - ldx.d $a3, $s2, $a3 + ldx.d $a4, $s2, $a4 slli.d $a5, $a5, 1 slli.d $a6, $a6, 1 ldx.h $a2, $a2, $a5 - ldx.h $a3, $a3, $a6 + ldx.h $a4, $a4, $a6 ld.w $a5, $sp, 348 ld.w $a6, $sp, 372 st.h $a2, $sp, 500 - st.h $a3, $sp, 502 + st.h $a4, $sp, 502 slli.d $a2, $a5, 3 - slli.d $a3, $a6, 3 + slli.d $a4, $a6, 3 ld.w $a5, $sp, 344 ld.w $a6, $sp, 368 ldx.d $a2, $s2, $a2 - ldx.d $a3, $s2, $a3 + ldx.d $a4, $s2, $a4 slli.d $a5, $a5, 1 slli.d $a6, $a6, 1 ld.w $a7, $sp, 396 ld.w $t0, $sp, 420 ldx.h $a2, $a2, $a5 - ldx.h $a3, $a3, $a6 + ldx.h $a4, $a4, $a6 slli.d $a5, $a7, 3 slli.d $a6, $t0, 3 ld.w $a7, $sp, 392 @@ -1144,65 +1144,39 @@ intrapred_luma_16x16: # @intrapred_luma_16x16 slli.d $a7, $t0, 1 ldx.h $a6, $a6, $a7 st.h $a2, $sp, 504 - st.h $a3, $sp, 506 + st.h $a4, $sp, 506 st.h $a5, $sp, 508 st.h $a6, $sp, 510 - ori $a5, $zero, 1 + ori $a4, $zero, 1 .LBB1_23: # %.loopexit136 ld.d $a2, $s3, 0 - ori $a6, $zero, 32 + vreplgr2vr.h $vr0, $a3 + ori $a5, $zero, 32 lu12i.w $a3, 1 - ori $a7, $a3, 720 - addi.d $t0, $sp, 448 - ori $t1, $zero, 64 + ori $a6, $a3, 720 + addi.d $a7, $sp, 448 + ori $t0, $zero, 64 .p2align 4, , 16 .LBB1_24: # =>This Inner Loop Header: Depth=1 - vld $vr0, $sp, 448 - vld $vr1, $sp, 464 - vstx $vr0, $a2, $a7 - add.d $a2, $a2, $a7 - vst $vr1, $a2, 16 + vld $vr1, $sp, 448 + vld $vr2, $sp, 464 + vstx $vr1, $a2, $a6 + add.d $a2, $a2, $a6 + vst $vr2, $a2, 16 ld.d $a2, $s3, 0 - ldx.h $t2, $a6, $t0 - add.d $t3, $a2, $a7 - st.h $t2, $t3, 512 - st.h $a4, $t3, 1024 - st.h $t2, $t3, 514 - st.h $a4, $t3, 1026 - st.h $t2, $t3, 516 - st.h $a4, $t3, 1028 - st.h $t2, $t3, 518 - st.h $a4, $t3, 1030 - st.h $t2, $t3, 520 - st.h $a4, $t3, 1032 - st.h $t2, $t3, 522 - st.h $a4, $t3, 1034 - st.h $t2, $t3, 524 - st.h $a4, $t3, 1036 - st.h $t2, $t3, 526 - st.h $a4, $t3, 1038 - st.h $t2, $t3, 528 - st.h $a4, $t3, 1040 - st.h $t2, $t3, 530 - st.h $a4, $t3, 1042 - st.h $t2, $t3, 532 - st.h $a4, $t3, 1044 - st.h $t2, $t3, 534 - st.h $a4, $t3, 1046 - st.h $t2, $t3, 536 - st.h $a4, $t3, 1048 - st.h $t2, $t3, 538 - st.h $a4, $t3, 1050 - st.h $t2, $t3, 540 - st.h $a4, $t3, 1052 - st.h $t2, $t3, 542 - st.h $a4, $t3, 1054 - addi.d $a6, $a6, 2 - addi.d $a7, $a7, 32 - bne $a6, $t1, .LBB1_24 + ldx.h $t1, $a5, $a7 + add.d $t2, $a2, $a6 + vreplgr2vr.h $vr1, $t1 + vst $vr1, $t2, 512 + vst $vr0, $t2, 1024 + vst $vr1, $t2, 528 + vst $vr0, $t2, 1040 + addi.d $a5, $a5, 2 + addi.d $a6, $a6, 32 + bne $a5, $t0, .LBB1_24 # %bb.25: sltui $a0, $a0, 1 - xori $a4, $a5, 1 + xori $a4, $a4, 1 or $a0, $a0, $a4 bnez $a0, .LBB1_29 # %bb.26: @@ -1442,8 +1416,8 @@ intrapred_luma_16x16: # @intrapred_luma_16x16 ld.d $a2, $s3, 0 lu12i.w $a3, 3 ori $a3, $a3, 3224 - ldx.w $a4, $a2, $a3 - move $a5, $zero + ldx.w $a3, $a2, $a3 + move $a4, $zero b .LBB1_23 .Lfunc_end1: .size intrapred_luma_16x16, .Lfunc_end1-intrapred_luma_16x16 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/decoder.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/decoder.s index 94d91759..563c7011 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/decoder.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/decoder.s @@ -864,13 +864,10 @@ decode_one_mb: # @decode_one_mb .type Get_Reference_Pixel,@function Get_Reference_Pixel: # @Get_Reference_Pixel # %bb.0: - addi.d $sp, $sp, -48 - st.d $fp, $sp, 40 # 8-byte Folded Spill - st.d $s0, $sp, 32 # 8-byte Folded Spill - st.d $s1, $sp, 24 # 8-byte Folded Spill - st.d $s2, $sp, 16 # 8-byte Folded Spill - st.d $s3, $sp, 8 # 8-byte Folded Spill - st.d $s4, $sp, 0 # 8-byte Folded Spill + addi.d $sp, $sp, -32 + st.d $fp, $sp, 24 # 8-byte Folded Spill + st.d $s0, $sp, 16 # 8-byte Folded Spill + st.d $s1, $sp, 8 # 8-byte Folded Spill srai.d $a7, $a2, 2 srai.d $a5, $a1, 2 pcalau12i $a3, %got_pc_hi20(img) @@ -882,258 +879,223 @@ Get_Reference_Pixel: # @Get_Reference_Pixel or $a4, $a2, $a1 andi $t1, $a4, 3 addi.w $a4, $t0, -1 - beqz $t1, .LBB3_6 + beqz $t1, .LBB3_7 # %bb.1: andi $t0, $a1, 3 andi $a1, $a2, 3 - beqz $t0, .LBB3_7 + beqz $t0, .LBB3_8 # %bb.2: - beqz $a1, .LBB3_10 + beqz $a1, .LBB3_11 # %bb.3: ori $a2, $zero, 2 - bne $a1, $a2, .LBB3_13 + bne $a1, $a2, .LBB3_14 # %bb.4: # %.preheader170.preheader - slt $a1, $a2, $a7 - masknez $t1, $a2, $a1 - maskeqz $a1, $a7, $a1 - or $a1, $a1, $t1 - addi.d $a1, $a1, -2 + vinsgr2vr.w $vr2, $a7, 0 + vinsgr2vr.w $vr2, $a7, 1 + ori $a1, $zero, 2 + lu32i.d $a1, 1 + vreplgr2vr.d $vr1, $a1 + vmax.w $vr3, $vr2, $vr1 + vinsgr2vr.w $vr0, $a5, 0 + vinsgr2vr.w $vr0, $a5, 1 + vmax.w $vr4, $vr0, $vr1 + vrepli.d $vr5, -2 + vadd.w $vr1, $vr3, $vr5 + vinsgr2vr.w $vr6, $a6, 0 + vinsgr2vr.w $vr6, $a6, 1 + vmin.w $vr7, $vr1, $vr6 + srai.d $a1, $a7, 63 + andn $a1, $a7, $a1 slt $t1, $a1, $a6 maskeqz $a1, $a1, $t1 masknez $t1, $a6, $t1 or $a1, $a1, $t1 - slt $t1, $a2, $a5 - masknez $a2, $a2, $t1 - maskeqz $t1, $a5, $t1 - or $a2, $t1, $a2 - addi.d $a2, $a2, -2 - slt $t1, $a2, $a4 - maskeqz $a2, $a2, $t1 - masknez $t1, $a4, $t1 - or $a2, $a2, $t1 + addi.w $t5, $zero, -1 + slt $t1, $t5, $a7 + masknez $t2, $t5, $t1 + maskeqz $a7, $a7, $t1 + or $a7, $a7, $t2 + addi.d $a7, $a7, 1 + slt $t1, $a7, $a6 + maskeqz $a7, $a7, $t1 + masknez $a6, $a6, $t1 + or $t1, $a7, $a6 + addi.w $a6, $zero, -2 + lu32i.d $a6, -3 + vreplgr2vr.d $vr1, $a6 + vmax.w $vr8, $vr2, $vr1 + lu32i.d $a2, 3 + vreplgr2vr.d $vr2, $a2 + vadd.w $vr4, $vr4, $vr5 + vinsgr2vr.w $vr3, $a4, 0 + vinsgr2vr.w $vr3, $a4, 1 + vmin.w $vr4, $vr4, $vr3 + vpickve2gr.w $a2, $vr4, 0 slli.d $a2, $a2, 3 - ldx.d $t8, $a0, $a2 - slli.d $a2, $a1, 1 - ori $a1, $zero, 1 - slt $t1, $a1, $a7 - masknez $t2, $a1, $t1 - maskeqz $t1, $a7, $t1 - or $t1, $t1, $t2 - addi.d $t1, $t1, -1 - slt $t2, $t1, $a6 - maskeqz $t1, $t1, $t2 - masknez $t2, $a6, $t2 - or $t1, $t1, $t2 - slli.d $t1, $t1, 1 - ldx.hu $t2, $t8, $t1 - ldx.hu $t3, $t8, $a2 - alsl.d $t2, $t2, $t2, 2 - sub.d $t5, $t3, $t2 - srai.d $t2, $a7, 63 - andn $t2, $a7, $t2 - slt $t3, $t2, $a6 - maskeqz $t2, $t2, $t3 - masknez $t3, $a6, $t3 - or $t2, $t2, $t3 - slli.d $t3, $t2, 1 - ldx.hu $t6, $t8, $t3 - addi.w $fp, $zero, -1 - slt $t2, $fp, $a7 - masknez $t4, $fp, $t2 - maskeqz $t2, $a7, $t2 - or $t2, $t2, $t4 - addi.d $t2, $t2, 1 - slt $t4, $t2, $a6 - maskeqz $t2, $t2, $t4 - masknez $t4, $a6, $t4 - or $t2, $t2, $t4 - slli.d $t4, $t2, 1 - ldx.hu $t7, $t8, $t4 - ori $t2, $zero, 20 - mul.d $t6, $t6, $t2 - add.d $t5, $t6, $t5 - mul.d $t6, $t7, $t2 - add.d $s0, $t6, $t5 - addi.w $t7, $zero, -2 - slt $t5, $t7, $a7 - masknez $t6, $t7, $t5 - maskeqz $t5, $a7, $t5 - or $t5, $t5, $t6 - addi.d $t5, $t5, 2 - slt $t6, $t5, $a6 - maskeqz $t5, $t5, $t6 - masknez $t6, $a6, $t6 - or $t5, $t5, $t6 - slli.d $t5, $t5, 1 - ldx.hu $s1, $t8, $t5 - addi.w $t6, $zero, -3 - slt $s2, $t6, $a7 - masknez $s3, $t6, $s2 - maskeqz $a7, $a7, $s2 - or $a7, $a7, $s3 - addi.d $a7, $a7, 3 - slt $s2, $a7, $a6 - maskeqz $a7, $a7, $s2 - masknez $a6, $a6, $s2 - or $a6, $a7, $a6 + ldx.d $t4, $a0, $a2 + vpickve2gr.w $a2, $vr7, 0 + slli.d $a2, $a2, 1 + vpickve2gr.w $a6, $vr7, 1 slli.d $a6, $a6, 1 - slt $a7, $a1, $a5 - masknez $s2, $a1, $a7 - maskeqz $a7, $a5, $a7 - or $a7, $a7, $s2 - addi.d $a7, $a7, -1 - slt $s2, $a7, $a4 - maskeqz $a7, $a7, $s2 - masknez $s2, $a4, $s2 - or $a7, $a7, $s2 - slli.d $a7, $a7, 3 - ldx.d $s2, $a0, $a7 - ldx.hu $a7, $t8, $a6 - alsl.d $t8, $s1, $s1, 2 - sub.d $t8, $s0, $t8 - ldx.hu $s0, $s2, $t1 - add.d $a7, $t8, $a7 - ldx.hu $t8, $s2, $a2 - ldx.hu $s1, $s2, $t3 - alsl.d $s0, $s0, $s0, 2 - ldx.hu $s3, $s2, $t4 - sub.d $t8, $t8, $s0 - mul.d $s0, $s1, $t2 - add.d $t8, $s0, $t8 - mul.d $s0, $s3, $t2 - add.d $t8, $s0, $t8 - ldx.hu $s0, $s2, $t5 - srai.d $s1, $a5, 63 - andn $s1, $a5, $s1 - slt $s3, $s1, $a4 - maskeqz $s1, $s1, $s3 - masknez $s3, $a4, $s3 - or $s1, $s1, $s3 - slli.d $s1, $s1, 3 - ldx.d $s1, $a0, $s1 - ldx.hu $s2, $s2, $a6 - alsl.d $s0, $s0, $s0, 2 - sub.w $t8, $t8, $s0 - ldx.hu $s0, $s1, $t1 - add.d $t8, $t8, $s2 - ldx.hu $s2, $s1, $a2 - ldx.hu $s3, $s1, $t3 - alsl.d $s0, $s0, $s0, 2 - ldx.hu $s4, $s1, $t4 - sub.d $s0, $s2, $s0 - mul.d $s2, $s3, $t2 - add.d $s0, $s2, $s0 - mul.d $s2, $s4, $t2 - add.d $s0, $s2, $s0 - ldx.hu $s2, $s1, $t5 - slt $s3, $fp, $a5 - masknez $fp, $fp, $s3 - maskeqz $s3, $a5, $s3 - or $fp, $s3, $fp - addi.d $fp, $fp, 1 - slt $s3, $fp, $a4 - maskeqz $fp, $fp, $s3 - masknez $s3, $a4, $s3 - or $fp, $fp, $s3 + ldx.hu $a7, $t4, $a6 + ldx.hu $t2, $t4, $a2 + vadd.w $vr5, $vr8, $vr2 + vmin.w $vr5, $vr5, $vr6 + alsl.d $a7, $a7, $a7, 2 + sub.d $t2, $t2, $a7 + slli.d $a7, $a1, 1 + ldx.hu $t3, $t4, $a7 + slli.d $t1, $t1, 1 + ldx.hu $t6, $t4, $t1 + ori $a1, $zero, 20 + mul.d $t3, $t3, $a1 + add.d $t2, $t3, $t2 + mul.d $t3, $t6, $a1 + add.d $t6, $t3, $t2 + vpickve2gr.w $t2, $vr5, 0 + slli.d $t2, $t2, 1 + ldx.hu $t7, $t4, $t2 + vpickve2gr.w $t3, $vr5, 1 + slli.d $t3, $t3, 1 + vpickve2gr.w $t8, $vr4, 1 + slli.d $t8, $t8, 3 + ldx.d $t8, $a0, $t8 + ldx.hu $t4, $t4, $t3 + alsl.d $t7, $t7, $t7, 2 + sub.d $t6, $t6, $t7 + ldx.hu $t7, $t8, $a6 + add.d $t4, $t6, $t4 + ldx.hu $t6, $t8, $a2 + ldx.hu $fp, $t8, $a7 + alsl.d $t7, $t7, $t7, 2 + ldx.hu $s0, $t8, $t1 + sub.d $t6, $t6, $t7 + mul.d $t7, $fp, $a1 + add.d $t6, $t7, $t6 + mul.d $t7, $s0, $a1 + add.d $t6, $t7, $t6 + ldx.hu $t7, $t8, $t2 + srai.d $fp, $a5, 63 + andn $fp, $a5, $fp + slt $s0, $fp, $a4 + maskeqz $fp, $fp, $s0 + masknez $s0, $a4, $s0 + or $fp, $fp, $s0 slli.d $fp, $fp, 3 - ldx.d $s3, $a0, $fp - ldx.hu $fp, $s1, $a6 - alsl.d $s1, $s2, $s2, 2 - sub.d $s0, $s0, $s1 - ldx.hu $s1, $s3, $t1 - add.d $fp, $s0, $fp - ldx.hu $s0, $s3, $a2 - ldx.hu $s2, $s3, $t3 - alsl.d $s1, $s1, $s1, 2 - ldx.hu $s4, $s3, $t4 - sub.d $s0, $s0, $s1 - mul.d $s1, $s2, $t2 - add.d $s0, $s1, $s0 - mul.d $s1, $s4, $t2 - add.d $s0, $s1, $s0 - ldx.hu $s1, $s3, $t5 - slt $s2, $t7, $a5 - masknez $t7, $t7, $s2 - maskeqz $s2, $a5, $s2 - or $t7, $s2, $t7 - addi.d $t7, $t7, 2 - slt $s2, $t7, $a4 - maskeqz $t7, $t7, $s2 - masknez $s2, $a4, $s2 - or $t7, $t7, $s2 - slli.d $t7, $t7, 3 - ldx.d $s2, $a0, $t7 - ldx.hu $t7, $s3, $a6 - alsl.d $s1, $s1, $s1, 2 - sub.d $s0, $s0, $s1 - ldx.hu $s1, $s2, $t1 - add.d $t7, $s0, $t7 - ldx.hu $s0, $s2, $a2 - ldx.hu $s3, $s2, $t3 - alsl.d $s1, $s1, $s1, 2 - ldx.hu $s4, $s2, $t4 - sub.d $s0, $s0, $s1 - mul.d $s1, $s3, $t2 - add.d $s0, $s1, $s0 - mul.d $s1, $s4, $t2 - add.d $s0, $s1, $s0 - slt $s1, $t6, $a5 - masknez $t6, $t6, $s1 - maskeqz $a5, $a5, $s1 - ldx.hu $s1, $s2, $t5 - or $a5, $a5, $t6 - addi.d $a5, $a5, 3 - slt $t6, $a5, $a4 - maskeqz $a5, $a5, $t6 - masknez $a4, $a4, $t6 + ldx.d $fp, $a0, $fp + ldx.hu $t8, $t8, $t3 + alsl.d $t7, $t7, $t7, 2 + sub.w $t6, $t6, $t7 + ldx.hu $t7, $fp, $a6 + add.d $t6, $t6, $t8 + ldx.hu $t8, $fp, $a2 + ldx.hu $s0, $fp, $a7 + alsl.d $t7, $t7, $t7, 2 + ldx.hu $s1, $fp, $t1 + sub.d $t7, $t8, $t7 + mul.d $t8, $s0, $a1 + add.d $t7, $t8, $t7 + mul.d $t8, $s1, $a1 + add.d $t7, $t8, $t7 + ldx.hu $t8, $fp, $t2 + slt $s0, $t5, $a5 + masknez $t5, $t5, $s0 + maskeqz $a5, $a5, $s0 + or $a5, $a5, $t5 + addi.d $a5, $a5, 1 + slt $t5, $a5, $a4 + maskeqz $a5, $a5, $t5 + masknez $a4, $a4, $t5 or $a4, $a5, $a4 slli.d $a4, $a4, 3 - ldx.d $a0, $a0, $a4 - alsl.d $a4, $s1, $s1, 2 - ldx.hu $a5, $s2, $a6 - sub.d $a4, $s0, $a4 - ldx.hu $t1, $a0, $t1 + ldx.d $a5, $a0, $a4 + ldx.hu $a4, $fp, $t3 + alsl.d $t5, $t8, $t8, 2 + sub.d $t5, $t7, $t5 + ldx.hu $t7, $a5, $a6 + add.d $a4, $t5, $a4 + ldx.hu $t5, $a5, $a2 + ldx.hu $t8, $a5, $a7 + alsl.d $t7, $t7, $t7, 2 + ldx.hu $fp, $a5, $t1 + sub.d $t5, $t5, $t7 + mul.d $t7, $t8, $a1 + add.d $t5, $t7, $t5 + mul.d $t7, $fp, $a1 + add.d $t5, $t7, $t5 + ldx.hu $t7, $a5, $t2 + vmax.w $vr0, $vr0, $vr1 + vadd.w $vr0, $vr0, $vr2 + vmin.w $vr0, $vr0, $vr3 + vpickve2gr.w $t8, $vr0, 0 + slli.d $t8, $t8, 3 + ldx.d $t8, $a0, $t8 + ldx.hu $a5, $a5, $t3 + alsl.d $t7, $t7, $t7, 2 + sub.d $t5, $t5, $t7 + ldx.hu $t7, $t8, $a6 + add.d $a5, $t5, $a5 + ldx.hu $t5, $t8, $a2 + ldx.hu $fp, $t8, $a7 + alsl.d $t7, $t7, $t7, 2 + ldx.hu $s0, $t8, $t1 + sub.d $t5, $t5, $t7 + mul.d $t7, $fp, $a1 + add.d $t5, $t7, $t5 + mul.d $t7, $s0, $a1 + add.d $t5, $t7, $t5 + ldx.hu $t7, $t8, $t2 + vpickve2gr.w $fp, $vr0, 1 + slli.d $fp, $fp, 3 + ldx.d $a0, $a0, $fp + alsl.d $t7, $t7, $t7, 2 + ldx.hu $t8, $t8, $t3 + sub.d $t5, $t5, $t7 + ldx.hu $a6, $a0, $a6 ldx.hu $a2, $a0, $a2 - add.w $a4, $a4, $a5 - ldx.hu $a5, $a0, $t3 - alsl.d $t1, $t1, $t1, 2 - sub.d $a2, $a2, $t1 - ldx.hu $t1, $a0, $t4 - mul.d $a5, $a5, $t2 - add.d $a2, $a5, $a2 - ldx.hu $a5, $a0, $t5 - mul.d $t1, $t1, $t2 - ldx.hu $a0, $a0, $a6 - add.d $a2, $t1, $a2 - alsl.d $a5, $a5, $a5, 2 - sub.d $a2, $a2, $a5 + add.w $t5, $t5, $t8 + ldx.hu $a7, $a0, $a7 + alsl.d $a6, $a6, $a6, 2 + sub.d $a2, $a2, $a6 + ldx.hu $a6, $a0, $t1 + mul.d $a7, $a7, $a1 + add.d $a2, $a7, $a2 + ldx.hu $a7, $a0, $t2 + mul.d $a6, $a6, $a1 + ldx.hu $a0, $a0, $t3 + add.d $a2, $a6, $a2 + alsl.d $a6, $a7, $a7, 2 + sub.d $a2, $a2, $a6 add.d $a0, $a2, $a0 - alsl.d $a2, $t8, $t8, 2 - sub.d $a2, $a7, $a2 - mul.d $a5, $fp, $t2 - add.d $a2, $a5, $a2 - mul.d $a5, $t7, $t2 - add.d $a2, $a5, $a2 - alsl.d $a4, $a4, $a4, 2 - sub.d $a2, $a2, $a4 - add.d $a2, $a0, $a2 + alsl.d $a2, $t6, $t6, 2 + sub.d $a2, $t4, $a2 + mul.d $a6, $a4, $a1 + add.d $a2, $a6, $a2 + mul.d $a1, $a5, $a1 + add.d $a1, $a1, $a2 + alsl.d $a2, $t5, $t5, 2 + sub.d $a1, $a1, $a2 + add.d $a1, $a0, $a1 ldptr.w $a0, $a3, 15520 - addi.w $a2, $a2, 512 - bstrpick.d $a3, $a2, 62, 53 - add.w $a2, $a2, $a3 - srai.d $a3, $a2, 10 - srai.d $a2, $a2, 63 - andn $a2, $a3, $a2 - slt $a3, $a2, $a0 - maskeqz $a2, $a2, $a3 - masknez $a3, $a0, $a3 - or $a3, $a2, $a3 - beq $t0, $a1, .LBB3_19 + addi.w $a1, $a1, 512 + bstrpick.d $a2, $a1, 62, 53 + add.w $a1, $a1, $a2 + srai.d $a2, $a1, 10 + srai.d $a1, $a1, 63 + andn $a1, $a2, $a1 + slt $a2, $a1, $a0 + maskeqz $a1, $a1, $a2 + masknez $a2, $a0, $a2 + ori $a3, $zero, 1 + or $a2, $a1, $a2 + beq $t0, $a3, .LBB3_19 # %bb.5: # %.preheader170.preheader ori $a1, $zero, 3 - beq $t0, $a1, .LBB3_17 - b .LBB3_24 -.LBB3_6: + bne $t0, $a1, .LBB3_25 +# %bb.6: + addi.w $a1, $a5, 16 + b .LBB3_22 +.LBB3_7: srai.d $a1, $a5, 63 andn $a1, $a5, $a1 slt $a2, $a1, $a4 @@ -1149,9 +1111,9 @@ Get_Reference_Pixel: # @Get_Reference_Pixel masknez $a2, $a6, $a2 or $a1, $a1, $a2 slli.d $a1, $a1, 1 - ldx.hu $a3, $a0, $a1 - b .LBB3_24 -.LBB3_7: + ldx.hu $a2, $a0, $a1 + b .LBB3_25 +.LBB3_8: srai.d $a2, $a5, 63 andn $a2, $a5, $a2 slt $a5, $a2, $a4 @@ -1160,32 +1122,26 @@ Get_Reference_Pixel: # @Get_Reference_Pixel or $a2, $a2, $a4 slli.d $a2, $a2, 3 ldx.d $a0, $a0, $a2 + vinsgr2vr.w $vr0, $a7, 0 + vinsgr2vr.w $vr0, $a7, 1 ori $a2, $zero, 2 - slt $a4, $a2, $a7 - masknez $a2, $a2, $a4 - maskeqz $a4, $a7, $a4 - or $a2, $a4, $a2 - addi.d $a2, $a2, -2 - slt $a4, $a2, $a6 - maskeqz $a2, $a2, $a4 - masknez $a4, $a6, $a4 - or $a2, $a2, $a4 - slli.d $a4, $a2, 1 - ori $a2, $zero, 1 - slt $a5, $a2, $a7 - masknez $t0, $a2, $a5 - maskeqz $a5, $a7, $a5 - or $a5, $a5, $t0 - addi.d $a5, $a5, -1 - slt $t0, $a5, $a6 - maskeqz $a5, $a5, $t0 - masknez $t0, $a6, $t0 - or $a5, $a5, $t0 - slli.d $a5, $a5, 1 - ldx.hu $a5, $a0, $a5 + lu32i.d $a2, 1 + vreplgr2vr.d $vr1, $a2 + vmax.w $vr1, $vr0, $vr1 + vrepli.d $vr2, -2 + vadd.w $vr1, $vr1, $vr2 + vinsgr2vr.w $vr2, $a6, 0 + vinsgr2vr.w $vr2, $a6, 1 + vmin.w $vr1, $vr1, $vr2 + vpickve2gr.w $a2, $vr1, 0 + slli.d $a2, $a2, 1 + vpickve2gr.w $a4, $vr1, 1 + slli.d $a4, $a4, 1 ldx.hu $a4, $a0, $a4 - alsl.d $a5, $a5, $a5, 2 - sub.d $t0, $a4, $a5 + ldx.hu $a2, $a0, $a2 + ori $t0, $zero, 2 + alsl.d $a4, $a4, $a4, 2 + sub.d $a2, $a2, $a4 srai.d $a4, $a7, 63 andn $a4, $a7, $a4 slt $a5, $a4, $a6 @@ -1197,424 +1153,367 @@ Get_Reference_Pixel: # @Get_Reference_Pixel addi.w $a5, $zero, -1 slt $t1, $a5, $a7 masknez $a5, $a5, $t1 - maskeqz $t1, $a7, $t1 - or $a5, $t1, $a5 + maskeqz $a7, $a7, $t1 + or $a5, $a7, $a5 addi.d $a5, $a5, 1 - slt $t1, $a5, $a6 - maskeqz $a5, $a5, $t1 - masknez $t1, $a6, $t1 - or $a5, $a5, $t1 + slt $a7, $a5, $a6 + maskeqz $a5, $a5, $a7 + masknez $a6, $a6, $a7 + or $a5, $a5, $a6 slli.d $a5, $a5, 1 - ldx.hu $t1, $a0, $a5 - ori $t2, $zero, 20 - mul.d $t3, $a4, $t2 - add.d $t0, $t3, $t0 - mul.d $t1, $t1, $t2 - addi.w $t2, $zero, -2 - slt $t3, $t2, $a7 - masknez $t2, $t2, $t3 - maskeqz $t3, $a7, $t3 - or $t2, $t3, $t2 - addi.d $t2, $t2, 2 - slt $t3, $t2, $a6 - maskeqz $t2, $t2, $t3 - masknez $t3, $a6, $t3 - or $t2, $t2, $t3 - addi.w $t3, $zero, -3 - slt $t4, $t3, $a7 - masknez $t3, $t3, $t4 - maskeqz $a7, $a7, $t4 - or $a7, $a7, $t3 - addi.d $a7, $a7, 3 - slt $t3, $a7, $a6 - maskeqz $a7, $a7, $t3 - masknez $a6, $a6, $t3 - slli.d $t2, $t2, 1 - ldx.hu $t2, $a0, $t2 - or $a6, $a7, $a6 - slli.d $a6, $a6, 1 - ldx.hu $a6, $a0, $a6 - add.d $a7, $t1, $t0 - alsl.d $t0, $t2, $t2, 2 - sub.d $a7, $a7, $t0 - add.d $a6, $a7, $a6 + ldx.hu $a6, $a0, $a5 + ori $a7, $zero, 20 + mul.d $t1, $a4, $a7 + add.d $a2, $t1, $a2 + mul.d $a6, $a6, $a7 + addi.w $a7, $zero, -2 + lu32i.d $a7, -3 + vreplgr2vr.d $vr1, $a7 + vmax.w $vr0, $vr0, $vr1 + lu32i.d $t0, 3 + vreplgr2vr.d $vr1, $t0 + vadd.w $vr0, $vr0, $vr1 + vmin.w $vr0, $vr0, $vr2 + vpickve2gr.w $a7, $vr0, 0 + slli.d $a7, $a7, 1 + ldx.hu $a7, $a0, $a7 + vpickve2gr.w $t0, $vr0, 1 + slli.d $t0, $t0, 1 + ldx.hu $t0, $a0, $t0 + add.d $a2, $a6, $a2 + alsl.d $a6, $a7, $a7, 2 + sub.d $a2, $a2, $a6 + add.d $a2, $a2, $t0 ldptr.w $a3, $a3, 15520 - addi.w $a6, $a6, 16 - bstrpick.d $a7, $a6, 62, 58 - add.w $a6, $a6, $a7 - srai.d $a7, $a6, 5 - srai.d $a6, $a6, 63 - andn $a6, $a7, $a6 - slt $a7, $a6, $a3 - maskeqz $a6, $a6, $a7 - masknez $a3, $a3, $a7 - ori $a7, $zero, 3 - or $a3, $a6, $a3 - beq $a1, $a7, .LBB3_16 -# %bb.8: - bne $a1, $a2, .LBB3_24 + addi.w $a2, $a2, 16 + bstrpick.d $a6, $a2, 62, 58 + add.w $a2, $a2, $a6 + srai.d $a6, $a2, 5 + srai.d $a2, $a2, 63 + andn $a2, $a6, $a2 + slt $a6, $a2, $a3 + maskeqz $a2, $a2, $a6 + masknez $a3, $a3, $a6 + ori $a6, $zero, 3 + or $a2, $a2, $a3 + beq $a1, $a6, .LBB3_17 # %bb.9: - add.d $a0, $a3, $a4 - b .LBB3_23 -.LBB3_10: + ori $a0, $zero, 1 + bne $a1, $a0, .LBB3_25 +# %bb.10: + add.d $a0, $a2, $a4 + b .LBB3_24 +.LBB3_11: srai.d $a1, $a7, 63 andn $a1, $a7, $a1 slt $a2, $a1, $a6 - maskeqz $a7, $a1, $a2 + maskeqz $a1, $a1, $a2 masknez $a2, $a6, $a2 - ori $a1, $zero, 2 - slt $a6, $a1, $a5 - masknez $a1, $a1, $a6 - maskeqz $a6, $a5, $a6 - or $a1, $a6, $a1 - addi.d $a1, $a1, -2 - slt $a6, $a1, $a4 - maskeqz $a1, $a1, $a6 - masknez $a6, $a4, $a6 - or $a1, $a1, $a6 - slli.d $a1, $a1, 3 - ldx.d $a6, $a0, $a1 - ori $a1, $zero, 1 - slt $t1, $a1, $a5 - masknez $t2, $a1, $t1 - maskeqz $t1, $a5, $t1 - or $t1, $t1, $t2 - addi.d $t1, $t1, -1 - slt $t2, $t1, $a4 - maskeqz $t1, $t1, $t2 - masknez $t2, $a4, $t2 - or $t1, $t1, $t2 - slli.d $t1, $t1, 3 - ldx.d $t1, $a0, $t1 - or $a2, $a7, $a2 - slli.d $a2, $a2, 1 - ldx.hu $a6, $a6, $a2 - ldx.hu $a7, $t1, $a2 - srai.d $t1, $a5, 63 - andn $t1, $a5, $t1 - slt $t2, $t1, $a4 - maskeqz $t1, $t1, $t2 - masknez $t2, $a4, $t2 - or $t1, $t1, $t2 - slli.d $t1, $t1, 3 - ldx.d $t1, $a0, $t1 - alsl.d $a7, $a7, $a7, 2 - sub.d $t2, $a6, $a7 - ldx.hu $a6, $t1, $a2 - addi.w $a7, $zero, -1 - slt $t1, $a7, $a5 - masknez $a7, $a7, $t1 - maskeqz $t1, $a5, $t1 - or $a7, $t1, $a7 - addi.d $a7, $a7, 1 + or $a1, $a1, $a2 + vinsgr2vr.w $vr0, $a5, 0 + vinsgr2vr.w $vr0, $a5, 1 + ori $a2, $zero, 2 + lu32i.d $a2, 1 + vreplgr2vr.d $vr1, $a2 + vmax.w $vr1, $vr0, $vr1 + vrepli.d $vr2, -2 + vadd.w $vr1, $vr1, $vr2 + vinsgr2vr.w $vr2, $a4, 0 + vinsgr2vr.w $vr2, $a4, 1 + vmin.w $vr1, $vr1, $vr2 + vpickve2gr.w $a2, $vr1, 0 + vpickve2gr.w $a6, $vr1, 1 + slli.d $a6, $a6, 3 + ldx.d $a6, $a0, $a6 + slli.d $a2, $a2, 3 + ldx.d $a2, $a0, $a2 + slli.d $a1, $a1, 1 + ldx.hu $a6, $a6, $a1 + srai.d $a7, $a5, 63 + andn $a7, $a5, $a7 slt $t1, $a7, $a4 maskeqz $a7, $a7, $t1 masknez $t1, $a4, $t1 or $a7, $a7, $t1 slli.d $a7, $a7, 3 - addi.w $t1, $zero, -2 - slt $t3, $t1, $a5 - masknez $t1, $t1, $t3 - maskeqz $t3, $a5, $t3 - or $t1, $t3, $t1 - addi.d $t1, $t1, 2 - slt $t3, $t1, $a4 - maskeqz $t1, $t1, $t3 - masknez $t3, $a4, $t3 - or $t1, $t1, $t3 - addi.w $t3, $zero, -3 - slt $t4, $t3, $a5 - masknez $t3, $t3, $t4 - maskeqz $a5, $a5, $t4 - ldx.d $t4, $a0, $a7 - or $a5, $a5, $t3 - addi.d $a5, $a5, 3 - slt $t3, $a5, $a4 + ldx.d $a7, $a0, $a7 + ori $t1, $zero, 2 + ldx.hu $a2, $a2, $a1 + alsl.d $t2, $a6, $a6, 2 + ldx.hu $a6, $a7, $a1 + addi.w $a7, $zero, -1 + slt $t3, $a7, $a5 + masknez $a7, $a7, $t3 maskeqz $a5, $a5, $t3 - masknez $a4, $a4, $t3 - ori $t3, $zero, 20 + or $a5, $a5, $a7 + addi.d $a5, $a5, 1 + slt $a7, $a5, $a4 + maskeqz $a5, $a5, $a7 + masknez $a4, $a4, $a7 or $a4, $a5, $a4 - mul.d $a5, $a6, $t3 - ldx.hu $t4, $t4, $a2 + slli.d $a4, $a4, 3 + ldx.d $a5, $a0, $a4 + sub.d $a2, $a2, $t2 + ori $a7, $zero, 20 + mul.d $t2, $a6, $a7 + ldx.hu $a5, $a5, $a1 + addi.w $t3, $zero, -2 + lu32i.d $t3, -3 + vreplgr2vr.d $vr1, $t3 + vmax.w $vr0, $vr0, $vr1 + lu32i.d $t1, 3 + vreplgr2vr.d $vr1, $t1 + vadd.w $vr0, $vr0, $vr1 + vmin.w $vr0, $vr0, $vr2 + vpickve2gr.w $t1, $vr0, 0 slli.d $t1, $t1, 3 ldx.d $t1, $a0, $t1 - slli.d $a4, $a4, 3 - ldx.d $a4, $a0, $a4 - add.d $a5, $a5, $t2 - ldx.hu $t1, $t1, $a2 - mul.d $t2, $t4, $t3 - ldx.hu $a4, $a4, $a2 - add.d $a5, $t2, $a5 - alsl.d $t1, $t1, $t1, 2 - sub.d $a5, $a5, $t1 - add.d $a4, $a5, $a4 + vpickve2gr.w $t3, $vr0, 1 + slli.d $t3, $t3, 3 + ldx.d $t3, $a0, $t3 + add.d $a2, $t2, $a2 + ldx.hu $t1, $t1, $a1 + mul.d $a5, $a5, $a7 + ldx.hu $a7, $t3, $a1 + add.d $a2, $a5, $a2 + alsl.d $a5, $t1, $t1, 2 + sub.d $a2, $a2, $a5 + add.d $a2, $a2, $a7 ldptr.w $a3, $a3, 15520 - addi.w $a4, $a4, 16 - bstrpick.d $a5, $a4, 62, 58 - add.w $a4, $a4, $a5 - srai.d $a5, $a4, 5 - srai.d $a4, $a4, 63 - andn $a4, $a5, $a4 - slt $a5, $a4, $a3 - maskeqz $a4, $a4, $a5 + addi.w $a2, $a2, 16 + bstrpick.d $a5, $a2, 62, 58 + add.w $a2, $a2, $a5 + srai.d $a5, $a2, 5 + srai.d $a2, $a2, 63 + andn $a2, $a5, $a2 + slt $a5, $a2, $a3 + maskeqz $a2, $a2, $a5 masknez $a3, $a3, $a5 ori $a5, $zero, 3 - or $a3, $a4, $a3 - beq $t0, $a5, .LBB3_21 -# %bb.11: - bne $t0, $a1, .LBB3_24 + or $a2, $a2, $a3 + beq $t0, $a5, .LBB3_20 # %bb.12: - add.d $a0, $a3, $a6 - b .LBB3_23 -.LBB3_13: + ori $a0, $zero, 1 + bne $t0, $a0, .LBB3_25 +# %bb.13: + add.d $a0, $a2, $a6 + b .LBB3_24 +.LBB3_14: bne $t0, $a2, .LBB3_18 -# %bb.14: # %.preheader168.preheader +# %bb.15: # %.preheader168.preheader + vinsgr2vr.w $vr1, $a5, 0 + vinsgr2vr.w $vr1, $a5, 1 + ori $t2, $zero, 2 ori $a2, $zero, 2 - slt $t0, $a2, $a5 - masknez $t1, $a2, $t0 - maskeqz $t0, $a5, $t0 - or $t0, $t0, $t1 - addi.d $t0, $t0, -2 - slt $t1, $t0, $a4 - maskeqz $t0, $t0, $t1 - masknez $t1, $a4, $t1 - or $t0, $t0, $t1 - slli.d $t1, $t0, 3 - slt $t0, $a2, $a7 - masknez $a2, $a2, $t0 - maskeqz $t0, $a7, $t0 - or $a2, $t0, $a2 - addi.d $a2, $a2, -2 - slt $t0, $a2, $a6 - maskeqz $t2, $a2, $t0 - masknez $t3, $a6, $t0 - ori $a2, $zero, 1 - slt $t0, $a2, $a5 - masknez $t4, $a2, $t0 - maskeqz $t0, $a5, $t0 - or $t0, $t0, $t4 - addi.d $t0, $t0, -1 - slt $t4, $t0, $a4 - maskeqz $t0, $t0, $t4 - masknez $t4, $a4, $t4 - or $t0, $t0, $t4 + lu32i.d $a2, 1 + vreplgr2vr.d $vr2, $a2 + vmax.w $vr3, $vr1, $vr2 + vinsgr2vr.w $vr0, $a7, 0 + vinsgr2vr.w $vr0, $a7, 1 + vmax.w $vr2, $vr0, $vr2 + vrepli.d $vr4, -2 + vadd.w $vr3, $vr3, $vr4 + vinsgr2vr.w $vr5, $a4, 0 + vinsgr2vr.w $vr5, $a4, 1 + vmin.w $vr3, $vr3, $vr5 + vpickve2gr.w $a2, $vr3, 0 + slli.d $a2, $a2, 3 + ldx.d $a2, $a0, $a2 + vpickve2gr.w $t0, $vr3, 1 slli.d $t0, $t0, 3 ldx.d $t0, $a0, $t0 - or $t2, $t2, $t3 + srai.d $t1, $a5, 63 + andn $t1, $a5, $t1 + slt $t3, $t1, $a4 + maskeqz $t1, $t1, $t3 + masknez $t3, $a4, $t3 + or $t1, $t1, $t3 + slli.d $t1, $t1, 3 ldx.d $t1, $a0, $t1 - slli.d $fp, $t2, 1 - ldx.hu $t2, $t0, $fp - ldx.hu $t5, $t1, $fp - alsl.d $t2, $t2, $t2, 2 - srai.d $t3, $a5, 63 - andn $t3, $a5, $t3 - slt $t4, $t3, $a4 - maskeqz $t3, $t3, $t4 - masknez $t4, $a4, $t4 - or $t3, $t3, $t4 - slli.d $t3, $t3, 3 - ldx.d $t3, $a0, $t3 - addi.w $t8, $zero, -1 - slt $t4, $t8, $a5 - masknez $t6, $t8, $t4 - maskeqz $t4, $a5, $t4 - or $t4, $t4, $t6 - addi.d $t4, $t4, 1 - slt $t6, $t4, $a4 - maskeqz $t4, $t4, $t6 - masknez $t6, $a4, $t6 - or $t4, $t4, $t6 - slli.d $t4, $t4, 3 - ldx.d $t4, $a0, $t4 - sub.d $t6, $t5, $t2 - ldx.hu $s0, $t3, $fp - ori $t2, $zero, 20 - ldx.hu $s1, $t4, $fp - addi.w $t7, $zero, -2 - slt $t5, $t7, $a5 - masknez $s2, $t7, $t5 - maskeqz $t5, $a5, $t5 - or $t5, $t5, $s2 - addi.d $t5, $t5, 2 - slt $s2, $t5, $a4 - maskeqz $t5, $t5, $s2 - masknez $s2, $a4, $s2 - or $t5, $t5, $s2 - slli.d $t5, $t5, 3 - ldx.d $t5, $a0, $t5 - mul.d $s0, $s0, $t2 - add.d $s0, $s0, $t6 - mul.d $s1, $s1, $t2 - ldx.hu $s2, $t5, $fp - addi.w $t6, $zero, -3 - slt $s3, $t6, $a5 - maskeqz $a5, $a5, $s3 - masknez $s3, $t6, $s3 - or $a5, $a5, $s3 - addi.d $a5, $a5, 3 - slt $s3, $a5, $a4 - maskeqz $a5, $a5, $s3 - masknez $a4, $a4, $s3 + addi.w $t5, $zero, -1 + slt $t3, $t5, $a5 + masknez $t4, $t5, $t3 + maskeqz $a5, $a5, $t3 + or $a5, $a5, $t4 + addi.d $a5, $a5, 1 + slt $t3, $a5, $a4 + maskeqz $a5, $a5, $t3 + masknez $a4, $a4, $t3 or $a4, $a5, $a4 slli.d $a4, $a4, 3 - ldx.d $a0, $a0, $a4 - add.d $a4, $s1, $s0 - alsl.d $a5, $s2, $s2, 2 - sub.d $a4, $a4, $a5 - ldx.hu $a5, $a0, $fp - slt $fp, $a2, $a7 - masknez $s0, $a2, $fp - maskeqz $fp, $a7, $fp - or $fp, $fp, $s0 - addi.d $fp, $fp, -1 - slt $s0, $fp, $a6 - maskeqz $fp, $fp, $s0 - masknez $s0, $a6, $s0 - or $fp, $fp, $s0 - slli.d $fp, $fp, 1 - ldx.hu $s0, $t0, $fp - ldx.hu $s1, $t1, $fp - add.d $a4, $a4, $a5 - ldx.hu $a5, $t3, $fp - alsl.d $s0, $s0, $s0, 2 - sub.d $s0, $s1, $s0 - ldx.hu $s1, $t4, $fp - mul.d $a5, $a5, $t2 - ldx.hu $s2, $t5, $fp - add.d $a5, $a5, $s0 - mul.d $s0, $s1, $t2 - add.d $a5, $s0, $a5 - alsl.d $s0, $s2, $s2, 2 - sub.w $a5, $a5, $s0 - ldx.hu $fp, $a0, $fp - srai.d $s0, $a7, 63 - andn $s0, $a7, $s0 - slt $s1, $s0, $a6 - maskeqz $s0, $s0, $s1 - masknez $s1, $a6, $s1 - or $s0, $s0, $s1 - slli.d $s0, $s0, 1 - ldx.hu $s1, $t0, $s0 - ldx.hu $s2, $t1, $s0 - add.d $a5, $a5, $fp - ldx.hu $fp, $t3, $s0 - alsl.d $s1, $s1, $s1, 2 - sub.d $s1, $s2, $s1 - ldx.hu $s2, $t4, $s0 - mul.d $fp, $fp, $t2 - ldx.hu $s3, $t5, $s0 - add.d $fp, $fp, $s1 - mul.d $s1, $s2, $t2 - add.d $fp, $s1, $fp - alsl.d $s1, $s3, $s3, 2 - sub.d $fp, $fp, $s1 - ldx.hu $s0, $a0, $s0 - slt $s1, $t8, $a7 - masknez $t8, $t8, $s1 - maskeqz $s1, $a7, $s1 - or $t8, $s1, $t8 - addi.d $t8, $t8, 1 - slt $s1, $t8, $a6 - maskeqz $t8, $t8, $s1 - masknez $s1, $a6, $s1 - or $t8, $t8, $s1 - slli.d $s1, $t8, 1 - ldx.hu $s2, $t0, $s1 - ldx.hu $s3, $t1, $s1 - add.d $t8, $fp, $s0 - ldx.hu $fp, $t3, $s1 - alsl.d $s0, $s2, $s2, 2 - sub.d $s0, $s3, $s0 - ldx.hu $s2, $t4, $s1 - mul.d $fp, $fp, $t2 - ldx.hu $s3, $t5, $s1 - add.d $fp, $fp, $s0 - mul.d $s0, $s2, $t2 - add.d $fp, $s0, $fp - alsl.d $s0, $s3, $s3, 2 - sub.d $fp, $fp, $s0 - ldx.hu $s0, $a0, $s1 - slt $s1, $t7, $a7 - masknez $t7, $t7, $s1 - maskeqz $s1, $a7, $s1 - or $t7, $s1, $t7 - addi.d $t7, $t7, 2 - slt $s1, $t7, $a6 - maskeqz $t7, $t7, $s1 - masknez $s1, $a6, $s1 - or $t7, $t7, $s1 - slli.d $s1, $t7, 1 - ldx.hu $s2, $t0, $s1 - ldx.hu $s3, $t1, $s1 - add.d $t7, $fp, $s0 - ldx.hu $fp, $t3, $s1 - alsl.d $s0, $s2, $s2, 2 - sub.d $s0, $s3, $s0 - ldx.hu $s2, $t4, $s1 - mul.d $fp, $fp, $t2 - ldx.hu $s3, $t5, $s1 - add.d $fp, $fp, $s0 - mul.d $s0, $s2, $t2 - add.d $fp, $s0, $fp - alsl.d $s0, $s3, $s3, 2 - sub.d $fp, $fp, $s0 - ldx.hu $s0, $a0, $s1 - slt $s1, $t6, $a7 - masknez $t6, $t6, $s1 - maskeqz $a7, $a7, $s1 - or $a7, $a7, $t6 + ldx.d $a4, $a0, $a4 + addi.w $a5, $zero, -2 + lu32i.d $a5, -3 + vreplgr2vr.d $vr3, $a5 + vmax.w $vr1, $vr1, $vr3 + lu32i.d $t2, 3 + vreplgr2vr.d $vr3, $t2 + vadd.w $vr1, $vr1, $vr3 + vmin.w $vr1, $vr1, $vr5 + vpickve2gr.w $a5, $vr1, 0 + slli.d $a5, $a5, 3 + ldx.d $a5, $a0, $a5 + vpickve2gr.w $t2, $vr1, 1 + vadd.w $vr2, $vr2, $vr4 + vinsgr2vr.w $vr1, $a6, 0 + vinsgr2vr.w $vr1, $a6, 1 + vmin.w $vr2, $vr2, $vr1 + vpickve2gr.w $t3, $vr2, 0 + slli.d $t3, $t3, 1 + ldx.hu $t4, $t0, $t3 + slli.d $t2, $t2, 3 + ldx.hu $t6, $a2, $t3 + ldx.d $t2, $a0, $t2 + alsl.d $a0, $t4, $t4, 2 + ldx.hu $t4, $t1, $t3 + sub.d $t6, $t6, $a0 + ori $a0, $zero, 20 + ldx.hu $t7, $a4, $t3 + mul.d $t4, $t4, $a0 + ldx.hu $t8, $a5, $t3 + add.d $t4, $t4, $t6 + mul.d $t6, $t7, $a0 + add.d $t4, $t6, $t4 + alsl.d $t6, $t8, $t8, 2 + sub.d $t4, $t4, $t6 + ldx.hu $t3, $t2, $t3 + vpickve2gr.w $t6, $vr2, 1 + slli.d $t6, $t6, 1 + ldx.hu $t7, $t0, $t6 + ldx.hu $t8, $a2, $t6 + add.d $t3, $t4, $t3 + ldx.hu $t4, $t1, $t6 + alsl.d $t7, $t7, $t7, 2 + sub.d $t7, $t8, $t7 + ldx.hu $t8, $a4, $t6 + mul.d $t4, $t4, $a0 + ldx.hu $fp, $a5, $t6 + add.d $t4, $t4, $t7 + mul.d $t7, $t8, $a0 + add.d $t4, $t7, $t4 + alsl.d $t7, $fp, $fp, 2 + sub.w $t4, $t4, $t7 + ldx.hu $t6, $t2, $t6 + srai.d $t7, $a7, 63 + andn $t7, $a7, $t7 + slt $t8, $t7, $a6 + maskeqz $t7, $t7, $t8 + masknez $t8, $a6, $t8 + or $t7, $t7, $t8 + slli.d $t7, $t7, 1 + ldx.hu $t8, $t0, $t7 + ldx.hu $fp, $a2, $t7 + add.d $t4, $t4, $t6 + ldx.hu $t6, $t1, $t7 + alsl.d $t8, $t8, $t8, 2 + sub.d $t8, $fp, $t8 + ldx.hu $fp, $a4, $t7 + mul.d $t6, $t6, $a0 + ldx.hu $s0, $a5, $t7 + add.d $t6, $t6, $t8 + mul.d $t8, $fp, $a0 + add.d $t6, $t8, $t6 + alsl.d $t8, $s0, $s0, 2 + sub.d $t6, $t6, $t8 + ldx.hu $t7, $t2, $t7 + lu32i.d $t5, -2 + vreplgr2vr.d $vr2, $t5 + vmax.w $vr0, $vr0, $vr2 + ori $t5, $zero, 1 + lu32i.d $t5, 2 + vreplgr2vr.d $vr2, $t5 + vadd.w $vr0, $vr0, $vr2 + vmin.w $vr0, $vr0, $vr1 + vpickve2gr.w $t5, $vr0, 0 + slli.d $t8, $t5, 1 + ldx.hu $fp, $t0, $t8 + ldx.hu $s0, $a2, $t8 + add.d $t5, $t6, $t7 + ldx.hu $t6, $t1, $t8 + alsl.d $t7, $fp, $fp, 2 + sub.d $t7, $s0, $t7 + ldx.hu $fp, $a4, $t8 + mul.d $t6, $t6, $a0 + ldx.hu $s0, $a5, $t8 + add.d $t6, $t6, $t7 + mul.d $t7, $fp, $a0 + add.d $t6, $t7, $t6 + alsl.d $t7, $s0, $s0, 2 + sub.d $t6, $t6, $t7 + ldx.hu $t7, $t2, $t8 + vpickve2gr.w $t8, $vr0, 1 + slli.d $t8, $t8, 1 + ldx.hu $fp, $t0, $t8 + ldx.hu $s0, $a2, $t8 + add.d $t6, $t6, $t7 + ldx.hu $t7, $t1, $t8 + alsl.d $fp, $fp, $fp, 2 + sub.d $fp, $s0, $fp + ldx.hu $s0, $a4, $t8 + mul.d $t7, $t7, $a0 + ldx.hu $s1, $a5, $t8 + add.d $t7, $t7, $fp + mul.d $fp, $s0, $a0 + add.d $t7, $fp, $t7 + alsl.d $fp, $s1, $s1, 2 + sub.d $t7, $t7, $fp + addi.w $fp, $zero, -3 + slt $s0, $fp, $a7 + masknez $fp, $fp, $s0 + maskeqz $a7, $a7, $s0 + ori $s0, $zero, 1 + ldx.hu $t8, $t2, $t8 + or $a7, $a7, $fp addi.d $a7, $a7, 3 - slt $t6, $a7, $a6 - maskeqz $a7, $a7, $t6 - masknez $a6, $a6, $t6 + slt $fp, $a7, $a6 + maskeqz $a7, $a7, $fp + masknez $a6, $a6, $fp or $a6, $a7, $a6 slli.d $a6, $a6, 1 ldx.hu $a7, $t0, $a6 - ldx.hu $t0, $t1, $a6 - add.w $t1, $fp, $s0 - ldx.hu $t3, $t3, $a6 + ldx.hu $a2, $a2, $a6 + add.w $t0, $t7, $t8 + ldx.hu $t1, $t1, $a6 alsl.d $a7, $a7, $a7, 2 - sub.d $a7, $t0, $a7 - ldx.hu $t0, $t4, $a6 - mul.d $t3, $t3, $t2 - add.d $a7, $t3, $a7 - ldx.hu $t3, $t5, $a6 - mul.d $t0, $t0, $t2 - ldx.hu $a0, $a0, $a6 - add.d $a6, $t0, $a7 - alsl.d $a7, $t3, $t3, 2 - sub.d $a6, $a6, $a7 - add.d $a0, $a6, $a0 - alsl.d $a5, $a5, $a5, 2 - sub.d $a4, $a4, $a5 - mul.d $a5, $t8, $t2 - add.d $a4, $a5, $a4 - mul.d $a5, $t7, $t2 + sub.d $a2, $a2, $a7 + ldx.hu $a4, $a4, $a6 + mul.d $a7, $t1, $a0 + add.d $a2, $a7, $a2 + ldx.hu $a5, $a5, $a6 + mul.d $a4, $a4, $a0 + ldx.hu $a6, $t2, $a6 + add.d $a2, $a4, $a2 + alsl.d $a4, $a5, $a5, 2 + sub.d $a2, $a2, $a4 + add.d $a2, $a2, $a6 + alsl.d $a4, $t4, $t4, 2 + sub.d $a4, $t3, $a4 + mul.d $a5, $t5, $a0 add.d $a4, $a5, $a4 - alsl.d $a5, $t1, $t1, 2 - sub.d $a4, $a4, $a5 - add.d $a4, $a0, $a4 + mul.d $a0, $t6, $a0 + add.d $a0, $a0, $a4 + alsl.d $a4, $t0, $t0, 2 + sub.d $a0, $a0, $a4 + add.d $a2, $a2, $a0 ldptr.w $a0, $a3, 15520 - addi.w $a3, $a4, 512 - bstrpick.d $a4, $a3, 62, 53 - add.w $a3, $a3, $a4 - srai.d $a4, $a3, 10 - srai.d $a3, $a3, 63 - andn $a3, $a4, $a3 - slt $a4, $a3, $a0 - maskeqz $a3, $a3, $a4 - masknez $a4, $a0, $a4 - or $a3, $a3, $a4 - bne $a1, $a2, .LBB3_17 -# %bb.15: - addi.w $a1, $t8, 16 - b .LBB3_20 -.LBB3_16: - ldx.hu $a0, $a0, $a5 + addi.w $a2, $a2, 512 + bstrpick.d $a3, $a2, 62, 53 + add.w $a2, $a2, $a3 + srai.d $a3, $a2, 10 + srai.d $a2, $a2, 63 + andn $a2, $a3, $a2 + slt $a3, $a2, $a0 + maskeqz $a2, $a2, $a3 + masknez $a3, $a0, $a3 + or $a2, $a2, $a3 + bne $a1, $s0, .LBB3_21 +# %bb.16: + addi.w $a1, $t5, 16 b .LBB3_22 .LBB3_17: - addi.w $a1, $t7, 16 - b .LBB3_20 + ldx.hu $a0, $a0, $a5 + add.d $a0, $a2, $a0 + b .LBB3_24 .LBB3_18: addi.d $a2, $t0, -1 sltu $a2, $zero, $a2 @@ -1627,28 +1526,22 @@ Get_Reference_Pixel: # @Get_Reference_Pixel or $a2, $a2, $t0 slli.d $a2, $a2, 3 ldx.d $t4, $a0, $a2 - ori $fp, $zero, 2 - slt $a2, $fp, $a7 - masknez $t0, $fp, $a2 - maskeqz $a2, $a7, $a2 - or $a2, $a2, $t0 - addi.d $a2, $a2, -2 - slt $t0, $a2, $a6 - maskeqz $a2, $a2, $t0 - masknez $t0, $a6, $t0 - or $a2, $a2, $t0 + vinsgr2vr.w $vr0, $a7, 0 + vinsgr2vr.w $vr0, $a7, 1 + ori $t6, $zero, 2 + ori $a2, $zero, 2 + lu32i.d $a2, 1 + vreplgr2vr.d $vr1, $a2 + vmax.w $vr1, $vr0, $vr1 + vrepli.d $vr2, -2 + vadd.w $vr1, $vr1, $vr2 + vinsgr2vr.w $vr2, $a6, 0 + vinsgr2vr.w $vr2, $a6, 1 + vmin.w $vr1, $vr1, $vr2 + vpickve2gr.w $a2, $vr1, 0 slli.d $a2, $a2, 1 ldx.hu $a2, $t4, $a2 - ori $t8, $zero, 1 - slt $t0, $t8, $a7 - masknez $t1, $t8, $t0 - maskeqz $t0, $a7, $t0 - or $t0, $t0, $t1 - addi.d $t0, $t0, -1 - slt $t1, $t0, $a6 - maskeqz $t0, $t0, $t1 - masknez $t1, $a6, $t1 - or $t0, $t0, $t1 + vpickve2gr.w $t0, $vr1, 1 slli.d $t0, $t0, 1 ldx.hu $t0, $t4, $t0 srai.d $t1, $a7, 63 @@ -1659,9 +1552,9 @@ Get_Reference_Pixel: # @Get_Reference_Pixel or $t1, $t1, $t2 slli.d $t1, $t1, 1 ldx.hu $t1, $t4, $t1 - addi.w $t7, $zero, -1 - slt $t2, $t7, $a7 - masknez $t3, $t7, $t2 + addi.w $t5, $zero, -1 + slt $t2, $t5, $a7 + masknez $t3, $t5, $t2 maskeqz $t2, $a7, $t2 or $t2, $t2, $t3 addi.d $t2, $t2, 1 @@ -1671,30 +1564,21 @@ Get_Reference_Pixel: # @Get_Reference_Pixel or $t2, $t2, $t3 slli.d $t2, $t2, 1 ldx.hu $t2, $t4, $t2 - addi.w $t6, $zero, -2 - slt $t3, $t6, $a7 - masknez $t5, $t6, $t3 - maskeqz $t3, $a7, $t3 - or $t3, $t3, $t5 - addi.d $t3, $t3, 2 - slt $t5, $t3, $a6 - maskeqz $t3, $t3, $t5 - masknez $t5, $a6, $t5 - or $t3, $t3, $t5 + addi.w $t3, $zero, -2 + lu32i.d $t3, -3 + vreplgr2vr.d $vr1, $t3 + vmax.w $vr0, $vr0, $vr1 + ori $t3, $zero, 2 + lu32i.d $t3, 3 + vreplgr2vr.d $vr1, $t3 + vadd.w $vr0, $vr0, $vr1 + vmin.w $vr0, $vr0, $vr2 + vpickve2gr.w $t3, $vr0, 0 slli.d $t3, $t3, 1 ldx.hu $t3, $t4, $t3 - addi.w $t5, $zero, -3 - slt $s0, $t5, $a7 - masknez $s1, $t5, $s0 - maskeqz $s0, $a7, $s0 - or $s0, $s0, $s1 - addi.d $s0, $s0, 3 - slt $s1, $s0, $a6 - maskeqz $s0, $s0, $s1 - masknez $s1, $a6, $s1 - or $s0, $s0, $s1 - slli.d $s0, $s0, 1 - ldx.hu $t4, $t4, $s0 + vpickve2gr.w $t7, $vr0, 1 + slli.d $t7, $t7, 1 + ldx.hu $t4, $t4, $t7 addi.d $a1, $a1, -1 sltu $a1, $zero, $a1 add.d $a1, $a7, $a1 @@ -1704,8 +1588,8 @@ Get_Reference_Pixel: # @Get_Reference_Pixel maskeqz $a1, $a1, $a7 masknez $a6, $a6, $a7 or $a1, $a1, $a6 - slt $a6, $fp, $a5 - masknez $a7, $fp, $a6 + slt $a6, $t6, $a5 + masknez $a7, $t6, $a6 maskeqz $a6, $a5, $a6 or $a6, $a6, $a7 addi.d $a6, $a6, -2 @@ -1713,73 +1597,70 @@ Get_Reference_Pixel: # @Get_Reference_Pixel maskeqz $a6, $a6, $a7 masknez $a7, $a4, $a7 or $a6, $a6, $a7 - slt $a7, $t8, $a5 - masknez $t8, $t8, $a7 - maskeqz $a7, $a5, $a7 - or $a7, $a7, $t8 - addi.d $a7, $a7, -1 - slt $t8, $a7, $a4 - maskeqz $a7, $a7, $t8 - masknez $t8, $a4, $t8 - or $a7, $a7, $t8 - slli.d $a7, $a7, 3 - ldx.d $a7, $a0, $a7 slli.d $a6, $a6, 3 - ldx.d $t8, $a0, $a6 + ldx.d $a7, $a0, $a6 + ori $t6, $zero, 1 + slt $a6, $t6, $a5 + masknez $t7, $t6, $a6 + maskeqz $a6, $a5, $a6 + or $a6, $a6, $t7 + addi.d $a6, $a6, -1 + slt $t7, $a6, $a4 + maskeqz $a6, $a6, $t7 + masknez $t7, $a4, $t7 + or $a6, $a6, $t7 + slli.d $a6, $a6, 3 + ldx.d $t7, $a0, $a6 + srai.d $a6, $a5, 63 + andn $a6, $a5, $a6 + slt $t8, $a6, $a4 + maskeqz $a6, $a6, $t8 + masknez $t8, $a4, $t8 + or $t8, $a6, $t8 slli.d $a6, $a1, 1 ldx.hu $a7, $a7, $a6 - srai.d $a1, $a5, 63 - andn $a1, $a5, $a1 - slt $fp, $a1, $a4 - maskeqz $a1, $a1, $fp - masknez $fp, $a4, $fp - or $a1, $a1, $fp - slli.d $a1, $a1, 3 - ldx.d $fp, $a0, $a1 - ldptr.w $a1, $a3, 15520 - ldx.hu $a3, $t8, $a6 - alsl.d $a7, $a7, $a7, 2 - ldx.hu $t8, $fp, $a6 - slt $fp, $t7, $a5 - masknez $t7, $t7, $fp - maskeqz $fp, $a5, $fp - or $t7, $fp, $t7 - addi.d $t7, $t7, 1 - slt $fp, $t7, $a4 - maskeqz $t7, $t7, $fp - masknez $fp, $a4, $fp - or $t7, $t7, $fp - slli.d $t7, $t7, 3 - ldx.d $t7, $a0, $t7 - sub.d $a3, $a3, $a7 - ori $a7, $zero, 20 - mul.d $t8, $t8, $a7 ldx.hu $t7, $t7, $a6 - slt $fp, $t6, $a5 - masknez $t6, $t6, $fp - maskeqz $fp, $a5, $fp - or $t6, $fp, $t6 - addi.d $t6, $t6, 2 - slt $fp, $t6, $a4 - maskeqz $t6, $t6, $fp - masknez $fp, $a4, $fp - or $t6, $t6, $fp + slli.d $a1, $t8, 3 + ldx.d $t8, $a0, $a1 + ldptr.w $a1, $a3, 15520 + alsl.d $a3, $t7, $t7, 2 + sub.d $a3, $a7, $a3 + ldx.hu $a7, $t8, $a6 + vinsgr2vr.w $vr0, $a5, 0 + vinsgr2vr.w $vr0, $a5, 1 + lu32i.d $t5, -2 + vreplgr2vr.d $vr1, $t5 + vmax.w $vr0, $vr0, $vr1 + lu32i.d $t6, 2 + vreplgr2vr.d $vr1, $t6 + vadd.w $vr0, $vr0, $vr1 + vinsgr2vr.w $vr1, $a4, 0 + vinsgr2vr.w $vr1, $a4, 1 + vmin.w $vr0, $vr0, $vr1 + vpickve2gr.w $t5, $vr0, 0 + slli.d $t5, $t5, 3 + ldx.d $t5, $a0, $t5 + addi.w $t6, $zero, -3 + slt $t7, $t6, $a5 + masknez $t6, $t6, $t7 + maskeqz $a5, $a5, $t7 + ori $t7, $zero, 20 + mul.d $a7, $a7, $t7 + ldx.hu $t5, $t5, $a6 + or $a5, $a5, $t6 + addi.d $a5, $a5, 3 + slt $t6, $a5, $a4 + maskeqz $a5, $a5, $t6 + masknez $a4, $a4, $t6 + vpickve2gr.w $t6, $vr0, 1 slli.d $t6, $t6, 3 ldx.d $t6, $a0, $t6 - slt $fp, $t5, $a5 - masknez $t5, $t5, $fp - maskeqz $a5, $a5, $fp - or $a5, $a5, $t5 - addi.d $a5, $a5, 3 - slt $t5, $a5, $a4 - maskeqz $a5, $a5, $t5 - masknez $a4, $a4, $t5 or $a4, $a5, $a4 slli.d $a4, $a4, 3 ldx.d $a0, $a0, $a4 - add.d $a3, $t8, $a3 + add.d $a3, $a7, $a3 ldx.hu $a4, $t6, $a6 - mul.d $a5, $t7, $a7 + mul.d $a5, $t5, $t7 ldx.hu $a0, $a0, $a6 add.d $a3, $a5, $a3 alsl.d $a4, $a4, $a4, 2 @@ -1787,9 +1668,9 @@ Get_Reference_Pixel: # @Get_Reference_Pixel add.d $a0, $a3, $a0 alsl.d $a3, $t0, $t0, 2 sub.d $a2, $a2, $a3 - mul.d $a3, $t1, $a7 + mul.d $a3, $t1, $t7 add.d $a2, $a3, $a2 - mul.d $a3, $t2, $a7 + mul.d $a3, $t2, $t7 add.d $a2, $a3, $a2 alsl.d $a3, $t3, $t3, 2 sub.d $a2, $a2, $a3 @@ -1814,40 +1695,39 @@ Get_Reference_Pixel: # @Get_Reference_Pixel maskeqz $a0, $a0, $a3 masknez $a1, $a1, $a3 or $a0, $a0, $a1 - add.d $a0, $a0, $a2 b .LBB3_23 .LBB3_19: - addi.w $a1, $fp, 16 + addi.w $a1, $a4, 16 + b .LBB3_22 .LBB3_20: - bstrpick.d $a2, $a1, 62, 58 - add.w $a1, $a1, $a2 - srai.d $a2, $a1, 5 - srai.d $a1, $a1, 63 - andn $a1, $a2, $a1 - slt $a2, $a1, $a0 - maskeqz $a1, $a1, $a2 - masknez $a0, $a0, $a2 - or $a0, $a1, $a0 - add.d $a0, $a0, $a3 - b .LBB3_23 + ldx.d $a0, $a0, $a4 + ldx.hu $a0, $a0, $a1 + add.d $a0, $a2, $a0 + b .LBB3_24 .LBB3_21: - ldx.d $a0, $a0, $a7 - ldx.hu $a0, $a0, $a2 + addi.w $a1, $t6, 16 .LBB3_22: - add.d $a0, $a3, $a0 + bstrpick.d $a3, $a1, 62, 58 + add.w $a1, $a1, $a3 + srai.d $a3, $a1, 5 + srai.d $a1, $a1, 63 + andn $a1, $a3, $a1 + slt $a3, $a1, $a0 + maskeqz $a1, $a1, $a3 + masknez $a0, $a0, $a3 + or $a0, $a1, $a0 .LBB3_23: + add.d $a0, $a0, $a2 +.LBB3_24: bstrpick.d $a1, $a0, 31, 31 add.w $a0, $a0, $a1 - srai.d $a3, $a0, 1 -.LBB3_24: - andi $a0, $a3, 255 - ld.d $s4, $sp, 0 # 8-byte Folded Reload - ld.d $s3, $sp, 8 # 8-byte Folded Reload - ld.d $s2, $sp, 16 # 8-byte Folded Reload - ld.d $s1, $sp, 24 # 8-byte Folded Reload - ld.d $s0, $sp, 32 # 8-byte Folded Reload - ld.d $fp, $sp, 40 # 8-byte Folded Reload - addi.d $sp, $sp, 48 + srai.d $a2, $a0, 1 +.LBB3_25: + andi $a0, $a2, 255 + ld.d $s1, $sp, 8 # 8-byte Folded Reload + ld.d $s0, $sp, 16 # 8-byte Folded Reload + ld.d $fp, $sp, 24 # 8-byte Folded Reload + addi.d $sp, $sp, 32 ret .Lfunc_end3: .size Get_Reference_Pixel, .Lfunc_end3-Get_Reference_Pixel diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/img_luma.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/img_luma.s index 0c1f66de..06742625 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/img_luma.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/img_luma.s @@ -1063,18 +1063,18 @@ getHorSubImageSixTap: # @getHorSubImageSixTap .type getVerSubImageSixTap,@function getVerSubImageSixTap: # @getVerSubImageSixTap # %bb.0: - addi.d $sp, $sp, -160 - st.d $ra, $sp, 152 # 8-byte Folded Spill - st.d $fp, $sp, 144 # 8-byte Folded Spill - st.d $s0, $sp, 136 # 8-byte Folded Spill - st.d $s1, $sp, 128 # 8-byte Folded Spill - st.d $s2, $sp, 120 # 8-byte Folded Spill - st.d $s3, $sp, 112 # 8-byte Folded Spill - st.d $s4, $sp, 104 # 8-byte Folded Spill - st.d $s5, $sp, 96 # 8-byte Folded Spill - st.d $s6, $sp, 88 # 8-byte Folded Spill - st.d $s7, $sp, 80 # 8-byte Folded Spill - st.d $s8, $sp, 72 # 8-byte Folded Spill + addi.d $sp, $sp, -144 + st.d $ra, $sp, 136 # 8-byte Folded Spill + st.d $fp, $sp, 128 # 8-byte Folded Spill + st.d $s0, $sp, 120 # 8-byte Folded Spill + st.d $s1, $sp, 112 # 8-byte Folded Spill + st.d $s2, $sp, 104 # 8-byte Folded Spill + st.d $s3, $sp, 96 # 8-byte Folded Spill + st.d $s4, $sp, 88 # 8-byte Folded Spill + st.d $s5, $sp, 80 # 8-byte Folded Spill + st.d $s6, $sp, 72 # 8-byte Folded Spill + st.d $s7, $sp, 64 # 8-byte Folded Spill + st.d $s8, $sp, 56 # 8-byte Folded Spill ldptr.d $a6, $a0, 6448 slli.d $a1, $a1, 3 ldx.d $a1, $a6, $a1 @@ -1083,29 +1083,29 @@ getVerSubImageSixTap: # @getVerSubImageSixTap slli.d $a2, $a2, 3 ldx.d $ra, $a1, $a2 addi.w $a2, $t0, 40 - addi.w $s6, $a0, 40 + addi.w $s8, $a0, 40 addi.w $a7, $t0, 39 - st.d $ra, $sp, 56 # 8-byte Folded Spill - st.d $s6, $sp, 64 # 8-byte Folded Spill + st.d $ra, $sp, 40 # 8-byte Folded Spill + st.d $s8, $sp, 48 # 8-byte Folded Spill beqz $a5, .LBB2_4 # %bb.1: # %.preheader238 pcalau12i $s5, %pc_hi20(imgY_sub_tmp) pcalau12i $a1, %got_pc_hi20(img) - ld.d $s7, $a1, %got_pc_lo12(img) + ld.d $s6, $a1, %got_pc_lo12(img) ori $a1, $zero, 1 - slt $a3, $a1, $s6 + slt $a3, $a1, $s8 masknez $a1, $a1, $a3 - maskeqz $a3, $s6, $a3 + maskeqz $a3, $s8, $a3 or $t5, $a3, $a1 addi.w $a1, $zero, -40 lu12i.w $s4, 3 bstrpick.d $a3, $t5, 31, 0 vrepli.w $vr0, 20 vrepli.w $vr1, -5 - bge $a1, $a0, .LBB2_59 + bge $a1, $a0, .LBB2_57 # %bb.2: # %.lr.ph ld.d $t2, $s5, %pc_lo12(imgY_sub_tmp) - ld.d $t3, $s7, 0 + ld.d $t3, $s6, 0 ld.d $fp, $ra, 0 ld.d $t7, $t2, 0 ld.d $t8, $t2, 8 @@ -1114,10 +1114,10 @@ getVerSubImageSixTap: # @getVerSubImageSixTap ori $t4, $s4, 3232 ldx.w $t6, $t3, $t4 ori $a1, $zero, 4 - bge $s6, $a1, .LBB2_47 + bge $s8, $a1, .LBB2_45 # %bb.3: move $s2, $zero - b .LBB2_50 + b .LBB2_48 .LBB2_4: # %.preheader231 slli.d $a1, $a3, 3 ldx.d $a1, $a6, $a1 @@ -1126,41 +1126,53 @@ getVerSubImageSixTap: # @getVerSubImageSixTap pcalau12i $a1, %got_pc_hi20(img) ld.d $s1, $a1, %got_pc_lo12(img) ori $a1, $zero, 1 - slt $a4, $a1, $s6 + slt $a4, $a1, $s8 masknez $a1, $a1, $a4 - maskeqz $a4, $s6, $a4 + maskeqz $a4, $s8, $a4 or $t3, $a4, $a1 addi.w $a1, $zero, -40 bstrpick.d $s5, $t3, 31, 0 vrepli.b $vr0, 0 - bge $a1, $a0, .LBB2_13 + bge $a1, $a0, .LBB2_14 # %bb.5: # %.lr.ph250 ld.d $t1, $s1, 0 - ld.d $t7, $ra, 0 - ld.d $t5, $a3, 0 - ld.d $t6, $a3, 8 - ld.d $t8, $a3, 16 - ld.d $fp, $a3, 24 + ld.d $t5, $ra, 0 + vld $vr1, $a3, 16 + vld $vr2, $a3, 0 lu12i.w $a1, 3 ori $t2, $a1, 3232 ldx.w $t4, $t1, $t2 ori $a1, $zero, 8 - bge $s6, $a1, .LBB2_82 -# %bb.6: - move $s0, $zero -.LBB2_7: # %scalar.ph371.preheader - alsl.d $a4, $s0, $t7, 1 - alsl.d $a5, $s0, $fp, 1 - alsl.d $t7, $s0, $t8, 1 - alsl.d $t6, $s0, $t6, 1 - alsl.d $t5, $s0, $t5, 1 - sub.d $t8, $s5, $s0 - .p2align 4, , 16 -.LBB2_8: # %scalar.ph371 + blt $s8, $a1, .LBB2_7 +# %bb.6: # %vector.memcheck + vreplgr2vr.d $vr3, $t5 + vsub.d $vr4, $vr3, $vr1 + vsub.d $vr3, $vr3, $vr2 + vslti.du $vr3, $vr3, 16 + vslti.du $vr4, $vr4, 16 + vpickev.w $vr3, $vr4, $vr3 + vmskltz.w $vr3, $vr3 + vpickve2gr.hu $a1, $vr3, 0 + beqz $a1, .LBB2_85 +.LBB2_7: + move $t6, $zero +.LBB2_8: # %scalar.ph371.preheader + vpickve2gr.d $a1, $vr2, 0 + vpickve2gr.d $a6, $vr2, 1 + vpickve2gr.d $t7, $vr1, 0 + vpickve2gr.d $a5, $vr1, 1 + alsl.d $a4, $t6, $t5, 1 + alsl.d $a5, $t6, $a5, 1 + alsl.d $t5, $t6, $t7, 1 + alsl.d $t7, $t6, $a6, 1 + alsl.d $t8, $t6, $a1, 1 + sub.d $t6, $s5, $t6 + .p2align 4, , 16 +.LBB2_9: # %scalar.ph371 # =>This Inner Loop Header: Depth=1 - ld.hu $a1, $t5, 0 - ld.hu $a6, $t6, 0 - ld.hu $fp, $t7, 0 + ld.hu $a1, $t8, 0 + ld.hu $a6, $t7, 0 + ld.hu $fp, $t5, 0 add.d $a6, $a6, $a1 slli.d $s0, $a6, 4 alsl.d $a6, $a6, $s0, 2 @@ -1181,46 +1193,48 @@ getVerSubImageSixTap: # @getVerSubImageSixTap st.h $a1, $a4, 0 addi.d $a4, $a4, 2 addi.d $a5, $a5, 2 - addi.d $t7, $t7, 2 - addi.d $t6, $t6, 2 - addi.d $t8, $t8, -1 addi.d $t5, $t5, 2 - bnez $t8, .LBB2_8 -.LBB2_9: # %.lr.ph250.1 - ld.d $t6, $ra, 8 - ld.d $t4, $a3, 8 - ld.d $t5, $a3, 16 - ld.d $t7, $a3, 0 - ld.d $t8, $a3, 24 - ld.d $fp, $a3, 32 + addi.d $t7, $t7, 2 + addi.d $t6, $t6, -1 + addi.d $t8, $t8, 2 + bnez $t6, .LBB2_9 +.LBB2_10: # %.lr.ph250.1 + ld.d $t4, $ra, 8 + vld $vr1, $a3, 16 + vld $vr2, $a3, 0 + ld.d $t5, $a3, 32 ldx.w $t1, $t1, $t2 ori $a1, $zero, 8 - bge $s6, $a1, .LBB2_89 -# %bb.10: + bge $s8, $a1, .LBB2_80 +# %bb.11: move $t2, $zero -.LBB2_11: # %scalar.ph398.preheader +.LBB2_12: # %scalar.ph398.preheader + vpickve2gr.d $a1, $vr2, 1 + vpickve2gr.d $a6, $vr1, 0 + vpickve2gr.d $t6, $vr2, 0 + vpickve2gr.d $t7, $vr1, 1 sub.d $a4, $s5, $t2 - alsl.d $a5, $t2, $t6, 1 - alsl.d $t6, $t2, $fp, 1 - alsl.d $t8, $t2, $t8, 1 - alsl.d $t7, $t2, $t7, 1 - alsl.d $t5, $t2, $t5, 1 - alsl.d $t2, $t2, $t4, 1 - .p2align 4, , 16 -.LBB2_12: # %scalar.ph398 + alsl.d $a5, $t2, $t4, 1 + alsl.d $t4, $t2, $t5, 1 + alsl.d $t5, $t2, $t7, 1 + alsl.d $t6, $t2, $t6, 1 + alsl.d $t7, $t2, $a6, 1 + alsl.d $t2, $t2, $a1, 1 + .p2align 4, , 16 +.LBB2_13: # %scalar.ph398 # =>This Inner Loop Header: Depth=1 ld.hu $a1, $t2, 0 - ld.hu $a6, $t5, 0 - ld.hu $t4, $t7, 0 - ld.hu $fp, $t8, 0 + ld.hu $a6, $t7, 0 + ld.hu $t8, $t6, 0 + ld.hu $fp, $t5, 0 add.d $a1, $a6, $a1 slli.d $a6, $a1, 4 alsl.d $a1, $a1, $a6, 2 - add.d $a6, $fp, $t4 - ld.hu $fp, $t6, 0 + add.d $a6, $fp, $t8 + ld.hu $fp, $t4, 0 alsl.d $a6, $a6, $a6, 2 sub.d $a1, $a1, $a6 - add.d $a1, $a1, $t4 + add.d $a1, $a1, $t8 add.d $a1, $a1, $fp addi.w $a1, $a1, 16 srai.d $a6, $a1, 5 @@ -1233,93 +1247,90 @@ getVerSubImageSixTap: # @getVerSubImageSixTap st.h $a1, $a5, 0 addi.d $a4, $a4, -1 addi.d $a5, $a5, 2 + addi.d $t4, $t4, 2 + addi.d $t5, $t5, 2 addi.d $t6, $t6, 2 - addi.d $t8, $t8, 2 addi.d $t7, $t7, 2 - addi.d $t5, $t5, 2 addi.d $t2, $t2, 2 - bnez $a4, .LBB2_12 -.LBB2_13: # %.loopexit230.1 + bnez $a4, .LBB2_13 +.LBB2_14: # %.loopexit230.1 ld.d $t1, $s1, 0 addi.w $t2, $t0, 37 addi.w $a1, $zero, -34 bstrpick.d $a5, $t3, 30, 3 - st.d $s5, $sp, 48 # 8-byte Folded Spill - blt $t0, $a1, .LBB2_31 -# %bb.14: # %.lr.ph255 - st.d $s1, $sp, 24 # 8-byte Folded Spill + blt $t0, $a1, .LBB2_29 +# %bb.15: # %.lr.ph255 + st.d $s1, $sp, 32 # 8-byte Folded Spill + move $fp, $s5 ori $a1, $zero, 3 slt $a4, $a1, $t2 masknez $a1, $a1, $a4 maskeqz $a4, $t2, $a4 or $t0, $a4, $a1 - st.d $a5, $sp, 16 # 8-byte Folded Spill - slli.d $a1, $a5, 3 - st.d $a1, $sp, 40 # 8-byte Folded Spill - ori $t6, $zero, 2 - addi.w $t5, $zero, -39 + st.d $a5, $sp, 24 # 8-byte Folded Spill + slli.d $t3, $a5, 3 + ori $t5, $zero, 2 + addi.w $t6, $zero, -39 lu12i.w $a1, 3 - ori $s7, $a1, 3232 - ori $s8, $zero, 8 - ori $fp, $zero, 16 + ori $t7, $a1, 3232 + ori $t8, $zero, 8 vrepli.w $vr1, 20 vrepli.w $vr2, -5 - st.d $s7, $sp, 32 # 8-byte Folded Spill - b .LBB2_16 + b .LBB2_17 .p2align 4, , 16 -.LBB2_15: # %.loopexit228 - # in Loop: Header=BB2_16 Depth=1 - beq $t6, $t0, .LBB2_30 -.LBB2_16: # =>This Loop Header: Depth=1 - # Child Loop BB2_28 Depth 2 - # Child Loop BB2_20 Depth 2 - move $a1, $t6 - addi.d $t6, $t6, 1 - blt $a0, $t5, .LBB2_15 -# %bb.17: # %.lr.ph253 - # in Loop: Header=BB2_16 Depth=1 - slli.d $a5, $a1, 3 - ldx.d $a4, $ra, $a5 - alsl.d $a1, $a1, $a3, 3 - ldx.d $s1, $a3, $a5 - slli.d $a5, $t6, 3 - ldx.d $s2, $a3, $a5 - ld.d $a5, $a1, -8 - ld.d $t3, $a1, 16 - ld.d $t8, $a1, -16 - ld.d $t4, $a1, 24 - ldx.w $s0, $t1, $s7 - bge $s6, $s8, .LBB2_21 -# %bb.18: # in Loop: Header=BB2_16 Depth=1 +.LBB2_16: # %.loopexit228 + # in Loop: Header=BB2_17 Depth=1 + addi.d $t5, $t5, 1 + beq $t5, $t0, .LBB2_28 +.LBB2_17: # =>This Loop Header: Depth=1 + # Child Loop BB2_26 Depth 2 + # Child Loop BB2_21 Depth 2 + blt $a0, $t6, .LBB2_16 +# %bb.18: # %.lr.ph253 + # in Loop: Header=BB2_17 Depth=1 + slli.d $a1, $t5, 3 + ldx.d $a5, $ra, $a1 + alsl.d $a1, $t5, $a3, 3 + vld $vr3, $a1, 8 + vld $vr4, $a1, -8 + ld.d $s2, $a1, -16 + ld.d $s3, $a1, 24 + ldx.w $s0, $t1, $t7 + bge $s8, $t8, .LBB2_22 +# %bb.19: # in Loop: Header=BB2_17 Depth=1 move $a1, $zero -.LBB2_19: # %scalar.ph428.preheader - # in Loop: Header=BB2_16 Depth=1 - alsl.d $a4, $a1, $a4, 1 - alsl.d $t4, $a1, $t4, 1 - alsl.d $t7, $a1, $t8, 1 - alsl.d $t3, $a1, $t3, 1 - alsl.d $a5, $a1, $a5, 1 - alsl.d $t8, $a1, $s2, 1 +.LBB2_20: # %scalar.ph428.preheader + # in Loop: Header=BB2_17 Depth=1 + vpickve2gr.d $a6, $vr4, 1 + vpickve2gr.d $s4, $vr3, 0 + vpickve2gr.d $s5, $vr4, 0 + vpickve2gr.d $s1, $vr3, 1 + alsl.d $a4, $a1, $a5, 1 + alsl.d $a5, $a1, $s3, 1 + alsl.d $t4, $a1, $s2, 1 alsl.d $s1, $a1, $s1, 1 - sub.d $s2, $s5, $a1 + alsl.d $s2, $a1, $s5, 1 + alsl.d $s3, $a1, $s4, 1 + alsl.d $s4, $a1, $a6, 1 + sub.d $s5, $fp, $a1 .p2align 4, , 16 -.LBB2_20: # %scalar.ph428 - # Parent Loop BB2_16 Depth=1 +.LBB2_21: # %scalar.ph428 + # Parent Loop BB2_17 Depth=1 # => This Inner Loop Header: Depth=2 - ld.hu $a1, $s1, 0 - ld.hu $a6, $t8, 0 - ld.hu $s3, $a5, 0 - ld.hu $s4, $t3, 0 + ld.hu $a1, $s4, 0 + ld.hu $a6, $s3, 0 + ld.hu $s6, $s2, 0 + ld.hu $s7, $s1, 0 add.d $a1, $a6, $a1 slli.d $a6, $a1, 4 alsl.d $a1, $a1, $a6, 2 - add.d $a6, $s4, $s3 - ld.hu $s3, $t7, 0 - ld.hu $s4, $t4, 0 + add.d $a6, $s7, $s6 + ld.hu $s6, $t4, 0 + ld.hu $s7, $a5, 0 alsl.d $a6, $a6, $a6, 2 sub.d $a1, $a1, $a6 - add.d $a1, $a1, $s3 - add.d $a1, $a1, $s4 + add.d $a1, $a1, $s6 + add.d $a1, $a1, $s7 addi.w $a1, $a1, 16 srai.d $a6, $a1, 5 srai.d $a1, $a1, 63 @@ -1330,177 +1341,170 @@ getVerSubImageSixTap: # @getVerSubImageSixTap or $a1, $a1, $a6 st.h $a1, $a4, 0 addi.d $a4, $a4, 2 - addi.d $t4, $t4, 2 - addi.d $t7, $t7, 2 - addi.d $t3, $t3, 2 addi.d $a5, $a5, 2 - addi.d $t8, $t8, 2 - addi.d $s2, $s2, -1 + addi.d $t4, $t4, 2 addi.d $s1, $s1, 2 - bnez $s2, .LBB2_20 - b .LBB2_15 + addi.d $s2, $s2, 2 + addi.d $s3, $s3, 2 + addi.d $s5, $s5, -1 + addi.d $s4, $s4, 2 + bnez $s5, .LBB2_21 + b .LBB2_16 .p2align 4, , 16 -.LBB2_21: # %vector.memcheck416 - # in Loop: Header=BB2_16 Depth=1 - sub.d $a6, $a4, $s1 +.LBB2_22: # %vector.memcheck416 + # in Loop: Header=BB2_17 Depth=1 + vreplgr2vr.d $vr5, $a5 + vsub.d $vr6, $vr5, $vr3 + vsub.d $vr5, $vr5, $vr4 + vslti.du $vr5, $vr5, 16 + vslti.du $vr6, $vr6, 16 + vpickev.w $vr5, $vr6, $vr5 + vmskltz.w $vr5, $vr5 + vpickve2gr.hu $a4, $vr5, 0 move $a1, $zero - bltu $a6, $fp, .LBB2_19 -# %bb.22: # %vector.memcheck416 - # in Loop: Header=BB2_16 Depth=1 - sub.d $a6, $a4, $s2 - bltu $a6, $fp, .LBB2_19 + bnez $a4, .LBB2_20 # %bb.23: # %vector.memcheck416 - # in Loop: Header=BB2_16 Depth=1 - sub.d $a6, $a4, $a5 - bltu $a6, $fp, .LBB2_19 + # in Loop: Header=BB2_17 Depth=1 + sub.d $a4, $a5, $s2 + ori $a6, $zero, 16 + bltu $a4, $a6, .LBB2_20 # %bb.24: # %vector.memcheck416 - # in Loop: Header=BB2_16 Depth=1 - sub.d $a6, $a4, $t3 - bltu $a6, $fp, .LBB2_19 -# %bb.25: # %vector.memcheck416 - # in Loop: Header=BB2_16 Depth=1 - sub.d $a6, $a4, $t8 - bltu $a6, $fp, .LBB2_19 -# %bb.26: # %vector.memcheck416 - # in Loop: Header=BB2_16 Depth=1 - sub.d $a6, $a4, $t4 - bltu $a6, $fp, .LBB2_19 -# %bb.27: # %vector.ph430 - # in Loop: Header=BB2_16 Depth=1 - vreplgr2vr.w $vr3, $s0 - move $s8, $s1 - move $ra, $s2 - move $s4, $a5 - move $s5, $t3 - move $s6, $t8 - move $s7, $t4 - move $s3, $a4 - ld.d $t7, $sp, 40 # 8-byte Folded Reload + # in Loop: Header=BB2_17 Depth=1 + sub.d $a4, $a5, $s3 + bltu $a4, $a6, .LBB2_20 +# %bb.25: # %vector.ph430 + # in Loop: Header=BB2_17 Depth=1 + vpickve2gr.d $s4, $vr4, 1 + vpickve2gr.d $s5, $vr3, 0 + vpickve2gr.d $s6, $vr4, 0 + vpickve2gr.d $s7, $vr3, 1 + vreplgr2vr.w $vr5, $s0 + move $s8, $s2 + move $ra, $s3 + move $s1, $a5 + move $t4, $t3 .p2align 4, , 16 -.LBB2_28: # %vector.body435 - # Parent Loop BB2_16 Depth=1 +.LBB2_26: # %vector.body435 + # Parent Loop BB2_17 Depth=1 # => This Inner Loop Header: Depth=2 - vld $vr4, $s8, 0 - vld $vr5, $ra, 0 - vilvh.h $vr6, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vilvh.h $vr7, $vr0, $vr5 - vilvl.h $vr5, $vr0, $vr5 - vadd.w $vr4, $vr5, $vr4 - vadd.w $vr5, $vr7, $vr6 vld $vr6, $s4, 0 - vmul.w $vr5, $vr5, $vr1 vld $vr7, $s5, 0 - vmul.w $vr4, $vr4, $vr1 - vilvl.h $vr8, $vr0, $vr6 - vilvh.h $vr6, $vr0, $vr6 - vilvl.h $vr9, $vr0, $vr7 - vilvh.h $vr7, $vr0, $vr7 - vld $vr10, $s6, 0 + vilvh.h $vr8, $vr0, $vr6 + vilvl.h $vr6, $vr0, $vr6 + vilvh.h $vr9, $vr0, $vr7 + vilvl.h $vr7, $vr0, $vr7 vadd.w $vr6, $vr7, $vr6 - vld $vr7, $s7, 0 + vadd.w $vr7, $vr9, $vr8 + vld $vr8, $s6, 0 + vmul.w $vr7, $vr7, $vr1 + vld $vr9, $s7, 0 + vmul.w $vr6, $vr6, $vr1 + vilvl.h $vr10, $vr0, $vr8 + vilvh.h $vr8, $vr0, $vr8 + vilvl.h $vr11, $vr0, $vr9 + vilvh.h $vr9, $vr0, $vr9 + vld $vr12, $s8, 0 vadd.w $vr8, $vr9, $vr8 - vilvl.h $vr9, $vr0, $vr10 - vilvh.h $vr10, $vr0, $vr10 - vilvh.h $vr11, $vr0, $vr7 - vilvl.h $vr7, $vr0, $vr7 - vmadd.w $vr4, $vr8, $vr2 - vmadd.w $vr5, $vr6, $vr2 - vadd.w $vr5, $vr5, $vr10 - vadd.w $vr4, $vr4, $vr9 - vadd.w $vr4, $vr4, $vr7 - vadd.w $vr5, $vr5, $vr11 - vaddi.wu $vr5, $vr5, 16 - vaddi.wu $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 5 - vsrai.w $vr5, $vr5, 5 - vmaxi.w $vr5, $vr5, 0 - vmaxi.w $vr4, $vr4, 0 - vmin.w $vr4, $vr4, $vr3 - vmin.w $vr5, $vr5, $vr3 - vpickev.h $vr4, $vr5, $vr4 - vst $vr4, $s3, 0 - addi.d $t7, $t7, -8 - addi.d $s3, $s3, 16 + vld $vr9, $ra, 0 + vadd.w $vr10, $vr11, $vr10 + vilvl.h $vr11, $vr0, $vr12 + vilvh.h $vr12, $vr0, $vr12 + vilvh.h $vr13, $vr0, $vr9 + vilvl.h $vr9, $vr0, $vr9 + vmadd.w $vr6, $vr10, $vr2 + vmadd.w $vr7, $vr8, $vr2 + vadd.w $vr7, $vr7, $vr12 + vadd.w $vr6, $vr6, $vr11 + vadd.w $vr6, $vr6, $vr9 + vadd.w $vr7, $vr7, $vr13 + vaddi.wu $vr7, $vr7, 16 + vaddi.wu $vr6, $vr6, 16 + vsrai.w $vr6, $vr6, 5 + vsrai.w $vr7, $vr7, 5 + vmaxi.w $vr7, $vr7, 0 + vmaxi.w $vr6, $vr6, 0 + vmin.w $vr6, $vr6, $vr5 + vmin.w $vr7, $vr7, $vr5 + vpickev.h $vr6, $vr7, $vr6 + vst $vr6, $s1, 0 + addi.d $t4, $t4, -8 + addi.d $s1, $s1, 16 + addi.d $ra, $ra, 16 + addi.d $s8, $s8, 16 addi.d $s7, $s7, 16 addi.d $s6, $s6, 16 addi.d $s5, $s5, 16 addi.d $s4, $s4, 16 - addi.d $ra, $ra, 16 - addi.d $s8, $s8, 16 - bnez $t7, .LBB2_28 -# %bb.29: # %middle.block444 - # in Loop: Header=BB2_16 Depth=1 - ld.d $a6, $sp, 40 # 8-byte Folded Reload - move $a1, $a6 - ld.d $ra, $sp, 56 # 8-byte Folded Reload - ld.d $s6, $sp, 64 # 8-byte Folded Reload - ld.d $s5, $sp, 48 # 8-byte Folded Reload - ld.d $s7, $sp, 32 # 8-byte Folded Reload - ori $s8, $zero, 8 - beq $a6, $s5, .LBB2_15 - b .LBB2_19 -.LBB2_30: # %.preheader.loopexit - ld.d $a1, $sp, 24 # 8-byte Folded Reload + bnez $t4, .LBB2_26 +# %bb.27: # %middle.block444 + # in Loop: Header=BB2_17 Depth=1 + move $a1, $t3 + ld.d $ra, $sp, 40 # 8-byte Folded Reload + ld.d $s8, $sp, 48 # 8-byte Folded Reload + beq $t3, $fp, .LBB2_16 + b .LBB2_20 +.LBB2_28: # %.preheader.loopexit + ld.d $a1, $sp, 32 # 8-byte Folded Reload ld.d $t1, $a1, 0 - ld.d $a5, $sp, 16 # 8-byte Folded Reload -.LBB2_31: # %.preheader - slli.d $t4, $a5, 3 + move $s5, $fp + ld.d $a5, $sp, 24 # 8-byte Folded Reload +.LBB2_29: # %.preheader + slli.d $a4, $a5, 3 addi.w $t0, $zero, -39 lu12i.w $a1, 3 ori $t3, $a1, 3232 - ori $s7, $zero, 8 + ori $s6, $zero, 8 ori $t5, $zero, 16 vrepli.w $vr1, 20 vrepli.w $vr2, -5 - b .LBB2_33 - .p2align 4, , 16 -.LBB2_32: # %.loopexit - # in Loop: Header=BB2_33 Depth=1 - bge $t2, $a2, .LBB2_81 -.LBB2_33: # =>This Loop Header: Depth=1 - # Child Loop BB2_45 Depth 2 - # Child Loop BB2_37 Depth 2 + b .LBB2_31 + .p2align 4, , 16 +.LBB2_30: # %.loopexit + # in Loop: Header=BB2_31 Depth=1 + bge $t2, $a2, .LBB2_79 +.LBB2_31: # =>This Loop Header: Depth=1 + # Child Loop BB2_43 Depth 2 + # Child Loop BB2_35 Depth 2 move $a1, $t2 addi.d $t2, $t2, 1 - blt $a0, $t0, .LBB2_32 -# %bb.34: # %.lr.ph258 - # in Loop: Header=BB2_33 Depth=1 - slli.d $a4, $a1, 3 - ldx.d $t7, $ra, $a4 - addi.w $a5, $a1, 3 - slt $a6, $a7, $a5 - masknez $a5, $a5, $a6 - maskeqz $a6, $a7, $a6 - or $a5, $a6, $a5 - addi.w $a6, $a1, 2 + blt $a0, $t0, .LBB2_30 +# %bb.32: # %.lr.ph258 + # in Loop: Header=BB2_31 Depth=1 + slli.d $a5, $a1, 3 + ldx.d $t4, $ra, $a5 + addi.w $a6, $a1, 3 slt $t6, $a7, $a6 masknez $a6, $a6, $t6 maskeqz $t6, $a7, $t6 or $a6, $t6, $a6 - addi.w $t6, $t2, 0 - slt $t8, $a7, $t6 - masknez $t6, $t6, $t8 + addi.w $t6, $a1, 2 + slt $t7, $a7, $t6 + masknez $t6, $t6, $t7 + maskeqz $t7, $a7, $t7 + or $t6, $t7, $t6 + addi.w $t7, $t2, 0 + slt $t8, $a7, $t7 + masknez $t7, $t7, $t8 maskeqz $t8, $a7, $t8 - or $t6, $t8, $t6 + or $t7, $t8, $t7 alsl.d $a1, $a1, $a3, 3 - ldx.d $t8, $a3, $a4 - slli.d $a4, $t6, 3 - ldx.d $fp, $a3, $a4 + ldx.d $t8, $a3, $a5 + slli.d $a5, $t7, 3 + ldx.d $fp, $a3, $a5 ld.d $s0, $a1, -8 - slli.d $a4, $a6, 3 - ldx.d $s1, $a3, $a4 + slli.d $a5, $t6, 3 + ldx.d $s1, $a3, $a5 ld.d $s2, $a1, -16 - slli.d $a1, $a5, 3 + slli.d $a1, $a6, 3 ldx.d $s3, $a3, $a1 ldx.w $t6, $t1, $t3 - bge $s6, $s7, .LBB2_38 -# %bb.35: # in Loop: Header=BB2_33 Depth=1 + bge $s8, $s6, .LBB2_36 +# %bb.33: # in Loop: Header=BB2_31 Depth=1 move $a1, $zero -.LBB2_36: # %scalar.ph459.preheader - # in Loop: Header=BB2_33 Depth=1 - alsl.d $a4, $a1, $t7, 1 - alsl.d $a5, $a1, $s3, 1 +.LBB2_34: # %scalar.ph459.preheader + # in Loop: Header=BB2_31 Depth=1 + alsl.d $a5, $a1, $t4, 1 + alsl.d $t4, $a1, $s3, 1 alsl.d $t7, $a1, $s2, 1 alsl.d $s1, $a1, $s1, 1 alsl.d $s0, $a1, $s0, 1 @@ -1508,8 +1512,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap alsl.d $t8, $a1, $t8, 1 sub.d $s2, $s5, $a1 .p2align 4, , 16 -.LBB2_37: # %scalar.ph459 - # Parent Loop BB2_33 Depth=1 +.LBB2_35: # %scalar.ph459 + # Parent Loop BB2_31 Depth=1 # => This Inner Loop Header: Depth=2 ld.hu $a1, $t8, 0 ld.hu $a6, $fp, 0 @@ -1520,7 +1524,7 @@ getVerSubImageSixTap: # @getVerSubImageSixTap alsl.d $a1, $a1, $a6, 2 add.d $a6, $s4, $s3 ld.hu $s3, $t7, 0 - ld.hu $s4, $a5, 0 + ld.hu $s4, $t4, 0 alsl.d $a6, $a6, $a6, 2 sub.d $a1, $a1, $a6 add.d $a1, $a1, $s3 @@ -1533,45 +1537,46 @@ getVerSubImageSixTap: # @getVerSubImageSixTap maskeqz $a1, $a1, $a6 masknez $a6, $t6, $a6 or $a1, $a1, $a6 - st.h $a1, $a4, 0 - addi.d $a4, $a4, 2 + st.h $a1, $a5, 0 addi.d $a5, $a5, 2 + addi.d $t4, $t4, 2 addi.d $t7, $t7, 2 addi.d $s1, $s1, 2 addi.d $s0, $s0, 2 addi.d $fp, $fp, 2 addi.d $s2, $s2, -1 addi.d $t8, $t8, 2 - bnez $s2, .LBB2_37 - b .LBB2_32 + bnez $s2, .LBB2_35 + b .LBB2_30 .p2align 4, , 16 -.LBB2_38: # %vector.memcheck447 - # in Loop: Header=BB2_33 Depth=1 - sub.d $a4, $t7, $t8 +.LBB2_36: # %vector.memcheck447 + # in Loop: Header=BB2_31 Depth=1 + sub.d $a5, $t4, $t8 move $a1, $zero - bltu $a4, $t5, .LBB2_36 + bltu $a5, $t5, .LBB2_34 +# %bb.37: # %vector.memcheck447 + # in Loop: Header=BB2_31 Depth=1 + sub.d $a5, $t4, $fp + bltu $a5, $t5, .LBB2_34 +# %bb.38: # %vector.memcheck447 + # in Loop: Header=BB2_31 Depth=1 + sub.d $a5, $t4, $s0 + bltu $a5, $t5, .LBB2_34 # %bb.39: # %vector.memcheck447 - # in Loop: Header=BB2_33 Depth=1 - sub.d $a4, $t7, $fp - bltu $a4, $t5, .LBB2_36 + # in Loop: Header=BB2_31 Depth=1 + sub.d $a5, $t4, $s1 + bltu $a5, $t5, .LBB2_34 # %bb.40: # %vector.memcheck447 - # in Loop: Header=BB2_33 Depth=1 - sub.d $a4, $t7, $s0 - bltu $a4, $t5, .LBB2_36 + # in Loop: Header=BB2_31 Depth=1 + sub.d $a5, $t4, $s2 + bltu $a5, $t5, .LBB2_34 # %bb.41: # %vector.memcheck447 - # in Loop: Header=BB2_33 Depth=1 - sub.d $a4, $t7, $s1 - bltu $a4, $t5, .LBB2_36 -# %bb.42: # %vector.memcheck447 - # in Loop: Header=BB2_33 Depth=1 - sub.d $a4, $t7, $s2 - bltu $a4, $t5, .LBB2_36 -# %bb.43: # %vector.memcheck447 - # in Loop: Header=BB2_33 Depth=1 - sub.d $a4, $t7, $s3 - bltu $a4, $t5, .LBB2_36 -# %bb.44: # %vector.ph461 - # in Loop: Header=BB2_33 Depth=1 + # in Loop: Header=BB2_31 Depth=1 + sub.d $a5, $t4, $s3 + bltu $a5, $t5, .LBB2_34 +# %bb.42: # %vector.ph461 + # in Loop: Header=BB2_31 Depth=1 + move $a6, $s5 vreplgr2vr.w $vr3, $t6 move $s4, $t8 move $s5, $fp @@ -1579,11 +1584,11 @@ getVerSubImageSixTap: # @getVerSubImageSixTap move $s7, $s1 move $s8, $s2 move $ra, $s3 - move $a4, $t7 - move $a5, $t4 + move $t7, $t4 + move $a5, $a4 .p2align 4, , 16 -.LBB2_45: # %vector.body466 - # Parent Loop BB2_33 Depth=1 +.LBB2_43: # %vector.body466 + # Parent Loop BB2_31 Depth=1 # => This Inner Loop Header: Depth=2 vld $vr4, $s4, 0 vld $vr5, $s5, 0 @@ -1624,26 +1629,26 @@ getVerSubImageSixTap: # @getVerSubImageSixTap vmin.w $vr4, $vr4, $vr3 vmin.w $vr5, $vr5, $vr3 vpickev.h $vr4, $vr5, $vr4 - vst $vr4, $a4, 0 + vst $vr4, $t7, 0 addi.d $a5, $a5, -8 - addi.d $a4, $a4, 16 + addi.d $t7, $t7, 16 addi.d $ra, $ra, 16 addi.d $s8, $s8, 16 addi.d $s7, $s7, 16 addi.d $s6, $s6, 16 addi.d $s5, $s5, 16 addi.d $s4, $s4, 16 - bnez $a5, .LBB2_45 -# %bb.46: # %middle.block475 - # in Loop: Header=BB2_33 Depth=1 - move $a1, $t4 - ld.d $ra, $sp, 56 # 8-byte Folded Reload - ld.d $s6, $sp, 64 # 8-byte Folded Reload - ld.d $s5, $sp, 48 # 8-byte Folded Reload - ori $s7, $zero, 8 - beq $t4, $s5, .LBB2_32 - b .LBB2_36 -.LBB2_47: # %vector.ph + bnez $a5, .LBB2_43 +# %bb.44: # %middle.block475 + # in Loop: Header=BB2_31 Depth=1 + move $a1, $a4 + ld.d $ra, $sp, 40 # 8-byte Folded Reload + ld.d $s8, $sp, 48 # 8-byte Folded Reload + move $s5, $a6 + ori $s6, $zero, 8 + beq $a4, $a6, .LBB2_30 + b .LBB2_34 +.LBB2_45: # %vector.ph bstrpick.d $a1, $a3, 30, 2 slli.d $s2, $a1, 2 vreplgr2vr.w $vr2, $t6 @@ -1656,7 +1661,7 @@ getVerSubImageSixTap: # @getVerSubImageSixTap move $t1, $fp move $s3, $s2 .p2align 4, , 16 -.LBB2_48: # %vector.body +.LBB2_46: # %vector.body # =>This Inner Loop Header: Depth=1 vld $vr4, $a1, 0 vld $vr5, $a4, 0 @@ -1680,10 +1685,10 @@ getVerSubImageSixTap: # @getVerSubImageSixTap addi.d $a5, $a5, 16 addi.d $a4, $a4, 16 addi.d $a1, $a1, 16 - bnez $s3, .LBB2_48 -# %bb.49: # %middle.block - beq $s2, $a3, .LBB2_52 -.LBB2_50: # %scalar.ph.preheader + bnez $s3, .LBB2_46 +# %bb.47: # %middle.block + beq $s2, $a3, .LBB2_50 +.LBB2_48: # %scalar.ph.preheader alsl.d $fp, $s2, $fp, 1 alsl.d $s1, $s2, $s1, 2 alsl.d $s0, $s2, $s0, 2 @@ -1691,7 +1696,7 @@ getVerSubImageSixTap: # @getVerSubImageSixTap alsl.d $t7, $s2, $t7, 2 sub.d $a4, $a3, $s2 .p2align 4, , 16 -.LBB2_51: # %scalar.ph +.LBB2_49: # %scalar.ph # =>This Inner Loop Header: Depth=1 ld.w $a1, $t7, 0 ld.w $a5, $t8, 0 @@ -1720,8 +1725,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap addi.d $t8, $t8, 4 addi.d $a4, $a4, -1 addi.d $t7, $t7, 4 - bnez $a4, .LBB2_51 -.LBB2_52: # %.lr.ph.1 + bnez $a4, .LBB2_49 +.LBB2_50: # %.lr.ph.1 ld.d $t8, $ra, 8 ld.d $t6, $t2, 8 ld.d $t7, $t2, 16 @@ -1730,11 +1735,11 @@ getVerSubImageSixTap: # @getVerSubImageSixTap ld.d $s1, $t2, 32 ldx.w $t2, $t3, $t4 ori $a1, $zero, 4 - bge $s6, $a1, .LBB2_54 -# %bb.53: + bge $s8, $a1, .LBB2_52 +# %bb.51: move $t3, $zero - b .LBB2_57 -.LBB2_54: # %vector.ph312 + b .LBB2_55 +.LBB2_52: # %vector.ph312 bstrpick.d $a1, $a3, 30, 2 slli.d $t3, $a1, 2 vreplgr2vr.w $vr2, $t2 @@ -1748,7 +1753,7 @@ getVerSubImageSixTap: # @getVerSubImageSixTap move $t4, $t8 move $s2, $t3 .p2align 4, , 16 -.LBB2_55: # %vector.body317 +.LBB2_53: # %vector.body317 # =>This Inner Loop Header: Depth=1 vld $vr4, $a4, 0 vld $vr5, $a5, 0 @@ -1774,10 +1779,10 @@ getVerSubImageSixTap: # @getVerSubImageSixTap addi.d $t1, $t1, 16 addi.d $a5, $a5, 16 addi.d $a4, $a4, 16 - bnez $s2, .LBB2_55 -# %bb.56: # %middle.block325 - beq $t3, $a3, .LBB2_59 -.LBB2_57: # %scalar.ph310.preheader + bnez $s2, .LBB2_53 +# %bb.54: # %middle.block325 + beq $t3, $a3, .LBB2_57 +.LBB2_55: # %scalar.ph310.preheader sub.d $t4, $a3, $t3 alsl.d $t8, $t3, $t8, 1 alsl.d $s1, $t3, $s1, 2 @@ -1786,7 +1791,7 @@ getVerSubImageSixTap: # @getVerSubImageSixTap alsl.d $t7, $t3, $t7, 2 alsl.d $t3, $t3, $t6, 2 .p2align 4, , 16 -.LBB2_58: # %scalar.ph310 +.LBB2_56: # %scalar.ph310 # =>This Inner Loop Header: Depth=1 ld.w $a1, $t3, 0 ld.w $a4, $t7, 0 @@ -1817,44 +1822,44 @@ getVerSubImageSixTap: # @getVerSubImageSixTap addi.d $fp, $fp, 4 addi.d $t7, $t7, 4 addi.d $t3, $t3, 4 - bnez $t4, .LBB2_58 -.LBB2_59: # %.loopexit237.1 + bnez $t4, .LBB2_56 +.LBB2_57: # %.loopexit237.1 ld.d $t2, $s5, %pc_lo12(imgY_sub_tmp) - ld.d $t4, $s7, 0 + ld.d $t4, $s6, 0 addi.w $t3, $t0, 37 addi.w $a1, $zero, -34 bstrpick.d $a5, $t5, 30, 2 - blt $t0, $a1, .LBB2_71 -# %bb.60: # %.lr.ph245 - st.d $s7, $sp, 40 # 8-byte Folded Spill - st.d $s5, $sp, 48 # 8-byte Folded Spill + blt $t0, $a1, .LBB2_69 +# %bb.58: # %.lr.ph245 + st.d $s6, $sp, 24 # 8-byte Folded Spill + st.d $s5, $sp, 32 # 8-byte Folded Spill ori $a1, $zero, 3 slt $a4, $a1, $t3 masknez $a1, $a1, $a4 maskeqz $a4, $t3, $a4 or $t0, $a4, $a1 - st.d $a5, $sp, 32 # 8-byte Folded Spill + st.d $a5, $sp, 16 # 8-byte Folded Spill slli.d $a6, $a5, 2 ori $t8, $zero, 2 addi.w $t7, $zero, -39 ori $fp, $s4, 3232 ori $a1, $zero, 512 vreplgr2vr.w $vr2, $a1 - b .LBB2_62 - .p2align 4, , 16 -.LBB2_61: # %.loopexit235 - # in Loop: Header=BB2_62 Depth=1 - beq $t8, $t0, .LBB2_70 -.LBB2_62: # =>This Loop Header: Depth=1 - # Child Loop BB2_66 Depth 2 - # Child Loop BB2_69 Depth 2 + b .LBB2_60 + .p2align 4, , 16 +.LBB2_59: # %.loopexit235 + # in Loop: Header=BB2_60 Depth=1 + beq $t8, $t0, .LBB2_68 +.LBB2_60: # =>This Loop Header: Depth=1 + # Child Loop BB2_64 Depth 2 + # Child Loop BB2_67 Depth 2 move $a4, $t8 addi.d $t8, $t8, 1 - blt $a0, $t7, .LBB2_61 -# %bb.63: # %.lr.ph243 - # in Loop: Header=BB2_62 Depth=1 + blt $a0, $t7, .LBB2_59 +# %bb.61: # %.lr.ph243 + # in Loop: Header=BB2_60 Depth=1 slli.d $a5, $a4, 3 - ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload ldx.d $a1, $a1, $a5 alsl.d $t6, $a4, $t2, 3 ldx.d $s2, $t2, $a5 @@ -1865,15 +1870,15 @@ getVerSubImageSixTap: # @getVerSubImageSixTap ld.d $t5, $t6, -16 ld.d $s0, $t6, 24 ldx.w $s1, $t4, $fp - ld.d $t6, $sp, 64 # 8-byte Folded Reload + ld.d $t6, $sp, 48 # 8-byte Folded Reload ori $s3, $zero, 4 - bge $t6, $s3, .LBB2_65 -# %bb.64: # in Loop: Header=BB2_62 Depth=1 + bge $t6, $s3, .LBB2_63 +# %bb.62: # in Loop: Header=BB2_60 Depth=1 move $t6, $zero - b .LBB2_68 + b .LBB2_66 .p2align 4, , 16 -.LBB2_65: # %vector.ph330 - # in Loop: Header=BB2_62 Depth=1 +.LBB2_63: # %vector.ph330 + # in Loop: Header=BB2_60 Depth=1 vreplgr2vr.w $vr3, $s1 move $ra, $s2 move $s3, $a4 @@ -1884,8 +1889,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap move $s4, $a1 move $t6, $a6 .p2align 4, , 16 -.LBB2_66: # %vector.body335 - # Parent Loop BB2_62 Depth=1 +.LBB2_64: # %vector.body335 + # Parent Loop BB2_60 Depth=1 # => This Inner Loop Header: Depth=2 vld $vr4, $ra, 0 vld $vr5, $s3, 0 @@ -1913,13 +1918,13 @@ getVerSubImageSixTap: # @getVerSubImageSixTap addi.d $s5, $s5, 16 addi.d $s3, $s3, 16 addi.d $ra, $ra, 16 - bnez $t6, .LBB2_66 -# %bb.67: # %middle.block344 - # in Loop: Header=BB2_62 Depth=1 + bnez $t6, .LBB2_64 +# %bb.65: # %middle.block344 + # in Loop: Header=BB2_60 Depth=1 move $t6, $a6 - beq $a6, $a3, .LBB2_61 -.LBB2_68: # %scalar.ph328.preheader - # in Loop: Header=BB2_62 Depth=1 + beq $a6, $a3, .LBB2_59 +.LBB2_66: # %scalar.ph328.preheader + # in Loop: Header=BB2_60 Depth=1 alsl.d $s4, $t6, $a1, 1 alsl.d $s8, $t6, $s0, 2 alsl.d $s7, $t6, $t5, 2 @@ -1929,8 +1934,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap alsl.d $s2, $t6, $s2, 2 sub.d $ra, $a3, $t6 .p2align 4, , 16 -.LBB2_69: # %scalar.ph328 - # Parent Loop BB2_62 Depth=1 +.LBB2_67: # %scalar.ph328 + # Parent Loop BB2_60 Depth=1 # => This Inner Loop Header: Depth=2 ld.w $a1, $s2, 0 ld.w $a4, $s3, 0 @@ -1963,37 +1968,37 @@ getVerSubImageSixTap: # @getVerSubImageSixTap addi.d $s3, $s3, 4 addi.d $ra, $ra, -1 addi.d $s2, $s2, 4 - bnez $ra, .LBB2_69 - b .LBB2_61 -.LBB2_70: # %.preheader233.loopexit - ld.d $a1, $sp, 48 # 8-byte Folded Reload + bnez $ra, .LBB2_67 + b .LBB2_59 +.LBB2_68: # %.preheader233.loopexit + ld.d $a1, $sp, 32 # 8-byte Folded Reload ld.d $t2, $a1, %pc_lo12(imgY_sub_tmp) - ld.d $a1, $sp, 40 # 8-byte Folded Reload + ld.d $a1, $sp, 24 # 8-byte Folded Reload ld.d $t4, $a1, 0 - ld.d $ra, $sp, 56 # 8-byte Folded Reload - ld.d $s6, $sp, 64 # 8-byte Folded Reload + ld.d $ra, $sp, 40 # 8-byte Folded Reload + ld.d $s8, $sp, 48 # 8-byte Folded Reload lu12i.w $s4, 3 - ld.d $a5, $sp, 32 # 8-byte Folded Reload -.LBB2_71: # %.preheader233 + ld.d $a5, $sp, 16 # 8-byte Folded Reload +.LBB2_69: # %.preheader233 slli.d $a5, $a5, 2 addi.w $t0, $zero, -39 ori $a4, $s4, 3232 ori $t1, $zero, 4 ori $a1, $zero, 512 vreplgr2vr.w $vr2, $a1 - b .LBB2_73 - .p2align 4, , 16 -.LBB2_72: # %.loopexit232 - # in Loop: Header=BB2_73 Depth=1 - bge $t3, $a2, .LBB2_81 -.LBB2_73: # =>This Loop Header: Depth=1 - # Child Loop BB2_77 Depth 2 - # Child Loop BB2_80 Depth 2 + b .LBB2_71 + .p2align 4, , 16 +.LBB2_70: # %.loopexit232 + # in Loop: Header=BB2_71 Depth=1 + bge $t3, $a2, .LBB2_79 +.LBB2_71: # =>This Loop Header: Depth=1 + # Child Loop BB2_75 Depth 2 + # Child Loop BB2_78 Depth 2 move $a1, $t3 addi.d $t3, $t3, 1 - blt $a0, $t0, .LBB2_72 -# %bb.74: # %.lr.ph247 - # in Loop: Header=BB2_73 Depth=1 + blt $a0, $t0, .LBB2_70 +# %bb.72: # %.lr.ph247 + # in Loop: Header=BB2_71 Depth=1 slli.d $a6, $a1, 3 ldx.d $t6, $ra, $a6 addi.w $t5, $a1, 3 @@ -2022,13 +2027,13 @@ getVerSubImageSixTap: # @getVerSubImageSixTap slli.d $a1, $t5, 3 ldx.d $s2, $t2, $a1 ldx.w $t5, $t4, $a4 - bge $s6, $t1, .LBB2_76 -# %bb.75: # in Loop: Header=BB2_73 Depth=1 + bge $s8, $t1, .LBB2_74 +# %bb.73: # in Loop: Header=BB2_71 Depth=1 move $a1, $zero - b .LBB2_79 + b .LBB2_77 .p2align 4, , 16 -.LBB2_76: # %vector.ph349 - # in Loop: Header=BB2_73 Depth=1 +.LBB2_74: # %vector.ph349 + # in Loop: Header=BB2_71 Depth=1 vreplgr2vr.w $vr3, $t5 move $s3, $t7 move $s4, $t8 @@ -2039,8 +2044,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap move $a1, $t6 move $a6, $a5 .p2align 4, , 16 -.LBB2_77: # %vector.body354 - # Parent Loop BB2_73 Depth=1 +.LBB2_75: # %vector.body354 + # Parent Loop BB2_71 Depth=1 # => This Inner Loop Header: Depth=2 vld $vr4, $s3, 0 vld $vr5, $s4, 0 @@ -2068,14 +2073,14 @@ getVerSubImageSixTap: # @getVerSubImageSixTap addi.d $s5, $s5, 16 addi.d $s4, $s4, 16 addi.d $s3, $s3, 16 - bnez $a6, .LBB2_77 -# %bb.78: # %middle.block363 - # in Loop: Header=BB2_73 Depth=1 + bnez $a6, .LBB2_75 +# %bb.76: # %middle.block363 + # in Loop: Header=BB2_71 Depth=1 move $a1, $a5 - ld.d $s6, $sp, 64 # 8-byte Folded Reload - beq $a5, $a3, .LBB2_72 -.LBB2_79: # %scalar.ph347.preheader - # in Loop: Header=BB2_73 Depth=1 + ld.d $s8, $sp, 48 # 8-byte Folded Reload + beq $a5, $a3, .LBB2_70 +.LBB2_77: # %scalar.ph347.preheader + # in Loop: Header=BB2_71 Depth=1 alsl.d $t6, $a1, $t6, 1 alsl.d $s2, $a1, $s2, 2 alsl.d $s1, $a1, $s1, 2 @@ -2085,8 +2090,8 @@ getVerSubImageSixTap: # @getVerSubImageSixTap alsl.d $t7, $a1, $t7, 2 sub.d $s3, $a3, $a1 .p2align 4, , 16 -.LBB2_80: # %scalar.ph347 - # Parent Loop BB2_73 Depth=1 +.LBB2_78: # %scalar.ph347 + # Parent Loop BB2_71 Depth=1 # => This Inner Loop Header: Depth=2 ld.w $a1, $t7, 0 ld.w $a6, $t8, 0 @@ -2119,183 +2124,160 @@ getVerSubImageSixTap: # @getVerSubImageSixTap addi.d $t8, $t8, 4 addi.d $s3, $s3, -1 addi.d $t7, $t7, 4 - bnez $s3, .LBB2_80 - b .LBB2_72 -.LBB2_81: # %.loopexit227 - ld.d $s8, $sp, 72 # 8-byte Folded Reload - ld.d $s7, $sp, 80 # 8-byte Folded Reload - ld.d $s6, $sp, 88 # 8-byte Folded Reload - ld.d $s5, $sp, 96 # 8-byte Folded Reload - ld.d $s4, $sp, 104 # 8-byte Folded Reload - ld.d $s3, $sp, 112 # 8-byte Folded Reload - ld.d $s2, $sp, 120 # 8-byte Folded Reload - ld.d $s1, $sp, 128 # 8-byte Folded Reload - ld.d $s0, $sp, 136 # 8-byte Folded Reload - ld.d $fp, $sp, 144 # 8-byte Folded Reload - ld.d $ra, $sp, 152 # 8-byte Folded Reload - addi.d $sp, $sp, 160 + bnez $s3, .LBB2_78 + b .LBB2_70 +.LBB2_79: # %.loopexit227 + ld.d $s8, $sp, 56 # 8-byte Folded Reload + ld.d $s7, $sp, 64 # 8-byte Folded Reload + ld.d $s6, $sp, 72 # 8-byte Folded Reload + ld.d $s5, $sp, 80 # 8-byte Folded Reload + ld.d $s4, $sp, 88 # 8-byte Folded Reload + ld.d $s3, $sp, 96 # 8-byte Folded Reload + ld.d $s2, $sp, 104 # 8-byte Folded Reload + ld.d $s1, $sp, 112 # 8-byte Folded Reload + ld.d $s0, $sp, 120 # 8-byte Folded Reload + ld.d $fp, $sp, 128 # 8-byte Folded Reload + ld.d $ra, $sp, 136 # 8-byte Folded Reload + addi.d $sp, $sp, 144 ret -.LBB2_82: # %vector.memcheck - sub.d $a4, $t7, $t5 - ori $a1, $zero, 16 - move $s0, $zero - bltu $a4, $a1, .LBB2_7 -# %bb.83: # %vector.memcheck - sub.d $a4, $t7, $t6 - bltu $a4, $a1, .LBB2_7 -# %bb.84: # %vector.memcheck - sub.d $a4, $t7, $t8 - ori $a1, $zero, 16 - bltu $a4, $a1, .LBB2_7 -# %bb.85: # %vector.memcheck - sub.d $a4, $t7, $fp - bltu $a4, $a1, .LBB2_7 -# %bb.86: # %vector.ph373 - move $a6, $s1 +.LBB2_80: # %vector.memcheck388 + vreplgr2vr.d $vr3, $t4 + vsub.d $vr4, $vr3, $vr1 + vsub.d $vr3, $vr3, $vr2 + vslti.du $vr3, $vr3, 16 + vslti.du $vr4, $vr4, 16 + vpickev.w $vr3, $vr4, $vr3 + vmskltz.w $vr3, $vr3 + vpickve2gr.hu $a1, $vr3, 0 + move $t2, $zero + bnez $a1, .LBB2_12 +# %bb.81: # %vector.memcheck388 + sub.d $a1, $t4, $t5 + ori $a4, $zero, 16 + bltu $a1, $a4, .LBB2_12 +# %bb.82: # %vector.ph400 + vpickve2gr.d $t6, $vr2, 1 + vpickve2gr.d $t7, $vr1, 0 + vpickve2gr.d $t8, $vr2, 0 + vpickve2gr.d $fp, $vr1, 1 bstrpick.d $a1, $s5, 30, 3 - slli.d $s0, $a1, 3 - vreplgr2vr.w $vr1, $t4 - vrepli.w $vr2, 20 - vrepli.w $vr3, -5 + slli.d $t2, $a1, 3 + vreplgr2vr.w $vr3, $t1 + vrepli.w $vr4, 20 + vrepli.w $vr5, -5 move $a4, $t5 - move $a5, $t6 - move $s1, $t8 - move $s2, $fp - move $s3, $t7 - move $s4, $s0 + move $a5, $t4 + move $s0, $t2 .p2align 4, , 16 -.LBB2_87: # %vector.body378 +.LBB2_83: # %vector.body405 # =>This Inner Loop Header: Depth=1 - vld $vr4, $a4, 0 - vld $vr5, $a5, 0 - vilvh.h $vr6, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vilvh.h $vr7, $vr0, $vr5 - vilvl.h $vr5, $vr0, $vr5 - vadd.w $vr5, $vr5, $vr4 - vld $vr8, $s1, 0 - vadd.w $vr7, $vr7, $vr6 - vmul.w $vr7, $vr7, $vr2 - vmul.w $vr5, $vr5, $vr2 - vilvl.h $vr9, $vr0, $vr8 - vld $vr10, $s2, 0 - vilvh.h $vr8, $vr0, $vr8 - vadd.w $vr8, $vr8, $vr6 - vadd.w $vr9, $vr9, $vr4 - vilvh.h $vr11, $vr0, $vr10 - vilvl.h $vr10, $vr0, $vr10 - vmadd.w $vr5, $vr9, $vr3 - vmadd.w $vr7, $vr8, $vr3 + vld $vr6, $t6, 0 + vld $vr7, $t7, 0 + vilvh.h $vr8, $vr0, $vr6 + vilvl.h $vr6, $vr0, $vr6 + vilvh.h $vr9, $vr0, $vr7 + vilvl.h $vr7, $vr0, $vr7 vadd.w $vr6, $vr7, $vr6 - vadd.w $vr4, $vr5, $vr4 - vadd.w $vr4, $vr4, $vr10 - vadd.w $vr5, $vr6, $vr11 - vaddi.wu $vr5, $vr5, 16 - vaddi.wu $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 5 - vsrai.w $vr5, $vr5, 5 - vmaxi.w $vr5, $vr5, 0 - vmaxi.w $vr4, $vr4, 0 - vmin.w $vr4, $vr4, $vr1 - vmin.w $vr5, $vr5, $vr1 - vpickev.h $vr4, $vr5, $vr4 - vst $vr4, $s3, 0 - addi.d $s4, $s4, -8 - addi.d $s3, $s3, 16 - addi.d $s2, $s2, 16 - addi.d $s1, $s1, 16 + vadd.w $vr7, $vr9, $vr8 + vld $vr8, $t8, 0 + vmul.w $vr7, $vr7, $vr4 + vld $vr9, $fp, 0 + vmul.w $vr6, $vr6, $vr4 + vilvl.h $vr10, $vr0, $vr8 + vilvh.h $vr8, $vr0, $vr8 + vilvl.h $vr11, $vr0, $vr9 + vld $vr12, $a4, 0 + vilvh.h $vr9, $vr0, $vr9 + vadd.w $vr9, $vr9, $vr8 + vadd.w $vr11, $vr11, $vr10 + vilvh.h $vr13, $vr0, $vr12 + vilvl.h $vr12, $vr0, $vr12 + vmadd.w $vr6, $vr11, $vr5 + vmadd.w $vr7, $vr9, $vr5 + vadd.w $vr7, $vr7, $vr8 + vadd.w $vr6, $vr6, $vr10 + vadd.w $vr6, $vr6, $vr12 + vadd.w $vr7, $vr7, $vr13 + vaddi.wu $vr7, $vr7, 16 + vaddi.wu $vr6, $vr6, 16 + vsrai.w $vr6, $vr6, 5 + vsrai.w $vr7, $vr7, 5 + vmaxi.w $vr7, $vr7, 0 + vmaxi.w $vr6, $vr6, 0 + vmin.w $vr6, $vr6, $vr3 + vmin.w $vr7, $vr7, $vr3 + vpickev.h $vr6, $vr7, $vr6 + vst $vr6, $a5, 0 + addi.d $s0, $s0, -8 addi.d $a5, $a5, 16 addi.d $a4, $a4, 16 - bnez $s4, .LBB2_87 -# %bb.88: # %middle.block385 - move $s1, $a6 - bne $s0, $s5, .LBB2_7 - b .LBB2_9 -.LBB2_89: # %vector.memcheck388 - sub.d $a4, $t6, $t4 - ori $a1, $zero, 16 - move $t2, $zero - bltu $a4, $a1, .LBB2_11 -# %bb.90: # %vector.memcheck388 - sub.d $a4, $t6, $t5 - bltu $a4, $a1, .LBB2_11 -# %bb.91: # %vector.memcheck388 - sub.d $a4, $t6, $t7 - ori $a1, $zero, 16 - bltu $a4, $a1, .LBB2_11 -# %bb.92: # %vector.memcheck388 - sub.d $a4, $t6, $t8 - bltu $a4, $a1, .LBB2_11 -# %bb.93: # %vector.memcheck388 - sub.d $a1, $t6, $fp - ori $a4, $zero, 16 - bltu $a1, $a4, .LBB2_11 -# %bb.94: # %vector.ph400 - move $a6, $s1 + addi.d $fp, $fp, 16 + addi.d $t8, $t8, 16 + addi.d $t7, $t7, 16 + addi.d $t6, $t6, 16 + bnez $s0, .LBB2_83 +# %bb.84: # %middle.block413 + bne $t2, $s5, .LBB2_12 + b .LBB2_14 +.LBB2_85: # %vector.ph373 + vpickve2gr.d $a5, $vr2, 0 + vpickve2gr.d $a4, $vr2, 1 + vpickve2gr.d $t7, $vr1, 0 + vpickve2gr.d $t8, $vr1, 1 bstrpick.d $a1, $s5, 30, 3 - slli.d $t2, $a1, 3 - vreplgr2vr.w $vr1, $t1 - vrepli.w $vr2, 20 - vrepli.w $vr3, -5 - move $a4, $t4 - move $a5, $t5 - move $s0, $t7 - move $s1, $t8 - move $s2, $fp - move $s3, $t6 - move $s4, $t2 - .p2align 4, , 16 -.LBB2_95: # %vector.body405 + slli.d $t6, $a1, 3 + vreplgr2vr.w $vr3, $t4 + vrepli.w $vr4, 20 + vrepli.w $vr5, -5 + move $fp, $t5 + move $s0, $t6 + .p2align 4, , 16 +.LBB2_86: # %vector.body378 # =>This Inner Loop Header: Depth=1 - vld $vr4, $a4, 0 - vld $vr5, $a5, 0 - vilvh.h $vr6, $vr0, $vr4 - vilvl.h $vr4, $vr0, $vr4 - vilvh.h $vr7, $vr0, $vr5 - vilvl.h $vr5, $vr0, $vr5 - vadd.w $vr4, $vr5, $vr4 - vadd.w $vr5, $vr7, $vr6 - vld $vr6, $s0, 0 - vmul.w $vr5, $vr5, $vr2 - vld $vr7, $s1, 0 - vmul.w $vr4, $vr4, $vr2 - vilvl.h $vr8, $vr0, $vr6 - vilvh.h $vr6, $vr0, $vr6 - vilvl.h $vr9, $vr0, $vr7 - vld $vr10, $s2, 0 - vilvh.h $vr7, $vr0, $vr7 + vld $vr6, $a5, 0 + vld $vr7, $a4, 0 + vilvh.h $vr8, $vr0, $vr6 + vilvl.h $vr6, $vr0, $vr6 + vilvh.h $vr9, $vr0, $vr7 + vilvl.h $vr7, $vr0, $vr7 vadd.w $vr7, $vr7, $vr6 + vld $vr10, $t7, 0 vadd.w $vr9, $vr9, $vr8 - vilvh.h $vr11, $vr0, $vr10 - vilvl.h $vr10, $vr0, $vr10 - vmadd.w $vr4, $vr9, $vr3 - vmadd.w $vr5, $vr7, $vr3 - vadd.w $vr5, $vr5, $vr6 - vadd.w $vr4, $vr4, $vr8 - vadd.w $vr4, $vr4, $vr10 - vadd.w $vr5, $vr5, $vr11 - vaddi.wu $vr5, $vr5, 16 - vaddi.wu $vr4, $vr4, 16 - vsrai.w $vr4, $vr4, 5 - vsrai.w $vr5, $vr5, 5 - vmaxi.w $vr5, $vr5, 0 - vmaxi.w $vr4, $vr4, 0 - vmin.w $vr4, $vr4, $vr1 - vmin.w $vr5, $vr5, $vr1 - vpickev.h $vr4, $vr5, $vr4 - vst $vr4, $s3, 0 - addi.d $s4, $s4, -8 - addi.d $s3, $s3, 16 - addi.d $s2, $s2, 16 - addi.d $s1, $s1, 16 - addi.d $s0, $s0, 16 - addi.d $a5, $a5, 16 + vmul.w $vr9, $vr9, $vr4 + vmul.w $vr7, $vr7, $vr4 + vilvl.h $vr11, $vr0, $vr10 + vld $vr12, $t8, 0 + vilvh.h $vr10, $vr0, $vr10 + vadd.w $vr10, $vr10, $vr8 + vadd.w $vr11, $vr11, $vr6 + vilvh.h $vr13, $vr0, $vr12 + vilvl.h $vr12, $vr0, $vr12 + vmadd.w $vr7, $vr11, $vr5 + vmadd.w $vr9, $vr10, $vr5 + vadd.w $vr8, $vr9, $vr8 + vadd.w $vr6, $vr7, $vr6 + vadd.w $vr6, $vr6, $vr12 + vadd.w $vr7, $vr8, $vr13 + vaddi.wu $vr7, $vr7, 16 + vaddi.wu $vr6, $vr6, 16 + vsrai.w $vr6, $vr6, 5 + vsrai.w $vr7, $vr7, 5 + vmaxi.w $vr7, $vr7, 0 + vmaxi.w $vr6, $vr6, 0 + vmin.w $vr6, $vr6, $vr3 + vmin.w $vr7, $vr7, $vr3 + vpickev.h $vr6, $vr7, $vr6 + vst $vr6, $fp, 0 + addi.d $s0, $s0, -8 + addi.d $fp, $fp, 16 + addi.d $t8, $t8, 16 + addi.d $t7, $t7, 16 addi.d $a4, $a4, 16 - bnez $s4, .LBB2_95 -# %bb.96: # %middle.block413 - move $s1, $a6 - bne $t2, $s5, .LBB2_11 - b .LBB2_13 + addi.d $a5, $a5, 16 + bnez $s0, .LBB2_86 +# %bb.87: # %middle.block385 + beq $t6, $s5, .LBB2_10 + b .LBB2_8 .Lfunc_end2: .size getVerSubImageSixTap, .Lfunc_end2-getVerSubImageSixTap # -- End function diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_distortion.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_distortion.s index d1bfd99a..b8bf5809 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_distortion.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_distortion.s @@ -7512,7 +7512,7 @@ computeBiPredSSE2: # @computeBiPredSSE2 ld.w $t1, $t1, %pc_lo12(wp_luma_round) move $s1, $a5 move $s3, $a4 - st.d $a3, $sp, 240 # 8-byte Folded Spill + st.d $a3, $sp, 216 # 8-byte Folded Spill move $s6, $a2 move $s5, $a1 addi.d $s7, $t0, 1 @@ -7521,14 +7521,14 @@ computeBiPredSSE2: # @computeBiPredSSE2 pcalau12i $a1, %pc_hi20(bipred2_access_method) st.d $a1, $sp, 160 # 8-byte Folded Spill ld.w $a1, $a1, %pc_lo12(bipred2_access_method) - ld.w $s0, $a2, %pc_lo12(img_padded_size_x) + ld.w $fp, $a2, %pc_lo12(img_padded_size_x) pcalau12i $s4, %pc_hi20(src_line) st.d $a0, $sp, 168 # 8-byte Folded Spill st.d $a0, $s4, %pc_lo12(src_line) slli.d $a0, $a1, 3 pcalau12i $a1, %pc_hi20(get_line) - addi.d $fp, $a1, %pc_lo12(get_line) - ldx.d $a3, $fp, $a0 + addi.d $s0, $a1, %pc_lo12(get_line) + ldx.d $a3, $s0, $a0 pcalau12i $a0, %pc_hi20(ref_pic2_sub) addi.d $a0, $a0, %pc_lo12(ref_pic2_sub) st.d $a0, $sp, 152 # 8-byte Folded Spill @@ -7542,7 +7542,7 @@ computeBiPredSSE2: # @computeBiPredSSE2 st.d $a1, $sp, 144 # 8-byte Folded Spill ld.w $a1, $a1, %pc_lo12(bipred1_access_method) slli.d $a1, $a1, 3 - ldx.d $a3, $fp, $a1 + ldx.d $a3, $s0, $a1 pcalau12i $s2, %pc_hi20(ref2_line) st.d $a0, $s2, %pc_lo12(ref2_line) pcalau12i $a0, %pc_hi20(ref_pic1_sub) @@ -7554,111 +7554,111 @@ computeBiPredSSE2: # @computeBiPredSSE2 st.d $s3, $sp, 176 # 8-byte Folded Spill move $a2, $s3 jirl $ra, $a3, 0 - pcalau12i $fp, %pc_hi20(ref1_line) - st.d $a0, $fp, %pc_lo12(ref1_line) + pcalau12i $s1, %pc_hi20(ref1_line) + st.d $a0, $s1, %pc_lo12(ref1_line) vreplgr2vr.w $vr12, $s8 vreplgr2vr.w $vr13, $s7 lu12i.w $a1, 3 - st.d $s4, $sp, 232 # 8-byte Folded Spill - blez $s5, .LBB16_12 + st.d $s4, $sp, 240 # 8-byte Folded Spill + blez $s5, .LBB16_13 # %bb.1: # %.preheader109.lr.ph - ld.d $s4, $s4, %pc_lo12(src_line) - ld.d $ra, $s2, %pc_lo12(ref2_line) + ld.d $t7, $s4, %pc_lo12(src_line) + ld.d $t6, $s2, %pc_lo12(ref2_line) pcalau12i $a2, %pc_hi20(weight1) - ld.h $a2, $a2, %pc_lo12(weight1) - pcalau12i $a3, %pc_hi20(weight2) - ld.h $a3, $a3, %pc_lo12(weight2) - sub.w $a4, $s0, $s6 - pcalau12i $a5, %got_pc_hi20(img) - ld.d $a6, $a5, %got_pc_lo12(img) + ld.h $a7, $a2, %pc_lo12(weight1) + pcalau12i $a2, %pc_hi20(weight2) + ld.h $t1, $a2, %pc_lo12(weight2) + sub.w $a2, $fp, $s6 + pcalau12i $a3, %got_pc_hi20(img) + ld.d $a4, $a3, %got_pc_lo12(img) move $s3, $zero - move $a5, $zero - ld.d $a6, $a6, 0 - pcalau12i $a7, %pc_hi20(offsetBi) - ld.h $a7, $a7, %pc_lo12(offsetBi) - addi.d $t0, $s6, -1 - bstrpick.d $t0, $t0, 31, 2 - addi.d $t0, $t0, 1 - bstrpick.d $t3, $t0, 30, 3 - slli.d $t1, $t3, 3 - slli.d $t2, $t3, 5 - st.d $t2, $sp, 224 # 8-byte Folded Spill - slli.d $t3, $t3, 6 - vreplgr2vr.w $vr0, $a2 - vreplgr2vr.w $vr1, $a3 - vreplgr2vr.w $vr2, $a7 - ori $t4, $a1, 3232 + move $a3, $zero + ld.d $a4, $a4, 0 + pcalau12i $a5, %pc_hi20(offsetBi) + ld.h $t2, $a5, %pc_lo12(offsetBi) + addi.d $a5, $s6, -1 + bstrpick.d $a5, $a5, 31, 2 + addi.d $a5, $a5, 1 + bstrpick.d $t0, $a5, 30, 3 + slli.d $a6, $t0, 3 + vreplgr2vr.w $vr0, $a7 + slli.d $a7, $t0, 5 + slli.d $t0, $t0, 6 + vreplgr2vr.w $vr1, $t1 + vreplgr2vr.w $vr2, $t2 + ori $t1, $a1, 3232 + ori $t2, $zero, 29 vrepli.b $vr3, 0 + ld.d $ra, $sp, 216 # 8-byte Folded Reload + move $fp, $s4 .p2align 4, , 16 .LBB16_2: # %.preheader109 # =>This Loop Header: Depth=1 # Child Loop BB16_6 Depth 2 - # Child Loop BB16_8 Depth 2 - blez $s6, .LBB16_10 + # Child Loop BB16_9 Depth 2 + blez $s6, .LBB16_11 # %bb.3: # %.lr.ph # in Loop: Header=BB16_2 Depth=1 - ldx.w $t6, $a6, $t4 - ori $a1, $zero, 29 - bgeu $s6, $a1, .LBB16_5 + ldx.w $t3, $a4, $t1 + vreplgr2vr.w $vr4, $t3 + bgeu $s6, $t2, .LBB16_5 # %bb.4: # in Loop: Header=BB16_2 Depth=1 - move $s1, $zero - move $t7, $a0 - move $s0, $ra - move $t8, $s4 - ld.d $s4, $sp, 232 # 8-byte Folded Reload + move $t8, $zero + move $t3, $a0 + move $t4, $t6 + move $t5, $t7 b .LBB16_8 .p2align 4, , 16 .LBB16_5: # %vector.ph # in Loop: Header=BB16_2 Depth=1 - add.d $t7, $a0, $t3 - add.d $s0, $ra, $t3 - add.d $t8, $s4, $t3 - vori.b $vr4, $vr3, 0 - vinsgr2vr.w $vr4, $s3, 0 - vreplgr2vr.w $vr5, $t6 + add.d $t3, $a0, $t0 + add.d $t4, $t6, $t0 + add.d $t5, $t7, $t0 + vori.b $vr5, $vr3, 0 + vinsgr2vr.w $vr5, $s3, 0 addi.d $a0, $a0, 32 - addi.d $s1, $ra, 32 - addi.d $s3, $s4, 32 - move $s4, $t1 + addi.d $t6, $t6, 32 + addi.d $t7, $t7, 32 + move $t8, $a6 vori.b $vr6, $vr3, 0 .p2align 4, , 16 .LBB16_6: # %vector.body # Parent Loop BB16_2 Depth=1 # => This Inner Loop Header: Depth=2 - ld.h $ra, $a0, -32 - ld.h $a1, $a0, -24 - ld.h $t5, $a0, -16 - ld.h $t2, $a0, -8 - vinsgr2vr.h $vr7, $ra, 0 - vinsgr2vr.h $vr7, $a1, 1 - vinsgr2vr.h $vr7, $t5, 2 - vinsgr2vr.h $vr7, $t2, 3 - ld.h $a1, $a0, 0 - ld.h $t2, $a0, 8 - ld.h $t5, $a0, 16 - ld.h $ra, $a0, 24 - vinsgr2vr.h $vr8, $a1, 0 - vinsgr2vr.h $vr8, $t2, 1 - vinsgr2vr.h $vr8, $t5, 2 - vinsgr2vr.h $vr8, $ra, 3 + ld.h $fp, $a0, -32 + ld.h $s0, $a0, -24 + ld.h $s3, $a0, -16 + ld.h $s4, $a0, -8 + vinsgr2vr.h $vr7, $fp, 0 + vinsgr2vr.h $vr7, $s0, 1 + vinsgr2vr.h $vr7, $s3, 2 + vinsgr2vr.h $vr7, $s4, 3 + ld.h $fp, $a0, 0 + ld.h $s0, $a0, 8 + ld.h $s3, $a0, 16 + ld.h $s4, $a0, 24 + vinsgr2vr.h $vr8, $fp, 0 + vinsgr2vr.h $vr8, $s0, 1 + vinsgr2vr.h $vr8, $s3, 2 + vinsgr2vr.h $vr8, $s4, 3 vilvl.h $vr7, $vr3, $vr7 vilvl.h $vr8, $vr3, $vr8 - ld.h $a1, $s1, -32 - ld.h $t2, $s1, -24 - ld.h $t5, $s1, -16 - ld.h $ra, $s1, -8 - vinsgr2vr.h $vr9, $a1, 0 - vinsgr2vr.h $vr9, $t2, 1 - vinsgr2vr.h $vr9, $t5, 2 - vinsgr2vr.h $vr9, $ra, 3 - ld.h $a1, $s1, 0 - ld.h $t2, $s1, 8 - ld.h $t5, $s1, 16 - ld.h $ra, $s1, 24 - vinsgr2vr.h $vr10, $a1, 0 - vinsgr2vr.h $vr10, $t2, 1 - vinsgr2vr.h $vr10, $t5, 2 - vinsgr2vr.h $vr10, $ra, 3 + ld.h $fp, $t6, -32 + ld.h $s0, $t6, -24 + ld.h $s3, $t6, -16 + ld.h $s4, $t6, -8 + vinsgr2vr.h $vr9, $fp, 0 + vinsgr2vr.h $vr9, $s0, 1 + vinsgr2vr.h $vr9, $s3, 2 + vinsgr2vr.h $vr9, $s4, 3 + ld.h $fp, $t6, 0 + ld.h $s0, $t6, 8 + ld.h $s3, $t6, 16 + ld.h $s4, $t6, 24 + vinsgr2vr.h $vr10, $fp, 0 + vinsgr2vr.h $vr10, $s0, 1 + vinsgr2vr.h $vr10, $s3, 2 + vinsgr2vr.h $vr10, $s4, 3 vilvl.h $vr9, $vr3, $vr9 vilvl.h $vr10, $vr3, $vr10 vori.b $vr11, $vr12, 0 @@ -7673,64 +7673,64 @@ computeBiPredSSE2: # @computeBiPredSSE2 vadd.w $vr7, $vr7, $vr2 vmaxi.w $vr8, $vr8, 0 vmaxi.w $vr7, $vr7, 0 - vmin.w $vr8, $vr8, $vr5 - vmin.w $vr7, $vr7, $vr5 - ld.h $a1, $s3, -32 - ld.h $t2, $s3, -24 - ld.h $t5, $s3, -16 - ld.h $ra, $s3, -8 - vinsgr2vr.h $vr9, $a1, 0 - vinsgr2vr.h $vr9, $t2, 1 - vinsgr2vr.h $vr9, $t5, 2 - vinsgr2vr.h $vr9, $ra, 3 - ld.h $a1, $s3, 0 - ld.h $t2, $s3, 8 - ld.h $t5, $s3, 16 - ld.h $ra, $s3, 24 - vinsgr2vr.h $vr10, $a1, 0 - vinsgr2vr.h $vr10, $t2, 1 - vinsgr2vr.h $vr10, $t5, 2 - vinsgr2vr.h $vr10, $ra, 3 + vmin.w $vr8, $vr8, $vr4 + vmin.w $vr7, $vr7, $vr4 + ld.h $fp, $t7, -32 + ld.h $s0, $t7, -24 + ld.h $s3, $t7, -16 + ld.h $s4, $t7, -8 + vinsgr2vr.h $vr9, $fp, 0 + vinsgr2vr.h $vr9, $s0, 1 + vinsgr2vr.h $vr9, $s3, 2 + vinsgr2vr.h $vr9, $s4, 3 + ld.h $fp, $t7, 0 + ld.h $s0, $t7, 8 + ld.h $s3, $t7, 16 + ld.h $s4, $t7, 24 + vinsgr2vr.h $vr10, $fp, 0 + vinsgr2vr.h $vr10, $s0, 1 + vinsgr2vr.h $vr10, $s3, 2 + vinsgr2vr.h $vr10, $s4, 3 vilvl.h $vr9, $vr3, $vr9 vilvl.h $vr10, $vr3, $vr10 vsub.w $vr8, $vr9, $vr8 vsub.w $vr7, $vr10, $vr7 - vmadd.w $vr4, $vr8, $vr8 + vmadd.w $vr5, $vr8, $vr8 vmadd.w $vr6, $vr7, $vr7 - ld.h $a1, $a0, -30 - ld.h $t2, $a0, -22 - ld.h $t5, $a0, -14 - ld.h $ra, $a0, -6 - vinsgr2vr.h $vr7, $a1, 0 - vinsgr2vr.h $vr7, $t2, 1 - vinsgr2vr.h $vr7, $t5, 2 - vinsgr2vr.h $vr7, $ra, 3 - ld.h $a1, $a0, 2 - ld.h $t2, $a0, 10 - ld.h $t5, $a0, 18 - ld.h $ra, $a0, 26 - vinsgr2vr.h $vr8, $a1, 0 - vinsgr2vr.h $vr8, $t2, 1 - vinsgr2vr.h $vr8, $t5, 2 - vinsgr2vr.h $vr8, $ra, 3 + ld.h $fp, $a0, -30 + ld.h $s0, $a0, -22 + ld.h $s3, $a0, -14 + ld.h $s4, $a0, -6 + vinsgr2vr.h $vr7, $fp, 0 + vinsgr2vr.h $vr7, $s0, 1 + vinsgr2vr.h $vr7, $s3, 2 + vinsgr2vr.h $vr7, $s4, 3 + ld.h $fp, $a0, 2 + ld.h $s0, $a0, 10 + ld.h $s3, $a0, 18 + ld.h $s4, $a0, 26 + vinsgr2vr.h $vr8, $fp, 0 + vinsgr2vr.h $vr8, $s0, 1 + vinsgr2vr.h $vr8, $s3, 2 + vinsgr2vr.h $vr8, $s4, 3 vilvl.h $vr7, $vr3, $vr7 vilvl.h $vr8, $vr3, $vr8 - ld.h $a1, $s1, -30 - ld.h $t2, $s1, -22 - ld.h $t5, $s1, -14 - ld.h $ra, $s1, -6 - vinsgr2vr.h $vr9, $a1, 0 - vinsgr2vr.h $vr9, $t2, 1 - vinsgr2vr.h $vr9, $t5, 2 - vinsgr2vr.h $vr9, $ra, 3 - ld.h $a1, $s1, 2 - ld.h $t2, $s1, 10 - ld.h $t5, $s1, 18 - ld.h $ra, $s1, 26 - vinsgr2vr.h $vr10, $a1, 0 - vinsgr2vr.h $vr10, $t2, 1 - vinsgr2vr.h $vr10, $t5, 2 - vinsgr2vr.h $vr10, $ra, 3 + ld.h $fp, $t6, -30 + ld.h $s0, $t6, -22 + ld.h $s3, $t6, -14 + ld.h $s4, $t6, -6 + vinsgr2vr.h $vr9, $fp, 0 + vinsgr2vr.h $vr9, $s0, 1 + vinsgr2vr.h $vr9, $s3, 2 + vinsgr2vr.h $vr9, $s4, 3 + ld.h $fp, $t6, 2 + ld.h $s0, $t6, 10 + ld.h $s3, $t6, 18 + ld.h $s4, $t6, 26 + vinsgr2vr.h $vr10, $fp, 0 + vinsgr2vr.h $vr10, $s0, 1 + vinsgr2vr.h $vr10, $s3, 2 + vinsgr2vr.h $vr10, $s4, 3 vilvl.h $vr9, $vr3, $vr9 vilvl.h $vr10, $vr3, $vr10 vori.b $vr11, $vr12, 0 @@ -7745,64 +7745,64 @@ computeBiPredSSE2: # @computeBiPredSSE2 vadd.w $vr7, $vr7, $vr2 vmaxi.w $vr8, $vr8, 0 vmaxi.w $vr7, $vr7, 0 - vmin.w $vr8, $vr8, $vr5 - vmin.w $vr7, $vr7, $vr5 - ld.h $a1, $s3, -30 - ld.h $t2, $s3, -22 - ld.h $t5, $s3, -14 - ld.h $ra, $s3, -6 - vinsgr2vr.h $vr9, $a1, 0 - vinsgr2vr.h $vr9, $t2, 1 - vinsgr2vr.h $vr9, $t5, 2 - vinsgr2vr.h $vr9, $ra, 3 - ld.h $a1, $s3, 2 - ld.h $t2, $s3, 10 - ld.h $t5, $s3, 18 - ld.h $ra, $s3, 26 - vinsgr2vr.h $vr10, $a1, 0 - vinsgr2vr.h $vr10, $t2, 1 - vinsgr2vr.h $vr10, $t5, 2 - vinsgr2vr.h $vr10, $ra, 3 + vmin.w $vr8, $vr8, $vr4 + vmin.w $vr7, $vr7, $vr4 + ld.h $fp, $t7, -30 + ld.h $s0, $t7, -22 + ld.h $s3, $t7, -14 + ld.h $s4, $t7, -6 + vinsgr2vr.h $vr9, $fp, 0 + vinsgr2vr.h $vr9, $s0, 1 + vinsgr2vr.h $vr9, $s3, 2 + vinsgr2vr.h $vr9, $s4, 3 + ld.h $fp, $t7, 2 + ld.h $s0, $t7, 10 + ld.h $s3, $t7, 18 + ld.h $s4, $t7, 26 + vinsgr2vr.h $vr10, $fp, 0 + vinsgr2vr.h $vr10, $s0, 1 + vinsgr2vr.h $vr10, $s3, 2 + vinsgr2vr.h $vr10, $s4, 3 vilvl.h $vr9, $vr3, $vr9 vilvl.h $vr10, $vr3, $vr10 vsub.w $vr8, $vr9, $vr8 vsub.w $vr7, $vr10, $vr7 - vmadd.w $vr4, $vr8, $vr8 + vmadd.w $vr5, $vr8, $vr8 vmadd.w $vr6, $vr7, $vr7 - ld.h $a1, $a0, -28 - ld.h $t2, $a0, -20 - ld.h $t5, $a0, -12 - ld.h $ra, $a0, -4 - vinsgr2vr.h $vr7, $a1, 0 - vinsgr2vr.h $vr7, $t2, 1 - vinsgr2vr.h $vr7, $t5, 2 - vinsgr2vr.h $vr7, $ra, 3 - ld.h $a1, $a0, 4 - ld.h $t2, $a0, 12 - ld.h $t5, $a0, 20 - ld.h $ra, $a0, 28 - vinsgr2vr.h $vr8, $a1, 0 - vinsgr2vr.h $vr8, $t2, 1 - vinsgr2vr.h $vr8, $t5, 2 - vinsgr2vr.h $vr8, $ra, 3 + ld.h $fp, $a0, -28 + ld.h $s0, $a0, -20 + ld.h $s3, $a0, -12 + ld.h $s4, $a0, -4 + vinsgr2vr.h $vr7, $fp, 0 + vinsgr2vr.h $vr7, $s0, 1 + vinsgr2vr.h $vr7, $s3, 2 + vinsgr2vr.h $vr7, $s4, 3 + ld.h $fp, $a0, 4 + ld.h $s0, $a0, 12 + ld.h $s3, $a0, 20 + ld.h $s4, $a0, 28 + vinsgr2vr.h $vr8, $fp, 0 + vinsgr2vr.h $vr8, $s0, 1 + vinsgr2vr.h $vr8, $s3, 2 + vinsgr2vr.h $vr8, $s4, 3 vilvl.h $vr7, $vr3, $vr7 vilvl.h $vr8, $vr3, $vr8 - ld.h $a1, $s1, -28 - ld.h $t2, $s1, -20 - ld.h $t5, $s1, -12 - ld.h $ra, $s1, -4 - vinsgr2vr.h $vr9, $a1, 0 - vinsgr2vr.h $vr9, $t2, 1 - vinsgr2vr.h $vr9, $t5, 2 - vinsgr2vr.h $vr9, $ra, 3 - ld.h $a1, $s1, 4 - ld.h $t2, $s1, 12 - ld.h $t5, $s1, 20 - ld.h $ra, $s1, 28 - vinsgr2vr.h $vr10, $a1, 0 - vinsgr2vr.h $vr10, $t2, 1 - vinsgr2vr.h $vr10, $t5, 2 - vinsgr2vr.h $vr10, $ra, 3 + ld.h $fp, $t6, -28 + ld.h $s0, $t6, -20 + ld.h $s3, $t6, -12 + ld.h $s4, $t6, -4 + vinsgr2vr.h $vr9, $fp, 0 + vinsgr2vr.h $vr9, $s0, 1 + vinsgr2vr.h $vr9, $s3, 2 + vinsgr2vr.h $vr9, $s4, 3 + ld.h $fp, $t6, 4 + ld.h $s0, $t6, 12 + ld.h $s3, $t6, 20 + ld.h $s4, $t6, 28 + vinsgr2vr.h $vr10, $fp, 0 + vinsgr2vr.h $vr10, $s0, 1 + vinsgr2vr.h $vr10, $s3, 2 + vinsgr2vr.h $vr10, $s4, 3 vilvl.h $vr9, $vr3, $vr9 vilvl.h $vr10, $vr3, $vr10 vori.b $vr11, $vr12, 0 @@ -7817,64 +7817,64 @@ computeBiPredSSE2: # @computeBiPredSSE2 vadd.w $vr7, $vr7, $vr2 vmaxi.w $vr8, $vr8, 0 vmaxi.w $vr7, $vr7, 0 - vmin.w $vr8, $vr8, $vr5 - vmin.w $vr7, $vr7, $vr5 - ld.h $a1, $s3, -28 - ld.h $t2, $s3, -20 - ld.h $t5, $s3, -12 - ld.h $ra, $s3, -4 - vinsgr2vr.h $vr9, $a1, 0 - vinsgr2vr.h $vr9, $t2, 1 - vinsgr2vr.h $vr9, $t5, 2 - vinsgr2vr.h $vr9, $ra, 3 - ld.h $a1, $s3, 4 - ld.h $t2, $s3, 12 - ld.h $t5, $s3, 20 - ld.h $ra, $s3, 28 - vinsgr2vr.h $vr10, $a1, 0 - vinsgr2vr.h $vr10, $t2, 1 - vinsgr2vr.h $vr10, $t5, 2 - vinsgr2vr.h $vr10, $ra, 3 + vmin.w $vr8, $vr8, $vr4 + vmin.w $vr7, $vr7, $vr4 + ld.h $fp, $t7, -28 + ld.h $s0, $t7, -20 + ld.h $s3, $t7, -12 + ld.h $s4, $t7, -4 + vinsgr2vr.h $vr9, $fp, 0 + vinsgr2vr.h $vr9, $s0, 1 + vinsgr2vr.h $vr9, $s3, 2 + vinsgr2vr.h $vr9, $s4, 3 + ld.h $fp, $t7, 4 + ld.h $s0, $t7, 12 + ld.h $s3, $t7, 20 + ld.h $s4, $t7, 28 + vinsgr2vr.h $vr10, $fp, 0 + vinsgr2vr.h $vr10, $s0, 1 + vinsgr2vr.h $vr10, $s3, 2 + vinsgr2vr.h $vr10, $s4, 3 vilvl.h $vr9, $vr3, $vr9 vilvl.h $vr10, $vr3, $vr10 vsub.w $vr8, $vr9, $vr8 vsub.w $vr7, $vr10, $vr7 - vmadd.w $vr4, $vr8, $vr8 + vmadd.w $vr5, $vr8, $vr8 vmadd.w $vr6, $vr7, $vr7 - ld.h $a1, $a0, -26 - ld.h $t2, $a0, -18 - ld.h $t5, $a0, -10 - ld.h $ra, $a0, -2 - vinsgr2vr.h $vr7, $a1, 0 - vinsgr2vr.h $vr7, $t2, 1 - vinsgr2vr.h $vr7, $t5, 2 - vinsgr2vr.h $vr7, $ra, 3 - ld.h $a1, $a0, 6 - ld.h $t2, $a0, 14 - ld.h $t5, $a0, 22 - ld.h $ra, $a0, 30 - vinsgr2vr.h $vr8, $a1, 0 - vinsgr2vr.h $vr8, $t2, 1 - vinsgr2vr.h $vr8, $t5, 2 - vinsgr2vr.h $vr8, $ra, 3 + ld.h $fp, $a0, -26 + ld.h $s0, $a0, -18 + ld.h $s3, $a0, -10 + ld.h $s4, $a0, -2 + vinsgr2vr.h $vr7, $fp, 0 + vinsgr2vr.h $vr7, $s0, 1 + vinsgr2vr.h $vr7, $s3, 2 + vinsgr2vr.h $vr7, $s4, 3 + ld.h $fp, $a0, 6 + ld.h $s0, $a0, 14 + ld.h $s3, $a0, 22 + ld.h $s4, $a0, 30 + vinsgr2vr.h $vr8, $fp, 0 + vinsgr2vr.h $vr8, $s0, 1 + vinsgr2vr.h $vr8, $s3, 2 + vinsgr2vr.h $vr8, $s4, 3 vilvl.h $vr7, $vr3, $vr7 vilvl.h $vr8, $vr3, $vr8 - ld.h $a1, $s1, -26 - ld.h $t2, $s1, -18 - ld.h $t5, $s1, -10 - ld.h $ra, $s1, -2 - vinsgr2vr.h $vr9, $a1, 0 - vinsgr2vr.h $vr9, $t2, 1 - vinsgr2vr.h $vr9, $t5, 2 - vinsgr2vr.h $vr9, $ra, 3 - ld.h $a1, $s1, 6 - ld.h $t2, $s1, 14 - ld.h $t5, $s1, 22 - ld.h $ra, $s1, 30 - vinsgr2vr.h $vr10, $a1, 0 - vinsgr2vr.h $vr10, $t2, 1 - vinsgr2vr.h $vr10, $t5, 2 - vinsgr2vr.h $vr10, $ra, 3 + ld.h $fp, $t6, -26 + ld.h $s0, $t6, -18 + ld.h $s3, $t6, -10 + ld.h $s4, $t6, -2 + vinsgr2vr.h $vr9, $fp, 0 + vinsgr2vr.h $vr9, $s0, 1 + vinsgr2vr.h $vr9, $s3, 2 + vinsgr2vr.h $vr9, $s4, 3 + ld.h $fp, $t6, 6 + ld.h $s0, $t6, 14 + ld.h $s3, $t6, 22 + ld.h $s4, $t6, 30 + vinsgr2vr.h $vr10, $fp, 0 + vinsgr2vr.h $vr10, $s0, 1 + vinsgr2vr.h $vr10, $s3, 2 + vinsgr2vr.h $vr10, $s4, 3 vilvl.h $vr9, $vr3, $vr9 vilvl.h $vr10, $vr3, $vr10 vori.b $vr11, $vr12, 0 @@ -7889,169 +7889,126 @@ computeBiPredSSE2: # @computeBiPredSSE2 vadd.w $vr7, $vr7, $vr2 vmaxi.w $vr8, $vr8, 0 vmaxi.w $vr7, $vr7, 0 - vmin.w $vr8, $vr8, $vr5 - vmin.w $vr7, $vr7, $vr5 - ld.h $a1, $s3, -26 - ld.h $t2, $s3, -18 - ld.h $t5, $s3, -10 - ld.h $ra, $s3, -2 - vinsgr2vr.h $vr9, $a1, 0 - vinsgr2vr.h $vr9, $t2, 1 - vinsgr2vr.h $vr9, $t5, 2 - vinsgr2vr.h $vr9, $ra, 3 - ld.h $a1, $s3, 6 - ld.h $t2, $s3, 14 - ld.h $t5, $s3, 22 - ld.h $ra, $s3, 30 - vinsgr2vr.h $vr10, $a1, 0 - vinsgr2vr.h $vr10, $t2, 1 - vinsgr2vr.h $vr10, $t5, 2 - vinsgr2vr.h $vr10, $ra, 3 + vmin.w $vr8, $vr8, $vr4 + vmin.w $vr7, $vr7, $vr4 + ld.h $fp, $t7, -26 + ld.h $s0, $t7, -18 + ld.h $s3, $t7, -10 + ld.h $s4, $t7, -2 + vinsgr2vr.h $vr9, $fp, 0 + vinsgr2vr.h $vr9, $s0, 1 + vinsgr2vr.h $vr9, $s3, 2 + vinsgr2vr.h $vr9, $s4, 3 + ld.h $fp, $t7, 6 + ld.h $s0, $t7, 14 + ld.h $s3, $t7, 22 + ld.h $s4, $t7, 30 + vinsgr2vr.h $vr10, $fp, 0 + vinsgr2vr.h $vr10, $s0, 1 + vinsgr2vr.h $vr10, $s3, 2 + vinsgr2vr.h $vr10, $s4, 3 vilvl.h $vr9, $vr3, $vr9 vilvl.h $vr10, $vr3, $vr10 vsub.w $vr8, $vr9, $vr8 vsub.w $vr7, $vr10, $vr7 - vmadd.w $vr4, $vr8, $vr8 + vmadd.w $vr5, $vr8, $vr8 vmadd.w $vr6, $vr7, $vr7 - addi.d $s4, $s4, -8 + addi.d $t8, $t8, -8 addi.d $a0, $a0, 64 - addi.d $s1, $s1, 64 - addi.d $s3, $s3, 64 - bnez $s4, .LBB16_6 + addi.d $t6, $t6, 64 + addi.d $t7, $t7, 64 + bnez $t8, .LBB16_6 # %bb.7: # %middle.block # in Loop: Header=BB16_2 Depth=1 - vadd.w $vr4, $vr6, $vr4 - vhaddw.d.w $vr4, $vr4, $vr4 - vhaddw.q.d $vr4, $vr4, $vr4 - vpickve2gr.d $s3, $vr4, 0 - ld.d $s1, $sp, 224 # 8-byte Folded Reload - ld.d $s4, $sp, 232 # 8-byte Folded Reload - beq $t1, $t0, .LBB16_9 + vadd.w $vr5, $vr6, $vr5 + vhaddw.d.w $vr5, $vr5, $vr5 + vhaddw.q.d $vr5, $vr5, $vr5 + vpickve2gr.d $s3, $vr5, 0 + move $t8, $a7 + ld.d $fp, $sp, 240 # 8-byte Folded Reload + beq $a6, $a5, .LBB16_10 +.LBB16_8: # %scalar.ph.preheader + # in Loop: Header=BB16_2 Depth=1 + move $t7, $t3 + move $t6, $t4 + move $a0, $t5 .p2align 4, , 16 -.LBB16_8: # %scalar.ph +.LBB16_9: # %scalar.ph # Parent Loop BB16_2 Depth=1 # => This Inner Loop Header: Depth=2 - ld.hu $a0, $t7, 0 - ld.hu $a1, $s0, 0 - mul.d $a0, $a0, $a2 - mul.d $a1, $a1, $a3 - add.d $a0, $a0, $s8 - add.d $a0, $a0, $a1 - sra.w $a0, $a0, $s7 - add.w $a0, $a0, $a7 - srai.d $a1, $a0, 63 - andn $a0, $a0, $a1 - slt $a1, $a0, $t6 - ld.hu $t2, $t8, 0 - maskeqz $a0, $a0, $a1 - masknez $a1, $t6, $a1 - or $a0, $a0, $a1 - sub.d $a0, $t2, $a0 - ld.hu $a1, $t7, 2 - ld.hu $t2, $s0, 2 - mulw.d.w $a0, $a0, $a0 - add.d $a0, $a0, $s3 - mul.d $a1, $a1, $a2 - mul.d $t2, $t2, $a3 - add.d $a1, $a1, $s8 - add.d $a1, $a1, $t2 - sra.w $a1, $a1, $s7 - add.w $a1, $a1, $a7 - srai.d $t2, $a1, 63 - andn $a1, $a1, $t2 - slt $t2, $a1, $t6 - ld.hu $t5, $t8, 2 - maskeqz $a1, $a1, $t2 - masknez $t2, $t6, $t2 - or $a1, $a1, $t2 - sub.d $a1, $t5, $a1 - ld.hu $t2, $t7, 4 - ld.hu $t5, $s0, 4 - mul.d $a1, $a1, $a1 - add.d $a0, $a0, $a1 - mul.d $a1, $t2, $a2 - mul.d $t2, $t5, $a3 - add.d $a1, $a1, $s8 - add.d $a1, $a1, $t2 - sra.w $a1, $a1, $s7 - add.w $a1, $a1, $a7 - srai.d $t2, $a1, 63 - andn $a1, $a1, $t2 - slt $t2, $a1, $t6 - ld.hu $t5, $t8, 4 - maskeqz $a1, $a1, $t2 - masknez $t2, $t6, $t2 - or $a1, $a1, $t2 - sub.d $a1, $t5, $a1 - ld.hu $t2, $t7, 6 - ld.hu $t5, $s0, 6 - mul.d $a1, $a1, $a1 - add.d $a0, $a0, $a1 - mul.d $a1, $t2, $a2 - mul.d $t2, $t5, $a3 - add.d $a1, $a1, $s8 - add.d $a1, $a1, $t2 - sra.w $a1, $a1, $s7 - add.w $a1, $a1, $a7 - srai.d $t2, $a1, 63 - andn $a1, $a1, $t2 - slt $t2, $a1, $t6 - maskeqz $a1, $a1, $t2 - masknez $t2, $t6, $t2 - or $a1, $a1, $t2 - ld.hu $t2, $t8, 6 - addi.d $t7, $t7, 8 - addi.d $s0, $s0, 8 - addi.d $t8, $t8, 8 - sub.d $a1, $t2, $a1 - mul.d $a1, $a1, $a1 - addi.w $s1, $s1, 4 - add.d $s3, $a0, $a1 - blt $s1, $s6, .LBB16_8 -.LBB16_9: # %._crit_edge + addi.d $t3, $t7, 8 + ld.d $t7, $t7, 0 + addi.d $t4, $t6, 8 + ld.d $t6, $t6, 0 + addi.d $t5, $a0, 8 + vinsgr2vr.d $vr5, $t7, 0 + vilvl.h $vr5, $vr3, $vr5 + vinsgr2vr.d $vr6, $t6, 0 + vilvl.h $vr6, $vr3, $vr6 + vori.b $vr7, $vr12, 0 + vmadd.w $vr7, $vr0, $vr5 + vmadd.w $vr7, $vr1, $vr6 + vsra.w $vr5, $vr7, $vr13 + ld.d $a0, $a0, 0 + vadd.w $vr5, $vr5, $vr2 + vmaxi.w $vr5, $vr5, 0 + vmin.w $vr5, $vr5, $vr4 + vinsgr2vr.d $vr6, $a0, 0 + vilvl.h $vr6, $vr3, $vr6 + vsub.w $vr5, $vr6, $vr5 + vmul.w $vr5, $vr5, $vr5 + vhaddw.d.w $vr5, $vr5, $vr5 + vhaddw.q.d $vr5, $vr5, $vr5 + vpickve2gr.d $a0, $vr5, 0 + addi.w $t8, $t8, 4 + add.d $s3, $a0, $s3 + move $t7, $t3 + move $t6, $t4 + move $a0, $t5 + blt $t8, $s6, .LBB16_9 +.LBB16_10: # %._crit_edge # in Loop: Header=BB16_2 Depth=1 - st.d $t7, $fp, %pc_lo12(ref1_line) - st.d $s0, $s2, %pc_lo12(ref2_line) - st.d $t8, $s4, %pc_lo12(src_line) - move $s4, $t8 - move $a0, $t7 - move $ra, $s0 -.LBB16_10: # in Loop: Header=BB16_2 Depth=1 - addi.w $a1, $s3, 0 - ld.d $t2, $sp, 240 # 8-byte Folded Reload - bge $a1, $t2, .LBB16_30 -# %bb.11: # in Loop: Header=BB16_2 Depth=1 - alsl.d $ra, $a4, $ra, 1 - st.d $ra, $s2, %pc_lo12(ref2_line) - alsl.d $a0, $a4, $a0, 1 - addi.w $a5, $a5, 1 - st.d $a0, $fp, %pc_lo12(ref1_line) - bne $a5, $s5, .LBB16_2 - b .LBB16_13 -.LBB16_12: + st.d $t3, $s1, %pc_lo12(ref1_line) + st.d $t4, $s2, %pc_lo12(ref2_line) + st.d $t5, $fp, %pc_lo12(src_line) + move $t7, $t5 + move $a0, $t3 + move $t6, $t4 +.LBB16_11: # in Loop: Header=BB16_2 Depth=1 + addi.w $t3, $s3, 0 + bge $t3, $ra, .LBB16_31 +# %bb.12: # in Loop: Header=BB16_2 Depth=1 + alsl.d $t6, $a2, $t6, 1 + st.d $t6, $s2, %pc_lo12(ref2_line) + alsl.d $a0, $a2, $a0, 1 + addi.w $a3, $a3, 1 + st.d $a0, $s1, %pc_lo12(ref1_line) + bne $a3, $s5, .LBB16_2 + b .LBB16_14 +.LBB16_13: move $s3, $zero -.LBB16_13: # %._crit_edge132 +.LBB16_14: # %._crit_edge132 pcalau12i $a0, %pc_hi20(ChromaMEEnable) ld.w $a0, $a0, %pc_lo12(ChromaMEEnable) - beqz $a0, .LBB16_30 -# %bb.14: + beqz $a0, .LBB16_31 +# %bb.15: pcalau12i $a0, %pc_hi20(shift_cr_x) ld.wu $a0, $a0, %pc_lo12(shift_cr_x) - pcalau12i $a1, %pc_hi20(shift_cr_y) - ld.wu $a1, $a1, %pc_lo12(shift_cr_y) - pcalau12i $a2, %pc_hi20(img_cr_padded_size_x) - ld.w $a2, $a2, %pc_lo12(img_cr_padded_size_x) + pcalau12i $a2, %pc_hi20(shift_cr_y) + ld.wu $a2, $a2, %pc_lo12(shift_cr_y) + pcalau12i $a3, %pc_hi20(img_cr_padded_size_x) + ld.w $a3, $a3, %pc_lo12(img_cr_padded_size_x) vrepli.b $vr11, 0 sra.w $s6, $s6, $a0 - sra.w $s5, $s5, $a1 - sub.w $s1, $a2, $s6 + sra.w $s5, $s5, $a2 + sub.w $fp, $a3, $s6 bstrpick.d $s4, $s6, 31, 0 bstrpick.d $a0, $s6, 30, 3 - slli.d $a1, $a0, 3 - st.d $a1, $sp, 224 # 8-byte Folded Spill + slli.d $a2, $a0, 3 + st.d $a2, $sp, 232 # 8-byte Folded Spill slli.d $a0, $a0, 5 st.d $a0, $sp, 208 # 8-byte Folded Spill - ori $a1, $zero, 1 + ori $a2, $zero, 1 pcalau12i $a0, %pc_hi20(get_crline) addi.d $a0, $a0, %pc_lo12(get_crline) st.d $a0, $sp, 120 # 8-byte Folded Spill @@ -8068,27 +8025,25 @@ computeBiPredSSE2: # @computeBiPredSSE2 addi.d $a0, $a0, %pc_lo12(offsetBi_cr) st.d $a0, $sp, 16 # 8-byte Folded Spill move $s0, $zero - lu12i.w $a0, 3 - ori $a0, $a0, 3236 - st.d $a0, $sp, 216 # 8-byte Folded Spill - ld.d $t8, $sp, 232 # 8-byte Folded Reload + ori $a0, $a1, 3236 + st.d $a0, $sp, 224 # 8-byte Folded Spill vst $vr12, $sp, 96 # 16-byte Folded Spill vst $vr13, $sp, 80 # 16-byte Folded Spill vst $vr11, $sp, 64 # 16-byte Folded Spill st.d $s4, $sp, 56 # 8-byte Folded Spill - st.d $s1, $sp, 48 # 8-byte Folded Spill -.LBB16_15: # =>This Loop Header: Depth=1 - # Child Loop BB16_17 Depth 2 - # Child Loop BB16_21 Depth 3 - # Child Loop BB16_24 Depth 3 - st.d $a1, $sp, 128 # 8-byte Folded Spill + st.d $fp, $sp, 48 # 8-byte Folded Spill +.LBB16_16: # =>This Loop Header: Depth=1 + # Child Loop BB16_18 Depth 2 + # Child Loop BB16_22 Depth 3 + # Child Loop BB16_25 Depth 3 + st.d $a2, $sp, 128 # 8-byte Folded Spill ld.d $a0, $sp, 160 # 8-byte Folded Reload ld.w $a0, $a0, %pc_lo12(bipred2_access_method) slli.d $a0, $a0, 3 - move $s1, $s2 - move $s2, $fp - ld.d $fp, $sp, 120 # 8-byte Folded Reload - ldx.d $a3, $fp, $a0 + move $fp, $s2 + move $s2, $s1 + ld.d $s1, $sp, 120 # 8-byte Folded Reload + ldx.d $a3, $s1, $a0 ld.d $a0, $sp, 152 # 8-byte Folded Reload alsl.d $a0, $s0, $a0, 3 ld.d $a0, $a0, 8 @@ -8096,30 +8051,30 @@ computeBiPredSSE2: # @computeBiPredSSE2 sll.w $a1, $a1, $s0 ld.d $a2, $sp, 168 # 8-byte Folded Reload alsl.d $a1, $a1, $a2, 1 - st.d $a1, $t8, %pc_lo12(src_line) + ld.d $s4, $sp, 240 # 8-byte Folded Reload + st.d $a1, $s4, %pc_lo12(src_line) ld.d $a1, $sp, 200 # 8-byte Folded Reload ld.d $a2, $sp, 192 # 8-byte Folded Reload - move $s4, $t8 jirl $ra, $a3, 0 ld.d $a1, $sp, 144 # 8-byte Folded Reload ld.w $a1, $a1, %pc_lo12(bipred1_access_method) slli.d $a1, $a1, 3 - ldx.d $a3, $fp, $a1 - move $fp, $s2 - move $s2, $s1 + ldx.d $a3, $s1, $a1 + move $s1, $s2 + move $s2, $fp ld.d $a1, $sp, 136 # 8-byte Folded Reload alsl.d $a1, $s0, $a1, 3 ld.d $a1, $a1, 8 - st.d $a0, $s1, %pc_lo12(ref2_line) + st.d $a0, $fp, %pc_lo12(ref2_line) move $a0, $a1 ld.d $a1, $sp, 184 # 8-byte Folded Reload ld.d $a2, $sp, 176 # 8-byte Folded Reload jirl $ra, $a3, 0 - st.d $a0, $fp, %pc_lo12(ref1_line) + st.d $a0, $s1, %pc_lo12(ref1_line) move $a6, $s5 - blez $s5, .LBB16_28 -# %bb.16: # %.preheader.lr.ph - # in Loop: Header=BB16_15 Depth=1 + blez $s5, .LBB16_29 +# %bb.17: # %.preheader.lr.ph + # in Loop: Header=BB16_16 Depth=1 move $a1, $zero ld.d $t6, $s4, %pc_lo12(src_line) ld.d $t5, $s2, %pc_lo12(ref2_line) @@ -8131,38 +8086,39 @@ computeBiPredSSE2: # @computeBiPredSSE2 alsl.d $a4, $s0, $a4, 1 ld.d $a5, $sp, 16 # 8-byte Folded Reload alsl.d $a5, $s0, $a5, 1 - move $t8, $s4 + ld.d $t8, $sp, 216 # 8-byte Folded Reload + move $s0, $s4 vld $vr12, $sp, 96 # 16-byte Folded Reload vld $vr13, $sp, 80 # 16-byte Folded Reload vld $vr11, $sp, 64 # 16-byte Folded Reload ld.d $s4, $sp, 56 # 8-byte Folded Reload move $s5, $a6 - ld.d $s1, $sp, 48 # 8-byte Folded Reload + ld.d $fp, $sp, 48 # 8-byte Folded Reload .p2align 4, , 16 -.LBB16_17: # %.preheader - # Parent Loop BB16_15 Depth=1 +.LBB16_18: # %.preheader + # Parent Loop BB16_16 Depth=1 # => This Loop Header: Depth=2 - # Child Loop BB16_21 Depth 3 - # Child Loop BB16_24 Depth 3 - blez $s6, .LBB16_26 -# %bb.18: # %.lr.ph136 - # in Loop: Header=BB16_17 Depth=2 + # Child Loop BB16_22 Depth 3 + # Child Loop BB16_25 Depth 3 + blez $s6, .LBB16_27 +# %bb.19: # %.lr.ph136 + # in Loop: Header=BB16_18 Depth=2 ld.h $a6, $a3, 0 ld.h $a7, $a4, 0 - ld.d $t0, $sp, 216 # 8-byte Folded Reload + ld.d $t0, $sp, 224 # 8-byte Folded Reload ldx.w $t0, $a2, $t0 ld.h $t1, $a5, 0 ori $t2, $zero, 8 - bgeu $s6, $t2, .LBB16_20 -# %bb.19: # in Loop: Header=BB16_17 Depth=2 + bgeu $s6, $t2, .LBB16_21 +# %bb.20: # in Loop: Header=BB16_18 Depth=2 move $t7, $zero move $t2, $a0 move $t4, $t5 move $t3, $t6 - b .LBB16_23 + b .LBB16_24 .p2align 4, , 16 -.LBB16_20: # %vector.ph242 - # in Loop: Header=BB16_17 Depth=2 +.LBB16_21: # %vector.ph242 + # in Loop: Header=BB16_18 Depth=2 ld.d $t3, $sp, 208 # 8-byte Folded Reload add.d $t2, $a0, $t3 add.d $t4, $t5, $t3 @@ -8176,12 +8132,12 @@ computeBiPredSSE2: # @computeBiPredSSE2 addi.d $a0, $a0, 16 addi.d $t5, $t5, 16 addi.d $t6, $t6, 16 - ld.d $t7, $sp, 224 # 8-byte Folded Reload + ld.d $t7, $sp, 232 # 8-byte Folded Reload vori.b $vr5, $vr11, 0 .p2align 4, , 16 -.LBB16_21: # %vector.body258 - # Parent Loop BB16_15 Depth=1 - # Parent Loop BB16_17 Depth=2 +.LBB16_22: # %vector.body258 + # Parent Loop BB16_16 Depth=1 + # Parent Loop BB16_18 Depth=2 # => This Inner Loop Header: Depth=3 ld.h $t8, $a0, -16 ld.h $s0, $a0, -12 @@ -8331,24 +8287,25 @@ computeBiPredSSE2: # @computeBiPredSSE2 addi.d $a0, $a0, 32 addi.d $t5, $t5, 32 addi.d $t6, $t6, 32 - bnez $t7, .LBB16_21 -# %bb.22: # %middle.block290 - # in Loop: Header=BB16_17 Depth=2 + bnez $t7, .LBB16_22 +# %bb.23: # %middle.block290 + # in Loop: Header=BB16_18 Depth=2 vadd.w $vr0, $vr5, $vr0 vhaddw.d.w $vr0, $vr0, $vr0 vhaddw.q.d $vr0, $vr0, $vr0 vpickve2gr.d $s3, $vr0, 0 - ld.d $a0, $sp, 224 # 8-byte Folded Reload + ld.d $a0, $sp, 232 # 8-byte Folded Reload move $t7, $a0 - ld.d $t8, $sp, 232 # 8-byte Folded Reload - beq $a0, $s4, .LBB16_25 -.LBB16_23: # %scalar.ph240.preheader - # in Loop: Header=BB16_17 Depth=2 + ld.d $t8, $sp, 216 # 8-byte Folded Reload + ld.d $s0, $sp, 240 # 8-byte Folded Reload + beq $a0, $s4, .LBB16_26 +.LBB16_24: # %scalar.ph240.preheader + # in Loop: Header=BB16_18 Depth=2 sub.d $a0, $s6, $t7 .p2align 4, , 16 -.LBB16_24: # %scalar.ph240 - # Parent Loop BB16_15 Depth=1 - # Parent Loop BB16_17 Depth=2 +.LBB16_25: # %scalar.ph240 + # Parent Loop BB16_16 Depth=1 + # Parent Loop BB16_18 Depth=2 # => This Inner Loop Header: Depth=3 ld.hu $t5, $t2, 0 ld.hu $t6, $t4, 0 @@ -8390,39 +8347,37 @@ computeBiPredSSE2: # @computeBiPredSSE2 mul.d $t6, $t6, $t6 addi.w $a0, $a0, -1 add.d $s3, $t5, $t6 - bnez $a0, .LBB16_24 -.LBB16_25: # %._crit_edge137 - # in Loop: Header=BB16_17 Depth=2 - st.d $t2, $fp, %pc_lo12(ref1_line) + bnez $a0, .LBB16_25 +.LBB16_26: # %._crit_edge137 + # in Loop: Header=BB16_18 Depth=2 + st.d $t2, $s1, %pc_lo12(ref1_line) st.d $t4, $s2, %pc_lo12(ref2_line) - st.d $t3, $t8, %pc_lo12(src_line) + st.d $t3, $s0, %pc_lo12(src_line) move $t6, $t3 move $a0, $t2 move $t5, $t4 -.LBB16_26: # in Loop: Header=BB16_17 Depth=2 +.LBB16_27: # in Loop: Header=BB16_18 Depth=2 addi.w $a6, $s3, 0 - ld.d $a7, $sp, 240 # 8-byte Folded Reload - bge $a6, $a7, .LBB16_30 -# %bb.27: # in Loop: Header=BB16_17 Depth=2 - alsl.d $t5, $s1, $t5, 1 + bge $a6, $t8, .LBB16_31 +# %bb.28: # in Loop: Header=BB16_18 Depth=2 + alsl.d $t5, $fp, $t5, 1 st.d $t5, $s2, %pc_lo12(ref2_line) - alsl.d $a0, $s1, $a0, 1 + alsl.d $a0, $fp, $a0, 1 addi.w $a1, $a1, 1 - st.d $a0, $fp, %pc_lo12(ref1_line) - bne $a1, $s5, .LBB16_17 - b .LBB16_29 + st.d $a0, $s1, %pc_lo12(ref1_line) + bne $a1, $s5, .LBB16_18 + b .LBB16_30 .p2align 4, , 16 -.LBB16_28: # in Loop: Header=BB16_15 Depth=1 - move $t8, $s4 +.LBB16_29: # in Loop: Header=BB16_16 Depth=1 move $s5, $a6 -.LBB16_29: # %._crit_edge156 - # in Loop: Header=BB16_15 Depth=1 +.LBB16_30: # %._crit_edge156 + # in Loop: Header=BB16_16 Depth=1 ld.d $a0, $sp, 128 # 8-byte Folded Reload andi $a0, $a0, 1 ori $s0, $zero, 1 - move $a1, $zero - bnez $a0, .LBB16_15 -.LBB16_30: # %.loopexit + move $a2, $zero + bnez $a0, .LBB16_16 +.LBB16_31: # %.loopexit addi.w $a0, $s3, 0 ld.d $s8, $sp, 248 # 8-byte Folded Reload ld.d $s7, $sp, 256 # 8-byte Folded Reload diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_epzs.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_epzs.s index 4e966a7a..7f75e64e 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_epzs.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/me_epzs.s @@ -379,6 +379,21 @@ EPZSWindowPredictorInit: # @EPZSWindowPredictorInit .section .rodata.cst16,"aM",@progbits,16 .p2align 4, 0x0 # -- Begin function EPZSInit .LCPI6_0: + .word 8 # 0x8 + .word 7 # 0x7 + .word 7 # 0x7 + .word 6 # 0x6 +.LCPI6_1: + .word 768 # 0x300 + .word 384 # 0x180 + .word 384 # 0x180 + .word 192 # 0xc0 +.LCPI6_2: + .word 6 # 0x6 + .word 5 # 0x5 + .word 5 # 0x5 + .word 4 # 0x4 +.LCPI6_3: .word 0 # 0x0 .word 0 # 0x0 .word 3 # 0x3 @@ -466,62 +481,45 @@ EPZSInit: # @EPZSInit st.w $zero, $a7, 0 pcalau12i $t0, %pc_hi20(subthres) addi.d $t0, $t0, %pc_lo12(subthres) + pcalau12i $t1, %pc_hi20(.LCPI6_0) + vld $vr0, $t1, %pc_lo12(.LCPI6_0) st.w $zero, $t0, 0 - slli.d $t1, $a4, 8 + vreplgr2vr.w $vr1, $a4 + vsll.w $vr1, $vr1, $vr0 + vreplgr2vr.w $vr2, $a0 + pcalau12i $t1, %pc_hi20(.LCPI6_1) + vld $vr3, $t1, %pc_lo12(.LCPI6_1) + vsll.w $vr1, $vr1, $vr2 + vst $vr1, $a5, 4 + vreplgr2vr.w $vr1, $a3 + vmul.w $vr1, $vr1, $vr3 + pcalau12i $t1, %pc_hi20(.LCPI6_2) + vld $vr3, $t1, %pc_lo12(.LCPI6_2) + vsll.w $vr1, $vr1, $vr2 + vst $vr1, $a6, 4 + vreplgr2vr.w $vr1, $a2 + vsll.w $vr1, $vr1, $vr3 + vsll.w $vr1, $vr1, $vr2 + vst $vr1, $a7, 4 + vreplgr2vr.w $vr1, $a1 + vsll.w $vr0, $vr1, $vr0 + vsll.w $vr0, $vr0, $vr2 + vst $vr0, $t0, 4 + slli.d $t1, $a4, 5 sll.w $t1, $t1, $a0 - st.w $t1, $a5, 4 - alsl.d $t1, $a3, $a3, 1 - slli.d $t2, $t1, 8 - sll.w $t2, $t2, $a0 - st.w $t2, $a6, 4 - slli.d $t2, $a2, 6 - sll.w $t2, $t2, $a0 - st.w $t2, $a7, 4 - slli.d $t2, $a1, 8 + st.w $t1, $a5, 20 + alsl.d $t2, $a3, $a3, 1 + slli.d $t2, $t2, 5 sll.w $t2, $t2, $a0 - st.w $t2, $t0, 4 - slli.d $t2, $a4, 7 - sll.w $t2, $t2, $a0 - st.w $t2, $a5, 8 - slli.d $t3, $t1, 7 - sll.w $t3, $t3, $a0 - st.w $t3, $a6, 8 - slli.d $t4, $a2, 5 - sll.w $t4, $t4, $a0 - st.w $t4, $a7, 8 - slli.d $t5, $a1, 7 - sll.w $t5, $t5, $a0 - st.w $t5, $t0, 8 - st.w $t2, $a5, 12 - st.w $t3, $a6, 12 - st.w $t4, $a7, 12 - st.w $t5, $t0, 12 - slli.d $t2, $a4, 6 - sll.w $t2, $t2, $a0 - st.w $t2, $a5, 16 - slli.d $t2, $t1, 6 - sll.w $t2, $t2, $a0 - st.w $t2, $a6, 16 - slli.d $t2, $a2, 4 - sll.w $t2, $t2, $a0 - st.w $t2, $a7, 16 - slli.d $t2, $a1, 6 - sll.w $t2, $t2, $a0 - st.w $t2, $t0, 16 - slli.d $t2, $a4, 5 - sll.w $t2, $t2, $a0 - st.w $t2, $a5, 20 - slli.d $t1, $t1, 5 - sll.w $t1, $t1, $a0 - st.w $t1, $a6, 20 + st.w $t2, $a6, 20 slli.d $t3, $a2, 3 sll.w $t3, $t3, $a0 st.w $t3, $a7, 20 slli.d $t4, $a1, 5 sll.w $t4, $t4, $a0 st.w $t4, $t0, 20 - st.w $t2, $a5, 24 - st.w $t1, $a6, 24 + st.w $t1, $a5, 24 + st.w $t2, $a6, 24 st.w $t3, $a7, 24 st.w $t4, $t0, 24 slli.d $a4, $a4, 4 @@ -567,11 +565,11 @@ EPZSInit: # @EPZSInit srl.w $s4, $s5, $s0 st.w $s4, $a0, 4 ori $s3, $zero, 3 - ori $s6, $zero, 3 - pcalau12i $a1, %pc_hi20(.LCPI6_0) - vld $vr0, $a1, %pc_lo12(.LCPI6_0) - lu32i.d $s6, 3 - st.d $s6, $a0, 8 + ori $s7, $zero, 3 + pcalau12i $a1, %pc_hi20(.LCPI6_3) + vld $vr0, $a1, %pc_lo12(.LCPI6_3) + lu32i.d $s7, 3 + st.d $s7, $a0, 8 st.w $s4, $a0, 16 vst $vr0, $a0, 20 addi.d $a1, $zero, -4 @@ -581,7 +579,7 @@ EPZSInit: # @EPZSInit lu32i.d $a1, 3 st.d $a1, $a0, 40 st.w $s2, $a0, 48 - ori $s7, $zero, 0 + ori $s6, $zero, 0 ori $a1, $zero, 0 lu32i.d $a1, 2 st.d $a1, $a0, 52 @@ -639,15 +637,15 @@ EPZSInit: # @EPZSInit st.d $a2, $a0, 56 st.w $zero, $a0, 64 st.w $s2, $a0, 68 - st.d $s6, $a0, 72 + st.d $s7, $a0, 72 st.w $s2, $a0, 80 st.w $s2, $a0, 84 ori $a2, $zero, 3 lu32i.d $a2, 5 st.d $a2, $a0, 88 st.w $s2, $a0, 96 - lu32i.d $s7, 5 - st.d $s7, $a0, 100 + lu32i.d $s6, 5 + st.d $s6, $a0, 100 st.w $s3, $a0, 108 st.w $s2, $a0, 112 st.w $s4, $a0, 116 @@ -701,9 +699,9 @@ EPZSInit: # @EPZSInit st.d $a3, $a6, 8 bne $a1, $a2, .LBB6_11 # %bb.12: # %assignEPZSpattern.exit34 - ori $s6, $zero, 1 - lu32i.d $s6, 1 - st.d $s6, $fp, 16 + ori $s7, $zero, 1 + lu32i.d $s7, 1 + st.d $s7, $fp, 16 st.d $fp, $fp, 24 ori $a0, $zero, 1 ori $a1, $zero, 32 @@ -772,7 +770,7 @@ EPZSInit: # @EPZSInit st.w $a4, $a0, 116 lu32i.d $a2, 3 st.d $a2, $a0, 120 - st.d $s6, $fp, 16 + st.d $s7, $fp, 16 st.d $fp, $fp, 24 ori $a0, $zero, 1 ori $a1, $zero, 32 @@ -795,7 +793,7 @@ EPZSInit: # @EPZSInit jirl $ra, $ra, 0 move $a1, $zero st.d $a0, $fp, 8 - ld.d $s6, $s5, 0 + ld.d $s7, $s5, 0 pcalau12i $a2, %pc_hi20(sbdiamond) st.d $fp, $a2, %pc_lo12(sbdiamond) ori $a2, $zero, 192 @@ -814,11 +812,11 @@ EPZSInit: # @EPZSInit st.d $a3, $a6, 8 bne $a1, $a2, .LBB6_17 # %bb.18: # %assignEPZSpattern.exit44 - ori $s7, $zero, 0 + ori $s6, $zero, 0 ori $s4, $zero, 0 lu32i.d $s4, 1 st.d $s4, $fp, 16 - st.d $s6, $fp, 24 + st.d $s7, $fp, 24 ori $a0, $zero, 1 ori $a1, $zero, 32 pcaddu18i $ra, %call36(calloc) @@ -831,7 +829,7 @@ EPZSInit: # @EPZSInit pcaddu18i $ra, %call36(no_mem_exit) jirl $ra, $ra, 0 pcalau12i $a0, %pc_hi20(sdiamond) - ld.d $s6, $a0, %pc_lo12(sdiamond) + ld.d $s7, $a0, %pc_lo12(sdiamond) ld.w $s0, $s2, %pc_lo12(mv_rescale) .LBB6_20: # %.lr.ph.i46 ori $s1, $zero, 8 @@ -879,15 +877,15 @@ EPZSInit: # @EPZSInit lu32i.d $a3, 3 st.d $a3, $a0, 88 st.w $a7, $a0, 96 - lu32i.d $s7, 4 - st.d $s7, $a0, 100 + lu32i.d $s6, 4 + st.d $s6, $a0, 100 st.w $a1, $a0, 108 st.w $a5, $a0, 112 st.w $a4, $a0, 116 lu32i.d $a2, 3 st.d $a2, $a0, 120 st.d $s4, $fp, 16 - st.d $s6, $fp, 24 + st.d $s7, $fp, 24 slli.w $fp, $s8, 3 ori $a0, $zero, 1 ori $a1, $zero, 32 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/rdopt.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/rdopt.s index f7ad59dc..d616c549 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/rdopt.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/rdopt.s @@ -602,16 +602,15 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef masknez $a6, $a6, $t0 maskeqz $a0, $a0, $t0 or $a0, $a0, $a6 - ori $t7, $zero, 4 - masknez $t3, $t7, $t1 + ori $s8, $zero, 4 + masknez $t3, $s8, $t1 ori $a6, $zero, 2 maskeqz $t1, $a6, $t1 or $t1, $t1, $t3 masknez $t1, $t1, $t0 maskeqz $t3, $a6, $t0 or $t1, $t3, $t1 - add.w $t1, $a0, $t1 - st.d $t1, $sp, 80 # 8-byte Folded Spill + add.w $ra, $a0, $t1 andi $t1, $t2, 2 addi.d $t3, $a3, -3 sltui $t3, $t3, 1 @@ -620,26 +619,27 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef maskeqz $t0, $t1, $t0 or $t0, $t0, $t2 slt $t1, $a6, $a3 - masknez $t2, $t7, $t1 + masknez $t2, $s8, $t1 maskeqz $t1, $a6, $t1 or $t1, $t1, $t2 - add.w $t1, $t0, $t1 - st.d $t1, $sp, 96 # 8-byte Folded Spill + add.w $t7, $t0, $t1 ldptr.d $t1, $a7, 14384 ld.w $t2, $a7, 172 - ld.w $s7, $a7, 168 + ld.w $s1, $a7, 168 pcalau12i $a7, %pc_hi20(pixel_map) ld.d $t5, $a7, %pc_lo12(pixel_map) addi.w $t6, $a0, 0 - slli.d $s1, $a1, 3 + vinsgr2vr.w $vr0, $a5, 0 + vinsgr2vr.w $vr0, $a5, 1 + slli.d $a1, $a1, 3 slli.d $s2, $a2, 3 - slli.d $a3, $a3, 3 + slli.d $s0, $a3, 3 addi.w $t8, $zero, -1 addi.w $fp, $zero, -2 - ori $s0, $zero, 1 - st.d $t1, $sp, 72 # 8-byte Folded Spill - st.d $t2, $sp, 64 # 8-byte Folded Spill - st.d $a3, $sp, 88 # 8-byte Folded Spill + ori $a3, $zero, 1 + vrepli.d $vr1, -2 + st.d $t1, $sp, 96 # 8-byte Folded Spill + st.d $t2, $sp, 88 # 8-byte Folded Spill .LBB3_1: # %.preheader230 # =>This Loop Header: Depth=1 # Child Loop BB3_2 Depth 2 @@ -653,12 +653,12 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef # Child Loop BB3_64 Depth 3 # Child Loop BB3_65 Depth 4 slli.d $a0, $t6, 3 - ldx.d $s8, $t1, $a0 + ldx.d $s7, $t1, $a0 add.d $a0, $t2, $t6 - slli.d $ra, $a0, 4 + slli.d $t3, $a0, 4 move $s3, $t0 - st.d $s8, $sp, 112 # 8-byte Folded Spill - st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $s7, $sp, 112 # 8-byte Folded Spill + st.d $t3, $sp, 104 # 8-byte Folded Spill .LBB3_2: # Parent Loop BB3_1 Depth=1 # => This Loop Header: Depth=2 # Child Loop BB3_141 Depth 3 @@ -671,424 +671,411 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef # Child Loop BB3_64 Depth 3 # Child Loop BB3_65 Depth 4 slli.d $a0, $s3, 3 - ldx.d $a0, $s8, $a0 - ldx.d $a0, $a0, $s1 + ldx.d $a0, $s7, $a0 + ldx.d $a0, $a0, $a1 ldx.d $a0, $a0, $s2 - ldx.d $a0, $a0, $a3 - ld.hu $a1, $a0, 2 - ld.h $a7, $a0, 0 - bstrpick.d $a0, $a1, 15, 2 + ldx.d $a0, $a0, $s0 + ld.hu $a7, $a0, 2 + ld.h $t1, $a0, 0 + bstrpick.d $a0, $a7, 15, 2 slli.d $a0, $a0, 2 ext.w.h $a0, $a0 - add.w $a0, $ra, $a0 + add.w $a0, $t3, $a0 srai.d $s4, $a0, 2 - bstrpick.d $a0, $a7, 15, 2 + bstrpick.d $a0, $t1, 15, 2 slli.d $a0, $a0, 2 ext.w.h $a0, $a0 - add.d $t1, $s7, $s3 - alsl.w $t1, $t1, $a0, 4 - andi $a0, $a1, 3 - andi $s6, $a7, 3 - or $a1, $a0, $s6 - srai.d $s5, $t1, 2 - bnez $a1, .LBB3_20 + add.d $t2, $s1, $s3 + alsl.w $t2, $t2, $a0, 4 + andi $a0, $a7, 3 + andi $s6, $t1, 3 + or $a7, $a0, $s6 + srai.d $t4, $t2, 2 + bnez $a7, .LBB3_20 # %bb.3: # %.preheader218 # in Loop: Header=BB3_2 Depth=2 - srai.d $a0, $s5, 63 - andn $a0, $s5, $a0 - srai.d $a1, $s4, 63 - andn $a1, $s4, $a1 - slt $a7, $a1, $a5 - maskeqz $a1, $a1, $a7 - masknez $a7, $a5, $a7 - or $a1, $a1, $a7 - slt $a7, $a0, $a4 - slli.d $a1, $a1, 3 - ldx.d $t3, $t5, $a1 - maskeqz $a0, $a0, $a7 - masknez $a1, $a4, $a7 - or $a0, $a0, $a1 - ldx.bu $a1, $t3, $a0 - bge $a2, $a1, .LBB3_158 + srai.d $a0, $t4, 63 + andn $a0, $t4, $a0 + srai.d $a7, $s4, 63 + andn $a7, $s4, $a7 + slt $t1, $a7, $a5 + maskeqz $a7, $a7, $t1 + masknez $t1, $a5, $t1 + or $a7, $a7, $t1 + slt $t1, $a0, $a4 + slli.d $a7, $a7, 3 + ldx.d $s5, $t5, $a7 + maskeqz $a0, $a0, $t1 + masknez $a7, $a4, $t1 + or $a7, $a0, $a7 + ldx.bu $a0, $s5, $a7 + bge $a2, $a0, .LBB3_158 # %bb.4: # in Loop: Header=BB3_2 Depth=2 - slt $a1, $t8, $s5 - masknez $a7, $t8, $a1 - maskeqz $a1, $s5, $a1 - or $a1, $a1, $a7 - addi.d $a1, $a1, 1 - slt $a7, $a1, $a4 - maskeqz $a1, $a1, $a7 - masknez $a7, $a4, $a7 - or $a7, $a1, $a7 - ldx.bu $a1, $t3, $a7 - bge $a2, $a1, .LBB3_158 -# %bb.5: # in Loop: Header=BB3_2 Depth=2 - slt $a1, $fp, $s5 - masknez $t1, $fp, $a1 - maskeqz $a1, $s5, $a1 - or $a1, $a1, $t1 - addi.d $a1, $a1, 2 - slt $t1, $a1, $a4 - maskeqz $a1, $a1, $t1 + slt $a0, $t8, $t4 + masknez $t1, $t8, $a0 + maskeqz $a0, $t4, $a0 + or $a0, $a0, $t1 + addi.d $a0, $a0, 1 + slt $t1, $a0, $a4 + maskeqz $a0, $a0, $t1 masknez $t1, $a4, $t1 - or $t1, $a1, $t1 - ldx.bu $a1, $t3, $t1 - bge $a2, $a1, .LBB3_158 + or $a0, $a0, $t1 + ldx.bu $t1, $s5, $a0 + bge $a2, $t1, .LBB3_158 +# %bb.5: # in Loop: Header=BB3_2 Depth=2 + slt $t1, $fp, $t4 + masknez $t2, $fp, $t1 + maskeqz $t1, $t4, $t1 + or $t1, $t1, $t2 + addi.d $t1, $t1, 2 + slt $t2, $t1, $a4 + maskeqz $t1, $t1, $t2 + masknez $t2, $a4, $t2 + or $t1, $t1, $t2 + ldx.bu $t2, $s5, $t1 + bge $a2, $t2, .LBB3_158 # %bb.6: # in Loop: Header=BB3_2 Depth=2 - addi.w $a1, $zero, -3 - slt $t2, $a1, $s5 - masknez $t4, $a1, $t2 - maskeqz $t2, $s5, $t2 - or $t2, $t2, $t4 + addi.w $t3, $zero, -3 + slt $t2, $t3, $t4 + masknez $s6, $t3, $t2 + maskeqz $t2, $t4, $t2 + or $t2, $t2, $s6 addi.d $t2, $t2, 3 slt $t4, $t2, $a4 maskeqz $t2, $t2, $t4 masknez $t4, $a4, $t4 or $t2, $t2, $t4 - ldx.bu $t3, $t3, $t2 - bge $a2, $t3, .LBB3_158 + ldx.bu $t4, $s5, $t2 + bge $a2, $t4, .LBB3_158 # %bb.7: # %.preheader212.1 # in Loop: Header=BB3_2 Depth=2 - slt $t3, $t8, $s4 - masknez $t4, $t8, $t3 - maskeqz $t3, $s4, $t3 - or $t3, $t3, $t4 - addi.d $t3, $t3, 1 - slt $t4, $t3, $a5 - maskeqz $t3, $t3, $t4 - masknez $t4, $a5, $t4 - or $t3, $t3, $t4 - slli.d $t3, $t3, 3 - ldx.d $t3, $t5, $t3 - ldx.bu $t4, $t3, $a0 - bge $a2, $t4, .LBB3_158 + slt $t4, $t8, $s4 + masknez $s5, $t8, $t4 + maskeqz $t4, $s4, $t4 + or $t4, $t4, $s5 + addi.d $t4, $t4, 1 + slt $s5, $t4, $a5 + maskeqz $t4, $t4, $s5 + masknez $s5, $a5, $s5 + or $t4, $t4, $s5 + slli.d $t4, $t4, 3 + ldx.d $t4, $t5, $t4 + ldx.bu $s5, $t4, $a7 + bge $a2, $s5, .LBB3_158 # %bb.8: # in Loop: Header=BB3_2 Depth=2 - ldx.bu $t4, $t3, $a7 - bge $a2, $t4, .LBB3_158 + ldx.bu $s5, $t4, $a0 + bge $a2, $s5, .LBB3_158 # %bb.9: # in Loop: Header=BB3_2 Depth=2 - ldx.bu $t4, $t3, $t1 - bge $a2, $t4, .LBB3_158 + ldx.bu $s5, $t4, $t1 + bge $a2, $s5, .LBB3_158 # %bb.10: # in Loop: Header=BB3_2 Depth=2 - ldx.bu $t3, $t3, $t2 - bge $a2, $t3, .LBB3_158 + ldx.bu $t4, $t4, $t2 + bge $a2, $t4, .LBB3_158 # %bb.11: # %.preheader212.2 # in Loop: Header=BB3_2 Depth=2 - slt $t3, $fp, $s4 - masknez $t4, $fp, $t3 - maskeqz $t3, $s4, $t3 - or $t3, $t3, $t4 - addi.d $t3, $t3, 2 + slt $t4, $fp, $s4 + masknez $s5, $fp, $t4 + maskeqz $t4, $s4, $t4 + or $t4, $t4, $s5 + addi.d $t4, $t4, 2 + slt $s5, $t4, $a5 + maskeqz $t4, $t4, $s5 + masknez $s5, $a5, $s5 + or $t4, $t4, $s5 + slli.d $t4, $t4, 3 + ldx.d $t4, $t5, $t4 + ldx.bu $s5, $t4, $a7 + bge $a2, $s5, .LBB3_158 +# %bb.12: # in Loop: Header=BB3_2 Depth=2 + ldx.bu $s5, $t4, $a0 + bge $a2, $s5, .LBB3_158 +# %bb.13: # in Loop: Header=BB3_2 Depth=2 + ldx.bu $s5, $t4, $t1 + bge $a2, $s5, .LBB3_158 +# %bb.14: # in Loop: Header=BB3_2 Depth=2 + ldx.bu $t4, $t4, $t2 + bge $a2, $t4, .LBB3_158 +# %bb.15: # %.preheader212.3 + # in Loop: Header=BB3_2 Depth=2 + slt $t4, $t3, $s4 + masknez $t3, $t3, $t4 + maskeqz $t4, $s4, $t4 + or $t3, $t4, $t3 + addi.d $t3, $t3, 3 slt $t4, $t3, $a5 maskeqz $t3, $t3, $t4 masknez $t4, $a5, $t4 or $t3, $t3, $t4 slli.d $t3, $t3, 3 ldx.d $t3, $t5, $t3 - ldx.bu $t4, $t3, $a0 - bge $a2, $t4, .LBB3_158 -# %bb.12: # in Loop: Header=BB3_2 Depth=2 - ldx.bu $t4, $t3, $a7 - bge $a2, $t4, .LBB3_158 -# %bb.13: # in Loop: Header=BB3_2 Depth=2 - ldx.bu $t4, $t3, $t1 - bge $a2, $t4, .LBB3_158 -# %bb.14: # in Loop: Header=BB3_2 Depth=2 - ldx.bu $t3, $t3, $t2 - bge $a2, $t3, .LBB3_158 -# %bb.15: # %.preheader212.3 - # in Loop: Header=BB3_2 Depth=2 - slt $t3, $a1, $s4 - masknez $a1, $a1, $t3 - maskeqz $t3, $s4, $t3 - or $a1, $t3, $a1 - addi.d $a1, $a1, 3 - slt $t3, $a1, $a5 - maskeqz $a1, $a1, $t3 - masknez $t3, $a5, $t3 - or $a1, $a1, $t3 - slli.d $a1, $a1, 3 - ldx.d $a1, $t5, $a1 - ldx.bu $a0, $a1, $a0 - bge $a2, $a0, .LBB3_158 + ldx.bu $a7, $t3, $a7 + bge $a2, $a7, .LBB3_158 # %bb.16: # in Loop: Header=BB3_2 Depth=2 - ldx.bu $a0, $a1, $a7 + ldx.bu $a0, $t3, $a0 bge $a2, $a0, .LBB3_158 # %bb.17: # in Loop: Header=BB3_2 Depth=2 - ldx.bu $a0, $a1, $t1 + ldx.bu $a0, $t3, $t1 bge $a2, $a0, .LBB3_158 # %bb.18: # in Loop: Header=BB3_2 Depth=2 - move $a3, $s7 - ldx.bu $a0, $a1, $t2 + ldx.bu $a0, $t3, $t2 bge $a2, $a0, .LBB3_158 .LBB3_19: # %.loopexit219 # in Loop: Header=BB3_2 Depth=2 addi.d $s3, $s3, 1 - ld.d $a0, $sp, 96 # 8-byte Folded Reload - move $s7, $a3 - ld.d $a3, $sp, 88 # 8-byte Folded Reload - ld.d $s8, $sp, 112 # 8-byte Folded Reload - ld.d $ra, $sp, 104 # 8-byte Folded Reload - blt $s3, $a0, .LBB3_2 + ld.d $s7, $sp, 112 # 8-byte Folded Reload + ld.d $t3, $sp, 104 # 8-byte Folded Reload + blt $s3, $t7, .LBB3_2 b .LBB3_156 .p2align 4, , 16 .LBB3_20: # in Loop: Header=BB3_2 Depth=2 - move $a3, $s7 beqz $a0, .LBB3_63 # %bb.21: # in Loop: Header=BB3_2 Depth=2 beqz $s6, .LBB3_73 # %bb.22: # in Loop: Header=BB3_2 Depth=2 + st.d $ra, $sp, 72 # 8-byte Folded Spill bne $s6, $a6, .LBB3_99 # %bb.23: # %.preheader224 # in Loop: Header=BB3_2 Depth=2 addi.d $a0, $s4, 3 - st.d $a0, $sp, 16 # 8-byte Folded Spill + st.d $a0, $sp, 32 # 8-byte Folded Spill .LBB3_24: # %.preheader215 # Parent Loop BB3_1 Depth=1 # Parent Loop BB3_2 Depth=2 # => This Loop Header: Depth=3 # Child Loop BB3_25 Depth 4 + move $a3, $a1 move $s8, $zero addi.w $s7, $s4, 0 - slt $a0, $a6, $s7 - masknez $a1, $a6, $a0 - maskeqz $a0, $s7, $a0 - or $a0, $a0, $a1 - addi.w $a0, $a0, -2 + vinsgr2vr.w $vr2, $s4, 0 + vinsgr2vr.w $vr2, $s4, 1 + ori $a0, $zero, 2 + lu32i.d $a0, 1 + vreplgr2vr.d $vr3, $a0 + vmax.w $vr3, $vr2, $vr3 + vadd.w $vr3, $vr3, $vr1 + vmin.w $vr3, $vr3, $vr0 + vpickve2gr.w $a0, $vr3, 0 + slli.d $a0, $a0, 3 + ldx.d $ra, $t5, $a0 + vpickve2gr.w $a0, $vr3, 1 + alsl.d $a0, $a0, $t5, 3 + st.d $a0, $sp, 80 # 8-byte Folded Spill + srai.d $a0, $s7, 63 + andn $a0, $s7, $a0 slt $a1, $a0, $a5 maskeqz $a0, $a0, $a1 masknez $a1, $a5, $a1 or $a0, $a0, $a1 - slt $a1, $s0, $s7 - masknez $a7, $s0, $a1 - maskeqz $a1, $s7, $a1 - or $a1, $a1, $a7 - addi.d $a1, $a1, -1 - slt $a7, $a1, $a5 - maskeqz $a1, $a1, $a7 - masknez $a7, $a5, $a7 - or $a1, $a1, $a7 - srai.d $a7, $s7, 63 - andn $a7, $s7, $a7 - slt $t1, $a7, $a5 - maskeqz $a7, $a7, $t1 - masknez $t1, $a5, $t1 - or $a7, $a7, $t1 + alsl.d $a0, $a0, $t5, 3 + st.d $a0, $sp, 64 # 8-byte Folded Spill addi.w $s4, $s4, 1 - srai.d $t1, $s4, 63 - andn $t1, $s4, $t1 - slt $t2, $t1, $a5 - maskeqz $t1, $t1, $t2 - masknez $t2, $a5, $t2 - or $t1, $t1, $t2 - slt $t2, $fp, $s7 - masknez $t3, $fp, $t2 - maskeqz $t2, $s7, $t2 - or $t2, $t2, $t3 - addi.w $t2, $t2, 2 - slt $t3, $t2, $a5 - maskeqz $t2, $t2, $t3 - masknez $t3, $a5, $t3 - or $t2, $t2, $t3 - addi.w $ra, $zero, -3 - slt $t3, $ra, $s7 - masknez $t4, $ra, $t3 - maskeqz $t3, $s7, $t3 - or $t3, $t3, $t4 - addi.w $t3, $t3, 3 - slt $t4, $t3, $a5 - maskeqz $t3, $t3, $t4 - masknez $t4, $a5, $t4 - or $t3, $t3, $t4 - slli.d $a0, $a0, 3 - ldx.d $s6, $t5, $a0 - alsl.d $a0, $a1, $t5, 3 + srai.d $a0, $s4, 63 + andn $a0, $s4, $a0 + slt $a1, $a0, $a5 + maskeqz $a0, $a0, $a1 + masknez $a1, $a5, $a1 + or $a0, $a0, $a1 + alsl.d $a0, $a0, $t5, 3 st.d $a0, $sp, 56 # 8-byte Folded Spill - alsl.d $a0, $a7, $t5, 3 + move $a0, $fp + lu32i.d $a0, -3 + vreplgr2vr.d $vr3, $a0 + vmax.w $vr2, $vr2, $vr3 + ori $a0, $zero, 2 + lu32i.d $a0, 3 + vreplgr2vr.d $vr3, $a0 + vadd.w $vr2, $vr2, $vr3 + vmin.w $vr2, $vr2, $vr0 + vpickve2gr.w $a0, $vr2, 0 + alsl.d $a0, $a0, $t5, 3 st.d $a0, $sp, 48 # 8-byte Folded Spill - alsl.d $a0, $t1, $t5, 3 + vpickve2gr.w $a0, $vr2, 1 + alsl.d $a0, $a0, $t5, 3 st.d $a0, $sp, 40 # 8-byte Folded Spill - alsl.d $a0, $t2, $t5, 3 - st.d $a0, $sp, 32 # 8-byte Folded Spill - alsl.d $a0, $t3, $t5, 3 - st.d $a0, $sp, 24 # 8-byte Folded Spill .LBB3_25: # %.preheader207 # Parent Loop BB3_1 Depth=1 # Parent Loop BB3_2 Depth=2 # Parent Loop BB3_24 Depth=3 # => This Inner Loop Header: Depth=4 - add.w $a1, $s5, $s8 - slt $a0, $a6, $a1 - maskeqz $a7, $a1, $a0 - masknez $a0, $a6, $a0 - or $a0, $a7, $a0 - addi.w $a0, $a0, -2 - slt $a7, $a0, $a4 - maskeqz $a0, $a0, $a7 + add.w $a0, $t4, $s8 + slt $a1, $a6, $a0 + maskeqz $a7, $a0, $a1 + masknez $a1, $a6, $a1 + or $a1, $a7, $a1 + addi.w $a1, $a1, -2 + slt $a7, $a1, $a4 + maskeqz $a1, $a1, $a7 masknez $a7, $a4, $a7 - or $t1, $a0, $a7 - ldx.bu $a0, $s6, $t1 - bge $a2, $a0, .LBB3_158 + or $t1, $a1, $a7 + ldx.bu $a1, $ra, $t1 + bge $a2, $a1, .LBB3_158 # %bb.26: # in Loop: Header=BB3_25 Depth=4 - slt $a0, $s0, $a1 - masknez $a7, $s0, $a0 - maskeqz $a0, $a1, $a0 - or $a0, $a0, $a7 - addi.d $a0, $a0, -1 - slt $a7, $a0, $a4 - maskeqz $a0, $a0, $a7 + ori $a7, $zero, 1 + slt $a1, $a7, $a0 + masknez $a7, $a7, $a1 + maskeqz $a1, $a0, $a1 + or $a1, $a1, $a7 + addi.d $a1, $a1, -1 + slt $a7, $a1, $a4 + maskeqz $a1, $a1, $a7 masknez $a7, $a4, $a7 - or $t2, $a0, $a7 - ldx.bu $a0, $s6, $t2 - bge $a2, $a0, .LBB3_158 + or $t2, $a1, $a7 + ldx.bu $a1, $ra, $t2 + bge $a2, $a1, .LBB3_158 # %bb.27: # in Loop: Header=BB3_25 Depth=4 - srai.d $a0, $a1, 63 - andn $a0, $a1, $a0 - slt $a7, $a0, $a4 - maskeqz $a0, $a0, $a7 + srai.d $a1, $a0, 63 + andn $a1, $a0, $a1 + slt $a7, $a1, $a4 + maskeqz $a1, $a1, $a7 masknez $a7, $a4, $a7 - or $t3, $a0, $a7 - ldx.bu $a0, $s6, $t3 - bge $a2, $a0, .LBB3_158 + or $t3, $a1, $a7 + ldx.bu $a1, $ra, $t3 + bge $a2, $a1, .LBB3_158 # %bb.28: # in Loop: Header=BB3_25 Depth=4 - addi.w $a0, $a1, 1 - srai.d $a7, $a0, 63 - andn $a0, $a0, $a7 - slt $a7, $a0, $a4 - maskeqz $a0, $a0, $a7 + addi.w $a1, $a0, 1 + srai.d $a7, $a1, 63 + andn $a1, $a1, $a7 + slt $a7, $a1, $a4 + maskeqz $a1, $a1, $a7 masknez $a7, $a4, $a7 - or $a7, $a0, $a7 - ldx.bu $a0, $s6, $a7 - bge $a2, $a0, .LBB3_158 + or $a7, $a1, $a7 + ldx.bu $a1, $ra, $a7 + bge $a2, $a1, .LBB3_158 # %bb.29: # in Loop: Header=BB3_25 Depth=4 - slt $a0, $fp, $a1 - masknez $t4, $fp, $a0 - maskeqz $a0, $a1, $a0 - or $a0, $a0, $t4 - addi.w $a0, $a0, 2 - slt $t4, $a0, $a4 - maskeqz $a0, $a0, $t4 - masknez $t4, $a4, $t4 - or $a0, $a0, $t4 - ldx.bu $t4, $s6, $a0 - bge $a2, $t4, .LBB3_158 + slt $a1, $fp, $a0 + masknez $s5, $fp, $a1 + maskeqz $a1, $a0, $a1 + or $a1, $a1, $s5 + addi.w $a1, $a1, 2 + slt $s5, $a1, $a4 + maskeqz $a1, $a1, $s5 + masknez $s5, $a4, $s5 + or $s6, $a1, $s5 + ldx.bu $a1, $ra, $s6 + bge $a2, $a1, .LBB3_158 # %bb.30: # in Loop: Header=BB3_25 Depth=4 - slt $t4, $ra, $a1 - masknez $t7, $ra, $t4 - maskeqz $a1, $a1, $t4 - or $a1, $a1, $t7 - addi.w $a1, $a1, 3 - slt $t4, $a1, $a4 - maskeqz $a1, $a1, $t4 - masknez $t4, $a4, $t4 - or $t4, $a1, $t4 - ldx.bu $a1, $s6, $t4 + addi.w $a1, $zero, -3 + slt $s5, $a1, $a0 + masknez $a1, $a1, $s5 + maskeqz $a0, $a0, $s5 + or $a0, $a0, $a1 + addi.w $a0, $a0, 3 + slt $a1, $a0, $a4 + maskeqz $a0, $a0, $a1 + masknez $a1, $a4, $a1 + or $a0, $a0, $a1 + ldx.bu $a1, $ra, $a0 bge $a2, $a1, .LBB3_158 # %bb.31: # in Loop: Header=BB3_25 Depth=4 - ld.d $a1, $sp, 56 # 8-byte Folded Reload - ld.d $a1, $a1, 0 - ldx.bu $t7, $a1, $t1 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $s5, $a1, 0 + ldx.bu $a1, $s5, $t1 + bge $a2, $a1, .LBB3_158 # %bb.32: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $t2 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $t2 + bge $a2, $a1, .LBB3_158 # %bb.33: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $t3 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $t3 + bge $a2, $a1, .LBB3_158 # %bb.34: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $a7 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $a7 + bge $a2, $a1, .LBB3_158 # %bb.35: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $a0 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $s6 + bge $a2, $a1, .LBB3_158 # %bb.36: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $a1, $a1, $t4 + ldx.bu $a1, $s5, $a0 bge $a2, $a1, .LBB3_158 # %bb.37: # in Loop: Header=BB3_25 Depth=4 - ld.d $a1, $sp, 48 # 8-byte Folded Reload - ld.d $a1, $a1, 0 - ldx.bu $t7, $a1, $t1 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $sp, 64 # 8-byte Folded Reload + ld.d $s5, $a1, 0 + ldx.bu $a1, $s5, $t1 + bge $a2, $a1, .LBB3_158 # %bb.38: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $t2 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $t2 + bge $a2, $a1, .LBB3_158 # %bb.39: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $t3 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $t3 + bge $a2, $a1, .LBB3_158 # %bb.40: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $a7 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $a7 + bge $a2, $a1, .LBB3_158 # %bb.41: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $a0 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $s6 + bge $a2, $a1, .LBB3_158 # %bb.42: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $a1, $a1, $t4 + ldx.bu $a1, $s5, $a0 bge $a2, $a1, .LBB3_158 # %bb.43: # in Loop: Header=BB3_25 Depth=4 - ld.d $a1, $sp, 40 # 8-byte Folded Reload - ld.d $a1, $a1, 0 - ldx.bu $t7, $a1, $t1 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $sp, 56 # 8-byte Folded Reload + ld.d $s5, $a1, 0 + ldx.bu $a1, $s5, $t1 + bge $a2, $a1, .LBB3_158 # %bb.44: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $t2 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $t2 + bge $a2, $a1, .LBB3_158 # %bb.45: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $t3 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $t3 + bge $a2, $a1, .LBB3_158 # %bb.46: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $a7 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $a7 + bge $a2, $a1, .LBB3_158 # %bb.47: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $a0 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $s6 + bge $a2, $a1, .LBB3_158 # %bb.48: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $a1, $a1, $t4 + ldx.bu $a1, $s5, $a0 bge $a2, $a1, .LBB3_158 # %bb.49: # in Loop: Header=BB3_25 Depth=4 - ld.d $a1, $sp, 32 # 8-byte Folded Reload - ld.d $a1, $a1, 0 - ldx.bu $t7, $a1, $t1 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $sp, 48 # 8-byte Folded Reload + ld.d $s5, $a1, 0 + ldx.bu $a1, $s5, $t1 + bge $a2, $a1, .LBB3_158 # %bb.50: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $t2 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $t2 + bge $a2, $a1, .LBB3_158 # %bb.51: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $t3 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $t3 + bge $a2, $a1, .LBB3_158 # %bb.52: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $a7 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $a7 + bge $a2, $a1, .LBB3_158 # %bb.53: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t7, $a1, $a0 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $s5, $s6 + bge $a2, $a1, .LBB3_158 # %bb.54: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $a1, $a1, $t4 + ldx.bu $a1, $s5, $a0 bge $a2, $a1, .LBB3_158 # %bb.55: # in Loop: Header=BB3_25 Depth=4 - ld.d $a1, $sp, 24 # 8-byte Folded Reload - ld.d $a1, $a1, 0 - ldx.bu $t1, $a1, $t1 - bge $a2, $t1, .LBB3_158 + ld.d $a1, $sp, 40 # 8-byte Folded Reload + ld.d $s5, $a1, 0 + ldx.bu $a1, $s5, $t1 + bge $a2, $a1, .LBB3_158 # %bb.56: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t1, $a1, $t2 - bge $a2, $t1, .LBB3_158 + ldx.bu $a1, $s5, $t2 + bge $a2, $a1, .LBB3_158 # %bb.57: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $t1, $a1, $t3 - bge $a2, $t1, .LBB3_158 + ldx.bu $a1, $s5, $t3 + bge $a2, $a1, .LBB3_158 # %bb.58: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $a7, $a1, $a7 - bge $a2, $a7, .LBB3_158 + ldx.bu $a1, $s5, $a7 + bge $a2, $a1, .LBB3_158 # %bb.59: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $a0, $a1, $a0 - bge $a2, $a0, .LBB3_158 + ldx.bu $a1, $s5, $s6 + bge $a2, $a1, .LBB3_158 # %bb.60: # in Loop: Header=BB3_25 Depth=4 - ldx.bu $a0, $a1, $t4 + ldx.bu $a0, $s5, $a0 bge $a2, $a0, .LBB3_158 # %bb.61: # in Loop: Header=BB3_25 Depth=4 addi.w $s8, $s8, 1 - ori $t7, $zero, 4 - bne $s8, $t7, .LBB3_25 + ori $a0, $zero, 4 + bne $s8, $a0, .LBB3_25 # %bb.62: # in Loop: Header=BB3_24 Depth=3 - ld.d $a0, $sp, 16 # 8-byte Folded Reload + ori $s8, $zero, 4 + ld.d $ra, $sp, 72 # 8-byte Folded Reload + move $a1, $a3 + ori $a3, $zero, 1 + ld.d $a0, $sp, 32 # 8-byte Folded Reload bne $s7, $a0, .LBB3_24 b .LBB3_19 .LBB3_63: # %.preheader220 @@ -1099,14 +1086,14 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef # => This Loop Header: Depth=3 # Child Loop BB3_65 Depth 4 addi.w $a7, $s4, 0 - srai.d $a1, $a7, 63 - andn $a1, $a7, $a1 - slt $t1, $a1, $a5 - maskeqz $a1, $a1, $t1 - masknez $t1, $a5, $t1 - or $a1, $a1, $t1 - slli.d $a1, $a1, 3 - ldx.d $t1, $t5, $a1 + srai.d $t1, $a7, 63 + andn $t1, $a7, $t1 + slt $t2, $t1, $a5 + maskeqz $t1, $t1, $t2 + masknez $t2, $a5, $t2 + or $t1, $t1, $t2 + slli.d $t1, $t1, 3 + ldx.d $t1, $t5, $t1 move $t2, $zero .p2align 4, , 16 .LBB3_65: # %.preheader @@ -1114,357 +1101,343 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef # Parent Loop BB3_2 Depth=2 # Parent Loop BB3_64 Depth=3 # => This Inner Loop Header: Depth=4 - add.w $a1, $s5, $t2 - slt $t3, $a6, $a1 - maskeqz $t4, $a1, $t3 - masknez $t3, $a6, $t3 - or $t3, $t4, $t3 - addi.w $t3, $t3, -2 - slt $t4, $t3, $a4 - maskeqz $t3, $t3, $t4 - masknez $t4, $a4, $t4 - or $t3, $t3, $t4 - ldx.bu $t3, $t1, $t3 - bge $a2, $t3, .LBB3_158 + add.w $t3, $t4, $t2 + slt $s5, $a6, $t3 + maskeqz $s6, $t3, $s5 + masknez $s5, $a6, $s5 + or $s5, $s6, $s5 + addi.w $s5, $s5, -2 + slt $s6, $s5, $a4 + maskeqz $s5, $s5, $s6 + masknez $s6, $a4, $s6 + or $s5, $s5, $s6 + ldx.bu $s5, $t1, $s5 + bge $a2, $s5, .LBB3_158 # %bb.66: # in Loop: Header=BB3_65 Depth=4 - slt $t3, $s0, $a1 - masknez $t4, $s0, $t3 - maskeqz $t3, $a1, $t3 - or $t3, $t3, $t4 - addi.d $t3, $t3, -1 - slt $t4, $t3, $a4 - maskeqz $t3, $t3, $t4 - masknez $t4, $a4, $t4 - or $t3, $t3, $t4 - ldx.bu $t3, $t1, $t3 - bge $a2, $t3, .LBB3_158 + slt $s5, $a3, $t3 + masknez $s6, $a3, $s5 + maskeqz $s5, $t3, $s5 + or $s5, $s5, $s6 + addi.d $s5, $s5, -1 + slt $s6, $s5, $a4 + maskeqz $s5, $s5, $s6 + masknez $s6, $a4, $s6 + or $s5, $s5, $s6 + ldx.bu $s5, $t1, $s5 + bge $a2, $s5, .LBB3_158 # %bb.67: # in Loop: Header=BB3_65 Depth=4 - srai.d $t3, $a1, 63 - andn $t3, $a1, $t3 - slt $t4, $t3, $a4 - maskeqz $t3, $t3, $t4 - masknez $t4, $a4, $t4 - or $t3, $t3, $t4 - ldx.bu $t3, $t1, $t3 - bge $a2, $t3, .LBB3_158 + srai.d $s5, $t3, 63 + andn $s5, $t3, $s5 + slt $s6, $s5, $a4 + maskeqz $s5, $s5, $s6 + masknez $s6, $a4, $s6 + or $s5, $s5, $s6 + ldx.bu $s5, $t1, $s5 + bge $a2, $s5, .LBB3_158 # %bb.68: # in Loop: Header=BB3_65 Depth=4 - addi.w $t3, $a1, 1 - srai.d $t4, $t3, 63 - andn $t3, $t3, $t4 - slt $t4, $t3, $a4 - maskeqz $t3, $t3, $t4 - masknez $t4, $a4, $t4 - or $t3, $t3, $t4 - ldx.bu $t3, $t1, $t3 - bge $a2, $t3, .LBB3_158 + addi.w $s5, $t3, 1 + srai.d $s6, $s5, 63 + andn $s5, $s5, $s6 + slt $s6, $s5, $a4 + maskeqz $s5, $s5, $s6 + masknez $s6, $a4, $s6 + or $s5, $s5, $s6 + ldx.bu $s5, $t1, $s5 + bge $a2, $s5, .LBB3_158 # %bb.69: # in Loop: Header=BB3_65 Depth=4 - slt $t3, $fp, $a1 - masknez $t4, $fp, $t3 - maskeqz $t3, $a1, $t3 - or $t3, $t3, $t4 - addi.w $t3, $t3, 2 - slt $t4, $t3, $a4 - maskeqz $t3, $t3, $t4 - masknez $t4, $a4, $t4 - or $t3, $t3, $t4 + slt $s5, $fp, $t3 + masknez $s6, $fp, $s5 + maskeqz $s5, $t3, $s5 + or $s5, $s5, $s6 + addi.w $s5, $s5, 2 + slt $s6, $s5, $a4 + maskeqz $s5, $s5, $s6 + masknez $s6, $a4, $s6 + or $s5, $s5, $s6 + ldx.bu $s5, $t1, $s5 + bge $a2, $s5, .LBB3_158 +# %bb.70: # in Loop: Header=BB3_65 Depth=4 + addi.w $s5, $zero, -3 + slt $s6, $s5, $t3 + masknez $s5, $s5, $s6 + maskeqz $t3, $t3, $s6 + or $t3, $t3, $s5 + addi.w $t3, $t3, 3 + slt $s5, $t3, $a4 + maskeqz $t3, $t3, $s5 + masknez $s5, $a4, $s5 + or $t3, $t3, $s5 ldx.bu $t3, $t1, $t3 bge $a2, $t3, .LBB3_158 -# %bb.70: # in Loop: Header=BB3_65 Depth=4 - addi.w $t3, $zero, -3 - slt $t4, $t3, $a1 - masknez $t3, $t3, $t4 - maskeqz $a1, $a1, $t4 - or $a1, $a1, $t3 - addi.w $a1, $a1, 3 - slt $t3, $a1, $a4 - maskeqz $a1, $a1, $t3 - masknez $t3, $a4, $t3 - or $a1, $a1, $t3 - ldx.bu $a1, $t1, $a1 - bge $a2, $a1, .LBB3_158 # %bb.71: # in Loop: Header=BB3_65 Depth=4 addi.w $t2, $t2, 1 - bne $t2, $t7, .LBB3_65 + bne $t2, $s8, .LBB3_65 # %bb.72: # in Loop: Header=BB3_64 Depth=3 addi.d $s4, $s4, 1 bne $a7, $a0, .LBB3_64 b .LBB3_19 .LBB3_73: # %.preheader222 # in Loop: Header=BB3_2 Depth=2 - move $s6, $zero - srai.d $a0, $s5, 63 - andn $a0, $s5, $a0 - slt $a1, $a0, $a4 - maskeqz $a0, $a0, $a1 - masknez $a1, $a4, $a1 - or $s7, $a0, $a1 - slt $a0, $t8, $s5 - masknez $a1, $t8, $a0 - maskeqz $a0, $s5, $a0 - or $a0, $a0, $a1 + move $a7, $zero + srai.d $a0, $t4, 63 + andn $a0, $t4, $a0 + slt $t1, $a0, $a4 + maskeqz $a0, $a0, $t1 + masknez $t1, $a4, $t1 + or $t1, $a0, $t1 + slt $a0, $t8, $t4 + masknez $t2, $t8, $a0 + maskeqz $a0, $t4, $a0 + or $a0, $a0, $t2 addi.d $a0, $a0, 1 - slt $a1, $a0, $a4 - maskeqz $a0, $a0, $a1 - masknez $a1, $a4, $a1 - or $s8, $a0, $a1 - slt $a0, $fp, $s5 - masknez $a1, $fp, $a0 - maskeqz $a0, $s5, $a0 - or $a0, $a0, $a1 + slt $t2, $a0, $a4 + maskeqz $a0, $a0, $t2 + masknez $t2, $a4, $t2 + or $s6, $a0, $t2 + slt $a0, $fp, $t4 + masknez $t2, $fp, $a0 + maskeqz $a0, $t4, $a0 + or $a0, $a0, $t2 addi.d $a0, $a0, 2 - slt $a1, $a0, $a4 - maskeqz $a0, $a0, $a1 - masknez $a1, $a4, $a1 - or $a7, $a0, $a1 - addi.w $t1, $zero, -3 - slt $a0, $t1, $s5 - maskeqz $a1, $s5, $a0 - masknez $a0, $t1, $a0 - or $a0, $a1, $a0 - addi.d $a0, $a0, 3 - slt $a1, $a0, $a4 - maskeqz $a0, $a0, $a1 - masknez $a1, $a4, $a1 - or $s5, $a0, $a1 + slt $t2, $a0, $a4 + maskeqz $a0, $a0, $t2 + masknez $t2, $a4, $t2 + or $s7, $a0, $t2 + addi.w $a0, $zero, -3 + slt $t2, $a0, $t4 + masknez $a0, $a0, $t2 + maskeqz $t2, $t4, $t2 + or $a0, $t2, $a0 + addi.d $a0, $a0, 3 + slt $t2, $a0, $a4 + maskeqz $a0, $a0, $t2 + masknez $t2, $a4, $t2 + or $s5, $a0, $t2 .LBB3_74: # %.preheader214 # Parent Loop BB3_1 Depth=1 # Parent Loop BB3_2 Depth=2 # => This Inner Loop Header: Depth=3 - add.w $a1, $s4, $s6 - slt $a0, $a6, $a1 - maskeqz $t2, $a1, $a0 - masknez $a0, $a6, $a0 - or $a0, $t2, $a0 - addi.w $a0, $a0, -2 - slt $t2, $a0, $a5 - maskeqz $a0, $a0, $t2 - masknez $t2, $a5, $t2 - or $a0, $a0, $t2 + add.w $t4, $s4, $a7 + vinsgr2vr.w $vr2, $t4, 0 + vinsgr2vr.w $vr2, $t4, 1 + ori $a0, $zero, 2 + lu32i.d $a0, 1 + vreplgr2vr.d $vr3, $a0 + vmax.w $vr3, $vr2, $vr3 + vadd.w $vr3, $vr3, $vr1 + vmin.w $vr3, $vr3, $vr0 + vpickve2gr.w $a0, $vr3, 0 slli.d $a0, $a0, 3 ldx.d $t2, $t5, $a0 - ldx.bu $a0, $t2, $s7 + ldx.bu $a0, $t2, $t1 bge $a2, $a0, .LBB3_158 # %bb.75: # in Loop: Header=BB3_74 Depth=3 - slt $a0, $s0, $a1 - maskeqz $t3, $a1, $a0 - masknez $a0, $s0, $a0 - or $a0, $t3, $a0 - addi.d $a0, $a0, -1 - slt $t3, $a0, $a5 - maskeqz $a0, $a0, $t3 - masknez $t3, $a5, $t3 - or $a0, $a0, $t3 + vpickve2gr.w $a0, $vr3, 1 alsl.d $t3, $a0, $t5, 3 ld.d $a0, $t3, 0 - ldx.bu $a0, $a0, $s7 + ldx.bu $a0, $a0, $t1 bge $a2, $a0, .LBB3_158 # %bb.76: # in Loop: Header=BB3_74 Depth=3 - srai.d $a0, $a1, 63 - andn $a0, $a1, $a0 - slt $t4, $a0, $a5 - maskeqz $a0, $a0, $t4 - masknez $t4, $a5, $t4 - or $a0, $a0, $t4 + srai.d $a0, $t4, 63 + andn $a0, $t4, $a0 + slt $s8, $a0, $a5 + maskeqz $a0, $a0, $s8 + masknez $s8, $a5, $s8 + or $a0, $a0, $s8 alsl.d $a0, $a0, $t5, 3 - ld.d $t4, $a0, 0 - ldx.bu $t4, $t4, $s7 - bge $a2, $t4, .LBB3_158 + ld.d $s8, $a0, 0 + ldx.bu $s8, $s8, $t1 + bge $a2, $s8, .LBB3_158 # %bb.77: # in Loop: Header=BB3_74 Depth=3 - addi.w $t4, $a1, 1 - srai.d $ra, $t4, 63 - andn $t4, $t4, $ra - slt $ra, $t4, $a5 - maskeqz $t4, $t4, $ra - masknez $ra, $a5, $ra - or $t4, $t4, $ra + addi.w $t4, $t4, 1 + srai.d $s8, $t4, 63 + andn $t4, $t4, $s8 + slt $s8, $t4, $a5 + maskeqz $t4, $t4, $s8 + masknez $s8, $a5, $s8 + or $t4, $t4, $s8 + alsl.d $s8, $t4, $t5, 3 + ld.d $t4, $s8, 0 + ldx.bu $t4, $t4, $t1 + bge $a2, $t4, .LBB3_158 +# %bb.78: # in Loop: Header=BB3_74 Depth=3 + move $a3, $t7 + move $t7, $ra + move $t4, $fp + lu32i.d $t4, -3 + vreplgr2vr.d $vr3, $t4 + vmax.w $vr2, $vr2, $vr3 + ori $t4, $zero, 2 + lu32i.d $t4, 3 + vreplgr2vr.d $vr3, $t4 + vadd.w $vr2, $vr2, $vr3 + vmin.w $vr2, $vr2, $vr0 + vpickve2gr.w $t4, $vr2, 0 alsl.d $t4, $t4, $t5, 3 ld.d $ra, $t4, 0 - ldx.bu $ra, $ra, $s7 + ldx.bu $ra, $ra, $t1 bge $a2, $ra, .LBB3_158 -# %bb.78: # in Loop: Header=BB3_74 Depth=3 - slt $ra, $fp, $a1 - maskeqz $t7, $a1, $ra - masknez $ra, $fp, $ra - or $t7, $t7, $ra - addi.w $t7, $t7, 2 - slt $ra, $t7, $a5 - maskeqz $t7, $t7, $ra - masknez $ra, $a5, $ra - or $t7, $t7, $ra - alsl.d $ra, $t7, $t5, 3 - ld.d $t7, $ra, 0 - ldx.bu $t7, $t7, $s7 - bge $a2, $t7, .LBB3_158 # %bb.79: # in Loop: Header=BB3_74 Depth=3 - slt $t7, $t1, $a1 - maskeqz $a1, $a1, $t7 - masknez $t7, $t1, $t7 - or $a1, $a1, $t7 - addi.w $a1, $a1, 3 - slt $t7, $a1, $a5 - maskeqz $a1, $a1, $t7 - masknez $t7, $a5, $t7 - or $a1, $a1, $t7 - alsl.d $a1, $a1, $t5, 3 - ld.d $t7, $a1, 0 - ldx.bu $t7, $t7, $s7 - bge $a2, $t7, .LBB3_158 + move $a6, $t8 + move $t8, $t6 + move $t6, $t0 + move $t0, $s0 + move $s0, $s2 + move $s2, $a1 + vpickve2gr.w $ra, $vr2, 1 + alsl.d $ra, $ra, $t5, 3 + ld.d $a1, $ra, 0 + ldx.bu $a1, $a1, $t1 + bge $a2, $a1, .LBB3_158 # %bb.80: # in Loop: Header=BB3_74 Depth=3 - ldx.bu $t7, $t2, $s8 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $t2, $s6 + bge $a2, $a1, .LBB3_158 # %bb.81: # in Loop: Header=BB3_74 Depth=3 - ld.d $t7, $t3, 0 - ldx.bu $t7, $t7, $s8 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $t3, 0 + ldx.bu $a1, $a1, $s6 + bge $a2, $a1, .LBB3_158 # %bb.82: # in Loop: Header=BB3_74 Depth=3 - ld.d $t7, $a0, 0 - ldx.bu $t7, $t7, $s8 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $a0, 0 + ldx.bu $a1, $a1, $s6 + bge $a2, $a1, .LBB3_158 # %bb.83: # in Loop: Header=BB3_74 Depth=3 - ld.d $t7, $t4, 0 - ldx.bu $t7, $t7, $s8 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $s8, 0 + ldx.bu $a1, $a1, $s6 + bge $a2, $a1, .LBB3_158 # %bb.84: # in Loop: Header=BB3_74 Depth=3 - ld.d $t7, $ra, 0 - ldx.bu $t7, $t7, $s8 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $t4, 0 + ldx.bu $a1, $a1, $s6 + bge $a2, $a1, .LBB3_158 # %bb.85: # in Loop: Header=BB3_74 Depth=3 - ld.d $t7, $a1, 0 - ldx.bu $t7, $t7, $s8 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $ra, 0 + ldx.bu $a1, $a1, $s6 + bge $a2, $a1, .LBB3_158 # %bb.86: # in Loop: Header=BB3_74 Depth=3 - ldx.bu $t7, $t2, $a7 - bge $a2, $t7, .LBB3_158 + ldx.bu $a1, $t2, $s7 + bge $a2, $a1, .LBB3_158 # %bb.87: # in Loop: Header=BB3_74 Depth=3 - ld.d $t7, $t3, 0 - ldx.bu $t7, $t7, $a7 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $t3, 0 + ldx.bu $a1, $a1, $s7 + bge $a2, $a1, .LBB3_158 # %bb.88: # in Loop: Header=BB3_74 Depth=3 - ld.d $t7, $a0, 0 - ldx.bu $t7, $t7, $a7 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $a0, 0 + ldx.bu $a1, $a1, $s7 + bge $a2, $a1, .LBB3_158 # %bb.89: # in Loop: Header=BB3_74 Depth=3 - ld.d $t7, $t4, 0 - ldx.bu $t7, $t7, $a7 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $s8, 0 + ldx.bu $a1, $a1, $s7 + bge $a2, $a1, .LBB3_158 # %bb.90: # in Loop: Header=BB3_74 Depth=3 - ld.d $t7, $ra, 0 - ldx.bu $t7, $t7, $a7 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $t4, 0 + ldx.bu $a1, $a1, $s7 + bge $a2, $a1, .LBB3_158 # %bb.91: # in Loop: Header=BB3_74 Depth=3 - ld.d $t7, $a1, 0 - ldx.bu $t7, $t7, $a7 - bge $a2, $t7, .LBB3_158 + ld.d $a1, $ra, 0 + ldx.bu $a1, $a1, $s7 + bge $a2, $a1, .LBB3_158 # %bb.92: # in Loop: Header=BB3_74 Depth=3 - ldx.bu $t2, $t2, $s5 - bge $a2, $t2, .LBB3_158 + ldx.bu $a1, $t2, $s5 + bge $a2, $a1, .LBB3_158 # %bb.93: # in Loop: Header=BB3_74 Depth=3 - ld.d $t2, $t3, 0 - ldx.bu $t2, $t2, $s5 - bge $a2, $t2, .LBB3_158 + ld.d $a1, $t3, 0 + ldx.bu $a1, $a1, $s5 + bge $a2, $a1, .LBB3_158 # %bb.94: # in Loop: Header=BB3_74 Depth=3 ld.d $a0, $a0, 0 ldx.bu $a0, $a0, $s5 bge $a2, $a0, .LBB3_158 # %bb.95: # in Loop: Header=BB3_74 Depth=3 - ld.d $a0, $t4, 0 + ld.d $a0, $s8, 0 ldx.bu $a0, $a0, $s5 bge $a2, $a0, .LBB3_158 # %bb.96: # in Loop: Header=BB3_74 Depth=3 - ld.d $a0, $ra, 0 + ld.d $a0, $t4, 0 ldx.bu $a0, $a0, $s5 bge $a2, $a0, .LBB3_158 # %bb.97: # in Loop: Header=BB3_74 Depth=3 - ld.d $a0, $a1, 0 + ld.d $a0, $ra, 0 ldx.bu $a0, $a0, $s5 bge $a2, $a0, .LBB3_158 # %bb.98: # in Loop: Header=BB3_74 Depth=3 - addi.w $s6, $s6, 1 - ori $t7, $zero, 4 - bne $s6, $t7, .LBB3_74 + addi.w $a7, $a7, 1 + ori $s8, $zero, 4 + move $ra, $t7 + move $t7, $a3 + move $a1, $s2 + move $s2, $s0 + move $s0, $t0 + move $t0, $t6 + move $t6, $t8 + move $t8, $a6 + ori $a6, $zero, 2 + ori $a3, $zero, 1 + bne $a7, $s8, .LBB3_74 b .LBB3_19 .LBB3_99: # in Loop: Header=BB3_2 Depth=2 bne $a0, $a6, .LBB3_140 # %bb.100: # %.preheader226 # in Loop: Header=BB3_2 Depth=2 addi.d $a0, $s4, 3 - st.d $a0, $sp, 8 # 8-byte Folded Spill + st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a1, $sp, 16 # 8-byte Folded Spill + st.d $t7, $sp, 8 # 8-byte Folded Spill .LBB3_101: # %.preheader216 # Parent Loop BB3_1 Depth=1 # Parent Loop BB3_2 Depth=2 # => This Loop Header: Depth=3 # Child Loop BB3_102 Depth 4 - move $s8, $zero - addi.w $s7, $s4, 0 - slt $a0, $a6, $s7 - masknez $a1, $a6, $a0 - maskeqz $a0, $s7, $a0 - or $a0, $a0, $a1 - addi.w $a0, $a0, -2 + move $t7, $zero + addi.w $a1, $s4, 0 + vinsgr2vr.w $vr2, $s4, 0 + vinsgr2vr.w $vr2, $s4, 1 + ori $a0, $zero, 2 + lu32i.d $a0, 1 + vreplgr2vr.d $vr3, $a0 + vmax.w $vr3, $vr2, $vr3 + vadd.w $vr3, $vr3, $vr1 + vmin.w $vr3, $vr3, $vr0 + vpickve2gr.w $a0, $vr3, 0 + slli.d $a0, $a0, 3 + ldx.d $ra, $t5, $a0 + vpickve2gr.w $a0, $vr3, 1 + alsl.d $a3, $a0, $t5, 3 + srai.d $a0, $a1, 63 + st.d $a1, $sp, 56 # 8-byte Folded Spill + andn $a0, $a1, $a0 slt $a1, $a0, $a5 maskeqz $a0, $a0, $a1 masknez $a1, $a5, $a1 or $a0, $a0, $a1 - slt $a1, $s0, $s7 - masknez $a7, $s0, $a1 - maskeqz $a1, $s7, $a1 - or $a1, $a1, $a7 - addi.d $a1, $a1, -1 - slt $a7, $a1, $a5 - maskeqz $a1, $a1, $a7 - masknez $a7, $a5, $a7 - or $a1, $a1, $a7 - srai.d $a7, $s7, 63 - andn $a7, $s7, $a7 - slt $t1, $a7, $a5 - maskeqz $a7, $a7, $t1 - masknez $t1, $a5, $t1 - or $a7, $a7, $t1 + alsl.d $s5, $a0, $t5, 3 addi.w $s4, $s4, 1 - srai.d $t1, $s4, 63 - andn $t1, $s4, $t1 - slt $t2, $t1, $a5 - maskeqz $t1, $t1, $t2 - masknez $t2, $a5, $t2 - or $t1, $t1, $t2 - slt $t2, $fp, $s7 - masknez $t3, $fp, $t2 - maskeqz $t2, $s7, $t2 - or $t2, $t2, $t3 - addi.w $t2, $t2, 2 - slt $t3, $t2, $a5 - maskeqz $t2, $t2, $t3 - masknez $t3, $a5, $t3 - or $t2, $t2, $t3 - addi.w $t4, $zero, -3 - slt $t3, $t4, $s7 - st.d $t4, $sp, 32 # 8-byte Folded Spill - masknez $t4, $t4, $t3 - maskeqz $t3, $s7, $t3 - or $t3, $t3, $t4 - addi.w $t3, $t3, 3 - slt $t4, $t3, $a5 - maskeqz $t3, $t3, $t4 - masknez $t4, $a5, $t4 - or $t3, $t3, $t4 - slli.d $a0, $a0, 3 - ldx.d $s6, $t5, $a0 - alsl.d $t4, $a1, $t5, 3 - alsl.d $ra, $a7, $t5, 3 - alsl.d $a0, $t1, $t5, 3 - st.d $a0, $sp, 56 # 8-byte Folded Spill - alsl.d $a0, $t2, $t5, 3 - st.d $a0, $sp, 48 # 8-byte Folded Spill - alsl.d $a0, $t3, $t5, 3 - st.d $a0, $sp, 40 # 8-byte Folded Spill - st.d $t4, $sp, 24 # 8-byte Folded Spill - st.d $ra, $sp, 16 # 8-byte Folded Spill + srai.d $a0, $s4, 63 + andn $a0, $s4, $a0 + slt $a1, $a0, $a5 + maskeqz $a0, $a0, $a1 + masknez $a1, $a5, $a1 + or $a0, $a0, $a1 + alsl.d $s8, $a0, $t5, 3 + move $a0, $fp + lu32i.d $a0, -3 + vreplgr2vr.d $vr3, $a0 + vmax.w $vr2, $vr2, $vr3 + ori $a0, $zero, 2 + lu32i.d $a0, 3 + vreplgr2vr.d $vr3, $a0 + vadd.w $vr2, $vr2, $vr3 + vmin.w $vr2, $vr2, $vr0 + vpickve2gr.w $a0, $vr2, 0 + alsl.d $t3, $a0, $t5, 3 + vpickve2gr.w $a0, $vr2, 1 + alsl.d $a0, $a0, $t5, 3 + st.d $a0, $sp, 80 # 8-byte Folded Spill + st.d $t3, $sp, 64 # 8-byte Folded Spill .LBB3_102: # %.preheader208 # Parent Loop BB3_1 Depth=1 # Parent Loop BB3_2 Depth=2 # Parent Loop BB3_101 Depth=3 # => This Inner Loop Header: Depth=4 - add.w $t1, $s5, $s8 + add.w $t1, $t4, $t7 slt $a0, $a6, $t1 maskeqz $a1, $t1, $a0 masknez $a0, $a6, $a0 @@ -1474,35 +1447,33 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef maskeqz $a0, $a0, $a1 masknez $a1, $a4, $a1 or $a0, $a0, $a1 - ldx.bu $a1, $s6, $a0 + ldx.bu $a1, $ra, $a0 bge $a2, $a1, .LBB3_158 # %bb.103: # in Loop: Header=BB3_102 Depth=4 - ld.d $a7, $t4, 0 + ld.d $a7, $a3, 0 ldx.bu $a1, $a7, $a0 bge $a2, $a1, .LBB3_158 # %bb.104: # in Loop: Header=BB3_102 Depth=4 - ld.d $ra, $ra, 0 - ldx.bu $a1, $ra, $a0 + ld.d $s6, $s5, 0 + ldx.bu $a1, $s6, $a0 bge $a2, $a1, .LBB3_158 # %bb.105: # in Loop: Header=BB3_102 Depth=4 - ld.d $a1, $sp, 56 # 8-byte Folded Reload - ld.d $t2, $a1, 0 - ldx.bu $a1, $t2, $a0 + ld.d $s7, $s8, 0 + ldx.bu $a1, $s7, $a0 bge $a2, $a1, .LBB3_158 # %bb.106: # in Loop: Header=BB3_102 Depth=4 - ld.d $a1, $sp, 48 # 8-byte Folded Reload - ld.d $t3, $a1, 0 - ldx.bu $a1, $t3, $a0 + ld.d $t2, $t3, 0 + ldx.bu $a1, $t2, $a0 bge $a2, $a1, .LBB3_158 # %bb.107: # in Loop: Header=BB3_102 Depth=4 - move $t4, $s8 - ld.d $a1, $sp, 40 # 8-byte Folded Reload - ld.d $s8, $a1, 0 - ldx.bu $a0, $s8, $a0 + ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $t3, $a1, 0 + ldx.bu $a0, $t3, $a0 bge $a2, $a0, .LBB3_158 # %bb.108: # in Loop: Header=BB3_102 Depth=4 - slt $a0, $s0, $t1 - masknez $a1, $s0, $a0 + ori $a1, $zero, 1 + slt $a0, $a1, $t1 + masknez $a1, $a1, $a0 maskeqz $a0, $t1, $a0 or $a0, $a0, $a1 addi.d $a0, $a0, -1 @@ -1510,22 +1481,22 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef maskeqz $a0, $a0, $a1 masknez $a1, $a4, $a1 or $a0, $a0, $a1 - ldx.bu $a1, $s6, $a0 + ldx.bu $a1, $ra, $a0 bge $a2, $a1, .LBB3_158 # %bb.109: # in Loop: Header=BB3_102 Depth=4 ldx.bu $a1, $a7, $a0 bge $a2, $a1, .LBB3_158 # %bb.110: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $ra, $a0 + ldx.bu $a1, $s6, $a0 bge $a2, $a1, .LBB3_158 # %bb.111: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $t2, $a0 + ldx.bu $a1, $s7, $a0 bge $a2, $a1, .LBB3_158 # %bb.112: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $t3, $a0 + ldx.bu $a1, $t2, $a0 bge $a2, $a1, .LBB3_158 # %bb.113: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a0, $s8, $a0 + ldx.bu $a0, $t3, $a0 bge $a2, $a0, .LBB3_158 # %bb.114: # in Loop: Header=BB3_102 Depth=4 srai.d $a0, $t1, 63 @@ -1534,22 +1505,22 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef maskeqz $a0, $a0, $a1 masknez $a1, $a4, $a1 or $a0, $a0, $a1 - ldx.bu $a1, $s6, $a0 + ldx.bu $a1, $ra, $a0 bge $a2, $a1, .LBB3_158 # %bb.115: # in Loop: Header=BB3_102 Depth=4 ldx.bu $a1, $a7, $a0 bge $a2, $a1, .LBB3_158 # %bb.116: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $ra, $a0 + ldx.bu $a1, $s6, $a0 bge $a2, $a1, .LBB3_158 # %bb.117: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $t2, $a0 + ldx.bu $a1, $s7, $a0 bge $a2, $a1, .LBB3_158 # %bb.118: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $t3, $a0 + ldx.bu $a1, $t2, $a0 bge $a2, $a1, .LBB3_158 # %bb.119: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a0, $s8, $a0 + ldx.bu $a0, $t3, $a0 bge $a2, $a0, .LBB3_158 # %bb.120: # in Loop: Header=BB3_102 Depth=4 addi.w $a0, $t1, 1 @@ -1559,22 +1530,22 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef maskeqz $a0, $a0, $a1 masknez $a1, $a4, $a1 or $a0, $a0, $a1 - ldx.bu $a1, $s6, $a0 + ldx.bu $a1, $ra, $a0 bge $a2, $a1, .LBB3_158 # %bb.121: # in Loop: Header=BB3_102 Depth=4 ldx.bu $a1, $a7, $a0 bge $a2, $a1, .LBB3_158 # %bb.122: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $ra, $a0 + ldx.bu $a1, $s6, $a0 bge $a2, $a1, .LBB3_158 # %bb.123: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $t2, $a0 + ldx.bu $a1, $s7, $a0 bge $a2, $a1, .LBB3_158 # %bb.124: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $t3, $a0 + ldx.bu $a1, $t2, $a0 bge $a2, $a1, .LBB3_158 # %bb.125: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a0, $s8, $a0 + ldx.bu $a0, $t3, $a0 bge $a2, $a0, .LBB3_158 # %bb.126: # in Loop: Header=BB3_102 Depth=4 slt $a0, $fp, $t1 @@ -1586,76 +1557,84 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef maskeqz $a0, $a0, $a1 masknez $a1, $a4, $a1 or $a0, $a0, $a1 - ldx.bu $a1, $s6, $a0 + ldx.bu $a1, $ra, $a0 bge $a2, $a1, .LBB3_158 # %bb.127: # in Loop: Header=BB3_102 Depth=4 ldx.bu $a1, $a7, $a0 bge $a2, $a1, .LBB3_158 # %bb.128: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $ra, $a0 + ldx.bu $a1, $s6, $a0 bge $a2, $a1, .LBB3_158 # %bb.129: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $t2, $a0 + ldx.bu $a1, $s7, $a0 bge $a2, $a1, .LBB3_158 # %bb.130: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $t3, $a0 + ldx.bu $a1, $t2, $a0 bge $a2, $a1, .LBB3_158 # %bb.131: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a0, $s8, $a0 + ldx.bu $a0, $t3, $a0 bge $a2, $a0, .LBB3_158 # %bb.132: # in Loop: Header=BB3_102 Depth=4 - ld.d $a1, $sp, 32 # 8-byte Folded Reload - slt $a0, $a1, $t1 - masknez $a1, $a1, $a0 - maskeqz $a0, $t1, $a0 - or $a0, $a0, $a1 + addi.w $a0, $zero, -3 + slt $a1, $a0, $t1 + masknez $a0, $a0, $a1 + maskeqz $a1, $t1, $a1 + or $a0, $a1, $a0 addi.w $a0, $a0, 3 slt $a1, $a0, $a4 maskeqz $a0, $a0, $a1 masknez $a1, $a4, $a1 or $a0, $a0, $a1 - ldx.bu $a1, $s6, $a0 + ldx.bu $a1, $ra, $a0 bge $a2, $a1, .LBB3_158 # %bb.133: # in Loop: Header=BB3_102 Depth=4 ldx.bu $a1, $a7, $a0 bge $a2, $a1, .LBB3_158 # %bb.134: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $ra, $a0 + ldx.bu $a1, $s6, $a0 bge $a2, $a1, .LBB3_158 # %bb.135: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $t2, $a0 + ldx.bu $a1, $s7, $a0 bge $a2, $a1, .LBB3_158 # %bb.136: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a1, $t3, $a0 + ldx.bu $a1, $t2, $a0 bge $a2, $a1, .LBB3_158 # %bb.137: # in Loop: Header=BB3_102 Depth=4 - ldx.bu $a0, $s8, $a0 + ldx.bu $a0, $t3, $a0 bge $a2, $a0, .LBB3_158 # %bb.138: # in Loop: Header=BB3_102 Depth=4 - addi.w $s8, $t4, 1 - ld.d $t4, $sp, 24 # 8-byte Folded Reload - ld.d $ra, $sp, 16 # 8-byte Folded Reload - bne $s8, $t7, .LBB3_102 + addi.w $t7, $t7, 1 + ori $a0, $zero, 4 + ld.d $t3, $sp, 64 # 8-byte Folded Reload + bne $t7, $a0, .LBB3_102 # %bb.139: # in Loop: Header=BB3_101 Depth=3 - ld.d $a0, $sp, 8 # 8-byte Folded Reload - bne $s7, $a0, .LBB3_101 + ori $s8, $zero, 4 + ld.d $ra, $sp, 72 # 8-byte Folded Reload + ld.d $a1, $sp, 16 # 8-byte Folded Reload + ori $a3, $zero, 1 + ld.d $a0, $sp, 48 # 8-byte Folded Reload + ld.d $a7, $sp, 56 # 8-byte Folded Reload + ld.d $t7, $sp, 8 # 8-byte Folded Reload + bne $a7, $a0, .LBB3_101 b .LBB3_19 .LBB3_140: # %.preheader228 # in Loop: Header=BB3_2 Depth=2 addi.d $a0, $a0, -1 sltu $a0, $zero, $a0 - addi.d $a1, $s5, 3 - st.d $a1, $sp, 32 # 8-byte Folded Spill - addi.d $a1, $s4, 3 - st.d $a1, $sp, 16 # 8-byte Folded Spill - st.d $a0, $sp, 24 # 8-byte Folded Spill + addi.d $a3, $t4, 3 + st.d $a3, $sp, 40 # 8-byte Folded Spill + addi.d $a3, $s4, 3 + st.d $a3, $sp, 24 # 8-byte Folded Spill + st.d $a0, $sp, 32 # 8-byte Folded Spill .LBB3_141: # %.preheader217 # Parent Loop BB3_1 Depth=1 # Parent Loop BB3_2 Depth=2 # => This Loop Header: Depth=3 # Child Loop BB3_142 Depth 4 + ori $a7, $zero, 1 + move $a3, $a1 add.w $a0, $s4, $a0 - addi.w $ra, $s4, 0 + addi.w $t1, $s4, 0 srai.d $a1, $a0, 63 andn $a0, $a0, $a1 slt $a1, $a0, $a5 @@ -1663,10 +1642,10 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef masknez $a1, $a5, $a1 or $a0, $a0, $a1 slli.d $a0, $a0, 3 - ldx.d $a7, $t5, $a0 - slt $a0, $a6, $ra + ldx.d $ra, $t5, $a0 + slt $a0, $a6, $t1 masknez $a1, $a6, $a0 - maskeqz $a0, $ra, $a0 + maskeqz $a0, $t1, $a0 or $a0, $a0, $a1 addi.w $a0, $a0, -2 slt $a1, $a0, $a5 @@ -1674,10 +1653,10 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef masknez $a1, $a5, $a1 or $a0, $a0, $a1 alsl.d $a0, $a0, $t5, 3 - st.d $a0, $sp, 56 # 8-byte Folded Spill - slt $a0, $s0, $ra - masknez $a1, $s0, $a0 - maskeqz $a0, $ra, $a0 + st.d $a0, $sp, 80 # 8-byte Folded Spill + slt $a0, $a7, $t1 + masknez $a1, $a7, $a0 + maskeqz $a0, $t1, $a0 or $a0, $a0, $a1 addi.d $a0, $a0, -1 slt $a1, $a0, $a5 @@ -1685,15 +1664,15 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef masknez $a1, $a5, $a1 or $a0, $a0, $a1 alsl.d $a0, $a0, $t5, 3 - st.d $a0, $sp, 48 # 8-byte Folded Spill - srai.d $a0, $ra, 63 - andn $a0, $ra, $a0 + st.d $a0, $sp, 64 # 8-byte Folded Spill + srai.d $a0, $t1, 63 + andn $a0, $t1, $a0 slt $a1, $a0, $a5 maskeqz $a0, $a0, $a1 masknez $a1, $a5, $a1 or $a0, $a0, $a1 alsl.d $a0, $a0, $t5, 3 - st.d $a0, $sp, 40 # 8-byte Folded Spill + st.d $a0, $sp, 56 # 8-byte Folded Spill addi.w $s4, $s4, 1 srai.d $a0, $s4, 63 andn $a0, $s4, $a0 @@ -1701,10 +1680,11 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef maskeqz $a0, $a0, $a1 masknez $a1, $a5, $a1 or $a0, $a0, $a1 - alsl.d $t1, $a0, $t5, 3 - slt $a0, $fp, $ra + alsl.d $a0, $a0, $t5, 3 + st.d $a0, $sp, 48 # 8-byte Folded Spill + slt $a0, $fp, $t1 masknez $a1, $fp, $a0 - maskeqz $a0, $ra, $a0 + maskeqz $a0, $t1, $a0 or $a0, $a0, $a1 addi.w $a0, $a0, 2 slt $a1, $a0, $a5 @@ -1713,145 +1693,149 @@ CheckReliabilityOfRef: # @CheckReliabilityOfRef or $a0, $a0, $a1 alsl.d $t2, $a0, $t5, 3 addi.w $t3, $zero, -3 - slt $a0, $t3, $ra + slt $a0, $t3, $t1 masknez $a1, $t3, $a0 - maskeqz $a0, $ra, $a0 + maskeqz $a0, $t1, $a0 or $a0, $a0, $a1 addi.w $a0, $a0, 3 slt $a1, $a0, $a5 maskeqz $a0, $a0, $a1 masknez $a1, $a5, $a1 or $a0, $a0, $a1 - alsl.d $s7, $a0, $t5, 3 - move $a0, $s5 + alsl.d $a7, $a0, $t5, 3 + move $s5, $t4 .LBB3_142: # Parent Loop BB3_1 Depth=1 # Parent Loop BB3_2 Depth=2 # Parent Loop BB3_141 Depth=3 # => This Inner Loop Header: Depth=4 - addi.w $a1, $a0, 0 - slt $t4, $a6, $a1 - masknez $t7, $a6, $t4 - maskeqz $t4, $a1, $t4 - or $t4, $t4, $t7 - addi.w $t4, $t4, -2 - slt $t7, $t4, $a4 - maskeqz $t4, $t4, $t7 - masknez $t7, $a4, $t7 - or $t4, $t4, $t7 - ldx.bu $t4, $a7, $t4 - bge $a2, $t4, .LBB3_158 + addi.w $s7, $s5, 0 + slt $a0, $a6, $s7 + masknez $a1, $a6, $a0 + maskeqz $a0, $s7, $a0 + or $a0, $a0, $a1 + addi.w $a0, $a0, -2 + slt $a1, $a0, $a4 + maskeqz $a0, $a0, $a1 + masknez $a1, $a4, $a1 + or $a0, $a0, $a1 + ldx.bu $a0, $ra, $a0 + bge $a2, $a0, .LBB3_158 # %bb.143: # in Loop: Header=BB3_142 Depth=4 - slt $t4, $s0, $a1 - masknez $t7, $s0, $t4 - maskeqz $t4, $a1, $t4 - or $t4, $t4, $t7 - addi.d $t4, $t4, -1 - slt $t7, $t4, $a4 - maskeqz $t4, $t4, $t7 - masknez $t7, $a4, $t7 - or $t4, $t4, $t7 - ldx.bu $t4, $a7, $t4 - bge $a2, $t4, .LBB3_158 + ori $a1, $zero, 1 + slt $a0, $a1, $s7 + masknez $a1, $a1, $a0 + maskeqz $a0, $s7, $a0 + or $a0, $a0, $a1 + addi.d $a0, $a0, -1 + slt $a1, $a0, $a4 + maskeqz $a0, $a0, $a1 + masknez $a1, $a4, $a1 + or $a0, $a0, $a1 + ldx.bu $a0, $ra, $a0 + bge $a2, $a0, .LBB3_158 # %bb.144: # in Loop: Header=BB3_142 Depth=4 - srai.d $t4, $a1, 63 - andn $t4, $a1, $t4 - slt $t7, $t4, $a4 - maskeqz $t4, $t4, $t7 - masknez $t7, $a4, $t7 - or $t4, $t4, $t7 - ldx.bu $t4, $a7, $t4 - bge $a2, $t4, .LBB3_158 + srai.d $a0, $s7, 63 + andn $a0, $s7, $a0 + slt $a1, $a0, $a4 + maskeqz $a0, $a0, $a1 + masknez $a1, $a4, $a1 + or $a0, $a0, $a1 + ldx.bu $a0, $ra, $a0 + bge $a2, $a0, .LBB3_158 # %bb.145: # in Loop: Header=BB3_142 Depth=4 - addi.w $t4, $a0, 1 - srai.d $t7, $t4, 63 - andn $t7, $t4, $t7 - slt $s8, $t7, $a4 - maskeqz $t7, $t7, $s8 + addi.w $a0, $s5, 1 + srai.d $a1, $a0, 63 + andn $a1, $a0, $a1 + slt $s8, $a1, $a4 + maskeqz $a1, $a1, $s8 masknez $s8, $a4, $s8 - or $t7, $t7, $s8 - ldx.bu $t7, $a7, $t7 - bge $a2, $t7, .LBB3_158 + or $a1, $a1, $s8 + ldx.bu $a1, $ra, $a1 + bge $a2, $a1, .LBB3_158 # %bb.146: # in Loop: Header=BB3_142 Depth=4 - slt $t7, $fp, $a1 - masknez $s8, $fp, $t7 - maskeqz $t7, $a1, $t7 - or $t7, $t7, $s8 - addi.w $t7, $t7, 2 - slt $s8, $t7, $a4 - maskeqz $t7, $t7, $s8 + slt $a1, $fp, $s7 + masknez $s8, $fp, $a1 + maskeqz $a1, $s7, $a1 + or $a1, $a1, $s8 + addi.w $a1, $a1, 2 + slt $s8, $a1, $a4 + maskeqz $a1, $a1, $s8 masknez $s8, $a4, $s8 - or $t7, $t7, $s8 - ldx.bu $t7, $a7, $t7 - bge $a2, $t7, .LBB3_158 + or $a1, $a1, $s8 + ldx.bu $a1, $ra, $a1 + bge $a2, $a1, .LBB3_158 # %bb.147: # in Loop: Header=BB3_142 Depth=4 - slt $t7, $t3, $a1 - masknez $s8, $t3, $t7 - maskeqz $a1, $a1, $t7 + slt $a1, $t3, $s7 + masknez $s8, $t3, $a1 + maskeqz $a1, $s7, $a1 or $a1, $a1, $s8 addi.w $a1, $a1, 3 - slt $t7, $a1, $a4 - maskeqz $a1, $a1, $t7 - masknez $t7, $a4, $t7 - or $a1, $a1, $t7 - ldx.bu $a1, $a7, $a1 + slt $s7, $a1, $a4 + maskeqz $a1, $a1, $s7 + masknez $s7, $a4, $s7 + or $a1, $a1, $s7 + ldx.bu $a1, $ra, $a1 bge $a2, $a1, .LBB3_158 # %bb.148: # in Loop: Header=BB3_142 Depth=4 addi.d $a1, $s6, -1 sltui $a1, $a1, 1 - masknez $t7, $t4, $a1 - maskeqz $a0, $a0, $a1 - or $a0, $a0, $t7 - addi.w $a0, $a0, 0 - srai.d $a1, $a0, 63 - andn $a0, $a0, $a1 - slt $a1, $a0, $a4 - ld.d $t7, $sp, 56 # 8-byte Folded Reload - ld.d $t7, $t7, 0 - maskeqz $a0, $a0, $a1 - masknez $a1, $a4, $a1 - or $a0, $a0, $a1 - ldx.bu $a1, $t7, $a0 + masknez $s7, $a0, $a1 + maskeqz $a1, $s5, $a1 + or $a1, $a1, $s7 + addi.w $a1, $a1, 0 + srai.d $s5, $a1, 63 + andn $a1, $a1, $s5 + slt $s5, $a1, $a4 + ld.d $s7, $sp, 80 # 8-byte Folded Reload + ld.d $s7, $s7, 0 + maskeqz $a1, $a1, $s5 + masknez $s5, $a4, $s5 + or $s5, $a1, $s5 + ldx.bu $a1, $s7, $s5 bge $a2, $a1, .LBB3_158 # %bb.149: # in Loop: Header=BB3_142 Depth=4 - ld.d $a1, $sp, 48 # 8-byte Folded Reload + ld.d $a1, $sp, 64 # 8-byte Folded Reload ld.d $a1, $a1, 0 - ldx.bu $a1, $a1, $a0 + ldx.bu $a1, $a1, $s5 bge $a2, $a1, .LBB3_158 # %bb.150: # in Loop: Header=BB3_142 Depth=4 - ld.d $a1, $sp, 40 # 8-byte Folded Reload + ld.d $a1, $sp, 56 # 8-byte Folded Reload ld.d $a1, $a1, 0 - ldx.bu $a1, $a1, $a0 + ldx.bu $a1, $a1, $s5 bge $a2, $a1, .LBB3_158 # %bb.151: # in Loop: Header=BB3_142 Depth=4 - ld.d $a1, $t1, 0 - ldx.bu $a1, $a1, $a0 + ld.d $a1, $sp, 48 # 8-byte Folded Reload + ld.d $a1, $a1, 0 + ldx.bu $a1, $a1, $s5 bge $a2, $a1, .LBB3_158 # %bb.152: # in Loop: Header=BB3_142 Depth=4 ld.d $a1, $t2, 0 - ldx.bu $a1, $a1, $a0 + ldx.bu $a1, $a1, $s5 bge $a2, $a1, .LBB3_158 # %bb.153: # in Loop: Header=BB3_142 Depth=4 - ld.d $a1, $s7, 0 - ldx.bu $a0, $a1, $a0 - bge $a2, $a0, .LBB3_158 + ld.d $a1, $a7, 0 + ldx.bu $a1, $a1, $s5 + bge $a2, $a1, .LBB3_158 # %bb.154: # %.loopexit209 # in Loop: Header=BB3_142 Depth=4 - addi.w $a1, $t4, -1 - move $a0, $t4 - ld.d $t4, $sp, 32 # 8-byte Folded Reload - bne $a1, $t4, .LBB3_142 + addi.w $a1, $a0, -1 + move $s5, $a0 + ld.d $a0, $sp, 40 # 8-byte Folded Reload + bne $a1, $a0, .LBB3_142 # %bb.155: # in Loop: Header=BB3_141 Depth=3 - ori $t7, $zero, 4 - ld.d $a0, $sp, 24 # 8-byte Folded Reload - ld.d $a1, $sp, 16 # 8-byte Folded Reload - bne $ra, $a1, .LBB3_141 + ori $s8, $zero, 4 + ld.d $ra, $sp, 72 # 8-byte Folded Reload + move $a1, $a3 + ori $a3, $zero, 1 + ld.d $a0, $sp, 32 # 8-byte Folded Reload + ld.d $a7, $sp, 24 # 8-byte Folded Reload + bne $t1, $a7, .LBB3_141 b .LBB3_19 .LBB3_156: # in Loop: Header=BB3_1 Depth=1 addi.d $t6, $t6, 1 - ld.d $a0, $sp, 80 # 8-byte Folded Reload - ld.d $t1, $sp, 72 # 8-byte Folded Reload - ld.d $t2, $sp, 64 # 8-byte Folded Reload - blt $t6, $a0, .LBB3_1 + ld.d $t1, $sp, 96 # 8-byte Folded Reload + ld.d $t2, $sp, 88 # 8-byte Folded Reload + blt $t6, $ra, .LBB3_1 # %bb.157: ori $a0, $zero, 1 b .LBB3_159 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/slice.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/slice.s index 88a93839..f231e468 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/slice.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/slice.s @@ -1114,49 +1114,37 @@ encode_one_slice: # @encode_one_slice ld.d $a0, $a0, %got_pc_lo12(writeB8_typeInfo) pcalau12i $a1, %got_pc_hi20(writeB8_typeInfo_CABAC) ld.d $a1, $a1, %got_pc_lo12(writeB8_typeInfo_CABAC) - ld.w $a2, $s3, 0 + vld $vr0, $s3, 0 st.d $a1, $a0, 0 - sltui $a0, $a2, 2 + vslti.wu $vr0, $vr0, 2 + vshuf4i.w $vr1, $vr0, 50 + vslli.d $vr1, $vr1, 32 + vsrai.d $vr1, $vr1, 32 + vshuf4i.w $vr0, $vr0, 16 + vslli.d $vr0, $vr0, 32 + vsrai.d $vr0, $vr0, 32 + pcalau12i $a0, %got_pc_hi20(writeSE_Dummy) + ld.d $a0, $a0, %got_pc_lo12(writeSE_Dummy) + vreplgr2vr.d $vr2, $a0 pcalau12i $a1, %got_pc_hi20(writeRefFrame_CABAC) ld.d $a1, $a1, %got_pc_lo12(writeRefFrame_CABAC) - masknez $a2, $a1, $a0 - pcalau12i $a3, %got_pc_hi20(writeSE_Dummy) - ld.d $a3, $a3, %got_pc_lo12(writeSE_Dummy) - maskeqz $a0, $a3, $a0 - or $a0, $a0, $a2 + vreplgr2vr.d $vr3, $a1 + vbitsel.v $vr0, $vr3, $vr2, $vr0 + vbitsel.v $vr1, $vr3, $vr2, $vr1 pcalau12i $a2, %got_pc_hi20(writeRefFrame) ld.d $a2, $a2, %got_pc_lo12(writeRefFrame) - ld.w $a4, $s3, 4 - st.d $a0, $a2, 0 - sltui $a0, $a4, 2 - masknez $a4, $a1, $a0 - vld $vr0, $s3, 8 - maskeqz $a0, $a3, $a0 - or $a0, $a0, $a4 - st.d $a0, $a2, 8 - vslti.wu $vr0, $vr0, 2 - vpickve2gr.w $a0, $vr0, 0 - andi $a0, $a0, 1 - masknez $a4, $a1, $a0 - maskeqz $a0, $a3, $a0 - or $a0, $a0, $a4 - st.d $a0, $a2, 16 - vpickve2gr.w $a0, $vr0, 1 - andi $a0, $a0, 1 - masknez $a4, $a1, $a0 - maskeqz $a0, $a3, $a0 - or $a0, $a0, $a4 - st.d $a0, $a2, 24 - vpickve2gr.w $a0, $vr0, 2 - andi $a0, $a0, 1 - masknez $a4, $a1, $a0 - maskeqz $a0, $a3, $a0 - or $a0, $a0, $a4 - st.d $a0, $a2, 32 - vpickve2gr.w $a0, $vr0, 3 - andi $a0, $a0, 1 - masknez $a1, $a1, $a0 - maskeqz $a0, $a3, $a0 + ld.w $a3, $s3, 16 + vst $vr1, $a2, 16 + vst $vr0, $a2, 0 + sltui $a3, $a3, 2 + masknez $a4, $a1, $a3 + ld.w $a5, $s3, 20 + maskeqz $a3, $a0, $a3 + or $a3, $a3, $a4 + st.d $a3, $a2, 32 + sltui $a2, $a5, 2 + masknez $a1, $a1, $a2 + maskeqz $a0, $a0, $a2 or $a4, $a0, $a1 pcalau12i $a0, %got_pc_hi20(writeMB_transform_size_CABAC) ld.d $a1, $a0, %got_pc_lo12(writeMB_transform_size_CABAC) diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/transform8x8.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/transform8x8.s index 0d38b7f8..690b3da8 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/transform8x8.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/transform8x8.s @@ -1623,19 +1623,19 @@ Mode_Decision_for_new_8x8IntraBlocks: # @Mode_Decision_for_new_8x8IntraBlocks .type intrapred_luma8x8,@function intrapred_luma8x8: # @intrapred_luma8x8 # %bb.0: - addi.d $sp, $sp, -448 - st.d $ra, $sp, 440 # 8-byte Folded Spill - st.d $fp, $sp, 432 # 8-byte Folded Spill - st.d $s0, $sp, 424 # 8-byte Folded Spill - st.d $s1, $sp, 416 # 8-byte Folded Spill - st.d $s2, $sp, 408 # 8-byte Folded Spill - st.d $s3, $sp, 400 # 8-byte Folded Spill - st.d $s4, $sp, 392 # 8-byte Folded Spill - st.d $s5, $sp, 384 # 8-byte Folded Spill - st.d $s6, $sp, 376 # 8-byte Folded Spill - st.d $s7, $sp, 368 # 8-byte Folded Spill - st.d $s8, $sp, 360 # 8-byte Folded Spill - st.d $a4, $sp, 72 # 8-byte Folded Spill + addi.d $sp, $sp, -432 + st.d $ra, $sp, 424 # 8-byte Folded Spill + st.d $fp, $sp, 416 # 8-byte Folded Spill + st.d $s0, $sp, 408 # 8-byte Folded Spill + st.d $s1, $sp, 400 # 8-byte Folded Spill + st.d $s2, $sp, 392 # 8-byte Folded Spill + st.d $s3, $sp, 384 # 8-byte Folded Spill + st.d $s4, $sp, 376 # 8-byte Folded Spill + st.d $s5, $sp, 368 # 8-byte Folded Spill + st.d $s6, $sp, 360 # 8-byte Folded Spill + st.d $s7, $sp, 352 # 8-byte Folded Spill + st.d $s8, $sp, 344 # 8-byte Folded Spill + st.d $a4, $sp, 56 # 8-byte Folded Spill move $s3, $a3 move $s4, $a2 pcalau12i $a2, %got_pc_hi20(enc_picture) @@ -1651,7 +1651,7 @@ intrapred_luma8x8: # @intrapred_luma8x8 pcalau12i $s8, %pc_hi20(getNeighbour) ld.d $a5, $s8, %pc_lo12(getNeighbour) addi.d $s5, $s1, -1 - addi.d $a4, $sp, 168 + addi.d $a4, $sp, 152 move $a0, $s2 move $a1, $s5 move $a2, $s0 @@ -1659,56 +1659,56 @@ intrapred_luma8x8: # @intrapred_luma8x8 jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) addi.d $a2, $s0, 1 - addi.d $a4, $sp, 192 + addi.d $a4, $sp, 176 move $a0, $s2 move $a1, $s5 move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) addi.d $a2, $s0, 2 - addi.d $a4, $sp, 216 + addi.d $a4, $sp, 200 move $a0, $s2 move $a1, $s5 move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) addi.d $a2, $s0, 3 - addi.d $a4, $sp, 240 + addi.d $a4, $sp, 224 move $a0, $s2 move $a1, $s5 move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) addi.d $a2, $s0, 4 - addi.d $a4, $sp, 264 + addi.d $a4, $sp, 248 move $a0, $s2 move $a1, $s5 move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) addi.d $a2, $s0, 5 - addi.d $a4, $sp, 288 + addi.d $a4, $sp, 272 move $a0, $s2 move $a1, $s5 move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) addi.d $a2, $s0, 6 - addi.d $a4, $sp, 312 + addi.d $a4, $sp, 296 move $a0, $s2 move $a1, $s5 move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) addi.d $a2, $s0, 7 - addi.d $a4, $sp, 336 + addi.d $a4, $sp, 320 move $a0, $s2 move $a1, $s5 move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) addi.d $s6, $s0, -1 - addi.d $a4, $sp, 144 + addi.d $a4, $sp, 128 move $a0, $s2 move $a1, $s1 move $a2, $s6 @@ -1716,19 +1716,19 @@ intrapred_luma8x8: # @intrapred_luma8x8 jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) addi.d $a1, $s1, 8 - addi.d $a4, $sp, 120 + addi.d $a4, $sp, 104 move $a0, $s2 move $a2, $s6 move $a3, $zero jirl $ra, $a5, 0 ld.d $a5, $s8, %pc_lo12(getNeighbour) - addi.d $a4, $sp, 96 + addi.d $a4, $sp, 80 move $a0, $s2 move $a1, $s5 move $a2, $s6 move $a3, $zero jirl $ra, $a5, 0 - ld.w $a0, $sp, 120 + ld.w $a0, $sp, 104 sltu $a0, $zero, $a0 addi.d $a1, $s1, -8 sltu $a1, $zero, $a1 @@ -1736,71 +1736,71 @@ intrapred_luma8x8: # @intrapred_luma8x8 sltu $a2, $zero, $a2 or $a1, $a1, $a2 and $a0, $a1, $a0 - st.w $a0, $sp, 120 + st.w $a0, $sp, 104 pcalau12i $a1, %got_pc_hi20(input) ld.d $a1, $a1, %got_pc_lo12(input) ld.d $a1, $a1, 0 ld.w $a1, $a1, 272 beqz $a1, .LBB2_6 # %bb.1: # %.preheader544 - ld.w $a2, $sp, 168 + ld.w $a2, $sp, 152 ld.d $a1, $fp, 0 move $a7, $fp beqz $a2, .LBB2_7 # %bb.2: - ld.w $a2, $sp, 172 + ld.w $a2, $sp, 156 ldptr.d $a3, $a1, 14240 slli.d $a2, $a2, 2 ldx.wu $a2, $a3, $a2 andi $a2, $a2, 1 - ld.w $a3, $sp, 192 + ld.w $a3, $sp, 176 beqz $a3, .LBB2_8 .LBB2_3: - ld.w $a3, $sp, 196 + ld.w $a3, $sp, 180 ldptr.d $a4, $a1, 14240 slli.d $a3, $a3, 2 ldx.w $a4, $a4, $a3 - ld.w $a3, $sp, 216 + ld.w $a3, $sp, 200 beqz $a3, .LBB2_9 .LBB2_4: - ld.w $a3, $sp, 220 + ld.w $a3, $sp, 204 ldptr.d $a5, $a1, 14240 slli.d $a3, $a3, 2 ldx.w $a5, $a5, $a3 - ld.w $a3, $sp, 240 + ld.w $a3, $sp, 224 beqz $a3, .LBB2_10 .LBB2_5: - ld.w $a3, $sp, 244 + ld.w $a3, $sp, 228 ldptr.d $a6, $a1, 14240 slli.d $a3, $a3, 2 ldx.w $a3, $a6, $a3 b .LBB2_11 .LBB2_6: - ld.w $a3, $sp, 168 - ld.w $s0, $sp, 144 - ld.w $a5, $sp, 96 + ld.w $a3, $sp, 152 + ld.w $s2, $sp, 128 + ld.w $a5, $sp, 80 move $a7, $fp b .LBB2_31 .LBB2_7: move $a2, $zero - ld.w $a3, $sp, 192 + ld.w $a3, $sp, 176 bnez $a3, .LBB2_3 .LBB2_8: move $a4, $zero - ld.w $a3, $sp, 216 + ld.w $a3, $sp, 200 bnez $a3, .LBB2_4 .LBB2_9: move $a5, $zero - ld.w $a3, $sp, 240 + ld.w $a3, $sp, 224 bnez $a3, .LBB2_5 .LBB2_10: move $a3, $zero .LBB2_11: - ld.w $a6, $sp, 264 + ld.w $a6, $sp, 248 and $a2, $a4, $a2 beqz $a6, .LBB2_13 # %bb.12: - ld.w $a4, $sp, 268 + ld.w $a4, $sp, 252 ldptr.d $a6, $a1, 14240 slli.d $a4, $a4, 2 ldx.w $a4, $a6, $a4 @@ -1808,11 +1808,11 @@ intrapred_luma8x8: # @intrapred_luma8x8 .LBB2_13: move $a4, $zero .LBB2_14: - ld.w $a6, $sp, 288 + ld.w $a6, $sp, 272 and $a5, $a5, $a2 beqz $a6, .LBB2_16 # %bb.15: - ld.w $a2, $sp, 292 + ld.w $a2, $sp, 276 ldptr.d $a6, $a1, 14240 slli.d $a2, $a2, 2 ldx.w $a2, $a6, $a2 @@ -1820,11 +1820,11 @@ intrapred_luma8x8: # @intrapred_luma8x8 .LBB2_16: move $a2, $zero .LBB2_17: - ld.w $a6, $sp, 312 + ld.w $a6, $sp, 296 and $a5, $a3, $a5 beqz $a6, .LBB2_19 # %bb.18: - ld.w $a3, $sp, 316 + ld.w $a3, $sp, 300 ldptr.d $a6, $a1, 14240 slli.d $a3, $a3, 2 ldx.w $a3, $a6, $a3 @@ -1832,11 +1832,11 @@ intrapred_luma8x8: # @intrapred_luma8x8 .LBB2_19: move $a3, $zero .LBB2_20: - ld.w $a6, $sp, 336 + ld.w $a6, $sp, 320 and $a5, $a4, $a5 beqz $a6, .LBB2_22 # %bb.21: - ld.w $a4, $sp, 340 + ld.w $a4, $sp, 324 ldptr.d $a6, $a1, 14240 slli.d $a4, $a4, 2 ldx.w $a4, $a6, $a4 @@ -1844,34 +1844,34 @@ intrapred_luma8x8: # @intrapred_luma8x8 .LBB2_22: move $a4, $zero .LBB2_23: - ld.w $a6, $sp, 144 + ld.w $a6, $sp, 128 and $a2, $a2, $a5 beqz $a6, .LBB2_26 # %bb.24: - ld.w $a5, $sp, 148 + ld.w $a5, $sp, 132 ldptr.d $a6, $a1, 14240 slli.d $a5, $a5, 2 - ldx.w $s0, $a6, $a5 + ldx.w $s2, $a6, $a5 and $a2, $a3, $a2 beqz $a0, .LBB2_27 .LBB2_25: - ld.w $a0, $sp, 124 + ld.w $a0, $sp, 108 ldptr.d $a3, $a1, 14240 slli.d $a0, $a0, 2 ldx.w $a0, $a3, $a0 b .LBB2_28 .LBB2_26: - move $s0, $zero + move $s2, $zero and $a2, $a3, $a2 bnez $a0, .LBB2_25 .LBB2_27: move $a0, $zero .LBB2_28: - ld.w $a5, $sp, 96 + ld.w $a5, $sp, 80 and $a3, $a4, $a2 beqz $a5, .LBB2_30 # %bb.29: - ld.w $a2, $sp, 100 + ld.w $a2, $sp, 84 ldptr.d $a1, $a1, 14240 slli.d $a2, $a2, 2 ldx.w $a5, $a1, $a2 @@ -1880,22 +1880,22 @@ intrapred_luma8x8: # @intrapred_luma8x8 move $a5, $zero .LBB2_31: st.w $a3, $s4, 0 - st.w $s0, $s3, 0 - sltu $a1, $zero, $s0 + st.w $s2, $s3, 0 + sltu $a1, $zero, $s2 sltu $a2, $zero, $a3 and $a2, $a1, $a2 sltu $a1, $zero, $a5 - st.d $a2, $sp, 88 # 8-byte Folded Spill + st.d $a2, $sp, 72 # 8-byte Folded Spill and $a1, $a2, $a1 - ld.d $a2, $sp, 72 # 8-byte Folded Reload + ld.d $a2, $sp, 56 # 8-byte Folded Reload st.w $a1, $a2, 0 pcalau12i $a1, %pc_hi20(intrapred_luma8x8.PredPel) addi.d $fp, $a1, %pc_lo12(intrapred_luma8x8.PredPel) - beqz $s0, .LBB2_35 + beqz $s2, .LBB2_35 # %bb.32: - ld.w $a1, $sp, 164 + ld.w $a1, $sp, 148 slli.d $a1, $a1, 3 - ld.w $a2, $sp, 160 + ld.w $a2, $sp, 144 ldx.d $a1, $s7, $a1 slli.d $a4, $a2, 1 ldx.h $a4, $a1, $a4 @@ -1917,9 +1917,9 @@ intrapred_luma8x8: # @intrapred_luma8x8 st.h $a1, $fp, 16 beqz $a0, .LBB2_36 .LBB2_33: - ld.w $a0, $sp, 140 + ld.w $a0, $sp, 124 slli.d $a0, $a0, 3 - ld.w $a1, $sp, 136 + ld.w $a1, $sp, 120 ldx.d $a0, $s7, $a0 slli.d $a2, $a1, 1 ldx.h $a2, $a0, $a2 @@ -1941,57 +1941,57 @@ intrapred_luma8x8: # @intrapred_luma8x8 st.h $a0, $fp, 32 beqz $a3, .LBB2_37 .LBB2_34: - ld.w $a0, $sp, 188 + ld.w $a0, $sp, 172 slli.d $a0, $a0, 3 - ld.w $a1, $sp, 184 + ld.w $a1, $sp, 168 ldx.d $a0, $s7, $a0 - ld.w $a2, $sp, 212 + ld.w $a2, $sp, 196 slli.d $a1, $a1, 1 ldx.h $a0, $a0, $a1 slli.d $a1, $a2, 3 - ld.w $a2, $sp, 208 + ld.w $a2, $sp, 192 ldx.d $a1, $s7, $a1 - ld.w $a4, $sp, 236 + ld.w $a4, $sp, 220 st.h $a0, $fp, 34 slli.d $a0, $a2, 1 ldx.h $a0, $a1, $a0 slli.d $a1, $a4, 3 - ld.w $a2, $sp, 232 + ld.w $a2, $sp, 216 ldx.d $a1, $s7, $a1 - ld.w $a4, $sp, 260 + ld.w $a4, $sp, 244 st.h $a0, $fp, 36 slli.d $a0, $a2, 1 ldx.h $a0, $a1, $a0 slli.d $a1, $a4, 3 - ld.w $a2, $sp, 256 + ld.w $a2, $sp, 240 ldx.d $a1, $s7, $a1 - ld.w $a4, $sp, 284 + ld.w $a4, $sp, 268 st.h $a0, $fp, 38 slli.d $a0, $a2, 1 ldx.h $a0, $a1, $a0 slli.d $a1, $a4, 3 - ld.w $a2, $sp, 280 + ld.w $a2, $sp, 264 ldx.d $a1, $s7, $a1 - ld.w $a4, $sp, 308 + ld.w $a4, $sp, 292 st.h $a0, $fp, 40 slli.d $a0, $a2, 1 ldx.h $a0, $a1, $a0 slli.d $a1, $a4, 3 - ld.w $a2, $sp, 304 + ld.w $a2, $sp, 288 ldx.d $a1, $s7, $a1 - ld.w $a4, $sp, 332 + ld.w $a4, $sp, 316 st.h $a0, $fp, 42 slli.d $a0, $a2, 1 ldx.h $a0, $a1, $a0 slli.d $a1, $a4, 3 - ld.w $a2, $sp, 328 + ld.w $a2, $sp, 312 ldx.d $a1, $s7, $a1 - ld.w $a4, $sp, 356 + ld.w $a4, $sp, 340 st.h $a0, $fp, 44 slli.d $a0, $a2, 1 ldx.h $a0, $a1, $a0 slli.d $a1, $a4, 3 - ld.w $a2, $sp, 352 + ld.w $a2, $sp, 336 ldx.d $a1, $s7, $a1 st.h $a0, $fp, 46 slli.d $a0, $a2, 1 @@ -2012,14 +2012,8 @@ intrapred_luma8x8: # @intrapred_luma8x8 st.h $a1, $fp, 16 bnez $a0, .LBB2_33 .LBB2_36: - st.h $a1, $fp, 32 - st.h $a1, $fp, 30 - st.h $a1, $fp, 28 - st.h $a1, $fp, 26 - st.h $a1, $fp, 24 - st.h $a1, $fp, 22 - st.h $a1, $fp, 20 - st.h $a1, $fp, 18 + vreplgr2vr.h $vr0, $a1 + vst $vr0, $fp, 18 bnez $a3, .LBB2_34 .LBB2_37: ld.d $a0, $a7, 0 @@ -2035,12 +2029,12 @@ intrapred_luma8x8: # @intrapred_luma8x8 st.h $a0, $fp, 34 .LBB2_38: st.h $a0, $fp, 48 - st.d $a7, $sp, 80 # 8-byte Folded Spill + st.d $a7, $sp, 64 # 8-byte Folded Spill beqz $a5, .LBB2_40 # %bb.39: - ld.w $a0, $sp, 116 + ld.w $a0, $sp, 100 slli.d $a0, $a0, 3 - ld.w $a1, $sp, 112 + ld.w $a1, $sp, 96 ldx.d $a0, $s7, $a0 slli.d $a1, $a1, 1 ldx.hu $a0, $a0, $a1 @@ -2053,46 +2047,43 @@ intrapred_luma8x8: # @intrapred_luma8x8 ldx.h $a0, $s4, $a0 .LBB2_41: lu12i.w $s5, 1 - ori $a1, $s5, 3410 - add.d $s6, $s4, $a1 st.h $a0, $fp, 0 - ori $a1, $s5, 3280 + ori $s7, $s5, 3280 lu12i.w $a0, 15 - ori $s7, $a0, 4095 - st.d $a1, $sp, 64 # 8-byte Folded Spill - stx.h $s7, $s4, $a1 + ori $s8, $a0, 4095 + stx.h $s8, $s4, $s7 ori $a0, $s5, 3408 - st.d $a0, $sp, 72 # 8-byte Folded Spill - stx.h $s7, $s4, $a0 - ori $s8, $s5, 3536 - stx.h $s7, $s4, $s8 + st.d $a0, $sp, 56 # 8-byte Folded Spill + stx.h $s8, $s4, $a0 + ori $s0, $s5, 3536 + stx.h $s8, $s4, $s0 ori $a0, $s5, 3664 - st.d $a0, $sp, 40 # 8-byte Folded Spill - stx.h $s7, $s4, $a0 + st.d $a0, $sp, 32 # 8-byte Folded Spill + stx.h $s8, $s4, $a0 ori $a0, $s5, 3792 - st.d $a0, $sp, 16 # 8-byte Folded Spill - stx.h $s7, $s4, $a0 + st.d $a0, $sp, 8 # 8-byte Folded Spill + stx.h $s8, $s4, $a0 ori $a0, $s5, 3920 - st.d $a0, $sp, 24 # 8-byte Folded Spill - stx.h $s7, $s4, $a0 + st.d $a0, $sp, 16 # 8-byte Folded Spill + stx.h $s8, $s4, $a0 ori $a0, $s5, 4048 - st.d $a0, $sp, 32 # 8-byte Folded Spill - stx.h $s7, $s4, $a0 + st.d $a0, $sp, 24 # 8-byte Folded Spill + stx.h $s8, $s4, $a0 lu12i.w $s3, 2 ori $a0, $s3, 80 - st.d $a0, $sp, 48 # 8-byte Folded Spill - stx.h $s7, $s4, $a0 + st.d $a0, $sp, 40 # 8-byte Folded Spill + stx.h $s8, $s4, $a0 ori $a0, $s3, 208 - st.d $a0, $sp, 56 # 8-byte Folded Spill - stx.h $s7, $s4, $a0 + st.d $a0, $sp, 48 # 8-byte Folded Spill + stx.h $s8, $s4, $a0 move $a0, $fp - move $s2, $a5 + move $s1, $a5 move $a1, $a5 - move $a2, $s0 - move $s1, $a3 + move $a2, $s2 + move $s6, $a3 pcaddu18i $ra, %call36(LowPassForIntra8x8Pred) jirl $ra, $ra, 0 - ld.d $a0, $sp, 88 # 8-byte Folded Reload + ld.d $a0, $sp, 72 # 8-byte Folded Reload beqz $a0, .LBB2_43 # %bb.42: ld.hu $a0, $fp, 2 @@ -2128,11 +2119,11 @@ intrapred_luma8x8: # @intrapred_luma8x8 add.d $a0, $a0, $t7 addi.d $a0, $a0, 8 bstrpick.d $a0, $a0, 31, 4 - move $ra, $s1 + move $ra, $s6 b .LBB2_51 .LBB2_43: - move $ra, $s1 - bnez $s0, .LBB2_46 + move $ra, $s6 + bnez $s2, .LBB2_46 # %bb.44: beqz $ra, .LBB2_46 # %bb.45: @@ -2146,7 +2137,7 @@ intrapred_luma8x8: # @intrapred_luma8x8 ld.hu $a7, $fp, 48 b .LBB2_49 .LBB2_46: - beqz $s0, .LBB2_50 + beqz $s2, .LBB2_50 # %bb.47: bnez $ra, .LBB2_50 # %bb.48: @@ -2174,265 +2165,137 @@ intrapred_luma8x8: # @intrapred_luma8x8 ori $a0, $a0, 3224 ldx.w $a0, $s4, $a0 .LBB2_51: # %.preheader - stx.h $a0, $s4, $s8 - ori $a1, $s5, 3552 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3568 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3584 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3600 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3616 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3632 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3648 - stx.h $a0, $s4, $a1 - st.h $a0, $s6, 128 - st.h $a0, $s6, 144 - st.h $a0, $s6, 160 - st.h $a0, $s6, 176 - st.h $a0, $s6, 192 - st.h $a0, $s6, 208 - st.h $a0, $s6, 224 - st.h $a0, $s6, 240 - ori $a1, $s5, 3540 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3556 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3572 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3588 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3604 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3620 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3636 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3652 - stx.h $a0, $s4, $a1 - st.h $a0, $s6, 132 - st.h $a0, $s6, 148 - st.h $a0, $s6, 164 - st.h $a0, $s6, 180 - st.h $a0, $s6, 196 - st.h $a0, $s6, 212 - st.h $a0, $s6, 228 - st.h $a0, $s6, 244 - ori $a1, $s5, 3544 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3560 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3576 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3592 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3608 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3624 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3640 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3656 - stx.h $a0, $s4, $a1 - st.h $a0, $s6, 136 - st.h $a0, $s6, 152 - st.h $a0, $s6, 168 - st.h $a0, $s6, 184 - st.h $a0, $s6, 200 - st.h $a0, $s6, 216 - st.h $a0, $s6, 232 - st.h $a0, $s6, 248 - ori $a1, $s5, 3548 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3564 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3580 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3596 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3612 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3628 - stx.h $a0, $s4, $a1 - ori $a1, $s5, 3644 - stx.h $a0, $s4, $a1 - st.h $a0, $s6, 140 - st.h $a0, $s6, 156 - st.h $a0, $s6, 172 - st.h $a0, $s6, 188 - st.h $a0, $s6, 204 - st.h $a0, $s6, 220 - st.h $a0, $s6, 236 - st.h $a0, $s6, 252 - vld $vr0, $fp, 2 - ori $a1, $s5, 3660 - stx.h $a0, $s4, $a1 - ori $a0, $s5, 3392 - vstx $vr0, $s4, $a0 - ori $a0, $s5, 3376 + ld.d $a2, $sp, 56 # 8-byte Folded Reload + vreplgr2vr.h $vr0, $a0 + vstx $vr0, $s4, $s0 + ori $a0, $s5, 3552 vstx $vr0, $s4, $a0 - ori $a0, $s5, 3360 + ori $a0, $s5, 3568 vstx $vr0, $s4, $a0 - ori $a0, $s5, 3344 + ori $a0, $s5, 3584 vstx $vr0, $s4, $a0 - ori $a0, $s5, 3328 + ori $a0, $s5, 3600 vstx $vr0, $s4, $a0 - ori $a0, $s5, 3312 + ori $a0, $s5, 3616 vstx $vr0, $s4, $a0 - ori $a0, $s5, 3296 + ori $a0, $s5, 3632 vstx $vr0, $s4, $a0 - ld.d $a0, $sp, 64 # 8-byte Folded Reload + vld $vr1, $fp, 2 + ori $a0, $s5, 3648 vstx $vr0, $s4, $a0 - bnez $s0, .LBB2_53 + ori $a0, $s5, 3392 + vstx $vr1, $s4, $a0 + ori $a0, $s5, 3376 + vstx $vr1, $s4, $a0 + ori $a0, $s5, 3360 + vstx $vr1, $s4, $a0 + ori $a0, $s5, 3344 + vstx $vr1, $s4, $a0 + ori $a0, $s5, 3328 + vstx $vr1, $s4, $a0 + ori $a0, $s5, 3312 + vstx $vr1, $s4, $a0 + ori $a0, $s5, 3296 + vstx $vr1, $s4, $a0 + vstx $vr1, $s4, $s7 + bnez $s2, .LBB2_53 # %bb.52: - stx.h $s7, $s4, $a0 + stx.h $s8, $s4, $s7 .LBB2_53: - ld.h $a0, $fp, 34 - st.h $a0, $s6, 12 - st.h $a0, $s6, 8 - st.h $a0, $s6, 4 - st.h $a0, $s6, 0 - ld.h $a1, $fp, 36 - st.h $a1, $s6, 28 - ori $a2, $s5, 3436 - stx.h $a1, $s4, $a2 - st.h $a1, $s6, 24 - ori $a2, $s5, 3432 - stx.h $a1, $s4, $a2 - st.h $a1, $s6, 20 - ori $a2, $s5, 3428 - st.h $a1, $s6, 16 - ld.h $a3, $fp, 38 - stx.h $a1, $s4, $a2 - ori $a2, $s5, 3424 - stx.h $a1, $s4, $a2 - st.h $a3, $s6, 44 - ori $a1, $s5, 3452 - stx.h $a3, $s4, $a1 - st.h $a3, $s6, 40 - ori $a1, $s5, 3448 - stx.h $a3, $s4, $a1 - st.h $a3, $s6, 36 - ori $a1, $s5, 3444 - st.h $a3, $s6, 32 - ld.h $a2, $fp, 40 - stx.h $a3, $s4, $a1 + ld.h $a0, $fp, 36 + vld $vr0, $fp, 34 + ld.h $a1, $fp, 38 + vreplgr2vr.h $vr1, $a0 + ori $a0, $s5, 3424 + vstx $vr1, $s4, $a0 + vreplgr2vr.h $vr1, $a1 + ld.h $a0, $fp, 40 ori $a1, $s5, 3440 - stx.h $a3, $s4, $a1 - st.h $a2, $s6, 60 - ori $a1, $s5, 3468 - stx.h $a2, $s4, $a1 - st.h $a2, $s6, 56 - ori $a1, $s5, 3464 - stx.h $a2, $s4, $a1 - st.h $a2, $s6, 52 - ori $a1, $s5, 3460 - st.h $a2, $s6, 48 - ld.h $a3, $fp, 42 - stx.h $a2, $s4, $a1 - ori $a1, $s5, 3456 - stx.h $a2, $s4, $a1 - st.h $a3, $s6, 76 - ori $a1, $s5, 3484 - stx.h $a3, $s4, $a1 - st.h $a3, $s6, 72 - ori $a1, $s5, 3480 - stx.h $a3, $s4, $a1 - st.h $a3, $s6, 68 - ori $a1, $s5, 3476 - stx.h $a3, $s4, $a1 - st.h $a3, $s6, 64 - ld.h $a1, $fp, 44 - ori $a2, $s5, 3472 - stx.h $a3, $s4, $a2 - ori $a2, $s5, 3500 - stx.h $a1, $s4, $a2 - ori $a2, $s5, 3496 - stx.h $a1, $s4, $a2 - ori $a2, $s5, 3492 - stx.h $a1, $s4, $a2 - ori $a2, $s5, 3488 - stx.h $a1, $s4, $a2 - ori $a2, $s5, 3420 - st.h $a1, $s6, 92 - st.h $a1, $s6, 88 - st.h $a1, $s6, 84 - st.h $a1, $s6, 80 + vstx $vr1, $s4, $a1 + ld.h $a1, $fp, 42 + vreplgr2vr.h $vr1, $a0 + ori $a0, $s5, 3456 + vstx $vr1, $s4, $a0 + vreplgr2vr.h $vr1, $a1 + ld.h $a0, $fp, 44 + ori $a1, $s5, 3472 + vstx $vr1, $s4, $a1 ld.h $a1, $fp, 46 - stx.h $a0, $s4, $a2 - ori $a2, $s5, 3416 - stx.h $a0, $s4, $a2 - st.h $a1, $s6, 108 - ori $a2, $s5, 3516 - stx.h $a1, $s4, $a2 - st.h $a1, $s6, 104 - ori $a2, $s5, 3512 - stx.h $a1, $s4, $a2 - st.h $a1, $s6, 100 - ori $a2, $s5, 3508 - stx.h $a1, $s4, $a2 - st.h $a1, $s6, 96 - ori $a2, $s5, 3504 - stx.h $a1, $s4, $a2 + vreplgr2vr.h $vr1, $a0 + ori $a0, $s5, 3488 + vstx $vr1, $s4, $a0 + vreplgr2vr.h $vr1, $a1 + ori $a0, $s5, 3504 ld.h $a1, $fp, 48 - ori $a2, $s5, 3412 - stx.h $a0, $s4, $a2 - ld.d $a2, $sp, 72 # 8-byte Folded Reload - stx.h $a0, $s4, $a2 - st.h $a1, $s6, 124 - ori $a0, $s5, 3532 - stx.h $a1, $s4, $a0 - st.h $a1, $s6, 120 - ori $a0, $s5, 3528 - stx.h $a1, $s4, $a0 - st.h $a1, $s6, 116 - ori $a0, $s5, 3524 - stx.h $a1, $s4, $a0 - st.h $a1, $s6, 112 + vstx $vr1, $s4, $a0 + vreplvei.h $vr0, $vr0, 0 + vstx $vr0, $s4, $a2 + vreplgr2vr.h $vr0, $a1 ori $a0, $s5, 3520 - stx.h $a1, $s4, $a0 + vstx $vr0, $s4, $a0 bnez $ra, .LBB2_55 # %bb.54: - stx.h $s7, $s4, $a2 + stx.h $s8, $s4, $a2 .LBB2_55: vrepli.b $vr0, 0 - beqz $s0, .LBB2_57 + beqz $s2, .LBB2_57 # %bb.56: - ld.hu $t1, $fp, 6 - ld.hu $t4, $fp, 2 - ld.hu $t3, $fp, 4 - addi.d $a1, $t1, 2 - add.d $a0, $a1, $t4 + ori $a0, $s5, 3666 + ld.hu $a7, $fp, 6 + ld.hu $t3, $fp, 2 + add.d $a0, $s4, $a0 + ld.hu $t2, $fp, 4 + addi.d $a2, $a7, 2 + add.d $a1, $a2, $t3 ld.hu $t0, $fp, 8 - alsl.d $a0, $t3, $a0, 1 - srli.d $a2, $a0, 2 - ld.d $a0, $sp, 40 # 8-byte Folded Reload - stx.h $a2, $s4, $a0 - add.d $a0, $t0, $t3 - alsl.d $a0, $t1, $a0, 1 - addi.d $a0, $a0, 2 - srli.d $a0, $a0, 2 - st.h $a0, $s6, 256 - ld.hu $t2, $fp, 10 - ori $a3, $s5, 3680 - stx.h $a0, $s4, $a3 - alsl.d $a1, $t0, $a1, 1 - add.d $a1, $a1, $t2 + alsl.d $a1, $t2, $a1, 1 + srli.d $a3, $a1, 2 + ld.d $a1, $sp, 32 # 8-byte Folded Reload + stx.h $a3, $s4, $a1 + add.d $a1, $t0, $t2 + alsl.d $a1, $a7, $a1, 1 + addi.d $a1, $a1, 2 srli.d $a1, $a1, 2 - ori $a3, $s5, 3696 - stx.h $a1, $s4, $a3 - st.h $a1, $s6, 272 - ld.hu $t6, $fp, 12 + st.h $a1, $a0, 0 + ld.hu $t1, $fp, 10 + ori $a4, $s5, 3680 + stx.h $a1, $s4, $a4 + alsl.d $a2, $t0, $a2, 1 + add.d $a2, $a2, $t1 + srli.d $a2, $a2, 2 + ori $a4, $s5, 3696 + stx.h $a2, $s4, $a4 + st.h $a2, $a0, 16 + ld.hu $t5, $fp, 12 vld $vr1, $fp, 12 - ld.hu $t5, $fp, 14 + ld.hu $t4, $fp, 14 vld $vr2, $fp, 14 - ld.hu $t7, $fp, 30 + ld.hu $t6, $fp, 30 vld $vr3, $fp, 16 - ld.hu $t8, $fp, 28 - ld.hu $a3, $fp, 16 + ld.hu $t7, $fp, 28 + ld.hu $a4, $fp, 16 + addi.d $a5, $t4, 2 + alsl.d $a6, $t1, $t5, 1 + addi.d $a6, $a6, 2 + add.d $t8, $a6, $t0 + srli.d $t8, $t8, 2 + ori $s0, $s5, 3712 + stx.h $t8, $s4, $s0 + st.h $t8, $a0, 32 + ori $s0, $s5, 3684 + stx.h $t8, $s4, $s0 + st.h $t8, $a0, 4 + add.d $t8, $a5, $t1 + alsl.d $t8, $t5, $t8, 1 + srli.d $t8, $t8, 2 + ori $s0, $s5, 3728 + stx.h $t8, $s4, $s0 + ori $s0, $s5, 3700 + stx.h $t8, $s4, $s0 + ori $s0, $s5, 3672 + stx.h $t8, $s4, $s0 + st.h $t8, $a0, 48 + st.h $t8, $a0, 20 vilvl.h $vr8, $vr0, $vr1 vbsrl.v $vr7, $vr1, 8 vilvl.h $vr6, $vr0, $vr7 @@ -2441,248 +2304,220 @@ intrapred_luma8x8: # @intrapred_luma8x8 vilvl.h $vr9, $vr0, $vr3 vilvh.h $vr10, $vr0, $vr3 vaddi.wu $vr3, $vr6, 2 - vaddi.wu $vr4, $vr8, 2 - addi.d $a4, $t5, 2 - alsl.d $a5, $t2, $t6, 1 - addi.d $a5, $a5, 2 - add.d $a6, $a5, $t0 - srli.d $a6, $a6, 2 - ori $a7, $s5, 3712 - stx.h $a6, $s4, $a7 - st.h $a6, $s6, 288 - ori $a7, $s5, 3684 - stx.h $a6, $s4, $a7 - st.h $a6, $s6, 260 - add.d $a6, $a4, $t2 - alsl.d $a6, $t6, $a6, 1 - srli.d $a6, $a6, 2 - ori $a7, $s5, 3728 - stx.h $a6, $s4, $a7 - st.h $a6, $s6, 304 - ori $a7, $s5, 3700 - stx.h $a6, $s4, $a7 - st.h $a6, $s6, 276 - ori $a7, $s5, 3672 - stx.h $a6, $s4, $a7 + vaddi.wu $vr5, $vr8, 2 vslli.w $vr2, $vr2, 1 - vslli.w $vr5, $vr1, 1 - vadd.w $vr1, $vr4, $vr5 + vslli.w $vr4, $vr1, 1 + vadd.w $vr1, $vr5, $vr4 vadd.w $vr2, $vr3, $vr2 vadd.w $vr10, $vr2, $vr10 vadd.w $vr1, $vr1, $vr9 vsrli.w $vr2, $vr1, 2 vsrli.w $vr1, $vr10, 2 - vpickve2gr.h $a7, $vr1, 6 - vpickve2gr.h $a6, $vr2, 0 - st.h $a6, $s6, 320 - st.h $a6, $s6, 292 - st.h $a6, $s6, 264 - vpickve2gr.h $a6, $vr2, 2 - st.h $a6, $s6, 308 - st.h $a6, $s6, 280 - vpickve2gr.h $a6, $vr2, 4 - st.h $a6, $s6, 352 - st.h $a6, $s6, 324 - st.h $a6, $s6, 296 - st.h $a6, $s6, 268 - vpickve2gr.h $a6, $vr2, 6 - st.h $a6, $s6, 368 - st.h $a6, $s6, 312 - st.h $a6, $s6, 284 - vpickve2gr.h $a6, $vr1, 4 - st.h $a6, $s6, 360 - st.h $a6, $s6, 332 - vpickve2gr.h $a6, $vr1, 2 - st.h $a7, $s6, 376 - vpickve2gr.h $a7, $vr1, 0 - st.h $a7, $s6, 356 - st.h $a7, $s6, 328 - st.h $a7, $s6, 300 - st.h $a6, $s6, 372 - st.h $a6, $s6, 316 - add.d $t4, $t3, $t4 - addi.d $t4, $t4, 1 - srli.d $t4, $t4, 1 - ld.d $s0, $sp, 48 # 8-byte Folded Reload - stx.h $t4, $s4, $s0 - ld.hu $t4, $fp, 32 - ori $s0, $s5, 3668 - stx.h $a1, $s4, $s0 - alsl.d $t8, $t7, $t8, 1 - add.d $t8, $t8, $t4 - addi.d $t8, $t8, 2 - srli.d $t8, $t8, 2 - ori $s0, $s5, 3788 - stx.h $t8, $s4, $s0 - st.h $t8, $s6, 364 - alsl.d $t4, $t4, $t4, 1 - add.d $t4, $t7, $t4 - addi.d $t4, $t4, 2 - srli.d $t4, $t4, 2 - st.h $t4, $s6, 380 - add.d $t3, $t1, $t3 + vstelm.h $vr2, $a0, 64, 0 + vstelm.h $vr2, $a0, 36, 0 + vstelm.h $vr2, $a0, 8, 0 + vstelm.h $vr2, $a0, 52, 2 + vstelm.h $vr2, $a0, 24, 2 + vstelm.h $vr2, $a0, 96, 4 + vstelm.h $vr2, $a0, 68, 4 + vstelm.h $vr2, $a0, 40, 4 + vstelm.h $vr2, $a0, 12, 4 + vstelm.h $vr2, $a0, 112, 6 + vstelm.h $vr2, $a0, 56, 6 + vstelm.h $vr2, $a0, 28, 6 + vstelm.h $vr1, $a0, 100, 0 + vstelm.h $vr1, $a0, 72, 0 + vstelm.h $vr1, $a0, 44, 0 + vstelm.h $vr1, $a0, 116, 2 + vstelm.h $vr1, $a0, 60, 2 + vstelm.h $vr1, $a0, 104, 4 + vstelm.h $vr1, $a0, 76, 4 + vstelm.h $vr1, $a0, 120, 6 + add.d $t3, $t2, $t3 addi.d $t3, $t3, 1 srli.d $t3, $t3, 1 - ori $t4, $s3, 112 - stx.h $t3, $s4, $t4 - st.h $t3, $s6, 768 - add.d $t1, $t1, $t0 - addi.d $t1, $t1, 1 + ld.d $t8, $sp, 40 # 8-byte Folded Reload + stx.h $t3, $s4, $t8 + ld.hu $t3, $fp, 32 + ori $t8, $s5, 3668 + stx.h $a2, $s4, $t8 + alsl.d $t7, $t6, $t7, 1 + add.d $t7, $t7, $t3 + addi.d $t7, $t7, 2 + srli.d $t7, $t7, 2 + ori $t8, $s5, 3788 + stx.h $t7, $s4, $t8 + st.h $t7, $a0, 108 + alsl.d $t3, $t3, $t3, 1 + add.d $t3, $t6, $t3 + addi.d $t3, $t3, 2 + srli.d $t3, $t3, 2 + st.h $t3, $a0, 124 + add.d $t2, $a7, $t2 + addi.d $t2, $t2, 1 + srli.d $t2, $t2, 1 + ori $t3, $s3, 112 + stx.h $t2, $s4, $t3 + st.h $t2, $a0, 512 + add.d $a7, $a7, $t0 + addi.d $a7, $a7, 1 + srli.d $a7, $a7, 1 + ori $t2, $s3, 144 + stx.h $a7, $s4, $t2 + ori $t2, $s3, 84 + stx.h $a7, $s4, $t2 + st.h $a7, $a0, 544 + add.d $a7, $t0, $t1 + addi.d $a7, $a7, 1 + srli.d $a7, $a7, 1 + ori $t0, $s3, 176 + stx.h $a7, $s4, $t0 + ori $t0, $s3, 116 + stx.h $a7, $s4, $t0 + st.h $a7, $a0, 576 + st.h $a7, $a0, 516 + add.d $a7, $t1, $t5 + addi.d $a7, $a7, 1 + srli.d $a7, $a7, 1 + ori $t0, $s3, 148 + stx.h $a7, $s4, $t0 + ori $t0, $s3, 88 + stx.h $a7, $s4, $t0 + st.h $a7, $a0, 608 + st.h $a7, $a0, 548 + add.d $a7, $t5, $t4 + addi.d $a7, $a7, 1 + srli.d $a7, $a7, 1 + ori $t0, $s3, 180 + stx.h $a7, $s4, $t0 + st.h $a7, $a0, 580 + ori $t0, $s3, 120 + stx.h $a7, $s4, $t0 + st.h $a7, $a0, 520 + add.d $a7, $t4, $a4 + addi.d $a7, $a7, 1 + srli.d $a7, $a7, 1 + st.h $a7, $a0, 612 + ori $t0, $s3, 152 + stx.h $a7, $s4, $t0 + st.h $a7, $a0, 552 + ori $t0, $s3, 92 + stx.h $a7, $s4, $t0 + vpickve2gr.w $a7, $vr8, 3 + bstrpick.d $a7, $a7, 15, 0 + or $t0, $a4, $a7 + xor $t1, $a4, $a7 srli.d $t1, $t1, 1 - ori $t3, $s3, 144 - stx.h $t1, $s4, $t3 - ori $t3, $s3, 84 - stx.h $t1, $s4, $t3 - st.h $t1, $s6, 800 - add.d $t0, $t0, $t2 - addi.d $t0, $t0, 1 - srli.d $t0, $t0, 1 - ori $t1, $s3, 176 - stx.h $t0, $s4, $t1 - ori $t1, $s3, 116 - stx.h $t0, $s4, $t1 - st.h $t0, $s6, 832 - st.h $t0, $s6, 772 - add.d $t0, $t2, $t6 - addi.d $t0, $t0, 1 - srli.d $t0, $t0, 1 - ori $t1, $s3, 148 - stx.h $t0, $s4, $t1 - ori $t1, $s3, 88 - stx.h $t0, $s4, $t1 - st.h $t0, $s6, 864 - st.h $t0, $s6, 804 - add.d $t0, $t6, $t5 - addi.d $t0, $t0, 1 - srli.d $t0, $t0, 1 - ori $t1, $s3, 180 - stx.h $t0, $s4, $t1 - st.h $t0, $s6, 836 - ori $t1, $s3, 120 - stx.h $t0, $s4, $t1 - st.h $t0, $s6, 776 - add.d $t0, $t5, $a3 - addi.d $t0, $t0, 1 - srli.d $t0, $t0, 1 - st.h $t0, $s6, 868 - ori $t1, $s3, 152 + sub.d $t0, $t0, $t1 + ori $t1, $s3, 184 stx.h $t0, $s4, $t1 - st.h $t0, $s6, 808 - ori $t1, $s3, 92 + st.h $t0, $a0, 584 + ori $t1, $s3, 124 stx.h $t0, $s4, $t1 - vpickve2gr.w $t0, $vr8, 3 + st.h $t0, $a0, 524 + vpickve2gr.w $t0, $vr7, 0 bstrpick.d $t0, $t0, 15, 0 - or $t1, $a3, $t0 - xor $t2, $a3, $t0 - srli.d $t2, $t2, 1 - sub.d $t1, $t1, $t2 - ori $t2, $s3, 184 - stx.h $t1, $s4, $t2 - st.h $t1, $s6, 840 - ori $t2, $s3, 124 - stx.h $t1, $s4, $t2 - st.h $t1, $s6, 780 - vpickve2gr.w $t1, $vr7, 0 - bstrpick.d $t1, $t1, 15, 0 - or $t2, $t0, $t1 - xor $t0, $t0, $t1 + or $t1, $a7, $t0 + xor $a7, $a7, $t0 + srli.d $a7, $a7, 1 + sub.d $a7, $t1, $a7 + st.h $a7, $a0, 616 + ori $t1, $s3, 156 + stx.h $a7, $s4, $t1 + st.h $a7, $a0, 556 + vpickve2gr.w $a7, $vr6, 1 + bstrpick.d $a7, $a7, 15, 0 + or $t1, $t0, $a7 + xor $t0, $t0, $a7 srli.d $t0, $t0, 1 - sub.d $t0, $t2, $t0 - st.h $t0, $s6, 872 - ori $t2, $s3, 156 - stx.h $t0, $s4, $t2 - st.h $t0, $s6, 812 - vpickve2gr.w $t0, $vr6, 1 - bstrpick.d $t0, $t0, 15, 0 - or $t2, $t1, $t0 - xor $t1, $t1, $t0 - srli.d $t1, $t1, 1 - sub.d $t1, $t2, $t1 - ori $t2, $s3, 188 - stx.h $t1, $s4, $t2 - st.h $t1, $s6, 844 - ori $t1, $s3, 96 - stx.h $a2, $s4, $t1 - vpickve2gr.w $a2, $vr6, 2 - bstrpick.d $a2, $a2, 15, 0 - or $t1, $t0, $a2 - xor $a2, $t0, $a2 - srli.d $a2, $a2, 1 - sub.d $a2, $t1, $a2 - st.h $a2, $s6, 876 - ori $a2, $s3, 128 - stx.h $a0, $s4, $a2 - st.h $a0, $s6, 784 - ori $a0, $s3, 160 - st.h $a1, $s6, 816 - ld.hu $a2, $fp, 8 - stx.h $a1, $s4, $a0 - ori $a0, $s3, 100 - stx.h $a1, $s4, $a0 - add.d $a0, $a5, $a2 - srli.d $a0, $a0, 2 - ori $a1, $s3, 192 - stx.h $a0, $s4, $a1 - st.h $a0, $s6, 848 - st.h $a0, $s6, 788 - ld.hu $a1, $fp, 10 - ld.hu $a2, $fp, 12 - ori $a5, $s3, 132 - stx.h $a0, $s4, $a5 - add.d $a0, $a4, $a1 - alsl.d $a0, $a2, $a0, 1 - srli.d $a0, $a0, 2 - st.h $a0, $s6, 880 - ori $a1, $s3, 164 - stx.h $a0, $s4, $a1 - st.h $a0, $s6, 820 - ori $a1, $s3, 104 - ld.hu $a4, $fp, 14 - stx.h $a0, $s4, $a1 - vpickve2gr.w $a0, $vr4, 2 - add.d $a0, $a0, $a2 - alsl.d $a0, $a4, $a0, 1 - bstrpick.d $a0, $a0, 18, 2 - ori $a1, $s3, 196 - stx.h $a0, $s4, $a1 - ori $a1, $s3, 136 - stx.h $a0, $s4, $a1 - st.h $a0, $s6, 852 - st.h $a0, $s6, 792 - vpickve2gr.w $a0, $vr4, 3 - alsl.d $a0, $a3, $a0, 1 - add.d $a0, $a0, $a4 - bstrpick.d $a0, $a0, 18, 2 - st.h $a0, $s6, 884 - ori $a1, $s3, 168 - stx.h $a0, $s4, $a1 - st.h $a0, $s6, 824 - ori $a1, $s3, 108 - stx.h $a0, $s4, $a1 - ld.hu $a0, $fp, 16 - vpickve2gr.w $a1, $vr3, 0 - vpickve2gr.w $a2, $vr5, 2 - add.d $a1, $a1, $a2 - add.d $a0, $a1, $a0 - bstrpick.d $a0, $a0, 18, 2 - ori $a1, $s3, 200 - stx.h $a0, $s4, $a1 - st.h $a0, $s6, 856 - ori $a1, $s3, 140 - stx.h $a0, $s4, $a1 - st.h $a0, $s6, 796 - ld.hu $a0, $fp, 18 - vpickve2gr.w $a1, $vr3, 1 - vpickve2gr.w $a2, $vr5, 3 - add.d $a1, $a1, $a2 - add.d $a0, $a1, $a0 - bstrpick.d $a0, $a0, 18, 2 - ori $a1, $s3, 172 - stx.h $a0, $s4, $a1 - st.h $a0, $s6, 888 - st.h $a0, $s6, 828 - st.h $a7, $s6, 860 - st.h $a6, $s6, 892 + sub.d $t0, $t1, $t0 + ori $t1, $s3, 188 + stx.h $t0, $s4, $t1 + st.h $t0, $a0, 588 + ori $t0, $s3, 96 + stx.h $a3, $s4, $t0 + vpickve2gr.w $a3, $vr6, 2 + bstrpick.d $a3, $a3, 15, 0 + or $t0, $a7, $a3 + xor $a3, $a7, $a3 + srli.d $a3, $a3, 1 + sub.d $a3, $t0, $a3 + st.h $a3, $a0, 620 + ori $a3, $s3, 128 + stx.h $a1, $s4, $a3 + st.h $a1, $a0, 528 + ori $a1, $s3, 160 + st.h $a2, $a0, 560 + ld.hu $a3, $fp, 8 + stx.h $a2, $s4, $a1 + ori $a1, $s3, 100 + stx.h $a2, $s4, $a1 + add.d $a1, $a6, $a3 + srli.d $a1, $a1, 2 + ori $a2, $s3, 192 + stx.h $a1, $s4, $a2 + st.h $a1, $a0, 592 + st.h $a1, $a0, 532 + ld.hu $a2, $fp, 10 + ld.hu $a3, $fp, 12 + ori $a6, $s3, 132 + stx.h $a1, $s4, $a6 + add.d $a1, $a5, $a2 + alsl.d $a1, $a3, $a1, 1 + srli.d $a1, $a1, 2 + st.h $a1, $a0, 624 + ori $a2, $s3, 164 + stx.h $a1, $s4, $a2 + st.h $a1, $a0, 564 + ori $a2, $s3, 104 + ld.hu $a5, $fp, 14 + stx.h $a1, $s4, $a2 + vpickve2gr.w $a1, $vr5, 2 + add.d $a1, $a1, $a3 + alsl.d $a1, $a5, $a1, 1 + bstrpick.d $a1, $a1, 18, 2 + ori $a2, $s3, 196 + stx.h $a1, $s4, $a2 + ori $a2, $s3, 136 + stx.h $a1, $s4, $a2 + st.h $a1, $a0, 596 + st.h $a1, $a0, 536 + vpickve2gr.w $a1, $vr5, 3 + alsl.d $a1, $a4, $a1, 1 + add.d $a1, $a1, $a5 + bstrpick.d $a1, $a1, 18, 2 + st.h $a1, $a0, 628 + ori $a2, $s3, 168 + stx.h $a1, $s4, $a2 + st.h $a1, $a0, 568 + ori $a2, $s3, 108 + stx.h $a1, $s4, $a2 + ld.hu $a1, $fp, 16 + vpickve2gr.w $a2, $vr3, 0 + vpickve2gr.w $a3, $vr4, 2 + add.d $a2, $a2, $a3 + add.d $a1, $a2, $a1 + bstrpick.d $a1, $a1, 18, 2 + ori $a2, $s3, 200 + stx.h $a1, $s4, $a2 + st.h $a1, $a0, 600 + ori $a2, $s3, 140 + stx.h $a1, $s4, $a2 + st.h $a1, $a0, 540 + ld.hu $a1, $fp, 18 + vpickve2gr.w $a2, $vr3, 1 + vpickve2gr.w $a3, $vr4, 3 + add.d $a2, $a2, $a3 + add.d $a1, $a2, $a1 + bstrpick.d $a1, $a1, 18, 2 + ori $a2, $s3, 172 + stx.h $a1, $s4, $a2 + st.h $a1, $a0, 632 + st.h $a1, $a0, 572 + vpickve2gr.h $a1, $vr1, 0 + st.h $a1, $a0, 604 + vpickve2gr.h $a1, $vr1, 2 + st.h $a1, $a0, 636 ori $a0, $s5, 3716 add.d $a0, $s4, $a0 vstelm.h $vr2, $a0, 0, 0 @@ -2744,13 +2579,13 @@ intrapred_luma8x8: # @intrapred_luma8x8 add.d $a0, $s4, $a0 vstelm.h $vr1, $a0, 0, 0 .LBB2_57: - sltui $a0, $s2, 1 - ld.d $a1, $sp, 88 # 8-byte Folded Reload + sltui $a0, $s1, 1 + ld.d $a1, $sp, 72 # 8-byte Folded Reload xori $a1, $a1, 1 or $a0, $a1, $a0 bnez $a0, .LBB2_59 # %bb.58: # %.thread - ld.d $s6, $sp, 80 # 8-byte Folded Reload + ld.d $s6, $sp, 64 # 8-byte Folded Reload ld.d $a0, $s6, 0 ori $a1, $s5, 3810 ld.hu $a2, $fp, 44 @@ -2911,7 +2746,7 @@ intrapred_luma8x8: # @intrapred_luma8x8 vstelm.h $vr1, $a1, 28, 2 vstelm.h $vr1, $a6, 0, 2 vstelm.h $vr1, $a1, 12, 4 - ld.d $a6, $sp, 16 # 8-byte Folded Reload + ld.d $a6, $sp, 8 # 8-byte Folded Reload vstx $vr0, $a0, $a6 add.d $a5, $a4, $a5 addi.d $a5, $a5, 1 @@ -2920,7 +2755,7 @@ intrapred_luma8x8: # @intrapred_luma8x8 ori $a6, $s5, 3988 stx.h $a5, $a0, $a6 st.h $a5, $a1, 144 - ld.d $a6, $sp, 24 # 8-byte Folded Reload + ld.d $a6, $sp, 16 # 8-byte Folded Reload stx.h $a5, $a0, $a6 add.d $a4, $a4, $a3 addi.d $a4, $a4, 1 @@ -3092,7 +2927,7 @@ intrapred_luma8x8: # @intrapred_luma8x8 stx.h $s1, $a1, $s2 ori $s2, $s5, 4068 stx.h $s1, $a1, $s2 - ld.d $s2, $sp, 32 # 8-byte Folded Reload + ld.d $s2, $sp, 24 # 8-byte Folded Reload stx.h $s1, $a1, $s2 add.d $s1, $t6, $a3 addi.d $s1, $s1, 1 @@ -3246,7 +3081,7 @@ intrapred_luma8x8: # @intrapred_luma8x8 st.h $a1, $a0, 12 b .LBB2_60 .LBB2_59: - ld.d $s6, $sp, 80 # 8-byte Folded Reload + ld.d $s6, $sp, 64 # 8-byte Folded Reload beqz $ra, .LBB2_61 .LBB2_60: ld.d $a0, $s6, 0 @@ -3258,7 +3093,7 @@ intrapred_luma8x8: # @intrapred_luma8x8 ld.hu $a5, $fp, 38 addi.d $a6, $a6, 1 srli.d $a6, $a6, 1 - ld.d $a7, $sp, 56 # 8-byte Folded Reload + ld.d $a7, $sp, 48 # 8-byte Folded Reload stx.h $a6, $a0, $a7 add.d $a6, $a3, $a5 addi.d $a6, $a6, 1 @@ -3334,18 +3169,9 @@ intrapred_luma8x8: # @intrapred_luma8x8 st.h $t4, $a4, 116 ori $t2, $s3, 324 stx.h $t4, $a0, $t2 - st.h $t4, $a4, 112 - ori $t2, $s3, 320 - stx.h $t4, $a0, $t2 - st.h $t4, $a4, 108 - ori $t2, $s3, 316 - stx.h $t4, $a0, $t2 - st.h $t4, $a4, 104 - ori $t2, $s3, 312 - stx.h $t4, $a0, $t2 - st.h $t4, $a4, 100 + vreplgr2vr.h $vr0, $t4 ori $t2, $s3, 308 - stx.h $t4, $a0, $t2 + vstx $vr0, $a0, $t2 st.h $t4, $a4, 92 ori $t2, $s3, 300 stx.h $t4, $a0, $t2 @@ -3403,18 +3229,18 @@ intrapred_luma8x8: # @intrapred_luma8x8 srli.d $a2, $a2, 2 stx.h $a2, $a0, $a1 .LBB2_61: - ld.d $s8, $sp, 360 # 8-byte Folded Reload - ld.d $s7, $sp, 368 # 8-byte Folded Reload - ld.d $s6, $sp, 376 # 8-byte Folded Reload - ld.d $s5, $sp, 384 # 8-byte Folded Reload - ld.d $s4, $sp, 392 # 8-byte Folded Reload - ld.d $s3, $sp, 400 # 8-byte Folded Reload - ld.d $s2, $sp, 408 # 8-byte Folded Reload - ld.d $s1, $sp, 416 # 8-byte Folded Reload - ld.d $s0, $sp, 424 # 8-byte Folded Reload - ld.d $fp, $sp, 432 # 8-byte Folded Reload - ld.d $ra, $sp, 440 # 8-byte Folded Reload - addi.d $sp, $sp, 448 + ld.d $s8, $sp, 344 # 8-byte Folded Reload + ld.d $s7, $sp, 352 # 8-byte Folded Reload + ld.d $s6, $sp, 360 # 8-byte Folded Reload + ld.d $s5, $sp, 368 # 8-byte Folded Reload + ld.d $s4, $sp, 376 # 8-byte Folded Reload + ld.d $s3, $sp, 384 # 8-byte Folded Reload + ld.d $s2, $sp, 392 # 8-byte Folded Reload + ld.d $s1, $sp, 400 # 8-byte Folded Reload + ld.d $s0, $sp, 408 # 8-byte Folded Reload + ld.d $fp, $sp, 416 # 8-byte Folded Reload + ld.d $ra, $sp, 424 # 8-byte Folded Reload + addi.d $sp, $sp, 432 ret .Lfunc_end2: .size intrapred_luma8x8, .Lfunc_end2-intrapred_luma8x8 diff --git a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/weighted_prediction.s b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/weighted_prediction.s index 04ca0318..79894f35 100644 --- a/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/weighted_prediction.s +++ b/results/MultiSource/Applications/JM/lencod/CMakeFiles/lencod.dir/weighted_prediction.s @@ -517,25 +517,30 @@ estimate_weighting_factor_B_slice: # @estimate_weighting_factor_B_slice pcalau12i $a1, %pc_hi20(active_pps) blez $a0, .LBB1_24 # %bb.9: # %.preheader263.lr.ph - ld.wu $a3, $fp, 4 - addi.w $a4, $a3, 0 - pcalau12i $a2, %got_pc_hi20(listX) - ld.d $a2, $a2, %got_pc_lo12(listX) - ld.d $a5, $a2, 8 - ld.d $a6, $a2, 0 - pcalau12i $a2, %got_pc_hi20(enc_picture) - ld.d $a2, $a2, %got_pc_lo12(enc_picture) + ld.wu $a2, $fp, 4 + addi.w $a3, $a2, 0 + pcalau12i $a4, %got_pc_hi20(listX) + ld.d $a4, $a4, %got_pc_lo12(listX) + ld.d $a5, $a4, 8 + ld.d $a6, $a4, 0 + pcalau12i $a4, %got_pc_hi20(enc_picture) + ld.d $a4, $a4, %got_pc_lo12(enc_picture) move $a7, $zero - ld.d $t0, $a2, 0 - lu12i.w $a2, 3 - ori $a2, $a2, 8 + ld.d $t0, $a4, 0 + lu12i.w $a4, 3 + ori $a4, $a4, 8 addi.d $t1, $sp, 56 - add.d $t1, $t1, $a2 + add.d $t1, $t1, $a4 addi.w $t2, $zero, -128 ori $t3, $zero, 127 lu12i.w $t4, 4 + vrepli.w $vr0, 32 addi.w $t5, $zero, -1024 + vreplgr2vr.w $vr1, $t5 ori $t6, $zero, 1023 + vreplgr2vr.w $vr2, $t6 + vrepli.w $vr3, -129 + vrepli.w $vr4, -193 ori $t7, $zero, 32 ori $t8, $zero, 64 b .LBB1_11 @@ -548,47 +553,49 @@ estimate_weighting_factor_B_slice: # @estimate_weighting_factor_B_slice .LBB1_11: # %.preheader263 # =>This Loop Header: Depth=1 # Child Loop BB1_15 Depth 2 - blez $a4, .LBB1_10 + blez $a3, .LBB1_10 # %bb.12: # %.lr.ph # in Loop: Header=BB1_11 Depth=1 slli.d $s0, $a7, 3 ldx.d $s0, $a6, $s0 - ld.w $s0, $s0, 4 - ld.w $s1, $t0, 4 - sub.w $s1, $s1, $s0 - slt $s2, $t2, $s1 - maskeqz $s1, $s1, $s2 + ld.w $s1, $s0, 4 + ld.w $s0, $t0, 4 + sub.w $s0, $s0, $s1 + slt $s2, $t2, $s0 + maskeqz $s0, $s0, $s2 masknez $s2, $t2, $s2 - or $s1, $s1, $s2 - slti $s2, $s1, 127 - maskeqz $s1, $s1, $s2 + or $s0, $s0, $s2 + slti $s2, $s0, 127 + maskeqz $s0, $s0, $s2 masknez $s2, $t3, $s2 - or $s1, $s1, $s2 - move $s2, $t1 - move $s4, $a3 - move $s5, $a5 + or $s2, $s0, $s2 + vinsgr2vr.w $vr5, $s2, 0 + vinsgr2vr.w $vr5, $s2, 1 + move $s4, $t1 + move $s5, $a2 + move $s0, $a5 b .LBB1_15 .p2align 4, , 16 .LBB1_13: # in Loop: Header=BB1_15 Depth=2 ori $s6, $zero, 32 ori $s7, $zero, 32 lu32i.d $s7, 32 - st.d $s7, $s2, -8 - stptr.d $s7, $s2, -12296 - st.w $s6, $s2, 0 + st.d $s7, $s4, -8 + stptr.d $s7, $s4, -12296 + st.w $s6, $s4, 0 .LBB1_14: # in Loop: Header=BB1_15 Depth=2 - stptr.w $s6, $s2, -12288 - addi.d $s5, $s5, 8 - addi.d $s4, $s4, -1 - addi.d $s2, $s2, 12 - beqz $s4, .LBB1_10 + stptr.w $s6, $s4, -12288 + addi.d $s0, $s0, 8 + addi.d $s5, $s5, -1 + addi.d $s4, $s4, 12 + beqz $s5, .LBB1_10 .LBB1_15: # Parent Loop BB1_11 Depth=1 # => This Inner Loop Header: Depth=2 - ld.d $s6, $s5, 0 + ld.d $s6, $s0, 0 ld.w $s6, $s6, 4 - beq $s6, $s0, .LBB1_13 + beq $s6, $s1, .LBB1_13 # %bb.16: # in Loop: Header=BB1_15 Depth=2 - sub.w $s6, $s6, $s0 + sub.w $s6, $s6, $s1 slt $s7, $t2, $s6 maskeqz $s6, $s6, $s7 masknez $s7, $t2, $s7 @@ -605,10 +612,40 @@ estimate_weighting_factor_B_slice: # @estimate_weighting_factor_B_slice xor $s7, $s7, $s8 sub.d $s7, $s7, $s8 or $s7, $s7, $t4 - ext.w.h $s6, $s6 div.d $s6, $s7, $s6 + vinsgr2vr.h $vr6, $s6, 0 + vinsgr2vr.h $vr6, $s6, 2 + vslli.w $vr6, $vr6, 16 + vsrai.w $vr6, $vr6, 16 + vori.b $vr7, $vr0, 0 + vmadd.w $vr7, $vr5, $vr6 + vsrai.w $vr6, $vr7, 6 + vmax.w $vr6, $vr6, $vr1 + vmin.w $vr6, $vr6, $vr2 + vsrai.w $vr6, $vr6, 2 + vadd.w $vr7, $vr6, $vr3 + vslt.wu $vr7, $vr7, $vr4 + vshuf4i.w $vr8, $vr7, 16 + vpickve2gr.d $s7, $vr8, 1 + andi $s7, $s7, 1 + vpickve2gr.w $s8, $vr6, 1 + masknez $s8, $s8, $s7 + maskeqz $s7, $t7, $s7 + or $s7, $s7, $s8 + st.w $s7, $s4, -8 + sub.d $s7, $t8, $s7 + stptr.w $s7, $s4, -12296 + vpickve2gr.d $s7, $vr7, 0 + andi $s7, $s7, 1 + vpickve2gr.w $s8, $vr6, 0 + masknez $s8, $s8, $s7 + maskeqz $s7, $t7, $s7 + or $s7, $s7, $s8 + st.w $s7, $s4, -4 + sub.d $s7, $t8, $s7 + stptr.w $s7, $s4, -12292 ext.w.h $s6, $s6 - mul.d $s6, $s1, $s6 + mul.d $s6, $s2, $s6 addi.w $s6, $s6, 32 srai.d $s6, $s6, 6 slt $s7, $t5, $s6 @@ -624,26 +661,22 @@ estimate_weighting_factor_B_slice: # @estimate_weighting_factor_B_slice sltui $s7, $s7, -193 masknez $s6, $s6, $s7 maskeqz $s7, $t7, $s7 - or $s7, $s7, $s6 - st.w $s7, $s2, -8 - sub.d $s6, $t8, $s7 - stptr.w $s6, $s2, -12296 - st.w $s7, $s2, -4 - stptr.w $s6, $s2, -12292 - st.w $s7, $s2, 0 + or $s6, $s7, $s6 + st.w $s6, $s4, 0 + sub.d $s6, $t8, $s6 b .LBB1_14 .LBB1_17: # %._crit_edge274 ld.d $a1, $a1, %pc_lo12(active_pps) - ld.w $a7, $a1, 196 + ld.w $a2, $a1, 196 ori $a1, $zero, 2 - bne $a7, $a1, .LBB1_30 + bne $a2, $a1, .LBB1_30 # %bb.18: # %.preheader246.lr.ph move $a1, $zero - pcalau12i $a3, %pc_hi20(wbp_weight) - ld.d $a3, $a3, %pc_lo12(wbp_weight) + pcalau12i $a2, %pc_hi20(wbp_weight) + ld.d $a2, $a2, %pc_lo12(wbp_weight) ld.w $t0, $fp, 4 - addi.d $a4, $sp, 56 - add.d $a2, $a4, $a2 + addi.d $a3, $sp, 56 + add.d $a3, $a3, $a4 lu12i.w $a5, -4 ori $a4, $a5, 4088 ori $a5, $a5, 4092 @@ -653,7 +686,7 @@ estimate_weighting_factor_B_slice: # @estimate_weighting_factor_B_slice .LBB1_19: # %._crit_edge308 # in Loop: Header=BB1_20 Depth=1 addi.d $a1, $a1, 1 - addi.d $a2, $a2, 384 + addi.d $a3, $a3, 384 bge $a1, $a0, .LBB1_25 .LBB1_20: # %.preheader246 # =>This Loop Header: Depth=1 @@ -661,13 +694,13 @@ estimate_weighting_factor_B_slice: # @estimate_weighting_factor_B_slice blez $t0, .LBB1_19 # %bb.21: # %.preheader245.lr.ph # in Loop: Header=BB1_20 Depth=1 - ld.d $a0, $a3, 8 - ld.d $a7, $a3, 0 + ld.d $a0, $a2, 8 + ld.d $a7, $a2, 0 slli.d $t0, $a1, 3 ldx.d $a0, $a0, $t0 ldx.d $a7, $a7, $t0 move $t1, $zero - move $t2, $a2 + move $t2, $a3 .p2align 4, , 16 .LBB1_22: # %.preheader245 # Parent Loop BB1_20 Depth=1 @@ -698,9 +731,9 @@ estimate_weighting_factor_B_slice: # @estimate_weighting_factor_B_slice b .LBB1_19 .LBB1_24: # %._crit_edge274.thread ld.d $a0, $a1, %pc_lo12(active_pps) - ld.w $a7, $a0, 196 + ld.w $a2, $a0, 196 ori $a0, $zero, 2 - bne $a7, $a0, .LBB1_30 + bne $a2, $a0, .LBB1_30 .LBB1_25: # %.preheader244 move $a0, $zero ld.d $a1, $sp, 32 # 8-byte Folded Reload @@ -745,6 +778,7 @@ estimate_weighting_factor_B_slice: # @estimate_weighting_factor_B_slice blt $a7, $t1, .LBB1_29 b .LBB1_26 .LBB1_30: # %.preheader262 + st.d $a2, $sp, 16 # 8-byte Folded Spill ld.w $s5, $s3, 68 movgr2fr.d $fs0, $zero fmov.d $fs1, $fs0 @@ -787,7 +821,6 @@ estimate_weighting_factor_B_slice: # @estimate_weighting_factor_B_slice fld.d $fa1, $a0, %pc_lo12(.LCPI1_0) fmul.d $fs1, $fa0, $fa1 .LBB1_37: # %.preheader260 - st.d $a7, $sp, 16 # 8-byte Folded Spill addi.w $a0, $s5, 19 ori $a1, $zero, 20 slt $a2, $a1, $a0 @@ -1699,22 +1732,27 @@ test_wp_B_slice: # @test_wp_B_slice ld.wu $a1, $s3, 4 addi.w $a2, $a1, 0 pcalau12i $a3, %got_pc_hi20(listX) - ld.d $a3, $a3, %got_pc_lo12(listX) - ld.d $a4, $a3, 8 - ld.d $a5, $a3, 0 - pcalau12i $a3, %got_pc_hi20(enc_picture) - ld.d $a3, $a3, %got_pc_lo12(enc_picture) + ld.d $a4, $a3, %got_pc_lo12(listX) + ld.d $a3, $a4, 8 + ld.d $a4, $a4, 0 + pcalau12i $a5, %got_pc_hi20(enc_picture) + ld.d $a5, $a5, %got_pc_lo12(enc_picture) move $a6, $zero - ld.d $a7, $a3, 0 - lu12i.w $a3, 3 - ori $a3, $a3, 8 + ld.d $a7, $a5, 0 + lu12i.w $a5, 3 + ori $a5, $a5, 8 addi.d $t0, $sp, 56 - add.d $t0, $t0, $a3 + add.d $t0, $t0, $a5 addi.w $t1, $zero, -128 ori $t2, $zero, 127 lu12i.w $t3, 4 + vrepli.w $vr0, 32 addi.w $t4, $zero, -1024 + vreplgr2vr.w $vr1, $t4 ori $t5, $zero, 1023 + vreplgr2vr.w $vr2, $t5 + vrepli.w $vr3, -129 + vrepli.w $vr4, -193 ori $t6, $zero, 32 ori $t7, $zero, 64 b .LBB3_11 @@ -1731,7 +1769,7 @@ test_wp_B_slice: # @test_wp_B_slice # %bb.12: # %.lr.ph # in Loop: Header=BB3_11 Depth=1 slli.d $t8, $a6, 3 - ldx.d $t8, $a5, $t8 + ldx.d $t8, $a4, $t8 ld.w $t8, $t8, 4 ld.w $fp, $a7, 4 sub.w $fp, $fp, $t8 @@ -1743,9 +1781,11 @@ test_wp_B_slice: # @test_wp_B_slice maskeqz $fp, $fp, $s2 masknez $s2, $t2, $s2 or $fp, $fp, $s2 + vinsgr2vr.w $vr5, $fp, 0 + vinsgr2vr.w $vr5, $fp, 1 move $s2, $t0 move $s4, $a1 - move $s6, $a4 + move $s6, $a3 b .LBB3_15 .p2align 4, , 16 .LBB3_13: # in Loop: Header=BB3_15 Depth=2 @@ -1784,8 +1824,38 @@ test_wp_B_slice: # @test_wp_B_slice xor $s8, $s8, $ra sub.d $s8, $s8, $ra or $s8, $s8, $t3 - ext.w.h $s7, $s7 div.d $s7, $s8, $s7 + vinsgr2vr.h $vr6, $s7, 0 + vinsgr2vr.h $vr6, $s7, 2 + vslli.w $vr6, $vr6, 16 + vsrai.w $vr6, $vr6, 16 + vori.b $vr7, $vr0, 0 + vmadd.w $vr7, $vr5, $vr6 + vsrai.w $vr6, $vr7, 6 + vmax.w $vr6, $vr6, $vr1 + vmin.w $vr6, $vr6, $vr2 + vsrai.w $vr6, $vr6, 2 + vadd.w $vr7, $vr6, $vr3 + vslt.wu $vr7, $vr7, $vr4 + vshuf4i.w $vr8, $vr7, 16 + vpickve2gr.d $s8, $vr8, 1 + andi $s8, $s8, 1 + vpickve2gr.w $ra, $vr6, 1 + masknez $ra, $ra, $s8 + maskeqz $s8, $t6, $s8 + or $s8, $s8, $ra + st.w $s8, $s2, -8 + sub.d $s8, $t7, $s8 + stptr.w $s8, $s2, -12296 + vpickve2gr.d $s8, $vr7, 0 + andi $s8, $s8, 1 + vpickve2gr.w $ra, $vr6, 0 + masknez $ra, $ra, $s8 + maskeqz $s8, $t6, $s8 + or $s8, $s8, $ra + st.w $s8, $s2, -4 + sub.d $s8, $t7, $s8 + stptr.w $s8, $s2, -12292 ext.w.h $s7, $s7 mul.d $s7, $fp, $s7 addi.w $s7, $s7, 32 @@ -1803,13 +1873,9 @@ test_wp_B_slice: # @test_wp_B_slice sltui $s8, $s8, -193 masknez $s7, $s7, $s8 maskeqz $s8, $t6, $s8 - or $s8, $s8, $s7 - st.w $s8, $s2, -8 - sub.d $s7, $t7, $s8 - stptr.w $s7, $s2, -12296 - st.w $s8, $s2, -4 - stptr.w $s7, $s2, -12292 - st.w $s8, $s2, 0 + or $s7, $s8, $s7 + st.w $s7, $s2, 0 + sub.d $s7, $t7, $s7 b .LBB3_14 .LBB3_17: # %._crit_edge311 ori $a1, $zero, 1 @@ -1820,8 +1886,8 @@ test_wp_B_slice: # @test_wp_B_slice pcalau12i $a2, %pc_hi20(wbp_weight) ld.d $a2, $a2, %pc_lo12(wbp_weight) ld.w $t0, $s3, 4 - addi.d $a4, $sp, 56 - add.d $a3, $a4, $a3 + addi.d $a3, $sp, 56 + add.d $a3, $a3, $a5 lu12i.w $a5, -4 ori $a4, $a5, 4088 ori $a5, $a5, 4092 diff --git a/results/MultiSource/Applications/oggenc/CMakeFiles/oggenc.dir/oggenc.s b/results/MultiSource/Applications/oggenc/CMakeFiles/oggenc.dir/oggenc.s index 131bb50f..8eb96d6b 100644 --- a/results/MultiSource/Applications/oggenc/CMakeFiles/oggenc.dir/oggenc.s +++ b/results/MultiSource/Applications/oggenc/CMakeFiles/oggenc.dir/oggenc.s @@ -7899,30 +7899,20 @@ vorbis_encode_setup_init: # @vorbis_encode_setup_init .LBB41_14: # %.preheader.i ld.w $a1, $a0, 0 ld.w $a2, $a0, 8 - stptr.w $a1, $a0, 5316 - stptr.w $a2, $a0, 5376 - stptr.w $a1, $a0, 5320 - stptr.w $a2, $a0, 5380 - stptr.w $a1, $a0, 5324 - stptr.w $a2, $a0, 5384 - stptr.w $a1, $a0, 5328 - stptr.w $a2, $a0, 5388 - stptr.w $a1, $a0, 5332 - stptr.w $a2, $a0, 5392 - stptr.w $a1, $a0, 5336 - stptr.w $a2, $a0, 5396 - stptr.w $a1, $a0, 5340 - stptr.w $a2, $a0, 5400 - stptr.w $a1, $a0, 5344 - stptr.w $a2, $a0, 5404 - stptr.w $a1, $a0, 5348 - stptr.w $a2, $a0, 5408 - stptr.w $a1, $a0, 5352 - stptr.w $a2, $a0, 5412 - stptr.w $a1, $a0, 5356 - stptr.w $a2, $a0, 5416 - stptr.w $a1, $a0, 5360 - stptr.w $a2, $a0, 5420 + vreplgr2vr.w $vr0, $a1 + ori $a3, $s8, 1220 + vstx $vr0, $a0, $a3 + vreplgr2vr.w $vr1, $a2 + ori $a3, $s8, 1280 + vstx $vr1, $a0, $a3 + ori $a3, $s8, 1236 + vstx $vr0, $a0, $a3 + ori $a3, $s8, 1296 + vstx $vr1, $a0, $a3 + ori $a3, $s8, 1252 + vstx $vr0, $a0, $a3 + ori $a3, $s8, 1312 + vstx $vr1, $a0, $a3 stptr.w $a1, $a0, 5364 stptr.w $a2, $a0, 5424 stptr.w $a1, $a0, 5368 @@ -46320,13 +46310,6 @@ floor1_fit: # @floor1_fit .type accumulate_fit,@function accumulate_fit: # @accumulate_fit # %bb.0: - addi.d $sp, $sp, -48 - st.d $fp, $sp, 40 # 8-byte Folded Spill - st.d $s0, $sp, 32 # 8-byte Folded Spill - st.d $s1, $sp, 24 # 8-byte Folded Spill - st.d $s2, $sp, 16 # 8-byte Folded Spill - st.d $s3, $sp, 8 # 8-byte Folded Spill - st.d $s4, $sp, 0 # 8-byte Folded Spill vrepli.b $vr0, 0 vst $vr0, $a4, 48 vst $vr0, $a4, 32 @@ -46336,127 +46319,102 @@ accumulate_fit: # @accumulate_fit slt $a7, $a3, $a5 masknez $a5, $a5, $a7 maskeqz $a7, $a3, $a7 - or $fp, $a7, $a5 + or $t1, $a7, $a5 st.d $a3, $a4, 8 - bge $fp, $a2, .LBB255_3 + bge $t1, $a2, .LBB255_3 # %bb.1: - move $t2, $zero - move $t1, $zero - move $t0, $zero - move $a7, $zero move $a5, $zero move $a3, $zero - move $t8, $zero - move $t7, $zero - move $t6, $zero - move $t5, $zero - move $t4, $zero - move $t3, $zero + move $t0, $zero + move $a7, $zero + vori.b $vr3, $vr0, 0 + vori.b $vr1, $vr0, 0 + vori.b $vr2, $vr0, 0 .LBB255_2: # %._crit_edge - add.d $a0, $t8, $t2 - add.d $a1, $t7, $t1 - add.d $a2, $t6, $t0 - add.d $t5, $t5, $a7 - add.d $t4, $t4, $a5 - fld.s $fa0, $a6, 1108 - add.d $a6, $t3, $a3 - movgr2fr.d $fa1, $a6 - ffint.s.l $fa1, $fa1 - fmul.s $fa0, $fa0, $fa1 - addi.d $t3, $a3, 1 - movgr2fr.d $fa1, $t3 - ffint.s.l $fa1, $fa1 - fdiv.s $fa0, $fa0, $fa1 - ftintrz.w.s $fa0, $fa0 - movfr2gr.s $t3, $fa0 - mul.d $t2, $t2, $t3 - add.d $a0, $a0, $t2 - st.d $a0, $a4, 16 - mul.d $a0, $t1, $t3 - add.d $a0, $a1, $a0 - st.d $a0, $a4, 24 - mul.d $a0, $t0, $t3 - add.d $a0, $a2, $a0 - st.d $a0, $a4, 32 - mul.d $a0, $a7, $t3 - add.d $a0, $t5, $a0 - st.d $a0, $a4, 40 - mul.d $a0, $a5, $t3 - add.d $a0, $t4, $a0 + vadd.d $vr0, $vr0, $vr2 + vadd.d $vr3, $vr3, $vr1 + add.d $a0, $t0, $a5 + fld.s $fa4, $a6, 1108 + add.d $a1, $a7, $a3 + movgr2fr.d $fa5, $a1 + ffint.s.l $fa5, $fa5 + fmul.s $fa4, $fa4, $fa5 + addi.d $a2, $a3, 1 + movgr2fr.d $fa5, $a2 + ffint.s.l $fa5, $fa5 + fdiv.s $fa4, $fa4, $fa5 + ftintrz.w.s $fa4, $fa4 + movfr2gr.s $a2, $fa4 + vreplgr2vr.d $vr4, $a2 + vmadd.d $vr3, $vr1, $vr4 + vmadd.d $vr0, $vr2, $vr4 + vst $vr0, $a4, 32 + vst $vr3, $a4, 16 + mul.d $a5, $a5, $a2 + add.d $a0, $a0, $a5 st.d $a0, $a4, 48 - mul.d $a0, $a3, $t3 - add.d $a1, $a0, $a6 + mul.d $a0, $a3, $a2 + add.d $a1, $a0, $a1 addi.w $a0, $a3, 0 st.d $a1, $a4, 56 - ld.d $s4, $sp, 0 # 8-byte Folded Reload - ld.d $s3, $sp, 8 # 8-byte Folded Reload - ld.d $s2, $sp, 16 # 8-byte Folded Reload - ld.d $s1, $sp, 24 # 8-byte Folded Reload - ld.d $s0, $sp, 32 # 8-byte Folded Reload - ld.d $fp, $sp, 40 # 8-byte Folded Reload - addi.d $sp, $sp, 48 ret .LBB255_3: # %.lr.ph - move $t3, $zero - move $t4, $zero - move $t5, $zero - move $t6, $zero - move $t7, $zero - move $t8, $zero - move $a3, $zero - move $a5, $zero move $a7, $zero move $t0, $zero - move $t1, $zero - move $t2, $zero - addi.d $fp, $fp, 1 - pcalau12i $s0, %pc_hi20(.LCPI255_0) - fld.s $fa0, $s0, %pc_lo12(.LCPI255_0) - pcalau12i $s0, %pc_hi20(.LCPI255_1) - fld.s $fa1, $s0, %pc_lo12(.LCPI255_1) + move $a3, $zero + move $a5, $zero + addi.d $t1, $t1, 1 alsl.d $a1, $a2, $a1, 2 + pcalau12i $t2, %pc_hi20(.LCPI255_0) + fld.s $fa4, $t2, %pc_lo12(.LCPI255_0) + pcalau12i $t2, %pc_hi20(.LCPI255_1) + fld.s $fa5, $t2, %pc_lo12(.LCPI255_1) alsl.d $a0, $a2, $a0, 2 - ori $s0, $zero, 1023 + vrepli.b $vr3, 0 + ori $t2, $zero, 1023 + vori.b $vr0, $vr3, 0 + vori.b $vr1, $vr3, 0 + vori.b $vr2, $vr3, 0 b .LBB255_6 .p2align 4, , 16 .LBB255_4: # in Loop: Header=BB255_6 Depth=1 - add.d $t8, $a2, $t8 - add.d $t7, $t7, $s2 - add.d $t6, $s3, $t6 - add.d $t5, $t5, $s1 - add.d $t4, $s4, $t4 - addi.d $t3, $t3, 1 + vadd.d $vr0, $vr7, $vr0 + vadd.d $vr3, $vr6, $vr3 + add.d $t0, $t3, $t0 + addi.d $a7, $a7, 1 .LBB255_5: # in Loop: Header=BB255_6 Depth=1 addi.d $a2, $a2, 1 addi.d $a1, $a1, 4 addi.d $a0, $a0, 4 - beq $fp, $a2, .LBB255_2 + beq $t1, $a2, .LBB255_2 .LBB255_6: # =>This Inner Loop Header: Depth=1 - fld.s $fa2, $a0, 0 - fmul.s $fa3, $fa2, $fa0 - fadd.s $fa3, $fa3, $fa1 - ftintrz.w.s $fa3, $fa3 - movfr2gr.s $s1, $fa3 - blez $s1, .LBB255_5 + fld.s $fa6, $a0, 0 + fmul.s $fa7, $fa6, $fa4 + fadd.s $fa7, $fa7, $fa5 + ftintrz.w.s $fa7, $fa7 + movfr2gr.s $t3, $fa7 + blez $t3, .LBB255_5 # %bb.7: # in Loop: Header=BB255_6 Depth=1 - sltui $s2, $s1, 1023 - fld.s $fa3, $a1, 0 - fld.s $fa4, $a6, 1112 - masknez $s3, $s0, $s2 - maskeqz $s1, $s1, $s2 - or $s2, $s1, $s3 - fadd.s $fa3, $fa3, $fa4 - mul.d $s3, $a2, $a2 - mul.d $s1, $s2, $s2 - fcmp.cult.s $fcc0, $fa3, $fa2 - mul.d $s4, $a2, $s2 + sltui $t4, $t3, 1023 + fld.s $fa7, $a1, 0 + fld.s $ft0, $a6, 1112 + masknez $t5, $t2, $t4 + maskeqz $t3, $t3, $t4 + or $t5, $t3, $t5 + fadd.s $fa7, $fa7, $ft0 + mul.d $t6, $a2, $a2 + mul.d $t4, $t5, $t5 + fcmp.cult.s $fcc0, $fa7, $fa6 + mul.d $t3, $a2, $t5 + vinsgr2vr.d $vr6, $a2, 0 + vinsgr2vr.d $vr6, $t5, 1 + vinsgr2vr.d $vr7, $t6, 0 + vinsgr2vr.d $vr7, $t4, 1 bcnez $fcc0, .LBB255_4 # %bb.8: # in Loop: Header=BB255_6 Depth=1 - add.d $t2, $a2, $t2 - add.d $t1, $t1, $s2 - add.d $t0, $s3, $t0 - add.d $a7, $a7, $s1 - add.d $a5, $s4, $a5 + vadd.d $vr2, $vr7, $vr2 + vadd.d $vr1, $vr6, $vr1 + add.d $a5, $t3, $a5 addi.d $a3, $a3, 1 b .LBB255_5 .Lfunc_end255: diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Tar/TarHandler.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Tar/TarHandler.s index d2b62200..311f0db3 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Tar/TarHandler.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Archive/Tar/TarHandler.s @@ -237,12 +237,10 @@ _ZN8NArchive4NTar8CHandler9ReadItem2EP19ISequentialInStreamRbRNS0_7CItemExE: # @ bnez $a0, .LBB5_2 # %bb.1: ld.wu $a1, $s0, 120 - ld.d $a2, $fp, 224 - ld.d $a3, $fp, 232 - add.d $a2, $a2, $a1 - st.d $a2, $fp, 224 - add.d $a1, $a3, $a1 - st.d $a1, $fp, 232 + vld $vr0, $fp, 224 + vreplgr2vr.d $vr1, $a1 + vadd.d $vr0, $vr0, $vr1 + vst $vr0, $fp, 224 .LBB5_2: ld.d $s0, $sp, 8 # 8-byte Folded Reload ld.d $fp, $sp, 16 # 8-byte Folded Reload @@ -423,15 +421,13 @@ _ZN8NArchive4NTar8CHandler5Open2EP9IInStreamP20IArchiveOpenCallback: # @_ZN8NArc .p2align 4, , 16 .LBB6_12: # in Loop: Header=BB6_6 Depth=1 ld.wu $a0, $sp, 248 - ld.d $a1, $s1, 224 - ld.d $a2, $s1, 232 - add.d $a1, $a1, $a0 - ld.bu $a3, $sp, 88 - st.d $a1, $s1, 224 - add.d $a0, $a2, $a0 - st.d $a0, $s1, 232 + vld $vr0, $s2, 0 + ld.bu $a1, $sp, 88 + vreplgr2vr.d $vr1, $a0 + vadd.d $vr0, $vr0, $vr1 + vst $vr0, $s2, 0 ori $s7, $zero, 2 - beqz $a3, .LBB6_27 + beqz $a1, .LBB6_27 # %bb.13: # in Loop: Header=BB6_6 Depth=1 .Ltmp25: # EH_LABEL ori $a0, $zero, 128 @@ -2022,14 +2018,12 @@ _ZN8NArchive4NTar8CHandler6SkipToEj: # @_ZN8NArchive4NTar8CHandler6SkipToEj bnez $a0, .LBB18_3 # %bb.12: # in Loop: Header=BB18_7 Depth=1 ld.wu $a0, $fp, 216 - ld.d $a1, $fp, 224 - ld.d $a2, $fp, 232 - ld.bu $a3, $sp, 15 - add.d $a1, $a1, $a0 - st.d $a1, $fp, 224 - add.d $a0, $a2, $a0 - st.d $a0, $fp, 232 - bnez $a3, .LBB18_5 + vld $vr0, $fp, 224 + ld.bu $a1, $sp, 15 + vreplgr2vr.d $vr1, $a0 + vadd.d $vr0, $vr0, $vr1 + vst $vr0, $fp, 224 + bnez $a1, .LBB18_5 # %bb.13: ori $a0, $zero, 1 st.b $a0, $fp, 240 diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Common/LimitedStreams.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Common/LimitedStreams.s index c170734e..5f266f38 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Common/LimitedStreams.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/Common/LimitedStreams.s @@ -127,12 +127,10 @@ _ZN16CLimitedInStream4ReadEPvjPj: # @_ZN16CLimitedInStream4ReadEPvjPj # %bb.8: st.w $a1, $s0, 0 .LBB1_9: - ld.d $a2, $fp, 32 - ld.d $a3, $fp, 24 - add.d $a2, $a2, $a1 - st.d $a2, $fp, 32 - add.d $a1, $a3, $a1 - st.d $a1, $fp, 24 + vld $vr0, $fp, 24 + vreplgr2vr.d $vr1, $a1 + vadd.d $vr0, $vr0, $vr1 + vst $vr0, $fp, 24 b .LBB1_11 .LBB1_10: xor $a0, $a3, $a0 @@ -243,14 +241,12 @@ _ZN16CClusterInStream4ReadEPvjPj: # @_ZN16CClusterInStream4ReadEPvjPj # %bb.7: st.w $a1, $s0, 0 .LBB3_8: - ld.d $a2, $fp, 24 - ld.d $a3, $fp, 16 - add.d $a2, $a2, $a1 - ld.w $a4, $fp, 32 - st.d $a2, $fp, 24 - add.d $a2, $a3, $a1 - st.d $a2, $fp, 16 - sub.d $a1, $a4, $a1 + vld $vr0, $fp, 16 + ld.w $a2, $fp, 32 + vreplgr2vr.d $vr1, $a1 + vadd.d $vr0, $vr0, $vr1 + vst $vr0, $fp, 16 + sub.d $a1, $a2, $a1 st.w $a1, $fp, 32 b .LBB3_10 .LBB3_9: diff --git a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/UI/Console/BenchCon.s b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/UI/Console/BenchCon.s index 5bdb5a47..73e66733 100644 --- a/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/UI/Console/BenchCon.s +++ b/results/MultiSource/Benchmarks/7zip/CMakeFiles/7zip-benchmark.dir/CPP/7zip/UI/Console/BenchCon.s @@ -218,20 +218,18 @@ _ZN14CBenchCallback15SetDecodeResultERK10CBenchInfob: # @_ZN14CBenchCallback15Se ori $s2, $zero, 1 pcaddu18i $ra, %call36(fwrite) jirl $ra, $ra, 0 - vld $vr0, $s0, 16 ld.d $a0, $s0, 48 + vld $vr0, $s0, 16 vld $vr1, $s0, 0 - vst $vr0, $sp, 32 st.d $a0, $sp, 64 - ld.d $a1, $s0, 32 + vst $vr0, $sp, 32 + vld $vr0, $s0, 32 vst $vr1, $sp, 16 - ld.d $a2, $s0, 40 bstrpick.d $a0, $a0, 31, 0 - mul.d $a1, $a1, $a0 - st.d $a1, $sp, 48 - mul.d $a1, $a2, $a0 + vreplgr2vr.d $vr1, $a0 + vmul.d $vr0, $vr0, $vr1 ld.d $a0, $fp, 72 - st.d $a1, $sp, 56 + vst $vr0, $sp, 48 st.w $s2, $sp, 64 addi.d $a3, $fp, 40 addi.d $a1, $sp, 16 @@ -523,14 +521,12 @@ _Z12LzmaBenchConP8_IO_FILEjjj: # @_Z12LzmaBenchConP8_IO_FILEjjj ld.d $a0, $sp, 24 beqz $a0, .LBB3_20 # %bb.19: - ld.d $a1, $sp, 32 - ld.d $a2, $sp, 40 - div.du $a1, $a1, $a0 - ld.d $a3, $sp, 48 - st.d $a1, $sp, 32 - div.du $a1, $a2, $a0 - st.d $a1, $sp, 40 - div.du $a0, $a3, $a0 + vld $vr0, $sp, 32 + ld.d $a1, $sp, 48 + vreplgr2vr.d $vr1, $a0 + vdiv.du $vr0, $vr0, $vr1 + vst $vr0, $sp, 32 + div.du $a0, $a1, $a0 st.d $a0, $sp, 48 ori $a0, $zero, 1 st.d $a0, $sp, 24 @@ -538,14 +534,12 @@ _Z12LzmaBenchConP8_IO_FILEjjj: # @_Z12LzmaBenchConP8_IO_FILEjjj ld.d $a0, $sp, 56 beqz $a0, .LBB3_22 # %bb.21: - ld.d $a1, $sp, 64 - ld.d $a2, $sp, 72 - div.du $a1, $a1, $a0 - ld.d $a3, $sp, 80 - st.d $a1, $sp, 64 - div.du $a1, $a2, $a0 - st.d $a1, $sp, 72 - div.du $a0, $a3, $a0 + vld $vr0, $sp, 64 + ld.d $a1, $sp, 80 + vreplgr2vr.d $vr1, $a0 + vdiv.du $vr0, $vr0, $vr1 + vst $vr0, $sp, 64 + div.du $a0, $a1, $a0 st.d $a0, $sp, 80 ori $a0, $zero, 1 st.d $a0, $sp, 56 diff --git a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btQuantizedBvh.s b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btQuantizedBvh.s index b3e78ca8..f4c20995 100644 --- a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btQuantizedBvh.s +++ b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btQuantizedBvh.s @@ -1037,7 +1037,13 @@ _ZN14btQuantizedBvh30assignInternalNodeFromLeafNodeEii: # @_ZN14btQuantizedBvh30 .Lfunc_end9: .size _ZN14btQuantizedBvh30assignInternalNodeFromLeafNodeEii, .Lfunc_end9-_ZN14btQuantizedBvh30assignInternalNodeFromLeafNodeEii # -- End function - .globl _ZN14btQuantizedBvh17calcSplittingAxisEii # -- Begin function _ZN14btQuantizedBvh17calcSplittingAxisEii + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4, 0x0 # -- Begin function _ZN14btQuantizedBvh17calcSplittingAxisEii +.LCPI10_0: + .dword 0 # 0x0 + .dword 32 # 0x20 + .text + .globl _ZN14btQuantizedBvh17calcSplittingAxisEii .p2align 5 .type _ZN14btQuantizedBvh17calcSplittingAxisEii,@function _ZN14btQuantizedBvh17calcSplittingAxisEii: # @_ZN14btQuantizedBvh17calcSplittingAxisEii @@ -1048,46 +1054,48 @@ _ZN14btQuantizedBvh17calcSplittingAxisEii: # @_ZN14btQuantizedBvh17calcSplitting # %bb.1: # %.lr.ph ld.bu $a6, $a0, 64 ld.d $a7, $a0, 88 + ld.d $a3, $a0, 40 ld.d $t0, $a0, 152 - fld.s $fa0, $a0, 40 - fld.s $fa1, $a0, 44 fld.s $fa2, $a0, 48 - fld.s $fa3, $a0, 8 - fld.s $fa4, $a0, 12 - fld.s $fa6, $a0, 16 + ld.d $t1, $a0, 8 + vinsgr2vr.d $vr3, $a3, 0 + fld.s $fa4, $a0, 44 + fld.s $fa5, $a0, 40 + vinsgr2vr.d $vr6, $t1, 0 + vreplvei.w $vr7, $vr6, 0 + vreplvei.w $vr10, $vr6, 1 + fld.s $ft3, $a0, 16 slli.d $a3, $a1, 6 add.d $a7, $a3, $a7 addi.d $a7, $a7, 16 alsl.d $t0, $a1, $t0, 4 addi.d $t0, $t0, 6 - movgr2fr.w $fa5, $zero - vldi $vr7, -1184 - fmov.s $ft0, $fa5 - fmov.s $ft1, $fa5 + vrepli.b $vr9, 0 + movgr2fr.w $ft0, $zero + pcalau12i $t1, %pc_hi20(.LCPI10_0) + vld $vr0, $t1, %pc_lo12(.LCPI10_0) + lu12i.w $t1, 258048 + vreplgr2vr.w $vr1, $t1 + vldi $vr12, -1184 b .LBB10_4 .p2align 4, , 16 .LBB10_2: # in Loop: Header=BB10_4 Depth=1 ld.d $t2, $a7, 0 - fld.s $ft2, $a7, 8 + fld.s $ft6, $a7, 8 ld.d $t1, $a7, -16 - fld.s $ft5, $a7, -8 - movgr2fr.w $ft3, $t2 - srli.d $t2, $t2, 32 - movgr2fr.w $ft4, $t2 + fld.s $ft7, $a7, -8 + vinsgr2vr.d $vr13, $t2, 0 .LBB10_3: # %_ZNK14btQuantizedBvh10getAabbMinEi.exit # in Loop: Header=BB10_4 Depth=1 - movgr2fr.w $ft6, $t1 - srli.d $t1, $t1, 32 - movgr2fr.w $ft7, $t1 - fadd.s $ft3, $ft3, $ft6 - fadd.s $ft4, $ft4, $ft7 - fadd.s $ft2, $ft2, $ft5 - fmul.s $ft3, $ft3, $fa7 - fmul.s $ft4, $ft4, $fa7 - fmul.s $ft2, $ft2, $fa7 - fadd.s $ft1, $ft1, $ft3 - fadd.s $ft0, $ft0, $ft4 - fadd.s $fa5, $fa5, $ft2 + fadd.s $ft6, $ft6, $ft7 + vreplgr2vr.d $vr15, $t1 + vsrl.d $vr15, $vr15, $vr0 + vshuf4i.w $vr15, $vr15, 8 + vfadd.s $vr13, $vr13, $vr15 + vfmul.s $vr13, $vr13, $vr1 + fmul.s $ft6, $ft6, $ft4 + vfadd.s $vr9, $vr9, $vr13 + fadd.s $ft0, $ft0, $ft6 addi.d $a5, $a5, -1 addi.d $a7, $a7, 64 addi.d $t0, $t0, 16 @@ -1096,54 +1104,55 @@ _ZN14btQuantizedBvh17calcSplittingAxisEii: # @_ZN14btQuantizedBvh17calcSplitting beqz $a6, .LBB10_2 # %bb.5: # in Loop: Header=BB10_4 Depth=1 ld.hu $t1, $t0, 0 - movgr2fr.w $ft2, $t1 - ld.hu $t1, $t0, 2 - ffint.s.w $ft2, $ft2 - fdiv.s $ft2, $ft2, $fa0 - ld.hu $t2, $t0, 4 - movgr2fr.w $ft3, $t1 - ffint.s.w $ft3, $ft3 - fdiv.s $ft4, $ft3, $fa1 - movgr2fr.w $ft3, $t2 - ffint.s.w $ft3, $ft3 - fdiv.s $ft5, $ft3, $fa2 - ld.hu $t1, $t0, -6 - fadd.s $ft3, $ft2, $fa3 - fadd.s $ft4, $ft4, $fa4 - fadd.s $ft2, $ft5, $fa6 + ld.hu $t2, $t0, 2 movgr2fr.w $ft5, $t1 - ld.hu $t1, $t0, -4 ffint.s.w $ft5, $ft5 - fdiv.s $ft5, $ft5, $fa0 - ld.hu $t2, $t0, -2 + movgr2fr.w $ft6, $t2 + ld.hu $t1, $t0, 4 + ffint.s.w $ft6, $ft6 + vextrins.w $vr13, $vr14, 16 + vfdiv.s $vr13, $vr13, $vr3 movgr2fr.w $ft6, $t1 ffint.s.w $ft6, $ft6 - fdiv.s $ft6, $ft6, $fa1 - movgr2fr.w $ft7, $t2 + ld.hu $t1, $t0, -6 + fdiv.s $ft6, $ft6, $fa2 + vfadd.s $vr13, $vr13, $vr6 + fadd.s $ft6, $ft6, $ft3 + movgr2fr.w $ft7, $t1 + ld.hu $t1, $t0, -4 ffint.s.w $ft7, $ft7 - fdiv.s $ft7, $ft7, $fa2 - fadd.s $ft8, $ft5, $fa3 - fadd.s $ft6, $ft6, $fa4 - fadd.s $ft5, $ft7, $fa6 - movfr2gr.s $t1, $ft8 - movfr2gr.s $t2, $ft6 + fdiv.s $ft7, $ft7, $fa5 + ld.hu $t2, $t0, -2 + movgr2fr.w $ft8, $t1 + ffint.s.w $ft8, $ft8 + fdiv.s $ft8, $ft8, $fa4 + movgr2fr.w $ft9, $t2 + ffint.s.w $ft9, $ft9 + fdiv.s $ft9, $ft9, $fa2 + fadd.s $ft10, $ft7, $fa7 + fadd.s $ft8, $ft8, $ft2 + fadd.s $ft7, $ft9, $ft3 + movfr2gr.s $t1, $ft10 + movfr2gr.s $t2, $ft8 bstrins.d $t1, $t2, 63, 32 b .LBB10_3 .LBB10_6: # %._crit_edge.thread movgr2fr.w $fa0, $a4 - movgr2fr.w $fa5, $zero - ffint.s.w $fa0, $fa0 - fmov.s $fa7, $fa5 - fmov.s $fa1, $fa5 + ffint.s.w $fa2, $fa0 + vrepli.b $vr5, 0 + movgr2fr.w $fa6, $zero .LBB10_7: # %._crit_edge203 - vldi $vr2, -1040 - fadd.s $fa0, $fa0, $fa2 + vldi $vr0, -1040 + fadd.s $fa0, $fa2, $fa0 frecip.s $fa0, $fa0 - fmul.s $fa2, $fa0, $fa5 - fmul.s $fa3, $fa0, $fa7 - fmul.s $fa0, $fa0, $fa1 - fcmp.clt.s $fcc0, $fa2, $fa3 - fsel $fa1, $fa2, $fa3, $fcc0 + vori.b $vr1, $vr0, 0 + vextrins.w $vr1, $vr0, 16 + vfmul.s $vr1, $vr1, $vr5 + fmul.s $fa0, $fa0, $fa6 + vreplvei.w $vr2, $vr1, 0 + vreplvei.w $vr1, $vr1, 1 + fcmp.clt.s $fcc0, $fa2, $fa1 + fsel $fa1, $fa2, $fa1, $fcc0 fcmp.clt.s $fcc1, $fa1, $fa0 movcf2gr $a0, $fcc0 movcf2gr $a1, $fcc1 @@ -1153,60 +1162,58 @@ _ZN14btQuantizedBvh17calcSplittingAxisEii: # @_ZN14btQuantizedBvh17calcSplitting or $a0, $a1, $a0 ret .LBB10_8: # %._crit_edge - movgr2fr.w $fa0, $a4 - ffint.s.w $fa0, $fa0 - frecip.s $fa1, $fa0 - fmul.s $fa2, $fa1, $ft1 - fmul.s $fa3, $fa1, $ft0 - fmul.s $fa4, $fa1, $fa5 + movgr2fr.w $fa2, $a4 + ffint.s.w $fa2, $fa2 + frecip.s $fa4, $fa2 + vori.b $vr3, $vr4, 0 + vextrins.w $vr3, $vr4, 16 + vfmul.s $vr3, $vr3, $vr9 + fmul.s $fa4, $fa4, $ft0 + ld.d $a5, $a0, 40 ld.bu $a4, $a0, 64 - ld.d $a5, $a0, 88 - ld.d $a6, $a0, 152 - fld.s $fa6, $a0, 40 - fld.s $ft0, $a0, 44 - fld.s $ft1, $a0, 48 - fld.s $ft2, $a0, 8 - fld.s $ft3, $a0, 12 - fld.s $ft4, $a0, 16 + ld.d $a6, $a0, 88 + ld.d $a7, $a0, 152 + vinsgr2vr.d $vr7, $a5, 0 + ld.d $a5, $a0, 8 + fld.s $ft0, $a0, 48 + vreplvei.w $vr9, $vr7, 0 + vreplvei.w $vr10, $vr7, 1 + vinsgr2vr.d $vr11, $a5, 0 + vreplvei.w $vr12, $vr11, 0 + vreplvei.w $vr13, $vr11, 1 + fld.s $ft6, $a0, 16 sub.d $a0, $a2, $a1 - add.d $a2, $a3, $a5 + add.d $a2, $a3, $a6 addi.d $a2, $a2, 16 - alsl.d $a1, $a1, $a6, 4 + alsl.d $a1, $a1, $a7, 4 addi.d $a1, $a1, 6 - movgr2fr.w $fa1, $zero - vldi $vr13, -1184 - fmov.s $fa7, $fa1 - fmov.s $fa5, $fa1 + vrepli.b $vr5, 0 + movgr2fr.w $fa6, $zero + vldi $vr15, -1184 b .LBB10_11 .p2align 4, , 16 .LBB10_9: # in Loop: Header=BB10_11 Depth=1 - ld.d $a5, $a2, 0 - fld.s $ft6, $a2, 8 + fld.s $ft8, $a2, 8 + vldrepl.d $vr17, $a2, 0 ld.d $a3, $a2, -16 - fld.s $ft9, $a2, -8 - movgr2fr.w $ft7, $a5 - srli.d $a5, $a5, 32 - movgr2fr.w $ft8, $a5 + fld.s $ft10, $a2, -8 + vsrl.d $vr17, $vr17, $vr0 + vshuf4i.w $vr17, $vr17, 8 .LBB10_10: # %_ZNK14btQuantizedBvh10getAabbMinEi.exit90 # in Loop: Header=BB10_11 Depth=1 - movgr2fr.w $ft10, $a3 - srli.d $a3, $a3, 32 - movgr2fr.w $ft11, $a3 - fadd.s $ft7, $ft7, $ft10 - fadd.s $ft8, $ft8, $ft11 - fadd.s $ft6, $ft6, $ft9 - fmul.s $ft7, $ft7, $ft5 - fmul.s $ft8, $ft8, $ft5 - fmul.s $ft6, $ft6, $ft5 - fsub.s $ft7, $ft7, $fa2 - fsub.s $ft8, $ft8, $fa3 - fsub.s $ft6, $ft6, $fa4 - fmul.s $ft7, $ft7, $ft7 + fadd.s $ft8, $ft8, $ft10 + fmul.s $ft8, $ft8, $ft7 + fsub.s $ft8, $ft8, $fa4 + vreplgr2vr.d $vr18, $a3 + vsrl.d $vr18, $vr18, $vr0 + vshuf4i.w $vr18, $vr18, 8 + vfadd.s $vr17, $vr17, $vr18 + vfmul.s $vr17, $vr17, $vr1 + vfsub.s $vr17, $vr17, $vr3 + vfmul.s $vr17, $vr17, $vr17 fmul.s $ft8, $ft8, $ft8 - fmul.s $ft6, $ft6, $ft6 - fadd.s $fa5, $fa5, $ft7 - fadd.s $fa7, $fa7, $ft8 - fadd.s $fa1, $fa1, $ft6 + vfadd.s $vr5, $vr5, $vr17 + fadd.s $fa6, $fa6, $ft8 addi.d $a0, $a0, -1 addi.d $a2, $a2, 64 addi.d $a1, $a1, 16 @@ -1214,38 +1221,41 @@ _ZN14btQuantizedBvh17calcSplittingAxisEii: # @_ZN14btQuantizedBvh17calcSplitting .LBB10_11: # =>This Inner Loop Header: Depth=1 beqz $a4, .LBB10_9 # %bb.12: # in Loop: Header=BB10_11 Depth=1 - ld.hu $a3, $a1, 0 - movgr2fr.w $ft6, $a3 - ld.hu $a3, $a1, 2 - ffint.s.w $ft6, $ft6 - fdiv.s $ft6, $ft6, $fa6 - ld.hu $a5, $a1, 4 - movgr2fr.w $ft7, $a3 - ffint.s.w $ft7, $ft7 - fdiv.s $ft8, $ft7, $ft0 - movgr2fr.w $ft7, $a5 - ffint.s.w $ft7, $ft7 - fdiv.s $ft9, $ft7, $ft1 - ld.hu $a3, $a1, -6 - fadd.s $ft7, $ft6, $ft2 - fadd.s $ft8, $ft8, $ft3 - fadd.s $ft6, $ft9, $ft4 + ld.w $a3, $a1, 0 + vinsgr2vr.w $vr16, $a3, 0 + vpickve2gr.h $a3, $vr16, 1 + bstrpick.d $a3, $a3, 15, 0 movgr2fr.w $ft9, $a3 - ld.hu $a3, $a1, -4 ffint.s.w $ft9, $ft9 - fdiv.s $ft9, $ft9, $fa6 - ld.hu $a5, $a1, -2 + vpickve2gr.h $a3, $vr16, 0 + bstrpick.d $a3, $a3, 15, 0 + movgr2fr.w $ft8, $a3 + ld.hu $a3, $a1, 4 + ffint.s.w $ft8, $ft8 + vextrins.w $vr16, $vr17, 16 + vfdiv.s $vr16, $vr16, $vr7 + movgr2fr.w $ft9, $a3 + ffint.s.w $ft9, $ft9 + ld.hu $a3, $a1, -6 + fdiv.s $ft10, $ft9, $ft0 + vfadd.s $vr17, $vr16, $vr11 + fadd.s $ft8, $ft10, $ft6 movgr2fr.w $ft10, $a3 + ld.hu $a3, $a1, -4 ffint.s.w $ft10, $ft10 - fdiv.s $ft10, $ft10, $ft0 - movgr2fr.w $ft11, $a5 + fdiv.s $ft10, $ft10, $ft1 + ld.hu $a5, $a1, -2 + movgr2fr.w $ft11, $a3 ffint.s.w $ft11, $ft11 - fdiv.s $ft11, $ft11, $ft1 - fadd.s $ft12, $ft9, $ft2 - fadd.s $ft10, $ft10, $ft3 - fadd.s $ft9, $ft11, $ft4 - movfr2gr.s $a3, $ft12 - movfr2gr.s $a5, $ft10 + fdiv.s $ft11, $ft11, $ft2 + movgr2fr.w $ft12, $a5 + ffint.s.w $ft12, $ft12 + fdiv.s $ft12, $ft12, $ft0 + fadd.s $ft13, $ft10, $ft4 + fadd.s $ft11, $ft11, $ft5 + fadd.s $ft10, $ft12, $ft6 + movfr2gr.s $a3, $ft13 + movfr2gr.s $a5, $ft11 bstrins.d $a3, $a5, 63, 32 b .LBB10_10 .Lfunc_end10: diff --git a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btSoftBody.s b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btSoftBody.s index 6ad4169f..cbcf9d52 100644 --- a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btSoftBody.s +++ b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btSoftBody.s @@ -4304,26 +4304,23 @@ _ZN10btSoftBody17pointersToIndicesEv: # @_ZN10btSoftBody17pointersToIndicesEv b .LBB32_2 .LBB32_5: # %._crit_edge ld.w $a3, $a0, 852 + vreplgr2vr.d $vr0, $a1 lu12i.w $a2, -69906 blez $a3, .LBB32_8 # %bb.6: # %.lr.ph51 ld.d $a4, $a0, 864 - addi.d $a4, $a4, 24 + addi.d $a4, $a4, 16 ori $a5, $a2, 3823 lu32i.d $a5, -69906 lu52i.d $a5, $a5, -274 + vreplgr2vr.d $vr1, $a5 .p2align 4, , 16 .LBB32_7: # =>This Inner Loop Header: Depth=1 - ld.d $a6, $a4, -8 - sub.d $a6, $a6, $a1 - ld.d $a7, $a4, 0 - srai.d $a6, $a6, 3 - mul.d $a6, $a6, $a5 - st.d $a6, $a4, -8 - sub.d $a6, $a7, $a1 - srai.d $a6, $a6, 3 - mul.d $a6, $a6, $a5 - st.d $a6, $a4, 0 + vld $vr2, $a4, 0 + vsub.d $vr2, $vr2, $vr0 + vsrai.d $vr2, $vr2, 3 + vmul.d $vr2, $vr2, $vr1 + vst $vr2, $a4, 0 addi.d $a3, $a3, -1 addi.d $a4, $a4, 72 bnez $a3, .LBB32_7 @@ -4339,6 +4336,7 @@ _ZN10btSoftBody17pointersToIndicesEv: # @_ZN10btSoftBody17pointersToIndicesEv ori $a6, $a2, 3823 lu32i.d $a6, -69906 lu52i.d $a6, $a6, -274 + vreplgr2vr.d $vr1, $a6 b .LBB32_11 .p2align 4, , 16 .LBB32_10: # in Loop: Header=BB32_11 Depth=1 @@ -4349,23 +4347,18 @@ _ZN10btSoftBody17pointersToIndicesEv: # @_ZN10btSoftBody17pointersToIndicesEv .LBB32_11: # =>This Inner Loop Header: Depth=1 ld.d $a7, $a0, 896 add.d $t0, $a7, $a3 - ld.d $t1, $t0, 16 + vld $vr2, $t0, 16 + vsub.d $vr2, $vr2, $vr0 + vsrai.d $vr2, $vr2, 3 + ld.d $t1, $t0, 32 + vmul.d $vr2, $vr2, $vr1 + vst $vr2, $t0, 16 + ld.d $t2, $t0, 64 sub.d $t1, $t1, $a1 - ld.d $t2, $t0, 24 - srai.d $t1, $t1, 3 - mul.d $t1, $t1, $a6 - st.d $t1, $t0, 16 - sub.d $t1, $t2, $a1 srai.d $t1, $t1, 3 - ld.d $t2, $t0, 32 mul.d $t1, $t1, $a6 - st.d $t1, $t0, 24 - ld.d $t1, $t0, 64 - sub.d $t2, $t2, $a1 - srai.d $t2, $t2, 3 - mul.d $t2, $t2, $a6 - st.d $t2, $t0, 32 - beqz $t1, .LBB32_10 + st.d $t1, $t0, 32 + beqz $t2, .LBB32_10 # %bb.12: # in Loop: Header=BB32_11 Depth=1 ld.d $t0, $sp, 8 addi.w $t1, $t0, 0 @@ -4436,7 +4429,6 @@ _ZN10btSoftBody17pointersToIndicesEv: # @_ZN10btSoftBody17pointersToIndicesEv # %bb.22: # %.preheader.lr.ph ld.d $a0, $a0, 800 move $a4, $zero - vreplgr2vr.d $vr0, $a1 addi.d $a5, $a0, 56 addi.d $a6, $a0, 40 ori $a7, $zero, 88 @@ -15090,78 +15082,69 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl ld.w $a0, $s8, 884 blez $a0, .LBB91_23 # %bb.21: # %.lr.ph551 - ld.d $a1, $s8, 896 move $a0, $zero - addi.d $a1, $a1, 32 - lu12i.w $a2, -69906 - ori $a2, $a2, 3823 + ld.d $a2, $s8, 896 + vreplgr2vr.d $vr0, $s6 + lu12i.w $a1, -69906 + ori $a1, $a1, 3823 + addi.d $a2, $a2, 32 + move $a3, $a1 + lu32i.d $a3, -69906 + lu52i.d $a3, $a3, -274 + vreplgr2vr.d $vr1, $a3 addi.w $a3, $zero, -1 lu32i.d $a3, 0 .p2align 4, , 16 .LBB91_22: # =>This Inner Loop Header: Depth=1 - ld.d $a4, $a1, -16 - ld.d $a5, $a1, -8 + ld.d $a4, $a2, -8 + ld.d $a5, $a2, -16 sub.d $a4, $a4, $s6 + ld.d $a6, $a2, 0 srli.d $a4, $a4, 3 - mul.w $a4, $a4, $a2 - sub.d $a5, $a5, $s6 - srli.d $a5, $a5, 3 - mul.w $a5, $a5, $a2 - slt $a6, $a4, $a5 - masknez $a7, $a5, $a6 - maskeqz $a6, $a4, $a6 + mul.d $a4, $a4, $a1 + vinsgr2vr.d $vr2, $a5, 0 + vinsgr2vr.d $vr2, $a6, 1 + vsub.d $vr2, $vr2, $vr0 + vsrli.d $vr2, $vr2, 3 + vmul.d $vr2, $vr2, $vr1 + vshuf4i.w $vr3, $vr2, 8 + vinsgr2vr.w $vr4, $a4, 0 + vinsgr2vr.w $vr4, $a4, 1 + vmin.w $vr5, $vr4, $vr3 + vmax.w $vr3, $vr4, $vr3 + vaddi.wu $vr4, $vr3, 1 + vmul.w $vr6, $vr4, $vr3 + vsrli.w $vr6, $vr6, 31 + vmadd.w $vr6, $vr4, $vr3 + vsrai.w $vr3, $vr6, 1 + vadd.w $vr3, $vr3, $vr5 + vpickve2gr.w $a4, $vr3, 0 + slli.d $a4, $a4, 2 + stx.w $a3, $s7, $a4 + vpickve2gr.w $a4, $vr3, 1 + slli.d $a4, $a4, 2 + stx.w $a3, $s7, $a4 + vpickve2gr.w $a4, $vr2, 0 + vpickve2gr.w $a5, $vr2, 2 + slt $a6, $a5, $a4 + masknez $a7, $a4, $a6 + maskeqz $a6, $a5, $a6 or $a6, $a6, $a7 - slt $a7, $a5, $a4 - masknez $t0, $a5, $a7 - maskeqz $a7, $a4, $a7 - or $a7, $a7, $t0 - addi.w $t0, $a7, 1 - mul.d $a7, $t0, $a7 - bstrpick.d $t0, $a7, 31, 31 - add.w $a7, $a7, $t0 - srli.d $a7, $a7, 1 - add.w $a6, $a7, $a6 - slli.d $a6, $a6, 2 - stx.w $a3, $s7, $a6 - ld.d $a6, $a1, 0 - sub.d $a6, $a6, $s6 - srli.d $a6, $a6, 3 - mul.w $a6, $a6, $a2 - slt $a7, $a5, $a6 - masknez $t0, $a6, $a7 - maskeqz $a7, $a5, $a7 - or $a7, $a7, $t0 - slt $t0, $a6, $a5 - masknez $t1, $a6, $t0 - maskeqz $a5, $a5, $t0 - or $a5, $a5, $t1 - addi.w $t0, $a5, 1 - mul.d $a5, $t0, $a5 - bstrpick.d $t0, $a5, 31, 31 - add.w $a5, $a5, $t0 - srli.d $a5, $a5, 1 - add.w $a5, $a5, $a7 - slli.d $a5, $a5, 2 - stx.w $a3, $s7, $a5 - slt $a5, $a6, $a4 - masknez $a7, $a4, $a5 - maskeqz $a5, $a6, $a5 - or $a5, $a5, $a7 - slt $a7, $a4, $a6 + slt $a7, $a4, $a5 masknez $a4, $a4, $a7 - maskeqz $a6, $a6, $a7 - or $a4, $a6, $a4 - addi.w $a6, $a4, 1 - mul.d $a4, $a6, $a4 - bstrpick.d $a6, $a4, 31, 31 - add.w $a4, $a4, $a6 - srli.d $a4, $a4, 1 + maskeqz $a5, $a5, $a7 + or $a4, $a5, $a4 + addi.w $a5, $a4, 1 + mul.d $a4, $a5, $a4 + bstrpick.d $a5, $a4, 31, 31 add.w $a4, $a4, $a5 + srli.d $a4, $a4, 1 + add.w $a4, $a4, $a6 slli.d $a4, $a4, 2 stx.w $a3, $s7, $a4 ld.w $a4, $s8, 884 addi.d $a0, $a0, 1 - addi.d $a1, $a1, 72 + addi.d $a2, $a2, 72 blt $a0, $a4, .LBB91_22 .LBB91_23: # %.preheader531 st.d $s8, $sp, 40 # 8-byte Folded Spill @@ -15744,7 +15727,7 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl .LBB91_82: # %._crit_edge566 ld.d $s8, $sp, 40 # 8-byte Folded Reload ld.d $a0, $sp, 8 # 8-byte Folded Reload - beqz $a0, .LBB91_146 + beqz $a0, .LBB91_147 # %bb.83: ld.w $fp, $s8, 820 blez $fp, .LBB91_95 @@ -15834,13 +15817,13 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl move $s2, $zero .LBB91_96: # %._crit_edge569 ld.w $a0, $s8, 852 - ld.d $s5, $s8, 832 + ld.d $s4, $s8, 832 blez $a0, .LBB91_112 # %bb.97: # %.lr.ph574 move $fp, $zero move $s3, $zero slli.d $a1, $a0, 6 - alsl.d $s4, $a0, $a1, 3 + alsl.d $s5, $a0, $a1, 3 lu12i.w $a0, -69906 ori $a0, $a0, 3823 lu32i.d $a0, -69906 @@ -15855,19 +15838,19 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl # in Loop: Header=BB91_99 Depth=1 addi.d $fp, $fp, 72 addi.w $s3, $s3, 1 - beq $s4, $fp, .LBB91_112 + beq $s5, $fp, .LBB91_112 .LBB91_99: # =>This Inner Loop Header: Depth=1 ld.d $a0, $sp, 40 # 8-byte Folded Reload ld.d $a0, $a0, 864 add.d $a1, $a0, $fp ld.d $a0, $a1, 16 - sub.d $a0, $a0, $s5 + sub.d $a0, $a0, $s4 srli.d $a0, $a0, 3 mul.w $a0, $a0, $s6 ld.d $a1, $a1, 24 slli.d $a2, $a0, 2 ldx.w $a2, $s2, $a2 - sub.d $a1, $a1, $s5 + sub.d $a1, $a1, $s4 srai.d $a1, $a1, 3 mul.w $s0, $a1, $s6 beqz $a2, .LBB91_103 @@ -15932,7 +15915,7 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl alsl.d $a0, $a0, $a2, 3 add.d $a0, $a1, $a0 ld.d $a1, $a0, 16 - sub.d $a1, $a1, $s5 + sub.d $a1, $a1, $s4 srli.d $a1, $a1, 3 mul.d $a1, $a1, $s8 srai.d $a1, $a1, 30 @@ -15945,7 +15928,7 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl st.d $a1, $a0, 16 .LBB91_110: # in Loop: Header=BB91_99 Depth=1 ld.d $a1, $a0, 24 - sub.d $a1, $a1, $s5 + sub.d $a1, $a1, $s4 srli.d $a1, $a1, 3 mul.d $a1, $a1, $s8 srai.d $a1, $a1, 30 @@ -15968,7 +15951,7 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl alsl.d $s3, $a0, $a1, 3 ori $a0, $zero, 0 lu32i.d $a0, -69905 - lu52i.d $s4, $a0, -274 + lu52i.d $s5, $a0, -274 ori $s6, $zero, 120 b .LBB91_115 .p2align 4, , 16 @@ -16020,9 +16003,9 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl # %bb.121: # %.preheader526.preheader # in Loop: Header=BB91_115 Depth=1 ld.d $a0, $s0, 16 - sub.d $a0, $a0, $s5 + sub.d $a0, $a0, $s4 srli.d $a0, $a0, 3 - mul.d $a0, $a0, $s4 + mul.d $a0, $a0, $s5 srai.d $a0, $a0, 30 ldx.w $a0, $s2, $a0 beqz $a0, .LBB91_123 @@ -16034,9 +16017,9 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl .LBB91_123: # %.preheader526.1 # in Loop: Header=BB91_115 Depth=1 ld.d $a0, $s0, 24 - sub.d $a0, $a0, $s5 + sub.d $a0, $a0, $s4 srli.d $a0, $a0, 3 - mul.d $a0, $a0, $s4 + mul.d $a0, $a0, $s5 srai.d $a0, $a0, 30 ldx.w $a0, $s2, $a0 beqz $a0, .LBB91_125 @@ -16048,9 +16031,9 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl .LBB91_125: # %.preheader526.2 # in Loop: Header=BB91_115 Depth=1 ld.d $a0, $s0, 32 - sub.d $a0, $a0, $s5 + sub.d $a0, $a0, $s4 srli.d $a0, $a0, 3 - mul.d $a0, $a0, $s4 + mul.d $a0, $a0, $s5 srai.d $a0, $a0, 30 ldx.w $a0, $s2, $a0 beqz $a0, .LBB91_114 @@ -16077,71 +16060,68 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl move $a2, $s3 pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.w $a0, $s8, 852 - bgtz $a0, .LBB91_131 - b .LBB91_133 + b .LBB91_131 .LBB91_130: move $s1, $zero +.LBB91_131: # %_ZN20btAlignedObjectArrayIiE6resizeEiRKi.exit382 + ld.d $s7, $sp, 48 # 8-byte Folded Reload ld.w $a0, $s8, 852 - blez $a0, .LBB91_133 -.LBB91_131: # %.preheader525.lr.ph + vreplgr2vr.d $vr2, $s4 + blez $a0, .LBB91_134 +# %bb.132: # %.preheader525.lr.ph ld.d $a1, $s8, 864 - addi.d $a1, $a1, 24 + addi.d $a1, $a1, 16 ori $a2, $zero, 0 lu32i.d $a2, -69905 lu52i.d $a2, $a2, -274 + vreplgr2vr.d $vr0, $a2 .p2align 4, , 16 -.LBB91_132: # %.preheader525 +.LBB91_133: # %.preheader525 # =>This Inner Loop Header: Depth=1 - ld.d $a3, $a1, -8 - sub.d $a3, $a3, $s5 - srli.d $a3, $a3, 3 - mul.d $a3, $a3, $a2 - srai.d $a3, $a3, 30 - ldx.w $a4, $s1, $a3 - addi.d $a4, $a4, 1 - stx.w $a4, $s1, $a3 - ld.d $a3, $a1, 0 - sub.d $a3, $a3, $s5 - srli.d $a3, $a3, 3 - mul.d $a3, $a3, $a2 - srai.d $a3, $a3, 30 - ldx.w $a4, $s1, $a3 - addi.d $a4, $a4, 1 - stx.w $a4, $s1, $a3 + vld $vr1, $a1, 0 + vsub.d $vr1, $vr1, $vr2 + vsrli.d $vr1, $vr1, 3 + vmul.d $vr1, $vr1, $vr0 + vsrai.d $vr1, $vr1, 30 + vpickve2gr.d $a2, $vr1, 0 + ldx.w $a3, $s1, $a2 + addi.d $a3, $a3, 1 + stx.w $a3, $s1, $a2 + vpickve2gr.d $a2, $vr1, 1 + ldx.w $a3, $s1, $a2 + addi.d $a3, $a3, 1 + stx.w $a3, $s1, $a2 addi.d $a0, $a0, -1 addi.d $a1, $a1, 72 - bnez $a0, .LBB91_132 -.LBB91_133: # %._crit_edge583 + bnez $a0, .LBB91_133 +.LBB91_134: # %._crit_edge583 ld.w $a0, $s8, 884 - blez $a0, .LBB91_136 -# %bb.134: # %.preheader524.lr.ph + blez $a0, .LBB91_137 +# %bb.135: # %.preheader524.lr.ph ld.d $a1, $s8, 896 addi.d $a1, $a1, 32 ori $a2, $zero, 0 lu32i.d $a2, -69905 lu52i.d $a2, $a2, -274 + vreplgr2vr.d $vr0, $a2 .p2align 4, , 16 -.LBB91_135: # %.preheader524 +.LBB91_136: # %.preheader524 # =>This Inner Loop Header: Depth=1 - ld.d $a3, $a1, -16 - sub.d $a3, $a3, $s5 - srli.d $a3, $a3, 3 - mul.d $a3, $a3, $a2 - srai.d $a3, $a3, 30 + vld $vr1, $a1, -16 + vsub.d $vr1, $vr1, $vr2 + vsrli.d $vr1, $vr1, 3 + vmul.d $vr1, $vr1, $vr0 + vsrai.d $vr1, $vr1, 30 + vpickve2gr.d $a3, $vr1, 0 ldx.w $a4, $s1, $a3 addi.d $a4, $a4, 1 stx.w $a4, $s1, $a3 - ld.d $a3, $a1, -8 - sub.d $a3, $a3, $s5 - srli.d $a3, $a3, 3 - mul.d $a3, $a3, $a2 - srai.d $a3, $a3, 30 + vpickve2gr.d $a3, $vr1, 1 ldx.w $a4, $s1, $a3 addi.d $a4, $a4, 1 stx.w $a4, $s1, $a3 ld.d $a3, $a1, 0 - sub.d $a3, $a3, $s5 + sub.d $a3, $a3, $s4 srli.d $a3, $a3, 3 mul.d $a3, $a3, $a2 srai.d $a3, $a3, 30 @@ -16150,27 +16130,30 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl stx.w $a4, $s1, $a3 addi.d $a0, $a0, -1 addi.d $a1, $a1, 72 - bnez $a0, .LBB91_135 -.LBB91_136: # %.preheader + bnez $a0, .LBB91_136 +.LBB91_137: # %.preheader ld.w $a0, $s8, 852 - blez $a0, .LBB91_142 -# %bb.137: # %.lr.ph587 + blez $a0, .LBB91_143 +# %bb.138: # %.lr.ph587 move $fp, $zero ori $a1, $zero, 0 lu32i.d $a1, -69905 - lu52i.d $s0, $a1, -274 - ori $s6, $zero, 1 - ori $s7, $zero, 72 - b .LBB91_140 + lu52i.d $a1, $a1, -274 + vreplgr2vr.d $vr1, $a1 + ori $s0, $zero, 1 + ori $s5, $zero, 72 + vst $vr2, $sp, 80 # 16-byte Folded Spill + vst $vr1, $sp, 64 # 16-byte Folded Spill + b .LBB91_141 .p2align 4, , 16 -.LBB91_138: # in Loop: Header=BB91_140 Depth=1 - addi.d $a0, $a3, -1 - stx.w $a0, $s1, $a2 - ldx.w $a0, $s1, $a4 +.LBB91_139: # in Loop: Header=BB91_141 Depth=1 + addi.d $a0, $a4, -1 + stx.w $a0, $s1, $a3 + ldx.w $a0, $s1, $a2 addi.d $a0, $a0, -1 - stx.w $a0, $s1, $a4 + stx.w $a0, $s1, $a2 ld.w $a0, $s8, 852 - mul.d $a0, $a0, $s7 + mul.d $a0, $a0, $s5 add.d $a0, $a1, $a0 addi.d $s4, $a0, -72 addi.d $a0, $sp, 128 @@ -16188,60 +16171,58 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl move $a0, $s4 pcaddu18i $ra, %call36(memcpy) jirl $ra, $ra, 0 + vld $vr1, $sp, 64 # 16-byte Folded Reload + vld $vr2, $sp, 80 # 16-byte Folded Reload ld.w $a0, $s8, 852 addi.w $a0, $a0, -1 st.w $a0, $s8, 852 addi.d $fp, $fp, -1 -.LBB91_139: # in Loop: Header=BB91_140 Depth=1 +.LBB91_140: # in Loop: Header=BB91_141 Depth=1 addi.w $fp, $fp, 1 - bge $fp, $a0, .LBB91_143 -.LBB91_140: # =>This Inner Loop Header: Depth=1 + bge $fp, $a0, .LBB91_144 +.LBB91_141: # =>This Inner Loop Header: Depth=1 ld.d $a1, $s8, 864 slli.d $a2, $fp, 6 alsl.d $a2, $fp, $a2, 3 add.d $s3, $a1, $a2 - ld.d $a2, $s3, 16 - sub.d $a2, $a2, $s5 - srli.d $a2, $a2, 3 - ld.d $a4, $s3, 24 - mul.d $a2, $a2, $s0 - srai.d $a2, $a2, 30 - ldx.w $a3, $s1, $a2 - sub.d $a4, $a4, $s5 - srli.d $a4, $a4, 3 - mul.d $a4, $a4, $s0 - srai.d $a4, $a4, 30 - beq $a3, $s6, .LBB91_138 -# %bb.141: # in Loop: Header=BB91_140 Depth=1 - ldx.w $a5, $s1, $a4 - bne $a5, $s6, .LBB91_139 - b .LBB91_138 -.LBB91_142: # %_ZN20btAlignedObjectArrayIiED2Ev.exit388 - beqz $s1, .LBB91_144 -.LBB91_143: # %_ZN20btAlignedObjectArrayIiED2Ev.exit388.thread + vld $vr0, $s3, 16 + vsub.d $vr0, $vr0, $vr2 + vsrli.d $vr0, $vr0, 3 + vmul.d $vr0, $vr0, $vr1 + vsrai.d $vr0, $vr0, 30 + vpickve2gr.d $a3, $vr0, 0 + ldx.w $a4, $s1, $a3 + vpickve2gr.d $a2, $vr0, 1 + beq $a4, $s0, .LBB91_139 +# %bb.142: # in Loop: Header=BB91_141 Depth=1 + ldx.w $a5, $s1, $a2 + bne $a5, $s0, .LBB91_140 + b .LBB91_139 +.LBB91_143: # %_ZN20btAlignedObjectArrayIiED2Ev.exit388 + beqz $s1, .LBB91_145 +.LBB91_144: # %_ZN20btAlignedObjectArrayIiED2Ev.exit388.thread .Ltmp364: # EH_LABEL move $a0, $s1 pcaddu18i $ra, %call36(_Z21btAlignedFreeInternalPv) jirl $ra, $ra, 0 .Ltmp365: # EH_LABEL -.LBB91_144: # %_ZN20btAlignedObjectArrayIiED2Ev.exit392 - ld.d $s7, $sp, 48 # 8-byte Folded Reload - beqz $s2, .LBB91_146 -# %bb.145: +.LBB91_145: # %_ZN20btAlignedObjectArrayIiED2Ev.exit392 + beqz $s2, .LBB91_147 +# %bb.146: .Ltmp369: # EH_LABEL move $a0, $s2 pcaddu18i $ra, %call36(_Z21btAlignedFreeInternalPv) jirl $ra, $ra, 0 .Ltmp370: # EH_LABEL -.LBB91_146: # %_ZN20btAlignedObjectArrayIiED2Ev.exit396 +.LBB91_147: # %_ZN20btAlignedObjectArrayIiED2Ev.exit396 ori $a0, $zero, 1 st.b $a0, $s8, 1140 - beqz $s7, .LBB91_148 -# %bb.147: + beqz $s7, .LBB91_149 +# %bb.148: move $a0, $s7 pcaddu18i $ra, %call36(_Z21btAlignedFreeInternalPv) jirl $ra, $ra, 0 -.LBB91_148: # %_ZN11btSymMatrixIiED2Ev.exit +.LBB91_149: # %_ZN11btSymMatrixIiED2Ev.exit fld.d $fs7, $sp, 200 # 8-byte Folded Reload fld.d $fs6, $sp, 208 # 8-byte Folded Reload fld.d $fs5, $sp, 216 # 8-byte Folded Reload @@ -16263,86 +16244,86 @@ _ZN10btSoftBody6refineEPNS_10ImplicitFnEfb: # @_ZN10btSoftBody6refineEPNS_10Impl ld.d $ra, $sp, 344 # 8-byte Folded Reload addi.d $sp, $sp, 352 ret -.LBB91_149: +.LBB91_150: .Ltmp371: # EH_LABEL - b .LBB91_159 -.LBB91_150: # %_ZN20btAlignedObjectArrayIiED2Ev.exit400 + b .LBB91_160 +.LBB91_151: # %_ZN20btAlignedObjectArrayIiED2Ev.exit400 .Ltmp363: # EH_LABEL - b .LBB91_164 -.LBB91_151: # %.thread506 + b .LBB91_165 +.LBB91_152: # %.thread506 .Ltmp340: # EH_LABEL - b .LBB91_159 -.LBB91_152: -.Ltmp366: # EH_LABEL - b .LBB91_164 + b .LBB91_160 .LBB91_153: -.Ltmp343: # EH_LABEL - b .LBB91_164 +.Ltmp366: # EH_LABEL + b .LBB91_165 .LBB91_154: -.Ltmp331: # EH_LABEL - b .LBB91_167 +.Ltmp343: # EH_LABEL + b .LBB91_165 .LBB91_155: -.Ltmp346: # EH_LABEL - b .LBB91_164 +.Ltmp331: # EH_LABEL + b .LBB91_168 .LBB91_156: -.Ltmp328: # EH_LABEL - b .LBB91_169 +.Ltmp346: # EH_LABEL + b .LBB91_165 .LBB91_157: -.Ltmp314: # EH_LABEL - b .LBB91_159 +.Ltmp328: # EH_LABEL + b .LBB91_170 .LBB91_158: +.Ltmp314: # EH_LABEL + b .LBB91_160 +.LBB91_159: .Ltmp317: # EH_LABEL -.LBB91_159: # %_ZN20btAlignedObjectArrayIiED2Ev.exit404 +.LBB91_160: # %_ZN20btAlignedObjectArrayIiED2Ev.exit404 move $s0, $a0 - b .LBB91_170 -.LBB91_160: # %.thread499 + b .LBB91_171 +.LBB91_161: # %.thread499 .Ltmp353: # EH_LABEL move $s0, $a0 ld.d $s7, $sp, 48 # 8-byte Folded Reload - b .LBB91_165 -.LBB91_161: -.Ltmp334: # EH_LABEL - b .LBB91_167 + b .LBB91_166 .LBB91_162: -.Ltmp337: # EH_LABEL - b .LBB91_167 +.Ltmp334: # EH_LABEL + b .LBB91_168 .LBB91_163: -.Ltmp360: # EH_LABEL +.Ltmp337: # EH_LABEL + b .LBB91_168 .LBB91_164: +.Ltmp360: # EH_LABEL +.LBB91_165: move $s0, $a0 ld.d $s7, $sp, 48 # 8-byte Folded Reload - beqz $s2, .LBB91_170 -.LBB91_165: + beqz $s2, .LBB91_171 +.LBB91_166: .Ltmp367: # EH_LABEL move $a0, $s2 pcaddu18i $ra, %call36(_Z21btAlignedFreeInternalPv) jirl $ra, $ra, 0 .Ltmp368: # EH_LABEL - b .LBB91_170 -.LBB91_166: # %_ZN20btAlignedObjectArrayIiED2Ev.exit404.thread707 + b .LBB91_171 +.LBB91_167: # %_ZN20btAlignedObjectArrayIiED2Ev.exit404.thread707 .Ltmp322: # EH_LABEL -.LBB91_167: # %_ZN20btAlignedObjectArrayIiED2Ev.exit404.thread +.LBB91_168: # %_ZN20btAlignedObjectArrayIiED2Ev.exit404.thread move $s0, $a0 ld.d $s7, $sp, 48 # 8-byte Folded Reload - b .LBB91_171 -.LBB91_168: # %.loopexit529 + b .LBB91_172 +.LBB91_169: # %.loopexit529 .Ltmp325: # EH_LABEL -.LBB91_169: # %_ZN20btAlignedObjectArrayIiED2Ev.exit404 +.LBB91_170: # %_ZN20btAlignedObjectArrayIiED2Ev.exit404 move $s0, $a0 ld.d $s7, $sp, 48 # 8-byte Folded Reload -.LBB91_170: # %_ZN20btAlignedObjectArrayIiED2Ev.exit404 - beqz $s7, .LBB91_172 -.LBB91_171: # %_ZN20btAlignedObjectArrayIiED2Ev.exit404.thread +.LBB91_171: # %_ZN20btAlignedObjectArrayIiED2Ev.exit404 + beqz $s7, .LBB91_173 +.LBB91_172: # %_ZN20btAlignedObjectArrayIiED2Ev.exit404.thread .Ltmp372: # EH_LABEL move $a0, $s7 pcaddu18i $ra, %call36(_Z21btAlignedFreeInternalPv) jirl $ra, $ra, 0 .Ltmp373: # EH_LABEL -.LBB91_172: # %common.resume +.LBB91_173: # %common.resume move $a0, $s0 pcaddu18i $ra, %call36(_Unwind_Resume) jirl $ra, $ra, 0 -.LBB91_173: +.LBB91_174: .Ltmp374: # EH_LABEL pcaddu18i $ra, %call36(__clang_call_terminate) jirl $ra, $ra, 0 diff --git a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btSolve2LinearConstraint.s b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btSolve2LinearConstraint.s index 0a8ffe9e..4b4a10fa 100644 --- a/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btSolve2LinearConstraint.s +++ b/results/MultiSource/Benchmarks/Bullet/CMakeFiles/bullet.dir/btSolve2LinearConstraint.s @@ -10,23 +10,21 @@ _ZN24btSolve2LinearConstraint31resolveUnilateralPairConstraintEP11btRigidBodyS1_RK11btMatrix3x3S4_RK9btVector3fS7_S7_S7_S7_fS7_S7_S7_fS7_S7_S7_fS7_RfS8_: # @_ZN24btSolve2LinearConstraint31resolveUnilateralPairConstraintEP11btRigidBodyS1_RK11btMatrix3x3S4_RK9btVector3fS7_S7_S7_S7_fS7_S7_S7_fS7_S7_S7_fS7_RfS8_ .cfi_startproc # %bb.0: - addi.d $sp, $sp, -336 - .cfi_def_cfa_offset 336 - st.d $ra, $sp, 328 # 8-byte Folded Spill - st.d $fp, $sp, 320 # 8-byte Folded Spill - st.d $s0, $sp, 312 # 8-byte Folded Spill - st.d $s1, $sp, 304 # 8-byte Folded Spill - st.d $s2, $sp, 296 # 8-byte Folded Spill - st.d $s3, $sp, 288 # 8-byte Folded Spill - st.d $s4, $sp, 280 # 8-byte Folded Spill - st.d $s5, $sp, 272 # 8-byte Folded Spill - st.d $s6, $sp, 264 # 8-byte Folded Spill - st.d $s7, $sp, 256 # 8-byte Folded Spill - st.d $s8, $sp, 248 # 8-byte Folded Spill - fst.d $fs0, $sp, 240 # 8-byte Folded Spill - fst.d $fs1, $sp, 232 # 8-byte Folded Spill - fst.d $fs2, $sp, 224 # 8-byte Folded Spill - fst.d $fs3, $sp, 216 # 8-byte Folded Spill + addi.d $sp, $sp, -368 + .cfi_def_cfa_offset 368 + st.d $ra, $sp, 360 # 8-byte Folded Spill + st.d $fp, $sp, 352 # 8-byte Folded Spill + st.d $s0, $sp, 344 # 8-byte Folded Spill + st.d $s1, $sp, 336 # 8-byte Folded Spill + st.d $s2, $sp, 328 # 8-byte Folded Spill + st.d $s3, $sp, 320 # 8-byte Folded Spill + st.d $s4, $sp, 312 # 8-byte Folded Spill + st.d $s5, $sp, 304 # 8-byte Folded Spill + st.d $s6, $sp, 296 # 8-byte Folded Spill + st.d $s7, $sp, 288 # 8-byte Folded Spill + st.d $s8, $sp, 280 # 8-byte Folded Spill + fst.d $fs0, $sp, 272 # 8-byte Folded Spill + fst.d $fs1, $sp, 264 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -40,11 +38,9 @@ _ZN24btSolve2LinearConstraint31resolveUnilateralPairConstraintEP11btRigidBodyS1_ .cfi_offset 31, -88 .cfi_offset 56, -96 .cfi_offset 57, -104 - .cfi_offset 58, -112 - .cfi_offset 59, -120 - ld.d $t0, $sp, 408 - ld.d $a7, $sp, 416 - ld.d $s1, $sp, 376 + ld.d $t0, $sp, 440 + ld.d $a7, $sp, 448 + ld.d $s1, $sp, 408 st.w $zero, $t0, 0 st.w $zero, $a7, 0 fld.s $fa4, $s1, 4 @@ -61,37 +57,41 @@ _ZN24btSolve2LinearConstraint31resolveUnilateralPairConstraintEP11btRigidBodyS1_ fadd.s $fa4, $fa4, $fa6 fabs.s $fa4, $fa4 fcmp.cle.s $fcc0, $fa5, $fa4 + # kill: def $f1 killed $f1 def $vr1 + # kill: def $f0 killed $f0 def $vr0 bcnez $fcc0, .LBB0_2 # %bb.1: - fmov.s $fs2, $fa3 - fmov.s $fs3, $fa2 + fmov.s $fs0, $fa3 + fmov.s $fs1, $fa2 move $s5, $a5 move $s6, $a4 move $s7, $a3 move $s3, $a2 move $s4, $a1 - ld.d $a1, $sp, 400 + ld.d $a1, $sp, 432 st.d $a1, $sp, 16 # 8-byte Folded Spill - ld.d $s8, $sp, 392 - ld.d $s2, $sp, 384 - ld.d $a4, $sp, 368 - ld.d $fp, $sp, 344 - ld.d $s0, $sp, 336 + ld.d $s8, $sp, 424 + ld.d $s2, $sp, 416 + ld.d $a4, $sp, 400 + ld.d $fp, $sp, 376 + ld.d $s0, $sp, 368 st.d $a0, $sp, 24 # 8-byte Folded Spill - addi.d $a0, $sp, 132 + addi.d $a0, $sp, 176 move $a1, $a3 move $a2, $s6 move $a3, $s0 move $a5, $s1 move $a6, $s5 - fmov.s $fs1, $fa0 + vst $vr0, $sp, 64 # 16-byte Folded Spill + # kill: def $f0 killed $f0 killed $vr0 st.d $a7, $sp, 40 # 8-byte Folded Spill move $a7, $fp - fmov.s $fs0, $fa1 + vst $vr1, $sp, 48 # 16-byte Folded Spill + # kill: def $f1 killed $f1 killed $vr1 st.d $t0, $sp, 32 # 8-byte Folded Spill pcaddu18i $ra, %call36(_ZN15btJacobianEntryC2ERK11btMatrix3x3S2_RK9btVector3S5_S5_S5_fS5_f) jirl $ra, $ra, 0 - addi.d $a0, $sp, 48 + addi.d $a0, $sp, 88 move $a1, $s7 move $a2, $s6 move $a3, $s2 @@ -99,9 +99,11 @@ _ZN24btSolve2LinearConstraint31resolveUnilateralPairConstraintEP11btRigidBodyS1_ ld.d $s6, $sp, 16 # 8-byte Folded Reload move $a5, $s6 move $a6, $s5 - fmov.s $fa0, $fs1 + vld $vr0, $sp, 64 # 16-byte Folded Reload + # kill: def $f0 killed $f0 killed $vr0 move $a7, $fp - fmov.s $fa1, $fs0 + vld $vr1, $sp, 48 # 16-byte Folded Reload + # kill: def $f1 killed $f1 killed $vr1 pcaddu18i $ra, %call36(_ZN15btJacobianEntryC2ERK11btMatrix3x3S2_RK9btVector3S5_S5_S5_fS5_f) jirl $ra, $ra, 0 fld.s $fa3, $s0, 4 @@ -185,94 +187,91 @@ _ZN24btSolve2LinearConstraint31resolveUnilateralPairConstraintEP11btRigidBodyS1_ ld.d $a0, $sp, 24 # 8-byte Folded Reload fld.s $fa2, $a0, 0 fld.s $fa0, $a0, 4 - fadd.s $fa4, $fs1, $fs0 + vld $vr10, $sp, 48 # 16-byte Folded Reload + vld $vr11, $sp, 64 # 16-byte Folded Reload + fadd.s $fa4, $ft3, $ft2 frecip.s $fa4, $fa4 - fmul.s $fa5, $fs3, $fa2 + fmul.s $fa5, $fs1, $fa2 fneg.s $fa6, $fa0 fmul.s $fa0, $fa3, $fa6 fmadd.s $fa0, $fa5, $fa4, $fa0 - fld.s $fa3, $sp, 132 - fld.s $fa5, $sp, 48 - fmul.s $fa2, $fs2, $fa2 + fmul.s $fa2, $fs0, $fa2 fmul.s $fa1, $fa1, $fa6 + fld.s $fa3, $sp, 184 + fld.s $fa5, $sp, 96 + fld.s $fa6, $sp, 232 + fld.s $fa7, $sp, 112 + fld.s $ft0, $sp, 248 + fld.s $ft1, $sp, 128 fmadd.s $fa1, $fa2, $fa4, $fa1 fmul.s $fa2, $fa3, $fa5 - fld.s $fa3, $sp, 136 - fld.s $fa4, $sp, 52 - fld.s $fa5, $sp, 140 - fld.s $fa6, $sp, 56 - fld.s $fa7, $sp, 180 - fld.s $ft0, $sp, 64 - fld.s $ft1, $sp, 184 - fld.s $ft2, $sp, 68 - fmul.s $fa3, $fa3, $fa4 - fmul.s $fa4, $fa5, $fa6 - fmul.s $fa5, $fa7, $ft0 - fmul.s $fa6, $ft1, $ft2 - fld.s $fa7, $sp, 188 - fld.s $ft0, $sp, 72 - fld.s $ft1, $sp, 196 - fld.s $ft2, $sp, 80 - fld.s $ft3, $sp, 200 - fld.s $ft4, $sp, 84 - fld.s $ft5, $sp, 204 - fld.s $ft6, $sp, 88 - fmul.s $fa7, $fa7, $ft0 - fmul.s $ft0, $ft1, $ft2 - fmul.s $ft1, $ft3, $ft4 - fmul.s $ft2, $ft5, $ft6 - fmul.s $ft3, $fs1, $fa2 - fmul.s $ft4, $fs1, $fa3 - fmul.s $ft5, $fs1, $fa4 - fmul.s $fa2, $fs0, $fa2 - fmul.s $fa3, $fs0, $fa3 - fmul.s $fa4, $fs0, $fa4 - fadd.s $fa5, $fa5, $ft0 - fadd.s $fa6, $fa6, $ft1 - fadd.s $fa7, $fa7, $ft2 - fadd.s $fa5, $ft3, $fa5 - fadd.s $fa6, $ft4, $fa6 - fadd.s $fa7, $ft5, $fa7 - fadd.s $fa2, $fa2, $fa5 - fadd.s $fa3, $fa3, $fa6 - fadd.s $fa4, $fa4, $fa7 + fmul.s $fa3, $fa6, $fa7 + fmul.s $fa4, $ft0, $ft1 + fmul.s $fa5, $ft3, $fa2 + fmul.s $fa2, $ft2, $fa2 + ld.d $a0, $sp, 176 + fadd.s $fa3, $fa3, $fa4 + ld.d $a1, $sp, 88 + fadd.s $fa3, $fa5, $fa3 + vinsgr2vr.d $vr4, $a0, 0 + ld.d $a0, $sp, 224 + vinsgr2vr.d $vr5, $a1, 0 + vfmul.s $vr4, $vr4, $vr5 + ld.d $a1, $sp, 104 + vinsgr2vr.d $vr5, $a0, 0 + ld.d $a0, $sp, 240 + ld.d $a2, $sp, 120 + vinsgr2vr.d $vr6, $a1, 0 + vfmul.s $vr5, $vr5, $vr6 + vinsgr2vr.d $vr6, $a0, 0 + vinsgr2vr.d $vr7, $a2, 0 + vfmul.s $vr6, $vr6, $vr7 + vori.b $vr7, $vr11, 0 + vextrins.w $vr7, $vr11, 16 + vfmul.s $vr7, $vr7, $vr4 + vextrins.w $vr10, $vr10, 16 + vfmul.s $vr4, $vr10, $vr4 + vfadd.s $vr5, $vr5, $vr6 + vfadd.s $vr5, $vr7, $vr5 + vfadd.s $vr4, $vr4, $vr5 fadd.s $fa2, $fa2, $fa3 - fld.s $fa3, $sp, 212 - fld.s $fa5, $sp, 128 - fadd.s $fa2, $fa2, $fa4 - fneg.s $fa4, $fa2 - fmul.s $fa2, $fa2, $fa4 - fmadd.s $fa2, $fa3, $fa5, $fa2 + vreplvei.w $vr3, $vr4, 0 + vreplvei.w $vr4, $vr4, 1 + fadd.s $fa3, $fa3, $fa4 + fld.s $fa4, $sp, 256 + fld.s $fa5, $sp, 168 + fadd.s $fa2, $fa3, $fa2 + fneg.s $fa3, $fa2 + fmul.s $fa2, $fa2, $fa3 + fmadd.s $fa2, $fa4, $fa5, $fa2 frecip.s $fa2, $fa2 - fmul.s $fa3, $fa0, $fa3 - fmul.s $fa6, $fa1, $fa4 + fmul.s $fa4, $fa0, $fa4 + fmul.s $fa6, $fa1, $fa3 fmul.s $fa6, $fa6, $fa2 - fmadd.s $fa3, $fa3, $fa2, $fa6 + fmadd.s $fa4, $fa4, $fa2, $fa6 ld.d $a0, $sp, 32 # 8-byte Folded Reload - fst.s $fa3, $a0, 0 + fst.s $fa4, $a0, 0 fmul.s $fa1, $fa1, $fa5 - fmul.s $fa0, $fa0, $fa4 + fmul.s $fa0, $fa0, $fa3 fmul.s $fa0, $fa0, $fa2 fmadd.s $fa0, $fa1, $fa2, $fa0 ld.d $a0, $sp, 40 # 8-byte Folded Reload fst.s $fa0, $a0, 0 .LBB0_2: - fld.d $fs3, $sp, 216 # 8-byte Folded Reload - fld.d $fs2, $sp, 224 # 8-byte Folded Reload - fld.d $fs1, $sp, 232 # 8-byte Folded Reload - fld.d $fs0, $sp, 240 # 8-byte Folded Reload - ld.d $s8, $sp, 248 # 8-byte Folded Reload - ld.d $s7, $sp, 256 # 8-byte Folded Reload - ld.d $s6, $sp, 264 # 8-byte Folded Reload - ld.d $s5, $sp, 272 # 8-byte Folded Reload - ld.d $s4, $sp, 280 # 8-byte Folded Reload - ld.d $s3, $sp, 288 # 8-byte Folded Reload - ld.d $s2, $sp, 296 # 8-byte Folded Reload - ld.d $s1, $sp, 304 # 8-byte Folded Reload - ld.d $s0, $sp, 312 # 8-byte Folded Reload - ld.d $fp, $sp, 320 # 8-byte Folded Reload - ld.d $ra, $sp, 328 # 8-byte Folded Reload - addi.d $sp, $sp, 336 + fld.d $fs1, $sp, 264 # 8-byte Folded Reload + fld.d $fs0, $sp, 272 # 8-byte Folded Reload + ld.d $s8, $sp, 280 # 8-byte Folded Reload + ld.d $s7, $sp, 288 # 8-byte Folded Reload + ld.d $s6, $sp, 296 # 8-byte Folded Reload + ld.d $s5, $sp, 304 # 8-byte Folded Reload + ld.d $s4, $sp, 312 # 8-byte Folded Reload + ld.d $s3, $sp, 320 # 8-byte Folded Reload + ld.d $s2, $sp, 328 # 8-byte Folded Reload + ld.d $s1, $sp, 336 # 8-byte Folded Reload + ld.d $s0, $sp, 344 # 8-byte Folded Reload + ld.d $fp, $sp, 352 # 8-byte Folded Reload + ld.d $ra, $sp, 360 # 8-byte Folded Reload + addi.d $sp, $sp, 368 ret .Lfunc_end0: .size _ZN24btSolve2LinearConstraint31resolveUnilateralPairConstraintEP11btRigidBodyS1_RK11btMatrix3x3S4_RK9btVector3fS7_S7_S7_S7_fS7_S7_S7_fS7_S7_S7_fS7_RfS8_, .Lfunc_end0-_ZN24btSolve2LinearConstraint31resolveUnilateralPairConstraintEP11btRigidBodyS1_RK11btMatrix3x3S4_RK9btVector3fS7_S7_S7_S7_fS7_S7_S7_fS7_S7_S7_fS7_RfS8_ @@ -413,23 +412,21 @@ _ZN15btJacobianEntryC2ERK11btMatrix3x3S2_RK9btVector3S5_S5_S5_fS5_f: # @_ZN15btJ _ZN24btSolve2LinearConstraint30resolveBilateralPairConstraintEP11btRigidBodyS1_RK11btMatrix3x3S4_RK9btVector3fS7_S7_S7_S7_fS7_S7_S7_fS7_S7_S7_fS7_RfS8_: # @_ZN24btSolve2LinearConstraint30resolveBilateralPairConstraintEP11btRigidBodyS1_RK11btMatrix3x3S4_RK9btVector3fS7_S7_S7_S7_fS7_S7_S7_fS7_S7_S7_fS7_RfS8_ .cfi_startproc # %bb.0: - addi.d $sp, $sp, -336 - .cfi_def_cfa_offset 336 - st.d $ra, $sp, 328 # 8-byte Folded Spill - st.d $fp, $sp, 320 # 8-byte Folded Spill - st.d $s0, $sp, 312 # 8-byte Folded Spill - st.d $s1, $sp, 304 # 8-byte Folded Spill - st.d $s2, $sp, 296 # 8-byte Folded Spill - st.d $s3, $sp, 288 # 8-byte Folded Spill - st.d $s4, $sp, 280 # 8-byte Folded Spill - st.d $s5, $sp, 272 # 8-byte Folded Spill - st.d $s6, $sp, 264 # 8-byte Folded Spill - st.d $s7, $sp, 256 # 8-byte Folded Spill - st.d $s8, $sp, 248 # 8-byte Folded Spill - fst.d $fs0, $sp, 240 # 8-byte Folded Spill - fst.d $fs1, $sp, 232 # 8-byte Folded Spill - fst.d $fs2, $sp, 224 # 8-byte Folded Spill - fst.d $fs3, $sp, 216 # 8-byte Folded Spill + addi.d $sp, $sp, -368 + .cfi_def_cfa_offset 368 + st.d $ra, $sp, 360 # 8-byte Folded Spill + st.d $fp, $sp, 352 # 8-byte Folded Spill + st.d $s0, $sp, 344 # 8-byte Folded Spill + st.d $s1, $sp, 336 # 8-byte Folded Spill + st.d $s2, $sp, 328 # 8-byte Folded Spill + st.d $s3, $sp, 320 # 8-byte Folded Spill + st.d $s4, $sp, 312 # 8-byte Folded Spill + st.d $s5, $sp, 304 # 8-byte Folded Spill + st.d $s6, $sp, 296 # 8-byte Folded Spill + st.d $s7, $sp, 288 # 8-byte Folded Spill + st.d $s8, $sp, 280 # 8-byte Folded Spill + fst.d $fs0, $sp, 272 # 8-byte Folded Spill + fst.d $fs1, $sp, 264 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -443,11 +440,9 @@ _ZN24btSolve2LinearConstraint30resolveBilateralPairConstraintEP11btRigidBodyS1_R .cfi_offset 31, -88 .cfi_offset 56, -96 .cfi_offset 57, -104 - .cfi_offset 58, -112 - .cfi_offset 59, -120 - ld.d $s2, $sp, 408 - ld.d $s8, $sp, 416 - ld.d $s1, $sp, 376 + ld.d $s2, $sp, 440 + ld.d $s8, $sp, 448 + ld.d $s1, $sp, 408 st.w $zero, $s2, 0 st.w $zero, $s8, 0 fld.s $fa4, $s1, 4 @@ -464,37 +459,41 @@ _ZN24btSolve2LinearConstraint30resolveBilateralPairConstraintEP11btRigidBodyS1_R fadd.s $fa4, $fa4, $fa6 fabs.s $fa4, $fa4 fcmp.cle.s $fcc0, $fa5, $fa4 + # kill: def $f1 killed $f1 def $vr1 + # kill: def $f0 killed $f0 def $vr0 bcnez $fcc0, .LBB2_6 # %bb.1: - fmov.s $fs2, $fa3 - fmov.s $fs3, $fa2 + fmov.s $fs0, $fa3 + fmov.s $fs1, $fa2 move $s5, $a5 move $s6, $a4 move $s7, $a3 move $s3, $a2 move $s4, $a1 - ld.d $a1, $sp, 400 + ld.d $a1, $sp, 432 st.d $a1, $sp, 32 # 8-byte Folded Spill - ld.d $a1, $sp, 392 + ld.d $a1, $sp, 424 st.d $a1, $sp, 24 # 8-byte Folded Spill - ld.d $a1, $sp, 384 + ld.d $a1, $sp, 416 st.d $a1, $sp, 16 # 8-byte Folded Spill - ld.d $a4, $sp, 368 - ld.d $fp, $sp, 344 - ld.d $s0, $sp, 336 + ld.d $a4, $sp, 400 + ld.d $fp, $sp, 376 + ld.d $s0, $sp, 368 st.d $a0, $sp, 40 # 8-byte Folded Spill - addi.d $a0, $sp, 132 + addi.d $a0, $sp, 176 move $a1, $a3 move $a2, $s6 move $a3, $s0 move $a5, $s1 move $a6, $s5 - fmov.s $fs1, $fa0 + vst $vr0, $sp, 64 # 16-byte Folded Spill + # kill: def $f0 killed $f0 killed $vr0 move $a7, $fp - fmov.s $fs0, $fa1 + vst $vr1, $sp, 48 # 16-byte Folded Spill + # kill: def $f1 killed $f1 killed $vr1 pcaddu18i $ra, %call36(_ZN15btJacobianEntryC2ERK11btMatrix3x3S2_RK9btVector3S5_S5_S5_fS5_f) jirl $ra, $ra, 0 - addi.d $a0, $sp, 48 + addi.d $a0, $sp, 88 move $a1, $s7 move $a2, $s6 ld.d $s7, $sp, 16 # 8-byte Folded Reload @@ -503,9 +502,11 @@ _ZN24btSolve2LinearConstraint30resolveBilateralPairConstraintEP11btRigidBodyS1_R ld.d $s6, $sp, 32 # 8-byte Folded Reload move $a5, $s6 move $a6, $s5 - fmov.s $fa0, $fs1 + vld $vr0, $sp, 64 # 16-byte Folded Reload + # kill: def $f0 killed $f0 killed $vr0 move $a7, $fp - fmov.s $fa1, $fs0 + vld $vr1, $sp, 48 # 16-byte Folded Reload + # kill: def $f1 killed $f1 killed $vr1 pcaddu18i $ra, %call36(_ZN15btJacobianEntryC2ERK11btMatrix3x3S2_RK9btVector3S5_S5_S5_fS5_f) jirl $ra, $ra, 0 fld.s $fa3, $s0, 4 @@ -578,68 +579,67 @@ _ZN24btSolve2LinearConstraint30resolveBilateralPairConstraintEP11btRigidBodyS1_R fadd.s $fa5, $ft2, $fa5 fadd.s $fa6, $ft10, $fa6 fsub.s $fa2, $fa2, $fa4 - fsub.s $fa1, $fa1, $fa5 fld.s $fa4, $s6, 4 + fsub.s $fa1, $fa1, $fa5 fsub.s $fa0, $fa0, $fa6 fld.s $fa5, $s6, 0 - fld.s $fa6, $s6, 8 fmul.s $fa1, $fa4, $fa1 - ld.d $a2, $sp, 40 # 8-byte Folded Reload - fld.s $fa4, $a2, 4 + fld.s $fa4, $s6, 8 + ld.d $a0, $sp, 40 # 8-byte Folded Reload + fld.s $fa6, $a0, 4 fmadd.s $fa1, $fa5, $fa2, $fa1 - fmadd.s $fa1, $fa6, $fa0, $fa1 - fld.s $fa2, $a2, 0 - fneg.s $fa4, $fa4 + fld.s $fa2, $a0, 0 + fmadd.s $fa1, $fa4, $fa0, $fa1 + fneg.s $fa4, $fa6 fmul.s $fa0, $fa3, $fa4 - fld.s $fa3, $sp, 132 - fld.s $fa5, $sp, 48 - fmadd.s $fa0, $fs3, $fa2, $fa0 + fmadd.s $fa0, $fs1, $fa2, $fa0 fmul.s $fa1, $fa1, $fa4 - fmadd.s $fa1, $fs2, $fa2, $fa1 - fmul.s $fa2, $fa3, $fa5 - fld.s $fa3, $sp, 136 - fld.s $fa4, $sp, 52 - fld.s $fa5, $sp, 140 - fld.s $fa6, $sp, 56 - fld.s $fa7, $sp, 180 - fld.s $ft0, $sp, 64 - fld.s $ft1, $sp, 184 - fld.s $ft2, $sp, 68 - fmul.s $fa3, $fa3, $fa4 - fmul.s $fa4, $fa5, $fa6 - fmul.s $fa5, $fa7, $ft0 - fmul.s $fa6, $ft1, $ft2 - fld.s $fa7, $sp, 188 - fld.s $ft0, $sp, 72 - fld.s $ft1, $sp, 196 - fld.s $ft2, $sp, 80 - fld.s $ft3, $sp, 200 - fld.s $ft4, $sp, 84 - fld.s $ft5, $sp, 204 - fld.s $ft6, $sp, 88 - fmul.s $fa7, $fa7, $ft0 - fmul.s $ft0, $ft1, $ft2 - fmul.s $ft1, $ft3, $ft4 - fmul.s $ft2, $ft5, $ft6 - fmul.s $ft3, $fs1, $fa2 - fmul.s $ft4, $fs1, $fa3 - fmul.s $ft5, $fs1, $fa4 - fmul.s $fa2, $fs0, $fa2 - fmul.s $fa3, $fs0, $fa3 - fmul.s $fa4, $fs0, $fa4 - fadd.s $fa5, $fa5, $ft0 - fadd.s $fa6, $fa6, $ft1 - fadd.s $fa7, $fa7, $ft2 - fadd.s $fa5, $ft3, $fa5 - fadd.s $fa6, $ft4, $fa6 - fadd.s $fa7, $ft5, $fa7 - fadd.s $fa2, $fa2, $fa5 - fadd.s $fa3, $fa3, $fa6 - fadd.s $fa5, $fa4, $fa7 + fld.s $fa3, $sp, 184 + fld.s $fa4, $sp, 96 + fld.s $fa5, $sp, 232 + fld.s $fa6, $sp, 112 + fld.s $fa7, $sp, 248 + fld.s $ft0, $sp, 128 + fmadd.s $fa1, $fs0, $fa2, $fa1 + fmul.s $fa2, $fa3, $fa4 + fmul.s $fa3, $fa5, $fa6 + fmul.s $fa4, $fa7, $ft0 + vld $vr9, $sp, 64 # 16-byte Folded Reload + fmul.s $fa5, $ft1, $fa2 + vld $vr8, $sp, 48 # 16-byte Folded Reload + fmul.s $fa2, $ft0, $fa2 + ld.d $a0, $sp, 176 + fadd.s $fa3, $fa3, $fa4 + ld.d $a1, $sp, 88 + fadd.s $fa3, $fa5, $fa3 + vinsgr2vr.d $vr4, $a0, 0 + ld.d $a0, $sp, 224 + vinsgr2vr.d $vr5, $a1, 0 + vfmul.s $vr4, $vr4, $vr5 + ld.d $a1, $sp, 104 + vinsgr2vr.d $vr5, $a0, 0 + ld.d $a0, $sp, 240 + ld.d $a2, $sp, 120 + vinsgr2vr.d $vr6, $a1, 0 + vfmul.s $vr5, $vr5, $vr6 + vinsgr2vr.d $vr6, $a0, 0 + vinsgr2vr.d $vr7, $a2, 0 + vfmul.s $vr6, $vr6, $vr7 + vori.b $vr7, $vr9, 0 + vextrins.w $vr7, $vr9, 16 + vfmul.s $vr7, $vr7, $vr4 + vextrins.w $vr8, $vr8, 16 + vfmul.s $vr4, $vr8, $vr4 + vfadd.s $vr5, $vr5, $vr6 + vfadd.s $vr5, $vr7, $vr5 + vfadd.s $vr4, $vr4, $vr5 fadd.s $fa3, $fa2, $fa3 - fld.s $fa2, $sp, 212 - fld.s $fa4, $sp, 128 - fadd.s $fa3, $fa3, $fa5 + vreplvei.w $vr2, $vr4, 0 + vreplvei.w $vr4, $vr4, 1 + fadd.s $fa5, $fa2, $fa4 + fld.s $fa2, $sp, 256 + fld.s $fa4, $sp, 168 + fadd.s $fa3, $fa5, $fa3 fneg.s $fa5, $fa3 fmul.s $fa3, $fa3, $fa5 fmadd.s $fa3, $fa2, $fa4, $fa3 @@ -677,22 +677,20 @@ _ZN24btSolve2LinearConstraint30resolveBilateralPairConstraintEP11btRigidBodyS1_R # %bb.5: # %.sink.split st.w $zero, $s2, 0 .LBB2_6: - fld.d $fs3, $sp, 216 # 8-byte Folded Reload - fld.d $fs2, $sp, 224 # 8-byte Folded Reload - fld.d $fs1, $sp, 232 # 8-byte Folded Reload - fld.d $fs0, $sp, 240 # 8-byte Folded Reload - ld.d $s8, $sp, 248 # 8-byte Folded Reload - ld.d $s7, $sp, 256 # 8-byte Folded Reload - ld.d $s6, $sp, 264 # 8-byte Folded Reload - ld.d $s5, $sp, 272 # 8-byte Folded Reload - ld.d $s4, $sp, 280 # 8-byte Folded Reload - ld.d $s3, $sp, 288 # 8-byte Folded Reload - ld.d $s2, $sp, 296 # 8-byte Folded Reload - ld.d $s1, $sp, 304 # 8-byte Folded Reload - ld.d $s0, $sp, 312 # 8-byte Folded Reload - ld.d $fp, $sp, 320 # 8-byte Folded Reload - ld.d $ra, $sp, 328 # 8-byte Folded Reload - addi.d $sp, $sp, 336 + fld.d $fs1, $sp, 264 # 8-byte Folded Reload + fld.d $fs0, $sp, 272 # 8-byte Folded Reload + ld.d $s8, $sp, 280 # 8-byte Folded Reload + ld.d $s7, $sp, 288 # 8-byte Folded Reload + ld.d $s6, $sp, 296 # 8-byte Folded Reload + ld.d $s5, $sp, 304 # 8-byte Folded Reload + ld.d $s4, $sp, 312 # 8-byte Folded Reload + ld.d $s3, $sp, 320 # 8-byte Folded Reload + ld.d $s2, $sp, 328 # 8-byte Folded Reload + ld.d $s1, $sp, 336 # 8-byte Folded Reload + ld.d $s0, $sp, 344 # 8-byte Folded Reload + ld.d $fp, $sp, 352 # 8-byte Folded Reload + ld.d $ra, $sp, 360 # 8-byte Folded Reload + addi.d $sp, $sp, 368 ret .Lfunc_end2: .size _ZN24btSolve2LinearConstraint30resolveBilateralPairConstraintEP11btRigidBodyS1_RK11btMatrix3x3S4_RK9btVector3fS7_S7_S7_S7_fS7_S7_S7_fS7_S7_S7_fS7_RfS8_, .Lfunc_end2-_ZN24btSolve2LinearConstraint30resolveBilateralPairConstraintEP11btRigidBodyS1_RK11btMatrix3x3S4_RK9btVector3fS7_S7_S7_S7_fS7_S7_S7_fS7_S7_S7_fS7_RfS8_ diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/Bounds.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/Bounds.s index 54dde076..0294bffc 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/Bounds.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/Bounds.s @@ -55,18 +55,14 @@ Bounds_AddBounds: # @Bounds_AddBounds .type Bounds_AddEpsilon,@function Bounds_AddEpsilon: # @Bounds_AddEpsilon # %bb.0: - fld.d $fa1, $a0, 0 - fld.d $fa2, $a0, 8 - fsub.d $fa1, $fa1, $fa0 - fst.d $fa1, $a0, 0 - fld.d $fa1, $a0, 16 - fsub.d $fa2, $fa2, $fa0 - fld.d $fa3, $a0, 24 - fst.d $fa2, $a0, 8 - fadd.d $fa1, $fa0, $fa1 - fst.d $fa1, $a0, 16 - fadd.d $fa0, $fa0, $fa3 - fst.d $fa0, $a0, 24 + vld $vr1, $a0, 0 + # kill: def $f0_64 killed $f0_64 def $vr0 + vld $vr2, $a0, 16 + vreplvei.d $vr0, $vr0, 0 + vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a0, 0 + vfadd.d $vr0, $vr0, $vr2 + vst $vr0, $a0, 16 ret .Lfunc_end3: .size Bounds_AddEpsilon, .Lfunc_end3-Bounds_AddEpsilon diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/mesh.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/mesh.s index 1bf7e467..a239b312 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/mesh.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CMakeFiles/CLAMR.dir/mesh.s @@ -2920,6 +2920,9 @@ _ZN4Mesh36compare_mpot_cpu_local_to_cpu_globalEjPiS0_S0_S0_i: # @_ZN4Mesh36compa .section .rodata.cst16,"aM",@progbits,16 .p2align 4, 0x0 # -- Begin function _ZN4MeshC2Eiiiiddiii .LCPI13_0: + .dword 0xbfe0000000000000 # double -0.5 + .dword 0x3fe0000000000000 # double 0.5 +.LCPI13_1: .word 0 # 0x0 .word 1 # 0x1 .word 2 # 0x2 @@ -2934,21 +2937,19 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception5 # %bb.0: # %.preheader215 - addi.d $sp, $sp, -288 - .cfi_def_cfa_offset 288 - st.d $ra, $sp, 280 # 8-byte Folded Spill - st.d $fp, $sp, 272 # 8-byte Folded Spill - st.d $s0, $sp, 264 # 8-byte Folded Spill - st.d $s1, $sp, 256 # 8-byte Folded Spill - st.d $s2, $sp, 248 # 8-byte Folded Spill - st.d $s3, $sp, 240 # 8-byte Folded Spill - st.d $s4, $sp, 232 # 8-byte Folded Spill - st.d $s5, $sp, 224 # 8-byte Folded Spill - st.d $s6, $sp, 216 # 8-byte Folded Spill - st.d $s7, $sp, 208 # 8-byte Folded Spill - st.d $s8, $sp, 200 # 8-byte Folded Spill - fst.d $fs0, $sp, 192 # 8-byte Folded Spill - fst.d $fs1, $sp, 184 # 8-byte Folded Spill + addi.d $sp, $sp, -304 + .cfi_def_cfa_offset 304 + st.d $ra, $sp, 296 # 8-byte Folded Spill + st.d $fp, $sp, 288 # 8-byte Folded Spill + st.d $s0, $sp, 280 # 8-byte Folded Spill + st.d $s1, $sp, 272 # 8-byte Folded Spill + st.d $s2, $sp, 264 # 8-byte Folded Spill + st.d $s3, $sp, 256 # 8-byte Folded Spill + st.d $s4, $sp, 248 # 8-byte Folded Spill + st.d $s5, $sp, 240 # 8-byte Folded Spill + st.d $s6, $sp, 232 # 8-byte Folded Spill + st.d $s7, $sp, 224 # 8-byte Folded Spill + st.d $s8, $sp, 216 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -2960,14 +2961,14 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii .cfi_offset 29, -72 .cfi_offset 30, -80 .cfi_offset 31, -88 - .cfi_offset 56, -96 - .cfi_offset 57, -104 - st.d $a6, $sp, 144 # 8-byte Folded Spill + st.d $a6, $sp, 176 # 8-byte Folded Spill move $s2, $a5 - fmov.d $fs0, $fa1 - fmov.d $fs1, $fa0 + # kill: def $f1_64 killed $f1_64 def $vr1 + vst $vr1, $sp, 144 # 16-byte Folded Spill + # kill: def $f0_64 killed $f0_64 def $vr0 + vst $vr0, $sp, 128 # 16-byte Folded Spill move $s1, $a4 - move $s4, $a3 + move $s7, $a3 move $s5, $a2 move $s8, $a1 move $fp, $a0 @@ -2996,7 +2997,7 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii st.d $a0, $fp, 184 addi.d $a0, $fp, 192 addi.d $s0, $fp, 680 - addi.d $s7, $fp, 856 + addi.d $s4, $fp, 856 st.d $zero, $fp, 1320 vrepli.b $vr0, 0 vst $vr0, $sp, 96 # 16-byte Folded Spill @@ -3010,14 +3011,14 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii vld $vr0, $sp, 96 # 16-byte Folded Reload vst $vr0, $fp, 640 ori $a2, $zero, 96 - st.d $s0, $sp, 136 # 8-byte Folded Spill + st.d $s0, $sp, 168 # 8-byte Folded Spill move $a0, $s0 move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 ori $a2, $zero, 264 - st.d $s7, $sp, 64 # 8-byte Folded Spill - move $a0, $s7 + st.d $s4, $sp, 64 # 8-byte Folded Spill + move $a0, $s4 move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 @@ -3032,8 +3033,10 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 + vld $vr1, $sp, 128 # 16-byte Folded Reload + vld $vr3, $sp, 144 # 16-byte Folded Reload st.w $s1, $fp, 0 - st.w $s4, $fp, 1120 + st.w $s7, $fp, 1120 st.d $zero, $fp, 664 ori $a0, $zero, 1 st.w $a0, $fp, 672 @@ -3042,13 +3045,13 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii st.d $a0, $fp, 620 st.d $zero, $fp, 1160 st.d $zero, $fp, 1176 - ld.d $a0, $sp, 144 # 8-byte Folded Reload + ld.d $a0, $sp, 176 # 8-byte Folded Reload st.w $a0, $fp, 628 st.d $zero, $fp, 632 lu12i.w $a0, 260096 st.w $a0, $fp, 656 - fst.d $fs1, $fp, 1280 - fst.d $fs0, $fp, 1288 + fst.d $fa1, $fp, 1280 + fst.d $fa3, $fp, 1288 st.w $s2, $fp, 1124 st.w $zero, $fp, 1132 st.w $zero, $fp, 1140 @@ -3072,33 +3075,31 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii addi.d $a0, $fp, 952 st.d $a0, $sp, 80 # 8-byte Folded Spill addi.d $a0, $fp, 1024 + pcalau12i $a1, %pc_hi20(.LCPI13_0) + vld $vr0, $a1, %pc_lo12(.LCPI13_0) addi.d $a1, $fp, 1048 st.d $a1, $sp, 72 # 8-byte Folded Spill addi.d $a1, $fp, 1072 st.d $a1, $sp, 88 # 8-byte Folded Spill - vldi $vr0, -800 - fmul.d $fa1, $fs1, $fa0 + vreplvei.d $vr1, $vr1, 0 + vfmul.d $vr1, $vr1, $vr0 movgr2fr.w $fa2, $s8 ffint.d.w $fa2, $fa2 - fmul.d $fa1, $fa1, $fa2 - fst.d $fa1, $fp, 1184 - fmul.d $fa0, $fs0, $fa0 - movgr2fr.w $fa1, $s5 - ffint.d.w $fa1, $fa1 - fmul.d $fa0, $fa0, $fa1 - fst.d $fa0, $fp, 1200 - vldi $vr0, -928 - fmul.d $fa3, $fs1, $fa0 - fmul.d $fa2, $fa3, $fa2 - fst.d $fa2, $fp, 1192 + vreplvei.d $vr3, $vr3, 0 + vfmul.d $vr0, $vr3, $vr0 + movgr2fr.w $fa3, $s5 + ffint.d.w $fa3, $fa3 + vreplvei.d $vr2, $vr2, 0 + vfmul.d $vr1, $vr1, $vr2 + vst $vr1, $fp, 1184 ld.d $a3, $fp, 1032 ld.d $a1, $fp, 1024 - fmul.d $fa0, $fs0, $fa0 - fmul.d $fa0, $fa0, $fa1 - addi.w $s1, $s4, 1 + vreplvei.d $vr1, $vr3, 0 + vfmul.d $vr0, $vr0, $vr1 + addi.w $s1, $s7, 1 sub.d $a2, $a3, $a1 srai.d $a2, $a2, 2 - fst.d $fa0, $fp, 1208 + vst $vr0, $fp, 1200 st.d $s0, $sp, 56 # 8-byte Folded Spill bgeu $a2, $s1, .LBB13_4 # %bb.3: @@ -3340,17 +3341,17 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii move $a3, $zero ld.w $a1, $fp, 1136 st.w $zero, $fp, 1128 - st.d $zero, $sp, 168 + st.d $zero, $sp, 200 ld.w $a2, $fp, 1144 - st.w $a1, $sp, 176 - st.w $a1, $sp, 180 - st.w $zero, $sp, 152 - st.w $a2, $sp, 156 - st.w $zero, $sp, 160 - st.w $a2, $sp, 164 + st.w $a1, $sp, 208 + st.w $a1, $sp, 212 + st.w $zero, $sp, 184 + st.w $a2, $sp, 188 + st.w $zero, $sp, 192 + st.w $a2, $sp, 196 addi.w $a1, $zero, -4 lu52i.d $a1, $a1, 2047 - st.d $a1, $sp, 144 # 8-byte Folded Spill + st.d $a1, $sp, 176 # 8-byte Folded Spill ori $s6, $zero, 1 addi.w $a1, $zero, -1 lu52i.d $s3, $a1, 511 @@ -3358,7 +3359,7 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii .p2align 4, , 16 .LBB13_48: # %._crit_edge229 # in Loop: Header=BB13_49 Depth=1 - ld.d $a3, $sp, 112 # 8-byte Folded Reload + ld.d $a3, $sp, 120 # 8-byte Folded Reload addi.d $a3, $a3, 1 ori $a1, $zero, 4 beq $a3, $a1, .LBB13_73 @@ -3367,32 +3368,32 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii # Child Loop BB13_56 Depth 3 ori $a1, $zero, 2 sll.w $a1, $a1, $a0 - st.d $a3, $sp, 112 # 8-byte Folded Spill + st.d $a3, $sp, 120 # 8-byte Folded Spill blez $a1, .LBB13_48 # %bb.50: # %.lr.ph228 # in Loop: Header=BB13_49 Depth=1 slli.d $a2, $a3, 2 - addi.d $a3, $sp, 152 + addi.d $a3, $sp, 184 ldx.w $a3, $a2, $a3 - addi.d $a4, $sp, 168 + addi.d $a4, $sp, 200 ldx.w $a2, $a2, $a4 addi.d $a4, $a3, 1 - st.d $a4, $sp, 120 # 8-byte Folded Spill + st.d $a4, $sp, 128 # 8-byte Folded Spill mul.d $s0, $a1, $a3 - st.d $a2, $sp, 128 # 8-byte Folded Spill + st.d $a2, $sp, 144 # 8-byte Folded Spill addi.d $s8, $a2, 1 b .LBB13_52 .p2align 4, , 16 .LBB13_51: # %._crit_edge225 # in Loop: Header=BB13_52 Depth=2 addi.w $s0, $s0, 1 - ld.d $a2, $sp, 120 # 8-byte Folded Reload + ld.d $a2, $sp, 128 # 8-byte Folded Reload mul.w $a2, $a1, $a2 bge $s0, $a2, .LBB13_48 .LBB13_52: # Parent Loop BB13_49 Depth=1 # => This Loop Header: Depth=2 # Child Loop BB13_56 Depth 3 - ld.d $a2, $sp, 128 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload mul.w $s2, $a2, $a1 ori $a1, $zero, 2 sll.w $a1, $a1, $a0 @@ -3430,10 +3431,10 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii b .LBB13_65 .p2align 4, , 16 .LBB13_58: # in Loop: Header=BB13_56 Depth=3 - ld.d $a1, $sp, 136 # 8-byte Folded Reload + ld.d $a1, $sp, 168 # 8-byte Folded Reload ld.d $s4, $a1, 0 sub.d $s1, $a0, $s4 - ld.d $a0, $sp, 144 # 8-byte Folded Reload + ld.d $a0, $sp, 176 # 8-byte Folded Reload beq $s1, $a0, .LBB13_79 # %bb.59: # %_ZNSt12_Vector_baseIiSaIiEE11_M_allocateEm.exit.i.i # in Loop: Header=BB13_56 Depth=3 @@ -3491,7 +3492,7 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii # %bb.66: # in Loop: Header=BB13_56 Depth=3 ld.d $s4, $fp, 704 sub.d $s1, $a0, $s4 - ld.d $a0, $sp, 144 # 8-byte Folded Reload + ld.d $a0, $sp, 176 # 8-byte Folded Reload beq $s1, $a0, .LBB13_79 # %bb.67: # %_ZNSt12_Vector_baseIiSaIiEE11_M_allocateEm.exit.i.i99 # in Loop: Header=BB13_56 Depth=3 @@ -3549,20 +3550,18 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii vld $vr0, $sp, 96 # 16-byte Folded Reload vst $vr0, $fp, 1368 vst $vr0, $fp, 1384 - fld.d $fs1, $sp, 184 # 8-byte Folded Reload - fld.d $fs0, $sp, 192 # 8-byte Folded Reload - ld.d $s8, $sp, 200 # 8-byte Folded Reload - ld.d $s7, $sp, 208 # 8-byte Folded Reload - ld.d $s6, $sp, 216 # 8-byte Folded Reload - ld.d $s5, $sp, 224 # 8-byte Folded Reload - ld.d $s4, $sp, 232 # 8-byte Folded Reload - ld.d $s3, $sp, 240 # 8-byte Folded Reload - ld.d $s2, $sp, 248 # 8-byte Folded Reload - ld.d $s1, $sp, 256 # 8-byte Folded Reload - ld.d $s0, $sp, 264 # 8-byte Folded Reload - ld.d $fp, $sp, 272 # 8-byte Folded Reload - ld.d $ra, $sp, 280 # 8-byte Folded Reload - addi.d $sp, $sp, 288 + ld.d $s8, $sp, 216 # 8-byte Folded Reload + ld.d $s7, $sp, 224 # 8-byte Folded Reload + ld.d $s6, $sp, 232 # 8-byte Folded Reload + ld.d $s5, $sp, 240 # 8-byte Folded Reload + ld.d $s4, $sp, 248 # 8-byte Folded Reload + ld.d $s3, $sp, 256 # 8-byte Folded Reload + ld.d $s2, $sp, 264 # 8-byte Folded Reload + ld.d $s1, $sp, 272 # 8-byte Folded Reload + ld.d $s0, $sp, 280 # 8-byte Folded Reload + ld.d $fp, $sp, 288 # 8-byte Folded Reload + ld.d $ra, $sp, 296 # 8-byte Folded Reload + addi.d $sp, $sp, 304 ret .LBB13_74: # %vector.scevcheck addi.d $a2, $s1, -1 @@ -3576,8 +3575,8 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii bnez $a2, .LBB13_44 # %bb.76: # %vector.ph bstrpick.d $a1, $s1, 32, 3 - pcalau12i $a2, %pc_hi20(.LCPI13_0) - vld $vr0, $a2, %pc_lo12(.LCPI13_0) + pcalau12i $a2, %pc_hi20(.LCPI13_1) + vld $vr0, $a2, %pc_lo12(.LCPI13_1) slli.d $a1, $a1, 3 addi.d $a2, $a0, 16 vrepli.w $vr1, 2 @@ -3826,7 +3825,7 @@ _ZN4MeshC2Eiiiiddiii: # @_ZN4MeshC2Eiiiiddiii pcaddu18i $ra, %call36(_ZdlPvm) jirl $ra, $ra, 0 .LBB13_143: # %_ZNSt6vectorIiSaIiEED2Ev.exit205 - ld.d $a0, $sp, 136 # 8-byte Folded Reload + ld.d $a0, $sp, 168 # 8-byte Folded Reload ld.d $a0, $a0, 0 beqz $a0, .LBB13_145 # %bb.144: diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/Hydro.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/Hydro.s index 7b713bdd..c3571c31 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/Hydro.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/Hydro.s @@ -2292,35 +2292,32 @@ _ZN5Hydro7doCycleEd: # @_ZN5Hydro7doCycleEd bge $s1, $s2, .LBB5_37 # %bb.43: # %.cont.preheader.i # in Loop: Header=BB5_38 Depth=1 + ld.d $a5, $fp, 272 ld.d $a3, $fp, 264 - ld.d $a6, $fp, 272 - ld.d $a5, $fp, 280 + ld.d $a4, $fp, 280 sub.d $a2, $s2, $s1 - alsl.d $a4, $s1, $a3, 4 slli.d $a7, $s1, 4 - addi.d $a4, $a4, 8 - alsl.d $a5, $s1, $a5, 3 - alsl.d $a6, $s1, $a6, 4 - addi.d $a6, $a6, 8 + alsl.d $a3, $s1, $a3, 4 + alsl.d $a4, $s1, $a4, 3 + alsl.d $a5, $s1, $a5, 4 + move $a6, $a3 move $t0, $a2 .p2align 4, , 16 .LBB5_44: # %.cont.i # Parent Loop BB5_38 Depth=1 # => This Inner Loop Header: Depth=2 - fld.d $fa0, $a5, 0 + fld.d $fa0, $a4, 0 fcmp.clt.d $fcc0, $fa0, $fs0 - fld.d $fa1, $a6, -8 - fld.d $fa2, $a6, 0 + vld $vr1, $a5, 0 fsel $fa0, $fa0, $fs0, $fcc0 frecip.d $fa0, $fa0 - fmul.d $fa1, $fa1, $fa0 - fmul.d $fa0, $fa2, $fa0 - fst.d $fa1, $a4, -8 - fst.d $fa0, $a4, 0 + vreplvei.d $vr0, $vr0, 0 + vfmul.d $vr0, $vr1, $vr0 + vst $vr0, $a6, 0 addi.d $t0, $t0, -1 - addi.d $a4, $a4, 16 - addi.d $a5, $a5, 8 addi.d $a6, $a6, 16 + addi.d $a4, $a4, 8 + addi.d $a5, $a5, 16 bnez $t0, .LBB5_44 # %bb.45: # %.lr.ph.preheader.i184 # in Loop: Header=BB5_38 Depth=1 @@ -2331,7 +2328,6 @@ _ZN5Hydro7doCycleEd: # @_ZN5Hydro7doCycleEd ld.d $a5, $sp, 48 # 8-byte Folded Reload add.d $a5, $a5, $a7 add.d $a6, $a6, $a7 - add.d $a3, $a3, $a7 add.d $a7, $t0, $a7 .p2align 4, , 16 .LBB5_46: # %.lr.ph.i186 @@ -2959,27 +2955,23 @@ _ZN5Hydro9calcAccelEPK7double2PKdPS0_ii: # @_ZN5Hydro9calcAccelEPK7double2PKdPS0 # %bb.0: bge $a4, $a5, .LBB10_3 # %bb.1: # %.cont.preheader - slli.d $a0, $a4, 4 - addi.d $a6, $a0, 8 - add.d $a0, $a1, $a6 + alsl.d $a0, $a4, $a1, 4 pcalau12i $a1, %pc_hi20(.LCPI10_0) fld.d $fa0, $a1, %pc_lo12(.LCPI10_0) alsl.d $a1, $a4, $a2, 3 - add.d $a2, $a3, $a6 + alsl.d $a2, $a4, $a3, 4 sub.d $a3, $a5, $a4 .p2align 4, , 16 .LBB10_2: # %.cont # =>This Inner Loop Header: Depth=1 fld.d $fa1, $a1, 0 fcmp.clt.d $fcc0, $fa1, $fa0 - fld.d $fa2, $a0, -8 - fld.d $fa3, $a0, 0 + vld $vr2, $a0, 0 fsel $fa1, $fa1, $fa0, $fcc0 frecip.d $fa1, $fa1 - fmul.d $fa2, $fa2, $fa1 - fmul.d $fa1, $fa3, $fa1 - fst.d $fa2, $a2, -8 - fst.d $fa1, $a2, 0 + vreplvei.d $vr1, $vr1, 0 + vfmul.d $vr1, $vr2, $vr1 + vst $vr1, $a2, 0 addi.d $a0, $a0, 16 addi.d $a1, $a1, 8 addi.d $a3, $a3, -1 diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/HydroBC.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/HydroBC.s index f30c7d6d..f614bf20 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/HydroBC.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/HydroBC.s @@ -198,33 +198,29 @@ _ZN7HydroBC12applyFixedBCEP7double2S1_ii: # @_ZN7HydroBC12applyFixedBCEP7double2 .p2align 4, , 16 .LBB2_2: # =>This Inner Loop Header: Depth=1 ld.w $a4, $a5, 0 - alsl.d $a6, $a4, $a1, 4 - slli.d $a7, $a4, 4 - fldx.d $fa0, $a1, $a7 - fld.d $fa1, $a6, 8 - fld.d $fa2, $a0, 24 - fld.d $fa3, $a0, 16 - fmul.d $fa4, $fa1, $fa2 - fmadd.d $fa4, $fa0, $fa3, $fa4 - fmul.d $fa3, $fa3, $fa4 - fmul.d $fa2, $fa2, $fa4 - fsub.d $fa0, $fa0, $fa3 - fsub.d $fa1, $fa1, $fa2 - fstx.d $fa0, $a1, $a7 - fst.d $fa1, $a6, 8 - alsl.d $a4, $a4, $a2, 4 - fldx.d $fa0, $a2, $a7 - fld.d $fa1, $a4, 8 - fld.d $fa2, $a0, 24 - fld.d $fa3, $a0, 16 - fmul.d $fa4, $fa1, $fa2 - fmadd.d $fa4, $fa0, $fa3, $fa4 - fmul.d $fa3, $fa3, $fa4 - fmul.d $fa2, $fa2, $fa4 - fsub.d $fa0, $fa0, $fa3 - fsub.d $fa1, $fa1, $fa2 - fstx.d $fa0, $a2, $a7 - fst.d $fa1, $a4, 8 + slli.d $a4, $a4, 4 + vldx $vr0, $a1, $a4 + vld $vr1, $a0, 16 + vfmul.d $vr2, $vr0, $vr1 + vreplvei.d $vr2, $vr2, 1 + vreplvei.d $vr3, $vr0, 0 + vreplvei.d $vr4, $vr1, 0 + fmadd.d $fa2, $fa3, $fa4, $fa2 + vreplvei.d $vr2, $vr2, 0 + vfmul.d $vr1, $vr1, $vr2 + vfsub.d $vr0, $vr0, $vr1 + vstx $vr0, $a1, $a4 + vldx $vr0, $a2, $a4 + vld $vr1, $a0, 16 + vfmul.d $vr2, $vr0, $vr1 + vreplvei.d $vr2, $vr2, 1 + vreplvei.d $vr3, $vr0, 0 + vreplvei.d $vr4, $vr1, 0 + fmadd.d $fa2, $fa3, $fa4, $fa2 + vreplvei.d $vr2, $vr2, 0 + vfmul.d $vr1, $vr1, $vr2 + vfsub.d $vr0, $vr0, $vr1 + vstx $vr0, $a2, $a4 addi.d $a3, $a3, -1 addi.d $a5, $a5, 4 bnez $a3, .LBB2_2 diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/Mesh.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/Mesh.s index 66171be8..d765979b 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/Mesh.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/Mesh.s @@ -3656,40 +3656,41 @@ _ZN4Mesh8calcCtrsEPK7double2PS0_S3_ii: # @_ZN4Mesh8calcCtrsEPK7double2PS0_S3_ii .cfi_offset 29, -72 .cfi_offset 30, -80 .cfi_offset 31, -88 - move $fp, $a0 - ld.d $s0, $a0, 104 + move $s0, $a0 + ld.d $s8, $a0, 104 ld.w $a0, $a0, 72 move $s4, $a5 move $s5, $a4 slli.d $a4, $a4, 2 - ldx.w $s7, $s0, $a4 + ldx.w $s7, $s8, $a4 slt $a0, $a5, $a0 - alsl.d $a4, $a5, $s0, 2 - addi.d $a5, $fp, 68 + alsl.d $a4, $a5, $s8, 2 + addi.d $a5, $s0, 68 maskeqz $a4, $a4, $a0 masknez $a0, $a5, $a0 or $a0, $a4, $a0 - ld.w $s8, $a0, 0 + ld.w $t0, $a0, 0 move $s1, $a3 move $s2, $a2 move $s3, $a1 - alsl.d $a0, $s7, $a3, 4 - sub.d $s6, $s8, $s7 - beq $s7, $s8, .LBB9_2 + alsl.d $fp, $s7, $a3, 4 + sub.d $s6, $t0, $s7 + beq $s7, $t0, .LBB9_2 # %bb.1: # %.lr.ph.i.i.i.preheader slli.d $a2, $s6, 4 + move $a0, $fp move $a1, $zero - st.d $a0, $sp, 16 # 8-byte Folded Spill + st.d $t0, $sp, 16 # 8-byte Folded Spill pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a0, $sp, 16 # 8-byte Folded Reload + ld.d $t0, $sp, 16 # 8-byte Folded Reload .LBB9_2: # %_ZSt4fillIP7double2S0_EvT_S2_RKT0_.exit bge $s5, $s4, .LBB9_5 # %bb.3: # %.lr.ph - ld.d $a1, $fp, 88 - ld.d $a2, $fp, 96 - ld.d $a3, $fp, 112 - alsl.d $t0, $s5, $s0, 2 + ld.d $a1, $s0, 88 + ld.d $a2, $s0, 96 + ld.d $a3, $s0, 112 + alsl.d $a0, $s5, $s8, 2 alsl.d $a1, $s5, $a1, 2 alsl.d $a2, $s5, $a2, 2 alsl.d $a3, $s5, $a3, 2 @@ -3705,7 +3706,7 @@ _ZN4Mesh8calcCtrsEPK7double2PS0_S3_ii: # @_ZN4Mesh8calcCtrsEPK7double2PS0_S3_ii slli.d $a6, $a6, 4 vldx $vr1, $s3, $a5 vldx $vr2, $s3, $a6 - ld.w $a6, $t0, 0 + ld.w $a6, $a0, 0 slli.d $a7, $a7, 4 vfadd.d $vr1, $vr1, $vr2 vfmul.d $vr1, $vr1, $vr0 @@ -3719,28 +3720,25 @@ _ZN4Mesh8calcCtrsEPK7double2PS0_S3_ii: # @_ZN4Mesh8calcCtrsEPK7double2PS0_S3_ii addi.d $a2, $a2, 4 addi.d $a3, $a3, 4 addi.d $a4, $a4, -1 - addi.d $t0, $t0, 4 + addi.d $a0, $a0, 4 bnez $a4, .LBB9_4 .LBB9_5: # %.preheader - bge $s7, $s8, .LBB9_8 + bge $s7, $t0, .LBB9_8 # %bb.6: # %.lr.ph49 - ld.d $a1, $fp, 232 - alsl.d $a3, $s7, $a1, 2 - addi.d $a1, $a0, 8 + ld.d $a0, $s0, 232 + alsl.d $a0, $s7, $a0, 2 .p2align 4, , 16 .LBB9_7: # =>This Inner Loop Header: Depth=1 - ld.w $a2, $a3, 0 - fld.d $fa0, $a1, -8 - movgr2fr.w $fa1, $a2 - fld.d $fa2, $a1, 0 + ld.w $a1, $a0, 0 + vld $vr0, $fp, 0 + movgr2fr.w $fa1, $a1 ffint.d.w $fa1, $fa1 - fdiv.d $fa0, $fa0, $fa1 - fst.d $fa0, $a1, -8 - fdiv.d $fa0, $fa2, $fa1 - fst.d $fa0, $a1, 0 - addi.d $a3, $a3, 4 + vreplvei.d $vr1, $vr1, 0 + vfdiv.d $vr0, $vr0, $vr1 + vst $vr0, $fp, 0 + addi.d $a0, $a0, 4 addi.d $s6, $s6, -1 - addi.d $a1, $a1, 16 + addi.d $fp, $fp, 16 bnez $s6, .LBB9_7 .LBB9_8: # %._crit_edge ld.d $s8, $sp, 24 # 8-byte Folded Reload diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/PolyGas.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/PolyGas.s index 1a16396e..66a5f447 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/PolyGas.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/PolyGas.s @@ -532,23 +532,19 @@ _ZN7PolyGas9calcForceEPKdPK7double2PS2_ii: # @_ZN7PolyGas9calcForceEPKdPK7double ld.d $a0, $a0, 0 ld.d $a0, $a0, 104 alsl.d $a0, $a4, $a0, 2 - slli.d $a6, $a4, 4 - addi.d $a6, $a6, 8 - add.d $a2, $a2, $a6 - add.d $a3, $a3, $a6 + alsl.d $a2, $a4, $a2, 4 + alsl.d $a3, $a4, $a3, 4 sub.d $a4, $a5, $a4 .p2align 4, , 16 .LBB3_2: # =>This Inner Loop Header: Depth=1 ld.w $a5, $a0, 0 slli.d $a5, $a5, 3 fldx.d $fa0, $a1, $a5 - fld.d $fa1, $a2, -8 - fld.d $fa2, $a2, 0 + vld $vr1, $a2, 0 fneg.d $fa0, $fa0 - fmul.d $fa1, $fa1, $fa0 - fmul.d $fa0, $fa2, $fa0 - fst.d $fa1, $a3, -8 - fst.d $fa0, $a3, 0 + vreplvei.d $vr0, $vr0, 0 + vfmul.d $vr0, $vr1, $vr0 + vst $vr0, $a3, 0 addi.d $a0, $a0, 4 addi.d $a2, $a2, 16 addi.d $a4, $a4, -1 diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/QCS.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/QCS.s index c15e0ba7..b4bda846 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/QCS.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/QCS.s @@ -307,27 +307,27 @@ _ZN3QCS9calcForceEP7double2ii: # @_ZN3QCS9calcForceEP7double2ii _ZN3QCS12setCornerDivEPdS0_S0_S0_S0_ii: # @_ZN3QCS12setCornerDivEPdS0_S0_S0_S0_ii .cfi_startproc # %bb.0: - addi.d $sp, $sp, -272 - .cfi_def_cfa_offset 272 - st.d $ra, $sp, 264 # 8-byte Folded Spill - st.d $fp, $sp, 256 # 8-byte Folded Spill - st.d $s0, $sp, 248 # 8-byte Folded Spill - st.d $s1, $sp, 240 # 8-byte Folded Spill - st.d $s2, $sp, 232 # 8-byte Folded Spill - st.d $s3, $sp, 224 # 8-byte Folded Spill - st.d $s4, $sp, 216 # 8-byte Folded Spill - st.d $s5, $sp, 208 # 8-byte Folded Spill - st.d $s6, $sp, 200 # 8-byte Folded Spill - st.d $s7, $sp, 192 # 8-byte Folded Spill - st.d $s8, $sp, 184 # 8-byte Folded Spill - fst.d $fs0, $sp, 176 # 8-byte Folded Spill - fst.d $fs1, $sp, 168 # 8-byte Folded Spill - fst.d $fs2, $sp, 160 # 8-byte Folded Spill - fst.d $fs3, $sp, 152 # 8-byte Folded Spill - fst.d $fs4, $sp, 144 # 8-byte Folded Spill - fst.d $fs5, $sp, 136 # 8-byte Folded Spill - fst.d $fs6, $sp, 128 # 8-byte Folded Spill - fst.d $fs7, $sp, 120 # 8-byte Folded Spill + addi.d $sp, $sp, -368 + .cfi_def_cfa_offset 368 + st.d $ra, $sp, 360 # 8-byte Folded Spill + st.d $fp, $sp, 352 # 8-byte Folded Spill + st.d $s0, $sp, 344 # 8-byte Folded Spill + st.d $s1, $sp, 336 # 8-byte Folded Spill + st.d $s2, $sp, 328 # 8-byte Folded Spill + st.d $s3, $sp, 320 # 8-byte Folded Spill + st.d $s4, $sp, 312 # 8-byte Folded Spill + st.d $s5, $sp, 304 # 8-byte Folded Spill + st.d $s6, $sp, 296 # 8-byte Folded Spill + st.d $s7, $sp, 288 # 8-byte Folded Spill + st.d $s8, $sp, 280 # 8-byte Folded Spill + fst.d $fs0, $sp, 272 # 8-byte Folded Spill + fst.d $fs1, $sp, 264 # 8-byte Folded Spill + fst.d $fs2, $sp, 256 # 8-byte Folded Spill + fst.d $fs3, $sp, 248 # 8-byte Folded Spill + fst.d $fs4, $sp, 240 # 8-byte Folded Spill + fst.d $fs5, $sp, 232 # 8-byte Folded Spill + fst.d $fs6, $sp, 224 # 8-byte Folded Spill + fst.d $fs7, $sp, 216 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -358,15 +358,15 @@ _ZN3QCS12setCornerDivEPdS0_S0_S0_S0_ii: # @_ZN3QCS12setCornerDivEPdS0_S0_S0_S0_i ld.d $s8, $a0, 248 ld.d $a2, $s4, 104 ld.d $a0, $s4, 264 - st.d $a0, $sp, 88 # 8-byte Folded Spill + st.d $a0, $sp, 152 # 8-byte Folded Spill ld.d $a0, $s4, 272 - st.d $a0, $sp, 80 # 8-byte Folded Spill - st.d $a6, $sp, 112 # 8-byte Folded Spill + st.d $a0, $sp, 144 # 8-byte Folded Spill + st.d $a6, $sp, 192 # 8-byte Folded Spill slli.d $a0, $a6, 2 ldx.w $s6, $a2, $a0 slt $a0, $a7, $a1 - st.d $a2, $sp, 96 # 8-byte Folded Spill - st.d $a7, $sp, 104 # 8-byte Folded Spill + st.d $a2, $sp, 160 # 8-byte Folded Spill + st.d $a7, $sp, 176 # 8-byte Folded Spill alsl.d $a1, $a7, $a2, 2 addi.d $a2, $s4, 68 maskeqz $a1, $a1, $a0 @@ -374,33 +374,33 @@ _ZN3QCS12setCornerDivEPdS0_S0_S0_S0_ii: # @_ZN3QCS12setCornerDivEPdS0_S0_S0_S0_i or $a0, $a1, $a0 ld.w $s7, $a0, 0 ld.d $a0, $s4, 280 - st.d $a0, $sp, 72 # 8-byte Folded Spill + st.d $a0, $sp, 136 # 8-byte Folded Spill ld.d $a0, $s4, 376 - st.d $a0, $sp, 64 # 8-byte Folded Spill + st.d $a0, $sp, 128 # 8-byte Folded Spill ld.d $a0, $s4, 232 - st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a0, $sp, 168 # 8-byte Folded Spill sub.d $s5, $s7, $s6 addi.w $a0, $s5, 0 slli.d $a0, $a0, 4 - st.d $a0, $sp, 40 # 8-byte Folded Spill + st.d $a0, $sp, 112 # 8-byte Folded Spill pcaddu18i $ra, %call36(malloc) jirl $ra, $ra, 0 move $t5, $s6 beq $s7, $s6, .LBB3_2 # %bb.1: # %.lr.ph.i.i.i.preheader - st.d $a0, $sp, 56 # 8-byte Folded Spill - ld.d $a0, $sp, 56 # 8-byte Folded Reload + st.d $a0, $sp, 120 # 8-byte Folded Spill + ld.d $a0, $sp, 120 # 8-byte Folded Reload move $a1, $zero - ld.d $a2, $sp, 40 # 8-byte Folded Reload + ld.d $a2, $sp, 112 # 8-byte Folded Reload move $s6, $t5 pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 move $t5, $s6 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 120 # 8-byte Folded Reload .LBB3_2: # %_ZSt4fillIP7double2S0_EvT_S2_RKT0_.exit - ld.d $t2, $sp, 96 # 8-byte Folded Reload - ld.d $a6, $sp, 112 # 8-byte Folded Reload - ld.d $a7, $sp, 104 # 8-byte Folded Reload + ld.d $t2, $sp, 160 # 8-byte Folded Reload + ld.d $a6, $sp, 192 # 8-byte Folded Reload + ld.d $a7, $sp, 176 # 8-byte Folded Reload bge $a6, $a7, .LBB3_5 # %bb.3: # %.lr.ph ld.d $a2, $s4, 88 @@ -423,27 +423,25 @@ _ZN3QCS12setCornerDivEPdS0_S0_S0_S0_ii: # @_ZN3QCS12setCornerDivEPdS0_S0_S0_S0_i addi.d $a1, $a1, 4 bnez $a3, .LBB3_4 .LBB3_5: # %.preheader359 - ld.d $t3, $sp, 88 # 8-byte Folded Reload - ld.d $t4, $sp, 80 # 8-byte Folded Reload - ld.d $t6, $sp, 72 # 8-byte Folded Reload - ld.d $t7, $sp, 64 # 8-byte Folded Reload + ld.d $t3, $sp, 152 # 8-byte Folded Reload + ld.d $t4, $sp, 144 # 8-byte Folded Reload + ld.d $t6, $sp, 136 # 8-byte Folded Reload + ld.d $t7, $sp, 128 # 8-byte Folded Reload bge $t5, $s7, .LBB3_8 # %bb.6: # %.lr.ph362.preheader - ld.d $a1, $sp, 48 # 8-byte Folded Reload + ld.d $a1, $sp, 168 # 8-byte Folded Reload alsl.d $a1, $t5, $a1, 2 - addi.d $a2, $a0, 8 + move $a2, $a0 .p2align 4, , 16 .LBB3_7: # %.lr.ph362 # =>This Inner Loop Header: Depth=1 ld.w $a3, $a1, 0 - fld.d $fa0, $a2, -8 + vld $vr0, $a2, 0 movgr2fr.w $fa1, $a3 - fld.d $fa2, $a2, 0 ffint.d.w $fa1, $fa1 - fdiv.d $fa0, $fa0, $fa1 - fst.d $fa0, $a2, -8 - fdiv.d $fa0, $fa2, $fa1 - fst.d $fa0, $a2, 0 + vreplvei.d $vr1, $vr1, 0 + vfdiv.d $vr0, $vr0, $vr1 + vst $vr0, $a2, 0 addi.d $a1, $a1, 4 addi.d $s5, $s5, -1 addi.d $a2, $a2, 16 @@ -456,15 +454,18 @@ _ZN3QCS12setCornerDivEPdS0_S0_S0_S0_ii: # @_ZN3QCS12setCornerDivEPdS0_S0_S0_S0_i ld.d $t8, $s4, 96 ld.d $ra, $s4, 112 ld.d $a2, $s4, 88 - st.d $a2, $sp, 104 # 8-byte Folded Spill + st.d $a2, $sp, 168 # 8-byte Folded Spill alsl.d $s4, $a6, $a1, 2 alsl.d $s7, $a6, $t8, 2 alsl.d $s5, $a6, $ra, 2 - vldi $vr23, -928 + lu52i.d $a1, $zero, 1022 + vreplgr2vr.d $vr23, $a1 + vldi $vr28, -928 pcalau12i $a1, %pc_hi20(.LCPI3_0) - fld.d $ft12, $a1, %pc_lo12(.LCPI3_0) - movgr2fr.d $ft13, $zero - vldi $vr22, -944 + fld.d $fs0, $a1, %pc_lo12(.LCPI3_0) + movgr2fr.d $fs1, $zero + vldi $vr29, -944 + vldi $vr30, -1008 .p2align 4, , 16 .LBB3_10: # =>This Inner Loop Header: Depth=1 ld.w $a1, $s4, 0 @@ -472,172 +473,167 @@ _ZN3QCS12setCornerDivEPdS0_S0_S0_S0_ii: # @_ZN3QCS12setCornerDivEPdS0_S0_S0_S0_i ldx.w $a2, $t2, $a1 sub.w $a3, $a2, $t5 ldx.w $a4, $t8, $a1 - ld.d $a5, $sp, 104 # 8-byte Folded Reload + ld.d $a5, $sp, 168 # 8-byte Folded Reload ldx.w $a5, $a5, $a1 ld.w $a6, $s7, 0 ldx.w $a1, $ra, $a1 ld.w $a7, $s5, 0 alsl.d $t0, $a4, $s8, 4 slli.d $t1, $a4, 4 - fldx.d $fs2, $s8, $t1 - fld.d $fs3, $t0, 8 + fldx.d $fa0, $s8, $t1 + fld.d $fa1, $t0, 8 alsl.d $a4, $a4, $t3, 4 - fldx.d $fa0, $t3, $t1 - fld.d $fa1, $a4, 8 - alsl.d $a4, $a6, $s8, 4 - slli.d $a6, $a6, 4 - fldx.d $ft1, $s8, $a6 - fld.d $ft2, $a4, 8 - alsl.d $a4, $a7, $t4, 4 - slli.d $a6, $a7, 4 - fldx.d $fa3, $t4, $a6 + fldx.d $fa3, $t3, $t1 fld.d $fa4, $a4, 8 - alsl.d $a4, $a3, $a0, 4 - alsl.d $a6, $a2, $t6, 4 - slli.d $a2, $a2, 4 - fldx.d $fa5, $t6, $a2 + slli.d $a4, $a6, 4 + alsl.d $a6, $a7, $t4, 4 + slli.d $t0, $a7, 4 + fldx.d $fa5, $t4, $t0 fld.d $fa6, $a6, 8 - alsl.d $a2, $a5, $s8, 4 - slli.d $a5, $a5, 4 - fldx.d $ft5, $s8, $a5 - fld.d $ft6, $a2, 8 + alsl.d $a6, $a3, $a0, 4 + alsl.d $t0, $a2, $t6, 4 + slli.d $a2, $a2, 4 + fldx.d $fa7, $t6, $a2 + fld.d $ft0, $t0, 8 + slli.d $a2, $a5, 4 + vldx $vr13, $s8, $a4 + vldx $vr15, $s8, $a2 alsl.d $a2, $a1, $t4, 4 - slli.d $a5, $a1, 4 - fldx.d $fa7, $t4, $a5 - fld.d $ft0, $a2, 8 - fsub.d $ft4, $fa5, $fa0 - fsub.d $ft3, $fa6, $fa1 - fsub.d $fa2, $fa7, $fa3 - fsub.d $ft7, $ft0, $fa4 + slli.d $a4, $a1, 4 + fldx.d $ft1, $t4, $a4 + fld.d $ft2, $a2, 8 + fsub.d $ft4, $fa7, $fa3 + fsub.d $ft3, $ft0, $fa4 + fsub.d $fa2, $ft1, $fa5 + fsub.d $ft6, $ft2, $fa6 fneg.d $ft8, $fa2 fmul.d $fa2, $ft3, $ft8 - fmadd.d $fa2, $ft4, $ft7, $fa2 - fmul.d $fa2, $fa2, $ft15 + fmadd.d $fa2, $ft4, $ft6, $fa2 + fmul.d $fa2, $fa2, $fs4 fst.d $fa2, $s3, 0 slli.d $a1, $a1, 3 fldx.d $ft9, $t7, $a1 slli.d $a1, $a7, 3 fldx.d $ft10, $t7, $a1 slli.d $a1, $a3, 4 - fldx.d $fs4, $a0, $a1 - fld.d $fs5, $a4, 8 + fldx.d $ft12, $a0, $a1 + fld.d $fs7, $a6, 8 fcmp.clt.d $fcc0, $ft10, $ft9 - fsel $ft11, $ft9, $ft10, $fcc0 - fst.d $ft11, $sp, 112 # 8-byte Folded Spill - fcmp.clt.d $fcc0, $ft11, $ft12 - fmov.d $ft11, $ft13 + fsel $fs2, $ft9, $ft10, $fcc0 + fcmp.clt.d $fcc0, $fs2, $fs0 + fmov.d $ft11, $fs1 bcnez $fcc0, .LBB3_12 # %bb.11: # in Loop: Header=BB3_10 Depth=1 - fsub.d $ft11, $fa4, $fa1 - fmov.d $fs0, $ft12 - fsub.d $ft12, $fa3, $fa0 - fmov.d $fs1, $ft13 - fsub.d $ft13, $ft0, $fa1 - fsub.d $ft14, $fa7, $fa0 + fsub.d $ft11, $fa6, $fa4 + vori.b $vr27, $vr20, 0 + fsub.d $ft12, $fa5, $fa3 + fsub.d $ft13, $ft2, $fa4 + fsub.d $ft14, $ft1, $fa3 fmul.d $ft11, $ft11, $ft13 - fmov.d $ft13, $fs1 fmadd.d $ft11, $ft14, $ft12, $ft11 - vldi $vr22, -944 - fmov.d $ft12, $fs0 - vldi $vr24, -1008 - fmul.d $ft11, $ft11, $fs0 + vori.b $vr20, $vr27, 0 + fmul.d $ft11, $ft11, $fs6 fmul.d $ft9, $ft9, $ft10 fdiv.d $ft11, $ft11, $ft9 .LBB3_12: # in Loop: Header=BB3_10 Depth=1 - fadd.d $ft1, $fs2, $ft1 - fadd.d $ft2, $fs3, $ft2 - fmul.d $fs7, $ft1, $ft15 - fmul.d $fs0, $ft2, $ft15 - fadd.d $ft1, $fs2, $ft5 - fadd.d $ft2, $fs3, $ft6 - fmul.d $fs1, $ft1, $ft15 - fmul.d $fs6, $ft2, $ft15 + vreplvei.d $vr21, $vr0, 0 + vpackev.d $vr17, $vr13, $vr15 + vfadd.d $vr17, $vr21, $vr17 + vreplvei.d $vr18, $vr1, 0 + vpackod.d $vr13, $vr13, $vr15 + vst $vr18, $sp, 192 # 16-byte Folded Spill + vfadd.d $vr13, $vr18, $vr13 + vfmul.d $vr17, $vr17, $vr23 + vfmul.d $vr22, $vr13, $vr23 fst.d $ft11, $fp, 0 - fsub.d $ft1, $fs4, $fs2 - fsub.d $ft2, $fs5, $fs3 - fmul.d $ft2, $ft2, $ft8 - fmadd.d $ft1, $ft1, $ft7, $ft2 - fsub.d $ft2, $fs1, $fs7 - fsub.d $ft5, $fs6, $fs0 + fsub.d $ft5, $ft12, $fa0 + fsub.d $ft7, $fs7, $fa1 + fmul.d $ft7, $ft7, $ft8 + fmadd.d $ft5, $ft5, $ft6, $ft7 + vreplvei.d $vr14, $vr17, 0 + vst $vr17, $sp, 176 # 16-byte Folded Spill + vreplvei.d $vr15, $vr17, 1 + fsub.d $ft8, $ft6, $ft7 + vreplvei.d $vr17, $vr22, 0 + vreplvei.d $vr18, $vr22, 1 + fsub.d $ft11, $ft9, $ft10 fneg.d $ft4, $ft4 - fmul.d $ft4, $ft5, $ft4 - fmadd.d $ft2, $ft2, $ft3, $ft4 - fsub.d $ft1, $ft1, $ft2 - fadd.d $ft2, $fa2, $fa2 - fdiv.d $ft3, $ft1, $ft2 - fadd.d $ft1, $fa3, $fa5 - fadd.d $ft2, $fa4, $fa6 - fsub.d $ft1, $ft1, $fa0 - fsub.d $ft2, $ft2, $fa1 - fsub.d $ft1, $ft1, $fa7 - fsub.d $ft2, $ft2, $ft0 - fmul.d $ft1, $ft1, $ft15 - fmul.d $ft2, $ft2, $ft15 - fadd.d $fa5, $fa5, $fa7 - fadd.d $fa6, $fa6, $ft0 - fsub.d $fa0, $fa5, $fa0 - fsub.d $fa1, $fa6, $fa1 - fsub.d $fa0, $fa0, $fa3 - fsub.d $fa1, $fa1, $fa4 - fmul.d $fa0, $fa0, $ft15 - fmul.d $fa1, $fa1, $ft15 - fmul.d $fa3, $ft2, $ft2 - fmadd.d $fa3, $ft1, $ft1, $fa3 - fsqrt.d $fa3, $fa3 - fmul.d $fa4, $fa1, $fa1 - fmadd.d $fa4, $fa0, $fa0, $fa4 - fsqrt.d $fa4, $fa4 - fadd.d $fa5, $fs2, $fs7 - fadd.d $fa6, $fs3, $fs0 - fadd.d $fa5, $fa5, $fs4 - fadd.d $fa6, $fa6, $fs5 - fadd.d $fa5, $fa5, $fs1 - fadd.d $fa6, $fa6, $fs6 - fmul.d $fa5, $fa5, $ft14 - fmul.d $fa6, $fa6, $ft14 - fmul.d $fa7, $ft2, $fa6 - fmadd.d $fa7, $ft1, $fa5, $fa7 - fmul.d $fa7, $fa7, $fa4 + fmul.d $ft4, $ft11, $ft4 + fmadd.d $ft3, $ft8, $ft3, $ft4 + fsub.d $ft3, $ft5, $ft3 + fadd.d $ft4, $fa2, $fa2 + fdiv.d $fs3, $ft3, $ft4 + fadd.d $ft3, $fa5, $fa7 + fadd.d $ft4, $fa6, $ft0 + fsub.d $ft3, $ft3, $fa3 + fsub.d $ft4, $ft4, $fa4 + fsub.d $ft3, $ft3, $ft1 + fsub.d $ft4, $ft4, $ft2 + fmul.d $ft3, $ft3, $fs4 + fmul.d $ft4, $ft4, $fs4 + fadd.d $fa7, $fa7, $ft1 + fadd.d $ft0, $ft0, $ft2 + fsub.d $fa3, $fa7, $fa3 + fsub.d $fa4, $ft0, $fa4 + fsub.d $fa3, $fa3, $fa5 + fsub.d $fa4, $fa4, $fa6 + fmul.d $fa3, $fa3, $fs4 + fmul.d $fa4, $fa4, $fs4 + fmul.d $fa5, $ft4, $ft4 + fmadd.d $fa5, $ft3, $ft3, $fa5 + fsqrt.d $fa5, $fa5 + fmul.d $fa6, $fa4, $fa4 + fmadd.d $fa6, $fa3, $fa3, $fa6 + fsqrt.d $fa6, $fa6 + fadd.d $fa0, $fa0, $ft7 + fadd.d $fa1, $fa1, $ft10 + fadd.d $fa0, $fa0, $ft12 + fadd.d $fa1, $fa1, $fs7 + fadd.d $fa0, $fa0, $ft6 + fadd.d $fa1, $fa1, $ft9 + fmul.d $fa0, $fa0, $fs5 + fmul.d $fa1, $fa1, $fs5 + fmul.d $fa7, $ft4, $fa1 + fmadd.d $fa7, $ft3, $fa0, $fa7 + fmul.d $fa7, $fa7, $fa6 fabs.d $fa7, $fa7 - fmul.d $fa1, $fa6, $fa1 - fmadd.d $fa0, $fa0, $fa5, $fa1 - fmul.d $fa0, $fa3, $fa0 + fmul.d $fa1, $fa1, $fa4 + fmadd.d $fa0, $fa3, $fa0, $fa1 + fmul.d $fa0, $fa5, $fa0 fabs.d $fa0, $fa0 fcmp.clt.d $fcc0, $fa0, $fa7 - fsel $fa0, $fa4, $fa3, $fcc0 - fsel $fa1, $fa3, $fa4, $fcc0 + fsel $fa0, $fa6, $fa5, $fcc0 + fsel $fa1, $fa5, $fa6, $fcc0 fdiv.d $fa0, $fa0, $fa1 - vldi $vr1, -1008 - fmul.d $fa1, $fa2, $fa1 + fmul.d $fa1, $fa2, $fs6 fmul.d $fa1, $fa1, $fa0 fsqrt.d $fa0, $fa1 fcmp.cor.d $fcc0, $fa0, $fa0 - fst.d $ft3, $s2, 0 + fst.d $fs3, $s2, 0 bceqz $fcc0, .LBB3_14 .LBB3_13: # %.split # in Loop: Header=BB3_10 Depth=1 - fld.d $fa1, $sp, 112 # 8-byte Folded Reload - fadd.d $fa1, $fa1, $fa1 + fadd.d $fa1, $fs2, $fs2 fcmp.clt.d $fcc0, $fa1, $fa0 fsel $fa0, $fa0, $fa1, $fcc0 - fadd.d $fa1, $fs7, $fs4 - fadd.d $fa2, $fs0, $fs5 - fsub.d $fa1, $fa1, $fs2 - fsub.d $fa2, $fa2, $fs3 - fsub.d $fa1, $fa1, $fs1 - fsub.d $fa2, $fa2, $fs6 - fmul.d $fa2, $fa2, $fa2 - fmadd.d $fa1, $fa1, $fa1, $fa2 - fadd.d $fa2, $fs4, $fs1 - fadd.d $fa3, $fs5, $fs6 - fsub.d $fa2, $fa2, $fs2 - fsub.d $fa3, $fa3, $fs3 - fsub.d $fa2, $fa2, $fs7 - fsub.d $fa3, $fa3, $fs0 - fmul.d $fa3, $fa3, $fa3 - fmadd.d $fa2, $fa2, $fa2, $fa3 + vreplvei.d $vr1, $vr20, 0 + vld $vr4, $sp, 176 # 16-byte Folded Reload + vfadd.d $vr1, $vr4, $vr1 + vreplvei.d $vr2, $vr31, 0 + vfadd.d $vr2, $vr22, $vr2 + vfsub.d $vr1, $vr1, $vr21 + vld $vr3, $sp, 192 # 16-byte Folded Reload + vfsub.d $vr2, $vr2, $vr3 + vshuf4i.d $vr4, $vr0, 1 + vfsub.d $vr1, $vr1, $vr4 + vshuf4i.d $vr22, $vr0, 1 + vfsub.d $vr2, $vr2, $vr22 + vfmul.d $vr2, $vr2, $vr2 + vfmadd.d $vr1, $vr1, $vr1, $vr2 + vreplvei.d $vr2, $vr1, 0 + vreplvei.d $vr1, $vr1, 1 movgr2fr.d $fa3, $zero - fcmp.clt.d $fcc0, $ft3, $fa3 + fcmp.clt.d $fcc0, $fs3, $fa3 fsel $fa0, $fa3, $fa0, $fcc0 fst.d $fa0, $s1, 0 fld.d $fa0, $s2, 0 @@ -661,51 +657,56 @@ _ZN3QCS12setCornerDivEPdS0_S0_S0_S0_ii: # @_ZN3QCS12setCornerDivEPdS0_S0_S0_S0_i .LBB3_14: # %call.sqrt # in Loop: Header=BB3_10 Depth=1 fmov.d $fa0, $fa1 - st.d $a0, $sp, 56 # 8-byte Folded Spill - st.d $t5, $sp, 48 # 8-byte Folded Spill - st.d $t8, $sp, 40 # 8-byte Folded Spill - st.d $ra, $sp, 32 # 8-byte Folded Spill - fst.d $ft12, $sp, 24 # 8-byte Folded Spill - fst.d $ft13, $sp, 16 # 8-byte Folded Spill - fst.d $ft3, $sp, 8 # 8-byte Folded Spill + st.d $a0, $sp, 120 # 8-byte Folded Spill + st.d $t5, $sp, 112 # 8-byte Folded Spill + st.d $t8, $sp, 104 # 8-byte Folded Spill + st.d $ra, $sp, 96 # 8-byte Folded Spill + vst $vr23, $sp, 80 # 16-byte Folded Spill + vst $vr31, $sp, 64 # 16-byte Folded Spill + vst $vr20, $sp, 48 # 16-byte Folded Spill + vst $vr21, $sp, 32 # 16-byte Folded Spill + vst $vr22, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 - fld.d $ft3, $sp, 8 # 8-byte Folded Reload - vldi $vr22, -944 - fld.d $ft13, $sp, 16 # 8-byte Folded Reload - fld.d $ft12, $sp, 24 # 8-byte Folded Reload - vldi $vr23, -928 - ld.d $ra, $sp, 32 # 8-byte Folded Reload - ld.d $t8, $sp, 40 # 8-byte Folded Reload - ld.d $t7, $sp, 64 # 8-byte Folded Reload - ld.d $t6, $sp, 72 # 8-byte Folded Reload - ld.d $t5, $sp, 48 # 8-byte Folded Reload - ld.d $t4, $sp, 80 # 8-byte Folded Reload - ld.d $t3, $sp, 88 # 8-byte Folded Reload - ld.d $t2, $sp, 96 # 8-byte Folded Reload - ld.d $a0, $sp, 56 # 8-byte Folded Reload + vld $vr22, $sp, 16 # 16-byte Folded Reload + vld $vr21, $sp, 32 # 16-byte Folded Reload + vld $vr20, $sp, 48 # 16-byte Folded Reload + vld $vr31, $sp, 64 # 16-byte Folded Reload + vldi $vr30, -1008 + vldi $vr29, -944 + vldi $vr28, -928 + vld $vr23, $sp, 80 # 16-byte Folded Reload + ld.d $ra, $sp, 96 # 8-byte Folded Reload + ld.d $t8, $sp, 104 # 8-byte Folded Reload + ld.d $t7, $sp, 128 # 8-byte Folded Reload + ld.d $t6, $sp, 136 # 8-byte Folded Reload + ld.d $t5, $sp, 112 # 8-byte Folded Reload + ld.d $t4, $sp, 144 # 8-byte Folded Reload + ld.d $t3, $sp, 152 # 8-byte Folded Reload + ld.d $t2, $sp, 160 # 8-byte Folded Reload + ld.d $a0, $sp, 120 # 8-byte Folded Reload b .LBB3_13 .LBB3_15: # %._crit_edge - fld.d $fs7, $sp, 120 # 8-byte Folded Reload - fld.d $fs6, $sp, 128 # 8-byte Folded Reload - fld.d $fs5, $sp, 136 # 8-byte Folded Reload - fld.d $fs4, $sp, 144 # 8-byte Folded Reload - fld.d $fs3, $sp, 152 # 8-byte Folded Reload - fld.d $fs2, $sp, 160 # 8-byte Folded Reload - fld.d $fs1, $sp, 168 # 8-byte Folded Reload - fld.d $fs0, $sp, 176 # 8-byte Folded Reload - ld.d $s8, $sp, 184 # 8-byte Folded Reload - ld.d $s7, $sp, 192 # 8-byte Folded Reload - ld.d $s6, $sp, 200 # 8-byte Folded Reload - ld.d $s5, $sp, 208 # 8-byte Folded Reload - ld.d $s4, $sp, 216 # 8-byte Folded Reload - ld.d $s3, $sp, 224 # 8-byte Folded Reload - ld.d $s2, $sp, 232 # 8-byte Folded Reload - ld.d $s1, $sp, 240 # 8-byte Folded Reload - ld.d $s0, $sp, 248 # 8-byte Folded Reload - ld.d $fp, $sp, 256 # 8-byte Folded Reload - ld.d $ra, $sp, 264 # 8-byte Folded Reload - addi.d $sp, $sp, 272 + fld.d $fs7, $sp, 216 # 8-byte Folded Reload + fld.d $fs6, $sp, 224 # 8-byte Folded Reload + fld.d $fs5, $sp, 232 # 8-byte Folded Reload + fld.d $fs4, $sp, 240 # 8-byte Folded Reload + fld.d $fs3, $sp, 248 # 8-byte Folded Reload + fld.d $fs2, $sp, 256 # 8-byte Folded Reload + fld.d $fs1, $sp, 264 # 8-byte Folded Reload + fld.d $fs0, $sp, 272 # 8-byte Folded Reload + ld.d $s8, $sp, 280 # 8-byte Folded Reload + ld.d $s7, $sp, 288 # 8-byte Folded Reload + ld.d $s6, $sp, 296 # 8-byte Folded Reload + ld.d $s5, $sp, 304 # 8-byte Folded Reload + ld.d $s4, $sp, 312 # 8-byte Folded Reload + ld.d $s3, $sp, 320 # 8-byte Folded Reload + ld.d $s2, $sp, 328 # 8-byte Folded Reload + ld.d $s1, $sp, 336 # 8-byte Folded Reload + ld.d $s0, $sp, 344 # 8-byte Folded Reload + ld.d $fp, $sp, 352 # 8-byte Folded Reload + ld.d $ra, $sp, 360 # 8-byte Folded Reload + addi.d $sp, $sp, 368 pcaddu18i $t8, %call36(free) jr $t8 .Lfunc_end3: diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/TTS.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/TTS.s index bc89f229..928aa660 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/TTS.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT/CMakeFiles/PENNANT.dir/TTS.s @@ -157,11 +157,9 @@ _ZN3TTS9calcForceEPKdS1_S1_S1_S1_PK7double2PS2_ii: # @_ZN3TTS9calcForceEPKdS1_S1 alsl.d $t0, $t2, $t0, 2 alsl.d $a4, $t2, $a4, 3 alsl.d $a5, $t2, $a5, 3 + alsl.d $a6, $t2, $a6, 4 + alsl.d $a7, $t2, $a7, 4 sub.d $t1, $t1, $t2 - slli.d $t2, $t2, 4 - addi.d $t2, $t2, 8 - add.d $a6, $a6, $t2 - add.d $a7, $a7, $t2 .p2align 4, , 16 .LBB2_2: # =>This Inner Loop Header: Depth=1 ld.w $t2, $t0, 0 @@ -180,19 +178,17 @@ _ZN3TTS9calcForceEPKdS1_S1_S1_S1_PK7double2PS2_ii: # @_ZN3TTS9calcForceEPKdS1_S1 fmul.d $fa4, $fa0, $fa2 fmul.d $fa2, $fa2, $fa4 fsub.d $fa1, $fa1, $fa3 - fld.d $fa3, $a6, -8 - fld.d $fa4, $a6, 0 + vld $vr3, $a6, 0 fneg.d $fa1, $fa1 fmul.d $fa1, $fa2, $fa1 - fmul.d $fa2, $fa3, $fa1 - fmul.d $fa1, $fa4, $fa1 - fst.d $fa2, $a7, -8 - fst.d $fa1, $a7, 0 + vreplvei.d $vr1, $vr1, 0 + vfmul.d $vr1, $vr3, $vr1 + vst $vr1, $a7, 0 addi.d $t0, $t0, 4 addi.d $a4, $a4, 8 addi.d $a5, $a5, 8 - addi.d $t1, $t1, -1 addi.d $a6, $a6, 16 + addi.d $t1, $t1, -1 addi.d $a7, $a7, 16 bnez $t1, .LBB2_2 .LBB2_3: # %._crit_edge diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeFiles/CoMD.dir/eam.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeFiles/CoMD.dir/eam.s index a3adb40c..ddadb0ba 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeFiles/CoMD.dir/eam.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeFiles/CoMD.dir/eam.s @@ -895,30 +895,30 @@ initEamPot: # @initEamPot .type eamForce,@function eamForce: # @eamForce # %bb.0: - addi.d $sp, $sp, -336 - st.d $ra, $sp, 328 # 8-byte Folded Spill - st.d $fp, $sp, 320 # 8-byte Folded Spill - st.d $s0, $sp, 312 # 8-byte Folded Spill - st.d $s1, $sp, 304 # 8-byte Folded Spill - st.d $s2, $sp, 296 # 8-byte Folded Spill - st.d $s3, $sp, 288 # 8-byte Folded Spill - st.d $s4, $sp, 280 # 8-byte Folded Spill - st.d $s5, $sp, 272 # 8-byte Folded Spill - st.d $s6, $sp, 264 # 8-byte Folded Spill - st.d $s7, $sp, 256 # 8-byte Folded Spill - st.d $s8, $sp, 248 # 8-byte Folded Spill - fst.d $fs0, $sp, 240 # 8-byte Folded Spill - fst.d $fs1, $sp, 232 # 8-byte Folded Spill - fst.d $fs2, $sp, 224 # 8-byte Folded Spill - fst.d $fs3, $sp, 216 # 8-byte Folded Spill - fst.d $fs4, $sp, 208 # 8-byte Folded Spill - fst.d $fs5, $sp, 200 # 8-byte Folded Spill + addi.d $sp, $sp, -368 + st.d $ra, $sp, 360 # 8-byte Folded Spill + st.d $fp, $sp, 352 # 8-byte Folded Spill + st.d $s0, $sp, 344 # 8-byte Folded Spill + st.d $s1, $sp, 336 # 8-byte Folded Spill + st.d $s2, $sp, 328 # 8-byte Folded Spill + st.d $s3, $sp, 320 # 8-byte Folded Spill + st.d $s4, $sp, 312 # 8-byte Folded Spill + st.d $s5, $sp, 304 # 8-byte Folded Spill + st.d $s6, $sp, 296 # 8-byte Folded Spill + st.d $s7, $sp, 288 # 8-byte Folded Spill + st.d $s8, $sp, 280 # 8-byte Folded Spill + fst.d $fs0, $sp, 272 # 8-byte Folded Spill + fst.d $fs1, $sp, 264 # 8-byte Folded Spill + fst.d $fs2, $sp, 256 # 8-byte Folded Spill + fst.d $fs3, $sp, 248 # 8-byte Folded Spill + fst.d $fs4, $sp, 240 # 8-byte Folded Spill + fst.d $fs5, $sp, 232 # 8-byte Folded Spill move $a2, $a0 ld.d $a3, $a0, 64 ld.d $a0, $a3, 104 ld.d $s0, $a2, 24 - st.d $a2, $sp, 80 # 8-byte Folded Spill - st.d $a3, $sp, 72 # 8-byte Folded Spill + st.d $a2, $sp, 112 # 8-byte Folded Spill + st.d $a3, $sp, 104 # 8-byte Folded Spill bnez $a0, .LBB1_2 # %bb.1: ld.w $a0, $s0, 20 @@ -927,26 +927,26 @@ eamForce: # @eamForce move $a0, $s1 pcaddu18i $ra, %call36(malloc) jirl $ra, $ra, 0 - ld.d $a1, $sp, 72 # 8-byte Folded Reload + ld.d $a1, $sp, 104 # 8-byte Folded Reload st.d $a0, $a1, 96 move $a0, $s1 pcaddu18i $ra, %call36(malloc) jirl $ra, $ra, 0 - ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 112 # 8-byte Folded Reload ld.d $a1, $a1, 16 - ld.d $a2, $sp, 72 # 8-byte Folded Reload + ld.d $a2, $sp, 104 # 8-byte Folded Reload st.d $a0, $a2, 88 move $a0, $a1 move $a1, $s0 pcaddu18i $ra, %call36(initForceHaloExchange) jirl $ra, $ra, 0 - ld.d $a1, $sp, 72 # 8-byte Folded Reload + ld.d $a1, $sp, 104 # 8-byte Folded Reload st.d $a0, $a1, 104 ori $a0, $zero, 16 pcaddu18i $ra, %call36(malloc) jirl $ra, $ra, 0 - ld.d $a3, $sp, 72 # 8-byte Folded Reload - ld.d $a2, $sp, 80 # 8-byte Folded Reload + ld.d $a3, $sp, 104 # 8-byte Folded Reload + ld.d $a2, $sp, 112 # 8-byte Folded Reload ld.d $a1, $a3, 96 ld.d $s0, $a2, 24 st.d $a0, $a3, 112 @@ -965,9 +965,9 @@ eamForce: # @eamForce move $fp, $a3 pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a0, $sp, 80 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload ld.d $a0, $a0, 24 - ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 112 # 8-byte Folded Reload ld.d $a1, $a1, 32 ld.w $a2, $a0, 20 ld.d $a0, $a1, 48 @@ -976,7 +976,7 @@ eamForce: # @eamForce move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a0, $sp, 80 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload ld.d $a0, $a0, 24 ld.w $a1, $a0, 20 ld.d $a0, $fp, 96 @@ -985,7 +985,7 @@ eamForce: # @eamForce move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a0, $sp, 80 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload ld.d $a0, $a0, 24 ld.w $a1, $a0, 20 ld.d $a0, $fp, 88 @@ -994,7 +994,7 @@ eamForce: # @eamForce move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - ld.d $a0, $sp, 80 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload ld.d $a0, $a0, 24 ld.w $a1, $a0, 12 blez $a1, .LBB1_34 @@ -1006,8 +1006,8 @@ eamForce: # @eamForce b .LBB1_6 .p2align 4, , 16 .LBB1_4: # in Loop: Header=BB1_6 Depth=1 - ld.d $t5, $sp, 80 # 8-byte Folded Reload - ld.d $t6, $sp, 72 # 8-byte Folded Reload + ld.d $t5, $sp, 112 # 8-byte Folded Reload + ld.d $t6, $sp, 104 # 8-byte Folded Reload .LBB1_5: # %._crit_edge258 # in Loop: Header=BB1_6 Depth=1 ld.d $a0, $t5, 24 @@ -1022,8 +1022,8 @@ eamForce: # @eamForce ld.d $a1, $a0, 120 slli.d $a2, $s0, 2 ldx.w $a1, $a1, $a2 - st.d $a1, $sp, 64 # 8-byte Folded Spill - addi.d $a2, $sp, 92 + st.d $a1, $sp, 96 # 8-byte Folded Spill + addi.d $a2, $sp, 124 move $a1, $s0 pcaddu18i $ra, %call36(getNeighborBoxes) jirl $ra, $ra, 0 @@ -1032,12 +1032,12 @@ eamForce: # @eamForce # in Loop: Header=BB1_6 Depth=1 move $a7, $zero bstrpick.d $t0, $s2, 31, 0 - ld.d $t5, $sp, 80 # 8-byte Folded Reload - ld.d $t6, $sp, 72 # 8-byte Folded Reload - addi.d $a5, $sp, 92 + ld.d $t5, $sp, 112 # 8-byte Folded Reload + ld.d $t6, $sp, 104 # 8-byte Folded Reload + addi.d $a5, $sp, 124 vldi $vr14, -928 vldi $vr15, -896 - ld.d $a6, $sp, 64 # 8-byte Folded Reload + ld.d $a6, $sp, 96 # 8-byte Folded Reload b .LBB1_9 .p2align 4, , 16 .LBB1_8: # %.loopexit239 @@ -1087,70 +1087,64 @@ eamForce: # @eamForce .p2align 4, , 16 .LBB1_15: # %interpolate.exit214 # in Loop: Header=BB1_17 Depth=4 - fsub.d $ft2, $fa7, $ft0 - fsub.d $ft3, $fa5, $fa6 - fsub.d $fa7, $ft1, $fa3 - fsub.d $fa7, $fa7, $ft3 - fmadd.d $fa7, $ft2, $fa7, $ft3 - fmul.d $fa7, $fa7, $ft6 - fmul.d $ft1, $fa2, $fa7 + fsub.d $fa7, $fa7, $ft0 + fsub.d $ft0, $fa5, $fa6 + fsub.d $ft1, $ft1, $fa3 + fsub.d $ft1, $ft1, $ft0 + fmadd.d $ft1, $fa7, $ft1, $ft0 + fmul.d $ft1, $ft1, $ft6 + fmul.d $fa1, $fa1, $ft1 alsl.d $a3, $a2, $a1, 3 slli.d $a2, $a2, 3 - fld.d $fa7, $a3, 8 - ld.d $a4, $fp, 40 - fld.d $ft0, $a3, -8 - slli.d $a3, $s5, 4 - alsl.d $a3, $s5, $a3, 3 - fldx.d $ft4, $a4, $a3 - fldx.d $fa2, $a1, $a2 - fmul.d $ft5, $ft1, $fs5 - fdiv.d $ft5, $ft5, $fa0 - fsub.d $ft4, $ft4, $ft5 - fstx.d $ft4, $a4, $a3 - fldx.d $ft4, $a4, $s8 - fadd.d $ft4, $ft5, $ft4 - add.d $a1, $a4, $a3 - fstx.d $ft4, $a4, $s8 - fld.d $ft4, $a1, 8 - add.d $a2, $a4, $s8 - fmul.d $ft5, $ft1, $fs4 - fdiv.d $ft5, $ft5, $fa0 - fsub.d $ft4, $ft4, $ft5 - fst.d $ft4, $a1, 8 - fld.d $ft4, $a2, 8 + fld.d $ft1, $a3, 8 + fld.d $ft2, $a3, -8 + ld.d $a3, $fp, 40 + slli.d $a4, $s5, 4 + alsl.d $a4, $s5, $a4, 3 + vreplvei.d $vr11, $vr1, 0 + vfmul.d $vr11, $vr11, $vr12 + vldx $vr12, $a3, $a4 + vreplvei.d $vr13, $vr0, 0 + vfdiv.d $vr11, $vr11, $vr13 + fldx.d $ft5, $a1, $a2 + vfsub.d $vr12, $vr12, $vr11 + vstx $vr12, $a3, $a4 + vldx $vr12, $a3, $s8 fadd.d $fa5, $fa5, $fa6 - fadd.d $fa6, $ft5, $ft4 - fst.d $fa6, $a2, 8 + vfadd.d $vr6, $vr11, $vr12 + add.d $a1, $a3, $a4 + vstx $vr6, $a3, $s8 + fmul.d $fa1, $fa1, $fs3 fld.d $fa6, $a1, 16 - fmul.d $ft4, $ft2, $ft6 - fmul.d $ft1, $ft1, $fs3 - fdiv.d $fa0, $ft1, $fa0 - fsub.d $fa6, $fa6, $fa0 - fst.d $fa6, $a1, 16 - fld.d $fa6, $a2, 16 + fmul.d $ft3, $fa7, $ft6 + add.d $a2, $a3, $s8 + fdiv.d $fa0, $fa1, $fa0 + fsub.d $fa1, $fa6, $fa0 + fst.d $fa1, $a1, 16 + fld.d $fa1, $a2, 16 fmadd.d $fa5, $fa3, $ft7, $fa5 - fmadd.d $fa5, $ft2, $fa5, $ft3 - fmadd.d $fa3, $ft4, $fa5, $fa3 - fadd.d $fa0, $fa0, $fa6 + fmadd.d $fa5, $fa7, $fa5, $ft0 + fmadd.d $fa3, $ft3, $fa5, $fa3 + fadd.d $fa0, $fa0, $fa1 ld.d $a1, $fp, 48 fst.d $fa0, $a2, 16 - fsub.d $fa0, $fa1, $fa4 + fsub.d $fa0, $fa2, $fa4 slli.d $a2, $s5, 3 fldx.d $fa1, $a1, $a2 - fsub.d $fa4, $fa7, $ft0 - fmul.d $fa5, $fa0, $ft6 - fadd.d $fa6, $fa7, $ft0 + fsub.d $fa2, $ft1, $ft2 + fmul.d $fa4, $fa0, $ft6 + fadd.d $fa5, $ft1, $ft2 fmadd.d $fa1, $fa3, $ft6, $fa1 fstx.d $fa1, $a1, $a2 fldx.d $fa1, $a1, $s1 - fmadd.d $fa6, $fa2, $ft7, $fa6 + fmadd.d $fa5, $ft5, $ft7, $fa5 ld.d $a3, $t6, 88 ld.w $a4, $t2, 12 fmadd.d $fa1, $fa3, $ft6, $fa1 fstx.d $fa1, $a1, $s1 fldx.d $fa1, $a3, $a2 - fmadd.d $fa0, $fa0, $fa6, $fa4 - fmadd.d $fa0, $fa5, $fa0, $fa2 + fmadd.d $fa0, $fa0, $fa5, $fa2 + fmadd.d $fa0, $fa4, $fa0, $ft5 slt $a1, $t1, $a4 fadd.d $fa1, $fa0, $fa1 fstx.d $fa1, $a3, $a2 @@ -1180,20 +1174,19 @@ eamForce: # @eamForce ld.d $a1, $fp, 24 alsl.d $a2, $s5, $s5, 1 slli.d $a2, $a2, 3 - add.d $a3, $a1, $a2 - fldx.d $fa0, $a1, $a2 - fldx.d $fa1, $a1, $s8 + vldx $vr0, $a1, $a2 + vldx $vr1, $a1, $s8 + add.d $a2, $a1, $a2 add.d $a1, $a1, $s8 - fld.d $fa2, $a3, 8 - fld.d $fa3, $a1, 8 - fsub.d $fs5, $fa0, $fa1 - fld.d $fa0, $a3, 16 - fld.d $fa1, $a1, 16 - fsub.d $fs4, $fa2, $fa3 - fmadd.d $fa2, $fs5, $fs5, $fs2 - fmadd.d $fa2, $fs4, $fs4, $fa2 - fsub.d $fs3, $fa0, $fa1 - fmadd.d $fa1, $fs3, $fs3, $fa2 + vfsub.d $vr12, $vr0, $vr1 + vreplvei.d $vr0, $vr12, 0 + fld.d $fa1, $a2, 16 + fld.d $fa2, $a1, 16 + fmadd.d $fa0, $fa0, $fa0, $fs2 + vreplvei.d $vr3, $vr12, 1 + fmadd.d $fa0, $fa3, $fa3, $fa0 + fsub.d $fs3, $fa1, $fa2 + fmadd.d $fa1, $fs3, $fs3, $fa0 fcmp.clt.d $fcc0, $fs0, $fa1 bcnez $fcc0, .LBB1_16 # %bb.20: # in Loop: Header=BB1_17 Depth=4 @@ -1203,25 +1196,25 @@ eamForce: # @eamForce .LBB1_21: # %.split # in Loop: Header=BB1_17 Depth=4 ld.d $a2, $t6, 64 - fld.d $fa1, $a2, 8 + fld.d $fa2, $a2, 8 ld.d $a1, $a2, 24 - fld.d $fa2, $a2, 16 - fcmp.clt.d $fcc0, $fa0, $fa1 - fsel $fa3, $fa0, $fa1, $fcc0 - fsub.d $fa1, $fa3, $fa1 - fmul.d $fa7, $fa2, $fa1 - vreplvei.d $vr1, $vr7, 0 + fld.d $fa1, $a2, 16 + fcmp.clt.d $fcc0, $fa0, $fa2 + fsel $fa3, $fa0, $fa2, $fcc0 + fsub.d $fa2, $fa3, $fa2 + fmul.d $fa7, $fa1, $fa2 + vreplvei.d $vr2, $vr7, 0 ld.w $a3, $a2, 0 - vfrintrm.d $vr8, $vr1 - ftintrz.w.d $fa1, $ft0 - movfr2gr.s $a2, $fa1 + vfrintrm.d $vr8, $vr2 + ftintrz.w.d $fa2, $ft0 + movfr2gr.s $a2, $fa2 bge $a3, $a2, .LBB1_23 # %bb.22: # in Loop: Header=BB1_17 Depth=4 - movgr2fr.w $fa1, $a3 - ffint.d.w $fa1, $fa1 - fdiv.d $fa7, $fa1, $fa2 - vreplvei.d $vr1, $vr7, 0 - vfrintrm.d $vr8, $vr1 + movgr2fr.w $fa2, $a3 + ffint.d.w $fa2, $fa2 + fdiv.d $fa7, $fa2, $fa1 + vreplvei.d $vr2, $vr7, 0 + vfrintrm.d $vr8, $vr2 move $a2, $a3 .LBB1_23: # %interpolate.exit # in Loop: Header=BB1_17 Depth=4 @@ -1231,55 +1224,58 @@ eamForce: # @eamForce fld.d $fa6, $a3, -8 fld.d $ft1, $a3, 16 slli.d $a2, $a2, 3 - fld.d $fa1, $a4, 8 + fld.d $fa2, $a4, 8 fldx.d $fa3, $a1, $a2 ld.d $a1, $a4, 24 fld.d $ft2, $a4, 16 - fcmp.clt.d $fcc0, $fa0, $fa1 - fsel $fa4, $fa0, $fa1, $fcc0 - fsub.d $fa1, $fa4, $fa1 - fmul.d $fa1, $ft2, $fa1 - vreplvei.d $vr4, $vr1, 0 + fcmp.clt.d $fcc0, $fa0, $fa2 + fsel $fa4, $fa0, $fa2, $fcc0 + fsub.d $fa2, $fa4, $fa2 + fmul.d $fa2, $ft2, $fa2 + vreplvei.d $vr4, $vr2, 0 ld.w $a3, $a4, 0 vfrintrm.d $vr4, $vr4 ftintrz.w.d $ft3, $fa4 movfr2gr.s $a2, $ft3 bge $a3, $a2, .LBB1_15 # %bb.24: # in Loop: Header=BB1_17 Depth=4 - movgr2fr.w $fa1, $a3 - ffint.d.w $fa1, $fa1 - fdiv.d $fa1, $fa1, $ft2 - vreplvei.d $vr4, $vr1, 0 + movgr2fr.w $fa2, $a3 + ffint.d.w $fa2, $fa2 + fdiv.d $fa2, $fa2, $ft2 + vreplvei.d $vr4, $vr2, 0 vfrintrm.d $vr4, $vr4 move $a2, $a3 b .LBB1_15 .LBB1_25: # %call.sqrt # in Loop: Header=BB1_17 Depth=4 fmov.d $fa0, $fa1 - st.d $s2, $sp, 56 # 8-byte Folded Spill + st.d $s2, $sp, 88 # 8-byte Folded Spill move $s2, $a0 - st.d $a7, $sp, 48 # 8-byte Folded Spill - st.d $t0, $sp, 40 # 8-byte Folded Spill - st.d $t1, $sp, 32 # 8-byte Folded Spill - st.d $t2, $sp, 24 # 8-byte Folded Spill - st.d $t3, $sp, 16 # 8-byte Folded Spill - st.d $t4, $sp, 8 # 8-byte Folded Spill + st.d $a7, $sp, 80 # 8-byte Folded Spill + st.d $t0, $sp, 72 # 8-byte Folded Spill + st.d $t1, $sp, 64 # 8-byte Folded Spill + st.d $t2, $sp, 56 # 8-byte Folded Spill + st.d $t3, $sp, 48 # 8-byte Folded Spill + st.d $t4, $sp, 40 # 8-byte Folded Spill + vst $vr12, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 - ld.d $t4, $sp, 8 # 8-byte Folded Reload - ld.d $t3, $sp, 16 # 8-byte Folded Reload - ld.d $t2, $sp, 24 # 8-byte Folded Reload - ld.d $t1, $sp, 32 # 8-byte Folded Reload - ld.d $t0, $sp, 40 # 8-byte Folded Reload - ld.d $a7, $sp, 48 # 8-byte Folded Reload - ld.d $a6, $sp, 64 # 8-byte Folded Reload + vld $vr12, $sp, 16 # 16-byte Folded Reload + ld.d $t4, $sp, 40 # 8-byte Folded Reload + ld.d $t3, $sp, 48 # 8-byte Folded Reload + ld.d $t2, $sp, 56 # 8-byte Folded Reload + ld.d $t1, $sp, 64 # 8-byte Folded Reload + ld.d $t0, $sp, 72 # 8-byte Folded Reload + ld.d $a7, $sp, 80 # 8-byte Folded Reload + ld.d $a6, $sp, 96 # 8-byte Folded Reload vldi $vr15, -896 vldi $vr14, -928 - addi.d $a5, $sp, 92 - ld.d $t6, $sp, 72 # 8-byte Folded Reload - ld.d $t5, $sp, 80 # 8-byte Folded Reload + addi.d $a5, $sp, 124 + ld.d $t6, $sp, 104 # 8-byte Folded Reload + ld.d $t5, $sp, 112 # 8-byte Folded Reload move $a0, $s2 - ld.d $s2, $sp, 56 # 8-byte Folded Reload + ld.d $s2, $sp, 88 # 8-byte Folded Reload + # kill: def $f0_64 killed $f0_64 def $vr0 b .LBB1_21 .LBB1_26: # %.preheader blez $a1, .LBB1_35 @@ -1371,7 +1367,7 @@ eamForce: # @eamForce b .LBB1_31 .LBB1_34: movgr2fr.d $fs1, $zero - ld.d $t6, $sp, 72 # 8-byte Folded Reload + ld.d $t6, $sp, 104 # 8-byte Folded Reload .LBB1_35: # %._crit_edge277 ori $a0, $zero, 8 move $fp, $t6 @@ -1384,7 +1380,7 @@ eamForce: # @eamForce ori $a0, $zero, 8 pcaddu18i $ra, %call36(profileStop) jirl $ra, $ra, 0 - ld.d $a5, $sp, 80 # 8-byte Folded Reload + ld.d $a5, $sp, 112 # 8-byte Folded Reload ld.d $a0, $a5, 24 ld.w $a1, $a0, 12 blez $a1, .LBB1_56 @@ -1405,19 +1401,19 @@ eamForce: # @eamForce # Child Loop BB1_41 Depth 2 # Child Loop BB1_45 Depth 3 # Child Loop BB1_49 Depth 4 - st.d $a7, $sp, 64 # 8-byte Folded Spill + st.d $a7, $sp, 96 # 8-byte Folded Spill ld.d $a1, $a0, 120 slli.d $a2, $s0, 2 ldx.w $fp, $a1, $a2 - addi.d $a2, $sp, 92 + addi.d $a2, $sp, 124 move $a1, $s0 pcaddu18i $ra, %call36(getNeighborBoxes) jirl $ra, $ra, 0 vldi $vr8, -928 - addi.d $t0, $sp, 92 - ld.d $a7, $sp, 64 # 8-byte Folded Reload - ld.d $a5, $sp, 80 # 8-byte Folded Reload - ld.d $a6, $sp, 72 # 8-byte Folded Reload + addi.d $t0, $sp, 124 + ld.d $a7, $sp, 96 # 8-byte Folded Reload + ld.d $a5, $sp, 112 # 8-byte Folded Reload + ld.d $a6, $sp, 104 # 8-byte Folded Reload blez $a0, .LBB1_37 # %bb.39: # %.lr.ph294 # in Loop: Header=BB1_38 Depth=1 @@ -1466,11 +1462,11 @@ eamForce: # @eamForce blez $s1, .LBB1_44 # %bb.46: # %.lr.ph286.preheader # in Loop: Header=BB1_45 Depth=3 - move $fp, $zero + move $s3, $zero alsl.d $a1, $s6, $s6, 1 - slli.d $s3, $a1, 3 - move $s5, $t5 - move $s4, $t4 + slli.d $s5, $a1, 3 + move $s4, $t5 + move $fp, $t4 b .LBB1_49 .p2align 4, , 16 .LBB1_47: # %interpolate.exit226 @@ -1491,7 +1487,7 @@ eamForce: # @eamForce fmul.d $fa1, $fa1, $fa2 slli.d $a2, $s6, 3 fldx.d $fa2, $a1, $a2 - fldx.d $fa3, $a1, $s5 + fldx.d $fa3, $a1, $s4 ld.d $a3, $s2, 40 slli.d $a4, $s6, 4 alsl.d $a4, $s6, $a4, 3 @@ -1503,16 +1499,16 @@ eamForce: # @eamForce fsub.d $fa2, $fa3, $fa2 fstx.d $fa2, $a3, $a4 fldx.d $fa2, $a1, $a2 - fldx.d $fa3, $a1, $s5 + fldx.d $fa3, $a1, $s4 fadd.d $fa2, $fa2, $fa3 - fldx.d $fa3, $a3, $s4 + fldx.d $fa3, $a3, $fp fmul.d $fa2, $fa1, $fa2 fmul.d $fa2, $fs5, $fa2 fdiv.d $fa2, $fa2, $fa0 fadd.d $fa2, $fa3, $fa2 - fstx.d $fa2, $a3, $s4 + fstx.d $fa2, $a3, $fp fldx.d $fa2, $a1, $a2 - fldx.d $fa3, $a1, $s5 + fldx.d $fa3, $a1, $s4 add.d $a4, $a3, $a4 fadd.d $fa2, $fa2, $fa3 fld.d $fa3, $a4, 8 @@ -1522,8 +1518,8 @@ eamForce: # @eamForce fsub.d $fa2, $fa3, $fa2 fst.d $fa2, $a4, 8 fldx.d $fa2, $a1, $a2 - fldx.d $fa3, $a1, $s5 - add.d $a3, $a3, $s4 + fldx.d $fa3, $a1, $s4 + add.d $a3, $a3, $fp fadd.d $fa2, $fa2, $fa3 fld.d $fa3, $a3, 8 fmul.d $fa2, $fa1, $fa2 @@ -1532,7 +1528,7 @@ eamForce: # @eamForce fadd.d $fa2, $fa3, $fa2 fst.d $fa2, $a3, 8 fldx.d $fa2, $a1, $a2 - fldx.d $fa3, $a1, $s5 + fldx.d $fa3, $a1, $s4 fadd.d $fa2, $fa2, $fa3 fld.d $fa3, $a4, 16 fmul.d $fa2, $fa1, $fa2 @@ -1541,7 +1537,7 @@ eamForce: # @eamForce fsub.d $fa2, $fa3, $fa2 fst.d $fa2, $a4, 16 fldx.d $fa2, $a1, $a2 - fldx.d $fa3, $a1, $s5 + fldx.d $fa3, $a1, $s4 fadd.d $fa2, $fa2, $fa3 fld.d $fa3, $a3, 16 fmul.d $fa1, $fa1, $fa2 @@ -1551,10 +1547,10 @@ eamForce: # @eamForce fst.d $fa0, $a3, 16 .LBB1_48: # %.loopexit # in Loop: Header=BB1_49 Depth=4 - addi.w $fp, $fp, 1 - addi.d $s4, $s4, 24 - addi.d $s5, $s5, 8 - beq $s1, $fp, .LBB1_44 + addi.w $s3, $s3, 1 + addi.d $fp, $fp, 24 + addi.d $s4, $s4, 8 + beq $s1, $s3, .LBB1_44 .LBB1_49: # %.lr.ph286 # Parent Loop BB1_38 Depth=1 # Parent Loop BB1_41 Depth=2 @@ -1563,14 +1559,14 @@ eamForce: # @eamForce bne $s0, $s8, .LBB1_51 # %bb.50: # %.lr.ph286 # in Loop: Header=BB1_49 Depth=4 - bgeu $s7, $fp, .LBB1_48 + bgeu $s7, $s3, .LBB1_48 .LBB1_51: # in Loop: Header=BB1_49 Depth=4 ld.d $s2, $a5, 32 ld.d $a1, $s2, 24 - add.d $a2, $a1, $s3 - fldx.d $fa0, $a1, $s3 - fldx.d $fa1, $a1, $s4 - add.d $a1, $a1, $s4 + add.d $a2, $a1, $s5 + fldx.d $fa0, $a1, $s5 + fldx.d $fa1, $a1, $fp + add.d $a1, $a1, $fp fld.d $fa2, $a2, 8 fld.d $fa3, $a1, 8 fsub.d $fs5, $fa0, $fa1 @@ -1587,7 +1583,7 @@ eamForce: # @eamForce fsqrt.d $fa0, $fa1 fcmp.cor.d $fcc0, $fa0, $fa0 bceqz $fcc0, .LBB1_55 -.LBB1_53: # %.split421 +.LBB1_53: # %.split417 # in Loop: Header=BB1_49 Depth=4 ld.d $a2, $a6, 72 fld.d $fa2, $a2, 8 @@ -1611,50 +1607,50 @@ eamForce: # @eamForce vfrintrm.d $vr3, $vr3 move $a3, $a2 b .LBB1_47 -.LBB1_55: # %call.sqrt422 +.LBB1_55: # %call.sqrt418 # in Loop: Header=BB1_49 Depth=4 fmov.d $fa0, $fa1 - st.d $a0, $sp, 56 # 8-byte Folded Spill - st.d $t1, $sp, 48 # 8-byte Folded Spill - st.d $t2, $sp, 40 # 8-byte Folded Spill - st.d $t3, $sp, 32 # 8-byte Folded Spill - st.d $t4, $sp, 24 # 8-byte Folded Spill - st.d $t5, $sp, 16 # 8-byte Folded Spill + st.d $a0, $sp, 88 # 8-byte Folded Spill + st.d $t1, $sp, 80 # 8-byte Folded Spill + st.d $t2, $sp, 72 # 8-byte Folded Spill + st.d $t3, $sp, 64 # 8-byte Folded Spill + st.d $t4, $sp, 56 # 8-byte Folded Spill + st.d $t5, $sp, 48 # 8-byte Folded Spill pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 - ld.d $t5, $sp, 16 # 8-byte Folded Reload - ld.d $t4, $sp, 24 # 8-byte Folded Reload - ld.d $t3, $sp, 32 # 8-byte Folded Reload - ld.d $t2, $sp, 40 # 8-byte Folded Reload - ld.d $t1, $sp, 48 # 8-byte Folded Reload + ld.d $t5, $sp, 48 # 8-byte Folded Reload + ld.d $t4, $sp, 56 # 8-byte Folded Reload + ld.d $t3, $sp, 64 # 8-byte Folded Reload + ld.d $t2, $sp, 72 # 8-byte Folded Reload + ld.d $t1, $sp, 80 # 8-byte Folded Reload vldi $vr8, -928 - addi.d $t0, $sp, 92 - ld.d $a7, $sp, 64 # 8-byte Folded Reload - ld.d $a6, $sp, 72 # 8-byte Folded Reload - ld.d $a5, $sp, 80 # 8-byte Folded Reload - ld.d $a0, $sp, 56 # 8-byte Folded Reload + addi.d $t0, $sp, 124 + ld.d $a7, $sp, 96 # 8-byte Folded Reload + ld.d $a6, $sp, 104 # 8-byte Folded Reload + ld.d $a5, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 88 # 8-byte Folded Reload b .LBB1_53 .LBB1_56: # %._crit_edge299 fst.d $fs1, $a5, 48 move $a0, $zero - fld.d $fs5, $sp, 200 # 8-byte Folded Reload - fld.d $fs4, $sp, 208 # 8-byte Folded Reload - fld.d $fs3, $sp, 216 # 8-byte Folded Reload - fld.d $fs2, $sp, 224 # 8-byte Folded Reload - fld.d $fs1, $sp, 232 # 8-byte Folded Reload - fld.d $fs0, $sp, 240 # 8-byte Folded Reload - ld.d $s8, $sp, 248 # 8-byte Folded Reload - ld.d $s7, $sp, 256 # 8-byte Folded Reload - ld.d $s6, $sp, 264 # 8-byte Folded Reload - ld.d $s5, $sp, 272 # 8-byte Folded Reload - ld.d $s4, $sp, 280 # 8-byte Folded Reload - ld.d $s3, $sp, 288 # 8-byte Folded Reload - ld.d $s2, $sp, 296 # 8-byte Folded Reload - ld.d $s1, $sp, 304 # 8-byte Folded Reload - ld.d $s0, $sp, 312 # 8-byte Folded Reload - ld.d $fp, $sp, 320 # 8-byte Folded Reload - ld.d $ra, $sp, 328 # 8-byte Folded Reload - addi.d $sp, $sp, 336 + fld.d $fs5, $sp, 232 # 8-byte Folded Reload + fld.d $fs4, $sp, 240 # 8-byte Folded Reload + fld.d $fs3, $sp, 248 # 8-byte Folded Reload + fld.d $fs2, $sp, 256 # 8-byte Folded Reload + fld.d $fs1, $sp, 264 # 8-byte Folded Reload + fld.d $fs0, $sp, 272 # 8-byte Folded Reload + ld.d $s8, $sp, 280 # 8-byte Folded Reload + ld.d $s7, $sp, 288 # 8-byte Folded Reload + ld.d $s6, $sp, 296 # 8-byte Folded Reload + ld.d $s5, $sp, 304 # 8-byte Folded Reload + ld.d $s4, $sp, 312 # 8-byte Folded Reload + ld.d $s3, $sp, 320 # 8-byte Folded Reload + ld.d $s2, $sp, 328 # 8-byte Folded Reload + ld.d $s1, $sp, 336 # 8-byte Folded Reload + ld.d $s0, $sp, 344 # 8-byte Folded Reload + ld.d $fp, $sp, 352 # 8-byte Folded Reload + ld.d $ra, $sp, 360 # 8-byte Folded Reload + addi.d $sp, $sp, 368 ret .Lfunc_end1: .size eamForce, .Lfunc_end1-eamForce diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeFiles/CoMD.dir/initAtoms.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeFiles/CoMD.dir/initAtoms.s index 3dcb9540..7534f082 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeFiles/CoMD.dir/initAtoms.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CMakeFiles/CoMD.dir/initAtoms.s @@ -718,6 +718,7 @@ setTemperature: # @setTemperature ld.d $a1, $a1, 120 move $a2, $zero move $a3, $zero + vreplvei.d $vr1, $vr0, 0 b .LBB4_13 .p2align 4, , 16 .LBB4_12: # %._crit_edge72 @@ -741,15 +742,12 @@ setTemperature: # @setTemperature .p2align 4, , 16 .LBB4_15: # Parent Loop BB4_13 Depth=1 # => This Inner Loop Header: Depth=2 - fld.d $fa1, $a5, -16 - fld.d $fa2, $a5, -8 - fmul.d $fa1, $fa0, $fa1 + vld $vr2, $a5, -16 fld.d $fa3, $a5, 0 - fst.d $fa1, $a5, -16 - fmul.d $fa1, $fa0, $fa2 - fst.d $fa1, $a5, -8 - fmul.d $fa1, $fa0, $fa3 - fst.d $fa1, $a5, 0 + vfmul.d $vr2, $vr1, $vr2 + vst $vr2, $a5, -16 + fmul.d $fa2, $fa0, $fa3 + fst.d $fa2, $a5, 0 addi.w $a4, $a4, -1 addi.d $a5, $a5, 24 bnez $a4, .LBB4_15 @@ -775,10 +773,11 @@ setTemperature: # @setTemperature ld.d $ra, $sp, 136 # 8-byte Folded Reload addi.d $sp, $sp, 144 ret -.LBB4_18: # %call.sqrt106 +.LBB4_18: # %call.sqrt105 fmov.d $fa0, $fa1 pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 + # kill: def $f0_64 killed $f0_64 def $vr0 b .LBB4_10 .Lfunc_end4: .size setTemperature, .Lfunc_end4-setTemperature diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeFiles/SimpleMOC.dir/solver.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeFiles/SimpleMOC.dir/solver.s index e761c261..e15bb8ad 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeFiles/SimpleMOC.dir/solver.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeFiles/SimpleMOC.dir/solver.s @@ -5,449 +5,460 @@ .type attenuate_fluxes,@function attenuate_fluxes: # @attenuate_fluxes # %bb.0: - addi.d $sp, $sp, -128 - st.d $ra, $sp, 120 # 8-byte Folded Spill - st.d $fp, $sp, 112 # 8-byte Folded Spill - st.d $s0, $sp, 104 # 8-byte Folded Spill - st.d $s1, $sp, 96 # 8-byte Folded Spill - st.d $s2, $sp, 88 # 8-byte Folded Spill - st.d $s3, $sp, 80 # 8-byte Folded Spill - st.d $s4, $sp, 72 # 8-byte Folded Spill - st.d $s5, $sp, 64 # 8-byte Folded Spill - st.d $s6, $sp, 56 # 8-byte Folded Spill - st.d $s7, $sp, 48 # 8-byte Folded Spill - st.d $s8, $sp, 40 # 8-byte Folded Spill + addi.d $sp, $sp, -112 + st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s1, $sp, 80 # 8-byte Folded Spill + st.d $s2, $sp, 72 # 8-byte Folded Spill + st.d $s3, $sp, 64 # 8-byte Folded Spill + st.d $s4, $sp, 56 # 8-byte Folded Spill + st.d $s5, $sp, 48 # 8-byte Folded Spill + st.d $s6, $sp, 40 # 8-byte Folded Spill + st.d $s7, $sp, 32 # 8-byte Folded Spill + st.d $s8, $sp, 24 # 8-byte Folded Spill # kill: def $f1 killed $f1 def $vr1 # kill: def $f0 killed $f0 def $vr0 - ld.w $s0, $a3, 8 - ld.w $t1, $a3, 12 - ld.w $a7, $a3, 36 - ld.w $s1, $a3, 44 - fld.s $fa5, $a3, 60 - ld.d $s2, $a4, 40 - fld.s $fa3, $a4, 48 - fld.s $fa4, $a4, 52 - ld.d $t6, $a5, 0 - ld.d $t3, $a5, 8 - ld.d $fp, $a5, 16 - ld.d $t7, $a5, 24 - ld.d $t4, $a5, 32 - ld.d $t5, $a5, 40 - ld.d $t0, $a5, 48 - ld.d $t2, $a5, 56 - ld.d $a6, $a5, 64 - ld.d $t8, $a5, 72 - ld.d $a3, $a5, 80 - ld.d $a4, $a5, 88 - ld.d $s5, $a5, 96 - ld.d $s6, $a5, 104 - mul.d $a5, $t1, $s0 - mul.d $a5, $a5, $s1 - fld.s $fa6, $a0, 4 - movgr2fr.w $fa7, $a5 - ffint.s.w $fa7, $fa7 - fdiv.s $ft0, $fa5, $fa7 - fdiv.s $fa5, $fa6, $ft0 - ftintrz.w.s $fa5, $fa5 - movfr2gr.s $a5, $fa5 - movgr2fr.w $fa5, $a5 - ffint.s.w $fa7, $fa5 - vldi $vr9, -1184 - ld.d $s1, $a2, 0 - mod.w $s0, $a5, $t1 - fld.s $fa5, $a0, 0 - slli.d $a5, $s0, 3 - ldx.d $s1, $s1, $a5 - fadd.s $fa7, $fa7, $ft1 - fneg.s $ft1, $ft0 - fmadd.s $fa6, $ft1, $fa7, $fa6 - beqz $s0, .LBB0_7 + ld.w $t3, $a3, 8 + ld.w $t2, $a3, 12 + ld.w $a6, $a3, 36 + ld.w $t4, $a3, 44 + fld.s $ft1, $a3, 60 + ld.d $s1, $a4, 40 + fld.s $fa7, $a4, 48 + fld.s $ft0, $a4, 52 + ld.d $t0, $a5, 0 + ld.d $a7, $a5, 8 + ld.d $t7, $a5, 16 + vld $vr5, $a5, 24 + vld $vr6, $a5, 40 + ld.d $s7, $a5, 56 + ld.d $a4, $a5, 64 + ld.d $t1, $a5, 72 + vld $vr3, $a5, 80 + vld $vr4, $a5, 96 + mul.d $a3, $t2, $t3 + mul.d $a3, $a3, $t4 + fld.s $ft2, $a0, 4 + movgr2fr.w $ft3, $a3 + ffint.s.w $ft3, $ft3 + fdiv.s $ft4, $ft1, $ft3 + fdiv.s $ft1, $ft2, $ft4 + ftintrz.w.s $ft1, $ft1 + movfr2gr.s $a3, $ft1 + movgr2fr.w $ft1, $a3 + ffint.s.w $ft3, $ft1 + vldi $vr13, -1184 + ld.d $t3, $a2, 0 + mod.w $a5, $a3, $t2 + fld.s $ft1, $a0, 0 + slli.d $a3, $a5, 3 + ldx.d $t3, $t3, $a3 + fadd.s $ft3, $ft3, $ft5 + fneg.s $ft5, $ft4 + fmadd.s $ft2, $ft5, $ft3, $ft2 + beqz $a5, .LBB0_7 # %bb.1: - addi.w $t1, $t1, -1 - bne $s0, $t1, .LBB0_12 + addi.w $t2, $t2, -1 + bne $a5, $t2, .LBB0_12 # %bb.2: - blez $a7, .LBB0_96 + blez $a6, .LBB0_92 # %bb.3: # %.lr.ph338 - st.d $s1, $sp, 16 # 8-byte Folded Spill - st.d $s5, $sp, 32 # 8-byte Folded Spill - ld.d $t1, $a2, 8 - fadd.s $fa6, $ft0, $fa6 - alsl.d $s0, $s0, $t1, 3 - ld.d $s1, $s0, -16 - ld.d $s3, $s0, -8 - ldx.d $s4, $t1, $a5 - fadd.s $fa7, $ft0, $ft0 - ori $a5, $zero, 16 - fmul.s $ft0, $ft0, $fa7 - st.d $s6, $sp, 24 # 8-byte Folded Spill - bgeu $a7, $a5, .LBB0_145 + st.d $t3, $sp, 16 # 8-byte Folded Spill + ld.d $t2, $a2, 8 + fadd.s $ft2, $ft4, $ft2 + alsl.d $a5, $a5, $t2, 3 + ld.d $t3, $a5, -16 + ld.d $t4, $a5, -8 + ldx.d $t5, $t2, $a3 + fadd.s $ft3, $ft4, $ft4 + ori $a3, $zero, 16 + fmul.s $ft4, $ft4, $ft3 + bgeu $a6, $a3, .LBB0_140 # %bb.4: - move $s5, $zero + move $t6, $zero .LBB0_5: # %scalar.ph500.preheader - alsl.d $t1, $s5, $fp, 2 - alsl.d $s0, $s5, $t3, 2 - alsl.d $s6, $s5, $t6, 2 - alsl.d $s4, $s5, $s4, 2 - alsl.d $s3, $s5, $s3, 2 - alsl.d $s1, $s5, $s1, 2 - sub.d $s5, $a7, $s5 - vldi $vr9, -1152 + alsl.d $a3, $t6, $t7, 2 + alsl.d $a5, $t6, $a7, 2 + alsl.d $t2, $t6, $t0, 2 + alsl.d $t5, $t6, $t5, 2 + alsl.d $t4, $t6, $t4, 2 + alsl.d $t3, $t6, $t3, 2 + sub.d $t6, $a6, $t6 + vldi $vr13, -1152 .p2align 4, , 16 .LBB0_6: # %scalar.ph500 # =>This Inner Loop Header: Depth=1 - fld.s $ft2, $s1, 0 - fld.s $ft3, $s4, 0 - fld.s $ft4, $s3, 0 - fsub.s $ft5, $ft2, $ft3 - fdiv.s $ft5, $ft5, $fa7 - fmadd.s $ft2, $ft4, $ft1, $ft2 - fadd.s $ft2, $ft2, $ft3 - fdiv.s $ft2, $ft2, $ft0 - fmadd.s $ft3, $ft5, $fa6, $ft4 - fmul.s $ft4, $fa6, $ft2 - fmadd.s $ft3, $ft4, $fa6, $ft3 - fst.s $ft3, $s6, 0 - fadd.s $ft3, $ft2, $ft2 - fmadd.s $ft3, $ft3, $fa6, $ft5 - fst.s $ft3, $s0, 0 - fst.s $ft2, $t1, 0 - addi.d $t1, $t1, 4 - addi.d $s0, $s0, 4 - addi.d $s6, $s6, 4 - addi.d $s4, $s4, 4 - addi.d $s3, $s3, 4 - addi.d $s5, $s5, -1 - addi.d $s1, $s1, 4 - bnez $s5, .LBB0_6 + fld.s $ft6, $t3, 0 + fld.s $ft7, $t5, 0 + fld.s $ft8, $t4, 0 + fsub.s $ft9, $ft6, $ft7 + fdiv.s $ft9, $ft9, $ft3 + fmadd.s $ft6, $ft8, $ft5, $ft6 + fadd.s $ft6, $ft6, $ft7 + fdiv.s $ft6, $ft6, $ft4 + fmadd.s $ft7, $ft9, $ft2, $ft8 + fmul.s $ft8, $ft2, $ft6 + fmadd.s $ft7, $ft8, $ft2, $ft7 + fst.s $ft7, $t2, 0 + fadd.s $ft7, $ft6, $ft6 + fmadd.s $ft7, $ft7, $ft2, $ft9 + fst.s $ft7, $a5, 0 + fst.s $ft6, $a3, 0 + addi.d $a3, $a3, 4 + addi.d $a5, $a5, 4 + addi.d $t2, $t2, 4 + addi.d $t5, $t5, 4 + addi.d $t4, $t4, 4 + addi.d $t6, $t6, -1 + addi.d $t3, $t3, 4 + bnez $t6, .LBB0_6 b .LBB0_17 .LBB0_7: - blez $a7, .LBB0_96 + blez $a6, .LBB0_92 # %bb.8: # %.lr.ph340 - st.d $s1, $sp, 16 # 8-byte Folded Spill - ld.d $t1, $a2, 8 - fsub.s $fa6, $fa6, $ft0 - ldx.d $s1, $t1, $a5 - ld.d $s3, $t1, 8 - ld.d $s4, $t1, 16 - fadd.s $fa7, $ft0, $ft0 - ori $t1, $zero, 16 - fmul.s $ft0, $ft0, $fa7 - st.d $s5, $sp, 32 # 8-byte Folded Spill - st.d $s6, $sp, 24 # 8-byte Folded Spill - bgeu $a7, $t1, .LBB0_130 + st.d $t3, $sp, 16 # 8-byte Folded Spill + ld.d $a5, $a2, 8 + fsub.s $ft2, $ft2, $ft4 + ldx.d $t3, $a5, $a3 + ld.d $t4, $a5, 8 + ld.d $t5, $a5, 16 + fadd.s $ft3, $ft4, $ft4 + ori $a3, $zero, 16 + fmul.s $ft4, $ft4, $ft3 + bgeu $a6, $a3, .LBB0_119 # %bb.9: - move $a5, $zero + move $t6, $zero .LBB0_10: # %scalar.ph544.preheader - alsl.d $t1, $a5, $fp, 2 - alsl.d $s0, $a5, $t3, 2 - alsl.d $s6, $a5, $t6, 2 - alsl.d $s4, $a5, $s4, 2 - alsl.d $s3, $a5, $s3, 2 - alsl.d $s1, $a5, $s1, 2 - sub.d $s5, $a7, $a5 - vldi $vr9, -1152 + alsl.d $a3, $t6, $t7, 2 + alsl.d $a5, $t6, $a7, 2 + alsl.d $t2, $t6, $t0, 2 + alsl.d $t5, $t6, $t5, 2 + alsl.d $t4, $t6, $t4, 2 + alsl.d $t3, $t6, $t3, 2 + sub.d $t6, $a6, $t6 + vldi $vr13, -1152 .p2align 4, , 16 .LBB0_11: # %scalar.ph544 # =>This Inner Loop Header: Depth=1 - fld.s $ft2, $s1, 0 - fld.s $ft3, $s4, 0 - fld.s $ft4, $s3, 0 - fsub.s $ft5, $ft2, $ft3 - fdiv.s $ft5, $ft5, $fa7 - fmadd.s $ft2, $ft4, $ft1, $ft2 - fadd.s $ft2, $ft2, $ft3 - fdiv.s $ft2, $ft2, $ft0 - fmadd.s $ft3, $ft5, $fa6, $ft4 - fmul.s $ft4, $fa6, $ft2 - fmadd.s $ft3, $ft4, $fa6, $ft3 - fst.s $ft3, $s6, 0 - fadd.s $ft3, $ft2, $ft2 - fmadd.s $ft3, $ft3, $fa6, $ft5 - fst.s $ft3, $s0, 0 - fst.s $ft2, $t1, 0 - addi.d $t1, $t1, 4 - addi.d $s0, $s0, 4 - addi.d $s6, $s6, 4 - addi.d $s4, $s4, 4 - addi.d $s3, $s3, 4 - addi.d $s5, $s5, -1 - addi.d $s1, $s1, 4 - bnez $s5, .LBB0_11 + fld.s $ft6, $t3, 0 + fld.s $ft7, $t5, 0 + fld.s $ft8, $t4, 0 + fsub.s $ft9, $ft6, $ft7 + fdiv.s $ft9, $ft9, $ft3 + fmadd.s $ft6, $ft8, $ft5, $ft6 + fadd.s $ft6, $ft6, $ft7 + fdiv.s $ft6, $ft6, $ft4 + fmadd.s $ft7, $ft9, $ft2, $ft8 + fmul.s $ft8, $ft2, $ft6 + fmadd.s $ft7, $ft8, $ft2, $ft7 + fst.s $ft7, $t2, 0 + fadd.s $ft7, $ft6, $ft6 + fmadd.s $ft7, $ft7, $ft2, $ft9 + fst.s $ft7, $a5, 0 + fst.s $ft6, $a3, 0 + addi.d $a3, $a3, 4 + addi.d $a5, $a5, 4 + addi.d $t2, $t2, 4 + addi.d $t5, $t5, 4 + addi.d $t4, $t4, 4 + addi.d $t6, $t6, -1 + addi.d $t3, $t3, 4 + bnez $t6, .LBB0_11 b .LBB0_17 .LBB0_12: # %.preheader334 - blez $a7, .LBB0_96 + blez $a6, .LBB0_92 # %bb.13: # %.lr.ph - st.d $s1, $sp, 16 # 8-byte Folded Spill - st.d $s5, $sp, 32 # 8-byte Folded Spill - ld.d $t1, $a2, 8 - alsl.d $s0, $s0, $t1, 3 - ld.d $s1, $s0, -8 - ldx.d $s3, $t1, $a5 - ld.d $s4, $s0, 8 - fadd.s $fa7, $ft0, $ft0 - ori $a5, $zero, 16 - fmul.s $ft0, $ft0, $fa7 - st.d $s6, $sp, 24 # 8-byte Folded Spill - bgeu $a7, $a5, .LBB0_160 + st.d $t3, $sp, 16 # 8-byte Folded Spill + ld.d $t2, $a2, 8 + alsl.d $a5, $a5, $t2, 3 + ld.d $t3, $a5, -8 + ldx.d $t4, $t2, $a3 + ld.d $t5, $a5, 8 + fadd.s $ft3, $ft4, $ft4 + ori $a3, $zero, 16 + fmul.s $ft4, $ft4, $ft3 + bgeu $a6, $a3, .LBB0_155 # %bb.14: - move $s5, $zero + move $t6, $zero .LBB0_15: # %scalar.ph.preheader - alsl.d $t1, $s5, $fp, 2 - alsl.d $s0, $s5, $t3, 2 - alsl.d $s6, $s5, $t6, 2 - alsl.d $s4, $s5, $s4, 2 - alsl.d $s3, $s5, $s3, 2 - alsl.d $s1, $s5, $s1, 2 - sub.d $s5, $a7, $s5 - vldi $vr9, -1152 + alsl.d $a3, $t6, $t7, 2 + alsl.d $a5, $t6, $a7, 2 + alsl.d $t2, $t6, $t0, 2 + alsl.d $t5, $t6, $t5, 2 + alsl.d $t4, $t6, $t4, 2 + alsl.d $t3, $t6, $t3, 2 + sub.d $t6, $a6, $t6 + vldi $vr13, -1152 .p2align 4, , 16 .LBB0_16: # %scalar.ph # =>This Inner Loop Header: Depth=1 - fld.s $ft2, $s1, 0 - fld.s $ft3, $s4, 0 - fld.s $ft4, $s3, 0 - fsub.s $ft5, $ft2, $ft3 - fdiv.s $ft5, $ft5, $fa7 - fmadd.s $ft2, $ft4, $ft1, $ft2 - fadd.s $ft2, $ft2, $ft3 - fdiv.s $ft2, $ft2, $ft0 - fmadd.s $ft3, $ft5, $fa6, $ft4 - fmul.s $ft4, $fa6, $ft2 - fmadd.s $ft3, $ft4, $fa6, $ft3 - fst.s $ft3, $s6, 0 - fadd.s $ft3, $ft2, $ft2 - fmadd.s $ft3, $ft3, $fa6, $ft5 - fst.s $ft3, $s0, 0 - fst.s $ft2, $t1, 0 - addi.d $t1, $t1, 4 - addi.d $s0, $s0, 4 - addi.d $s6, $s6, 4 - addi.d $s4, $s4, 4 - addi.d $s3, $s3, 4 - addi.d $s5, $s5, -1 - addi.d $s1, $s1, 4 - bnez $s5, .LBB0_16 + fld.s $ft6, $t3, 0 + fld.s $ft7, $t5, 0 + fld.s $ft8, $t4, 0 + fsub.s $ft9, $ft6, $ft7 + fdiv.s $ft9, $ft9, $ft3 + fmadd.s $ft6, $ft8, $ft5, $ft6 + fadd.s $ft6, $ft6, $ft7 + fdiv.s $ft6, $ft6, $ft4 + fmadd.s $ft7, $ft9, $ft2, $ft8 + fmul.s $ft8, $ft2, $ft6 + fmadd.s $ft7, $ft8, $ft2, $ft7 + fst.s $ft7, $t2, 0 + fadd.s $ft7, $ft6, $ft6 + fmadd.s $ft7, $ft7, $ft2, $ft9 + fst.s $ft7, $a5, 0 + fst.s $ft6, $a3, 0 + addi.d $a3, $a3, 4 + addi.d $a5, $a5, 4 + addi.d $t2, $t2, 4 + addi.d $t5, $t5, 4 + addi.d $t4, $t4, 4 + addi.d $t6, $t6, -1 + addi.d $t3, $t3, 4 + bnez $t6, .LBB0_16 .LBB0_17: # %.lr.ph342 - bstrpick.d $s1, $a7, 31, 0 + bstrpick.d $t3, $a6, 31, 0 + vpickve2gr.d $t8, $vr6, 1 + vpickve2gr.d $s0, $vr6, 0 + vpickve2gr.d $fp, $vr5, 1 + vpickve2gr.d $s2, $vr5, 0 + vpickve2gr.d $t5, $vr4, 1 + vpickve2gr.d $a3, $vr4, 0 + st.d $a3, $sp, 8 # 8-byte Folded Spill + vpickve2gr.d $t6, $vr3, 1 + vpickve2gr.d $a5, $vr3, 0 ld.d $a2, $a2, 24 - fmul.s $fa5, $fa2, $fa5 - ori $a5, $zero, 20 + fmul.s $ft1, $fa2, $ft1 + ori $a3, $zero, 20 fmul.s $fa2, $fa1, $fa1 - bgeu $a7, $a5, .LBB0_22 + bgeu $a6, $a3, .LBB0_22 # %bb.18: move $s3, $zero .LBB0_19: # %scalar.ph588.preheader - alsl.d $a5, $s3, $t5, 2 - alsl.d $t1, $s3, $t4, 2 - alsl.d $s0, $s3, $t7, 2 + alsl.d $a3, $s3, $s0, 2 + alsl.d $t2, $s3, $fp, 2 + alsl.d $s4, $s3, $s2, 2 alsl.d $a2, $s3, $a2, 2 - sub.d $s3, $s1, $s3 + sub.d $s3, $t3, $s3 .p2align 4, , 16 .LBB0_20: # %scalar.ph588 # =>This Inner Loop Header: Depth=1 - fld.s $fa6, $a2, 0 - fst.s $fa6, $s0, 0 - fmul.s $fa6, $fa0, $fa6 - fst.s $fa6, $t1, 0 - fld.s $fa6, $s0, 0 - fmul.s $fa6, $fa6, $fa6 - fst.s $fa6, $a5, 0 - addi.d $a5, $a5, 4 - addi.d $t1, $t1, 4 - addi.d $s0, $s0, 4 + fld.s $ft2, $a2, 0 + fst.s $ft2, $s4, 0 + fmul.s $ft2, $fa0, $ft2 + fst.s $ft2, $t2, 0 + fld.s $ft2, $s4, 0 + fmul.s $ft2, $ft2, $ft2 + fst.s $ft2, $a3, 0 + addi.d $a3, $a3, 4 + addi.d $t2, $t2, 4 + addi.d $s4, $s4, 4 addi.d $s3, $s3, -1 addi.d $a2, $a2, 4 bnez $s3, .LBB0_20 # %bb.21: - move $t1, $s1 - move $s0, $t4 - move $a5, $t0 + move $a3, $t3 + move $t2, $fp + move $s4, $t8 b .LBB0_33 .LBB0_22: # %vector.memcheck564 - alsl.d $s0, $s1, $t7, 2 - alsl.d $a5, $s1, $t4, 2 - sltu $t1, $t7, $a5 - sltu $s3, $t4, $s0 - and $t1, $t1, $s3 + alsl.d $s4, $t3, $s2, 2 + alsl.d $a3, $t3, $fp, 2 + sltu $t2, $s2, $a3 + sltu $s3, $fp, $s4 + and $t2, $t2, $s3 move $s3, $zero - bnez $t1, .LBB0_19 + bnez $t2, .LBB0_19 # %bb.23: # %vector.memcheck564 - alsl.d $t1, $s1, $t5, 2 - sltu $s4, $t7, $t1 - sltu $s5, $t5, $s0 - and $s4, $s4, $s5 - bnez $s4, .LBB0_19 + alsl.d $t2, $t3, $s0, 2 + sltu $s5, $s2, $t2 + sltu $s6, $s0, $s4 + and $s5, $s5, $s6 + bnez $s5, .LBB0_19 # %bb.24: # %vector.memcheck564 - alsl.d $s4, $s1, $a2, 2 - sltu $s5, $t7, $s4 - sltu $s0, $a2, $s0 - and $s0, $s5, $s0 - bnez $s0, .LBB0_19 + alsl.d $s5, $t3, $a2, 2 + sltu $s6, $s2, $s5 + sltu $s4, $a2, $s4 + and $s4, $s6, $s4 + bnez $s4, .LBB0_19 # %bb.25: # %vector.memcheck564 - sltu $s0, $t4, $t1 - sltu $s5, $t5, $a5 - and $s0, $s0, $s5 - bnez $s0, .LBB0_19 + sltu $s4, $fp, $t2 + sltu $s6, $s0, $a3 + and $s4, $s4, $s6 + bnez $s4, .LBB0_19 # %bb.26: # %vector.memcheck564 - sltu $s0, $t4, $s4 - sltu $a5, $a2, $a5 - and $a5, $s0, $a5 - bnez $a5, .LBB0_19 + sltu $s4, $fp, $s5 + sltu $a3, $a2, $a3 + and $a3, $s4, $a3 + bnez $a3, .LBB0_19 # %bb.27: # %vector.memcheck564 - sltu $a5, $t5, $s4 - sltu $t1, $a2, $t1 - and $a5, $a5, $t1 - bnez $a5, .LBB0_19 + sltu $a3, $s0, $s5 + sltu $t2, $a2, $t2 + and $a3, $a3, $t2 + bnez $a3, .LBB0_19 # %bb.28: # %vector.ph590 - bstrpick.d $a5, $s1, 30, 2 - slli.d $s3, $a5, 2 - vreplvei.w $vr6, $vr0, 0 - move $a5, $a2 - move $t1, $t7 - move $s0, $t4 - move $s4, $t5 - move $s5, $s3 + bstrpick.d $a3, $t3, 30, 2 + slli.d $s3, $a3, 2 + vreplvei.w $vr10, $vr0, 0 + move $a3, $a2 + move $t2, $s2 + move $s4, $fp + move $s5, $s0 + move $s6, $s3 .p2align 4, , 16 .LBB0_29: # %vector.body595 # =>This Inner Loop Header: Depth=1 - vld $vr7, $a5, 0 - vst $vr7, $t1, 0 - vfmul.s $vr8, $vr6, $vr7 - vst $vr8, $s0, 0 - vfmul.s $vr7, $vr7, $vr7 - vst $vr7, $s4, 0 - addi.d $s5, $s5, -4 + vld $vr11, $a3, 0 + vst $vr11, $t2, 0 + vfmul.s $vr12, $vr10, $vr11 + vst $vr12, $s4, 0 + vfmul.s $vr11, $vr11, $vr11 + vst $vr11, $s5, 0 + addi.d $s6, $s6, -4 + addi.d $s5, $s5, 16 addi.d $s4, $s4, 16 - addi.d $s0, $s0, 16 - addi.d $t1, $t1, 16 - addi.d $a5, $a5, 16 - bnez $s5, .LBB0_29 + addi.d $t2, $t2, 16 + addi.d $a3, $a3, 16 + bnez $s6, .LBB0_29 # %bb.30: # %middle.block600 - move $t1, $s1 - move $s0, $t4 - move $a5, $t0 - beq $s3, $s1, .LBB0_33 + move $a3, $t3 + move $t2, $fp + move $s4, $t8 + beq $s3, $t3, .LBB0_33 b .LBB0_19 .p2align 4, , 16 .LBB0_31: # in Loop: Header=BB0_33 Depth=1 vldi $vr0, -1168 .LBB0_32: # %interpolateTable.exit # in Loop: Header=BB0_33 Depth=1 - fst.s $fa0, $a5, 0 - addi.d $a5, $a5, 4 - addi.d $t1, $t1, -1 - addi.d $s0, $s0, 4 - beqz $t1, .LBB0_35 + fst.s $fa0, $s4, 0 + addi.d $s4, $s4, 4 + addi.d $a3, $a3, -1 + addi.d $t2, $t2, 4 + beqz $a3, .LBB0_35 .LBB0_33: # %.lr.ph344 # =>This Inner Loop Header: Depth=1 - fld.s $fa0, $s0, 0 - fcmp.clt.s $fcc0, $fa4, $fa0 + fld.s $fa0, $t2, 0 + fcmp.clt.s $fcc0, $ft0, $fa0 bcnez $fcc0, .LBB0_31 # %bb.34: # in Loop: Header=BB0_33 Depth=1 - fdiv.s $fa6, $fa0, $fa3 - vldi $vr7, -1184 - fmadd.s $fa6, $fa3, $fa7, $fa6 - ftintrz.w.s $fa6, $fa6 - movfr2gr.s $a2, $fa6 + fdiv.s $ft2, $fa0, $fa7 + vldi $vr11, -1184 + fmadd.s $ft2, $fa7, $ft3, $ft2 + ftintrz.w.s $ft2, $ft2 + movfr2gr.s $a2, $ft2 slli.w $a2, $a2, 1 - alsl.d $s3, $a2, $s2, 2 + alsl.d $s3, $a2, $s1, 2 slli.d $a2, $a2, 2 - fldx.s $fa6, $s2, $a2 - fld.s $fa7, $s3, 4 - fmadd.s $fa0, $fa6, $fa0, $fa7 + fldx.s $ft2, $s1, $a2 + fld.s $ft3, $s3, 4 + fmadd.s $fa0, $ft2, $fa0, $ft3 b .LBB0_32 .LBB0_35: # %.lr.ph346.preheader ori $a2, $zero, 8 - bgeu $a7, $a2, .LBB0_48 -# %bb.36: + vrepli.d $vr0, 32 + bltu $a6, $a2, .LBB0_37 +# %bb.36: # %vector.memcheck603 + vreplgr2vr.d $vr7, $s7 + vsub.d $vr6, $vr7, $vr6 + vsub.d $vr5, $vr7, $vr5 + vslt.du $vr5, $vr5, $vr0 + vslt.du $vr6, $vr6, $vr0 + vpickev.w $vr5, $vr6, $vr5 + vmskltz.w $vr5, $vr5 + vpickve2gr.hu $a2, $vr5, 0 + beqz $a2, .LBB0_134 +.LBB0_37: move $a2, $zero -.LBB0_37: # %.lr.ph346.preheader838 - alsl.d $t1, $a2, $t2, 2 - alsl.d $s0, $a2, $t5, 2 - alsl.d $s2, $a2, $t7, 2 - alsl.d $s3, $a2, $t0, 2 - alsl.d $s4, $a2, $t4, 2 - sub.d $a2, $s1, $a2 - vldi $vr0, -1152 +.LBB0_38: # %.lr.ph346.preheader839 + alsl.d $a3, $a2, $s7, 2 + alsl.d $t2, $a2, $s0, 2 + alsl.d $s1, $a2, $s2, 2 + alsl.d $s3, $a2, $t8, 2 + alsl.d $s4, $a2, $fp, 2 + sub.d $a2, $t3, $a2 + vldi $vr5, -1152 .p2align 4, , 16 -.LBB0_38: # %.lr.ph346 +.LBB0_39: # %.lr.ph346 # =>This Inner Loop Header: Depth=1 - fld.s $fa3, $s4, 0 - fld.s $fa4, $s3, 0 - fld.s $fa6, $s2, 0 - fld.s $fa7, $s0, 0 - fadd.s $ft0, $fa3, $fa0 - fadd.s $fa4, $fa4, $fa4 - fmul.s $fa6, $fa6, $fa7 - fdiv.s $fa4, $fa4, $fa6 - fmadd.s $fa3, $fa3, $ft0, $fa4 - fst.s $fa3, $t1, 0 - addi.d $t1, $t1, 4 - addi.d $s0, $s0, 4 - addi.d $s2, $s2, 4 + fld.s $fa6, $s4, 0 + fld.s $fa7, $s3, 0 + fld.s $ft0, $s1, 0 + fld.s $ft2, $t2, 0 + fadd.s $ft3, $fa6, $fa5 + fadd.s $fa7, $fa7, $fa7 + fmul.s $ft0, $ft0, $ft2 + fdiv.s $fa7, $fa7, $ft0 + fmadd.s $fa6, $fa6, $ft3, $fa7 + fst.s $fa6, $a3, 0 + addi.d $a3, $a3, 4 + addi.d $t2, $t2, 4 + addi.d $s1, $s1, 4 addi.d $s3, $s3, 4 addi.d $a2, $a2, -1 addi.d $s4, $s4, 4 - bnez $a2, .LBB0_38 -.LBB0_39: # %.lr.ph349.preheader + bnez $a2, .LBB0_39 +.LBB0_40: # %.lr.ph349.preheader ori $a2, $zero, 32 masknez $a2, $a2, $a1 - ori $a5, $zero, 24 - maskeqz $a1, $a5, $a1 + ori $a3, $zero, 24 + maskeqz $a1, $a3, $a1 or $a1, $a1, $a2 ldx.d $a0, $a0, $a1 ori $a1, $zero, 8 - bgeu $a7, $a1, .LBB0_55 -# %bb.40: + bgeu $a6, $a1, .LBB0_49 +# %bb.41: + move $t4, $s7 move $a1, $zero -.LBB0_41: # %.lr.ph349.preheader837 +.LBB0_42: # %.lr.ph349.preheader838 slli.d $a2, $a1, 2 - sub.d $a1, $s1, $a1 - vldi $vr0, -1144 - vldi $vr3, -1256 - vldi $vr4, -1128 - vldi $vr6, -1272 - move $s2, $t6 - move $s3, $t4 - move $s4, $t7 + sub.d $a1, $t3, $a1 + vldi $vr5, -1144 + vldi $vr6, -1256 + vldi $vr7, -1128 + vldi $vr8, -1272 + move $s1, $t0 + move $s3, $fp + move $s4, $s2 move $s5, $a0 - move $s6, $t0 - move $s7, $t5 - move $s8, $t3 - move $ra, $t2 - move $t1, $fp - move $s0, $a6 + move $s6, $t8 + move $s7, $s0 + move $s8, $a7 + move $ra, $t4 + move $a3, $t7 + move $t2, $a4 .p2align 4, , 16 -.LBB0_42: # %.lr.ph349 +.LBB0_43: # %.lr.ph349 # =>This Inner Loop Header: Depth=1 - fldx.s $fa7, $s2, $a2 - fldx.s $ft0, $s4, $a2 - fldx.s $ft1, $s5, $a2 - fldx.s $ft2, $s6, $a2 - fldx.s $ft3, $s3, $a2 - fmsub.s $ft0, $ft0, $ft1, $fa7 - fmul.s $ft0, $ft0, $ft2 - fmadd.s $fa7, $fa7, $ft3, $ft0 - fldx.s $ft0, $s7, $a2 - fldx.s $ft1, $s8, $a2 - fldx.s $ft4, $ra, $a2 - fldx.s $ft5, $t1, $a2 - fdiv.s $fa7, $fa7, $ft0 - fmul.s $ft1, $fa1, $ft1 - fmadd.s $fa7, $ft1, $ft4, $fa7 - fmul.s $ft1, $fa2, $ft5 - fadd.s $ft4, $ft3, $fa0 - fmadd.s $ft4, $ft3, $ft4, $fa3 - fmul.s $ft2, $ft2, $fa4 - fmadd.s $ft2, $ft3, $ft4, $ft2 - fmul.s $ft1, $ft2, $ft1 - fmul.s $ft2, $ft0, $fa6 - fmul.s $ft0, $ft0, $ft2 - fdiv.s $ft0, $ft1, $ft0 - fadd.s $fa7, $fa7, $ft0 - fstx.s $fa7, $s0, $a2 - addi.d $s0, $s0, 4 - addi.d $t1, $t1, 4 + fldx.s $ft2, $s1, $a2 + fldx.s $ft3, $s4, $a2 + fldx.s $ft4, $s5, $a2 + fldx.s $ft5, $s6, $a2 + fldx.s $ft6, $s3, $a2 + fmsub.s $ft3, $ft3, $ft4, $ft2 + fmul.s $ft3, $ft3, $ft5 + fmadd.s $ft2, $ft2, $ft6, $ft3 + fldx.s $ft3, $s7, $a2 + fldx.s $ft4, $s8, $a2 + fldx.s $ft7, $ra, $a2 + fldx.s $ft8, $a3, $a2 + fdiv.s $ft2, $ft2, $ft3 + fmul.s $ft4, $fa1, $ft4 + fmadd.s $ft2, $ft4, $ft7, $ft2 + fmul.s $ft4, $fa2, $ft8 + fadd.s $ft7, $ft6, $fa5 + fmadd.s $ft7, $ft6, $ft7, $fa6 + fmul.s $ft5, $ft5, $fa7 + fmadd.s $ft5, $ft6, $ft7, $ft5 + fmul.s $ft4, $ft5, $ft4 + fmul.s $ft5, $ft3, $ft0 + fmul.s $ft3, $ft3, $ft5 + fdiv.s $ft3, $ft4, $ft3 + fadd.s $ft2, $ft2, $ft3 + fstx.s $ft2, $t2, $a2 + addi.d $t2, $t2, 4 + addi.d $a3, $a3, 4 addi.d $ra, $ra, 4 addi.d $s8, $s8, 4 addi.d $s7, $s7, 4 @@ -456,368 +467,321 @@ attenuate_fluxes: # @attenuate_fluxes addi.d $s4, $s4, 4 addi.d $s3, $s3, 4 addi.d $a1, $a1, -1 - addi.d $s2, $s2, 4 - bnez $a1, .LBB0_42 -.LBB0_43: # %.lr.ph351.preheader + addi.d $s1, $s1, 4 + bnez $a1, .LBB0_43 +.LBB0_44: # %.lr.ph351.preheader ori $a2, $zero, 8 move $a1, $zero - bltu $a7, $a2, .LBB0_67 -# %bb.44: # %.lr.ph351.preheader - sub.d $a2, $t8, $a6 - ori $a5, $zero, 32 - ld.d $s0, $sp, 32 # 8-byte Folded Reload - ld.d $s2, $sp, 24 # 8-byte Folded Reload - ld.d $s3, $sp, 16 # 8-byte Folded Reload - bltu $a2, $a5, .LBB0_68 -# %bb.45: # %vector.ph677 - bstrpick.d $a1, $s1, 30, 3 + bltu $a6, $a2, .LBB0_61 +# %bb.45: # %.lr.ph351.preheader + sub.d $a2, $t1, $a4 + ori $a3, $zero, 32 + move $s1, $t4 + ld.d $t4, $sp, 16 # 8-byte Folded Reload + bltu $a2, $a3, .LBB0_62 +# %bb.46: # %vector.ph677 + bstrpick.d $a1, $t3, 30, 3 slli.d $a1, $a1, 3 - vreplvei.w $vr0, $vr5, 0 - addi.d $a2, $t8, 16 - addi.d $a5, $a6, 16 - move $t1, $a1 + vreplvei.w $vr5, $vr9, 0 + addi.d $a2, $t1, 16 + addi.d $a3, $a4, 16 + move $t2, $a1 .p2align 4, , 16 -.LBB0_46: # %vector.body682 +.LBB0_47: # %vector.body682 # =>This Inner Loop Header: Depth=1 - vld $vr3, $a5, -16 - vld $vr4, $a5, 0 - vfmul.s $vr3, $vr0, $vr3 - vfmul.s $vr4, $vr0, $vr4 - vst $vr3, $a2, -16 - vst $vr4, $a2, 0 - addi.d $t1, $t1, -8 + vld $vr6, $a3, -16 + vld $vr7, $a3, 0 + vfmul.s $vr6, $vr5, $vr6 + vfmul.s $vr7, $vr5, $vr7 + vst $vr6, $a2, -16 + vst $vr7, $a2, 0 + addi.d $t2, $t2, -8 addi.d $a2, $a2, 32 - addi.d $a5, $a5, 32 - bnez $t1, .LBB0_46 -# %bb.47: # %middle.block687 - bne $a1, $s1, .LBB0_68 - b .LBB0_70 -.LBB0_48: # %vector.memcheck603 - sub.d $t1, $t2, $t4 - ori $a5, $zero, 32 - move $a2, $zero - bltu $t1, $a5, .LBB0_37 -# %bb.49: # %vector.memcheck603 - sub.d $t1, $t2, $t0 - bltu $t1, $a5, .LBB0_37 -# %bb.50: # %vector.memcheck603 - sub.d $t1, $t2, $t7 - ori $a5, $zero, 32 - bltu $t1, $a5, .LBB0_37 -# %bb.51: # %vector.memcheck603 - sub.d $t1, $t2, $t5 - bltu $t1, $a5, .LBB0_37 -# %bb.52: # %vector.ph613 - bstrpick.d $a2, $s1, 30, 3 - slli.d $a2, $a2, 3 - addi.d $t1, $t4, 16 - addi.d $s0, $t2, 16 - addi.d $s2, $t0, 16 - addi.d $s3, $t5, 16 - addi.d $s4, $t7, 16 - lu12i.w $a5, -262144 - vreplgr2vr.w $vr0, $a5 - move $s5, $a2 - .p2align 4, , 16 -.LBB0_53: # %vector.body616 - # =>This Inner Loop Header: Depth=1 - vld $vr3, $t1, -16 - vld $vr4, $t1, 0 - vfadd.s $vr6, $vr3, $vr0 - vfadd.s $vr7, $vr4, $vr0 - vld $vr8, $s2, -16 - vld $vr9, $s2, 0 - vld $vr10, $s4, -16 - vld $vr11, $s4, 0 - vld $vr12, $s3, -16 - vld $vr13, $s3, 0 - vfadd.s $vr8, $vr8, $vr8 - vfadd.s $vr9, $vr9, $vr9 - vfmul.s $vr10, $vr10, $vr12 - vfmul.s $vr11, $vr11, $vr13 - vfdiv.s $vr8, $vr8, $vr10 - vfdiv.s $vr9, $vr9, $vr11 - vfmadd.s $vr3, $vr3, $vr6, $vr8 - vfmadd.s $vr4, $vr4, $vr7, $vr9 - vst $vr3, $s0, -16 - vst $vr4, $s0, 0 - addi.d $s5, $s5, -8 - addi.d $t1, $t1, 32 - addi.d $s0, $s0, 32 - addi.d $s2, $s2, 32 - addi.d $s3, $s3, 32 - addi.d $s4, $s4, 32 - bnez $s5, .LBB0_53 -# %bb.54: # %middle.block627 - bne $a2, $s1, .LBB0_37 - b .LBB0_39 -.LBB0_55: # %vector.memcheck630 - sub.d $a5, $a6, $t6 + addi.d $a3, $a3, 32 + bnez $t2, .LBB0_47 +# %bb.48: # %middle.block687 + bne $a1, $t3, .LBB0_62 + b .LBB0_64 +.LBB0_49: # %vector.memcheck630 + sub.d $a3, $a4, $t0 ori $a2, $zero, 16 move $a1, $zero - bltu $a5, $a2, .LBB0_41 -# %bb.56: # %vector.memcheck630 - sub.d $a5, $a6, $t4 - bltu $a5, $a2, .LBB0_41 -# %bb.57: # %vector.memcheck630 - sub.d $a5, $a6, $t7 + bltu $a3, $a2, .LBB0_170 +# %bb.50: # %vector.memcheck630 + sub.d $a3, $a4, $fp + bltu $a3, $a2, .LBB0_170 +# %bb.51: # %vector.memcheck630 + sub.d $a3, $a4, $s2 ori $a2, $zero, 16 - bltu $a5, $a2, .LBB0_41 -# %bb.58: # %vector.memcheck630 - sub.d $a5, $a6, $a0 - bltu $a5, $a2, .LBB0_41 -# %bb.59: # %vector.memcheck630 - sub.d $a5, $a6, $t0 + bltu $a3, $a2, .LBB0_170 +# %bb.52: # %vector.memcheck630 + sub.d $a3, $a4, $a0 + bltu $a3, $a2, .LBB0_170 +# %bb.53: # %vector.memcheck630 + sub.d $a3, $a4, $t8 ori $a2, $zero, 16 - bltu $a5, $a2, .LBB0_41 -# %bb.60: # %vector.memcheck630 - sub.d $a5, $a6, $t5 - bltu $a5, $a2, .LBB0_41 -# %bb.61: # %vector.memcheck630 - sub.d $a5, $a6, $t3 + bltu $a3, $a2, .LBB0_170 +# %bb.54: # %vector.memcheck630 + sub.d $a3, $a4, $s0 + bltu $a3, $a2, .LBB0_170 +# %bb.55: # %vector.memcheck630 + sub.d $a3, $a4, $a7 ori $a2, $zero, 16 - bltu $a5, $a2, .LBB0_41 -# %bb.62: # %vector.memcheck630 - sub.d $a5, $a6, $t2 - bltu $a5, $a2, .LBB0_41 -# %bb.63: # %vector.memcheck630 - sub.d $a2, $a6, $fp - ori $a5, $zero, 16 - bltu $a2, $a5, .LBB0_41 -# %bb.64: # %vector.ph651 - bstrpick.d $a1, $s1, 30, 2 + bltu $a3, $a2, .LBB0_170 +# %bb.56: # %vector.memcheck630 + move $t4, $s7 + sub.d $a3, $a4, $s7 + bltu $a3, $a2, .LBB0_42 +# %bb.57: # %vector.memcheck630 + sub.d $a2, $a4, $t7 + ori $a3, $zero, 16 + bltu $a2, $a3, .LBB0_42 +# %bb.58: # %vector.ph651 + bstrpick.d $a1, $t3, 30, 2 slli.d $a1, $a1, 2 - vreplvei.w $vr0, $vr1, 0 - vreplvei.w $vr3, $vr2, 0 + vreplvei.w $vr5, $vr1, 0 + vreplvei.w $vr6, $vr2, 0 lu12i.w $a2, -261120 - vreplgr2vr.w $vr4, $a2 + vreplgr2vr.w $vr7, $a2 lu12i.w $a2, 265216 - vreplgr2vr.w $vr6, $a2 + vreplgr2vr.w $vr8, $a2 lu12i.w $a2, -259072 - vreplgr2vr.w $vr7, $a2 + vreplgr2vr.w $vr10, $a2 lu12i.w $a2, 263168 - vreplgr2vr.w $vr8, $a2 - move $a2, $t6 - move $s2, $t4 - move $s3, $t7 + vreplgr2vr.w $vr11, $a2 + move $a2, $t0 + move $s1, $fp + move $s3, $s2 move $s4, $a0 - move $s5, $t0 - move $s6, $t5 - move $s7, $t3 - move $s8, $t2 - move $t1, $fp - move $s0, $a6 + move $s5, $t8 + move $s6, $s0 + move $s7, $a7 + move $s8, $t4 + move $a3, $t7 + move $t2, $a4 move $ra, $a1 .p2align 4, , 16 -.LBB0_65: # %vector.body658 +.LBB0_59: # %vector.body658 # =>This Inner Loop Header: Depth=1 - vld $vr9, $a2, 0 - vld $vr10, $s3, 0 - vld $vr11, $s4, 0 - vld $vr12, $s5, 0 - vld $vr13, $s2, 0 - vfmsub.s $vr10, $vr10, $vr11, $vr9 - vfmul.s $vr10, $vr10, $vr12 - vfmadd.s $vr9, $vr9, $vr13, $vr10 - vld $vr10, $s6, 0 - vld $vr11, $s7, 0 - vld $vr14, $s8, 0 - vld $vr15, $t1, 0 - vfdiv.s $vr9, $vr9, $vr10 - vfmul.s $vr11, $vr0, $vr11 - vfmadd.s $vr9, $vr11, $vr14, $vr9 - vfmul.s $vr11, $vr3, $vr15 - vfadd.s $vr14, $vr13, $vr4 - vfmadd.s $vr14, $vr13, $vr14, $vr6 - vfmul.s $vr12, $vr12, $vr7 - vfmadd.s $vr12, $vr13, $vr14, $vr12 - vfmul.s $vr11, $vr12, $vr11 - vfmul.s $vr12, $vr10, $vr8 - vfmul.s $vr10, $vr10, $vr12 - vfdiv.s $vr10, $vr11, $vr10 - vfadd.s $vr9, $vr9, $vr10 - vst $vr9, $s0, 0 + vld $vr12, $a2, 0 + vld $vr13, $s3, 0 + vld $vr14, $s4, 0 + vld $vr15, $s5, 0 + vld $vr16, $s1, 0 + vfmsub.s $vr13, $vr13, $vr14, $vr12 + vfmul.s $vr13, $vr13, $vr15 + vfmadd.s $vr12, $vr12, $vr16, $vr13 + vld $vr13, $s6, 0 + vld $vr14, $s7, 0 + vld $vr17, $s8, 0 + vld $vr18, $a3, 0 + vfdiv.s $vr12, $vr12, $vr13 + vfmul.s $vr14, $vr5, $vr14 + vfmadd.s $vr12, $vr14, $vr17, $vr12 + vfmul.s $vr14, $vr6, $vr18 + vfadd.s $vr17, $vr16, $vr7 + vfmadd.s $vr17, $vr16, $vr17, $vr8 + vfmul.s $vr15, $vr15, $vr10 + vfmadd.s $vr15, $vr16, $vr17, $vr15 + vfmul.s $vr14, $vr15, $vr14 + vfmul.s $vr15, $vr13, $vr11 + vfmul.s $vr13, $vr13, $vr15 + vfdiv.s $vr13, $vr14, $vr13 + vfadd.s $vr12, $vr12, $vr13 + vst $vr12, $t2, 0 addi.d $ra, $ra, -4 - addi.d $s0, $s0, 16 - addi.d $t1, $t1, 16 + addi.d $t2, $t2, 16 + addi.d $a3, $a3, 16 addi.d $s8, $s8, 16 addi.d $s7, $s7, 16 addi.d $s6, $s6, 16 addi.d $s5, $s5, 16 addi.d $s4, $s4, 16 addi.d $s3, $s3, 16 - addi.d $s2, $s2, 16 + addi.d $s1, $s1, 16 addi.d $a2, $a2, 16 - bnez $ra, .LBB0_65 -# %bb.66: # %middle.block670 - bne $a1, $s1, .LBB0_41 - b .LBB0_43 -.LBB0_67: - ld.d $s0, $sp, 32 # 8-byte Folded Reload - ld.d $s2, $sp, 24 # 8-byte Folded Reload - ld.d $s3, $sp, 16 # 8-byte Folded Reload -.LBB0_68: # %.lr.ph351.preheader836 - alsl.d $a2, $a1, $t8, 2 - alsl.d $a5, $a1, $a6, 2 - sub.d $a1, $s1, $a1 + bnez $ra, .LBB0_59 +# %bb.60: # %middle.block670 + bne $a1, $t3, .LBB0_42 + b .LBB0_44 +.LBB0_61: + move $s1, $t4 + ld.d $t4, $sp, 16 # 8-byte Folded Reload +.LBB0_62: # %.lr.ph351.preheader837 + alsl.d $a2, $a1, $t1, 2 + alsl.d $a3, $a1, $a4, 2 + sub.d $a1, $t3, $a1 .p2align 4, , 16 -.LBB0_69: # %.lr.ph351 +.LBB0_63: # %.lr.ph351 # =>This Inner Loop Header: Depth=1 - fld.s $fa0, $a5, 0 - fmul.s $fa0, $fa5, $fa0 - fst.s $fa0, $a2, 0 + fld.s $fa5, $a3, 0 + fmul.s $fa5, $ft1, $fa5 + fst.s $fa5, $a2, 0 addi.d $a2, $a2, 4 addi.d $a1, $a1, -1 - addi.d $a5, $a5, 4 - bnez $a1, .LBB0_69 -.LBB0_70: # %.lr.ph353.preheader + addi.d $a3, $a3, 4 + bnez $a1, .LBB0_63 +.LBB0_64: # %.lr.ph353.preheader ori $a1, $zero, 8 - bltu $a7, $a1, .LBB0_73 -# %bb.71: # %vector.memcheck690 - alsl.d $a1, $s1, $t8, 2 - bgeu $s3, $a1, .LBB0_127 -# %bb.72: # %vector.memcheck690 - alsl.d $a1, $s1, $s3, 2 - bgeu $t8, $a1, .LBB0_127 -.LBB0_73: + bltu $a6, $a1, .LBB0_67 +# %bb.65: # %vector.memcheck690 + alsl.d $a1, $t3, $t1, 2 + bgeu $t4, $a1, .LBB0_116 +# %bb.66: # %vector.memcheck690 + alsl.d $a1, $t3, $t4, 2 + bgeu $t1, $a1, .LBB0_116 +.LBB0_67: move $a1, $zero -.LBB0_74: # %.lr.ph353.preheader835 - alsl.d $a2, $a1, $s3, 2 - alsl.d $a5, $a1, $t8, 2 - sub.d $a1, $s1, $a1 +.LBB0_68: # %.lr.ph353.preheader836 + alsl.d $a2, $a1, $t4, 2 + alsl.d $a3, $a1, $t1, 2 + sub.d $a1, $t3, $a1 .p2align 4, , 16 -.LBB0_75: # %.lr.ph353 +.LBB0_69: # %.lr.ph353 # =>This Inner Loop Header: Depth=1 - fld.s $fa0, $a5, 0 - fld.s $fa3, $a2, 0 - fadd.s $fa0, $fa0, $fa3 - fst.s $fa0, $a2, 0 + fld.s $fa5, $a3, 0 + fld.s $fa6, $a2, 0 + fadd.s $fa5, $fa5, $fa6 + fst.s $fa5, $a2, 0 addi.d $a2, $a2, 4 addi.d $a1, $a1, -1 - addi.d $a5, $a5, 4 - bnez $a1, .LBB0_75 -.LBB0_76: # %.lr.ph355.preheader + addi.d $a3, $a3, 4 + bnez $a1, .LBB0_69 +.LBB0_70: # %.lr.ph355.preheader ori $a1, $zero, 8 - bgeu $a7, $a1, .LBB0_97 -# %bb.77: + bgeu $a6, $a1, .LBB0_93 +# %bb.71: move $a1, $zero -.LBB0_78: # %.lr.ph355.preheader834 - alsl.d $a2, $a1, $a3, 2 - alsl.d $a5, $a1, $t7, 2 - alsl.d $a6, $a1, $t0, 2 - alsl.d $t1, $a1, $t6, 2 - sub.d $a1, $s1, $a1 + move $t4, $t5 +.LBB0_72: # %.lr.ph355.preheader835 + ld.d $t5, $sp, 8 # 8-byte Folded Reload +.LBB0_73: # %.lr.ph355.preheader835 + alsl.d $a2, $a1, $a5, 2 + alsl.d $a3, $a1, $s2, 2 + alsl.d $a4, $a1, $t8, 2 + alsl.d $t0, $a1, $t0, 2 + sub.d $a1, $t3, $a1 .p2align 4, , 16 -.LBB0_79: # %.lr.ph355 +.LBB0_74: # %.lr.ph355 # =>This Inner Loop Header: Depth=1 - fld.s $fa0, $t1, 0 - fld.s $fa3, $a6, 0 - fld.s $fa4, $a5, 0 - fmul.s $fa0, $fa0, $fa3 - fdiv.s $fa0, $fa0, $fa4 - fst.s $fa0, $a2, 0 + fld.s $fa5, $t0, 0 + fld.s $fa6, $a4, 0 + fld.s $fa7, $a3, 0 + fmul.s $fa5, $fa5, $fa6 + fdiv.s $fa5, $fa5, $fa7 + fst.s $fa5, $a2, 0 addi.d $a2, $a2, 4 - addi.d $a5, $a5, 4 - addi.d $a6, $a6, 4 + addi.d $a3, $a3, 4 + addi.d $a4, $a4, 4 addi.d $a1, $a1, -1 - addi.d $t1, $t1, 4 - bnez $a1, .LBB0_79 -.LBB0_80: # %.lr.ph357.preheader + addi.d $t0, $t0, 4 + bnez $a1, .LBB0_74 +.LBB0_75: # %.lr.ph357.preheader ori $a1, $zero, 8 - bgeu $a7, $a1, .LBB0_103 -# %bb.81: + bgeu $a6, $a1, .LBB0_99 +# %bb.76: move $a1, $zero -.LBB0_82: # %.lr.ph357.preheader833 - alsl.d $a2, $a1, $a4, 2 - alsl.d $a5, $a1, $t5, 2 - alsl.d $a6, $a1, $t0, 2 - alsl.d $t1, $a1, $t4, 2 - alsl.d $t3, $a1, $t3, 2 - sub.d $a1, $s1, $a1 +.LBB0_77: # %.lr.ph357.preheader834 + alsl.d $a2, $a1, $t6, 2 + alsl.d $a3, $a1, $s0, 2 + alsl.d $a4, $a1, $t8, 2 + alsl.d $t0, $a1, $fp, 2 + alsl.d $a7, $a1, $a7, 2 + sub.d $a1, $t3, $a1 .p2align 4, , 16 -.LBB0_83: # %.lr.ph357 +.LBB0_78: # %.lr.ph357 # =>This Inner Loop Header: Depth=1 - fld.s $fa0, $t1, 0 - fld.s $fa3, $a6, 0 - fld.s $fa4, $t3, 0 - fld.s $fa5, $a5, 0 - fsub.s $fa0, $fa0, $fa3 - fmul.s $fa3, $fa1, $fa4 - fmul.s $fa0, $fa3, $fa0 - fdiv.s $fa0, $fa0, $fa5 - fst.s $fa0, $a2, 0 + fld.s $fa5, $t0, 0 + fld.s $fa6, $a4, 0 + fld.s $fa7, $a7, 0 + fld.s $ft0, $a3, 0 + fsub.s $fa5, $fa5, $fa6 + fmul.s $fa6, $fa1, $fa7 + fmul.s $fa5, $fa6, $fa5 + fdiv.s $fa5, $fa5, $ft0 + fst.s $fa5, $a2, 0 addi.d $a2, $a2, 4 - addi.d $a5, $a5, 4 - addi.d $a6, $a6, 4 - addi.d $t1, $t1, 4 + addi.d $a3, $a3, 4 + addi.d $a4, $a4, 4 + addi.d $t0, $t0, 4 addi.d $a1, $a1, -1 - addi.d $t3, $t3, 4 - bnez $a1, .LBB0_83 -.LBB0_84: # %.lr.ph359.preheader + addi.d $a7, $a7, 4 + bnez $a1, .LBB0_78 +.LBB0_79: # %.lr.ph359.preheader ori $a1, $zero, 8 - bgeu $a7, $a1, .LBB0_110 -# %bb.85: + bgeu $a6, $a1, .LBB0_106 +# %bb.80: move $a1, $zero -.LBB0_86: # %.lr.ph359.preheader832 - alsl.d $a2, $a1, $s0, 2 - alsl.d $a5, $a1, $t2, 2 - alsl.d $a6, $a1, $fp, 2 - sub.d $a1, $s1, $a1 +.LBB0_81: # %.lr.ph359.preheader833 + alsl.d $a2, $a1, $t5, 2 + alsl.d $a3, $a1, $s1, 2 + alsl.d $a4, $a1, $t7, 2 + sub.d $a1, $t3, $a1 .p2align 4, , 16 -.LBB0_87: # %.lr.ph359 +.LBB0_82: # %.lr.ph359 # =>This Inner Loop Header: Depth=1 - fld.s $fa0, $a6, 0 - fld.s $fa1, $a5, 0 - fmul.s $fa0, $fa2, $fa0 - fmul.s $fa0, $fa0, $fa1 - fst.s $fa0, $a2, 0 + fld.s $fa1, $a4, 0 + fld.s $fa5, $a3, 0 + fmul.s $fa1, $fa2, $fa1 + fmul.s $fa1, $fa1, $fa5 + fst.s $fa1, $a2, 0 addi.d $a2, $a2, 4 - addi.d $a5, $a5, 4 + addi.d $a3, $a3, 4 addi.d $a1, $a1, -1 - addi.d $a6, $a6, 4 - bnez $a1, .LBB0_87 -.LBB0_88: # %.lr.ph361.preheader + addi.d $a4, $a4, 4 + bnez $a1, .LBB0_82 +.LBB0_83: # %.lr.ph361.preheader ori $a1, $zero, 8 - bgeu $a7, $a1, .LBB0_115 -# %bb.89: + bgeu $a6, $a1, .LBB0_111 +# %bb.84: move $a1, $zero -.LBB0_90: # %.lr.ph361.preheader831 - alsl.d $a2, $a1, $s2, 2 - alsl.d $a5, $a1, $t0, 2 - alsl.d $a6, $a1, $a0, 2 - sub.d $a1, $s1, $a1 - vldi $vr0, -1168 +.LBB0_85: # %.lr.ph361.preheader832 + alsl.d $a2, $a1, $t4, 2 + alsl.d $a3, $a1, $t8, 2 + alsl.d $a4, $a1, $a0, 2 + sub.d $a1, $t3, $a1 + vldi $vr1, -1168 .p2align 4, , 16 -.LBB0_91: # %.lr.ph361 +.LBB0_86: # %.lr.ph361 # =>This Inner Loop Header: Depth=1 - fld.s $fa1, $a5, 0 - fld.s $fa2, $a6, 0 - fsub.s $fa1, $fa0, $fa1 - fmul.s $fa1, $fa2, $fa1 - fst.s $fa1, $a2, 0 + fld.s $fa2, $a3, 0 + fld.s $fa5, $a4, 0 + fsub.s $fa2, $fa1, $fa2 + fmul.s $fa2, $fa5, $fa2 + fst.s $fa2, $a2, 0 addi.d $a2, $a2, 4 - addi.d $a5, $a5, 4 + addi.d $a3, $a3, 4 addi.d $a1, $a1, -1 - addi.d $a6, $a6, 4 - bnez $a1, .LBB0_91 -.LBB0_92: # %.lr.ph363.preheader + addi.d $a4, $a4, 4 + bnez $a1, .LBB0_86 +.LBB0_87: # %.lr.ph363.preheader ori $a1, $zero, 8 - bgeu $a7, $a1, .LBB0_120 -# %bb.93: + bltu $a6, $a1, .LBB0_89 +# %bb.88: # %vector.memcheck803 + vreplgr2vr.d $vr1, $a0 + vsub.d $vr2, $vr1, $vr4 + vsub.d $vr1, $vr1, $vr3 + vslt.du $vr1, $vr1, $vr0 + vslt.du $vr0, $vr2, $vr0 + vpickev.w $vr0, $vr0, $vr1 + vmskltz.w $vr0, $vr0 + vpickve2gr.hu $a1, $vr0, 0 + beqz $a1, .LBB0_137 +.LBB0_89: move $a1, $zero -.LBB0_94: # %.lr.ph363.preheader830 +.LBB0_90: # %.lr.ph363.preheader831 alsl.d $a0, $a1, $a0, 2 - alsl.d $a2, $a1, $s2, 2 - alsl.d $a5, $a1, $s0, 2 - alsl.d $a4, $a1, $a4, 2 - alsl.d $a3, $a1, $a3, 2 - sub.d $a1, $s1, $a1 + alsl.d $a2, $a1, $t4, 2 + alsl.d $a3, $a1, $t5, 2 + alsl.d $a4, $a1, $t6, 2 + alsl.d $a5, $a1, $a5, 2 + sub.d $a1, $t3, $a1 .p2align 4, , 16 -.LBB0_95: # %.lr.ph363 +.LBB0_91: # %.lr.ph363 # =>This Inner Loop Header: Depth=1 - fld.s $fa0, $a3, 0 + fld.s $fa0, $a5, 0 fld.s $fa1, $a4, 0 - fld.s $fa2, $a5, 0 + fld.s $fa2, $a3, 0 fld.s $fa3, $a2, 0 fadd.s $fa0, $fa0, $fa1 fadd.s $fa0, $fa0, $fa2 @@ -825,541 +789,575 @@ attenuate_fluxes: # @attenuate_fluxes fst.s $fa0, $a0, 0 addi.d $a0, $a0, 4 addi.d $a2, $a2, 4 - addi.d $a5, $a5, 4 + addi.d $a3, $a3, 4 addi.d $a4, $a4, 4 addi.d $a1, $a1, -1 - addi.d $a3, $a3, 4 - bnez $a1, .LBB0_95 -.LBB0_96: # %._crit_edge364 - ld.d $s8, $sp, 40 # 8-byte Folded Reload - ld.d $s7, $sp, 48 # 8-byte Folded Reload - ld.d $s6, $sp, 56 # 8-byte Folded Reload - ld.d $s5, $sp, 64 # 8-byte Folded Reload - ld.d $s4, $sp, 72 # 8-byte Folded Reload - ld.d $s3, $sp, 80 # 8-byte Folded Reload - ld.d $s2, $sp, 88 # 8-byte Folded Reload - ld.d $s1, $sp, 96 # 8-byte Folded Reload - ld.d $s0, $sp, 104 # 8-byte Folded Reload - ld.d $fp, $sp, 112 # 8-byte Folded Reload - ld.d $ra, $sp, 120 # 8-byte Folded Reload - addi.d $sp, $sp, 128 + addi.d $a5, $a5, 4 + bnez $a1, .LBB0_91 +.LBB0_92: # %._crit_edge364 + ld.d $s8, $sp, 24 # 8-byte Folded Reload + ld.d $s7, $sp, 32 # 8-byte Folded Reload + ld.d $s6, $sp, 40 # 8-byte Folded Reload + ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $s4, $sp, 56 # 8-byte Folded Reload + ld.d $s3, $sp, 64 # 8-byte Folded Reload + ld.d $s2, $sp, 72 # 8-byte Folded Reload + ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $fp, $sp, 96 # 8-byte Folded Reload + ld.d $ra, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret -.LBB0_97: # %vector.memcheck711 - sub.d $a5, $a3, $t6 +.LBB0_93: # %vector.memcheck711 + sub.d $a3, $a5, $t0 ori $a2, $zero, 32 move $a1, $zero - bltu $a5, $a2, .LBB0_78 -# %bb.98: # %vector.memcheck711 - sub.d $a5, $a3, $t0 - bltu $a5, $a2, .LBB0_78 -# %bb.99: # %vector.memcheck711 - sub.d $a2, $a3, $t7 - ori $a5, $zero, 32 - bltu $a2, $a5, .LBB0_78 -# %bb.100: # %vector.ph719 - bstrpick.d $a1, $s1, 30, 3 + move $t4, $t5 + bltu $a3, $a2, .LBB0_72 +# %bb.94: # %vector.memcheck711 + sub.d $a3, $a5, $t8 + ld.d $t5, $sp, 8 # 8-byte Folded Reload + bltu $a3, $a2, .LBB0_73 +# %bb.95: # %vector.memcheck711 + sub.d $a2, $a5, $s2 + ori $a3, $zero, 32 + bltu $a2, $a3, .LBB0_73 +# %bb.96: # %vector.ph719 + bstrpick.d $a1, $t3, 30, 3 slli.d $a1, $a1, 3 - addi.d $a2, $a3, 16 - addi.d $a5, $t7, 16 - addi.d $a6, $t0, 16 - addi.d $t1, $t6, 16 - move $t8, $a1 + addi.d $a2, $a5, 16 + addi.d $a3, $s2, 16 + addi.d $a4, $t8, 16 + addi.d $t1, $t0, 16 + move $t2, $a1 .p2align 4, , 16 -.LBB0_101: # %vector.body722 +.LBB0_97: # %vector.body722 # =>This Inner Loop Header: Depth=1 - vld $vr0, $t1, -16 - vld $vr3, $t1, 0 - vld $vr4, $a6, -16 - vld $vr5, $a6, 0 - vld $vr6, $a5, -16 - vld $vr7, $a5, 0 - vfmul.s $vr0, $vr0, $vr4 - vfmul.s $vr3, $vr3, $vr5 - vfdiv.s $vr0, $vr0, $vr6 - vfdiv.s $vr3, $vr3, $vr7 - vst $vr0, $a2, -16 - vst $vr3, $a2, 0 - addi.d $t8, $t8, -8 + vld $vr5, $t1, -16 + vld $vr6, $t1, 0 + vld $vr7, $a4, -16 + vld $vr8, $a4, 0 + vld $vr9, $a3, -16 + vld $vr10, $a3, 0 + vfmul.s $vr5, $vr5, $vr7 + vfmul.s $vr6, $vr6, $vr8 + vfdiv.s $vr5, $vr5, $vr9 + vfdiv.s $vr6, $vr6, $vr10 + vst $vr5, $a2, -16 + vst $vr6, $a2, 0 + addi.d $t2, $t2, -8 addi.d $a2, $a2, 32 - addi.d $a5, $a5, 32 - addi.d $a6, $a6, 32 + addi.d $a3, $a3, 32 + addi.d $a4, $a4, 32 addi.d $t1, $t1, 32 - bnez $t8, .LBB0_101 -# %bb.102: # %middle.block731 - bne $a1, $s1, .LBB0_78 - b .LBB0_80 -.LBB0_103: # %vector.memcheck734 - sub.d $a5, $a4, $t3 + bnez $t2, .LBB0_97 +# %bb.98: # %middle.block731 + bne $a1, $t3, .LBB0_73 + b .LBB0_75 +.LBB0_99: # %vector.memcheck734 + sub.d $a3, $t6, $a7 ori $a2, $zero, 32 move $a1, $zero - bltu $a5, $a2, .LBB0_82 -# %bb.104: # %vector.memcheck734 - sub.d $a5, $a4, $t4 - bltu $a5, $a2, .LBB0_82 -# %bb.105: # %vector.memcheck734 - sub.d $a5, $a4, $t0 + bltu $a3, $a2, .LBB0_77 +# %bb.100: # %vector.memcheck734 + sub.d $a3, $t6, $fp + bltu $a3, $a2, .LBB0_77 +# %bb.101: # %vector.memcheck734 + sub.d $a3, $t6, $t8 ori $a2, $zero, 32 - bltu $a5, $a2, .LBB0_82 -# %bb.106: # %vector.memcheck734 - sub.d $a5, $a4, $t5 - bltu $a5, $a2, .LBB0_82 -# %bb.107: # %vector.ph744 - bstrpick.d $a1, $s1, 30, 3 + bltu $a3, $a2, .LBB0_77 +# %bb.102: # %vector.memcheck734 + sub.d $a3, $t6, $s0 + bltu $a3, $a2, .LBB0_77 +# %bb.103: # %vector.ph744 + bstrpick.d $a1, $t3, 30, 3 slli.d $a1, $a1, 3 - vreplvei.w $vr0, $vr1, 0 - addi.d $a2, $t3, 16 - addi.d $a5, $a4, 16 - addi.d $a6, $t4, 16 - addi.d $t1, $t5, 16 - addi.d $t6, $t0, 16 - move $t7, $a1 + vreplvei.w $vr5, $vr1, 0 + addi.d $a2, $a7, 16 + addi.d $a3, $t6, 16 + addi.d $a4, $fp, 16 + addi.d $t0, $s0, 16 + addi.d $t1, $t8, 16 + move $t2, $a1 .p2align 4, , 16 -.LBB0_108: # %vector.body749 +.LBB0_104: # %vector.body749 # =>This Inner Loop Header: Depth=1 - vld $vr3, $a2, -16 - vld $vr4, $a2, 0 - vld $vr5, $a6, -16 - vld $vr6, $a6, 0 - vld $vr7, $t6, -16 - vld $vr8, $t6, 0 - vfmul.s $vr3, $vr0, $vr3 - vfmul.s $vr4, $vr0, $vr4 - vfsub.s $vr5, $vr5, $vr7 - vfsub.s $vr6, $vr6, $vr8 - vld $vr7, $t1, -16 - vld $vr8, $t1, 0 - vfmul.s $vr3, $vr3, $vr5 - vfmul.s $vr4, $vr4, $vr6 - vfdiv.s $vr3, $vr3, $vr7 - vfdiv.s $vr4, $vr4, $vr8 - vst $vr3, $a5, -16 - vst $vr4, $a5, 0 - addi.d $t7, $t7, -8 + vld $vr6, $a2, -16 + vld $vr7, $a2, 0 + vld $vr8, $a4, -16 + vld $vr9, $a4, 0 + vld $vr10, $t1, -16 + vld $vr11, $t1, 0 + vfmul.s $vr6, $vr5, $vr6 + vfmul.s $vr7, $vr5, $vr7 + vfsub.s $vr8, $vr8, $vr10 + vfsub.s $vr9, $vr9, $vr11 + vld $vr10, $t0, -16 + vld $vr11, $t0, 0 + vfmul.s $vr6, $vr6, $vr8 + vfmul.s $vr7, $vr7, $vr9 + vfdiv.s $vr6, $vr6, $vr10 + vfdiv.s $vr7, $vr7, $vr11 + vst $vr6, $a3, -16 + vst $vr7, $a3, 0 + addi.d $t2, $t2, -8 addi.d $a2, $a2, 32 - addi.d $a5, $a5, 32 - addi.d $a6, $a6, 32 + addi.d $a3, $a3, 32 + addi.d $a4, $a4, 32 + addi.d $t0, $t0, 32 addi.d $t1, $t1, 32 - addi.d $t6, $t6, 32 - bnez $t7, .LBB0_108 -# %bb.109: # %middle.block760 - bne $a1, $s1, .LBB0_82 - b .LBB0_84 -.LBB0_110: # %vector.memcheck763 - sub.d $a5, $s0, $fp + bnez $t2, .LBB0_104 +# %bb.105: # %middle.block760 + bne $a1, $t3, .LBB0_77 + b .LBB0_79 +.LBB0_106: # %vector.memcheck763 + sub.d $a3, $t5, $t7 ori $a2, $zero, 32 move $a1, $zero - bltu $a5, $a2, .LBB0_86 -# %bb.111: # %vector.memcheck763 - sub.d $a5, $s0, $t2 - bltu $a5, $a2, .LBB0_86 -# %bb.112: # %vector.ph769 - bstrpick.d $a1, $s1, 30, 3 + bltu $a3, $a2, .LBB0_81 +# %bb.107: # %vector.memcheck763 + sub.d $a3, $t5, $s1 + bltu $a3, $a2, .LBB0_81 +# %bb.108: # %vector.ph769 + bstrpick.d $a1, $t3, 30, 3 slli.d $a1, $a1, 3 - vreplvei.w $vr0, $vr2, 0 - addi.d $a2, $s0, 16 - addi.d $a5, $t2, 16 - addi.d $a6, $fp, 16 - move $t1, $a1 + vreplvei.w $vr1, $vr2, 0 + addi.d $a2, $t5, 16 + addi.d $a3, $s1, 16 + addi.d $a4, $t7, 16 + move $a7, $a1 .p2align 4, , 16 -.LBB0_113: # %vector.body774 +.LBB0_109: # %vector.body774 # =>This Inner Loop Header: Depth=1 - vld $vr1, $a6, -16 - vld $vr3, $a6, 0 - vld $vr4, $a5, -16 - vld $vr5, $a5, 0 - vfmul.s $vr1, $vr0, $vr1 - vfmul.s $vr3, $vr0, $vr3 - vfmul.s $vr1, $vr1, $vr4 - vfmul.s $vr3, $vr3, $vr5 - vst $vr1, $a2, -16 - vst $vr3, $a2, 0 - addi.d $t1, $t1, -8 + vld $vr5, $a4, -16 + vld $vr6, $a4, 0 + vld $vr7, $a3, -16 + vld $vr8, $a3, 0 + vfmul.s $vr5, $vr1, $vr5 + vfmul.s $vr6, $vr1, $vr6 + vfmul.s $vr5, $vr5, $vr7 + vfmul.s $vr6, $vr6, $vr8 + vst $vr5, $a2, -16 + vst $vr6, $a2, 0 + addi.d $a7, $a7, -8 addi.d $a2, $a2, 32 - addi.d $a5, $a5, 32 - addi.d $a6, $a6, 32 - bnez $t1, .LBB0_113 -# %bb.114: # %middle.block781 - bne $a1, $s1, .LBB0_86 - b .LBB0_88 -.LBB0_115: # %vector.memcheck784 - sub.d $a5, $s2, $a0 + addi.d $a3, $a3, 32 + addi.d $a4, $a4, 32 + bnez $a7, .LBB0_109 +# %bb.110: # %middle.block781 + bne $a1, $t3, .LBB0_81 + b .LBB0_83 +.LBB0_111: # %vector.memcheck784 + sub.d $a3, $t4, $a0 ori $a2, $zero, 32 move $a1, $zero - bltu $a5, $a2, .LBB0_90 -# %bb.116: # %vector.memcheck784 - sub.d $a5, $s2, $t0 - bltu $a5, $a2, .LBB0_90 -# %bb.117: # %vector.ph790 - bstrpick.d $a1, $s1, 30, 3 + bltu $a3, $a2, .LBB0_85 +# %bb.112: # %vector.memcheck784 + sub.d $a3, $t4, $t8 + bltu $a3, $a2, .LBB0_85 +# %bb.113: # %vector.ph790 + bstrpick.d $a1, $t3, 30, 3 slli.d $a1, $a1, 3 - addi.d $a2, $s2, 16 - addi.d $a5, $t0, 16 - addi.d $a6, $a0, 16 - lu12i.w $t1, 260096 - vreplgr2vr.w $vr0, $t1 - move $t1, $a1 + addi.d $a2, $t4, 16 + addi.d $a3, $t8, 16 + addi.d $a4, $a0, 16 + lu12i.w $a7, 260096 + vreplgr2vr.w $vr1, $a7 + move $a7, $a1 .p2align 4, , 16 -.LBB0_118: # %vector.body793 +.LBB0_114: # %vector.body793 # =>This Inner Loop Header: Depth=1 - vld $vr1, $a5, -16 - vld $vr2, $a5, 0 - vld $vr3, $a6, -16 - vld $vr4, $a6, 0 - vfsub.s $vr1, $vr0, $vr1 - vfsub.s $vr2, $vr0, $vr2 - vfmul.s $vr1, $vr3, $vr1 - vfmul.s $vr2, $vr4, $vr2 - vst $vr1, $a2, -16 - vst $vr2, $a2, 0 - addi.d $t1, $t1, -8 + vld $vr2, $a3, -16 + vld $vr5, $a3, 0 + vld $vr6, $a4, -16 + vld $vr7, $a4, 0 + vfsub.s $vr2, $vr1, $vr2 + vfsub.s $vr5, $vr1, $vr5 + vfmul.s $vr2, $vr6, $vr2 + vfmul.s $vr5, $vr7, $vr5 + vst $vr2, $a2, -16 + vst $vr5, $a2, 0 + addi.d $a7, $a7, -8 addi.d $a2, $a2, 32 - addi.d $a5, $a5, 32 - addi.d $a6, $a6, 32 - bnez $t1, .LBB0_118 -# %bb.119: # %middle.block800 - bne $a1, $s1, .LBB0_90 - b .LBB0_92 -.LBB0_120: # %vector.memcheck803 - sub.d $a5, $a0, $a3 - ori $a2, $zero, 32 - move $a1, $zero - bltu $a5, $a2, .LBB0_94 -# %bb.121: # %vector.memcheck803 - sub.d $a5, $a0, $a4 - bltu $a5, $a2, .LBB0_94 -# %bb.122: # %vector.memcheck803 - sub.d $a5, $a0, $s0 - ori $a2, $zero, 32 - bltu $a5, $a2, .LBB0_94 -# %bb.123: # %vector.memcheck803 - sub.d $a5, $a0, $s2 - bltu $a5, $a2, .LBB0_94 -# %bb.124: # %vector.ph813 - bstrpick.d $a1, $s1, 30, 3 + addi.d $a3, $a3, 32 + addi.d $a4, $a4, 32 + bnez $a7, .LBB0_114 +# %bb.115: # %middle.block800 + bne $a1, $t3, .LBB0_85 + b .LBB0_87 +.LBB0_116: # %vector.ph698 + bstrpick.d $a1, $t3, 30, 3 slli.d $a1, $a1, 3 - addi.d $a2, $a3, 16 - addi.d $a5, $a0, 16 - addi.d $a6, $a4, 16 - addi.d $a7, $s2, 16 - addi.d $t0, $s0, 16 - move $t1, $a1 + addi.d $a2, $t4, 16 + addi.d $a3, $t1, 16 + move $a4, $a1 + .p2align 4, , 16 +.LBB0_117: # %vector.body701 + # =>This Inner Loop Header: Depth=1 + vld $vr5, $a3, -16 + vld $vr6, $a3, 0 + vld $vr7, $a2, -16 + vld $vr8, $a2, 0 + vfadd.s $vr5, $vr5, $vr7 + vfadd.s $vr6, $vr6, $vr8 + vst $vr5, $a2, -16 + vst $vr6, $a2, 0 + addi.d $a4, $a4, -8 + addi.d $a2, $a2, 32 + addi.d $a3, $a3, 32 + bnez $a4, .LBB0_117 +# %bb.118: # %middle.block708 + beq $a1, $t3, .LBB0_70 + b .LBB0_68 +.LBB0_119: # %vector.memcheck520 + sub.d $a5, $a7, $t0 + move $t6, $zero + bltu $a5, $a3, .LBB0_10 +# %bb.120: # %vector.memcheck520 + sub.d $a5, $t7, $t0 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_10 +# %bb.121: # %vector.memcheck520 + sub.d $a5, $t0, $t3 + bltu $a5, $a3, .LBB0_10 +# %bb.122: # %vector.memcheck520 + sub.d $a5, $t0, $t4 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_10 +# %bb.123: # %vector.memcheck520 + sub.d $a5, $t0, $t5 + bltu $a5, $a3, .LBB0_10 +# %bb.124: # %vector.memcheck520 + sub.d $a5, $t7, $a7 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_10 +# %bb.125: # %vector.memcheck520 + sub.d $a5, $a7, $t3 + bltu $a5, $a3, .LBB0_10 +# %bb.126: # %vector.memcheck520 + sub.d $a5, $a7, $t4 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_10 +# %bb.127: # %vector.memcheck520 + sub.d $a5, $a7, $t5 + bltu $a5, $a3, .LBB0_10 +# %bb.128: # %vector.memcheck520 + sub.d $a5, $t7, $t3 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_10 +# %bb.129: # %vector.memcheck520 + sub.d $a5, $t7, $t4 + bltu $a5, $a3, .LBB0_10 +# %bb.130: # %vector.memcheck520 + sub.d $a3, $t7, $t5 + ori $a5, $zero, 16 + bltu $a3, $a5, .LBB0_10 +# %bb.131: # %vector.ph546 + bstrpick.d $a3, $a6, 30, 2 + slli.d $t6, $a3, 2 + vreplvei.w $vr13, $vr11, 0 + vreplvei.w $vr14, $vr12, 0 + vreplvei.w $vr15, $vr10, 0 + lu12i.w $a3, -262144 + vreplgr2vr.w $vr16, $a3 + move $a3, $t3 + move $a5, $t4 + move $t2, $t5 + move $t8, $t0 + move $fp, $a7 + move $s0, $t7 + move $s2, $t6 + .p2align 4, , 16 +.LBB0_132: # %vector.body555 + # =>This Inner Loop Header: Depth=1 + vld $vr17, $a3, 0 + vld $vr18, $t2, 0 + vld $vr19, $a5, 0 + vfsub.s $vr20, $vr17, $vr18 + vfdiv.s $vr20, $vr20, $vr13 + vfmadd.s $vr17, $vr19, $vr16, $vr17 + vfadd.s $vr17, $vr17, $vr18 + vfdiv.s $vr17, $vr17, $vr14 + vfmadd.s $vr18, $vr20, $vr15, $vr19 + vfmul.s $vr19, $vr15, $vr17 + vfmadd.s $vr18, $vr19, $vr15, $vr18 + vst $vr18, $t8, 0 + vfadd.s $vr18, $vr17, $vr17 + vfmadd.s $vr18, $vr18, $vr15, $vr20 + vst $vr18, $fp, 0 + vst $vr17, $s0, 0 + addi.d $s2, $s2, -4 + addi.d $s0, $s0, 16 + addi.d $fp, $fp, 16 + addi.d $t8, $t8, 16 + addi.d $t2, $t2, 16 + addi.d $a5, $a5, 16 + addi.d $a3, $a3, 16 + bnez $s2, .LBB0_132 +# %bb.133: # %middle.block561 + bne $t6, $a6, .LBB0_10 + b .LBB0_17 +.LBB0_134: # %vector.ph613 + bstrpick.d $a2, $t3, 30, 3 + slli.d $a2, $a2, 3 + addi.d $a3, $fp, 16 + addi.d $t2, $s7, 16 + addi.d $s1, $t8, 16 + addi.d $s3, $s0, 16 + addi.d $s4, $s2, 16 + lu12i.w $s5, -262144 + vreplgr2vr.w $vr5, $s5 + move $s5, $a2 + .p2align 4, , 16 +.LBB0_135: # %vector.body616 + # =>This Inner Loop Header: Depth=1 + vld $vr6, $a3, -16 + vld $vr7, $a3, 0 + vfadd.s $vr8, $vr6, $vr5 + vfadd.s $vr10, $vr7, $vr5 + vld $vr11, $s1, -16 + vld $vr12, $s1, 0 + vld $vr13, $s4, -16 + vld $vr14, $s4, 0 + vld $vr15, $s3, -16 + vld $vr16, $s3, 0 + vfadd.s $vr11, $vr11, $vr11 + vfadd.s $vr12, $vr12, $vr12 + vfmul.s $vr13, $vr13, $vr15 + vfmul.s $vr14, $vr14, $vr16 + vfdiv.s $vr11, $vr11, $vr13 + vfdiv.s $vr12, $vr12, $vr14 + vfmadd.s $vr6, $vr6, $vr8, $vr11 + vfmadd.s $vr7, $vr7, $vr10, $vr12 + vst $vr6, $t2, -16 + vst $vr7, $t2, 0 + addi.d $s5, $s5, -8 + addi.d $a3, $a3, 32 + addi.d $t2, $t2, 32 + addi.d $s1, $s1, 32 + addi.d $s3, $s3, 32 + addi.d $s4, $s4, 32 + bnez $s5, .LBB0_135 +# %bb.136: # %middle.block627 + beq $a2, $t3, .LBB0_40 + b .LBB0_38 +.LBB0_137: # %vector.ph813 + bstrpick.d $a1, $t3, 30, 3 + slli.d $a1, $a1, 3 + addi.d $a2, $a5, 16 + addi.d $a3, $a0, 16 + addi.d $a4, $t6, 16 + addi.d $a6, $t4, 16 + addi.d $a7, $t5, 16 + move $t0, $a1 .p2align 4, , 16 -.LBB0_125: # %vector.body816 +.LBB0_138: # %vector.body816 # =>This Inner Loop Header: Depth=1 vld $vr0, $a2, -16 vld $vr1, $a2, 0 - vld $vr2, $a6, -16 - vld $vr3, $a6, 0 + vld $vr2, $a4, -16 + vld $vr3, $a4, 0 vfadd.s $vr0, $vr0, $vr2 vfadd.s $vr1, $vr1, $vr3 - vld $vr2, $t0, -16 - vld $vr3, $t0, 0 - vld $vr4, $a7, -16 - vld $vr5, $a7, 0 + vld $vr2, $a7, -16 + vld $vr3, $a7, 0 + vld $vr4, $a6, -16 + vld $vr5, $a6, 0 vfadd.s $vr0, $vr0, $vr2 vfadd.s $vr1, $vr1, $vr3 vfadd.s $vr0, $vr0, $vr4 vfadd.s $vr1, $vr1, $vr5 - vst $vr0, $a5, -16 - vst $vr1, $a5, 0 - addi.d $t1, $t1, -8 + vst $vr0, $a3, -16 + vst $vr1, $a3, 0 + addi.d $t0, $t0, -8 addi.d $a2, $a2, 32 - addi.d $a5, $a5, 32 + addi.d $a3, $a3, 32 + addi.d $a4, $a4, 32 addi.d $a6, $a6, 32 addi.d $a7, $a7, 32 - addi.d $t0, $t0, 32 - bnez $t1, .LBB0_125 -# %bb.126: # %middle.block827 - bne $a1, $s1, .LBB0_94 - b .LBB0_96 -.LBB0_127: # %vector.ph698 - bstrpick.d $a1, $s1, 30, 3 - slli.d $a1, $a1, 3 - addi.d $a2, $s3, 16 - addi.d $a5, $t8, 16 - move $a6, $a1 - .p2align 4, , 16 -.LBB0_128: # %vector.body701 - # =>This Inner Loop Header: Depth=1 - vld $vr0, $a5, -16 - vld $vr3, $a5, 0 - vld $vr4, $a2, -16 - vld $vr5, $a2, 0 - vfadd.s $vr0, $vr0, $vr4 - vfadd.s $vr3, $vr3, $vr5 - vst $vr0, $a2, -16 - vst $vr3, $a2, 0 - addi.d $a6, $a6, -8 - addi.d $a2, $a2, 32 - addi.d $a5, $a5, 32 - bnez $a6, .LBB0_128 -# %bb.129: # %middle.block708 - beq $a1, $s1, .LBB0_76 - b .LBB0_74 -.LBB0_130: # %vector.memcheck520 - sub.d $s0, $t3, $t6 - move $a5, $zero - bltu $s0, $t1, .LBB0_10 -# %bb.131: # %vector.memcheck520 - sub.d $s0, $fp, $t6 - ori $t1, $zero, 16 - bltu $s0, $t1, .LBB0_10 -# %bb.132: # %vector.memcheck520 - sub.d $s0, $t6, $s1 - bltu $s0, $t1, .LBB0_10 -# %bb.133: # %vector.memcheck520 - sub.d $s0, $t6, $s3 - ori $t1, $zero, 16 - bltu $s0, $t1, .LBB0_10 -# %bb.134: # %vector.memcheck520 - sub.d $s0, $t6, $s4 - bltu $s0, $t1, .LBB0_10 -# %bb.135: # %vector.memcheck520 - sub.d $s0, $fp, $t3 - ori $t1, $zero, 16 - bltu $s0, $t1, .LBB0_10 -# %bb.136: # %vector.memcheck520 - sub.d $s0, $t3, $s1 - bltu $s0, $t1, .LBB0_10 -# %bb.137: # %vector.memcheck520 - sub.d $s0, $t3, $s3 - ori $t1, $zero, 16 - bltu $s0, $t1, .LBB0_10 -# %bb.138: # %vector.memcheck520 - sub.d $s0, $t3, $s4 - bltu $s0, $t1, .LBB0_10 -# %bb.139: # %vector.memcheck520 - sub.d $s0, $fp, $s1 - ori $t1, $zero, 16 - bltu $s0, $t1, .LBB0_10 -# %bb.140: # %vector.memcheck520 - sub.d $s0, $fp, $s3 - bltu $s0, $t1, .LBB0_10 -# %bb.141: # %vector.memcheck520 - sub.d $t1, $fp, $s4 - ori $s0, $zero, 16 - bltu $t1, $s0, .LBB0_10 -# %bb.142: # %vector.ph546 - bstrpick.d $a5, $a7, 30, 2 - slli.d $a5, $a5, 2 - vreplvei.w $vr9, $vr7, 0 - vreplvei.w $vr10, $vr8, 0 - vreplvei.w $vr11, $vr6, 0 - lu12i.w $t1, -262144 - vreplgr2vr.w $vr12, $t1 - move $t1, $s1 - move $s0, $s3 - move $s6, $s4 - move $s7, $t6 - move $s8, $t3 - move $ra, $fp - move $s5, $a5 - .p2align 4, , 16 -.LBB0_143: # %vector.body555 - # =>This Inner Loop Header: Depth=1 - vld $vr13, $t1, 0 - vld $vr14, $s6, 0 - vld $vr15, $s0, 0 - vfsub.s $vr16, $vr13, $vr14 - vfdiv.s $vr16, $vr16, $vr9 - vfmadd.s $vr13, $vr15, $vr12, $vr13 - vfadd.s $vr13, $vr13, $vr14 - vfdiv.s $vr13, $vr13, $vr10 - vfmadd.s $vr14, $vr16, $vr11, $vr15 - vfmul.s $vr15, $vr11, $vr13 - vfmadd.s $vr14, $vr15, $vr11, $vr14 - vst $vr14, $s7, 0 - vfadd.s $vr14, $vr13, $vr13 - vfmadd.s $vr14, $vr14, $vr11, $vr16 - vst $vr14, $s8, 0 - vst $vr13, $ra, 0 - addi.d $s5, $s5, -4 - addi.d $ra, $ra, 16 - addi.d $s8, $s8, 16 - addi.d $s7, $s7, 16 - addi.d $s6, $s6, 16 - addi.d $s0, $s0, 16 - addi.d $t1, $t1, 16 - bnez $s5, .LBB0_143 -# %bb.144: # %middle.block561 - bne $a5, $a7, .LBB0_10 - b .LBB0_17 -.LBB0_145: # %vector.memcheck476 - sub.d $t1, $t3, $t6 - move $s5, $zero - bltu $t1, $a5, .LBB0_5 + bnez $t0, .LBB0_138 +# %bb.139: # %middle.block827 + beq $a1, $t3, .LBB0_92 + b .LBB0_90 +.LBB0_140: # %vector.memcheck476 + sub.d $a5, $a7, $t0 + move $t6, $zero + bltu $a5, $a3, .LBB0_5 +# %bb.141: # %vector.memcheck476 + sub.d $a5, $t7, $t0 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_5 +# %bb.142: # %vector.memcheck476 + sub.d $a5, $t0, $t3 + bltu $a5, $a3, .LBB0_5 +# %bb.143: # %vector.memcheck476 + sub.d $a5, $t0, $t4 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_5 +# %bb.144: # %vector.memcheck476 + sub.d $a5, $t0, $t5 + bltu $a5, $a3, .LBB0_5 +# %bb.145: # %vector.memcheck476 + sub.d $a5, $t7, $a7 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_5 # %bb.146: # %vector.memcheck476 - sub.d $t1, $fp, $t6 - ori $a5, $zero, 16 - bltu $t1, $a5, .LBB0_5 + sub.d $a5, $a7, $t3 + bltu $a5, $a3, .LBB0_5 # %bb.147: # %vector.memcheck476 - sub.d $t1, $t6, $s1 - bltu $t1, $a5, .LBB0_5 + sub.d $a5, $a7, $t4 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_5 # %bb.148: # %vector.memcheck476 - sub.d $t1, $t6, $s3 - ori $a5, $zero, 16 - bltu $t1, $a5, .LBB0_5 + sub.d $a5, $a7, $t5 + bltu $a5, $a3, .LBB0_5 # %bb.149: # %vector.memcheck476 - sub.d $t1, $t6, $s4 - bltu $t1, $a5, .LBB0_5 + sub.d $a5, $t7, $t3 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_5 # %bb.150: # %vector.memcheck476 - sub.d $t1, $fp, $t3 - ori $a5, $zero, 16 - bltu $t1, $a5, .LBB0_5 + sub.d $a5, $t7, $t4 + bltu $a5, $a3, .LBB0_5 # %bb.151: # %vector.memcheck476 - sub.d $t1, $t3, $s1 - bltu $t1, $a5, .LBB0_5 -# %bb.152: # %vector.memcheck476 - sub.d $t1, $t3, $s3 - ori $a5, $zero, 16 - bltu $t1, $a5, .LBB0_5 -# %bb.153: # %vector.memcheck476 - sub.d $t1, $t3, $s4 - bltu $t1, $a5, .LBB0_5 -# %bb.154: # %vector.memcheck476 - sub.d $t1, $fp, $s1 + sub.d $a3, $t7, $t5 ori $a5, $zero, 16 - bltu $t1, $a5, .LBB0_5 -# %bb.155: # %vector.memcheck476 - sub.d $t1, $fp, $s3 - bltu $t1, $a5, .LBB0_5 -# %bb.156: # %vector.memcheck476 - sub.d $a5, $fp, $s4 - ori $t1, $zero, 16 - bltu $a5, $t1, .LBB0_5 -# %bb.157: # %vector.ph502 - bstrpick.d $a5, $a7, 30, 2 - slli.d $s5, $a5, 2 - vreplvei.w $vr9, $vr7, 0 - vreplvei.w $vr10, $vr8, 0 - vreplvei.w $vr11, $vr6, 0 - lu12i.w $a5, -262144 - vreplgr2vr.w $vr12, $a5 - move $t1, $s1 - move $s0, $s3 - move $s6, $s4 - move $s7, $t6 - move $s8, $t3 - move $ra, $fp - move $a5, $s5 + bltu $a3, $a5, .LBB0_5 +# %bb.152: # %vector.ph502 + bstrpick.d $a3, $a6, 30, 2 + slli.d $t6, $a3, 2 + vreplvei.w $vr13, $vr11, 0 + vreplvei.w $vr14, $vr12, 0 + vreplvei.w $vr15, $vr10, 0 + lu12i.w $a3, -262144 + vreplgr2vr.w $vr16, $a3 + move $a3, $t3 + move $a5, $t4 + move $t2, $t5 + move $t8, $t0 + move $fp, $a7 + move $s0, $t7 + move $s2, $t6 .p2align 4, , 16 -.LBB0_158: # %vector.body511 +.LBB0_153: # %vector.body511 # =>This Inner Loop Header: Depth=1 - vld $vr13, $t1, 0 - vld $vr14, $s6, 0 - vld $vr15, $s0, 0 - vfsub.s $vr16, $vr13, $vr14 - vfdiv.s $vr16, $vr16, $vr9 - vfmadd.s $vr13, $vr15, $vr12, $vr13 - vfadd.s $vr13, $vr13, $vr14 - vfdiv.s $vr13, $vr13, $vr10 - vfmadd.s $vr14, $vr16, $vr11, $vr15 - vfmul.s $vr15, $vr11, $vr13 - vfmadd.s $vr14, $vr15, $vr11, $vr14 - vst $vr14, $s7, 0 - vfadd.s $vr14, $vr13, $vr13 - vfmadd.s $vr14, $vr14, $vr11, $vr16 - vst $vr14, $s8, 0 - vst $vr13, $ra, 0 - addi.d $a5, $a5, -4 - addi.d $ra, $ra, 16 - addi.d $s8, $s8, 16 - addi.d $s7, $s7, 16 - addi.d $s6, $s6, 16 + vld $vr17, $a3, 0 + vld $vr18, $t2, 0 + vld $vr19, $a5, 0 + vfsub.s $vr20, $vr17, $vr18 + vfdiv.s $vr20, $vr20, $vr13 + vfmadd.s $vr17, $vr19, $vr16, $vr17 + vfadd.s $vr17, $vr17, $vr18 + vfdiv.s $vr17, $vr17, $vr14 + vfmadd.s $vr18, $vr20, $vr15, $vr19 + vfmul.s $vr19, $vr15, $vr17 + vfmadd.s $vr18, $vr19, $vr15, $vr18 + vst $vr18, $t8, 0 + vfadd.s $vr18, $vr17, $vr17 + vfmadd.s $vr18, $vr18, $vr15, $vr20 + vst $vr18, $fp, 0 + vst $vr17, $s0, 0 + addi.d $s2, $s2, -4 addi.d $s0, $s0, 16 - addi.d $t1, $t1, 16 - bnez $a5, .LBB0_158 -# %bb.159: # %middle.block517 - bne $s5, $a7, .LBB0_5 + addi.d $fp, $fp, 16 + addi.d $t8, $t8, 16 + addi.d $t2, $t2, 16 + addi.d $a5, $a5, 16 + addi.d $a3, $a3, 16 + bnez $s2, .LBB0_153 +# %bb.154: # %middle.block517 + bne $t6, $a6, .LBB0_5 b .LBB0_17 -.LBB0_160: # %vector.memcheck - sub.d $t1, $t3, $t6 - move $s5, $zero - bltu $t1, $a5, .LBB0_15 +.LBB0_155: # %vector.memcheck + sub.d $a5, $a7, $t0 + move $t6, $zero + bltu $a5, $a3, .LBB0_15 +# %bb.156: # %vector.memcheck + sub.d $a5, $t7, $t0 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_15 +# %bb.157: # %vector.memcheck + sub.d $a5, $t0, $t3 + bltu $a5, $a3, .LBB0_15 +# %bb.158: # %vector.memcheck + sub.d $a5, $t0, $t4 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_15 +# %bb.159: # %vector.memcheck + sub.d $a5, $t0, $t5 + bltu $a5, $a3, .LBB0_15 +# %bb.160: # %vector.memcheck + sub.d $a5, $t7, $a7 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_15 # %bb.161: # %vector.memcheck - sub.d $t1, $fp, $t6 - ori $a5, $zero, 16 - bltu $t1, $a5, .LBB0_15 + sub.d $a5, $a7, $t3 + bltu $a5, $a3, .LBB0_15 # %bb.162: # %vector.memcheck - sub.d $t1, $t6, $s1 - bltu $t1, $a5, .LBB0_15 + sub.d $a5, $a7, $t4 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_15 # %bb.163: # %vector.memcheck - sub.d $t1, $t6, $s3 - ori $a5, $zero, 16 - bltu $t1, $a5, .LBB0_15 + sub.d $a5, $a7, $t5 + bltu $a5, $a3, .LBB0_15 # %bb.164: # %vector.memcheck - sub.d $t1, $t6, $s4 - bltu $t1, $a5, .LBB0_15 + sub.d $a5, $t7, $t3 + ori $a3, $zero, 16 + bltu $a5, $a3, .LBB0_15 # %bb.165: # %vector.memcheck - sub.d $t1, $fp, $t3 - ori $a5, $zero, 16 - bltu $t1, $a5, .LBB0_15 + sub.d $a5, $t7, $t4 + bltu $a5, $a3, .LBB0_15 # %bb.166: # %vector.memcheck - sub.d $t1, $t3, $s1 - bltu $t1, $a5, .LBB0_15 -# %bb.167: # %vector.memcheck - sub.d $t1, $t3, $s3 - ori $a5, $zero, 16 - bltu $t1, $a5, .LBB0_15 -# %bb.168: # %vector.memcheck - sub.d $t1, $t3, $s4 - bltu $t1, $a5, .LBB0_15 -# %bb.169: # %vector.memcheck - sub.d $t1, $fp, $s1 + sub.d $a3, $t7, $t5 ori $a5, $zero, 16 - bltu $t1, $a5, .LBB0_15 -# %bb.170: # %vector.memcheck - sub.d $t1, $fp, $s3 - bltu $t1, $a5, .LBB0_15 -# %bb.171: # %vector.memcheck - sub.d $a5, $fp, $s4 - ori $t1, $zero, 16 - bltu $a5, $t1, .LBB0_15 -# %bb.172: # %vector.ph - bstrpick.d $a5, $a7, 30, 2 - slli.d $s5, $a5, 2 - vreplvei.w $vr9, $vr7, 0 - vreplvei.w $vr10, $vr8, 0 - vreplvei.w $vr11, $vr6, 0 - lu12i.w $a5, -262144 - vreplgr2vr.w $vr12, $a5 - move $t1, $s1 - move $s0, $s3 - move $s6, $s4 - move $s7, $t6 - move $s8, $t3 - move $ra, $fp - move $a5, $s5 + bltu $a3, $a5, .LBB0_15 +# %bb.167: # %vector.ph + bstrpick.d $a3, $a6, 30, 2 + slli.d $t6, $a3, 2 + vreplvei.w $vr13, $vr11, 0 + vreplvei.w $vr14, $vr12, 0 + vreplvei.w $vr15, $vr10, 0 + lu12i.w $a3, -262144 + vreplgr2vr.w $vr16, $a3 + move $a3, $t3 + move $a5, $t4 + move $t2, $t5 + move $t8, $t0 + move $fp, $a7 + move $s0, $t7 + move $s2, $t6 .p2align 4, , 16 -.LBB0_173: # %vector.body +.LBB0_168: # %vector.body # =>This Inner Loop Header: Depth=1 - vld $vr13, $t1, 0 - vld $vr14, $s6, 0 - vld $vr15, $s0, 0 - vfsub.s $vr16, $vr13, $vr14 - vfdiv.s $vr16, $vr16, $vr9 - vfmadd.s $vr13, $vr15, $vr12, $vr13 - vfadd.s $vr13, $vr13, $vr14 - vfdiv.s $vr13, $vr13, $vr10 - vfmadd.s $vr14, $vr16, $vr11, $vr15 - vfmul.s $vr15, $vr11, $vr13 - vfmadd.s $vr14, $vr15, $vr11, $vr14 - vst $vr14, $s7, 0 - vfadd.s $vr14, $vr13, $vr13 - vfmadd.s $vr14, $vr14, $vr11, $vr16 - vst $vr14, $s8, 0 - vst $vr13, $ra, 0 - addi.d $a5, $a5, -4 - addi.d $ra, $ra, 16 - addi.d $s8, $s8, 16 - addi.d $s7, $s7, 16 - addi.d $s6, $s6, 16 + vld $vr17, $a3, 0 + vld $vr18, $t2, 0 + vld $vr19, $a5, 0 + vfsub.s $vr20, $vr17, $vr18 + vfdiv.s $vr20, $vr20, $vr13 + vfmadd.s $vr17, $vr19, $vr16, $vr17 + vfadd.s $vr17, $vr17, $vr18 + vfdiv.s $vr17, $vr17, $vr14 + vfmadd.s $vr18, $vr20, $vr15, $vr19 + vfmul.s $vr19, $vr15, $vr17 + vfmadd.s $vr18, $vr19, $vr15, $vr18 + vst $vr18, $t8, 0 + vfadd.s $vr18, $vr17, $vr17 + vfmadd.s $vr18, $vr18, $vr15, $vr20 + vst $vr18, $fp, 0 + vst $vr17, $s0, 0 + addi.d $s2, $s2, -4 addi.d $s0, $s0, 16 - addi.d $t1, $t1, 16 - bnez $a5, .LBB0_173 -# %bb.174: # %middle.block - bne $s5, $a7, .LBB0_15 + addi.d $fp, $fp, 16 + addi.d $t8, $t8, 16 + addi.d $t2, $t2, 16 + addi.d $a5, $a5, 16 + addi.d $a3, $a3, 16 + bnez $s2, .LBB0_168 +# %bb.169: # %middle.block + bne $t6, $a6, .LBB0_15 b .LBB0_17 +.LBB0_170: + move $t4, $s7 + b .LBB0_42 .Lfunc_end0: .size attenuate_fluxes, .Lfunc_end0-attenuate_fluxes # -- End function diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench/CMakeFiles/XSBench.dir/CalculateXS.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench/CMakeFiles/XSBench.dir/CalculateXS.s index 71fd80ae..bcf11c5a 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench/CMakeFiles/XSBench.dir/CalculateXS.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench/CMakeFiles/XSBench.dir/CalculateXS.s @@ -71,8 +71,8 @@ calculate_macro_xs: # @calculate_macro_xs srli.d $t3, $t3, 1 add.d $t3, $t3, $a1 slli.d $t4, $t3, 4 - fldx.d $fa1, $a5, $t4 - fcmp.clt.d $fcc0, $fa0, $fa1 + fldx.d $fa2, $a5, $t4 + fcmp.clt.d $fcc0, $fa0, $fa2 movcf2gr $t4, $fcc0 maskeqz $t5, $t3, $t4 masknez $t2, $t2, $t4 @@ -96,12 +96,9 @@ calculate_macro_xs: # @calculate_macro_xs add.d $a4, $a5, $t1 ld.d $a4, $a4, 8 addi.d $a2, $a2, -1 - movgr2fr.d $fa1, $zero + movgr2fr.d $fa2, $zero addi.w $a5, $zero, -48 - fmov.d $fa2, $fa1 - fmov.d $fa3, $fa1 - fmov.d $fa4, $fa1 - fmov.d $fa5, $fa1 + vori.b $vr3, $vr1, 0 .p2align 4, , 16 .LBB1_6: # =>This Inner Loop Header: Depth=1 ld.w $a7, $a0, 0 @@ -116,48 +113,38 @@ calculate_macro_xs: # @calculate_macro_xs add.d $a7, $a7, $t1 maskeqz $t1, $a5, $t2 add.d $t2, $a7, $t1 - fld.d $fa6, $t2, 48 - fldx.d $fa7, $a7, $t1 - fld.d $ft0, $t2, 56 - fld.d $ft1, $t2, 8 - fsub.d $ft2, $fa6, $fa0 - fsub.d $fa6, $fa6, $fa7 - fsub.d $fa7, $ft0, $ft1 - fneg.d $ft1, $ft2 - fdiv.d $fa6, $ft1, $fa6 - fld.d $ft1, $t2, 64 - fld.d $ft2, $t2, 16 - fld.d $ft3, $t2, 72 - fld.d $ft4, $t2, 24 - fmadd.d $fa7, $fa6, $fa7, $ft0 - fsub.d $ft0, $ft1, $ft2 - fmadd.d $ft0, $fa6, $ft0, $ft1 - fsub.d $ft1, $ft3, $ft4 - fmadd.d $ft1, $fa6, $ft1, $ft3 - fld.d $ft2, $t2, 80 - fld.d $ft3, $t2, 32 - fld.d $ft4, $t2, 88 - fld.d $ft5, $t2, 40 - fld.d $ft6, $a3, 0 - fsub.d $ft3, $ft2, $ft3 - fmadd.d $ft2, $fa6, $ft3, $ft2 - fsub.d $ft3, $ft4, $ft5 - fmadd.d $fa6, $fa6, $ft3, $ft4 - fmadd.d $fa5, $fa7, $ft6, $fa5 - fmadd.d $fa4, $ft0, $ft6, $fa4 - fmadd.d $fa3, $ft1, $ft6, $fa3 - fmadd.d $fa2, $ft2, $ft6, $fa2 - fmadd.d $fa1, $fa6, $ft6, $fa1 + fld.d $fa4, $t2, 48 + fldx.d $fa5, $a7, $t1 + fsub.d $fa6, $fa4, $fa0 + vld $vr7, $t2, 56 + vld $vr8, $t2, 8 + fsub.d $fa4, $fa4, $fa5 + fneg.d $fa5, $fa6 + fdiv.d $fa4, $fa5, $fa4 + vfsub.d $vr5, $vr7, $vr8 + vreplvei.d $vr6, $vr4, 0 + vfmadd.d $vr5, $vr6, $vr5, $vr7 + vld $vr7, $t2, 72 + vld $vr8, $t2, 24 + fld.d $ft1, $t2, 88 + fld.d $ft2, $t2, 40 + fld.d $ft3, $a3, 0 + vfsub.d $vr8, $vr7, $vr8 + vfmadd.d $vr6, $vr6, $vr8, $vr7 + fsub.d $fa7, $ft1, $ft2 + fmadd.d $fa4, $fa4, $fa7, $ft1 + vreplvei.d $vr7, $vr11, 0 + vfmadd.d $vr1, $vr5, $vr7, $vr1 + vfmadd.d $vr3, $vr6, $vr7, $vr3 + fmadd.d $fa2, $fa4, $ft3, $fa2 addi.d $a3, $a3, 8 addi.d $a1, $a1, -1 addi.d $a0, $a0, 4 bnez $a1, .LBB1_6 # %bb.7: # %._crit_edge.loopexit - fst.d $fa5, $t0, 0 - fst.d $fa4, $t0, 8 - fst.d $fa3, $t0, 16 - fst.d $fa2, $t0, 24 - fst.d $fa1, $t0, 32 + vst $vr1, $t0, 0 + vst $vr3, $t0, 16 + fst.d $fa2, $t0, 32 .LBB1_8: # %._crit_edge ret .Lfunc_end1: diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeFiles/miniGMG.dir/operators.ompif.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeFiles/miniGMG.dir/operators.ompif.s index 4fc98450..6ed88cfc 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeFiles/miniGMG.dir/operators.ompif.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeFiles/miniGMG.dir/operators.ompif.s @@ -781,284 +781,302 @@ rebuild_lambda: # @rebuild_lambda .type smooth,@function smooth: # @smooth # %bb.0: - addi.d $sp, $sp, -1008 - st.d $ra, $sp, 1000 # 8-byte Folded Spill - st.d $fp, $sp, 992 # 8-byte Folded Spill - st.d $s0, $sp, 984 # 8-byte Folded Spill - st.d $s1, $sp, 976 # 8-byte Folded Spill - st.d $s2, $sp, 968 # 8-byte Folded Spill - st.d $s3, $sp, 960 # 8-byte Folded Spill - st.d $s4, $sp, 952 # 8-byte Folded Spill - st.d $s5, $sp, 944 # 8-byte Folded Spill - st.d $s6, $sp, 936 # 8-byte Folded Spill - st.d $s7, $sp, 928 # 8-byte Folded Spill - st.d $s8, $sp, 920 # 8-byte Folded Spill - fst.d $fs0, $sp, 912 # 8-byte Folded Spill - move $fp, $a0 - ld.w $a4, $a0, 1612 - ori $a0, $zero, 2 + addi.d $sp, $sp, -1040 + st.d $ra, $sp, 1032 # 8-byte Folded Spill + st.d $fp, $sp, 1024 # 8-byte Folded Spill + st.d $s0, $sp, 1016 # 8-byte Folded Spill + st.d $s1, $sp, 1008 # 8-byte Folded Spill + st.d $s2, $sp, 1000 # 8-byte Folded Spill + st.d $s3, $sp, 992 # 8-byte Folded Spill + st.d $s4, $sp, 984 # 8-byte Folded Spill + st.d $s5, $sp, 976 # 8-byte Folded Spill + st.d $s6, $sp, 968 # 8-byte Folded Spill + st.d $s7, $sp, 960 # 8-byte Folded Spill + st.d $s8, $sp, 952 # 8-byte Folded Spill + fst.d $fs0, $sp, 944 # 8-byte Folded Spill + move $s0, $a0 + ld.w $a0, $a0, 1612 + ori $fp, $zero, 2 fmov.d $fs0, $fa1 # kill: def $f0_64 killed $f0_64 def $vr0 move $s3, $a3 - st.d $a2, $sp, 144 # 8-byte Folded Spill - move $s0, $a1 - st.d $a4, $sp, 104 # 8-byte Folded Spill - vst $vr0, $sp, 80 # 16-byte Folded Spill - blt $a4, $a0, .LBB3_2 + st.d $a2, $sp, 160 # 8-byte Folded Spill + move $s1, $a1 + st.d $a0, $sp, 120 # 8-byte Folded Spill + vst $vr0, $sp, 96 # 16-byte Folded Spill + blt $a0, $fp, .LBB3_2 # %bb.1: ori $a3, $zero, 1 ori $a4, $zero, 1 ori $a5, $zero, 1 - move $a0, $fp - move $a1, $s0 + move $a0, $s0 + move $a1, $s1 move $a2, $s3 pcaddu18i $ra, %call36(exchange_boundary) jirl $ra, $ra, 0 - vld $vr0, $sp, 80 # 16-byte Folded Reload + vld $vr0, $sp, 96 # 16-byte Folded Reload .LBB3_2: move $a1, $zero ori $a2, $zero, 1 - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 120 # 8-byte Folded Reload slt $a0, $a2, $a0 - st.d $a0, $sp, 64 # 8-byte Folded Spill - alsl.d $a0, $s0, $fp, 3 - st.d $a0, $sp, 56 # 8-byte Folded Spill + st.d $a0, $sp, 80 # 8-byte Folded Spill + alsl.d $a0, $s1, $s0, 3 + st.d $a0, $sp, 72 # 8-byte Folded Spill addi.d $a0, $a0, 1616 - st.d $a0, $sp, 136 # 8-byte Folded Spill + st.d $a0, $sp, 152 # 8-byte Folded Spill vreplvei.d $vr0, $vr0, 0 - vst $vr0, $sp, 32 # 16-byte Folded Spill + vst $vr0, $sp, 48 # 16-byte Folded Spill ori $a0, $zero, 216 - mul.d $a0, $s0, $a0 - st.d $a0, $sp, 160 # 8-byte Folded Spill + mul.d $a0, $s1, $a0 + st.d $a0, $sp, 176 # 8-byte Folded Spill slli.d $a0, $s3, 3 - st.d $a0, $sp, 128 # 8-byte Folded Spill + st.d $a0, $sp, 144 # 8-byte Folded Spill + lu32i.d $fp, 1 + vreplgr2vr.d $vr0, $fp + vst $vr0, $sp, 32 # 16-byte Folded Spill lu12i.w $a0, 349525 ori $a0, $a0, 1365 lu32i.d $a0, 349525 lu52i.d $a0, $a0, -1026 vreplgr2vr.d $vr0, $a0 vst $vr0, $sp, 16 # 16-byte Folded Spill - st.d $fp, $sp, 112 # 8-byte Folded Spill - st.d $s0, $sp, 72 # 8-byte Folded Spill + st.d $s0, $sp, 128 # 8-byte Folded Spill + st.d $s1, $sp, 88 # 8-byte Folded Spill b .LBB3_4 .p2align 4, , 16 .LBB3_3: # %._crit_edge226 # in Loop: Header=BB3_4 Depth=1 pcaddu18i $ra, %call36(CycleTime) jirl $ra, $ra, 0 - ld.d $a3, $sp, 56 # 8-byte Folded Reload + ld.d $a3, $sp, 72 # 8-byte Folded Reload ld.d $a1, $a3, 0 - ld.d $a2, $sp, 120 # 8-byte Folded Reload + ld.d $a2, $sp, 136 # 8-byte Folded Reload sub.d $a0, $a0, $a2 add.d $a0, $a0, $a1 - ld.d $a1, $sp, 104 # 8-byte Folded Reload - ld.d $a2, $sp, 152 # 8-byte Folded Reload + ld.d $a1, $sp, 120 # 8-byte Folded Reload + ld.d $a2, $sp, 168 # 8-byte Folded Reload add.w $a1, $a2, $a1 st.d $a0, $a3, 0 - ld.d $fp, $sp, 112 # 8-byte Folded Reload - ld.d $s0, $sp, 72 # 8-byte Folded Reload + ld.d $s0, $sp, 128 # 8-byte Folded Reload + ld.d $s1, $sp, 88 # 8-byte Folded Reload ori $a0, $zero, 4 - bge $a1, $a0, .LBB3_43 + bge $a1, $a0, .LBB3_40 .LBB3_4: # =>This Loop Header: Depth=1 # Child Loop BB3_7 Depth 2 # Child Loop BB3_10 Depth 3 # Child Loop BB3_13 Depth 4 # Child Loop BB3_16 Depth 5 - # Child Loop BB3_41 Depth 6 + # Child Loop BB3_38 Depth 6 # Child Loop BB3_19 Depth 6 - st.d $a1, $sp, 152 # 8-byte Folded Spill + st.d $a1, $sp, 168 # 8-byte Folded Spill andi $a0, $a1, 1 sltui $a0, $a0, 1 ori $a1, $zero, 10 masknez $a1, $a1, $a0 - ld.d $a2, $sp, 144 # 8-byte Folded Reload + ld.d $a2, $sp, 160 # 8-byte Folded Reload maskeqz $a0, $a2, $a0 or $a2, $a0, $a1 ori $a3, $zero, 1 - move $a0, $fp - move $a1, $s0 - ld.d $a4, $sp, 64 # 8-byte Folded Reload + move $a0, $s0 + move $a1, $s1 + ld.d $a4, $sp, 80 # 8-byte Folded Reload move $a5, $a4 pcaddu18i $ra, %call36(exchange_boundary) jirl $ra, $ra, 0 pcaddu18i $ra, %call36(CycleTime) jirl $ra, $ra, 0 - ld.w $a1, $fp, 1600 - st.d $a0, $sp, 120 # 8-byte Folded Spill - vld $vr9, $sp, 80 # 16-byte Folded Reload - vld $vr10, $sp, 32 # 16-byte Folded Reload - vld $vr11, $sp, 16 # 16-byte Folded Reload - st.d $a1, $sp, 176 # 8-byte Folded Spill + ld.w $a1, $s0, 1600 + st.d $a0, $sp, 136 # 8-byte Folded Spill + vld $vr13, $sp, 96 # 16-byte Folded Reload + vld $vr14, $sp, 48 # 16-byte Folded Reload + vld $vr15, $sp, 32 # 16-byte Folded Reload + vld $vr16, $sp, 16 # 16-byte Folded Reload + st.d $a1, $sp, 192 # 8-byte Folded Spill blez $a1, .LBB3_3 # %bb.5: # %.lr.ph225 # in Loop: Header=BB3_4 Depth=1 - ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 128 # 8-byte Folded Reload ld.d $a0, $a0, 1776 - st.d $a0, $sp, 168 # 8-byte Folded Spill + st.d $a0, $sp, 184 # 8-byte Folded Spill move $a1, $zero b .LBB3_7 .p2align 4, , 16 .LBB3_6: # %._crit_edge222 # in Loop: Header=BB3_7 Depth=2 - ld.d $a1, $sp, 184 # 8-byte Folded Reload + ld.d $a1, $sp, 200 # 8-byte Folded Reload addi.d $a1, $a1, 1 - ld.d $a0, $sp, 176 # 8-byte Folded Reload + ld.d $a0, $sp, 192 # 8-byte Folded Reload beq $a1, $a0, .LBB3_3 .LBB3_7: # Parent Loop BB3_4 Depth=1 # => This Loop Header: Depth=2 # Child Loop BB3_10 Depth 3 # Child Loop BB3_13 Depth 4 # Child Loop BB3_16 Depth 5 - # Child Loop BB3_41 Depth 6 + # Child Loop BB3_38 Depth 6 # Child Loop BB3_19 Depth 6 - st.d $a1, $sp, 184 # 8-byte Folded Spill + st.d $a1, $sp, 200 # 8-byte Folded Spill slli.d $a0, $a1, 8 - ld.d $a1, $sp, 168 # 8-byte Folded Reload + ld.d $a1, $sp, 184 # 8-byte Folded Reload add.d $a0, $a1, $a0 ld.d $a0, $a0, 248 - ld.d $a1, $sp, 160 # 8-byte Folded Reload + ld.d $a1, $sp, 176 # 8-byte Folded Reload add.d $a0, $a0, $a1 ld.w $a1, $a0, 44 - st.d $a1, $sp, 488 # 8-byte Folded Spill + st.d $a1, $sp, 512 # 8-byte Folded Spill blez $a1, .LBB3_6 # %bb.8: # %.lr.ph221 # in Loop: Header=BB3_7 Depth=2 - ld.w $a4, $a0, 48 - ld.d $a1, $a0, 176 - ld.w $a5, $a0, 52 - st.d $zero, $sp, 480 # 8-byte Folded Spill - ld.d $a2, $sp, 128 # 8-byte Folded Reload - ldx.d $a7, $a1, $a2 - add.d $t0, $a4, $a5 - addi.w $a2, $t0, 1 - ld.d $a6, $sp, 488 # 8-byte Folded Reload - mul.w $t1, $a6, $a2 - ld.d $t2, $a1, 16 - ld.d $t3, $a1, 40 - ld.d $t4, $a1, 48 - ld.d $t5, $a1, 56 - ld.d $t6, $a1, 32 - ld.d $a3, $sp, 136 # 8-byte Folded Reload - fld.d $fa0, $a3, 0 - ld.w $a3, $a0, 20 - ld.w $t7, $a0, 24 - st.d $t7, $sp, 280 # 8-byte Folded Spill + ld.d $a1, $a0, 48 + st.d $zero, $sp, 504 # 8-byte Folded Spill + ld.d $a2, $a0, 176 + vinsgr2vr.d $vr1, $a1, 0 + vpickve2gr.w $a4, $vr1, 0 + vpickve2gr.w $a5, $vr1, 1 + ld.d $a1, $sp, 144 # 8-byte Folded Reload + ldx.d $a6, $a2, $a1 + add.d $a7, $a4, $a5 + addi.w $t0, $a7, 1 + ld.d $a3, $sp, 512 # 8-byte Folded Reload + mul.w $t1, $a3, $t0 + ld.d $t2, $a2, 16 + ld.d $t3, $a2, 40 + ld.d $t7, $a2, 48 + ld.d $t8, $a2, 56 + ld.d $fp, $a2, 32 + ld.d $a1, $sp, 152 # 8-byte Folded Reload + fld.d $fa0, $a1, 0 + ld.w $a1, $a0, 20 + ld.w $t4, $a0, 24 + st.d $t4, $sp, 296 # 8-byte Folded Spill ld.w $a0, $a0, 28 - st.d $a0, $sp, 368 # 8-byte Folded Spill + st.d $a0, $sp, 392 # 8-byte Folded Spill fmul.d $fa0, $fa0, $fa0 frecip.d $fa0, $fa0 - ld.d $a0, $sp, 144 # 8-byte Folded Reload - alsl.d $a0, $a0, $a1, 3 - st.d $a0, $sp, 272 # 8-byte Folded Spill - addi.d $a0, $a1, 80 - st.d $a0, $sp, 264 # 8-byte Folded Spill + ld.d $a0, $sp, 160 # 8-byte Folded Reload + alsl.d $a0, $a0, $a2, 3 + st.d $a0, $sp, 288 # 8-byte Folded Spill + addi.d $a0, $a2, 80 + st.d $a0, $sp, 280 # 8-byte Folded Spill fmul.d $fa0, $fs0, $fa0 ori $a0, $zero, 1 - sub.w $a1, $a0, $a6 - st.d $a3, $sp, 288 # 8-byte Folded Spill - add.d $a0, $a3, $a6 - addi.d $a3, $a0, -1 - st.d $t0, $sp, 376 # 8-byte Folded Spill - st.d $a1, $sp, 904 # 8-byte Folded Spill - mulw.d.w $a0, $t0, $a1 - addi.d $t0, $a0, 1 - bstrpick.d $a1, $a2, 31, 0 - st.d $a1, $sp, 248 # 8-byte Folded Spill - sub.d $a1, $a0, $a6 - addi.d $a1, $a1, 2 - st.d $a1, $sp, 240 # 8-byte Folded Spill - add.d $a1, $a4, $a0 - st.d $a1, $sp, 464 # 8-byte Folded Spill - sub.d $a1, $a1, $a6 - addi.w $t7, $a1, 1 + sub.w $a2, $a0, $a3 + st.d $a1, $sp, 304 # 8-byte Folded Spill + add.d $a0, $a1, $a3 + addi.d $s0, $a0, -1 + st.d $a7, $sp, 400 # 8-byte Folded Spill + st.d $a2, $sp, 936 # 8-byte Folded Spill + mulw.d.w $a0, $a7, $a2 + vaddi.wu $vr1, $vr1, 1 + vreplgr2vr.w $vr2, $a0 + vpackev.d $vr1, $vr1, $vr15 + vadd.w $vr1, $vr2, $vr1 + vreplgr2vr.w $vr2, $a3 + vsub.w $vr1, $vr1, $vr2 + addi.d $a7, $a0, 1 + sub.d $a1, $t1, $a5 + slli.d $a1, $a1, 3 + st.d $a1, $sp, 264 # 8-byte Folded Spill addi.d $a1, $a5, 1 add.d $a2, $a1, $a0 - sub.w $t8, $a2, $a6 - sub.d $a2, $t1, $a5 - slli.d $a2, $a2, 3 - st.d $a2, $sp, 216 # 8-byte Folded Spill + sub.w $s2, $a2, $a3 sub.d $a2, $t1, $a4 slli.d $a2, $a2, 3 - st.d $a2, $sp, 208 # 8-byte Folded Spill - vreplvei.d $vr1, $vr0, 0 + st.d $a2, $sp, 248 # 8-byte Folded Spill + add.d $a2, $a4, $a0 + st.d $a2, $sp, 488 # 8-byte Folded Spill + sub.d $a2, $a2, $a3 + addi.w $s3, $a2, 1 + sub.d $a2, $a0, $a3 + addi.d $a2, $a2, 2 + st.d $a2, $sp, 232 # 8-byte Folded Spill + vreplvei.d $vr2, $vr0, 0 add.d $a1, $a1, $a4 - st.d $a1, $sp, 360 # 8-byte Folded Spill + st.d $a1, $sp, 384 # 8-byte Folded Spill + sub.d $a1, $a7, $a4 + sub.w $a1, $a1, $a3 + st.d $a1, $sp, 480 # 8-byte Folded Spill + sub.d $a1, $a7, $a5 + sub.w $a1, $a1, $a3 + st.d $a1, $sp, 472 # 8-byte Folded Spill + st.d $a4, $sp, 904 # 8-byte Folded Spill sub.d $a1, $a0, $a4 - st.d $a1, $sp, 456 # 8-byte Folded Spill + st.d $a1, $sp, 464 # 8-byte Folded Spill add.d $a1, $a5, $a0 - st.d $a1, $sp, 448 # 8-byte Folded Spill + st.d $a1, $sp, 456 # 8-byte Folded Spill + st.d $a5, $sp, 592 # 8-byte Folded Spill sub.d $a0, $a0, $a5 - st.d $a0, $sp, 440 # 8-byte Folded Spill - st.d $a7, $sp, 344 # 8-byte Folded Spill - alsl.d $s6, $t1, $a7, 3 - st.d $t2, $sp, 328 # 8-byte Folded Spill - alsl.d $a7, $t1, $t2, 3 - st.d $t3, $sp, 320 # 8-byte Folded Spill - alsl.d $t2, $t1, $t3, 3 - st.d $t4, $sp, 312 # 8-byte Folded Spill - alsl.d $t3, $t1, $t4, 3 - st.d $t5, $sp, 304 # 8-byte Folded Spill - alsl.d $t4, $t1, $t5, 3 - st.d $t6, $sp, 296 # 8-byte Folded Spill - alsl.d $t5, $t1, $t6, 3 - ld.d $a0, $sp, 152 # 8-byte Folded Reload - add.w $a1, $a6, $a0 - st.d $a1, $sp, 352 # 8-byte Folded Spill - st.d $t0, $sp, 472 # 8-byte Folded Spill - sub.w $a1, $t0, $a6 - st.d $t1, $sp, 336 # 8-byte Folded Spill - slli.d $a2, $t1, 3 - st.d $a2, $sp, 192 # 8-byte Folded Spill - st.d $a4, $sp, 872 # 8-byte Folded Spill - sub.w $a2, $a1, $a4 - st.d $a2, $sp, 432 # 8-byte Folded Spill - st.d $a5, $sp, 568 # 8-byte Folded Spill - sub.w $a2, $a1, $a5 - st.d $a2, $sp, 424 # 8-byte Folded Spill - st.d $a3, $sp, 256 # 8-byte Folded Spill - st.d $a3, $sp, 416 # 8-byte Folded Spill - st.d $t8, $sp, 224 # 8-byte Folded Spill - st.d $t8, $sp, 408 # 8-byte Folded Spill - st.d $t7, $sp, 232 # 8-byte Folded Spill - st.d $t7, $sp, 400 # 8-byte Folded Spill - st.d $a1, $sp, 200 # 8-byte Folded Spill - st.d $a1, $sp, 392 # 8-byte Folded Spill - st.d $a0, $sp, 384 # 8-byte Folded Spill + st.d $a0, $sp, 448 # 8-byte Folded Spill + st.d $a6, $sp, 368 # 8-byte Folded Spill + alsl.d $t4, $t1, $a6, 3 + st.d $t2, $sp, 344 # 8-byte Folded Spill + alsl.d $t5, $t1, $t2, 3 + st.d $t3, $sp, 336 # 8-byte Folded Spill + alsl.d $t6, $t1, $t3, 3 + st.d $t7, $sp, 328 # 8-byte Folded Spill + alsl.d $t7, $t1, $t7, 3 + st.d $t8, $sp, 320 # 8-byte Folded Spill + alsl.d $t8, $t1, $t8, 3 + st.d $fp, $sp, 312 # 8-byte Folded Spill + alsl.d $s1, $t1, $fp, 3 + ld.d $a0, $sp, 168 # 8-byte Folded Reload + add.w $a1, $a3, $a0 + st.d $a1, $sp, 376 # 8-byte Folded Spill + st.d $t1, $sp, 352 # 8-byte Folded Spill + slli.d $a1, $t1, 3 + st.d $a1, $sp, 224 # 8-byte Folded Spill + st.d $a7, $sp, 496 # 8-byte Folded Spill + sub.w $a1, $a7, $a3 + st.d $t0, $sp, 360 # 8-byte Folded Spill + bstrpick.d $a2, $t0, 31, 0 + st.d $a2, $sp, 208 # 8-byte Folded Spill + st.d $s0, $sp, 272 # 8-byte Folded Spill + st.d $s0, $sp, 440 # 8-byte Folded Spill + st.d $s2, $sp, 256 # 8-byte Folded Spill + st.d $s2, $sp, 432 # 8-byte Folded Spill + st.d $s3, $sp, 240 # 8-byte Folded Spill + st.d $s3, $sp, 424 # 8-byte Folded Spill + st.d $a1, $sp, 216 # 8-byte Folded Spill + st.d $a1, $sp, 416 # 8-byte Folded Spill + st.d $a0, $sp, 408 # 8-byte Folded Spill b .LBB3_10 .p2align 4, , 16 .LBB3_9: # %._crit_edge215 # in Loop: Header=BB3_10 Depth=3 - ld.d $a2, $sp, 384 # 8-byte Folded Reload + ld.d $a2, $sp, 408 # 8-byte Folded Reload addi.w $a2, $a2, 1 - ld.d $a0, $sp, 904 # 8-byte Folded Reload + ld.d $a0, $sp, 936 # 8-byte Folded Reload addi.w $a0, $a0, 1 - st.d $a0, $sp, 904 # 8-byte Folded Spill - ld.d $a0, $sp, 480 # 8-byte Folded Reload + st.d $a0, $sp, 936 # 8-byte Folded Spill + ld.d $a0, $sp, 504 # 8-byte Folded Reload addi.d $a0, $a0, 1 - st.d $a0, $sp, 480 # 8-byte Folded Spill - ld.d $a0, $sp, 360 # 8-byte Folded Reload - ld.d $a1, $sp, 392 # 8-byte Folded Reload + st.d $a0, $sp, 504 # 8-byte Folded Spill + ld.d $a0, $sp, 384 # 8-byte Folded Reload + ld.d $a1, $sp, 416 # 8-byte Folded Reload add.w $a1, $a1, $a0 - st.d $a1, $sp, 392 # 8-byte Folded Spill - ld.d $a1, $sp, 400 # 8-byte Folded Reload + st.d $a1, $sp, 416 # 8-byte Folded Spill + ld.d $a1, $sp, 424 # 8-byte Folded Reload + add.w $a1, $a1, $a0 + st.d $a1, $sp, 424 # 8-byte Folded Spill + ld.d $a1, $sp, 480 # 8-byte Folded Reload add.w $a1, $a1, $a0 - st.d $a1, $sp, 400 # 8-byte Folded Spill + st.d $a1, $sp, 480 # 8-byte Folded Spill ld.d $a1, $sp, 432 # 8-byte Folded Reload add.w $a1, $a1, $a0 st.d $a1, $sp, 432 # 8-byte Folded Spill - ld.d $a1, $sp, 408 # 8-byte Folded Reload - add.w $a1, $a1, $a0 - st.d $a1, $sp, 408 # 8-byte Folded Spill - ld.d $a1, $sp, 424 # 8-byte Folded Reload + ld.d $a1, $sp, 472 # 8-byte Folded Reload add.w $a1, $a1, $a0 - st.d $a1, $sp, 424 # 8-byte Folded Spill - ld.d $a0, $sp, 416 # 8-byte Folded Reload + st.d $a1, $sp, 472 # 8-byte Folded Spill + ld.d $a0, $sp, 440 # 8-byte Folded Reload addi.d $a0, $a0, -1 - st.d $a0, $sp, 416 # 8-byte Folded Spill - ld.d $a0, $sp, 376 # 8-byte Folded Reload - ld.d $a1, $sp, 472 # 8-byte Folded Reload + st.d $a0, $sp, 440 # 8-byte Folded Spill + ld.d $a0, $sp, 400 # 8-byte Folded Reload + ld.d $a1, $sp, 496 # 8-byte Folded Reload add.d $a1, $a1, $a0 - st.d $a1, $sp, 472 # 8-byte Folded Spill + st.d $a1, $sp, 496 # 8-byte Folded Spill + ld.d $a1, $sp, 488 # 8-byte Folded Reload + add.d $a1, $a1, $a0 + st.d $a1, $sp, 488 # 8-byte Folded Spill ld.d $a1, $sp, 464 # 8-byte Folded Reload add.d $a1, $a1, $a0 st.d $a1, $sp, 464 # 8-byte Folded Spill @@ -1068,297 +1086,301 @@ smooth: # @smooth ld.d $a1, $sp, 448 # 8-byte Folded Reload add.d $a1, $a1, $a0 st.d $a1, $sp, 448 # 8-byte Folded Spill - ld.d $a1, $sp, 440 # 8-byte Folded Reload - add.d $a1, $a1, $a0 - st.d $a1, $sp, 440 # 8-byte Folded Spill - ld.d $a0, $sp, 352 # 8-byte Folded Reload - st.d $a2, $sp, 384 # 8-byte Folded Spill + ld.d $a0, $sp, 376 # 8-byte Folded Reload + st.d $a2, $sp, 408 # 8-byte Folded Spill bge $a2, $a0, .LBB3_6 .LBB3_10: # Parent Loop BB3_4 Depth=1 # Parent Loop BB3_7 Depth=2 # => This Loop Header: Depth=3 # Child Loop BB3_13 Depth 4 # Child Loop BB3_16 Depth 5 - # Child Loop BB3_41 Depth 6 + # Child Loop BB3_38 Depth 6 # Child Loop BB3_19 Depth 6 - ld.d $a0, $sp, 488 # 8-byte Folded Reload + ld.d $a0, $sp, 512 # 8-byte Folded Reload ori $a1, $zero, 1 sub.d $a2, $a1, $a0 addi.d $a0, $a0, -1 - st.d $a0, $sp, 488 # 8-byte Folded Spill - ld.d $a1, $sp, 368 # 8-byte Folded Reload + st.d $a0, $sp, 512 # 8-byte Folded Spill + ld.d $a1, $sp, 392 # 8-byte Folded Reload add.d $a0, $a0, $a1 - st.d $a2, $sp, 864 # 8-byte Folded Spill - st.d $a0, $sp, 552 # 8-byte Folded Spill + st.d $a2, $sp, 896 # 8-byte Folded Spill + st.d $a0, $sp, 576 # 8-byte Folded Spill bge $a2, $a0, .LBB3_9 # %bb.11: # %.preheader210.lr.ph # in Loop: Header=BB3_10 Depth=3 - st.d $zero, $sp, 736 # 8-byte Folded Spill - ld.d $t8, $sp, 904 # 8-byte Folded Reload - addi.d $a0, $t8, 1 - ld.d $a3, $sp, 416 # 8-byte Folded Reload + st.d $zero, $sp, 792 # 8-byte Folded Spill + ld.d $t2, $sp, 936 # 8-byte Folded Reload + addi.d $a0, $t2, 1 + ld.d $a3, $sp, 440 # 8-byte Folded Reload slt $a1, $a0, $a3 masknez $a2, $a0, $a1 maskeqz $a1, $a3, $a1 or $a1, $a1, $a2 - sub.d $a1, $a1, $t8 + sub.d $a1, $a1, $t2 bstrins.d $a1, $zero, 0, 0 - st.d $a1, $sp, 544 # 8-byte Folded Spill - ld.d $a3, $sp, 480 # 8-byte Folded Reload - ld.d $a1, $sp, 256 # 8-byte Folded Reload - sub.d $a1, $a1, $a3 + st.d $a1, $sp, 568 # 8-byte Folded Spill + ld.d $a4, $sp, 504 # 8-byte Folded Reload + ld.d $a1, $sp, 272 # 8-byte Folded Reload + sub.d $a1, $a1, $a4 slt $a2, $a0, $a1 maskeqz $a1, $a1, $a2 masknez $a0, $a0, $a2 or $a0, $a1, $a0 - sub.d $fp, $a0, $t8 - ld.d $a1, $sp, 248 # 8-byte Folded Reload - mul.d $a1, $a3, $a1 - ld.d $a2, $sp, 200 # 8-byte Folded Reload - add.d $a2, $a2, $a1 - st.d $a2, $sp, 520 # 8-byte Folded Spill - ld.d $a2, $sp, 224 # 8-byte Folded Reload + sub.d $fp, $a0, $t2 + ld.d $a1, $sp, 208 # 8-byte Folded Reload + mul.d $a1, $a4, $a1 + ld.d $a2, $sp, 216 # 8-byte Folded Reload add.d $a2, $a2, $a1 - st.d $a2, $sp, 512 # 8-byte Folded Spill - ld.d $a2, $sp, 232 # 8-byte Folded Reload + st.d $a2, $sp, 544 # 8-byte Folded Spill + ld.d $a2, $sp, 256 # 8-byte Folded Reload add.d $a2, $a2, $a1 - st.d $a2, $sp, 504 # 8-byte Folded Spill + st.d $a2, $sp, 536 # 8-byte Folded Spill ld.d $a2, $sp, 240 # 8-byte Folded Reload + add.d $a2, $a2, $a1 + st.d $a2, $sp, 528 # 8-byte Folded Spill + ld.d $a2, $sp, 232 # 8-byte Folded Reload add.d $a1, $a2, $a1 - st.d $a1, $sp, 496 # 8-byte Folded Spill - ld.d $a1, $sp, 384 # 8-byte Folded Reload + st.d $a1, $sp, 520 # 8-byte Folded Spill + nor $a1, $t2, $zero + add.d $s0, $a0, $a1 + ld.d $a1, $sp, 408 # 8-byte Folded Reload andi $a1, $a1, 1 sltui $a1, $a1, 1 - ld.d $a5, $sp, 264 # 8-byte Folded Reload - masknez $a2, $a5, $a1 - ld.d $a4, $sp, 272 # 8-byte Folded Reload - maskeqz $a3, $a4, $a1 + ld.d $a6, $sp, 280 # 8-byte Folded Reload + masknez $a2, $a6, $a1 + ld.d $a5, $sp, 288 # 8-byte Folded Reload + maskeqz $a3, $a5, $a1 or $a2, $a3, $a2 - masknez $a3, $a4, $a1 - maskeqz $a1, $a5, $a1 + masknez $a3, $a5, $a1 + maskeqz $a1, $a6, $a1 or $a1, $a1, $a3 - ld.d $a3, $a1, 0 + ld.d $a1, $a1, 0 ld.d $a2, $a2, 0 - nor $a1, $t8, $zero - add.d $a1, $a0, $a1 - st.d $a1, $sp, 848 # 8-byte Folded Spill - ld.d $t6, $sp, 336 # 8-byte Folded Reload - alsl.d $a6, $t6, $a3, 3 - alsl.d $a1, $t6, $a2, 3 - ld.d $a4, $sp, 872 # 8-byte Folded Reload - add.d $a4, $a4, $t8 - ld.d $a5, $sp, 568 # 8-byte Folded Reload - add.d $a5, $a5, $t8 - slli.d $t0, $t8, 3 - ld.d $t1, $sp, 488 # 8-byte Folded Reload - ld.d $t7, $sp, 280 # 8-byte Folded Reload - add.d $t7, $t1, $t7 - st.d $t7, $sp, 856 # 8-byte Folded Spill - ld.d $t7, $sp, 288 # 8-byte Folded Reload - add.d $ra, $t1, $t7 - ld.d $t1, $sp, 192 # 8-byte Folded Reload - add.d $t7, $a3, $t1 - st.d $t7, $sp, 648 # 8-byte Folded Spill - sub.d $a3, $a3, $t0 - st.d $a3, $sp, 640 # 8-byte Folded Spill - add.d $a0, $a0, $t6 - st.d $a0, $sp, 832 # 8-byte Folded Spill - ld.d $a0, $sp, 328 # 8-byte Folded Reload - sub.d $a0, $a0, $t0 - st.d $a0, $sp, 632 # 8-byte Folded Spill - ld.d $a0, $sp, 216 # 8-byte Folded Reload + ld.d $a3, $sp, 360 # 8-byte Folded Reload + mul.d $a3, $a3, $a4 + vinsgr2vr.w $vr3, $a3, 0 + ld.d $t0, $sp, 352 # 8-byte Folded Reload + alsl.d $a7, $t0, $a1, 3 + alsl.d $t3, $t0, $a2, 3 + ld.d $a3, $sp, 904 # 8-byte Folded Reload + add.d $a3, $a3, $t2 + ld.d $a4, $sp, 592 # 8-byte Folded Reload + add.d $a4, $a4, $t2 + slli.d $a5, $t2, 3 + ld.d $a6, $sp, 512 # 8-byte Folded Reload + ld.d $t1, $sp, 296 # 8-byte Folded Reload + add.d $t1, $a6, $t1 + st.d $t1, $sp, 888 # 8-byte Folded Spill + ld.d $t1, $sp, 304 # 8-byte Folded Reload + add.d $s3, $a6, $t1 + ld.d $a6, $sp, 224 # 8-byte Folded Reload + add.d $t1, $a1, $a6 + st.d $t1, $sp, 672 # 8-byte Folded Spill + sub.d $a1, $a1, $a5 + st.d $a1, $sp, 664 # 8-byte Folded Spill + add.d $a0, $a0, $t0 + st.d $a0, $sp, 872 # 8-byte Folded Spill + ld.d $a0, $sp, 344 # 8-byte Folded Reload + sub.d $a0, $a0, $a5 + st.d $a0, $sp, 656 # 8-byte Folded Spill + ld.d $a0, $sp, 264 # 8-byte Folded Reload add.d $a0, $a2, $a0 - st.d $a0, $sp, 624 # 8-byte Folded Spill - slli.d $a0, $a5, 3 + st.d $a0, $sp, 648 # 8-byte Folded Spill + slli.d $a0, $a4, 3 sub.d $a0, $a2, $a0 - st.d $a0, $sp, 616 # 8-byte Folded Spill - add.d $a3, $a2, $t1 - sub.d $a0, $a2, $t0 - st.d $a0, $sp, 824 # 8-byte Folded Spill - ld.d $a0, $sp, 208 # 8-byte Folded Reload + st.d $a0, $sp, 640 # 8-byte Folded Spill + add.d $a1, $a2, $a6 + sub.d $a0, $a2, $a5 + st.d $a0, $sp, 864 # 8-byte Folded Spill + ld.d $a0, $sp, 248 # 8-byte Folded Reload add.d $a0, $a2, $a0 - st.d $a0, $sp, 608 # 8-byte Folded Spill - slli.d $a0, $a4, 3 + st.d $a0, $sp, 632 # 8-byte Folded Spill + slli.d $a0, $a3, 3 sub.d $a0, $a2, $a0 - st.d $a0, $sp, 600 # 8-byte Folded Spill + st.d $a0, $sp, 624 # 8-byte Folded Spill + ld.d $a0, $sp, 336 # 8-byte Folded Reload + sub.d $a0, $a0, $a5 + st.d $a0, $sp, 848 # 8-byte Folded Spill + ld.d $a0, $sp, 328 # 8-byte Folded Reload + sub.d $a0, $a0, $a5 + st.d $a0, $sp, 840 # 8-byte Folded Spill ld.d $a0, $sp, 320 # 8-byte Folded Reload - sub.d $a0, $a0, $t0 - st.d $a0, $sp, 800 # 8-byte Folded Spill + sub.d $a0, $a0, $a5 + st.d $a0, $sp, 808 # 8-byte Folded Spill ld.d $a0, $sp, 312 # 8-byte Folded Reload - sub.d $a0, $a0, $t0 - st.d $a0, $sp, 792 # 8-byte Folded Spill - ld.d $a0, $sp, 304 # 8-byte Folded Reload - sub.d $a0, $a0, $t0 - st.d $a0, $sp, 768 # 8-byte Folded Spill - ld.d $a0, $sp, 296 # 8-byte Folded Reload - sub.d $a0, $a0, $t0 - st.d $a0, $sp, 576 # 8-byte Folded Spill - ld.d $a0, $sp, 344 # 8-byte Folded Reload - sub.d $a0, $a0, $t0 - st.d $a0, $sp, 560 # 8-byte Folded Spill - st.d $fp, $sp, 840 # 8-byte Folded Spill + sub.d $a0, $a0, $a5 + st.d $a0, $sp, 600 # 8-byte Folded Spill + ld.d $a0, $sp, 368 # 8-byte Folded Reload + sub.d $a0, $a0, $a5 + st.d $a0, $sp, 584 # 8-byte Folded Spill + st.d $s0, $sp, 800 # 8-byte Folded Spill + vreplgr2vr.w $vr4, $s0 + vadd.w $vr4, $vr1, $vr4 + st.d $fp, $sp, 880 # 8-byte Folded Spill bstrins.d $fp, $zero, 0, 0 - st.d $a3, $sp, 816 # 8-byte Folded Spill - addi.d $a0, $a3, -8 - st.d $a0, $sp, 592 # 8-byte Folded Spill - st.d $fp, $sp, 536 # 8-byte Folded Spill - add.d $a0, $fp, $t8 - st.d $a0, $sp, 528 # 8-byte Folded Spill - ld.d $a0, $sp, 440 # 8-byte Folded Reload - st.d $a0, $sp, 728 # 8-byte Folded Spill + st.d $a1, $sp, 856 # 8-byte Folded Spill + addi.d $a0, $a1, -8 + st.d $a0, $sp, 616 # 8-byte Folded Spill + st.d $fp, $sp, 560 # 8-byte Folded Spill + add.d $a0, $fp, $t2 + st.d $a0, $sp, 552 # 8-byte Folded Spill ld.d $a0, $sp, 448 # 8-byte Folded Reload - st.d $a0, $sp, 720 # 8-byte Folded Spill + st.d $a0, $sp, 784 # 8-byte Folded Spill ld.d $a0, $sp, 456 # 8-byte Folded Reload - st.d $a0, $sp, 712 # 8-byte Folded Spill + st.d $a0, $sp, 776 # 8-byte Folded Spill ld.d $a0, $sp, 464 # 8-byte Folded Reload - st.d $a0, $sp, 704 # 8-byte Folded Spill + st.d $a0, $sp, 768 # 8-byte Folded Spill + ld.d $a0, $sp, 488 # 8-byte Folded Reload + st.d $a0, $sp, 760 # 8-byte Folded Spill + ld.d $a0, $sp, 496 # 8-byte Folded Reload + st.d $a0, $sp, 752 # 8-byte Folded Spill ld.d $a0, $sp, 472 # 8-byte Folded Reload - st.d $a0, $sp, 696 # 8-byte Folded Spill - ld.d $a0, $sp, 424 # 8-byte Folded Reload - st.d $a0, $sp, 688 # 8-byte Folded Spill - ld.d $a0, $sp, 408 # 8-byte Folded Reload - st.d $a0, $sp, 680 # 8-byte Folded Spill + st.d $a0, $sp, 744 # 8-byte Folded Spill ld.d $a0, $sp, 432 # 8-byte Folded Reload - st.d $a0, $sp, 672 # 8-byte Folded Spill - ld.d $a0, $sp, 400 # 8-byte Folded Reload - st.d $a0, $sp, 664 # 8-byte Folded Spill - ld.d $a0, $sp, 392 # 8-byte Folded Reload - st.d $a0, $sp, 656 # 8-byte Folded Spill - ld.d $a3, $sp, 864 # 8-byte Folded Reload + st.d $a0, $sp, 736 # 8-byte Folded Spill + ld.d $a0, $sp, 480 # 8-byte Folded Reload + st.d $a0, $sp, 728 # 8-byte Folded Spill + ld.d $a0, $sp, 424 # 8-byte Folded Reload + st.d $a0, $sp, 720 # 8-byte Folded Spill + ld.d $a0, $sp, 416 # 8-byte Folded Reload + st.d $a0, $sp, 712 # 8-byte Folded Spill + ld.d $a2, $sp, 896 # 8-byte Folded Reload b .LBB3_13 .p2align 4, , 16 .LBB3_12: # %._crit_edge213 # in Loop: Header=BB3_13 Depth=4 - ld.d $a3, $sp, 584 # 8-byte Folded Reload - addi.w $a3, $a3, 1 - ld.d $a0, $sp, 736 # 8-byte Folded Reload + ld.d $a2, $sp, 608 # 8-byte Folded Reload + addi.w $a2, $a2, 1 + ld.d $a0, $sp, 792 # 8-byte Folded Reload addi.d $a0, $a0, 1 - st.d $a0, $sp, 736 # 8-byte Folded Spill - ld.d $a0, $sp, 568 # 8-byte Folded Reload - ld.d $a2, $sp, 656 # 8-byte Folded Reload - add.w $a2, $a2, $a0 - st.d $a2, $sp, 656 # 8-byte Folded Spill - ld.d $a2, $sp, 664 # 8-byte Folded Reload - add.w $a2, $a2, $a0 - st.d $a2, $sp, 664 # 8-byte Folded Spill - ld.d $a2, $sp, 672 # 8-byte Folded Reload - add.w $a2, $a2, $a0 - st.d $a2, $sp, 672 # 8-byte Folded Spill - ld.d $a2, $sp, 680 # 8-byte Folded Reload - add.w $a2, $a2, $a0 - st.d $a2, $sp, 680 # 8-byte Folded Spill - ld.d $a2, $sp, 688 # 8-byte Folded Reload - add.w $a2, $a2, $a0 - st.d $a2, $sp, 688 # 8-byte Folded Spill - ld.d $a2, $sp, 696 # 8-byte Folded Reload - add.d $a2, $a2, $a0 - st.d $a2, $sp, 696 # 8-byte Folded Spill - ld.d $a2, $sp, 704 # 8-byte Folded Reload - add.d $a2, $a2, $a0 - st.d $a2, $sp, 704 # 8-byte Folded Spill - ld.d $a2, $sp, 712 # 8-byte Folded Reload - add.d $a2, $a2, $a0 - st.d $a2, $sp, 712 # 8-byte Folded Spill - ld.d $a2, $sp, 720 # 8-byte Folded Reload - add.d $a2, $a2, $a0 - st.d $a2, $sp, 720 # 8-byte Folded Spill - ld.d $a2, $sp, 728 # 8-byte Folded Reload - add.d $a2, $a2, $a0 - st.d $a2, $sp, 728 # 8-byte Folded Spill - ld.d $a0, $sp, 552 # 8-byte Folded Reload - bge $a3, $a0, .LBB3_9 + st.d $a0, $sp, 792 # 8-byte Folded Spill + ld.d $a0, $sp, 592 # 8-byte Folded Reload + ld.d $a1, $sp, 712 # 8-byte Folded Reload + add.w $a1, $a1, $a0 + st.d $a1, $sp, 712 # 8-byte Folded Spill + ld.d $a1, $sp, 720 # 8-byte Folded Reload + add.w $a1, $a1, $a0 + st.d $a1, $sp, 720 # 8-byte Folded Spill + ld.d $a1, $sp, 728 # 8-byte Folded Reload + add.w $a1, $a1, $a0 + st.d $a1, $sp, 728 # 8-byte Folded Spill + ld.d $a1, $sp, 736 # 8-byte Folded Reload + add.w $a1, $a1, $a0 + st.d $a1, $sp, 736 # 8-byte Folded Spill + ld.d $a1, $sp, 744 # 8-byte Folded Reload + add.w $a1, $a1, $a0 + st.d $a1, $sp, 744 # 8-byte Folded Spill + ld.d $a1, $sp, 752 # 8-byte Folded Reload + add.d $a1, $a1, $a0 + st.d $a1, $sp, 752 # 8-byte Folded Spill + ld.d $a1, $sp, 760 # 8-byte Folded Reload + add.d $a1, $a1, $a0 + st.d $a1, $sp, 760 # 8-byte Folded Spill + ld.d $a1, $sp, 768 # 8-byte Folded Reload + add.d $a1, $a1, $a0 + st.d $a1, $sp, 768 # 8-byte Folded Spill + ld.d $a1, $sp, 776 # 8-byte Folded Reload + add.d $a1, $a1, $a0 + st.d $a1, $sp, 776 # 8-byte Folded Spill + ld.d $a1, $sp, 784 # 8-byte Folded Reload + add.d $a1, $a1, $a0 + st.d $a1, $sp, 784 # 8-byte Folded Spill + ld.d $a0, $sp, 576 # 8-byte Folded Reload + bge $a2, $a0, .LBB3_9 .LBB3_13: # %.preheader210 # Parent Loop BB3_4 Depth=1 # Parent Loop BB3_7 Depth=2 # Parent Loop BB3_10 Depth=3 # => This Loop Header: Depth=4 # Child Loop BB3_16 Depth 5 - # Child Loop BB3_41 Depth 6 + # Child Loop BB3_38 Depth 6 # Child Loop BB3_19 Depth 6 - st.d $a3, $sp, 584 # 8-byte Folded Spill - ld.d $a0, $sp, 864 # 8-byte Folded Reload - ld.d $a2, $sp, 856 # 8-byte Folded Reload - bge $a0, $a2, .LBB3_12 + st.d $a2, $sp, 608 # 8-byte Folded Spill + ld.d $a0, $sp, 896 # 8-byte Folded Reload + ld.d $a1, $sp, 888 # 8-byte Folded Reload + bge $a0, $a1, .LBB3_12 # %bb.14: # %.preheader.lr.ph # in Loop: Header=BB3_13 Depth=4 - st.d $zero, $sp, 896 # 8-byte Folded Spill - ld.d $a0, $sp, 568 # 8-byte Folded Reload - ld.d $a2, $sp, 736 # 8-byte Folded Reload - mul.d $a0, $a0, $a2 - ld.d $a2, $sp, 520 # 8-byte Folded Reload - add.d $a2, $a2, $a0 - st.d $a2, $sp, 752 # 8-byte Folded Spill - ld.d $a2, $sp, 512 # 8-byte Folded Reload - add.d $a2, $a2, $a0 - st.d $a2, $sp, 808 # 8-byte Folded Spill - ld.d $a2, $sp, 504 # 8-byte Folded Reload - add.d $a2, $a2, $a0 - st.d $a2, $sp, 760 # 8-byte Folded Spill - ld.d $a2, $sp, 496 # 8-byte Folded Reload - add.d $a0, $a2, $a0 - st.d $a0, $sp, 744 # 8-byte Folded Spill - ld.d $t1, $sp, 728 # 8-byte Folded Reload - ld.d $t0, $sp, 720 # 8-byte Folded Reload - ld.d $s1, $sp, 712 # 8-byte Folded Reload - ld.d $s2, $sp, 704 # 8-byte Folded Reload - ld.d $t6, $sp, 696 # 8-byte Folded Reload - ld.d $a0, $sp, 688 # 8-byte Folded Reload - st.d $a0, $sp, 888 # 8-byte Folded Spill - ld.d $a0, $sp, 680 # 8-byte Folded Reload - st.d $a0, $sp, 880 # 8-byte Folded Spill - ld.d $a5, $sp, 672 # 8-byte Folded Reload - ld.d $s0, $sp, 664 # 8-byte Folded Reload - ld.d $a4, $sp, 656 # 8-byte Folded Reload - ld.d $s8, $sp, 864 # 8-byte Folded Reload + st.d $zero, $sp, 928 # 8-byte Folded Spill + ld.d $a0, $sp, 592 # 8-byte Folded Reload + ld.d $a1, $sp, 792 # 8-byte Folded Reload + mul.d $a0, $a0, $a1 + ld.d $a1, $sp, 544 # 8-byte Folded Reload + add.d $a1, $a1, $a0 + st.d $a1, $sp, 704 # 8-byte Folded Spill + ld.d $a1, $sp, 536 # 8-byte Folded Reload + add.d $a1, $a1, $a0 + st.d $a1, $sp, 696 # 8-byte Folded Spill + ld.d $a1, $sp, 528 # 8-byte Folded Reload + add.d $a1, $a1, $a0 + st.d $a1, $sp, 688 # 8-byte Folded Spill + ld.d $a1, $sp, 520 # 8-byte Folded Reload + add.d $a1, $a1, $a0 + st.d $a1, $sp, 680 # 8-byte Folded Spill + vinsgr2vr.w $vr5, $a0, 0 + vadd.w $vr5, $vr5, $vr3 + ld.d $t2, $sp, 784 # 8-byte Folded Reload + ld.d $t0, $sp, 776 # 8-byte Folded Reload + ld.d $s0, $sp, 768 # 8-byte Folded Reload + ld.d $s2, $sp, 760 # 8-byte Folded Reload + ld.d $s6, $sp, 752 # 8-byte Folded Reload + ld.d $a0, $sp, 744 # 8-byte Folded Reload + st.d $a0, $sp, 920 # 8-byte Folded Spill + ld.d $a0, $sp, 736 # 8-byte Folded Reload + st.d $a0, $sp, 912 # 8-byte Folded Spill + ld.d $fp, $sp, 728 # 8-byte Folded Reload + ld.d $a3, $sp, 720 # 8-byte Folded Reload + ld.d $a6, $sp, 712 # 8-byte Folded Reload + ld.d $t1, $sp, 896 # 8-byte Folded Reload b .LBB3_16 .p2align 4, , 16 .LBB3_15: # %._crit_edge # in Loop: Header=BB3_16 Depth=5 - addi.w $s8, $s8, 1 - ld.d $a0, $sp, 896 # 8-byte Folded Reload + addi.w $t1, $t1, 1 + ld.d $a0, $sp, 928 # 8-byte Folded Reload addi.d $a0, $a0, 1 - st.d $a0, $sp, 896 # 8-byte Folded Spill - ld.d $a0, $sp, 872 # 8-byte Folded Reload - add.w $a4, $a4, $a0 - add.w $s0, $s0, $a0 - add.w $a5, $a5, $a0 - ld.d $a2, $sp, 880 # 8-byte Folded Reload - add.w $a2, $a2, $a0 - st.d $a2, $sp, 880 # 8-byte Folded Spill - ld.d $a2, $sp, 888 # 8-byte Folded Reload - add.w $a2, $a2, $a0 - st.d $a2, $sp, 888 # 8-byte Folded Spill - add.d $t6, $t6, $a0 + st.d $a0, $sp, 928 # 8-byte Folded Spill + ld.d $a0, $sp, 904 # 8-byte Folded Reload + add.w $a6, $a6, $a0 + add.w $a3, $a3, $a0 + add.w $fp, $fp, $a0 + ld.d $a1, $sp, 912 # 8-byte Folded Reload + add.w $a1, $a1, $a0 + st.d $a1, $sp, 912 # 8-byte Folded Spill + ld.d $a1, $sp, 920 # 8-byte Folded Reload + add.w $a1, $a1, $a0 + st.d $a1, $sp, 920 # 8-byte Folded Spill + add.d $s6, $s6, $a0 add.d $s2, $s2, $a0 - add.d $s1, $s1, $a0 + add.d $s0, $s0, $a0 add.d $t0, $t0, $a0 - add.d $t1, $t1, $a0 - ld.d $a0, $sp, 856 # 8-byte Folded Reload - bge $s8, $a0, .LBB3_12 + add.d $t2, $t2, $a0 + ld.d $a0, $sp, 888 # 8-byte Folded Reload + bge $t1, $a0, .LBB3_12 .LBB3_16: # %.preheader # Parent Loop BB3_4 Depth=1 # Parent Loop BB3_7 Depth=2 # Parent Loop BB3_10 Depth=3 # Parent Loop BB3_13 Depth=4 # => This Loop Header: Depth=5 - # Child Loop BB3_41 Depth 6 + # Child Loop BB3_38 Depth 6 # Child Loop BB3_19 Depth 6 - ld.d $a0, $sp, 864 # 8-byte Folded Reload - bge $a0, $ra, .LBB3_15 + ld.d $a0, $sp, 896 # 8-byte Folded Reload + bge $a0, $s3, .LBB3_15 # %bb.17: # %.lr.ph # in Loop: Header=BB3_16 Depth=5 - ld.d $a3, $sp, 904 # 8-byte Folded Reload + ld.d $a5, $sp, 936 # 8-byte Folded Reload ori $a0, $zero, 8 - ld.d $a2, $sp, 840 # 8-byte Folded Reload - bgeu $a2, $a0, .LBB3_20 + ld.d $a1, $sp, 880 # 8-byte Folded Reload + bgeu $a1, $a0, .LBB3_20 .LBB3_18: # %scalar.ph.preheader # in Loop: Header=BB3_16 Depth=5 - move $a2, $t1 - move $t7, $t0 - move $t8, $s1 - move $s4, $s2 - move $s5, $t6 - move $fp, $a3 + move $a1, $t2 + move $a2, $t0 + move $s4, $s0 + move $s7, $s2 + move $ra, $s6 + move $s8, $a5 .p2align 4, , 16 .LBB3_19: # %scalar.ph # Parent Loop BB3_4 Depth=1 @@ -1367,371 +1389,361 @@ smooth: # @smooth # Parent Loop BB3_13 Depth=4 # Parent Loop BB3_16 Depth=5 # => This Inner Loop Header: Depth=6 - add.w $s3, $a3, $s5 - addi.w $s7, $s3, -1 - slli.d $a0, $s7, 3 - fldx.d $fa2, $a7, $a0 - alsl.d $s7, $s7, $a1, 3 - fldx.d $fa3, $a1, $a0 - slli.d $s3, $s3, 3 - fldx.d $fa4, $a1, $s3 - fld.d $fa5, $s7, -8 - fldx.d $fa6, $t2, $s3 - fldx.d $fa7, $t2, $a0 - fsub.d $fa4, $fa4, $fa3 - fsub.d $fa5, $fa3, $fa5 - fneg.d $fa5, $fa5 - fmul.d $fa5, $fa7, $fa5 - fmadd.d $fa4, $fa6, $fa4, $fa5 - add.w $s3, $a3, $s4 - slli.d $s3, $s3, 3 - fldx.d $fa5, $t3, $s3 - fldx.d $fa6, $a1, $s3 - add.w $s3, $a3, $t8 - slli.d $s3, $s3, 3 - fldx.d $fa7, $a1, $s3 - fsub.d $fa6, $fa6, $fa3 - fmadd.d $fa4, $fa5, $fa6, $fa4 - fldx.d $fa5, $t3, $a0 - fsub.d $fa6, $fa3, $fa7 - add.w $s3, $a3, $t7 - slli.d $s3, $s3, 3 - fldx.d $fa7, $a1, $s3 - fneg.d $fa5, $fa5 - fmadd.d $fa4, $fa5, $fa6, $fa4 - fldx.d $fa5, $t4, $s3 - fsub.d $fa6, $fa7, $fa3 - add.w $s3, $a3, $a2 - slli.d $s3, $s3, 3 - fldx.d $fa7, $a1, $s3 - fldx.d $ft0, $t4, $a0 - fmadd.d $fa4, $fa5, $fa6, $fa4 - fmul.d $fa2, $ft1, $fa2 - fsub.d $fa5, $fa3, $fa7 - fneg.d $fa6, $ft0 - fnmadd.d $fa4, $fa6, $fa5, $fa4 - fldx.d $fa5, $t5, $a0 - fldx.d $fa6, $s6, $a0 - pcalau12i $s3, %pc_hi20(.LCPI3_0) - fld.d $fa7, $s3, %pc_lo12(.LCPI3_0) - fmul.d $fa4, $fa0, $fa4 - fmadd.d $fa2, $fa2, $fa3, $fa4 - fsub.d $fa2, $fa2, $fa6 - fmul.d $fa4, $fa5, $fa7 - fmadd.d $fa2, $fa4, $fa2, $fa3 - fstx.d $fa2, $a6, $a0 - addi.d $fp, $fp, 1 - addi.d $s5, $s5, 1 + add.w $a0, $a5, $ra + addi.w $a4, $a0, -1 + slli.d $s5, $a4, 3 + fldx.d $fa6, $t5, $s5 + alsl.d $a4, $a4, $t3, 3 + fldx.d $fa7, $t3, $s5 + slli.d $a0, $a0, 3 + fldx.d $ft0, $t3, $a0 + fld.d $ft1, $a4, -8 + fldx.d $ft2, $t6, $a0 + fldx.d $ft3, $t6, $s5 + fsub.d $ft0, $ft0, $fa7 + fsub.d $ft1, $fa7, $ft1 + fneg.d $ft1, $ft1 + fmul.d $ft1, $ft3, $ft1 + fmadd.d $ft0, $ft2, $ft0, $ft1 + add.w $a0, $a5, $s7 + slli.d $a0, $a0, 3 + fldx.d $ft1, $t7, $a0 + fldx.d $ft2, $t3, $a0 + add.w $a0, $a5, $s4 + slli.d $a0, $a0, 3 + fldx.d $ft3, $t3, $a0 + fsub.d $ft2, $ft2, $fa7 + fmadd.d $ft0, $ft1, $ft2, $ft0 + fldx.d $ft1, $t7, $s5 + fsub.d $ft2, $fa7, $ft3 + add.w $a0, $a5, $a2 + slli.d $a0, $a0, 3 + fldx.d $ft3, $t3, $a0 + fneg.d $ft1, $ft1 + fmadd.d $ft0, $ft1, $ft2, $ft0 + fldx.d $ft1, $t8, $a0 + fsub.d $ft2, $ft3, $fa7 + add.w $a0, $a5, $a1 + slli.d $a0, $a0, 3 + fldx.d $ft3, $t3, $a0 + fldx.d $ft4, $t8, $s5 + fmadd.d $ft0, $ft1, $ft2, $ft0 + fmul.d $fa6, $ft5, $fa6 + fsub.d $ft1, $fa7, $ft3 + fneg.d $ft2, $ft4 + fnmadd.d $ft0, $ft2, $ft1, $ft0 + fldx.d $ft1, $s1, $s5 + fldx.d $ft2, $t4, $s5 + pcalau12i $a0, %pc_hi20(.LCPI3_0) + fld.d $ft3, $a0, %pc_lo12(.LCPI3_0) + fmul.d $ft0, $fa0, $ft0 + fmadd.d $fa6, $fa6, $fa7, $ft0 + fsub.d $fa6, $fa6, $ft2 + fmul.d $ft0, $ft1, $ft3 + fmadd.d $fa6, $ft0, $fa6, $fa7 + fstx.d $fa6, $a7, $s5 + addi.d $s8, $s8, 1 + addi.d $ra, $ra, 1 + addi.d $s7, $s7, 1 addi.d $s4, $s4, 1 - addi.d $t8, $t8, 1 - addi.d $t7, $t7, 1 addi.d $a2, $a2, 1 - blt $fp, $ra, .LBB3_19 + addi.d $a1, $a1, 1 + blt $s8, $s3, .LBB3_19 b .LBB3_15 .p2align 4, , 16 .LBB3_20: # %vector.scevcheck # in Loop: Header=BB3_16 Depth=5 - ld.d $a0, $sp, 872 # 8-byte Folded Reload - ld.d $a2, $sp, 896 # 8-byte Folded Reload - mul.d $a0, $a0, $a2 - ld.d $a2, $sp, 808 # 8-byte Folded Reload - add.w $s3, $a2, $a0 - ld.d $a2, $sp, 848 # 8-byte Folded Reload - add.w $t7, $s3, $a2 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - blt $t7, $s3, .LBB3_18 + ld.d $a0, $sp, 904 # 8-byte Folded Reload + ld.d $a1, $sp, 928 # 8-byte Folded Reload + mul.d $s8, $a0, $a1 + vinsgr2vr.w $vr6, $s8, 0 + vadd.w $vr6, $vr6, $vr5 + vreplvei.w $vr6, $vr6, 0 + vadd.w $vr7, $vr6, $vr1 + vadd.w $vr6, $vr6, $vr4 + vslt.w $vr6, $vr6, $vr7 + vmskltz.w $vr6, $vr6 + vpickve2gr.hu $a1, $vr6, 0 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a1, .LBB3_18 # %bb.21: # %vector.scevcheck # in Loop: Header=BB3_16 Depth=5 - ld.d $a2, $sp, 760 # 8-byte Folded Reload - add.w $s7, $a2, $a0 - ld.d $a2, $sp, 848 # 8-byte Folded Reload - add.w $t7, $s7, $a2 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - blt $t7, $s7, .LBB3_18 -# %bb.22: # %vector.scevcheck + ld.d $a0, $sp, 800 # 8-byte Folded Reload + srli.d $a1, $a0, 32 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a1, .LBB3_18 +# %bb.22: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - ld.d $a2, $sp, 752 # 8-byte Folded Reload - add.w $t8, $a2, $a0 - ld.d $a2, $sp, 848 # 8-byte Folded Reload - add.w $t7, $t8, $a2 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - blt $t7, $t8, .LBB3_18 -# %bb.23: # %vector.scevcheck + ld.d $a0, $sp, 704 # 8-byte Folded Reload + add.w $s4, $a0, $s8 + ld.d $a0, $sp, 672 # 8-byte Folded Reload + alsl.d $a1, $s4, $a0, 3 + ld.d $a0, $sp, 872 # 8-byte Folded Reload + add.d $s7, $a0, $s4 + ld.d $a0, $sp, 664 # 8-byte Folded Reload + alsl.d $ra, $s7, $a0, 3 + alsl.d $a2, $s4, $t5, 3 + ld.d $a0, $sp, 656 # 8-byte Folded Reload + alsl.d $a5, $s7, $a0, 3 + sltu $a5, $a1, $a5 + sltu $a2, $a2, $ra + and $a2, $a5, $a2 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a2, .LBB3_18 +# %bb.23: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - ld.d $a2, $sp, 744 # 8-byte Folded Reload - add.w $fp, $a2, $a0 - ld.d $a0, $sp, 848 # 8-byte Folded Reload - add.w $a0, $fp, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - blt $a0, $fp, .LBB3_18 -# %bb.24: # %vector.scevcheck + ld.d $a0, $sp, 648 # 8-byte Folded Reload + alsl.d $a2, $s4, $a0, 3 + ld.d $a0, $sp, 640 # 8-byte Folded Reload + alsl.d $a5, $s7, $a0, 3 + sltu $a5, $a1, $a5 + sltu $a2, $a2, $ra + and $a2, $a5, $a2 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a2, .LBB3_18 +# %bb.24: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - ld.d $a0, $sp, 848 # 8-byte Folded Reload - srli.d $a0, $a0, 32 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 + ld.d $a0, $sp, 696 # 8-byte Folded Reload + add.w $a4, $a0, $s8 + ld.d $a0, $sp, 856 # 8-byte Folded Reload + alsl.d $a2, $a4, $a0, 3 + ld.d $a0, $sp, 872 # 8-byte Folded Reload + add.d $a5, $a0, $a4 + ld.d $a0, $sp, 864 # 8-byte Folded Reload + st.d $a5, $sp, 816 # 8-byte Folded Spill + alsl.d $a5, $a5, $a0, 3 + sltu $a5, $a1, $a5 + sltu $a2, $a2, $ra + and $a2, $a5, $a2 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a2, .LBB3_18 # %bb.25: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - ld.d $a0, $sp, 648 # 8-byte Folded Reload - alsl.d $t7, $t8, $a0, 3 - ld.d $a0, $sp, 832 # 8-byte Folded Reload - add.d $s4, $a0, $t8 - ld.d $a0, $sp, 640 # 8-byte Folded Reload - alsl.d $s5, $s4, $a0, 3 - alsl.d $a0, $t8, $a7, 3 - ld.d $a2, $sp, 632 # 8-byte Folded Reload - alsl.d $a3, $s4, $a2, 3 - sltu $a3, $t7, $a3 - sltu $a0, $a0, $s5 - and $a0, $a3, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 + ld.d $a0, $sp, 632 # 8-byte Folded Reload + alsl.d $a2, $s4, $a0, 3 + ld.d $a0, $sp, 624 # 8-byte Folded Reload + alsl.d $a5, $s7, $a0, 3 + sltu $a5, $a1, $a5 + sltu $a2, $a2, $ra + and $a2, $a5, $a2 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a2, .LBB3_18 # %bb.26: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - ld.d $a0, $sp, 624 # 8-byte Folded Reload - alsl.d $a0, $t8, $a0, 3 - ld.d $a2, $sp, 616 # 8-byte Folded Reload - alsl.d $a3, $s4, $a2, 3 - sltu $a3, $t7, $a3 - sltu $a0, $a0, $s5 - and $a0, $a3, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 + ld.d $a0, $sp, 688 # 8-byte Folded Reload + add.w $a5, $a0, $s8 + ld.d $a0, $sp, 856 # 8-byte Folded Reload + alsl.d $a2, $a5, $a0, 3 + ld.d $a0, $sp, 872 # 8-byte Folded Reload + st.d $a5, $sp, 832 # 8-byte Folded Spill + add.d $a5, $a0, $a5 + ld.d $a0, $sp, 864 # 8-byte Folded Reload + st.d $a5, $sp, 824 # 8-byte Folded Spill + alsl.d $a5, $a5, $a0, 3 + sltu $a5, $a1, $a5 + sltu $a2, $a2, $ra + and $a2, $a5, $a2 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a2, .LBB3_18 # %bb.27: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - ld.d $a0, $sp, 816 # 8-byte Folded Reload - alsl.d $a0, $s3, $a0, 3 - ld.d $a2, $sp, 832 # 8-byte Folded Reload - add.d $a3, $a2, $s3 - ld.d $a2, $sp, 824 # 8-byte Folded Reload - st.d $a3, $sp, 776 # 8-byte Folded Spill - alsl.d $a3, $a3, $a2, 3 - sltu $a3, $t7, $a3 - sltu $a0, $a0, $s5 - and $a0, $a3, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 + ld.d $a0, $sp, 616 # 8-byte Folded Reload + alsl.d $a2, $s4, $a0, 3 + ld.d $a0, $sp, 864 # 8-byte Folded Reload + alsl.d $a5, $s7, $a0, 3 + sltu $a5, $a1, $a5 + sltu $a2, $a2, $ra + and $a2, $a5, $a2 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a2, .LBB3_18 # %bb.28: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - ld.d $a0, $sp, 608 # 8-byte Folded Reload - alsl.d $a0, $t8, $a0, 3 - ld.d $a2, $sp, 600 # 8-byte Folded Reload - alsl.d $a3, $s4, $a2, 3 - sltu $a3, $t7, $a3 - sltu $a0, $a0, $s5 - and $a0, $a3, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 + ld.d $a0, $sp, 680 # 8-byte Folded Reload + add.w $s8, $a0, $s8 + ld.d $a0, $sp, 856 # 8-byte Folded Reload + alsl.d $a5, $s8, $a0, 3 + ld.d $a0, $sp, 872 # 8-byte Folded Reload + add.d $a2, $a0, $s8 + ld.d $a0, $sp, 864 # 8-byte Folded Reload + alsl.d $s5, $a2, $a0, 3 + sltu $s5, $a1, $s5 + sltu $a5, $a5, $ra + and $s5, $s5, $a5 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $s5, .LBB3_18 # %bb.29: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - ld.d $a0, $sp, 816 # 8-byte Folded Reload - alsl.d $a0, $s7, $a0, 3 - ld.d $a2, $sp, 832 # 8-byte Folded Reload - add.d $a3, $a2, $s7 - ld.d $a2, $sp, 824 # 8-byte Folded Reload - st.d $a3, $sp, 784 # 8-byte Folded Spill - alsl.d $a3, $a3, $a2, 3 - sltu $a3, $t7, $a3 - sltu $a0, $a0, $s5 - and $a0, $a3, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 + alsl.d $a5, $s4, $t6, 3 + ld.d $a0, $sp, 848 # 8-byte Folded Reload + alsl.d $s5, $s7, $a0, 3 + sltu $s5, $a1, $s5 + sltu $a5, $a5, $ra + and $s5, $s5, $a5 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $s5, .LBB3_18 # %bb.30: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - ld.d $a0, $sp, 592 # 8-byte Folded Reload - alsl.d $a0, $t8, $a0, 3 - ld.d $a2, $sp, 824 # 8-byte Folded Reload - alsl.d $a3, $s4, $a2, 3 - sltu $a3, $t7, $a3 - sltu $a0, $a0, $s5 - and $a0, $a3, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 + alsl.d $a5, $s8, $t6, 3 + ld.d $a0, $sp, 848 # 8-byte Folded Reload + alsl.d $a2, $a2, $a0, 3 + sltu $a2, $a1, $a2 + sltu $a5, $a5, $ra + and $a2, $a2, $a5 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a2, .LBB3_18 # %bb.31: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - ld.d $a0, $sp, 816 # 8-byte Folded Reload - alsl.d $a3, $fp, $a0, 3 - ld.d $a0, $sp, 832 # 8-byte Folded Reload - add.d $a0, $a0, $fp - ld.d $a2, $sp, 824 # 8-byte Folded Reload - alsl.d $a2, $a0, $a2, 3 - sltu $a2, $t7, $a2 - sltu $a3, $a3, $s5 - and $a2, $a2, $a3 - ld.d $a3, $sp, 904 # 8-byte Folded Reload + alsl.d $a2, $s4, $t7, 3 + ld.d $a0, $sp, 840 # 8-byte Folded Reload + alsl.d $a5, $s7, $a0, 3 + sltu $a5, $a1, $a5 + sltu $a2, $a2, $ra + and $a2, $a5, $a2 + ld.d $a5, $sp, 936 # 8-byte Folded Reload bnez $a2, .LBB3_18 # %bb.32: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - alsl.d $a2, $t8, $t2, 3 - ld.d $a3, $sp, 800 # 8-byte Folded Reload - alsl.d $a3, $s4, $a3, 3 - sltu $a3, $t7, $a3 - sltu $a2, $a2, $s5 - and $a2, $a3, $a2 - ld.d $a3, $sp, 904 # 8-byte Folded Reload + ld.d $a0, $sp, 832 # 8-byte Folded Reload + alsl.d $a2, $a0, $t7, 3 + ld.d $a0, $sp, 840 # 8-byte Folded Reload + ld.d $a5, $sp, 824 # 8-byte Folded Reload + alsl.d $a5, $a5, $a0, 3 + sltu $a5, $a1, $a5 + sltu $a2, $a2, $ra + and $a2, $a5, $a2 + ld.d $a5, $sp, 936 # 8-byte Folded Reload bnez $a2, .LBB3_18 # %bb.33: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - alsl.d $a2, $fp, $t2, 3 - ld.d $a3, $sp, 800 # 8-byte Folded Reload - alsl.d $a0, $a0, $a3, 3 - sltu $a0, $t7, $a0 - sltu $a2, $a2, $s5 - and $a0, $a0, $a2 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 + alsl.d $a2, $s4, $t8, 3 + ld.d $a0, $sp, 808 # 8-byte Folded Reload + alsl.d $a5, $s7, $a0, 3 + sltu $a5, $a1, $a5 + sltu $a2, $a2, $ra + and $a2, $a5, $a2 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a2, .LBB3_18 # %bb.34: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - alsl.d $a0, $t8, $t3, 3 - ld.d $a2, $sp, 792 # 8-byte Folded Reload - alsl.d $a2, $s4, $a2, 3 - sltu $a2, $t7, $a2 - sltu $a0, $a0, $s5 - and $a0, $a2, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 + alsl.d $a2, $a4, $t8, 3 + ld.d $a0, $sp, 808 # 8-byte Folded Reload + ld.d $a4, $sp, 816 # 8-byte Folded Reload + alsl.d $a5, $a4, $a0, 3 + sltu $a5, $a1, $a5 + sltu $a2, $a2, $ra + and $a2, $a5, $a2 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a2, .LBB3_18 # %bb.35: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - alsl.d $a0, $s7, $t3, 3 - ld.d $a2, $sp, 792 # 8-byte Folded Reload - ld.d $a3, $sp, 784 # 8-byte Folded Reload - alsl.d $a2, $a3, $a2, 3 - sltu $a2, $t7, $a2 - sltu $a0, $a0, $s5 - and $a0, $a2, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 + alsl.d $a2, $s4, $s1, 3 + ld.d $a0, $sp, 600 # 8-byte Folded Reload + alsl.d $a5, $s7, $a0, 3 + sltu $a5, $a1, $a5 + sltu $a2, $a2, $ra + and $a2, $a5, $a2 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a2, .LBB3_18 # %bb.36: # %vector.memcheck # in Loop: Header=BB3_16 Depth=5 - alsl.d $a0, $t8, $t4, 3 - ld.d $a2, $sp, 768 # 8-byte Folded Reload - alsl.d $a2, $s4, $a2, 3 - sltu $a2, $t7, $a2 - sltu $a0, $a0, $s5 - and $a0, $a2, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 -# %bb.37: # %vector.memcheck - # in Loop: Header=BB3_16 Depth=5 - alsl.d $a0, $s3, $t4, 3 - ld.d $a2, $sp, 768 # 8-byte Folded Reload - ld.d $a3, $sp, 776 # 8-byte Folded Reload - alsl.d $a2, $a3, $a2, 3 - sltu $a2, $t7, $a2 - sltu $a0, $a0, $s5 - and $a0, $a2, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 -# %bb.38: # %vector.memcheck - # in Loop: Header=BB3_16 Depth=5 - alsl.d $a0, $t8, $t5, 3 - ld.d $a2, $sp, 576 # 8-byte Folded Reload - alsl.d $a2, $s4, $a2, 3 - sltu $a2, $t7, $a2 - sltu $a0, $a0, $s5 - and $a0, $a2, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 -# %bb.39: # %vector.memcheck - # in Loop: Header=BB3_16 Depth=5 - alsl.d $a0, $t8, $s6, 3 - ld.d $a2, $sp, 560 # 8-byte Folded Reload - alsl.d $a2, $s4, $a2, 3 - sltu $a2, $t7, $a2 - sltu $a0, $a0, $s5 - and $a0, $a2, $a0 - ld.d $a3, $sp, 904 # 8-byte Folded Reload - bnez $a0, .LBB3_18 -# %bb.40: # %vector.ph + alsl.d $a2, $s4, $t4, 3 + ld.d $a0, $sp, 584 # 8-byte Folded Reload + alsl.d $a5, $s7, $a0, 3 + sltu $a1, $a1, $a5 + sltu $a2, $a2, $ra + and $a1, $a1, $a2 + ld.d $a5, $sp, 936 # 8-byte Folded Reload + bnez $a1, .LBB3_18 +# %bb.37: # %vector.ph # in Loop: Header=BB3_16 Depth=5 - ld.d $a2, $sp, 544 # 8-byte Folded Reload - ld.d $a3, $sp, 888 # 8-byte Folded Reload - ld.d $t7, $sp, 880 # 8-byte Folded Reload - move $t8, $a5 - move $fp, $s0 - move $s4, $a4 - .p2align 4, , 16 -.LBB3_41: # %vector.body + ld.d $a1, $sp, 568 # 8-byte Folded Reload + ld.d $a2, $sp, 920 # 8-byte Folded Reload + ld.d $a5, $sp, 912 # 8-byte Folded Reload + move $s4, $fp + move $s7, $a3 + move $s8, $a6 + .p2align 4, , 16 +.LBB3_38: # %vector.body # Parent Loop BB3_4 Depth=1 # Parent Loop BB3_7 Depth=2 # Parent Loop BB3_10 Depth=3 # Parent Loop BB3_13 Depth=4 # Parent Loop BB3_16 Depth=5 # => This Inner Loop Header: Depth=6 + slli.d $s5, $s8, 3 + vldx $vr6, $t5, $s5 + alsl.d $ra, $s8, $t3, 3 + vldx $vr7, $t3, $s5 + addi.w $a0, $s8, 1 + slli.d $a0, $a0, 3 + vldx $vr8, $t3, $a0 + vld $vr9, $ra, -8 + vldx $vr10, $t6, $a0 + vldx $vr11, $t6, $s5 + vfsub.d $vr8, $vr8, $vr7 + vfsub.d $vr9, $vr7, $vr9 + vbitrevi.d $vr9, $vr9, 63 + vfmul.d $vr9, $vr11, $vr9 + vfmadd.d $vr8, $vr10, $vr8, $vr9 + slli.d $a0, $s7, 3 + vldx $vr9, $t3, $a0 + vldx $vr10, $t7, $a0 slli.d $a0, $s4, 3 - vldx $vr2, $a7, $a0 - alsl.d $s5, $s4, $a1, 3 - vldx $vr3, $a1, $a0 - addi.w $s3, $s4, 1 - slli.d $s3, $s3, 3 - vldx $vr4, $a1, $s3 - vld $vr5, $s5, -8 - vldx $vr6, $t2, $s3 - vldx $vr7, $t2, $a0 - vfsub.d $vr4, $vr4, $vr3 - vfsub.d $vr5, $vr3, $vr5 - vbitrevi.d $vr5, $vr5, 63 - vfmul.d $vr5, $vr7, $vr5 - vfmadd.d $vr4, $vr6, $vr4, $vr5 - slli.d $s3, $fp, 3 - vldx $vr5, $a1, $s3 - vldx $vr6, $t3, $s3 - slli.d $s3, $t8, 3 - vldx $vr7, $a1, $s3 - vfsub.d $vr5, $vr5, $vr3 - vfmadd.d $vr4, $vr6, $vr5, $vr4 - vldx $vr5, $t3, $a0 - vfsub.d $vr6, $vr3, $vr7 - slli.d $s3, $t7, 3 - vldx $vr7, $a1, $s3 - vbitrevi.d $vr5, $vr5, 63 - vfmadd.d $vr4, $vr5, $vr6, $vr4 - vldx $vr5, $t4, $s3 - vfsub.d $vr6, $vr7, $vr3 - slli.d $s3, $a3, 3 - vldx $vr7, $a1, $s3 - vldx $vr8, $t4, $a0 - vfmadd.d $vr4, $vr5, $vr6, $vr4 - vfmul.d $vr2, $vr10, $vr2 - vfsub.d $vr5, $vr3, $vr7 - vbitrevi.d $vr6, $vr8, 63 - vfnmadd.d $vr4, $vr6, $vr5, $vr4 - vldx $vr5, $s6, $a0 - vldx $vr6, $t5, $a0 - vfmul.d $vr4, $vr1, $vr4 - vfmadd.d $vr2, $vr2, $vr3, $vr4 - vfsub.d $vr2, $vr2, $vr5 - vfmul.d $vr4, $vr6, $vr11 - vfmadd.d $vr2, $vr4, $vr2, $vr3 - vstx $vr2, $a6, $a0 + vldx $vr11, $t3, $a0 + vfsub.d $vr9, $vr9, $vr7 + vfmadd.d $vr8, $vr10, $vr9, $vr8 + vldx $vr9, $t7, $s5 + vfsub.d $vr10, $vr7, $vr11 + slli.d $a0, $a5, 3 + vldx $vr11, $t3, $a0 + vbitrevi.d $vr9, $vr9, 63 + vfmadd.d $vr8, $vr9, $vr10, $vr8 + vldx $vr9, $t8, $a0 + vfsub.d $vr10, $vr11, $vr7 + slli.d $a0, $a2, 3 + vldx $vr11, $t3, $a0 + vldx $vr12, $t8, $s5 + vfmadd.d $vr8, $vr9, $vr10, $vr8 + vfmul.d $vr6, $vr14, $vr6 + vfsub.d $vr9, $vr7, $vr11 + vbitrevi.d $vr10, $vr12, 63 + vfnmadd.d $vr8, $vr10, $vr9, $vr8 + vldx $vr9, $t4, $s5 + vldx $vr10, $s1, $s5 + vfmul.d $vr8, $vr2, $vr8 + vfmadd.d $vr6, $vr6, $vr7, $vr8 + vfsub.d $vr6, $vr6, $vr9 + vfmul.d $vr8, $vr10, $vr16 + vfmadd.d $vr6, $vr8, $vr6, $vr7 + vstx $vr6, $a7, $s5 + addi.w $s8, $s8, 2 + addi.w $s7, $s7, 2 addi.w $s4, $s4, 2 - addi.w $fp, $fp, 2 - addi.w $t8, $t8, 2 - addi.w $t7, $t7, 2 - addi.d $a2, $a2, -2 - addi.w $a3, $a3, 2 - bnez $a2, .LBB3_41 -# %bb.42: # %middle.block + addi.w $a5, $a5, 2 + addi.d $a1, $a1, -2 + addi.w $a2, $a2, 2 + bnez $a1, .LBB3_38 +# %bb.39: # %middle.block # in Loop: Header=BB3_16 Depth=5 - ld.d $a3, $sp, 528 # 8-byte Folded Reload - ld.d $a0, $sp, 840 # 8-byte Folded Reload - ld.d $a2, $sp, 536 # 8-byte Folded Reload - beq $a0, $a2, .LBB3_15 + ld.d $a5, $sp, 552 # 8-byte Folded Reload + ld.d $a0, $sp, 880 # 8-byte Folded Reload + ld.d $a1, $sp, 560 # 8-byte Folded Reload + beq $a0, $a1, .LBB3_15 b .LBB3_18 -.LBB3_43: - fld.d $fs0, $sp, 912 # 8-byte Folded Reload - ld.d $s8, $sp, 920 # 8-byte Folded Reload - ld.d $s7, $sp, 928 # 8-byte Folded Reload - ld.d $s6, $sp, 936 # 8-byte Folded Reload - ld.d $s5, $sp, 944 # 8-byte Folded Reload - ld.d $s4, $sp, 952 # 8-byte Folded Reload - ld.d $s3, $sp, 960 # 8-byte Folded Reload - ld.d $s2, $sp, 968 # 8-byte Folded Reload - ld.d $s1, $sp, 976 # 8-byte Folded Reload - ld.d $s0, $sp, 984 # 8-byte Folded Reload - ld.d $fp, $sp, 992 # 8-byte Folded Reload - ld.d $ra, $sp, 1000 # 8-byte Folded Reload - addi.d $sp, $sp, 1008 +.LBB3_40: + fld.d $fs0, $sp, 944 # 8-byte Folded Reload + ld.d $s8, $sp, 952 # 8-byte Folded Reload + ld.d $s7, $sp, 960 # 8-byte Folded Reload + ld.d $s6, $sp, 968 # 8-byte Folded Reload + ld.d $s5, $sp, 976 # 8-byte Folded Reload + ld.d $s4, $sp, 984 # 8-byte Folded Reload + ld.d $s3, $sp, 992 # 8-byte Folded Reload + ld.d $s2, $sp, 1000 # 8-byte Folded Reload + ld.d $s1, $sp, 1008 # 8-byte Folded Reload + ld.d $s0, $sp, 1016 # 8-byte Folded Reload + ld.d $fp, $sp, 1024 # 8-byte Folded Reload + ld.d $ra, $sp, 1032 # 8-byte Folded Reload + addi.d $sp, $sp, 1040 ret .Lfunc_end3: .size smooth, .Lfunc_end3-smooth diff --git a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeFiles/miniGMG.dir/solver.s b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeFiles/miniGMG.dir/solver.s index acc4501a..82a1ec3d 100644 --- a/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeFiles/miniGMG.dir/solver.s +++ b/results/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/CMakeFiles/miniGMG.dir/solver.s @@ -3588,47 +3588,52 @@ CABiCGStab: # @CABiCGStab ori $a1, $a1, 2448 add.d $a1, $sp, $a1 vld $vr2, $a1, 0 - vfadd.d $vr13, $vr1, $vr0 + vfadd.d $vr14, $vr1, $vr0 + vld $vr0, $sp, 944 # 16-byte Folded Reload + vfmul.d $vr0, $vr11, $vr0 lu12i.w $a1, 1 ori $a1, $a1, 2464 add.d $a1, $sp, $a1 - vld $vr0, $a1, 0 - vld $vr1, $sp, 944 # 16-byte Folded Reload - vfmul.d $vr1, $vr11, $vr1 - vfadd.d $vr12, $vr2, $vr1 - vld $vr1, $sp, 928 # 16-byte Folded Reload - vfmul.d $vr1, $vr11, $vr1 - vfadd.d $vr10, $vr0, $vr1 + vld $vr1, $a1, 0 + vfadd.d $vr13, $vr2, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 2480 + ori $a1, $a1, 2776 add.d $a1, $sp, $a1 fld.d $fa0, $a1, 0 + vld $vr2, $sp, 928 # 16-byte Folded Reload + vfmul.d $vr2, $vr11, $vr2 + vfadd.d $vr12, $vr1, $vr2 lu12i.w $a1, 1 - ori $a1, $a1, 2776 + ori $a1, $a1, 2480 add.d $a1, $sp, $a1 vld $vr1, $a1, 0 + vextrins.d $vr20, $vr0, 16 + lu12i.w $a1, 1 + ori $a1, $a1, 2784 + add.d $a1, $sp, $a1 + vld $vr0, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 2488 + ori $a1, $a1, 2496 add.d $a1, $sp, $a1 vld $vr2, $a1, 0 - fmul.d $fa3, $ft15, $ft12 - fadd.d $ft6, $fa0, $fa3 - vfmul.d $vr0, $vr11, $vr1 + vfmul.d $vr3, $vr11, $vr20 + vfadd.d $vr10, $vr1, $vr3 + vfmul.d $vr0, $vr11, $vr0 vfadd.d $vr9, $vr2, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 2792 + ori $a1, $a1, 2800 add.d $a1, $sp, $a1 vld $vr0, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 2504 + ori $a1, $a1, 2512 add.d $a1, $sp, $a1 vld $vr1, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 2808 + ori $a1, $a1, 2816 add.d $a1, $sp, $a1 vld $vr2, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 2520 + ori $a1, $a1, 2528 add.d $a1, $sp, $a1 vld $vr3, $a1, 0 vfmul.d $vr0, $vr11, $vr0 @@ -3636,25 +3641,25 @@ CABiCGStab: # @CABiCGStab vfmul.d $vr0, $vr11, $vr2 vfadd.d $vr7, $vr3, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 2824 + ori $a1, $a1, 2832 add.d $a1, $sp, $a1 vld $vr0, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 2536 + ori $a1, $a1, 2544 add.d $a1, $sp, $a1 vld $vr1, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 2840 + ori $a1, $a1, 2848 add.d $a1, $sp, $a1 - vld $vr2, $a1, 0 + fld.d $fa2, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 2552 + ori $a1, $a1, 2560 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 + fld.d $fa3, $a1, 0 vfmul.d $vr0, $vr11, $vr0 - vfadd.d $vr18, $vr1, $vr0 - vfmul.d $vr0, $vr11, $vr2 - vfadd.d $vr17, $vr3, $vr0 + vfadd.d $vr17, $vr1, $vr0 + fmul.d $fa0, $ft15, $fa2 + fadd.d $ft10, $fa3, $fa0 lu12i.w $a1, 1 ori $a1, $a1, 2288 add.d $a1, $sp, $a1 @@ -3790,39 +3795,39 @@ CABiCGStab: # @CABiCGStab vst $vr1, $sp, 432 # 16-byte Folded Spill vreplvei.d $vr1, $vr4, 1 vst $vr1, $sp, 416 # 16-byte Folded Spill - vst $vr13, $sp, 256 # 16-byte Folded Spill + vst $vr14, $sp, 256 # 16-byte Folded Spill lu12i.w $a1, 1 ori $a1, $a1, 2432 add.d $a1, $sp, $a1 - vst $vr13, $a1, 0 - vst $vr12, $sp, 272 # 16-byte Folded Spill + vst $vr14, $a1, 0 + vst $vr13, $sp, 272 # 16-byte Folded Spill lu12i.w $a1, 1 ori $a1, $a1, 2448 add.d $a1, $sp, $a1 - vst $vr12, $a1, 0 - vst $vr10, $sp, 288 # 16-byte Folded Spill + vst $vr13, $a1, 0 + vst $vr12, $sp, 288 # 16-byte Folded Spill lu12i.w $a1, 1 ori $a1, $a1, 2464 add.d $a1, $sp, $a1 - vst $vr10, $a1, 0 - fst.d $ft6, $sp, 248 # 8-byte Folded Spill + vst $vr12, $a1, 0 + vst $vr10, $sp, 304 # 16-byte Folded Spill lu12i.w $a1, 1 ori $a1, $a1, 2480 add.d $a1, $sp, $a1 - fst.d $ft6, $a1, 0 - vst $vr9, $sp, 304 # 16-byte Folded Spill + vst $vr10, $a1, 0 + vst $vr9, $sp, 320 # 16-byte Folded Spill lu12i.w $a1, 1 - ori $a1, $a1, 2488 + ori $a1, $a1, 2496 add.d $a1, $sp, $a1 vst $vr9, $a1, 0 - vst $vr8, $sp, 320 # 16-byte Folded Spill + vst $vr8, $sp, 336 # 16-byte Folded Spill lu12i.w $a1, 1 - ori $a1, $a1, 2504 + ori $a1, $a1, 2512 add.d $a1, $sp, $a1 vst $vr8, $a1, 0 - vst $vr7, $sp, 336 # 16-byte Folded Spill + vst $vr7, $sp, 352 # 16-byte Folded Spill lu12i.w $a1, 1 - ori $a1, $a1, 2520 + ori $a1, $a1, 2528 add.d $a1, $sp, $a1 vst $vr7, $a1, 0 lu12i.w $a1, 2 @@ -3954,16 +3959,16 @@ CABiCGStab: # @CABiCGStab ori $a1, $a1, 3640 add.d $a1, $sp, $a1 fld.d $fa3, $a1, 0 - vst $vr18, $sp, 352 # 16-byte Folded Spill - lu12i.w $a1, 1 - ori $a1, $a1, 2536 - add.d $a1, $sp, $a1 - vst $vr18, $a1, 0 vst $vr17, $sp, 368 # 16-byte Folded Spill lu12i.w $a1, 1 - ori $a1, $a1, 2552 + ori $a1, $a1, 2544 add.d $a1, $sp, $a1 vst $vr17, $a1, 0 + fst.d $ft10, $sp, 248 # 8-byte Folded Spill + lu12i.w $a1, 1 + ori $a1, $a1, 2560 + add.d $a1, $sp, $a1 + fst.d $ft10, $a1, 0 lu12i.w $a1, 2 ori $a1, $a1, 3648 add.d $a1, $sp, $a1 @@ -4214,7 +4219,7 @@ CABiCGStab: # @CABiCGStab # %bb.25: # %.lr.ph.i399.preheader # in Loop: Header=BB1_8 Depth=2 lu12i.w $a0, 1 - ori $a0, $a0, 2624 + ori $a0, $a0, 2704 add.d $a0, $sp, $a0 fld.d $fa1, $a0, 0 move $a0, $zero @@ -4261,132 +4266,128 @@ CABiCGStab: # @CABiCGStab vld $vr7, $a1, 0 vfadd.d $vr3, $vr3, $vr5 lu12i.w $a1, 1 + ori $a1, $a1, 2320 + add.d $a1, $sp, $a1 + vld $vr5, $a1, 0 + lu12i.w $a1, 1 ori $a1, $a1, 2448 add.d $a1, $sp, $a1 vst $vr3, $a1, 0 + vfmul.d $vr3, $vr1, $vr7 + vld $vr7, $sp, 288 # 16-byte Folded Reload + vfadd.d $vr3, $vr7, $vr3 + vfmul.d $vr5, $vr6, $vr5 lu12i.w $a1, 1 - ori $a1, $a1, 2320 + ori $a1, $a1, 2624 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 - vfmul.d $vr5, $vr1, $vr7 - vld $vr7, $sp, 288 # 16-byte Folded Reload - vfadd.d $vr5, $vr7, $vr5 + vld $vr7, $a1, 0 + vfadd.d $vr3, $vr3, $vr5 lu12i.w $a1, 1 ori $a1, $a1, 2336 add.d $a1, $sp, $a1 - fld.d $fa7, $a1, 0 - vfmul.d $vr3, $vr6, $vr3 - vfadd.d $vr3, $vr5, $vr3 + vld $vr5, $a1, 0 lu12i.w $a1, 1 ori $a1, $a1, 2464 add.d $a1, $sp, $a1 vst $vr3, $a1, 0 - fmul.d $fa3, $fa4, $fa7 + vfmul.d $vr3, $vr1, $vr7 + vld $vr7, $sp, 304 # 16-byte Folded Reload + vfadd.d $vr3, $vr7, $vr3 + vfmul.d $vr5, $vr6, $vr5 lu12i.w $a1, 1 - ori $a1, $a1, 2632 + ori $a1, $a1, 2640 add.d $a1, $sp, $a1 - vld $vr4, $a1, 0 - fadd.d $fa2, $fa2, $fa3 + vld $vr7, $a1, 0 + vfadd.d $vr3, $vr3, $vr5 lu12i.w $a1, 1 - ori $a1, $a1, 2344 + ori $a1, $a1, 2352 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 + vld $vr5, $a1, 0 lu12i.w $a1, 1 ori $a1, $a1, 2480 add.d $a1, $sp, $a1 - fst.d $fa2, $a1, 0 - vfmul.d $vr2, $vr1, $vr4 - vld $vr4, $sp, 304 # 16-byte Folded Reload - vfadd.d $vr2, $vr4, $vr2 - vfmul.d $vr3, $vr6, $vr3 - lu12i.w $a1, 1 - ori $a1, $a1, 2648 - add.d $a1, $sp, $a1 - vld $vr4, $a1, 0 - vfadd.d $vr2, $vr2, $vr3 - lu12i.w $a1, 1 - ori $a1, $a1, 2360 - add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 + vst $vr3, $a1, 0 + vfmul.d $vr3, $vr1, $vr7 + vld $vr7, $sp, 320 # 16-byte Folded Reload + vfadd.d $vr3, $vr7, $vr3 + vfmul.d $vr5, $vr6, $vr5 lu12i.w $a1, 1 - ori $a1, $a1, 2488 + ori $a1, $a1, 2656 add.d $a1, $sp, $a1 - vst $vr2, $a1, 0 - vfmul.d $vr2, $vr1, $vr4 - vld $vr4, $sp, 320 # 16-byte Folded Reload - vfadd.d $vr2, $vr4, $vr2 - vfmul.d $vr3, $vr6, $vr3 + vld $vr7, $a1, 0 + vfadd.d $vr3, $vr3, $vr5 lu12i.w $a1, 1 - ori $a1, $a1, 2664 + ori $a1, $a1, 2368 add.d $a1, $sp, $a1 - vld $vr4, $a1, 0 - vfadd.d $vr2, $vr2, $vr3 + vld $vr5, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 2376 + ori $a1, $a1, 2496 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 + vst $vr3, $a1, 0 + vfmul.d $vr3, $vr1, $vr7 + vld $vr7, $sp, 336 # 16-byte Folded Reload + vfadd.d $vr3, $vr7, $vr3 + vfmul.d $vr5, $vr6, $vr5 lu12i.w $a1, 1 - ori $a1, $a1, 2504 + ori $a1, $a1, 2672 add.d $a1, $sp, $a1 - vst $vr2, $a1, 0 - vfmul.d $vr2, $vr1, $vr4 - vld $vr4, $sp, 336 # 16-byte Folded Reload - vfadd.d $vr2, $vr4, $vr2 - vfmul.d $vr3, $vr6, $vr3 + vld $vr7, $a1, 0 + vfadd.d $vr3, $vr3, $vr5 lu12i.w $a1, 1 - ori $a1, $a1, 2680 + ori $a1, $a1, 2384 add.d $a1, $sp, $a1 - vld $vr4, $a1, 0 - vfadd.d $vr2, $vr2, $vr3 + vld $vr5, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 2392 + ori $a1, $a1, 2512 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 + vst $vr3, $a1, 0 + vfmul.d $vr3, $vr1, $vr7 + vld $vr7, $sp, 352 # 16-byte Folded Reload + vfadd.d $vr3, $vr7, $vr3 + vfmul.d $vr5, $vr6, $vr5 lu12i.w $a1, 1 - ori $a1, $a1, 2520 + ori $a1, $a1, 2688 add.d $a1, $sp, $a1 - vst $vr2, $a1, 0 - vfmul.d $vr2, $vr1, $vr4 - vld $vr4, $sp, 352 # 16-byte Folded Reload - vfadd.d $vr2, $vr4, $vr2 - vfmul.d $vr3, $vr6, $vr3 + vld $vr7, $a1, 0 + vfadd.d $vr3, $vr3, $vr5 lu12i.w $a1, 1 - ori $a1, $a1, 2696 + ori $a1, $a1, 2528 add.d $a1, $sp, $a1 - vld $vr4, $a1, 0 - vfadd.d $vr2, $vr2, $vr3 + vst $vr3, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 2408 + ori $a1, $a1, 2400 add.d $a1, $sp, $a1 vld $vr3, $a1, 0 + vfmul.d $vr5, $vr1, $vr7 + vld $vr7, $sp, 368 # 16-byte Folded Reload + vfadd.d $vr5, $vr7, $vr5 lu12i.w $a1, 1 - ori $a1, $a1, 2536 + ori $a1, $a1, 2416 add.d $a1, $sp, $a1 - vst $vr2, $a1, 0 - vfmul.d $vr2, $vr1, $vr4 - vld $vr4, $sp, 368 # 16-byte Folded Reload - vfadd.d $vr2, $vr4, $vr2 + fld.d $fa7, $a1, 0 vfmul.d $vr3, $vr6, $vr3 - vfadd.d $vr2, $vr2, $vr3 + vfadd.d $vr3, $vr5, $vr3 lu12i.w $a1, 1 - ori $a1, $a1, 2272 + ori $a1, $a1, 2544 add.d $a1, $sp, $a1 - fld.d $fa3, $a1, 0 + vst $vr3, $a1, 0 + fmul.d $fa3, $fa4, $fa7 lu12i.w $a1, 1 - ori $a1, $a1, 2704 + ori $a1, $a1, 2272 add.d $a1, $sp, $a1 fld.d $fa4, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 2416 + ori $a1, $a1, 2704 add.d $a1, $sp, $a1 fld.d $fa5, $a1, 0 + fadd.d $fa2, $fa2, $fa3 lu12i.w $a1, 1 - ori $a1, $a1, 2552 + ori $a1, $a1, 2560 add.d $a1, $sp, $a1 - vst $vr2, $a1, 0 - fmul.d $fa2, $ft2, $fa3 - fsub.d $fa2, $fa4, $fa2 - fmul.d $fa3, $ft15, $fa5 + fst.d $fa2, $a1, 0 + fmul.d $fa2, $ft2, $fa4 + fsub.d $fa2, $fa5, $fa2 + fmul.d $fa3, $ft15, $fa7 fsub.d $fa2, $fa2, $fa3 lu12i.w $a1, 1 ori $a1, $a1, 2144 diff --git a/results/MultiSource/Benchmarks/FreeBench/distray/CMakeFiles/distray.dir/distray.s b/results/MultiSource/Benchmarks/FreeBench/distray/CMakeFiles/distray.dir/distray.s index 405bd3d4..2d13c0d6 100644 --- a/results/MultiSource/Benchmarks/FreeBench/distray/CMakeFiles/distray.dir/distray.s +++ b/results/MultiSource/Benchmarks/FreeBench/distray/CMakeFiles/distray.dir/distray.s @@ -619,29 +619,26 @@ TraceLine: # @TraceLine jirl $ra, $ra, 0 pcalau12i $a0, %pc_hi20(.LCPI1_1) fld.d $fa1, $a0, %pc_lo12(.LCPI1_1) - fmul.d $fa2, $fa0, $fa1 + fmul.d $fa1, $fa0, $fa1 b .LBB1_10 .LBB1_9: - vldi $vr2, -912 + vldi $vr1, -912 .LBB1_10: pcalau12i $a0, %pc_hi20(Skycolor) addi.d $a0, $a0, %pc_lo12(Skycolor) fld.d $fa0, $a0, 0 - fld.d $fa1, $a0, 24 + fld.d $fa2, $a0, 24 vldi $vr3, -912 - fsub.d $fa3, $fa3, $fa2 + fsub.d $fa3, $fa3, $fa1 fmul.d $fa0, $fa3, $fa0 - fmadd.d $fa0, $fa1, $fa2, $fa0 + fmadd.d $fa0, $fa2, $fa1, $fa0 fst.d $fa0, $fp, 0 - fld.d $fa1, $a0, 8 - fld.d $fa4, $a0, 32 - fmul.d $fa1, $fa3, $fa1 - fmadd.d $fa1, $fa4, $fa2, $fa1 - fst.d $fa1, $fp, 8 - fld.d $fa4, $a0, 16 - fld.d $fa5, $a0, 40 - fmul.d $fa3, $fa3, $fa4 - fmadd.d $fa2, $fa5, $fa2, $fa3 + vld $vr2, $a0, 8 + vld $vr4, $a0, 32 + vreplvei.d $vr3, $vr3, 0 + vfmul.d $vr2, $vr3, $vr2 + vreplvei.d $vr1, $vr1, 0 + vfmadd.d $vr2, $vr4, $vr1, $vr2 b .LBB1_27 .LBB1_11: vst $vr4, $sp, 16 # 16-byte Folded Spill @@ -718,12 +715,12 @@ TraceLine: # @TraceLine fst.d $fa0, $fp, 0 fld.d $fa1, $a0, %pc_lo12(Ambient) fld.d $fa2, $s2, 8 - fadd.d $fa3, $fa3, $fa1 - fmul.d $fa1, $fa2, $fa3 - fst.d $fa1, $fp, 8 - fld.d $fa2, $s2, 16 - fmul.d $fa2, $fa3, $fa2 - fst.d $fa2, $fp, 16 + fadd.d $fa1, $fa3, $fa1 + fmul.d $fa2, $fa2, $fa1 + fst.d $fa2, $fp, 8 + fld.d $fa3, $s2, 16 + fmul.d $fa1, $fa1, $fa3 + fst.d $fa1, $fp, 16 fld.d $fa3, $s2, 32 fcmp.cule.d $fcc0, $fa3, $fs4 bcnez $fcc0, .LBB1_28 @@ -792,7 +789,7 @@ TraceLine: # @TraceLine fmadd.d $fa0, $fa4, $fa4, $fa0 fsqrt.d $fa0, $fa0 fdiv.d $fs0, $fa1, $fa0 - vrepli.b $vr3, 0 + vrepli.b $vr2, 0 pcalau12i $s4, %pc_hi20(rnd) lu12i.w $a0, 269412 pcalau12i $a1, %pc_hi20(.LCPI1_2) @@ -804,7 +801,7 @@ TraceLine: # @TraceLine .p2align 4, , 16 .LBB1_22: # %DistribVector.exit80 # =>This Inner Loop Header: Depth=1 - vst $vr3, $sp, 128 # 16-byte Folded Spill + vst $vr2, $sp, 128 # 16-byte Folded Spill fld.d $fa0, $s2, 40 ld.d $a0, $s4, %pc_lo12(rnd) fmul.d $fa1, $fa0, $fs6 @@ -857,13 +854,13 @@ TraceLine: # @TraceLine move $a3, $s0 pcaddu18i $ra, %call36(TraceLine) jirl $ra, $ra, 0 - vld $vr3, $sp, 128 # 16-byte Folded Reload + vld $vr2, $sp, 128 # 16-byte Folded Reload fld.d $fa0, $sp, 216 vld $vr1, $sp, 224 ld.w $a0, $s1, %pc_lo12(DISTRIB) fadd.d $fs7, $fa0, $fs7 addi.w $s3, $s3, 1 - vfadd.d $vr3, $vr1, $vr3 + vfadd.d $vr2, $vr1, $vr2 blt $s3, $a0, .LBB1_22 b .LBB1_25 .LBB1_23: @@ -874,33 +871,32 @@ TraceLine: # @TraceLine pcaddu18i $ra, %call36(TraceLine) jirl $ra, $ra, 0 fld.d $fa0, $sp, 240 - fld.d $fa1, $sp, 248 - fld.d $fa2, $sp, 256 + vld $vr1, $sp, 248 b .LBB1_26 .LBB1_24: movgr2fr.d $fs7, $zero - vld $vr3, $sp, 128 # 16-byte Folded Reload + vld $vr2, $sp, 128 # 16-byte Folded Reload .LBB1_25: # %._crit_edge movgr2fr.w $fa0, $a0 ffint.d.w $fa0, $fa0 - frecip.d $fa2, $fa0 - fmul.d $fa0, $fa2, $fs7 - vreplvei.d $vr1, $vr3, 0 - fmul.d $fa1, $fa2, $fa1 - vreplvei.d $vr3, $vr3, 1 - fmul.d $fa2, $fa2, $fa3 + frecip.d $fa1, $fa0 + fmul.d $fa0, $fa1, $fs7 + vreplvei.d $vr1, $vr1, 0 + vfmul.d $vr1, $vr1, $vr2 .LBB1_26: - fld.d $fa3, $s2, 32 - fld.d $fa4, $fp, 0 - fld.d $fa5, $fp, 8 - fmadd.d $fa0, $fa0, $fa3, $fa4 - fld.d $fa4, $fp, 16 + fld.d $fa2, $s2, 32 + fld.d $fa3, $fp, 0 + vld $vr4, $fp, 8 + fmadd.d $fa0, $fa0, $fa2, $fa3 fst.d $fa0, $fp, 0 - fmadd.d $fa1, $fa1, $fa3, $fa5 - fst.d $fa1, $fp, 8 - fmadd.d $fa2, $fa2, $fa3, $fa4 + vreplvei.d $vr2, $vr2, 0 + vfmadd.d $vr2, $vr1, $vr2, $vr4 .LBB1_27: # %.sink.split - fst.d $fa2, $fp, 16 + vreplvei.d $vr1, $vr2, 0 + fst.d $fa1, $fp, 8 + vreplvei.d $vr1, $vr2, 1 + vstelm.d $vr2, $fp, 16, 1 + vreplvei.d $vr2, $vr2, 0 .LBB1_28: vldi $vr3, -912 fcmp.cule.d $fcc0, $fa0, $fa3 @@ -910,13 +906,13 @@ TraceLine: # @TraceLine st.d $a0, $fp, 0 .LBB1_30: vldi $vr0, -912 - fcmp.cule.d $fcc0, $fa1, $fa0 + fcmp.cule.d $fcc0, $fa2, $fa0 bcnez $fcc0, .LBB1_32 # %bb.31: lu52i.d $a0, $zero, 1023 st.d $a0, $fp, 8 .LBB1_32: - fcmp.cule.d $fcc0, $fa2, $fa0 + fcmp.cule.d $fcc0, $fa1, $fa0 bcnez $fcc0, .LBB1_34 # %bb.33: lu52i.d $a0, $zero, 1023 @@ -962,11 +958,27 @@ TraceLine: # @TraceLine .type IntersectObjs,@function IntersectObjs: # @IntersectObjs # %bb.0: + addi.d $sp, $sp, -192 + st.d $ra, $sp, 184 # 8-byte Folded Spill + st.d $fp, $sp, 176 # 8-byte Folded Spill + st.d $s0, $sp, 168 # 8-byte Folded Spill + st.d $s1, $sp, 160 # 8-byte Folded Spill + st.d $s2, $sp, 152 # 8-byte Folded Spill + st.d $s3, $sp, 144 # 8-byte Folded Spill + st.d $s4, $sp, 136 # 8-byte Folded Spill + st.d $s5, $sp, 128 # 8-byte Folded Spill + st.d $s6, $sp, 120 # 8-byte Folded Spill + st.d $s7, $sp, 112 # 8-byte Folded Spill + fst.d $fs0, $sp, 104 # 8-byte Folded Spill + fst.d $fs1, $sp, 96 # 8-byte Folded Spill + fst.d $fs2, $sp, 88 # 8-byte Folded Spill + fst.d $fs3, $sp, 80 # 8-byte Folded Spill + fst.d $fs4, $sp, 72 # 8-byte Folded Spill fld.d $fa1, $a1, 16 pcalau12i $a5, %pc_hi20(.LCPI2_0) - fld.d $fa7, $a5, %pc_lo12(.LCPI2_0) + fld.d $fs0, $a5, %pc_lo12(.LCPI2_0) fabs.d $fa0, $fa1 - fcmp.cule.d $fcc0, $fa0, $fa7 + fcmp.cule.d $fcc0, $fa0, $fs0 bcnez $fcc0, .LBB2_4 # %bb.1: pcalau12i $a5, %pc_hi20(Groundpos) @@ -974,7 +986,7 @@ IntersectObjs: # @IntersectObjs fld.d $fa2, $a0, 16 fsub.d $fa0, $fa0, $fa2 fdiv.d $fa0, $fa0, $fa1 - fcmp.cule.d $fcc0, $fa0, $fa7 + fcmp.cule.d $fcc0, $fa0, $fs0 vldi $vr5, -784 bcnez $fcc0, .LBB2_5 # %bb.2: @@ -983,14 +995,11 @@ IntersectObjs: # @IntersectObjs fcmp.cule.d $fcc0, $fa3, $fa0 bcnez $fcc0, .LBB2_5 # %bb.3: - fld.d $fa3, $a0, 0 - fld.d $fa4, $a1, 0 - fld.d $fa5, $a0, 8 - fld.d $fa6, $a1, 8 - fmadd.d $fa3, $fa4, $fa0, $fa3 - fst.d $fa3, $a2, 0 - fmadd.d $fa3, $fa6, $fa0, $fa5 - fst.d $fa3, $a2, 8 + vld $vr3, $a0, 0 + vld $vr4, $a1, 0 + vreplvei.d $vr5, $vr0, 0 + vfmadd.d $vr3, $vr4, $vr5, $vr3 + vst $vr3, $a2, 0 fmadd.d $fa1, $fa1, $fa0, $fa2 fst.d $fa1, $a2, 16 vrepli.b $vr1, 0 @@ -1022,25 +1031,6 @@ IntersectObjs: # @IntersectObjs .LBB2_4: vldi $vr5, -784 .LBB2_5: - addi.d $sp, $sp, -192 - st.d $ra, $sp, 184 # 8-byte Folded Spill - st.d $fp, $sp, 176 # 8-byte Folded Spill - st.d $s0, $sp, 168 # 8-byte Folded Spill - st.d $s1, $sp, 160 # 8-byte Folded Spill - st.d $s2, $sp, 152 # 8-byte Folded Spill - st.d $s3, $sp, 144 # 8-byte Folded Spill - st.d $s4, $sp, 136 # 8-byte Folded Spill - st.d $s5, $sp, 128 # 8-byte Folded Spill - st.d $s6, $sp, 120 # 8-byte Folded Spill - st.d $s7, $sp, 112 # 8-byte Folded Spill - fst.d $fs0, $sp, 104 # 8-byte Folded Spill - fst.d $fs1, $sp, 96 # 8-byte Folded Spill - fst.d $fs2, $sp, 88 # 8-byte Folded Spill - fst.d $fs3, $sp, 80 # 8-byte Folded Spill - fst.d $fs4, $sp, 72 # 8-byte Folded Spill - fst.d $fs5, $sp, 64 # 8-byte Folded Spill - fst.d $fs6, $sp, 56 # 8-byte Folded Spill - fst.d $fs7, $sp, 48 # 8-byte Folded Spill pcalau12i $a5, %pc_hi20(objs) addi.d $fp, $a5, %pc_lo12(objs) move $s0, $zero @@ -1048,17 +1038,14 @@ IntersectObjs: # @IntersectObjs ori $s1, $zero, 320 b .LBB2_8 .LBB2_6: # in Loop: Header=BB2_8 Depth=1 - fmul.d $fa1, $fs7, $fa0 - fst.d $fa1, $a2, 0 - fmul.d $fa2, $fs5, $fa0 - fst.d $fa2, $a2, 8 - fmul.d $fa3, $fs6, $fa0 - fst.d $fa3, $a2, 16 - fsub.d $fa1, $fa1, $fs3 - fst.d $fa1, $a3, 0 + vreplvei.d $vr1, $vr0, 0 + vfmul.d $vr1, $vr7, $vr1 + vst $vr1, $a2, 0 + fmul.d $fa2, $fs3, $fa0 + fst.d $fa2, $a2, 16 + vfsub.d $vr1, $vr1, $vr6 + vst $vr1, $a3, 0 fsub.d $fa1, $fa2, $fs2 - fst.d $fa1, $a3, 8 - fsub.d $fa1, $fa3, $fs4 fst.d $fa1, $a3, 16 vld $vr1, $a0, 0 vld $vr2, $a2, 0 @@ -1077,36 +1064,36 @@ IntersectObjs: # @IntersectObjs beq $s0, $s1, .LBB2_14 .LBB2_8: # =>This Inner Loop Header: Depth=1 add.d $s2, $fp, $s0 - fldx.d $fa0, $fp, $s0 - fld.d $fa1, $s2, 8 - fld.d $fa2, $a0, 0 - fld.d $fa3, $a0, 8 - fld.d $fa4, $s2, 16 - fsub.d $fs3, $fa0, $fa2 - fsub.d $fs2, $fa1, $fa3 - fld.d $fa0, $a0, 16 - fld.d $fs5, $a1, 8 - fld.d $fs7, $a1, 0 - fld.d $fs6, $a1, 16 - fsub.d $fs4, $fa4, $fa0 - fmul.d $fa0, $fs5, $fs5 - fmadd.d $fa0, $fs7, $fs7, $fa0 - fmadd.d $fa0, $fs6, $fs6, $fa0 - frecip.d $fa0, $fa0 - fmul.d $fa1, $fs2, $fs5 - fmadd.d $fa1, $fs3, $fs7, $fa1 - fmadd.d $fa1, $fs4, $fs6, $fa1 - fld.d $fa2, $s2, 24 - fmul.d $fs0, $fa1, $fa0 - fneg.d $fa1, $fs3 - fmul.d $fa1, $fs3, $fa1 + fld.d $fa0, $s2, 16 + fld.d $fa1, $a0, 16 + vldx $vr2, $fp, $s0 + vld $vr3, $a0, 0 + vld $vr7, $a1, 0 + fsub.d $fs2, $fa0, $fa1 + vfsub.d $vr6, $vr2, $vr3 + vreplvei.d $vr0, $vr7, 1 + fld.d $fs3, $a1, 16 + fmul.d $fa1, $fa0, $fa0 + vreplvei.d $vr2, $vr7, 0 fmadd.d $fa1, $fa2, $fa2, $fa1 + fmadd.d $fa1, $fs3, $fs3, $fa1 + frecip.d $fa1, $fa1 + vreplvei.d $vr3, $vr6, 1 + fmul.d $fa0, $fa3, $fa0 + vreplvei.d $vr4, $vr6, 0 + fmadd.d $fa0, $fa4, $fa2, $fa0 + fmadd.d $fa0, $fs2, $fs3, $fa0 + fld.d $fa2, $s2, 24 + fmul.d $fs4, $fa0, $fa1 + fneg.d $fa0, $fa4 + fmul.d $fa0, $fa4, $fa0 + fmadd.d $fa0, $fa2, $fa2, $fa0 + fneg.d $fa2, $fa3 + fmadd.d $fa0, $fa2, $fa3, $fa0 fneg.d $fa2, $fs2 - fmadd.d $fa1, $fa2, $fs2, $fa1 - fneg.d $fa2, $fs4 - fmadd.d $fa1, $fa2, $fs4, $fa1 - fmul.d $fa0, $fa0, $fa1 - fmadd.d $fa0, $fs0, $fs0, $fa0 + fmadd.d $fa0, $fa2, $fs2, $fa0 + fmul.d $fa0, $fa1, $fa0 + fmadd.d $fa0, $fs4, $fs4, $fa0 fcmp.cule.d $fcc0, $fa0, $fs1 bcnez $fcc0, .LBB2_7 # %bb.9: # in Loop: Header=BB2_8 Depth=1 @@ -1115,11 +1102,11 @@ IntersectObjs: # @IntersectObjs bceqz $fcc0, .LBB2_13 .LBB2_10: # %.split # in Loop: Header=BB2_8 Depth=1 - fsub.d $fa0, $fs0, $fa1 - fadd.d $fa1, $fs0, $fa1 - fcmp.clt.d $fcc0, $fa0, $fa7 + fsub.d $fa0, $fs4, $fa1 + fadd.d $fa1, $fs4, $fa1 + fcmp.clt.d $fcc0, $fa0, $fs0 fsel $fa0, $fa0, $fa1, $fcc0 - fcmp.cule.d $fcc0, $fa0, $fa7 + fcmp.cule.d $fcc0, $fa0, $fs0 bcnez $fcc0, .LBB2_7 # %bb.11: # in Loop: Header=BB2_8 Depth=1 fcmp.clt.d $fcc0, $fa5, $fs1 @@ -1135,12 +1122,14 @@ IntersectObjs: # @IntersectObjs move $s4, $a3 move $s7, $a2 move $s5, $a0 - fst.d $fa7, $sp, 40 # 8-byte Folded Spill - vst $vr5, $sp, 16 # 16-byte Folded Spill + vst $vr5, $sp, 48 # 16-byte Folded Spill + vst $vr6, $sp, 32 # 16-byte Folded Spill + vst $vr7, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 - vld $vr5, $sp, 16 # 16-byte Folded Reload - fld.d $fa7, $sp, 40 # 8-byte Folded Reload + vld $vr7, $sp, 16 # 16-byte Folded Reload + vld $vr6, $sp, 32 # 16-byte Folded Reload + vld $vr5, $sp, 48 # 16-byte Folded Reload move $a0, $s5 move $a2, $s7 move $a3, $s4 @@ -1150,9 +1139,6 @@ IntersectObjs: # @IntersectObjs b .LBB2_10 .LBB2_14: fmov.d $fa0, $fa5 - fld.d $fs7, $sp, 48 # 8-byte Folded Reload - fld.d $fs6, $sp, 56 # 8-byte Folded Reload - fld.d $fs5, $sp, 64 # 8-byte Folded Reload fld.d $fs4, $sp, 72 # 8-byte Folded Reload fld.d $fs3, $sp, 80 # 8-byte Folded Reload fld.d $fs2, $sp, 88 # 8-byte Folded Reload diff --git a/results/MultiSource/Benchmarks/FreeBench/pifft/CMakeFiles/pifft.dir/pifft.s b/results/MultiSource/Benchmarks/FreeBench/pifft/CMakeFiles/pifft.dir/pifft.s index faab0012..5ba7a1d0 100644 --- a/results/MultiSource/Benchmarks/FreeBench/pifft/CMakeFiles/pifft.dir/pifft.s +++ b/results/MultiSource/Benchmarks/FreeBench/pifft/CMakeFiles/pifft.dir/pifft.s @@ -2286,33 +2286,31 @@ mp_mul: # @mp_mul ori $a0, $zero, 4 blt $s2, $a0, .LBB8_35 # %bb.33: # %.lr.ph.preheader.i117 - ld.d $a0, $sp, 112 # 8-byte Folded Reload - addi.d $a0, $a0, 32 - addi.d $a1, $s8, 32 - addi.d $a2, $s0, 32 + addi.d $a0, $s8, 32 + addi.d $a1, $s0, 32 + ld.d $a2, $sp, 112 # 8-byte Folded Reload + addi.d $a2, $a2, 24 ori $a3, $zero, 3 .p2align 4, , 16 .LBB8_34: # %.lr.ph.i118 # =>This Inner Loop Header: Depth=1 - fld.d $fa0, $a1, -8 - fld.d $fa1, $a2, 0 - fld.d $fa2, $a1, 0 - fld.d $fa3, $a2, -8 - fld.d $fa4, $a0, -8 - fneg.d $fa5, $fa1 - fmul.d $fa5, $fa2, $fa5 - fmadd.d $fa5, $fa0, $fa3, $fa5 - fadd.d $fa4, $fa4, $fa5 - fld.d $fa5, $a0, 0 - fst.d $fa4, $a0, -8 - fmul.d $fa2, $fa2, $fa3 - fmadd.d $fa0, $fa0, $fa1, $fa2 - fadd.d $fa0, $fa0, $fa5 - fst.d $fa0, $a0, 0 + fld.d $fa0, $a1, 0 + fld.d $fa1, $a0, 0 + vld $vr2, $a1, -8 + fld.d $fa3, $a0, -8 + fneg.d $fa0, $fa0 + vreplvei.d $vr1, $vr1, 0 + vpackev.d $vr0, $vr2, $vr0 + vld $vr4, $a2, 0 + vfmul.d $vr0, $vr1, $vr0 + vreplvei.d $vr1, $vr3, 0 + vfmadd.d $vr0, $vr1, $vr2, $vr0 + vfadd.d $vr0, $vr4, $vr0 + vst $vr0, $a2, 0 addi.d $a3, $a3, 2 - addi.d $a0, $a0, 16 addi.d $a2, $a2, 16 addi.d $a1, $a1, 16 + addi.d $a0, $a0, 16 bltu $a3, $s2, .LBB8_34 .LBB8_35: # %mp_mul_cmuladd.exit sub.w $a0, $s3, $s7 @@ -4668,28 +4666,26 @@ mp_mul_cmuladd: # @mp_mul_cmuladd fst.d $fa0, $a3, 16 blt $a0, $a4, .LBB28_3 # %bb.1: # %.lr.ph.preheader - addi.d $a4, $a3, 32 - addi.d $a5, $a2, 32 - addi.d $a6, $a1, 32 + addi.d $a4, $a2, 32 + addi.d $a5, $a1, 32 + addi.d $a6, $a3, 24 ori $a7, $zero, 3 .p2align 4, , 16 .LBB28_2: # %.lr.ph # =>This Inner Loop Header: Depth=1 - fld.d $fa0, $a6, -8 + fld.d $fa0, $a4, 0 fld.d $fa1, $a5, 0 - fld.d $fa2, $a6, 0 + vld $vr2, $a4, -8 fld.d $fa3, $a5, -8 - fld.d $fa4, $a4, -8 - fneg.d $fa5, $fa1 - fmul.d $fa5, $fa2, $fa5 - fmadd.d $fa5, $fa0, $fa3, $fa5 - fadd.d $fa4, $fa4, $fa5 - fld.d $fa5, $a4, 0 - fst.d $fa4, $a4, -8 - fmul.d $fa2, $fa2, $fa3 - fmadd.d $fa0, $fa0, $fa1, $fa2 - fadd.d $fa0, $fa0, $fa5 - fst.d $fa0, $a4, 0 + fneg.d $fa0, $fa0 + vreplvei.d $vr1, $vr1, 0 + vpackev.d $vr0, $vr2, $vr0 + vld $vr4, $a6, 0 + vfmul.d $vr0, $vr1, $vr0 + vreplvei.d $vr1, $vr3, 0 + vfmadd.d $vr0, $vr1, $vr2, $vr0 + vfadd.d $vr0, $vr4, $vr0 + vst $vr0, $a6, 0 addi.d $a7, $a7, 2 addi.d $a4, $a4, 16 addi.d $a5, $a5, 16 diff --git a/results/MultiSource/Benchmarks/MallocBench/gs/CMakeFiles/gs.dir/gsimage.s b/results/MultiSource/Benchmarks/MallocBench/gs/CMakeFiles/gs.dir/gsimage.s index 3ed755fd..e94bda2f 100644 --- a/results/MultiSource/Benchmarks/MallocBench/gs/CMakeFiles/gs.dir/gsimage.s +++ b/results/MultiSource/Benchmarks/MallocBench/gs/CMakeFiles/gs.dir/gsimage.s @@ -129,50 +129,56 @@ image_init: # @image_init st.w $s6, $s7, 0 move $t1, $s5 st.w $s5, $s7, 4 - fld.s $fa0, $sp, 16 + fld.s $fa1, $sp, 16 pcalau12i $a2, %pc_hi20(.LCPI1_0) - fld.s $fa2, $a2, %pc_lo12(.LCPI1_0) + fld.s $fa0, $a2, %pc_lo12(.LCPI1_0) move $a2, $s4 st.w $s4, $s7, 8 move $a5, $s3 st.w $s3, $s7, 12 move $a3, $s2 st.w $s2, $s7, 16 - fmul.s $fa0, $fa0, $fa2 - fld.s $fa1, $sp, 64 - ftintrz.l.s $fa0, $fa0 - movfr2gr.d $t2, $fa0 - fst.d $fa0, $s7, 24 - fmul.s $fa0, $fa1, $fa2 - ld.d $t4, $sp, 32 - ld.d $t5, $sp, 48 - ftintrz.l.s $fa0, $fa0 - movfr2gr.d $t3, $fa0 - fst.d $fa0, $s7, 48 - or $t0, $t5, $t4 + fmul.s $fa1, $fa1, $fa0 + fld.s $fa2, $sp, 64 + ftintrz.l.s $fa1, $fa1 + movfr2gr.d $t3, $fa1 + fst.d $fa1, $s7, 24 + fmul.s $fa1, $fa2, $fa0 + ld.d $t2, $sp, 32 + ld.d $t4, $sp, 48 + ftintrz.l.s $fa1, $fa1 + movfr2gr.d $t5, $fa1 + fst.d $fa1, $s7, 48 + or $t0, $t4, $t2 bstrpick.d $t0, $t0, 62, 0 sltu $t6, $zero, $t0 + vinsgr2vr.w $vr1, $t2, 0 + vinsgr2vr.w $vr1, $t4, 1 + lu12i.w $t7, 284672 + vreplgr2vr.w $vr2, $t7 + vfmul.s $vr1, $vr1, $vr2 + vreplvei.w $vr2, $vr1, 0 + ftintrz.l.s $fa2, $fa2 + movfr2gr.d $t7, $fa2 + vinsgr2vr.d $vr2, $t7, 0 + vreplvei.w $vr1, $vr1, 1 + ftintrz.l.s $fa1, $fa1 + movfr2gr.d $t7, $fa1 + vinsgr2vr.d $vr2, $t7, 1 + vreplgr2vr.d $vr1, $t6 + vslli.d $vr1, $vr1, 63 + vsrai.d $vr1, $vr1, 63 + vand.v $vr1, $vr1, $vr2 + fld.s $fa2, $sp, 80 st.w $t6, $s7, 100 - movgr2fr.w $fa0, $t4 - movgr2fr.w $fa1, $t5 - fmul.s $fa0, $fa0, $fa2 - ftintrz.l.s $fa0, $fa0 - movfr2gr.d $t4, $fa0 - fmul.s $fa0, $fa1, $fa2 - ftintrz.l.s $fa0, $fa0 - movfr2gr.d $t5, $fa0 - maskeqz $t7, $t4, $t6 - maskeqz $t6, $t5, $t6 - fld.s $fa0, $sp, 80 - st.d $t7, $s7, 32 - st.d $t6, $s7, 40 - fld.s $fa3, $sp, 96 - fmul.s $fa0, $fa0, $fa2 - ftintrz.l.s $fa1, $fa0 - fst.d $fa1, $s7, 160 - fmul.s $fa0, $fa3, $fa2 - ftintrz.l.s $fa0, $fa0 - fst.d $fa0, $s7, 168 + vst $vr1, $s7, 32 + fld.s $fa1, $sp, 96 + fmul.s $fa2, $fa2, $fa0 + ftintrz.l.s $fa2, $fa2 + fst.d $fa2, $s7, 160 + fmul.s $fa1, $fa1, $fa0 + ftintrz.l.s $fa1, $fa1 + fst.d $fa1, $s7, 168 st.d $fp, $s7, 72 st.d $a0, $s7, 80 st.w $s0, $s7, 88 @@ -245,45 +251,53 @@ image_init: # @image_init stptr.w $zero, $a1, 8360 .LBB1_17: ld.d $t6, $fp, 264 - mul.d $a0, $t2, $a4 - mul.d $t2, $t3, $t1 + mul.d $a0, $t3, $a4 + mul.d $t3, $t5, $t1 beqz $t0, .LBB1_19 # %bb.18: - mul.d $t1, $t5, $t1 - add.d $t5, $t1, $a0 - mul.d $t1, $t4, $a4 - add.d $t2, $t1, $t2 + movgr2fr.w $fa3, $t4 + fmul.s $fa3, $fa3, $fa0 + ftintrz.l.s $fa3, $fa3 + movfr2gr.d $t4, $fa3 + mul.d $t1, $t4, $t1 + add.d $t4, $t1, $a0 + movgr2fr.w $fa3, $t2 + fmul.s $fa0, $fa3, $fa0 + ftintrz.l.s $fa0, $fa0 + movfr2gr.d $t1, $fa0 + mul.d $t1, $t1, $a4 + add.d $t3, $t1, $t3 b .LBB1_20 .LBB1_19: - move $t5, $a0 + move $t4, $a0 .LBB1_20: ld.d $t8, $t6, 56 - ld.d $t4, $t6, 64 + ld.d $t5, $t6, 64 ld.d $t7, $t6, 72 - ld.d $t3, $t6, 80 - movfr2gr.d $t1, $fa1 - bltz $t5, .LBB1_23 + ld.d $t2, $t6, 80 + movfr2gr.d $t1, $fa2 + bltz $t4, .LBB1_23 # %bb.21: blt $t1, $t8, .LBB1_27 # %bb.22: - add.d $t5, $t5, $t1 - bge $t7, $t5, .LBB1_25 + add.d $t4, $t4, $t1 + bge $t7, $t4, .LBB1_25 b .LBB1_27 .LBB1_23: - add.d $t5, $t5, $t1 - blt $t5, $t8, .LBB1_27 + add.d $t4, $t4, $t1 + blt $t4, $t8, .LBB1_27 # %bb.24: blt $t7, $t1, .LBB1_27 .LBB1_25: - movfr2gr.d $t5, $fa0 - bltz $t2, .LBB1_41 + movfr2gr.d $t4, $fa1 + bltz $t3, .LBB1_41 # %bb.26: - slt $t4, $t5, $t4 - xori $t4, $t4, 1 - add.d $t2, $t2, $t5 - slt $t2, $t3, $t2 + slt $t5, $t4, $t5 + xori $t5, $t5, 1 + add.d $t3, $t3, $t4 + slt $t2, $t2, $t3 xori $t2, $t2, 1 - and $t3, $t4, $t2 + and $t3, $t5, $t2 b .LBB1_42 .LBB1_27: # %.critedge st.w $zero, $a1, 96 @@ -360,12 +374,12 @@ image_init: # @image_init addi.d $sp, $sp, 192 ret .LBB1_41: - add.d $t2, $t2, $t5 - slt $t2, $t2, $t4 - xori $t2, $t2, 1 + add.d $t3, $t3, $t4 slt $t3, $t3, $t5 xori $t3, $t3, 1 - and $t3, $t2, $t3 + slt $t2, $t2, $t4 + xori $t2, $t2, 1 + and $t3, $t3, $t2 .LBB1_42: st.w $t3, $a1, 96 ori $t2, $zero, 1 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/layer3.s b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/layer3.s index 75aa0fdd..7ff7bf79 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/layer3.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/layer3.s @@ -79,15 +79,15 @@ .dword 0x0000000000000000 # double 0 .dword 0x8000000000000000 # double -0 .LCPI0_28: - .word 12544 # 0x3100 - .word 12608 # 0x3140 - .word 12296 # 0x3008 - .word 12360 # 0x3048 -.LCPI0_29: .word 12288 # 0x3000 .word 12352 # 0x3040 .word 12416 # 0x3080 .word 12480 # 0x30c0 +.LCPI0_29: + .word 12544 # 0x3100 + .word 12608 # 0x3140 + .word 12296 # 0x3008 + .word 12360 # 0x3048 .LCPI0_30: .word 12424 # 0x3088 .word 12488 # 0x30c8 @@ -119,256 +119,261 @@ .word 12328 # 0x3028 .word 12392 # 0x3068 .LCPI0_36: + .word 12456 # 0x30a8 + .word 12520 # 0x30e8 + .word 12584 # 0x3128 + .word 12648 # 0x3168 +.LCPI0_37: .word 16384 # 0x4000 .word 16448 # 0x4040 .word 16512 # 0x4080 .word 16576 # 0x40c0 -.LCPI0_37: +.LCPI0_38: .word 16392 # 0x4008 .word 16456 # 0x4048 .word 16520 # 0x4088 .word 16584 # 0x40c8 -.LCPI0_38: +.LCPI0_39: .word 16400 # 0x4010 .word 16464 # 0x4050 .word 16528 # 0x4090 .word 16592 # 0x40d0 -.LCPI0_39: +.LCPI0_40: .word 16408 # 0x4018 .word 16472 # 0x4058 .word 16536 # 0x4098 .word 16600 # 0x40d8 -.LCPI0_40: +.LCPI0_41: .word 16385 # 0x4001 .word 16449 # 0x4041 .word 16513 # 0x4081 .word 16577 # 0x40c1 -.LCPI0_41: +.LCPI0_42: .word 16393 # 0x4009 .word 16457 # 0x4049 .word 16521 # 0x4089 .word 16585 # 0x40c9 -.LCPI0_42: +.LCPI0_43: .word 16401 # 0x4011 .word 16465 # 0x4051 .word 16529 # 0x4091 .word 16593 # 0x40d1 -.LCPI0_43: +.LCPI0_44: .word 16409 # 0x4019 .word 16473 # 0x4059 .word 16537 # 0x4099 .word 16601 # 0x40d9 -.LCPI0_44: +.LCPI0_45: .word 16386 # 0x4002 .word 16450 # 0x4042 .word 16514 # 0x4082 .word 16578 # 0x40c2 -.LCPI0_45: +.LCPI0_46: .word 16394 # 0x400a .word 16458 # 0x404a .word 16522 # 0x408a .word 16586 # 0x40ca -.LCPI0_46: +.LCPI0_47: .word 16402 # 0x4012 .word 16466 # 0x4052 .word 16530 # 0x4092 .word 16594 # 0x40d2 -.LCPI0_47: +.LCPI0_48: .word 16410 # 0x401a .word 16474 # 0x405a .word 16538 # 0x409a .word 16602 # 0x40da -.LCPI0_48: +.LCPI0_49: .word 16387 # 0x4003 .word 16451 # 0x4043 .word 16515 # 0x4083 .word 16579 # 0x40c3 -.LCPI0_49: +.LCPI0_50: .word 16395 # 0x400b .word 16459 # 0x404b .word 16523 # 0x408b .word 16587 # 0x40cb -.LCPI0_50: +.LCPI0_51: .word 16403 # 0x4013 .word 16467 # 0x4053 .word 16531 # 0x4093 .word 16595 # 0x40d3 -.LCPI0_51: +.LCPI0_52: .word 16411 # 0x401b .word 16475 # 0x405b .word 16539 # 0x409b .word 16603 # 0x40db -.LCPI0_52: +.LCPI0_53: .word 20480 # 0x5000 .word 20488 # 0x5008 .word 20496 # 0x5010 .word 20481 # 0x5001 -.LCPI0_53: +.LCPI0_54: .word 40960 # 0xa000 .word 40968 # 0xa008 .word 40976 # 0xa010 .word 40961 # 0xa001 -.LCPI0_54: +.LCPI0_55: .word 20489 # 0x5009 .word 20497 # 0x5011 .word 20482 # 0x5002 .word 20490 # 0x500a -.LCPI0_55: +.LCPI0_56: .word 40969 # 0xa009 .word 40977 # 0xa011 .word 40962 # 0xa002 .word 40970 # 0xa00a -.LCPI0_56: +.LCPI0_57: .word 20498 # 0x5012 .word 20483 # 0x5003 .word 20491 # 0x500b .word 20499 # 0x5013 -.LCPI0_57: +.LCPI0_58: .word 40978 # 0xa012 .word 40963 # 0xa003 .word 40971 # 0xa00b .word 40979 # 0xa013 -.LCPI0_58: +.LCPI0_59: .word 0 # 0x0 .word 512 # 0x200 .word 1024 # 0x400 .word 1536 # 0x600 -.LCPI0_59: +.LCPI0_60: .word 64 # 0x40 .word 576 # 0x240 .word 1088 # 0x440 .word 1600 # 0x640 -.LCPI0_60: +.LCPI0_61: .word 128 # 0x80 .word 640 # 0x280 .word 1152 # 0x480 .word 1664 # 0x680 -.LCPI0_61: +.LCPI0_62: .word 192 # 0xc0 .word 704 # 0x2c0 .word 1216 # 0x4c0 .word 1728 # 0x6c0 -.LCPI0_62: +.LCPI0_63: .word 4096 # 0x1000 .word 4160 # 0x1040 .word 4224 # 0x1080 .word 4288 # 0x10c0 -.LCPI0_63: +.LCPI0_64: .word 4104 # 0x1008 .word 4168 # 0x1048 .word 4232 # 0x1088 .word 4296 # 0x10c8 -.LCPI0_64: +.LCPI0_65: .word 4112 # 0x1010 .word 4176 # 0x1050 .word 4240 # 0x1090 .word 4304 # 0x10d0 -.LCPI0_65: +.LCPI0_66: .word 4120 # 0x1018 .word 4184 # 0x1058 .word 4248 # 0x1098 .word 4312 # 0x10d8 -.LCPI0_66: +.LCPI0_67: .word 4128 # 0x1020 .word 4192 # 0x1060 .word 4256 # 0x10a0 .word 4320 # 0x10e0 -.LCPI0_67: +.LCPI0_68: .word 4097 # 0x1001 .word 4161 # 0x1041 .word 4225 # 0x1081 .word 4289 # 0x10c1 -.LCPI0_68: +.LCPI0_69: .word 4105 # 0x1009 .word 4169 # 0x1049 .word 4233 # 0x1089 .word 4297 # 0x10c9 -.LCPI0_69: +.LCPI0_70: .word 4113 # 0x1011 .word 4177 # 0x1051 .word 4241 # 0x1091 .word 4305 # 0x10d1 -.LCPI0_70: +.LCPI0_71: .word 4121 # 0x1019 .word 4185 # 0x1059 .word 4249 # 0x1099 .word 4313 # 0x10d9 -.LCPI0_71: +.LCPI0_72: .word 4129 # 0x1021 .word 4193 # 0x1061 .word 4257 # 0x10a1 .word 4321 # 0x10e1 -.LCPI0_72: +.LCPI0_73: .word 4098 # 0x1002 .word 4162 # 0x1042 .word 4226 # 0x1082 .word 4290 # 0x10c2 -.LCPI0_73: +.LCPI0_74: .word 4106 # 0x100a .word 4170 # 0x104a .word 4234 # 0x108a .word 4298 # 0x10ca -.LCPI0_74: +.LCPI0_75: .word 4114 # 0x1012 .word 4178 # 0x1052 .word 4242 # 0x1092 .word 4306 # 0x10d2 -.LCPI0_75: +.LCPI0_76: .word 4122 # 0x101a .word 4186 # 0x105a .word 4250 # 0x109a .word 4314 # 0x10da -.LCPI0_76: +.LCPI0_77: .word 4130 # 0x1022 .word 4194 # 0x1062 .word 4258 # 0x10a2 .word 4322 # 0x10e2 -.LCPI0_77: +.LCPI0_78: .word 4099 # 0x1003 .word 4163 # 0x1043 .word 4227 # 0x1083 .word 4291 # 0x10c3 -.LCPI0_78: +.LCPI0_79: .word 4107 # 0x100b .word 4171 # 0x104b .word 4235 # 0x108b .word 4299 # 0x10cb -.LCPI0_79: +.LCPI0_80: .word 4115 # 0x1013 .word 4179 # 0x1053 .word 4243 # 0x1093 .word 4307 # 0x10d3 -.LCPI0_80: +.LCPI0_81: .word 4123 # 0x101b .word 4187 # 0x105b .word 4251 # 0x109b .word 4315 # 0x10db -.LCPI0_81: +.LCPI0_82: .word 4131 # 0x1023 .word 4195 # 0x1063 .word 4259 # 0x10a3 .word 4323 # 0x10e3 -.LCPI0_82: +.LCPI0_83: .word 4100 # 0x1004 .word 4164 # 0x1044 .word 4228 # 0x1084 .word 4292 # 0x10c4 -.LCPI0_83: +.LCPI0_84: .word 4108 # 0x100c .word 4172 # 0x104c .word 4236 # 0x108c .word 4300 # 0x10cc -.LCPI0_84: +.LCPI0_85: .word 4116 # 0x1014 .word 4180 # 0x1054 .word 4244 # 0x1094 .word 4308 # 0x10d4 -.LCPI0_85: +.LCPI0_86: .word 4124 # 0x101c .word 4188 # 0x105c .word 4252 # 0x109c .word 4316 # 0x10dc -.LCPI0_86: +.LCPI0_87: .word 4132 # 0x1024 .word 4196 # 0x1064 .word 4260 # 0x10a4 @@ -1764,113 +1769,104 @@ init_layer3: # @init_layer3 # %bb.38: # %.preheader328.preheader pcalau12i $a0, %pc_hi20(i_slen2) addi.d $a1, $a0, %pc_lo12(i_slen2) + pcalau12i $a0, %pc_hi20(.LCPI0_28) + vld $vr0, $a0, %pc_lo12(.LCPI0_28) + pcalau12i $a0, %pc_hi20(.LCPI0_29) + vld $vr1, $a0, %pc_lo12(.LCPI0_29) + pcalau12i $a0, %pc_hi20(.LCPI0_30) + vld $vr2, $a0, %pc_lo12(.LCPI0_30) + pcalau12i $a0, %pc_hi20(.LCPI0_31) + vld $vr3, $a0, %pc_lo12(.LCPI0_31) + pcalau12i $a0, %pc_hi20(.LCPI0_32) + vld $vr4, $a0, %pc_lo12(.LCPI0_32) + pcalau12i $a0, %pc_hi20(.LCPI0_33) + vld $vr5, $a0, %pc_lo12(.LCPI0_33) + pcalau12i $a0, %pc_hi20(.LCPI0_34) + vld $vr6, $a0, %pc_lo12(.LCPI0_34) + pcalau12i $a0, %pc_hi20(.LCPI0_35) + vld $vr7, $a0, %pc_lo12(.LCPI0_35) + pcalau12i $a0, %pc_hi20(.LCPI0_36) + vld $vr8, $a0, %pc_lo12(.LCPI0_36) move $a0, $zero move $a2, $zero - pcalau12i $a3, %pc_hi20(.LCPI0_28) - vld $vr0, $a3, %pc_lo12(.LCPI0_28) - pcalau12i $a3, %pc_hi20(.LCPI0_29) - vld $vr1, $a3, %pc_lo12(.LCPI0_29) - pcalau12i $a3, %pc_hi20(.LCPI0_30) - vld $vr2, $a3, %pc_lo12(.LCPI0_30) - pcalau12i $a3, %pc_hi20(.LCPI0_31) - vld $vr3, $a3, %pc_lo12(.LCPI0_31) - pcalau12i $a3, %pc_hi20(.LCPI0_32) - vld $vr4, $a3, %pc_lo12(.LCPI0_32) - pcalau12i $a3, %pc_hi20(.LCPI0_33) - vld $vr5, $a3, %pc_lo12(.LCPI0_33) - pcalau12i $a3, %pc_hi20(.LCPI0_34) - vld $vr6, $a3, %pc_lo12(.LCPI0_34) - pcalau12i $a3, %pc_hi20(.LCPI0_35) - vld $vr7, $a3, %pc_lo12(.LCPI0_35) - lu12i.w $a6, 3 - ori $a3, $a6, 296 - ori $a4, $a6, 168 - ori $a5, $a6, 232 - ori $a6, $a6, 360 - ori $a7, $zero, 720 + ori $a3, $zero, 720 .p2align 4, , 16 .LBB0_39: # %.preheader328 # =>This Inner Loop Header: Depth=1 - add.d $t0, $a1, $a0 - vreplgr2vr.w $vr8, $a2 - vadd.w $vr9, $vr8, $vr0 - vadd.w $vr10, $vr8, $vr1 + add.d $a4, $a1, $a0 + vreplgr2vr.w $vr9, $a2 + vadd.w $vr10, $vr9, $vr0 vstx $vr10, $a1, $a0 - vst $vr9, $t0, 16 - vadd.w $vr9, $vr8, $vr2 - vadd.w $vr10, $vr8, $vr3 - vst $vr10, $t0, 48 - vst $vr9, $t0, 32 - vadd.w $vr9, $vr8, $vr4 - vadd.w $vr10, $vr8, $vr5 - vst $vr10, $t0, 80 - vst $vr9, $t0, 64 - vadd.w $vr9, $vr8, $vr6 - vadd.w $vr8, $vr8, $vr7 - vst $vr8, $t0, 112 - vst $vr9, $t0, 96 - add.d $t1, $a2, $a3 - add.d $t2, $a2, $a4 - st.w $t2, $t0, 128 - add.d $t2, $a2, $a5 - st.w $t2, $t0, 132 - st.w $t1, $t0, 136 - add.d $t1, $a2, $a6 - st.w $t1, $t0, 140 + vadd.w $vr10, $vr9, $vr1 + vst $vr10, $a4, 16 + vadd.w $vr10, $vr9, $vr2 + vst $vr10, $a4, 32 + vadd.w $vr10, $vr9, $vr3 + vst $vr10, $a4, 48 + vadd.w $vr10, $vr9, $vr4 + vst $vr10, $a4, 64 + vadd.w $vr10, $vr9, $vr5 + vst $vr10, $a4, 80 + vadd.w $vr10, $vr9, $vr6 + vst $vr10, $a4, 96 + vadd.w $vr10, $vr9, $vr7 + vst $vr10, $a4, 112 + vadd.w $vr9, $vr9, $vr8 + vst $vr9, $a4, 128 addi.d $a0, $a0, 144 addi.d $a2, $a2, 1 - bne $a0, $a7, .LBB0_39 + bne $a0, $a3, .LBB0_39 # %bb.40: # %.preheader325.preheader - pcalau12i $a0, %pc_hi20(.LCPI0_36) - vld $vr0, $a0, %pc_lo12(.LCPI0_36) pcalau12i $a0, %pc_hi20(.LCPI0_37) - vld $vr1, $a0, %pc_lo12(.LCPI0_37) + vld $vr0, $a0, %pc_lo12(.LCPI0_37) + pcalau12i $a0, %pc_hi20(.LCPI0_38) + vld $vr1, $a0, %pc_lo12(.LCPI0_38) vst $vr0, $a1, 720 vst $vr1, $a1, 736 - pcalau12i $a0, %pc_hi20(.LCPI0_38) - vld $vr0, $a0, %pc_lo12(.LCPI0_38) pcalau12i $a0, %pc_hi20(.LCPI0_39) - vld $vr1, $a0, %pc_lo12(.LCPI0_39) + vld $vr0, $a0, %pc_lo12(.LCPI0_39) pcalau12i $a0, %pc_hi20(.LCPI0_40) - vld $vr2, $a0, %pc_lo12(.LCPI0_40) + vld $vr1, $a0, %pc_lo12(.LCPI0_40) pcalau12i $a0, %pc_hi20(.LCPI0_41) - vld $vr3, $a0, %pc_lo12(.LCPI0_41) + vld $vr2, $a0, %pc_lo12(.LCPI0_41) + pcalau12i $a0, %pc_hi20(.LCPI0_42) + vld $vr3, $a0, %pc_lo12(.LCPI0_42) vst $vr0, $a1, 752 vst $vr1, $a1, 768 vst $vr2, $a1, 784 vst $vr3, $a1, 800 - pcalau12i $a0, %pc_hi20(.LCPI0_42) - vld $vr0, $a0, %pc_lo12(.LCPI0_42) pcalau12i $a0, %pc_hi20(.LCPI0_43) - vld $vr1, $a0, %pc_lo12(.LCPI0_43) + vld $vr0, $a0, %pc_lo12(.LCPI0_43) pcalau12i $a0, %pc_hi20(.LCPI0_44) - vld $vr2, $a0, %pc_lo12(.LCPI0_44) + vld $vr1, $a0, %pc_lo12(.LCPI0_44) pcalau12i $a0, %pc_hi20(.LCPI0_45) - vld $vr3, $a0, %pc_lo12(.LCPI0_45) + vld $vr2, $a0, %pc_lo12(.LCPI0_45) + pcalau12i $a0, %pc_hi20(.LCPI0_46) + vld $vr3, $a0, %pc_lo12(.LCPI0_46) vst $vr0, $a1, 816 vst $vr1, $a1, 832 vst $vr2, $a1, 848 vst $vr3, $a1, 864 - pcalau12i $a0, %pc_hi20(.LCPI0_46) - vld $vr0, $a0, %pc_lo12(.LCPI0_46) pcalau12i $a0, %pc_hi20(.LCPI0_47) - vld $vr1, $a0, %pc_lo12(.LCPI0_47) + vld $vr0, $a0, %pc_lo12(.LCPI0_47) pcalau12i $a0, %pc_hi20(.LCPI0_48) - vld $vr2, $a0, %pc_lo12(.LCPI0_48) + vld $vr1, $a0, %pc_lo12(.LCPI0_48) pcalau12i $a0, %pc_hi20(.LCPI0_49) - vld $vr3, $a0, %pc_lo12(.LCPI0_49) + vld $vr2, $a0, %pc_lo12(.LCPI0_49) + pcalau12i $a0, %pc_hi20(.LCPI0_50) + vld $vr3, $a0, %pc_lo12(.LCPI0_50) vst $vr0, $a1, 880 vst $vr1, $a1, 896 vst $vr2, $a1, 912 vst $vr3, $a1, 928 - pcalau12i $a0, %pc_hi20(.LCPI0_50) - vld $vr0, $a0, %pc_lo12(.LCPI0_50) pcalau12i $a0, %pc_hi20(.LCPI0_51) - vld $vr1, $a0, %pc_lo12(.LCPI0_51) + vld $vr0, $a0, %pc_lo12(.LCPI0_51) pcalau12i $a0, %pc_hi20(.LCPI0_52) - vld $vr2, $a0, %pc_lo12(.LCPI0_52) + vld $vr1, $a0, %pc_lo12(.LCPI0_52) pcalau12i $a0, %pc_hi20(.LCPI0_53) - vld $vr3, $a0, %pc_lo12(.LCPI0_53) + vld $vr2, $a0, %pc_lo12(.LCPI0_53) + pcalau12i $a0, %pc_hi20(.LCPI0_54) + vld $vr3, $a0, %pc_lo12(.LCPI0_54) vst $vr0, $a1, 944 vst $vr1, $a1, 960 vst $vr2, $a1, 976 @@ -1878,26 +1874,26 @@ init_layer3: # @init_layer3 addi.d $a0, $a0, %pc_lo12(n_slen2) move $a2, $zero move $a3, $zero - pcalau12i $a4, %pc_hi20(.LCPI0_54) - vld $vr0, $a4, %pc_lo12(.LCPI0_54) pcalau12i $a4, %pc_hi20(.LCPI0_55) - vld $vr1, $a4, %pc_lo12(.LCPI0_55) + vld $vr0, $a4, %pc_lo12(.LCPI0_55) pcalau12i $a4, %pc_hi20(.LCPI0_56) - vld $vr2, $a4, %pc_lo12(.LCPI0_56) + vld $vr1, $a4, %pc_lo12(.LCPI0_56) + pcalau12i $a4, %pc_hi20(.LCPI0_57) + vld $vr2, $a4, %pc_lo12(.LCPI0_57) vst $vr3, $a0, 2000 vst $vr0, $a1, 992 vst $vr1, $a0, 2016 vst $vr2, $a1, 1008 - pcalau12i $a1, %pc_hi20(.LCPI0_57) - vld $vr4, $a1, %pc_lo12(.LCPI0_57) pcalau12i $a1, %pc_hi20(.LCPI0_58) - vld $vr0, $a1, %pc_lo12(.LCPI0_58) + vld $vr4, $a1, %pc_lo12(.LCPI0_58) pcalau12i $a1, %pc_hi20(.LCPI0_59) - vld $vr1, $a1, %pc_lo12(.LCPI0_59) + vld $vr0, $a1, %pc_lo12(.LCPI0_59) pcalau12i $a1, %pc_hi20(.LCPI0_60) - vld $vr2, $a1, %pc_lo12(.LCPI0_60) + vld $vr1, $a1, %pc_lo12(.LCPI0_60) pcalau12i $a1, %pc_hi20(.LCPI0_61) - vld $vr3, $a1, %pc_lo12(.LCPI0_61) + vld $vr2, $a1, %pc_lo12(.LCPI0_61) + pcalau12i $a1, %pc_hi20(.LCPI0_62) + vld $vr3, $a1, %pc_lo12(.LCPI0_62) vst $vr4, $a0, 2032 addi.d $a1, $a0, 160 ori $a4, $zero, 5 @@ -1905,131 +1901,131 @@ init_layer3: # @init_layer3 .LBB0_41: # %.preheader320 # =>This Inner Loop Header: Depth=1 vreplgr2vr.w $vr4, $a2 - vadd.w $vr5, $vr4, $vr0 - vadd.w $vr4, $vr4, $vr1 - vst $vr4, $a1, -144 - vst $vr5, $a1, -160 + vadd.w $vr4, $vr4, $vr0 + vst $vr4, $a1, -160 vreplgr2vr.w $vr4, $a3 + vadd.w $vr5, $vr4, $vr1 + vst $vr5, $a1, -144 vadd.w $vr5, $vr4, $vr2 + vst $vr5, $a1, -128 vadd.w $vr4, $vr4, $vr3 vst $vr4, $a1, -112 - vst $vr5, $a1, -128 addi.d $a5, $a3, 8 vreplgr2vr.w $vr4, $a5 vadd.w $vr5, $vr4, $vr0 - vadd.w $vr6, $vr4, $vr1 - vst $vr6, $a1, -80 vst $vr5, $a1, -96 + vadd.w $vr5, $vr4, $vr1 + vst $vr5, $a1, -80 vadd.w $vr5, $vr4, $vr2 + vst $vr5, $a1, -64 vadd.w $vr4, $vr4, $vr3 vst $vr4, $a1, -48 - vst $vr5, $a1, -64 addi.d $a5, $a3, 16 vreplgr2vr.w $vr4, $a5 vadd.w $vr5, $vr4, $vr0 - vadd.w $vr6, $vr4, $vr1 - vst $vr6, $a1, -16 vst $vr5, $a1, -32 + vadd.w $vr5, $vr4, $vr1 + vst $vr5, $a1, -16 vadd.w $vr5, $vr4, $vr2 + vst $vr5, $a1, 0 vadd.w $vr4, $vr4, $vr3 vst $vr4, $a1, 16 - vst $vr5, $a1, 0 addi.d $a5, $a3, 24 vreplgr2vr.w $vr4, $a5 vadd.w $vr5, $vr4, $vr0 - vadd.w $vr6, $vr4, $vr1 - vst $vr6, $a1, 48 vst $vr5, $a1, 32 + vadd.w $vr5, $vr4, $vr1 + vst $vr5, $a1, 48 vadd.w $vr5, $vr4, $vr2 + vst $vr5, $a1, 64 vadd.w $vr4, $vr4, $vr3 vst $vr4, $a1, 80 - vst $vr5, $a1, 64 addi.d $a5, $a3, 32 vreplgr2vr.w $vr4, $a5 vadd.w $vr5, $vr4, $vr0 - vadd.w $vr6, $vr4, $vr1 - vst $vr6, $a1, 112 vst $vr5, $a1, 96 + vadd.w $vr5, $vr4, $vr1 + vst $vr5, $a1, 112 vadd.w $vr5, $vr4, $vr2 + vst $vr5, $a1, 128 vadd.w $vr4, $vr4, $vr3 vst $vr4, $a1, 144 - vst $vr5, $a1, 128 addi.d $a3, $a3, 1 addi.d $a2, $a2, 1 addi.d $a1, $a1, 320 bne $a3, $a4, .LBB0_41 # %bb.42: # %.preheader316.preheader - pcalau12i $a1, %pc_hi20(.LCPI0_62) - vld $vr0, $a1, %pc_lo12(.LCPI0_62) - vst $vr0, $a0, 1600 pcalau12i $a1, %pc_hi20(.LCPI0_63) vld $vr0, $a1, %pc_lo12(.LCPI0_63) + vst $vr0, $a0, 1600 pcalau12i $a1, %pc_hi20(.LCPI0_64) - vld $vr1, $a1, %pc_lo12(.LCPI0_64) + vld $vr0, $a1, %pc_lo12(.LCPI0_64) pcalau12i $a1, %pc_hi20(.LCPI0_65) - vld $vr2, $a1, %pc_lo12(.LCPI0_65) + vld $vr1, $a1, %pc_lo12(.LCPI0_65) pcalau12i $a1, %pc_hi20(.LCPI0_66) - vld $vr3, $a1, %pc_lo12(.LCPI0_66) + vld $vr2, $a1, %pc_lo12(.LCPI0_66) + pcalau12i $a1, %pc_hi20(.LCPI0_67) + vld $vr3, $a1, %pc_lo12(.LCPI0_67) vst $vr0, $a0, 1616 vst $vr1, $a0, 1632 vst $vr2, $a0, 1648 vst $vr3, $a0, 1664 - pcalau12i $a1, %pc_hi20(.LCPI0_67) - vld $vr0, $a1, %pc_lo12(.LCPI0_67) pcalau12i $a1, %pc_hi20(.LCPI0_68) - vld $vr1, $a1, %pc_lo12(.LCPI0_68) + vld $vr0, $a1, %pc_lo12(.LCPI0_68) pcalau12i $a1, %pc_hi20(.LCPI0_69) - vld $vr2, $a1, %pc_lo12(.LCPI0_69) + vld $vr1, $a1, %pc_lo12(.LCPI0_69) pcalau12i $a1, %pc_hi20(.LCPI0_70) - vld $vr3, $a1, %pc_lo12(.LCPI0_70) + vld $vr2, $a1, %pc_lo12(.LCPI0_70) + pcalau12i $a1, %pc_hi20(.LCPI0_71) + vld $vr3, $a1, %pc_lo12(.LCPI0_71) vst $vr0, $a0, 1680 vst $vr1, $a0, 1696 vst $vr2, $a0, 1712 vst $vr3, $a0, 1728 - pcalau12i $a1, %pc_hi20(.LCPI0_71) - vld $vr0, $a1, %pc_lo12(.LCPI0_71) pcalau12i $a1, %pc_hi20(.LCPI0_72) - vld $vr1, $a1, %pc_lo12(.LCPI0_72) + vld $vr0, $a1, %pc_lo12(.LCPI0_72) pcalau12i $a1, %pc_hi20(.LCPI0_73) - vld $vr2, $a1, %pc_lo12(.LCPI0_73) + vld $vr1, $a1, %pc_lo12(.LCPI0_73) pcalau12i $a1, %pc_hi20(.LCPI0_74) - vld $vr3, $a1, %pc_lo12(.LCPI0_74) + vld $vr2, $a1, %pc_lo12(.LCPI0_74) + pcalau12i $a1, %pc_hi20(.LCPI0_75) + vld $vr3, $a1, %pc_lo12(.LCPI0_75) vst $vr0, $a0, 1744 vst $vr1, $a0, 1760 vst $vr2, $a0, 1776 vst $vr3, $a0, 1792 - pcalau12i $a1, %pc_hi20(.LCPI0_75) - vld $vr0, $a1, %pc_lo12(.LCPI0_75) pcalau12i $a1, %pc_hi20(.LCPI0_76) - vld $vr1, $a1, %pc_lo12(.LCPI0_76) + vld $vr0, $a1, %pc_lo12(.LCPI0_76) pcalau12i $a1, %pc_hi20(.LCPI0_77) - vld $vr2, $a1, %pc_lo12(.LCPI0_77) + vld $vr1, $a1, %pc_lo12(.LCPI0_77) pcalau12i $a1, %pc_hi20(.LCPI0_78) - vld $vr3, $a1, %pc_lo12(.LCPI0_78) + vld $vr2, $a1, %pc_lo12(.LCPI0_78) + pcalau12i $a1, %pc_hi20(.LCPI0_79) + vld $vr3, $a1, %pc_lo12(.LCPI0_79) vst $vr0, $a0, 1808 vst $vr1, $a0, 1824 vst $vr2, $a0, 1840 vst $vr3, $a0, 1856 - pcalau12i $a1, %pc_hi20(.LCPI0_79) - vld $vr0, $a1, %pc_lo12(.LCPI0_79) pcalau12i $a1, %pc_hi20(.LCPI0_80) - vld $vr1, $a1, %pc_lo12(.LCPI0_80) + vld $vr0, $a1, %pc_lo12(.LCPI0_80) pcalau12i $a1, %pc_hi20(.LCPI0_81) - vld $vr2, $a1, %pc_lo12(.LCPI0_81) + vld $vr1, $a1, %pc_lo12(.LCPI0_81) pcalau12i $a1, %pc_hi20(.LCPI0_82) - vld $vr3, $a1, %pc_lo12(.LCPI0_82) + vld $vr2, $a1, %pc_lo12(.LCPI0_82) + pcalau12i $a1, %pc_hi20(.LCPI0_83) + vld $vr3, $a1, %pc_lo12(.LCPI0_83) vst $vr0, $a0, 1872 vst $vr1, $a0, 1888 vst $vr2, $a0, 1904 vst $vr3, $a0, 1920 - pcalau12i $a1, %pc_hi20(.LCPI0_83) - vld $vr0, $a1, %pc_lo12(.LCPI0_83) pcalau12i $a1, %pc_hi20(.LCPI0_84) - vld $vr1, $a1, %pc_lo12(.LCPI0_84) + vld $vr0, $a1, %pc_lo12(.LCPI0_84) pcalau12i $a1, %pc_hi20(.LCPI0_85) - vld $vr2, $a1, %pc_lo12(.LCPI0_85) + vld $vr1, $a1, %pc_lo12(.LCPI0_85) pcalau12i $a1, %pc_hi20(.LCPI0_86) - vld $vr3, $a1, %pc_lo12(.LCPI0_86) + vld $vr2, $a1, %pc_lo12(.LCPI0_86) + pcalau12i $a1, %pc_hi20(.LCPI0_87) + vld $vr3, $a1, %pc_lo12(.LCPI0_87) vst $vr0, $a0, 1936 vst $vr1, $a0, 1952 vst $vr2, $a0, 1968 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/newmdct.s b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/newmdct.s index 3afbd69b..08ae1d39 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/newmdct.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/newmdct.s @@ -1770,111 +1770,108 @@ mdct_init48: # @mdct_init48 vst $vr2, $s0, 80 vst $vr3, $s0, 96 pcalau12i $a0, %pc_hi20(enwindow) - addi.d $a1, $a0, %pc_lo12(enwindow) - fld.d $fa4, $a1, 1984 - fld.d $fa0, $a1, 0 - move $a0, $zero + addi.d $a0, $a0, %pc_lo12(enwindow) + fld.d $fa4, $a0, 1984 + fld.d $fa0, $a0, 0 + move $a1, $zero fdiv.d $fa1, $fa0, $fa4 - vld $vr2, $a1, 8 - vld $vr3, $a1, 24 + vld $vr2, $a0, 8 fst.d $fa1, $sp, 264 vreplvei.d $vr1, $vr0, 0 + vld $vr3, $a0, 24 vfdiv.d $vr2, $vr2, $vr1 - vfdiv.d $vr1, $vr3, $vr1 - fld.d $fa3, $a1, 40 - vst $vr1, $a1, 16 - vst $vr2, $a1, 0 - fld.d $fa1, $a1, 48 - fdiv.d $fa2, $fa3, $fa0 - fst.d $fa2, $a1, 32 - fld.d $fa2, $a1, 56 - fdiv.d $fa1, $fa1, $fa0 - fst.d $fa1, $a1, 40 - addi.d $a6, $a1, 56 - fdiv.d $fa0, $fa2, $fa0 - fst.d $fa0, $a1, 48 + vst $vr2, $a0, 0 + vld $vr2, $a0, 40 + vfdiv.d $vr3, $vr3, $vr1 + vst $vr3, $a0, 16 + fld.d $fa3, $a0, 56 + vfdiv.d $vr1, $vr2, $vr1 + vst $vr1, $a0, 32 + addi.d $a6, $a0, 56 + fdiv.d $fa0, $fa3, $fa0 + fst.d $fa0, $a0, 48 addi.d $a3, $sp, 256 - addi.d $a1, $a1, 120 + addi.d $a0, $a0, 120 addi.d $a4, $sp, 272 ori $a5, $zero, 120 .p2align 4, , 16 .LBB1_5: # %.preheader198 # =>This Inner Loop Header: Depth=1 - fld.d $fa0, $a1, -56 - fld.d $fa1, $a1, -48 + fld.d $fa0, $a0, -56 + fld.d $fa1, $a0, -48 move $a2, $a6 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 0 - fld.d $fa1, $a1, -40 + fld.d $fa1, $a0, -40 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 8 - fld.d $fa1, $a1, -32 + fld.d $fa1, $a0, -32 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 16 - fld.d $fa1, $a1, -24 + fld.d $fa1, $a0, -24 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 24 - fld.d $fa1, $a1, -16 + fld.d $fa1, $a0, -16 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 32 - fld.d $fa1, $a1, -8 + fld.d $fa1, $a0, -8 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 40 - fld.d $fa1, $a1, 0 + fld.d $fa1, $a0, 0 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 48 - fld.d $fa1, $a1, 8 + fld.d $fa1, $a0, 8 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 56 - fld.d $fa1, $a1, 16 + fld.d $fa1, $a0, 16 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 64 - fld.d $fa1, $a1, 24 + fld.d $fa1, $a0, 24 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 72 - fld.d $fa1, $a1, 32 + fld.d $fa1, $a0, 32 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 80 - fld.d $fa1, $a1, 40 + fld.d $fa1, $a0, 40 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 88 - fld.d $fa1, $a1, 48 + fld.d $fa1, $a0, 48 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 96 - fld.d $fa1, $a1, 56 + fld.d $fa1, $a0, 56 fdiv.d $fa1, $fa1, $fa0 fst.d $fa1, $a6, 104 - fld.d $fa1, $a1, 64 + fld.d $fa1, $a0, 64 fdiv.d $fa2, $fa0, $fa4 - fstx.d $fa2, $a4, $a0 + fstx.d $fa2, $a4, $a1 fst.d $fa2, $a3, 0 fdiv.d $fa0, $fa1, $fa0 fst.d $fa0, $a6, 112 - addi.d $a0, $a0, 8 + addi.d $a1, $a1, 8 addi.d $a3, $a3, -8 - addi.d $a1, $a1, 128 + addi.d $a0, $a0, 128 addi.d $a6, $a6, 120 - bne $a0, $a5, .LBB1_5 + bne $a1, $a5, .LBB1_5 # %bb.6: # %.preheader197.preheader - fld.d $fa0, $a1, -48 + fld.d $fa0, $a0, -48 fdiv.d $fa0, $fa0, $fa4 fst.d $fa0, $a2, 120 - fld.d $fa0, $a1, -40 + fld.d $fa0, $a0, -40 fdiv.d $fa0, $fa0, $fa4 fst.d $fa0, $a2, 128 - fld.d $fa0, $a1, -32 + fld.d $fa0, $a0, -32 fdiv.d $fa0, $fa0, $fa4 fst.d $fa0, $a2, 136 - fld.d $fa0, $a1, -24 + fld.d $fa0, $a0, -24 fdiv.d $fa0, $fa0, $fa4 fst.d $fa0, $a2, 144 - fld.d $fa0, $a1, -16 + fld.d $fa0, $a0, -16 fdiv.d $fa0, $fa0, $fa4 fst.d $fa0, $a2, 152 - fld.d $fa0, $a1, -8 + fld.d $fa0, $a0, -8 fdiv.d $fa0, $fa0, $fa4 fst.d $fa0, $a2, 160 - fld.d $fa0, $a1, 0 + fld.d $fa0, $a0, 0 vst $vr4, $sp, 128 # 16-byte Folded Spill fdiv.d $fa0, $fa0, $fa4 fst.d $fa0, $a2, 168 @@ -2682,7 +2679,7 @@ all: .type enwindow,@object # @enwindow .data - .p2align 5, 0x0 + .p2align 4, 0x0 enwindow: .dword 0x3fa251e002c5be4c # double 0.035780907000000001 .dword 0x3f924e1ffc2760f6 # double 0.017876148000000001 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/psymodel.s b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/psymodel.s index 7d552932..3d9d4c48 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/psymodel.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/psymodel.s @@ -1292,28 +1292,26 @@ L3psycho_anal: # @L3psycho_anal bge $s5, $a2, .LBB0_124 # %bb.110: # %.lr.ph1039 # in Loop: Header=BB0_78 Depth=1 - alsl.d $a0, $s5, $s3, 2 - addi.d $s8, $a0, 8 + alsl.d $s8, $s5, $s3, 2 addi.d $a0, $s5, 2 andi $a0, $a0, 12 sub.d $a1, $t6, $a0 ld.d $a2, $sp, 440 # 8-byte Folded Reload - addi.d $a2, $a2, 2047 - addi.d $a2, $a2, 1 - add.d $s7, $a2, $a1 + add.d $a1, $a2, $a1 + addi.d $a1, $a1, 2047 + addi.d $s7, $a1, 1 + add.d $a1, $t6, $a0 + add.d $a1, $a2, $a1 + addi.d $s2, $a1, 1024 pcalau12i $a1, %pc_hi20(L3psycho_anal.energy_s) addi.d $a1, $a1, %pc_lo12(L3psycho_anal.energy_s) - add.d $a1, $a1, $a0 - addi.d $s2, $a1, 516 - add.d $a0, $t6, $a0 - add.d $s1, $a2, $a0 + add.d $a0, $a1, $a0 + addi.d $s1, $a0, 516 b .LBB0_112 .p2align 4, , 16 .LBB0_111: # in Loop: Header=BB0_112 Depth=2 - fst.s $fa0, $s8, -8 - fst.s $fa0, $s8, 4 - fst.s $fa0, $s8, 0 - fst.s $fa0, $s8, -4 + vreplvei.w $vr0, $vr0, 0 + vst $vr0, $s8, 0 addi.d $s5, $s5, 4 ld.d $a0, $sp, 504 # 8-byte Folded Reload ld.w $a2, $a0, %pc_lo12(L3psycho_anal.cw_upper_index) @@ -1324,12 +1322,12 @@ L3psycho_anal: # @L3psycho_anal bge $s5, $a2, .LBB0_124 .LBB0_112: # Parent Loop BB0_78 Depth=1 # => This Inner Loop Header: Depth=2 - fld.s $fa5, $s2, -516 + fld.s $fa5, $s1, -516 movgr2fr.w $fs4, $zero fcmp.ceq.s $fcc0, $fa5, $fs4 bcnez $fcc0, .LBB0_115 # %bb.113: # in Loop: Header=BB0_112 Depth=2 - fld.s $fa0, $s1, -2048 + fld.s $fa0, $s2, -1024 fld.s $fa1, $s7, -1024 fmul.s $ft2, $fa0, $fa1 fneg.s $fa2, $fa1 @@ -1339,7 +1337,7 @@ L3psycho_anal: # @L3psycho_anal fcmp.cor.s $fcc0, $fs0, $fs0 fmul.s $fs3, $fa0, $fa6 bcnez $fcc0, .LBB0_116 -# %bb.114: # %call.sqrt1988 +# %bb.114: # %call.sqrt1985 # in Loop: Header=BB0_112 Depth=2 fmov.s $fa0, $fa5 move $s4, $t4 @@ -1364,16 +1362,16 @@ L3psycho_anal: # @L3psycho_anal fmov.s $fs3, $fs4 vldi $vr5, -1168 .LBB0_116: # in Loop: Header=BB0_112 Depth=2 - fld.s $fs1, $s2, 516 + fld.s $fs1, $s1, 516 fcmp.ceq.s $fcc0, $fs1, $fs4 bcnez $fcc0, .LBB0_119 # %bb.117: # in Loop: Header=BB0_112 Depth=2 - fld.s $fs4, $s1, 0 + fld.s $fs4, $s2, 1024 fld.s $fs2, $s7, 1024 fsqrt.s $fa0, $fs1 fcmp.cor.s $fcc0, $fa0, $fa0 bceqz $fcc0, .LBB0_123 -.LBB0_118: # %.split1989 +.LBB0_118: # %.split1986 # in Loop: Header=BB0_112 Depth=2 fadd.s $fa1, $ft2, $fs3 fadd.s $fa2, $fs4, $fs2 @@ -1387,11 +1385,11 @@ L3psycho_anal: # @L3psycho_anal fmov.s $fs1, $fa0 fmov.s $fs3, $fa2 .LBB0_119: # in Loop: Header=BB0_112 Depth=2 - fld.s $fa1, $s2, 0 + fld.s $fa1, $s1, 0 fsqrt.s $fa0, $fa1 fcmp.cor.s $fcc0, $fa0, $fa0 bceqz $fcc0, .LBB0_122 -.LBB0_120: # %.split1991 +.LBB0_120: # %.split1988 # in Loop: Header=BB0_112 Depth=2 fmsub.s $fa1, $fs0, $ft1, $fs1 fabs.s $fa2, $fa1 @@ -1400,7 +1398,7 @@ L3psycho_anal: # @L3psycho_anal fcmp.ceq.s $fcc0, $fa0, $fa2 bcnez $fcc0, .LBB0_111 # %bb.121: # in Loop: Header=BB0_112 Depth=2 - fld.s $fa2, $s1, -1024 + fld.s $fa2, $s2, 0 fld.s $fa3, $s7, 0 fadd.s $fa4, $fa2, $fa3 fneg.s $fa1, $fa1 @@ -1418,7 +1416,7 @@ L3psycho_anal: # @L3psycho_anal fdiv.d $fa0, $fa1, $fa0 fcvt.s.d $fa0, $fa0 b .LBB0_111 -.LBB0_122: # %call.sqrt1992 +.LBB0_122: # %call.sqrt1989 # in Loop: Header=BB0_112 Depth=2 fmov.s $fa0, $fa1 move $s4, $t4 @@ -1435,7 +1433,7 @@ L3psycho_anal: # @L3psycho_anal ld.d $t1, $sp, 472 # 8-byte Folded Reload ld.d $t0, $sp, 496 # 8-byte Folded Reload b .LBB0_120 -.LBB0_123: # %call.sqrt1990 +.LBB0_123: # %call.sqrt1987 # in Loop: Header=BB0_112 Depth=2 fmov.s $fa0, $fs1 move $s4, $t4 diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/quantize-pvt.s b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/quantize-pvt.s index e2fec85a..ebd201c5 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/quantize-pvt.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/quantize-pvt.s @@ -2719,35 +2719,10 @@ quantize_xrpow: # @quantize_xrpow ori $a4, $zero, 1 .p2align 4, , 16 .LBB12_1: # =>This Inner Loop Header: Depth=1 - vld $vr3, $a0, 0 - vld $vr1, $a0, 48 - vld $vr2, $a0, 32 - vld $vr4, $a0, 16 - addi.d $a0, $a0, 64 + vld $vr1, $a0, 16 + vld $vr2, $a0, 0 vfmul.d $vr1, $vr0, $vr1 vfmul.d $vr2, $vr0, $vr2 - vfmul.d $vr7, $vr0, $vr4 - vfmul.d $vr8, $vr0, $vr3 - vreplvei.d $vr3, $vr8, 1 - ftintrz.w.d $fa3, $fa3 - movfr2gr.s $a5, $fa3 - vreplvei.d $vr3, $vr8, 0 - ftintrz.w.d $fa3, $fa3 - movfr2gr.s $a6, $fa3 - slli.d $a6, $a6, 3 - fldx.d $ft1, $a3, $a6 - slli.d $a5, $a5, 3 - fldx.d $ft2, $a3, $a5 - vreplvei.d $vr3, $vr7, 1 - ftintrz.w.d $fa3, $fa3 - movfr2gr.s $a5, $fa3 - vreplvei.d $vr3, $vr7, 0 - ftintrz.w.d $fa3, $fa3 - movfr2gr.s $a6, $fa3 - slli.d $a6, $a6, 3 - fldx.d $ft3, $a3, $a6 - slli.d $a5, $a5, 3 - fldx.d $ft4, $a3, $a5 vreplvei.d $vr3, $vr2, 1 ftintrz.w.d $fa3, $fa3 movfr2gr.s $a5, $fa3 @@ -2768,51 +2743,75 @@ quantize_xrpow: # @quantize_xrpow fldx.d $fa5, $a3, $a6 slli.d $a5, $a5, 3 fldx.d $fa6, $a3, $a5 - vextrins.d $vr9, $vr10, 16 - vextrins.d $vr11, $vr12, 16 - vfadd.d $vr7, $vr7, $vr11 - vfadd.d $vr8, $vr8, $vr9 - vreplvei.d $vr9, $vr8, 0 - ftintrz.w.d $ft1, $ft1 - movfr2gr.s $a5, $ft1 - vinsgr2vr.w $vr9, $a5, 0 - vreplvei.d $vr8, $vr8, 1 - ftintrz.w.d $ft0, $ft0 - movfr2gr.s $a5, $ft0 - vinsgr2vr.w $vr9, $a5, 1 - vreplvei.d $vr8, $vr7, 0 - ftintrz.w.d $ft0, $ft0 - movfr2gr.s $a5, $ft0 - vinsgr2vr.w $vr9, $a5, 2 - vreplvei.d $vr7, $vr7, 1 - ftintrz.w.d $fa7, $fa7 - movfr2gr.s $a5, $fa7 - vinsgr2vr.w $vr9, $a5, 3 - vst $vr9, $a1, 0 - addi.d $a5, $a1, 32 vextrins.d $vr3, $vr4, 16 vextrins.d $vr5, $vr6, 16 vfadd.d $vr1, $vr1, $vr5 vfadd.d $vr2, $vr2, $vr3 vreplvei.d $vr3, $vr2, 0 ftintrz.w.d $fa3, $fa3 - movfr2gr.s $a6, $fa3 - vinsgr2vr.w $vr3, $a6, 0 + movfr2gr.s $a5, $fa3 + vinsgr2vr.w $vr3, $a5, 0 vreplvei.d $vr2, $vr2, 1 ftintrz.w.d $fa2, $fa2 - movfr2gr.s $a6, $fa2 - vinsgr2vr.w $vr3, $a6, 1 + movfr2gr.s $a5, $fa2 + vinsgr2vr.w $vr3, $a5, 1 vreplvei.d $vr2, $vr1, 0 ftintrz.w.d $fa2, $fa2 - movfr2gr.s $a6, $fa2 - vinsgr2vr.w $vr3, $a6, 2 + movfr2gr.s $a5, $fa2 + vinsgr2vr.w $vr3, $a5, 2 vreplvei.d $vr1, $vr1, 1 + vld $vr2, $a0, 48 + vld $vr4, $a0, 32 ftintrz.w.d $fa1, $fa1 - movfr2gr.s $a6, $fa1 - vinsgr2vr.w $vr3, $a6, 3 - addi.w $a2, $a2, -1 + movfr2gr.s $a5, $fa1 + vfmul.d $vr1, $vr0, $vr2 + vfmul.d $vr2, $vr0, $vr4 + vreplvei.d $vr4, $vr2, 1 + ftintrz.w.d $fa4, $fa4 + vreplvei.d $vr5, $vr2, 0 + ftintrz.w.d $fa5, $fa5 + movfr2gr.s $a6, $fa5 + slli.d $a6, $a6, 3 + fldx.d $fa5, $a3, $a6 + movfr2gr.s $a6, $fa4 + slli.d $a6, $a6, 3 + fldx.d $fa4, $a3, $a6 + vreplvei.d $vr6, $vr1, 1 + ftintrz.w.d $fa6, $fa6 + vreplvei.d $vr7, $vr1, 0 + ftintrz.w.d $fa7, $fa7 + movfr2gr.s $a6, $fa7 + slli.d $a6, $a6, 3 + fldx.d $fa7, $a3, $a6 + movfr2gr.s $a6, $fa6 + slli.d $a6, $a6, 3 + fldx.d $fa6, $a3, $a6 + vinsgr2vr.w $vr3, $a5, 3 + vst $vr3, $a1, 0 + vextrins.d $vr5, $vr4, 16 + vextrins.d $vr7, $vr6, 16 + vfadd.d $vr1, $vr1, $vr7 + vfadd.d $vr2, $vr2, $vr5 + vreplvei.d $vr3, $vr2, 0 + ftintrz.w.d $fa3, $fa3 + movfr2gr.s $a5, $fa3 + vinsgr2vr.w $vr3, $a5, 0 + vreplvei.d $vr2, $vr2, 1 + ftintrz.w.d $fa2, $fa2 + movfr2gr.s $a5, $fa2 + vinsgr2vr.w $vr3, $a5, 1 + vreplvei.d $vr2, $vr1, 0 + ftintrz.w.d $fa2, $fa2 + movfr2gr.s $a5, $fa2 + vinsgr2vr.w $vr3, $a5, 2 + vreplvei.d $vr1, $vr1, 1 + ftintrz.w.d $fa1, $fa1 + movfr2gr.s $a5, $fa1 + vinsgr2vr.w $vr3, $a5, 3 vst $vr3, $a1, 16 - move $a1, $a5 + addi.w $a2, $a2, -1 + addi.d $a0, $a0, 64 + addi.d $a1, $a1, 32 bltu $a4, $a2, .LBB12_1 # %bb.2: ret diff --git a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/vbrquantize.s b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/vbrquantize.s index a1e38f1b..7acb3f2d 100644 --- a/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/vbrquantize.s +++ b/results/MultiSource/Benchmarks/MiBench/consumer-lame/CMakeFiles/consumer-lame.dir/vbrquantize.s @@ -998,168 +998,168 @@ VBR_iteration_loop_new: # @VBR_iteration_loop_new # %bb.20: # %.preheader.preheader # in Loop: Header=BB4_10 Depth=2 lu12i.w $a1, 1 - ori $a1, $a1, 912 + ori $a1, $a1, 896 add.d $a1, $sp, $a1 vld $vr1, $a1, 0 + vreplvei.d $vr0, $vr4, 0 lu12i.w $a1, 1 - ori $a1, $a1, 896 + ori $a1, $a1, 912 add.d $a1, $sp, $a1 vld $vr2, $a1, 0 - vreplvei.d $vr0, $vr4, 0 vfsub.d $vr1, $vr1, $vr0 - vfsub.d $vr2, $vr2, $vr0 + lu12i.w $a1, 1 + ori $a1, $a1, 896 + add.d $a1, $sp, $a1 + vst $vr1, $a1, 0 lu12i.w $a1, 1 ori $a1, $a1, 928 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 + vld $vr1, $a1, 0 + vfsub.d $vr2, $vr2, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 896 + ori $a1, $a1, 912 add.d $a1, $sp, $a1 vst $vr2, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 912 + ori $a1, $a1, 944 + add.d $a1, $sp, $a1 + vld $vr2, $a1, 0 + vfsub.d $vr1, $vr1, $vr0 + lu12i.w $a1, 1 + ori $a1, $a1, 928 add.d $a1, $sp, $a1 vst $vr1, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 944 + ori $a1, $a1, 960 add.d $a1, $sp, $a1 vld $vr1, $a1, 0 - vfsub.d $vr2, $vr3, $vr0 + vfsub.d $vr2, $vr2, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 928 + ori $a1, $a1, 944 add.d $a1, $sp, $a1 vst $vr2, $a1, 0 lu12i.w $a1, 1 ori $a1, $a1, 976 add.d $a1, $sp, $a1 vld $vr2, $a1, 0 - lu12i.w $a1, 1 - ori $a1, $a1, 960 - add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 vfsub.d $vr1, $vr1, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 944 + ori $a1, $a1, 960 add.d $a1, $sp, $a1 vst $vr1, $a1, 0 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr2, $vr3, $vr0 lu12i.w $a1, 1 ori $a1, $a1, 992 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 + vld $vr1, $a1, 0 + vfsub.d $vr2, $vr2, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 960 + ori $a1, $a1, 976 add.d $a1, $sp, $a1 vst $vr2, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 976 + ori $a1, $a1, 1008 + add.d $a1, $sp, $a1 + vld $vr2, $a1, 0 + vfsub.d $vr1, $vr1, $vr0 + lu12i.w $a1, 1 + ori $a1, $a1, 992 add.d $a1, $sp, $a1 vst $vr1, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 1008 + ori $a1, $a1, 1024 add.d $a1, $sp, $a1 vld $vr1, $a1, 0 - vfsub.d $vr2, $vr3, $vr0 + vfsub.d $vr2, $vr2, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 992 + ori $a1, $a1, 1008 add.d $a1, $sp, $a1 vst $vr2, $a1, 0 lu12i.w $a1, 1 ori $a1, $a1, 1040 add.d $a1, $sp, $a1 vld $vr2, $a1, 0 - lu12i.w $a1, 1 - ori $a1, $a1, 1024 - add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 vfsub.d $vr1, $vr1, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 1008 + ori $a1, $a1, 1024 add.d $a1, $sp, $a1 vst $vr1, $a1, 0 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr2, $vr3, $vr0 lu12i.w $a1, 1 ori $a1, $a1, 1056 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 + vld $vr1, $a1, 0 + vfsub.d $vr2, $vr2, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 1024 + ori $a1, $a1, 1040 add.d $a1, $sp, $a1 vst $vr2, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 1040 + ori $a1, $a1, 1072 + add.d $a1, $sp, $a1 + vld $vr2, $a1, 0 + vfsub.d $vr1, $vr1, $vr0 + lu12i.w $a1, 1 + ori $a1, $a1, 1056 add.d $a1, $sp, $a1 vst $vr1, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 1072 + ori $a1, $a1, 1088 add.d $a1, $sp, $a1 vld $vr1, $a1, 0 - vfsub.d $vr2, $vr3, $vr0 + vfsub.d $vr2, $vr2, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 1056 + ori $a1, $a1, 1072 add.d $a1, $sp, $a1 vst $vr2, $a1, 0 lu12i.w $a1, 1 ori $a1, $a1, 1104 add.d $a1, $sp, $a1 vld $vr2, $a1, 0 - lu12i.w $a1, 1 - ori $a1, $a1, 1088 - add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 vfsub.d $vr1, $vr1, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 1072 + ori $a1, $a1, 1088 add.d $a1, $sp, $a1 vst $vr1, $a1, 0 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr2, $vr3, $vr0 lu12i.w $a1, 1 ori $a1, $a1, 1120 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 - lu12i.w $a1, 1 - ori $a1, $a1, 1088 - add.d $a1, $sp, $a1 - vst $vr2, $a1, 0 + vld $vr1, $a1, 0 + vfsub.d $vr2, $vr2, $vr0 lu12i.w $a1, 1 ori $a1, $a1, 1104 add.d $a1, $sp, $a1 - vst $vr1, $a1, 0 + vst $vr2, $a1, 0 lu12i.w $a1, 1 ori $a1, $a1, 1136 add.d $a1, $sp, $a1 - vld $vr1, $a1, 0 - vfsub.d $vr2, $vr3, $vr0 + vld $vr2, $a1, 0 + vfsub.d $vr1, $vr1, $vr0 lu12i.w $a1, 1 ori $a1, $a1, 1120 add.d $a1, $sp, $a1 - vst $vr2, $a1, 0 + vst $vr1, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 1168 + ori $a1, $a1, 1152 add.d $a1, $sp, $a1 - vld $vr2, $a1, 0 + vld $vr1, $a1, 0 + vfsub.d $vr2, $vr2, $vr0 lu12i.w $a1, 1 - ori $a1, $a1, 1152 + ori $a1, $a1, 1168 add.d $a1, $sp, $a1 vld $vr3, $a1, 0 - vfsub.d $vr1, $vr1, $vr0 lu12i.w $a1, 1 ori $a1, $a1, 1136 add.d $a1, $sp, $a1 - vst $vr1, $a1, 0 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr0, $vr3, $vr0 + vst $vr2, $a1, 0 + vfsub.d $vr1, $vr1, $vr0 lu12i.w $a1, 1 ori $a1, $a1, 1152 add.d $a1, $sp, $a1 - vst $vr0, $a1, 0 + vst $vr1, $a1, 0 + vfsub.d $vr0, $vr3, $vr0 lu12i.w $a1, 1 ori $a1, $a1, 1168 add.d $a1, $sp, $a1 - vst $vr1, $a1, 0 + vst $vr0, $a1, 0 st.w $zero, $s1, 68 ld.d $a1, $sp, 120 # 8-byte Folded Reload add.d $a0, $a1, $a0 @@ -1193,93 +1193,93 @@ VBR_iteration_loop_new: # @VBR_iteration_loop_new ori $a1, $a1, 720 add.d $a1, $sp, $a1 vld $vr0, $a1, 0 + vreplvei.d $vr1, $vr4, 0 lu12i.w $a1, 1 ori $a1, $a1, 736 add.d $a1, $sp, $a1 - vld $vr1, $a1, 0 - vreplvei.d $vr2, $vr4, 0 - vfsub.d $vr0, $vr0, $vr2 + vld $vr2, $a1, 0 + vfsub.d $vr0, $vr0, $vr1 lu12i.w $a1, 1 ori $a1, $a1, 720 add.d $a1, $sp, $a1 vst $vr0, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 768 - add.d $a1, $sp, $a1 - vld $vr0, $a1, 0 - lu12i.w $a1, 1 ori $a1, $a1, 752 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 - vfsub.d $vr1, $vr1, $vr2 + vld $vr0, $a1, 0 + vfsub.d $vr2, $vr2, $vr1 lu12i.w $a1, 1 ori $a1, $a1, 736 add.d $a1, $sp, $a1 - vst $vr1, $a1, 0 - vfsub.d $vr0, $vr0, $vr2 - vfsub.d $vr1, $vr3, $vr2 + vst $vr2, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 784 + ori $a1, $a1, 768 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 + vld $vr2, $a1, 0 + vfsub.d $vr0, $vr0, $vr1 lu12i.w $a1, 1 ori $a1, $a1, 752 add.d $a1, $sp, $a1 - vst $vr1, $a1, 0 + vst $vr0, $a1, 0 + lu12i.w $a1, 1 + ori $a1, $a1, 784 + add.d $a1, $sp, $a1 + vld $vr0, $a1, 0 + vfsub.d $vr2, $vr2, $vr1 lu12i.w $a1, 1 ori $a1, $a1, 768 add.d $a1, $sp, $a1 - vst $vr0, $a1, 0 + vst $vr2, $a1, 0 lu12i.w $a1, 1 ori $a1, $a1, 800 add.d $a1, $sp, $a1 - vld $vr0, $a1, 0 - vfsub.d $vr1, $vr3, $vr2 + vld $vr2, $a1, 0 + vfsub.d $vr0, $vr0, $vr1 lu12i.w $a1, 1 ori $a1, $a1, 784 add.d $a1, $sp, $a1 - vst $vr1, $a1, 0 - lu12i.w $a1, 1 - ori $a1, $a1, 832 - add.d $a1, $sp, $a1 - vld $vr1, $a1, 0 + vst $vr0, $a1, 0 lu12i.w $a1, 1 ori $a1, $a1, 816 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 - vfsub.d $vr0, $vr0, $vr2 + vld $vr0, $a1, 0 + vfsub.d $vr2, $vr2, $vr1 lu12i.w $a1, 1 ori $a1, $a1, 800 add.d $a1, $sp, $a1 - vst $vr0, $a1, 0 - vfsub.d $vr0, $vr1, $vr2 - vfsub.d $vr1, $vr3, $vr2 + vst $vr2, $a1, 0 lu12i.w $a1, 1 - ori $a1, $a1, 864 + ori $a1, $a1, 832 add.d $a1, $sp, $a1 - vld $vr3, $a1, 0 + vld $vr2, $a1, 0 + vfsub.d $vr0, $vr0, $vr1 lu12i.w $a1, 1 ori $a1, $a1, 816 add.d $a1, $sp, $a1 - vst $vr1, $a1, 0 + vst $vr0, $a1, 0 lu12i.w $a1, 1 ori $a1, $a1, 848 add.d $a1, $sp, $a1 - vld $vr1, $a1, 0 + vld $vr0, $a1, 0 + vfsub.d $vr2, $vr2, $vr1 lu12i.w $a1, 1 ori $a1, $a1, 832 add.d $a1, $sp, $a1 - vst $vr0, $a1, 0 - vfsub.d $vr0, $vr3, $vr2 + vst $vr2, $a1, 0 + lu12i.w $a1, 1 + ori $a1, $a1, 864 + add.d $a1, $sp, $a1 + vld $vr2, $a1, 0 + vfsub.d $vr0, $vr0, $vr1 lu12i.w $a1, 1 ori $a1, $a1, 880 add.d $a1, $sp, $a1 fld.d $fa3, $a1, 0 - vfsub.d $vr1, $vr1, $vr2 lu12i.w $a1, 1 ori $a1, $a1, 848 add.d $a1, $sp, $a1 - vst $vr1, $a1, 0 + vst $vr0, $a1, 0 + vfsub.d $vr0, $vr2, $vr1 lu12i.w $a1, 1 ori $a1, $a1, 864 add.d $a1, $sp, $a1 diff --git a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/gsm_decode.s b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/gsm_decode.s index d8fffb79..e7fb6bfd 100644 --- a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/gsm_decode.s +++ b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/gsm_decode.s @@ -48,35 +48,36 @@ gsm_decode: # @gsm_decode st.h $a3, $sp, 136 srli.d $a3, $a2, 7 bstrins.d $a3, $a4, 5, 1 - ld.bu $a4, $a1, 8 st.h $a3, $sp, 120 - bstrpick.d $a3, $a2, 6, 4 - bstrpick.d $a5, $a2, 3, 1 - srli.d $a6, $a4, 6 - bstrins.d $a6, $a2, 2, 2 + vinsgr2vr.b $vr0, $a2, 0 + vinsgr2vr.b $vr0, $a2, 1 + ld.bu $a3, $a1, 8 + vrepli.h $vr1, 260 + vsrl.b $vr0, $vr0, $vr1 + vandi.b $vr1, $vr0, 7 + srli.d $a4, $a3, 6 + bstrins.d $a4, $a2, 2, 2 ld.bu $a2, $a1, 9 - bstrpick.d $t0, $a4, 5, 3 - ld.bu $t1, $a1, 10 - andi $a4, $a4, 7 - srli.d $t2, $a2, 5 - bstrpick.d $t3, $a2, 4, 2 - srli.d $t4, $t1, 7 - bstrins.d $t4, $a2, 2, 1 - vinsgr2vr.b $vr1, $a3, 0 - vinsgr2vr.b $vr1, $a5, 1 - vinsgr2vr.b $vr1, $a6, 2 - vinsgr2vr.b $vr1, $t0, 3 - vinsgr2vr.b $vr1, $a4, 4 - vinsgr2vr.b $vr1, $t2, 5 - vinsgr2vr.b $vr1, $t3, 6 - vinsgr2vr.b $vr1, $t4, 7 + bstrpick.d $a5, $a3, 5, 3 + ld.bu $a6, $a1, 10 + andi $a3, $a3, 7 + srli.d $t0, $a2, 5 + bstrpick.d $t1, $a2, 4, 2 + srli.d $t2, $a6, 7 + bstrins.d $t2, $a2, 2, 1 + vinsgr2vr.b $vr1, $a4, 2 + vinsgr2vr.b $vr1, $a5, 3 + vinsgr2vr.b $vr1, $a3, 4 + vinsgr2vr.b $vr1, $t0, 5 + vinsgr2vr.b $vr1, $t1, 6 + vinsgr2vr.b $vr1, $t2, 7 vrepli.b $vr0, 0 vilvl.b $vr1, $vr0, $vr1 vst $vr1, $sp, 16 - bstrpick.d $a2, $t1, 6, 4 + bstrpick.d $a2, $a6, 6, 4 st.h $a2, $sp, 32 ld.bu $a2, $a1, 12 - bstrpick.d $a3, $t1, 3, 1 + bstrpick.d $a3, $a6, 3, 1 ld.bu $a4, $a1, 13 st.h $a3, $sp, 34 srli.d $a3, $a2, 1 @@ -92,7 +93,7 @@ gsm_decode: # @gsm_decode bstrins.d $a5, $a4, 5, 1 st.h $a5, $sp, 122 srli.d $a4, $a3, 6 - bstrins.d $a4, $t1, 2, 2 + bstrins.d $a4, $a6, 2, 2 st.h $a4, $sp, 36 vinsgr2vr.b $vr1, $a3, 0 vinsgr2vr.b $vr1, $a2, 1 diff --git a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/lpc.s b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/lpc.s index f17a0b2d..9ff6ebc4 100644 --- a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/lpc.s +++ b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/lpc.s @@ -69,26 +69,26 @@ .type Gsm_LPC_Analysis,@function Gsm_LPC_Analysis: # @Gsm_LPC_Analysis # %bb.0: # %vector.ph - addi.d $sp, $sp, -160 - st.d $ra, $sp, 152 # 8-byte Folded Spill - st.d $fp, $sp, 144 # 8-byte Folded Spill - st.d $s0, $sp, 136 # 8-byte Folded Spill - st.d $s1, $sp, 128 # 8-byte Folded Spill - st.d $s2, $sp, 120 # 8-byte Folded Spill - st.d $s3, $sp, 112 # 8-byte Folded Spill - st.d $s4, $sp, 104 # 8-byte Folded Spill - st.d $s5, $sp, 96 # 8-byte Folded Spill - st.d $s6, $sp, 88 # 8-byte Folded Spill - st.d $s7, $sp, 80 # 8-byte Folded Spill - st.d $s8, $sp, 72 # 8-byte Folded Spill - fst.d $fs0, $sp, 64 # 8-byte Folded Spill + addi.d $sp, $sp, -224 + st.d $ra, $sp, 216 # 8-byte Folded Spill + st.d $fp, $sp, 208 # 8-byte Folded Spill + st.d $s0, $sp, 200 # 8-byte Folded Spill + st.d $s1, $sp, 192 # 8-byte Folded Spill + st.d $s2, $sp, 184 # 8-byte Folded Spill + st.d $s3, $sp, 176 # 8-byte Folded Spill + st.d $s4, $sp, 168 # 8-byte Folded Spill + st.d $s5, $sp, 160 # 8-byte Folded Spill + st.d $s6, $sp, 152 # 8-byte Folded Spill + st.d $s7, $sp, 144 # 8-byte Folded Spill + st.d $s8, $sp, 136 # 8-byte Folded Spill + fst.d $fs0, $sp, 128 # 8-byte Folded Spill move $s0, $a1 vld $vr2, $a1, 0 vld $vr3, $a1, 16 vslti.h $vr4, $vr2, 0 vslti.h $vr5, $vr3, 0 - lu12i.w $a0, 8 - vreplgr2vr.h $vr0, $a0 + lu12i.w $s2, 8 + vreplgr2vr.h $vr0, $s2 vseq.h $vr6, $vr2, $vr0 vseq.h $vr7, $vr3, $vr0 vneg.h $vr8, $vr2 @@ -235,7 +235,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vmax.hu $vr0, $vr1, $vr0 vpickve2gr.h $a0, $vr0, 0 bstrpick.d $a0, $a0, 15, 0 - move $s2, $a2 + move $s8, $a2 beqz $a0, .LBB0_6 # %bb.1: slli.d $a0, $a0, 16 @@ -769,7 +769,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr18, $s0, 288 vst $vr16, $s0, 304 .LBB0_18: # %Autocorrelation.exit - st.d $s2, $sp, 16 # 8-byte Folded Spill + st.d $s8, $sp, 80 # 8-byte Folded Spill beqz $s1, .LBB0_21 # %bb.19: # %.preheader69.preheader.i vadd.d $vr0, $vr7, $vr0 @@ -788,76 +788,66 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vhaddw.q.d $vr5, $vr5, $vr5 vhaddw.q.d $vr6, $vr6, $vr6 vhaddw.q.d $vr7, $vr7, $vr7 - vpickve2gr.d $a0, $vr0, 0 - vpickve2gr.d $a1, $vr1, 0 - vpickve2gr.d $a2, $vr2, 0 - vpickve2gr.d $a3, $vr3, 0 - vpickve2gr.d $a4, $vr4, 0 - vpickve2gr.d $a5, $vr5, 0 - vpickve2gr.d $a6, $vr6, 0 - vpickve2gr.d $a7, $vr7, 0 - slli.d $fp, $a7, 1 - slli.d $s0, $a6, 1 - slli.d $s2, $a5, 1 - slli.d $s3, $a4, 1 - slli.d $s6, $a3, 1 - slli.d $s7, $a2, 1 - slli.d $s8, $a1, 1 - slli.d $s5, $a0, 1 + vextrins.d $vr0, $vr1, 16 + vextrins.d $vr2, $vr3, 16 + vextrins.d $vr4, $vr5, 16 + vextrins.d $vr6, $vr7, 16 + vslli.d $vr1, $vr6, 1 + vst $vr1, $sp, 48 # 16-byte Folded Spill + vslli.d $vr1, $vr4, 1 + vst $vr1, $sp, 64 # 16-byte Folded Spill + vslli.d $vr1, $vr2, 1 + vst $vr1, $sp, 16 # 16-byte Folded Spill + vslli.d $vr0, $vr0, 1 + vst $vr0, $sp, 32 # 16-byte Folded Spill move $a0, $s1 pcaddu18i $ra, %call36(gsm_norm) jirl $ra, $ra, 0 - sll.d $a1, $s1, $a0 - srli.d $a1, $a1, 16 + bstrpick.d $a2, $a0, 31, 0 + sll.d $a0, $s1, $a0 + srli.d $a1, $a0, 16 ext.w.h $a3, $a1 - sll.d $a2, $s5, $a0 - srli.d $s5, $a2, 16 - bstrpick.d $a2, $a2, 31, 16 - ext.w.h $a4, $s5 - sll.d $a5, $s8, $a0 - srli.d $a5, $a5, 16 - sll.d $a6, $s7, $a0 - srli.d $a6, $a6, 16 - sll.d $a7, $s6, $a0 - srli.d $a7, $a7, 16 - sll.d $t0, $s3, $a0 - srli.d $t0, $t0, 16 - sll.d $t1, $s2, $a0 - ld.d $t3, $sp, 16 # 8-byte Folded Reload - srli.d $t1, $t1, 16 - sll.d $t2, $s0, $a0 - srli.d $t2, $t2, 16 - sll.d $a0, $fp, $a0 - srli.d $a0, $a0, 16 - st.h $s5, $sp, 30 - st.h $a5, $sp, 32 - st.h $a6, $sp, 34 - st.h $a7, $sp, 36 - st.h $t0, $sp, 38 - st.h $t1, $sp, 40 - st.h $t2, $sp, 42 - st.h $s5, $sp, 48 - st.h $a5, $sp, 50 - st.h $a6, $sp, 52 - st.h $a7, $sp, 54 - st.h $t0, $sp, 56 - st.h $t1, $sp, 58 - st.h $t2, $sp, 60 - st.h $a0, $sp, 62 - slti $a0, $a4, 0 - lu12i.w $a4, 8 - xor $a2, $a2, $a4 + vreplgr2vr.d $vr0, $a2 + vld $vr1, $sp, 16 # 16-byte Folded Reload + vsll.d $vr1, $vr1, $vr0 + vld $vr2, $sp, 32 # 16-byte Folded Reload + vsll.d $vr2, $vr2, $vr0 + vld $vr3, $sp, 48 # 16-byte Folded Reload + vsll.d $vr3, $vr3, $vr0 + vld $vr4, $sp, 64 # 16-byte Folded Reload + vsll.d $vr0, $vr4, $vr0 + vsrli.d $vr0, $vr0, 16 + vsrli.d $vr3, $vr3, 16 + vsrli.d $vr2, $vr2, 16 + vsrli.d $vr1, $vr1, 16 + vpickev.w $vr1, $vr1, $vr2 + vpickev.w $vr0, $vr3, $vr0 + vpickev.h $vr2, $vr0, $vr1 + vpickve2gr.h $a0, $vr1, 0 + bstrpick.d $a2, $a0, 15, 0 + ext.w.h $fp, $a0 + vstelm.h $vr1, $sp, 94, 0 + vstelm.h $vr1, $sp, 96, 2 + vstelm.h $vr1, $sp, 98, 4 + vstelm.h $vr1, $sp, 100, 6 + vstelm.h $vr0, $sp, 102, 0 + vstelm.h $vr0, $sp, 104, 2 + vstelm.h $vr0, $sp, 106, 4 + vst $vr2, $sp, 112 + slti $a4, $fp, 0 + move $s3, $s2 + xor $a2, $a2, $s2 sltui $a2, $a2, 1 - sub.d $a4, $zero, $s5 - masknez $a4, $a4, $a2 + sub.d $a5, $zero, $a0 + masknez $a5, $a5, $a2 maskeqz $a2, $s4, $a2 - or $a2, $a2, $a4 - masknez $a4, $s5, $a0 - maskeqz $a0, $a2, $a0 - or $a0, $a0, $a4 + or $a2, $a2, $a5 + masknez $a0, $a0, $a4 + maskeqz $a2, $a2, $a4 + or $a0, $a2, $a0 ext.w.h $a0, $a0 ori $a2, $zero, 16 - move $s0, $t3 + move $s0, $s8 bge $a3, $a0, .LBB0_26 .LBB0_20: # %.lr.ph82.preheader.i move $a0, $s0 @@ -866,15 +856,15 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis jirl $ra, $ra, 0 b .LBB0_22 .LBB0_21: # %.preheader.preheader.i + move $s3, $s2 vrepli.b $vr0, 0 - vst $vr0, $s2, 0 + vst $vr0, $s8, 0 .LBB0_22: # %Reflection_coefficients.exit - ld.d $s2, $sp, 16 # 8-byte Folded Reload + ld.d $s2, $sp, 80 # 8-byte Folded Reload ld.hu $a1, $s2, 0 ext.w.h $a0, $a1 slti $a0, $a0, 0 - lu12i.w $fp, 8 - xor $a2, $a1, $fp + xor $a2, $a1, $s3 sltui $a2, $a2, 1 sub.d $a3, $zero, $a1 masknez $a3, $a3, $a2 @@ -902,13 +892,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis add.d $a2, $a1, $a2 b .LBB0_34 .LBB0_26: # %.lr.ph.preheader - move $s6, $zero - ori $fp, $zero, 8 - ori $s7, $zero, 0 - lu32i.d $s7, 32768 - lu12i.w $s3, -8 - ori $s8, $zero, 7 - move $s0, $t3 + move $s5, $zero + ori $s6, $zero, 0 + lu32i.d $s6, 32768 + lu12i.w $s2, -8 + ori $s7, $zero, 7 + move $s0, $s8 ori $a2, $zero, 1 b .LBB0_28 .p2align 4, , 16 @@ -916,97 +905,97 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis # in Loop: Header=BB0_28 Depth=1 mul.d $a0, $a1, $a0 slli.d $a0, $a0, 33 - add.d $a0, $a0, $s7 + add.d $a0, $a0, $s6 srai.d $a0, $a0, 48 add.d $a0, $a0, $s1 slt $a1, $a0, $s4 maskeqz $a0, $a0, $a1 masknez $a1, $s4, $a1 or $a0, $a0, $a1 - slt $a1, $s3, $a0 + slt $a1, $s2, $a0 maskeqz $a0, $a0, $a1 - masknez $a1, $s3, $a1 + masknez $a1, $s2, $a1 or $a1, $a0, $a1 - addi.d $a2, $s2, 1 + addi.d $a2, $s8, 1 addi.d $s0, $s0, 2 - addi.d $s6, $s6, 1 - ext.w.h $a0, $s5 + addi.d $s5, $s5, 1 + ext.w.h $a0, $fp slti $a0, $a0, 0 - bstrpick.d $a3, $s5, 15, 0 - lu12i.w $a4, 8 - xor $a3, $a3, $a4 + bstrpick.d $a3, $fp, 15, 0 + xor $a3, $a3, $s3 sltui $a3, $a3, 1 - sub.d $a4, $zero, $s5 + sub.d $a4, $zero, $fp masknez $a4, $a4, $a3 maskeqz $a3, $s4, $a3 or $a3, $a3, $a4 maskeqz $a3, $a3, $a0 - masknez $a0, $s5, $a0 + masknez $a0, $fp, $a0 or $a0, $a3, $a0 ext.w.h $a0, $a0 ext.w.h $a3, $a1 - addi.d $s8, $s8, -1 + addi.d $s7, $s7, -1 blt $a3, $a0, .LBB0_70 .LBB0_28: # %.lr.ph # =>This Loop Header: Depth=1 # Child Loop BB0_31 Depth 2 - move $s2, $a2 + move $s8, $a2 ext.w.h $a0, $a0 ext.w.h $s1, $a1 move $a1, $s1 pcaddu18i $ra, %call36(gsm_div) jirl $ra, $ra, 0 - ext.w.h $a1, $s5 + ext.w.h $a1, $fp slt $a2, $zero, $a1 sub.d $a3, $zero, $a0 masknez $a0, $a0, $a2 maskeqz $a2, $a3, $a2 or $a0, $a2, $a0 st.h $a0, $s0, 0 - beq $s2, $fp, .LBB0_22 + ori $a2, $zero, 8 + beq $s8, $a2, .LBB0_22 # %bb.29: # in Loop: Header=BB0_28 Depth=1 ext.w.h $a0, $a0 ori $a2, $zero, 7 - bltu $a2, $s2, .LBB0_27 + bltu $a2, $s8, .LBB0_27 # %bb.30: # %.lr.ph.i # in Loop: Header=BB0_28 Depth=1 ori $a3, $zero, 1 - sltu $a2, $a3, $s8 + sltu $a2, $a3, $s7 masknez $a3, $a3, $a2 - maskeqz $a2, $s8, $a2 + maskeqz $a2, $s7, $a2 or $a2, $a2, $a3 slli.d $a3, $a0, 33 - addi.d $a4, $sp, 30 - addi.d $a5, $sp, 50 + addi.d $a4, $sp, 94 + addi.d $a5, $sp, 114 .p2align 4, , 16 .LBB0_31: # Parent Loop BB0_28 Depth=1 # => This Inner Loop Header: Depth=2 ld.h $a6, $a4, 0 ld.h $a7, $a5, 0 mul.d $t0, $a3, $a6 - add.d $t0, $t0, $s7 + add.d $t0, $t0, $s6 srai.d $t0, $t0, 48 add.d $t0, $t0, $a7 slt $t1, $t0, $s4 maskeqz $t0, $t0, $t1 masknez $t1, $s4, $t1 or $t0, $t0, $t1 - slt $t1, $s3, $t0 + slt $t1, $s2, $t0 maskeqz $t0, $t0, $t1 - masknez $t1, $s3, $t1 + masknez $t1, $s2, $t1 or $t0, $t0, $t1 st.h $t0, $a5, -2 mul.d $a7, $a3, $a7 - add.d $a7, $a7, $s7 + add.d $a7, $a7, $s6 srai.d $a7, $a7, 48 add.d $a6, $a7, $a6 slt $a7, $a6, $s4 maskeqz $a6, $a6, $a7 masknez $a7, $s4, $a7 or $a6, $a6, $a7 - slt $a7, $s3, $a6 + slt $a7, $s2, $a6 maskeqz $a6, $a6, $a7 - masknez $a7, $s3, $a7 + masknez $a7, $s2, $a7 or $a6, $a6, $a7 st.h $a6, $a4, 0 addi.d $a2, $a2, -1 @@ -1015,7 +1004,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis bnez $a2, .LBB0_31 # %bb.32: # %._crit_edge.i.loopexit # in Loop: Header=BB0_28 Depth=1 - ld.hu $s5, $sp, 48 + ld.hu $fp, $sp, 112 b .LBB0_27 .LBB0_33: lu12i.w $a2, 6 @@ -1025,7 +1014,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $a4, $s2, 2 ext.w.h $a1, $a4 slti $a1, $a1, 0 - xor $a5, $a4, $fp + xor $a5, $a4, $s3 sltui $a5, $a5, 1 sub.d $a6, $zero, $a4 masknez $a6, $a6, $a5 @@ -1057,7 +1046,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $a6, $s2, 4 ext.w.h $a4, $a6 slti $a4, $a4, 0 - xor $a7, $a6, $fp + xor $a7, $a6, $s3 sltui $a7, $a7, 1 sub.d $t0, $zero, $a6 masknez $t0, $t0, $a7 @@ -1088,7 +1077,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $t0, $s2, 6 ext.w.h $a7, $t0 slti $a7, $a7, 0 - xor $t1, $t0, $fp + xor $t1, $t0, $s3 sltui $t1, $t1, 1 sub.d $t2, $zero, $t0 masknez $t2, $t2, $t1 @@ -1119,7 +1108,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $t2, $s2, 8 ext.w.h $t0, $t2 slti $t0, $t0, 0 - xor $t3, $t2, $fp + xor $t3, $t2, $s3 sltui $t3, $t3, 1 sub.d $t4, $zero, $t2 masknez $t4, $t4, $t3 @@ -1150,7 +1139,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $t4, $s2, 10 ext.w.h $t2, $t4 slti $t2, $t2, 0 - xor $t5, $t4, $fp + xor $t5, $t4, $s3 sltui $t5, $t5, 1 sub.d $t6, $zero, $t4 masknez $t6, $t6, $t5 @@ -1181,7 +1170,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $t6, $s2, 12 ext.w.h $t4, $t6 slti $t4, $t4, 0 - xor $t7, $t6, $fp + xor $t7, $t6, $s3 sltui $t7, $t7, 1 sub.d $t8, $zero, $t6 masknez $t8, $t8, $t7 @@ -1212,7 +1201,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $t8, $s2, 14 ext.w.h $t6, $t8 slti $t6, $t6, 0 - xor $fp, $t8, $fp + xor $fp, $t8, $s3 sltui $fp, $fp, 1 sub.d $s0, $zero, $t8 masknez $s0, $s0, $fp @@ -1335,25 +1324,25 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vadd.h $vr0, $vr0, $vr4 vbitsel.v $vr0, $vr0, $vr5, $vr3 vst $vr0, $s2, 0 - fld.d $fs0, $sp, 64 # 8-byte Folded Reload - ld.d $s8, $sp, 72 # 8-byte Folded Reload - ld.d $s7, $sp, 80 # 8-byte Folded Reload - ld.d $s6, $sp, 88 # 8-byte Folded Reload - ld.d $s5, $sp, 96 # 8-byte Folded Reload - ld.d $s4, $sp, 104 # 8-byte Folded Reload - ld.d $s3, $sp, 112 # 8-byte Folded Reload - ld.d $s2, $sp, 120 # 8-byte Folded Reload - ld.d $s1, $sp, 128 # 8-byte Folded Reload - ld.d $s0, $sp, 136 # 8-byte Folded Reload - ld.d $fp, $sp, 144 # 8-byte Folded Reload - ld.d $ra, $sp, 152 # 8-byte Folded Reload - addi.d $sp, $sp, 160 + fld.d $fs0, $sp, 128 # 8-byte Folded Reload + ld.d $s8, $sp, 136 # 8-byte Folded Reload + ld.d $s7, $sp, 144 # 8-byte Folded Reload + ld.d $s6, $sp, 152 # 8-byte Folded Reload + ld.d $s5, $sp, 160 # 8-byte Folded Reload + ld.d $s4, $sp, 168 # 8-byte Folded Reload + ld.d $s3, $sp, 176 # 8-byte Folded Reload + ld.d $s2, $sp, 184 # 8-byte Folded Reload + ld.d $s1, $sp, 192 # 8-byte Folded Reload + ld.d $s0, $sp, 200 # 8-byte Folded Reload + ld.d $fp, $sp, 208 # 8-byte Folded Reload + ld.d $ra, $sp, 216 # 8-byte Folded Reload + addi.d $sp, $sp, 224 ret .LBB0_70: # %.preheader64.i ori $a0, $zero, 7 - bltu $a0, $s2, .LBB0_22 + bltu $a0, $s8, .LBB0_22 # %bb.71: - slli.d $a0, $s6, 1 + slli.d $a0, $s5, 1 ori $a1, $zero, 16 sub.d $a2, $a1, $a0 b .LBB0_20 diff --git a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/short_term.s b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/short_term.s index 0250c0e1..8dc262de 100644 --- a/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/short_term.s +++ b/results/MultiSource/Benchmarks/MiBench/telecomm-gsm/CMakeFiles/telecomm-gsm.dir/short_term.s @@ -1,19 +1,31 @@ .file "short_term.c" + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4, 0x0 # -- Begin function Gsm_Short_Term_Analysis_Filter +.LCPI0_0: + .half 0 # 0x0 + .half 4 # 0x4 + .half 65535 # 0xffff + .half 65535 # 0xffff + .half 65535 # 0xffff + .half 65535 # 0xffff + .half 65535 # 0xffff + .half 65535 # 0xffff .text - .globl Gsm_Short_Term_Analysis_Filter # -- Begin function Gsm_Short_Term_Analysis_Filter + .globl Gsm_Short_Term_Analysis_Filter .p2align 5 .type Gsm_Short_Term_Analysis_Filter,@function Gsm_Short_Term_Analysis_Filter: # @Gsm_Short_Term_Analysis_Filter # %bb.0: # %vector.ph - addi.d $sp, $sp, -96 - st.d $ra, $sp, 88 # 8-byte Folded Spill - st.d $fp, $sp, 80 # 8-byte Folded Spill - st.d $s0, $sp, 72 # 8-byte Folded Spill - st.d $s1, $sp, 64 # 8-byte Folded Spill - st.d $s2, $sp, 56 # 8-byte Folded Spill - st.d $s3, $sp, 48 # 8-byte Folded Spill - st.d $s4, $sp, 40 # 8-byte Folded Spill - st.d $s5, $sp, 32 # 8-byte Folded Spill + addi.d $sp, $sp, -112 + st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s1, $sp, 80 # 8-byte Folded Spill + st.d $s2, $sp, 72 # 8-byte Folded Spill + st.d $s3, $sp, 64 # 8-byte Folded Spill + st.d $s4, $sp, 56 # 8-byte Folded Spill + st.d $s5, $sp, 48 # 8-byte Folded Spill + fst.d $fs0, $sp, 40 # 8-byte Folded Spill move $s1, $a0 ld.h $a3, $a0, 628 move $fp, $a2 @@ -40,409 +52,391 @@ Gsm_Short_Term_Analysis_Filter: # @Gsm_Short_Term_Analysis_Filter vreplgr2vr.w $vr3, $a0 vsra.h $vr0, $vr0, $vr3 vadd.h $vr0, $vr0, $vr2 - vadd.h $vr9, $vr0, $vr1 - vslti.h $vr10, $vr9, 0 + vadd.h $vr0, $vr0, $vr1 + vslti.h $vr10, $vr0, 0 lu12i.w $a0, 2 ori $a1, $a0, 2867 - vreplgr2vr.h $vr1, $a1 - vslt.hu $vr11, $vr9, $vr1 - vsrli.h $vr0, $vr9, 2 + vreplgr2vr.h $vr2, $a1 + vslt.hu $vr11, $vr0, $vr2 + vsrli.h $vr1, $vr0, 2 lu12i.w $a1, 6 ori $a2, $a1, 1536 - vreplgr2vr.h $vr3, $a2 - vadd.h $vr12, $vr0, $vr3 + vreplgr2vr.h $vr4, $a2 + vadd.h $vr12, $vr1, $vr4 lu12i.w $a2, 13 ori $a2, $a2, 1229 - vreplgr2vr.h $vr0, $a2 - vadd.h $vr2, $vr9, $vr0 + vreplgr2vr.h $vr1, $a2 + vadd.h $vr3, $vr0, $vr1 ori $a0, $a0, 819 - vreplgr2vr.h $vr5, $a0 - vslt.hu $vr13, $vr2, $vr5 - vadd.h $vr14, $vr9, $vr1 - vslli.h $vr15, $vr9, 1 - lu12i.w $a2, 8 - vreplgr2vr.h $vr6, $a2 - vseq.h $vr4, $vr9, $vr6 - vneg.h $vr7, $vr9 - lu12i.w $a0, 7 - ori $a0, $a0, 4095 - vreplgr2vr.h $vr2, $a0 - vbitsel.v $vr16, $vr7, $vr2, $vr4 - vslt.hu $vr17, $vr16, $vr1 - lu12i.w $a3, 4 - ori $a3, $a3, 3686 - vreplgr2vr.h $vr7, $a3 - vslt.hu $vr18, $vr16, $vr7 + vreplgr2vr.h $vr6, $a0 + vslt.hu $vr13, $vr3, $vr6 + vadd.h $vr14, $vr0, $vr2 + vslli.h $vr15, $vr0, 1 + lu12i.w $a0, 8 + vreplgr2vr.h $vr7, $a0 + vseq.h $vr5, $vr0, $vr7 + vneg.h $vr8, $vr0 + lu12i.w $a2, 7 + ori $a2, $a2, 4095 + vreplgr2vr.h $vr3, $a2 + vbitsel.v $vr16, $vr8, $vr3, $vr5 + vslt.hu $vr17, $vr16, $vr2 + lu12i.w $a2, 4 + ori $a2, $a2, 3686 + vreplgr2vr.h $vr8, $a2 + vslt.hu $vr18, $vr16, $vr8 ori $a1, $a1, 2047 - vreplgr2vr.h $vr4, $a1 - vslt.hu $vr19, $vr4, $vr16 - vxor.v $vr8, $vr19, $vr18 - vnor.v $vr8, $vr17, $vr8 - vand.v $vr20, $vr10, $vr8 + vreplgr2vr.h $vr5, $a1 + vslt.hu $vr19, $vr5, $vr16 + vxor.v $vr9, $vr19, $vr18 + vnor.v $vr9, $vr17, $vr9 + vand.v $vr20, $vr10, $vr9 vsrli.h $vr21, $vr16, 2 lu12i.w $a1, 9 ori $a1, $a1, 2560 - vreplgr2vr.h $vr8, $a1 - vsub.h $vr21, $vr8, $vr21 + vreplgr2vr.h $vr9, $a1 + vsub.h $vr21, $vr9, $vr21 vxor.v $vr18, $vr17, $vr18 vand.v $vr18, $vr10, $vr18 - vsub.h $vr22, $vr0, $vr16 + vsub.h $vr22, $vr1, $vr16 vand.v $vr17, $vr10, $vr17 vslli.h $vr16, $vr16, 1 vneg.h $vr16, $vr16 - vslt.h $vr9, $vr4, $vr9 - vand.v $vr10, $vr10, $vr19 - vbitsel.v $vr9, $vr12, $vr2, $vr9 - vbitsel.v $vr9, $vr9, $vr14, $vr13 - vbitsel.v $vr9, $vr9, $vr15, $vr11 - vbitsel.v $vr11, $vr9, $vr21, $vr20 - ori $a1, $a2, 1 - vreplgr2vr.h $vr9, $a1 - vbitsel.v $vr10, $vr11, $vr9, $vr10 - vbitsel.v $vr10, $vr10, $vr22, $vr18 - vbitsel.v $vr10, $vr10, $vr16, $vr17 - vst $vr10, $sp, 16 - addi.d $a1, $s1, 580 - ori $a3, $zero, 13 - addi.d $a4, $sp, 16 - ori $a5, $zero, 0 - lu32i.d $a5, 32768 - lu12i.w $a2, -8 - ori $a6, $zero, 16 - move $a7, $fp + vslt.h $vr0, $vr5, $vr0 + vand.v $vr19, $vr10, $vr19 + vbitsel.v $vr0, $vr12, $vr3, $vr0 + vbitsel.v $vr0, $vr0, $vr14, $vr13 + vbitsel.v $vr0, $vr0, $vr15, $vr11 + vbitsel.v $vr0, $vr0, $vr21, $vr20 + ori $a0, $a0, 1 + vreplgr2vr.h $vr10, $a0 + vbitsel.v $vr0, $vr0, $vr10, $vr19 + vbitsel.v $vr0, $vr0, $vr22, $vr18 + vbitsel.v $vr0, $vr0, $vr16, $vr17 + vst $vr0, $sp, 16 + addi.d $a0, $s1, 580 + ori $a1, $zero, 13 + addi.d $a2, $sp, 16 + ori $a3, $zero, 0 + pcalau12i $a4, %pc_hi20(.LCPI0_0) + vld $vr0, $a4, %pc_lo12(.LCPI0_0) + lu32i.d $a3, 32768 + vreplgr2vr.d $vr11, $a3 + ori $a3, $zero, 16 + move $a4, $fp .p2align 4, , 16 .LBB0_1: # =>This Loop Header: Depth=1 # Child Loop BB0_2 Depth 2 - ld.hu $t1, $a7, 0 - move $t0, $zero - move $t2, $t1 + ld.h $a6, $a4, 0 + move $a5, $zero + vinsgr2vr.h $vr12, $a6, 0 + vinsgr2vr.h $vr12, $a6, 1 .p2align 4, , 16 .LBB0_2: # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 - ldx.h $t3, $t0, $a4 - ldx.h $t4, $a1, $t0 - stx.h $t1, $a1, $t0 - ext.w.h $t2, $t2 - mul.d $t1, $t2, $t3 - slli.d $t1, $t1, 33 - add.d $t1, $t1, $a5 - srai.d $t1, $t1, 48 - add.d $t1, $t1, $t4 - slt $t5, $t1, $a0 - maskeqz $t1, $t1, $t5 - masknez $t5, $a0, $t5 - or $t1, $t1, $t5 - slt $t5, $a2, $t1 - maskeqz $t1, $t1, $t5 - masknez $t5, $a2, $t5 - or $t1, $t1, $t5 - mul.d $t3, $t4, $t3 - slli.d $t3, $t3, 33 - add.d $t3, $t3, $a5 - srai.d $t3, $t3, 48 - add.d $t2, $t3, $t2 - slt $t3, $t2, $a0 - maskeqz $t2, $t2, $t3 - masknez $t3, $a0, $t3 - or $t2, $t2, $t3 - slt $t3, $a2, $t2 - maskeqz $t2, $t2, $t3 - masknez $t3, $a2, $t3 - addi.d $t0, $t0, 2 - or $t2, $t2, $t3 - bne $t0, $a6, .LBB0_2 + ldx.h $a6, $a0, $a5 + add.d $a7, $a0, $a5 + ldx.h $t0, $a5, $a2 + vstelm.h $vr12, $a7, 0, 1 + vinsgr2vr.h $vr13, $a6, 0 + vpickev.h $vr13, $vr12, $vr13 + vslli.d $vr13, $vr13, 48 + vsrai.d $vr13, $vr13, 48 + vreplgr2vr.d $vr14, $t0 + vmul.d $vr13, $vr13, $vr14 + vslli.d $vr13, $vr13, 33 + vadd.d $vr13, $vr13, $vr11 + vsrli.d $vr13, $vr13, 48 + vori.b $vr14, $vr0, 0 + vshuf.h $vr14, $vr0, $vr13 + vinsgr2vr.h $vr12, $a6, 1 + vadd.h $vr13, $vr14, $vr12 + vslt.h $vr14, $vr13, $vr14 + vslti.h $vr12, $vr12, 0 + vxor.v $vr12, $vr12, $vr14 + vsrai.h $vr14, $vr13, 15 + vbitrevi.h $vr14, $vr14, 15 + addi.d $a5, $a5, 2 + vbitsel.v $vr12, $vr13, $vr14, $vr12 + bne $a5, $a3, .LBB0_2 # %bb.3: # in Loop: Header=BB0_1 Depth=1 - addi.w $a3, $a3, -1 - st.h $t2, $a7, 0 - addi.d $a7, $a7, 2 - bnez $a3, .LBB0_1 + addi.w $a1, $a1, -1 + vstelm.h $vr12, $a4, 0, 0 + addi.d $a4, $a4, 2 + bnez $a1, .LBB0_1 # %bb.4: # %Short_term_analysis_filtering.exit - vld $vr10, $s2, 0 - vld $vr11, $s0, 0 - vsrai.h $vr10, $vr10, 1 + vld $vr11, $s2, 0 + vld $vr12, $s0, 0 vsrai.h $vr11, $vr11, 1 - vadd.h $vr10, $vr11, $vr10 - vslti.h $vr11, $vr10, 0 - vslt.hu $vr12, $vr10, $vr1 - vsrli.h $vr13, $vr10, 2 - vadd.h $vr13, $vr13, $vr3 - vadd.h $vr14, $vr10, $vr0 - vslt.hu $vr14, $vr14, $vr5 - vadd.h $vr15, $vr10, $vr1 - vslli.h $vr16, $vr10, 1 - vseq.h $vr17, $vr10, $vr6 - vneg.h $vr18, $vr10 - vbitsel.v $vr17, $vr18, $vr2, $vr17 - vslt.hu $vr18, $vr17, $vr1 - vslt.hu $vr19, $vr17, $vr7 - vslt.hu $vr20, $vr4, $vr17 - vxor.v $vr21, $vr20, $vr19 - vnor.v $vr21, $vr18, $vr21 - vand.v $vr21, $vr11, $vr21 - vsrli.h $vr22, $vr17, 2 - vsub.h $vr22, $vr8, $vr22 - vxor.v $vr19, $vr18, $vr19 - vand.v $vr19, $vr11, $vr19 - vsub.h $vr23, $vr0, $vr17 - vand.v $vr18, $vr11, $vr18 - vslli.h $vr17, $vr17, 1 - vneg.h $vr17, $vr17 - vslt.h $vr10, $vr4, $vr10 - vand.v $vr11, $vr11, $vr20 - vbitsel.v $vr10, $vr13, $vr2, $vr10 - vbitsel.v $vr10, $vr10, $vr15, $vr14 - vbitsel.v $vr10, $vr10, $vr16, $vr12 - vbitsel.v $vr10, $vr10, $vr22, $vr21 - vbitsel.v $vr10, $vr10, $vr9, $vr11 - vbitsel.v $vr10, $vr10, $vr23, $vr19 - vbitsel.v $vr10, $vr10, $vr17, $vr18 - vst $vr10, $sp, 16 - addi.d $a3, $fp, 26 - ori $a4, $zero, 14 - addi.d $a5, $sp, 16 - ori $a6, $zero, 0 - lu32i.d $a6, 32768 - ori $a7, $zero, 16 + vsrai.h $vr12, $vr12, 1 + vadd.h $vr11, $vr12, $vr11 + vslti.h $vr12, $vr11, 0 + vslt.hu $vr13, $vr11, $vr2 + vsrli.h $vr14, $vr11, 2 + vadd.h $vr14, $vr14, $vr4 + vadd.h $vr15, $vr11, $vr1 + vslt.hu $vr15, $vr15, $vr6 + vadd.h $vr16, $vr11, $vr2 + vslli.h $vr17, $vr11, 1 + vseq.h $vr18, $vr11, $vr7 + vneg.h $vr19, $vr11 + vbitsel.v $vr18, $vr19, $vr3, $vr18 + vslt.hu $vr19, $vr18, $vr2 + vslt.hu $vr20, $vr18, $vr8 + vslt.hu $vr21, $vr5, $vr18 + vxor.v $vr22, $vr21, $vr20 + vnor.v $vr22, $vr19, $vr22 + vand.v $vr22, $vr12, $vr22 + vsrli.h $vr23, $vr18, 2 + vsub.h $vr23, $vr9, $vr23 + vxor.v $vr20, $vr19, $vr20 + vand.v $vr20, $vr12, $vr20 + vsub.h $vr24, $vr1, $vr18 + vand.v $vr19, $vr12, $vr19 + vslli.h $vr18, $vr18, 1 + vneg.h $vr18, $vr18 + vslt.h $vr11, $vr5, $vr11 + vand.v $vr12, $vr12, $vr21 + vbitsel.v $vr11, $vr14, $vr3, $vr11 + vbitsel.v $vr11, $vr11, $vr16, $vr15 + vbitsel.v $vr11, $vr11, $vr17, $vr13 + vbitsel.v $vr11, $vr11, $vr23, $vr22 + vbitsel.v $vr11, $vr11, $vr10, $vr12 + vbitsel.v $vr11, $vr11, $vr24, $vr20 + vbitsel.v $vr11, $vr11, $vr18, $vr19 + vst $vr11, $sp, 16 + addi.d $a1, $fp, 26 + ori $a2, $zero, 14 + addi.d $a3, $sp, 16 + ori $a4, $zero, 0 + lu32i.d $a4, 32768 + vreplgr2vr.d $vr11, $a4 + ori $a4, $zero, 16 .p2align 4, , 16 .LBB0_5: # =>This Loop Header: Depth=1 # Child Loop BB0_6 Depth 2 - ld.hu $t1, $a3, 0 - move $t0, $zero - move $t2, $t1 + ld.h $a6, $a1, 0 + move $a5, $zero + vinsgr2vr.h $vr12, $a6, 0 + vinsgr2vr.h $vr12, $a6, 1 .p2align 4, , 16 .LBB0_6: # Parent Loop BB0_5 Depth=1 # => This Inner Loop Header: Depth=2 - ldx.h $t3, $t0, $a5 - ldx.h $t4, $a1, $t0 - stx.h $t1, $a1, $t0 - ext.w.h $t2, $t2 - mul.d $t1, $t2, $t3 - slli.d $t1, $t1, 33 - add.d $t1, $t1, $a6 - srai.d $t1, $t1, 48 - add.d $t1, $t1, $t4 - slt $t5, $t1, $a0 - maskeqz $t1, $t1, $t5 - masknez $t5, $a0, $t5 - or $t1, $t1, $t5 - slt $t5, $a2, $t1 - maskeqz $t1, $t1, $t5 - masknez $t5, $a2, $t5 - or $t1, $t1, $t5 - mul.d $t3, $t4, $t3 - slli.d $t3, $t3, 33 - add.d $t3, $t3, $a6 - srai.d $t3, $t3, 48 - add.d $t2, $t3, $t2 - slt $t3, $t2, $a0 - maskeqz $t2, $t2, $t3 - masknez $t3, $a0, $t3 - or $t2, $t2, $t3 - slt $t3, $a2, $t2 - maskeqz $t2, $t2, $t3 - masknez $t3, $a2, $t3 - addi.d $t0, $t0, 2 - or $t2, $t2, $t3 - bne $t0, $a7, .LBB0_6 + ldx.h $a6, $a0, $a5 + add.d $a7, $a0, $a5 + ldx.h $t0, $a5, $a3 + vstelm.h $vr12, $a7, 0, 1 + vinsgr2vr.h $vr13, $a6, 0 + vpickev.h $vr13, $vr12, $vr13 + vslli.d $vr13, $vr13, 48 + vsrai.d $vr13, $vr13, 48 + vreplgr2vr.d $vr14, $t0 + vmul.d $vr13, $vr13, $vr14 + vslli.d $vr13, $vr13, 33 + vadd.d $vr13, $vr13, $vr11 + vsrli.d $vr13, $vr13, 48 + vori.b $vr14, $vr0, 0 + vshuf.h $vr14, $vr0, $vr13 + vinsgr2vr.h $vr12, $a6, 1 + vadd.h $vr13, $vr14, $vr12 + vslt.h $vr14, $vr13, $vr14 + vslti.h $vr12, $vr12, 0 + vxor.v $vr12, $vr12, $vr14 + vsrai.h $vr14, $vr13, 15 + vbitrevi.h $vr14, $vr14, 15 + addi.d $a5, $a5, 2 + vbitsel.v $vr12, $vr13, $vr14, $vr12 + bne $a5, $a4, .LBB0_6 # %bb.7: # in Loop: Header=BB0_5 Depth=1 - addi.w $a4, $a4, -1 - st.h $t2, $a3, 0 - addi.d $a3, $a3, 2 - bnez $a4, .LBB0_5 + addi.w $a2, $a2, -1 + vstelm.h $vr12, $a1, 0, 0 + addi.d $a1, $a1, 2 + bnez $a2, .LBB0_5 # %bb.8: # %Short_term_analysis_filtering.exit49 - vld $vr10, $s2, 0 - vld $vr11, $s0, 0 - vsrai.h $vr10, $vr10, 2 - vsrai.h $vr12, $vr11, 2 - vadd.h $vr10, $vr12, $vr10 - vsrai.h $vr11, $vr11, 1 - vadd.h $vr10, $vr10, $vr11 - vslti.h $vr11, $vr10, 0 - vslt.hu $vr12, $vr10, $vr1 - vsrli.h $vr13, $vr10, 2 - vadd.h $vr13, $vr13, $vr3 - vadd.h $vr14, $vr10, $vr0 - vslt.hu $vr14, $vr14, $vr5 - vadd.h $vr15, $vr10, $vr1 - vslli.h $vr16, $vr10, 1 - vseq.h $vr17, $vr10, $vr6 - vneg.h $vr18, $vr10 - vbitsel.v $vr17, $vr18, $vr2, $vr17 - vslt.hu $vr18, $vr17, $vr1 - vslt.hu $vr19, $vr17, $vr7 - vslt.hu $vr20, $vr4, $vr17 - vxor.v $vr21, $vr20, $vr19 - vnor.v $vr21, $vr18, $vr21 - vand.v $vr21, $vr11, $vr21 - vsrli.h $vr22, $vr17, 2 - vsub.h $vr22, $vr8, $vr22 - vxor.v $vr19, $vr18, $vr19 - vand.v $vr19, $vr11, $vr19 - vsub.h $vr23, $vr0, $vr17 - vand.v $vr18, $vr11, $vr18 - vslli.h $vr17, $vr17, 1 - vneg.h $vr17, $vr17 - vslt.h $vr10, $vr4, $vr10 - vand.v $vr11, $vr11, $vr20 - vbitsel.v $vr10, $vr13, $vr2, $vr10 - vbitsel.v $vr10, $vr10, $vr15, $vr14 - vbitsel.v $vr10, $vr10, $vr16, $vr12 - vbitsel.v $vr10, $vr10, $vr22, $vr21 - vbitsel.v $vr10, $vr10, $vr9, $vr11 - vbitsel.v $vr10, $vr10, $vr23, $vr19 - vbitsel.v $vr10, $vr10, $vr17, $vr18 - vst $vr10, $sp, 16 - addi.d $a3, $fp, 54 - ori $a4, $zero, 13 - addi.d $a5, $sp, 16 - ori $a6, $zero, 0 - lu32i.d $a6, 32768 - ori $a7, $zero, 16 + vld $vr11, $s2, 0 + vld $vr12, $s0, 0 + vsrai.h $vr11, $vr11, 2 + vsrai.h $vr13, $vr12, 2 + vadd.h $vr11, $vr13, $vr11 + vsrai.h $vr12, $vr12, 1 + vadd.h $vr11, $vr11, $vr12 + vslti.h $vr12, $vr11, 0 + vslt.hu $vr13, $vr11, $vr2 + vsrli.h $vr14, $vr11, 2 + vadd.h $vr14, $vr14, $vr4 + vadd.h $vr15, $vr11, $vr1 + vslt.hu $vr15, $vr15, $vr6 + vadd.h $vr16, $vr11, $vr2 + vslli.h $vr17, $vr11, 1 + vseq.h $vr18, $vr11, $vr7 + vneg.h $vr19, $vr11 + vbitsel.v $vr18, $vr19, $vr3, $vr18 + vslt.hu $vr19, $vr18, $vr2 + vslt.hu $vr20, $vr18, $vr8 + vslt.hu $vr21, $vr5, $vr18 + vxor.v $vr22, $vr21, $vr20 + vnor.v $vr22, $vr19, $vr22 + vand.v $vr22, $vr12, $vr22 + vsrli.h $vr23, $vr18, 2 + vsub.h $vr23, $vr9, $vr23 + vxor.v $vr20, $vr19, $vr20 + vand.v $vr20, $vr12, $vr20 + vsub.h $vr24, $vr1, $vr18 + vand.v $vr19, $vr12, $vr19 + vslli.h $vr18, $vr18, 1 + vneg.h $vr18, $vr18 + vslt.h $vr11, $vr5, $vr11 + vand.v $vr12, $vr12, $vr21 + vbitsel.v $vr11, $vr14, $vr3, $vr11 + vbitsel.v $vr11, $vr11, $vr16, $vr15 + vbitsel.v $vr11, $vr11, $vr17, $vr13 + vbitsel.v $vr11, $vr11, $vr23, $vr22 + vbitsel.v $vr11, $vr11, $vr10, $vr12 + vbitsel.v $vr11, $vr11, $vr24, $vr20 + vbitsel.v $vr11, $vr11, $vr18, $vr19 + vst $vr11, $sp, 16 + addi.d $a1, $fp, 54 + ori $a2, $zero, 13 + addi.d $a3, $sp, 16 + ori $a4, $zero, 0 + lu32i.d $a4, 32768 + vreplgr2vr.d $vr11, $a4 + ori $a4, $zero, 16 .p2align 4, , 16 .LBB0_9: # =>This Loop Header: Depth=1 # Child Loop BB0_10 Depth 2 - ld.hu $t1, $a3, 0 - move $t0, $zero - move $t2, $t1 + ld.h $a6, $a1, 0 + move $a5, $zero + vinsgr2vr.h $vr12, $a6, 0 + vinsgr2vr.h $vr12, $a6, 1 .p2align 4, , 16 .LBB0_10: # Parent Loop BB0_9 Depth=1 # => This Inner Loop Header: Depth=2 - ldx.h $t3, $t0, $a5 - ldx.h $t4, $a1, $t0 - stx.h $t1, $a1, $t0 - ext.w.h $t2, $t2 - mul.d $t1, $t2, $t3 - slli.d $t1, $t1, 33 - add.d $t1, $t1, $a6 - srai.d $t1, $t1, 48 - add.d $t1, $t1, $t4 - slt $t5, $t1, $a0 - maskeqz $t1, $t1, $t5 - masknez $t5, $a0, $t5 - or $t1, $t1, $t5 - slt $t5, $a2, $t1 - maskeqz $t1, $t1, $t5 - masknez $t5, $a2, $t5 - or $t1, $t1, $t5 - mul.d $t3, $t4, $t3 - slli.d $t3, $t3, 33 - add.d $t3, $t3, $a6 - srai.d $t3, $t3, 48 - add.d $t2, $t3, $t2 - slt $t3, $t2, $a0 - maskeqz $t2, $t2, $t3 - masknez $t3, $a0, $t3 - or $t2, $t2, $t3 - slt $t3, $a2, $t2 - maskeqz $t2, $t2, $t3 - masknez $t3, $a2, $t3 - addi.d $t0, $t0, 2 - or $t2, $t2, $t3 - bne $t0, $a7, .LBB0_10 + ldx.h $a6, $a0, $a5 + add.d $a7, $a0, $a5 + ldx.h $t0, $a5, $a3 + vstelm.h $vr12, $a7, 0, 1 + vinsgr2vr.h $vr13, $a6, 0 + vpickev.h $vr13, $vr12, $vr13 + vslli.d $vr13, $vr13, 48 + vsrai.d $vr13, $vr13, 48 + vreplgr2vr.d $vr14, $t0 + vmul.d $vr13, $vr13, $vr14 + vslli.d $vr13, $vr13, 33 + vadd.d $vr13, $vr13, $vr11 + vsrli.d $vr13, $vr13, 48 + vori.b $vr14, $vr0, 0 + vshuf.h $vr14, $vr0, $vr13 + vinsgr2vr.h $vr12, $a6, 1 + vadd.h $vr13, $vr14, $vr12 + vslt.h $vr14, $vr13, $vr14 + vslti.h $vr12, $vr12, 0 + vxor.v $vr12, $vr12, $vr14 + vsrai.h $vr14, $vr13, 15 + vbitrevi.h $vr14, $vr14, 15 + addi.d $a5, $a5, 2 + vbitsel.v $vr12, $vr13, $vr14, $vr12 + bne $a5, $a4, .LBB0_10 # %bb.11: # in Loop: Header=BB0_9 Depth=1 - addi.w $a4, $a4, -1 - st.h $t2, $a3, 0 - addi.d $a3, $a3, 2 - bnez $a4, .LBB0_9 + addi.w $a2, $a2, -1 + vstelm.h $vr12, $a1, 0, 0 + addi.d $a1, $a1, 2 + bnez $a2, .LBB0_9 # %bb.12: # %Short_term_analysis_filtering.exit84 - vld $vr10, $s0, 0 - vslti.h $vr11, $vr10, 0 - vslt.hu $vr12, $vr10, $vr1 - vsrli.h $vr13, $vr10, 2 - vadd.h $vr3, $vr13, $vr3 - vadd.h $vr13, $vr10, $vr0 - vslt.hu $vr5, $vr13, $vr5 - vadd.h $vr13, $vr10, $vr1 - vslli.h $vr14, $vr10, 1 - vseq.h $vr6, $vr10, $vr6 - vneg.h $vr15, $vr10 - vbitsel.v $vr6, $vr15, $vr2, $vr6 - vslt.hu $vr1, $vr6, $vr1 - vslt.hu $vr7, $vr6, $vr7 - vslt.hu $vr15, $vr4, $vr6 - vxor.v $vr16, $vr15, $vr7 - vnor.v $vr16, $vr1, $vr16 - vand.v $vr16, $vr11, $vr16 - vsrli.h $vr17, $vr6, 2 - vsub.h $vr8, $vr8, $vr17 - vxor.v $vr7, $vr1, $vr7 - vand.v $vr7, $vr11, $vr7 - vsub.h $vr0, $vr0, $vr6 - vand.v $vr1, $vr11, $vr1 - vslli.h $vr6, $vr6, 1 - vneg.h $vr6, $vr6 - vslt.h $vr4, $vr4, $vr10 - vand.v $vr10, $vr11, $vr15 - vbitsel.v $vr2, $vr3, $vr2, $vr4 - vbitsel.v $vr2, $vr2, $vr13, $vr5 - vbitsel.v $vr2, $vr2, $vr14, $vr12 - vbitsel.v $vr2, $vr2, $vr8, $vr16 - vbitsel.v $vr2, $vr2, $vr9, $vr10 - vbitsel.v $vr0, $vr2, $vr0, $vr7 - vbitsel.v $vr0, $vr0, $vr6, $vr1 - vst $vr0, $sp, 16 - addi.d $a3, $fp, 80 - ori $a4, $zero, 120 - addi.d $a5, $sp, 16 - ori $a6, $zero, 0 - lu32i.d $a6, 32768 - ori $a7, $zero, 16 + vld $vr11, $s0, 0 + vslti.h $vr12, $vr11, 0 + vslt.hu $vr13, $vr11, $vr2 + vsrli.h $vr14, $vr11, 2 + vadd.h $vr4, $vr14, $vr4 + vadd.h $vr14, $vr11, $vr1 + vslt.hu $vr6, $vr14, $vr6 + vadd.h $vr14, $vr11, $vr2 + vslli.h $vr15, $vr11, 1 + vseq.h $vr7, $vr11, $vr7 + vneg.h $vr16, $vr11 + vbitsel.v $vr7, $vr16, $vr3, $vr7 + vslt.hu $vr2, $vr7, $vr2 + vslt.hu $vr8, $vr7, $vr8 + vslt.hu $vr16, $vr5, $vr7 + vxor.v $vr17, $vr16, $vr8 + vnor.v $vr17, $vr2, $vr17 + vand.v $vr17, $vr12, $vr17 + vsrli.h $vr18, $vr7, 2 + vsub.h $vr9, $vr9, $vr18 + vxor.v $vr8, $vr2, $vr8 + vand.v $vr8, $vr12, $vr8 + vsub.h $vr1, $vr1, $vr7 + vand.v $vr2, $vr12, $vr2 + vslli.h $vr7, $vr7, 1 + vneg.h $vr7, $vr7 + vslt.h $vr5, $vr5, $vr11 + vand.v $vr11, $vr12, $vr16 + vbitsel.v $vr3, $vr4, $vr3, $vr5 + vbitsel.v $vr3, $vr3, $vr14, $vr6 + vbitsel.v $vr3, $vr3, $vr15, $vr13 + vbitsel.v $vr3, $vr3, $vr9, $vr17 + vbitsel.v $vr3, $vr3, $vr10, $vr11 + vbitsel.v $vr1, $vr3, $vr1, $vr8 + vbitsel.v $vr1, $vr1, $vr7, $vr2 + vst $vr1, $sp, 16 + addi.d $a1, $fp, 80 + ori $a2, $zero, 120 + addi.d $a3, $sp, 16 + ori $a4, $zero, 0 + lu32i.d $a4, 32768 + vreplgr2vr.d $vr1, $a4 + ori $a4, $zero, 16 .p2align 4, , 16 .LBB0_13: # =>This Loop Header: Depth=1 # Child Loop BB0_14 Depth 2 - ld.hu $t1, $a3, 0 - move $t0, $zero - move $t2, $t1 + ld.h $a6, $a1, 0 + move $a5, $zero + vinsgr2vr.h $vr2, $a6, 0 + vinsgr2vr.h $vr2, $a6, 1 .p2align 4, , 16 .LBB0_14: # Parent Loop BB0_13 Depth=1 # => This Inner Loop Header: Depth=2 - ldx.h $t3, $t0, $a5 - ldx.h $t4, $a1, $t0 - stx.h $t1, $a1, $t0 - ext.w.h $t2, $t2 - mul.d $t1, $t2, $t3 - slli.d $t1, $t1, 33 - add.d $t1, $t1, $a6 - srai.d $t1, $t1, 48 - add.d $t1, $t1, $t4 - slt $t5, $t1, $a0 - maskeqz $t1, $t1, $t5 - masknez $t5, $a0, $t5 - or $t1, $t1, $t5 - slt $t5, $a2, $t1 - maskeqz $t1, $t1, $t5 - masknez $t5, $a2, $t5 - or $t1, $t1, $t5 - mul.d $t3, $t4, $t3 - slli.d $t3, $t3, 33 - add.d $t3, $t3, $a6 - srai.d $t3, $t3, 48 - add.d $t2, $t3, $t2 - slt $t3, $t2, $a0 - maskeqz $t2, $t2, $t3 - masknez $t3, $a0, $t3 - or $t2, $t2, $t3 - slt $t3, $a2, $t2 - maskeqz $t2, $t2, $t3 - masknez $t3, $a2, $t3 - addi.d $t0, $t0, 2 - or $t2, $t2, $t3 - bne $t0, $a7, .LBB0_14 + ldx.h $a6, $a0, $a5 + add.d $a7, $a0, $a5 + ldx.h $t0, $a5, $a3 + vstelm.h $vr2, $a7, 0, 1 + vinsgr2vr.h $vr3, $a6, 0 + vpickev.h $vr3, $vr2, $vr3 + vslli.d $vr3, $vr3, 48 + vsrai.d $vr3, $vr3, 48 + vreplgr2vr.d $vr4, $t0 + vmul.d $vr3, $vr3, $vr4 + vslli.d $vr3, $vr3, 33 + vadd.d $vr3, $vr3, $vr1 + vsrli.d $vr3, $vr3, 48 + vori.b $vr4, $vr0, 0 + vshuf.h $vr4, $vr0, $vr3 + vinsgr2vr.h $vr2, $a6, 1 + vadd.h $vr3, $vr4, $vr2 + vslt.h $vr4, $vr3, $vr4 + vslti.h $vr2, $vr2, 0 + vxor.v $vr2, $vr2, $vr4 + vsrai.h $vr4, $vr3, 15 + vbitrevi.h $vr4, $vr4, 15 + addi.d $a5, $a5, 2 + vbitsel.v $vr2, $vr3, $vr4, $vr2 + bne $a5, $a4, .LBB0_14 # %bb.15: # in Loop: Header=BB0_13 Depth=1 - addi.w $a4, $a4, -1 - st.h $t2, $a3, 0 - addi.d $a3, $a3, 2 - bnez $a4, .LBB0_13 + addi.w $a2, $a2, -1 + vstelm.h $vr2, $a1, 0, 0 + addi.d $a1, $a1, 2 + bnez $a2, .LBB0_13 # %bb.16: # %Short_term_analysis_filtering.exit103 - ld.d $s5, $sp, 32 # 8-byte Folded Reload - ld.d $s4, $sp, 40 # 8-byte Folded Reload - ld.d $s3, $sp, 48 # 8-byte Folded Reload - ld.d $s2, $sp, 56 # 8-byte Folded Reload - ld.d $s1, $sp, 64 # 8-byte Folded Reload - ld.d $s0, $sp, 72 # 8-byte Folded Reload - ld.d $fp, $sp, 80 # 8-byte Folded Reload - ld.d $ra, $sp, 88 # 8-byte Folded Reload - addi.d $sp, $sp, 96 + fld.d $fs0, $sp, 40 # 8-byte Folded Reload + ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $s4, $sp, 56 # 8-byte Folded Reload + ld.d $s3, $sp, 64 # 8-byte Folded Reload + ld.d $s2, $sp, 72 # 8-byte Folded Reload + ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $fp, $sp, 96 # 8-byte Folded Reload + ld.d $ra, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret .Lfunc_end0: .size Gsm_Short_Term_Analysis_Filter, .Lfunc_end0-Gsm_Short_Term_Analysis_Filter diff --git a/results/MultiSource/Benchmarks/Olden/bh/CMakeFiles/bh.dir/newbh.s b/results/MultiSource/Benchmarks/Olden/bh/CMakeFiles/bh.dir/newbh.s index b1c6c0b1..fa794400 100644 --- a/results/MultiSource/Benchmarks/Olden/bh/CMakeFiles/bh.dir/newbh.s +++ b/results/MultiSource/Benchmarks/Olden/bh/CMakeFiles/bh.dir/newbh.s @@ -620,57 +620,53 @@ uniform_testdata: # @uniform_testdata .type intcoord,@function intcoord: # @intcoord # %bb.0: - fld.d $fa0, $a0, 16 - fld.d $fa2, $a1, 0 + vld $vr0, $a0, 16 + vld $vr3, $a1, 0 fld.d $fa1, $a1, 24 - move $a2, $zero - fld.d $fa4, $a0, 24 - fsub.d $fa0, $fa0, $fa2 - fdiv.d $fa0, $fa0, $fa1 - movgr2fr.d $fa2, $zero - fcmp.cle.d $fcc0, $fa2, $fa0 - vldi $vr3, -912 - fld.d $fa5, $a1, 8 - fcmp.clt.d $fcc1, $fa0, $fa3 - fld.d $fa3, $a0, 32 - movcf2gr $a0, $fcc0 - movcf2gr $a3, $fcc1 - and $a0, $a0, $a3 - fsub.d $fa4, $fa4, $fa5 - fdiv.d $fa4, $fa4, $fa1 - fcmp.cult.d $fcc0, $fa4, $fa2 - pcalau12i $a3, %pc_hi20(.LCPI3_0) - bcnez $fcc0, .LBB3_3 + fld.d $fa2, $a0, 32 + vfsub.d $vr0, $vr0, $vr3 + vreplvei.d $vr3, $vr1, 0 + vfdiv.d $vr0, $vr0, $vr3 + lu52i.d $a0, $zero, 1023 + vreplgr2vr.d $vr3, $a0 + vfcmp.clt.d $vr3, $vr0, $vr3 + vrepli.b $vr4, 0 + vfcmp.cle.d $vr4, $vr4, $vr0 + vand.v $vr3, $vr4, $vr3 + vpickve2gr.d $a0, $vr3, 0 + vpickve2gr.d $a2, $vr3, 1 + andi $a3, $a2, 1 + andi $a0, $a0, 1 + pcalau12i $a2, %pc_hi20(.LCPI3_0) + beqz $a3, .LBB3_2 # %bb.1: - vldi $vr5, -912 - fcmp.cule.d $fcc0, $fa5, $fa4 - move $a4, $a2 - bcnez $fcc0, .LBB3_4 -# %bb.2: - fld.d $fa5, $a3, %pc_lo12(.LCPI3_0) + fld.d $fa3, $a2, %pc_lo12(.LCPI3_0) slli.d $a4, $a0, 32 - fmul.d $fa4, $fa4, $fa5 - vreplvei.d $vr4, $vr4, 0 - vfrintrm.d $vr4, $vr4 - ftintrz.w.d $fa4, $fa4 - movfr2gr.s $a2, $fa4 - slli.d $a2, $a2, 32 - b .LBB3_4 + vreplvei.d $vr4, $vr0, 1 + fmul.d $fa3, $fa4, $fa3 + vreplvei.d $vr3, $vr3, 0 + vfrintrm.d $vr3, $vr3 + ftintrz.w.d $fa3, $fa3 + movfr2gr.s $a3, $fa3 + slli.d $a3, $a3, 32 + b .LBB3_3 +.LBB3_2: + move $a3, $zero + move $a4, $zero .LBB3_3: - move $a4, $a2 -.LBB3_4: - fld.d $fa4, $a1, 16 - fsub.d $fa3, $fa3, $fa4 - fdiv.d $fa1, $fa3, $fa1 + fld.d $fa3, $a1, 16 + fsub.d $fa2, $fa2, $fa3 + fdiv.d $fa1, $fa2, $fa1 + movgr2fr.d $fa2, $zero fcmp.cult.d $fcc0, $fa1, $fa2 move $a1, $zero - bcnez $fcc0, .LBB3_7 -# %bb.5: + bcnez $fcc0, .LBB3_6 +# %bb.4: vldi $vr2, -912 fcmp.cule.d $fcc0, $fa2, $fa1 - bcnez $fcc0, .LBB3_7 -# %bb.6: - fld.d $fa2, $a3, %pc_lo12(.LCPI3_0) + bcnez $fcc0, .LBB3_6 +# %bb.5: + fld.d $fa2, $a2, %pc_lo12(.LCPI3_0) fmul.d $fa1, $fa1, $fa2 vreplvei.d $vr1, $vr1, 0 vfrintrm.d $vr1, $vr1 @@ -678,16 +674,17 @@ intcoord: # @intcoord movfr2gr.s $a1, $fa1 bstrpick.d $a1, $a1, 31, 0 or $a1, $a4, $a1 -.LBB3_7: - fld.d $fa1, $a3, %pc_lo12(.LCPI3_0) +.LBB3_6: + fld.d $fa1, $a2, %pc_lo12(.LCPI3_0) + vreplvei.d $vr0, $vr0, 0 fmul.d $fa0, $fa0, $fa1 vreplvei.d $vr0, $vr0, 0 vfrintrm.d $vr0, $vr0 ftintrz.w.d $fa0, $fa0 - movfr2gr.s $a3, $fa0 - bstrpick.d $a3, $a3, 31, 0 - maskeqz $a0, $a3, $a0 - or $a0, $a2, $a0 + movfr2gr.s $a2, $fa0 + bstrpick.d $a2, $a2, 31, 0 + maskeqz $a0, $a2, $a0 + or $a0, $a3, $a0 ret .Lfunc_end3: .size intcoord, .Lfunc_end3-intcoord @@ -2155,14 +2152,11 @@ loadtree: # @loadtree .type hackcofm,@function hackcofm: # @hackcofm # %bb.0: - addi.d $sp, $sp, -64 - st.d $ra, $sp, 56 # 8-byte Folded Spill - st.d $fp, $sp, 48 # 8-byte Folded Spill - st.d $s0, $sp, 40 # 8-byte Folded Spill - fst.d $fs0, $sp, 32 # 8-byte Folded Spill - fst.d $fs1, $sp, 24 # 8-byte Folded Spill - fst.d $fs2, $sp, 16 # 8-byte Folded Spill - fst.d $fs3, $sp, 8 # 8-byte Folded Spill + addi.d $sp, $sp, -80 + st.d $ra, $sp, 72 # 8-byte Folded Spill + st.d $fp, $sp, 64 # 8-byte Folded Spill + st.d $s0, $sp, 56 # 8-byte Folded Spill + fst.d $fs0, $sp, 48 # 8-byte Folded Spill move $fp, $a0 ld.hu $a0, $a0, 0 ori $a1, $zero, 2 @@ -2174,162 +2168,178 @@ hackcofm: # @hackcofm move $a0, $s0 pcaddu18i $ra, %call36(hackcofm) jirl $ra, $ra, 0 - fld.d $fa1, $s0, 16 - fld.d $fa2, $s0, 24 - fld.d $fa3, $s0, 32 - fmul.d $fa1, $fa0, $fa1 + vld $vr1, $s0, 16 + fld.d $fa2, $s0, 32 + # kill: def $f0_64 killed $f0_64 def $vr0 + vreplvei.d $vr3, $vr0, 0 + vfmul.d $vr1, $vr3, $vr1 fmul.d $fa2, $fa0, $fa2 - fmul.d $fa3, $fa0, $fa3 - movgr2fr.d $fa4, $zero - fadd.d $fs1, $fa1, $fa4 - fadd.d $fs2, $fa2, $fa4 - fadd.d $fs3, $fa3, $fa4 - fadd.d $fs0, $fa0, $fa4 + vrepli.b $vr3, 0 + vfadd.d $vr3, $vr1, $vr3 + movgr2fr.d $fa1, $zero + fadd.d $fs0, $fa2, $fa1 + fadd.d $fa1, $fa0, $fa1 ld.d $s0, $fp, 56 beqz $s0, .LBB22_4 .LBB22_3: # %.preheader.preheader.1 move $a0, $s0 + vst $vr1, $sp, 32 # 16-byte Folded Spill + vst $vr3, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(hackcofm) jirl $ra, $ra, 0 - fld.d $fa1, $s0, 16 - fld.d $fa2, $s0, 24 - fld.d $fa3, $s0, 32 - fmul.d $fa1, $fa0, $fa1 + vld $vr1, $s0, 16 + fld.d $fa2, $s0, 32 + # kill: def $f0_64 killed $f0_64 def $vr0 + vreplvei.d $vr3, $vr0, 0 + vfmul.d $vr1, $vr3, $vr1 + vld $vr3, $sp, 16 # 16-byte Folded Reload fmul.d $fa2, $fa0, $fa2 - fmul.d $fa3, $fa0, $fa3 - fadd.d $fs1, $fs1, $fa1 - fadd.d $fs2, $fs2, $fa2 - fadd.d $fs3, $fs3, $fa3 - fadd.d $fs0, $fs0, $fa0 + vfadd.d $vr3, $vr3, $vr1 + vld $vr1, $sp, 32 # 16-byte Folded Reload + fadd.d $fs0, $fs0, $fa2 + fadd.d $fa1, $fa1, $fa0 .LBB22_4: # %.loopexit45.1 ld.d $s0, $fp, 64 beqz $s0, .LBB22_6 # %bb.5: # %.preheader.preheader.2 move $a0, $s0 + vst $vr1, $sp, 32 # 16-byte Folded Spill + vst $vr3, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(hackcofm) jirl $ra, $ra, 0 - fld.d $fa1, $s0, 16 - fld.d $fa2, $s0, 24 - fld.d $fa3, $s0, 32 - fmul.d $fa1, $fa0, $fa1 + vld $vr1, $s0, 16 + fld.d $fa2, $s0, 32 + # kill: def $f0_64 killed $f0_64 def $vr0 + vreplvei.d $vr3, $vr0, 0 + vfmul.d $vr1, $vr3, $vr1 + vld $vr3, $sp, 16 # 16-byte Folded Reload fmul.d $fa2, $fa0, $fa2 - fmul.d $fa3, $fa0, $fa3 - fadd.d $fs1, $fs1, $fa1 - fadd.d $fs2, $fs2, $fa2 - fadd.d $fs3, $fs3, $fa3 - fadd.d $fs0, $fs0, $fa0 + vfadd.d $vr3, $vr3, $vr1 + vld $vr1, $sp, 32 # 16-byte Folded Reload + fadd.d $fs0, $fs0, $fa2 + fadd.d $fa1, $fa1, $fa0 .LBB22_6: # %.loopexit45.2 ld.d $s0, $fp, 72 beqz $s0, .LBB22_8 # %bb.7: # %.preheader.preheader.3 move $a0, $s0 + vst $vr1, $sp, 32 # 16-byte Folded Spill + vst $vr3, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(hackcofm) jirl $ra, $ra, 0 - fld.d $fa1, $s0, 16 - fld.d $fa2, $s0, 24 - fld.d $fa3, $s0, 32 - fmul.d $fa1, $fa0, $fa1 + vld $vr1, $s0, 16 + fld.d $fa2, $s0, 32 + # kill: def $f0_64 killed $f0_64 def $vr0 + vreplvei.d $vr3, $vr0, 0 + vfmul.d $vr1, $vr3, $vr1 + vld $vr3, $sp, 16 # 16-byte Folded Reload fmul.d $fa2, $fa0, $fa2 - fmul.d $fa3, $fa0, $fa3 - fadd.d $fs1, $fs1, $fa1 - fadd.d $fs2, $fs2, $fa2 - fadd.d $fs3, $fs3, $fa3 - fadd.d $fs0, $fs0, $fa0 + vfadd.d $vr3, $vr3, $vr1 + vld $vr1, $sp, 32 # 16-byte Folded Reload + fadd.d $fs0, $fs0, $fa2 + fadd.d $fa1, $fa1, $fa0 .LBB22_8: # %.loopexit45.3 ld.d $s0, $fp, 80 beqz $s0, .LBB22_10 # %bb.9: # %.preheader.preheader.4 move $a0, $s0 + vst $vr1, $sp, 32 # 16-byte Folded Spill + vst $vr3, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(hackcofm) jirl $ra, $ra, 0 - fld.d $fa1, $s0, 16 - fld.d $fa2, $s0, 24 - fld.d $fa3, $s0, 32 - fmul.d $fa1, $fa0, $fa1 + vld $vr1, $s0, 16 + fld.d $fa2, $s0, 32 + # kill: def $f0_64 killed $f0_64 def $vr0 + vreplvei.d $vr3, $vr0, 0 + vfmul.d $vr1, $vr3, $vr1 + vld $vr3, $sp, 16 # 16-byte Folded Reload fmul.d $fa2, $fa0, $fa2 - fmul.d $fa3, $fa0, $fa3 - fadd.d $fs1, $fs1, $fa1 - fadd.d $fs2, $fs2, $fa2 - fadd.d $fs3, $fs3, $fa3 - fadd.d $fs0, $fs0, $fa0 + vfadd.d $vr3, $vr3, $vr1 + vld $vr1, $sp, 32 # 16-byte Folded Reload + fadd.d $fs0, $fs0, $fa2 + fadd.d $fa1, $fa1, $fa0 .LBB22_10: # %.loopexit45.4 ld.d $s0, $fp, 88 beqz $s0, .LBB22_12 # %bb.11: # %.preheader.preheader.5 move $a0, $s0 + vst $vr1, $sp, 32 # 16-byte Folded Spill + vst $vr3, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(hackcofm) jirl $ra, $ra, 0 - fld.d $fa1, $s0, 16 - fld.d $fa2, $s0, 24 - fld.d $fa3, $s0, 32 - fmul.d $fa1, $fa0, $fa1 + vld $vr1, $s0, 16 + fld.d $fa2, $s0, 32 + # kill: def $f0_64 killed $f0_64 def $vr0 + vreplvei.d $vr3, $vr0, 0 + vfmul.d $vr1, $vr3, $vr1 + vld $vr3, $sp, 16 # 16-byte Folded Reload fmul.d $fa2, $fa0, $fa2 - fmul.d $fa3, $fa0, $fa3 - fadd.d $fs1, $fs1, $fa1 - fadd.d $fs2, $fs2, $fa2 - fadd.d $fs3, $fs3, $fa3 - fadd.d $fs0, $fs0, $fa0 + vfadd.d $vr3, $vr3, $vr1 + vld $vr1, $sp, 32 # 16-byte Folded Reload + fadd.d $fs0, $fs0, $fa2 + fadd.d $fa1, $fa1, $fa0 .LBB22_12: # %.loopexit45.5 ld.d $s0, $fp, 96 beqz $s0, .LBB22_14 # %bb.13: # %.preheader.preheader.6 move $a0, $s0 + vst $vr1, $sp, 32 # 16-byte Folded Spill + vst $vr3, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(hackcofm) jirl $ra, $ra, 0 - fld.d $fa1, $s0, 16 - fld.d $fa2, $s0, 24 - fld.d $fa3, $s0, 32 - fmul.d $fa1, $fa0, $fa1 + vld $vr1, $s0, 16 + fld.d $fa2, $s0, 32 + # kill: def $f0_64 killed $f0_64 def $vr0 + vreplvei.d $vr3, $vr0, 0 + vfmul.d $vr1, $vr3, $vr1 + vld $vr3, $sp, 16 # 16-byte Folded Reload fmul.d $fa2, $fa0, $fa2 - fmul.d $fa3, $fa0, $fa3 - fadd.d $fs1, $fs1, $fa1 - fadd.d $fs2, $fs2, $fa2 - fadd.d $fs3, $fs3, $fa3 - fadd.d $fs0, $fs0, $fa0 + vfadd.d $vr3, $vr3, $vr1 + vld $vr1, $sp, 32 # 16-byte Folded Reload + fadd.d $fs0, $fs0, $fa2 + fadd.d $fa1, $fa1, $fa0 .LBB22_14: # %.loopexit45.6 ld.d $s0, $fp, 104 beqz $s0, .LBB22_16 # %bb.15: # %.preheader.preheader.7 move $a0, $s0 + vst $vr1, $sp, 32 # 16-byte Folded Spill + vst $vr3, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(hackcofm) jirl $ra, $ra, 0 - fld.d $fa1, $s0, 16 - fld.d $fa2, $s0, 24 - fld.d $fa3, $s0, 32 - fmul.d $fa1, $fa0, $fa1 + vld $vr1, $s0, 16 + fld.d $fa2, $s0, 32 + # kill: def $f0_64 killed $f0_64 def $vr0 + vreplvei.d $vr3, $vr0, 0 + vfmul.d $vr1, $vr3, $vr1 + vld $vr3, $sp, 16 # 16-byte Folded Reload fmul.d $fa2, $fa0, $fa2 - fmul.d $fa3, $fa0, $fa3 - fadd.d $fs1, $fs1, $fa1 - fadd.d $fs2, $fs2, $fa2 - fadd.d $fs3, $fs3, $fa3 - fadd.d $fs0, $fs0, $fa0 + vfadd.d $vr3, $vr3, $vr1 + vld $vr1, $sp, 32 # 16-byte Folded Reload + fadd.d $fs0, $fs0, $fa2 + fadd.d $fa1, $fa1, $fa0 .LBB22_16: # %.loopexit45.7 - fst.d $fs0, $fp, 8 - fdiv.d $fa0, $fs1, $fs0 - fst.d $fa0, $fp, 16 - fdiv.d $fa0, $fs2, $fs0 - fst.d $fa0, $fp, 24 - fdiv.d $fa0, $fs3, $fs0 + fst.d $fa1, $fp, 8 + vreplvei.d $vr0, $vr1, 0 + vfdiv.d $vr0, $vr3, $vr0 + vst $vr0, $fp, 16 + fdiv.d $fa0, $fs0, $fa1 fst.d $fa0, $fp, 32 b .LBB22_18 .LBB22_17: - fld.d $fs0, $fp, 8 + fld.d $fa1, $fp, 8 .LBB22_18: # %.loopexit - fmov.d $fa0, $fs0 - fld.d $fs3, $sp, 8 # 8-byte Folded Reload - fld.d $fs2, $sp, 16 # 8-byte Folded Reload - fld.d $fs1, $sp, 24 # 8-byte Folded Reload - fld.d $fs0, $sp, 32 # 8-byte Folded Reload - ld.d $s0, $sp, 40 # 8-byte Folded Reload - ld.d $fp, $sp, 48 # 8-byte Folded Reload - ld.d $ra, $sp, 56 # 8-byte Folded Reload - addi.d $sp, $sp, 64 + fmov.d $fa0, $fa1 + fld.d $fs0, $sp, 48 # 8-byte Folded Reload + ld.d $s0, $sp, 56 # 8-byte Folded Reload + ld.d $fp, $sp, 64 # 8-byte Folded Reload + ld.d $ra, $sp, 72 # 8-byte Folded Reload + addi.d $sp, $sp, 80 ret .LBB22_19: - movgr2fr.d $fs1, $zero - fmov.d $fs2, $fs1 - fmov.d $fs3, $fs1 - fmov.d $fs0, $fs1 + movgr2fr.d $fs0, $zero + vrepli.b $vr3, 0 + fmov.d $fa1, $fs0 ld.d $s0, $fp, 56 bnez $s0, .LBB22_3 b .LBB22_4 @@ -2344,35 +2354,31 @@ ic_test: # @ic_test fld.d $fa0, $a0, 16 fld.d $fa1, $a1, 0 fld.d $fa2, $a1, 24 - fld.d $fa3, $a0, 24 fsub.d $fa0, $fa0, $fa1 fdiv.d $fa0, $fa0, $fa2 movgr2fr.d $fa1, $zero fcmp.cle.d $fcc0, $fa1, $fa0 - fld.d $fa4, $a1, 8 - vldi $vr5, -912 - fcmp.clt.d $fcc1, $fa0, $fa5 - fld.d $fa0, $a0, 32 + vldi $vr1, -912 + vld $vr3, $a0, 24 + vld $vr4, $a1, 8 + fcmp.clt.d $fcc1, $fa0, $fa1 movcf2gr $a0, $fcc0 - movcf2gr $a2, $fcc1 - fsub.d $fa3, $fa3, $fa4 - fdiv.d $fa3, $fa3, $fa2 - fld.d $fa4, $a1, 16 - fcmp.cle.d $fcc0, $fa1, $fa3 - fcmp.clt.d $fcc1, $fa3, $fa5 - and $a0, $a0, $a2 - movcf2gr $a1, $fcc0 - movcf2gr $a2, $fcc1 - fsub.d $fa0, $fa0, $fa4 - fdiv.d $fa0, $fa0, $fa2 - fcmp.cle.d $fcc0, $fa1, $fa0 - fcmp.clt.d $fcc1, $fa0, $fa5 - and $a1, $a1, $a2 - movcf2gr $a2, $fcc0 - movcf2gr $a3, $fcc1 - and $a2, $a2, $a3 + movcf2gr $a1, $fcc1 + and $a0, $a0, $a1 + vfsub.d $vr0, $vr3, $vr4 + vreplvei.d $vr1, $vr2, 0 + vfdiv.d $vr0, $vr0, $vr1 + lu52i.d $a1, $zero, 1023 + vreplgr2vr.d $vr1, $a1 + vfcmp.clt.d $vr1, $vr0, $vr1 + vrepli.b $vr2, 0 + vfcmp.cle.d $vr0, $vr2, $vr0 + vand.v $vr0, $vr0, $vr1 + vpickve2gr.d $a1, $vr0, 0 + vpickve2gr.d $a2, $vr0, 1 and $a1, $a2, $a1 and $a0, $a1, $a0 + andi $a0, $a0, 1 ret .Lfunc_end23: .size ic_test, .Lfunc_end23-ic_test diff --git a/results/MultiSource/Benchmarks/Olden/health/CMakeFiles/health.dir/health.s b/results/MultiSource/Benchmarks/Olden/health/CMakeFiles/health.dir/health.s index 51e6cb15..d9b4e116 100644 --- a/results/MultiSource/Benchmarks/Olden/health/CMakeFiles/health.dir/health.s +++ b/results/MultiSource/Benchmarks/Olden/health/CMakeFiles/health.dir/health.s @@ -113,106 +113,99 @@ get_results: # @get_results # %bb.0: beqz $a0, .LBB1_5 # %bb.1: # %.preheader.preheader - addi.d $sp, $sp, -96 - st.d $ra, $sp, 88 # 8-byte Folded Spill - st.d $fp, $sp, 80 # 8-byte Folded Spill - st.d $s0, $sp, 72 # 8-byte Folded Spill - st.d $s1, $sp, 64 # 8-byte Folded Spill - st.d $s2, $sp, 56 # 8-byte Folded Spill - st.d $s3, $sp, 48 # 8-byte Folded Spill - st.d $s4, $sp, 40 # 8-byte Folded Spill - st.d $s5, $sp, 32 # 8-byte Folded Spill - fst.d $fs0, $sp, 24 # 8-byte Folded Spill - fst.d $fs1, $sp, 16 # 8-byte Folded Spill - fst.d $fs2, $sp, 8 # 8-byte Folded Spill + addi.d $sp, $sp, -80 + st.d $ra, $sp, 72 # 8-byte Folded Spill + st.d $fp, $sp, 64 # 8-byte Folded Spill + st.d $s0, $sp, 56 # 8-byte Folded Spill + st.d $s1, $sp, 48 # 8-byte Folded Spill + st.d $s2, $sp, 40 # 8-byte Folded Spill + fst.d $fs0, $sp, 32 # 8-byte Folded Spill + fst.d $fs1, $sp, 24 # 8-byte Folded Spill + fst.d $fs2, $sp, 16 # 8-byte Folded Spill ld.d $a1, $a0, 24 - move $s5, $a0 + move $s2, $a0 move $a0, $a1 pcaddu18i $ra, %call36(get_results) jirl $ra, $ra, 0 - ld.d $a2, $s5, 16 + ld.d $a2, $s2, 16 move $fp, $a0 - move $s0, $a1 - movgr2fr.w $fs0, $a0 + movgr2fr.w $fs0, $a1 move $a0, $a2 pcaddu18i $ra, %call36(get_results) jirl $ra, $ra, 0 - ld.d $a2, $s5, 8 - move $s1, $a0 - move $s2, $a1 - movgr2fr.w $fs1, $a0 + ld.d $a2, $s2, 8 + move $s0, $a0 + movgr2fr.w $fs1, $a1 move $a0, $a2 pcaddu18i $ra, %call36(get_results) jirl $ra, $ra, 0 - ld.d $a2, $s5, 0 - move $s3, $a0 - move $s4, $a1 - movgr2fr.w $fs2, $a0 + ld.d $a2, $s2, 0 + move $s1, $a0 + movgr2fr.w $fs2, $a1 move $a0, $a2 pcaddu18i $ra, %call36(get_results) jirl $ra, $ra, 0 - pcalau12i $a2, %pc_hi20(.LCPI1_0) - vld $vr0, $a2, %pc_lo12(.LCPI1_0) - movgr2fr.w $fa1, $a0 - movgr2fr.w $fa2, $zero - fadd.s $fa2, $fs0, $fa2 - fadd.s $fa2, $fa2, $fs1 - fadd.s $fa2, $fa2, $fs2 - vinsgr2vr.d $vr3, $s0, 0 - vinsgr2vr.d $vr3, $fp, 1 - vsrl.d $vr3, $vr3, $vr0 + movgr2fr.w $fa0, $a1 + movgr2fr.w $fa1, $zero + fadd.s $fa1, $fs0, $fa1 + pcalau12i $a1, %pc_hi20(.LCPI1_0) + vld $vr2, $a1, %pc_lo12(.LCPI1_0) + fadd.s $fa1, $fa1, $fs1 + fadd.s $fa1, $fa1, $fs2 + vreplgr2vr.d $vr3, $fp + vsrl.d $vr3, $vr3, $vr2 vshuf4i.w $vr3, $vr3, 8 - vinsgr2vr.d $vr4, $s2, 0 - vinsgr2vr.d $vr4, $s1, 1 - vsrl.d $vr4, $vr4, $vr0 + vreplgr2vr.d $vr4, $s0 + vsrl.d $vr4, $vr4, $vr2 vshuf4i.w $vr4, $vr4, 8 - vinsgr2vr.d $vr5, $s4, 0 - vinsgr2vr.d $vr5, $s3, 1 - vsrl.d $vr5, $vr5, $vr0 + vreplgr2vr.d $vr5, $s1 + vsrl.d $vr5, $vr5, $vr2 vshuf4i.w $vr5, $vr5, 8 - vinsgr2vr.d $vr6, $a1, 0 - vinsgr2vr.d $vr6, $a0, 1 - vsrl.d $vr0, $vr6, $vr0 - vshuf4i.w $vr0, $vr0, 8 + vreplgr2vr.d $vr6, $a0 + vsrl.d $vr2, $vr6, $vr2 + vshuf4i.w $vr2, $vr2, 8 vrepli.b $vr6, 0 vfadd.s $vr3, $vr3, $vr6 - ld.d $a0, $s5, 40 + ld.d $a0, $s2, 40 vfadd.s $vr3, $vr3, $vr4 vfadd.s $vr3, $vr3, $vr5 - fadd.s $fa1, $fa2, $fa1 - vfadd.s $vr0, $vr3, $vr0 + fadd.s $fa0, $fa1, $fa0 + vfadd.s $vr1, $vr3, $vr2 beqz $a0, .LBB1_4 # %bb.2: # %.lr.ph.preheader - vldi $vr2, -1168 + lu12i.w $a1, 260096 + vreplgr2vr.w $vr2, $a1 .p2align 4, , 16 .LBB1_3: # %.lr.ph # =>This Inner Loop Header: Depth=1 ld.d $a1, $a0, 8 - ld.d $a1, $a1, 0 + ld.w $a2, $a1, 0 + movgr2fr.w $fa3, $a2 + ld.w $a1, $a1, 4 + ffint.s.w $fa3, $fa3 + fadd.s $fa0, $fa0, $fa3 ld.d $a0, $a0, 0 - vinsgr2vr.d $vr3, $a1, 0 - vffint.s.w $vr3, $vr3 - vfadd.s $vr0, $vr0, $vr3 - fadd.s $fa1, $fa1, $fa2 + movgr2fr.w $fa3, $a1 + ffint.s.w $fa3, $fa3 + vori.b $vr4, $vr2, 0 + vextrins.w $vr4, $vr3, 16 + vfadd.s $vr1, $vr1, $vr4 bnez $a0, .LBB1_3 .LBB1_4: # %._crit_edge - movfr2gr.s $a0, $fa1 - vpickve2gr.w $a2, $vr0, 1 - vpickve2gr.w $a1, $vr0, 0 + vpickve2gr.w $a0, $vr1, 0 + vpickve2gr.w $a2, $vr1, 1 + movfr2gr.s $a1, $fa0 bstrpick.d $a1, $a1, 31, 0 bstrins.d $a0, $a2, 63, 32 - fld.d $fs2, $sp, 8 # 8-byte Folded Reload - fld.d $fs1, $sp, 16 # 8-byte Folded Reload - fld.d $fs0, $sp, 24 # 8-byte Folded Reload - ld.d $s5, $sp, 32 # 8-byte Folded Reload - ld.d $s4, $sp, 40 # 8-byte Folded Reload - ld.d $s3, $sp, 48 # 8-byte Folded Reload - ld.d $s2, $sp, 56 # 8-byte Folded Reload - ld.d $s1, $sp, 64 # 8-byte Folded Reload - ld.d $s0, $sp, 72 # 8-byte Folded Reload - ld.d $fp, $sp, 80 # 8-byte Folded Reload - ld.d $ra, $sp, 88 # 8-byte Folded Reload - addi.d $sp, $sp, 96 + fld.d $fs2, $sp, 16 # 8-byte Folded Reload + fld.d $fs1, $sp, 24 # 8-byte Folded Reload + fld.d $fs0, $sp, 32 # 8-byte Folded Reload + ld.d $s2, $sp, 40 # 8-byte Folded Reload + ld.d $s1, $sp, 48 # 8-byte Folded Reload + ld.d $s0, $sp, 56 # 8-byte Folded Reload + ld.d $fp, $sp, 64 # 8-byte Folded Reload + ld.d $ra, $sp, 72 # 8-byte Folded Reload + addi.d $sp, $sp, 80 ret .LBB1_5: move $a1, $zero diff --git a/results/MultiSource/Benchmarks/Olden/power/CMakeFiles/power.dir/compute.s b/results/MultiSource/Benchmarks/Olden/power/CMakeFiles/power.dir/compute.s index 03e30a7c..fc5063f0 100644 --- a/results/MultiSource/Benchmarks/Olden/power/CMakeFiles/power.dir/compute.s +++ b/results/MultiSource/Benchmarks/Olden/power/CMakeFiles/power.dir/compute.s @@ -947,45 +947,48 @@ make_orthogonal: # @make_orthogonal .type find_gradient_f,@function find_gradient_f: # @find_gradient_f # %bb.0: - addi.d $sp, $sp, -32 - st.d $ra, $sp, 24 # 8-byte Folded Spill - st.d $fp, $sp, 16 # 8-byte Folded Spill - fst.d $fs0, $sp, 8 # 8-byte Folded Spill - fst.d $fs1, $sp, 0 # 8-byte Folded Spill pcalau12i $a1, %pc_hi20(P) fld.d $fa2, $a1, %pc_lo12(P) - vldi $vr3, -912 pcalau12i $a1, %pc_hi20(Q) - fld.d $fa4, $a1, %pc_lo12(Q) - fadd.d $fa2, $fa2, $fa3 - frecip.d $fa2, $fa2 - fsub.d $fs0, $fa2, $fa0 - fadd.d $fa0, $fa4, $fa3 - frecip.d $fa0, $fa0 - fsub.d $fs1, $fa0, $fa1 - movgr2fr.d $fa0, $zero - fmadd.d $fa0, $fs0, $fs0, $fa0 - fmadd.d $fa1, $fs1, $fs1, $fa0 + fld.d $fa3, $a1, %pc_lo12(Q) + # kill: def $f1_64 killed $f1_64 def $vr1 + # kill: def $f0_64 killed $f0_64 def $vr0 + vextrins.d $vr2, $vr3, 16 + lu52i.d $a1, $zero, 1023 + vreplgr2vr.d $vr3, $a1 + vfadd.d $vr2, $vr2, $vr3 + vfrecip.d $vr2, $vr2 + vextrins.d $vr0, $vr1, 16 + vfsub.d $vr2, $vr2, $vr0 + vreplvei.d $vr0, $vr2, 0 + movgr2fr.d $fa1, $zero + fmadd.d $fa0, $fa0, $fa0, $fa1 + vreplvei.d $vr1, $vr2, 1 + fmadd.d $fa1, $fa1, $fa1, $fa0 fsqrt.d $fa0, $fa1 fcmp.cor.d $fcc0, $fa0, $fa0 bceqz $fcc0, .LBB8_2 .LBB8_1: # %.split - fdiv.d $fa1, $fs0, $fa0 - fst.d $fa1, $a0, 0 - fdiv.d $fa1, $fs1, $fa0 - fst.d $fa1, $a0, 8 - fld.d $fs1, $sp, 0 # 8-byte Folded Reload - fld.d $fs0, $sp, 8 # 8-byte Folded Reload - ld.d $fp, $sp, 16 # 8-byte Folded Reload - ld.d $ra, $sp, 24 # 8-byte Folded Reload - addi.d $sp, $sp, 32 + vreplvei.d $vr1, $vr0, 0 + vfdiv.d $vr1, $vr2, $vr1 + vst $vr1, $a0, 0 + # kill: def $f0_64 killed $f0_64 killed $vr0 ret .LBB8_2: # %call.sqrt + addi.d $sp, $sp, -32 + st.d $ra, $sp, 24 # 8-byte Folded Spill + st.d $fp, $sp, 16 # 8-byte Folded Spill fmov.d $fa0, $fa1 move $fp, $a0 + vst $vr2, $sp, 0 # 16-byte Folded Spill pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 + vld $vr2, $sp, 0 # 16-byte Folded Reload move $a0, $fp + # kill: def $f0_64 killed $f0_64 def $vr0 + ld.d $fp, $sp, 16 # 8-byte Folded Reload + ld.d $ra, $sp, 24 # 8-byte Folded Reload + addi.d $sp, $sp, 32 b .LBB8_1 .Lfunc_end8: .size find_gradient_f, .Lfunc_end8-find_gradient_f diff --git a/results/MultiSource/Benchmarks/Olden/tsp/CMakeFiles/tsp.dir/tsp.s b/results/MultiSource/Benchmarks/Olden/tsp/CMakeFiles/tsp.dir/tsp.s index 1c588a9e..ac32bf9c 100644 --- a/results/MultiSource/Benchmarks/Olden/tsp/CMakeFiles/tsp.dir/tsp.s +++ b/results/MultiSource/Benchmarks/Olden/tsp/CMakeFiles/tsp.dir/tsp.s @@ -32,15 +32,15 @@ tsp: # @tsp move $a2, $s1 pcaddu18i $ra, %call36(tsp) jirl $ra, $ra, 0 - fld.d $fa0, $s3, 8 - fld.d $fa2, $fp, 8 + fld.d $fa1, $s3, 8 + fld.d $fa4, $fp, 8 move $a4, $s3 - fld.d $fa1, $s3, 16 - fld.d $fa4, $fp, 16 + fld.d $fa2, $s3, 16 + fld.d $fa0, $fp, 16 move $a1, $a0 - fsub.d $fa3, $fa0, $fa2 + fsub.d $fa3, $fa1, $fa4 ld.d $a2, $fp, 40 - fsub.d $fa5, $fa1, $fa4 + fsub.d $fa5, $fa2, $fa0 fmul.d $fa5, $fa5, $fa5 fmadd.d $fa3, $fa3, $fa3, $fa5 fsqrt.d $fa3, $fa3 @@ -50,25 +50,25 @@ tsp: # @tsp .p2align 4, , 16 .LBB0_3: # %.lr.ph.i18 # =>This Inner Loop Header: Depth=1 - fld.d $fa2, $a2, 8 + fld.d $fa0, $a2, 8 fld.d $fa4, $a2, 16 - fsub.d $fa2, $fa2, $fa0 - fsub.d $fa4, $fa4, $fa1 + fsub.d $fa0, $fa0, $fa1 + fsub.d $fa4, $fa4, $fa2 fmul.d $fa4, $fa4, $fa4 - fmadd.d $fa2, $fa2, $fa2, $fa4 - fsqrt.d $fa2, $fa2 - fcmp.clt.d $fcc0, $fa2, $fa3 + fmadd.d $fa0, $fa0, $fa0, $fa4 + fsqrt.d $fa0, $fa0 + fcmp.clt.d $fcc0, $fa0, $fa3 movcf2gr $a0, $fcc0 maskeqz $a5, $a2, $a0 ld.d $a2, $a2, 40 - fsel $fa3, $fa3, $fa2, $fcc0 + fsel $fa3, $fa3, $fa0, $fcc0 masknez $a0, $a3, $a0 or $a3, $a5, $a0 bne $a2, $fp, .LBB0_3 # %bb.4: # %._crit_edge.loopexit.i20 ld.d $a2, $a3, 40 - fld.d $fa2, $a3, 8 - fld.d $fa4, $a3, 16 + fld.d $fa4, $a3, 8 + fld.d $fa0, $a3, 16 b .LBB0_17 .LBB0_5: pcaddu18i $ra, %call36(makelist) @@ -132,37 +132,30 @@ tsp: # @tsp move $a3, $a0 .LBB0_14: # %._crit_edge.i # in Loop: Header=BB0_9 Depth=1 - fld.d $fa4, $a3, 8 - fld.d $fa5, $a3, 16 - fld.d $fa6, $a2, 8 - fld.d $fa7, $a2, 16 ld.d $a5, $a3, 48 - fsub.d $ft0, $fa4, $fa6 - fsub.d $ft1, $fa5, $fa7 - fmul.d $ft1, $ft1, $ft1 - fmadd.d $ft0, $ft0, $ft0, $ft1 - fld.d $ft1, $a5, 8 - fld.d $ft2, $a5, 16 ld.d $a4, $a1, 40 - fsqrt.d $ft0, $ft0 - fsub.d $fa4, $fa4, $ft1 - fsub.d $fa5, $fa5, $ft2 - fmul.d $fa5, $fa5, $fa5 - fmadd.d $fa4, $fa4, $fa4, $fa5 - fsqrt.d $fa4, $fa4 - fsub.d $fa5, $fa2, $fa6 - fsub.d $fa6, $fa3, $fa7 - fmul.d $fa6, $fa6, $fa6 - fmadd.d $fa5, $fa5, $fa5, $fa6 - fsqrt.d $fa5, $fa5 - fsub.d $fa2, $fa2, $ft1 - fsub.d $fa3, $fa3, $ft2 - fmul.d $fa3, $fa3, $fa3 - fmadd.d $fa2, $fa2, $fa2, $fa3 - fsqrt.d $fa2, $fa2 - fsub.d $fa2, $fa2, $fa4 - fsub.d $fa3, $fa5, $ft0 - fcmp.cule.d $fcc0, $fa3, $fa2 + vld $vr4, $a2, 8 + vld $vr5, $a5, 8 + vldrepl.d $vr6, $a3, 8 + vldrepl.d $vr7, $a3, 16 + vpackev.d $vr8, $vr4, $vr5 + vfsub.d $vr6, $vr6, $vr8 + vpackod.d $vr4, $vr4, $vr5 + vfsub.d $vr5, $vr7, $vr4 + vfmul.d $vr5, $vr5, $vr5 + vfmadd.d $vr5, $vr6, $vr6, $vr5 + vfsqrt.d $vr5, $vr5 + vreplvei.d $vr2, $vr2, 0 + vfsub.d $vr2, $vr2, $vr8 + vreplvei.d $vr3, $vr3, 0 + vfsub.d $vr3, $vr3, $vr4 + vfmul.d $vr3, $vr3, $vr3 + vfmadd.d $vr2, $vr2, $vr2, $vr3 + vfsqrt.d $vr2, $vr2 + vfsub.d $vr2, $vr2, $vr5 + vreplvei.d $vr3, $vr2, 0 + vreplvei.d $vr2, $vr2, 1 + fcmp.cule.d $fcc0, $fa2, $fa3 bceqz $fcc0, .LBB0_7 # %bb.15: # in Loop: Header=BB0_9 Depth=1 st.d $a1, $a2, 48 @@ -174,110 +167,102 @@ tsp: # @tsp move $a3, $fp .LBB0_17: # %._crit_edge.i23 move $a0, $a4 - fld.d $fa5, $a2, 8 - fld.d $fa6, $a2, 16 ld.d $a4, $a3, 48 - fsub.d $fa7, $fa2, $fa5 - fsub.d $ft0, $fa4, $fa6 - fmul.d $ft0, $ft0, $ft0 - fld.d $ft1, $a4, 8 - fld.d $ft2, $a4, 16 - fmadd.d $fa7, $fa7, $fa7, $ft0 - fsqrt.d $fa7, $fa7 - fsub.d $fa2, $fa2, $ft1 - fsub.d $fa4, $fa4, $ft2 - fmul.d $fa4, $fa4, $fa4 - fmadd.d $fa2, $fa2, $fa2, $fa4 - fsqrt.d $fa2, $fa2 - fsub.d $fa4, $fa0, $fa5 - fsub.d $fa5, $fa1, $fa6 - fmul.d $fa5, $fa5, $fa5 - fmadd.d $fa4, $fa4, $fa4, $fa5 - fsqrt.d $fa4, $fa4 - fsub.d $fa5, $fa0, $ft1 - fsub.d $fa6, $fa1, $ft2 - fmul.d $fa6, $fa6, $fa6 - fmadd.d $fa5, $fa5, $fa5, $fa6 - fsqrt.d $fa5, $fa5 - fsub.d $fa2, $fa5, $fa2 - fsub.d $fa6, $fa4, $fa7 - fcmp.clt.d $fcc0, $fa2, $fa6 - fsel $fa2, $fa4, $fa3, $fcc0 - fsel $fa3, $fa3, $fa5, $fcc0 + vld $vr5, $a2, 8 + vld $vr6, $a4, 8 + vreplvei.d $vr4, $vr4, 0 + vpackev.d $vr7, $vr5, $vr6 + vfsub.d $vr4, $vr4, $vr7 + vreplvei.d $vr0, $vr0, 0 + vpackod.d $vr6, $vr5, $vr6 + vfsub.d $vr0, $vr0, $vr6 + vfmul.d $vr0, $vr0, $vr0 + vfmadd.d $vr0, $vr4, $vr4, $vr0 + vfsqrt.d $vr0, $vr0 + vreplvei.d $vr4, $vr1, 0 + vfsub.d $vr7, $vr4, $vr7 + vreplvei.d $vr5, $vr2, 0 + vfsub.d $vr6, $vr5, $vr6 + vfmul.d $vr6, $vr6, $vr6 + vfmadd.d $vr6, $vr7, $vr7, $vr6 + vfsqrt.d $vr6, $vr6 + vfsub.d $vr0, $vr6, $vr0 + vreplvei.d $vr7, $vr0, 0 + vreplvei.d $vr0, $vr0, 1 + fcmp.clt.d $fcc0, $fa7, $fa0 + vreplvei.d $vr0, $vr6, 1 + fsel $fa0, $fa0, $fa3, $fcc0 + vreplvei.d $vr6, $vr6, 0 + fsel $fa3, $fa3, $fa6, $fcc0 movcf2gr $a5, $fcc0 masknez $a2, $a2, $a5 maskeqz $a6, $a3, $a5 or $a2, $a6, $a2 masknez $a3, $a3, $a5 - fld.d $fa5, $a1, 8 + fld.d $ft0, $a1, 8 maskeqz $a4, $a4, $a5 - fld.d $fa6, $a1, 16 + fld.d $fa7, $a1, 16 or $a3, $a4, $a3 - fsub.d $fa4, $fa0, $fa5 + fsub.d $fa6, $fa1, $ft0 ld.d $a4, $a1, 40 - fsub.d $fa7, $fa1, $fa6 - fmul.d $fa7, $fa7, $fa7 - fmadd.d $fa4, $fa4, $fa4, $fa7 - fsqrt.d $fa4, $fa4 + fsub.d $ft1, $fa2, $fa7 + fmul.d $ft1, $ft1, $ft1 + fmadd.d $fa6, $fa6, $fa6, $ft1 + fsqrt.d $fa6, $fa6 beq $a4, $a1, .LBB0_21 # %bb.18: # %.lr.ph212.i.preheader move $a5, $a1 .p2align 4, , 16 .LBB0_19: # %.lr.ph212.i # =>This Inner Loop Header: Depth=1 - fld.d $fa5, $a4, 8 - fld.d $fa6, $a4, 16 - fsub.d $fa5, $fa5, $fa0 - fsub.d $fa6, $fa6, $fa1 - fmul.d $fa6, $fa6, $fa6 - fmadd.d $fa5, $fa5, $fa5, $fa6 - fsqrt.d $fa5, $fa5 - fcmp.clt.d $fcc0, $fa5, $fa4 + fld.d $fa7, $a4, 8 + fld.d $ft0, $a4, 16 + fsub.d $fa7, $fa7, $fa1 + fsub.d $ft0, $ft0, $fa2 + fmul.d $ft0, $ft0, $ft0 + fmadd.d $fa7, $fa7, $fa7, $ft0 + fsqrt.d $fa7, $fa7 + fcmp.clt.d $fcc0, $fa7, $fa6 movcf2gr $a6, $fcc0 maskeqz $a7, $a4, $a6 ld.d $a4, $a4, 40 - fsel $fa4, $fa4, $fa5, $fcc0 + fsel $fa6, $fa6, $fa7, $fcc0 masknez $a5, $a5, $a6 or $a5, $a7, $a5 bne $a4, $a1, .LBB0_19 # %bb.20: # %._crit_edge213.loopexit.i ld.d $a4, $a5, 40 - fld.d $fa5, $a5, 8 - fld.d $fa6, $a5, 16 + fld.d $ft0, $a5, 8 + fld.d $fa7, $a5, 16 b .LBB0_22 .LBB0_21: move $a5, $a1 .LBB0_22: # %._crit_edge213.i - fld.d $fa7, $a4, 8 - fld.d $ft0, $a4, 16 ld.d $a6, $a5, 48 - fsub.d $ft1, $fa5, $fa7 - fsub.d $ft2, $fa6, $ft0 - fmul.d $ft2, $ft2, $ft2 - fld.d $ft3, $a6, 8 - fld.d $ft4, $a6, 16 - fmadd.d $ft1, $ft1, $ft1, $ft2 - fsqrt.d $ft1, $ft1 - fsub.d $fa5, $fa5, $ft3 - fsub.d $fa6, $fa6, $ft4 - fmul.d $fa6, $fa6, $fa6 - fmadd.d $fa5, $fa5, $fa5, $fa6 - fsqrt.d $fa5, $fa5 - fsub.d $fa6, $fa0, $fa7 - fsub.d $fa7, $fa1, $ft0 - fmul.d $fa7, $fa7, $fa7 - fmadd.d $fa6, $fa6, $fa6, $fa7 - fsqrt.d $fa6, $fa6 - fsub.d $fa0, $fa0, $ft3 - fsub.d $fa1, $fa1, $ft4 - fmul.d $fa1, $fa1, $fa1 - fmadd.d $fa0, $fa0, $fa0, $fa1 - fsqrt.d $fa0, $fa0 - fsub.d $fa1, $fa0, $fa5 - fsub.d $fa5, $fa6, $ft1 - fcmp.clt.d $fcc0, $fa1, $fa5 - fsel $fa1, $fa6, $fa4, $fcc0 - fsel $fa0, $fa4, $fa0, $fcc0 + vld $vr1, $a4, 8 + vld $vr2, $a6, 8 + vreplvei.d $vr8, $vr8, 0 + vpackev.d $vr9, $vr1, $vr2 + vfsub.d $vr8, $vr8, $vr9 + vreplvei.d $vr7, $vr7, 0 + vpackod.d $vr1, $vr1, $vr2 + vfsub.d $vr2, $vr7, $vr1 + vfmul.d $vr2, $vr2, $vr2 + vfmadd.d $vr2, $vr8, $vr8, $vr2 + vfsqrt.d $vr2, $vr2 + vfsub.d $vr4, $vr4, $vr9 + vfsub.d $vr1, $vr5, $vr1 + vfmul.d $vr1, $vr1, $vr1 + vfmadd.d $vr1, $vr4, $vr4, $vr1 + vfsqrt.d $vr1, $vr1 + vfsub.d $vr2, $vr1, $vr2 + vreplvei.d $vr4, $vr2, 0 + vreplvei.d $vr2, $vr2, 1 + fcmp.clt.d $fcc0, $fa4, $fa2 + vreplvei.d $vr2, $vr1, 1 + fsel $fa2, $fa2, $fa6, $fcc0 + vreplvei.d $vr1, $vr1, 0 + fsel $fa1, $fa6, $fa1, $fcc0 movcf2gr $a7, $fcc0 masknez $a1, $a4, $a7 maskeqz $a4, $a5, $a7 @@ -313,25 +298,25 @@ tsp: # @tsp fmul.d $fa7, $fa7, $fa7 fmadd.d $fa5, $fa5, $fa5, $fa7 fsqrt.d $fa5, $fa5 - fadd.d $fa7, $fa3, $fa0 + fadd.d $fa7, $fa3, $fa1 fadd.d $fa7, $fa7, $ft0 - fadd.d $fa3, $fa3, $fa1 + fadd.d $fa3, $fa3, $fa2 fadd.d $fa3, $fa3, $fa4 fcmp.clt.d $fcc0, $fa3, $fa7 fsel $fa3, $fa7, $fa3, $fcc0 movcf2gr $a5, $fcc0 addi.d $a5, $a5, 1 - fadd.d $fa0, $fa2, $fa0 - fadd.d $fa0, $fa0, $fa6 - fcmp.clt.d $fcc0, $fa0, $fa3 - fsel $fa0, $fa3, $fa0, $fcc0 + fadd.d $fa1, $fa0, $fa1 + fadd.d $fa1, $fa1, $fa6 + fcmp.clt.d $fcc0, $fa1, $fa3 + fsel $fa1, $fa3, $fa1, $fcc0 movcf2gr $a6, $fcc0 masknez $a5, $a5, $a6 ori $a7, $zero, 3 maskeqz $a6, $a7, $a6 - fadd.d $fa1, $fa2, $fa1 - fadd.d $fa1, $fa1, $fa5 - fcmp.clt.d $fcc0, $fa1, $fa0 + fadd.d $fa0, $fa0, $fa2 + fadd.d $fa0, $fa0, $fa5 + fcmp.clt.d $fcc0, $fa0, $fa1 or $a5, $a6, $a5 movcf2gr $a6, $fcc0 masknez $a5, $a5, $a6 diff --git a/results/MultiSource/Benchmarks/Olden/voronoi/CMakeFiles/voronoi.dir/vector.s b/results/MultiSource/Benchmarks/Olden/voronoi/CMakeFiles/voronoi.dir/vector.s index e3a15a6f..f9789c7a 100644 --- a/results/MultiSource/Benchmarks/Olden/voronoi/CMakeFiles/voronoi.dir/vector.s +++ b/results/MultiSource/Benchmarks/Olden/voronoi/CMakeFiles/voronoi.dir/vector.s @@ -36,12 +36,11 @@ V2_dot: # @V2_dot .type V2_times,@function V2_times: # @V2_times # %bb.0: - fld.d $fa1, $a1, 0 - fld.d $fa2, $a1, 8 - fmul.d $fa1, $fa0, $fa1 - fst.d $fa1, $a0, 0 - fmul.d $fa0, $fa0, $fa2 - fst.d $fa0, $a0, 8 + vld $vr1, $a1, 0 + # kill: def $f0_64 killed $f0_64 def $vr0 + vreplvei.d $vr0, $vr0, 0 + vfmul.d $vr0, $vr0, $vr1 + vst $vr0, $a0, 0 ret .Lfunc_end2: .size V2_times, .Lfunc_end2-V2_times diff --git a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/genorient.s b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/genorient.s index f5ba09d5..d794a486 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/genorient.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/genorient.s @@ -5,25 +5,25 @@ .type genorient,@function genorient: # @genorient # %bb.0: - addi.d $sp, $sp, -240 - st.d $ra, $sp, 232 # 8-byte Folded Spill - st.d $fp, $sp, 224 # 8-byte Folded Spill - st.d $s0, $sp, 216 # 8-byte Folded Spill - st.d $s1, $sp, 208 # 8-byte Folded Spill - st.d $s2, $sp, 200 # 8-byte Folded Spill - st.d $s3, $sp, 192 # 8-byte Folded Spill - st.d $s4, $sp, 184 # 8-byte Folded Spill - st.d $s5, $sp, 176 # 8-byte Folded Spill - st.d $s6, $sp, 168 # 8-byte Folded Spill - st.d $s7, $sp, 160 # 8-byte Folded Spill - st.d $s8, $sp, 152 # 8-byte Folded Spill + addi.d $sp, $sp, -256 + st.d $ra, $sp, 248 # 8-byte Folded Spill + st.d $fp, $sp, 240 # 8-byte Folded Spill + st.d $s0, $sp, 232 # 8-byte Folded Spill + st.d $s1, $sp, 224 # 8-byte Folded Spill + st.d $s2, $sp, 216 # 8-byte Folded Spill + st.d $s3, $sp, 208 # 8-byte Folded Spill + st.d $s4, $sp, 200 # 8-byte Folded Spill + st.d $s5, $sp, 192 # 8-byte Folded Spill + st.d $s6, $sp, 184 # 8-byte Folded Spill + st.d $s7, $sp, 176 # 8-byte Folded Spill + st.d $s8, $sp, 168 # 8-byte Folded Spill pcalau12i $a0, %got_pc_hi20(numcells) ld.d $a0, $a0, %got_pc_lo12(numcells) - st.d $a0, $sp, 24 # 8-byte Folded Spill + st.d $a0, $sp, 32 # 8-byte Folded Spill ld.w $a0, $a0, 0 pcalau12i $a1, %got_pc_hi20(numpads) ld.d $a1, $a1, %got_pc_lo12(numpads) - st.d $a1, $sp, 16 # 8-byte Folded Spill + st.d $a1, $sp, 24 # 8-byte Folded Spill ld.w $a1, $a1, 0 add.w $a0, $a1, $a0 blez $a0, .LBB0_161 @@ -31,22 +31,22 @@ genorient: # @genorient ori $a0, $zero, 1 pcalau12i $a1, %got_pc_hi20(cellarray) ld.d $a1, $a1, %got_pc_lo12(cellarray) - st.d $a1, $sp, 32 # 8-byte Folded Spill - ori $s8, $zero, 7 + st.d $a1, $sp, 40 # 8-byte Folded Spill + ori $s7, $zero, 7 pcalau12i $a1, %pc_hi20(.Lswitch.table.genorient) addi.d $a1, $a1, %pc_lo12(.Lswitch.table.genorient) - st.d $a1, $sp, 56 # 8-byte Folded Spill + st.d $a1, $sp, 64 # 8-byte Folded Spill # implicit-def: $r5 # kill: killed $r5 b .LBB0_4 .p2align 4, , 16 .LBB0_2: # in Loop: Header=BB0_4 Depth=1 - ld.d $a2, $sp, 72 # 8-byte Folded Reload + ld.d $a2, $sp, 80 # 8-byte Folded Reload .LBB0_3: # %.loopexit934 # in Loop: Header=BB0_4 Depth=1 - ld.d $a0, $sp, 24 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload ld.w $a0, $a0, 0 - ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a1, $sp, 24 # 8-byte Folded Reload ld.w $a1, $a1, 0 add.w $a1, $a1, $a0 addi.d $a0, $a2, 1 @@ -56,36 +56,35 @@ genorient: # @genorient # Child Loop BB0_34 Depth 2 # Child Loop BB0_79 Depth 2 # Child Loop BB0_70 Depth 2 - ld.d $a1, $sp, 32 # 8-byte Folded Reload + ld.d $a1, $sp, 40 # 8-byte Folded Reload ld.d $a1, $a1, 0 move $a2, $a0 slli.d $a0, $a0, 3 - ldx.d $a0, $a1, $a0 - st.d $a0, $sp, 144 # 8-byte Folded Spill - ld.w $s1, $a0, 56 + ldx.d $a1, $a1, $a0 + ld.w $s1, $a1, 56 beqz $s1, .LBB0_3 # %bb.5: # in Loop: Header=BB0_4 Depth=1 - st.d $a2, $sp, 72 # 8-byte Folded Spill - bltu $s8, $s1, .LBB0_7 + st.d $a2, $sp, 80 # 8-byte Folded Spill + bltu $s7, $s1, .LBB0_7 # %bb.6: # %switch.lookup # in Loop: Header=BB0_4 Depth=1 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload alsl.d $a0, $s1, $a0, 2 ld.w $a0, $a0, -4 - st.d $a0, $sp, 112 # 8-byte Folded Spill + st.d $a0, $sp, 120 # 8-byte Folded Spill .LBB0_7: # in Loop: Header=BB0_4 Depth=1 - ld.d $a0, $sp, 144 # 8-byte Folded Reload - ld.d $s5, $a0, 152 + ld.d $s5, $a1, 152 vld $vr0, $s5, 56 - vst $vr0, $sp, 128 # 16-byte Folded Spill - addi.d $fp, $a0, 152 + vst $vr0, $sp, 144 # 16-byte Folded Spill + st.d $a1, $sp, 128 # 8-byte Folded Spill + addi.d $fp, $a1, 152 ori $a0, $zero, 104 pcaddu18i $ra, %call36(malloc) jirl $ra, $ra, 0 - vld $vr4, $sp, 128 # 16-byte Folded Reload + vld $vr4, $sp, 144 # 16-byte Folded Reload move $s0, $a0 slli.d $a0, $s1, 3 - st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $fp, $sp, 112 # 8-byte Folded Spill stx.d $s0, $fp, $a0 st.d $zero, $s0, 0 vst $vr4, $s0, 56 @@ -99,14 +98,14 @@ genorient: # @genorient fst.d $fa1, $s0, 16 fld.d $fa0, $s5, 32 fst.d $fa2, $s0, 24 - ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 120 # 8-byte Folded Reload addi.w $fp, $a0, 0 - addi.d $s6, $fp, -1 + addi.d $s8, $fp, -1 fst.d $fa0, $s0, 32 ori $a0, $zero, 6 - bltu $a0, $s6, .LBB0_20 + bltu $a0, $s8, .LBB0_20 # %bb.8: # in Loop: Header=BB0_4 Depth=1 - slli.d $a0, $s6, 2 + slli.d $a0, $s8, 2 pcalau12i $a1, %pc_hi20(.LJTI0_0) addi.d $a1, $a1, %pc_lo12(.LJTI0_0) ldx.w $a0, $a1, $a0 @@ -161,18 +160,17 @@ genorient: # @genorient # in Loop: Header=BB0_4 Depth=1 fst.d $fa0, $s5, 32 .LBB0_20: # in Loop: Header=BB0_4 Depth=1 - st.d $s1, $sp, 64 # 8-byte Folded Spill + st.d $s1, $sp, 72 # 8-byte Folded Spill addi.d $s1, $s5, 68 addi.d $s2, $s5, 64 addi.d $s3, $s5, 60 addi.d $s4, $s5, 56 vpickve2gr.w $a0, $vr4, 2 vpickve2gr.w $a1, $vr4, 3 - sub.w $s7, $a1, $a0 - vpickve2gr.w $a0, $vr4, 0 - vpickve2gr.w $a1, $vr4, 1 - sub.w $a0, $a1, $a0 - st.d $a0, $sp, 104 # 8-byte Folded Spill + sub.w $s6, $a1, $a0 + vreplvei.w $vr0, $vr4, 1 + vsub.w $vr0, $vr0, $vr4 + vpickve2gr.w $s7, $vr0, 0 move $a0, $fp pcaddu18i $ra, %call36(move) jirl $ra, $ra, 0 @@ -182,20 +180,22 @@ genorient: # @genorient move $a3, $s1 pcaddu18i $ra, %call36(rect) jirl $ra, $ra, 0 - ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 120 # 8-byte Folded Reload bstrins.d $a0, $zero, 1, 1 addi.w $a0, $a0, 0 addi.d $a1, $a0, -4 sltui $a1, $a1, 1 - and $a1, $s7, $a1 - st.d $a1, $sp, 88 # 8-byte Folded Spill - bnez $a1, .LBB0_23 + and $a2, $s6, $a1 + st.d $s7, $sp, 96 # 8-byte Folded Spill + andi $a1, $s7, 1 + st.d $a1, $sp, 136 # 8-byte Folded Spill + st.d $a2, $sp, 104 # 8-byte Folded Spill + bnez $a2, .LBB0_23 # %bb.21: # in Loop: Header=BB0_4 Depth=1 - ld.d $a1, $sp, 104 # 8-byte Folded Reload - andi $a1, $a1, 1 + ld.d $a1, $sp, 136 # 8-byte Folded Reload beqz $a1, .LBB0_24 # %bb.22: # in Loop: Header=BB0_4 Depth=1 - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 120 # 8-byte Folded Reload bstrins.d $a1, $zero, 0, 0 addi.w $a1, $a1, 0 ori $a2, $zero, 2 @@ -213,13 +213,14 @@ genorient: # @genorient ld.w $a2, $s4, 0 ld.w $a1, $s3, 0 .LBB0_25: # in Loop: Header=BB0_4 Depth=1 - andi $a3, $s7, 1 + ori $a4, $zero, 7 + andi $a3, $s6, 1 st.w $a2, $s5, 72 addi.d $a0, $a0, -1 sltui $a0, $a0, 1 and $a0, $a3, $a0 st.w $a1, $s5, 76 - st.d $a0, $sp, 128 # 8-byte Folded Spill + st.d $a0, $sp, 144 # 8-byte Folded Spill beqz $a0, .LBB0_27 .LBB0_26: # in Loop: Header=BB0_4 Depth=1 ld.w $a0, $s2, 0 @@ -231,11 +232,10 @@ genorient: # @genorient b .LBB0_30 .p2align 4, , 16 .LBB0_27: # in Loop: Header=BB0_4 Depth=1 - ld.d $a0, $sp, 104 # 8-byte Folded Reload - andi $a0, $a0, 1 + ld.d $a0, $sp, 136 # 8-byte Folded Reload beqz $a0, .LBB0_30 # %bb.28: # in Loop: Header=BB0_4 Depth=1 - beq $fp, $s8, .LBB0_26 + beq $fp, $a4, .LBB0_26 # %bb.29: # in Loop: Header=BB0_4 Depth=1 ori $a0, $zero, 4 beq $fp, $a0, .LBB0_26 @@ -243,25 +243,23 @@ genorient: # @genorient .LBB0_30: # in Loop: Header=BB0_4 Depth=1 ld.d $a0, $s5, 64 st.d $a0, $s5, 80 - ld.d $a0, $sp, 144 # 8-byte Folded Reload + ld.d $a0, $sp, 128 # 8-byte Folded Reload ld.w $a0, $a0, 60 - ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a1, $sp, 120 # 8-byte Folded Reload bstrpick.d $a1, $a1, 31, 1 - st.d $a1, $sp, 80 # 8-byte Folded Spill + st.d $a1, $sp, 88 # 8-byte Folded Spill blez $a0, .LBB0_53 # %bb.31: # %.lr.ph # in Loop: Header=BB0_4 Depth=1 move $s7, $zero - ld.d $a1, $sp, 104 # 8-byte Folded Reload - andi $a0, $a1, 1 - st.d $a0, $sp, 120 # 8-byte Folded Spill - ld.d $a0, $sp, 80 # 8-byte Folded Reload + ld.d $a0, $sp, 88 # 8-byte Folded Reload slli.w $a0, $a0, 1 addi.d $a0, $a0, -2 sltui $a0, $a0, 1 + ld.d $a1, $sp, 96 # 8-byte Folded Reload and $a0, $a1, $a0 - ld.d $a1, $sp, 88 # 8-byte Folded Reload - or $s8, $a1, $a0 + ld.d $a1, $sp, 104 # 8-byte Folded Reload + or $s6, $a1, $a0 b .LBB0_34 .p2align 4, , 16 .LBB0_32: # in Loop: Header=BB0_34 Depth=2 @@ -274,7 +272,7 @@ genorient: # @genorient .LBB0_33: # in Loop: Header=BB0_34 Depth=2 ld.d $a0, $s5, 64 st.d $a0, $s5, 80 - ld.d $a0, $sp, 144 # 8-byte Folded Reload + ld.d $a0, $sp, 128 # 8-byte Folded Reload ld.w $a0, $a0, 60 addi.w $s7, $s7, 1 bge $s7, $a0, .LBB0_53 @@ -302,9 +300,9 @@ genorient: # @genorient fst.d $fa1, $a0, 24 fst.d $fa0, $a0, 32 ori $a0, $zero, 6 - bltu $a0, $s6, .LBB0_47 + bltu $a0, $s8, .LBB0_47 # %bb.35: # in Loop: Header=BB0_34 Depth=2 - slli.d $a0, $s6, 2 + slli.d $a0, $s8, 2 pcalau12i $a1, %pc_hi20(.LJTI0_1) addi.d $a1, $a1, %pc_lo12(.LJTI0_1) ldx.w $a0, $a1, $a0 @@ -373,7 +371,7 @@ genorient: # @genorient jirl $ra, $ra, 0 ld.w $a1, $s4, 0 ld.w $a0, $s3, 0 - beqz $s8, .LBB0_49 + beqz $s6, .LBB0_49 # %bb.48: # in Loop: Header=BB0_34 Depth=2 addi.d $a1, $a1, 1 st.w $a1, $s4, 0 @@ -382,10 +380,10 @@ genorient: # @genorient .LBB0_49: # in Loop: Header=BB0_34 Depth=2 st.w $a1, $s5, 72 st.w $a0, $s5, 76 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload bnez $a0, .LBB0_32 # %bb.50: # in Loop: Header=BB0_34 Depth=2 - ld.d $a0, $sp, 120 # 8-byte Folded Reload + ld.d $a0, $sp, 136 # 8-byte Folded Reload beqz $a0, .LBB0_33 # %bb.51: # in Loop: Header=BB0_34 Depth=2 ori $a0, $zero, 7 @@ -397,15 +395,15 @@ genorient: # @genorient .p2align 4, , 16 .LBB0_53: # %._crit_edge # in Loop: Header=BB0_4 Depth=1 - ld.d $a0, $sp, 144 # 8-byte Folded Reload - ld.w $a0, $a0, 68 - ld.d $a1, $sp, 96 # 8-byte Folded Reload - ld.d $a2, $sp, 64 # 8-byte Folded Reload + ld.d $s8, $sp, 128 # 8-byte Folded Reload + ld.w $a0, $s8, 68 + ld.d $a1, $sp, 112 # 8-byte Folded Reload + ld.d $a2, $sp, 72 # 8-byte Folded Reload alsl.d $s3, $a2, $a1, 3 - ori $s8, $zero, 7 + ori $s7, $zero, 7 beqz $a0, .LBB0_65 # %bb.54: # in Loop: Header=BB0_4 Depth=1 - ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload ld.d $a0, $a0, 0 ld.d $s4, $a0, 88 ori $a0, $zero, 32 @@ -428,14 +426,13 @@ genorient: # @genorient move $a1, $s1 pcaddu18i $ra, %call36(point) jirl $ra, $ra, 0 - ld.d $a0, $sp, 88 # 8-byte Folded Reload + ld.d $a0, $sp, 104 # 8-byte Folded Reload bnez $a0, .LBB0_57 # %bb.55: # in Loop: Header=BB0_4 Depth=1 - ld.d $a0, $sp, 104 # 8-byte Folded Reload - andi $a0, $a0, 1 + ld.d $a0, $sp, 136 # 8-byte Folded Reload beqz $a0, .LBB0_58 # %bb.56: # in Loop: Header=BB0_4 Depth=1 - ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 120 # 8-byte Folded Reload bstrins.d $a0, $zero, 0, 0 addi.w $a0, $a0, 0 ori $a1, $zero, 2 @@ -450,7 +447,7 @@ genorient: # @genorient ld.w $a0, $s2, 0 .LBB0_59: # in Loop: Header=BB0_4 Depth=1 st.w $a0, $s4, 16 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload beqz $a0, .LBB0_61 .LBB0_60: # in Loop: Header=BB0_4 Depth=1 ld.w $a0, $s1, 0 @@ -458,29 +455,26 @@ genorient: # @genorient st.w $a0, $s1, 0 b .LBB0_64 .LBB0_61: # in Loop: Header=BB0_4 Depth=1 - ld.d $a0, $sp, 104 # 8-byte Folded Reload - andi $a0, $a0, 1 + ld.d $a0, $sp, 136 # 8-byte Folded Reload beqz $a0, .LBB0_64 # %bb.62: # in Loop: Header=BB0_4 Depth=1 - beq $fp, $s8, .LBB0_60 + beq $fp, $s7, .LBB0_60 # %bb.63: # in Loop: Header=BB0_4 Depth=1 ori $a0, $zero, 4 beq $fp, $a0, .LBB0_60 .p2align 4, , 16 .LBB0_64: # in Loop: Header=BB0_4 Depth=1 ld.w $a0, $s4, 12 - ld.d $a1, $sp, 144 # 8-byte Folded Reload - ld.w $a1, $a1, 68 + ld.w $a1, $s8, 68 st.w $a0, $s4, 20 ori $a0, $zero, 2 bge $a1, $a0, .LBB0_76 .LBB0_65: # %.loopexit935 # in Loop: Header=BB0_4 Depth=1 - ld.d $a0, $sp, 144 # 8-byte Folded Reload - ld.w $s1, $a0, 128 + ld.w $s1, $s8, 128 beqz $s1, .LBB0_2 # %bb.66: # in Loop: Header=BB0_4 Depth=1 - ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload ld.d $a0, $a0, 0 ld.d $s0, $a0, 96 slli.d $a0, $s1, 4 @@ -493,16 +487,15 @@ genorient: # @genorient # %bb.67: # %.lr.ph949 # in Loop: Header=BB0_4 Depth=1 move $s2, $zero - ld.d $a2, $sp, 104 # 8-byte Folded Reload - andi $s3, $a2, 1 - ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 88 # 8-byte Folded Reload slli.w $a1, $a1, 1 addi.d $a1, $a1, -2 sltui $a1, $a1, 1 + ld.d $a2, $sp, 96 # 8-byte Folded Reload and $a1, $a2, $a1 - ld.d $a2, $sp, 88 # 8-byte Folded Reload - or $s4, $a2, $a1 - addi.d $s5, $a0, 16 + ld.d $a2, $sp, 104 # 8-byte Folded Reload + or $s3, $a2, $a1 + addi.d $s4, $a0, 16 addi.d $s0, $s0, 16 b .LBB0_70 .p2align 4, , 16 @@ -512,18 +505,17 @@ genorient: # @genorient st.w $a0, $s1, 0 .LBB0_69: # in Loop: Header=BB0_70 Depth=2 ld.w $a0, $s0, 4 - ld.d $a1, $sp, 144 # 8-byte Folded Reload - ld.w $a1, $a1, 128 + ld.w $a1, $s8, 128 st.w $a0, $s0, 12 addi.d $s2, $s2, 1 - addi.d $s5, $s5, 16 + addi.d $s4, $s4, 16 addi.d $s0, $s0, 16 bge $s2, $a1, .LBB0_2 .LBB0_70: # Parent Loop BB0_4 Depth=1 # => This Inner Loop Header: Depth=2 vld $vr0, $s0, 0 addi.d $s1, $s0, 4 - vst $vr0, $s5, 0 + vst $vr0, $s4, 0 move $a0, $fp pcaddu18i $ra, %call36(move) jirl $ra, $ra, 0 @@ -532,34 +524,34 @@ genorient: # @genorient pcaddu18i $ra, %call36(point) jirl $ra, $ra, 0 ld.w $a0, $s0, 0 - beqz $s4, .LBB0_72 + beqz $s3, .LBB0_72 # %bb.71: # in Loop: Header=BB0_70 Depth=2 addi.d $a0, $a0, 1 st.w $a0, $s0, 0 .LBB0_72: # in Loop: Header=BB0_70 Depth=2 st.w $a0, $s0, 8 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload bnez $a0, .LBB0_68 # %bb.73: # in Loop: Header=BB0_70 Depth=2 - beqz $s3, .LBB0_69 + ld.d $a0, $sp, 136 # 8-byte Folded Reload + beqz $a0, .LBB0_69 # %bb.74: # in Loop: Header=BB0_70 Depth=2 - beq $fp, $s8, .LBB0_68 + beq $fp, $s7, .LBB0_68 # %bb.75: # in Loop: Header=BB0_70 Depth=2 ori $a0, $zero, 4 beq $fp, $a0, .LBB0_68 b .LBB0_69 .LBB0_76: # %.lr.ph945 # in Loop: Header=BB0_4 Depth=1 - ld.d $a1, $sp, 104 # 8-byte Folded Reload - andi $s5, $a1, 1 - ld.d $a0, $sp, 80 # 8-byte Folded Reload + ld.d $a0, $sp, 88 # 8-byte Folded Reload slli.w $a0, $a0, 1 addi.d $a0, $a0, -2 sltui $a0, $a0, 1 + ld.d $a1, $sp, 96 # 8-byte Folded Reload and $a0, $a1, $a0 - ld.d $a1, $sp, 88 # 8-byte Folded Reload - or $s6, $a1, $a0 - ori $s7, $zero, 1 + ld.d $a1, $sp, 104 # 8-byte Folded Reload + or $s5, $a1, $a0 + ori $s6, $zero, 1 b .LBB0_79 .p2align 4, , 16 .LBB0_77: # in Loop: Header=BB0_79 Depth=2 @@ -568,11 +560,10 @@ genorient: # @genorient st.w $a0, $s1, 0 .LBB0_78: # in Loop: Header=BB0_79 Depth=2 ld.w $a0, $s4, 12 - ld.d $a1, $sp, 144 # 8-byte Folded Reload - ld.w $a1, $a1, 68 - addi.w $s7, $s7, 1 + ld.w $a1, $s8, 68 + addi.w $s6, $s6, 1 st.w $a0, $s4, 20 - bge $s7, $a1, .LBB0_65 + bge $s6, $a1, .LBB0_65 .LBB0_79: # Parent Loop BB0_4 Depth=1 # => This Inner Loop Header: Depth=2 ld.d $s4, $s4, 0 @@ -597,18 +588,19 @@ genorient: # @genorient pcaddu18i $ra, %call36(point) jirl $ra, $ra, 0 ld.w $a0, $s4, 8 - beqz $s6, .LBB0_81 + beqz $s5, .LBB0_81 # %bb.80: # in Loop: Header=BB0_79 Depth=2 addi.d $a0, $a0, 1 st.w $a0, $s2, 0 .LBB0_81: # in Loop: Header=BB0_79 Depth=2 st.w $a0, $s4, 16 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 144 # 8-byte Folded Reload bnez $a0, .LBB0_77 # %bb.82: # in Loop: Header=BB0_79 Depth=2 - beqz $s5, .LBB0_78 + ld.d $a0, $sp, 136 # 8-byte Folded Reload + beqz $a0, .LBB0_78 # %bb.83: # in Loop: Header=BB0_79 Depth=2 - beq $fp, $s8, .LBB0_77 + beq $fp, $s7, .LBB0_77 # %bb.84: # in Loop: Header=BB0_79 Depth=2 ori $a0, $zero, 4 beq $fp, $a0, .LBB0_77 @@ -619,34 +611,34 @@ genorient: # @genorient ori $a2, $zero, 1 pcalau12i $a0, %pc_hi20(.Lswitch.table.genorient.5) addi.d $a0, $a0, %pc_lo12(.Lswitch.table.genorient.5) - st.d $a0, $sp, 72 # 8-byte Folded Spill + st.d $a0, $sp, 80 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(.Lswitch.table.genorient.6) addi.d $a0, $a0, %pc_lo12(.Lswitch.table.genorient.6) - st.d $a0, $sp, 64 # 8-byte Folded Spill + st.d $a0, $sp, 72 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(.Lswitch.table.genorient.7) addi.d $a0, $a0, %pc_lo12(.Lswitch.table.genorient.7) - st.d $a0, $sp, 56 # 8-byte Folded Spill + st.d $a0, $sp, 64 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(.Lswitch.table.genorient.8) addi.d $a0, $a0, %pc_lo12(.Lswitch.table.genorient.8) - st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a0, $sp, 56 # 8-byte Folded Spill b .LBB0_88 .p2align 4, , 16 .LBB0_87: # in Loop: Header=BB0_88 Depth=1 - ld.d $a0, $sp, 24 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload ld.w $a0, $a0, 0 - ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a1, $sp, 24 # 8-byte Folded Reload ld.w $a1, $a1, 0 add.w $a3, $a1, $a0 - ld.d $a4, $sp, 8 # 8-byte Folded Reload + ld.d $a4, $sp, 16 # 8-byte Folded Reload addi.d $a2, $a4, 1 bge $a4, $a3, .LBB0_113 .LBB0_88: # %.lr.ph965 # =>This Loop Header: Depth=1 # Child Loop BB0_90 Depth 2 # Child Loop BB0_105 Depth 3 - ld.d $a0, $sp, 32 # 8-byte Folded Reload + ld.d $a0, $sp, 40 # 8-byte Folded Reload ld.d $a0, $a0, 0 - st.d $a2, $sp, 8 # 8-byte Folded Spill + st.d $a2, $sp, 16 # 8-byte Folded Spill slli.d $a1, $a2, 3 ldx.d $a4, $a0, $a1 ld.d $a0, $a4, 152 @@ -655,16 +647,16 @@ genorient: # @genorient ld.w $a3, $a0, 60 ld.w $a0, $a0, 56 addi.d $a5, $a4, 152 - st.d $a5, $sp, 40 # 8-byte Folded Spill + st.d $a5, $sp, 48 # 8-byte Folded Spill sub.d $a1, $a1, $a2 sub.d $a0, $a3, $a0 st.d $a4, $sp, 144 # 8-byte Folded Spill addi.d $a2, $a4, 20 - st.d $a2, $sp, 80 # 8-byte Folded Spill + st.d $a2, $sp, 88 # 8-byte Folded Spill andi $a1, $a1, 1 - st.d $a1, $sp, 88 # 8-byte Folded Spill + st.d $a1, $sp, 96 # 8-byte Folded Spill andi $a0, $a0, 1 - st.d $a0, $sp, 128 # 8-byte Folded Spill + st.d $a0, $sp, 136 # 8-byte Folded Spill ori $fp, $zero, 1 b .LBB0_90 .p2align 4, , 16 @@ -681,12 +673,12 @@ genorient: # @genorient beq $fp, $a0, .LBB0_89 # %bb.91: # in Loop: Header=BB0_90 Depth=2 slli.d $a0, $fp, 2 - ld.d $a1, $sp, 80 # 8-byte Folded Reload + ld.d $a1, $sp, 88 # 8-byte Folded Reload ldx.w $a0, $a1, $a0 beqz $a0, .LBB0_89 # %bb.92: # %switch.lookup1096 # in Loop: Header=BB0_90 Depth=2 - ld.d $s1, $sp, 40 # 8-byte Folded Reload + ld.d $s1, $sp, 48 # 8-byte Folded Reload ld.d $s7, $s1, 0 ori $a0, $zero, 104 pcaddu18i $ra, %call36(malloc) @@ -695,21 +687,21 @@ genorient: # @genorient slli.d $a0, $fp, 3 stx.d $s0, $s1, $a0 addi.d $s2, $a0, -8 - ld.d $a0, $sp, 72 # 8-byte Folded Reload + ld.d $a0, $sp, 80 # 8-byte Folded Reload ldx.d $a0, $a0, $s2 fldx.d $fa0, $s7, $a0 - ld.d $a0, $sp, 64 # 8-byte Folded Reload + ld.d $a0, $sp, 72 # 8-byte Folded Reload ldx.d $a0, $a0, $s2 vld $vr1, $s7, 56 fst.d $fa0, $s0, 8 fldx.d $fa0, $s7, $a0 - ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $a0, $sp, 64 # 8-byte Folded Reload ldx.d $a0, $a0, $s2 vst $vr1, $s0, 56 st.d $zero, $s0, 0 fst.d $fa0, $s0, 16 fldx.d $fa0, $s7, $a0 - ld.d $a0, $sp, 48 # 8-byte Folded Reload + ld.d $a0, $sp, 56 # 8-byte Folded Reload ldx.d $a0, $a0, $s2 addi.d $s1, $s0, 56 addi.d $s5, $s0, 60 @@ -733,12 +725,12 @@ genorient: # @genorient andi $a2, $fp, 5 addi.d $a0, $a2, -4 sltui $a0, $a0, 1 - ld.d $a1, $sp, 88 # 8-byte Folded Reload + ld.d $a1, $sp, 96 # 8-byte Folded Reload and $a0, $a1, $a0 andi $a1, $fp, 6 bnez $a0, .LBB0_95 # %bb.93: # in Loop: Header=BB0_90 Depth=2 - ld.d $a3, $sp, 128 # 8-byte Folded Reload + ld.d $a3, $sp, 136 # 8-byte Folded Reload beqz $a3, .LBB0_96 # %bb.94: # in Loop: Header=BB0_90 Depth=2 ori $a3, $zero, 2 @@ -755,7 +747,7 @@ genorient: # @genorient .LBB0_96: # in Loop: Header=BB0_90 Depth=2 addi.d $a2, $a2, -1 sltui $a2, $a2, 1 - ld.d $a3, $sp, 88 # 8-byte Folded Reload + ld.d $a3, $sp, 96 # 8-byte Folded Reload and $s6, $a3, $a2 beqz $s6, .LBB0_98 .LBB0_97: # in Loop: Header=BB0_90 Depth=2 @@ -769,7 +761,7 @@ genorient: # @genorient st.w $a3, $s0, 84 b .LBB0_101 .LBB0_98: # in Loop: Header=BB0_90 Depth=2 - ld.d $a2, $sp, 128 # 8-byte Folded Reload + ld.d $a2, $sp, 136 # 8-byte Folded Reload beqz $a2, .LBB0_101 # %bb.99: # in Loop: Header=BB0_90 Depth=2 ori $a2, $zero, 7 @@ -785,12 +777,15 @@ genorient: # @genorient # %bb.102: # %.lr.ph960 # in Loop: Header=BB0_90 Depth=2 move $s5, $zero - ld.d $a2, $sp, 128 # 8-byte Folded Reload + ld.d $a2, $sp, 136 # 8-byte Folded Reload sltu $a2, $zero, $a2 addi.d $a1, $a1, -2 sltui $a1, $a1, 1 and $a1, $a2, $a1 or $s8, $a0, $a1 + ld.d $a0, $sp, 80 # 8-byte Folded Reload + add.d $a0, $a0, $s2 + st.d $a0, $sp, 128 # 8-byte Folded Spill ld.d $a0, $sp, 72 # 8-byte Folded Reload add.d $a0, $a0, $s2 st.d $a0, $sp, 120 # 8-byte Folded Spill @@ -800,9 +795,6 @@ genorient: # @genorient ld.d $a0, $sp, 56 # 8-byte Folded Reload add.d $a0, $a0, $s2 st.d $a0, $sp, 104 # 8-byte Folded Spill - ld.d $a0, $sp, 48 # 8-byte Folded Reload - add.d $a0, $a0, $s2 - st.d $a0, $sp, 96 # 8-byte Folded Spill b .LBB0_105 .p2align 4, , 16 .LBB0_103: # in Loop: Header=BB0_105 Depth=3 @@ -837,18 +829,18 @@ genorient: # @genorient bltu $a0, $fp, .LBB0_107 # %bb.106: # %switch.lookup1106 # in Loop: Header=BB0_105 Depth=3 - ld.d $a0, $sp, 120 # 8-byte Folded Reload + ld.d $a0, $sp, 128 # 8-byte Folded Reload ld.d $a0, $a0, 0 fldx.d $fa0, $s7, $a0 - ld.d $a0, $sp, 112 # 8-byte Folded Reload + ld.d $a0, $sp, 120 # 8-byte Folded Reload ld.d $a0, $a0, 0 fst.d $fa0, $s0, 8 fldx.d $fa0, $s7, $a0 - ld.d $a0, $sp, 104 # 8-byte Folded Reload + ld.d $a0, $sp, 112 # 8-byte Folded Reload ld.d $a0, $a0, 0 fst.d $fa0, $s0, 16 fldx.d $fa0, $s7, $a0 - ld.d $a0, $sp, 96 # 8-byte Folded Reload + ld.d $a0, $sp, 104 # 8-byte Folded Reload ld.d $a0, $a0, 0 fst.d $fa0, $s0, 24 fldx.d $fa0, $s7, $a0 @@ -882,7 +874,7 @@ genorient: # @genorient .LBB0_109: # in Loop: Header=BB0_105 Depth=3 bnez $s6, .LBB0_103 # %bb.110: # in Loop: Header=BB0_105 Depth=3 - ld.d $a0, $sp, 128 # 8-byte Folded Reload + ld.d $a0, $sp, 136 # 8-byte Folded Reload beqz $a0, .LBB0_104 # %bb.111: # in Loop: Header=BB0_105 Depth=3 ori $a0, $zero, 7 @@ -899,11 +891,11 @@ genorient: # @genorient .p2align 4, , 16 .LBB0_115: # %.loopexit930.loopexit # in Loop: Header=BB0_117 Depth=1 - ld.d $a0, $sp, 24 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload ld.w $a0, $a0, 0 - ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a1, $sp, 24 # 8-byte Folded Reload ld.w $a1, $a1, 0 - ld.d $a4, $sp, 112 # 8-byte Folded Reload + ld.d $a4, $sp, 120 # 8-byte Folded Reload .LBB0_116: # %.loopexit930 # in Loop: Header=BB0_117 Depth=1 add.w $a3, $a1, $a0 @@ -913,7 +905,7 @@ genorient: # @genorient # =>This Loop Header: Depth=1 # Child Loop BB0_120 Depth 2 # Child Loop BB0_135 Depth 3 - ld.d $a3, $sp, 32 # 8-byte Folded Reload + ld.d $a3, $sp, 40 # 8-byte Folded Reload ld.d $a3, $a3, 0 move $a4, $a2 slli.d $a2, $a2, 3 @@ -921,18 +913,18 @@ genorient: # @genorient ld.w $a2, $s8, 68 beqz $a2, .LBB0_116 # %bb.118: # in Loop: Header=BB0_117 Depth=1 - st.d $a4, $sp, 112 # 8-byte Folded Spill + st.d $a4, $sp, 120 # 8-byte Folded Spill ld.d $a0, $s8, 152 ld.w $a1, $a0, 68 ld.w $a2, $a0, 64 ld.w $a3, $a0, 60 ld.w $a0, $a0, 56 addi.d $a4, $s8, 152 - st.d $a4, $sp, 120 # 8-byte Folded Spill + st.d $a4, $sp, 128 # 8-byte Folded Spill sub.d $a1, $a1, $a2 sub.d $a0, $a3, $a0 addi.d $a2, $s8, 20 - st.d $a2, $sp, 128 # 8-byte Folded Spill + st.d $a2, $sp, 136 # 8-byte Folded Spill andi $a1, $a1, 1 st.d $a1, $sp, 144 # 8-byte Folded Spill andi $s3, $a0, 1 @@ -951,11 +943,11 @@ genorient: # @genorient beq $fp, $a0, .LBB0_119 # %bb.121: # in Loop: Header=BB0_120 Depth=2 slli.d $a0, $fp, 2 - ld.d $a1, $sp, 128 # 8-byte Folded Reload + ld.d $a1, $sp, 136 # 8-byte Folded Reload ldx.w $a0, $a1, $a0 beqz $a0, .LBB0_119 # %bb.122: # in Loop: Header=BB0_120 Depth=2 - ld.d $s0, $sp, 120 # 8-byte Folded Reload + ld.d $s0, $sp, 128 # 8-byte Folded Reload ld.d $a0, $s0, 0 ld.d $s4, $a0, 88 ori $a0, $zero, 32 @@ -1094,11 +1086,11 @@ genorient: # @genorient .p2align 4, , 16 .LBB0_143: # %.loopexit928.loopexit # in Loop: Header=BB0_145 Depth=1 - ld.d $a0, $sp, 24 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload ld.w $a0, $a0, 0 - ld.d $a1, $sp, 16 # 8-byte Folded Reload + ld.d $a1, $sp, 24 # 8-byte Folded Reload ld.w $a1, $a1, 0 - ld.d $a4, $sp, 104 # 8-byte Folded Reload + ld.d $a4, $sp, 112 # 8-byte Folded Reload .LBB0_144: # %.loopexit928 # in Loop: Header=BB0_145 Depth=1 add.w $a3, $a1, $a0 @@ -1108,7 +1100,7 @@ genorient: # @genorient # =>This Loop Header: Depth=1 # Child Loop BB0_149 Depth 2 # Child Loop BB0_155 Depth 3 - ld.d $a3, $sp, 32 # 8-byte Folded Reload + ld.d $a3, $sp, 40 # 8-byte Folded Reload ld.d $a3, $a3, 0 move $a4, $a2 slli.d $a2, $a2, 3 @@ -1119,24 +1111,24 @@ genorient: # @genorient ld.w $s1, $s6, 128 beqz $s1, .LBB0_144 # %bb.147: # in Loop: Header=BB0_145 Depth=1 - st.d $a4, $sp, 104 # 8-byte Folded Spill + st.d $a4, $sp, 112 # 8-byte Folded Spill ld.d $a0, $s6, 152 ld.w $a1, $a0, 68 ld.w $a2, $a0, 64 ld.w $a3, $a0, 60 ld.w $a4, $a0, 56 addi.d $a5, $s6, 152 - st.d $a5, $sp, 128 # 8-byte Folded Spill + st.d $a5, $sp, 136 # 8-byte Folded Spill sub.d $a1, $a1, $a2 sub.d $a2, $a3, $a4 ld.d $a0, $a0, 96 addi.d $a3, $s6, 20 st.d $a3, $sp, 144 # 8-byte Folded Spill andi $a1, $a1, 1 - st.d $a1, $sp, 120 # 8-byte Folded Spill + st.d $a1, $sp, 128 # 8-byte Folded Spill andi $s7, $a2, 1 addi.d $a0, $a0, 20 - st.d $a0, $sp, 112 # 8-byte Folded Spill + st.d $a0, $sp, 120 # 8-byte Folded Spill ori $fp, $zero, 1 b .LBB0_149 .p2align 4, , 16 @@ -1161,7 +1153,7 @@ genorient: # @genorient pcaddu18i $ra, %call36(malloc) jirl $ra, $ra, 0 slli.d $a1, $fp, 3 - ld.d $a2, $sp, 128 # 8-byte Folded Reload + ld.d $a2, $sp, 136 # 8-byte Folded Reload ldx.d $a1, $a2, $a1 st.d $a0, $a1, 96 blez $s1, .LBB0_148 @@ -1176,14 +1168,14 @@ genorient: # @genorient and $a1, $a1, $a3 addi.d $a3, $a2, -4 sltui $a3, $a3, 1 - ld.d $a4, $sp, 120 # 8-byte Folded Reload + ld.d $a4, $sp, 128 # 8-byte Folded Reload and $a3, $a4, $a3 addi.d $a2, $a2, -1 sltui $a2, $a2, 1 and $s4, $a4, $a2 or $s5, $a3, $a1 addi.d $s0, $a0, 16 - ld.d $s8, $sp, 112 # 8-byte Folded Reload + ld.d $s8, $sp, 120 # 8-byte Folded Reload b .LBB0_155 .p2align 4, , 16 .LBB0_153: # in Loop: Header=BB0_155 Depth=3 @@ -1273,18 +1265,18 @@ genorient: # @genorient bnez $s3, .LBB0_165 b .LBB0_163 .LBB0_166: # %._crit_edge997 - ld.d $s8, $sp, 152 # 8-byte Folded Reload - ld.d $s7, $sp, 160 # 8-byte Folded Reload - ld.d $s6, $sp, 168 # 8-byte Folded Reload - ld.d $s5, $sp, 176 # 8-byte Folded Reload - ld.d $s4, $sp, 184 # 8-byte Folded Reload - ld.d $s3, $sp, 192 # 8-byte Folded Reload - ld.d $s2, $sp, 200 # 8-byte Folded Reload - ld.d $s1, $sp, 208 # 8-byte Folded Reload - ld.d $s0, $sp, 216 # 8-byte Folded Reload - ld.d $fp, $sp, 224 # 8-byte Folded Reload - ld.d $ra, $sp, 232 # 8-byte Folded Reload - addi.d $sp, $sp, 240 + ld.d $s8, $sp, 168 # 8-byte Folded Reload + ld.d $s7, $sp, 176 # 8-byte Folded Reload + ld.d $s6, $sp, 184 # 8-byte Folded Reload + ld.d $s5, $sp, 192 # 8-byte Folded Reload + ld.d $s4, $sp, 200 # 8-byte Folded Reload + ld.d $s3, $sp, 208 # 8-byte Folded Reload + ld.d $s2, $sp, 216 # 8-byte Folded Reload + ld.d $s1, $sp, 224 # 8-byte Folded Reload + ld.d $s0, $sp, 232 # 8-byte Folded Reload + ld.d $fp, $sp, 240 # 8-byte Folded Reload + ld.d $ra, $sp, 248 # 8-byte Folded Reload + addi.d $sp, $sp, 256 ret .Lfunc_end0: .size genorient, .Lfunc_end0-genorient diff --git a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/savewolf.s b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/savewolf.s index b95ae7b9..a234ab0b 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/savewolf.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/savewolf.s @@ -588,9 +588,9 @@ TW_oldinput: # @TW_oldinput sub.d $s4, $a2, $a1 ld.d $a1, $sp, 48 # 8-byte Folded Reload ld.d $s8, $a1, 0 - vpickve2gr.w $a1, $vr0, 0 - vpickve2gr.w $a2, $vr0, 1 - sub.d $s0, $a2, $a1 + vreplvei.w $vr1, $vr0, 1 + vsub.w $vr1, $vr1, $vr0 + vpickve2gr.w $s0, $vr1, 0 addi.d $s3, $s8, 56 addi.d $s2, $s8, 60 addi.d $s1, $s8, 64 diff --git a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/uaspect.s b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/uaspect.s index 84b0d0b1..715e0795 100644 --- a/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/uaspect.s +++ b/results/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/uaspect.s @@ -526,15 +526,15 @@ uaspect: # @uaspect vld $vr0, $s1, 56 vpickve2gr.w $a0, $vr0, 2 vpickve2gr.w $a1, $vr0, 3 - sub.w $a4, $a1, $a0 - vpickve2gr.w $a1, $vr0, 0 - vpickve2gr.w $a2, $vr0, 1 + sub.w $a3, $a1, $a0 + vreplvei.w $vr1, $vr0, 1 + vsub.w $vr1, $vr1, $vr0 addi.d $a0, $s0, -2 - ori $a3, $zero, 5 - sub.d $a1, $a2, $a1 - st.d $a1, $sp, 64 # 8-byte Folded Spill - st.d $a4, $sp, 40 # 8-byte Folded Spill - bltu $a3, $a0, .LBB0_33 + ori $a1, $zero, 5 + vpickve2gr.w $a2, $vr1, 0 + st.d $a2, $sp, 64 # 8-byte Folded Spill + st.d $a3, $sp, 40 # 8-byte Folded Spill + bltu $a1, $a0, .LBB0_33 # %bb.25: slli.d $a0, $a0, 2 pcalau12i $a1, %pc_hi20(.LJTI0_0) diff --git a/results/MultiSource/Benchmarks/Trimaran/enc-3des/CMakeFiles/enc-3des.dir/des.s b/results/MultiSource/Benchmarks/Trimaran/enc-3des/CMakeFiles/enc-3des.dir/des.s index dc3a63cc..8c622b92 100644 --- a/results/MultiSource/Benchmarks/Trimaran/enc-3des/CMakeFiles/enc-3des.dir/des.s +++ b/results/MultiSource/Benchmarks/Trimaran/enc-3des/CMakeFiles/enc-3des.dir/des.s @@ -2,108 +2,111 @@ .section .rodata.cst16,"aM",@progbits,16 .p2align 4, 0x0 # -- Begin function des_main_ks .LCPI0_0: + .dword 1 # 0x1 + .dword 2 # 0x2 +.LCPI0_1: + .dword 10 # 0xa + .dword 6 # 0x6 +.LCPI0_2: .dword 18 # 0x12 .dword 10 # 0xa -.LCPI0_1: +.LCPI0_3: .dword 14 # 0xe .dword 15 # 0xf -.LCPI0_2: +.LCPI0_4: .dword 134217728 # 0x8000000 .dword 536870912 # 0x20000000 -.LCPI0_3: +.LCPI0_5: .dword 4 # 0x4 .dword 17 # 0x11 -.LCPI0_4: +.LCPI0_6: .dword 603979776 # 0x24000000 .dword 268435456 # 0x10000000 -.LCPI0_5: +.LCPI0_7: .dword 10 # 0xa .dword 16 # 0x10 -.LCPI0_6: +.LCPI0_8: .dword 34078720 # 0x2080000 .dword 134217728 # 0x8000000 -.LCPI0_7: +.LCPI0_9: .dword 6 # 0x6 .dword 22 # 0x16 -.LCPI0_8: +.LCPI0_10: .dword 16777216 # 0x1000000 .dword 67108864 # 0x4000000 -.LCPI0_9: +.LCPI0_11: .dword 1048576 # 0x100000 .dword 33554432 # 0x2000000 -.LCPI0_10: +.LCPI0_12: .dword 9 # 0x9 .dword 1 # 0x1 -.LCPI0_11: +.LCPI0_13: .dword 2097152 # 0x200000 .dword 16777216 # 0x1000000 -.LCPI0_12: +.LCPI0_14: .dword 262144 # 0x40000 .dword 2097152 # 0x200000 -.LCPI0_13: +.LCPI0_15: .dword 2 # 0x2 .dword 11 # 0xb -.LCPI0_14: +.LCPI0_16: .dword 131072 # 0x20000 .dword 1048576 # 0x100000 -.LCPI0_15: +.LCPI0_17: .dword 65536 # 0x10000 .dword 262144 # 0x40000 -.LCPI0_16: +.LCPI0_18: .dword 256 # 0x100 .dword 65536 # 0x10000 -.LCPI0_17: +.LCPI0_19: .dword 268435456 # 0x10000000 .dword 512 # 0x200 -.LCPI0_18: +.LCPI0_20: .dword 14 # 0xe .dword 3 # 0x3 -.LCPI0_19: +.LCPI0_21: .dword 24 # 0x18 .dword 14 # 0xe -.LCPI0_20: +.LCPI0_22: .dword 4096 # 0x1000 .dword 131072 # 0x20000 -.LCPI0_21: - .dword 1 # 0x1 - .dword 2 # 0x2 -.LCPI0_22: +.LCPI0_23: .dword 1024 # 0x400 .dword 8192 # 0x2000 -.LCPI0_23: +.LCPI0_24: .dword 2 # 0x2 .dword 4096 # 0x1000 -.LCPI0_24: +.LCPI0_25: .dword 1 # 0x1 .dword 2056 # 0x808 -.LCPI0_25: +.LCPI0_26: .dword 5 # 0x5 .dword 9 # 0x9 -.LCPI0_26: +.LCPI0_27: .dword 32 # 0x20 .dword 1024 # 0x400 -.LCPI0_27: +.LCPI0_28: .dword 6 # 0x6 .dword 7 # 0x7 -.LCPI0_28: +.LCPI0_29: .dword 2048 # 0x800 .dword 256 # 0x100 -.LCPI0_29: +.LCPI0_30: .dword 10 # 0xa .dword 7 # 0x7 -.LCPI0_30: +.LCPI0_31: .dword 16 # 0x10 .dword 32 # 0x20 -.LCPI0_31: +.LCPI0_32: .dword 512 # 0x200 .dword 17 # 0x11 -.LCPI0_32: +.LCPI0_33: .dword 8 # 0x8 .dword 4 # 0x4 -.LCPI0_33: +.LCPI0_34: .dword 18 # 0x12 .dword 21 # 0x15 -.LCPI0_34: +.LCPI0_35: .dword 4 # 0x4 .dword 2 # 0x2 .text @@ -113,15 +116,14 @@ des_main_ks: # @des_main_ks # %bb.0: addi.d $sp, $sp, -256 - st.d $fp, $sp, 248 # 8-byte Folded Spill - fst.d $fs0, $sp, 240 # 8-byte Folded Spill - fst.d $fs1, $sp, 232 # 8-byte Folded Spill - fst.d $fs2, $sp, 224 # 8-byte Folded Spill - fst.d $fs3, $sp, 216 # 8-byte Folded Spill - fst.d $fs4, $sp, 208 # 8-byte Folded Spill - fst.d $fs5, $sp, 200 # 8-byte Folded Spill - fst.d $fs6, $sp, 192 # 8-byte Folded Spill - fst.d $fs7, $sp, 184 # 8-byte Folded Spill + fst.d $fs0, $sp, 248 # 8-byte Folded Spill + fst.d $fs1, $sp, 240 # 8-byte Folded Spill + fst.d $fs2, $sp, 232 # 8-byte Folded Spill + fst.d $fs3, $sp, 224 # 8-byte Folded Spill + fst.d $fs4, $sp, 216 # 8-byte Folded Spill + fst.d $fs5, $sp, 208 # 8-byte Folded Spill + fst.d $fs6, $sp, 200 # 8-byte Folded Spill + fst.d $fs7, $sp, 192 # 8-byte Folded Spill ld.bu $a3, $a1, 0 ld.bu $a2, $a1, 1 ld.bu $a4, $a1, 2 @@ -232,45 +234,46 @@ des_main_ks: # @des_main_ks or $a2, $a2, $a5 slli.d $a4, $a4, 4 or $a2, $a2, $a4 - bstrpick.d $a5, $a3, 27, 0 - bstrpick.d $a4, $a2, 27, 0 + bstrpick.d $a4, $a3, 27, 0 + bstrpick.d $a3, $a2, 27, 0 ori $a2, $zero, 15 - lu12i.w $a3, 128 - pcalau12i $a6, %pc_hi20(.LCPI0_0) - vld $vr0, $a6, %pc_lo12(.LCPI0_0) - vst $vr0, $sp, 160 # 16-byte Folded Spill + pcalau12i $a5, %pc_hi20(.LCPI0_0) + vld $vr0, $a5, %pc_lo12(.LCPI0_0) + lu12i.w $a5, 128 + pcalau12i $a6, %pc_hi20(.LCPI0_1) + vld $vr1, $a6, %pc_lo12(.LCPI0_1) + vst $vr1, $sp, 176 # 16-byte Folded Spill + pcalau12i $a6, %pc_hi20(.LCPI0_2) + vld $vr1, $a6, %pc_lo12(.LCPI0_2) + vst $vr1, $sp, 160 # 16-byte Folded Spill lu12i.w $a6, 2 - pcalau12i $a7, %pc_hi20(.LCPI0_1) - vld $vr0, $a7, %pc_lo12(.LCPI0_1) - vst $vr0, $sp, 144 # 16-byte Folded Spill - pcalau12i $a7, %pc_hi20(.LCPI0_2) - vld $vr0, $a7, %pc_lo12(.LCPI0_2) - vst $vr0, $sp, 128 # 16-byte Folded Spill pcalau12i $a7, %pc_hi20(.LCPI0_3) - vld $vr0, $a7, %pc_lo12(.LCPI0_3) - vst $vr0, $sp, 112 # 16-byte Folded Spill + vld $vr1, $a7, %pc_lo12(.LCPI0_3) + vst $vr1, $sp, 144 # 16-byte Folded Spill pcalau12i $a7, %pc_hi20(.LCPI0_4) - vld $vr0, $a7, %pc_lo12(.LCPI0_4) - vst $vr0, $sp, 96 # 16-byte Folded Spill + vld $vr1, $a7, %pc_lo12(.LCPI0_4) + vst $vr1, $sp, 128 # 16-byte Folded Spill pcalau12i $a7, %pc_hi20(.LCPI0_5) - vld $vr0, $a7, %pc_lo12(.LCPI0_5) - vst $vr0, $sp, 80 # 16-byte Folded Spill + vld $vr1, $a7, %pc_lo12(.LCPI0_5) + vst $vr1, $sp, 112 # 16-byte Folded Spill pcalau12i $a7, %pc_hi20(.LCPI0_6) - vld $vr0, $a7, %pc_lo12(.LCPI0_6) - vst $vr0, $sp, 64 # 16-byte Folded Spill + vld $vr1, $a7, %pc_lo12(.LCPI0_6) + vst $vr1, $sp, 96 # 16-byte Folded Spill pcalau12i $a7, %pc_hi20(.LCPI0_7) - vld $vr0, $a7, %pc_lo12(.LCPI0_7) - vst $vr0, $sp, 48 # 16-byte Folded Spill + vld $vr1, $a7, %pc_lo12(.LCPI0_7) + vst $vr1, $sp, 80 # 16-byte Folded Spill pcalau12i $a7, %pc_hi20(.LCPI0_8) - vld $vr0, $a7, %pc_lo12(.LCPI0_8) - vst $vr0, $sp, 32 # 16-byte Folded Spill + vld $vr1, $a7, %pc_lo12(.LCPI0_8) + vst $vr1, $sp, 64 # 16-byte Folded Spill pcalau12i $a7, %pc_hi20(.LCPI0_9) - vld $vr0, $a7, %pc_lo12(.LCPI0_9) - vst $vr0, $sp, 16 # 16-byte Folded Spill + vld $vr1, $a7, %pc_lo12(.LCPI0_9) + vst $vr1, $sp, 48 # 16-byte Folded Spill pcalau12i $a7, %pc_hi20(.LCPI0_10) - vld $vr10, $a7, %pc_lo12(.LCPI0_10) + vld $vr1, $a7, %pc_lo12(.LCPI0_10) + vst $vr1, $sp, 32 # 16-byte Folded Spill pcalau12i $a7, %pc_hi20(.LCPI0_11) - vld $vr11, $a7, %pc_lo12(.LCPI0_11) + vld $vr1, $a7, %pc_lo12(.LCPI0_11) + vst $vr1, $sp, 16 # 16-byte Folded Spill pcalau12i $a7, %pc_hi20(.LCPI0_12) vld $vr12, $a7, %pc_lo12(.LCPI0_12) pcalau12i $a7, %pc_hi20(.LCPI0_13) @@ -314,135 +317,133 @@ des_main_ks: # @des_main_ks pcalau12i $t1, %pc_hi20(.LCPI0_32) pcalau12i $t2, %pc_hi20(.LCPI0_33) pcalau12i $t3, %pc_hi20(.LCPI0_34) + pcalau12i $t4, %pc_hi20(.LCPI0_35) ori $a7, $zero, 16 ori $t0, $zero, 1 - vld $vr0, $t1, %pc_lo12(.LCPI0_32) - vld $vr1, $t2, %pc_lo12(.LCPI0_33) - vld $vr2, $t3, %pc_lo12(.LCPI0_34) + vld $vr1, $t1, %pc_lo12(.LCPI0_32) + vld $vr2, $t2, %pc_lo12(.LCPI0_33) + vld $vr3, $t3, %pc_lo12(.LCPI0_34) + vld $vr4, $t4, %pc_lo12(.LCPI0_35) lu12i.w $t1, 8 ori $t1, $t1, 259 - vrepli.d $vr5, 28 + vrepli.d $vr9, 28 b .LBB0_3 .LBB0_1: # in Loop: Header=BB0_3 Depth=1 ori $t2, $zero, 26 ori $t3, $zero, 2 .p2align 4, , 16 .LBB0_2: # in Loop: Header=BB0_3 Depth=1 - sll.d $t4, $a5, $t3 - srl.d $t5, $a5, $t2 - bstrpick.d $a5, $t4, 27, 1 - slli.d $a5, $a5, 1 - or $a5, $a5, $t5 - srli.d $t6, $t4, 1 - srli.d $t7, $t4, 10 - srli.d $t8, $t4, 2 - srli.d $fp, $t4, 6 + sll.d $t4, $a4, $t3 + srl.d $t5, $a4, $t2 + bstrpick.d $a4, $t4, 27, 1 + slli.d $a4, $a4, 1 + or $a4, $a4, $t5 + vreplgr2vr.d $vr6, $t4 + vsrl.d $vr7, $vr6, $vr0 + vld $vr5, $sp, 176 # 16-byte Folded Reload + vsrl.d $vr6, $vr6, $vr5 srli.d $t4, $t4, 4 - vinsgr2vr.d $vr4, $t5, 0 - vinsgr2vr.d $vr4, $a4, 1 - vori.b $vr3, $vr5, 0 - vinsgr2vr.d $vr3, $t3, 1 - vsll.d $vr3, $vr4, $vr3 - vreplgr2vr.d $vr6, $a5 - vinsgr2vr.d $vr4, $t6, 0 - vinsgr2vr.d $vr4, $t8, 1 - vld $vr7, $sp, 16 # 16-byte Folded Reload - vand.v $vr7, $vr4, $vr7 - vinsgr2vr.d $vr4, $t7, 0 - vinsgr2vr.d $vr4, $fp, 1 - vand.v $vr8, $vr4, $vr15 - vreplvei.d $vr4, $vr3, 1 - vor.v $vr7, $vr8, $vr7 - vori.b $vr8, $vr4, 0 - vinsgr2vr.d $vr8, $t4, 1 - vand.v $vr8, $vr8, $vr16 - vor.v $vr7, $vr7, $vr8 - vld $vr8, $sp, 144 # 16-byte Folded Reload - vsll.d $vr8, $vr6, $vr8 - vand.v $vr3, $vr3, $vr17 - vor.v $vr3, $vr7, $vr3 + vinsgr2vr.d $vr8, $t5, 0 + vinsgr2vr.d $vr8, $a3, 1 + vori.b $vr5, $vr9, 0 + vinsgr2vr.d $vr5, $t3, 1 + vsll.d $vr5, $vr8, $vr5 + vreplgr2vr.d $vr8, $a4 + vld $vr10, $sp, 16 # 16-byte Folded Reload + vand.v $vr7, $vr7, $vr10 + vand.v $vr10, $vr6, $vr17 + vreplvei.d $vr6, $vr5, 1 + vor.v $vr7, $vr10, $vr7 + vori.b $vr10, $vr6, 0 + vinsgr2vr.d $vr10, $t4, 1 + vand.v $vr10, $vr10, $vr18 + vor.v $vr7, $vr7, $vr10 + vld $vr10, $sp, 144 # 16-byte Folded Reload + vsll.d $vr10, $vr8, $vr10 + vand.v $vr5, $vr5, $vr19 + vor.v $vr5, $vr7, $vr5 vld $vr7, $sp, 112 # 16-byte Folded Reload - vsll.d $vr7, $vr6, $vr7 - vld $vr9, $sp, 96 # 16-byte Folded Reload - vand.v $vr7, $vr7, $vr9 - vor.v $vr3, $vr3, $vr7 + vsll.d $vr7, $vr8, $vr7 + vld $vr11, $sp, 96 # 16-byte Folded Reload + vand.v $vr7, $vr7, $vr11 + vor.v $vr5, $vr5, $vr7 vld $vr7, $sp, 128 # 16-byte Folded Reload - vand.v $vr7, $vr8, $vr7 - vor.v $vr3, $vr3, $vr7 + vand.v $vr7, $vr10, $vr7 + vor.v $vr5, $vr5, $vr7 vld $vr7, $sp, 160 # 16-byte Folded Reload - vsll.d $vr7, $vr6, $vr7 - vld $vr9, $sp, 64 # 16-byte Folded Reload - vand.v $vr7, $vr7, $vr9 - vor.v $vr3, $vr3, $vr7 + vsll.d $vr7, $vr8, $vr7 + vld $vr11, $sp, 64 # 16-byte Folded Reload + vand.v $vr7, $vr7, $vr11 + vor.v $vr5, $vr5, $vr7 vld $vr7, $sp, 48 # 16-byte Folded Reload - vsll.d $vr7, $vr6, $vr7 - vld $vr9, $sp, 32 # 16-byte Folded Reload - vand.v $vr7, $vr7, $vr9 - vor.v $vr3, $vr3, $vr7 - vsll.d $vr7, $vr6, $vr10 + vsll.d $vr7, $vr8, $vr7 + vld $vr11, $sp, 32 # 16-byte Folded Reload vand.v $vr7, $vr7, $vr11 - vor.v $vr3, $vr3, $vr7 + vor.v $vr5, $vr5, $vr7 + vsll.d $vr7, $vr8, $vr12 + vand.v $vr7, $vr7, $vr13 + vor.v $vr5, $vr5, $vr7 vld $vr7, $sp, 80 # 16-byte Folded Reload - vsll.d $vr7, $vr6, $vr7 - vand.v $vr7, $vr7, $vr12 - vor.v $vr3, $vr3, $vr7 - slli.d $t4, $a5, 3 - and $t4, $t4, $a3 - sll.d $t3, $a4, $t3 + vsll.d $vr7, $vr8, $vr7 + vand.v $vr7, $vr7, $vr14 + vor.v $vr5, $vr5, $vr7 + slli.d $t4, $a4, 3 + and $t4, $t4, $a5 + sll.d $t3, $a3, $t3 srli.d $t5, $t3, 13 and $t5, $t5, $a6 - vsll.d $vr6, $vr6, $vr13 - vand.v $vr6, $vr6, $vr14 - vor.v $vr3, $vr3, $vr6 - vinsgr2vr.d $vr6, $t5, 0 - vinsgr2vr.d $vr6, $t4, 1 - vor.v $vr3, $vr3, $vr6 + vsll.d $vr7, $vr8, $vr15 + vand.v $vr7, $vr7, $vr16 + vor.v $vr5, $vr5, $vr7 + vinsgr2vr.d $vr7, $t5, 0 + vinsgr2vr.d $vr7, $t4, 1 + vor.v $vr5, $vr5, $vr7 srli.d $t4, $t3, 4 - vinsgr2vr.d $vr8, $t4, 0 - vand.v $vr6, $vr8, $vr20 - vor.v $vr3, $vr3, $vr6 - vsrl.d $vr6, $vr4, $vr21 - vand.v $vr6, $vr6, $vr22 - vor.v $vr3, $vr3, $vr6 - vsrl.d $vr6, $vr4, $vr25 - vand.v $vr6, $vr6, $vr26 - vor.v $vr3, $vr3, $vr6 - vsrl.d $vr6, $vr4, $vr29 - vand.v $vr6, $vr6, $vr30 - vor.v $vr3, $vr3, $vr6 - vori.b $vr6, $vr4, 0 - vsrl.d $vr4, $vr4, $vr1 - srl.d $t2, $a4, $t2 - bstrpick.d $a4, $t3, 27, 1 - slli.d $a4, $a4, 1 - or $a4, $a4, $t2 + vinsgr2vr.d $vr10, $t4, 0 + vand.v $vr7, $vr10, $vr22 + vor.v $vr5, $vr5, $vr7 + vsrl.d $vr7, $vr6, $vr0 + vand.v $vr7, $vr7, $vr23 + vor.v $vr5, $vr5, $vr7 + vsrl.d $vr7, $vr6, $vr26 + vand.v $vr7, $vr7, $vr27 + vor.v $vr5, $vr5, $vr7 + vsrl.d $vr7, $vr6, $vr30 + vand.v $vr7, $vr7, $vr31 + vor.v $vr5, $vr5, $vr7 + vori.b $vr7, $vr6, 0 + vsrl.d $vr6, $vr6, $vr3 + srl.d $t2, $a3, $t2 + bstrpick.d $a3, $t3, 27, 1 + slli.d $a3, $a3, 1 + or $a3, $a3, $t2 srli.d $t3, $t3, 26 - slli.d $t4, $a4, 8 - vand.v $vr4, $vr4, $vr2 - vor.v $vr3, $vr3, $vr4 - vinsgr2vr.d $vr4, $t3, 0 - vinsgr2vr.d $vr4, $t4, 1 - vand.v $vr4, $vr4, $vr23 - vor.v $vr3, $vr3, $vr4 - vreplgr2vr.d $vr4, $a4 - vinsgr2vr.d $vr6, $a4, 1 - vsrl.d $vr6, $vr6, $vr19 + slli.d $t4, $a3, 8 + vand.v $vr6, $vr6, $vr4 + vor.v $vr5, $vr5, $vr6 + vinsgr2vr.d $vr6, $t3, 0 + vinsgr2vr.d $vr6, $t4, 1 vand.v $vr6, $vr6, $vr24 - vor.v $vr3, $vr3, $vr6 - vsrl.d $vr6, $vr4, $vr18 - vsll.d $vr4, $vr4, $vr27 - vand.v $vr4, $vr4, $vr28 - vor.v $vr3, $vr3, $vr4 - vand.v $vr4, $vr6, $vr31 - vor.v $vr3, $vr3, $vr4 + vor.v $vr5, $vr5, $vr6 + vreplgr2vr.d $vr6, $a3 + vinsgr2vr.d $vr7, $a3, 1 + vsrl.d $vr7, $vr7, $vr21 + vand.v $vr7, $vr7, $vr25 + vor.v $vr5, $vr5, $vr7 + vsrl.d $vr7, $vr6, $vr20 + vsll.d $vr6, $vr6, $vr28 + vand.v $vr6, $vr6, $vr29 + vor.v $vr5, $vr5, $vr6 + vand.v $vr6, $vr7, $vr1 + vor.v $vr5, $vr5, $vr6 slli.d $t2, $t2, 2 - vreplvei.d $vr4, $vr6, 1 - vinsgr2vr.d $vr4, $t2, 1 - vand.v $vr4, $vr4, $vr0 - vor.v $vr3, $vr3, $vr4 + vreplvei.d $vr6, $vr7, 1 + vinsgr2vr.d $vr6, $t2, 1 + vand.v $vr6, $vr6, $vr2 + vor.v $vr5, $vr5, $vr6 addi.d $t2, $a0, 16 addi.w $a1, $a1, 1 - vst $vr3, $a0, 0 + vst $vr5, $a0, 0 move $a0, $t2 beq $a1, $a7, .LBB0_5 .LBB0_3: # =>This Inner Loop Header: Depth=1 @@ -456,15 +457,14 @@ des_main_ks: # @des_main_ks b .LBB0_1 .LBB0_5: move $a0, $zero - fld.d $fs7, $sp, 184 # 8-byte Folded Reload - fld.d $fs6, $sp, 192 # 8-byte Folded Reload - fld.d $fs5, $sp, 200 # 8-byte Folded Reload - fld.d $fs4, $sp, 208 # 8-byte Folded Reload - fld.d $fs3, $sp, 216 # 8-byte Folded Reload - fld.d $fs2, $sp, 224 # 8-byte Folded Reload - fld.d $fs1, $sp, 232 # 8-byte Folded Reload - fld.d $fs0, $sp, 240 # 8-byte Folded Reload - ld.d $fp, $sp, 248 # 8-byte Folded Reload + fld.d $fs7, $sp, 192 # 8-byte Folded Reload + fld.d $fs6, $sp, 200 # 8-byte Folded Reload + fld.d $fs5, $sp, 208 # 8-byte Folded Reload + fld.d $fs4, $sp, 216 # 8-byte Folded Reload + fld.d $fs3, $sp, 224 # 8-byte Folded Reload + fld.d $fs2, $sp, 232 # 8-byte Folded Reload + fld.d $fs1, $sp, 240 # 8-byte Folded Reload + fld.d $fs0, $sp, 248 # 8-byte Folded Reload addi.d $sp, $sp, 256 ret .Lfunc_end0: diff --git a/results/MultiSource/Benchmarks/mafft/CMakeFiles/pairlocalalign.dir/constants.s b/results/MultiSource/Benchmarks/mafft/CMakeFiles/pairlocalalign.dir/constants.s index 3af63b5a..068be4e0 100644 --- a/results/MultiSource/Benchmarks/mafft/CMakeFiles/pairlocalalign.dir/constants.s +++ b/results/MultiSource/Benchmarks/mafft/CMakeFiles/pairlocalalign.dir/constants.s @@ -2076,51 +2076,53 @@ constants: # @constants movgr2fr.w $fa1, $a5 ld.w $a5, $a4, 316 fst.d $fa0, $a2, 24 - ffint.d.w $fa4, $fa1 - fst.d $fa4, $a3, 0 + ffint.d.w $fa2, $fa1 + fst.d $fa2, $a3, 0 movgr2fr.w $fa0, $a5 ld.w $a5, $a4, 320 - ffint.d.w $fa5, $fa0 - fst.d $fa5, $a3, 8 + ffint.d.w $fa3, $fa0 + fst.d $fa3, $a3, 8 ld.w $a4, $a4, 324 movgr2fr.w $fa0, $a5 - ffint.d.w $fa6, $fa0 - fst.d $fa6, $a3, 16 + ffint.d.w $fa4, $fa0 + fst.d $fa4, $a3, 16 movgr2fr.w $fa0, $a4 - ffint.d.w $fa7, $fa0 - fst.d $fa7, $a3, 24 - fld.d $fa0, $a0, 0 - movgr2fr.d $ft0, $zero - fld.d $fa1, $a0, 8 - fld.d $fa2, $a0, 16 - fld.d $fa3, $a0, 24 - fadd.d $ft0, $fa0, $ft0 - fadd.d $ft0, $ft0, $fa1 - fadd.d $ft0, $ft0, $fa2 - fadd.d $ft0, $ft0, $fa3 - fld.d $ft1, $a1, 0 - fld.d $ft2, $a1, 8 - fld.d $ft3, $a1, 16 - fld.d $ft4, $a1, 24 - fadd.d $ft0, $ft0, $ft1 - fadd.d $ft0, $ft0, $ft2 - fadd.d $ft0, $ft0, $ft3 - fadd.d $ft0, $ft0, $ft4 - fld.d $ft1, $a2, 0 - fld.d $ft2, $a2, 8 - fld.d $ft3, $a2, 16 - fld.d $ft4, $a2, 24 - fadd.d $ft0, $ft0, $ft1 - fadd.d $ft0, $ft0, $ft2 - fadd.d $ft0, $ft0, $ft3 - fadd.d $ft0, $ft0, $ft4 - fadd.d $fa4, $ft0, $fa4 + ffint.d.w $fa5, $fa0 + fst.d $fa5, $a3, 24 + vld $vr0, $a0, 0 + vreplvei.d $vr6, $vr0, 0 + movgr2fr.d $fa7, $zero + vld $vr1, $a0, 16 + fadd.d $fa6, $fa6, $fa7 + vreplvei.d $vr7, $vr0, 1 + fadd.d $fa6, $fa6, $fa7 + vreplvei.d $vr7, $vr1, 0 + fadd.d $fa6, $fa6, $fa7 + vreplvei.d $vr7, $vr1, 1 + fadd.d $fa6, $fa6, $fa7 + fld.d $fa7, $a1, 0 + fld.d $ft0, $a1, 8 + fld.d $ft1, $a1, 16 + fld.d $ft2, $a1, 24 + fadd.d $fa6, $fa6, $fa7 + fadd.d $fa6, $fa6, $ft0 + fadd.d $fa6, $fa6, $ft1 + fadd.d $fa6, $fa6, $ft2 + fld.d $fa7, $a2, 0 + fld.d $ft0, $a2, 8 + fld.d $ft1, $a2, 16 + fld.d $ft2, $a2, 24 + fadd.d $fa6, $fa6, $fa7 + fadd.d $fa6, $fa6, $ft0 + fadd.d $fa6, $fa6, $ft1 + fadd.d $fa6, $fa6, $ft2 + fadd.d $fa2, $fa6, $fa2 pcalau12i $a4, %pc_hi20(.LCPI2_9) - fld.d $ft0, $a4, %pc_lo12(.LCPI2_9) - fadd.d $fa4, $fa4, $fa5 - fadd.d $fa4, $fa4, $fa6 - fadd.d $fa4, $fa4, $fa7 - fmul.d $fa4, $fa4, $ft0 + fld.d $fa6, $a4, %pc_lo12(.LCPI2_9) + fadd.d $fa2, $fa2, $fa3 + fadd.d $fa2, $fa2, $fa4 + fadd.d $fa2, $fa2, $fa5 + fmul.d $fa3, $fa2, $fa6 pcalau12i $a4, %got_pc_hi20(disp) ld.d $a4, $a4, %got_pc_lo12(disp) ld.w $a4, $a4, 0 @@ -2129,110 +2131,104 @@ constants: # @constants pcalau12i $a0, %got_pc_hi20(stderr) ld.d $a0, $a0, %got_pc_lo12(stderr) ld.d $a0, $a0, 0 - movfr2gr.d $a2, $fa4 + movfr2gr.d $a2, $fa3 pcalau12i $a1, %pc_hi20(.L.str.4) addi.d $a1, $a1, %pc_lo12(.L.str.4) - vst $vr4, $sp, 128 # 16-byte Folded Spill + vst $vr3, $sp, 128 # 16-byte Folded Spill pcaddu18i $ra, %call36(fprintf) jirl $ra, $ra, 0 - vld $vr4, $sp, 128 # 16-byte Folded Reload + vld $vr3, $sp, 128 # 16-byte Folded Reload ld.d $a0, $s1, 0 - fld.d $fa0, $a0, 0 - fld.d $fa1, $a0, 8 - fld.d $fa2, $a0, 16 - fld.d $fa3, $a0, 24 + vld $vr0, $a0, 0 + vld $vr1, $a0, 16 ld.d $a1, $s1, 8 ld.d $a2, $s1, 16 ld.d $a3, $s1, 24 .LBB2_24: # %.preheader1134 - fsub.d $fa0, $fa0, $fa4 - fst.d $fa0, $a0, 0 - fsub.d $fa0, $fa1, $fa4 - fst.d $fa0, $a0, 8 - fsub.d $fa0, $fa2, $fa4 - fst.d $fa0, $a0, 16 - fsub.d $fa0, $fa3, $fa4 - fst.d $fa0, $a0, 24 - vld $vr0, $a1, 16 - vld $vr1, $a1, 0 - vreplvei.d $vr2, $vr4, 0 + vreplvei.d $vr2, $vr3, 0 vfsub.d $vr0, $vr0, $vr2 - vfsub.d $vr1, $vr1, $vr2 - vst $vr1, $a1, 0 + vst $vr0, $a0, 0 + vfsub.d $vr0, $vr1, $vr2 + vst $vr0, $a0, 16 + vld $vr0, $a1, 0 + vld $vr1, $a1, 16 + vfsub.d $vr0, $vr0, $vr2 + vst $vr0, $a1, 0 + vfsub.d $vr0, $vr1, $vr2 vst $vr0, $a1, 16 - vld $vr0, $a2, 16 - vld $vr1, $a2, 0 + vld $vr0, $a2, 0 + vld $vr1, $a2, 16 vfsub.d $vr0, $vr0, $vr2 - vfsub.d $vr1, $vr1, $vr2 - vst $vr1, $a2, 0 + vst $vr0, $a2, 0 + vfsub.d $vr0, $vr1, $vr2 vst $vr0, $a2, 16 - vld $vr0, $a3, 16 - vld $vr1, $a3, 0 - pcalau12i $a4, %pc_hi20(.LCPI2_5) - fld.d $fa3, $a4, %pc_lo12(.LCPI2_5) + vld $vr0, $a3, 0 vfsub.d $vr0, $vr0, $vr2 + vld $vr1, $a3, 16 + vst $vr0, $a3, 0 + pcalau12i $a4, %pc_hi20(.LCPI2_5) + fld.d $fa0, $a4, %pc_lo12(.LCPI2_5) vfsub.d $vr1, $vr1, $vr2 - vst $vr1, $a3, 0 - vst $vr0, $a3, 16 - vld $vr0, $a0, 16 + vst $vr1, $a3, 16 vld $vr1, $a0, 0 - fdiv.d $fa2, $fa3, $fa4 - vreplvei.d $vr2, $vr2, 0 - vfmul.d $vr0, $vr2, $vr0 - vfmul.d $vr1, $vr2, $vr1 + fdiv.d $fa0, $fa0, $fa3 + vld $vr2, $a0, 16 + vreplvei.d $vr0, $vr0, 0 + vfmul.d $vr1, $vr0, $vr1 vst $vr1, $a0, 0 - vst $vr0, $a0, 16 - vld $vr0, $a1, 16 + vfmul.d $vr1, $vr0, $vr2 + vst $vr1, $a0, 16 vld $vr1, $a1, 0 - vfmul.d $vr0, $vr2, $vr0 - vfmul.d $vr1, $vr2, $vr1 + vld $vr2, $a1, 16 + vfmul.d $vr1, $vr0, $vr1 vst $vr1, $a1, 0 - vst $vr0, $a1, 16 - vld $vr0, $a2, 16 + vfmul.d $vr1, $vr0, $vr2 + vst $vr1, $a1, 16 vld $vr1, $a2, 0 - vfmul.d $vr0, $vr2, $vr0 + vld $vr2, $a2, 16 + vfmul.d $vr1, $vr0, $vr1 ld.d $a0, $s1, 24 - vfmul.d $vr1, $vr2, $vr1 vst $vr1, $a2, 0 - vst $vr0, $a2, 16 - vld $vr0, $a0, 16 + vfmul.d $vr1, $vr0, $vr2 + vst $vr1, $a2, 16 vld $vr1, $a0, 0 - vfmul.d $vr0, $vr2, $vr0 - vfmul.d $vr1, $vr2, $vr1 + vfmul.d $vr1, $vr0, $vr1 + vld $vr2, $a0, 16 + vst $vr1, $a0, 0 ld.w $a2, $s4, 0 ld.d $a1, $s1, 0 - vst $vr1, $a0, 0 + vfmul.d $vr0, $vr0, $vr2 vst $vr0, $a0, 16 movgr2fr.w $fa0, $a2 - vld $vr1, $a1, 16 + vld $vr1, $a1, 0 ffint.d.w $fa0, $fa0 - vld $vr2, $a1, 0 vreplvei.d $vr0, $vr0, 0 + vld $vr2, $a1, 16 vfsub.d $vr1, $vr1, $vr0 ld.d $a2, $s1, 8 - vfsub.d $vr2, $vr2, $vr0 - vst $vr2, $a1, 0 + vst $vr1, $a1, 0 + vfsub.d $vr1, $vr2, $vr0 vst $vr1, $a1, 16 - vld $vr1, $a2, 16 - vld $vr2, $a2, 0 + vld $vr1, $a2, 0 + vld $vr2, $a2, 16 vfsub.d $vr1, $vr1, $vr0 ld.d $a3, $s1, 16 - vfsub.d $vr2, $vr2, $vr0 - vst $vr2, $a2, 0 + vst $vr1, $a2, 0 + vfsub.d $vr1, $vr2, $vr0 vst $vr1, $a2, 16 - vld $vr1, $a3, 16 - vld $vr2, $a3, 0 + vld $vr1, $a3, 0 + vld $vr2, $a3, 16 vfsub.d $vr1, $vr1, $vr0 - vfsub.d $vr2, $vr2, $vr0 - vst $vr2, $a3, 0 + vst $vr1, $a3, 0 + vfsub.d $vr1, $vr2, $vr0 vst $vr1, $a3, 16 - vld $vr1, $a0, 16 - vld $vr2, $a0, 0 + vld $vr1, $a0, 0 + vld $vr2, $a0, 16 vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a0, 0 vfsub.d $vr0, $vr2, $vr0 - vst $vr0, $a0, 0 - vst $vr1, $a0, 16 - vreplvei.d $vr0, $vr0, 0 + vst $vr0, $a0, 16 + vreplvei.d $vr0, $vr1, 0 b .LBB2_248 .LBB2_25: pcalau12i $a1, %got_pc_hi20(scoremtx) @@ -3806,35 +3802,35 @@ constants: # @constants .LBB2_114: # %.preheader1190 # =>This Inner Loop Header: Depth=1 ldx.d $a2, $s7, $a0 - vld $vr1, $a2, 16 - vld $vr2, $a2, 0 + vld $vr1, $a2, 0 + vld $vr2, $a2, 16 vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 0 + vld $vr1, $a2, 32 vfsub.d $vr2, $vr2, $vr0 - vld $vr3, $a2, 32 - vst $vr2, $a2, 0 - vst $vr1, $a2, 16 - vld $vr1, $a2, 48 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 32 + vst $vr2, $a2, 16 + vld $vr2, $a2, 48 + vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 32 + vld $vr1, $a2, 64 + vfsub.d $vr2, $vr2, $vr0 + vst $vr2, $a2, 48 vld $vr2, $a2, 80 - vld $vr3, $a2, 64 vfsub.d $vr1, $vr1, $vr0 - vst $vr1, $a2, 48 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr2, $vr3, $vr0 - vld $vr3, $a2, 96 - vst $vr2, $a2, 64 - vst $vr1, $a2, 80 - vld $vr1, $a2, 112 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 96 - vld $vr2, $a2, 144 - vld $vr3, $a2, 128 + vst $vr1, $a2, 64 + vld $vr1, $a2, 96 + vfsub.d $vr2, $vr2, $vr0 + vst $vr2, $a2, 80 + vld $vr2, $a2, 112 vfsub.d $vr1, $vr1, $vr0 - vst $vr1, $a2, 112 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 128 + vst $vr1, $a2, 96 + vld $vr1, $a2, 128 + vfsub.d $vr2, $vr2, $vr0 + vld $vr3, $a2, 144 + vst $vr2, $a2, 112 + vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 128 + vfsub.d $vr1, $vr3, $vr0 addi.d $a0, $a0, 8 vst $vr1, $a2, 144 bne $a0, $a1, .LBB2_114 @@ -3863,35 +3859,35 @@ constants: # @constants .LBB2_118: # %.preheader1187 # =>This Inner Loop Header: Depth=1 ldx.d $a2, $s7, $a0 - vld $vr1, $a2, 16 - vld $vr2, $a2, 0 + vld $vr1, $a2, 0 + vld $vr2, $a2, 16 vfmul.d $vr1, $vr0, $vr1 + vst $vr1, $a2, 0 + vld $vr1, $a2, 32 vfmul.d $vr2, $vr0, $vr2 - vld $vr3, $a2, 32 - vst $vr2, $a2, 0 - vst $vr1, $a2, 16 - vld $vr1, $a2, 48 - vfmul.d $vr2, $vr0, $vr3 - vst $vr2, $a2, 32 + vst $vr2, $a2, 16 + vld $vr2, $a2, 48 + vfmul.d $vr1, $vr0, $vr1 + vst $vr1, $a2, 32 + vld $vr1, $a2, 64 + vfmul.d $vr2, $vr0, $vr2 + vst $vr2, $a2, 48 vld $vr2, $a2, 80 - vld $vr3, $a2, 64 vfmul.d $vr1, $vr0, $vr1 - vst $vr1, $a2, 48 - vfmul.d $vr1, $vr0, $vr2 - vfmul.d $vr2, $vr0, $vr3 - vld $vr3, $a2, 96 - vst $vr2, $a2, 64 - vst $vr1, $a2, 80 - vld $vr1, $a2, 112 - vfmul.d $vr2, $vr0, $vr3 - vst $vr2, $a2, 96 - vld $vr2, $a2, 144 - vld $vr3, $a2, 128 + vst $vr1, $a2, 64 + vld $vr1, $a2, 96 + vfmul.d $vr2, $vr0, $vr2 + vst $vr2, $a2, 80 + vld $vr2, $a2, 112 vfmul.d $vr1, $vr0, $vr1 - vst $vr1, $a2, 112 - vfmul.d $vr1, $vr0, $vr2 - vfmul.d $vr2, $vr0, $vr3 - vst $vr2, $a2, 128 + vst $vr1, $a2, 96 + vld $vr1, $a2, 128 + vfmul.d $vr2, $vr0, $vr2 + vld $vr3, $a2, 144 + vst $vr2, $a2, 112 + vfmul.d $vr1, $vr0, $vr1 + vst $vr1, $a2, 128 + vfmul.d $vr1, $vr0, $vr3 addi.d $a0, $a0, 8 vst $vr1, $a2, 144 bne $a0, $a1, .LBB2_118 @@ -3907,35 +3903,35 @@ constants: # @constants .LBB2_120: # %.preheader1185 # =>This Inner Loop Header: Depth=1 ldx.d $a2, $s7, $a0 - vld $vr1, $a2, 16 - vld $vr2, $a2, 0 + vld $vr1, $a2, 0 + vld $vr2, $a2, 16 vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 0 + vld $vr1, $a2, 32 vfsub.d $vr2, $vr2, $vr0 - vld $vr3, $a2, 32 - vst $vr2, $a2, 0 - vst $vr1, $a2, 16 - vld $vr1, $a2, 48 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 32 + vst $vr2, $a2, 16 + vld $vr2, $a2, 48 + vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 32 + vld $vr1, $a2, 64 + vfsub.d $vr2, $vr2, $vr0 + vst $vr2, $a2, 48 vld $vr2, $a2, 80 - vld $vr3, $a2, 64 vfsub.d $vr1, $vr1, $vr0 - vst $vr1, $a2, 48 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr2, $vr3, $vr0 - vld $vr3, $a2, 96 - vst $vr2, $a2, 64 - vst $vr1, $a2, 80 - vld $vr1, $a2, 112 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 96 - vld $vr2, $a2, 144 - vld $vr3, $a2, 128 + vst $vr1, $a2, 64 + vld $vr1, $a2, 96 + vfsub.d $vr2, $vr2, $vr0 + vst $vr2, $a2, 80 + vld $vr2, $a2, 112 vfsub.d $vr1, $vr1, $vr0 - vst $vr1, $a2, 112 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 128 + vst $vr1, $a2, 96 + vld $vr1, $a2, 128 + vfsub.d $vr2, $vr2, $vr0 + vld $vr3, $a2, 144 + vst $vr2, $a2, 112 + vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 128 + vfsub.d $vr1, $vr3, $vr0 addi.d $a0, $a0, 8 vst $vr1, $a2, 144 bne $a0, $a1, .LBB2_120 @@ -4439,159 +4435,157 @@ constants: # @constants vldi $vr3, -944 .LBB2_128: # %.preheader1149 ld.d $s4, $sp, 112 # 8-byte Folded Reload - ld.d $a1, $s1, 0 - fld.d $fa4, $a1, 0 - fld.d $fa5, $a1, 8 - fmul.d $fa6, $fa4, $fa3 - fmadd.d $fa7, $fa6, $fa3, $fs0 - fmul.d $ft0, $fa5, $fa3 - fld.d $fa6, $a1, 16 - fmadd.d $ft0, $ft0, $fa2, $fa7 ld.d $a0, $s1, 8 - fld.d $fa7, $a1, 24 - fmul.d $ft1, $fa6, $fa3 - fmadd.d $ft0, $ft1, $fa1, $ft0 - fld.d $ft1, $a0, 0 - fmul.d $ft2, $fa7, $fa3 - fld.d $ft3, $a0, 8 - fmadd.d $ft0, $ft2, $fa0, $ft0 - fmul.d $ft1, $ft1, $fa2 - fmadd.d $ft0, $ft1, $fa3, $ft0 - fmul.d $ft1, $ft3, $fa2 - fld.d $ft2, $a0, 16 - fmadd.d $ft0, $ft1, $fa2, $ft0 - ld.d $a2, $s1, 16 - fld.d $ft1, $a0, 24 - fmul.d $ft2, $ft2, $fa2 - fmadd.d $ft0, $ft2, $fa1, $ft0 - fld.d $ft2, $a2, 0 - fmul.d $ft1, $ft1, $fa2 - fld.d $ft3, $a2, 8 - fmadd.d $ft0, $ft1, $fa0, $ft0 - fmul.d $ft1, $ft2, $fa1 - fmadd.d $ft0, $ft1, $fa3, $ft0 - fmul.d $ft1, $ft3, $fa1 - fld.d $ft2, $a2, 16 - fmadd.d $ft0, $ft1, $fa2, $ft0 - ld.d $a3, $s1, 24 - fld.d $ft1, $a2, 24 - fmul.d $ft2, $ft2, $fa1 - fmadd.d $ft0, $ft2, $fa1, $ft0 - fld.d $ft2, $a3, 0 + fld.d $fa4, $a0, 0 + ld.d $a3, $s1, 0 + fmul.d $fa4, $fa4, $fa2 + ld.d $a1, $s1, 16 + fld.d $fa5, $a0, 8 + fld.d $fa6, $a0, 16 + fld.d $fa7, $a0, 24 + fld.d $ft0, $a1, 0 + fmul.d $fa5, $fa5, $fa2 + fmul.d $fa6, $fa6, $fa2 + fmul.d $fa7, $fa7, $fa2 + fmul.d $ft0, $ft0, $fa1 + fld.d $ft1, $a1, 8 + ld.d $a2, $s1, 24 + fld.d $ft2, $a1, 16 + fld.d $ft3, $a1, 24 fmul.d $ft1, $ft1, $fa1 - fmadd.d $ft0, $ft1, $fa0, $ft0 - fld.d $ft1, $a3, 8 - fmul.d $ft2, $ft2, $fa0 - fmadd.d $fa3, $ft2, $fa3, $ft0 - fld.d $ft0, $a3, 16 - fmul.d $ft1, $ft1, $fa0 - fld.d $ft2, $a3, 24 - fmadd.d $fa2, $ft1, $fa2, $fa3 - fmul.d $fa3, $ft0, $fa0 - fmadd.d $fa1, $fa3, $fa1, $fa2 - fmul.d $fa2, $ft2, $fa0 - fmadd.d $fa0, $fa2, $fa0, $fa1 - fsub.d $fa1, $fa4, $fa0 - fst.d $fa1, $a1, 0 - fsub.d $fa1, $fa5, $fa0 - fst.d $fa1, $a1, 8 - fsub.d $fa1, $fa6, $fa0 - fst.d $fa1, $a1, 16 - fsub.d $fa1, $fa7, $fa0 - fst.d $fa1, $a1, 24 - vld $vr1, $a0, 16 - vld $vr2, $a0, 0 + fld.d $ft4, $a2, 0 + fmul.d $ft2, $ft2, $fa1 + fmul.d $ft3, $ft3, $fa1 + fld.d $ft5, $a2, 8 + fmul.d $ft4, $ft4, $fa0 + fld.d $ft6, $a2, 16 + fld.d $ft7, $a2, 24 + fmul.d $ft5, $ft5, $fa0 + vld $vr16, $a3, 0 + fmul.d $ft6, $ft6, $fa0 + fmul.d $ft7, $ft7, $fa0 + vld $vr17, $a3, 16 + vreplvei.d $vr18, $vr16, 0 + fmul.d $ft10, $ft10, $fa3 + fmadd.d $ft10, $ft10, $fa3, $fs0 + vreplvei.d $vr19, $vr16, 1 + fmul.d $ft11, $ft11, $fa3 + fmadd.d $ft10, $ft11, $fa2, $ft10 + vreplvei.d $vr19, $vr17, 0 + fmul.d $ft11, $ft11, $fa3 + fmadd.d $ft10, $ft11, $fa1, $ft10 + vreplvei.d $vr19, $vr17, 1 + fmul.d $ft11, $ft11, $fa3 + fmadd.d $ft10, $ft11, $fa0, $ft10 + fmadd.d $fa4, $fa4, $fa3, $ft10 + fmadd.d $fa4, $fa5, $fa2, $fa4 + fmadd.d $fa4, $fa6, $fa1, $fa4 + fmadd.d $fa4, $fa7, $fa0, $fa4 + fmadd.d $fa4, $ft0, $fa3, $fa4 + fmadd.d $fa4, $ft1, $fa2, $fa4 + fmadd.d $fa4, $ft2, $fa1, $fa4 + fmadd.d $fa4, $ft3, $fa0, $fa4 + fmadd.d $fa3, $ft4, $fa3, $fa4 + fmadd.d $fa2, $ft5, $fa2, $fa3 + fmadd.d $fa1, $ft6, $fa1, $fa2 + fmadd.d $fa0, $ft7, $fa0, $fa1 vreplvei.d $vr0, $vr0, 0 + vfsub.d $vr1, $vr16, $vr0 + vfsub.d $vr2, $vr17, $vr0 + vst $vr2, $a3, 16 + vst $vr1, $a3, 0 + vld $vr1, $a0, 0 + vld $vr2, $a0, 16 vfsub.d $vr1, $vr1, $vr0 - vfsub.d $vr2, $vr2, $vr0 - vst $vr2, $a0, 0 + vst $vr1, $a0, 0 + vfsub.d $vr1, $vr2, $vr0 vst $vr1, $a0, 16 - vld $vr1, $a2, 16 - vld $vr2, $a2, 0 + vld $vr1, $a1, 0 + vld $vr2, $a1, 16 vfsub.d $vr1, $vr1, $vr0 - vfsub.d $vr2, $vr2, $vr0 - vst $vr2, $a2, 0 - vst $vr1, $a2, 16 - vld $vr1, $a3, 16 - vld $vr2, $a3, 0 + vst $vr1, $a1, 0 + vfsub.d $vr1, $vr2, $vr0 + vst $vr1, $a1, 16 + vld $vr1, $a2, 0 + vld $vr2, $a2, 16 vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 0 vfsub.d $vr0, $vr2, $vr0 - vst $vr0, $a3, 0 - vst $vr1, $a3, 16 - fld.d $fa0, $a0, 8 - fld.d $fa2, $a2, 16 + vst $vr0, $a2, 16 + fld.d $fa1, $a0, 8 + fld.d $fa2, $a1, 16 ld.d $a2, $s1, 0 vldi $vr3, -944 - fmul.d $fa0, $fa0, $fa3 + fmul.d $fa1, $fa1, $fa3 fmul.d $fa2, $fa2, $fa3 - vreplvei.d $vr1, $vr1, 1 fld.d $fa4, $a2, 0 - fmul.d $fa1, $fa1, $fa3 + vreplvei.d $vr0, $vr0, 1 + fmul.d $fa0, $fa0, $fa3 vld $vr5, $a2, 0 - vld $vr6, $a2, 16 fmul.d $fa3, $fa4, $fa3 fadd.d $fa3, $fa3, $fs0 pcalau12i $a0, %pc_hi20(.LCPI2_5) fld.d $fa4, $a0, %pc_lo12(.LCPI2_5) - fadd.d $fa0, $fa3, $fa0 - fadd.d $fa0, $fa0, $fa2 - fadd.d $fa0, $fa0, $fa1 + fadd.d $fa1, $fa3, $fa1 + fadd.d $fa1, $fa1, $fa2 + fadd.d $fa0, $fa1, $fa0 fdiv.d $fa0, $fa4, $fa0 vreplvei.d $vr0, $vr0, 0 - vfmul.d $vr1, $vr0, $vr6 - ld.d $a3, $s1, 8 + vld $vr1, $a2, 16 vfmul.d $vr2, $vr0, $vr5 + ld.d $a3, $s1, 8 vst $vr2, $a2, 0 + vfmul.d $vr1, $vr0, $vr1 vst $vr1, $a2, 16 - vld $vr1, $a3, 16 - vld $vr2, $a3, 0 + vld $vr1, $a3, 0 + vld $vr2, $a3, 16 vfmul.d $vr1, $vr0, $vr1 ld.d $a1, $s1, 16 - vfmul.d $vr2, $vr0, $vr2 - vst $vr2, $a3, 0 + vst $vr1, $a3, 0 + vfmul.d $vr1, $vr0, $vr2 vst $vr1, $a3, 16 - vld $vr1, $a1, 16 - vld $vr2, $a1, 0 + vld $vr1, $a1, 0 + vld $vr2, $a1, 16 vfmul.d $vr1, $vr0, $vr1 ld.d $a0, $s1, 24 - vfmul.d $vr2, $vr0, $vr2 - vst $vr2, $a1, 0 + vst $vr1, $a1, 0 + vfmul.d $vr1, $vr0, $vr2 vst $vr1, $a1, 16 - vld $vr1, $a0, 16 - vld $vr2, $a0, 0 + vld $vr1, $a0, 0 + vld $vr2, $a0, 16 vfmul.d $vr1, $vr0, $vr1 ld.w $a4, $s4, 0 + vst $vr1, $a0, 0 vfmul.d $vr0, $vr0, $vr2 - vst $vr0, $a0, 0 - vst $vr1, $a0, 16 + vst $vr0, $a0, 16 movgr2fr.w $fa0, $a4 - vld $vr1, $a2, 16 - vld $vr2, $a2, 0 + vld $vr1, $a2, 0 ffint.d.w $fa0, $fa0 + vld $vr2, $a2, 16 vreplvei.d $vr0, $vr0, 0 vfsub.d $vr1, $vr1, $vr0 - vfsub.d $vr2, $vr2, $vr0 - vst $vr2, $a2, 0 + vst $vr1, $a2, 0 + vfsub.d $vr1, $vr2, $vr0 vst $vr1, $a2, 16 - vld $vr1, $a3, 16 - vld $vr2, $a3, 0 + vld $vr1, $a3, 0 + vld $vr2, $a3, 16 vfsub.d $vr1, $vr1, $vr0 - vfsub.d $vr2, $vr2, $vr0 - vst $vr2, $a3, 0 + vst $vr1, $a3, 0 + vfsub.d $vr1, $vr2, $vr0 vst $vr1, $a3, 16 - vld $vr1, $a1, 16 - vld $vr2, $a1, 0 + vld $vr1, $a1, 0 + vld $vr2, $a1, 16 vfsub.d $vr1, $vr1, $vr0 - vfsub.d $vr2, $vr2, $vr0 - vst $vr2, $a1, 0 + vst $vr1, $a1, 0 + vfsub.d $vr1, $vr2, $vr0 vst $vr1, $a1, 16 - vld $vr1, $a0, 16 - vld $vr2, $a0, 0 + vld $vr1, $a0, 0 + vld $vr2, $a0, 16 vfsub.d $vr1, $vr1, $vr0 ld.d $a2, $s1, 0 + vst $vr1, $a0, 0 vfsub.d $vr0, $vr2, $vr0 - vst $vr0, $a0, 0 - vst $vr1, $a0, 16 + vst $vr0, $a0, 16 fld.d $fa0, $a2, 0 fcmp.cule.d $fcc0, $fa0, $fs0 bcnez $fcc0, .LBB2_130 @@ -4842,35 +4836,35 @@ constants: # @constants .LBB2_141: # %.preheader1171 # =>This Inner Loop Header: Depth=1 ldx.d $a2, $fp, $a0 - vld $vr1, $a2, 16 - vld $vr2, $a2, 0 + vld $vr1, $a2, 0 + vld $vr2, $a2, 16 vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 0 + vld $vr1, $a2, 32 vfsub.d $vr2, $vr2, $vr0 - vld $vr3, $a2, 32 - vst $vr2, $a2, 0 - vst $vr1, $a2, 16 - vld $vr1, $a2, 48 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 32 + vst $vr2, $a2, 16 + vld $vr2, $a2, 48 + vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 32 + vld $vr1, $a2, 64 + vfsub.d $vr2, $vr2, $vr0 + vst $vr2, $a2, 48 vld $vr2, $a2, 80 - vld $vr3, $a2, 64 vfsub.d $vr1, $vr1, $vr0 - vst $vr1, $a2, 48 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr2, $vr3, $vr0 - vld $vr3, $a2, 96 - vst $vr2, $a2, 64 - vst $vr1, $a2, 80 - vld $vr1, $a2, 112 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 96 - vld $vr2, $a2, 144 - vld $vr3, $a2, 128 + vst $vr1, $a2, 64 + vld $vr1, $a2, 96 + vfsub.d $vr2, $vr2, $vr0 + vst $vr2, $a2, 80 + vld $vr2, $a2, 112 vfsub.d $vr1, $vr1, $vr0 - vst $vr1, $a2, 112 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 128 + vst $vr1, $a2, 96 + vld $vr1, $a2, 128 + vfsub.d $vr2, $vr2, $vr0 + vld $vr3, $a2, 144 + vst $vr2, $a2, 112 + vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 128 + vfsub.d $vr1, $vr3, $vr0 addi.d $a0, $a0, 8 vst $vr1, $a2, 144 bne $a0, $a1, .LBB2_141 @@ -4898,35 +4892,35 @@ constants: # @constants .LBB2_145: # %.preheader1168 # =>This Inner Loop Header: Depth=1 ldx.d $a2, $fp, $a0 - vld $vr1, $a2, 16 - vld $vr2, $a2, 0 + vld $vr1, $a2, 0 + vld $vr2, $a2, 16 vfmul.d $vr1, $vr0, $vr1 + vst $vr1, $a2, 0 + vld $vr1, $a2, 32 vfmul.d $vr2, $vr0, $vr2 - vld $vr3, $a2, 32 - vst $vr2, $a2, 0 - vst $vr1, $a2, 16 - vld $vr1, $a2, 48 - vfmul.d $vr2, $vr0, $vr3 - vst $vr2, $a2, 32 + vst $vr2, $a2, 16 + vld $vr2, $a2, 48 + vfmul.d $vr1, $vr0, $vr1 + vst $vr1, $a2, 32 + vld $vr1, $a2, 64 + vfmul.d $vr2, $vr0, $vr2 + vst $vr2, $a2, 48 vld $vr2, $a2, 80 - vld $vr3, $a2, 64 vfmul.d $vr1, $vr0, $vr1 - vst $vr1, $a2, 48 - vfmul.d $vr1, $vr0, $vr2 - vfmul.d $vr2, $vr0, $vr3 - vld $vr3, $a2, 96 - vst $vr2, $a2, 64 - vst $vr1, $a2, 80 - vld $vr1, $a2, 112 - vfmul.d $vr2, $vr0, $vr3 - vst $vr2, $a2, 96 - vld $vr2, $a2, 144 - vld $vr3, $a2, 128 + vst $vr1, $a2, 64 + vld $vr1, $a2, 96 + vfmul.d $vr2, $vr0, $vr2 + vst $vr2, $a2, 80 + vld $vr2, $a2, 112 vfmul.d $vr1, $vr0, $vr1 - vst $vr1, $a2, 112 - vfmul.d $vr1, $vr0, $vr2 - vfmul.d $vr2, $vr0, $vr3 - vst $vr2, $a2, 128 + vst $vr1, $a2, 96 + vld $vr1, $a2, 128 + vfmul.d $vr2, $vr0, $vr2 + vld $vr3, $a2, 144 + vst $vr2, $a2, 112 + vfmul.d $vr1, $vr0, $vr1 + vst $vr1, $a2, 128 + vfmul.d $vr1, $vr0, $vr3 addi.d $a0, $a0, 8 vst $vr1, $a2, 144 bne $a0, $a1, .LBB2_145 @@ -4942,35 +4936,35 @@ constants: # @constants .LBB2_147: # %.preheader1166 # =>This Inner Loop Header: Depth=1 ldx.d $a2, $fp, $a0 - vld $vr1, $a2, 16 - vld $vr2, $a2, 0 + vld $vr1, $a2, 0 + vld $vr2, $a2, 16 + vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 0 + vld $vr1, $a2, 32 + vfsub.d $vr2, $vr2, $vr0 + vst $vr2, $a2, 16 + vld $vr2, $a2, 48 vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 32 + vld $vr1, $a2, 64 vfsub.d $vr2, $vr2, $vr0 - vld $vr3, $a2, 32 - vst $vr2, $a2, 0 - vst $vr1, $a2, 16 - vld $vr1, $a2, 48 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 32 + vst $vr2, $a2, 48 vld $vr2, $a2, 80 - vld $vr3, $a2, 64 vfsub.d $vr1, $vr1, $vr0 - vst $vr1, $a2, 48 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr2, $vr3, $vr0 - vld $vr3, $a2, 96 - vst $vr2, $a2, 64 - vst $vr1, $a2, 80 - vld $vr1, $a2, 112 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 96 - vld $vr2, $a2, 144 - vld $vr3, $a2, 128 + vst $vr1, $a2, 64 + vld $vr1, $a2, 96 + vfsub.d $vr2, $vr2, $vr0 + vst $vr2, $a2, 80 + vld $vr2, $a2, 112 vfsub.d $vr1, $vr1, $vr0 - vst $vr1, $a2, 112 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 128 + vst $vr1, $a2, 96 + vld $vr1, $a2, 128 + vfsub.d $vr2, $vr2, $vr0 + vld $vr3, $a2, 144 + vst $vr2, $a2, 112 + vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 128 + vfsub.d $vr1, $vr3, $vr0 addi.d $a0, $a0, 8 vst $vr1, $a2, 144 bne $a0, $a1, .LBB2_147 @@ -6402,7 +6396,7 @@ constants: # @constants jirl $ra, $ra, 0 move $a0, $zero addi.d $a1, $s2, 20 - movgr2fr.d $ft13, $zero + movgr2fr.d $ft9, $zero vldi $vr0, -928 ori $a2, $zero, 80 vldi $vr1, -800 @@ -6424,14 +6418,14 @@ constants: # @constants # =>This Inner Loop Header: Depth=1 ldx.d $a3, $s1, $a0 fld.d $fa2, $a3, 0 - fcmp.cule.d $fcc0, $fa2, $ft13 + fcmp.cule.d $fcc0, $fa2, $ft9 bcnez $fcc0, .LBB2_257 # %bb.256: # in Loop: Header=BB2_255 Depth=1 fadd.d $fa2, $fa2, $fa0 b .LBB2_259 .p2align 4, , 16 .LBB2_257: # in Loop: Header=BB2_255 Depth=1 - fcmp.cule.d $fcc0, $ft13, $fa2 + fcmp.cule.d $fcc0, $ft9, $fa2 bcnez $fcc0, .LBB2_260 # %bb.258: # in Loop: Header=BB2_255 Depth=1 fadd.d $fa2, $fa2, $fa1 @@ -6446,7 +6440,7 @@ constants: # @constants .LBB2_261: # %shishagonyuu.exit1038 # in Loop: Header=BB2_255 Depth=1 fld.d $fa2, $a3, 8 - fcmp.clt.d $fcc0, $ft13, $fa2 + fcmp.clt.d $fcc0, $ft9, $fa2 st.w $a4, $a1, -20 bceqz $fcc0, .LBB2_263 # %bb.262: # in Loop: Header=BB2_255 Depth=1 @@ -6454,7 +6448,7 @@ constants: # @constants b .LBB2_265 .p2align 4, , 16 .LBB2_263: # in Loop: Header=BB2_255 Depth=1 - fcmp.cule.d $fcc0, $ft13, $fa2 + fcmp.cule.d $fcc0, $ft9, $fa2 bcnez $fcc0, .LBB2_266 # %bb.264: # in Loop: Header=BB2_255 Depth=1 fadd.d $fa2, $fa2, $fa1 @@ -6469,7 +6463,7 @@ constants: # @constants .LBB2_267: # %shishagonyuu.exit1038.1 # in Loop: Header=BB2_255 Depth=1 fld.d $fa2, $a3, 16 - fcmp.clt.d $fcc0, $ft13, $fa2 + fcmp.clt.d $fcc0, $ft9, $fa2 st.w $a4, $a1, -16 bceqz $fcc0, .LBB2_269 # %bb.268: # in Loop: Header=BB2_255 Depth=1 @@ -6477,7 +6471,7 @@ constants: # @constants b .LBB2_271 .p2align 4, , 16 .LBB2_269: # in Loop: Header=BB2_255 Depth=1 - fcmp.cule.d $fcc0, $ft13, $fa2 + fcmp.cule.d $fcc0, $ft9, $fa2 bcnez $fcc0, .LBB2_272 # %bb.270: # in Loop: Header=BB2_255 Depth=1 fadd.d $fa2, $fa2, $fa1 @@ -6492,7 +6486,7 @@ constants: # @constants .LBB2_273: # %shishagonyuu.exit1038.2 # in Loop: Header=BB2_255 Depth=1 fld.d $fa2, $a3, 24 - fcmp.clt.d $fcc0, $ft13, $fa2 + fcmp.clt.d $fcc0, $ft9, $fa2 st.w $a4, $a1, -12 bceqz $fcc0, .LBB2_275 # %bb.274: # in Loop: Header=BB2_255 Depth=1 @@ -6500,7 +6494,7 @@ constants: # @constants b .LBB2_277 .p2align 4, , 16 .LBB2_275: # in Loop: Header=BB2_255 Depth=1 - fcmp.cule.d $fcc0, $ft13, $fa2 + fcmp.cule.d $fcc0, $ft9, $fa2 bcnez $fcc0, .LBB2_278 # %bb.276: # in Loop: Header=BB2_255 Depth=1 fadd.d $fa2, $fa2, $fa1 @@ -6515,7 +6509,7 @@ constants: # @constants .LBB2_279: # %shishagonyuu.exit1038.3 # in Loop: Header=BB2_255 Depth=1 fld.d $fa2, $a3, 32 - fcmp.clt.d $fcc0, $ft13, $fa2 + fcmp.clt.d $fcc0, $ft9, $fa2 st.w $a4, $a1, -8 bceqz $fcc0, .LBB2_281 # %bb.280: # in Loop: Header=BB2_255 Depth=1 @@ -6523,7 +6517,7 @@ constants: # @constants b .LBB2_283 .p2align 4, , 16 .LBB2_281: # in Loop: Header=BB2_255 Depth=1 - fcmp.cule.d $fcc0, $ft13, $fa2 + fcmp.cule.d $fcc0, $ft9, $fa2 bcnez $fcc0, .LBB2_284 # %bb.282: # in Loop: Header=BB2_255 Depth=1 fadd.d $fa2, $fa2, $fa1 @@ -6538,7 +6532,7 @@ constants: # @constants .LBB2_285: # %shishagonyuu.exit1038.4 # in Loop: Header=BB2_255 Depth=1 fld.d $fa2, $a3, 40 - fcmp.clt.d $fcc0, $ft13, $fa2 + fcmp.clt.d $fcc0, $ft9, $fa2 st.w $a4, $a1, -4 bceqz $fcc0, .LBB2_287 # %bb.286: # in Loop: Header=BB2_255 Depth=1 @@ -6546,7 +6540,7 @@ constants: # @constants b .LBB2_289 .p2align 4, , 16 .LBB2_287: # in Loop: Header=BB2_255 Depth=1 - fcmp.cule.d $fcc0, $ft13, $fa2 + fcmp.cule.d $fcc0, $ft9, $fa2 bcnez $fcc0, .LBB2_290 # %bb.288: # in Loop: Header=BB2_255 Depth=1 fadd.d $fa2, $fa2, $fa1 @@ -6561,7 +6555,7 @@ constants: # @constants .LBB2_291: # %shishagonyuu.exit1038.5 # in Loop: Header=BB2_255 Depth=1 fld.d $fa2, $a3, 48 - fcmp.clt.d $fcc0, $ft13, $fa2 + fcmp.clt.d $fcc0, $ft9, $fa2 st.w $a4, $a1, 0 bceqz $fcc0, .LBB2_293 # %bb.292: # in Loop: Header=BB2_255 Depth=1 @@ -6569,7 +6563,7 @@ constants: # @constants b .LBB2_295 .p2align 4, , 16 .LBB2_293: # in Loop: Header=BB2_255 Depth=1 - fcmp.cule.d $fcc0, $ft13, $fa2 + fcmp.cule.d $fcc0, $ft9, $fa2 bcnez $fcc0, .LBB2_296 # %bb.294: # in Loop: Header=BB2_255 Depth=1 fadd.d $fa2, $fa2, $fa1 @@ -6584,7 +6578,7 @@ constants: # @constants .LBB2_297: # %shishagonyuu.exit1038.6 # in Loop: Header=BB2_255 Depth=1 fld.d $fa2, $a3, 56 - fcmp.clt.d $fcc0, $ft13, $fa2 + fcmp.clt.d $fcc0, $ft9, $fa2 st.w $a4, $a1, 4 bceqz $fcc0, .LBB2_299 # %bb.298: # in Loop: Header=BB2_255 Depth=1 @@ -6592,7 +6586,7 @@ constants: # @constants b .LBB2_301 .p2align 4, , 16 .LBB2_299: # in Loop: Header=BB2_255 Depth=1 - fcmp.cule.d $fcc0, $ft13, $fa2 + fcmp.cule.d $fcc0, $ft9, $fa2 bcnez $fcc0, .LBB2_302 # %bb.300: # in Loop: Header=BB2_255 Depth=1 fadd.d $fa2, $fa2, $fa1 @@ -6607,7 +6601,7 @@ constants: # @constants .LBB2_303: # %shishagonyuu.exit1038.7 # in Loop: Header=BB2_255 Depth=1 fld.d $fa2, $a3, 64 - fcmp.clt.d $fcc0, $ft13, $fa2 + fcmp.clt.d $fcc0, $ft9, $fa2 st.w $a4, $a1, 8 bceqz $fcc0, .LBB2_305 # %bb.304: # in Loop: Header=BB2_255 Depth=1 @@ -6615,7 +6609,7 @@ constants: # @constants b .LBB2_307 .p2align 4, , 16 .LBB2_305: # in Loop: Header=BB2_255 Depth=1 - fcmp.cule.d $fcc0, $ft13, $fa2 + fcmp.cule.d $fcc0, $ft9, $fa2 bcnez $fcc0, .LBB2_308 # %bb.306: # in Loop: Header=BB2_255 Depth=1 fadd.d $fa2, $fa2, $fa1 @@ -6630,11 +6624,11 @@ constants: # @constants .LBB2_309: # %shishagonyuu.exit1038.8 # in Loop: Header=BB2_255 Depth=1 fld.d $fa2, $a3, 72 - fcmp.clt.d $fcc0, $ft13, $fa2 + fcmp.clt.d $fcc0, $ft9, $fa2 st.w $a4, $a1, 12 bcnez $fcc0, .LBB2_252 # %bb.310: # in Loop: Header=BB2_255 Depth=1 - fcmp.cule.d $fcc0, $ft13, $fa2 + fcmp.cule.d $fcc0, $ft9, $fa2 bcnez $fcc0, .LBB2_312 # %bb.311: # in Loop: Header=BB2_255 Depth=1 fadd.d $fa2, $fa2, $fa1 @@ -6648,7 +6642,7 @@ constants: # @constants st.d $s1, $sp, 64 # 8-byte Folded Spill beqz $a0, .LBB2_319 # %bb.314: - vst $vr21, $sp, 128 # 16-byte Folded Spill + vst $vr17, $sp, 128 # 16-byte Folded Spill move $s7, $s5 move $s6, $s4 pcalau12i $a0, %got_pc_hi20(stderr) @@ -6694,94 +6688,87 @@ constants: # @constants jirl $ra, $ra, 0 move $s4, $s6 move $s5, $s7 - vld $vr21, $sp, 128 # 16-byte Folded Reload + vld $vr17, $sp, 128 # 16-byte Folded Reload .LBB2_319: # %.preheader1118 - fld.d $fa4, $s8, 0 + fld.d $fa0, $s8, 0 + fld.d $fa1, $s8, 8 + fld.d $fa2, $s8, 16 + fld.d $fa3, $s8, 24 pcalau12i $a0, %pc_hi20(ribosum4) addi.d $s7, $a0, %pc_lo12(ribosum4) - fld.d $fa0, $s7, 0 - fmul.d $fa2, $fa0, $fa4 - fld.d $fa1, $s7, 8 - fld.d $fa5, $s8, 8 - fmadd.d $fa3, $fa2, $fa4, $ft13 - fld.d $fa2, $s7, 16 - fmul.d $fa6, $fa1, $fa4 - fmadd.d $fa6, $fa6, $fa5, $fa3 - fld.d $fa7, $s8, 16 - fmul.d $ft0, $fa2, $fa4 - fld.d $fa3, $s7, 24 - fld.d $ft1, $s8, 24 - fmadd.d $ft0, $ft0, $fa7, $fa6 - fld.d $fa6, $s7, 32 - fmul.d $ft2, $fa3, $fa4 - fmadd.d $ft2, $ft2, $ft1, $ft0 - fld.d $ft0, $s7, 40 - fmul.d $ft3, $fa6, $fa5 - fmadd.d $ft2, $ft3, $fa4, $ft2 - fld.d $ft3, $s7, 48 - fmul.d $ft4, $ft0, $fa5 - fmadd.d $ft2, $ft4, $fa5, $ft2 - fld.d $ft4, $s7, 56 - fmul.d $ft5, $ft3, $fa5 - fmadd.d $ft2, $ft5, $fa7, $ft2 - fld.d $ft5, $s7, 64 - fmul.d $ft6, $ft4, $fa5 - fmadd.d $ft2, $ft6, $ft1, $ft2 - fld.d $ft6, $s7, 72 - fmul.d $ft7, $ft5, $fa7 - fmadd.d $ft2, $ft7, $fa4, $ft2 - fld.d $ft7, $s7, 80 - fmul.d $ft8, $ft6, $fa7 - fmadd.d $ft2, $ft8, $fa5, $ft2 - fld.d $ft8, $s7, 88 - fmul.d $ft9, $ft7, $fa7 - fmadd.d $ft2, $ft9, $fa7, $ft2 - fld.d $ft9, $s7, 96 - fmul.d $ft10, $ft8, $fa7 - fmadd.d $ft2, $ft10, $ft1, $ft2 - fld.d $ft10, $s7, 104 - fmul.d $ft11, $ft9, $ft1 - fmadd.d $fa4, $ft11, $fa4, $ft2 - fld.d $ft2, $s7, 112 - fmul.d $ft11, $ft10, $ft1 - fld.d $ft12, $s7, 120 - fmadd.d $fa4, $ft11, $fa5, $fa4 - fmul.d $fa5, $ft2, $ft1 - fmadd.d $fa4, $fa5, $fa7, $fa4 - fmul.d $fa5, $ft12, $ft1 - fmadd.d $fa4, $fa5, $ft1, $fa4 - fsub.d $fa0, $fa0, $fa4 - fst.d $fa0, $s7, 0 - fsub.d $fa0, $fa1, $fa4 - fst.d $fa0, $s7, 8 - fsub.d $fa0, $fa2, $fa4 - fst.d $fa0, $s7, 16 - fsub.d $fa0, $fa3, $fa4 - fst.d $fa0, $s7, 24 - fsub.d $fa0, $fa6, $fa4 - fst.d $fa0, $s7, 32 - fsub.d $fa0, $ft0, $fa4 - fst.d $fa0, $s7, 40 - fsub.d $fa0, $ft3, $fa4 - fst.d $fa0, $s7, 48 - fsub.d $fa0, $ft4, $fa4 - fst.d $fa0, $s7, 56 - fsub.d $fa0, $ft5, $fa4 - fst.d $fa0, $s7, 64 - fsub.d $fa0, $ft6, $fa4 - fst.d $fa0, $s7, 72 - fsub.d $fa0, $ft7, $fa4 - fst.d $fa0, $s7, 80 - fsub.d $fa0, $ft8, $fa4 - fst.d $fa0, $s7, 88 - fsub.d $fa0, $ft9, $fa4 - fst.d $fa0, $s7, 96 - fsub.d $fa0, $ft10, $fa4 - fst.d $fa0, $s7, 104 - fsub.d $fa0, $ft2, $fa4 - fst.d $fa0, $s7, 112 - fsub.d $fa0, $ft12, $fa4 - fst.d $fa0, $s7, 120 + vld $vr4, $s7, 0 + vld $vr5, $s7, 16 + vreplvei.d $vr6, $vr4, 0 + fmul.d $fa6, $fa6, $fa0 + fmadd.d $fa6, $fa6, $fa0, $ft9 + vreplvei.d $vr7, $vr4, 1 + fmul.d $fa7, $fa7, $fa0 + fmadd.d $fa6, $fa7, $fa1, $fa6 + vreplvei.d $vr7, $vr5, 0 + fmul.d $fa7, $fa7, $fa0 + fmadd.d $fa7, $fa7, $fa2, $fa6 + vreplvei.d $vr8, $vr5, 1 + vld $vr6, $s7, 32 + fmul.d $ft0, $ft0, $fa0 + fmadd.d $ft0, $ft0, $fa3, $fa7 + vld $vr7, $s7, 48 + vreplvei.d $vr9, $vr6, 0 + fmul.d $ft1, $ft1, $fa1 + vreplvei.d $vr10, $vr6, 1 + fmul.d $ft2, $ft2, $fa1 + vreplvei.d $vr11, $vr7, 0 + fmul.d $ft3, $ft3, $fa1 + vreplvei.d $vr12, $vr7, 1 + fmul.d $ft4, $ft4, $fa1 + fmadd.d $ft0, $ft1, $fa0, $ft0 + fmadd.d $ft0, $ft2, $fa1, $ft0 + vld $vr9, $s7, 64 + fmadd.d $ft0, $ft3, $fa2, $ft0 + fmadd.d $ft0, $ft4, $fa3, $ft0 + vld $vr10, $s7, 80 + vreplvei.d $vr11, $vr9, 0 + fmul.d $ft3, $ft3, $fa2 + vreplvei.d $vr12, $vr9, 1 + fmul.d $ft4, $ft4, $fa2 + vreplvei.d $vr13, $vr10, 0 + fmul.d $ft5, $ft5, $fa2 + vreplvei.d $vr14, $vr10, 1 + fmul.d $ft6, $ft6, $fa2 + fmadd.d $ft0, $ft3, $fa0, $ft0 + fmadd.d $ft0, $ft4, $fa1, $ft0 + vld $vr11, $s7, 96 + fmadd.d $ft0, $ft5, $fa2, $ft0 + fmadd.d $ft0, $ft6, $fa3, $ft0 + vld $vr12, $s7, 112 + vreplvei.d $vr13, $vr11, 0 + fmul.d $ft5, $ft5, $fa3 + vreplvei.d $vr14, $vr11, 1 + fmul.d $ft6, $ft6, $fa3 + vreplvei.d $vr15, $vr12, 0 + fmul.d $ft7, $ft7, $fa3 + vreplvei.d $vr16, $vr12, 1 + fmul.d $ft8, $ft8, $fa3 + fmadd.d $fa0, $ft5, $fa0, $ft0 + fmadd.d $fa0, $ft6, $fa1, $fa0 + fmadd.d $fa0, $ft7, $fa2, $fa0 + fmadd.d $fa0, $ft8, $fa3, $fa0 + vreplvei.d $vr0, $vr0, 0 + vfsub.d $vr1, $vr5, $vr0 + vfsub.d $vr2, $vr4, $vr0 + vst $vr2, $s7, 0 + vst $vr1, $s7, 16 + vfsub.d $vr1, $vr7, $vr0 + vfsub.d $vr2, $vr6, $vr0 + vst $vr2, $s7, 32 + vst $vr1, $s7, 48 + vfsub.d $vr1, $vr10, $vr0 + vfsub.d $vr2, $vr9, $vr0 + vst $vr2, $s7, 64 + vst $vr1, $s7, 80 + vfsub.d $vr1, $vr12, $vr0 + vfsub.d $vr0, $vr11, $vr0 + vst $vr0, $s7, 96 + vst $vr1, $s7, 112 fld.d $fa0, $s8, 0 fld.d $fa1, $s8, 24 fld.d $fa2, $s8, 8 @@ -6810,7 +6797,7 @@ constants: # @constants fld.d $fa7, $a5, -56 fmul.d $fa6, $fa6, $fa5 fmul.d $fa6, $fa6, $fa0 - fmadd.d $fa6, $fa6, $fa0, $ft13 + fmadd.d $fa6, $fa6, $fa0, $ft9 fmul.d $fa7, $fa7, $fa4 fld.d $ft0, $a5, -48 fmul.d $fa7, $fa7, $fa5 @@ -6884,7 +6871,7 @@ constants: # @constants fmul.d $fa7, $ft0, $fa4 fmul.d $fa5, $fa7, $fa5 fmul.d $fa5, $fa5, $fa1 - fmadd.d $ft13, $fa5, $fa1, $fa6 + fmadd.d $ft9, $fa5, $fa1, $fa6 addi.d $a4, $a4, 8 addi.d $a5, $a5, 128 bne $a4, $a2, .LBB2_321 @@ -6894,7 +6881,7 @@ constants: # @constants bne $a0, $a3, .LBB2_320 # %bb.323: # %.preheader1110.preheader move $a0, $zero - vreplvei.d $vr0, $vr21, 0 + vreplvei.d $vr0, $vr17, 0 ori $a1, $zero, 2048 .p2align 4, , 16 .LBB2_324: # %.preheader1110 @@ -6904,213 +6891,178 @@ constants: # @constants vld $vr2, $a2, 16 vfsub.d $vr1, $vr1, $vr0 vstx $vr1, $fp, $a0 - vld $vr1, $a2, 48 - vld $vr3, $a2, 32 + vld $vr1, $a2, 32 vfsub.d $vr2, $vr2, $vr0 vst $vr2, $a2, 16 + vld $vr2, $a2, 48 vfsub.d $vr1, $vr1, $vr0 - vfsub.d $vr2, $vr3, $vr0 - vld $vr3, $a2, 64 - vst $vr2, $a2, 32 - vst $vr1, $a2, 48 - vld $vr1, $a2, 80 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 64 - vld $vr2, $a2, 112 - vld $vr3, $a2, 96 + vst $vr1, $a2, 32 + vld $vr1, $a2, 64 + vfsub.d $vr2, $vr2, $vr0 + vst $vr2, $a2, 48 + vld $vr2, $a2, 80 vfsub.d $vr1, $vr1, $vr0 - vst $vr1, $a2, 80 - vfsub.d $vr1, $vr2, $vr0 - vfsub.d $vr2, $vr3, $vr0 - vst $vr2, $a2, 96 + vst $vr1, $a2, 64 + vld $vr1, $a2, 96 + vfsub.d $vr2, $vr2, $vr0 + vld $vr3, $a2, 112 + vst $vr2, $a2, 80 + vfsub.d $vr1, $vr1, $vr0 + vst $vr1, $a2, 96 + vfsub.d $vr1, $vr3, $vr0 addi.d $a0, $a0, 128 vst $vr1, $a2, 112 bne $a0, $a1, .LBB2_324 # %bb.325: # %.preheader1109.preheader - fld.d $fa1, $s7, 0 - fld.d $fa2, $s8, 0 + fld.d $fa1, $s8, 0 + fld.d $fa2, $s8, 8 + fld.d $fa3, $s8, 16 + fld.d $fa4, $s8, 24 + fld.d $fa5, $s7, 0 + vld $vr6, $s7, 0 + vld $vr7, $s7, 16 movgr2fr.d $fa0, $zero - fmadd.d $fa2, $fa1, $fa2, $fa0 - fld.d $fa6, $s7, 40 - fld.d $fa3, $s8, 8 - fld.d $ft3, $s7, 80 - fld.d $fa4, $s8, 16 - fld.d $ft8, $s7, 120 - fld.d $fa5, $s8, 24 + fmadd.d $fa1, $fa5, $fa1, $fa0 + fld.d $fa5, $s7, 40 + vld $vr8, $s7, 32 + vld $vr10, $s7, 48 + fld.d $ft1, $s7, 80 + fmadd.d $fa1, $fa5, $fa2, $fa1 + vld $vr11, $s7, 64 + vld $vr12, $s7, 80 + fmadd.d $fa1, $ft1, $fa3, $fa1 + fld.d $fa2, $s7, 120 pcalau12i $a0, %pc_hi20(.LCPI2_5) - fld.d $ft9, $a0, %pc_lo12(.LCPI2_5) - fmadd.d $fa2, $fa6, $fa3, $fa2 - fmadd.d $fa2, $ft3, $fa4, $fa2 - fmadd.d $fa2, $ft8, $fa5, $fa2 - fdiv.d $ft10, $ft9, $fa2 - fld.d $fa2, $s7, 8 - fmul.d $fa1, $ft10, $fa1 - fst.d $fa1, $s7, 0 - fld.d $fa3, $s7, 16 - fmul.d $fa2, $ft10, $fa2 - fst.d $fa2, $s7, 8 - fld.d $fa4, $s7, 24 - fmul.d $fa3, $ft10, $fa3 - fld.d $fa5, $s7, 32 - fst.d $fa3, $s7, 16 - fmul.d $fa4, $ft10, $fa4 - fst.d $fa4, $s7, 24 - fmul.d $fa5, $ft10, $fa5 - fst.d $fa5, $s7, 32 - fld.d $fa7, $s7, 48 - fmul.d $fa6, $ft10, $fa6 - fst.d $fa6, $s7, 40 - fld.d $ft0, $s7, 56 - fmul.d $fa7, $ft10, $fa7 - fst.d $fa7, $s7, 48 - fld.d $ft1, $s7, 64 - fmul.d $ft0, $ft10, $ft0 - fld.d $ft2, $s7, 72 - fst.d $ft0, $s7, 56 - fmul.d $ft1, $ft10, $ft1 - fst.d $ft1, $s7, 64 - fmul.d $ft2, $ft10, $ft2 - fst.d $ft2, $s7, 72 - fld.d $ft4, $s7, 88 - fmul.d $ft3, $ft10, $ft3 - fst.d $ft3, $s7, 80 - fld.d $ft5, $s7, 96 - fmul.d $ft4, $ft10, $ft4 - fst.d $ft4, $s7, 88 - fld.d $ft6, $s7, 104 - fmul.d $ft5, $ft10, $ft5 - fld.d $ft7, $s7, 112 - fst.d $ft5, $s7, 96 - fmul.d $ft6, $ft10, $ft6 - fst.d $ft6, $s7, 104 - fmul.d $ft7, $ft10, $ft7 - fst.d $ft7, $s7, 112 - fmul.d $ft8, $ft10, $ft8 - fst.d $ft8, $s7, 120 - fld.d $ft10, $fp, 408 - fld.d $ft11, $s8, 0 - fld.d $ft12, $s8, 24 - fld.d $ft13, $fp, 1632 + fld.d $ft1, $a0, %pc_lo12(.LCPI2_5) + vld $vr13, $s7, 96 + vld $vr14, $s7, 112 + fmadd.d $fa1, $fa2, $fa4, $fa1 + fdiv.d $fa1, $ft1, $fa1 + vreplvei.d $vr15, $vr1, 0 + vfmul.d $vr1, $vr15, $vr7 + vfmul.d $vr5, $vr15, $vr6 + vst $vr5, $s7, 0 + vst $vr1, $s7, 16 + vfmul.d $vr2, $vr15, $vr10 + vfmul.d $vr6, $vr15, $vr8 + vst $vr6, $s7, 32 + vst $vr2, $s7, 48 + vfmul.d $vr3, $vr15, $vr12 + vfmul.d $vr7, $vr15, $vr11 + vst $vr7, $s7, 64 + vst $vr3, $s7, 80 + vfmul.d $vr4, $vr15, $vr14 + vfmul.d $vr8, $vr15, $vr13 + vst $vr8, $s7, 96 + vst $vr4, $s7, 112 + fld.d $ft2, $fp, 408 + fld.d $ft3, $s8, 0 + fld.d $ft4, $s8, 24 + fld.d $ft5, $fp, 1632 move $a0, $zero - fmul.d $ft10, $ft10, $ft11 - fmadd.d $ft10, $ft10, $ft12, $fa0 - fmul.d $ft13, $ft12, $ft13 - fld.d $ft14, $fp, 816 - fld.d $ft15, $s8, 8 - fld.d $fs0, $s8, 16 - fmadd.d $ft10, $ft13, $ft11, $ft10 - fld.d $ft11, $fp, 1224 - fmul.d $ft13, $ft14, $ft15 - fmadd.d $ft10, $ft13, $fs0, $ft10 - fld.d $ft13, $fp, 952 - fmul.d $ft11, $fs0, $ft11 - fld.d $ft14, $fp, 1768 - fmadd.d $ft10, $ft11, $ft15, $ft10 - fmul.d $ft11, $ft15, $ft13 - fmadd.d $ft10, $ft11, $ft12, $ft10 - fmul.d $ft11, $ft12, $ft14 - fmadd.d $ft10, $ft11, $ft15, $ft10 - fdiv.d $ft9, $ft9, $ft10 - vreplvei.d $vr17, $vr17, 0 + fmul.d $ft2, $ft2, $ft3 + fmadd.d $ft2, $ft2, $ft4, $fa0 + fmul.d $ft5, $ft4, $ft5 + fld.d $ft6, $fp, 816 + fld.d $ft7, $s8, 8 + fld.d $ft8, $s8, 16 + fmadd.d $ft2, $ft5, $ft3, $ft2 + fld.d $ft3, $fp, 1224 + fmul.d $ft5, $ft6, $ft7 + fmadd.d $ft2, $ft5, $ft8, $ft2 + fld.d $ft5, $fp, 952 + fmul.d $ft3, $ft8, $ft3 + fld.d $ft6, $fp, 1768 + fmadd.d $ft2, $ft3, $ft7, $ft2 + fmul.d $ft3, $ft7, $ft5 + fmadd.d $ft2, $ft3, $ft4, $ft2 + fmul.d $ft3, $ft4, $ft6 + fmadd.d $ft2, $ft3, $ft7, $ft2 + fdiv.d $ft1, $ft1, $ft2 + vreplvei.d $vr9, $vr9, 0 ori $a1, $zero, 2048 .p2align 4, , 16 .LBB2_326: # %.preheader1106 # =>This Inner Loop Header: Depth=1 - vldx $vr18, $fp, $a0 + vldx $vr10, $fp, $a0 add.d $a2, $fp, $a0 - vld $vr19, $a2, 16 - vfmul.d $vr18, $vr17, $vr18 - vstx $vr18, $fp, $a0 - vld $vr18, $a2, 48 - vld $vr20, $a2, 32 - vfmul.d $vr19, $vr17, $vr19 - vst $vr19, $a2, 16 - vfmul.d $vr18, $vr17, $vr18 - vfmul.d $vr19, $vr17, $vr20 - vld $vr20, $a2, 64 - vst $vr19, $a2, 32 - vst $vr18, $a2, 48 - vld $vr18, $a2, 80 - vfmul.d $vr19, $vr17, $vr20 - vst $vr19, $a2, 64 - vld $vr19, $a2, 112 - vld $vr20, $a2, 96 - vfmul.d $vr18, $vr17, $vr18 - vst $vr18, $a2, 80 - vfmul.d $vr18, $vr17, $vr19 - vfmul.d $vr19, $vr17, $vr20 - vst $vr19, $a2, 96 + vld $vr11, $a2, 16 + vfmul.d $vr10, $vr9, $vr10 + vstx $vr10, $fp, $a0 + vld $vr10, $a2, 32 + vfmul.d $vr11, $vr9, $vr11 + vst $vr11, $a2, 16 + vld $vr11, $a2, 48 + vfmul.d $vr10, $vr9, $vr10 + vst $vr10, $a2, 32 + vld $vr10, $a2, 64 + vfmul.d $vr11, $vr9, $vr11 + vst $vr11, $a2, 48 + vld $vr11, $a2, 80 + vfmul.d $vr10, $vr9, $vr10 + vst $vr10, $a2, 64 + vld $vr10, $a2, 96 + vfmul.d $vr11, $vr9, $vr11 + vld $vr12, $a2, 112 + vst $vr11, $a2, 80 + vfmul.d $vr10, $vr9, $vr10 + vst $vr10, $a2, 96 + vfmul.d $vr10, $vr9, $vr12 addi.d $a0, $a0, 128 - vst $vr18, $a2, 112 + vst $vr10, $a2, 112 bne $a0, $a1, .LBB2_326 # %bb.327: # %.preheader1105 ld.w $a1, $s4, 0 move $a0, $zero - movgr2fr.w $ft9, $a1 - ffint.d.w $ft9, $ft9 - fsub.d $fa1, $fa1, $ft9 - fst.d $fa1, $s7, 0 - fsub.d $fa1, $fa2, $ft9 - fst.d $fa1, $s7, 8 - fsub.d $fa1, $fa3, $ft9 - fst.d $fa1, $s7, 16 - fsub.d $fa1, $fa4, $ft9 - fst.d $fa1, $s7, 24 - fsub.d $fa1, $fa5, $ft9 - fst.d $fa1, $s7, 32 - fsub.d $fa1, $fa6, $ft9 - fst.d $fa1, $s7, 40 - fsub.d $fa1, $fa7, $ft9 - fst.d $fa1, $s7, 48 - fsub.d $fa1, $ft0, $ft9 - fst.d $fa1, $s7, 56 - fsub.d $fa1, $ft1, $ft9 - fst.d $fa1, $s7, 64 - fsub.d $fa1, $ft2, $ft9 - fst.d $fa1, $s7, 72 - fsub.d $fa1, $ft3, $ft9 - fst.d $fa1, $s7, 80 - fsub.d $fa1, $ft4, $ft9 - fst.d $fa1, $s7, 88 - fsub.d $fa1, $ft5, $ft9 - fst.d $fa1, $s7, 96 - fsub.d $fa1, $ft6, $ft9 - fst.d $fa1, $s7, 104 - fsub.d $fa1, $ft7, $ft9 - fst.d $fa1, $s7, 112 - fsub.d $fa1, $ft8, $ft9 - fst.d $fa1, $s7, 120 - vreplvei.d $vr1, $vr17, 0 + movgr2fr.w $ft1, $a1 + ffint.d.w $ft1, $ft1 + vreplvei.d $vr9, $vr9, 0 + vfsub.d $vr5, $vr5, $vr9 + vfsub.d $vr1, $vr1, $vr9 + vst $vr1, $s7, 16 + vst $vr5, $s7, 0 + vfsub.d $vr1, $vr6, $vr9 + vfsub.d $vr2, $vr2, $vr9 + vst $vr2, $s7, 48 + vst $vr1, $s7, 32 + vfsub.d $vr1, $vr7, $vr9 + vfsub.d $vr2, $vr3, $vr9 + vst $vr2, $s7, 80 + vst $vr1, $s7, 64 + vfsub.d $vr1, $vr8, $vr9 + vfsub.d $vr2, $vr4, $vr9 + vst $vr2, $s7, 112 + vst $vr1, $s7, 96 ori $a1, $zero, 2048 .p2align 4, , 16 .LBB2_328: # %.preheader1102 # =>This Inner Loop Header: Depth=1 - vldx $vr2, $fp, $a0 + vldx $vr1, $fp, $a0 add.d $a2, $fp, $a0 - vld $vr3, $a2, 16 - vfsub.d $vr2, $vr2, $vr1 - vstx $vr2, $fp, $a0 + vld $vr2, $a2, 16 + vfsub.d $vr1, $vr1, $vr9 + vstx $vr1, $fp, $a0 + vld $vr1, $a2, 32 + vfsub.d $vr2, $vr2, $vr9 + vst $vr2, $a2, 16 vld $vr2, $a2, 48 - vld $vr4, $a2, 32 - vfsub.d $vr3, $vr3, $vr1 - vst $vr3, $a2, 16 - vfsub.d $vr2, $vr2, $vr1 - vfsub.d $vr3, $vr4, $vr1 - vld $vr4, $a2, 64 - vst $vr3, $a2, 32 + vfsub.d $vr1, $vr1, $vr9 + vst $vr1, $a2, 32 + vld $vr1, $a2, 64 + vfsub.d $vr2, $vr2, $vr9 vst $vr2, $a2, 48 vld $vr2, $a2, 80 - vfsub.d $vr3, $vr4, $vr1 - vst $vr3, $a2, 64 + vfsub.d $vr1, $vr1, $vr9 + vst $vr1, $a2, 64 + vld $vr1, $a2, 96 + vfsub.d $vr2, $vr2, $vr9 vld $vr3, $a2, 112 - vld $vr4, $a2, 96 - vfsub.d $vr2, $vr2, $vr1 vst $vr2, $a2, 80 - vfsub.d $vr2, $vr3, $vr1 - vfsub.d $vr3, $vr4, $vr1 - vst $vr3, $a2, 96 + vfsub.d $vr1, $vr1, $vr9 + vst $vr1, $a2, 96 + vfsub.d $vr1, $vr3, $vr9 addi.d $a0, $a0, 128 - vst $vr2, $a2, 112 + vst $vr1, $a2, 112 bne $a0, $a1, .LBB2_328 # %bb.329: # %.preheader1100 fld.d $fa1, $s7, 0 @@ -10002,7 +9954,7 @@ constants: # @constants fld.d $ft15, $sp, 120 # 8-byte Folded Reload # kill: def $f0_64 killed $f0_64 def $vr0 b .LBB2_460 -.LBB2_484: # %call.sqrt2651 +.LBB2_484: # %call.sqrt2669 fmov.d $fa0, $fa1 fst.d $ft15, $sp, 128 # 8-byte Folded Spill fst.d $ft14, $sp, 120 # 8-byte Folded Spill diff --git a/results/MultiSource/Benchmarks/mafft/CMakeFiles/pairlocalalign.dir/fft.s b/results/MultiSource/Benchmarks/mafft/CMakeFiles/pairlocalalign.dir/fft.s index f321de87..8243f8e6 100644 --- a/results/MultiSource/Benchmarks/mafft/CMakeFiles/pairlocalalign.dir/fft.s +++ b/results/MultiSource/Benchmarks/mafft/CMakeFiles/pairlocalalign.dir/fft.s @@ -333,17 +333,14 @@ fft: # @fft # %bb.43: # %.lr.ph130 movgr2fr.d $fa0, $s4 ffint.d.l $fa0, $fa0 - addi.d $a0, $fp, 8 + vreplvei.d $vr0, $vr0, 0 .p2align 4, , 16 .LBB0_44: # =>This Inner Loop Header: Depth=1 - fld.d $fa1, $a0, -8 - fld.d $fa2, $a0, 0 - fdiv.d $fa1, $fa1, $fa0 - fst.d $fa1, $a0, -8 - fdiv.d $fa1, $fa2, $fa0 - fst.d $fa1, $a0, 0 + vld $vr1, $fp, 0 + vfdiv.d $vr1, $vr1, $vr0 + vst $vr1, $fp, 0 addi.d $s3, $s3, -1 - addi.d $a0, $a0, 16 + addi.d $fp, $fp, 16 bnez $s3, .LBB0_44 .LBB0_45: move $a0, $zero diff --git a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/gsm_decode.s b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/gsm_decode.s index d8fffb79..e7fb6bfd 100644 --- a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/gsm_decode.s +++ b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/gsm_decode.s @@ -48,35 +48,36 @@ gsm_decode: # @gsm_decode st.h $a3, $sp, 136 srli.d $a3, $a2, 7 bstrins.d $a3, $a4, 5, 1 - ld.bu $a4, $a1, 8 st.h $a3, $sp, 120 - bstrpick.d $a3, $a2, 6, 4 - bstrpick.d $a5, $a2, 3, 1 - srli.d $a6, $a4, 6 - bstrins.d $a6, $a2, 2, 2 + vinsgr2vr.b $vr0, $a2, 0 + vinsgr2vr.b $vr0, $a2, 1 + ld.bu $a3, $a1, 8 + vrepli.h $vr1, 260 + vsrl.b $vr0, $vr0, $vr1 + vandi.b $vr1, $vr0, 7 + srli.d $a4, $a3, 6 + bstrins.d $a4, $a2, 2, 2 ld.bu $a2, $a1, 9 - bstrpick.d $t0, $a4, 5, 3 - ld.bu $t1, $a1, 10 - andi $a4, $a4, 7 - srli.d $t2, $a2, 5 - bstrpick.d $t3, $a2, 4, 2 - srli.d $t4, $t1, 7 - bstrins.d $t4, $a2, 2, 1 - vinsgr2vr.b $vr1, $a3, 0 - vinsgr2vr.b $vr1, $a5, 1 - vinsgr2vr.b $vr1, $a6, 2 - vinsgr2vr.b $vr1, $t0, 3 - vinsgr2vr.b $vr1, $a4, 4 - vinsgr2vr.b $vr1, $t2, 5 - vinsgr2vr.b $vr1, $t3, 6 - vinsgr2vr.b $vr1, $t4, 7 + bstrpick.d $a5, $a3, 5, 3 + ld.bu $a6, $a1, 10 + andi $a3, $a3, 7 + srli.d $t0, $a2, 5 + bstrpick.d $t1, $a2, 4, 2 + srli.d $t2, $a6, 7 + bstrins.d $t2, $a2, 2, 1 + vinsgr2vr.b $vr1, $a4, 2 + vinsgr2vr.b $vr1, $a5, 3 + vinsgr2vr.b $vr1, $a3, 4 + vinsgr2vr.b $vr1, $t0, 5 + vinsgr2vr.b $vr1, $t1, 6 + vinsgr2vr.b $vr1, $t2, 7 vrepli.b $vr0, 0 vilvl.b $vr1, $vr0, $vr1 vst $vr1, $sp, 16 - bstrpick.d $a2, $t1, 6, 4 + bstrpick.d $a2, $a6, 6, 4 st.h $a2, $sp, 32 ld.bu $a2, $a1, 12 - bstrpick.d $a3, $t1, 3, 1 + bstrpick.d $a3, $a6, 3, 1 ld.bu $a4, $a1, 13 st.h $a3, $sp, 34 srli.d $a3, $a2, 1 @@ -92,7 +93,7 @@ gsm_decode: # @gsm_decode bstrins.d $a5, $a4, 5, 1 st.h $a5, $sp, 122 srli.d $a4, $a3, 6 - bstrins.d $a4, $t1, 2, 2 + bstrins.d $a4, $a6, 2, 2 st.h $a4, $sp, 36 vinsgr2vr.b $vr1, $a3, 0 vinsgr2vr.b $vr1, $a2, 1 diff --git a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/lpc.s b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/lpc.s index f17a0b2d..9ff6ebc4 100644 --- a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/lpc.s +++ b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/lpc.s @@ -69,26 +69,26 @@ .type Gsm_LPC_Analysis,@function Gsm_LPC_Analysis: # @Gsm_LPC_Analysis # %bb.0: # %vector.ph - addi.d $sp, $sp, -160 - st.d $ra, $sp, 152 # 8-byte Folded Spill - st.d $fp, $sp, 144 # 8-byte Folded Spill - st.d $s0, $sp, 136 # 8-byte Folded Spill - st.d $s1, $sp, 128 # 8-byte Folded Spill - st.d $s2, $sp, 120 # 8-byte Folded Spill - st.d $s3, $sp, 112 # 8-byte Folded Spill - st.d $s4, $sp, 104 # 8-byte Folded Spill - st.d $s5, $sp, 96 # 8-byte Folded Spill - st.d $s6, $sp, 88 # 8-byte Folded Spill - st.d $s7, $sp, 80 # 8-byte Folded Spill - st.d $s8, $sp, 72 # 8-byte Folded Spill - fst.d $fs0, $sp, 64 # 8-byte Folded Spill + addi.d $sp, $sp, -224 + st.d $ra, $sp, 216 # 8-byte Folded Spill + st.d $fp, $sp, 208 # 8-byte Folded Spill + st.d $s0, $sp, 200 # 8-byte Folded Spill + st.d $s1, $sp, 192 # 8-byte Folded Spill + st.d $s2, $sp, 184 # 8-byte Folded Spill + st.d $s3, $sp, 176 # 8-byte Folded Spill + st.d $s4, $sp, 168 # 8-byte Folded Spill + st.d $s5, $sp, 160 # 8-byte Folded Spill + st.d $s6, $sp, 152 # 8-byte Folded Spill + st.d $s7, $sp, 144 # 8-byte Folded Spill + st.d $s8, $sp, 136 # 8-byte Folded Spill + fst.d $fs0, $sp, 128 # 8-byte Folded Spill move $s0, $a1 vld $vr2, $a1, 0 vld $vr3, $a1, 16 vslti.h $vr4, $vr2, 0 vslti.h $vr5, $vr3, 0 - lu12i.w $a0, 8 - vreplgr2vr.h $vr0, $a0 + lu12i.w $s2, 8 + vreplgr2vr.h $vr0, $s2 vseq.h $vr6, $vr2, $vr0 vseq.h $vr7, $vr3, $vr0 vneg.h $vr8, $vr2 @@ -235,7 +235,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vmax.hu $vr0, $vr1, $vr0 vpickve2gr.h $a0, $vr0, 0 bstrpick.d $a0, $a0, 15, 0 - move $s2, $a2 + move $s8, $a2 beqz $a0, .LBB0_6 # %bb.1: slli.d $a0, $a0, 16 @@ -769,7 +769,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vst $vr18, $s0, 288 vst $vr16, $s0, 304 .LBB0_18: # %Autocorrelation.exit - st.d $s2, $sp, 16 # 8-byte Folded Spill + st.d $s8, $sp, 80 # 8-byte Folded Spill beqz $s1, .LBB0_21 # %bb.19: # %.preheader69.preheader.i vadd.d $vr0, $vr7, $vr0 @@ -788,76 +788,66 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vhaddw.q.d $vr5, $vr5, $vr5 vhaddw.q.d $vr6, $vr6, $vr6 vhaddw.q.d $vr7, $vr7, $vr7 - vpickve2gr.d $a0, $vr0, 0 - vpickve2gr.d $a1, $vr1, 0 - vpickve2gr.d $a2, $vr2, 0 - vpickve2gr.d $a3, $vr3, 0 - vpickve2gr.d $a4, $vr4, 0 - vpickve2gr.d $a5, $vr5, 0 - vpickve2gr.d $a6, $vr6, 0 - vpickve2gr.d $a7, $vr7, 0 - slli.d $fp, $a7, 1 - slli.d $s0, $a6, 1 - slli.d $s2, $a5, 1 - slli.d $s3, $a4, 1 - slli.d $s6, $a3, 1 - slli.d $s7, $a2, 1 - slli.d $s8, $a1, 1 - slli.d $s5, $a0, 1 + vextrins.d $vr0, $vr1, 16 + vextrins.d $vr2, $vr3, 16 + vextrins.d $vr4, $vr5, 16 + vextrins.d $vr6, $vr7, 16 + vslli.d $vr1, $vr6, 1 + vst $vr1, $sp, 48 # 16-byte Folded Spill + vslli.d $vr1, $vr4, 1 + vst $vr1, $sp, 64 # 16-byte Folded Spill + vslli.d $vr1, $vr2, 1 + vst $vr1, $sp, 16 # 16-byte Folded Spill + vslli.d $vr0, $vr0, 1 + vst $vr0, $sp, 32 # 16-byte Folded Spill move $a0, $s1 pcaddu18i $ra, %call36(gsm_norm) jirl $ra, $ra, 0 - sll.d $a1, $s1, $a0 - srli.d $a1, $a1, 16 + bstrpick.d $a2, $a0, 31, 0 + sll.d $a0, $s1, $a0 + srli.d $a1, $a0, 16 ext.w.h $a3, $a1 - sll.d $a2, $s5, $a0 - srli.d $s5, $a2, 16 - bstrpick.d $a2, $a2, 31, 16 - ext.w.h $a4, $s5 - sll.d $a5, $s8, $a0 - srli.d $a5, $a5, 16 - sll.d $a6, $s7, $a0 - srli.d $a6, $a6, 16 - sll.d $a7, $s6, $a0 - srli.d $a7, $a7, 16 - sll.d $t0, $s3, $a0 - srli.d $t0, $t0, 16 - sll.d $t1, $s2, $a0 - ld.d $t3, $sp, 16 # 8-byte Folded Reload - srli.d $t1, $t1, 16 - sll.d $t2, $s0, $a0 - srli.d $t2, $t2, 16 - sll.d $a0, $fp, $a0 - srli.d $a0, $a0, 16 - st.h $s5, $sp, 30 - st.h $a5, $sp, 32 - st.h $a6, $sp, 34 - st.h $a7, $sp, 36 - st.h $t0, $sp, 38 - st.h $t1, $sp, 40 - st.h $t2, $sp, 42 - st.h $s5, $sp, 48 - st.h $a5, $sp, 50 - st.h $a6, $sp, 52 - st.h $a7, $sp, 54 - st.h $t0, $sp, 56 - st.h $t1, $sp, 58 - st.h $t2, $sp, 60 - st.h $a0, $sp, 62 - slti $a0, $a4, 0 - lu12i.w $a4, 8 - xor $a2, $a2, $a4 + vreplgr2vr.d $vr0, $a2 + vld $vr1, $sp, 16 # 16-byte Folded Reload + vsll.d $vr1, $vr1, $vr0 + vld $vr2, $sp, 32 # 16-byte Folded Reload + vsll.d $vr2, $vr2, $vr0 + vld $vr3, $sp, 48 # 16-byte Folded Reload + vsll.d $vr3, $vr3, $vr0 + vld $vr4, $sp, 64 # 16-byte Folded Reload + vsll.d $vr0, $vr4, $vr0 + vsrli.d $vr0, $vr0, 16 + vsrli.d $vr3, $vr3, 16 + vsrli.d $vr2, $vr2, 16 + vsrli.d $vr1, $vr1, 16 + vpickev.w $vr1, $vr1, $vr2 + vpickev.w $vr0, $vr3, $vr0 + vpickev.h $vr2, $vr0, $vr1 + vpickve2gr.h $a0, $vr1, 0 + bstrpick.d $a2, $a0, 15, 0 + ext.w.h $fp, $a0 + vstelm.h $vr1, $sp, 94, 0 + vstelm.h $vr1, $sp, 96, 2 + vstelm.h $vr1, $sp, 98, 4 + vstelm.h $vr1, $sp, 100, 6 + vstelm.h $vr0, $sp, 102, 0 + vstelm.h $vr0, $sp, 104, 2 + vstelm.h $vr0, $sp, 106, 4 + vst $vr2, $sp, 112 + slti $a4, $fp, 0 + move $s3, $s2 + xor $a2, $a2, $s2 sltui $a2, $a2, 1 - sub.d $a4, $zero, $s5 - masknez $a4, $a4, $a2 + sub.d $a5, $zero, $a0 + masknez $a5, $a5, $a2 maskeqz $a2, $s4, $a2 - or $a2, $a2, $a4 - masknez $a4, $s5, $a0 - maskeqz $a0, $a2, $a0 - or $a0, $a0, $a4 + or $a2, $a2, $a5 + masknez $a0, $a0, $a4 + maskeqz $a2, $a2, $a4 + or $a0, $a2, $a0 ext.w.h $a0, $a0 ori $a2, $zero, 16 - move $s0, $t3 + move $s0, $s8 bge $a3, $a0, .LBB0_26 .LBB0_20: # %.lr.ph82.preheader.i move $a0, $s0 @@ -866,15 +856,15 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis jirl $ra, $ra, 0 b .LBB0_22 .LBB0_21: # %.preheader.preheader.i + move $s3, $s2 vrepli.b $vr0, 0 - vst $vr0, $s2, 0 + vst $vr0, $s8, 0 .LBB0_22: # %Reflection_coefficients.exit - ld.d $s2, $sp, 16 # 8-byte Folded Reload + ld.d $s2, $sp, 80 # 8-byte Folded Reload ld.hu $a1, $s2, 0 ext.w.h $a0, $a1 slti $a0, $a0, 0 - lu12i.w $fp, 8 - xor $a2, $a1, $fp + xor $a2, $a1, $s3 sltui $a2, $a2, 1 sub.d $a3, $zero, $a1 masknez $a3, $a3, $a2 @@ -902,13 +892,12 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis add.d $a2, $a1, $a2 b .LBB0_34 .LBB0_26: # %.lr.ph.preheader - move $s6, $zero - ori $fp, $zero, 8 - ori $s7, $zero, 0 - lu32i.d $s7, 32768 - lu12i.w $s3, -8 - ori $s8, $zero, 7 - move $s0, $t3 + move $s5, $zero + ori $s6, $zero, 0 + lu32i.d $s6, 32768 + lu12i.w $s2, -8 + ori $s7, $zero, 7 + move $s0, $s8 ori $a2, $zero, 1 b .LBB0_28 .p2align 4, , 16 @@ -916,97 +905,97 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis # in Loop: Header=BB0_28 Depth=1 mul.d $a0, $a1, $a0 slli.d $a0, $a0, 33 - add.d $a0, $a0, $s7 + add.d $a0, $a0, $s6 srai.d $a0, $a0, 48 add.d $a0, $a0, $s1 slt $a1, $a0, $s4 maskeqz $a0, $a0, $a1 masknez $a1, $s4, $a1 or $a0, $a0, $a1 - slt $a1, $s3, $a0 + slt $a1, $s2, $a0 maskeqz $a0, $a0, $a1 - masknez $a1, $s3, $a1 + masknez $a1, $s2, $a1 or $a1, $a0, $a1 - addi.d $a2, $s2, 1 + addi.d $a2, $s8, 1 addi.d $s0, $s0, 2 - addi.d $s6, $s6, 1 - ext.w.h $a0, $s5 + addi.d $s5, $s5, 1 + ext.w.h $a0, $fp slti $a0, $a0, 0 - bstrpick.d $a3, $s5, 15, 0 - lu12i.w $a4, 8 - xor $a3, $a3, $a4 + bstrpick.d $a3, $fp, 15, 0 + xor $a3, $a3, $s3 sltui $a3, $a3, 1 - sub.d $a4, $zero, $s5 + sub.d $a4, $zero, $fp masknez $a4, $a4, $a3 maskeqz $a3, $s4, $a3 or $a3, $a3, $a4 maskeqz $a3, $a3, $a0 - masknez $a0, $s5, $a0 + masknez $a0, $fp, $a0 or $a0, $a3, $a0 ext.w.h $a0, $a0 ext.w.h $a3, $a1 - addi.d $s8, $s8, -1 + addi.d $s7, $s7, -1 blt $a3, $a0, .LBB0_70 .LBB0_28: # %.lr.ph # =>This Loop Header: Depth=1 # Child Loop BB0_31 Depth 2 - move $s2, $a2 + move $s8, $a2 ext.w.h $a0, $a0 ext.w.h $s1, $a1 move $a1, $s1 pcaddu18i $ra, %call36(gsm_div) jirl $ra, $ra, 0 - ext.w.h $a1, $s5 + ext.w.h $a1, $fp slt $a2, $zero, $a1 sub.d $a3, $zero, $a0 masknez $a0, $a0, $a2 maskeqz $a2, $a3, $a2 or $a0, $a2, $a0 st.h $a0, $s0, 0 - beq $s2, $fp, .LBB0_22 + ori $a2, $zero, 8 + beq $s8, $a2, .LBB0_22 # %bb.29: # in Loop: Header=BB0_28 Depth=1 ext.w.h $a0, $a0 ori $a2, $zero, 7 - bltu $a2, $s2, .LBB0_27 + bltu $a2, $s8, .LBB0_27 # %bb.30: # %.lr.ph.i # in Loop: Header=BB0_28 Depth=1 ori $a3, $zero, 1 - sltu $a2, $a3, $s8 + sltu $a2, $a3, $s7 masknez $a3, $a3, $a2 - maskeqz $a2, $s8, $a2 + maskeqz $a2, $s7, $a2 or $a2, $a2, $a3 slli.d $a3, $a0, 33 - addi.d $a4, $sp, 30 - addi.d $a5, $sp, 50 + addi.d $a4, $sp, 94 + addi.d $a5, $sp, 114 .p2align 4, , 16 .LBB0_31: # Parent Loop BB0_28 Depth=1 # => This Inner Loop Header: Depth=2 ld.h $a6, $a4, 0 ld.h $a7, $a5, 0 mul.d $t0, $a3, $a6 - add.d $t0, $t0, $s7 + add.d $t0, $t0, $s6 srai.d $t0, $t0, 48 add.d $t0, $t0, $a7 slt $t1, $t0, $s4 maskeqz $t0, $t0, $t1 masknez $t1, $s4, $t1 or $t0, $t0, $t1 - slt $t1, $s3, $t0 + slt $t1, $s2, $t0 maskeqz $t0, $t0, $t1 - masknez $t1, $s3, $t1 + masknez $t1, $s2, $t1 or $t0, $t0, $t1 st.h $t0, $a5, -2 mul.d $a7, $a3, $a7 - add.d $a7, $a7, $s7 + add.d $a7, $a7, $s6 srai.d $a7, $a7, 48 add.d $a6, $a7, $a6 slt $a7, $a6, $s4 maskeqz $a6, $a6, $a7 masknez $a7, $s4, $a7 or $a6, $a6, $a7 - slt $a7, $s3, $a6 + slt $a7, $s2, $a6 maskeqz $a6, $a6, $a7 - masknez $a7, $s3, $a7 + masknez $a7, $s2, $a7 or $a6, $a6, $a7 st.h $a6, $a4, 0 addi.d $a2, $a2, -1 @@ -1015,7 +1004,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis bnez $a2, .LBB0_31 # %bb.32: # %._crit_edge.i.loopexit # in Loop: Header=BB0_28 Depth=1 - ld.hu $s5, $sp, 48 + ld.hu $fp, $sp, 112 b .LBB0_27 .LBB0_33: lu12i.w $a2, 6 @@ -1025,7 +1014,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $a4, $s2, 2 ext.w.h $a1, $a4 slti $a1, $a1, 0 - xor $a5, $a4, $fp + xor $a5, $a4, $s3 sltui $a5, $a5, 1 sub.d $a6, $zero, $a4 masknez $a6, $a6, $a5 @@ -1057,7 +1046,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $a6, $s2, 4 ext.w.h $a4, $a6 slti $a4, $a4, 0 - xor $a7, $a6, $fp + xor $a7, $a6, $s3 sltui $a7, $a7, 1 sub.d $t0, $zero, $a6 masknez $t0, $t0, $a7 @@ -1088,7 +1077,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $t0, $s2, 6 ext.w.h $a7, $t0 slti $a7, $a7, 0 - xor $t1, $t0, $fp + xor $t1, $t0, $s3 sltui $t1, $t1, 1 sub.d $t2, $zero, $t0 masknez $t2, $t2, $t1 @@ -1119,7 +1108,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $t2, $s2, 8 ext.w.h $t0, $t2 slti $t0, $t0, 0 - xor $t3, $t2, $fp + xor $t3, $t2, $s3 sltui $t3, $t3, 1 sub.d $t4, $zero, $t2 masknez $t4, $t4, $t3 @@ -1150,7 +1139,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $t4, $s2, 10 ext.w.h $t2, $t4 slti $t2, $t2, 0 - xor $t5, $t4, $fp + xor $t5, $t4, $s3 sltui $t5, $t5, 1 sub.d $t6, $zero, $t4 masknez $t6, $t6, $t5 @@ -1181,7 +1170,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $t6, $s2, 12 ext.w.h $t4, $t6 slti $t4, $t4, 0 - xor $t7, $t6, $fp + xor $t7, $t6, $s3 sltui $t7, $t7, 1 sub.d $t8, $zero, $t6 masknez $t8, $t8, $t7 @@ -1212,7 +1201,7 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis ld.hu $t8, $s2, 14 ext.w.h $t6, $t8 slti $t6, $t6, 0 - xor $fp, $t8, $fp + xor $fp, $t8, $s3 sltui $fp, $fp, 1 sub.d $s0, $zero, $t8 masknez $s0, $s0, $fp @@ -1335,25 +1324,25 @@ Gsm_LPC_Analysis: # @Gsm_LPC_Analysis vadd.h $vr0, $vr0, $vr4 vbitsel.v $vr0, $vr0, $vr5, $vr3 vst $vr0, $s2, 0 - fld.d $fs0, $sp, 64 # 8-byte Folded Reload - ld.d $s8, $sp, 72 # 8-byte Folded Reload - ld.d $s7, $sp, 80 # 8-byte Folded Reload - ld.d $s6, $sp, 88 # 8-byte Folded Reload - ld.d $s5, $sp, 96 # 8-byte Folded Reload - ld.d $s4, $sp, 104 # 8-byte Folded Reload - ld.d $s3, $sp, 112 # 8-byte Folded Reload - ld.d $s2, $sp, 120 # 8-byte Folded Reload - ld.d $s1, $sp, 128 # 8-byte Folded Reload - ld.d $s0, $sp, 136 # 8-byte Folded Reload - ld.d $fp, $sp, 144 # 8-byte Folded Reload - ld.d $ra, $sp, 152 # 8-byte Folded Reload - addi.d $sp, $sp, 160 + fld.d $fs0, $sp, 128 # 8-byte Folded Reload + ld.d $s8, $sp, 136 # 8-byte Folded Reload + ld.d $s7, $sp, 144 # 8-byte Folded Reload + ld.d $s6, $sp, 152 # 8-byte Folded Reload + ld.d $s5, $sp, 160 # 8-byte Folded Reload + ld.d $s4, $sp, 168 # 8-byte Folded Reload + ld.d $s3, $sp, 176 # 8-byte Folded Reload + ld.d $s2, $sp, 184 # 8-byte Folded Reload + ld.d $s1, $sp, 192 # 8-byte Folded Reload + ld.d $s0, $sp, 200 # 8-byte Folded Reload + ld.d $fp, $sp, 208 # 8-byte Folded Reload + ld.d $ra, $sp, 216 # 8-byte Folded Reload + addi.d $sp, $sp, 224 ret .LBB0_70: # %.preheader64.i ori $a0, $zero, 7 - bltu $a0, $s2, .LBB0_22 + bltu $a0, $s8, .LBB0_22 # %bb.71: - slli.d $a0, $s6, 1 + slli.d $a0, $s5, 1 ori $a1, $zero, 16 sub.d $a2, $a1, $a0 b .LBB0_20 diff --git a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/short_term.s b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/short_term.s index 0250c0e1..8dc262de 100644 --- a/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/short_term.s +++ b/results/MultiSource/Benchmarks/mediabench/gsm/toast/CMakeFiles/toast.dir/short_term.s @@ -1,19 +1,31 @@ .file "short_term.c" + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4, 0x0 # -- Begin function Gsm_Short_Term_Analysis_Filter +.LCPI0_0: + .half 0 # 0x0 + .half 4 # 0x4 + .half 65535 # 0xffff + .half 65535 # 0xffff + .half 65535 # 0xffff + .half 65535 # 0xffff + .half 65535 # 0xffff + .half 65535 # 0xffff .text - .globl Gsm_Short_Term_Analysis_Filter # -- Begin function Gsm_Short_Term_Analysis_Filter + .globl Gsm_Short_Term_Analysis_Filter .p2align 5 .type Gsm_Short_Term_Analysis_Filter,@function Gsm_Short_Term_Analysis_Filter: # @Gsm_Short_Term_Analysis_Filter # %bb.0: # %vector.ph - addi.d $sp, $sp, -96 - st.d $ra, $sp, 88 # 8-byte Folded Spill - st.d $fp, $sp, 80 # 8-byte Folded Spill - st.d $s0, $sp, 72 # 8-byte Folded Spill - st.d $s1, $sp, 64 # 8-byte Folded Spill - st.d $s2, $sp, 56 # 8-byte Folded Spill - st.d $s3, $sp, 48 # 8-byte Folded Spill - st.d $s4, $sp, 40 # 8-byte Folded Spill - st.d $s5, $sp, 32 # 8-byte Folded Spill + addi.d $sp, $sp, -112 + st.d $ra, $sp, 104 # 8-byte Folded Spill + st.d $fp, $sp, 96 # 8-byte Folded Spill + st.d $s0, $sp, 88 # 8-byte Folded Spill + st.d $s1, $sp, 80 # 8-byte Folded Spill + st.d $s2, $sp, 72 # 8-byte Folded Spill + st.d $s3, $sp, 64 # 8-byte Folded Spill + st.d $s4, $sp, 56 # 8-byte Folded Spill + st.d $s5, $sp, 48 # 8-byte Folded Spill + fst.d $fs0, $sp, 40 # 8-byte Folded Spill move $s1, $a0 ld.h $a3, $a0, 628 move $fp, $a2 @@ -40,409 +52,391 @@ Gsm_Short_Term_Analysis_Filter: # @Gsm_Short_Term_Analysis_Filter vreplgr2vr.w $vr3, $a0 vsra.h $vr0, $vr0, $vr3 vadd.h $vr0, $vr0, $vr2 - vadd.h $vr9, $vr0, $vr1 - vslti.h $vr10, $vr9, 0 + vadd.h $vr0, $vr0, $vr1 + vslti.h $vr10, $vr0, 0 lu12i.w $a0, 2 ori $a1, $a0, 2867 - vreplgr2vr.h $vr1, $a1 - vslt.hu $vr11, $vr9, $vr1 - vsrli.h $vr0, $vr9, 2 + vreplgr2vr.h $vr2, $a1 + vslt.hu $vr11, $vr0, $vr2 + vsrli.h $vr1, $vr0, 2 lu12i.w $a1, 6 ori $a2, $a1, 1536 - vreplgr2vr.h $vr3, $a2 - vadd.h $vr12, $vr0, $vr3 + vreplgr2vr.h $vr4, $a2 + vadd.h $vr12, $vr1, $vr4 lu12i.w $a2, 13 ori $a2, $a2, 1229 - vreplgr2vr.h $vr0, $a2 - vadd.h $vr2, $vr9, $vr0 + vreplgr2vr.h $vr1, $a2 + vadd.h $vr3, $vr0, $vr1 ori $a0, $a0, 819 - vreplgr2vr.h $vr5, $a0 - vslt.hu $vr13, $vr2, $vr5 - vadd.h $vr14, $vr9, $vr1 - vslli.h $vr15, $vr9, 1 - lu12i.w $a2, 8 - vreplgr2vr.h $vr6, $a2 - vseq.h $vr4, $vr9, $vr6 - vneg.h $vr7, $vr9 - lu12i.w $a0, 7 - ori $a0, $a0, 4095 - vreplgr2vr.h $vr2, $a0 - vbitsel.v $vr16, $vr7, $vr2, $vr4 - vslt.hu $vr17, $vr16, $vr1 - lu12i.w $a3, 4 - ori $a3, $a3, 3686 - vreplgr2vr.h $vr7, $a3 - vslt.hu $vr18, $vr16, $vr7 + vreplgr2vr.h $vr6, $a0 + vslt.hu $vr13, $vr3, $vr6 + vadd.h $vr14, $vr0, $vr2 + vslli.h $vr15, $vr0, 1 + lu12i.w $a0, 8 + vreplgr2vr.h $vr7, $a0 + vseq.h $vr5, $vr0, $vr7 + vneg.h $vr8, $vr0 + lu12i.w $a2, 7 + ori $a2, $a2, 4095 + vreplgr2vr.h $vr3, $a2 + vbitsel.v $vr16, $vr8, $vr3, $vr5 + vslt.hu $vr17, $vr16, $vr2 + lu12i.w $a2, 4 + ori $a2, $a2, 3686 + vreplgr2vr.h $vr8, $a2 + vslt.hu $vr18, $vr16, $vr8 ori $a1, $a1, 2047 - vreplgr2vr.h $vr4, $a1 - vslt.hu $vr19, $vr4, $vr16 - vxor.v $vr8, $vr19, $vr18 - vnor.v $vr8, $vr17, $vr8 - vand.v $vr20, $vr10, $vr8 + vreplgr2vr.h $vr5, $a1 + vslt.hu $vr19, $vr5, $vr16 + vxor.v $vr9, $vr19, $vr18 + vnor.v $vr9, $vr17, $vr9 + vand.v $vr20, $vr10, $vr9 vsrli.h $vr21, $vr16, 2 lu12i.w $a1, 9 ori $a1, $a1, 2560 - vreplgr2vr.h $vr8, $a1 - vsub.h $vr21, $vr8, $vr21 + vreplgr2vr.h $vr9, $a1 + vsub.h $vr21, $vr9, $vr21 vxor.v $vr18, $vr17, $vr18 vand.v $vr18, $vr10, $vr18 - vsub.h $vr22, $vr0, $vr16 + vsub.h $vr22, $vr1, $vr16 vand.v $vr17, $vr10, $vr17 vslli.h $vr16, $vr16, 1 vneg.h $vr16, $vr16 - vslt.h $vr9, $vr4, $vr9 - vand.v $vr10, $vr10, $vr19 - vbitsel.v $vr9, $vr12, $vr2, $vr9 - vbitsel.v $vr9, $vr9, $vr14, $vr13 - vbitsel.v $vr9, $vr9, $vr15, $vr11 - vbitsel.v $vr11, $vr9, $vr21, $vr20 - ori $a1, $a2, 1 - vreplgr2vr.h $vr9, $a1 - vbitsel.v $vr10, $vr11, $vr9, $vr10 - vbitsel.v $vr10, $vr10, $vr22, $vr18 - vbitsel.v $vr10, $vr10, $vr16, $vr17 - vst $vr10, $sp, 16 - addi.d $a1, $s1, 580 - ori $a3, $zero, 13 - addi.d $a4, $sp, 16 - ori $a5, $zero, 0 - lu32i.d $a5, 32768 - lu12i.w $a2, -8 - ori $a6, $zero, 16 - move $a7, $fp + vslt.h $vr0, $vr5, $vr0 + vand.v $vr19, $vr10, $vr19 + vbitsel.v $vr0, $vr12, $vr3, $vr0 + vbitsel.v $vr0, $vr0, $vr14, $vr13 + vbitsel.v $vr0, $vr0, $vr15, $vr11 + vbitsel.v $vr0, $vr0, $vr21, $vr20 + ori $a0, $a0, 1 + vreplgr2vr.h $vr10, $a0 + vbitsel.v $vr0, $vr0, $vr10, $vr19 + vbitsel.v $vr0, $vr0, $vr22, $vr18 + vbitsel.v $vr0, $vr0, $vr16, $vr17 + vst $vr0, $sp, 16 + addi.d $a0, $s1, 580 + ori $a1, $zero, 13 + addi.d $a2, $sp, 16 + ori $a3, $zero, 0 + pcalau12i $a4, %pc_hi20(.LCPI0_0) + vld $vr0, $a4, %pc_lo12(.LCPI0_0) + lu32i.d $a3, 32768 + vreplgr2vr.d $vr11, $a3 + ori $a3, $zero, 16 + move $a4, $fp .p2align 4, , 16 .LBB0_1: # =>This Loop Header: Depth=1 # Child Loop BB0_2 Depth 2 - ld.hu $t1, $a7, 0 - move $t0, $zero - move $t2, $t1 + ld.h $a6, $a4, 0 + move $a5, $zero + vinsgr2vr.h $vr12, $a6, 0 + vinsgr2vr.h $vr12, $a6, 1 .p2align 4, , 16 .LBB0_2: # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 - ldx.h $t3, $t0, $a4 - ldx.h $t4, $a1, $t0 - stx.h $t1, $a1, $t0 - ext.w.h $t2, $t2 - mul.d $t1, $t2, $t3 - slli.d $t1, $t1, 33 - add.d $t1, $t1, $a5 - srai.d $t1, $t1, 48 - add.d $t1, $t1, $t4 - slt $t5, $t1, $a0 - maskeqz $t1, $t1, $t5 - masknez $t5, $a0, $t5 - or $t1, $t1, $t5 - slt $t5, $a2, $t1 - maskeqz $t1, $t1, $t5 - masknez $t5, $a2, $t5 - or $t1, $t1, $t5 - mul.d $t3, $t4, $t3 - slli.d $t3, $t3, 33 - add.d $t3, $t3, $a5 - srai.d $t3, $t3, 48 - add.d $t2, $t3, $t2 - slt $t3, $t2, $a0 - maskeqz $t2, $t2, $t3 - masknez $t3, $a0, $t3 - or $t2, $t2, $t3 - slt $t3, $a2, $t2 - maskeqz $t2, $t2, $t3 - masknez $t3, $a2, $t3 - addi.d $t0, $t0, 2 - or $t2, $t2, $t3 - bne $t0, $a6, .LBB0_2 + ldx.h $a6, $a0, $a5 + add.d $a7, $a0, $a5 + ldx.h $t0, $a5, $a2 + vstelm.h $vr12, $a7, 0, 1 + vinsgr2vr.h $vr13, $a6, 0 + vpickev.h $vr13, $vr12, $vr13 + vslli.d $vr13, $vr13, 48 + vsrai.d $vr13, $vr13, 48 + vreplgr2vr.d $vr14, $t0 + vmul.d $vr13, $vr13, $vr14 + vslli.d $vr13, $vr13, 33 + vadd.d $vr13, $vr13, $vr11 + vsrli.d $vr13, $vr13, 48 + vori.b $vr14, $vr0, 0 + vshuf.h $vr14, $vr0, $vr13 + vinsgr2vr.h $vr12, $a6, 1 + vadd.h $vr13, $vr14, $vr12 + vslt.h $vr14, $vr13, $vr14 + vslti.h $vr12, $vr12, 0 + vxor.v $vr12, $vr12, $vr14 + vsrai.h $vr14, $vr13, 15 + vbitrevi.h $vr14, $vr14, 15 + addi.d $a5, $a5, 2 + vbitsel.v $vr12, $vr13, $vr14, $vr12 + bne $a5, $a3, .LBB0_2 # %bb.3: # in Loop: Header=BB0_1 Depth=1 - addi.w $a3, $a3, -1 - st.h $t2, $a7, 0 - addi.d $a7, $a7, 2 - bnez $a3, .LBB0_1 + addi.w $a1, $a1, -1 + vstelm.h $vr12, $a4, 0, 0 + addi.d $a4, $a4, 2 + bnez $a1, .LBB0_1 # %bb.4: # %Short_term_analysis_filtering.exit - vld $vr10, $s2, 0 - vld $vr11, $s0, 0 - vsrai.h $vr10, $vr10, 1 + vld $vr11, $s2, 0 + vld $vr12, $s0, 0 vsrai.h $vr11, $vr11, 1 - vadd.h $vr10, $vr11, $vr10 - vslti.h $vr11, $vr10, 0 - vslt.hu $vr12, $vr10, $vr1 - vsrli.h $vr13, $vr10, 2 - vadd.h $vr13, $vr13, $vr3 - vadd.h $vr14, $vr10, $vr0 - vslt.hu $vr14, $vr14, $vr5 - vadd.h $vr15, $vr10, $vr1 - vslli.h $vr16, $vr10, 1 - vseq.h $vr17, $vr10, $vr6 - vneg.h $vr18, $vr10 - vbitsel.v $vr17, $vr18, $vr2, $vr17 - vslt.hu $vr18, $vr17, $vr1 - vslt.hu $vr19, $vr17, $vr7 - vslt.hu $vr20, $vr4, $vr17 - vxor.v $vr21, $vr20, $vr19 - vnor.v $vr21, $vr18, $vr21 - vand.v $vr21, $vr11, $vr21 - vsrli.h $vr22, $vr17, 2 - vsub.h $vr22, $vr8, $vr22 - vxor.v $vr19, $vr18, $vr19 - vand.v $vr19, $vr11, $vr19 - vsub.h $vr23, $vr0, $vr17 - vand.v $vr18, $vr11, $vr18 - vslli.h $vr17, $vr17, 1 - vneg.h $vr17, $vr17 - vslt.h $vr10, $vr4, $vr10 - vand.v $vr11, $vr11, $vr20 - vbitsel.v $vr10, $vr13, $vr2, $vr10 - vbitsel.v $vr10, $vr10, $vr15, $vr14 - vbitsel.v $vr10, $vr10, $vr16, $vr12 - vbitsel.v $vr10, $vr10, $vr22, $vr21 - vbitsel.v $vr10, $vr10, $vr9, $vr11 - vbitsel.v $vr10, $vr10, $vr23, $vr19 - vbitsel.v $vr10, $vr10, $vr17, $vr18 - vst $vr10, $sp, 16 - addi.d $a3, $fp, 26 - ori $a4, $zero, 14 - addi.d $a5, $sp, 16 - ori $a6, $zero, 0 - lu32i.d $a6, 32768 - ori $a7, $zero, 16 + vsrai.h $vr12, $vr12, 1 + vadd.h $vr11, $vr12, $vr11 + vslti.h $vr12, $vr11, 0 + vslt.hu $vr13, $vr11, $vr2 + vsrli.h $vr14, $vr11, 2 + vadd.h $vr14, $vr14, $vr4 + vadd.h $vr15, $vr11, $vr1 + vslt.hu $vr15, $vr15, $vr6 + vadd.h $vr16, $vr11, $vr2 + vslli.h $vr17, $vr11, 1 + vseq.h $vr18, $vr11, $vr7 + vneg.h $vr19, $vr11 + vbitsel.v $vr18, $vr19, $vr3, $vr18 + vslt.hu $vr19, $vr18, $vr2 + vslt.hu $vr20, $vr18, $vr8 + vslt.hu $vr21, $vr5, $vr18 + vxor.v $vr22, $vr21, $vr20 + vnor.v $vr22, $vr19, $vr22 + vand.v $vr22, $vr12, $vr22 + vsrli.h $vr23, $vr18, 2 + vsub.h $vr23, $vr9, $vr23 + vxor.v $vr20, $vr19, $vr20 + vand.v $vr20, $vr12, $vr20 + vsub.h $vr24, $vr1, $vr18 + vand.v $vr19, $vr12, $vr19 + vslli.h $vr18, $vr18, 1 + vneg.h $vr18, $vr18 + vslt.h $vr11, $vr5, $vr11 + vand.v $vr12, $vr12, $vr21 + vbitsel.v $vr11, $vr14, $vr3, $vr11 + vbitsel.v $vr11, $vr11, $vr16, $vr15 + vbitsel.v $vr11, $vr11, $vr17, $vr13 + vbitsel.v $vr11, $vr11, $vr23, $vr22 + vbitsel.v $vr11, $vr11, $vr10, $vr12 + vbitsel.v $vr11, $vr11, $vr24, $vr20 + vbitsel.v $vr11, $vr11, $vr18, $vr19 + vst $vr11, $sp, 16 + addi.d $a1, $fp, 26 + ori $a2, $zero, 14 + addi.d $a3, $sp, 16 + ori $a4, $zero, 0 + lu32i.d $a4, 32768 + vreplgr2vr.d $vr11, $a4 + ori $a4, $zero, 16 .p2align 4, , 16 .LBB0_5: # =>This Loop Header: Depth=1 # Child Loop BB0_6 Depth 2 - ld.hu $t1, $a3, 0 - move $t0, $zero - move $t2, $t1 + ld.h $a6, $a1, 0 + move $a5, $zero + vinsgr2vr.h $vr12, $a6, 0 + vinsgr2vr.h $vr12, $a6, 1 .p2align 4, , 16 .LBB0_6: # Parent Loop BB0_5 Depth=1 # => This Inner Loop Header: Depth=2 - ldx.h $t3, $t0, $a5 - ldx.h $t4, $a1, $t0 - stx.h $t1, $a1, $t0 - ext.w.h $t2, $t2 - mul.d $t1, $t2, $t3 - slli.d $t1, $t1, 33 - add.d $t1, $t1, $a6 - srai.d $t1, $t1, 48 - add.d $t1, $t1, $t4 - slt $t5, $t1, $a0 - maskeqz $t1, $t1, $t5 - masknez $t5, $a0, $t5 - or $t1, $t1, $t5 - slt $t5, $a2, $t1 - maskeqz $t1, $t1, $t5 - masknez $t5, $a2, $t5 - or $t1, $t1, $t5 - mul.d $t3, $t4, $t3 - slli.d $t3, $t3, 33 - add.d $t3, $t3, $a6 - srai.d $t3, $t3, 48 - add.d $t2, $t3, $t2 - slt $t3, $t2, $a0 - maskeqz $t2, $t2, $t3 - masknez $t3, $a0, $t3 - or $t2, $t2, $t3 - slt $t3, $a2, $t2 - maskeqz $t2, $t2, $t3 - masknez $t3, $a2, $t3 - addi.d $t0, $t0, 2 - or $t2, $t2, $t3 - bne $t0, $a7, .LBB0_6 + ldx.h $a6, $a0, $a5 + add.d $a7, $a0, $a5 + ldx.h $t0, $a5, $a3 + vstelm.h $vr12, $a7, 0, 1 + vinsgr2vr.h $vr13, $a6, 0 + vpickev.h $vr13, $vr12, $vr13 + vslli.d $vr13, $vr13, 48 + vsrai.d $vr13, $vr13, 48 + vreplgr2vr.d $vr14, $t0 + vmul.d $vr13, $vr13, $vr14 + vslli.d $vr13, $vr13, 33 + vadd.d $vr13, $vr13, $vr11 + vsrli.d $vr13, $vr13, 48 + vori.b $vr14, $vr0, 0 + vshuf.h $vr14, $vr0, $vr13 + vinsgr2vr.h $vr12, $a6, 1 + vadd.h $vr13, $vr14, $vr12 + vslt.h $vr14, $vr13, $vr14 + vslti.h $vr12, $vr12, 0 + vxor.v $vr12, $vr12, $vr14 + vsrai.h $vr14, $vr13, 15 + vbitrevi.h $vr14, $vr14, 15 + addi.d $a5, $a5, 2 + vbitsel.v $vr12, $vr13, $vr14, $vr12 + bne $a5, $a4, .LBB0_6 # %bb.7: # in Loop: Header=BB0_5 Depth=1 - addi.w $a4, $a4, -1 - st.h $t2, $a3, 0 - addi.d $a3, $a3, 2 - bnez $a4, .LBB0_5 + addi.w $a2, $a2, -1 + vstelm.h $vr12, $a1, 0, 0 + addi.d $a1, $a1, 2 + bnez $a2, .LBB0_5 # %bb.8: # %Short_term_analysis_filtering.exit49 - vld $vr10, $s2, 0 - vld $vr11, $s0, 0 - vsrai.h $vr10, $vr10, 2 - vsrai.h $vr12, $vr11, 2 - vadd.h $vr10, $vr12, $vr10 - vsrai.h $vr11, $vr11, 1 - vadd.h $vr10, $vr10, $vr11 - vslti.h $vr11, $vr10, 0 - vslt.hu $vr12, $vr10, $vr1 - vsrli.h $vr13, $vr10, 2 - vadd.h $vr13, $vr13, $vr3 - vadd.h $vr14, $vr10, $vr0 - vslt.hu $vr14, $vr14, $vr5 - vadd.h $vr15, $vr10, $vr1 - vslli.h $vr16, $vr10, 1 - vseq.h $vr17, $vr10, $vr6 - vneg.h $vr18, $vr10 - vbitsel.v $vr17, $vr18, $vr2, $vr17 - vslt.hu $vr18, $vr17, $vr1 - vslt.hu $vr19, $vr17, $vr7 - vslt.hu $vr20, $vr4, $vr17 - vxor.v $vr21, $vr20, $vr19 - vnor.v $vr21, $vr18, $vr21 - vand.v $vr21, $vr11, $vr21 - vsrli.h $vr22, $vr17, 2 - vsub.h $vr22, $vr8, $vr22 - vxor.v $vr19, $vr18, $vr19 - vand.v $vr19, $vr11, $vr19 - vsub.h $vr23, $vr0, $vr17 - vand.v $vr18, $vr11, $vr18 - vslli.h $vr17, $vr17, 1 - vneg.h $vr17, $vr17 - vslt.h $vr10, $vr4, $vr10 - vand.v $vr11, $vr11, $vr20 - vbitsel.v $vr10, $vr13, $vr2, $vr10 - vbitsel.v $vr10, $vr10, $vr15, $vr14 - vbitsel.v $vr10, $vr10, $vr16, $vr12 - vbitsel.v $vr10, $vr10, $vr22, $vr21 - vbitsel.v $vr10, $vr10, $vr9, $vr11 - vbitsel.v $vr10, $vr10, $vr23, $vr19 - vbitsel.v $vr10, $vr10, $vr17, $vr18 - vst $vr10, $sp, 16 - addi.d $a3, $fp, 54 - ori $a4, $zero, 13 - addi.d $a5, $sp, 16 - ori $a6, $zero, 0 - lu32i.d $a6, 32768 - ori $a7, $zero, 16 + vld $vr11, $s2, 0 + vld $vr12, $s0, 0 + vsrai.h $vr11, $vr11, 2 + vsrai.h $vr13, $vr12, 2 + vadd.h $vr11, $vr13, $vr11 + vsrai.h $vr12, $vr12, 1 + vadd.h $vr11, $vr11, $vr12 + vslti.h $vr12, $vr11, 0 + vslt.hu $vr13, $vr11, $vr2 + vsrli.h $vr14, $vr11, 2 + vadd.h $vr14, $vr14, $vr4 + vadd.h $vr15, $vr11, $vr1 + vslt.hu $vr15, $vr15, $vr6 + vadd.h $vr16, $vr11, $vr2 + vslli.h $vr17, $vr11, 1 + vseq.h $vr18, $vr11, $vr7 + vneg.h $vr19, $vr11 + vbitsel.v $vr18, $vr19, $vr3, $vr18 + vslt.hu $vr19, $vr18, $vr2 + vslt.hu $vr20, $vr18, $vr8 + vslt.hu $vr21, $vr5, $vr18 + vxor.v $vr22, $vr21, $vr20 + vnor.v $vr22, $vr19, $vr22 + vand.v $vr22, $vr12, $vr22 + vsrli.h $vr23, $vr18, 2 + vsub.h $vr23, $vr9, $vr23 + vxor.v $vr20, $vr19, $vr20 + vand.v $vr20, $vr12, $vr20 + vsub.h $vr24, $vr1, $vr18 + vand.v $vr19, $vr12, $vr19 + vslli.h $vr18, $vr18, 1 + vneg.h $vr18, $vr18 + vslt.h $vr11, $vr5, $vr11 + vand.v $vr12, $vr12, $vr21 + vbitsel.v $vr11, $vr14, $vr3, $vr11 + vbitsel.v $vr11, $vr11, $vr16, $vr15 + vbitsel.v $vr11, $vr11, $vr17, $vr13 + vbitsel.v $vr11, $vr11, $vr23, $vr22 + vbitsel.v $vr11, $vr11, $vr10, $vr12 + vbitsel.v $vr11, $vr11, $vr24, $vr20 + vbitsel.v $vr11, $vr11, $vr18, $vr19 + vst $vr11, $sp, 16 + addi.d $a1, $fp, 54 + ori $a2, $zero, 13 + addi.d $a3, $sp, 16 + ori $a4, $zero, 0 + lu32i.d $a4, 32768 + vreplgr2vr.d $vr11, $a4 + ori $a4, $zero, 16 .p2align 4, , 16 .LBB0_9: # =>This Loop Header: Depth=1 # Child Loop BB0_10 Depth 2 - ld.hu $t1, $a3, 0 - move $t0, $zero - move $t2, $t1 + ld.h $a6, $a1, 0 + move $a5, $zero + vinsgr2vr.h $vr12, $a6, 0 + vinsgr2vr.h $vr12, $a6, 1 .p2align 4, , 16 .LBB0_10: # Parent Loop BB0_9 Depth=1 # => This Inner Loop Header: Depth=2 - ldx.h $t3, $t0, $a5 - ldx.h $t4, $a1, $t0 - stx.h $t1, $a1, $t0 - ext.w.h $t2, $t2 - mul.d $t1, $t2, $t3 - slli.d $t1, $t1, 33 - add.d $t1, $t1, $a6 - srai.d $t1, $t1, 48 - add.d $t1, $t1, $t4 - slt $t5, $t1, $a0 - maskeqz $t1, $t1, $t5 - masknez $t5, $a0, $t5 - or $t1, $t1, $t5 - slt $t5, $a2, $t1 - maskeqz $t1, $t1, $t5 - masknez $t5, $a2, $t5 - or $t1, $t1, $t5 - mul.d $t3, $t4, $t3 - slli.d $t3, $t3, 33 - add.d $t3, $t3, $a6 - srai.d $t3, $t3, 48 - add.d $t2, $t3, $t2 - slt $t3, $t2, $a0 - maskeqz $t2, $t2, $t3 - masknez $t3, $a0, $t3 - or $t2, $t2, $t3 - slt $t3, $a2, $t2 - maskeqz $t2, $t2, $t3 - masknez $t3, $a2, $t3 - addi.d $t0, $t0, 2 - or $t2, $t2, $t3 - bne $t0, $a7, .LBB0_10 + ldx.h $a6, $a0, $a5 + add.d $a7, $a0, $a5 + ldx.h $t0, $a5, $a3 + vstelm.h $vr12, $a7, 0, 1 + vinsgr2vr.h $vr13, $a6, 0 + vpickev.h $vr13, $vr12, $vr13 + vslli.d $vr13, $vr13, 48 + vsrai.d $vr13, $vr13, 48 + vreplgr2vr.d $vr14, $t0 + vmul.d $vr13, $vr13, $vr14 + vslli.d $vr13, $vr13, 33 + vadd.d $vr13, $vr13, $vr11 + vsrli.d $vr13, $vr13, 48 + vori.b $vr14, $vr0, 0 + vshuf.h $vr14, $vr0, $vr13 + vinsgr2vr.h $vr12, $a6, 1 + vadd.h $vr13, $vr14, $vr12 + vslt.h $vr14, $vr13, $vr14 + vslti.h $vr12, $vr12, 0 + vxor.v $vr12, $vr12, $vr14 + vsrai.h $vr14, $vr13, 15 + vbitrevi.h $vr14, $vr14, 15 + addi.d $a5, $a5, 2 + vbitsel.v $vr12, $vr13, $vr14, $vr12 + bne $a5, $a4, .LBB0_10 # %bb.11: # in Loop: Header=BB0_9 Depth=1 - addi.w $a4, $a4, -1 - st.h $t2, $a3, 0 - addi.d $a3, $a3, 2 - bnez $a4, .LBB0_9 + addi.w $a2, $a2, -1 + vstelm.h $vr12, $a1, 0, 0 + addi.d $a1, $a1, 2 + bnez $a2, .LBB0_9 # %bb.12: # %Short_term_analysis_filtering.exit84 - vld $vr10, $s0, 0 - vslti.h $vr11, $vr10, 0 - vslt.hu $vr12, $vr10, $vr1 - vsrli.h $vr13, $vr10, 2 - vadd.h $vr3, $vr13, $vr3 - vadd.h $vr13, $vr10, $vr0 - vslt.hu $vr5, $vr13, $vr5 - vadd.h $vr13, $vr10, $vr1 - vslli.h $vr14, $vr10, 1 - vseq.h $vr6, $vr10, $vr6 - vneg.h $vr15, $vr10 - vbitsel.v $vr6, $vr15, $vr2, $vr6 - vslt.hu $vr1, $vr6, $vr1 - vslt.hu $vr7, $vr6, $vr7 - vslt.hu $vr15, $vr4, $vr6 - vxor.v $vr16, $vr15, $vr7 - vnor.v $vr16, $vr1, $vr16 - vand.v $vr16, $vr11, $vr16 - vsrli.h $vr17, $vr6, 2 - vsub.h $vr8, $vr8, $vr17 - vxor.v $vr7, $vr1, $vr7 - vand.v $vr7, $vr11, $vr7 - vsub.h $vr0, $vr0, $vr6 - vand.v $vr1, $vr11, $vr1 - vslli.h $vr6, $vr6, 1 - vneg.h $vr6, $vr6 - vslt.h $vr4, $vr4, $vr10 - vand.v $vr10, $vr11, $vr15 - vbitsel.v $vr2, $vr3, $vr2, $vr4 - vbitsel.v $vr2, $vr2, $vr13, $vr5 - vbitsel.v $vr2, $vr2, $vr14, $vr12 - vbitsel.v $vr2, $vr2, $vr8, $vr16 - vbitsel.v $vr2, $vr2, $vr9, $vr10 - vbitsel.v $vr0, $vr2, $vr0, $vr7 - vbitsel.v $vr0, $vr0, $vr6, $vr1 - vst $vr0, $sp, 16 - addi.d $a3, $fp, 80 - ori $a4, $zero, 120 - addi.d $a5, $sp, 16 - ori $a6, $zero, 0 - lu32i.d $a6, 32768 - ori $a7, $zero, 16 + vld $vr11, $s0, 0 + vslti.h $vr12, $vr11, 0 + vslt.hu $vr13, $vr11, $vr2 + vsrli.h $vr14, $vr11, 2 + vadd.h $vr4, $vr14, $vr4 + vadd.h $vr14, $vr11, $vr1 + vslt.hu $vr6, $vr14, $vr6 + vadd.h $vr14, $vr11, $vr2 + vslli.h $vr15, $vr11, 1 + vseq.h $vr7, $vr11, $vr7 + vneg.h $vr16, $vr11 + vbitsel.v $vr7, $vr16, $vr3, $vr7 + vslt.hu $vr2, $vr7, $vr2 + vslt.hu $vr8, $vr7, $vr8 + vslt.hu $vr16, $vr5, $vr7 + vxor.v $vr17, $vr16, $vr8 + vnor.v $vr17, $vr2, $vr17 + vand.v $vr17, $vr12, $vr17 + vsrli.h $vr18, $vr7, 2 + vsub.h $vr9, $vr9, $vr18 + vxor.v $vr8, $vr2, $vr8 + vand.v $vr8, $vr12, $vr8 + vsub.h $vr1, $vr1, $vr7 + vand.v $vr2, $vr12, $vr2 + vslli.h $vr7, $vr7, 1 + vneg.h $vr7, $vr7 + vslt.h $vr5, $vr5, $vr11 + vand.v $vr11, $vr12, $vr16 + vbitsel.v $vr3, $vr4, $vr3, $vr5 + vbitsel.v $vr3, $vr3, $vr14, $vr6 + vbitsel.v $vr3, $vr3, $vr15, $vr13 + vbitsel.v $vr3, $vr3, $vr9, $vr17 + vbitsel.v $vr3, $vr3, $vr10, $vr11 + vbitsel.v $vr1, $vr3, $vr1, $vr8 + vbitsel.v $vr1, $vr1, $vr7, $vr2 + vst $vr1, $sp, 16 + addi.d $a1, $fp, 80 + ori $a2, $zero, 120 + addi.d $a3, $sp, 16 + ori $a4, $zero, 0 + lu32i.d $a4, 32768 + vreplgr2vr.d $vr1, $a4 + ori $a4, $zero, 16 .p2align 4, , 16 .LBB0_13: # =>This Loop Header: Depth=1 # Child Loop BB0_14 Depth 2 - ld.hu $t1, $a3, 0 - move $t0, $zero - move $t2, $t1 + ld.h $a6, $a1, 0 + move $a5, $zero + vinsgr2vr.h $vr2, $a6, 0 + vinsgr2vr.h $vr2, $a6, 1 .p2align 4, , 16 .LBB0_14: # Parent Loop BB0_13 Depth=1 # => This Inner Loop Header: Depth=2 - ldx.h $t3, $t0, $a5 - ldx.h $t4, $a1, $t0 - stx.h $t1, $a1, $t0 - ext.w.h $t2, $t2 - mul.d $t1, $t2, $t3 - slli.d $t1, $t1, 33 - add.d $t1, $t1, $a6 - srai.d $t1, $t1, 48 - add.d $t1, $t1, $t4 - slt $t5, $t1, $a0 - maskeqz $t1, $t1, $t5 - masknez $t5, $a0, $t5 - or $t1, $t1, $t5 - slt $t5, $a2, $t1 - maskeqz $t1, $t1, $t5 - masknez $t5, $a2, $t5 - or $t1, $t1, $t5 - mul.d $t3, $t4, $t3 - slli.d $t3, $t3, 33 - add.d $t3, $t3, $a6 - srai.d $t3, $t3, 48 - add.d $t2, $t3, $t2 - slt $t3, $t2, $a0 - maskeqz $t2, $t2, $t3 - masknez $t3, $a0, $t3 - or $t2, $t2, $t3 - slt $t3, $a2, $t2 - maskeqz $t2, $t2, $t3 - masknez $t3, $a2, $t3 - addi.d $t0, $t0, 2 - or $t2, $t2, $t3 - bne $t0, $a7, .LBB0_14 + ldx.h $a6, $a0, $a5 + add.d $a7, $a0, $a5 + ldx.h $t0, $a5, $a3 + vstelm.h $vr2, $a7, 0, 1 + vinsgr2vr.h $vr3, $a6, 0 + vpickev.h $vr3, $vr2, $vr3 + vslli.d $vr3, $vr3, 48 + vsrai.d $vr3, $vr3, 48 + vreplgr2vr.d $vr4, $t0 + vmul.d $vr3, $vr3, $vr4 + vslli.d $vr3, $vr3, 33 + vadd.d $vr3, $vr3, $vr1 + vsrli.d $vr3, $vr3, 48 + vori.b $vr4, $vr0, 0 + vshuf.h $vr4, $vr0, $vr3 + vinsgr2vr.h $vr2, $a6, 1 + vadd.h $vr3, $vr4, $vr2 + vslt.h $vr4, $vr3, $vr4 + vslti.h $vr2, $vr2, 0 + vxor.v $vr2, $vr2, $vr4 + vsrai.h $vr4, $vr3, 15 + vbitrevi.h $vr4, $vr4, 15 + addi.d $a5, $a5, 2 + vbitsel.v $vr2, $vr3, $vr4, $vr2 + bne $a5, $a4, .LBB0_14 # %bb.15: # in Loop: Header=BB0_13 Depth=1 - addi.w $a4, $a4, -1 - st.h $t2, $a3, 0 - addi.d $a3, $a3, 2 - bnez $a4, .LBB0_13 + addi.w $a2, $a2, -1 + vstelm.h $vr2, $a1, 0, 0 + addi.d $a1, $a1, 2 + bnez $a2, .LBB0_13 # %bb.16: # %Short_term_analysis_filtering.exit103 - ld.d $s5, $sp, 32 # 8-byte Folded Reload - ld.d $s4, $sp, 40 # 8-byte Folded Reload - ld.d $s3, $sp, 48 # 8-byte Folded Reload - ld.d $s2, $sp, 56 # 8-byte Folded Reload - ld.d $s1, $sp, 64 # 8-byte Folded Reload - ld.d $s0, $sp, 72 # 8-byte Folded Reload - ld.d $fp, $sp, 80 # 8-byte Folded Reload - ld.d $ra, $sp, 88 # 8-byte Folded Reload - addi.d $sp, $sp, 96 + fld.d $fs0, $sp, 40 # 8-byte Folded Reload + ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $s4, $sp, 56 # 8-byte Folded Reload + ld.d $s3, $sp, 64 # 8-byte Folded Reload + ld.d $s2, $sp, 72 # 8-byte Folded Reload + ld.d $s1, $sp, 80 # 8-byte Folded Reload + ld.d $s0, $sp, 88 # 8-byte Folded Reload + ld.d $fp, $sp, 96 # 8-byte Folded Reload + ld.d $ra, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret .Lfunc_end0: .size Gsm_Short_Term_Analysis_Filter, .Lfunc_end0-Gsm_Short_Term_Analysis_Filter diff --git a/results/MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/emfloat.s b/results/MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/emfloat.s index f8a8994c..ae889fda 100644 --- a/results/MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/emfloat.s +++ b/results/MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/emfloat.s @@ -934,9 +934,9 @@ DoEmFloatIteration: # @DoEmFloatIteration st.h $a1, $s4, 4 st.h $a2, $s4, 2 .LBB2_27: # in Loop: Header=BB2_7 Depth=2 - vpickve2gr.h $a1, $vr1, 0 - vpickve2gr.h $a2, $vr1, 1 - or $a1, $a2, $a1 + vreplvei.h $vr0, $vr1, 1 + vor.v $vr0, $vr0, $vr1 + vpickve2gr.h $a1, $vr0, 0 or $a0, $a1, $a0 vpickve2gr.h $a1, $vr1, 2 or $a0, $a0, $a1 diff --git a/results/MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/nbench0.s b/results/MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/nbench0.s index 1c64ced4..fd1cb501 100644 --- a/results/MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/nbench0.s +++ b/results/MultiSource/Benchmarks/nbench/CMakeFiles/nbench.dir/nbench0.s @@ -396,16 +396,11 @@ main: # @main sltui $a0, $a0, 1 ld.d $a2, $sp, 32 # 8-byte Folded Reload st.w $a0, $a2, %pc_lo12(global_custrun) + vreplgr2vr.w $vr0, $a1 pcalau12i $a0, %pc_hi20(tests_to_do) addi.d $a0, $a0, %pc_lo12(tests_to_do) - st.w $a1, $a0, 0 - st.w $a1, $a0, 4 - st.w $a1, $a0, 8 - st.w $a1, $a0, 12 - st.w $a1, $a0, 16 - st.w $a1, $a0, 20 - st.w $a1, $a0, 24 - st.w $a1, $a0, 28 + vst $vr0, $a0, 0 + vst $vr0, $a0, 16 st.w $a1, $a0, 32 st.w $a1, $a0, 36 b .LBB0_15 diff --git a/results/SingleSource/Benchmarks/BenchmarkGame/CMakeFiles/n-body.dir/n-body.s b/results/SingleSource/Benchmarks/BenchmarkGame/CMakeFiles/n-body.dir/n-body.s index 4613018c..b02fe4e5 100644 --- a/results/SingleSource/Benchmarks/BenchmarkGame/CMakeFiles/n-body.dir/n-body.s +++ b/results/SingleSource/Benchmarks/BenchmarkGame/CMakeFiles/n-body.dir/n-body.s @@ -5,6 +5,7 @@ .type advance,@function advance: # @advance # %bb.0: + # kill: def $f0_64 killed $f0_64 def $vr0 blez $a0, .LBB0_8 # %bb.1: # %.lr.ph69.preheader move $a3, $zero @@ -28,75 +29,65 @@ advance: # @advance # in Loop: Header=BB0_3 Depth=1 mul.d $a7, $a6, $a5 add.d $a6, $a1, $a7 - fldx.d $fa1, $a1, $a7 - fld.d $fa2, $a6, 8 + vldx $vr1, $a1, $a7 + fld.d $fa2, $a6, 48 fld.d $fa3, $a6, 16 - fld.d $fa4, $a6, 48 - move $a7, $a2 - move $t0, $a0 + vreplvei.d $vr4, $vr2, 0 + move $a7, $a0 + move $t0, $a2 .p2align 4, , 16 .LBB0_5: # Parent Loop BB0_3 Depth=1 # => This Inner Loop Header: Depth=2 - fld.d $fa5, $a7, -48 - fld.d $fa6, $a7, -40 - fld.d $fa7, $a7, -32 - fsub.d $fa5, $fa1, $fa5 - fsub.d $fa6, $fa2, $fa6 - fsub.d $fa7, $fa3, $fa7 - fmul.d $ft0, $fa6, $fa6 - fmadd.d $ft0, $fa5, $fa5, $ft0 - fmadd.d $ft0, $fa7, $fa7, $ft0 - fsqrt.d $ft0, $ft0 - fmul.d $ft1, $ft0, $ft0 - fld.d $ft2, $a7, 0 - fmul.d $ft0, $ft0, $ft1 - fdiv.d $ft0, $fa0, $ft0 - fld.d $ft1, $a6, 24 - fneg.d $ft2, $ft2 - fmul.d $ft3, $fa5, $ft2 - fld.d $ft4, $a6, 32 - fmadd.d $ft1, $ft3, $ft0, $ft1 - fst.d $ft1, $a6, 24 - fmul.d $ft1, $fa6, $ft2 - fmadd.d $ft1, $ft1, $ft0, $ft4 - fld.d $ft3, $a6, 40 - fst.d $ft1, $a6, 32 - fmul.d $ft1, $fa7, $ft2 - fld.d $ft2, $a7, -24 - fmadd.d $ft1, $ft1, $ft0, $ft3 - fst.d $ft1, $a6, 40 - fmul.d $fa5, $fa5, $fa4 - fmadd.d $fa5, $fa5, $ft0, $ft2 - fld.d $ft1, $a7, -16 - fst.d $fa5, $a7, -24 - fmul.d $fa5, $fa6, $fa4 - fld.d $fa6, $a7, -8 - fmadd.d $fa5, $fa5, $ft0, $ft1 - fst.d $fa5, $a7, -16 - fmul.d $fa5, $fa7, $fa4 - fmadd.d $fa5, $fa5, $ft0, $fa6 - fst.d $fa5, $a7, -8 - addi.d $t0, $t0, -1 - addi.d $a7, $a7, 56 - bne $a4, $t0, .LBB0_5 + fld.d $fa5, $t0, -32 + fld.d $fa6, $t0, 0 + fsub.d $fa5, $fa3, $fa5 + vld $vr7, $t0, -48 + fneg.d $fa6, $fa6 + fld.d $ft0, $a6, 40 + fmul.d $ft1, $fa5, $fa6 + vfsub.d $vr7, $vr1, $vr7 + vfmul.d $vr10, $vr7, $vr7 + vreplvei.d $vr10, $vr10, 1 + vreplvei.d $vr11, $vr7, 0 + fmadd.d $ft2, $ft3, $ft3, $ft2 + fmadd.d $ft2, $fa5, $fa5, $ft2 + fsqrt.d $ft2, $ft2 + fmul.d $ft3, $ft2, $ft2 + fmul.d $ft2, $ft2, $ft3 + fdiv.d $ft2, $fa0, $ft2 + vld $vr11, $a6, 24 + vreplvei.d $vr6, $vr6, 0 + vfmul.d $vr6, $vr7, $vr6 + vreplvei.d $vr12, $vr10, 0 + vfmadd.d $vr6, $vr6, $vr12, $vr11 + vst $vr6, $a6, 24 + fmadd.d $fa6, $ft1, $ft2, $ft0 + fst.d $fa6, $a6, 40 + vld $vr6, $t0, -24 + vfmul.d $vr7, $vr7, $vr4 + fld.d $ft0, $t0, -8 + vfmadd.d $vr6, $vr7, $vr12, $vr6 + vst $vr6, $t0, -24 + fmul.d $fa5, $fa5, $fa2 + fmadd.d $fa5, $fa5, $ft2, $ft0 + fst.d $fa5, $t0, -8 + addi.d $a7, $a7, -1 + addi.d $t0, $t0, 56 + bne $a4, $a7, .LBB0_5 b .LBB0_2 .LBB0_6: # %.lr.ph71.preheader - addi.d $a1, $a1, 24 + vreplvei.d $vr1, $vr0, 0 .p2align 4, , 16 .LBB0_7: # %.lr.ph71 # =>This Inner Loop Header: Depth=1 - fld.d $fa1, $a1, 0 - fld.d $fa2, $a1, -24 - fmadd.d $fa1, $fa0, $fa1, $fa2 - fld.d $fa2, $a1, 8 - fld.d $fa3, $a1, -16 - fld.d $fa4, $a1, 16 - fld.d $fa5, $a1, -8 - fst.d $fa1, $a1, -24 - fmadd.d $fa1, $fa0, $fa2, $fa3 - fst.d $fa1, $a1, -16 - fmadd.d $fa1, $fa0, $fa4, $fa5 - fst.d $fa1, $a1, -8 + vld $vr2, $a1, 24 + vld $vr3, $a1, 0 + fld.d $fa4, $a1, 40 + fld.d $fa5, $a1, 16 + vfmadd.d $vr2, $vr1, $vr2, $vr3 + vst $vr2, $a1, 0 + fmadd.d $fa2, $fa0, $fa4, $fa5 + fst.d $fa2, $a1, 16 addi.d $a0, $a0, -1 addi.d $a1, $a1, 56 bnez $a0, .LBB0_7 @@ -242,50 +233,49 @@ main: # @main fst.d $fs0, $sp, 0 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(bodies) addi.d $a0, $a0, %pc_lo12(bodies) - move $a1, $zero - fld.d $fa1, $a0, 24 fld.d $fa2, $a0, 48 + fld.d $fa3, $a0, 40 + fld.d $fa1, $a0, 104 + fld.d $fa4, $a0, 96 + move $a1, $zero movgr2fr.d $fa0, $zero - fld.d $fa3, $a0, 32 - fld.d $fa4, $a0, 40 - fld.d $fa5, $a0, 80 - fld.d $fa6, $a0, 104 - fmadd.d $fa1, $fa1, $fa2, $fa0 fmadd.d $fa3, $fa3, $fa2, $fa0 - fmadd.d $fa2, $fa4, $fa2, $fa0 - fmadd.d $fa1, $fa5, $fa6, $fa1 - fld.d $fa4, $a0, 88 - fld.d $fa5, $a0, 96 - fld.d $fa7, $a0, 136 - fld.d $ft0, $a0, 160 - fld.d $ft1, $a0, 144 - fmadd.d $fa3, $fa4, $fa6, $fa3 - fmadd.d $fa2, $fa5, $fa6, $fa2 - fmadd.d $fa1, $fa7, $ft0, $fa1 - fmadd.d $fa3, $ft1, $ft0, $fa3 - fld.d $fa4, $a0, 152 - fld.d $fa5, $a0, 192 + fmadd.d $fa3, $fa4, $fa1, $fa3 + fld.d $fa4, $a0, 160 + fld.d $fa5, $a0, 152 fld.d $fa6, $a0, 216 - fld.d $fa7, $a0, 200 - fld.d $ft1, $a0, 208 - fmadd.d $fa2, $fa4, $ft0, $fa2 - fmadd.d $fa1, $fa5, $fa6, $fa1 - fmadd.d $fa3, $fa7, $fa6, $fa3 - fmadd.d $fa2, $ft1, $fa6, $fa2 - fld.d $fa4, $a0, 248 - fld.d $fa5, $a0, 272 - fld.d $fa6, $a0, 256 - fld.d $fa7, $a0, 264 - pcalau12i $a2, %pc_hi20(.LCPI3_0) - fld.d $ft0, $a2, %pc_lo12(.LCPI3_0) - fmadd.d $fa1, $fa4, $fa5, $fa1 - fmadd.d $fa3, $fa6, $fa5, $fa3 - fmadd.d $fa2, $fa7, $fa5, $fa2 - fdiv.d $fa1, $fa1, $ft0 - fst.d $fa1, $a0, 24 - fdiv.d $fa1, $fa3, $ft0 - fst.d $fa1, $a0, 32 - fdiv.d $fa1, $fa2, $ft0 + vld $vr7, $a0, 24 + fld.d $ft0, $a0, 208 + vreplvei.d $vr2, $vr2, 0 + vrepli.b $vr9, 0 + vfmadd.d $vr2, $vr7, $vr2, $vr9 + fld.d $fa7, $a0, 272 + fld.d $ft1, $a0, 264 + fmadd.d $fa3, $fa5, $fa4, $fa3 + fmadd.d $fa3, $ft0, $fa6, $fa3 + vld $vr5, $a0, 80 + fmadd.d $fa3, $ft1, $fa7, $fa3 + vreplvei.d $vr1, $vr1, 0 + vld $vr8, $a0, 136 + vfmadd.d $vr1, $vr5, $vr1, $vr2 + vreplvei.d $vr2, $vr4, 0 + vld $vr4, $a0, 192 + vfmadd.d $vr1, $vr8, $vr2, $vr1 + vld $vr2, $a0, 248 + vreplvei.d $vr5, $vr6, 0 + vfmadd.d $vr1, $vr4, $vr5, $vr1 + vreplvei.d $vr4, $vr7, 0 + vfmadd.d $vr1, $vr2, $vr4, $vr1 + lu12i.w $a2, -222236 + ori $a2, $a2, 1502 + lu32i.d $a2, 245052 + lu52i.d $a2, $a2, -1020 + pcalau12i $a3, %pc_hi20(.LCPI3_0) + fld.d $fa2, $a3, %pc_lo12(.LCPI3_0) + vreplgr2vr.d $vr4, $a2 + vfdiv.d $vr1, $vr1, $vr4 + vst $vr1, $a0, 24 + fdiv.d $fa1, $fa3, $fa2 fst.d $fa1, $a0, 40 addi.d $a2, $a0, 56 ori $a3, $zero, 4 diff --git a/results/SingleSource/Benchmarks/CoyoteBench/CMakeFiles/fftbench.dir/fftbench.s b/results/SingleSource/Benchmarks/CoyoteBench/CMakeFiles/fftbench.dir/fftbench.s index 77a60625..54afcd8f 100644 --- a/results/SingleSource/Benchmarks/CoyoteBench/CMakeFiles/fftbench.dir/fftbench.s +++ b/results/SingleSource/Benchmarks/CoyoteBench/CMakeFiles/fftbench.dir/fftbench.s @@ -1914,17 +1914,14 @@ _ZN10polynomialIdE11inverse_fftERKS_ISt7complexIdEE: # @_ZN10polynomialIdE11inve bstrins.d $a1, $a2, 63, 32 movgr2fr.d $fa1, $a1 fadd.d $fa0, $fa1, $fa0 - addi.d $a1, $s5, 8 + vreplvei.d $vr0, $vr0, 0 .p2align 4, , 16 .LBB6_31: # =>This Inner Loop Header: Depth=1 - fld.d $fa1, $a1, -8 - fld.d $fa2, $a1, 0 - fdiv.d $fa1, $fa1, $fa0 - fdiv.d $fa2, $fa2, $fa0 - fst.d $fa1, $a1, -8 - fst.d $fa2, $a1, 0 + vld $vr1, $s5, 0 + vfdiv.d $vr1, $vr1, $vr0 + vst $vr1, $s5, 0 addi.d $a0, $a0, 1 - addi.d $a1, $a1, 16 + addi.d $s5, $s5, 16 bltu $a0, $s4, .LBB6_31 .LBB6_32: # %._crit_edge80 fld.d $fs6, $sp, 80 # 8-byte Folded Reload diff --git a/results/SingleSource/Benchmarks/Linpack/CMakeFiles/linpack-pc.dir/linpack-pc.s b/results/SingleSource/Benchmarks/Linpack/CMakeFiles/linpack-pc.dir/linpack-pc.s index 574e6363..8d6d0cbb 100644 --- a/results/SingleSource/Benchmarks/Linpack/CMakeFiles/linpack-pc.dir/linpack-pc.s +++ b/results/SingleSource/Benchmarks/Linpack/CMakeFiles/linpack-pc.dir/linpack-pc.s @@ -2970,25 +2970,25 @@ dmxpy: # @dmxpy move $a6, $a2 bstrins.d $a6, $zero, 30, 1 ori $a7, $zero, 1 - move $s1, $a4 - move $s5, $a1 + move $s2, $a4 + move $s1, $a1 move $s6, $a0 bne $a6, $a7, .LBB6_6 # %bb.1: blez $s6, .LBB6_6 # %bb.2: # %.lr.ph.preheader ori $a0, $zero, 8 - bgeu $s6, $a0, .LBB6_37 + bgeu $s6, $a0, .LBB6_23 # %bb.3: move $a0, $zero -.LBB6_4: # %.lr.ph.preheader753 - alsl.d $a1, $a0, $s5, 2 +.LBB6_4: # %.lr.ph.preheader755 + alsl.d $a1, $a0, $s1, 2 alsl.d $a4, $a0, $a5, 2 sub.d $a0, $s6, $a0 .p2align 4, , 16 .LBB6_5: # %.lr.ph # =>This Inner Loop Header: Depth=1 - fld.s $fa0, $s1, 0 + fld.s $fa0, $s2, 0 fld.s $fa1, $a4, 0 fld.s $fa2, $a1, 0 fmul.s $fa0, $fa0, $fa1 @@ -3011,12 +3011,12 @@ dmxpy: # @dmxpy # %bb.8: # %.lr.ph248 addi.w $a6, $a0, -1 addi.w $a1, $a0, -2 - alsl.d $a0, $a1, $s1, 2 + alsl.d $a0, $a1, $s2, 2 mul.w $a4, $a1, $a3 - alsl.d $a1, $a6, $s1, 2 + alsl.d $a1, $a6, $s2, 2 ori $a7, $zero, 16 mul.w $a6, $a6, $a3 - bgeu $s6, $a7, .LBB6_72 + bgeu $s6, $a7, .LBB6_41 # %bb.9: move $a7, $zero .LBB6_10: # %scalar.ph372.preheader @@ -3025,7 +3025,7 @@ dmxpy: # @dmxpy add.d $a6, $a5, $a6 alsl.d $a4, $a4, $t0, 2 add.d $a4, $a5, $a4 - alsl.d $t0, $a7, $s5, 2 + alsl.d $t0, $a7, $s1, 2 sub.d $a7, $s6, $a7 .p2align 4, , 16 .LBB6_11: # %scalar.ph372 @@ -3068,22 +3068,22 @@ dmxpy: # @dmxpy addi.d $t1, $a4, -1 addi.d $a1, $a4, -4 bstrpick.d $a0, $a1, 31, 0 - alsl.d $a0, $a0, $s1, 2 + alsl.d $a0, $a0, $s2, 2 mul.w $a7, $a1, $a3 addi.d $a6, $a4, -3 bstrpick.d $a1, $a6, 31, 0 - alsl.d $a1, $a1, $s1, 2 + alsl.d $a1, $a1, $s2, 2 mul.w $t0, $a6, $a3 addi.d $a6, $a4, -2 bstrpick.d $a4, $a6, 31, 0 - alsl.d $a4, $a4, $s1, 2 + alsl.d $a4, $a4, $s2, 2 mul.w $t2, $a6, $a3 bstrpick.d $a6, $t1, 31, 0 - alsl.d $a6, $a6, $s1, 2 + alsl.d $a6, $a6, $s2, 2 mul.w $t3, $t1, $a3 ori $t4, $zero, 16 bstrpick.d $t1, $s6, 31, 0 - bgeu $s6, $t4, .LBB6_61 + bgeu $s6, $t4, .LBB6_30 # %bb.16: move $t4, $zero .LBB6_17: # %scalar.ph434.preheader @@ -3096,7 +3096,7 @@ dmxpy: # @dmxpy add.d $t0, $a5, $t0 alsl.d $a7, $a7, $t5, 2 add.d $a7, $a5, $a7 - alsl.d $t5, $t4, $s5, 2 + alsl.d $t5, $t4, $s1, 2 sub.d $t1, $t1, $t4 .p2align 4, , 16 .LBB6_18: # %scalar.ph434 @@ -3127,423 +3127,782 @@ dmxpy: # @dmxpy addi.d $t5, $t5, 4 bnez $t1, .LBB6_18 .LBB6_19: # %.loopexit242 + st.d $s2, $sp, 32 # 8-byte Folded Spill bstrpick.d $a0, $a2, 62, 59 add.d $a0, $a2, $a0 bstrpick.d $a0, $a0, 31, 4 slli.d $a0, $a0, 4 sub.w $a6, $a2, $a0 ori $a0, $zero, 8 - st.d $s1, $sp, 32 # 8-byte Folded Spill - st.d $s6, $sp, 72 # 8-byte Folded Spill - blt $a6, $a0, .LBB6_25 + st.d $s1, $sp, 48 # 8-byte Folded Spill + blt $a6, $a0, .LBB6_53 # %bb.20: # %.loopexit242 - blez $s6, .LBB6_25 + blez $s6, .LBB6_53 # %bb.21: # %.lr.ph252 - addi.d $a1, $a6, -1 - addi.d $a0, $a6, -8 - bstrpick.d $a4, $a0, 31, 0 - alsl.d $a7, $a4, $s1, 2 + st.d $a2, $sp, 288 # 8-byte Folded Spill + addi.d $a0, $a6, -1 + addi.d $a1, $a6, -8 + bstrpick.d $a4, $a1, 31, 0 + ld.d $t6, $sp, 32 # 8-byte Folded Reload + alsl.d $a7, $a4, $t6, 2 + mul.w $a2, $a1, $a3 + addi.d $a1, $a6, -7 + bstrpick.d $a4, $a1, 31, 0 + alsl.d $t0, $a4, $t6, 2 + mul.w $ra, $a1, $a3 + addi.d $a1, $a6, -6 + bstrpick.d $a4, $a1, 31, 0 + alsl.d $t1, $a4, $t6, 2 + mul.w $t8, $a1, $a3 + addi.d $a1, $a6, -5 + bstrpick.d $a4, $a1, 31, 0 + alsl.d $t2, $a4, $t6, 2 + mul.w $s3, $a1, $a3 + addi.d $a1, $a6, -4 + bstrpick.d $a4, $a1, 31, 0 + alsl.d $t3, $a4, $t6, 2 + mul.w $s4, $a1, $a3 + addi.d $a1, $a6, -3 + bstrpick.d $a4, $a1, 31, 0 + alsl.d $t4, $a4, $t6, 2 + mul.w $s7, $a1, $a3 + addi.d $a1, $a6, -2 + bstrpick.d $a4, $a1, 31, 0 + alsl.d $t5, $a4, $t6, 2 + mul.w $s5, $a1, $a3 + bstrpick.d $a1, $a0, 31, 0 + alsl.d $t6, $a1, $t6, 2 mul.w $s8, $a0, $a3 - addi.d $a0, $a6, -7 - bstrpick.d $a4, $a0, 31, 0 - alsl.d $t0, $a4, $s1, 2 - mul.w $a4, $a0, $a3 - addi.d $a0, $a6, -6 - bstrpick.d $t1, $a0, 31, 0 - alsl.d $t1, $t1, $s1, 2 - mul.w $fp, $a0, $a3 - addi.d $a0, $a6, -5 - bstrpick.d $t2, $a0, 31, 0 - alsl.d $t2, $t2, $s1, 2 - mul.w $s0, $a0, $a3 - addi.d $a0, $a6, -4 - bstrpick.d $t3, $a0, 31, 0 - alsl.d $t3, $t3, $s1, 2 - mul.w $s2, $a0, $a3 - addi.d $a0, $a6, -3 - bstrpick.d $t4, $a0, 31, 0 - alsl.d $t4, $t4, $s1, 2 - mul.w $ra, $a0, $a3 - addi.d $a0, $a6, -2 - bstrpick.d $t5, $a0, 31, 0 - alsl.d $t5, $t5, $s1, 2 - mul.w $a0, $a0, $a3 - bstrpick.d $t6, $a1, 31, 0 - alsl.d $t6, $t6, $s1, 2 - mul.w $a1, $a1, $a3 - alsl.d $t7, $s8, $a5, 2 - alsl.d $t8, $a4, $a5, 2 - st.d $fp, $sp, 272 # 8-byte Folded Spill - alsl.d $fp, $fp, $a5, 2 - st.d $s0, $sp, 280 # 8-byte Folded Spill - alsl.d $s0, $s0, $a5, 2 - st.d $s2, $sp, 288 # 8-byte Folded Spill - alsl.d $s1, $s2, $a5, 2 - alsl.d $s2, $ra, $a5, 2 - alsl.d $s3, $a0, $a5, 2 - ori $s7, $zero, 20 - alsl.d $s4, $a1, $a5, 2 - bgeu $s6, $s7, .LBB6_42 + alsl.d $t7, $a2, $a5, 2 + alsl.d $a1, $ra, $a5, 2 + st.d $t8, $sp, 280 # 8-byte Folded Spill + alsl.d $a4, $t8, $a5, 2 + alsl.d $t8, $s3, $a5, 2 + alsl.d $fp, $s4, $a5, 2 + alsl.d $a0, $s7, $a5, 2 + alsl.d $s0, $s5, $a5, 2 + ori $s2, $zero, 20 + alsl.d $s1, $s8, $a5, 2 + bgeu $s6, $s2, .LBB6_28 # %bb.22: - move $s8, $zero -.LBB6_23: # %scalar.ph539.preheader - slli.d $a1, $s8, 2 - sub.d $a0, $s6, $s8 - move $a4, $s5 + move $s3, $zero + ld.d $s5, $sp, 48 # 8-byte Folded Reload + ld.d $a2, $sp, 288 # 8-byte Folded Reload + b .LBB6_51 +.LBB6_23: # %vector.memcheck + alsl.d $a1, $s6, $s1, 2 + addi.d $a0, $s2, 4 + sltu $a0, $s1, $a0 + sltu $a4, $s2, $a1 + and $a4, $a0, $a4 + move $a0, $zero + bnez $a4, .LBB6_4 +# %bb.24: # %vector.memcheck + alsl.d $a4, $s6, $a5, 2 + sltu $a4, $s1, $a4 + sltu $a1, $a5, $a1 + and $a1, $a4, $a1 + bnez $a1, .LBB6_4 +# %bb.25: # %vector.ph + bstrpick.d $a0, $s6, 30, 3 + vldrepl.w $vr0, $s2, 0 + slli.d $a0, $a0, 3 + addi.d $a1, $a5, 16 + addi.d $a4, $s1, 16 + move $a6, $a0 + .p2align 4, , 16 +.LBB6_26: # %vector.body + # =>This Inner Loop Header: Depth=1 + vld $vr1, $a1, -16 + vld $vr2, $a1, 0 + vld $vr3, $a4, -16 + vld $vr4, $a4, 0 + vfmul.s $vr1, $vr0, $vr1 + vfmul.s $vr2, $vr0, $vr2 + vfadd.s $vr1, $vr3, $vr1 + vfadd.s $vr2, $vr4, $vr2 + vst $vr1, $a4, -16 + vst $vr2, $a4, 0 + addi.d $a6, $a6, -8 + addi.d $a1, $a1, 32 + addi.d $a4, $a4, 32 + bnez $a6, .LBB6_26 +# %bb.27: # %middle.block + bne $a0, $s6, .LBB6_4 + b .LBB6_6 +.LBB6_28: # %vector.memcheck458 + add.d $s2, $ra, $s6 + alsl.d $s2, $s2, $a5, 2 + vinsgr2vr.d $vr0, $s2, 0 + add.d $a2, $a2, $s6 + alsl.d $a2, $a2, $a5, 2 + vinsgr2vr.d $vr0, $a2, 1 + add.d $a2, $s3, $s6 + alsl.d $a2, $a2, $a5, 2 + vinsgr2vr.d $vr1, $a2, 0 + ld.d $a2, $sp, 280 # 8-byte Folded Reload + add.d $a2, $a2, $s6 + alsl.d $a2, $a2, $a5, 2 + vinsgr2vr.d $vr1, $a2, 1 + add.d $a2, $s7, $s6 + alsl.d $a2, $a2, $a5, 2 + vinsgr2vr.d $vr2, $a2, 0 + add.d $a2, $s4, $s6 + alsl.d $a2, $a2, $a5, 2 + vinsgr2vr.d $vr2, $a2, 1 + add.d $a2, $s8, $s6 + alsl.d $a2, $a2, $a5, 2 + vinsgr2vr.d $vr3, $a2, 0 + add.d $a2, $s5, $s6 + alsl.d $a2, $a2, $a5, 2 + vinsgr2vr.d $vr3, $a2, 1 + addi.d $a2, $t0, 4 + vinsgr2vr.d $vr4, $a2, 0 + addi.d $a2, $a7, 4 + vinsgr2vr.d $vr4, $a2, 1 + addi.d $a2, $t2, 4 + vinsgr2vr.d $vr5, $a2, 0 + addi.d $a2, $t1, 4 + vinsgr2vr.d $vr5, $a2, 1 + addi.d $a2, $t4, 4 + vinsgr2vr.d $vr6, $a2, 0 + addi.d $a2, $t3, 4 + vinsgr2vr.d $vr6, $a2, 1 + addi.d $a2, $t6, 4 + vinsgr2vr.d $vr7, $a2, 0 + addi.d $a2, $t5, 4 + vinsgr2vr.d $vr7, $a2, 1 + ld.d $s5, $sp, 48 # 8-byte Folded Reload + alsl.d $a2, $s6, $s5, 2 + vreplgr2vr.d $vr8, $s5 + vslt.du $vr7, $vr8, $vr7 + vslt.du $vr6, $vr8, $vr6 + vpickev.w $vr6, $vr6, $vr7 + vslt.du $vr5, $vr8, $vr5 + vslt.du $vr4, $vr8, $vr4 + vpickev.w $vr4, $vr4, $vr5 + vpickev.h $vr4, $vr4, $vr6 + vslt.du $vr3, $vr8, $vr3 + vslt.du $vr2, $vr8, $vr2 + vpickev.w $vr2, $vr2, $vr3 + vslt.du $vr1, $vr8, $vr1 + vslt.du $vr0, $vr8, $vr0 + vpickev.w $vr0, $vr0, $vr1 + vpickev.h $vr0, $vr0, $vr2 + vpickev.b $vr0, $vr0, $vr4 + vinsgr2vr.d $vr1, $a1, 0 + vinsgr2vr.d $vr1, $t7, 1 + vinsgr2vr.d $vr2, $t8, 0 + vinsgr2vr.d $vr2, $a4, 1 + vinsgr2vr.d $vr3, $a0, 0 + vinsgr2vr.d $vr3, $fp, 1 + vinsgr2vr.d $vr4, $s1, 0 + vinsgr2vr.d $vr4, $s0, 1 + vinsgr2vr.d $vr5, $t0, 0 + vinsgr2vr.d $vr5, $a7, 1 + vinsgr2vr.d $vr6, $t2, 0 + vinsgr2vr.d $vr6, $t1, 1 + vinsgr2vr.d $vr7, $t4, 0 + vinsgr2vr.d $vr7, $t3, 1 + vinsgr2vr.d $vr8, $t6, 0 + vinsgr2vr.d $vr8, $t5, 1 + vreplgr2vr.d $vr9, $a2 + vslt.du $vr8, $vr8, $vr9 + vslt.du $vr7, $vr7, $vr9 + vpickev.w $vr7, $vr7, $vr8 + vslt.du $vr6, $vr6, $vr9 + vslt.du $vr5, $vr5, $vr9 + vpickev.w $vr5, $vr5, $vr6 + vpickev.h $vr5, $vr5, $vr7 + vslt.du $vr4, $vr4, $vr9 + vslt.du $vr3, $vr3, $vr9 + vpickev.w $vr3, $vr3, $vr4 + vslt.du $vr2, $vr2, $vr9 + vslt.du $vr1, $vr1, $vr9 + vpickev.w $vr1, $vr1, $vr2 + vpickev.h $vr1, $vr1, $vr3 + vpickev.b $vr1, $vr1, $vr5 + vand.v $vr0, $vr0, $vr1 + vslli.b $vr0, $vr0, 7 + vmskltz.b $vr0, $vr0 + vpickve2gr.hu $a2, $vr0, 0 + beqz $a2, .LBB6_48 +# %bb.29: + move $s3, $zero + ld.d $a2, $sp, 288 # 8-byte Folded Reload + b .LBB6_51 +.LBB6_30: # %vector.memcheck393 + alsl.d $fp, $t1, $s1, 2 + addi.d $t4, $a6, 4 + sltu $t4, $s1, $t4 + sltu $t5, $a6, $fp + and $t5, $t4, $t5 + move $t4, $zero + bnez $t5, .LBB6_17 +# %bb.31: # %vector.memcheck393 + addi.d $t5, $a4, 4 + sltu $t5, $s1, $t5 + sltu $t6, $a4, $fp + and $t5, $t5, $t6 + bnez $t5, .LBB6_17 +# %bb.32: # %vector.memcheck393 + addi.d $t5, $a1, 4 + sltu $t5, $s1, $t5 + sltu $t6, $a1, $fp + and $t5, $t5, $t6 + bnez $t5, .LBB6_17 +# %bb.33: # %vector.memcheck393 + addi.d $t5, $a0, 4 + sltu $t5, $s1, $t5 + sltu $t6, $a0, $fp + and $t5, $t5, $t6 + bnez $t5, .LBB6_17 +# %bb.34: # %vector.memcheck393 + alsl.d $t5, $t3, $a5, 2 + add.d $t6, $t3, $t1 + alsl.d $t6, $t6, $a5, 2 + sltu $t6, $s1, $t6 + sltu $t7, $t5, $fp + and $t6, $t6, $t7 + bnez $t6, .LBB6_17 +# %bb.35: # %vector.memcheck393 + alsl.d $t6, $t2, $a5, 2 + add.d $t7, $t2, $t1 + alsl.d $t7, $t7, $a5, 2 + sltu $t7, $s1, $t7 + sltu $t8, $t6, $fp + and $t7, $t7, $t8 + bnez $t7, .LBB6_17 +# %bb.36: # %vector.memcheck393 + alsl.d $t7, $t0, $a5, 2 + add.d $t8, $t0, $t1 + alsl.d $t8, $t8, $a5, 2 + sltu $t8, $s1, $t8 + sltu $s0, $t7, $fp + and $t8, $t8, $s0 + bnez $t8, .LBB6_17 +# %bb.37: # %vector.memcheck393 + alsl.d $t8, $a7, $a5, 2 + add.d $s0, $a7, $t1 + alsl.d $s0, $s0, $a5, 2 + sltu $s0, $s1, $s0 + sltu $fp, $t8, $fp + and $fp, $s0, $fp + bnez $fp, .LBB6_17 +# %bb.38: # %vector.ph436 + vldrepl.w $vr0, $a0, 0 + vldrepl.w $vr1, $a1, 0 + vldrepl.w $vr2, $a4, 0 + vldrepl.w $vr3, $a6, 0 + bstrpick.d $t4, $t1, 31, 2 + slli.d $t4, $t4, 2 + move $fp, $s1 + move $s0, $t4 .p2align 4, , 16 -.LBB6_24: # %scalar.ph539 +.LBB6_39: # %vector.body439 # =>This Inner Loop Header: Depth=1 - fldx.s $fa0, $a4, $a1 + vld $vr4, $t8, 0 + vld $vr5, $fp, 0 + vld $vr6, $t7, 0 + vfmul.s $vr4, $vr0, $vr4 + vfadd.s $vr4, $vr5, $vr4 + vld $vr5, $t6, 0 + vfmul.s $vr6, $vr1, $vr6 + vld $vr7, $t5, 0 + vfadd.s $vr4, $vr4, $vr6 + vfmul.s $vr5, $vr2, $vr5 + vfadd.s $vr4, $vr4, $vr5 + vfmul.s $vr5, $vr3, $vr7 + vfadd.s $vr4, $vr4, $vr5 + vst $vr4, $fp, 0 + addi.d $s0, $s0, -4 + addi.d $t5, $t5, 16 + addi.d $t6, $t6, 16 + addi.d $t7, $t7, 16 + addi.d $t8, $t8, 16 + addi.d $fp, $fp, 16 + bnez $s0, .LBB6_39 +# %bb.40: # %middle.block455 + bne $t4, $t1, .LBB6_17 + b .LBB6_19 +.LBB6_41: # %vector.memcheck351 + alsl.d $t0, $s6, $s1, 2 + addi.d $a7, $a1, 4 + sltu $a7, $s1, $a7 + sltu $t1, $a1, $t0 + and $t1, $a7, $t1 + move $a7, $zero + bnez $t1, .LBB6_10 +# %bb.42: # %vector.memcheck351 + addi.d $t1, $a0, 4 + sltu $t1, $s1, $t1 + sltu $t2, $a0, $t0 + and $t1, $t1, $t2 + bnez $t1, .LBB6_10 +# %bb.43: # %vector.memcheck351 + alsl.d $t1, $a6, $a5, 2 + add.d $t2, $a6, $s6 + alsl.d $t2, $t2, $a5, 2 + sltu $t2, $s1, $t2 + sltu $t3, $t1, $t0 + and $t2, $t2, $t3 + bnez $t2, .LBB6_10 +# %bb.44: # %vector.memcheck351 + alsl.d $t2, $a4, $a5, 2 + add.d $t3, $a4, $s6 + alsl.d $t3, $t3, $a5, 2 + sltu $t3, $s1, $t3 + sltu $t0, $t2, $t0 + and $t0, $t3, $t0 + bnez $t0, .LBB6_10 +# %bb.45: # %vector.ph374 + bstrpick.d $a7, $s6, 30, 3 + slli.d $a7, $a7, 3 + vldrepl.w $vr0, $a0, 0 + vldrepl.w $vr1, $a1, 0 + addi.d $t0, $s1, 16 + addi.d $t1, $t1, 16 + addi.d $t2, $t2, 16 + move $t3, $a7 + .p2align 4, , 16 +.LBB6_46: # %vector.body377 + # =>This Inner Loop Header: Depth=1 + vld $vr2, $t2, -16 + vld $vr3, $t0, -16 + vld $vr4, $t0, 0 + vld $vr5, $t2, 0 + vfmul.s $vr2, $vr0, $vr2 + vfadd.s $vr2, $vr3, $vr2 + vld $vr3, $t1, -16 + vld $vr6, $t1, 0 + vfmul.s $vr5, $vr0, $vr5 + vfadd.s $vr4, $vr4, $vr5 + vfmul.s $vr3, $vr1, $vr3 + vfmul.s $vr5, $vr1, $vr6 + vfadd.s $vr2, $vr2, $vr3 + vfadd.s $vr3, $vr4, $vr5 + vst $vr2, $t0, -16 + vst $vr3, $t0, 0 + addi.d $t3, $t3, -8 + addi.d $t0, $t0, 32 + addi.d $t1, $t1, 32 + addi.d $t2, $t2, 32 + bnez $t3, .LBB6_46 +# %bb.47: # %middle.block390 + bne $a7, $s6, .LBB6_10 + b .LBB6_12 +.LBB6_48: # %vector.ph541 + move $s2, $zero + bstrpick.d $a2, $s6, 30, 2 + slli.d $s3, $a2, 2 + vldrepl.w $vr0, $a7, 0 + vldrepl.w $vr1, $t0, 0 + vldrepl.w $vr2, $t1, 0 + vldrepl.w $vr3, $t2, 0 + vldrepl.w $vr4, $t3, 0 + vldrepl.w $vr5, $t4, 0 + vldrepl.w $vr6, $t5, 0 + vldrepl.w $vr7, $t6, 0 + slli.d $a2, $s6, 2 + bstrpick.d $a2, $a2, 32, 4 + slli.d $s4, $a2, 4 + ld.d $a2, $sp, 288 # 8-byte Folded Reload + .p2align 4, , 16 +.LBB6_49: # %vector.body544 + # =>This Inner Loop Header: Depth=1 + vldx $vr8, $t7, $s2 + vldx $vr9, $s5, $s2 + vldx $vr10, $a1, $s2 + vfmul.s $vr8, $vr0, $vr8 + vfadd.s $vr8, $vr9, $vr8 + vldx $vr9, $a4, $s2 + vfmul.s $vr10, $vr1, $vr10 + vfadd.s $vr8, $vr8, $vr10 + vldx $vr10, $t8, $s2 + vfmul.s $vr9, $vr2, $vr9 + vfadd.s $vr8, $vr8, $vr9 + vldx $vr9, $fp, $s2 + vfmul.s $vr10, $vr3, $vr10 + vfadd.s $vr8, $vr8, $vr10 + vldx $vr10, $a0, $s2 + vfmul.s $vr9, $vr4, $vr9 + vfadd.s $vr8, $vr8, $vr9 + vldx $vr9, $s0, $s2 + vfmul.s $vr10, $vr5, $vr10 + vldx $vr11, $s1, $s2 + vfadd.s $vr8, $vr8, $vr10 + vfmul.s $vr9, $vr6, $vr9 + vfadd.s $vr8, $vr8, $vr9 + vfmul.s $vr9, $vr7, $vr11 + vfadd.s $vr8, $vr8, $vr9 + vstx $vr8, $s5, $s2 + addi.d $s2, $s2, 16 + bne $s4, $s2, .LBB6_49 +# %bb.50: # %middle.block572 + beq $s3, $s6, .LBB6_53 +.LBB6_51: # %scalar.ph539.preheader + slli.d $s2, $s3, 2 + sub.d $s3, $s6, $s3 + move $s4, $s5 + .p2align 4, , 16 +.LBB6_52: # %scalar.ph539 + # =>This Inner Loop Header: Depth=1 + fldx.s $fa0, $s4, $s2 fld.s $fa1, $a7, 0 - fldx.s $fa2, $t7, $a1 + fldx.s $fa2, $t7, $s2 fld.s $fa3, $t0, 0 - fldx.s $fa4, $t8, $a1 + fldx.s $fa4, $a1, $s2 fmul.s $fa1, $fa1, $fa2 fadd.s $fa0, $fa0, $fa1 fmul.s $fa1, $fa3, $fa4 fld.s $fa2, $t1, 0 - fldx.s $fa3, $fp, $a1 + fldx.s $fa3, $a4, $s2 fld.s $fa4, $t2, 0 - fldx.s $fa5, $s0, $a1 + fldx.s $fa5, $t8, $s2 fadd.s $fa0, $fa0, $fa1 fmul.s $fa1, $fa2, $fa3 fadd.s $fa0, $fa0, $fa1 fmul.s $fa1, $fa4, $fa5 fld.s $fa2, $t3, 0 - fldx.s $fa3, $s1, $a1 + fldx.s $fa3, $fp, $s2 fld.s $fa4, $t4, 0 - fldx.s $fa5, $s2, $a1 + fldx.s $fa5, $a0, $s2 fadd.s $fa0, $fa0, $fa1 fmul.s $fa1, $fa2, $fa3 fadd.s $fa0, $fa0, $fa1 fmul.s $fa1, $fa4, $fa5 fld.s $fa2, $t5, 0 - fldx.s $fa3, $s3, $a1 + fldx.s $fa3, $s0, $s2 fld.s $fa4, $t6, 0 - fldx.s $fa5, $s4, $a1 + fldx.s $fa5, $s1, $s2 fadd.s $fa0, $fa0, $fa1 fmul.s $fa1, $fa2, $fa3 fadd.s $fa0, $fa0, $fa1 fmul.s $fa1, $fa4, $fa5 fadd.s $fa0, $fa0, $fa1 - fstx.s $fa0, $a4, $a1 - addi.d $s4, $s4, 4 - addi.d $s3, $s3, 4 - addi.d $s2, $s2, 4 + fstx.s $fa0, $s4, $s2 addi.d $s1, $s1, 4 addi.d $s0, $s0, 4 + addi.d $a0, $a0, 4 addi.d $fp, $fp, 4 addi.d $t8, $t8, 4 - addi.d $t7, $t7, 4 - addi.d $a0, $a0, -1 addi.d $a4, $a4, 4 - bnez $a0, .LBB6_24 -.LBB6_25: # %.loopexit - st.d $s5, $sp, 48 # 8-byte Folded Spill + addi.d $a1, $a1, 4 + addi.d $t7, $t7, 4 + addi.d $s3, $s3, -1 + addi.d $s4, $s4, 4 + bnez $s3, .LBB6_52 +.LBB6_53: # %.loopexit addi.d $a0, $a6, 15 addi.w $a1, $a0, 0 - bge $a1, $a2, .LBB6_36 -# %bb.26: # %.preheader.lr.ph - bstrpick.d $s1, $a0, 31, 0 - bstrpick.d $a4, $a2, 31, 0 - bstrpick.d $s2, $s6, 31, 0 - ld.d $s0, $sp, 48 # 8-byte Folded Reload - alsl.d $a0, $s2, $s0, 2 - ld.d $a6, $sp, 32 # 8-byte Folded Reload - alsl.d $a1, $s1, $a6, 2 - addi.d $t3, $a1, -60 - addi.d $a1, $s1, 16 - sltu $a2, $a4, $a1 - st.d $a4, $sp, 64 # 8-byte Folded Spill - masknez $a4, $a4, $a2 - maskeqz $a1, $a1, $a2 - or $a1, $a1, $a4 - nor $a2, $s1, $zero - add.d $a1, $a1, $a2 - slli.d $a1, $a1, 2 - bstrins.d $a1, $zero, 5, 0 - alsl.d $a1, $s1, $a1, 2 - add.d $a2, $a6, $a1 - addi.d $t7, $a2, 4 - mul.d $a2, $a3, $s1 - alsl.d $s3, $a2, $a5, 2 - mul.d $a2, $a1, $a3 + bge $a1, $a2, .LBB6_64 +# %bb.54: # %.preheader.lr.ph + bstrpick.d $s2, $a0, 31, 0 + bstrpick.d $a6, $a2, 31, 0 + bstrpick.d $s3, $s6, 31, 0 + ld.d $s1, $sp, 48 # 8-byte Folded Reload + alsl.d $a0, $s3, $s1, 2 + ld.d $a7, $sp, 32 # 8-byte Folded Reload + alsl.d $a1, $s2, $a7, 2 + addi.d $a1, $a1, -60 + addi.d $a2, $s2, 16 + sltu $a4, $a6, $a2 + st.d $a6, $sp, 64 # 8-byte Folded Spill + masknez $a6, $a6, $a4 + maskeqz $a2, $a2, $a4 + or $a2, $a2, $a6 + nor $a4, $s2, $zero + add.d $a2, $a2, $a4 + slli.d $a2, $a2, 2 + bstrins.d $a2, $zero, 5, 0 + alsl.d $a4, $s2, $a2, 2 + add.d $a2, $a7, $a4 + addi.d $a7, $a2, 4 + mul.d $a2, $a3, $s2 + alsl.d $s4, $a2, $a5, 2 + mul.d $a2, $a4, $a3 add.d $a2, $a5, $a2 - alsl.d $fp, $s2, $a2, 2 - addi.d $a2, $a1, -4 + alsl.d $t1, $s3, $a2, 2 + addi.d $a2, $a4, -4 mul.d $a2, $a2, $a3 add.d $a2, $a5, $a2 - alsl.d $t8, $s2, $a2, 2 + alsl.d $t4, $s3, $a2, 2 addi.w $a2, $zero, -2 lu52i.d $a2, $a2, 1023 - add.d $a2, $s1, $a2 - mul.d $a2, $a2, $a3 - alsl.d $t4, $a2, $a5, 2 - addi.d $a2, $a1, -8 - mul.d $a2, $a2, $a3 - add.d $a2, $a5, $a2 - alsl.d $t6, $s2, $a2, 2 - addi.w $a2, $zero, -3 - lu52i.d $a2, $a2, 1023 - add.d $a2, $s1, $a2 - mul.d $a2, $a2, $a3 - alsl.d $t1, $a2, $a5, 2 - addi.d $a2, $a1, -12 - mul.d $a2, $a2, $a3 - add.d $a2, $a5, $a2 - alsl.d $t5, $s2, $a2, 2 - addi.w $a2, $zero, -4 - lu52i.d $a2, $a2, 1023 - add.d $a2, $s1, $a2 - mul.d $a2, $a2, $a3 - alsl.d $a7, $a2, $a5, 2 - addi.d $a2, $a1, -16 - mul.d $a2, $a2, $a3 - add.d $a2, $a5, $a2 - alsl.d $t2, $s2, $a2, 2 - addi.w $a2, $zero, -5 - lu52i.d $a2, $a2, 1023 - add.d $a2, $s1, $a2 - mul.d $a2, $a2, $a3 - alsl.d $a4, $a2, $a5, 2 - addi.d $a2, $a1, -20 - mul.d $a2, $a2, $a3 - add.d $a2, $a5, $a2 - alsl.d $t0, $s2, $a2, 2 - addi.w $a2, $zero, -6 - lu52i.d $a2, $a2, 1023 - add.d $a2, $s1, $a2 + add.d $a2, $s2, $a2 mul.d $a2, $a2, $a3 alsl.d $a2, $a2, $a5, 2 - addi.d $a6, $a1, -24 + addi.d $a6, $a4, -8 mul.d $a6, $a6, $a3 add.d $a6, $a5, $a6 - alsl.d $a6, $s2, $a6, 2 - sltu $t7, $s0, $t7 - sltu $t3, $t3, $a0 - and $t7, $t7, $t3 - sltu $t3, $s0, $fp - st.d $s3, $sp, 192 # 8-byte Folded Spill - sltu $fp, $s3, $a0 - and $fp, $t3, $fp - slti $t3, $a3, 0 - or $fp, $fp, $t3 - or $fp, $t7, $fp - sltu $t7, $s0, $t8 - addi.d $t8, $s1, -1 + alsl.d $t5, $s3, $a6, 2 + addi.w $a6, $zero, -3 + lu52i.d $a6, $a6, 1023 + add.d $a6, $s2, $a6 + mul.d $a6, $a6, $a3 + alsl.d $a6, $a6, $a5, 2 + addi.d $t0, $a4, -12 + mul.d $t0, $t0, $a3 + add.d $t0, $a5, $t0 + alsl.d $t6, $s3, $t0, 2 + addi.w $t0, $zero, -4 + lu52i.d $t0, $t0, 1023 + add.d $t0, $s2, $t0 + mul.d $t0, $t0, $a3 + alsl.d $t0, $t0, $a5, 2 + addi.d $t2, $a4, -16 + mul.d $t2, $t2, $a3 + add.d $t2, $a5, $t2 + alsl.d $t7, $s3, $t2, 2 + addi.w $t2, $zero, -5 + lu52i.d $t2, $t2, 1023 + add.d $t2, $s2, $t2 + mul.d $t2, $t2, $a3 + alsl.d $t2, $t2, $a5, 2 + addi.d $t3, $a4, -20 + mul.d $t3, $t3, $a3 + add.d $t3, $a5, $t3 + alsl.d $t8, $s3, $t3, 2 + addi.w $t3, $zero, -6 + lu52i.d $t3, $t3, 1023 + add.d $t3, $s2, $t3 + mul.d $t3, $t3, $a3 + alsl.d $t3, $t3, $a5, 2 + addi.d $fp, $a4, -24 + mul.d $fp, $fp, $a3 + add.d $fp, $a5, $fp + alsl.d $fp, $s3, $fp, 2 + addi.d $s0, $a4, -52 + mul.d $s0, $s0, $a3 + add.d $s0, $a5, $s0 + alsl.d $s0, $s3, $s0, 2 + vinsgr2vr.d $vr0, $s0, 0 + addi.d $s0, $a4, -56 + mul.d $s0, $s0, $a3 + add.d $s0, $a5, $s0 + alsl.d $s0, $s3, $s0, 2 + vinsgr2vr.d $vr0, $s0, 1 + addi.d $s0, $a4, -44 + mul.d $s0, $s0, $a3 + add.d $s0, $a5, $s0 + alsl.d $s0, $s3, $s0, 2 + vinsgr2vr.d $vr1, $s0, 0 + addi.d $s0, $a4, -48 + mul.d $s0, $s0, $a3 + add.d $s0, $a5, $s0 + alsl.d $s0, $s3, $s0, 2 + vinsgr2vr.d $vr1, $s0, 1 + addi.d $s0, $a4, -36 + mul.d $s0, $s0, $a3 + add.d $s0, $a5, $s0 + alsl.d $s0, $s3, $s0, 2 + vinsgr2vr.d $vr2, $s0, 0 + addi.d $s0, $a4, -40 + mul.d $s0, $s0, $a3 + add.d $s0, $a5, $s0 + alsl.d $s0, $s3, $s0, 2 + vinsgr2vr.d $vr2, $s0, 1 + addi.d $s0, $a4, -28 + mul.d $s0, $s0, $a3 + add.d $s0, $a5, $s0 + alsl.d $s0, $s3, $s0, 2 + vinsgr2vr.d $vr3, $s0, 0 + addi.d $s0, $a4, -32 + mul.d $s0, $s0, $a3 + add.d $s0, $a5, $s0 + alsl.d $s0, $s3, $s0, 2 + vinsgr2vr.d $vr3, $s0, 1 + addi.w $s0, $zero, -7 + lu52i.d $s0, $s0, 1023 + add.d $s0, $s2, $s0 + mul.d $s0, $s0, $a3 + alsl.d $s0, $s0, $a5, 2 + vinsgr2vr.d $vr4, $t8, 0 + addi.w $t8, $zero, -8 + lu52i.d $t8, $t8, 1023 + add.d $t8, $s2, $t8 mul.d $t8, $t8, $a3 alsl.d $t8, $t8, $a5, 2 - st.d $t8, $sp, 184 # 8-byte Folded Spill - sltu $t8, $t8, $a0 - and $t8, $t7, $t8 - addi.w $t7, $zero, -7 - lu52i.d $t7, $t7, 1023 - add.d $t7, $s1, $t7 - mul.d $t7, $t7, $a3 - alsl.d $t7, $t7, $a5, 2 - or $fp, $t8, $fp - addi.d $t8, $a1, -28 - mul.d $t8, $t8, $a3 - add.d $t8, $a5, $t8 - alsl.d $t8, $s2, $t8, 2 - sltu $t6, $s0, $t6 - sltu $t4, $t4, $a0 - and $t4, $t6, $t4 - addi.w $t6, $zero, -8 + vinsgr2vr.d $vr4, $fp, 1 + addi.w $fp, $zero, -9 + lu52i.d $fp, $fp, 1023 + add.d $fp, $s2, $fp + mul.d $fp, $fp, $a3 + alsl.d $fp, $fp, $a5, 2 + vinsgr2vr.d $vr5, $t6, 0 + addi.w $t6, $zero, -10 lu52i.d $t6, $t6, 1023 - add.d $t6, $s1, $t6 + add.d $t6, $s2, $t6 mul.d $t6, $t6, $a3 alsl.d $t6, $t6, $a5, 2 - or $t4, $t4, $fp - addi.d $fp, $a1, -32 - mul.d $fp, $fp, $a3 - add.d $fp, $a5, $fp - alsl.d $fp, $s2, $fp, 2 - sltu $t5, $s0, $t5 - sltu $t1, $t1, $a0 - and $t5, $t5, $t1 - addi.w $t1, $zero, -9 + vinsgr2vr.d $vr5, $t7, 1 + addi.w $t7, $zero, -11 + lu52i.d $t7, $t7, 1023 + add.d $t7, $s2, $t7 + mul.d $t7, $t7, $a3 + alsl.d $t7, $t7, $a5, 2 + vinsgr2vr.d $vr6, $t4, 0 + addi.w $t4, $zero, -12 + lu52i.d $t4, $t4, 1023 + add.d $t4, $s2, $t4 + mul.d $t4, $t4, $a3 + alsl.d $t4, $t4, $a5, 2 + vinsgr2vr.d $vr6, $t5, 1 + addi.w $t5, $zero, -13 + lu52i.d $t5, $t5, 1023 + add.d $t5, $s2, $t5 + mul.d $t5, $t5, $a3 + alsl.d $t5, $t5, $a5, 2 + vinsgr2vr.d $vr7, $t1, 0 + addi.w $t1, $zero, -14 lu52i.d $t1, $t1, 1023 - add.d $t1, $s1, $t1 + add.d $t1, $s2, $t1 mul.d $t1, $t1, $a3 alsl.d $t1, $t1, $a5, 2 - or $t5, $t5, $t4 - addi.d $t4, $a1, -36 - mul.d $t4, $t4, $a3 - add.d $t4, $a5, $t4 - alsl.d $t4, $s2, $t4, 2 - sltu $t2, $s0, $t2 - sltu $a7, $a7, $a0 - and $t2, $t2, $a7 - addi.w $a7, $zero, -10 + vinsgr2vr.d $vr7, $a7, 1 + addi.w $a7, $zero, -15 lu52i.d $a7, $a7, 1023 - add.d $a7, $s1, $a7 + add.d $a7, $s2, $a7 mul.d $a7, $a7, $a3 alsl.d $a7, $a7, $a5, 2 - or $t5, $t2, $t5 - addi.d $t2, $a1, -40 - mul.d $t2, $t2, $a3 - add.d $t2, $a5, $t2 - alsl.d $t2, $s2, $t2, 2 - sltu $t0, $s0, $t0 - sltu $a4, $a4, $a0 - and $t0, $t0, $a4 - addi.w $a4, $zero, -11 - lu52i.d $a4, $a4, 1023 - add.d $a4, $s1, $a4 + addi.d $a4, $a4, -60 mul.d $a4, $a4, $a3 - alsl.d $a4, $a4, $a5, 2 - or $t5, $t0, $t5 - addi.d $t0, $a1, -44 - mul.d $t0, $t0, $a3 - add.d $t0, $a5, $t0 - alsl.d $t0, $s2, $t0, 2 - sltu $a6, $s0, $a6 - sltu $a2, $a2, $a0 - and $a2, $a6, $a2 - addi.w $a6, $zero, -12 - lu52i.d $a6, $a6, 1023 - add.d $a6, $s1, $a6 + add.d $a4, $a5, $a4 + st.d $s3, $sp, 40 # 8-byte Folded Spill + alsl.d $a4, $s3, $a4, 2 + vreplgr2vr.d $vr8, $s1 + vinsgr2vr.d $vr9, $t5, 0 + vinsgr2vr.d $vr9, $t1, 1 + vinsgr2vr.d $vr10, $t7, 0 + vinsgr2vr.d $vr10, $t4, 1 + vinsgr2vr.d $vr11, $fp, 0 + vinsgr2vr.d $vr11, $t6, 1 + vinsgr2vr.d $vr12, $s0, 0 + vinsgr2vr.d $vr12, $t8, 1 + vinsgr2vr.d $vr13, $t2, 0 + vinsgr2vr.d $vr13, $t3, 1 + vinsgr2vr.d $vr14, $a6, 0 + vinsgr2vr.d $vr14, $t0, 1 + addi.d $a6, $s2, -1 mul.d $a6, $a6, $a3 alsl.d $a6, $a6, $a5, 2 - or $a2, $a2, $t5 - addi.d $t5, $a1, -48 - mul.d $t5, $t5, $a3 - add.d $t5, $a5, $t5 - alsl.d $t5, $s2, $t5, 2 - sltu $t8, $s0, $t8 - sltu $t7, $t7, $a0 - and $t7, $t8, $t7 - addi.w $t8, $zero, -13 - lu52i.d $t8, $t8, 1023 - add.d $t8, $s1, $t8 - mul.d $t8, $t8, $a3 - alsl.d $t8, $t8, $a5, 2 - or $a2, $t7, $a2 - addi.d $t7, $a1, -52 - mul.d $t7, $t7, $a3 - add.d $t7, $a5, $t7 - alsl.d $t7, $s2, $t7, 2 - sltu $fp, $s0, $fp - sltu $t6, $t6, $a0 - and $t6, $fp, $t6 - addi.w $fp, $zero, -14 - lu52i.d $fp, $fp, 1023 - add.d $fp, $s1, $fp - mul.d $fp, $fp, $a3 - alsl.d $fp, $fp, $a5, 2 - or $t3, $t6, $t3 - addi.d $t6, $a1, -56 - mul.d $t6, $t6, $a3 - add.d $t6, $a5, $t6 - alsl.d $t6, $s2, $t6, 2 - or $a2, $a2, $t3 - addi.w $t3, $zero, -15 - lu52i.d $t3, $t3, 1023 - add.d $t3, $s1, $t3 - mul.d $t3, $t3, $a3 - alsl.d $t3, $t3, $a5, 2 - addi.d $a1, $a1, -60 - mul.d $a1, $a1, $a3 - add.d $a1, $a5, $a1 - st.d $s2, $sp, 40 # 8-byte Folded Spill - alsl.d $a1, $s2, $a1, 2 - sltu $t4, $s0, $t4 - sltu $t1, $t1, $a0 - and $t1, $t4, $t1 - or $a2, $t1, $a2 - sltu $t1, $s0, $t2 - sltu $a7, $a7, $a0 - and $a7, $t1, $a7 - or $a2, $a7, $a2 - sltu $a7, $s0, $t0 - sltu $a4, $a4, $a0 - and $a4, $a7, $a4 - or $a2, $a4, $a2 - sltu $a4, $s0, $t5 - sltu $a6, $a6, $a0 - and $a4, $a4, $a6 - or $a2, $a4, $a2 - sltu $a4, $s0, $t7 - sltu $a6, $t8, $a0 - and $a4, $a4, $a6 - or $a2, $a4, $a2 - sltu $a4, $s0, $t6 - sltu $a6, $fp, $a0 - and $a4, $a4, $a6 - or $a2, $a4, $a2 - sltu $a1, $s0, $a1 - sltu $a0, $t3, $a0 - and $a0, $a1, $a0 - or $a0, $a0, $a2 + st.d $a6, $sp, 184 # 8-byte Folded Spill + vinsgr2vr.d $vr15, $a6, 0 + vinsgr2vr.d $vr15, $a2, 1 + st.d $s4, $sp, 192 # 8-byte Folded Spill + vinsgr2vr.d $vr16, $s4, 0 + vinsgr2vr.d $vr16, $a1, 1 + vreplgr2vr.d $vr17, $a0 + vslt.du $vr7, $vr8, $vr7 + vslt.du $vr6, $vr8, $vr6 + vpickev.w $vr6, $vr6, $vr7 + vslt.du $vr5, $vr8, $vr5 + vslt.du $vr4, $vr8, $vr4 + vpickev.w $vr4, $vr4, $vr5 + vpickev.h $vr4, $vr4, $vr6 + vslt.du $vr3, $vr8, $vr3 + vslt.du $vr2, $vr8, $vr2 + vpickev.w $vr2, $vr2, $vr3 + vslt.du $vr1, $vr8, $vr1 + vslt.du $vr0, $vr8, $vr0 + vpickev.w $vr0, $vr0, $vr1 + vpickev.h $vr0, $vr0, $vr2 + vpickev.b $vr0, $vr0, $vr4 + vslt.du $vr1, $vr16, $vr17 + vslt.du $vr2, $vr15, $vr17 + vpickev.w $vr1, $vr2, $vr1 + vslt.du $vr2, $vr14, $vr17 + vslt.du $vr3, $vr13, $vr17 + vpickev.w $vr2, $vr3, $vr2 + vpickev.h $vr1, $vr2, $vr1 + vslt.du $vr2, $vr12, $vr17 + vslt.du $vr3, $vr11, $vr17 + vpickev.w $vr2, $vr3, $vr2 + vslt.du $vr3, $vr10, $vr17 + vslt.du $vr4, $vr9, $vr17 + vpickev.w $vr3, $vr4, $vr3 + vpickev.h $vr2, $vr3, $vr2 + vpickev.b $vr1, $vr2, $vr1 + slti $a1, $a3, 0 + vand.v $vr0, $vr0, $vr1 + sltu $a2, $s1, $a4 + sltu $a0, $a7, $a0 + and $a0, $a2, $a0 + vslli.b $vr0, $vr0, 7 + vmskltz.b $vr0, $vr0 + vpickve2gr.hu $a2, $vr0, 0 + sltu $a2, $zero, $a2 + or $a0, $a2, $a0 + or $a0, $a0, $a1 st.d $a0, $sp, 24 # 8-byte Folded Spill bstrpick.d $a0, $s6, 30, 2 slli.d $a0, $a0, 2 st.d $a0, $sp, 16 # 8-byte Folded Spill - addi.d $a0, $s1, -15 + addi.d $a0, $s2, -15 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 208 # 8-byte Folded Spill - addi.d $a0, $s1, -2 + addi.d $a0, $s2, -2 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 176 # 8-byte Folded Spill - addi.d $a0, $s1, -3 + addi.d $a0, $s2, -3 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 168 # 8-byte Folded Spill - addi.d $a0, $s1, -4 + addi.d $a0, $s2, -4 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 160 # 8-byte Folded Spill - addi.d $a0, $s1, -5 + addi.d $a0, $s2, -5 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 152 # 8-byte Folded Spill - addi.d $a0, $s1, -6 + addi.d $a0, $s2, -6 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 144 # 8-byte Folded Spill - addi.d $a0, $s1, -7 + addi.d $a0, $s2, -7 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 136 # 8-byte Folded Spill - addi.d $a0, $s1, -8 + addi.d $a0, $s2, -8 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 128 # 8-byte Folded Spill - addi.d $a0, $s1, -9 + addi.d $a0, $s2, -9 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 120 # 8-byte Folded Spill - addi.d $a0, $s1, -10 + addi.d $a0, $s2, -10 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 112 # 8-byte Folded Spill - addi.d $a0, $s1, -11 + addi.d $a0, $s2, -11 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 104 # 8-byte Folded Spill - addi.d $a0, $s1, -12 + addi.d $a0, $s2, -12 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 96 # 8-byte Folded Spill - addi.d $a0, $s1, -13 + addi.d $a0, $s2, -13 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 88 # 8-byte Folded Spill - st.d $s1, $sp, 200 # 8-byte Folded Spill - addi.d $a0, $s1, -14 + st.d $s2, $sp, 200 # 8-byte Folded Spill + addi.d $a0, $s2, -14 mul.d $a0, $a0, $a3 alsl.d $a0, $a0, $a5, 2 st.d $a0, $sp, 80 # 8-byte Folded Spill slli.d $a0, $a3, 6 st.d $a0, $sp, 56 # 8-byte Folded Spill slli.d $a3, $a3, 2 - b .LBB6_28 + st.d $s6, $sp, 72 # 8-byte Folded Spill + b .LBB6_56 .p2align 4, , 16 -.LBB6_27: # %._crit_edge - # in Loop: Header=BB6_28 Depth=1 +.LBB6_55: # %._crit_edge + # in Loop: Header=BB6_56 Depth=1 ld.d $a0, $sp, 200 # 8-byte Folded Reload addi.d $a0, $a0, 16 ld.d $a1, $sp, 208 # 8-byte Folded Reload @@ -3598,14 +3957,14 @@ dmxpy: # @dmxpy ld.d $s6, $sp, 72 # 8-byte Folded Reload st.d $a0, $sp, 200 # 8-byte Folded Spill ld.d $a1, $sp, 64 # 8-byte Folded Reload - bgeu $a0, $a1, .LBB6_36 -.LBB6_28: # %.preheader + bgeu $a0, $a1, .LBB6_64 +.LBB6_56: # %.preheader # =>This Loop Header: Depth=1 - # Child Loop BB6_32 Depth 2 - # Child Loop BB6_35 Depth 2 - blez $s6, .LBB6_27 -# %bb.29: # %.lr.ph254 - # in Loop: Header=BB6_28 Depth=1 + # Child Loop BB6_60 Depth 2 + # Child Loop BB6_63 Depth 2 + blez $s6, .LBB6_55 +# %bb.57: # %.lr.ph254 + # in Loop: Header=BB6_56 Depth=1 ld.d $a0, $sp, 72 # 8-byte Folded Reload sltui $a0, $a0, 8 ld.d $a1, $sp, 32 # 8-byte Folded Reload @@ -3638,20 +3997,20 @@ dmxpy: # @dmxpy st.d $t4, $sp, 240 # 8-byte Folded Spill st.d $a1, $sp, 232 # 8-byte Folded Spill st.d $fp, $sp, 224 # 8-byte Folded Spill - beqz $a0, .LBB6_31 -# %bb.30: # in Loop: Header=BB6_28 Depth=1 + beqz $a0, .LBB6_59 +# %bb.58: # in Loop: Header=BB6_56 Depth=1 move $a0, $zero move $t0, $s0 - move $t1, $s1 - move $a6, $s2 + move $a6, $s1 + move $t1, $s2 move $t5, $s3 move $t6, $s4 move $t7, $s5 move $t8, $s6 - b .LBB6_34 + b .LBB6_62 .p2align 4, , 16 -.LBB6_31: # %vector.ph695 - # in Loop: Header=BB6_28 Depth=1 +.LBB6_59: # %vector.ph695 + # in Loop: Header=BB6_56 Depth=1 vldrepl.w $vr0, $a2, 0 vldrepl.w $vr1, $a4, 0 vldrepl.w $vr2, $a5, 0 @@ -3679,8 +4038,8 @@ dmxpy: # @dmxpy ld.d $a1, $sp, 208 # 8-byte Folded Reload ld.d $a2, $sp, 16 # 8-byte Folded Reload .p2align 4, , 16 -.LBB6_32: # %vector.body698 - # Parent Loop BB6_28 Depth=1 +.LBB6_60: # %vector.body698 + # Parent Loop BB6_56 Depth=1 # => This Inner Loop Header: Depth=2 vld $vr16, $a1, 0 vld $vr17, $a0, 0 @@ -3749,22 +4108,22 @@ dmxpy: # @dmxpy addi.d $a2, $a2, -4 addi.d $a1, $a1, 16 addi.d $a0, $a0, 16 - bnez $a2, .LBB6_32 -# %bb.33: # %middle.block750 - # in Loop: Header=BB6_28 Depth=1 + bnez $a2, .LBB6_60 +# %bb.61: # %middle.block750 + # in Loop: Header=BB6_56 Depth=1 ld.d $a2, $sp, 16 # 8-byte Folded Reload move $a0, $a2 ld.d $a1, $sp, 40 # 8-byte Folded Reload move $t0, $a5 - move $t1, $a7 - move $a6, $t2 + move $a6, $a7 + move $t1, $t2 move $t5, $t3 move $t6, $t4 move $t7, $fp move $t8, $s1 - beq $a2, $a1, .LBB6_27 -.LBB6_34: # %scalar.ph693.preheader - # in Loop: Header=BB6_28 Depth=1 + beq $a2, $a1, .LBB6_55 +.LBB6_62: # %scalar.ph693.preheader + # in Loop: Header=BB6_56 Depth=1 slli.d $fp, $a0, 2 ld.d $a1, $sp, 40 # 8-byte Folded Reload sub.d $s0, $a1, $a0 @@ -3778,23 +4137,23 @@ dmxpy: # @dmxpy ld.d $s2, $sp, 120 # 8-byte Folded Reload ld.d $s7, $sp, 128 # 8-byte Folded Reload ld.d $ra, $sp, 136 # 8-byte Folded Reload - ld.d $t2, $sp, 144 # 8-byte Folded Reload - ld.d $a4, $sp, 152 # 8-byte Folded Reload - ld.d $a2, $sp, 160 # 8-byte Folded Reload - ld.d $a0, $sp, 168 # 8-byte Folded Reload + ld.d $a2, $sp, 144 # 8-byte Folded Reload + ld.d $a1, $sp, 152 # 8-byte Folded Reload + ld.d $a4, $sp, 160 # 8-byte Folded Reload + ld.d $t2, $sp, 168 # 8-byte Folded Reload ld.d $a7, $sp, 176 # 8-byte Folded Reload - ld.d $a1, $sp, 184 # 8-byte Folded Reload + ld.d $a0, $sp, 184 # 8-byte Folded Reload ld.d $t3, $sp, 192 # 8-byte Folded Reload st.d $t8, $sp, 216 # 8-byte Folded Spill .p2align 4, , 16 -.LBB6_35: # %scalar.ph693 - # Parent Loop BB6_28 Depth=1 +.LBB6_63: # %scalar.ph693 + # Parent Loop BB6_56 Depth=1 # => This Inner Loop Header: Depth=2 fldx.s $fa0, $t4, $fp move $t8, $t7 move $t7, $t6 move $t6, $t5 - move $t5, $a6 + move $t5, $t1 ld.d $s8, $sp, 288 # 8-byte Folded Reload fld.s $fa1, $s8, 0 fldx.s $fa2, $s1, $fp @@ -3827,7 +4186,7 @@ dmxpy: # @dmxpy ld.d $s8, $sp, 240 # 8-byte Folded Reload fld.s $fa2, $s8, 0 ld.d $s8, $sp, 224 # 8-byte Folded Reload - move $a6, $t5 + move $t1, $t5 move $t5, $t6 move $t6, $t7 move $t7, $t8 @@ -3839,24 +4198,24 @@ dmxpy: # @dmxpy fmul.s $fa1, $fa2, $fa3 fadd.s $fa0, $fa0, $fa1 fmul.s $fa1, $fa4, $fa5 - fld.s $fa2, $t1, 0 + fld.s $fa2, $a6, 0 fldx.s $fa3, $ra, $fp - fld.s $fa4, $a6, 0 - fldx.s $fa5, $t2, $fp + fld.s $fa4, $t1, 0 + fldx.s $fa5, $a2, $fp fadd.s $fa0, $fa0, $fa1 fmul.s $fa1, $fa2, $fa3 fadd.s $fa0, $fa0, $fa1 fmul.s $fa1, $fa4, $fa5 fld.s $fa2, $t5, 0 - fldx.s $fa3, $a4, $fp + fldx.s $fa3, $a1, $fp fld.s $fa4, $t6, 0 - fldx.s $fa5, $a2, $fp + fldx.s $fa5, $a4, $fp fadd.s $fa0, $fa0, $fa1 fmul.s $fa1, $fa2, $fa3 fadd.s $fa0, $fa0, $fa1 fmul.s $fa1, $fa4, $fa5 fld.s $fa2, $t7, 0 - fldx.s $fa3, $a0, $fp + fldx.s $fa3, $t2, $fp fld.s $fa4, $t8, 0 fldx.s $fa5, $a7, $fp fadd.s $fa0, $fa0, $fa1 @@ -3865,7 +4224,7 @@ dmxpy: # @dmxpy fmul.s $fa1, $fa4, $fa5 ld.d $s8, $sp, 232 # 8-byte Folded Reload fld.s $fa2, $s8, 0 - fldx.s $fa3, $a1, $fp + fldx.s $fa3, $a0, $fp fld.s $fa4, $t0, 0 fldx.s $fa5, $t3, $fp fadd.s $fa0, $fa0, $fa1 @@ -3875,12 +4234,12 @@ dmxpy: # @dmxpy fadd.s $fa0, $fa0, $fa1 fstx.s $fa0, $t4, $fp addi.d $t3, $t3, 4 - addi.d $a1, $a1, 4 - addi.d $a7, $a7, 4 addi.d $a0, $a0, 4 - addi.d $a2, $a2, 4 - addi.d $a4, $a4, 4 + addi.d $a7, $a7, 4 addi.d $t2, $t2, 4 + addi.d $a4, $a4, 4 + addi.d $a1, $a1, 4 + addi.d $a2, $a2, 4 addi.d $ra, $ra, 4 addi.d $s7, $s7, 4 addi.d $s2, $s2, 4 @@ -3892,9 +4251,9 @@ dmxpy: # @dmxpy addi.d $s1, $s1, 4 addi.d $s0, $s0, -1 addi.d $t4, $t4, 4 - bnez $s0, .LBB6_35 - b .LBB6_27 -.LBB6_36: # %._crit_edge256 + bnez $s0, .LBB6_63 + b .LBB6_55 +.LBB6_64: # %._crit_edge256 ld.d $s8, $sp, 296 # 8-byte Folded Reload ld.d $s7, $sp, 304 # 8-byte Folded Reload ld.d $s6, $sp, 312 # 8-byte Folded Reload @@ -3908,386 +4267,6 @@ dmxpy: # @dmxpy ld.d $ra, $sp, 376 # 8-byte Folded Reload addi.d $sp, $sp, 384 ret -.LBB6_37: # %vector.memcheck - alsl.d $a1, $s6, $s5, 2 - addi.d $a0, $s1, 4 - sltu $a0, $s5, $a0 - sltu $a4, $s1, $a1 - and $a4, $a0, $a4 - move $a0, $zero - bnez $a4, .LBB6_4 -# %bb.38: # %vector.memcheck - alsl.d $a4, $s6, $a5, 2 - sltu $a4, $s5, $a4 - sltu $a1, $a5, $a1 - and $a1, $a4, $a1 - bnez $a1, .LBB6_4 -# %bb.39: # %vector.ph - bstrpick.d $a0, $s6, 30, 3 - vldrepl.w $vr0, $s1, 0 - slli.d $a0, $a0, 3 - addi.d $a1, $a5, 16 - addi.d $a4, $s5, 16 - move $a6, $a0 - .p2align 4, , 16 -.LBB6_40: # %vector.body - # =>This Inner Loop Header: Depth=1 - vld $vr1, $a1, -16 - vld $vr2, $a1, 0 - vld $vr3, $a4, -16 - vld $vr4, $a4, 0 - vfmul.s $vr1, $vr0, $vr1 - vfmul.s $vr2, $vr0, $vr2 - vfadd.s $vr1, $vr3, $vr1 - vfadd.s $vr2, $vr4, $vr2 - vst $vr1, $a4, -16 - vst $vr2, $a4, 0 - addi.d $a6, $a6, -8 - addi.d $a1, $a1, 32 - addi.d $a4, $a4, 32 - bnez $a6, .LBB6_40 -# %bb.41: # %middle.block - bne $a0, $s6, .LBB6_4 - b .LBB6_6 -.LBB6_42: # %vector.memcheck458 - st.d $a4, $sp, 256 # 8-byte Folded Spill - st.d $s8, $sp, 248 # 8-byte Folded Spill - alsl.d $s7, $s6, $s5, 2 - addi.d $a4, $t6, 4 - sltu $a4, $s5, $a4 - move $s6, $s5 - sltu $s5, $t6, $s7 - and $a4, $a4, $s5 - move $s8, $zero - bnez $a4, .LBB6_80 -# %bb.43: # %vector.memcheck458 - addi.d $a4, $t5, 4 - sltu $a4, $s6, $a4 - sltu $s5, $t5, $s7 - and $a4, $a4, $s5 - bnez $a4, .LBB6_80 -# %bb.44: # %vector.memcheck458 - addi.d $a4, $t4, 4 - sltu $a4, $s6, $a4 - sltu $s5, $t4, $s7 - and $a4, $a4, $s5 - bnez $a4, .LBB6_80 -# %bb.45: # %vector.memcheck458 - addi.d $a4, $t3, 4 - sltu $a4, $s6, $a4 - sltu $s5, $t3, $s7 - and $a4, $a4, $s5 - bnez $a4, .LBB6_80 -# %bb.46: # %vector.memcheck458 - addi.d $a4, $t2, 4 - sltu $a4, $s6, $a4 - sltu $s5, $t2, $s7 - and $a4, $a4, $s5 - bnez $a4, .LBB6_80 -# %bb.47: # %vector.memcheck458 - addi.d $a4, $t1, 4 - sltu $a4, $s6, $a4 - sltu $s5, $t1, $s7 - and $a4, $a4, $s5 - bnez $a4, .LBB6_80 -# %bb.48: # %vector.memcheck458 - addi.d $a4, $t0, 4 - sltu $a4, $s6, $a4 - sltu $s5, $t0, $s7 - and $a4, $a4, $s5 - bnez $a4, .LBB6_80 -# %bb.49: # %vector.memcheck458 - addi.d $a4, $a7, 4 - sltu $a4, $s6, $a4 - sltu $s5, $a7, $s7 - and $a4, $a4, $s5 - bnez $a4, .LBB6_80 -# %bb.50: # %vector.memcheck458 - st.d $s8, $sp, 264 # 8-byte Folded Spill - ld.d $s8, $sp, 72 # 8-byte Folded Reload - add.d $a1, $a1, $s8 - alsl.d $a1, $a1, $a5, 2 - sltu $a1, $s6, $a1 - sltu $a4, $s4, $s7 - and $a1, $a1, $a4 - move $s5, $s6 - bnez $a1, .LBB6_81 -# %bb.51: # %vector.memcheck458 - add.d $a0, $a0, $s8 - alsl.d $a0, $a0, $a5, 2 - sltu $a0, $s5, $a0 - sltu $a1, $s3, $s7 - and $a0, $a0, $a1 - move $s6, $s8 - bnez $a0, .LBB6_79 -# %bb.52: # %vector.memcheck458 - add.d $a0, $ra, $s6 - alsl.d $a0, $a0, $a5, 2 - sltu $a0, $s5, $a0 - sltu $a1, $s2, $s7 - and $a0, $a0, $a1 - ld.d $s8, $sp, 264 # 8-byte Folded Reload - bnez $a0, .LBB6_23 -# %bb.53: # %vector.memcheck458 - ld.d $a0, $sp, 288 # 8-byte Folded Reload - add.d $a0, $a0, $s6 - alsl.d $a0, $a0, $a5, 2 - sltu $a0, $s5, $a0 - sltu $a1, $s1, $s7 - and $a0, $a0, $a1 - bnez $a0, .LBB6_23 -# %bb.54: # %vector.memcheck458 - ld.d $a0, $sp, 280 # 8-byte Folded Reload - add.d $a0, $a0, $s6 - alsl.d $a0, $a0, $a5, 2 - sltu $a0, $s5, $a0 - sltu $a1, $s0, $s7 - and $a0, $a0, $a1 - bnez $a0, .LBB6_23 -# %bb.55: # %vector.memcheck458 - ld.d $a0, $sp, 272 # 8-byte Folded Reload - add.d $a0, $a0, $s6 - alsl.d $a0, $a0, $a5, 2 - sltu $a0, $s5, $a0 - sltu $a1, $fp, $s7 - and $a0, $a0, $a1 - bnez $a0, .LBB6_23 -# %bb.56: # %vector.memcheck458 - ld.d $a0, $sp, 256 # 8-byte Folded Reload - add.d $a0, $a0, $s6 - alsl.d $a0, $a0, $a5, 2 - sltu $a0, $s5, $a0 - sltu $a1, $t8, $s7 - and $a0, $a0, $a1 - bnez $a0, .LBB6_23 -# %bb.57: # %vector.memcheck458 - ld.d $a0, $sp, 248 # 8-byte Folded Reload - add.d $a0, $a0, $s6 - alsl.d $a0, $a0, $a5, 2 - sltu $a0, $s5, $a0 - sltu $a1, $t7, $s7 - and $a0, $a0, $a1 - bnez $a0, .LBB6_23 -# %bb.58: # %vector.ph541 - move $a0, $zero - bstrpick.d $a1, $s6, 30, 2 - slli.d $s8, $a1, 2 - vldrepl.w $vr0, $a7, 0 - vldrepl.w $vr1, $t0, 0 - vldrepl.w $vr2, $t1, 0 - vldrepl.w $vr3, $t2, 0 - vldrepl.w $vr4, $t3, 0 - vldrepl.w $vr5, $t4, 0 - vldrepl.w $vr6, $t5, 0 - vldrepl.w $vr7, $t6, 0 - slli.d $a1, $s6, 2 - bstrpick.d $a1, $a1, 32, 4 - slli.d $a1, $a1, 4 - .p2align 4, , 16 -.LBB6_59: # %vector.body544 - # =>This Inner Loop Header: Depth=1 - vldx $vr8, $t7, $a0 - vldx $vr9, $s5, $a0 - vldx $vr10, $t8, $a0 - vfmul.s $vr8, $vr0, $vr8 - vfadd.s $vr8, $vr9, $vr8 - vldx $vr9, $fp, $a0 - vfmul.s $vr10, $vr1, $vr10 - vfadd.s $vr8, $vr8, $vr10 - vldx $vr10, $s0, $a0 - vfmul.s $vr9, $vr2, $vr9 - vfadd.s $vr8, $vr8, $vr9 - vldx $vr9, $s1, $a0 - vfmul.s $vr10, $vr3, $vr10 - vfadd.s $vr8, $vr8, $vr10 - vldx $vr10, $s2, $a0 - vfmul.s $vr9, $vr4, $vr9 - vfadd.s $vr8, $vr8, $vr9 - vldx $vr9, $s3, $a0 - vfmul.s $vr10, $vr5, $vr10 - vldx $vr11, $s4, $a0 - vfadd.s $vr8, $vr8, $vr10 - vfmul.s $vr9, $vr6, $vr9 - vfadd.s $vr8, $vr8, $vr9 - vfmul.s $vr9, $vr7, $vr11 - vfadd.s $vr8, $vr8, $vr9 - vstx $vr8, $s5, $a0 - addi.d $a0, $a0, 16 - bne $a1, $a0, .LBB6_59 -# %bb.60: # %middle.block572 - bne $s8, $s6, .LBB6_23 - b .LBB6_25 -.LBB6_61: # %vector.memcheck393 - alsl.d $fp, $t1, $s5, 2 - addi.d $t4, $a6, 4 - sltu $t4, $s5, $t4 - sltu $t5, $a6, $fp - and $t5, $t4, $t5 - move $t4, $zero - bnez $t5, .LBB6_17 -# %bb.62: # %vector.memcheck393 - addi.d $t5, $a4, 4 - sltu $t5, $s5, $t5 - sltu $t6, $a4, $fp - and $t5, $t5, $t6 - bnez $t5, .LBB6_17 -# %bb.63: # %vector.memcheck393 - addi.d $t5, $a1, 4 - sltu $t5, $s5, $t5 - sltu $t6, $a1, $fp - and $t5, $t5, $t6 - bnez $t5, .LBB6_17 -# %bb.64: # %vector.memcheck393 - addi.d $t5, $a0, 4 - sltu $t5, $s5, $t5 - sltu $t6, $a0, $fp - and $t5, $t5, $t6 - bnez $t5, .LBB6_17 -# %bb.65: # %vector.memcheck393 - alsl.d $t5, $t3, $a5, 2 - add.d $t6, $t3, $t1 - alsl.d $t6, $t6, $a5, 2 - sltu $t6, $s5, $t6 - sltu $t7, $t5, $fp - and $t6, $t6, $t7 - bnez $t6, .LBB6_17 -# %bb.66: # %vector.memcheck393 - alsl.d $t6, $t2, $a5, 2 - add.d $t7, $t2, $t1 - alsl.d $t7, $t7, $a5, 2 - sltu $t7, $s5, $t7 - sltu $t8, $t6, $fp - and $t7, $t7, $t8 - bnez $t7, .LBB6_17 -# %bb.67: # %vector.memcheck393 - alsl.d $t7, $t0, $a5, 2 - add.d $t8, $t0, $t1 - alsl.d $t8, $t8, $a5, 2 - sltu $t8, $s5, $t8 - sltu $s0, $t7, $fp - and $t8, $t8, $s0 - bnez $t8, .LBB6_17 -# %bb.68: # %vector.memcheck393 - alsl.d $t8, $a7, $a5, 2 - add.d $s0, $a7, $t1 - alsl.d $s0, $s0, $a5, 2 - sltu $s0, $s5, $s0 - sltu $fp, $t8, $fp - and $fp, $s0, $fp - bnez $fp, .LBB6_17 -# %bb.69: # %vector.ph436 - vldrepl.w $vr0, $a0, 0 - vldrepl.w $vr1, $a1, 0 - vldrepl.w $vr2, $a4, 0 - vldrepl.w $vr3, $a6, 0 - bstrpick.d $t4, $t1, 31, 2 - slli.d $t4, $t4, 2 - move $fp, $s5 - move $s0, $t4 - .p2align 4, , 16 -.LBB6_70: # %vector.body439 - # =>This Inner Loop Header: Depth=1 - vld $vr4, $t8, 0 - vld $vr5, $fp, 0 - vld $vr6, $t7, 0 - vfmul.s $vr4, $vr0, $vr4 - vfadd.s $vr4, $vr5, $vr4 - vld $vr5, $t6, 0 - vfmul.s $vr6, $vr1, $vr6 - vld $vr7, $t5, 0 - vfadd.s $vr4, $vr4, $vr6 - vfmul.s $vr5, $vr2, $vr5 - vfadd.s $vr4, $vr4, $vr5 - vfmul.s $vr5, $vr3, $vr7 - vfadd.s $vr4, $vr4, $vr5 - vst $vr4, $fp, 0 - addi.d $s0, $s0, -4 - addi.d $t5, $t5, 16 - addi.d $t6, $t6, 16 - addi.d $t7, $t7, 16 - addi.d $t8, $t8, 16 - addi.d $fp, $fp, 16 - bnez $s0, .LBB6_70 -# %bb.71: # %middle.block455 - bne $t4, $t1, .LBB6_17 - b .LBB6_19 -.LBB6_72: # %vector.memcheck351 - alsl.d $t0, $s6, $s5, 2 - addi.d $a7, $a1, 4 - sltu $a7, $s5, $a7 - sltu $t1, $a1, $t0 - and $t1, $a7, $t1 - move $a7, $zero - bnez $t1, .LBB6_10 -# %bb.73: # %vector.memcheck351 - addi.d $t1, $a0, 4 - sltu $t1, $s5, $t1 - sltu $t2, $a0, $t0 - and $t1, $t1, $t2 - bnez $t1, .LBB6_10 -# %bb.74: # %vector.memcheck351 - alsl.d $t1, $a6, $a5, 2 - add.d $t2, $a6, $s6 - alsl.d $t2, $t2, $a5, 2 - sltu $t2, $s5, $t2 - sltu $t3, $t1, $t0 - and $t2, $t2, $t3 - bnez $t2, .LBB6_10 -# %bb.75: # %vector.memcheck351 - alsl.d $t2, $a4, $a5, 2 - add.d $t3, $a4, $s6 - alsl.d $t3, $t3, $a5, 2 - sltu $t3, $s5, $t3 - sltu $t0, $t2, $t0 - and $t0, $t3, $t0 - bnez $t0, .LBB6_10 -# %bb.76: # %vector.ph374 - bstrpick.d $a7, $s6, 30, 3 - slli.d $a7, $a7, 3 - vldrepl.w $vr0, $a0, 0 - vldrepl.w $vr1, $a1, 0 - addi.d $t0, $s5, 16 - addi.d $t1, $t1, 16 - addi.d $t2, $t2, 16 - move $t3, $a7 - .p2align 4, , 16 -.LBB6_77: # %vector.body377 - # =>This Inner Loop Header: Depth=1 - vld $vr2, $t2, -16 - vld $vr3, $t0, -16 - vld $vr4, $t0, 0 - vld $vr5, $t2, 0 - vfmul.s $vr2, $vr0, $vr2 - vfadd.s $vr2, $vr3, $vr2 - vld $vr3, $t1, -16 - vld $vr6, $t1, 0 - vfmul.s $vr5, $vr0, $vr5 - vfadd.s $vr4, $vr4, $vr5 - vfmul.s $vr3, $vr1, $vr3 - vfmul.s $vr5, $vr1, $vr6 - vfadd.s $vr2, $vr2, $vr3 - vfadd.s $vr3, $vr4, $vr5 - vst $vr2, $t0, -16 - vst $vr3, $t0, 0 - addi.d $t3, $t3, -8 - addi.d $t0, $t0, 32 - addi.d $t1, $t1, 32 - addi.d $t2, $t2, 32 - bnez $t3, .LBB6_77 -# %bb.78: # %middle.block390 - bne $a7, $s6, .LBB6_10 - b .LBB6_12 -.LBB6_79: - ld.d $s8, $sp, 264 # 8-byte Folded Reload - b .LBB6_23 -.LBB6_80: - move $s5, $s6 - ld.d $s6, $sp, 72 # 8-byte Folded Reload - b .LBB6_23 -.LBB6_81: - move $s6, $s8 - ld.d $s8, $sp, 264 # 8-byte Folded Reload - b .LBB6_23 .Lfunc_end6: .size dmxpy, .Lfunc_end6-dmxpy # -- End function diff --git a/results/SingleSource/Benchmarks/Misc-C++/Large/CMakeFiles/ray.dir/ray.s b/results/SingleSource/Benchmarks/Misc-C++/Large/CMakeFiles/ray.dir/ray.s index 14c5e6c7..5b245198 100644 --- a/results/SingleSource/Benchmarks/Misc-C++/Large/CMakeFiles/ray.dir/ray.s +++ b/results/SingleSource/Benchmarks/Misc-C++/Large/CMakeFiles/ray.dir/ray.s @@ -43,15 +43,14 @@ _ZmiRK3VecS1_: # @_ZmiRK3VecS1_ .type _ZmldRK3Vec,@function _ZmldRK3Vec: # @_ZmldRK3Vec # %bb.0: - fld.d $fa1, $a1, 0 - fld.d $fa2, $a1, 8 - fld.d $fa3, $a1, 16 + fld.d $fa1, $a1, 16 + vld $vr2, $a1, 0 + # kill: def $f0_64 killed $f0_64 def $vr0 fmul.d $fa1, $fa0, $fa1 - fmul.d $fa2, $fa0, $fa2 - fmul.d $fa0, $fa0, $fa3 - fst.d $fa1, $a0, 0 - fst.d $fa2, $a0, 8 - fst.d $fa0, $a0, 16 + vreplvei.d $vr0, $vr0, 0 + vfmul.d $vr0, $vr0, $vr2 + vst $vr0, $a0, 0 + fst.d $fa1, $a0, 16 ret .Lfunc_end2: .size _ZmldRK3Vec, .Lfunc_end2-_ZmldRK3Vec @@ -255,18 +254,17 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception0 # %bb.0: - addi.d $sp, $sp, -192 - .cfi_def_cfa_offset 192 - st.d $ra, $sp, 184 # 8-byte Folded Spill - st.d $fp, $sp, 176 # 8-byte Folded Spill - st.d $s0, $sp, 168 # 8-byte Folded Spill - st.d $s1, $sp, 160 # 8-byte Folded Spill - st.d $s2, $sp, 152 # 8-byte Folded Spill - st.d $s3, $sp, 144 # 8-byte Folded Spill - st.d $s4, $sp, 136 # 8-byte Folded Spill - fst.d $fs0, $sp, 128 # 8-byte Folded Spill - fst.d $fs1, $sp, 120 # 8-byte Folded Spill - fst.d $fs2, $sp, 112 # 8-byte Folded Spill + addi.d $sp, $sp, -224 + .cfi_def_cfa_offset 224 + st.d $ra, $sp, 216 # 8-byte Folded Spill + st.d $fp, $sp, 208 # 8-byte Folded Spill + st.d $s0, $sp, 200 # 8-byte Folded Spill + st.d $s1, $sp, 192 # 8-byte Folded Spill + st.d $s2, $sp, 184 # 8-byte Folded Spill + st.d $s3, $sp, 176 # 8-byte Folded Spill + st.d $s4, $sp, 168 # 8-byte Folded Spill + fst.d $fs0, $sp, 160 # 8-byte Folded Spill + fst.d $fs1, $sp, 152 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -276,7 +274,6 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd .cfi_offset 27, -56 .cfi_offset 56, -64 .cfi_offset 57, -72 - .cfi_offset 58, -80 fmov.d $fs0, $fa0 move $s0, $a1 move $s1, $a0 @@ -295,10 +292,10 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd fst.d $fs0, $fp, 32 beq $s1, $a0, .LBB7_25 # %bb.1: - addi.d $a0, $sp, 88 - st.d $a0, $sp, 96 - st.d $a0, $sp, 88 - st.d $zero, $sp, 104 + addi.d $a0, $sp, 128 + st.d $a0, $sp, 136 + st.d $a0, $sp, 128 + st.d $zero, $sp, 144 .Ltmp0: # EH_LABEL ori $a0, $zero, 24 pcaddu18i $ra, %call36(_Znwm) @@ -306,31 +303,32 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd .Ltmp1: # EH_LABEL # %bb.2: # %.preheader st.d $fp, $a0, 16 - addi.d $a1, $sp, 88 + addi.d $a1, $sp, 128 pcaddu18i $ra, %call36(_ZNSt8__detail15_List_node_base7_M_hookEPS0_) jirl $ra, $ra, 0 - ld.d $a0, $sp, 104 + ld.d $a0, $sp, 144 pcalau12i $a1, %pc_hi20(.LCPI7_0) fld.d $fa0, $a1, %pc_lo12(.LCPI7_0) addi.d $a0, $a0, 1 - st.d $a0, $sp, 104 + st.d $a0, $sp, 144 vldi $vr1, -1016 fmul.d $fs1, $fs0, $fa1 - fdiv.d $fs2, $fs1, $fa0 + fdiv.d $fa3, $fs1, $fa0 vldi $vr0, -928 fmul.d $fs0, $fs0, $fa0 fld.d $fa0, $s0, 0 fld.d $fa1, $s0, 8 fld.d $fa2, $s0, 16 addi.w $fp, $s1, -1 - fsub.d $fa0, $fa0, $fs2 - fadd.d $fa1, $fs2, $fa1 - fsub.d $fa2, $fa2, $fs2 - fst.d $fa0, $sp, 64 - fst.d $fa1, $sp, 72 - fst.d $fa2, $sp, 80 + fsub.d $fa0, $fa0, $fa3 + fadd.d $fa1, $fa3, $fa1 + vst $vr3, $sp, 32 # 16-byte Folded Spill + fsub.d $fa2, $fa2, $fa3 + fst.d $fa0, $sp, 96 + fst.d $fa1, $sp, 104 + fst.d $fa2, $sp, 112 .Ltmp3: # EH_LABEL - addi.d $a1, $sp, 64 + addi.d $a1, $sp, 96 move $a0, $fp fmov.d $fa0, $fs0 pcaddu18i $ra, %call36(_Z6createiRK3Vecd) @@ -345,23 +343,23 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd .Ltmp6: # EH_LABEL # %bb.4: st.d $s1, $a0, 16 - addi.d $a1, $sp, 88 + addi.d $a1, $sp, 128 pcaddu18i $ra, %call36(_ZNSt8__detail15_List_node_base7_M_hookEPS0_) jirl $ra, $ra, 0 - ld.d $a0, $sp, 104 + ld.d $a0, $sp, 144 + fld.d $fa0, $s0, 16 addi.d $a0, $a0, 1 - fld.d $fa0, $s0, 0 - fld.d $fa1, $s0, 8 - fld.d $fa2, $s0, 16 - st.d $a0, $sp, 104 - fadd.d $fa0, $fs2, $fa0 - fadd.d $fa1, $fs2, $fa1 - fsub.d $fa2, $fa2, $fs2 - fst.d $fa0, $sp, 64 - fst.d $fa1, $sp, 72 - fst.d $fa2, $sp, 80 + vld $vr1, $s0, 0 + st.d $a0, $sp, 144 + vld $vr2, $sp, 32 # 16-byte Folded Reload + fsub.d $fa0, $fa0, $fa2 + vreplvei.d $vr2, $vr2, 0 + vst $vr2, $sp, 16 # 16-byte Folded Spill + vfadd.d $vr1, $vr2, $vr1 + vst $vr1, $sp, 96 + fst.d $fa0, $sp, 112 .Ltmp7: # EH_LABEL - addi.d $a1, $sp, 64 + addi.d $a1, $sp, 96 move $a0, $fp fmov.d $fa0, $fs0 pcaddu18i $ra, %call36(_Z6createiRK3Vecd) @@ -376,23 +374,22 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd .Ltmp10: # EH_LABEL # %bb.6: # %.preheader.1 st.d $s1, $a0, 16 - addi.d $a1, $sp, 88 + addi.d $a1, $sp, 128 pcaddu18i $ra, %call36(_ZNSt8__detail15_List_node_base7_M_hookEPS0_) jirl $ra, $ra, 0 - ld.d $a0, $sp, 104 - addi.d $a0, $a0, 1 + ld.d $a0, $sp, 144 fld.d $fa0, $s0, 0 - fld.d $fa1, $s0, 8 - fld.d $fa2, $s0, 16 - st.d $a0, $sp, 104 - fsub.d $fa0, $fa0, $fs2 - fadd.d $fa1, $fs2, $fa1 - fadd.d $fa2, $fs2, $fa2 - fst.d $fa0, $sp, 64 - fst.d $fa1, $sp, 72 - fst.d $fa2, $sp, 80 + addi.d $a0, $a0, 1 + vld $vr1, $s0, 8 + st.d $a0, $sp, 144 + vld $vr2, $sp, 32 # 16-byte Folded Reload + fsub.d $fa0, $fa0, $fa2 + fst.d $fa0, $sp, 96 + vld $vr0, $sp, 16 # 16-byte Folded Reload + vfadd.d $vr0, $vr0, $vr1 + vst $vr0, $sp, 104 .Ltmp11: # EH_LABEL - addi.d $a1, $sp, 64 + addi.d $a1, $sp, 96 move $a0, $fp fmov.d $fa0, $fs0 pcaddu18i $ra, %call36(_Z6createiRK3Vecd) @@ -407,23 +404,22 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd .Ltmp14: # EH_LABEL # %bb.8: st.d $s1, $a0, 16 - addi.d $a1, $sp, 88 + addi.d $a1, $sp, 128 pcaddu18i $ra, %call36(_ZNSt8__detail15_List_node_base7_M_hookEPS0_) jirl $ra, $ra, 0 - ld.d $a0, $sp, 104 + ld.d $a0, $sp, 144 + fld.d $fa0, $s0, 16 + vld $vr1, $s0, 0 addi.d $a0, $a0, 1 - fld.d $fa0, $s0, 0 - fld.d $fa1, $s0, 8 - fld.d $fa2, $s0, 16 - st.d $a0, $sp, 104 - fadd.d $fa0, $fs2, $fa0 - fadd.d $fa1, $fs2, $fa1 - fadd.d $fa2, $fs2, $fa2 - fst.d $fa0, $sp, 64 - fst.d $fa1, $sp, 72 - fst.d $fa2, $sp, 80 + st.d $a0, $sp, 144 + vld $vr2, $sp, 32 # 16-byte Folded Reload + fadd.d $fa0, $fa2, $fa0 + vld $vr2, $sp, 16 # 16-byte Folded Reload + vfadd.d $vr1, $vr2, $vr1 + vst $vr1, $sp, 96 + fst.d $fa0, $sp, 112 .Ltmp15: # EH_LABEL - addi.d $a1, $sp, 64 + addi.d $a1, $sp, 96 move $a0, $fp fmov.d $fa0, $fs0 pcaddu18i $ra, %call36(_Z6createiRK3Vecd) @@ -438,13 +434,13 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd .Ltmp18: # EH_LABEL # %bb.10: st.d $fp, $a0, 16 - addi.d $a1, $sp, 88 - addi.d $s4, $sp, 88 + addi.d $a1, $sp, 128 + addi.d $s4, $sp, 128 pcaddu18i $ra, %call36(_ZNSt8__detail15_List_node_base7_M_hookEPS0_) jirl $ra, $ra, 0 - ld.d $a0, $sp, 104 + ld.d $a0, $sp, 144 addi.d $a0, $a0, 1 - st.d $a0, $sp, 104 + st.d $a0, $sp, 144 .Ltmp20: # EH_LABEL ori $a0, $zero, 72 pcaddu18i $ra, %call36(_Znwm) @@ -454,17 +450,17 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd move $fp, $a0 ld.d $a0, $s0, 16 vld $vr0, $s0, 0 - st.d $a0, $sp, 56 - vst $vr0, $sp, 40 - addi.d $s3, $sp, 16 - ld.d $s1, $sp, 88 - st.d $s3, $sp, 24 - st.d $s3, $sp, 16 - st.d $zero, $sp, 32 - addi.d $s0, $sp, 16 + st.d $a0, $sp, 88 + vst $vr0, $sp, 72 + addi.d $s3, $sp, 48 + ld.d $s1, $sp, 128 + st.d $s3, $sp, 56 + st.d $s3, $sp, 48 + st.d $zero, $sp, 64 + addi.d $s0, $sp, 48 beq $s1, $s4, .LBB7_16 # %bb.12: # %.lr.ph.i.i.preheader - addi.d $s0, $sp, 88 + addi.d $s0, $sp, 128 .p2align 4, , 16 .LBB7_13: # %.lr.ph.i.i # =>This Inner Loop Header: Depth=1 @@ -477,21 +473,21 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd # in Loop: Header=BB7_13 Depth=1 ld.d $a1, $s1, 16 st.d $a1, $a0, 16 - addi.d $a1, $sp, 16 + addi.d $a1, $sp, 48 pcaddu18i $ra, %call36(_ZNSt8__detail15_List_node_base7_M_hookEPS0_) jirl $ra, $ra, 0 - ld.d $a0, $sp, 32 + ld.d $a0, $sp, 64 ld.d $s1, $s1, 0 addi.d $a0, $a0, 1 - st.d $a0, $sp, 32 + st.d $a0, $sp, 64 bne $s1, $s0, .LBB7_13 # %bb.15: # %_ZNSt7__cxx114listIP5SceneSaIS2_EEC2ERKS4_.exit.loopexit - ld.d $s0, $sp, 16 + ld.d $s0, $sp, 48 .LBB7_16: # %_ZNSt7__cxx114listIP5SceneSaIS2_EEC2ERKS4_.exit pcalau12i $a0, %pc_hi20(_ZTV5Group+16) addi.d $a0, $a0, %pc_lo12(_ZTV5Group+16) - ld.d $a1, $sp, 40 - vld $vr0, $sp, 48 + ld.d $a1, $sp, 72 + vld $vr0, $sp, 80 st.d $a0, $fp, 0 st.d $s2, $fp, 8 st.d $a1, $fp, 16 @@ -503,7 +499,7 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd st.d $zero, $fp, 64 beq $s0, $s3, .LBB7_21 # %bb.17: # %.lr.ph.i.i.i30.preheader - addi.d $s2, $sp, 16 + addi.d $s2, $sp, 48 .p2align 4, , 16 .LBB7_18: # %.lr.ph.i.i.i30 # =>This Inner Loop Header: Depth=1 @@ -525,9 +521,9 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd st.d $a0, $fp, 64 bne $s0, $s2, .LBB7_18 # %bb.20: # %_ZN5GroupC2E6SphereNSt7__cxx114listIP5SceneSaIS4_EEE.exit.loopexit - ld.d $s0, $sp, 16 + ld.d $s0, $sp, 48 .LBB7_21: # %_ZN5GroupC2E6SphereNSt7__cxx114listIP5SceneSaIS4_EEE.exit - addi.d $s1, $sp, 16 + addi.d $s1, $sp, 48 beq $s0, $s1, .LBB7_23 .p2align 4, , 16 .LBB7_22: # %.lr.ph.i.i34 @@ -540,8 +536,8 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd move $s0, $s2 bne $s2, $s1, .LBB7_22 .LBB7_23: # %_ZNSt7__cxx1110_List_baseIP5SceneSaIS2_EED2Ev.exit - ld.d $a0, $sp, 88 - addi.d $s0, $sp, 88 + ld.d $a0, $sp, 128 + addi.d $s0, $sp, 128 beq $a0, $s0, .LBB7_25 .p2align 4, , 16 .LBB7_24: # %.lr.ph.i.i37 @@ -554,17 +550,16 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd bne $s1, $s0, .LBB7_24 .LBB7_25: move $a0, $fp - fld.d $fs2, $sp, 112 # 8-byte Folded Reload - fld.d $fs1, $sp, 120 # 8-byte Folded Reload - fld.d $fs0, $sp, 128 # 8-byte Folded Reload - ld.d $s4, $sp, 136 # 8-byte Folded Reload - ld.d $s3, $sp, 144 # 8-byte Folded Reload - ld.d $s2, $sp, 152 # 8-byte Folded Reload - ld.d $s1, $sp, 160 # 8-byte Folded Reload - ld.d $s0, $sp, 168 # 8-byte Folded Reload - ld.d $fp, $sp, 176 # 8-byte Folded Reload - ld.d $ra, $sp, 184 # 8-byte Folded Reload - addi.d $sp, $sp, 192 + fld.d $fs1, $sp, 152 # 8-byte Folded Reload + fld.d $fs0, $sp, 160 # 8-byte Folded Reload + ld.d $s4, $sp, 168 # 8-byte Folded Reload + ld.d $s3, $sp, 176 # 8-byte Folded Reload + ld.d $s2, $sp, 184 # 8-byte Folded Reload + ld.d $s1, $sp, 192 # 8-byte Folded Reload + ld.d $s0, $sp, 200 # 8-byte Folded Reload + ld.d $fp, $sp, 208 # 8-byte Folded Reload + ld.d $ra, $sp, 216 # 8-byte Folded Reload + addi.d $sp, $sp, 224 ret .LBB7_26: .Ltmp22: # EH_LABEL @@ -595,8 +590,8 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd move $a0, $s2 bne $s2, $s1, .LBB7_31 .LBB7_32: # %.body32 - ld.d $a0, $sp, 16 - addi.d $s1, $sp, 16 + ld.d $a0, $sp, 48 + addi.d $s1, $sp, 48 beq $a0, $s1, .LBB7_37 .p2align 4, , 16 .LBB7_33: # %.lr.ph.i.i42 @@ -610,8 +605,8 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd b .LBB7_37 .LBB7_34: .Ltmp25: # EH_LABEL - ld.d $a1, $sp, 16 - addi.d $s1, $sp, 16 + ld.d $a1, $sp, 48 + addi.d $s1, $sp, 48 move $s0, $a0 beq $a1, $s1, .LBB7_37 # %bb.35: # %.lr.ph.i.i.i.preheader @@ -631,8 +626,8 @@ _Z6createiRK3Vecd: # @_Z6createiRK3Vecd pcaddu18i $ra, %call36(_ZdlPvm) jirl $ra, $ra, 0 .LBB7_38: - ld.d $a0, $sp, 88 - addi.d $fp, $sp, 88 + ld.d $a0, $sp, 128 + addi.d $fp, $sp, 128 beq $a0, $fp, .LBB7_40 .p2align 4, , 16 .LBB7_39: # %.lr.ph.i.i47 @@ -1378,17 +1373,16 @@ _ZN5GroupD0Ev: # @_ZN5GroupD0Ev _ZNK5Group9intersectERKSt4pairId3VecERK3Ray: # @_ZNK5Group9intersectERKSt4pairId3VecERK3Ray .cfi_startproc # %bb.0: - addi.d $sp, $sp, -144 - .cfi_def_cfa_offset 144 - st.d $ra, $sp, 136 # 8-byte Folded Spill - st.d $fp, $sp, 128 # 8-byte Folded Spill - st.d $s0, $sp, 120 # 8-byte Folded Spill - st.d $s1, $sp, 112 # 8-byte Folded Spill - st.d $s2, $sp, 104 # 8-byte Folded Spill - st.d $s3, $sp, 96 # 8-byte Folded Spill - st.d $s4, $sp, 88 # 8-byte Folded Spill - fst.d $fs0, $sp, 80 # 8-byte Folded Spill - fst.d $fs1, $sp, 72 # 8-byte Folded Spill + addi.d $sp, $sp, -160 + .cfi_def_cfa_offset 160 + st.d $ra, $sp, 152 # 8-byte Folded Spill + st.d $fp, $sp, 144 # 8-byte Folded Spill + st.d $s0, $sp, 136 # 8-byte Folded Spill + st.d $s1, $sp, 128 # 8-byte Folded Spill + st.d $s2, $sp, 120 # 8-byte Folded Spill + st.d $s3, $sp, 112 # 8-byte Folded Spill + st.d $s4, $sp, 104 # 8-byte Folded Spill + fst.d $fs0, $sp, 96 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -1397,40 +1391,41 @@ _ZNK5Group9intersectERKSt4pairId3VecERK3Ray: # @_ZNK5Group9intersectERKSt4pairId .cfi_offset 26, -48 .cfi_offset 27, -56 .cfi_offset 56, -64 - .cfi_offset 57, -72 - vld $vr0, $a2, 16 move $s0, $a3 move $fp, $a0 - vst $vr0, $sp, 56 - fld.d $fa0, $a1, 16 - fld.d $fa1, $a3, 0 - fld.d $fa2, $a3, 24 - vld $vr3, $a1, 24 - vld $vr4, $a3, 8 - vld $vr5, $a3, 32 - fsub.d $fa0, $fa0, $fa1 - fmul.d $fa1, $fa0, $fa2 + vld $vr0, $a2, 16 + fld.d $fa1, $a1, 32 + fld.d $fa2, $a3, 16 + vld $vr3, $a1, 16 + vld $vr4, $a3, 0 + vst $vr0, $sp, 80 + vld $vr0, $a3, 24 + fsub.d $fa1, $fa1, $fa2 vfsub.d $vr2, $vr3, $vr4 - vfmul.d $vr3, $vr2, $vr5 - vreplvei.d $vr4, $vr3, 0 - fadd.d $fa1, $fa1, $fa4 - vld $vr4, $a2, 0 - vreplvei.d $vr3, $vr3, 1 - fadd.d $fs0, $fa1, $fa3 - fmul.d $fa0, $fa0, $fa0 - vfmul.d $vr1, $vr2, $vr2 - vreplvei.d $vr2, $vr1, 0 - fadd.d $fa0, $fa0, $fa2 - vreplvei.d $vr1, $vr1, 1 - fld.d $fa2, $a1, 40 - fadd.d $fa0, $fa0, $fa1 - fmul.d $fa1, $fs0, $fs0 - fsub.d $fa0, $fa1, $fa0 - fmul.d $fa1, $fa2, $fa2 + fld.d $fa3, $a3, 40 + vori.b $vr4, $vr2, 0 + vshuf4i.d $vr4, $vr0, 12 + vfmul.d $vr4, $vr2, $vr4 + vshuf4i.d $vr0, $vr2, 3 + vshuf4i.d $vr2, $vr0, 1 + vfmul.d $vr0, $vr2, $vr0 + vld $vr2, $a2, 0 + vreplvei.d $vr1, $vr1, 0 + vfadd.d $vr0, $vr4, $vr0 + vori.b $vr4, $vr1, 0 + vextrins.d $vr4, $vr3, 16 + vfmul.d $vr1, $vr1, $vr4 + vfadd.d $vr0, $vr0, $vr1 + vreplvei.d $vr4, $vr0, 1 + fld.d $fa1, $a1, 40 + fmul.d $fa3, $fa4, $fa4 + vreplvei.d $vr0, $vr0, 0 + fsub.d $fa0, $fa3, $fa0 + fmul.d $fa1, $fa1, $fa1 fadd.d $fa0, $fa1, $fa0 - movgr2fr.d $fs1, $zero - fcmp.cule.d $fcc0, $fs1, $fa0 - vst $vr4, $sp, 40 + movgr2fr.d $fs0, $zero + fcmp.cule.d $fcc0, $fs0, $fa0 + vst $vr2, $sp, 64 bcnez $fcc0, .LBB14_2 # %bb.1: pcalau12i $a0, %pc_hi20(infinity) @@ -1441,13 +1436,13 @@ _ZNK5Group9intersectERKSt4pairId3VecERK3Ray: # @_ZNK5Group9intersectERKSt4pairId fcmp.cor.d $fcc0, $fa1, $fa1 bceqz $fcc0, .LBB14_11 .LBB14_3: # %.split - fadd.d $fa0, $fs0, $fa1 + fadd.d $fa0, $fa4, $fa1 pcalau12i $a0, %pc_hi20(infinity) fld.d $fa2, $a0, %pc_lo12(infinity) - fsub.d $fa1, $fs0, $fa1 - fcmp.clt.d $fcc0, $fs1, $fa1 + fsub.d $fa1, $fa4, $fa1 + fcmp.clt.d $fcc0, $fs0, $fa1 fsel $fa1, $fa0, $fa1, $fcc0 - fcmp.clt.d $fcc0, $fa0, $fs1 + fcmp.clt.d $fcc0, $fa0, $fs0 fsel $fa0, $fa1, $fa2, $fcc0 .LBB14_4: # %_ZNK6Sphere10ray_sphereERK3Ray.exit fld.d $fa1, $a2, 0 @@ -1458,28 +1453,28 @@ _ZNK5Group9intersectERKSt4pairId3VecERK3Ray: # @_ZNK5Group9intersectERKSt4pairId addi.d $s2, $a1, 48 beq $s1, $s2, .LBB14_8 # %bb.6: # %.lr.ph - addi.d $s3, $sp, 16 - addi.d $s4, $sp, 48 + addi.d $s3, $sp, 40 + addi.d $s4, $sp, 72 .p2align 4, , 16 .LBB14_7: # =>This Inner Loop Header: Depth=1 ld.d $a1, $s1, 16 ld.d $a0, $a1, 0 ld.d $a4, $a0, 16 - addi.d $a0, $sp, 8 - addi.d $a2, $sp, 40 + addi.d $a0, $sp, 32 + addi.d $a2, $sp, 64 move $a3, $s0 jirl $ra, $a4, 0 ld.d $a0, $s3, 0 vld $vr0, $s3, 8 - fld.d $fa1, $sp, 8 + fld.d $fa1, $sp, 32 st.d $a0, $s4, 0 vst $vr0, $s4, 8 ld.d $s1, $s1, 0 - fst.d $fa1, $sp, 40 + fst.d $fa1, $sp, 64 bne $s1, $s2, .LBB14_7 .LBB14_8: # %._crit_edge - vld $vr0, $sp, 56 - vld $vr1, $sp, 40 + vld $vr0, $sp, 80 + vld $vr1, $sp, 64 b .LBB14_10 .LBB14_9: vld $vr0, $a2, 16 @@ -1487,22 +1482,23 @@ _ZNK5Group9intersectERKSt4pairId3VecERK3Ray: # @_ZNK5Group9intersectERKSt4pairId .LBB14_10: vst $vr0, $fp, 16 vst $vr1, $fp, 0 - fld.d $fs1, $sp, 72 # 8-byte Folded Reload - fld.d $fs0, $sp, 80 # 8-byte Folded Reload - ld.d $s4, $sp, 88 # 8-byte Folded Reload - ld.d $s3, $sp, 96 # 8-byte Folded Reload - ld.d $s2, $sp, 104 # 8-byte Folded Reload - ld.d $s1, $sp, 112 # 8-byte Folded Reload - ld.d $s0, $sp, 120 # 8-byte Folded Reload - ld.d $fp, $sp, 128 # 8-byte Folded Reload - ld.d $ra, $sp, 136 # 8-byte Folded Reload - addi.d $sp, $sp, 144 + fld.d $fs0, $sp, 96 # 8-byte Folded Reload + ld.d $s4, $sp, 104 # 8-byte Folded Reload + ld.d $s3, $sp, 112 # 8-byte Folded Reload + ld.d $s2, $sp, 120 # 8-byte Folded Reload + ld.d $s1, $sp, 128 # 8-byte Folded Reload + ld.d $s0, $sp, 136 # 8-byte Folded Reload + ld.d $fp, $sp, 144 # 8-byte Folded Reload + ld.d $ra, $sp, 152 # 8-byte Folded Reload + addi.d $sp, $sp, 160 ret .LBB14_11: # %call.sqrt move $s1, $a2 move $s2, $a1 + vst $vr4, $sp, 16 # 16-byte Folded Spill pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 + vld $vr4, $sp, 16 # 16-byte Folded Reload move $a1, $s2 move $a2, $s1 fmov.d $fa1, $fa0 diff --git a/results/SingleSource/Benchmarks/Misc-C++/Large/CMakeFiles/sphereflake.dir/sphereflake.s b/results/SingleSource/Benchmarks/Misc-C++/Large/CMakeFiles/sphereflake.dir/sphereflake.s index 786e589d..fef9dd64 100644 --- a/results/SingleSource/Benchmarks/Misc-C++/Large/CMakeFiles/sphereflake.dir/sphereflake.s +++ b/results/SingleSource/Benchmarks/Misc-C++/Large/CMakeFiles/sphereflake.dir/sphereflake.s @@ -42,27 +42,26 @@ main: # @main .cfi_startproc # %bb.0: - addi.d $sp, $sp, -512 - .cfi_def_cfa_offset 512 - st.d $ra, $sp, 504 # 8-byte Folded Spill - st.d $fp, $sp, 496 # 8-byte Folded Spill - st.d $s0, $sp, 488 # 8-byte Folded Spill - st.d $s1, $sp, 480 # 8-byte Folded Spill - st.d $s2, $sp, 472 # 8-byte Folded Spill - st.d $s3, $sp, 464 # 8-byte Folded Spill - st.d $s4, $sp, 456 # 8-byte Folded Spill - st.d $s5, $sp, 448 # 8-byte Folded Spill - st.d $s6, $sp, 440 # 8-byte Folded Spill - st.d $s7, $sp, 432 # 8-byte Folded Spill - st.d $s8, $sp, 424 # 8-byte Folded Spill - fst.d $fs0, $sp, 416 # 8-byte Folded Spill - fst.d $fs1, $sp, 408 # 8-byte Folded Spill - fst.d $fs2, $sp, 400 # 8-byte Folded Spill - fst.d $fs3, $sp, 392 # 8-byte Folded Spill - fst.d $fs4, $sp, 384 # 8-byte Folded Spill - fst.d $fs5, $sp, 376 # 8-byte Folded Spill - fst.d $fs6, $sp, 368 # 8-byte Folded Spill - fst.d $fs7, $sp, 360 # 8-byte Folded Spill + addi.d $sp, $sp, -544 + .cfi_def_cfa_offset 544 + st.d $ra, $sp, 536 # 8-byte Folded Spill + st.d $fp, $sp, 528 # 8-byte Folded Spill + st.d $s0, $sp, 520 # 8-byte Folded Spill + st.d $s1, $sp, 512 # 8-byte Folded Spill + st.d $s2, $sp, 504 # 8-byte Folded Spill + st.d $s3, $sp, 496 # 8-byte Folded Spill + st.d $s4, $sp, 488 # 8-byte Folded Spill + st.d $s5, $sp, 480 # 8-byte Folded Spill + st.d $s6, $sp, 472 # 8-byte Folded Spill + st.d $s7, $sp, 464 # 8-byte Folded Spill + st.d $s8, $sp, 456 # 8-byte Folded Spill + fst.d $fs0, $sp, 448 # 8-byte Folded Spill + fst.d $fs1, $sp, 440 # 8-byte Folded Spill + fst.d $fs2, $sp, 432 # 8-byte Folded Spill + fst.d $fs3, $sp, 424 # 8-byte Folded Spill + fst.d $fs4, $sp, 416 # 8-byte Folded Spill + fst.d $fs5, $sp, 408 # 8-byte Folded Spill + fst.d $fs6, $sp, 400 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -81,7 +80,6 @@ main: # @main .cfi_offset 60, -128 .cfi_offset 61, -136 .cfi_offset 62, -144 - .cfi_offset 63, -152 ori $s1, $zero, 2 ori $fp, $zero, 6 bne $a0, $s1, .LBB0_3 @@ -134,10 +132,10 @@ main: # @main add.d $a1, $a0, $s1 pcalau12i $a2, %pc_hi20(_ZL3end) st.d $a1, $a2, %pc_lo12(_ZL3end) - st.d $zero, $sp, 72 + st.d $zero, $sp, 120 vrepli.b $vr0, 0 - vst $vr0, $sp, 32 # 16-byte Folded Spill - vst $vr0, $sp, 80 + vst $vr0, $sp, 80 # 16-byte Folded Spill + vst $vr0, $sp, 128 addi.w $a1, $zero, -99 pcalau12i $a2, %pc_hi20(.LCPI0_0) fld.d $fs0, $a2, %pc_lo12(.LCPI0_0) @@ -166,11 +164,11 @@ main: # @main fmul.d $fa1, $fa0, $fa1 vldi $vr2, -800 fmul.d $fa2, $fa0, $fa2 - fst.d $fa1, $sp, 48 - fst.d $fa0, $sp, 56 - fst.d $fa2, $sp, 64 - addi.d $a3, $sp, 72 - addi.d $a4, $sp, 48 + fst.d $fa1, $sp, 96 + fst.d $fa0, $sp, 104 + fst.d $fa2, $sp, 112 + addi.d $a3, $sp, 120 + addi.d $a4, $sp, 96 vldi $vr0, -912 move $a1, $fp move $a2, $s0 @@ -206,8 +204,8 @@ main: # @main ori $a2, $zero, 5 pcaddu18i $ra, %call36(_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l) jirl $ra, $ra, 0 - vld $vr0, $sp, 32 # 16-byte Folded Reload - vst $vr0, $sp, 192 + vld $vr0, $sp, 80 # 16-byte Folded Reload + vst $vr0, $sp, 240 ori $a0, $zero, 0 lu32i.d $a0, 131072 pcalau12i $a1, %pc_hi20(.LCPI0_1) @@ -215,9 +213,9 @@ main: # @main pcalau12i $a1, %pc_hi20(.LCPI0_2) vld $vr1, $a1, %pc_lo12(.LCPI0_2) lu52i.d $a0, $a0, -1023 - st.d $a0, $sp, 208 - vst $vr0, $sp, 96 - vst $vr1, $sp, 112 + st.d $a0, $sp, 256 + vst $vr0, $sp, 144 + vst $vr1, $sp, 160 pcalau12i $a0, %pc_hi20(.LCPI0_3) vld $vr0, $a0, %pc_lo12(.LCPI0_3) pcalau12i $a0, %pc_hi20(.LCPI0_4) @@ -226,21 +224,21 @@ main: # @main vld $vr2, $a0, %pc_lo12(.LCPI0_5) pcalau12i $a0, %pc_hi20(.LCPI0_6) vld $vr3, $a0, %pc_lo12(.LCPI0_6) - vst $vr0, $sp, 128 - vst $vr1, $sp, 144 - vst $vr2, $sp, 160 - vst $vr3, $sp, 176 + vst $vr0, $sp, 176 + vst $vr1, $sp, 192 + vst $vr2, $sp, 208 + vst $vr3, $sp, 224 pcalau12i $a0, %pc_hi20(.LCPI0_7) - fld.d $fs1, $a0, %pc_lo12(.LCPI0_7) - movgr2fr.d $fs2, $zero - addi.d $s1, $sp, 96 + fld.d $fa0, $a0, %pc_lo12(.LCPI0_7) pcalau12i $a0, %pc_hi20(.LCPI0_8) - fld.d $fs3, $a0, %pc_lo12(.LCPI0_8) + fld.d $fa1, $a0, %pc_lo12(.LCPI0_8) + vst $vr1, $sp, 16 # 16-byte Folded Spill + movgr2fr.d $fs1, $zero + addi.d $s1, $sp, 144 pcalau12i $a0, %pc_hi20(.LCPI0_9) - fld.d $fs4, $a0, %pc_lo12(.LCPI0_9) + fld.d $fs2, $a0, %pc_lo12(.LCPI0_9) pcalau12i $a0, %pc_hi20(.LCPI0_10) - fld.d $fa0, $a0, %pc_lo12(.LCPI0_10) - fst.d $fa0, $sp, 8 # 8-byte Folded Spill + fld.d $fs3, $a0, %pc_lo12(.LCPI0_10) lu52i.d $s3, $zero, 2047 ori $s4, $zero, 4 pcalau12i $a0, %pc_hi20(_ZL5light) @@ -249,29 +247,32 @@ main: # @main ori $a0, $a0, 2577 lu32i.d $a0, 104345 lu52i.d $a0, $a0, 983 - vreplgr2vr.d $vr0, $a0 - vst $vr0, $sp, 16 # 16-byte Folded Spill + vreplgr2vr.d $vr1, $a0 + vst $vr1, $sp, 48 # 16-byte Folded Spill addi.w $s6, $zero, -99 - vldi $vr6, -928 b .LBB0_12 .p2align 4, , 16 .LBB0_11: # in Loop: Header=BB0_12 Depth=1 addi.w $s2, $s2, -1 - vldi $vr0, -784 - fadd.d $fs1, $fs1, $fa0 + vld $vr0, $sp, 32 # 16-byte Folded Reload + vldi $vr1, -784 + fadd.d $fa0, $fa0, $fa1 beqz $s2, .LBB0_23 .LBB0_12: # %.preheader37.i # =>This Loop Header: Depth=1 # Child Loop BB0_14 Depth 2 # Child Loop BB0_16 Depth 3 # Child Loop BB0_18 Depth 4 + vst $vr0, $sp, 32 # 16-byte Folded Spill + vld $vr1, $sp, 16 # 16-byte Folded Reload + vpackev.d $vr0, $vr1, $vr0 + vst $vr0, $sp, 64 # 16-byte Folded Spill ori $s7, $zero, 1024 - fmov.d $fs6, $fs2 + fmov.d $fs4, $fs1 b .LBB0_14 .p2align 4, , 16 .LBB0_13: # in Loop: Header=BB0_14 Depth=2 - fld.d $fa0, $sp, 8 # 8-byte Folded Reload - fmul.d $fa0, $fs7, $fa0 + fmul.d $fa0, $fs5, $fs3 ftintrz.w.d $fa0, $fa0 movfr2gr.s $a1, $fa0 move $a0, $fp @@ -281,10 +282,9 @@ main: # @main move $a1, $s0 pcaddu18i $ra, %call36(_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l) jirl $ra, $ra, 0 - vldi $vr6, -928 addi.w $s7, $s7, -1 vldi $vr0, -912 - fadd.d $fs6, $fs6, $fa0 + fadd.d $fs4, $fs4, $fa0 beqz $s7, .LBB0_11 .LBB0_14: # %.preheader.i # Parent Loop BB0_12 Depth=1 @@ -292,14 +292,13 @@ main: # @main # Child Loop BB0_16 Depth 3 # Child Loop BB0_18 Depth 4 move $s8, $zero - fmov.d $fs7, $fs2 + fmov.d $fs5, $fs1 b .LBB0_16 .p2align 4, , 16 .LBB0_15: # %_ZL9ray_tracePK6node_tRK5ray_t.exit.i # in Loop: Header=BB0_16 Depth=3 addi.d $s8, $s8, 1 - fadd.d $fs7, $fs7, $fa5 - vldi $vr6, -928 + fadd.d $fs5, $fs5, $fa5 beq $s8, $s4, .LBB0_13 .LBB0_16: # Parent Loop BB0_12 Depth=1 # Parent Loop BB0_14 Depth=2 @@ -309,26 +308,27 @@ main: # @main alsl.d $a0, $s8, $a0, 3 add.d $a1, $s1, $a0 fldx.d $fa0, $a0, $s1 - fld.d $fa1, $a1, 8 - fld.d $fa2, $a1, 16 - fadd.d $fa0, $fs6, $fa0 - fadd.d $fa1, $fs1, $fa1 - fadd.d $fa2, $fa2, $fs3 - fmul.d $fa3, $fa0, $fa0 - fmul.d $fa4, $fa1, $fa1 - fadd.d $fa3, $fa3, $fa4 - fmul.d $fa4, $fa2, $fa2 - fadd.d $fa3, $fa3, $fa4 - fclass.d $fa4, $fa3 - movfr2gr.d $a0, $fa4 + vld $vr2, $a1, 8 + fadd.d $fa1, $fs4, $fa0 + vld $vr0, $sp, 64 # 16-byte Folded Reload + vfadd.d $vr0, $vr0, $vr2 + fmul.d $fa2, $fa1, $fa1 + vfmul.d $vr3, $vr0, $vr0 + vreplvei.d $vr4, $vr3, 0 + fadd.d $fa2, $fa2, $fa4 + vreplvei.d $vr3, $vr3, 1 + fadd.d $fa2, $fa2, $fa3 + fclass.d $fa3, $fa2 + movfr2gr.d $a0, $fa3 andi $a0, $a0, 64 sltu $a0, $zero, $a0 andi $a0, $a0, 1 - fmov.d $fa4, $fs4 + fmov.d $fa3, $fs2 + vldi $vr5, -928 bnez $a0, .LBB0_20 # %bb.17: # %.preheader.i.i.i.preheader # in Loop: Header=BB0_16 Depth=3 - vldi $vr4, -912 + vldi $vr3, -912 move $a0, $s6 .p2align 4, , 16 .LBB0_18: # %.preheader.i.i.i @@ -336,13 +336,13 @@ main: # @main # Parent Loop BB0_14 Depth=2 # Parent Loop BB0_16 Depth=3 # => This Inner Loop Header: Depth=4 - fmov.d $fa5, $fa4 - fdiv.d $fa4, $fa3, $fa4 - fadd.d $fa4, $fa5, $fa4 - fmul.d $fa4, $fa4, $fa6 - fsub.d $fa5, $fa4, $fa5 - fabs.d $fa5, $fa5 - fcmp.cule.d $fcc0, $fa5, $fs0 + fmov.d $fa4, $fa3 + fdiv.d $fa3, $fa2, $fa3 + fadd.d $fa3, $fa4, $fa3 + fmul.d $fa3, $fa3, $fa5 + fsub.d $fa4, $fa3, $fa4 + fabs.d $fa4, $fa4 + fcmp.cule.d $fcc0, $fa4, $fs0 bcnez $fcc0, .LBB0_20 # %bb.19: # %.preheader.i.i.i # in Loop: Header=BB0_18 Depth=4 @@ -352,81 +352,80 @@ main: # @main .p2align 4, , 16 .LBB0_20: # %_ZNK3v_t4normEv.exit.i # in Loop: Header=BB0_16 Depth=3 - frecip.d $fa3, $fa4 - fmul.d $fa0, $fa0, $fa3 - fmul.d $fa1, $fa1, $fa3 - fmul.d $fa2, $fa2, $fa3 - fst.d $fa0, $sp, 216 - fst.d $fa1, $sp, 224 - fst.d $fa2, $sp, 232 - st.d $zero, $sp, 336 - vld $vr0, $sp, 32 # 16-byte Folded Reload - vst $vr0, $sp, 320 - st.d $s3, $sp, 344 - addi.d $a0, $sp, 192 - addi.d $a1, $sp, 320 + frecip.d $fa2, $fa3 + fmul.d $fa1, $fa1, $fa2 + fst.d $fa1, $sp, 264 + vreplvei.d $vr1, $vr2, 0 + vfmul.d $vr0, $vr0, $vr1 + vst $vr0, $sp, 272 + st.d $zero, $sp, 384 + vld $vr0, $sp, 80 # 16-byte Folded Reload + vst $vr0, $sp, 368 + st.d $s3, $sp, 392 + addi.d $a0, $sp, 240 + addi.d $a1, $sp, 368 pcaddu18i $ra, %call36(_ZN6node_t9intersectILb0EEEvRK5ray_tR5hit_t) jirl $ra, $ra, 0 - fld.d $fa0, $sp, 344 + fld.d $fa0, $sp, 392 fclass.d $fa1, $fa0 movfr2gr.d $a0, $fa1 andi $a0, $a0, 64 sltu $a0, $zero, $a0 andi $a0, $a0, 1 - fmov.d $fa5, $fs2 + fmov.d $fa5, $fs1 bnez $a0, .LBB0_15 # %bb.21: # in Loop: Header=BB0_16 Depth=3 - vld $vr3, $sp, 320 + vld $vr3, $sp, 368 vld $vr1, $s5, 0 vfmul.d $vr5, $vr3, $vr1 - fld.d $fa4, $sp, 336 + fld.d $fa4, $sp, 384 fld.d $fa2, $s5, 16 vreplvei.d $vr6, $vr5, 0 vreplvei.d $vr5, $vr5, 1 fadd.d $fa5, $fa6, $fa5 fmul.d $fa6, $fa4, $fa2 fadd.d $fa6, $fa5, $fa6 - fcmp.cle.d $fcc0, $fs2, $fa6 - fmov.d $fa5, $fs2 + fcmp.cle.d $fcc0, $fs1, $fa6 + fmov.d $fa5, $fs1 bcnez $fcc0, .LBB0_15 # %bb.22: # in Loop: Header=BB0_16 Depth=3 - fld.d $fa5, $sp, 232 - fld.d $fa7, $sp, 208 - fneg.d $fs5, $fa6 + fld.d $fa5, $sp, 280 + fld.d $fa7, $sp, 256 + fneg.d $fs6, $fa6 fmul.d $fa5, $fa0, $fa5 fadd.d $fa5, $fa5, $fa7 - vld $vr6, $sp, 16 # 16-byte Folded Reload + vld $vr6, $sp, 48 # 16-byte Folded Reload vfmul.d $vr3, $vr3, $vr6 fmul.d $fa4, $fa4, $fs0 fadd.d $fa4, $fa4, $fa5 vbitrevi.d $vr1, $vr1, 63 - vld $vr5, $sp, 216 - vld $vr6, $sp, 192 + vld $vr5, $sp, 264 + vld $vr6, $sp, 240 fneg.d $fa2, $fa2 vreplvei.d $vr0, $vr0, 0 vfmul.d $vr0, $vr0, $vr5 vfadd.d $vr0, $vr0, $vr6 vfadd.d $vr0, $vr3, $vr0 - vst $vr0, $sp, 272 - fst.d $fa4, $sp, 288 - vst $vr1, $sp, 296 - fst.d $fa2, $sp, 312 - st.d $zero, $sp, 256 - vld $vr0, $sp, 32 # 16-byte Folded Reload - vst $vr0, $sp, 240 - st.d $s3, $sp, 264 - addi.d $a0, $sp, 272 - addi.d $a1, $sp, 240 + vst $vr0, $sp, 320 + fst.d $fa4, $sp, 336 + vst $vr1, $sp, 344 + fst.d $fa2, $sp, 360 + st.d $zero, $sp, 304 + vld $vr0, $sp, 80 # 16-byte Folded Reload + vst $vr0, $sp, 288 + st.d $s3, $sp, 312 + addi.d $a0, $sp, 320 + addi.d $a1, $sp, 288 pcaddu18i $ra, %call36(_ZN6node_t9intersectILb1EEEvRK5ray_tR5hit_t) jirl $ra, $ra, 0 - fld.d $fa0, $sp, 264 + fld.d $fa0, $sp, 312 fclass.d $fa0, $fa0 movfr2gr.d $a0, $fa0 andi $a0, $a0, 64 sltu $a0, $zero, $a0 andi $a0, $a0, 1 movgr2cf $fcc0, $a0 - fsel $fa5, $fs2, $fs5, $fcc0 + fsel $fa5, $fs1, $fs6, $fcc0 b .LBB0_15 .LBB0_23: ld.d $a0, $fp, 0 @@ -458,26 +457,25 @@ main: # @main pcaddu18i $ra, %call36(_ZNSo5flushEv) jirl $ra, $ra, 0 move $a0, $zero - fld.d $fs7, $sp, 360 # 8-byte Folded Reload - fld.d $fs6, $sp, 368 # 8-byte Folded Reload - fld.d $fs5, $sp, 376 # 8-byte Folded Reload - fld.d $fs4, $sp, 384 # 8-byte Folded Reload - fld.d $fs3, $sp, 392 # 8-byte Folded Reload - fld.d $fs2, $sp, 400 # 8-byte Folded Reload - fld.d $fs1, $sp, 408 # 8-byte Folded Reload - fld.d $fs0, $sp, 416 # 8-byte Folded Reload - ld.d $s8, $sp, 424 # 8-byte Folded Reload - ld.d $s7, $sp, 432 # 8-byte Folded Reload - ld.d $s6, $sp, 440 # 8-byte Folded Reload - ld.d $s5, $sp, 448 # 8-byte Folded Reload - ld.d $s4, $sp, 456 # 8-byte Folded Reload - ld.d $s3, $sp, 464 # 8-byte Folded Reload - ld.d $s2, $sp, 472 # 8-byte Folded Reload - ld.d $s1, $sp, 480 # 8-byte Folded Reload - ld.d $s0, $sp, 488 # 8-byte Folded Reload - ld.d $fp, $sp, 496 # 8-byte Folded Reload - ld.d $ra, $sp, 504 # 8-byte Folded Reload - addi.d $sp, $sp, 512 + fld.d $fs6, $sp, 400 # 8-byte Folded Reload + fld.d $fs5, $sp, 408 # 8-byte Folded Reload + fld.d $fs4, $sp, 416 # 8-byte Folded Reload + fld.d $fs3, $sp, 424 # 8-byte Folded Reload + fld.d $fs2, $sp, 432 # 8-byte Folded Reload + fld.d $fs1, $sp, 440 # 8-byte Folded Reload + fld.d $fs0, $sp, 448 # 8-byte Folded Reload + ld.d $s8, $sp, 456 # 8-byte Folded Reload + ld.d $s7, $sp, 464 # 8-byte Folded Reload + ld.d $s6, $sp, 472 # 8-byte Folded Reload + ld.d $s5, $sp, 480 # 8-byte Folded Reload + ld.d $s4, $sp, 488 # 8-byte Folded Reload + ld.d $s3, $sp, 496 # 8-byte Folded Reload + ld.d $s2, $sp, 504 # 8-byte Folded Reload + ld.d $s1, $sp, 512 # 8-byte Folded Reload + ld.d $s0, $sp, 520 # 8-byte Folded Reload + ld.d $fp, $sp, 528 # 8-byte Folded Reload + ld.d $ra, $sp, 536 # 8-byte Folded Reload + addi.d $sp, $sp, 544 ret .LBB0_28: pcaddu18i $ra, %call36(_ZSt16__throw_bad_castv) @@ -1292,13 +1290,14 @@ _ZN6node_t9intersectILb0EEEvRK5ray_tR5hit_t: # @_ZN6node_t9intersectILb0EEEvRK5r ld.d $a3, $a3, %pc_lo12(_ZL3end) bgeu $a2, $a3, .LBB3_19 # %bb.1: # %.lr.ph - fld.d $fa3, $a1, 24 + fld.d $fa4, $a1, 24 pcalau12i $a4, %pc_hi20(.LCPI3_0) fld.d $fa0, $a4, %pc_lo12(.LCPI3_0) - movgr2fr.d $fa1, $zero + pcalau12i $a5, %pc_hi20(.LCPI3_1) + fld.d $fa1, $a5, %pc_lo12(.LCPI3_1) + movgr2fr.d $fa2, $zero addi.w $a5, $zero, -99 - vldi $vr2, -928 - pcalau12i $a6, %pc_hi20(.LCPI3_1) + vldi $vr3, -928 b .LBB3_3 .p2align 4, , 16 .LBB3_2: # %_ZNK8sphere_t9intersectERK5ray_t.exit28.thread @@ -1308,172 +1307,170 @@ _ZN6node_t9intersectILb0EEEvRK5ray_tR5hit_t: # @_ZN6node_t9intersectILb0EEEvRK5r .LBB3_3: # =>This Loop Header: Depth=1 # Child Loop BB3_7 Depth 2 # Child Loop BB3_14 Depth 2 - fld.d $fa6, $a2, 0 - fld.d $fa4, $a0, 0 - fld.d $fa7, $a2, 8 - fld.d $fa5, $a0, 8 - fsub.d $ft3, $fa6, $fa4 - fld.d $ft1, $a2, 16 + fld.d $ft0, $a2, 16 fld.d $fa6, $a0, 16 - fsub.d $ft4, $fa7, $fa5 - fld.d $fa7, $a0, 24 - fld.d $ft0, $a0, 32 - fsub.d $ft5, $ft1, $fa6 - fld.d $ft1, $a0, 40 - fmul.d $ft2, $ft3, $fa7 - fmul.d $ft6, $ft4, $ft0 - fadd.d $ft2, $ft2, $ft6 - fmul.d $ft6, $ft5, $ft1 - fadd.d $ft2, $ft2, $ft6 - fmul.d $ft6, $ft2, $ft2 + vld $vr5, $a0, 0 + vld $vr9, $a2, 0 + vld $vr7, $a0, 24 + fsub.d $ft2, $ft0, $fa6 + vfsub.d $vr9, $vr9, $vr5 + fld.d $ft0, $a0, 40 + vori.b $vr11, $vr9, 0 + vshuf4i.d $vr11, $vr7, 12 + vfmul.d $vr11, $vr9, $vr11 + vori.b $vr12, $vr7, 0 + vshuf4i.d $vr12, $vr9, 3 + vshuf4i.d $vr9, $vr0, 1 + vfmul.d $vr9, $vr9, $vr12 + vreplvei.d $vr10, $vr10, 0 + vfadd.d $vr9, $vr11, $vr9 + vori.b $vr11, $vr10, 0 + vextrins.d $vr11, $vr8, 16 + vfmul.d $vr10, $vr10, $vr11 + vfadd.d $vr10, $vr9, $vr10 + vreplvei.d $vr9, $vr10, 1 + fld.d $ft3, $a2, 24 + fmul.d $ft4, $ft1, $ft1 + vreplvei.d $vr10, $vr10, 0 + fsub.d $ft2, $ft4, $ft2 fmul.d $ft3, $ft3, $ft3 - fmul.d $ft4, $ft4, $ft4 - fadd.d $ft3, $ft3, $ft4 - fld.d $ft4, $a2, 24 - fmul.d $ft5, $ft5, $ft5 - fadd.d $ft3, $ft3, $ft5 - fsub.d $ft3, $ft6, $ft3 - fmul.d $ft4, $ft4, $ft4 - fadd.d $ft3, $ft4, $ft3 - fcmp.clt.d $fcc0, $ft3, $fa1 - fmov.d $ft4, $fa0 + fadd.d $ft2, $ft3, $ft2 + fcmp.clt.d $fcc0, $ft2, $fa2 + fmov.d $ft3, $fa0 bcnez $fcc0, .LBB3_10 # %bb.4: # in Loop: Header=BB3_3 Depth=1 - fclass.d $ft4, $ft3 - movfr2gr.d $a7, $ft4 - andi $a7, $a7, 64 - sltu $a7, $zero, $a7 - andi $a7, $a7, 1 - beqz $a7, .LBB3_6 + fclass.d $ft3, $ft2 + movfr2gr.d $a6, $ft3 + andi $a6, $a6, 64 + sltu $a6, $zero, $a6 + andi $a6, $a6, 1 + beqz $a6, .LBB3_6 # %bb.5: # in Loop: Header=BB3_3 Depth=1 - fld.d $ft4, $a4, %pc_lo12(.LCPI3_0) + fld.d $ft3, $a4, %pc_lo12(.LCPI3_0) b .LBB3_9 .LBB3_6: # %.preheader.i.i.preheader # in Loop: Header=BB3_3 Depth=1 - vldi $vr12, -912 - move $a7, $a5 + vldi $vr11, -912 + move $a6, $a5 .p2align 4, , 16 .LBB3_7: # %.preheader.i.i # Parent Loop BB3_3 Depth=1 # => This Inner Loop Header: Depth=2 - fmov.d $ft5, $ft4 - fdiv.d $ft4, $ft3, $ft4 - fadd.d $ft4, $ft5, $ft4 - fld.d $ft6, $a6, %pc_lo12(.LCPI3_1) - fmul.d $ft4, $ft4, $fa2 - fsub.d $ft5, $ft4, $ft5 - fabs.d $ft5, $ft5 - fcmp.cule.d $fcc0, $ft5, $ft6 + fmov.d $ft4, $ft3 + fdiv.d $ft3, $ft2, $ft3 + fadd.d $ft3, $ft4, $ft3 + fmul.d $ft3, $ft3, $fa3 + fsub.d $ft4, $ft3, $ft4 + fabs.d $ft4, $ft4 + fcmp.cule.d $fcc0, $ft4, $fa1 bcnez $fcc0, .LBB3_9 # %bb.8: # %.preheader.i.i # in Loop: Header=BB3_7 Depth=2 - move $t0, $a7 - addi.w $a7, $a7, 1 - bnez $t0, .LBB3_7 + move $a7, $a6 + addi.w $a6, $a6, 1 + bnez $a7, .LBB3_7 .p2align 4, , 16 .LBB3_9: # %_ZL8LLVMsqrtd.exit.i # in Loop: Header=BB3_3 Depth=1 - fadd.d $ft3, $ft2, $ft4 - fsub.d $ft2, $ft2, $ft4 - fcmp.clt.d $fcc0, $fa1, $ft2 - fsel $ft2, $ft3, $ft2, $fcc0 - fcmp.clt.d $fcc0, $ft3, $fa1 - fsel $ft4, $ft2, $fa0, $fcc0 + fadd.d $ft2, $ft1, $ft3 + fsub.d $ft1, $ft1, $ft3 + fcmp.clt.d $fcc0, $fa2, $ft1 + fsel $ft1, $ft2, $ft1, $fcc0 + fcmp.clt.d $fcc0, $ft2, $fa2 + fsel $ft3, $ft1, $fa0, $fcc0 .LBB3_10: # %_ZNK8sphere_t9intersectERK5ray_t.exit # in Loop: Header=BB3_3 Depth=1 - fcmp.cult.d $fcc0, $ft4, $fa3 + fcmp.cult.d $fcc0, $ft3, $fa4 bceqz $fcc0, .LBB3_18 # %bb.11: # in Loop: Header=BB3_3 Depth=1 - fld.d $ft2, $a2, 32 - fld.d $ft3, $a2, 40 - fld.d $ft4, $a2, 48 - fsub.d $ft5, $ft2, $fa4 - fsub.d $ft7, $ft3, $fa5 - fsub.d $ft8, $ft4, $fa6 - fmul.d $ft6, $fa7, $ft5 - fmul.d $ft9, $ft0, $ft7 - fadd.d $ft6, $ft6, $ft9 - fmul.d $ft9, $ft1, $ft8 - fadd.d $ft6, $ft6, $ft9 - fmul.d $ft9, $ft6, $ft6 - fmul.d $ft5, $ft5, $ft5 - fmul.d $ft7, $ft7, $ft7 - fadd.d $ft7, $ft5, $ft7 - fld.d $ft5, $a2, 56 - fmul.d $ft8, $ft8, $ft8 - fadd.d $ft7, $ft7, $ft8 - fsub.d $ft7, $ft9, $ft7 - fmul.d $ft8, $ft5, $ft5 - fadd.d $ft7, $ft8, $ft7 - fcmp.clt.d $fcc0, $ft7, $fa1 + vld $vr9, $a2, 32 + fld.d $ft2, $a2, 48 + vfsub.d $vr11, $vr9, $vr5 + fsub.d $ft4, $ft2, $fa6 + vori.b $vr13, $vr11, 0 + vshuf4i.d $vr13, $vr7, 12 + vfmul.d $vr13, $vr13, $vr11 + vori.b $vr14, $vr7, 0 + vshuf4i.d $vr14, $vr11, 3 + vshuf4i.d $vr11, $vr0, 1 + vfmul.d $vr11, $vr14, $vr11 + vfadd.d $vr11, $vr13, $vr11 + vreplvei.d $vr13, $vr12, 0 + vextrins.d $vr12, $vr8, 16 + vfmul.d $vr12, $vr12, $vr13 + vfadd.d $vr13, $vr11, $vr12 + vreplvei.d $vr12, $vr13, 1 + fld.d $ft3, $a2, 56 + fmul.d $ft6, $ft4, $ft4 + vreplvei.d $vr13, $vr13, 0 + fsub.d $ft5, $ft6, $ft5 + fmul.d $ft6, $ft3, $ft3 + fadd.d $ft5, $ft6, $ft5 + fcmp.clt.d $fcc0, $ft5, $fa2 bcnez $fcc0, .LBB3_2 # %bb.12: # in Loop: Header=BB3_3 Depth=1 - fclass.d $ft8, $ft7 - movfr2gr.d $a7, $ft8 - andi $a7, $a7, 64 - sltu $a7, $zero, $a7 - andi $a7, $a7, 1 - fmov.d $ft8, $fa0 - bnez $a7, .LBB3_16 + fclass.d $ft6, $ft5 + movfr2gr.d $a6, $ft6 + andi $a6, $a6, 64 + sltu $a6, $zero, $a6 + andi $a6, $a6, 1 + fmov.d $ft6, $fa0 + bnez $a6, .LBB3_16 # %bb.13: # %.preheader.i.i19.preheader # in Loop: Header=BB3_3 Depth=1 - vldi $vr16, -912 - move $a7, $a5 + vldi $vr14, -912 + move $a6, $a5 .p2align 4, , 16 .LBB3_14: # %.preheader.i.i19 # Parent Loop BB3_3 Depth=1 # => This Inner Loop Header: Depth=2 - fmov.d $ft9, $ft8 - fdiv.d $ft8, $ft7, $ft8 - fadd.d $ft8, $ft9, $ft8 - fld.d $ft10, $a6, %pc_lo12(.LCPI3_1) - fmul.d $ft8, $ft8, $fa2 - fsub.d $ft9, $ft8, $ft9 - fabs.d $ft9, $ft9 - fcmp.cule.d $fcc0, $ft9, $ft10 + fmov.d $ft7, $ft6 + fdiv.d $ft6, $ft5, $ft6 + fadd.d $ft6, $ft7, $ft6 + fmul.d $ft6, $ft6, $fa3 + fsub.d $ft7, $ft6, $ft7 + fabs.d $ft7, $ft7 + fcmp.cule.d $fcc0, $ft7, $fa1 bcnez $fcc0, .LBB3_16 # %bb.15: # %.preheader.i.i19 # in Loop: Header=BB3_14 Depth=2 - move $t0, $a7 - addi.w $a7, $a7, 1 - bnez $t0, .LBB3_14 + move $a7, $a6 + addi.w $a6, $a6, 1 + bnez $a7, .LBB3_14 .p2align 4, , 16 .LBB3_16: # %_ZNK8sphere_t9intersectERK5ray_t.exit28 # in Loop: Header=BB3_3 Depth=1 - fadd.d $ft7, $ft6, $ft8 - fsub.d $ft6, $ft6, $ft8 - fcmp.clt.d $fcc0, $fa1, $ft6 - fsel $ft6, $ft7, $ft6, $fcc0 - fcmp.clt.d $fcc0, $ft7, $fa1 - fsel $ft6, $ft6, $fa0, $fcc0 - fcmp.cule.d $fcc0, $fa3, $ft6 + fadd.d $ft5, $ft4, $ft6 + fsub.d $ft4, $ft4, $ft6 + fcmp.clt.d $fcc0, $fa2, $ft4 + fsel $ft4, $ft5, $ft4, $fcc0 + fcmp.clt.d $fcc0, $ft5, $fa2 + fsel $ft4, $ft4, $fa0, $fcc0 + fcmp.cule.d $fcc0, $fa4, $ft4 bcnez $fcc0, .LBB3_2 # %bb.17: # in Loop: Header=BB3_3 Depth=1 - fst.d $ft6, $a1, 24 - fmul.d $fa3, $fa7, $ft6 - fmul.d $fa7, $ft6, $ft0 - fmul.d $ft0, $ft6, $ft1 - fadd.d $fa3, $fa4, $fa3 - fadd.d $fa4, $fa5, $fa7 - fadd.d $fa5, $fa6, $ft0 - fsub.d $fa3, $fa3, $ft2 - fsub.d $fa4, $fa4, $ft3 - fsub.d $fa5, $fa5, $ft4 - frecip.d $fa6, $ft5 - fmul.d $fa3, $fa6, $fa3 - fmul.d $fa4, $fa6, $fa4 - fmul.d $fa5, $fa6, $fa5 - fst.d $fa3, $a1, 0 - fst.d $fa4, $a1, 8 - fst.d $fa5, $a1, 16 - fmov.d $fa3, $ft6 + fst.d $ft4, $a1, 24 + vreplvei.d $vr4, $vr12, 0 + vfmul.d $vr4, $vr7, $vr4 + fmul.d $fa7, $ft4, $ft0 + fadd.d $fa6, $fa6, $fa7 + fsub.d $fa6, $fa6, $ft2 + frecip.d $fa7, $ft3 + fmul.d $fa6, $fa7, $fa6 + vfadd.d $vr4, $vr5, $vr4 + vfsub.d $vr4, $vr4, $vr9 + vreplvei.d $vr5, $vr7, 0 + vfmul.d $vr4, $vr5, $vr4 + vst $vr4, $a1, 0 + fst.d $fa6, $a1, 16 + fmov.d $fa4, $ft4 b .LBB3_2 .p2align 4, , 16 .LBB3_18: # in Loop: Header=BB3_3 Depth=1 - ld.d $a7, $a2, 64 - slli.d $t0, $a7, 6 - alsl.d $a7, $a7, $t0, 3 - add.d $a2, $a2, $a7 + ld.d $a6, $a2, 64 + slli.d $a7, $a6, 6 + alsl.d $a6, $a6, $a7, 3 + add.d $a2, $a2, $a6 bltu $a2, $a3, .LBB3_3 .LBB3_19: # %._crit_edge ret @@ -1678,9 +1675,12 @@ _ZN6node_t9intersectILb1EEEvRK5ray_tR5hit_t: # @_ZN6node_t9intersectILb1EEEvRK5r .LCPI5_1: .dword 0x3d719799812dea11 # double 9.9999999999999998E-13 .LCPI5_2: - .dword 0xbfe4cccccccccccd # double -0.65000000000000002 -.LCPI5_3: .dword 0x3feccccccccccccd # double 0.90000000000000002 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4, 0x0 +.LCPI5_3: + .dword 0xbfe0000000000000 # double -0.5 + .dword 0xbfe4cccccccccccd # double -0.65000000000000002 .section .text.startup,"ax",@progbits .p2align 5 .type _GLOBAL__sub_I_sphereflake.cpp,@function @@ -1689,20 +1689,20 @@ _GLOBAL__sub_I_sphereflake.cpp: # @_GLOBAL__sub_I_sphereflake.cpp pcalau12i $a0, %pc_hi20(.LCPI5_0) fld.d $fa0, $a0, %pc_lo12(.LCPI5_0) pcalau12i $a0, %pc_hi20(.LCPI5_1) - fld.d $fa1, $a0, %pc_lo12(.LCPI5_1) + fld.d $fa2, $a0, %pc_lo12(.LCPI5_1) addi.w $a0, $zero, -99 - vldi $vr3, -912 - vldi $vr2, -928 + vldi $vr1, -912 + vldi $vr3, -928 .p2align 4, , 16 .LBB5_1: # %.preheader.i.i.i # =>This Inner Loop Header: Depth=1 - fmov.d $fa4, $fa3 - fdiv.d $fa3, $fa0, $fa3 - fadd.d $fa3, $fa4, $fa3 - fmul.d $fa3, $fa3, $fa2 - fsub.d $fa4, $fa3, $fa4 + fmov.d $fa4, $fa1 + fdiv.d $fa1, $fa0, $fa1 + fadd.d $fa1, $fa4, $fa1 + fmul.d $fa1, $fa1, $fa3 + fsub.d $fa4, $fa1, $fa4 fabs.d $fa4, $fa4 - fcmp.cule.d $fcc0, $fa4, $fa1 + fcmp.cule.d $fcc0, $fa4, $fa2 bcnez $fcc0, .LBB5_3 # %bb.2: # %.preheader.i.i.i # in Loop: Header=BB5_1 Depth=1 @@ -1710,19 +1710,17 @@ _GLOBAL__sub_I_sphereflake.cpp: # @_GLOBAL__sub_I_sphereflake.cpp addi.w $a0, $a0, 1 bnez $a1, .LBB5_1 .LBB5_3: # %__cxx_global_var_init.exit - frecip.d $fa0, $fa3 pcalau12i $a0, %pc_hi20(.LCPI5_2) - fld.d $fa1, $a0, %pc_lo12(.LCPI5_2) + fld.d $fa0, $a0, %pc_lo12(.LCPI5_2) pcalau12i $a0, %pc_hi20(.LCPI5_3) - fld.d $fa2, $a0, %pc_lo12(.LCPI5_3) - vldi $vr3, -800 - fmul.d $fa3, $fa0, $fa3 - fmul.d $fa1, $fa0, $fa1 - fmul.d $fa0, $fa0, $fa2 + vld $vr2, $a0, %pc_lo12(.LCPI5_3) + frecip.d $fa1, $fa1 + fmul.d $fa0, $fa1, $fa0 + vreplvei.d $vr1, $vr1, 0 + vfmul.d $vr1, $vr1, $vr2 pcalau12i $a0, %pc_hi20(_ZL5light) addi.d $a0, $a0, %pc_lo12(_ZL5light) - fst.d $fa3, $a0, 0 - fst.d $fa1, $a0, 8 + vst $vr1, $a0, 0 fst.d $fa0, $a0, 16 ret .Lfunc_end5: diff --git a/results/SingleSource/Benchmarks/Shootout-C++/CMakeFiles/Shootout-C++-ary2.dir/ary2.s b/results/SingleSource/Benchmarks/Shootout-C++/CMakeFiles/Shootout-C++-ary2.dir/ary2.s index df79c624..41b61d74 100644 --- a/results/SingleSource/Benchmarks/Shootout-C++/CMakeFiles/Shootout-C++-ary2.dir/ary2.s +++ b/results/SingleSource/Benchmarks/Shootout-C++/CMakeFiles/Shootout-C++-ary2.dir/ary2.s @@ -6,20 +6,17 @@ .section .rodata.cst16,"aM",@progbits,16 .p2align 4, 0x0 # -- Begin function main .LCPI0_0: - .word 4 # 0x4 - .word 6 # 0x6 - .word 0 # 0x0 - .word 1 # 0x1 + .dword 4 # 0x4 + .dword 5 # 0x5 .LCPI0_1: - .word 0 # 0x0 - .word 0 # 0x0 - .word 2 # 0x2 - .word 3 # 0x3 + .dword 2 # 0x2 + .dword 3 # 0x3 .LCPI0_2: - .word 4 # 0x4 - .word 5 # 0x5 - .word 6 # 0x6 - .word 7 # 0x7 + .dword 8 # 0x8 + .dword 9 # 0x9 +.LCPI0_3: + .dword 6 # 0x6 + .dword 7 # 0x7 .text .globl main .p2align 5 @@ -30,20 +27,22 @@ main: # @main .cfi_personality 155, DW.ref.__gxx_personality_v0 .cfi_lsda 27, .Lexception0 # %bb.0: - addi.d $sp, $sp, -48 - .cfi_def_cfa_offset 48 - st.d $ra, $sp, 40 # 8-byte Folded Spill - st.d $fp, $sp, 32 # 8-byte Folded Spill - st.d $s0, $sp, 24 # 8-byte Folded Spill - st.d $s1, $sp, 16 # 8-byte Folded Spill - st.d $s2, $sp, 8 # 8-byte Folded Spill - st.d $s3, $sp, 0 # 8-byte Folded Spill + addi.d $sp, $sp, -64 + .cfi_def_cfa_offset 64 + st.d $ra, $sp, 56 # 8-byte Folded Spill + st.d $fp, $sp, 48 # 8-byte Folded Spill + st.d $s0, $sp, 40 # 8-byte Folded Spill + st.d $s1, $sp, 32 # 8-byte Folded Spill + st.d $s2, $sp, 24 # 8-byte Folded Spill + st.d $s3, $sp, 16 # 8-byte Folded Spill + st.d $s4, $sp, 8 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 .cfi_offset 24, -32 .cfi_offset 25, -40 .cfi_offset 26, -48 + .cfi_offset 27, -56 ori $a2, $zero, 2 bne $a0, $a2, .LBB0_3 # %bb.1: @@ -56,24 +55,25 @@ main: # @main bltz $a1, .LBB0_20 # %bb.2: # %_ZNSt6vectorIiSaIiEE17_S_check_init_lenEmRKS0_.exit.i slli.d $a1, $a0, 3 - alsl.w $s3, $a0, $a1, 1 - slli.d $fp, $s3, 2 + alsl.w $s4, $a0, $a1, 1 + slli.d $fp, $s4, 2 b .LBB0_4 .LBB0_3: lu12i.w $a0, 8789 ori $fp, $a0, 256 lu12i.w $a0, 2197 - ori $s3, $a0, 1088 + ori $s4, $a0, 1088 .LBB0_4: # %_ZNSt6vectorIiSaIiEE17_S_check_init_lenEmRKS0_.exit.i.thread move $a0, $fp pcaddu18i $ra, %call36(_Znwm) jirl $ra, $ra, 0 move $s0, $a0 st.w $zero, $a0, 0 - addi.d $a0, $a0, 4 - addi.d $s2, $fp, -4 + addi.d $s2, $a0, 4 + addi.d $s3, $fp, -4 + move $a0, $s2 move $a1, $zero - move $a2, $s2 + move $a2, $s3 pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 .Ltmp0: # EH_LABEL @@ -86,7 +86,7 @@ main: # @main st.w $zero, $a0, 0 addi.d $a0, $a0, 4 move $a1, $zero - move $a2, $s2 + move $a2, $s3 pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 pcalau12i $a0, %pc_hi20(.LCPI0_0) @@ -95,34 +95,37 @@ main: # @main vld $vr1, $a0, %pc_lo12(.LCPI0_1) pcalau12i $a0, %pc_hi20(.LCPI0_2) vld $vr2, $a0, %pc_lo12(.LCPI0_2) + pcalau12i $a0, %pc_hi20(.LCPI0_3) + vld $vr3, $a0, %pc_lo12(.LCPI0_3) move $a1, $zero + move $a2, $zero add.d $a0, $s1, $fp - addi.d $a2, $s0, 32 .p2align 4, , 16 .LBB0_6: # %.lr.ph # =>This Inner Loop Header: Depth=1 - addi.d $a3, $a1, 1 - vinsgr2vr.d $vr3, $a1, 0 - vinsgr2vr.d $vr3, $a3, 1 - vreplgr2vr.d $vr4, $a1 - vshuf4i.w $vr5, $vr4, 8 - vori.b $vr6, $vr0, 0 - vshuf.w $vr6, $vr3, $vr5 - vshuf4i.w $vr3, $vr4, 136 - vadd.w $vr4, $vr6, $vr1 - vadd.w $vr3, $vr3, $vr2 - vst $vr3, $a2, -16 - vst $vr4, $a2, -32 - addi.d $a3, $a1, 8 - st.w $a3, $a2, 0 - addi.d $a3, $a1, 9 - st.w $a3, $a2, 4 + st.w $a1, $s2, -4 + addi.d $a3, $a2, 1 + st.w $a3, $s2, 0 + vreplgr2vr.d $vr4, $a2 + vadd.d $vr5, $vr4, $vr0 + vadd.d $vr6, $vr4, $vr1 + vpickve2gr.d $a3, $vr6, 0 + slli.d $a3, $a3, 2 + vpickev.w $vr5, $vr5, $vr6 + vstx $vr5, $s0, $a3 + vadd.d $vr5, $vr4, $vr2 + vadd.d $vr4, $vr4, $vr3 + vpickve2gr.d $a3, $vr4, 0 + slli.d $a3, $a3, 2 + vpickev.w $vr4, $vr5, $vr4 + vstx $vr4, $s0, $a3 + addi.d $a2, $a2, 10 addi.d $a1, $a1, 10 - addi.d $a2, $a2, 40 - bltu $a1, $s3, .LBB0_6 + addi.d $s2, $s2, 40 + bltu $a2, $s4, .LBB0_6 # %bb.7: # %.lr.ph164.preheader - addi.d $a1, $s3, 10 - slli.d $a2, $s3, 2 + addi.d $a1, $s4, 10 + slli.d $a2, $s4, 2 ori $a3, $zero, 10 .p2align 4, , 16 .LBB0_8: # %.lr.ph164 @@ -195,13 +198,14 @@ main: # @main pcaddu18i $ra, %call36(_ZdlPvm) jirl $ra, $ra, 0 move $a0, $zero - ld.d $s3, $sp, 0 # 8-byte Folded Reload - ld.d $s2, $sp, 8 # 8-byte Folded Reload - ld.d $s1, $sp, 16 # 8-byte Folded Reload - ld.d $s0, $sp, 24 # 8-byte Folded Reload - ld.d $fp, $sp, 32 # 8-byte Folded Reload - ld.d $ra, $sp, 40 # 8-byte Folded Reload - addi.d $sp, $sp, 48 + ld.d $s4, $sp, 8 # 8-byte Folded Reload + ld.d $s3, $sp, 16 # 8-byte Folded Reload + ld.d $s2, $sp, 24 # 8-byte Folded Reload + ld.d $s1, $sp, 32 # 8-byte Folded Reload + ld.d $s0, $sp, 40 # 8-byte Folded Reload + ld.d $fp, $sp, 48 # 8-byte Folded Reload + ld.d $ra, $sp, 56 # 8-byte Folded Reload + addi.d $sp, $sp, 64 ret .LBB0_18: .Ltmp13: # EH_LABEL diff --git a/results/SingleSource/Benchmarks/Shootout-C++/CMakeFiles/Shootout-C++-matrix.dir/matrix.s b/results/SingleSource/Benchmarks/Shootout-C++/CMakeFiles/Shootout-C++-matrix.dir/matrix.s index e6914fae..a56e4a22 100644 --- a/results/SingleSource/Benchmarks/Shootout-C++/CMakeFiles/Shootout-C++-matrix.dir/matrix.s +++ b/results/SingleSource/Benchmarks/Shootout-C++/CMakeFiles/Shootout-C++-matrix.dir/matrix.s @@ -350,25 +350,25 @@ _Z5mmultiiPPiS0_S0_: # @_Z5mmultiiPPiS0_S0_ .section .rodata.cst16,"aM",@progbits,16 .p2align 4, 0x0 # -- Begin function main .LCPI4_0: + .word 4 # 0x4 + .word 5 # 0x5 + .word 6 # 0x6 + .word 7 # 0x7 +.LCPI4_1: + .word 0 # 0x0 + .word 1 # 0x1 + .word 2 # 0x2 + .word 3 # 0x3 +.LCPI4_2: .word 8 # 0x8 .word 9 # 0x9 .word 10 # 0xa .word 11 # 0xb -.LCPI4_1: +.LCPI4_3: .word 12 # 0xc .word 13 # 0xd .word 14 # 0xe .word 15 # 0xf -.LCPI4_2: - .word 0 # 0x0 - .word 1 # 0x1 - .word 2 # 0x2 - .word 3 # 0x3 -.LCPI4_3: - .word 4 # 0x4 - .word 5 # 0x5 - .word 6 # 0x6 - .word 7 # 0x7 .LCPI4_4: .word 16 # 0x10 .word 17 # 0x11 @@ -379,6 +379,11 @@ _Z5mmultiiPPiS0_S0_: # @_Z5mmultiiPPiS0_S0_ .word 21 # 0x15 .word 22 # 0x16 .word 23 # 0x17 +.LCPI4_6: + .word 24 # 0x18 + .word 25 # 0x19 + .word 26 # 0x1a + .word 27 # 0x1b .text .globl main .p2align 5 @@ -386,19 +391,19 @@ _Z5mmultiiPPiS0_S0_: # @_Z5mmultiiPPiS0_S0_ main: # @main .cfi_startproc # %bb.0: - addi.d $sp, $sp, -368 - .cfi_def_cfa_offset 368 - st.d $ra, $sp, 360 # 8-byte Folded Spill - st.d $fp, $sp, 352 # 8-byte Folded Spill - st.d $s0, $sp, 344 # 8-byte Folded Spill - st.d $s1, $sp, 336 # 8-byte Folded Spill - st.d $s2, $sp, 328 # 8-byte Folded Spill - st.d $s3, $sp, 320 # 8-byte Folded Spill - st.d $s4, $sp, 312 # 8-byte Folded Spill - st.d $s5, $sp, 304 # 8-byte Folded Spill - st.d $s6, $sp, 296 # 8-byte Folded Spill - st.d $s7, $sp, 288 # 8-byte Folded Spill - st.d $s8, $sp, 280 # 8-byte Folded Spill + addi.d $sp, $sp, -384 + .cfi_def_cfa_offset 384 + st.d $ra, $sp, 376 # 8-byte Folded Spill + st.d $fp, $sp, 368 # 8-byte Folded Spill + st.d $s0, $sp, 360 # 8-byte Folded Spill + st.d $s1, $sp, 352 # 8-byte Folded Spill + st.d $s2, $sp, 344 # 8-byte Folded Spill + st.d $s3, $sp, 336 # 8-byte Folded Spill + st.d $s4, $sp, 328 # 8-byte Folded Spill + st.d $s5, $sp, 320 # 8-byte Folded Spill + st.d $s6, $sp, 312 # 8-byte Folded Spill + st.d $s7, $sp, 304 # 8-byte Folded Spill + st.d $s8, $sp, 296 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -430,21 +435,24 @@ main: # @main jirl $ra, $ra, 0 pcalau12i $a1, %pc_hi20(.LCPI4_0) vld $vr0, $a1, %pc_lo12(.LCPI4_0) - vst $vr0, $sp, 256 # 16-byte Folded Spill + vst $vr0, $sp, 272 # 16-byte Folded Spill pcalau12i $a1, %pc_hi20(.LCPI4_1) vld $vr0, $a1, %pc_lo12(.LCPI4_1) - vst $vr0, $sp, 240 # 16-byte Folded Spill + vst $vr0, $sp, 256 # 16-byte Folded Spill pcalau12i $a1, %pc_hi20(.LCPI4_2) vld $vr0, $a1, %pc_lo12(.LCPI4_2) - vst $vr0, $sp, 224 # 16-byte Folded Spill + vst $vr0, $sp, 240 # 16-byte Folded Spill pcalau12i $a1, %pc_hi20(.LCPI4_3) vld $vr0, $a1, %pc_lo12(.LCPI4_3) - vst $vr0, $sp, 208 # 16-byte Folded Spill + vst $vr0, $sp, 224 # 16-byte Folded Spill pcalau12i $a1, %pc_hi20(.LCPI4_4) vld $vr0, $a1, %pc_lo12(.LCPI4_4) - vst $vr0, $sp, 192 # 16-byte Folded Spill + vst $vr0, $sp, 208 # 16-byte Folded Spill pcalau12i $a1, %pc_hi20(.LCPI4_5) vld $vr0, $a1, %pc_lo12(.LCPI4_5) + vst $vr0, $sp, 192 # 16-byte Folded Spill + pcalau12i $a1, %pc_hi20(.LCPI4_6) + vld $vr0, $a1, %pc_lo12(.LCPI4_6) vst $vr0, $sp, 176 # 16-byte Folded Spill move $s3, $a0 move $s0, $zero @@ -457,32 +465,27 @@ main: # @main jirl $ra, $ra, 0 stx.d $a0, $s3, $s0 vreplgr2vr.w $vr0, $s1 - vld $vr1, $sp, 256 # 16-byte Folded Reload + vld $vr1, $sp, 272 # 16-byte Folded Reload vadd.w $vr1, $vr0, $vr1 + vld $vr2, $sp, 256 # 16-byte Folded Reload + vadd.w $vr2, $vr0, $vr2 + vst $vr2, $a0, 0 vld $vr2, $sp, 240 # 16-byte Folded Reload vadd.w $vr2, $vr0, $vr2 - vld $vr3, $sp, 224 # 16-byte Folded Reload - vadd.w $vr3, $vr0, $vr3 - vld $vr4, $sp, 208 # 16-byte Folded Reload - vadd.w $vr4, $vr0, $vr4 - vst $vr4, $a0, 16 - vst $vr3, $a0, 0 - vld $vr3, $sp, 192 # 16-byte Folded Reload - vadd.w $vr3, $vr0, $vr3 - vld $vr4, $sp, 176 # 16-byte Folded Reload - vadd.w $vr0, $vr0, $vr4 - vst $vr2, $a0, 48 - vst $vr1, $a0, 32 - vst $vr0, $a0, 80 - vst $vr3, $a0, 64 - addi.d $a1, $s1, 24 - st.w $a1, $a0, 96 - addi.d $a1, $s1, 25 - st.w $a1, $a0, 100 - addi.d $a1, $s1, 26 - st.w $a1, $a0, 104 - addi.d $a1, $s1, 27 - st.w $a1, $a0, 108 + vst $vr1, $a0, 16 + vld $vr1, $sp, 224 # 16-byte Folded Reload + vadd.w $vr1, $vr0, $vr1 + vst $vr2, $a0, 32 + vld $vr2, $sp, 208 # 16-byte Folded Reload + vadd.w $vr2, $vr0, $vr2 + vst $vr1, $a0, 48 + vld $vr1, $sp, 192 # 16-byte Folded Reload + vadd.w $vr1, $vr0, $vr1 + vst $vr2, $a0, 64 + vld $vr2, $sp, 176 # 16-byte Folded Reload + vadd.w $vr0, $vr0, $vr2 + vst $vr1, $a0, 80 + vst $vr0, $a0, 96 addi.d $a1, $s1, 28 st.w $a1, $a0, 112 addi.d $a1, $s1, 29 @@ -506,32 +509,27 @@ main: # @main jirl $ra, $ra, 0 stx.d $a0, $s4, $s0 vreplgr2vr.w $vr0, $s1 - vld $vr1, $sp, 256 # 16-byte Folded Reload + vld $vr1, $sp, 272 # 16-byte Folded Reload vadd.w $vr1, $vr0, $vr1 + vld $vr2, $sp, 256 # 16-byte Folded Reload + vadd.w $vr2, $vr0, $vr2 + vst $vr2, $a0, 0 vld $vr2, $sp, 240 # 16-byte Folded Reload vadd.w $vr2, $vr0, $vr2 - vld $vr3, $sp, 224 # 16-byte Folded Reload - vadd.w $vr3, $vr0, $vr3 - vld $vr4, $sp, 208 # 16-byte Folded Reload - vadd.w $vr4, $vr0, $vr4 - vst $vr4, $a0, 16 - vst $vr3, $a0, 0 - vld $vr3, $sp, 192 # 16-byte Folded Reload - vadd.w $vr3, $vr0, $vr3 - vld $vr4, $sp, 176 # 16-byte Folded Reload - vadd.w $vr0, $vr0, $vr4 - vst $vr2, $a0, 48 - vst $vr1, $a0, 32 - vst $vr0, $a0, 80 - vst $vr3, $a0, 64 - addi.d $a1, $s1, 24 - st.w $a1, $a0, 96 - addi.d $a1, $s1, 25 - st.w $a1, $a0, 100 - addi.d $a1, $s1, 26 - st.w $a1, $a0, 104 - addi.d $a1, $s1, 27 - st.w $a1, $a0, 108 + vst $vr1, $a0, 16 + vld $vr1, $sp, 224 # 16-byte Folded Reload + vadd.w $vr1, $vr0, $vr1 + vst $vr2, $a0, 32 + vld $vr2, $sp, 208 # 16-byte Folded Reload + vadd.w $vr2, $vr0, $vr2 + vst $vr1, $a0, 48 + vld $vr1, $sp, 192 # 16-byte Folded Reload + vadd.w $vr1, $vr0, $vr1 + vst $vr2, $a0, 64 + vld $vr2, $sp, 176 # 16-byte Folded Reload + vadd.w $vr0, $vr0, $vr2 + vst $vr1, $a0, 80 + vst $vr0, $a0, 96 addi.d $a1, $s1, 28 st.w $a1, $a0, 112 addi.d $a1, $s1, 29 @@ -540,8 +538,8 @@ main: # @main addi.d $s1, $s1, 30 bne $s0, $fp, .LBB4_6 # %bb.7: # %_Z8mkmatrixii.exit25 - st.d $s4, $sp, 8 # 8-byte Folded Spill - st.d $s3, $sp, 136 # 8-byte Folded Spill + st.d $s4, $sp, 16 # 8-byte Folded Spill + st.d $s3, $sp, 144 # 8-byte Folded Spill ori $a0, $zero, 240 ori $fp, $zero, 240 pcaddu18i $ra, %call36(malloc) @@ -557,32 +555,27 @@ main: # @main jirl $ra, $ra, 0 stx.d $a0, $s3, $s0 vreplgr2vr.w $vr0, $s1 - vld $vr1, $sp, 256 # 16-byte Folded Reload + vld $vr1, $sp, 272 # 16-byte Folded Reload vadd.w $vr1, $vr0, $vr1 + vld $vr2, $sp, 256 # 16-byte Folded Reload + vadd.w $vr2, $vr0, $vr2 + vst $vr2, $a0, 0 vld $vr2, $sp, 240 # 16-byte Folded Reload vadd.w $vr2, $vr0, $vr2 - vld $vr3, $sp, 224 # 16-byte Folded Reload - vadd.w $vr3, $vr0, $vr3 - vld $vr4, $sp, 208 # 16-byte Folded Reload - vadd.w $vr4, $vr0, $vr4 - vst $vr4, $a0, 16 - vst $vr3, $a0, 0 - vld $vr3, $sp, 192 # 16-byte Folded Reload - vadd.w $vr3, $vr0, $vr3 - vld $vr4, $sp, 176 # 16-byte Folded Reload - vadd.w $vr0, $vr0, $vr4 - vst $vr2, $a0, 48 - vst $vr1, $a0, 32 - vst $vr0, $a0, 80 - vst $vr3, $a0, 64 - addi.d $a1, $s1, 24 - st.w $a1, $a0, 96 - addi.d $a1, $s1, 25 - st.w $a1, $a0, 100 - addi.d $a1, $s1, 26 - st.w $a1, $a0, 104 - addi.d $a1, $s1, 27 - st.w $a1, $a0, 108 + vst $vr1, $a0, 16 + vld $vr1, $sp, 224 # 16-byte Folded Reload + vadd.w $vr1, $vr0, $vr1 + vst $vr2, $a0, 32 + vld $vr2, $sp, 208 # 16-byte Folded Reload + vadd.w $vr2, $vr0, $vr2 + vst $vr1, $a0, 48 + vld $vr1, $sp, 192 # 16-byte Folded Reload + vadd.w $vr1, $vr0, $vr1 + vst $vr2, $a0, 64 + vld $vr2, $sp, 176 # 16-byte Folded Reload + vadd.w $vr0, $vr0, $vr2 + vst $vr1, $a0, 80 + vst $vr0, $a0, 96 addi.d $a1, $s1, 28 st.w $a1, $a0, 112 addi.d $a1, $s1, 29 @@ -591,12 +584,12 @@ main: # @main addi.d $s1, $s1, 30 bne $s0, $fp, .LBB4_8 # %bb.9: # %_Z8mkmatrixii.exit36.preheader - st.d $s3, $sp, 128 # 8-byte Folded Spill + st.d $s3, $sp, 136 # 8-byte Folded Spill addi.w $a0, $s2, 0 - st.d $a0, $sp, 16 # 8-byte Folded Spill + st.d $a0, $sp, 24 # 8-byte Folded Spill blez $a0, .LBB4_16 # %bb.10: # %.preheader24.i.preheader.preheader - ld.d $a2, $sp, 8 # 8-byte Folded Reload + ld.d $a2, $sp, 16 # 8-byte Folded Reload ld.d $t2, $a2, 208 ld.d $t3, $a2, 216 ld.d $t4, $a2, 176 @@ -608,78 +601,78 @@ main: # @main ld.d $s3, $a2, 80 ld.d $s4, $a2, 88 ld.d $a0, $a2, 48 - st.d $a0, $sp, 240 # 8-byte Folded Spill + st.d $a0, $sp, 256 # 8-byte Folded Spill ld.d $a0, $a2, 56 - st.d $a0, $sp, 224 # 8-byte Folded Spill + st.d $a0, $sp, 240 # 8-byte Folded Spill ld.d $a0, $a2, 16 - st.d $a0, $sp, 208 # 8-byte Folded Spill + st.d $a0, $sp, 224 # 8-byte Folded Spill ld.d $a0, $a2, 24 - st.d $a0, $sp, 192 # 8-byte Folded Spill + st.d $a0, $sp, 208 # 8-byte Folded Spill ld.d $a0, $a2, 192 - st.d $a0, $sp, 176 # 8-byte Folded Spill + st.d $a0, $sp, 192 # 8-byte Folded Spill ld.d $a0, $a2, 200 - st.d $a0, $sp, 168 # 8-byte Folded Spill + st.d $a0, $sp, 176 # 8-byte Folded Spill ld.d $a0, $a2, 160 - st.d $a0, $sp, 160 # 8-byte Folded Spill + st.d $a0, $sp, 168 # 8-byte Folded Spill ld.d $a0, $a2, 168 - st.d $a0, $sp, 152 # 8-byte Folded Spill + st.d $a0, $sp, 160 # 8-byte Folded Spill ld.d $a0, $a2, 128 - st.d $a0, $sp, 120 # 8-byte Folded Spill + st.d $a0, $sp, 128 # 8-byte Folded Spill ld.d $a0, $a2, 136 - st.d $a0, $sp, 112 # 8-byte Folded Spill + st.d $a0, $sp, 120 # 8-byte Folded Spill ld.d $a0, $a2, 96 - st.d $a0, $sp, 104 # 8-byte Folded Spill + st.d $a0, $sp, 112 # 8-byte Folded Spill ld.d $a0, $a2, 104 - st.d $a0, $sp, 96 # 8-byte Folded Spill + st.d $a0, $sp, 104 # 8-byte Folded Spill ld.d $a0, $a2, 64 - st.d $a0, $sp, 88 # 8-byte Folded Spill + st.d $a0, $sp, 96 # 8-byte Folded Spill ld.d $a0, $a2, 72 - st.d $a0, $sp, 80 # 8-byte Folded Spill + st.d $a0, $sp, 88 # 8-byte Folded Spill ld.d $a0, $a2, 32 - st.d $a0, $sp, 72 # 8-byte Folded Spill + st.d $a0, $sp, 80 # 8-byte Folded Spill ld.d $a0, $a2, 40 - st.d $a0, $sp, 64 # 8-byte Folded Spill + st.d $a0, $sp, 72 # 8-byte Folded Spill ld.d $a0, $a2, 0 - st.d $a0, $sp, 56 # 8-byte Folded Spill + st.d $a0, $sp, 64 # 8-byte Folded Spill ld.d $a0, $a2, 8 - st.d $a0, $sp, 48 # 8-byte Folded Spill + st.d $a0, $sp, 56 # 8-byte Folded Spill ld.d $a0, $a2, 224 - st.d $a0, $sp, 40 # 8-byte Folded Spill + st.d $a0, $sp, 48 # 8-byte Folded Spill ld.d $a0, $a2, 232 - st.d $a0, $sp, 32 # 8-byte Folded Spill + st.d $a0, $sp, 40 # 8-byte Folded Spill move $a0, $zero .p2align 4, , 16 .LBB4_11: # %.preheader24.i.preheader # =>This Loop Header: Depth=1 # Child Loop BB4_12 Depth 2 # Child Loop BB4_13 Depth 3 - st.d $a0, $sp, 24 # 8-byte Folded Spill + st.d $a0, $sp, 32 # 8-byte Folded Spill move $a1, $zero .p2align 4, , 16 .LBB4_12: # %.preheader24.i # Parent Loop BB4_11 Depth=1 # => This Loop Header: Depth=2 # Child Loop BB4_13 Depth 3 - st.d $a1, $sp, 144 # 8-byte Folded Spill + st.d $a1, $sp, 152 # 8-byte Folded Spill slli.d $a4, $a1, 3 - ld.d $a3, $sp, 128 # 8-byte Folded Reload + ld.d $a3, $sp, 136 # 8-byte Folded Reload ldx.d $a0, $a3, $a4 - st.d $a0, $sp, 256 # 8-byte Folded Spill - ld.d $a5, $sp, 136 # 8-byte Folded Reload + st.d $a0, $sp, 272 # 8-byte Folded Spill + ld.d $a5, $sp, 144 # 8-byte Folded Reload ldx.d $a4, $a5, $a4 move $a5, $zero - ld.d $s1, $sp, 120 # 8-byte Folded Reload - ld.d $a1, $sp, 112 # 8-byte Folded Reload - ld.d $a2, $sp, 104 # 8-byte Folded Reload - ld.d $a3, $sp, 96 # 8-byte Folded Reload - ld.d $s5, $sp, 88 # 8-byte Folded Reload - ld.d $s6, $sp, 80 # 8-byte Folded Reload - ld.d $s7, $sp, 72 # 8-byte Folded Reload - ld.d $s8, $sp, 64 # 8-byte Folded Reload - ld.d $ra, $sp, 56 # 8-byte Folded Reload - ld.d $a0, $sp, 48 # 8-byte Folded Reload - ld.d $s0, $sp, 40 # 8-byte Folded Reload - ld.d $fp, $sp, 32 # 8-byte Folded Reload + ld.d $s1, $sp, 128 # 8-byte Folded Reload + ld.d $a1, $sp, 120 # 8-byte Folded Reload + ld.d $a2, $sp, 112 # 8-byte Folded Reload + ld.d $a3, $sp, 104 # 8-byte Folded Reload + ld.d $s5, $sp, 96 # 8-byte Folded Reload + ld.d $s6, $sp, 88 # 8-byte Folded Reload + ld.d $s7, $sp, 80 # 8-byte Folded Reload + ld.d $s8, $sp, 72 # 8-byte Folded Reload + ld.d $ra, $sp, 64 # 8-byte Folded Reload + ld.d $a0, $sp, 56 # 8-byte Folded Reload + ld.d $s0, $sp, 48 # 8-byte Folded Reload + ld.d $fp, $sp, 40 # 8-byte Folded Reload .p2align 4, , 16 .LBB4_13: # %vector.ph # Parent Loop BB4_11 Depth=1 @@ -714,18 +707,18 @@ main: # @main vinsgr2vr.w $vr9, $t0, 0 vinsgr2vr.w $vr9, $t1, 1 ld.d $a6, $a4, 40 - ld.d $a7, $sp, 240 # 8-byte Folded Reload + ld.d $a7, $sp, 256 # 8-byte Folded Reload ldx.w $a7, $a7, $a5 - ld.d $t0, $sp, 224 # 8-byte Folded Reload + ld.d $t0, $sp, 240 # 8-byte Folded Reload ldx.w $t0, $t0, $a5 ld.d $t1, $a4, 24 vinsgr2vr.d $vr10, $a6, 0 vinsgr2vr.w $vr0, $a7, 0 vinsgr2vr.w $vr0, $t0, 1 vinsgr2vr.d $vr11, $t1, 0 - ld.d $a6, $sp, 208 # 8-byte Folded Reload + ld.d $a6, $sp, 224 # 8-byte Folded Reload ldx.w $a6, $a6, $a5 - ld.d $a7, $sp, 192 # 8-byte Folded Reload + ld.d $a7, $sp, 208 # 8-byte Folded Reload ldx.w $a7, $a7, $a5 ld.d $t0, $a4, 8 vmul.w $vr0, $vr0, $vr11 @@ -736,18 +729,18 @@ main: # @main vmadd.w $vr0, $vr9, $vr10 vmadd.w $vr0, $vr7, $vr8 vmadd.w $vr0, $vr5, $vr6 - ld.d $a6, $sp, 176 # 8-byte Folded Reload + ld.d $a6, $sp, 192 # 8-byte Folded Reload ldx.w $a6, $a6, $a5 vmadd.w $vr0, $vr3, $vr4 - ld.d $a7, $sp, 168 # 8-byte Folded Reload + ld.d $a7, $sp, 176 # 8-byte Folded Reload ldx.w $a7, $a7, $a5 vmadd.w $vr0, $vr1, $vr2 vinsgr2vr.w $vr1, $a6, 0 ld.d $a6, $a4, 96 vinsgr2vr.w $vr1, $a7, 1 - ld.d $a7, $sp, 160 # 8-byte Folded Reload + ld.d $a7, $sp, 168 # 8-byte Folded Reload ldx.w $a7, $a7, $a5 - ld.d $t0, $sp, 152 # 8-byte Folded Reload + ld.d $t0, $sp, 160 # 8-byte Folded Reload ldx.w $t0, $t0, $a5 vinsgr2vr.d $vr2, $a6, 0 ld.d $a6, $a4, 80 @@ -802,25 +795,25 @@ main: # @main add.d $a6, $a6, $a7 mul.d $a7, $t1, $t0 add.d $a6, $a7, $a6 - ld.d $a7, $sp, 256 # 8-byte Folded Reload + ld.d $a7, $sp, 272 # 8-byte Folded Reload stx.w $a6, $a7, $a5 ori $a6, $zero, 120 addi.d $a5, $a5, 4 bne $a5, $a6, .LBB4_13 # %bb.14: # %._crit_edge28.i # in Loop: Header=BB4_12 Depth=2 - ld.d $a1, $sp, 144 # 8-byte Folded Reload + ld.d $a1, $sp, 152 # 8-byte Folded Reload addi.d $a1, $a1, 1 ori $a0, $zero, 30 bne $a1, $a0, .LBB4_12 # %bb.15: # %_Z5mmultiiPPiS0_S0_.exit # in Loop: Header=BB4_11 Depth=1 - ld.d $a0, $sp, 24 # 8-byte Folded Reload + ld.d $a0, $sp, 32 # 8-byte Folded Reload addi.w $a0, $a0, 1 - ld.d $a3, $sp, 16 # 8-byte Folded Reload + ld.d $a3, $sp, 24 # 8-byte Folded Reload bne $a0, $a3, .LBB4_11 .LBB4_16: # %_Z8mkmatrixii.exit36._crit_edge - ld.d $s5, $sp, 128 # 8-byte Folded Reload + ld.d $s5, $sp, 136 # 8-byte Folded Reload ld.d $a0, $s5, 0 ld.w $a1, $a0, 0 pcalau12i $a0, %got_pc_hi20(_ZSt4cout) @@ -867,8 +860,8 @@ main: # @main beqz $s2, .LBB4_27 # %bb.17: # %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i ld.bu $a1, $s2, 56 - ld.d $s3, $sp, 136 # 8-byte Folded Reload - ld.d $s4, $sp, 8 # 8-byte Folded Reload + ld.d $s3, $sp, 144 # 8-byte Folded Reload + ld.d $s4, $sp, 16 # 8-byte Folded Reload beqz $a1, .LBB4_19 # %bb.18: ld.bu $a1, $s2, 67 @@ -940,18 +933,18 @@ main: # @main pcaddu18i $ra, %call36(free) jirl $ra, $ra, 0 move $a0, $zero - ld.d $s8, $sp, 280 # 8-byte Folded Reload - ld.d $s7, $sp, 288 # 8-byte Folded Reload - ld.d $s6, $sp, 296 # 8-byte Folded Reload - ld.d $s5, $sp, 304 # 8-byte Folded Reload - ld.d $s4, $sp, 312 # 8-byte Folded Reload - ld.d $s3, $sp, 320 # 8-byte Folded Reload - ld.d $s2, $sp, 328 # 8-byte Folded Reload - ld.d $s1, $sp, 336 # 8-byte Folded Reload - ld.d $s0, $sp, 344 # 8-byte Folded Reload - ld.d $fp, $sp, 352 # 8-byte Folded Reload - ld.d $ra, $sp, 360 # 8-byte Folded Reload - addi.d $sp, $sp, 368 + ld.d $s8, $sp, 296 # 8-byte Folded Reload + ld.d $s7, $sp, 304 # 8-byte Folded Reload + ld.d $s6, $sp, 312 # 8-byte Folded Reload + ld.d $s5, $sp, 320 # 8-byte Folded Reload + ld.d $s4, $sp, 328 # 8-byte Folded Reload + ld.d $s3, $sp, 336 # 8-byte Folded Reload + ld.d $s2, $sp, 344 # 8-byte Folded Reload + ld.d $s1, $sp, 352 # 8-byte Folded Reload + ld.d $s0, $sp, 360 # 8-byte Folded Reload + ld.d $fp, $sp, 368 # 8-byte Folded Reload + ld.d $ra, $sp, 376 # 8-byte Folded Reload + addi.d $sp, $sp, 384 ret .LBB4_27: pcaddu18i $ra, %call36(_ZSt16__throw_bad_castv) diff --git a/results/SingleSource/Benchmarks/Shootout/CMakeFiles/Shootout-matrix.dir/matrix.s b/results/SingleSource/Benchmarks/Shootout/CMakeFiles/Shootout-matrix.dir/matrix.s index a2ca6462..de02aa48 100644 --- a/results/SingleSource/Benchmarks/Shootout/CMakeFiles/Shootout-matrix.dir/matrix.s +++ b/results/SingleSource/Benchmarks/Shootout/CMakeFiles/Shootout-matrix.dir/matrix.s @@ -346,15 +346,15 @@ mmult: # @mmult .section .rodata.cst16,"aM",@progbits,16 .p2align 4, 0x0 # -- Begin function main .LCPI4_0: - .word 0 # 0x0 - .word 1 # 0x1 - .word 2 # 0x2 - .word 3 # 0x3 -.LCPI4_1: .word 4 # 0x4 .word 5 # 0x5 .word 6 # 0x6 .word 7 # 0x7 +.LCPI4_1: + .word 0 # 0x0 + .word 1 # 0x1 + .word 2 # 0x2 + .word 3 # 0x3 .text .globl main .p2align 5 @@ -412,8 +412,8 @@ main: # @main vadd.w $vr1, $vr0, $vr1 vld $vr2, $sp, 16 # 16-byte Folded Reload vadd.w $vr0, $vr0, $vr2 - vst $vr0, $a0, 16 - vst $vr1, $a0, 0 + vst $vr0, $a0, 0 + vst $vr1, $a0, 16 addi.d $a1, $s3, 8 st.w $a1, $a0, 32 addi.d $a1, $s3, 9 @@ -441,8 +441,8 @@ main: # @main vadd.w $vr1, $vr0, $vr1 vld $vr2, $sp, 16 # 16-byte Folded Reload vadd.w $vr0, $vr0, $vr2 - vst $vr0, $a0, 16 - vst $vr1, $a0, 0 + vst $vr0, $a0, 0 + vst $vr1, $a0, 16 addi.d $a1, $s4, 8 st.w $a1, $a0, 32 addi.d $a1, $s4, 9 @@ -470,8 +470,8 @@ main: # @main vadd.w $vr1, $vr0, $vr1 vld $vr2, $sp, 16 # 16-byte Folded Reload vadd.w $vr0, $vr0, $vr2 - vst $vr0, $a0, 16 - vst $vr1, $a0, 0 + vst $vr0, $a0, 0 + vst $vr1, $a0, 16 addi.d $a1, $s5, 8 st.w $a1, $a0, 32 addi.d $a1, $s5, 9 diff --git a/results/SingleSource/Benchmarks/SmallPT/CMakeFiles/smallpt.dir/smallpt.s b/results/SingleSource/Benchmarks/SmallPT/CMakeFiles/smallpt.dir/smallpt.s index 1435aa1e..c7fc3c5c 100644 --- a/results/SingleSource/Benchmarks/SmallPT/CMakeFiles/smallpt.dir/smallpt.s +++ b/results/SingleSource/Benchmarks/SmallPT/CMakeFiles/smallpt.dir/smallpt.s @@ -21,43 +21,45 @@ .type _Z8radianceRK3RayiPt,@function _Z8radianceRK3RayiPt: # @_Z8radianceRK3RayiPt # %bb.0: - addi.d $sp, $sp, -432 - st.d $ra, $sp, 424 # 8-byte Folded Spill - st.d $fp, $sp, 416 # 8-byte Folded Spill - st.d $s0, $sp, 408 # 8-byte Folded Spill - st.d $s1, $sp, 400 # 8-byte Folded Spill - st.d $s2, $sp, 392 # 8-byte Folded Spill - st.d $s3, $sp, 384 # 8-byte Folded Spill - st.d $s4, $sp, 376 # 8-byte Folded Spill - st.d $s5, $sp, 368 # 8-byte Folded Spill - st.d $s6, $sp, 360 # 8-byte Folded Spill - st.d $s7, $sp, 352 # 8-byte Folded Spill - st.d $s8, $sp, 344 # 8-byte Folded Spill - fst.d $fs0, $sp, 336 # 8-byte Folded Spill - fst.d $fs1, $sp, 328 # 8-byte Folded Spill - fst.d $fs2, $sp, 320 # 8-byte Folded Spill - fst.d $fs3, $sp, 312 # 8-byte Folded Spill - fst.d $fs4, $sp, 304 # 8-byte Folded Spill - fst.d $fs5, $sp, 296 # 8-byte Folded Spill - fst.d $fs6, $sp, 288 # 8-byte Folded Spill - fst.d $fs7, $sp, 280 # 8-byte Folded Spill - fld.d $fs4, $a1, 0 - fld.d $fs5, $a1, 8 - fld.d $fs6, $a1, 16 - fld.d $fa4, $a1, 24 - fld.d $fs2, $a1, 32 - fld.d $fs1, $a1, 40 - pcalau12i $s1, %pc_hi20(.LCPI0_0) - fld.d $fa5, $s1, %pc_lo12(.LCPI0_0) + addi.d $sp, $sp, -464 + st.d $ra, $sp, 456 # 8-byte Folded Spill + st.d $fp, $sp, 448 # 8-byte Folded Spill + st.d $s0, $sp, 440 # 8-byte Folded Spill + st.d $s1, $sp, 432 # 8-byte Folded Spill + st.d $s2, $sp, 424 # 8-byte Folded Spill + st.d $s3, $sp, 416 # 8-byte Folded Spill + st.d $s4, $sp, 408 # 8-byte Folded Spill + st.d $s5, $sp, 400 # 8-byte Folded Spill + st.d $s6, $sp, 392 # 8-byte Folded Spill + st.d $s7, $sp, 384 # 8-byte Folded Spill + st.d $s8, $sp, 376 # 8-byte Folded Spill + fst.d $fs0, $sp, 368 # 8-byte Folded Spill + fst.d $fs1, $sp, 360 # 8-byte Folded Spill + fst.d $fs2, $sp, 352 # 8-byte Folded Spill + fst.d $fs3, $sp, 344 # 8-byte Folded Spill + fst.d $fs4, $sp, 336 # 8-byte Folded Spill + fst.d $fs5, $sp, 328 # 8-byte Folded Spill + fst.d $fs6, $sp, 320 # 8-byte Folded Spill + fst.d $fs7, $sp, 312 # 8-byte Folded Spill move $fp, $a0 + vld $vr6, $a1, 0 + fld.d $fs1, $a1, 16 + vld $vr7, $a1, 24 + fld.d $fs0, $a1, 40 + vreplvei.d $vr9, $vr6, 0 + vreplvei.d $vr10, $vr6, 1 + pcalau12i $s1, %pc_hi20(.LCPI0_0) + fld.d $ft0, $s1, %pc_lo12(.LCPI0_0) + vreplvei.d $vr4, $vr7, 0 + vreplvei.d $vr5, $vr7, 1 ori $s2, $zero, 8 pcalau12i $a0, %pc_hi20(spheres) addi.d $s0, $a0, %pc_lo12(spheres) pcalau12i $a0, %pc_hi20(.LCPI0_1) - fld.d $fs3, $a0, %pc_lo12(.LCPI0_1) + fld.d $fs2, $a0, %pc_lo12(.LCPI0_1) move $s4, $zero move $s3, $zero - movgr2fr.d $fs7, $zero + movgr2fr.d $fs5, $zero addi.w $s5, $zero, -792 b .LBB0_2 .p2align 4, , 16 @@ -71,19 +73,19 @@ _Z8radianceRK3RayiPt: # @_Z8radianceRK3RayiPt fld.d $fa0, $a0, 712 fld.d $fa1, $a0, 720 fld.d $fa2, $a0, 728 - fsub.d $fa0, $fa0, $fs4 - fsub.d $fa1, $fa1, $fs5 - fsub.d $fa2, $fa2, $fs6 - fmul.d $fa3, $fa1, $fs2 + fsub.d $fa0, $fa0, $ft1 + fsub.d $fa1, $fa1, $ft2 + fsub.d $fa2, $fa2, $fs1 + fmul.d $fa3, $fa1, $fa5 fmadd.d $fa3, $fa0, $fa4, $fa3 fmul.d $fa1, $fa1, $fa1 fmadd.d $fa0, $fa0, $fa0, $fa1 fld.d $fa1, $a0, 704 - fmadd.d $fs0, $fa2, $fs1, $fa3 + fmadd.d $fs3, $fa2, $fs0, $fa3 fmadd.d $fa0, $fa2, $fa2, $fa0 - fmsub.d $fa0, $fs0, $fs0, $fa0 + fmsub.d $fa0, $fs3, $fs3, $fa0 fmadd.d $fa0, $fa1, $fa1, $fa0 - fcmp.clt.d $fcc0, $fa0, $fs7 + fcmp.clt.d $fcc0, $fa0, $fs5 bcnez $fcc0, .LBB0_1 # %bb.3: # %_ZNK6Sphere9intersectERK3Ray.exit.i # in Loop: Header=BB0_2 Depth=1 @@ -92,33 +94,43 @@ _Z8radianceRK3RayiPt: # @_Z8radianceRK3RayiPt bceqz $fcc0, .LBB0_7 .LBB0_4: # %_ZNK6Sphere9intersectERK3Ray.exit.i.split # in Loop: Header=BB0_2 Depth=1 - fsub.d $fa0, $fs0, $fa1 - fadd.d $fa1, $fs0, $fa1 - fcmp.clt.d $fcc0, $fs3, $fa1 - fsel $fa1, $fs7, $fa1, $fcc0 - fcmp.clt.d $fcc0, $fs3, $fa0 + fsub.d $fa0, $fs3, $fa1 + fadd.d $fa1, $fs3, $fa1 + fcmp.clt.d $fcc0, $fs2, $fa1 + fsel $fa1, $fs5, $fa1, $fcc0 + fcmp.clt.d $fcc0, $fs2, $fa0 fsel $fa0, $fa1, $fa0, $fcc0 - fcmp.ceq.d $fcc0, $fa0, $fs7 + fcmp.ceq.d $fcc0, $fa0, $fs5 bcnez $fcc0, .LBB0_1 # %bb.5: # %_ZNK6Sphere9intersectERK3Ray.exit.i.split # in Loop: Header=BB0_2 Depth=1 - fcmp.cule.d $fcc0, $fa5, $fa0 + fcmp.cule.d $fcc0, $ft0, $fa0 bcnez $fcc0, .LBB0_1 # %bb.6: # in Loop: Header=BB0_2 Depth=1 move $s3, $s2 - fmov.d $fa5, $fa0 + fmov.d $ft0, $fa0 b .LBB0_1 .LBB0_7: # %call.sqrt # in Loop: Header=BB0_2 Depth=1 move $s6, $a3 move $s8, $a2 move $s7, $a1 - fst.d $fa4, $sp, 80 # 8-byte Folded Spill - fst.d $fa5, $sp, 72 # 8-byte Folded Spill + vst $vr4, $sp, 112 # 16-byte Folded Spill + vst $vr5, $sp, 96 # 16-byte Folded Spill + vst $vr6, $sp, 80 # 16-byte Folded Spill + vst $vr7, $sp, 64 # 16-byte Folded Spill + vst $vr8, $sp, 48 # 16-byte Folded Spill + vst $vr9, $sp, 128 # 16-byte Folded Spill + vst $vr10, $sp, 32 # 16-byte Folded Spill pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 - fld.d $fa5, $sp, 72 # 8-byte Folded Reload - fld.d $fa4, $sp, 80 # 8-byte Folded Reload + vld $vr10, $sp, 32 # 16-byte Folded Reload + vld $vr9, $sp, 128 # 16-byte Folded Reload + vld $vr8, $sp, 48 # 16-byte Folded Reload + vld $vr7, $sp, 64 # 16-byte Folded Reload + vld $vr6, $sp, 80 # 16-byte Folded Reload + vld $vr5, $sp, 96 # 16-byte Folded Reload + vld $vr4, $sp, 112 # 16-byte Folded Reload move $a1, $s7 move $a2, $s8 move $a3, $s6 @@ -126,197 +138,201 @@ _Z8radianceRK3RayiPt: # @_Z8radianceRK3RayiPt b .LBB0_4 .LBB0_8: # %_Z9intersectRK3RayRdRi.exit fld.d $fa0, $s1, %pc_lo12(.LCPI0_0) - fcmp.clt.d $fcc0, $fa5, $fa0 + fcmp.clt.d $fcc0, $ft0, $fa0 bceqz $fcc0, .LBB0_13 # %bb.9: ori $a0, $zero, 88 mul.d $a0, $s3, $a0 add.d $s3, $s0, $a0 - fld.d $fs3, $s3, 8 - fld.d $fa3, $s3, 16 - fld.d $fa6, $s3, 24 + fld.d $fs2, $s3, 24 + vld $vr2, $s3, 8 vld $vr1, $s3, 56 - fld.d $fs0, $s3, 72 + fld.d $fs3, $s3, 72 ori $a0, $zero, 5 - vst $vr1, $sp, 112 # 16-byte Folded Spill + vst $vr1, $sp, 128 # 16-byte Folded Spill blt $a2, $a0, .LBB0_15 # %bb.10: - fst.d $fa6, $sp, 48 # 8-byte Folded Spill - fst.d $fa3, $sp, 64 # 8-byte Folded Spill - fst.d $fa5, $sp, 72 # 8-byte Folded Spill - fst.d $fa4, $sp, 80 # 8-byte Folded Spill + vst $vr2, $sp, 32 # 16-byte Folded Spill + vst $vr8, $sp, 48 # 16-byte Folded Spill + vst $vr7, $sp, 64 # 16-byte Folded Spill + vst $vr6, $sp, 80 # 16-byte Folded Spill + vst $vr5, $sp, 96 # 16-byte Folded Spill + vst $vr4, $sp, 112 # 16-byte Folded Spill move $s0, $a1 move $s1, $a2 vreplvei.d $vr0, $vr1, 0 vreplvei.d $vr1, $vr1, 1 - fcmp.clt.d $fcc0, $fs0, $fa1 - fsel $fa2, $fs0, $fa1, $fcc0 - fst.d $fs0, $sp, 104 # 8-byte Folded Spill - fcmp.clt.d $fcc0, $fs0, $fa0 + fcmp.clt.d $fcc0, $fs3, $fa1 + fsel $fa2, $fs3, $fa1, $fcc0 + fcmp.clt.d $fcc0, $fs3, $fa0 fsel $fa3, $fa2, $fa0, $fcc0 fcmp.clt.d $fcc0, $fa1, $fa0 - fsel $fs0, $fa2, $fa3, $fcc0 + fsel $fs4, $fa2, $fa3, $fcc0 move $s2, $a3 move $a0, $a3 pcaddu18i $ra, %call36(erand48) jirl $ra, $ra, 0 - fcmp.clt.d $fcc0, $fa0, $fs0 + fcmp.clt.d $fcc0, $fa0, $fs4 bceqz $fcc0, .LBB0_12 # %bb.11: ori $a0, $zero, 128 - fld.d $fa4, $sp, 80 # 8-byte Folded Reload - fld.d $fa5, $sp, 72 # 8-byte Folded Reload bltu $s1, $a0, .LBB0_14 .LBB0_12: ld.d $a0, $s3, 48 vld $vr0, $s3, 32 st.d $a0, $fp, 16 vst $vr0, $fp, 0 - b .LBB0_25 + b .LBB0_24 .LBB0_13: st.d $zero, $fp, 16 vrepli.b $vr0, 0 vst $vr0, $fp, 0 - b .LBB0_25 + b .LBB0_24 .LBB0_14: move $a2, $s1 move $a1, $s0 - frecip.d $fa0, $fs0 + frecip.d $fa0, $fs4 vreplvei.d $vr1, $vr0, 0 - vld $vr2, $sp, 112 # 16-byte Folded Reload + vld $vr2, $sp, 128 # 16-byte Folded Reload vfmul.d $vr2, $vr2, $vr1 - vst $vr2, $sp, 112 # 16-byte Folded Spill - fld.d $fs0, $sp, 104 # 8-byte Folded Reload - fmul.d $fs0, $fs0, $fa0 + vst $vr2, $sp, 128 # 16-byte Folded Spill + fmul.d $fs3, $fs3, $fa0 move $a3, $s2 - fld.d $fa3, $sp, 64 # 8-byte Folded Reload - fld.d $fa6, $sp, 48 # 8-byte Folded Reload + vld $vr4, $sp, 112 # 16-byte Folded Reload + vld $vr5, $sp, 96 # 16-byte Folded Reload + vld $vr6, $sp, 80 # 16-byte Folded Reload + vld $vr7, $sp, 64 # 16-byte Folded Reload + vld $vr8, $sp, 48 # 16-byte Folded Reload + vld $vr2, $sp, 32 # 16-byte Folded Reload .LBB0_15: # %.thread - fmul.d $fa0, $fa4, $fa5 - fmul.d $fa1, $fa5, $fs2 - fmul.d $fa2, $fa5, $fs1 - fadd.d $ft1, $fs4, $fa0 - fadd.d $ft0, $fs5, $fa1 - fadd.d $fa7, $fs6, $fa2 - fsub.d $fa0, $ft1, $fs3 - fsub.d $fa1, $ft0, $fa3 - fsub.d $fa2, $fa7, $fa6 - fmul.d $fa3, $fa1, $fa1 - fmadd.d $fa3, $fa0, $fa0, $fa3 - fmadd.d $fa3, $fa2, $fa2, $fa3 - frsqrt.d $fa3, $fa3 - fmul.d $fs3, $fa0, $fa3 + vreplvei.d $vr0, $vr8, 0 + vfmul.d $vr0, $vr7, $vr0 + fmul.d $fa1, $ft0, $fs0 + fadd.d $ft0, $fs1, $fa1 + fsub.d $fa1, $ft0, $fs2 + vfadd.d $vr9, $vr6, $vr0 + vfsub.d $vr0, $vr9, $vr2 + vfmul.d $vr2, $vr0, $vr0 + vreplvei.d $vr2, $vr2, 1 + vreplvei.d $vr3, $vr0, 0 + fmadd.d $fa2, $fa3, $fa3, $fa2 + fmadd.d $fa2, $fa1, $fa1, $fa2 + frsqrt.d $fa2, $fa2 + vreplvei.d $vr3, $vr2, 0 + vfmul.d $vr0, $vr0, $vr3 ld.w $a0, $s3, 80 - fmul.d $fs4, $fa1, $fa3 - fmul.d $fs5, $fa2, $fa3 + fmul.d $fs1, $fa1, $fa2 + vreplvei.d $vr10, $vr0, 0 ori $a4, $zero, 1 addi.w $s0, $a2, 1 beq $a0, $a4, .LBB0_20 # %bb.16: # %.thread - fst.d $fs0, $sp, 104 # 8-byte Folded Spill - fmul.d $fa0, $fs2, $fs4 - fmadd.d $fa0, $fs3, $fa4, $fa0 - fmadd.d $fa0, $fs5, $fs1, $fa0 - fneg.d $fa1, $fs3 - fneg.d $fa2, $fs4 - fneg.d $fa3, $fs5 - fcmp.clt.d $fcc0, $fa0, $fs7 - fsel $fs0, $fa1, $fs3, $fcc0 - fsel $fs1, $fa2, $fs4, $fcc0 - fsel $fs6, $fa3, $fs5, $fcc0 + vreplvei.d $vr11, $vr0, 1 + fmul.d $fa1, $fa5, $ft3 + fmadd.d $fa1, $ft2, $fa4, $fa1 + fmadd.d $fa1, $fs1, $fs0, $fa1 + fneg.d $fa2, $ft2 + fneg.d $fa3, $ft3 + fneg.d $fa4, $fs1 + fcmp.clt.d $fcc0, $fa1, $fs5 + fsel $fs7, $fa2, $ft2, $fcc0 + fsel $fs4, $fa3, $ft3, $fcc0 + fsel $fs6, $fa4, $fs1, $fcc0 bnez $a0, .LBB0_21 # %bb.17: - fst.d $ft1, $sp, 64 # 8-byte Folded Spill - fst.d $ft0, $sp, 72 # 8-byte Folded Spill - fst.d $fa7, $sp, 80 # 8-byte Folded Spill + vst $vr10, $sp, 80 # 16-byte Folded Spill + vst $vr9, $sp, 48 # 16-byte Folded Spill + fst.d $ft0, $sp, 112 # 8-byte Folded Spill + fst.d $fs3, $sp, 64 # 8-byte Folded Spill move $a0, $a3 move $s2, $a3 pcaddu18i $ra, %call36(erand48) jirl $ra, $ra, 0 pcalau12i $a0, %pc_hi20(.LCPI0_2) fld.d $fa1, $a0, %pc_lo12(.LCPI0_2) - fmul.d $fs4, $fa0, $fa1 + fmul.d $fs1, $fa0, $fa1 move $s1, $s2 move $a0, $s2 pcaddu18i $ra, %call36(erand48) jirl $ra, $ra, 0 - fst.d $fa0, $sp, 48 # 8-byte Folded Spill + fst.d $fa0, $sp, 96 # 8-byte Folded Spill fsqrt.d $fs2, $fa0 fcmp.cor.d $fcc0, $fs2, $fs2 - bceqz $fcc0, .LBB0_34 + bceqz $fcc0, .LBB0_33 .LBB0_18: # %.split pcalau12i $a0, %pc_hi20(.LCPI0_3) fld.d $fa0, $a0, %pc_lo12(.LCPI0_3) - fabs.d $fa1, $fs3 + vld $vr1, $sp, 80 # 16-byte Folded Reload + fabs.d $fa1, $fa1 fcmp.clt.d $fcc0, $fa0, $fa1 vldi $vr1, -912 - fsel $fa0, $fa1, $fs7, $fcc0 - fsel $fa1, $fs7, $fa1, $fcc0 - fneg.d $fa2, $fs7 - fmul.d $fa2, $fs1, $fa2 + fsel $fa0, $fa1, $fs5, $fcc0 + fsel $fa1, $fs5, $fa1, $fcc0 + fneg.d $fa2, $fs5 + fmul.d $fa2, $fs4, $fa2 fmadd.d $fa2, $fa1, $fs6, $fa2 fneg.d $fa3, $fs6 fmul.d $fa3, $fa0, $fa3 - fmadd.d $fa3, $fs0, $fs7, $fa3 - fneg.d $fa4, $fs0 + fmadd.d $fa3, $fs7, $fs5, $fa3 + fneg.d $fa4, $fs7 fmul.d $fa1, $fa1, $fa4 - fmadd.d $fa0, $fa0, $fs1, $fa1 + fmadd.d $fa0, $fa0, $fs4, $fa1 fmul.d $fa1, $fa3, $fa3 fmadd.d $fa1, $fa2, $fa2, $fa1 fmadd.d $fa1, $fa0, $fa0, $fa1 frsqrt.d $fa1, $fa1 - fmul.d $fs3, $fa2, $fa1 - fmul.d $fs7, $fa3, $fa1 - fmul.d $fs5, $fa0, $fa1 - fneg.d $fa0, $fs7 + fmul.d $fs5, $fa2, $fa1 + fmul.d $fs3, $fa3, $fa1 + fmul.d $fs0, $fa0, $fa1 + fneg.d $fa0, $fs3 fmul.d $fa0, $fs6, $fa0 - fmadd.d $fa0, $fs1, $fs5, $fa0 + fmadd.d $fa0, $fs4, $fs0, $fa0 + fst.d $fa0, $sp, 80 # 8-byte Folded Spill + fneg.d $fa0, $fs0 + fmul.d $fa0, $fs7, $fa0 + fmadd.d $fa0, $fs6, $fs5, $fa0 fst.d $fa0, $sp, 32 # 8-byte Folded Spill fneg.d $fa0, $fs5 - fmul.d $fa0, $fs0, $fa0 - fmadd.d $fa0, $fs6, $fs3, $fa0 + fmul.d $fa0, $fs4, $fa0 + fmadd.d $fa0, $fs7, $fs3, $fa0 fst.d $fa0, $sp, 24 # 8-byte Folded Spill - fneg.d $fa0, $fs3 - fmul.d $fa0, $fs1, $fa0 - fmadd.d $fa0, $fs0, $fs7, $fa0 - fst.d $fa0, $sp, 16 # 8-byte Folded Spill - fmov.d $fa0, $fs4 + fmov.d $fa0, $fs1 pcaddu18i $ra, %call36(cos) jirl $ra, $ra, 0 - fmul.d $fa1, $fs3, $fa0 - fmul.d $fa2, $fs7, $fa0 - fmul.d $fa0, $fs5, $fa0 - fmul.d $fs3, $fs2, $fa1 - fmul.d $fs5, $fs2, $fa2 - fmul.d $fs7, $fs2, $fa0 - fmov.d $fa0, $fs4 + fmul.d $fa1, $fs5, $fa0 + fmul.d $fa2, $fs3, $fa0 + fmul.d $fa0, $fs0, $fa0 + fmul.d $fs0, $fs2, $fa1 + fmul.d $fs3, $fs2, $fa2 + fmul.d $fs5, $fs2, $fa0 + fmov.d $fa0, $fs1 pcaddu18i $ra, %call36(sin) jirl $ra, $ra, 0 - fld.d $fa1, $sp, 32 # 8-byte Folded Reload + fld.d $fa1, $sp, 80 # 8-byte Folded Reload fmul.d $fa1, $fa1, $fa0 - fld.d $fa2, $sp, 24 # 8-byte Folded Reload + fld.d $fa2, $sp, 32 # 8-byte Folded Reload fmul.d $fa2, $fa2, $fa0 - fld.d $fa3, $sp, 16 # 8-byte Folded Reload + fld.d $fa3, $sp, 24 # 8-byte Folded Reload fmul.d $fa0, $fa3, $fa0 fmul.d $fa1, $fs2, $fa1 fmul.d $fa2, $fs2, $fa2 fmul.d $fa3, $fs2, $fa0 - fadd.d $fs2, $fs3, $fa1 - fadd.d $fs3, $fs5, $fa2 - fld.d $fa0, $sp, 48 # 8-byte Folded Reload + fadd.d $fs1, $fs0, $fa1 + fadd.d $fs2, $fs3, $fa2 + fld.d $fa0, $sp, 96 # 8-byte Folded Reload vldi $vr1, -912 fsub.d $fa1, $fa1, $fa0 fsqrt.d $fa0, $fa1 fcmp.cor.d $fcc0, $fa0, $fa0 - fadd.d $fs4, $fs7, $fa3 - bceqz $fcc0, .LBB0_35 + fadd.d $fs0, $fs5, $fa3 + bceqz $fcc0, .LBB0_34 .LBB0_19: # %.split.split - fmul.d $fa1, $fs0, $fa0 - fmul.d $fa2, $fs1, $fa0 + fmul.d $fa1, $fs7, $fa0 + fmul.d $fa2, $fs4, $fa0 fmul.d $fa0, $fs6, $fa0 - fadd.d $fa1, $fa1, $fs2 - fadd.d $fa2, $fa2, $fs3 - fadd.d $fa0, $fa0, $fs4 + fadd.d $fa1, $fa1, $fs1 + fadd.d $fa2, $fa2, $fs2 + fadd.d $fa0, $fa0, $fs0 fmul.d $fa3, $fa2, $fa2 fmadd.d $fa3, $fa1, $fa1, $fa3 fmadd.d $fa3, $fa0, $fa0, $fa3 @@ -324,166 +340,156 @@ _Z8radianceRK3RayiPt: # @_Z8radianceRK3RayiPt fmul.d $fa1, $fa1, $fa3 fmul.d $fa2, $fa2, $fa3 fmul.d $fa0, $fa0, $fa3 - fld.d $fa3, $sp, 64 # 8-byte Folded Reload - fst.d $fa3, $sp, 232 - fld.d $fa3, $sp, 72 # 8-byte Folded Reload - fst.d $fa3, $sp, 240 - fld.d $fa3, $sp, 80 # 8-byte Folded Reload - fst.d $fa3, $sp, 248 - fst.d $fa1, $sp, 256 - fst.d $fa2, $sp, 264 - fst.d $fa0, $sp, 272 - addi.d $a0, $sp, 160 - addi.d $a1, $sp, 232 + vld $vr3, $sp, 48 # 16-byte Folded Reload + vst $vr3, $sp, 256 + fld.d $fa3, $sp, 112 # 8-byte Folded Reload + fst.d $fa3, $sp, 272 + fst.d $fa1, $sp, 280 + fst.d $fa2, $sp, 288 + fst.d $fa0, $sp, 296 + addi.d $a0, $sp, 176 + addi.d $a1, $sp, 256 move $a2, $s0 move $a3, $s1 - b .LBB0_23 -.LBB0_20: - fadd.d $fa0, $fs3, $fs3 - fld.d $fa1, $a1, 32 - fld.d $fa2, $a1, 24 - fadd.d $fa3, $fs4, $fs4 - fld.d $fa4, $a1, 40 - fmul.d $fa5, $fs4, $fa1 - fmadd.d $fa5, $fs3, $fa2, $fa5 - fadd.d $fa6, $fs5, $fs5 - fmadd.d $fa5, $fs5, $fa4, $fa5 - fmul.d $fa0, $fa0, $fa5 - fmul.d $fa3, $fa3, $fa5 - fmul.d $fa5, $fa6, $fa5 - fsub.d $fa0, $fa2, $fa0 - fsub.d $fa1, $fa1, $fa3 - fsub.d $fa2, $fa4, $fa5 - fst.d $ft1, $sp, 232 - fst.d $ft0, $sp, 240 - fst.d $fa7, $sp, 248 - fst.d $fa0, $sp, 256 - fst.d $fa1, $sp, 264 - fst.d $fa2, $sp, 272 - addi.d $a0, $sp, 160 - addi.d $a1, $sp, 232 - move $a2, $s0 pcaddu18i $ra, %call36(_Z8radianceRK3RayiPt) jirl $ra, $ra, 0 - fld.d $fa0, $sp, 176 + fld.d $fa0, $sp, 192 fld.d $fa1, $s3, 48 - vld $vr2, $sp, 160 + vld $vr2, $sp, 176 vld $vr3, $s3, 32 - fmul.d $fa0, $fs0, $fa0 - b .LBB0_24 + fld.d $fa4, $sp, 64 # 8-byte Folded Reload + fmul.d $fa0, $fa4, $fa0 + b .LBB0_23 +.LBB0_20: + fadd.d $fa1, $fs1, $fs1 + fld.d $fa2, $a1, 40 + vld $vr3, $a1, 24 + vst $vr9, $sp, 256 + fst.d $ft0, $sp, 272 + vfadd.d $vr4, $vr0, $vr0 + vfmul.d $vr0, $vr0, $vr3 + vreplvei.d $vr0, $vr0, 1 + vreplvei.d $vr5, $vr3, 0 + fmadd.d $fa0, $ft2, $fa5, $fa0 + fmadd.d $fa0, $fs1, $fa2, $fa0 + vreplvei.d $vr5, $vr0, 0 + vfmul.d $vr4, $vr4, $vr5 + fmul.d $fa0, $fa1, $fa0 + vfsub.d $vr1, $vr3, $vr4 + fsub.d $fa0, $fa2, $fa0 + vst $vr1, $sp, 280 + fst.d $fa0, $sp, 296 + b .LBB0_22 .LBB0_21: - fadd.d $fa0, $fs3, $fs3 - fadd.d $fa4, $fs4, $fs4 - fld.d $fa1, $a1, 32 - fld.d $fa3, $a1, 24 + vfadd.d $vr0, $vr0, $vr0 + fadd.d $fa4, $fs1, $fs1 + vld $vr5, $a1, 24 fld.d $fa2, $a1, 40 - fadd.d $fa5, $fs5, $fs5 - fmul.d $fa6, $fs4, $fa1 - fmadd.d $fa6, $fs3, $fa3, $fa6 - fmadd.d $fa6, $fs5, $fa2, $fa6 - fmul.d $fa0, $fa0, $fa6 + vst $vr9, $sp, 256 + fst.d $ft0, $sp, 272 + vreplvei.d $vr1, $vr5, 1 + fmul.d $fa6, $ft3, $fa1 + vreplvei.d $vr3, $vr5, 0 + fmadd.d $fa6, $ft2, $fa3, $fa6 + fmadd.d $fa6, $fs1, $fa2, $fa6 + vreplvei.d $vr7, $vr6, 0 + vfmul.d $vr0, $vr0, $vr7 fmul.d $fa4, $fa4, $fa6 - fmul.d $fa5, $fa5, $fa6 - fsub.d $fa0, $fa3, $fa0 - fsub.d $fa4, $fa1, $fa4 - fsub.d $fa5, $fa2, $fa5 - fst.d $ft1, $sp, 232 - fst.d $ft0, $sp, 240 - fst.d $fa7, $sp, 248 - fst.d $fa0, $sp, 256 - fst.d $fa4, $sp, 264 - fmul.d $fa0, $fs4, $fs1 - fmadd.d $fa0, $fs3, $fs0, $fa0 + vfsub.d $vr0, $vr5, $vr0 + fsub.d $fa4, $fa2, $fa4 + vst $vr0, $sp, 280 + fmul.d $fa0, $ft3, $fs4 + fmadd.d $fa0, $ft2, $fs7, $fa0 pcalau12i $a0, %pc_hi20(.LCPI0_4) - fld.d $fa4, $a0, %pc_lo12(.LCPI0_4) - fmadd.d $fa0, $fs5, $fs6, $fa0 - fcmp.clt.d $fcc1, $fs7, $fa0 + fld.d $fa5, $a0, %pc_lo12(.LCPI0_4) + fmadd.d $fa0, $fs1, $fs6, $fa0 + fcmp.clt.d $fcc1, $fs5, $fa0 vldi $vr0, -904 - fsel $fs2, $fa0, $fa4, $fcc1 - fmul.d $fa0, $fs1, $fa1 - fmadd.d $fa0, $fa3, $fs0, $fa0 - fmadd.d $fs0, $fa2, $fs6, $fa0 + fsel $fs0, $fa0, $fa5, $fcc1 + fmul.d $fa0, $fs4, $fa1 + fmadd.d $fa0, $fa3, $fs7, $fa0 + fmadd.d $fs2, $fa2, $fs6, $fa0 fnmadd.d $fs6, $fa2, $fs6, $fa0 vldi $vr0, -912 - fmadd.d $fa4, $fs6, $fs0, $fa0 - fneg.d $fa6, $fs2 - fmul.d $fa6, $fs2, $fa6 - fmadd.d $fa0, $fa6, $fa4, $fa0 - fcmp.cule.d $fcc0, $fs7, $fa0 - fst.d $fa5, $sp, 272 - bcnez $fcc0, .LBB0_26 -# %bb.22: - addi.d $a0, $sp, 160 - addi.d $a1, $sp, 232 + fmadd.d $fa5, $fs6, $fs2, $fa0 + fneg.d $fa6, $fs0 + fmul.d $fa6, $fs0, $fa6 + fmadd.d $fa0, $fa6, $fa5, $fa0 + fcmp.cule.d $fcc0, $fs5, $fa0 + fst.d $fa4, $sp, 296 + bcnez $fcc0, .LBB0_25 +.LBB0_22: + addi.d $a0, $sp, 176 + addi.d $a1, $sp, 256 move $a2, $s0 -.LBB0_23: pcaddu18i $ra, %call36(_Z8radianceRK3RayiPt) jirl $ra, $ra, 0 - fld.d $fa0, $sp, 176 + fld.d $fa0, $sp, 192 fld.d $fa1, $s3, 48 - vld $vr2, $sp, 160 + vld $vr2, $sp, 176 vld $vr3, $s3, 32 - fld.d $fa4, $sp, 104 # 8-byte Folded Reload - fmul.d $fa0, $fa4, $fa0 -.LBB0_24: + fmul.d $fa0, $fs3, $fa0 +.LBB0_23: fadd.d $fa0, $fa0, $fa1 - vld $vr1, $sp, 112 # 16-byte Folded Reload + vld $vr1, $sp, 128 # 16-byte Folded Reload vfmul.d $vr1, $vr1, $vr2 vfadd.d $vr1, $vr1, $vr3 vst $vr1, $fp, 0 fst.d $fa0, $fp, 16 -.LBB0_25: - fld.d $fs7, $sp, 280 # 8-byte Folded Reload - fld.d $fs6, $sp, 288 # 8-byte Folded Reload - fld.d $fs5, $sp, 296 # 8-byte Folded Reload - fld.d $fs4, $sp, 304 # 8-byte Folded Reload - fld.d $fs3, $sp, 312 # 8-byte Folded Reload - fld.d $fs2, $sp, 320 # 8-byte Folded Reload - fld.d $fs1, $sp, 328 # 8-byte Folded Reload - fld.d $fs0, $sp, 336 # 8-byte Folded Reload - ld.d $s8, $sp, 344 # 8-byte Folded Reload - ld.d $s7, $sp, 352 # 8-byte Folded Reload - ld.d $s6, $sp, 360 # 8-byte Folded Reload - ld.d $s5, $sp, 368 # 8-byte Folded Reload - ld.d $s4, $sp, 376 # 8-byte Folded Reload - ld.d $s3, $sp, 384 # 8-byte Folded Reload - ld.d $s2, $sp, 392 # 8-byte Folded Reload - ld.d $s1, $sp, 400 # 8-byte Folded Reload - ld.d $s0, $sp, 408 # 8-byte Folded Reload - ld.d $fp, $sp, 416 # 8-byte Folded Reload - ld.d $ra, $sp, 424 # 8-byte Folded Reload - addi.d $sp, $sp, 432 +.LBB0_24: + fld.d $fs7, $sp, 312 # 8-byte Folded Reload + fld.d $fs6, $sp, 320 # 8-byte Folded Reload + fld.d $fs5, $sp, 328 # 8-byte Folded Reload + fld.d $fs4, $sp, 336 # 8-byte Folded Reload + fld.d $fs3, $sp, 344 # 8-byte Folded Reload + fld.d $fs2, $sp, 352 # 8-byte Folded Reload + fld.d $fs1, $sp, 360 # 8-byte Folded Reload + fld.d $fs0, $sp, 368 # 8-byte Folded Reload + ld.d $s8, $sp, 376 # 8-byte Folded Reload + ld.d $s7, $sp, 384 # 8-byte Folded Reload + ld.d $s6, $sp, 392 # 8-byte Folded Reload + ld.d $s5, $sp, 400 # 8-byte Folded Reload + ld.d $s4, $sp, 408 # 8-byte Folded Reload + ld.d $s3, $sp, 416 # 8-byte Folded Reload + ld.d $s2, $sp, 424 # 8-byte Folded Reload + ld.d $s1, $sp, 432 # 8-byte Folded Reload + ld.d $s0, $sp, 440 # 8-byte Folded Reload + ld.d $fp, $sp, 448 # 8-byte Folded Reload + ld.d $ra, $sp, 456 # 8-byte Folded Reload + addi.d $sp, $sp, 464 ret -.LBB0_26: - fmul.d $fa3, $fs2, $fa3 - fmul.d $fs7, $fs2, $fa1 +.LBB0_25: + vreplvei.d $vr4, $vr9, 0 + vst $vr4, $sp, 48 # 16-byte Folded Spill + vreplvei.d $vr4, $vr9, 1 + vst $vr4, $sp, 64 # 16-byte Folded Spill + fmul.d $fs4, $fs0, $fa3 + fmul.d $fs5, $fs0, $fa1 fsqrt.d $fa1, $fa0 fcmp.cor.d $fcc0, $fa1, $fa1 - fmul.d $fs1, $fs2, $fa2 - fst.d $fa7, $sp, 80 # 8-byte Folded Spill - fst.d $ft0, $sp, 72 # 8-byte Folded Spill - fst.d $ft1, $sp, 64 # 8-byte Folded Spill - bceqz $fcc0, .LBB0_36 -.LBB0_27: # %.split326 - fmadd.d $fa0, $fs0, $fs2, $fa1 - fnmadd.d $fa1, $fs0, $fs2, $fa1 + fmul.d $fs7, $fs0, $fa2 + fst.d $ft0, $sp, 112 # 8-byte Folded Spill + bceqz $fcc0, .LBB0_35 +.LBB0_26: # %.split329 + fmadd.d $fa0, $fs2, $fs0, $fa1 + fnmadd.d $fa1, $fs2, $fs0, $fa1 fsel $fa0, $fa1, $fa0, $fcc1 - fmul.d $fa1, $fs3, $fa0 - fmul.d $fa2, $fs4, $fa0 - fmul.d $fa0, $fs5, $fa0 - fsub.d $fa1, $fa3, $fa1 - fsub.d $fa2, $fs7, $fa2 - fsub.d $fa0, $fs1, $fa0 + fmul.d $fa1, $ft2, $fa0 + fmul.d $fa2, $ft3, $fa0 + fmul.d $fa0, $fs1, $fa0 + fsub.d $fa1, $fs4, $fa1 + fsub.d $fa2, $fs5, $fa2 + fsub.d $fa0, $fs7, $fa0 fmul.d $fa3, $fa2, $fa2 fmadd.d $fa3, $fa1, $fa1, $fa3 fmadd.d $fa3, $fa0, $fa0, $fa3 frsqrt.d $fa3, $fa3 - fmul.d $fs2, $fa1, $fa3 - fmul.d $fs1, $fa2, $fa3 + fmul.d $fs4, $fa1, $fa3 + fmul.d $fs2, $fa2, $fa3 fmul.d $fs0, $fa0, $fa3 - fmul.d $fa0, $fs4, $fs1 - fmadd.d $fa0, $fs2, $fs3, $fa0 - fmadd.d $fa0, $fs0, $fs5, $fa0 + fmul.d $fa0, $ft3, $fs2 + fmadd.d $fa0, $fs4, $ft2, $fa0 + fmadd.d $fa0, $fs0, $fs1, $fa0 pcalau12i $a0, %pc_hi20(.LCPI0_5) fld.d $fa1, $a0, %pc_lo12(.LCPI0_5) fsel $fa0, $fa0, $fs6, $fcc1 @@ -498,137 +504,138 @@ _Z8radianceRK3RayiPt: # @_Z8radianceRK3RayiPt fmadd.d $fa2, $fa1, $fa0, $fa2 ori $a0, $zero, 2 fsub.d $fa0, $fa3, $fa2 - vst $vr0, $sp, 32 # 16-byte Folded Spill - vst $vr2, $sp, 48 # 16-byte Folded Spill - blt $a2, $a0, .LBB0_30 -# %bb.28: + vst $vr0, $sp, 80 # 16-byte Folded Spill + vst $vr2, $sp, 96 # 16-byte Folded Spill + blt $a2, $a0, .LBB0_29 +# %bb.27: vldi $vr0, -944 vldi $vr1, -928 - fmadd.d $fs3, $fa2, $fa1, $fa0 + fmadd.d $fs1, $fa2, $fa1, $fa0 move $s1, $a3 move $a0, $a3 pcaddu18i $ra, %call36(erand48) jirl $ra, $ra, 0 - fcmp.cule.d $fcc0, $fs3, $fa0 - bcnez $fcc0, .LBB0_31 -# %bb.29: - vld $vr0, $sp, 48 # 16-byte Folded Reload - fdiv.d $fa0, $fa0, $fs3 - vst $vr0, $sp, 80 # 16-byte Folded Spill - addi.d $a0, $sp, 160 - addi.d $a1, $sp, 232 + fcmp.cule.d $fcc0, $fs1, $fa0 + bcnez $fcc0, .LBB0_30 +# %bb.28: + vld $vr0, $sp, 96 # 16-byte Folded Reload + fdiv.d $fa0, $fa0, $fs1 + vst $vr0, $sp, 112 # 16-byte Folded Spill + addi.d $a0, $sp, 176 + addi.d $a1, $sp, 256 move $a2, $s0 move $a3, $s1 pcaddu18i $ra, %call36(_Z8radianceRK3RayiPt) jirl $ra, $ra, 0 - vld $vr0, $sp, 160 - fld.d $fa1, $sp, 176 - vld $vr3, $sp, 80 # 16-byte Folded Reload - b .LBB0_32 -.LBB0_30: - addi.d $a0, $sp, 208 - addi.d $a1, $sp, 232 + vld $vr0, $sp, 176 + fld.d $fa1, $sp, 192 + vld $vr3, $sp, 112 # 16-byte Folded Reload + b .LBB0_31 +.LBB0_29: + addi.d $a0, $sp, 224 + addi.d $a1, $sp, 256 move $a2, $s0 move $s1, $a3 pcaddu18i $ra, %call36(_Z8radianceRK3RayiPt) jirl $ra, $ra, 0 - fld.d $fa0, $sp, 224 - vld $vr1, $sp, 48 # 16-byte Folded Reload - fmul.d $fs3, $fa1, $fa0 - fld.d $fa0, $sp, 64 # 8-byte Folded Reload - fst.d $fa0, $sp, 160 - fld.d $fa0, $sp, 72 # 8-byte Folded Reload - fst.d $fa0, $sp, 168 - fld.d $fa0, $sp, 80 # 8-byte Folded Reload + fld.d $fa0, $sp, 240 + vld $vr1, $sp, 96 # 16-byte Folded Reload + fmul.d $fs1, $fa1, $fa0 + vld $vr0, $sp, 48 # 16-byte Folded Reload fst.d $fa0, $sp, 176 - fst.d $fs2, $sp, 184 - fst.d $fs1, $sp, 192 - fst.d $fs0, $sp, 200 - addi.d $a0, $sp, 128 - addi.d $a1, $sp, 160 + vld $vr0, $sp, 64 # 16-byte Folded Reload + fst.d $fa0, $sp, 184 + fld.d $fa0, $sp, 112 # 8-byte Folded Reload + fst.d $fa0, $sp, 192 + fst.d $fs4, $sp, 200 + fst.d $fs2, $sp, 208 + fst.d $fs0, $sp, 216 + addi.d $a0, $sp, 144 + addi.d $a1, $sp, 176 move $a2, $s0 move $a3, $s1 pcaddu18i $ra, %call36(_Z8radianceRK3RayiPt) jirl $ra, $ra, 0 - vld $vr0, $sp, 208 - vld $vr1, $sp, 48 # 16-byte Folded Reload + vld $vr0, $sp, 224 + vld $vr1, $sp, 96 # 16-byte Folded Reload vreplvei.d $vr1, $vr1, 0 - vld $vr2, $sp, 128 - fld.d $fa3, $sp, 144 + vld $vr2, $sp, 144 + fld.d $fa3, $sp, 160 vfmul.d $vr0, $vr1, $vr0 - vld $vr4, $sp, 32 # 16-byte Folded Reload + vld $vr4, $sp, 80 # 16-byte Folded Reload vreplvei.d $vr1, $vr4, 0 vfmul.d $vr1, $vr1, $vr2 fmul.d $fa2, $fa4, $fa3 vfadd.d $vr0, $vr0, $vr1 - fadd.d $fa1, $fs3, $fa2 - b .LBB0_33 -.LBB0_31: + fadd.d $fa1, $fs1, $fa2 + b .LBB0_32 +.LBB0_30: vldi $vr0, -912 - fsub.d $fa0, $fa0, $fs3 - vld $vr1, $sp, 32 # 16-byte Folded Reload + fsub.d $fa0, $fa0, $fs1 + vld $vr1, $sp, 80 # 16-byte Folded Reload fdiv.d $fa0, $fa1, $fa0 - vst $vr0, $sp, 48 # 16-byte Folded Spill - fld.d $fa0, $sp, 64 # 8-byte Folded Reload - fst.d $fa0, $sp, 160 - fld.d $fa0, $sp, 72 # 8-byte Folded Reload - fst.d $fa0, $sp, 168 - fld.d $fa0, $sp, 80 # 8-byte Folded Reload + vst $vr0, $sp, 96 # 16-byte Folded Spill + vld $vr0, $sp, 48 # 16-byte Folded Reload fst.d $fa0, $sp, 176 - fst.d $fs2, $sp, 184 - fst.d $fs1, $sp, 192 - fst.d $fs0, $sp, 200 - addi.d $a0, $sp, 208 - addi.d $a1, $sp, 160 + vld $vr0, $sp, 64 # 16-byte Folded Reload + fst.d $fa0, $sp, 184 + fld.d $fa0, $sp, 112 # 8-byte Folded Reload + fst.d $fa0, $sp, 192 + fst.d $fs4, $sp, 200 + fst.d $fs2, $sp, 208 + fst.d $fs0, $sp, 216 + addi.d $a0, $sp, 224 + addi.d $a1, $sp, 176 move $a2, $s0 move $a3, $s1 pcaddu18i $ra, %call36(_Z8radianceRK3RayiPt) jirl $ra, $ra, 0 - vld $vr0, $sp, 208 - fld.d $fa1, $sp, 224 - vld $vr3, $sp, 48 # 16-byte Folded Reload -.LBB0_32: + vld $vr0, $sp, 224 + fld.d $fa1, $sp, 240 + vld $vr3, $sp, 96 # 16-byte Folded Reload +.LBB0_31: vreplvei.d $vr2, $vr3, 0 vfmul.d $vr0, $vr2, $vr0 fmul.d $fa1, $fa3, $fa1 -.LBB0_33: +.LBB0_32: fld.d $fa2, $s3, 48 vld $vr3, $s3, 32 - fld.d $fa4, $sp, 104 # 8-byte Folded Reload - fmul.d $fa1, $fa4, $fa1 + fmul.d $fa1, $fs3, $fa1 fadd.d $fa1, $fa1, $fa2 - vld $vr2, $sp, 112 # 16-byte Folded Reload + vld $vr2, $sp, 128 # 16-byte Folded Reload vfmul.d $vr0, $vr2, $vr0 vfadd.d $vr0, $vr0, $vr3 vst $vr0, $fp, 0 fst.d $fa1, $fp, 16 - b .LBB0_25 -.LBB0_34: # %call.sqrt324 - fld.d $fa0, $sp, 48 # 8-byte Folded Reload + b .LBB0_24 +.LBB0_33: # %call.sqrt327 + fld.d $fa0, $sp, 96 # 8-byte Folded Reload pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 fmov.d $fs2, $fa0 b .LBB0_18 -.LBB0_35: # %call.sqrt325 +.LBB0_34: # %call.sqrt328 fmov.d $fa0, $fa1 pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 b .LBB0_19 -.LBB0_36: # %call.sqrt327 +.LBB0_35: # %call.sqrt330 move $s1, $a3 move $s2, $a2 + vst $vr10, $sp, 80 # 16-byte Folded Spill + vst $vr11, $sp, 96 # 16-byte Folded Spill movcf2gr $a0, $fcc1 - st.d $a0, $sp, 48 - fst.d $fa3, $sp, 32 # 8-byte Folded Spill + st.d $a0, $sp, 32 pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 - fld.d $fa3, $sp, 32 # 8-byte Folded Reload - ld.d $a0, $sp, 48 + ld.d $a0, $sp, 32 movgr2cf $fcc1, $a0 + vld $vr11, $sp, 96 # 16-byte Folded Reload + vld $vr10, $sp, 80 # 16-byte Folded Reload move $a2, $s2 move $a3, $s1 fmov.d $fa1, $fa0 - b .LBB0_27 + b .LBB0_26 .Lfunc_end0: .size _Z8radianceRK3RayiPt, .Lfunc_end0-_Z8radianceRK3RayiPt # -- End function @@ -914,7 +921,7 @@ main: # @main fsqrt.d $fa0, $fa1 fcmp.cor.d $fcc0, $fa0, $fa0 bceqz $fcc0, .LBB1_23 -.LBB1_13: # %.split158 +.LBB1_13: # %.split159 # in Loop: Header=BB1_15 Depth=5 fsub.d $fa0, $fa2, $fa0 .LBB1_14: # in Loop: Header=BB1_15 Depth=5 @@ -954,19 +961,18 @@ main: # @main fmul.d $fa3, $fa1, $fa3 vfadd.d $vr2, $vr2, $vr4 fadd.d $fa3, $fa3, $fa5 - vreplvei.d $vr4, $vr0, 1 - fmul.d $fa5, $fa4, $fa4 - vreplvei.d $vr0, $vr0, 0 - fmadd.d $fa5, $fa0, $fa0, $fa5 - fmadd.d $fa5, $fa1, $fa1, $fa5 - frsqrt.d $fa5, $fa5 - fmul.d $fa0, $fa0, $fa5 - fmul.d $fa4, $fa4, $fa5 - fmul.d $fa1, $fa1, $fa5 + vfmul.d $vr4, $vr0, $vr0 + vreplvei.d $vr4, $vr4, 1 + vreplvei.d $vr5, $vr0, 0 + fmadd.d $fa4, $fa5, $fa5, $fa4 + fmadd.d $fa4, $fa1, $fa1, $fa4 + frsqrt.d $fa4, $fa4 + fmul.d $fa1, $fa1, $fa4 vst $vr2, $sp, 224 fst.d $fa3, $sp, 240 - fst.d $fa0, $sp, 248 - fst.d $fa4, $sp, 256 + vreplvei.d $vr2, $vr4, 0 + vfmul.d $vr0, $vr0, $vr2 + vst $vr0, $sp, 248 fst.d $fa1, $sp, 264 addi.d $a0, $sp, 272 addi.d $a1, $sp, 224 @@ -1015,7 +1021,7 @@ main: # @main fsqrt.d $fa0, $fa1 fcmp.cor.d $fcc0, $fa0, $fa0 bceqz $fcc0, .LBB1_25 -.LBB1_19: # %.split154 +.LBB1_19: # %.split155 # in Loop: Header=BB1_15 Depth=5 fsub.d $fs0, $fa2, $fa0 .LBB1_20: # in Loop: Header=BB1_15 Depth=5 @@ -1031,12 +1037,12 @@ main: # @main fsqrt.d $fa1, $fa0 fcmp.cor.d $fcc0, $fa1, $fa1 bceqz $fcc0, .LBB1_26 -.LBB1_22: # %.split156 +.LBB1_22: # %.split157 # in Loop: Header=BB1_15 Depth=5 vldi $vr0, -784 fadd.d $fa0, $fa1, $fa0 b .LBB1_14 -.LBB1_23: # %call.sqrt159 +.LBB1_23: # %call.sqrt160 # in Loop: Header=BB1_15 Depth=5 fmov.d $fa0, $fa1 pcaddu18i $ra, %call36(sqrt) @@ -1049,14 +1055,14 @@ main: # @main jirl $ra, $ra, 0 fmov.d $fa1, $fa0 b .LBB1_17 -.LBB1_25: # %call.sqrt155 +.LBB1_25: # %call.sqrt156 # in Loop: Header=BB1_15 Depth=5 fmov.d $fa0, $fa1 pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 vldi $vr2, -912 b .LBB1_19 -.LBB1_26: # %call.sqrt157 +.LBB1_26: # %call.sqrt158 # in Loop: Header=BB1_15 Depth=5 pcaddu18i $ra, %call36(sqrt) jirl $ra, $ra, 0 diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-20050826-2.dir/20050826-2.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-20050826-2.dir/20050826-2.s index 1a2b7efa..52c99466 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-20050826-2.dir/20050826-2.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-20050826-2.dir/20050826-2.s @@ -38,7 +38,19 @@ inet_check_attr: # @inet_check_attr .Lfunc_end0: .size inet_check_attr, .Lfunc_end0-inet_check_attr # -- End function - .globl main # -- Begin function main + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4, 0x0 # -- Begin function main +.LCPI1_0: + .half 0 # 0x0 + .half 1 # 0x1 + .half 2 # 0x2 + .half 3 # 0x3 + .half 8 # 0x8 + .half 10 # 0xa + .half 12 # 0xc + .half 14 # 0xe + .text + .globl main .p2align 5 .type main,@function main: # @main @@ -48,7 +60,7 @@ main: # @main st.d $fp, $sp, 176 # 8-byte Folded Spill st.d $s0, $sp, 168 # 8-byte Folded Spill st.d $s1, $sp, 160 # 8-byte Folded Spill - ori $s0, $zero, 12 + ori $fp, $zero, 12 ori $a0, $zero, 12 lu32i.d $a0, 12 st.d $a0, $sp, 152 @@ -70,70 +82,62 @@ main: # @main addi.d $a1, $sp, 40 pcaddu18i $ra, %call36(inet_check_attr) jirl $ra, $ra, 0 - bnez $a0, .LBB1_20 + bnez $a0, .LBB1_13 # %bb.1: # %.preheader30.preheader - addi.d $fp, $sp, 156 - vld $vr0, $sp, 40 - vld $vr1, $sp, 56 - vld $vr2, $sp, 72 - vld $vr3, $sp, 88 - vinsgr2vr.d $vr4, $fp, 0 + addi.d $s0, $sp, 156 + vld $vr0, $sp, 72 + vld $vr1, $sp, 88 + vld $vr2, $sp, 40 + vld $vr3, $sp, 56 + vinsgr2vr.d $vr4, $s0, 0 vinsgr2vr.d $vr4, $s1, 1 - vreplgr2vr.d $vr5, $fp + vreplgr2vr.d $vr5, $s0 vseq.d $vr0, $vr0, $vr5 - vrepli.b $vr6, -1 - vxor.v $vr0, $vr0, $vr6 - vseq.d $vr1, $vr1, $vr5 - vxor.v $vr1, $vr1, $vr6 - vpickev.w $vr1, $vr1, $vr0 - vpickve2gr.h $a0, $vr1, 2 - andi $a0, $a0, 1 - vpickve2gr.h $a1, $vr0, 0 - bstrins.d $a1, $a0, 63, 1 - vpickve2gr.h $a0, $vr1, 4 - bstrins.d $a1, $a0, 2, 2 - vpickve2gr.h $a0, $vr1, 6 - bstrins.d $a1, $a0, 3, 3 - vseq.d $vr0, $vr2, $vr5 - vxor.v $vr0, $vr0, $vr6 - vseq.d $vr1, $vr3, $vr4 - vst $vr6, $sp, 16 # 16-byte Folded Spill - vxor.v $vr1, $vr1, $vr6 + vseq.d $vr1, $vr1, $vr4 vpickev.w $vr0, $vr1, $vr0 - vpickve2gr.h $a0, $vr0, 0 - bstrins.d $a1, $a0, 4, 4 - vpickve2gr.h $a0, $vr0, 2 - bstrins.d $a1, $a0, 5, 5 - vpickve2gr.h $a0, $vr0, 4 - andi $a0, $a0, 1 - slli.d $a0, $a0, 6 - or $a0, $a1, $a0 - vpickve2gr.h $a1, $vr0, 6 - slli.d $a1, $a1, 7 - or $a0, $a0, $a1 - andi $a0, $a0, 255 - bnez $a0, .LBB1_20 + vseq.d $vr1, $vr2, $vr5 + vseq.d $vr2, $vr3, $vr5 + vpickev.w $vr1, $vr2, $vr1 + vpickve2gr.h $a0, $vr1, 6 + vpickve2gr.h $a1, $vr1, 4 + vld $vr2, $sp, 112 + vld $vr3, $sp, 128 + vpickve2gr.h $a2, $vr1, 2 + vpickve2gr.h $a3, $vr1, 0 + vseq.d $vr1, $vr2, $vr5 + vst $vr5, $sp, 16 # 16-byte Folded Spill + vseq.d $vr2, $vr3, $vr5 + vpickev.w $vr1, $vr2, $vr1 + vinsgr2vr.w $vr2, $a3, 0 + vinsgr2vr.w $vr2, $a2, 1 + vinsgr2vr.w $vr2, $a1, 2 + vinsgr2vr.w $vr2, $a0, 3 + vand.v $vr1, $vr2, $vr1 + vpickve2gr.w $a0, $vr1, 0 + vinsgr2vr.h $vr2, $a0, 0 + vpickve2gr.w $a0, $vr1, 1 + vinsgr2vr.h $vr2, $a0, 1 + vpickve2gr.w $a0, $vr1, 2 + pcalau12i $a1, %pc_hi20(.LCPI1_0) + vld $vr3, $a1, %pc_lo12(.LCPI1_0) + vinsgr2vr.h $vr2, $a0, 2 + vpickve2gr.w $a0, $vr1, 3 + vinsgr2vr.h $vr2, $a0, 3 + vshuf.h $vr3, $vr0, $vr2 + vslli.h $vr0, $vr3, 15 + vmskltz.h $vr0, $vr0 + vpickve2gr.hu $a0, $vr0, 0 + ori $a1, $zero, 255 + bne $a0, $a1, .LBB1_13 # %bb.2: # %.preheader30.preheader ld.d $a0, $sp, 104 xor $a0, $a0, $s1 sltui $a0, $a0, 1 - beqz $a0, .LBB1_20 + beqz $a0, .LBB1_13 # %bb.3: # %.preheader30.preheader - ld.d $a0, $sp, 112 - bne $a0, $fp, .LBB1_20 -# %bb.4: # %.preheader30.preheader - ld.d $a0, $sp, 120 - bne $a0, $fp, .LBB1_20 -# %bb.5: # %.preheader30.preheader - ld.d $a0, $sp, 128 - bne $a0, $fp, .LBB1_20 -# %bb.6: # %.preheader30.preheader - ld.d $a0, $sp, 136 - bne $a0, $fp, .LBB1_20 -# %bb.7: # %.preheader30.preheader ld.d $a0, $sp, 144 - bne $a0, $fp, .LBB1_20 -# %bb.8: # %.preheader29.preheader + bne $a0, $s0, .LBB1_13 +# %bb.4: # %.preheader29.preheader addi.d $a0, $sp, 152 st.d $a0, $sp, 40 st.d $a0, $sp, 56 @@ -148,57 +152,57 @@ main: # @main st.d $a0, $sp, 136 st.d $a0, $sp, 144 st.d $zero, $sp, 48 - addi.d $a0, $s0, -8 + addi.d $a0, $fp, -8 st.h $a0, $sp, 156 - st.d $fp, $sp, 80 + st.d $s0, $sp, 80 addi.d $a1, $sp, 40 pcaddu18i $ra, %call36(inet_check_attr) jirl $ra, $ra, 0 addi.w $a1, $zero, -22 - bne $a0, $a1, .LBB1_20 -# %bb.9: + bne $a0, $a1, .LBB1_13 +# %bb.5: ld.d $a0, $sp, 40 - vld $vr2, $sp, 16 # 16-byte Folded Reload - bne $a0, $fp, .LBB1_20 -# %bb.10: # %.peel.next + bne $a0, $s0, .LBB1_13 +# %bb.6: # %.peel.next ld.d $a0, $sp, 48 - bnez $a0, .LBB1_20 -# %bb.11: # %.peel.next35 - ld.d $a0, $sp, 56 - bne $a0, $fp, .LBB1_20 -# %bb.12: # %.peel.next35 - ld.d $a0, $sp, 64 - bne $a0, $fp, .LBB1_20 -# %bb.13: # %.peel.next35 - ld.d $a0, $sp, 72 - bne $a0, $fp, .LBB1_20 -# %bb.14: # %.peel.next35 - ld.d $a0, $sp, 80 - bne $a0, $fp, .LBB1_20 -# %bb.15: # %vector.ph + bnez $a0, .LBB1_13 +# %bb.7: # %.peel.next35 + vld $vr0, $sp, 56 + vld $vr1, $sp, 72 + vld $vr3, $sp, 16 # 16-byte Folded Reload + vseq.d $vr2, $vr0, $vr3 + vrepli.b $vr0, -1 + vxor.v $vr2, $vr2, $vr0 + vseq.d $vr1, $vr1, $vr3 + vxor.v $vr1, $vr1, $vr0 + vpickev.w $vr1, $vr1, $vr2 + vmskltz.w $vr1, $vr1 + vpickve2gr.hu $a0, $vr1, 0 + bnez $a0, .LBB1_13 +# %bb.8: # %vector.ph move $a3, $zero addi.d $a0, $sp, 152 - vreplgr2vr.d $vr0, $a0 + vreplgr2vr.d $vr1, $a0 addi.d $a0, $sp, 88 ori $a1, $zero, 48 .p2align 4, , 16 -.LBB1_16: # %vector.body +.LBB1_9: # %vector.body # =>This Inner Loop Header: Depth=1 - vldx $vr1, $a0, $a3 - vseq.d $vr1, $vr1, $vr0 - vxor.v $vr1, $vr1, $vr2 - vmskltz.d $vr1, $vr1 - vpickve2gr.hu $a4, $vr1, 0 - bnez $a4, .LBB1_18 -# %bb.17: # %vector.body - # in Loop: Header=BB1_16 Depth=1 + vldx $vr2, $a0, $a3 + vseq.d $vr2, $vr2, $vr1 + vxor.v $vr2, $vr2, $vr0 + vmskltz.d $vr2, $vr2 + vpickve2gr.hu $a4, $vr2, 0 + bnez $a4, .LBB1_11 +# %bb.10: # %vector.body + # in Loop: Header=BB1_9 Depth=1 move $a2, $a3 addi.d $a3, $a3, 16 - bne $a2, $a1, .LBB1_16 -.LBB1_18: # %middle.split + bne $a2, $a1, .LBB1_9 +.LBB1_11: # %middle.split andi $a0, $a4, 3 - bnez $a0, .LBB1_20 -# %bb.19: # %.loopexit78 + bnez $a0, .LBB1_13 +# %bb.12: # %.loopexit78 move $a0, $zero ld.d $s1, $sp, 160 # 8-byte Folded Reload ld.d $s0, $sp, 168 # 8-byte Folded Reload @@ -206,7 +210,7 @@ main: # @main ld.d $ra, $sp, 184 # 8-byte Folded Reload addi.d $sp, $sp, 192 ret -.LBB1_20: +.LBB1_13: pcaddu18i $ra, %call36(abort) jirl $ra, $ra, 0 .Lfunc_end1: diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-980526-2.dir/980526-2.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-980526-2.dir/980526-2.s index 026e1c07..03721957 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-980526-2.dir/980526-2.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-980526-2.dir/980526-2.s @@ -20,37 +20,48 @@ do_mknod: # @do_mknod .Lfunc_end0: .size do_mknod, .Lfunc_end0-do_mknod # -- End function - .globl getname # -- Begin function getname + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4, 0x0 # -- Begin function getname +.LCPI1_0: + .dword 9 # 0x9 + .dword 11 # 0xb +.LCPI1_1: + .dword 10 # 0xa + .dword 12 # 0xc + .text + .globl getname .p2align 5 .type getname,@function getname: # @getname # %bb.0: alsl.d $a1, $a0, $a0, 2 - addi.d $a2, $a1, 1 - slli.d $a3, $a0, 2 - alsl.d $a3, $a0, $a3, 1 - addi.d $a3, $a3, 2 - slli.d $a4, $a0, 3 - sub.d $a5, $a4, $a0 - addi.d $a5, $a5, 3 - addi.d $a6, $a4, 4 - alsl.d $a7, $a0, $a0, 3 - addi.d $a7, $a7, 5 - alsl.d $t0, $a0, $a4, 1 - addi.d $t0, $t0, 5 - alsl.d $a1, $a1, $a0, 1 - addi.d $a1, $a1, 5 - alsl.d $a4, $a0, $a4, 2 - addi.d $a4, $a4, 5 - alsl.d $t1, $a0, $a0, 1 - alsl.d $a0, $t1, $a0, 2 - mul.d $a2, $a2, $a3 - mul.d $a3, $a5, $a6 - mul.d $a5, $a7, $t0 - mul.d $a1, $a1, $a4 - add.d $a0, $a0, $a3 - add.d $a0, $a0, $a2 - add.d $a0, $a0, $a5 + addi.d $a1, $a1, 1 + slli.d $a2, $a0, 2 + alsl.d $a2, $a0, $a2, 1 + addi.d $a2, $a2, 2 + slli.d $a3, $a0, 3 + sub.d $a4, $a3, $a0 + addi.d $a4, $a4, 3 + addi.d $a3, $a3, 4 + alsl.d $a5, $a0, $a0, 1 + alsl.d $a5, $a5, $a0, 2 + mul.d $a1, $a1, $a2 + mul.d $a2, $a4, $a3 + pcalau12i $a3, %pc_hi20(.LCPI1_0) + vld $vr0, $a3, %pc_lo12(.LCPI1_0) + pcalau12i $a3, %pc_hi20(.LCPI1_1) + vld $vr1, $a3, %pc_lo12(.LCPI1_1) + vreplgr2vr.d $vr2, $a0 + vmul.d $vr0, $vr2, $vr0 + vaddi.du $vr0, $vr0, 5 + vmul.d $vr1, $vr2, $vr1 + vaddi.du $vr1, $vr1, 5 + vmul.d $vr0, $vr0, $vr1 + add.d $a0, $a5, $a2 + add.d $a0, $a0, $a1 + vpickve2gr.d $a1, $vr0, 0 + add.d $a0, $a0, $a1 + vpickve2gr.d $a1, $vr0, 1 add.d $a0, $a0, $a1 addi.d $a0, $a0, 5 bstrpick.d $a0, $a0, 31, 0 diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-990513-1.dir/990513-1.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-990513-1.dir/990513-1.s index 96753b37..fb28c88e 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-990513-1.dir/990513-1.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-990513-1.dir/990513-1.s @@ -5,24 +5,71 @@ .type foo,@function foo: # @foo # %bb.0: # %vector.ph - move $a2, $zero - addi.d $a0, $a0, 1020 - addi.w $a3, $zero, -1024 - .p2align 4, , 16 -.LBB0_1: # %vector.body - # =>This Inner Loop Header: Depth=1 - add.d $a4, $a0, $a2 - stx.w $a1, $a0, $a2 - st.w $a1, $a4, -16 - st.w $a1, $a4, -4 - st.w $a1, $a4, -20 - st.w $a1, $a4, -8 - st.w $a1, $a4, -24 - st.w $a1, $a4, -12 - addi.d $a2, $a2, -32 - st.w $a1, $a4, -28 - bne $a2, $a3, .LBB0_1 -# %bb.2: # %middle.block + vreplgr2vr.w $vr0, $a1 + vst $vr0, $a0, 1008 + vst $vr0, $a0, 992 + vst $vr0, $a0, 976 + vst $vr0, $a0, 960 + vst $vr0, $a0, 944 + vst $vr0, $a0, 928 + vst $vr0, $a0, 912 + vst $vr0, $a0, 896 + vst $vr0, $a0, 880 + vst $vr0, $a0, 864 + vst $vr0, $a0, 848 + vst $vr0, $a0, 832 + vst $vr0, $a0, 816 + vst $vr0, $a0, 800 + vst $vr0, $a0, 784 + vst $vr0, $a0, 768 + vst $vr0, $a0, 752 + vst $vr0, $a0, 736 + vst $vr0, $a0, 720 + vst $vr0, $a0, 704 + vst $vr0, $a0, 688 + vst $vr0, $a0, 672 + vst $vr0, $a0, 656 + vst $vr0, $a0, 640 + vst $vr0, $a0, 624 + vst $vr0, $a0, 608 + vst $vr0, $a0, 592 + vst $vr0, $a0, 576 + vst $vr0, $a0, 560 + vst $vr0, $a0, 544 + vst $vr0, $a0, 528 + vst $vr0, $a0, 512 + vst $vr0, $a0, 496 + vst $vr0, $a0, 480 + vst $vr0, $a0, 464 + vst $vr0, $a0, 448 + vst $vr0, $a0, 432 + vst $vr0, $a0, 416 + vst $vr0, $a0, 400 + vst $vr0, $a0, 384 + vst $vr0, $a0, 368 + vst $vr0, $a0, 352 + vst $vr0, $a0, 336 + vst $vr0, $a0, 320 + vst $vr0, $a0, 304 + vst $vr0, $a0, 288 + vst $vr0, $a0, 272 + vst $vr0, $a0, 256 + vst $vr0, $a0, 240 + vst $vr0, $a0, 224 + vst $vr0, $a0, 208 + vst $vr0, $a0, 192 + vst $vr0, $a0, 176 + vst $vr0, $a0, 160 + vst $vr0, $a0, 144 + vst $vr0, $a0, 128 + vst $vr0, $a0, 112 + vst $vr0, $a0, 96 + vst $vr0, $a0, 80 + vst $vr0, $a0, 64 + vst $vr0, $a0, 48 + vst $vr0, $a0, 32 + vst $vr0, $a0, 16 + vst $vr0, $a0, 0 ret .Lfunc_end0: .size foo, .Lfunc_end0-foo diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-builtin-bitops-1.dir/builtin-bitops-1.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-builtin-bitops-1.dir/builtin-bitops-1.s index 4dbc058f..a7529a78 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-builtin-bitops-1.dir/builtin-bitops-1.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-builtin-bitops-1.dir/builtin-bitops-1.s @@ -179,74 +179,96 @@ my_clrsb: # @my_clrsb .Lfunc_end3: .size my_clrsb, .Lfunc_end3-my_clrsb # -- End function - .globl my_popcount # -- Begin function my_popcount + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4, 0x0 # -- Begin function my_popcount +.LCPI4_0: + .word 8 # 0x8 + .word 9 # 0x9 + .word 10 # 0xa + .word 11 # 0xb +.LCPI4_1: + .word 24 # 0x18 + .word 25 # 0x19 + .word 26 # 0x1a + .word 27 # 0x1b +.LCPI4_2: + .word 1 # 0x1 + .word 0 # 0x0 + .word 2 # 0x2 + .word 3 # 0x3 +.LCPI4_3: + .word 16 # 0x10 + .word 17 # 0x11 + .word 18 # 0x12 + .word 19 # 0x13 +.LCPI4_4: + .word 4 # 0x4 + .word 5 # 0x5 + .word 6 # 0x6 + .word 7 # 0x7 +.LCPI4_5: + .word 20 # 0x14 + .word 21 # 0x15 + .word 22 # 0x16 + .word 23 # 0x17 +.LCPI4_6: + .word 12 # 0xc + .word 13 # 0xd + .word 14 # 0xe + .word 15 # 0xf + .text + .globl my_popcount .p2align 5 .type my_popcount,@function my_popcount: # @my_popcount # %bb.0: - andi $a1, $a0, 1 - bstrpick.d $a2, $a0, 1, 1 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 2, 2 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 3, 3 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 4, 4 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 5, 5 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 6, 6 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 7, 7 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 8, 8 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 9, 9 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 10, 10 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 11, 11 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 12, 12 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 13, 13 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 14, 14 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 15, 15 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 16, 16 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 17, 17 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 18, 18 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 19, 19 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 20, 20 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 21, 21 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 22, 22 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 23, 23 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 24, 24 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 25, 25 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 26, 26 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 27, 27 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 28, 28 - add.d $a1, $a2, $a1 + pcalau12i $a1, %pc_hi20(.LCPI4_0) + vld $vr0, $a1, %pc_lo12(.LCPI4_0) + pcalau12i $a1, %pc_hi20(.LCPI4_1) + vld $vr1, $a1, %pc_lo12(.LCPI4_1) + pcalau12i $a1, %pc_hi20(.LCPI4_2) + vld $vr2, $a1, %pc_lo12(.LCPI4_2) + vreplgr2vr.w $vr3, $a0 + vsrl.w $vr0, $vr3, $vr0 + vsrl.w $vr1, $vr3, $vr1 + vsrl.w $vr2, $vr3, $vr2 + pcalau12i $a1, %pc_hi20(.LCPI4_3) + vld $vr4, $a1, %pc_lo12(.LCPI4_3) + pcalau12i $a1, %pc_hi20(.LCPI4_4) + vld $vr5, $a1, %pc_lo12(.LCPI4_4) + pcalau12i $a1, %pc_hi20(.LCPI4_5) + vld $vr6, $a1, %pc_lo12(.LCPI4_5) + pcalau12i $a1, %pc_hi20(.LCPI4_6) + vld $vr7, $a1, %pc_lo12(.LCPI4_6) + vsrl.w $vr4, $vr3, $vr4 + vsrl.w $vr5, $vr3, $vr5 + vsrl.w $vr6, $vr3, $vr6 + vsrl.w $vr3, $vr3, $vr7 + vrepli.w $vr7, 1 + vand.v $vr3, $vr3, $vr7 + vand.v $vr6, $vr6, $vr7 + vand.v $vr5, $vr5, $vr7 + vand.v $vr4, $vr4, $vr7 + vand.v $vr2, $vr2, $vr7 + vand.v $vr1, $vr1, $vr7 + vand.v $vr0, $vr0, $vr7 + bstrpick.d $a1, $a0, 28, 28 bstrpick.d $a2, $a0, 29, 29 - add.d $a1, $a2, $a1 - bstrpick.d $a2, $a0, 30, 30 - add.d $a1, $a2, $a1 + bstrpick.d $a3, $a0, 30, 30 bstrpick.d $a0, $a0, 31, 31 - add.w $a0, $a0, $a1 + vadd.w $vr0, $vr0, $vr1 + vadd.w $vr1, $vr2, $vr4 + vadd.w $vr0, $vr1, $vr0 + vadd.w $vr1, $vr5, $vr6 + vadd.w $vr1, $vr1, $vr3 + vadd.w $vr0, $vr0, $vr1 + vhaddw.d.w $vr0, $vr0, $vr0 + vhaddw.q.d $vr0, $vr0, $vr0 + vpickve2gr.d $a4, $vr0, 0 + add.d $a1, $a4, $a1 + add.d $a2, $a2, $a3 + add.d $a1, $a1, $a2 + add.w $a0, $a1, $a0 ret .Lfunc_end4: .size my_popcount, .Lfunc_end4-my_popcount diff --git a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-loop-13.dir/loop-13.s b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-loop-13.dir/loop-13.s index f8978561..79f21569 100644 --- a/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-loop-13.dir/loop-13.s +++ b/results/SingleSource/Regression/C/gcc-c-torture/execute/CMakeFiles/GCC-C-execute-loop-13.dir/loop-13.s @@ -7,25 +7,21 @@ scale: # @scale # %bb.0: ld.d $a3, $a0, 0 ori $a4, $zero, 1 - beq $a3, $a4, .LBB0_4 + beq $a3, $a4, .LBB0_3 # %bb.1: - blez $a2, .LBB0_4 -# %bb.2: # %.lr.ph.preheader - addi.d $a1, $a1, 8 + blez $a2, .LBB0_3 .p2align 4, , 16 -.LBB0_3: # %.lr.ph +.LBB0_2: # %.lr.ph # =>This Inner Loop Header: Depth=1 ld.d $a3, $a0, 0 - ld.d $a4, $a1, -8 - ld.d $a5, $a1, 0 - mul.d $a4, $a4, $a3 - mul.d $a3, $a5, $a3 - st.d $a4, $a1, -8 - st.d $a3, $a1, 0 + vld $vr0, $a1, 0 + vreplgr2vr.d $vr1, $a3 + vmul.d $vr0, $vr0, $vr1 + vst $vr0, $a1, 0 addi.w $a2, $a2, -1 addi.d $a1, $a1, 16 - bnez $a2, .LBB0_3 -.LBB0_4: # %.loopexit + bnez $a2, .LBB0_2 +.LBB0_3: # %.loopexit ret .Lfunc_end0: .size scale, .Lfunc_end0-scale diff --git a/results/SingleSource/UnitTests/CMakeFiles/matrix-types-spec.dir/matrix-types-spec.s b/results/SingleSource/UnitTests/CMakeFiles/matrix-types-spec.dir/matrix-types-spec.s index 4c22c915..a62322ab 100644 --- a/results/SingleSource/UnitTests/CMakeFiles/matrix-types-spec.dir/matrix-types-spec.s +++ b/results/SingleSource/UnitTests/CMakeFiles/matrix-types-spec.dir/matrix-types-spec.s @@ -26871,8 +26871,6 @@ _Z12testMultiplyIdLj3ELj3ELj3EEvv: # @_Z12testMultiplyIdLj3ELj3ELj3EEvv fst.d $fs0, $sp, 408 # 8-byte Folded Spill fst.d $fs1, $sp, 400 # 8-byte Folded Spill fst.d $fs2, $sp, 392 # 8-byte Folded Spill - fst.d $fs3, $sp, 384 # 8-byte Folded Spill - fst.d $fs4, $sp, 376 # 8-byte Folded Spill .cfi_offset 1, -8 .cfi_offset 22, -16 .cfi_offset 23, -24 @@ -26886,8 +26884,6 @@ _Z12testMultiplyIdLj3ELj3ELj3EEvv: # @_Z12testMultiplyIdLj3ELj3ELj3EEvv .cfi_offset 56, -88 .cfi_offset 57, -96 .cfi_offset 58, -104 - .cfi_offset 59, -112 - .cfi_offset 60, -120 lu12i.w $a0, -128 lu32i.d $a0, -131073 lu52i.d $a1, $a0, 1025 @@ -26930,7 +26926,7 @@ _Z12testMultiplyIdLj3ELj3ELj3EEvv: # @_Z12testMultiplyIdLj3ELj3ELj3EEvv vldi $vr4, -912 vldi $vr5, -860 vldi $vr6, -972 - addi.d $s6, $sp, 304 + addi.d $s6, $sp, 320 ori $s7, $zero, 9 .p2align 4, , 16 .LBB7_1: # =>This Loop Header: Depth=1 @@ -26993,7 +26989,7 @@ _Z12testMultiplyIdLj3ELj3ELj3EEvv: # @_Z12testMultiplyIdLj3ELj3ELj3EEvv vldi $vr4, -912 vldi $vr5, -860 vldi $vr6, -972 - addi.d $s5, $sp, 232 + addi.d $s5, $sp, 248 ori $s6, $zero, 9 .p2align 4, , 16 .LBB7_7: # %_Z10initRandomIdTnNSt9enable_ifIXsr3std17is_floating_pointIT_EE5valueEiE4typeELi0EEvPS1_jj.exit @@ -27049,195 +27045,177 @@ _Z12testMultiplyIdLj3ELj3ELj3EEvv: # @_Z12testMultiplyIdLj3ELj3ELj3EEvv vldi $vr4, -912 b .LBB7_10 .LBB7_12: # %.preheader.i.preheader - st.d $zero, $sp, 224 + st.d $zero, $sp, 240 vrepli.b $vr0, 0 + vst $vr0, $sp, 16 # 16-byte Folded Spill + vst $vr0, $sp, 224 vst $vr0, $sp, 208 vst $vr0, $sp, 192 - vst $vr0, $sp, 176 - addi.d $a0, $sp, 88 + addi.d $a0, $sp, 104 ori $a2, $zero, 72 move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - addi.d $a0, $sp, 16 + addi.d $a0, $sp, 32 ori $a2, $zero, 72 move $a1, $zero pcaddu18i $ra, %call36(memset) jirl $ra, $ra, 0 - fld.d $fa1, $sp, 304 - fld.d $fa4, $sp, 232 - fld.d $fa2, $sp, 328 - fld.d $fa5, $sp, 240 - fld.d $fa3, $sp, 352 - fld.d $fa6, $sp, 248 + fld.d $fa1, $sp, 320 + fld.d $fa4, $sp, 248 + fld.d $fa2, $sp, 344 + fld.d $fa5, $sp, 256 + fld.d $fa3, $sp, 368 + fld.d $fa6, $sp, 264 movgr2fr.d $fa0, $zero fmadd.d $fa7, $fa1, $fa4, $fa0 fmadd.d $fa7, $fa2, $fa5, $fa7 fmadd.d $fa7, $fa3, $fa6, $fa7 - fld.d $ft0, $sp, 256 - fld.d $ft1, $sp, 264 - fld.d $ft2, $sp, 272 - fst.d $fa7, $sp, 160 + fld.d $ft0, $sp, 272 + fld.d $ft1, $sp, 280 + fld.d $ft2, $sp, 288 + fst.d $fa7, $sp, 176 fmadd.d $fa7, $fa1, $ft0, $fa0 fmadd.d $fa7, $fa2, $ft1, $fa7 fmadd.d $fa7, $fa3, $ft2, $fa7 - fld.d $ft3, $sp, 280 - fld.d $ft4, $sp, 288 - fld.d $ft5, $sp, 296 - fst.d $fa7, $sp, 184 + fld.d $ft3, $sp, 296 + fld.d $ft4, $sp, 304 + fld.d $ft5, $sp, 312 + fst.d $fa7, $sp, 200 fmadd.d $fa1, $fa1, $ft3, $fa0 fmadd.d $fa1, $fa2, $ft4, $fa1 fmadd.d $fa1, $fa3, $ft5, $fa1 - fld.d $fa2, $sp, 312 - fld.d $fa3, $sp, 336 - fld.d $fa7, $sp, 360 - fst.d $fa1, $sp, 208 + fld.d $fa2, $sp, 328 + fld.d $fa3, $sp, 352 + fld.d $fa7, $sp, 376 + fst.d $fa1, $sp, 224 fmadd.d $fa1, $fa2, $fa4, $fa0 fmadd.d $fa1, $fa3, $fa5, $fa1 fmadd.d $fa1, $fa7, $fa6, $fa1 - fst.d $fa1, $sp, 168 + fst.d $fa1, $sp, 184 fmadd.d $fa1, $fa2, $ft0, $fa0 fmadd.d $fa1, $fa3, $ft1, $fa1 fmadd.d $fa1, $fa7, $ft2, $fa1 - fst.d $fa1, $sp, 192 + fst.d $fa1, $sp, 208 fmadd.d $fa1, $fa2, $ft3, $fa0 fmadd.d $fa1, $fa3, $ft4, $fa1 fmadd.d $fa1, $fa7, $ft5, $fa1 - fld.d $fa7, $sp, 176 - fld.d $fa2, $sp, 320 - fst.d $fa1, $sp, 216 - fld.d $fa3, $sp, 344 - fld.d $fa1, $sp, 368 + fld.d $fa7, $sp, 192 + fld.d $fa2, $sp, 336 + fst.d $fa1, $sp, 232 + fld.d $fa3, $sp, 360 + fld.d $fa1, $sp, 384 fmadd.d $fa4, $fa2, $fa4, $fa7 - fld.d $fa7, $sp, 200 + fld.d $fa7, $sp, 216 fmadd.d $fa4, $fa3, $fa5, $fa4 fmadd.d $fa4, $fa1, $fa6, $fa4 - fst.d $fa4, $sp, 176 + fst.d $fa4, $sp, 192 fmadd.d $fa4, $fa2, $ft0, $fa7 - fld.d $fa5, $sp, 224 + fld.d $fa5, $sp, 240 fmadd.d $fa4, $fa3, $ft1, $fa4 fmadd.d $fa4, $fa1, $ft2, $fa4 - fst.d $fa4, $sp, 200 + fst.d $fa4, $sp, 216 fmadd.d $fa4, $fa2, $ft3, $fa5 fmadd.d $fa4, $fa3, $ft4, $fa4 fmadd.d $fa4, $fa1, $ft5, $fa4 - fst.d $fa4, $sp, 224 - vld $vr5, $sp, 304 - vld $vr6, $sp, 328 - vld $vr4, $sp, 352 - fld.d $ft2, $sp, 248 - vld $vr12, $sp, 232 - fld.d $ft0, $sp, 272 - vld $vr11, $sp, 256 - fld.d $fa7, $sp, 296 - vld $vr9, $sp, 280 - vreplvei.d $vr15, $vr5, 0 - vreplvei.d $vr13, $vr12, 0 - fmadd.d $ft8, $ft7, $ft5, $fa0 - vreplvei.d $vr17, $vr6, 0 - vreplvei.d $vr14, $vr12, 1 - fmadd.d $ft8, $ft9, $ft6, $ft8 - vreplvei.d $vr18, $vr4, 0 - fmadd.d $ft8, $ft10, $ft2, $ft8 - vreplvei.d $vr19, $vr5, 1 - fmadd.d $ft12, $ft11, $ft5, $fa0 - vreplvei.d $vr21, $vr6, 1 - fmadd.d $ft12, $ft13, $ft6, $ft12 - vreplvei.d $vr22, $vr4, 1 - fmadd.d $ft12, $ft14, $ft2, $ft12 - fmadd.d $ft15, $fa2, $ft5, $fa0 - fmadd.d $ft15, $fa3, $ft6, $ft15 - fmadd.d $ft15, $fa1, $ft2, $ft15 - vextrins.d $vr16, $vr20, 16 - vreplvei.d $vr20, $vr11, 0 - fmadd.d $fs0, $ft7, $ft12, $fa0 - vreplvei.d $vr25, $vr11, 1 - fmadd.d $fs0, $ft9, $fs1, $fs0 - fmadd.d $fs0, $ft10, $ft0, $fs0 - fmadd.d $fs2, $ft11, $ft12, $fa0 - fmadd.d $fs2, $ft13, $fs1, $fs2 - fmadd.d $fs2, $ft14, $ft0, $fs2 - fmadd.d $fs3, $fa2, $ft12, $fa0 - fmadd.d $fs3, $fa3, $fs1, $fs3 - fmadd.d $fs3, $fa1, $ft0, $fs3 - vextrins.d $vr24, $vr26, 16 - vreplvei.d $vr26, $vr9, 0 - fmadd.d $ft7, $ft7, $fs2, $fa0 - vreplvei.d $vr28, $vr9, 1 - fmadd.d $ft7, $ft9, $fs4, $ft7 - fmadd.d $ft7, $ft10, $fa7, $ft7 - fmadd.d $ft9, $ft11, $fs2, $fa0 - fmadd.d $ft9, $ft13, $fs4, $ft9 - fmadd.d $ft9, $ft14, $fa7, $ft9 - fmadd.d $fa0, $fa2, $fs2, $fa0 - fmadd.d $fa0, $fa3, $fs4, $fa0 + fst.d $fa4, $sp, 240 + vld $vr5, $sp, 320 + vld $vr6, $sp, 344 + vld $vr4, $sp, 368 + fld.d $ft0, $sp, 264 + vld $vr9, $sp, 248 + fld.d $ft2, $sp, 288 + vld $vr11, $sp, 272 + fld.d $fa7, $sp, 312 + vld $vr12, $sp, 296 + vreplvei.d $vr13, $vr9, 0 + vreplvei.d $vr14, $vr9, 1 + vreplvei.d $vr15, $vr8, 0 + vld $vr26, $sp, 16 # 16-byte Folded Reload + vfmadd.d $vr16, $vr5, $vr13, $vr26 + vfmadd.d $vr16, $vr6, $vr14, $vr16 + vfmadd.d $vr16, $vr4, $vr15, $vr16 + fmadd.d $ft9, $fa2, $ft5, $fa0 + fmadd.d $ft9, $fa3, $ft6, $ft9 + fmadd.d $ft9, $fa1, $ft0, $ft9 + vreplvei.d $vr18, $vr11, 0 + vreplvei.d $vr19, $vr11, 1 + vreplvei.d $vr20, $vr10, 0 + vfmadd.d $vr21, $vr5, $vr18, $vr26 + vfmadd.d $vr21, $vr6, $vr19, $vr21 + vfmadd.d $vr21, $vr4, $vr20, $vr21 + fmadd.d $ft14, $fa2, $ft10, $fa0 + fmadd.d $ft14, $fa3, $ft11, $ft14 + fmadd.d $ft14, $fa1, $ft2, $ft14 + vreplvei.d $vr23, $vr12, 0 + vreplvei.d $vr24, $vr12, 1 + vreplvei.d $vr25, $vr7, 0 + vfmadd.d $vr26, $vr5, $vr23, $vr26 + vfmadd.d $vr26, $vr6, $vr24, $vr26 + vfmadd.d $vr26, $vr4, $vr25, $vr26 + fmadd.d $fa0, $fa2, $ft15, $fa0 + fmadd.d $fa0, $fa3, $fs0, $fa0 fmadd.d $fa0, $fa1, $fa7, $fa0 - vextrins.d $vr15, $vr17, 16 - fst.d $ft15, $sp, 104 - vst $vr16, $sp, 88 - fst.d $fs3, $sp, 128 - vst $vr24, $sp, 112 - fst.d $fa0, $sp, 152 - vst $vr15, $sp, 136 - vfmul.d $vr0, $vr2, $vr12 - vfmul.d $vr12, $vr5, $vr13 + vst $vr16, $sp, 104 + fst.d $ft9, $sp, 120 + vst $vr21, $sp, 128 + fst.d $ft14, $sp, 144 + vst $vr26, $sp, 152 + fst.d $fa0, $sp, 168 + vfmul.d $vr0, $vr2, $vr9 + vfmul.d $vr9, $vr5, $vr13 vfmul.d $vr13, $vr3, $vr14 vfmul.d $vr14, $vr6, $vr14 - vfadd.d $vr12, $vr12, $vr14 + vfadd.d $vr9, $vr9, $vr14 vfadd.d $vr0, $vr0, $vr13 - vreplvei.d $vr13, $vr10, 0 - vfmul.d $vr13, $vr4, $vr13 - vfmul.d $vr10, $vr1, $vr10 - vfadd.d $vr0, $vr0, $vr10 - vfadd.d $vr10, $vr12, $vr13 - vfmul.d $vr11, $vr2, $vr11 - vfmul.d $vr12, $vr5, $vr20 - vfmul.d $vr13, $vr3, $vr25 - vfmul.d $vr14, $vr6, $vr25 - vfadd.d $vr12, $vr12, $vr14 - vfadd.d $vr11, $vr11, $vr13 - vreplvei.d $vr13, $vr8, 0 - vfmul.d $vr13, $vr4, $vr13 + vfmul.d $vr13, $vr4, $vr15 vfmul.d $vr8, $vr1, $vr8 - vfadd.d $vr8, $vr11, $vr8 - vfadd.d $vr11, $vr12, $vr13 - vfmul.d $vr2, $vr2, $vr9 - vfmul.d $vr5, $vr5, $vr26 - vfmul.d $vr3, $vr3, $vr28 - vfmul.d $vr6, $vr6, $vr28 + vfadd.d $vr0, $vr0, $vr8 + vfadd.d $vr8, $vr9, $vr13 + vfmul.d $vr9, $vr2, $vr11 + vfmul.d $vr11, $vr5, $vr18 + vfmul.d $vr13, $vr3, $vr19 + vfmul.d $vr14, $vr6, $vr19 + vfadd.d $vr11, $vr11, $vr14 + vfadd.d $vr9, $vr9, $vr13 + vfmul.d $vr13, $vr4, $vr20 + vfmul.d $vr10, $vr1, $vr10 + vfadd.d $vr9, $vr9, $vr10 + vfadd.d $vr10, $vr11, $vr13 + vfmul.d $vr2, $vr2, $vr12 + vfmul.d $vr5, $vr5, $vr23 + vfmul.d $vr3, $vr3, $vr24 + vfmul.d $vr6, $vr6, $vr24 vfadd.d $vr5, $vr5, $vr6 vfadd.d $vr2, $vr2, $vr3 - vreplvei.d $vr3, $vr7, 0 - vfmul.d $vr3, $vr4, $vr3 + vfmul.d $vr3, $vr4, $vr25 vfmul.d $vr1, $vr1, $vr7 vfadd.d $vr1, $vr2, $vr1 vfadd.d $vr2, $vr5, $vr3 - vst $vr10, $sp, 16 - vstelm.d $vr0, $sp, 32, 0 - vst $vr11, $sp, 40 - vstelm.d $vr8, $sp, 56, 0 - vst $vr2, $sp, 64 - vstelm.d $vr1, $sp, 80, 0 - addi.d $a0, $sp, 88 - addi.d $a1, $sp, 16 + vst $vr8, $sp, 32 + vstelm.d $vr0, $sp, 48, 0 + vst $vr10, $sp, 56 + vstelm.d $vr9, $sp, 72, 0 + vst $vr2, $sp, 80 + vstelm.d $vr1, $sp, 96, 0 + addi.d $a0, $sp, 104 + addi.d $a1, $sp, 32 ori $a2, $zero, 3 ori $a3, $zero, 3 pcaddu18i $ra, %call36(_Z14expectMatrixEQIdTnNSt9enable_ifIXsr3std17is_floating_pointIT_EE5valueEiE4typeELi0EEvPS1_S4_jj) jirl $ra, $ra, 0 - addi.d $a0, $sp, 160 - addi.d $a1, $sp, 16 + addi.d $a0, $sp, 176 + addi.d $a1, $sp, 32 ori $a2, $zero, 3 ori $a3, $zero, 3 pcaddu18i $ra, %call36(_Z14expectMatrixEQIdTnNSt9enable_ifIXsr3std17is_floating_pointIT_EE5valueEiE4typeELi0EEvPS1_S4_jj) jirl $ra, $ra, 0 - addi.d $a0, $sp, 160 - addi.d $a1, $sp, 88 + addi.d $a0, $sp, 176 + addi.d $a1, $sp, 104 ori $a2, $zero, 3 ori $a3, $zero, 3 pcaddu18i $ra, %call36(_Z14expectMatrixEQIdTnNSt9enable_ifIXsr3std17is_floating_pointIT_EE5valueEiE4typeELi0EEvPS1_S4_jj) jirl $ra, $ra, 0 - fld.d $fs4, $sp, 376 # 8-byte Folded Reload - fld.d $fs3, $sp, 384 # 8-byte Folded Reload fld.d $fs2, $sp, 392 # 8-byte Folded Reload fld.d $fs1, $sp, 400 # 8-byte Folded Reload fld.d $fs0, $sp, 408 # 8-byte Folded Reload diff --git a/results/SingleSource/UnitTests/HashRecognize/CMakeFiles/crc32.be.dir/crc32.be.s b/results/SingleSource/UnitTests/HashRecognize/CMakeFiles/crc32.be.dir/crc32.be.s index dd11daaf..88b6a37f 100644 --- a/results/SingleSource/UnitTests/HashRecognize/CMakeFiles/crc32.be.dir/crc32.be.s +++ b/results/SingleSource/UnitTests/HashRecognize/CMakeFiles/crc32.be.dir/crc32.be.s @@ -2,56 +2,57 @@ .section .rodata.cst16,"aM",@progbits,16 .p2align 4, 0x0 # -- Begin function main .LCPI0_0: - .word 169000 # 0x29428 - .word 202800 # 0x31830 - .word 236600 # 0x39c38 - .word 270400 # 0x42040 -.LCPI0_1: .word 33800 # 0x8408 .word 67600 # 0x10810 .word 101400 # 0x18c18 .word 135200 # 0x21020 +.LCPI0_1: + .word 169000 # 0x29428 + .word 202800 # 0x31830 + .word 236600 # 0x39c38 + .word 270400 # 0x42040 .LCPI0_2: - .word 439400 # 0x6b468 - .word 473200 # 0x73870 - .word 507000 # 0x7bc78 - .word 540800 # 0x84080 -.LCPI0_3: .word 304200 # 0x4a448 .word 338000 # 0x52850 .word 371800 # 0x5ac58 .word 405600 # 0x63060 +.LCPI0_3: + .word 439400 # 0x6b468 + .word 473200 # 0x73870 + .word 507000 # 0x7bc78 + .word 540800 # 0x84080 .LCPI0_4: - .word 709800 # 0xad4a8 - .word 743600 # 0xb58b0 - .word 777400 # 0xbdcb8 - .word 811200 # 0xc60c0 -.LCPI0_5: .word 574600 # 0x8c488 .word 608400 # 0x94890 .word 642200 # 0x9cc98 .word 676000 # 0xa50a0 +.LCPI0_5: + .word 709800 # 0xad4a8 + .word 743600 # 0xb58b0 + .word 777400 # 0xbdcb8 + .word 811200 # 0xc60c0 +.LCPI0_6: + .word 845000 # 0xce4c8 + .word 878800 # 0xd68d0 + .word 912600 # 0xdecd8 + .word 946400 # 0xe70e0 .text .globl main .p2align 5 .type main,@function main: # @main # %bb.0: - addi.d $sp, $sp, -128 - st.d $fp, $sp, 120 # 8-byte Folded Spill - st.d $s0, $sp, 112 # 8-byte Folded Spill - st.d $s1, $sp, 104 # 8-byte Folded Spill - st.d $s2, $sp, 96 # 8-byte Folded Spill - st.d $s3, $sp, 88 # 8-byte Folded Spill - st.d $s4, $sp, 80 # 8-byte Folded Spill - fst.d $fs0, $sp, 72 # 8-byte Folded Spill - fst.d $fs1, $sp, 64 # 8-byte Folded Spill - fst.d $fs2, $sp, 56 # 8-byte Folded Spill - fst.d $fs3, $sp, 48 # 8-byte Folded Spill - fst.d $fs4, $sp, 40 # 8-byte Folded Spill - fst.d $fs5, $sp, 32 # 8-byte Folded Spill - fst.d $fs6, $sp, 24 # 8-byte Folded Spill - fst.d $fs7, $sp, 16 # 8-byte Folded Spill + addi.d $sp, $sp, -112 + st.d $fp, $sp, 104 # 8-byte Folded Spill + st.d $s0, $sp, 96 # 8-byte Folded Spill + fst.d $fs0, $sp, 88 # 8-byte Folded Spill + fst.d $fs1, $sp, 80 # 8-byte Folded Spill + fst.d $fs2, $sp, 72 # 8-byte Folded Spill + fst.d $fs3, $sp, 64 # 8-byte Folded Spill + fst.d $fs4, $sp, 56 # 8-byte Folded Spill + fst.d $fs5, $sp, 48 # 8-byte Folded Spill + fst.d $fs6, $sp, 40 # 8-byte Folded Spill + fst.d $fs7, $sp, 32 # 8-byte Folded Spill pcalau12i $a0, %pc_hi20(CRCTable) addi.d $a1, $a0, %pc_lo12(CRCTable) ld.w $a0, $a1, 0 @@ -59,306 +60,292 @@ main: # @main vld $vr0, $a2, %pc_lo12(.LCPI0_0) pcalau12i $a2, %pc_hi20(.LCPI0_1) vld $vr1, $a2, %pc_lo12(.LCPI0_1) - vreplgr2vr.w $vr5, $a0 - vxor.v $vr0, $vr5, $vr0 - vxor.v $vr1, $vr5, $vr1 pcalau12i $a2, %pc_hi20(.LCPI0_2) vld $vr2, $a2, %pc_lo12(.LCPI0_2) + vreplgr2vr.w $vr6, $a0 + vxor.v $vr0, $vr6, $vr0 + vst $vr0, $sp, 16 # 16-byte Folded Spill + vxor.v $vr1, $vr6, $vr1 + vxor.v $vr2, $vr6, $vr2 pcalau12i $a2, %pc_hi20(.LCPI0_3) vld $vr3, $a2, %pc_lo12(.LCPI0_3) pcalau12i $a2, %pc_hi20(.LCPI0_4) vld $vr4, $a2, %pc_lo12(.LCPI0_4) pcalau12i $a2, %pc_hi20(.LCPI0_5) - vld $vr6, $a2, %pc_lo12(.LCPI0_5) - vxor.v $vr2, $vr5, $vr2 - vxor.v $vr3, $vr5, $vr3 - vxor.v $vr4, $vr5, $vr4 - vxor.v $vr5, $vr5, $vr6 - lu12i.w $a2, 206 - ori $a2, $a2, 1224 + vld $vr5, $a2, %pc_lo12(.LCPI0_5) + pcalau12i $a2, %pc_hi20(.LCPI0_6) + vld $vr7, $a2, %pc_lo12(.LCPI0_6) + vxor.v $vr3, $vr6, $vr3 + vxor.v $vr4, $vr6, $vr4 + vxor.v $vr5, $vr6, $vr5 + vxor.v $vr6, $vr6, $vr7 + lu12i.w $a2, 239 + ori $a2, $a2, 1256 xor $a2, $a0, $a2 - lu12i.w $a3, 214 - ori $a3, $a3, 2256 + lu12i.w $a3, 247 + ori $a3, $a3, 2288 xor $a3, $a0, $a3 - lu12i.w $a4, 222 - ori $a4, $a4, 3288 + lu12i.w $a4, 255 + ori $a4, $a4, 3320 xor $a4, $a0, $a4 - lu12i.w $a5, 231 - ori $a5, $a5, 224 - xor $a5, $a0, $a5 - lu12i.w $a6, 239 - ori $a6, $a6, 1256 - xor $a6, $a0, $a6 - lu12i.w $a7, 247 - ori $a7, $a7, 2288 - xor $a7, $a0, $a7 - lu12i.w $t0, 255 - ori $t0, $t0, 3320 - xor $t0, $a0, $t0 - ori $t1, $zero, 28 + ori $a5, $zero, 28 pcalau12i $a0, %pc_hi20(main.sample) - addi.d $t2, $a0, %pc_lo12(main.sample) + addi.d $a6, $a0, %pc_lo12(main.sample) lu12i.w $a0, 264 ori $a0, $a0, 256 - vreplgr2vr.w $vr6, $a0 + vreplgr2vr.w $vr7, $a0 lu12i.w $a0, 528 ori $a0, $a0, 512 - vreplgr2vr.w $vr7, $a0 + vreplgr2vr.w $vr8, $a0 lu12i.w $a0, 792 ori $a0, $a0, 768 - vreplgr2vr.w $vr8, $a0 + vreplgr2vr.w $vr9, $a0 lu12i.w $a0, 1056 ori $a0, $a0, 1024 - vreplgr2vr.w $vr9, $a0 + vreplgr2vr.w $vr10, $a0 lu12i.w $a0, 1320 ori $a0, $a0, 1280 - vreplgr2vr.w $vr10, $a0 + vreplgr2vr.w $vr11, $a0 lu12i.w $a0, 1584 ori $a0, $a0, 1536 - vreplgr2vr.w $vr11, $a0 + vreplgr2vr.w $vr12, $a0 lu12i.w $a0, 1848 ori $a0, $a0, 1792 - vreplgr2vr.w $vr12, $a0 - lu12i.w $t3, -4096 - lu32i.d $t3, 0 + vreplgr2vr.w $vr13, $a0 + lu12i.w $a7, -4096 + lu32i.d $a7, 0 pcalau12i $a0, %pc_hi20(.L.crctable) - addi.d $t4, $a0, %pc_lo12(.L.crctable) - move $t5, $zero + addi.d $t0, $a0, %pc_lo12(.L.crctable) + move $t1, $zero move $a0, $zero - ori $t6, $zero, 1 - addi.w $t7, $zero, -4 + ori $t2, $zero, 1 + addi.w $t3, $zero, -4 b .LBB0_2 .p2align 4, , 16 .LBB0_1: # %crc_table.exit # in Loop: Header=BB0_2 Depth=1 - ldx.w $t8, $t2, $t5 - ldx.w $fp, $t2, $t1 - xor $s0, $fp, $t8 - and $s1, $s0, $t3 - srli.d $s1, $s1, 22 - ldx.w $s2, $a1, $s1 - slli.d $s0, $s0, 8 - xor $s3, $s2, $s0 - and $s3, $s3, $t3 - srli.d $s3, $s3, 22 - ldx.w $s3, $a1, $s3 - slli.d $t8, $t8, 16 - slli.d $s2, $s2, 8 - xor $s2, $s3, $s2 - xor $s2, $s2, $t8 - slli.d $s3, $fp, 16 - xor $s4, $s2, $s3 - and $s4, $s4, $t3 - srli.d $s4, $s4, 22 - ldx.w $s4, $a1, $s4 - ldx.w $s1, $t4, $s1 - slli.d $s2, $s2, 8 - xor $s2, $s2, $s4 - slli.d $fp, $fp, 24 - xor $s0, $s1, $s0 - and $s0, $s0, $t3 - srli.d $s0, $s0, 22 - ldx.w $s0, $t4, $s0 - xor $s4, $s2, $fp - and $s4, $s4, $t3 - slli.d $s1, $s1, 8 - xor $s0, $s0, $s1 - xor $t8, $s0, $t8 - xor $s0, $t8, $s3 - and $s0, $s0, $t3 + ldx.w $t4, $a6, $t1 + ldx.w $t5, $a6, $a5 + xor $t6, $t5, $t4 + and $t7, $t6, $a7 + srli.d $t7, $t7, 22 + ldx.w $t8, $a1, $t7 + slli.d $t6, $t6, 8 + xor $fp, $t8, $t6 + and $fp, $fp, $a7 + srli.d $fp, $fp, 22 + ldx.w $fp, $a1, $fp + slli.d $t4, $t4, 16 + slli.d $t8, $t8, 8 + xor $t8, $fp, $t8 + xor $t8, $t8, $t4 + slli.d $fp, $t5, 16 + xor $s0, $t8, $fp + and $s0, $s0, $a7 srli.d $s0, $s0, 22 - ldx.w $s0, $t4, $s0 - srli.d $s1, $s4, 22 - ldx.w $s1, $a1, $s1 + ldx.w $s0, $a1, $s0 + ldx.w $t7, $t0, $t7 slli.d $t8, $t8, 8 xor $t8, $t8, $s0 - xor $fp, $t8, $fp - and $fp, $fp, $t3 - srli.d $fp, $fp, 22 - ldx.w $fp, $t4, $fp - slli.w $s0, $s2, 8 - xor $s0, $s0, $s1 - slli.w $t8, $t8, 8 - xor $t8, $t8, $fp - xor $t8, $s0, $t8 - sltui $t8, $t8, 1 - masknez $fp, $t6, $t8 - maskeqz $a0, $a0, $t8 - or $a0, $a0, $fp - addi.d $t1, $t1, -4 - addi.d $t5, $t5, 4 - beq $t1, $t7, .LBB0_4 + slli.d $t5, $t5, 24 + xor $t6, $t7, $t6 + and $t6, $t6, $a7 + srli.d $t6, $t6, 22 + ldx.w $t6, $t0, $t6 + xor $s0, $t8, $t5 + and $s0, $s0, $a7 + slli.d $t7, $t7, 8 + xor $t6, $t6, $t7 + xor $t4, $t6, $t4 + xor $t6, $t4, $fp + and $t6, $t6, $a7 + srli.d $t6, $t6, 22 + ldx.w $t6, $t0, $t6 + srli.d $t7, $s0, 22 + ldx.w $t7, $a1, $t7 + slli.d $t4, $t4, 8 + xor $t4, $t4, $t6 + xor $t5, $t4, $t5 + and $t5, $t5, $a7 + srli.d $t5, $t5, 22 + ldx.w $t5, $t0, $t5 + slli.w $t6, $t8, 8 + xor $t6, $t6, $t7 + slli.w $t4, $t4, 8 + xor $t4, $t4, $t5 + xor $t4, $t6, $t4 + sltui $t4, $t4, 1 + masknez $t5, $t2, $t4 + maskeqz $a0, $a0, $t4 + or $a0, $a0, $t5 + addi.d $a5, $a5, -4 + addi.d $t1, $t1, 4 + beq $a5, $t3, .LBB0_4 .LBB0_2: # =>This Inner Loop Header: Depth=1 - ld.w $t8, $a1, 1020 - bnez $t8, .LBB0_1 + ld.w $t4, $a1, 1020 + bnez $t4, .LBB0_1 # %bb.3: # %vector.ph20 # in Loop: Header=BB0_2 Depth=1 - vst $vr1, $a1, 4 - vst $vr0, $a1, 20 - vst $vr3, $a1, 36 - vst $vr2, $a1, 52 - vst $vr5, $a1, 68 - vst $vr4, $a1, 84 - st.w $a2, $a1, 100 - st.w $a3, $a1, 104 - st.w $a4, $a1, 108 - st.w $a5, $a1, 112 - st.w $a6, $a1, 116 - vld $vr13, $a1, 0 - vld $vr14, $a1, 16 - st.w $a7, $a1, 120 - st.w $t0, $a1, 124 - vxor.v $vr17, $vr13, $vr6 - vxor.v $vr18, $vr14, $vr6 - vld $vr15, $a1, 32 - vld $vr16, $a1, 48 - vst $vr17, $a1, 128 - vst $vr18, $a1, 144 - vxor.v $vr19, $vr15, $vr6 - vxor.v $vr20, $vr16, $vr6 - vld $vr17, $a1, 64 - vld $vr18, $a1, 80 - vst $vr19, $a1, 160 - vst $vr20, $a1, 176 - vxor.v $vr21, $vr17, $vr6 - vxor.v $vr22, $vr18, $vr6 - vld $vr19, $a1, 96 - vld $vr20, $a1, 112 - vst $vr21, $a1, 192 - vst $vr22, $a1, 208 - vxor.v $vr23, $vr19, $vr6 - vxor.v $vr24, $vr20, $vr6 - vld $vr21, $a1, 0 - vld $vr22, $a1, 16 - vst $vr23, $a1, 224 - vst $vr24, $a1, 240 + vld $vr0, $sp, 16 # 16-byte Folded Reload + vst $vr0, $a1, 4 + vst $vr1, $a1, 20 + vst $vr2, $a1, 36 + vst $vr3, $a1, 52 + vst $vr4, $a1, 68 + vst $vr5, $a1, 84 + vst $vr6, $a1, 100 + st.w $a2, $a1, 116 + vld $vr14, $a1, 0 + vld $vr15, $a1, 16 + st.w $a3, $a1, 120 + st.w $a4, $a1, 124 + vxor.v $vr18, $vr14, $vr7 + vxor.v $vr19, $vr15, $vr7 + vld $vr16, $a1, 32 + vld $vr17, $a1, 48 + vst $vr18, $a1, 128 + vst $vr19, $a1, 144 + vxor.v $vr20, $vr16, $vr7 + vxor.v $vr21, $vr17, $vr7 + vld $vr18, $a1, 64 + vld $vr19, $a1, 80 + vst $vr20, $a1, 160 + vst $vr21, $a1, 176 + vxor.v $vr22, $vr18, $vr7 + vxor.v $vr23, $vr19, $vr7 + vld $vr20, $a1, 96 + vld $vr21, $a1, 112 + vst $vr22, $a1, 192 + vst $vr23, $a1, 208 + vxor.v $vr24, $vr20, $vr7 vxor.v $vr25, $vr21, $vr7 - vxor.v $vr26, $vr22, $vr7 - vld $vr23, $a1, 32 - vld $vr24, $a1, 48 - vst $vr25, $a1, 256 - vst $vr26, $a1, 272 - vxor.v $vr27, $vr23, $vr7 - vxor.v $vr28, $vr24, $vr7 - vld $vr25, $a1, 64 - vld $vr26, $a1, 80 - vst $vr27, $a1, 288 - vst $vr28, $a1, 304 - vxor.v $vr29, $vr25, $vr7 - vxor.v $vr30, $vr26, $vr7 - vld $vr27, $a1, 96 - vld $vr28, $a1, 112 - vst $vr29, $a1, 320 - vst $vr30, $a1, 336 - vxor.v $vr29, $vr27, $vr7 - vxor.v $vr30, $vr28, $vr7 - vst $vr29, $a1, 352 - vst $vr30, $a1, 368 - vxor.v $vr29, $vr13, $vr8 - vxor.v $vr30, $vr14, $vr8 - vst $vr29, $a1, 384 - vst $vr30, $a1, 400 - vxor.v $vr29, $vr15, $vr8 - vxor.v $vr30, $vr16, $vr8 - vst $vr29, $a1, 416 - vst $vr30, $a1, 432 - vxor.v $vr29, $vr17, $vr8 - vxor.v $vr30, $vr18, $vr8 - vst $vr29, $a1, 448 - vst $vr30, $a1, 464 - vxor.v $vr29, $vr19, $vr8 - vxor.v $vr30, $vr20, $vr8 - vld $vr31, $a1, 0 - vst $vr29, $a1, 480 - vst $vr30, $a1, 496 - vld $vr29, $a1, 16 - vxor.v $vr30, $vr31, $vr9 - vst $vr30, $a1, 512 - vld $vr30, $a1, 32 + vld $vr22, $a1, 0 + vld $vr23, $a1, 16 + vst $vr24, $a1, 224 + vst $vr25, $a1, 240 + vxor.v $vr26, $vr22, $vr8 + vxor.v $vr27, $vr23, $vr8 + vld $vr24, $a1, 32 + vld $vr25, $a1, 48 + vst $vr26, $a1, 256 + vst $vr27, $a1, 272 + vxor.v $vr28, $vr24, $vr8 + vxor.v $vr29, $vr25, $vr8 + vld $vr26, $a1, 64 + vld $vr27, $a1, 80 + vst $vr28, $a1, 288 + vst $vr29, $a1, 304 + vxor.v $vr30, $vr26, $vr8 + vxor.v $vr31, $vr27, $vr8 + vld $vr28, $a1, 96 + vld $vr29, $a1, 112 + vst $vr30, $a1, 320 + vst $vr31, $a1, 336 + vxor.v $vr30, $vr28, $vr8 + vxor.v $vr31, $vr29, $vr8 + vst $vr30, $a1, 352 + vst $vr31, $a1, 368 + vxor.v $vr30, $vr14, $vr9 + vxor.v $vr31, $vr15, $vr9 + vst $vr30, $a1, 384 + vst $vr31, $a1, 400 + vxor.v $vr30, $vr16, $vr9 + vxor.v $vr31, $vr17, $vr9 + vst $vr30, $a1, 416 + vst $vr31, $a1, 432 + vxor.v $vr30, $vr18, $vr9 + vxor.v $vr31, $vr19, $vr9 + vst $vr30, $a1, 448 + vst $vr31, $a1, 464 + vxor.v $vr30, $vr20, $vr9 + vxor.v $vr31, $vr21, $vr9 + vld $vr0, $a1, 0 + vst $vr30, $a1, 480 + vst $vr31, $a1, 496 + vld $vr30, $a1, 16 + vxor.v $vr0, $vr0, $vr10 + vst $vr0, $a1, 512 + vld $vr0, $a1, 32 vld $vr31, $a1, 48 - vxor.v $vr29, $vr29, $vr9 - vst $vr29, $a1, 528 - vxor.v $vr29, $vr30, $vr9 - vxor.v $vr30, $vr31, $vr9 + vxor.v $vr30, $vr30, $vr10 + vst $vr30, $a1, 528 + vxor.v $vr0, $vr0, $vr10 + vxor.v $vr30, $vr31, $vr10 vld $vr31, $a1, 64 - vst $vr29, $a1, 544 + vst $vr0, $a1, 544 vst $vr30, $a1, 560 - vld $vr29, $a1, 80 - vxor.v $vr30, $vr31, $vr9 + vld $vr0, $a1, 80 + vxor.v $vr30, $vr31, $vr10 vst $vr30, $a1, 576 vld $vr30, $a1, 96 vld $vr31, $a1, 112 - vxor.v $vr29, $vr29, $vr9 - vst $vr29, $a1, 592 - vxor.v $vr29, $vr30, $vr9 - vxor.v $vr30, $vr31, $vr9 - vst $vr29, $a1, 608 + vxor.v $vr0, $vr0, $vr10 + vst $vr0, $a1, 592 + vxor.v $vr0, $vr30, $vr10 + vxor.v $vr30, $vr31, $vr10 + vst $vr0, $a1, 608 vst $vr30, $a1, 624 - vxor.v $vr29, $vr13, $vr10 - vxor.v $vr30, $vr14, $vr10 - vst $vr29, $a1, 640 + vxor.v $vr0, $vr14, $vr11 + vxor.v $vr30, $vr15, $vr11 + vst $vr0, $a1, 640 vst $vr30, $a1, 656 - vxor.v $vr29, $vr15, $vr10 - vxor.v $vr30, $vr16, $vr10 - vst $vr29, $a1, 672 + vxor.v $vr0, $vr16, $vr11 + vxor.v $vr30, $vr17, $vr11 + vst $vr0, $a1, 672 vst $vr30, $a1, 688 - vxor.v $vr29, $vr17, $vr10 - vxor.v $vr30, $vr18, $vr10 - vst $vr29, $a1, 704 + vxor.v $vr0, $vr18, $vr11 + vxor.v $vr30, $vr19, $vr11 + vst $vr0, $a1, 704 vst $vr30, $a1, 720 - vxor.v $vr29, $vr19, $vr10 - vxor.v $vr30, $vr20, $vr10 - vst $vr29, $a1, 736 + vxor.v $vr0, $vr20, $vr11 + vxor.v $vr30, $vr21, $vr11 + vst $vr0, $a1, 736 vst $vr30, $a1, 752 - vxor.v $vr21, $vr21, $vr11 - vxor.v $vr22, $vr22, $vr11 - vst $vr21, $a1, 768 + vxor.v $vr0, $vr22, $vr12 + vxor.v $vr22, $vr23, $vr12 + vst $vr0, $a1, 768 vst $vr22, $a1, 784 - vxor.v $vr21, $vr23, $vr11 - vxor.v $vr22, $vr24, $vr11 - vst $vr21, $a1, 800 + vxor.v $vr0, $vr24, $vr12 + vxor.v $vr22, $vr25, $vr12 + vst $vr0, $a1, 800 vst $vr22, $a1, 816 - vxor.v $vr21, $vr25, $vr11 - vxor.v $vr22, $vr26, $vr11 - vst $vr21, $a1, 832 + vxor.v $vr0, $vr26, $vr12 + vxor.v $vr22, $vr27, $vr12 + vst $vr0, $a1, 832 vst $vr22, $a1, 848 - vxor.v $vr21, $vr27, $vr11 - vxor.v $vr22, $vr28, $vr11 - vst $vr21, $a1, 864 + vxor.v $vr0, $vr28, $vr12 + vxor.v $vr22, $vr29, $vr12 + vst $vr0, $a1, 864 vst $vr22, $a1, 880 - vxor.v $vr13, $vr13, $vr12 - vxor.v $vr14, $vr14, $vr12 - vst $vr13, $a1, 896 + vxor.v $vr0, $vr14, $vr13 + vxor.v $vr14, $vr15, $vr13 + vst $vr0, $a1, 896 vst $vr14, $a1, 912 - vxor.v $vr13, $vr15, $vr12 - vxor.v $vr14, $vr16, $vr12 - vst $vr13, $a1, 928 + vxor.v $vr0, $vr16, $vr13 + vxor.v $vr14, $vr17, $vr13 + vst $vr0, $a1, 928 vst $vr14, $a1, 944 - vxor.v $vr13, $vr17, $vr12 - vxor.v $vr14, $vr18, $vr12 - vst $vr13, $a1, 960 + vxor.v $vr0, $vr18, $vr13 + vxor.v $vr14, $vr19, $vr13 + vst $vr0, $a1, 960 vst $vr14, $a1, 976 - vxor.v $vr13, $vr19, $vr12 - vxor.v $vr14, $vr20, $vr12 - vst $vr13, $a1, 992 + vxor.v $vr0, $vr20, $vr13 + vxor.v $vr14, $vr21, $vr13 + vst $vr0, $a1, 992 vst $vr14, $a1, 1008 b .LBB0_1 .LBB0_4: - fld.d $fs7, $sp, 16 # 8-byte Folded Reload - fld.d $fs6, $sp, 24 # 8-byte Folded Reload - fld.d $fs5, $sp, 32 # 8-byte Folded Reload - fld.d $fs4, $sp, 40 # 8-byte Folded Reload - fld.d $fs3, $sp, 48 # 8-byte Folded Reload - fld.d $fs2, $sp, 56 # 8-byte Folded Reload - fld.d $fs1, $sp, 64 # 8-byte Folded Reload - fld.d $fs0, $sp, 72 # 8-byte Folded Reload - ld.d $s4, $sp, 80 # 8-byte Folded Reload - ld.d $s3, $sp, 88 # 8-byte Folded Reload - ld.d $s2, $sp, 96 # 8-byte Folded Reload - ld.d $s1, $sp, 104 # 8-byte Folded Reload - ld.d $s0, $sp, 112 # 8-byte Folded Reload - ld.d $fp, $sp, 120 # 8-byte Folded Reload - addi.d $sp, $sp, 128 + fld.d $fs7, $sp, 32 # 8-byte Folded Reload + fld.d $fs6, $sp, 40 # 8-byte Folded Reload + fld.d $fs5, $sp, 48 # 8-byte Folded Reload + fld.d $fs4, $sp, 56 # 8-byte Folded Reload + fld.d $fs3, $sp, 64 # 8-byte Folded Reload + fld.d $fs2, $sp, 72 # 8-byte Folded Reload + fld.d $fs1, $sp, 80 # 8-byte Folded Reload + fld.d $fs0, $sp, 88 # 8-byte Folded Reload + ld.d $s0, $sp, 96 # 8-byte Folded Reload + ld.d $fp, $sp, 104 # 8-byte Folded Reload + addi.d $sp, $sp, 112 ret .Lfunc_end0: .size main, .Lfunc_end0-main diff --git a/results/SingleSource/UnitTests/Vector/CMakeFiles/Vector-build.dir/build.s b/results/SingleSource/UnitTests/Vector/CMakeFiles/Vector-build.dir/build.s index 93de6247..7c205ad7 100644 --- a/results/SingleSource/UnitTests/Vector/CMakeFiles/Vector-build.dir/build.s +++ b/results/SingleSource/UnitTests/Vector/CMakeFiles/Vector-build.dir/build.s @@ -5,15 +5,10 @@ .word 0x3f9df3b6 # float 1.23399997 .word 0x401874d1 # float 2.38212991 .section .rodata.cst16,"aM",@progbits,16 - .p2align 4, 0x0 + .p2align 3, 0x0 .LCPI0_1: .dword 0x3ff85d3540000000 # double 1.5227558612823486 .dword 0x4016b2bb60000000 # double 5.6745429039001465 -.LCPI0_2: - .word 0 # 0x0 - .word 0 # 0x0 - .word 6 # 0x6 - .word 7 # 0x7 .text .globl main .p2align 5 @@ -31,7 +26,8 @@ main: # @main pcalau12i $a1, %pc_hi20(.LCPI0_0) addi.d $a1, $a1, %pc_lo12(.LCPI0_0) fldx.s $fa0, $a1, $a0 - vst $vr0, $sp, 0 # 16-byte Folded Spill + vreplvei.w $vr1, $vr0, 0 + vst $vr1, $sp, 0 # 16-byte Folded Spill fcvt.d.s $fa0, $fa0 movfr2gr.d $fp, $fa0 pcalau12i $a0, %pc_hi20(.L.str) @@ -53,12 +49,10 @@ main: # @main move $a4, $a1 pcaddu18i $ra, %call36(printf) jirl $ra, $ra, 0 - pcalau12i $a0, %pc_hi20(.LCPI0_2) - vld $vr2, $a0, %pc_lo12(.LCPI0_2) vrepli.b $vr0, 0 vld $vr1, $sp, 0 # 16-byte Folded Reload - vshuf.w $vr2, $vr0, $vr1 - vst $vr2, $sp, 0 # 16-byte Folded Spill + vshuf4i.d $vr1, $vr0, 12 + vst $vr1, $sp, 0 # 16-byte Folded Spill move $a0, $s0 move $a1, $fp move $a2, $fp