@@ -1050,33 +1050,118 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
1050
1050
p .From .Offset = int64 (condCode )
1051
1051
p .To .Type = obj .TYPE_REG
1052
1052
p .To .Reg = v .Reg ()
1053
- case ssa .OpARM64DUFFZERO :
1054
- // runtime.duffzero expects start address in R20
1055
- p := s .Prog (obj .ADUFFZERO )
1056
- p .To .Type = obj .TYPE_MEM
1057
- p .To .Name = obj .NAME_EXTERN
1058
- p .To .Sym = ir .Syms .Duffzero
1059
- p .To .Offset = v .AuxInt
1060
1053
case ssa .OpARM64LoweredZero :
1061
- // STP.P (ZR,ZR), 16(R16)
1062
- // CMP Rarg1, R16
1063
- // BLE -2(PC)
1064
- // arg1 is the address of the last 16-byte unit to zero
1065
- p := s .Prog (arm64 .ASTP )
1066
- p .Scond = arm64 .C_XPOST
1067
- p .From .Type = obj .TYPE_REGREG
1068
- p .From .Reg = arm64 .REGZERO
1069
- p .From .Offset = int64 (arm64 .REGZERO )
1070
- p .To .Type = obj .TYPE_MEM
1071
- p .To .Reg = arm64 .REG_R16
1072
- p .To .Offset = 16
1073
- p2 := s .Prog (arm64 .ACMP )
1074
- p2 .From .Type = obj .TYPE_REG
1075
- p2 .From .Reg = v .Args [1 ].Reg ()
1076
- p2 .Reg = arm64 .REG_R16
1077
- p3 := s .Prog (arm64 .ABLE )
1078
- p3 .To .Type = obj .TYPE_BRANCH
1079
- p3 .To .SetTarget (p )
1054
+ ptrReg := v .Args [0 ].Reg ()
1055
+ n := v .AuxInt
1056
+ if n < 16 {
1057
+ v .Fatalf ("Zero too small %d" , n )
1058
+ }
1059
+
1060
+ // Generate zeroing instructions.
1061
+ var off int64
1062
+ for n >= 16 {
1063
+ // STP (ZR, ZR), off(ptrReg)
1064
+ zero16 (s , ptrReg , off , false )
1065
+ off += 16
1066
+ n -= 16
1067
+ }
1068
+ // Write any fractional portion.
1069
+ // An overlapping 16-byte write can't be used here
1070
+ // because STP's offsets must be a multiple of 8.
1071
+ if n > 8 {
1072
+ // MOVD ZR, off(ptrReg)
1073
+ zero8 (s , ptrReg , off )
1074
+ off += 8
1075
+ n -= 8
1076
+ }
1077
+ if n != 0 {
1078
+ // MOVD ZR, off+n-8(ptrReg)
1079
+ // TODO: for n<=4 we could use a smaller write.
1080
+ zero8 (s , ptrReg , off + n - 8 )
1081
+ }
1082
+ case ssa .OpARM64LoweredZeroLoop :
1083
+ ptrReg := v .Args [0 ].Reg ()
1084
+ countReg := v .RegTmp ()
1085
+ n := v .AuxInt
1086
+ loopSize := int64 (64 )
1087
+ if n < 3 * loopSize {
1088
+ // - a loop count of 0 won't work.
1089
+ // - a loop count of 1 is useless.
1090
+ // - a loop count of 2 is a code size ~tie
1091
+ // 3 instructions to implement the loop
1092
+ // 4 instructions in the loop body
1093
+ // vs
1094
+ // 8 instructions in the straightline code
1095
+ // Might as well use straightline code.
1096
+ v .Fatalf ("ZeroLoop size too small %d" , n )
1097
+ }
1098
+
1099
+ // Put iteration count in a register.
1100
+ // MOVD $n, countReg
1101
+ p := s .Prog (arm64 .AMOVD )
1102
+ p .From .Type = obj .TYPE_CONST
1103
+ p .From .Offset = n / loopSize
1104
+ p .To .Type = obj .TYPE_REG
1105
+ p .To .Reg = countReg
1106
+ cntInit := p
1107
+
1108
+ // Zero loopSize bytes starting at ptrReg.
1109
+ // Increment ptrReg by loopSize as a side effect.
1110
+ for range loopSize / 16 {
1111
+ // STP.P (ZR, ZR), 16(ptrReg)
1112
+ zero16 (s , ptrReg , 0 , true )
1113
+ // TODO: should we use the postincrement form,
1114
+ // or use a separate += 64 instruction?
1115
+ // postincrement saves an instruction, but maybe
1116
+ // it requires more integer units to do the +=16s.
1117
+ }
1118
+ // Decrement loop count.
1119
+ // SUB $1, countReg
1120
+ p = s .Prog (arm64 .ASUB )
1121
+ p .From .Type = obj .TYPE_CONST
1122
+ p .From .Offset = 1
1123
+ p .To .Type = obj .TYPE_REG
1124
+ p .To .Reg = countReg
1125
+ // Jump to loop header if we're not done yet.
1126
+ // CBNZ head
1127
+ p = s .Prog (arm64 .ACBNZ )
1128
+ p .From .Type = obj .TYPE_REG
1129
+ p .From .Reg = countReg
1130
+ p .To .Type = obj .TYPE_BRANCH
1131
+ p .To .SetTarget (cntInit .Link )
1132
+
1133
+ // Multiples of the loop size are now done.
1134
+ n %= loopSize
1135
+
1136
+ // Write any fractional portion.
1137
+ var off int64
1138
+ for n >= 16 {
1139
+ // STP (ZR, ZR), off(ptrReg)
1140
+ zero16 (s , ptrReg , off , false )
1141
+ off += 16
1142
+ n -= 16
1143
+ }
1144
+ if n > 8 {
1145
+ // Note: an overlapping 16-byte write can't be used
1146
+ // here because STP's offsets must be a multiple of 8.
1147
+ // MOVD ZR, off(ptrReg)
1148
+ zero8 (s , ptrReg , off )
1149
+ off += 8
1150
+ n -= 8
1151
+ }
1152
+ if n != 0 {
1153
+ // MOVD ZR, off+n-8(ptrReg)
1154
+ // TODO: for n<=4 we could use a smaller write.
1155
+ zero8 (s , ptrReg , off + n - 8 )
1156
+ }
1157
+ // TODO: maybe we should use the count register to instead
1158
+ // hold an end pointer and compare against that?
1159
+ // ADD $n, ptrReg, endReg
1160
+ // then
1161
+ // CMP ptrReg, endReg
1162
+ // BNE loop
1163
+ // There's a past-the-end pointer here, any problem with that?
1164
+
1080
1165
case ssa .OpARM64DUFFCOPY :
1081
1166
p := s .Prog (obj .ADUFFCOPY )
1082
1167
p .To .Type = obj .TYPE_MEM
@@ -1482,3 +1567,35 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
1482
1567
p .Pos = p .Pos .WithNotStmt ()
1483
1568
return p
1484
1569
}
1570
+
1571
+ // zero16 zeroes 16 bytes at reg+off.
1572
+ // If postInc is true, increment reg by 16.
1573
+ func zero16 (s * ssagen.State , reg int16 , off int64 , postInc bool ) {
1574
+ // STP (ZR, ZR), off(reg)
1575
+ p := s .Prog (arm64 .ASTP )
1576
+ p .From .Type = obj .TYPE_REGREG
1577
+ p .From .Reg = arm64 .REGZERO
1578
+ p .From .Offset = int64 (arm64 .REGZERO )
1579
+ p .To .Type = obj .TYPE_MEM
1580
+ p .To .Reg = reg
1581
+ p .To .Offset = off
1582
+ if postInc {
1583
+ if off != 0 {
1584
+ panic ("can't postinc with non-zero offset" )
1585
+ }
1586
+ // STP.P (ZR, ZR), 16(reg)
1587
+ p .Scond = arm64 .C_XPOST
1588
+ p .To .Offset = 16
1589
+ }
1590
+ }
1591
+
1592
+ // zero8 zeroes 8 bytes at reg+off.
1593
+ func zero8 (s * ssagen.State , reg int16 , off int64 ) {
1594
+ // MOVD ZR, off(reg)
1595
+ p := s .Prog (arm64 .AMOVD )
1596
+ p .From .Type = obj .TYPE_REG
1597
+ p .From .Reg = arm64 .REGZERO
1598
+ p .To .Type = obj .TYPE_MEM
1599
+ p .To .Reg = reg
1600
+ p .To .Offset = off
1601
+ }
0 commit comments