@@ -114,3 +114,65 @@ BB0_11:
114114 WORD $ 0xfd000040 // str d0 , [ x2 ]
115115 WORD $ 0xa8c17bfd // ldp x29 , x30 , [ sp ], # 16 ; 16-byte Folded Reload
116116 WORD $ 0xd65f03c0 // ret
117+
118+ TEXT ·f32_dot_product(SB) , $ 0 - 32
119+ MOVD x + 0 (FP) , R0
120+ MOVD y + 8 (FP) , R1
121+ MOVD result + 16 (FP) , R2
122+ MOVD size + 24 (FP) , R3
123+ WORD $ 0xa9bf7bfd // stp x29 , x30 , [ sp , # - 16 ] ! ; 16-byte Folded Spill
124+ WORD $ 0x910003fd // mov x29 , sp
125+ WORD $ 0xb40000c3 // cbz x3 , LBB1_3
126+ WORD $ 0xf100207f // cmp x3 , # 8
127+ WORD $ 0x54000102 // b.hs LBB1_4
128+ WORD $ 0xd2800008 // mov x8 , # 0
129+ WORD $ 0x2f00e400 // movi d0 , # 0000000000000000
130+ WORD $ 0x14000018 // b LBB1_7
131+
132+ BB1_3:
133+ WORD $ 0x2f00e400 // movi d0 , # 0000000000000000
134+ WORD $ 0xfd000040 // str d0 , [ x2 ]
135+ WORD $ 0xa8c17bfd // ldp x29 , x30 , [ sp ], # 16 ; 16-byte Folded Reload
136+ WORD $ 0xd65f03c0 // ret
137+
138+ BB1_4:
139+ WORD $ 0x927df068 // and x8 , x3 , # 0xfffffffffffffff8
140+ WORD $ 0x91004009 // add x9 , x0 , # 16
141+ WORD $ 0x9100402a // add x10 , x1 , # 16
142+ WORD $ 0x6f00e400 // movi.2d v0 , # 0000000000000000
143+ WORD $ 0xaa0803eb // mov x11 , x8
144+ WORD $ 0x6f00e401 // movi.2d v1 , # 0000000000000000
145+
146+ BB1_5:
147+ WORD $ 0xad7f8d22 // ldp q2 , q3 , [ x9 , # - 16 ]
148+ WORD $ 0xad7f9544 // ldp q4 , q5 , [ x10 , # - 16 ]
149+ WORD $ 0x4e22cc80 // fmla.4s v0 , v4 , v2
150+ WORD $ 0x4e23cca1 // fmla.4s v1 , v5 , v3
151+ WORD $ 0x91008129 // add x9 , x9 , # 32
152+ WORD $ 0x9100814a // add x10 , x10 , # 32
153+ WORD $ 0xf100216b // subs x11 , x11 , # 8
154+ WORD $ 0x54ffff21 // b.ne LBB1_5
155+ WORD $ 0x4e20d420 // fadd .4s v0 , v1 , v0
156+ WORD $ 0x6e20d400 // faddp .4s v0 , v0 , v0
157+ WORD $ 0x7e30d800 // faddp .2s s0 , v0
158+ WORD $ 0xeb03011f // cmp x8 , x3
159+ WORD $ 0x54000140 // b.eq LBB1_9
160+
161+ BB1_7:
162+ WORD $ 0xcb080069 // sub x9 , x3 , x8
163+ WORD $ 0xd37ef50a // lsl x10 , x8 , # 2
164+ WORD $ 0x8b0a0028 // add x8 , x1 , x10
165+ WORD $ 0x8b0a000a // add x10 , x0 , x10
166+
167+ BB1_8:
168+ WORD $ 0xbc404541 // ldr s1 , [ x10 ], # 4
169+ WORD $ 0xbc404502 // ldr s2 , [ x8 ], # 4
170+ WORD $ 0x1f010040 // fmadd s0 , s2 , s1 , s0
171+ WORD $ 0xf1000529 // subs x9 , x9 , # 1
172+ WORD $ 0x54ffff81 // b.ne LBB1_8
173+
174+ BB1_9:
175+ WORD $ 0x1e22c000 // fcvt d0 , s0
176+ WORD $ 0xfd000040 // str d0 , [ x2 ]
177+ WORD $ 0xa8c17bfd // ldp x29 , x30 , [ sp ], # 16 ; 16-byte Folded Reload
178+ WORD $ 0xd65f03c0 // ret
0 commit comments