Skip to content

Commit 6167e0a

Browse files
committed
optimized llmulhu with exx
1 parent a7e99c1 commit 6167e0a

File tree

1 file changed

+106
-93
lines changed

1 file changed

+106
-93
lines changed

src/crt/llmulhu.src

Lines changed: 106 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -6,105 +6,118 @@
66

77
; BC:UDE:UHL = ((uint128_t)BC:UDE:UHL * (uint128_t)(SP64)) >> 64
88
__llmulhu:
9+
; modified version of __llmulu that uses exx to obtain the upper 64 bits of the result.
10+
; __llmulhu runs slightly faster than two calls to __llmulu, and is much faster
11+
; than the naive implementation of __llmulhu that calls __llmulu four times.
12+
push af
13+
ld a, i
14+
di
15+
push af
16+
917
push ix
1018
push iy
11-
ld ix, -36
12-
add ix, sp
13-
ld sp, ix
14-
lea ix, ix + 36
15-
16-
ld (ix - 3), bc
17-
ld (ix - 6), de
18-
ld (ix - 9), hl
19-
20-
ld bc, 0
21-
ld (ix - 10), b
22-
ld (ix - 13), bc
23-
ld (ix - 30), bc
24-
ld c, (ix + 12)
25-
ld (ix - 33), bc
26-
ld iy, (ix + 9)
27-
ld (ix - 36), iy
28-
29-
; x_lo * y_lo
30-
ld c, b
31-
ld d, b
32-
inc de
33-
dec.s de
34-
call __llmulu
35-
inc bc
36-
dec.s bc
37-
ld (ix - 16), bc
38-
ld (ix - 19), de
39-
ld b, 0
40-
ld c, b
41-
42-
; x_hi * y_lo
43-
inc.s de
44-
ld d, b
45-
ld e, (ix - 2)
46-
ld hl, (ix - 5)
47-
call __llmulu
48-
inc bc
49-
dec.s bc
50-
ld (ix - 21), bc
51-
ld (ix - 24), de
52-
ld (ix - 27), hl
53-
54-
ld c, (ix + 16)
55-
ld (ix - 33), c
56-
ld iy, (ix + 13)
57-
ld (ix - 36), iy
58-
59-
; x_lo * y_hi
60-
ld b, 0
61-
ld c, b
62-
inc.s de
63-
ld d, b
64-
ld e, (ix - 6)
65-
ld hl, (ix - 9)
66-
call __llmulu
67-
inc bc
68-
dec.s bc
69-
lea iy, ix - 27
70-
call __llmulhu_i72add
71-
lea iy, ix - 18
72-
call __llmulhu_i72add
73-
ld (ix - 16), bc
74-
ld (ix - 19), de
75-
ld bc, 0
76-
77-
; x_hi * y_hi
78-
inc.s de
79-
ld d, b
80-
ld e, (ix - 2)
81-
ld hl, (ix - 5)
82-
call __llmulu
83-
inc bc
84-
dec.s bc
85-
lea iy, ix - 18
86-
call __llmulhu_i72add
87-
ld sp, ix
88-
pop iy
89-
pop ix
90-
ret
9119

92-
__llmulhu_i72add:
93-
; similar to __lladd, except iy points to the stack and is destroyed
94-
push bc
95-
ld bc, (iy + 0)
96-
add hl, bc
20+
ld ix, 0
21+
lea iy, ix - 6
22+
add iy, sp ; cf=1
23+
24+
push de
25+
push hl
26+
ld l, c
27+
ld h, b
28+
ld.s sp, hl
29+
30+
lea hl, iy + 21
31+
ld b, 8
32+
.push_loop:
33+
push af
34+
ld a, (hl)
35+
inc hl
36+
or a, a ; cf=0
37+
djnz .push_loop
38+
39+
sbc hl, hl
40+
ld e, l
41+
ld d, h
42+
43+
exx
44+
sbc hl, hl
45+
ex de, hl
46+
sbc hl, hl
47+
ld c, l
48+
ld b, l
49+
exx
50+
51+
.byte_loop:
52+
scf
53+
adc a, a
54+
55+
.bit_loop:
56+
ex af, af'
57+
58+
add ix, ix
59+
adc hl, hl
9760
ex de, hl
61+
adc.s hl, hl
62+
ex de, hl
63+
64+
exx
65+
adc hl, hl
66+
ex de, hl
67+
adc hl, hl
68+
ex de, hl
69+
rl c
70+
rl b
71+
exx
72+
73+
ex af, af'
74+
75+
jr nc, .add_end
76+
ld bc, (iy)
77+
add ix, bc
9878
ld bc, (iy + 3)
9979
adc hl, bc
10080
ex de, hl
101-
pop bc
102-
jr nc, .no_carry48
81+
adc.s hl, sp
82+
ex de, hl
83+
jr nc, .add_end
84+
exx
85+
inc hl
86+
add hl, de
87+
or a, a
88+
sbc hl, de
89+
jr nz, .add_end_exx
90+
inc de
91+
sbc hl, de
92+
add hl, de
93+
jr nz, .add_end_exx
10394
inc bc
104-
.no_carry48:
105-
ld iy, (iy + 6)
106-
add iy, bc
107-
lea bc, iy
108-
ret
95+
.add_end_exx:
96+
exx
97+
.add_end:
98+
99+
add a, a
100+
jr nz, .bit_loop
101+
102+
pop af
103+
jr nc, .byte_loop
104+
105+
; ld b, d
106+
; ld c, e
107+
; ex de, hl
108+
; lea hl, ix
109+
; BC:UDE:UHL = lower 64 bits
110+
; shadow BC:UDE:UHL = upper 64 bits
111+
exx
109112

110-
extern __llmulu
113+
pop af ; reset SP
114+
pop af ; reset SP
115+
pop iy
116+
pop ix
117+
118+
pop af
119+
jp po, .skipEI
120+
ei
121+
.skipEI:
122+
pop af
123+
ret

0 commit comments

Comments
 (0)