Skip to content

Commit a391d60

Browse files
committed
Add module delay_cycles for cycle accurate delay
This module contains the inline assembly and metaprogramming details. It is then wrapped at the root of lib.
1 parent 6ba963c commit a391d60

File tree

1 file changed

+294
-0
lines changed

1 file changed

+294
-0
lines changed

src/delay_cycles.rs

Lines changed: 294 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
use core::arch::asm;
2+
3+
/// Delayer::delay_impl() generates the inline assembly to delay by an exact amount of cycles.
4+
///
5+
/// The total number of cycles is computed as CYCLES * MUL / DIV.
6+
/// With a maximum of 25_769_803_784 cycles.
7+
///
8+
/// Zero cycles does nothing. One cycle emits a `nop` instruction. 2 cycles one `rjump`. Above 5
9+
/// cycles, we get into loops. With counters starting at 8 bits, and progressing through 16, 24,
10+
/// and ultimately 32 bits. For a maximum of 11 instructions.
11+
///
12+
///
13+
/// Two nightly features are required for this implementation:
14+
/// #![feature(asm_experimental_arch)]
15+
/// #![feature(asm_const)]
16+
///
17+
/// When the rustc `feature(generic_const_exprs)` is complete
18+
/// (https://github.com/rust-lang/rust/issues/76560) it will become possible to do this directly:
19+
/// ```
20+
/// fn delay_ms<const SECS: u64>() {
21+
/// Delayer::<{SECS * CPU_FREQUENCY_HZ / 1000}>::delay_impl();
22+
/// }
23+
/// ```
24+
///
25+
/// This is also why the code is structured in such a way. With everything as associated consts.
26+
/// Because do support evaluation of expressions at compile time just fine contrary to const
27+
/// generics. The implementation goes from generic consts, to associated consts on the Delayer
28+
/// struct. And in turn those associated consts are fed to the `asm!` macro.
29+
///
30+
/// The rustc `feature(asm_const)` is also a work in progress
31+
/// (https://github.com/rust-lang/rust/issues/93332). It appears to work well in the present code.
32+
/// It also depends on the feature discussed in the next paragraph.
33+
///
34+
/// When `feature(inline_const)` (https://github.com/rust-lang/rust/issues/76001) is complete, all
35+
/// the conditionals used in `delay_impl()` can be wrapped within `const {}` blocks. To ensure
36+
/// beyond a shadow of a doubt that the whole function is fully linearised at compile time.
37+
/// Nevertheless; thanks to constant propagation; this already happens implicitly.
38+
///
39+
/// The maximum number of cycles is 25_769_803_784 when `delay_cycles_u32()` iterates 2^32
40+
/// times, `delay_2cycles()` is used twice, and `delay_1cycle()` once.
41+
///
42+
/// Every `delay_cycles_u*()` function has a minimum and maximum number of cycles it can consume.
43+
/// The minimum is: (cycles per run).
44+
/// The maximum is: (cycles per run) + (cycles per iteration) * (counter-1).
45+
/// Note that a counter of zero iterates 2^bits time.
46+
///
47+
/// Example with `delay_cycles_u32()`.
48+
/// Minimum: 9 cycles with 1 iteration.
49+
/// Maximum: 9 + 6 * (2^32-1) == 25_769_803_779 cycles with 2^32 iterations.
50+
///
51+
/// Cycles 1..=5 are implemented by a combination of up to two `delay_2cycles()` and up to one
52+
/// `delay_1cycle()`. Which gets us our maximum of 25_769_803_779 + 5 == 25_769_803_784.
53+
///
54+
/// Technically, beyond this value, the counters of various sizes will be combined until they are
55+
/// all used up. This means the absolute limit is the sum of the maximum cycles of all counters
56+
/// combined plus five:
57+
/// (3+3*0xFF) + (5+4*0xFFFF) + (7+5*0xFF_FFFF) + (9+6*0xFFFF_FFFF) + 5 == 25_853_952_779.
58+
/// But at this point, this is costing 23 instructions, for very little gain (~3.5s at 24Mhz).
59+
/// Calling delay_cycles twice would be far more efficient.
60+
pub struct Delayer<const CYCLES: u64, const MUL: u64, const DIV: u64>;
61+
62+
struct Cycles {
63+
counter_mask: u64,
64+
cycles_per_run: u64,
65+
cycles_per_iter: u64,
66+
max_cycles: u64,
67+
}
68+
69+
struct Selection {
70+
selected: bool,
71+
counter: u64,
72+
remainder: u64,
73+
}
74+
75+
const fn cycles(counter_mask: u64, cycles_per_run: u64, cycles_per_iter: u64) -> Cycles {
76+
Cycles {
77+
counter_mask,
78+
cycles_per_run,
79+
cycles_per_iter,
80+
max_cycles: cycles_per_run + cycles_per_iter * counter_mask,
81+
}
82+
}
83+
84+
const fn select(info: Cycles, cycles: u64, above: u64) -> Selection {
85+
if !(cycles > above) {
86+
return Selection { selected: false, counter: 0, remainder: cycles };
87+
}
88+
let counter = (cycles - info.cycles_per_run) / info.cycles_per_iter + 1;
89+
let counter = if counter > info.counter_mask {
90+
info.counter_mask + 1
91+
} else {
92+
counter
93+
};
94+
Selection {
95+
selected: true,
96+
counter: if counter > info.counter_mask {
97+
0 // Counter wrap around.
98+
} else {
99+
counter
100+
},
101+
remainder: cycles - (info.cycles_per_run + info.cycles_per_iter * (counter - 1))
102+
}
103+
}
104+
105+
impl<const CYCLES: u64, const MUL: u64, const DIV: u64> Delayer<CYCLES, MUL, DIV> {
106+
// Multiply first to avoid precision loss.
107+
// With a u64 there is no overflow when MUL is lower than:
108+
// (2^64-1)/25_769_803_784 == 715_827_882.
109+
// Since MUL is usually CPU_FREQUENCY_HZ, this allows up to 715.83 MHz.
110+
const TOTAL_CYCLES: u64 = CYCLES * MUL / DIV;
111+
112+
// With `feature(generic_const_exprs) it becomes possible to construct a static assertion.
113+
//const _: [(); 0 - ((Self::TOTAL_CYCLES > 25_769_803_784) as usize)] = [];
114+
115+
// counter mask, cycles per run, cycles per iteration. | cost + worst case remainder cost
116+
const U32_INFO: Cycles = cycles(0xFFFF_FFFF, 9, 6); // 8 + 3
117+
const U24_INFO: Cycles = cycles( 0xFF_FFFF, 7, 5); // 6 + 2
118+
const U16_INFO: Cycles = cycles( 0xFFFF, 5, 4); // 4 + 2
119+
const U8_INFO: Cycles = cycles( 0xFF, 3, 3); // 3 + 1
120+
121+
// The selection process stops at the smallest counter size that can handle the number of
122+
// cycles to consume with a remainder of up to 5 cycles. This will not always produce the
123+
// smallest possible number of instructions. In some cases, the cost of U16+U8 might be one
124+
// instruction lower than that of the U24. This is because the U16+U8 would have no remainder
125+
// contrary to the U24. Many combinations of the various counter sizes are possible, dividing
126+
// the number of cycles more or less evenly. Implementing this without
127+
// `feature(generic_const_exprs) seems daunting. It would require to compute the various
128+
// combinations and compare the cost. Note that gcc-avr intrinsics delay_cycles
129+
// doesn't bother to optimize this if this can be of any consolation.
130+
const U32: Selection = select(Self::U32_INFO, Self::TOTAL_CYCLES, Self::U24_INFO.max_cycles + 4);
131+
const U24: Selection = select(Self::U24_INFO, Self::U32.remainder, Self::U16_INFO.max_cycles + 5);
132+
const U16: Selection = select(Self::U16_INFO, Self::U24.remainder, Self::U8_INFO.max_cycles + 4);
133+
const U8 : Selection = select(Self::U8_INFO, Self::U16.remainder, 5);
134+
// The extras +4, +5, and +4 cycles take into account that even though the number of cycles is
135+
// beyond the capacity of the counter, the overflow can be served by the 1.=5 cycles
136+
// implementation. In those instances, it so happens that the counter of the next size up would
137+
// take more instructions because it also requires a remainder.
138+
139+
// The counters leave up to 5 cycles as a remainder. They are consumed with up to two `rjump`
140+
// and a `nop`.
141+
// 5 cycles => 3 instructions.
142+
// 4 cycles => 2 instructions.
143+
// 3 cycles => 2 instructions.
144+
// 2 cycles => 1 instruction.
145+
// 1 cycle => 1 instruction.
146+
147+
/// 8 instructions.
148+
/// 9 cycles per run.
149+
/// 6 cycles per iteration.
150+
#[inline(always)]
151+
fn delay_cycles_u32() {
152+
unsafe {
153+
asm!(
154+
"ldi {r0:l}, {b0}",
155+
"ldi {r0:h}, {b1}",
156+
"ldi {r2}, {b2}",
157+
"ldi {r3}, {b3}",
158+
"1:",
159+
"sbiw {r0}, 1",
160+
"sbci {r2}, 0",
161+
"sbci {r3}, 0",
162+
"brne 1b",
163+
r0 = out(reg_iw) _,
164+
r2 = out(reg_upper) _,
165+
r3 = out(reg_upper) _,
166+
b0 = const (Self::U32.counter >> 0) as u8,
167+
b1 = const (Self::U32.counter >> 8) as u8,
168+
b2 = const (Self::U32.counter >> 16) as u8,
169+
b3 = const (Self::U32.counter >> 24) as u8,
170+
options(nomem, nostack),
171+
)
172+
}
173+
}
174+
175+
/// 6 instructions.
176+
/// 7 cycles per run.
177+
/// 5 cycles per iteration.
178+
#[inline(always)]
179+
fn delay_cycles_u24() {
180+
// Some way to static assert that COUNTER < 2^24 would be nice.
181+
unsafe {
182+
asm!(
183+
"ldi {r0:l}, {b0}",
184+
"ldi {r0:h}, {b1}",
185+
"ldi {r2}, {b2}",
186+
"1:",
187+
"sbiw {r0}, 1",
188+
"sbci {r2}, 0",
189+
"brne 1b",
190+
r0 = out(reg_iw) _,
191+
r2 = out(reg_upper) _,
192+
b0 = const (Self::U24.counter >> 0) as u8,
193+
b1 = const (Self::U24.counter >> 8) as u8,
194+
b2 = const (Self::U24.counter >> 16) as u8,
195+
options(nomem, nostack),
196+
)
197+
}
198+
}
199+
200+
/// 4 instructions.
201+
/// 5 cycles per run.
202+
/// 4 cycles per iteration.
203+
#[inline(always)]
204+
fn delay_cycles_u16() {
205+
unsafe {
206+
asm!(
207+
"ldi {r0:l}, {b0}",
208+
"ldi {r0:h}, {b1}",
209+
"1:",
210+
"sbiw {r0}, 1",
211+
"brne 1b",
212+
r0 = out(reg_iw) _,
213+
b0 = const (Self::U16.counter >> 0) as u8,
214+
b1 = const (Self::U16.counter >> 8) as u8,
215+
options(nomem, nostack),
216+
)
217+
}
218+
}
219+
220+
/// 3 instructions.
221+
/// 3 cycles per run.
222+
/// 3 cycles per iteration.
223+
#[inline(always)]
224+
fn delay_cycles_u8() {
225+
unsafe {
226+
asm!(
227+
"ldi {r0}, {b0}",
228+
"1:",
229+
"dec {r0}",
230+
"brne 1b",
231+
r0 = out(reg_upper) _,
232+
b0 = const Self::U8.counter,
233+
options(nomem, nostack),
234+
// The carry flag is not touched by `dec`.
235+
// That's the difference between `dec` and `sub 1`.
236+
// Is it possible to tell `asm!` that the carry is untouched?
237+
// Something like `preserves_carry_flag`.
238+
// The compiler wouldn't have to save the carry flag when delay_cycles_u8 is used
239+
// within an outer loop using multiple-precision computations.
240+
)
241+
}
242+
}
243+
244+
/// 1 instruction.
245+
/// 2 cycles per run.
246+
#[inline(always)]
247+
fn delay_2cycles() {
248+
unsafe { asm!("rjmp .", options(nomem, nostack, preserves_flags),) }
249+
}
250+
251+
/// 1 instruction.
252+
/// 1 cycle per run.
253+
#[inline(always)]
254+
fn delay_1cycle() {
255+
unsafe { asm!("nop", options(nomem, nostack, preserves_flags),) }
256+
}
257+
258+
#[inline(always)]
259+
pub fn delay_impl() {
260+
// Cycles 83_886_083 + 4 .. 25_769_803_779 (9+6*0xFFFF_FFFF) + 5
261+
if Self::U32.selected {
262+
Self::delay_cycles_u32();
263+
}
264+
265+
// Cycles 262_146 + 5 ..= 83_886_082 (7+5*0xFF_FFFF) + 4
266+
if Self::U24.selected {
267+
Self::delay_cycles_u24();
268+
}
269+
270+
// Cycles 769 + 4 ..= 262_145 (5+4*0xFFFF) + 5
271+
if Self::U16.selected {
272+
Self::delay_cycles_u16();
273+
}
274+
275+
// Cycles 6 ..= 768 (3+3*0xFF) + 4
276+
if Self::U8.selected {
277+
Self::delay_cycles_u8();
278+
}
279+
280+
// Remaining cycles 1..=5.
281+
282+
if Self::U8.remainder >= 4 {
283+
Self::delay_2cycles();
284+
}
285+
286+
if Self::U8.remainder >= 2 {
287+
Self::delay_2cycles();
288+
}
289+
290+
if Self::U8.remainder % 2 == 1 {
291+
Self::delay_1cycle();
292+
}
293+
}
294+
}

0 commit comments

Comments
 (0)