|
| 1 | +use core::arch::asm; |
| 2 | + |
| 3 | +/// Delayer::delay_impl() generates the inline assembly to delay by an exact amount of cycles. |
| 4 | +/// |
| 5 | +/// The total number of cycles is computed as CYCLES * MUL / DIV. |
| 6 | +/// With a maximum of 25_769_803_784 cycles. |
| 7 | +/// |
| 8 | +/// Zero cycles does nothing. One cycle emits a `nop` instruction. 2 cycles one `rjump`. Above 5 |
| 9 | +/// cycles, we get into loops. With counters starting at 8 bits, and progressing through 16, 24, |
| 10 | +/// and ultimately 32 bits. For a maximum of 11 instructions. |
| 11 | +/// |
| 12 | +/// |
| 13 | +/// Two nightly features are required for this implementation: |
| 14 | +/// #![feature(asm_experimental_arch)] |
| 15 | +/// #![feature(asm_const)] |
| 16 | +/// |
| 17 | +/// When the rustc `feature(generic_const_exprs)` is complete |
| 18 | +/// (https://github.com/rust-lang/rust/issues/76560) it will become possible to do this directly: |
| 19 | +/// ``` |
| 20 | +/// fn delay_ms<const SECS: u64>() { |
| 21 | +/// Delayer::<{SECS * CPU_FREQUENCY_HZ / 1000}>::delay_impl(); |
| 22 | +/// } |
| 23 | +/// ``` |
| 24 | +/// |
| 25 | +/// This is also why the code is structured in such a way. With everything as associated consts. |
| 26 | +/// Because do support evaluation of expressions at compile time just fine contrary to const |
| 27 | +/// generics. The implementation goes from generic consts, to associated consts on the Delayer |
| 28 | +/// struct. And in turn those associated consts are fed to the `asm!` macro. |
| 29 | +/// |
| 30 | +/// The rustc `feature(asm_const)` is also a work in progress |
| 31 | +/// (https://github.com/rust-lang/rust/issues/93332). It appears to work well in the present code. |
| 32 | +/// It also depends on the feature discussed in the next paragraph. |
| 33 | +/// |
| 34 | +/// When `feature(inline_const)` (https://github.com/rust-lang/rust/issues/76001) is complete, all |
| 35 | +/// the conditionals used in `delay_impl()` can be wrapped within `const {}` blocks. To ensure |
| 36 | +/// beyond a shadow of a doubt that the whole function is fully linearised at compile time. |
| 37 | +/// Nevertheless; thanks to constant propagation; this already happens implicitly. |
| 38 | +/// |
| 39 | +/// The maximum number of cycles is 25_769_803_784 when `delay_cycles_u32()` iterates 2^32 |
| 40 | +/// times, `delay_2cycles()` is used twice, and `delay_1cycle()` once. |
| 41 | +/// |
| 42 | +/// Every `delay_cycles_u*()` function has a minimum and maximum number of cycles it can consume. |
| 43 | +/// The minimum is: (cycles per run). |
| 44 | +/// The maximum is: (cycles per run) + (cycles per iteration) * (counter-1). |
| 45 | +/// Note that a counter of zero iterates 2^bits time. |
| 46 | +/// |
| 47 | +/// Example with `delay_cycles_u32()`. |
| 48 | +/// Minimum: 9 cycles with 1 iteration. |
| 49 | +/// Maximum: 9 + 6 * (2^32-1) == 25_769_803_779 cycles with 2^32 iterations. |
| 50 | +/// |
| 51 | +/// Cycles 1..=5 are implemented by a combination of up to two `delay_2cycles()` and up to one |
| 52 | +/// `delay_1cycle()`. Which gets us our maximum of 25_769_803_779 + 5 == 25_769_803_784. |
| 53 | +/// |
| 54 | +/// Technically, beyond this value, the counters of various sizes will be combined until they are |
| 55 | +/// all used up. This means the absolute limit is the sum of the maximum cycles of all counters |
| 56 | +/// combined plus five: |
| 57 | +/// (3+3*0xFF) + (5+4*0xFFFF) + (7+5*0xFF_FFFF) + (9+6*0xFFFF_FFFF) + 5 == 25_853_952_779. |
| 58 | +/// But at this point, this is costing 23 instructions, for very little gain (~3.5s at 24Mhz). |
| 59 | +/// Calling delay_cycles twice would be far more efficient. |
| 60 | +pub struct Delayer<const CYCLES: u64, const MUL: u64, const DIV: u64>; |
| 61 | + |
| 62 | +struct Cycles { |
| 63 | + counter_mask: u64, |
| 64 | + cycles_per_run: u64, |
| 65 | + cycles_per_iter: u64, |
| 66 | + max_cycles: u64, |
| 67 | +} |
| 68 | + |
| 69 | +struct Selection { |
| 70 | + selected: bool, |
| 71 | + counter: u64, |
| 72 | + remainder: u64, |
| 73 | +} |
| 74 | + |
| 75 | +const fn cycles(counter_mask: u64, cycles_per_run: u64, cycles_per_iter: u64) -> Cycles { |
| 76 | + Cycles { |
| 77 | + counter_mask, |
| 78 | + cycles_per_run, |
| 79 | + cycles_per_iter, |
| 80 | + max_cycles: cycles_per_run + cycles_per_iter * counter_mask, |
| 81 | + } |
| 82 | +} |
| 83 | + |
| 84 | +const fn select(info: Cycles, cycles: u64, above: u64) -> Selection { |
| 85 | + if !(cycles > above) { |
| 86 | + return Selection { selected: false, counter: 0, remainder: cycles }; |
| 87 | + } |
| 88 | + let counter = (cycles - info.cycles_per_run) / info.cycles_per_iter + 1; |
| 89 | + let counter = if counter > info.counter_mask { |
| 90 | + info.counter_mask + 1 |
| 91 | + } else { |
| 92 | + counter |
| 93 | + }; |
| 94 | + Selection { |
| 95 | + selected: true, |
| 96 | + counter: if counter > info.counter_mask { |
| 97 | + 0 // Counter wrap around. |
| 98 | + } else { |
| 99 | + counter |
| 100 | + }, |
| 101 | + remainder: cycles - (info.cycles_per_run + info.cycles_per_iter * (counter - 1)) |
| 102 | + } |
| 103 | +} |
| 104 | + |
| 105 | +impl<const CYCLES: u64, const MUL: u64, const DIV: u64> Delayer<CYCLES, MUL, DIV> { |
| 106 | + // Multiply first to avoid precision loss. |
| 107 | + // With a u64 there is no overflow when MUL is lower than: |
| 108 | + // (2^64-1)/25_769_803_784 == 715_827_882. |
| 109 | + // Since MUL is usually CPU_FREQUENCY_HZ, this allows up to 715.83 MHz. |
| 110 | + const TOTAL_CYCLES: u64 = CYCLES * MUL / DIV; |
| 111 | + |
| 112 | + // With `feature(generic_const_exprs) it becomes possible to construct a static assertion. |
| 113 | + //const _: [(); 0 - ((Self::TOTAL_CYCLES > 25_769_803_784) as usize)] = []; |
| 114 | + |
| 115 | + // counter mask, cycles per run, cycles per iteration. | cost + worst case remainder cost |
| 116 | + const U32_INFO: Cycles = cycles(0xFFFF_FFFF, 9, 6); // 8 + 3 |
| 117 | + const U24_INFO: Cycles = cycles( 0xFF_FFFF, 7, 5); // 6 + 2 |
| 118 | + const U16_INFO: Cycles = cycles( 0xFFFF, 5, 4); // 4 + 2 |
| 119 | + const U8_INFO: Cycles = cycles( 0xFF, 3, 3); // 3 + 1 |
| 120 | + |
| 121 | + // The selection process stops at the smallest counter size that can handle the number of |
| 122 | + // cycles to consume with a remainder of up to 5 cycles. This will not always produce the |
| 123 | + // smallest possible number of instructions. In some cases, the cost of U16+U8 might be one |
| 124 | + // instruction lower than that of the U24. This is because the U16+U8 would have no remainder |
| 125 | + // contrary to the U24. Many combinations of the various counter sizes are possible, dividing |
| 126 | + // the number of cycles more or less evenly. Implementing this without |
| 127 | + // `feature(generic_const_exprs) seems daunting. It would require to compute the various |
| 128 | + // combinations and compare the cost. Note that gcc-avr intrinsics delay_cycles |
| 129 | + // doesn't bother to optimize this if this can be of any consolation. |
| 130 | + const U32: Selection = select(Self::U32_INFO, Self::TOTAL_CYCLES, Self::U24_INFO.max_cycles + 4); |
| 131 | + const U24: Selection = select(Self::U24_INFO, Self::U32.remainder, Self::U16_INFO.max_cycles + 5); |
| 132 | + const U16: Selection = select(Self::U16_INFO, Self::U24.remainder, Self::U8_INFO.max_cycles + 4); |
| 133 | + const U8 : Selection = select(Self::U8_INFO, Self::U16.remainder, 5); |
| 134 | + // The extras +4, +5, and +4 cycles take into account that even though the number of cycles is |
| 135 | + // beyond the capacity of the counter, the overflow can be served by the 1.=5 cycles |
| 136 | + // implementation. In those instances, it so happens that the counter of the next size up would |
| 137 | + // take more instructions because it also requires a remainder. |
| 138 | + |
| 139 | + // The counters leave up to 5 cycles as a remainder. They are consumed with up to two `rjump` |
| 140 | + // and a `nop`. |
| 141 | + // 5 cycles => 3 instructions. |
| 142 | + // 4 cycles => 2 instructions. |
| 143 | + // 3 cycles => 2 instructions. |
| 144 | + // 2 cycles => 1 instruction. |
| 145 | + // 1 cycle => 1 instruction. |
| 146 | + |
| 147 | + /// 8 instructions. |
| 148 | + /// 9 cycles per run. |
| 149 | + /// 6 cycles per iteration. |
| 150 | + #[inline(always)] |
| 151 | + fn delay_cycles_u32() { |
| 152 | + unsafe { |
| 153 | + asm!( |
| 154 | + "ldi {r0:l}, {b0}", |
| 155 | + "ldi {r0:h}, {b1}", |
| 156 | + "ldi {r2}, {b2}", |
| 157 | + "ldi {r3}, {b3}", |
| 158 | + "1:", |
| 159 | + "sbiw {r0}, 1", |
| 160 | + "sbci {r2}, 0", |
| 161 | + "sbci {r3}, 0", |
| 162 | + "brne 1b", |
| 163 | + r0 = out(reg_iw) _, |
| 164 | + r2 = out(reg_upper) _, |
| 165 | + r3 = out(reg_upper) _, |
| 166 | + b0 = const (Self::U32.counter >> 0) as u8, |
| 167 | + b1 = const (Self::U32.counter >> 8) as u8, |
| 168 | + b2 = const (Self::U32.counter >> 16) as u8, |
| 169 | + b3 = const (Self::U32.counter >> 24) as u8, |
| 170 | + options(nomem, nostack), |
| 171 | + ) |
| 172 | + } |
| 173 | + } |
| 174 | + |
| 175 | + /// 6 instructions. |
| 176 | + /// 7 cycles per run. |
| 177 | + /// 5 cycles per iteration. |
| 178 | + #[inline(always)] |
| 179 | + fn delay_cycles_u24() { |
| 180 | + // Some way to static assert that COUNTER < 2^24 would be nice. |
| 181 | + unsafe { |
| 182 | + asm!( |
| 183 | + "ldi {r0:l}, {b0}", |
| 184 | + "ldi {r0:h}, {b1}", |
| 185 | + "ldi {r2}, {b2}", |
| 186 | + "1:", |
| 187 | + "sbiw {r0}, 1", |
| 188 | + "sbci {r2}, 0", |
| 189 | + "brne 1b", |
| 190 | + r0 = out(reg_iw) _, |
| 191 | + r2 = out(reg_upper) _, |
| 192 | + b0 = const (Self::U24.counter >> 0) as u8, |
| 193 | + b1 = const (Self::U24.counter >> 8) as u8, |
| 194 | + b2 = const (Self::U24.counter >> 16) as u8, |
| 195 | + options(nomem, nostack), |
| 196 | + ) |
| 197 | + } |
| 198 | + } |
| 199 | + |
| 200 | + /// 4 instructions. |
| 201 | + /// 5 cycles per run. |
| 202 | + /// 4 cycles per iteration. |
| 203 | + #[inline(always)] |
| 204 | + fn delay_cycles_u16() { |
| 205 | + unsafe { |
| 206 | + asm!( |
| 207 | + "ldi {r0:l}, {b0}", |
| 208 | + "ldi {r0:h}, {b1}", |
| 209 | + "1:", |
| 210 | + "sbiw {r0}, 1", |
| 211 | + "brne 1b", |
| 212 | + r0 = out(reg_iw) _, |
| 213 | + b0 = const (Self::U16.counter >> 0) as u8, |
| 214 | + b1 = const (Self::U16.counter >> 8) as u8, |
| 215 | + options(nomem, nostack), |
| 216 | + ) |
| 217 | + } |
| 218 | + } |
| 219 | + |
| 220 | + /// 3 instructions. |
| 221 | + /// 3 cycles per run. |
| 222 | + /// 3 cycles per iteration. |
| 223 | + #[inline(always)] |
| 224 | + fn delay_cycles_u8() { |
| 225 | + unsafe { |
| 226 | + asm!( |
| 227 | + "ldi {r0}, {b0}", |
| 228 | + "1:", |
| 229 | + "dec {r0}", |
| 230 | + "brne 1b", |
| 231 | + r0 = out(reg_upper) _, |
| 232 | + b0 = const Self::U8.counter, |
| 233 | + options(nomem, nostack), |
| 234 | + // The carry flag is not touched by `dec`. |
| 235 | + // That's the difference between `dec` and `sub 1`. |
| 236 | + // Is it possible to tell `asm!` that the carry is untouched? |
| 237 | + // Something like `preserves_carry_flag`. |
| 238 | + // The compiler wouldn't have to save the carry flag when delay_cycles_u8 is used |
| 239 | + // within an outer loop using multiple-precision computations. |
| 240 | + ) |
| 241 | + } |
| 242 | + } |
| 243 | + |
| 244 | + /// 1 instruction. |
| 245 | + /// 2 cycles per run. |
| 246 | + #[inline(always)] |
| 247 | + fn delay_2cycles() { |
| 248 | + unsafe { asm!("rjmp .", options(nomem, nostack, preserves_flags),) } |
| 249 | + } |
| 250 | + |
| 251 | + /// 1 instruction. |
| 252 | + /// 1 cycle per run. |
| 253 | + #[inline(always)] |
| 254 | + fn delay_1cycle() { |
| 255 | + unsafe { asm!("nop", options(nomem, nostack, preserves_flags),) } |
| 256 | + } |
| 257 | + |
| 258 | + #[inline(always)] |
| 259 | + pub fn delay_impl() { |
| 260 | + // Cycles 83_886_083 + 4 .. 25_769_803_779 (9+6*0xFFFF_FFFF) + 5 |
| 261 | + if Self::U32.selected { |
| 262 | + Self::delay_cycles_u32(); |
| 263 | + } |
| 264 | + |
| 265 | + // Cycles 262_146 + 5 ..= 83_886_082 (7+5*0xFF_FFFF) + 4 |
| 266 | + if Self::U24.selected { |
| 267 | + Self::delay_cycles_u24(); |
| 268 | + } |
| 269 | + |
| 270 | + // Cycles 769 + 4 ..= 262_145 (5+4*0xFFFF) + 5 |
| 271 | + if Self::U16.selected { |
| 272 | + Self::delay_cycles_u16(); |
| 273 | + } |
| 274 | + |
| 275 | + // Cycles 6 ..= 768 (3+3*0xFF) + 4 |
| 276 | + if Self::U8.selected { |
| 277 | + Self::delay_cycles_u8(); |
| 278 | + } |
| 279 | + |
| 280 | + // Remaining cycles 1..=5. |
| 281 | + |
| 282 | + if Self::U8.remainder >= 4 { |
| 283 | + Self::delay_2cycles(); |
| 284 | + } |
| 285 | + |
| 286 | + if Self::U8.remainder >= 2 { |
| 287 | + Self::delay_2cycles(); |
| 288 | + } |
| 289 | + |
| 290 | + if Self::U8.remainder % 2 == 1 { |
| 291 | + Self::delay_1cycle(); |
| 292 | + } |
| 293 | + } |
| 294 | +} |
0 commit comments