Skip to content

Commit 5bc8571

Browse files
authored
fix #17, more optimizations (#18)
* fix #17, more optimizations
1 parent d926090 commit 5bc8571

File tree

9 files changed

+200
-56
lines changed

9 files changed

+200
-56
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/)
66
and this project adheres to [Semantic Versioning](http://semver.org/).
77

88

9+
## [0.4.1] - 2024-10-31
10+
- fix #17, add more optimizations, kudos to nt314p
11+
912
## [0.4.0] - 2024-09-03
1013
- fix #15, loop unroll option, improving performance, kudos to nt314p
1114
- fixed bug in test program (see #15)

FastShiftOut.cpp

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//
22
// FILE: FastShiftOut.cpp
33
// AUTHOR: Rob Tillaart
4-
// VERSION: 0.4.0
4+
// VERSION: 0.4.1
55
// PURPOSE: ShiftOut that implements the Print interface
66
// DATE: 2013-08-22
77
// URL: https://github.com/RobTillaart/FastShiftOut
@@ -170,8 +170,12 @@ size_t FastShiftOut::writeLSBFIRST(uint8_t data)
170170
uint8_t oldSREG = SREG;
171171
noInterrupts();
172172

173-
if ((value & 0x01) == 0) *localDataOutRegister &= outmask2;
174-
else *localDataOutRegister |= outmask1;
173+
// See discussion #17
174+
uint8_t d0 = *localDataOutRegister & outmask2; // cache 0
175+
uint8_t d1 = d0 | outmask1; // cache 1
176+
177+
if ((value & 0x01) == 0) *localDataOutRegister = d0;
178+
else *localDataOutRegister = d1;
175179
// *_clockRegister |= cbmask1;
176180
// *_clockRegister &= cbmask2;
177181
// following code is allowed as interrupts are disabled.
@@ -180,44 +184,44 @@ size_t FastShiftOut::writeLSBFIRST(uint8_t data)
180184
*localClockRegister = r | cbmask1; // set one bit
181185
*localClockRegister = r; // reset bit
182186

183-
if ((value & 0x02) == 0) *localDataOutRegister &= outmask2;
184-
else *localDataOutRegister |= outmask1;
187+
if ((value & 0x02) == 0) *localDataOutRegister = d0;
188+
else *localDataOutRegister = d1;
185189
r = *localClockRegister;
186190
*localClockRegister = r | cbmask1; // set one bit
187191
*localClockRegister = r; // reset it
188192

189-
if ((value & 0x04) == 0) *localDataOutRegister &= outmask2;
190-
else *localDataOutRegister |= outmask1;
193+
if ((value & 0x04) == 0) *localDataOutRegister = d0;
194+
else *localDataOutRegister = d1;
191195
r = *localClockRegister;
192196
*localClockRegister = r | cbmask1; // set one bit
193197
*localClockRegister = r; // reset it
194198

195-
if ((value & 0x08) == 0) *localDataOutRegister &= outmask2;
196-
else *localDataOutRegister |= outmask1;
199+
if ((value & 0x08) == 0) *localDataOutRegister = d0;
200+
else *localDataOutRegister = d1;
197201
r = *localClockRegister;
198202
*localClockRegister = r | cbmask1; // set one bit
199203
*localClockRegister = r; // reset it
200204

201-
if ((value & 0x10) == 0) *localDataOutRegister &= outmask2;
202-
else *localDataOutRegister |= outmask1;
205+
if ((value & 0x10) == 0) *localDataOutRegister = d0;
206+
else *localDataOutRegister = d1;
203207
r = *localClockRegister;
204208
*localClockRegister = r | cbmask1; // set one bit
205209
*localClockRegister = r; // reset it
206210

207-
if ((value & 0x20) == 0) *localDataOutRegister &= outmask2;
208-
else *localDataOutRegister |= outmask1;
211+
if ((value & 0x20) == 0) *localDataOutRegister = d0;
212+
else *localDataOutRegister = d1;
209213
r = *localClockRegister;
210214
*localClockRegister = r | cbmask1; // set one bit
211215
*localClockRegister = r; // reset it
212216

213-
if ((value & 0x40) == 0) *localDataOutRegister &= outmask2;
214-
else *localDataOutRegister |= outmask1;
217+
if ((value & 0x40) == 0) *localDataOutRegister = d0;
218+
else *localDataOutRegister = d1;
215219
r = *localClockRegister;
216220
*localClockRegister = r | cbmask1; // set one bit
217221
*localClockRegister = r; // reset it
218222

219-
if ((value & 0x80) == 0) *localDataOutRegister &= outmask2;
220-
else *localDataOutRegister |= outmask1;
223+
if ((value & 0x80) == 0) *localDataOutRegister = d0;
224+
else *localDataOutRegister = d1;
221225
r = *localClockRegister;
222226
*localClockRegister = r | cbmask1; // set one bit
223227
*localClockRegister = r; // reset it
@@ -238,11 +242,14 @@ size_t FastShiftOut::writeLSBFIRST(uint8_t data)
238242
uint8_t oldSREG = SREG;
239243
noInterrupts();
240244

245+
// See discussion #17
246+
uint8_t d0 = *localDataOutRegister & outmask2; // cache 0
247+
uint8_t d1 = d0 | outmask1; // cache 1
241248
for (uint8_t m = 1; m > 0; m <<= 1)
242249
{
243250
// process one bit
244-
if ((value & m) == 0) *localDataOutRegister &= outmask2;
245-
else *localDataOutRegister |= outmask1;
251+
if ((value & m) == 0) *localDataOutRegister = d0;
252+
else *localDataOutRegister = d1;
246253
uint8_t r = *localClockRegister;
247254
*localClockRegister = r | cbmask1; // set one bit
248255
*localClockRegister = r; // reset it
@@ -284,8 +291,12 @@ size_t FastShiftOut::writeMSBFIRST(uint8_t data)
284291
uint8_t oldSREG = SREG;
285292
noInterrupts();
286293

287-
if ((value & 0x80) == 0) *localDataOutRegister &= outmask2;
288-
else *localDataOutRegister |= outmask1;
294+
// See discussion #17
295+
uint8_t d0 = *localDataOutRegister & outmask2; // cache 0
296+
uint8_t d1 = d0 | outmask1; // cache 1
297+
298+
if ((value & 0x80) == 0) *localDataOutRegister = d0;
299+
else *localDataOutRegister = d1;
289300
// *localClockRegister |= cbmask1;
290301
// *localClockRegister &= cbmask2;
291302
// following code is allowed as interrupts are disabled.
@@ -294,44 +305,44 @@ size_t FastShiftOut::writeMSBFIRST(uint8_t data)
294305
*localClockRegister = r | cbmask1; // set one bit
295306
*localClockRegister = r; // reset it
296307

297-
if ((value & 0x40) == 0) *localDataOutRegister &= outmask2;
298-
else *localDataOutRegister |= outmask1;
308+
if ((value & 0x40) == 0) *localDataOutRegister = d0;
309+
else *localDataOutRegister = d1;
299310
r = *localClockRegister;
300311
*localClockRegister = r | cbmask1; // set one bit
301312
*localClockRegister = r; // reset it
302313

303-
if ((value & 0x20) == 0) *localDataOutRegister &= outmask2;
304-
else *localDataOutRegister |= outmask1;
314+
if ((value & 0x20) == 0) *localDataOutRegister = d0;
315+
else *localDataOutRegister = d1;
305316
r = *localClockRegister;
306317
*localClockRegister = r | cbmask1; // set one bit
307318
*localClockRegister = r; // reset it
308319

309-
if ((value & 0x10) == 0) *localDataOutRegister &= outmask2;
310-
else *localDataOutRegister |= outmask1;
320+
if ((value & 0x10) == 0) *localDataOutRegister = d0;
321+
else *localDataOutRegister = d1;
311322
r = *localClockRegister;
312323
*localClockRegister = r | cbmask1; // set one bit
313324
*localClockRegister = r; // reset it
314325

315-
if ((value & 0x08) == 0) *localDataOutRegister &= outmask2;
316-
else *localDataOutRegister |= outmask1;
326+
if ((value & 0x08) == 0) *localDataOutRegister = d0;
327+
else *localDataOutRegister = d1;
317328
r = *localClockRegister;
318329
*localClockRegister = r | cbmask1; // set one bit
319330
*localClockRegister = r; // reset it
320331

321-
if ((value & 0x04) == 0) *localDataOutRegister &= outmask2;
322-
else *localDataOutRegister |= outmask1;
332+
if ((value & 0x04) == 0) *localDataOutRegister = d0;
333+
else *localDataOutRegister = d1;
323334
r = *localClockRegister;
324335
*localClockRegister = r | cbmask1; // set one bit
325336
*localClockRegister = r; // reset it
326337

327-
if ((value & 0x02) == 0) *localDataOutRegister &= outmask2;
328-
else *localDataOutRegister |= outmask1;
338+
if ((value & 0x02) == 0) *localDataOutRegister = d0;
339+
else *localDataOutRegister = d1;
329340
r = *localClockRegister;
330341
*localClockRegister = r | cbmask1; // set one bit
331342
*localClockRegister = r; // reset it
332343

333-
if ((value & 0x01) == 0) *localDataOutRegister &= outmask2;
334-
else *localDataOutRegister |= outmask1;
344+
if ((value & 0x01) == 0) *localDataOutRegister = d0;
345+
else *localDataOutRegister = d1;
335346
r = *localClockRegister;
336347
*localClockRegister = r | cbmask1; // set one bit
337348
*localClockRegister = r; // reset it
@@ -352,16 +363,20 @@ size_t FastShiftOut::writeMSBFIRST(uint8_t data)
352363
uint8_t oldSREG = SREG;
353364
noInterrupts();
354365

366+
// See discussion #17
367+
uint8_t d0 = *localDataOutRegister & outmask2; // cache 0
368+
uint8_t d1 = d0 | outmask1; // cache 1
355369
for (uint8_t m = 0x80; m > 0; m >>= 1)
356370
{
357371
// process one bit
358-
if ((value & m) == 0) *localDataOutRegister &= outmask2;
359-
else *localDataOutRegister |= outmask1;
372+
if ((value & m) == 0) *localDataOutRegister = d0;
373+
else *localDataOutRegister = d1;
360374
uint8_t r = *localClockRegister;
361375
*localClockRegister = r | cbmask1; // set one bit
362376
*localClockRegister = r; // reset it
363377
}
364378

379+
365380
// restore interrupt state
366381
SREG = oldSREG;
367382

FastShiftOut.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
//
33
// FILE: FastShiftOut.h
44
// AUTHOR: Rob Tillaart
5-
// VERSION: 0.4.0
5+
// VERSION: 0.4.1
66
// PURPOSE: shiftOut class that implements the Print interface
77
// DATE: 2013-08-22
88
// URL: https://github.com/RobTillaart/FastShiftOut
@@ -11,10 +11,10 @@
1111
#include "Arduino.h"
1212
#include "Print.h"
1313

14-
#define FASTSHIFTOUT_LIB_VERSION (F("0.4.0"))
14+
#define FASTSHIFTOUT_LIB_VERSION (F("0.4.1"))
1515

1616
// uncomment next line to get SPEED OPTIMIZED CODE
17-
#define FASTSHIFTOUT_AVR_LOOP_UNROLLED 1
17+
// #define FASTSHIFTOUT_AVR_LOOP_UNROLLED 1
1818

1919

2020
class FastShiftOut : public Print

README.md

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -58,23 +58,25 @@ Numbers may vary depending on bit-order flag.
5858
Indicative time in microseconds, Arduino UNO, IDE 1.8.19, measured over 1000 calls.
5959
(delta between 2 calls and 1 call to eliminate overhead)
6060

61-
| function | 0.2.4 | 0.3.1 | 0.3.3 | 0.4.0 | 0.4.0L |
62-
|:-------------------------|--------:|---------:|---------:|---------:|---------:|
63-
| write() | 21.66 | 22.48 | 22.27 | 14.10 | 11.51 |
64-
| writeLSBFIRST() | 22.94 | 23.37 | 22.25 | 14.09 | 11.50 |
65-
| writeMSBFIRST() | 20.30 | 21.86 | 22.26 | 14.08 | 11.50 |
66-
| reference shiftOut() | 89.74 | 89.74 | 89.59 | 89.60 | 89.60 |
67-
| write16() | na | na | 45.39 | 29.06 | 23.89 |
68-
| write24() | na | na | 67.66 | 43.12 | 35.40 |
69-
| write32() | na | na | 89.91 | 57.22 | 46.90 |
70-
| println("Hello world") | na | 328.92 | 328.92 | 222.68 | 189.20 |
71-
| println(1357) | na | 313.56 | 311.60 | 262.60 | 247.12 |
72-
| println(3.14159265, 4) | na | 717.36 | 716.04 | 650.68 | 629.96 |
61+
| function | 0.2.4 | 0.3.1 | 0.3.3 | 0.4.0 | 0.4.0L | 0.4.1 | 0.4.1L |
62+
|:-------------------------|--------:|---------:|---------:|---------:|---------:|---------:|---------:|
63+
| write() | 21.66 | 22.48 | 22.27 | 14.10 | 11.51 | 12.83 | 9.12 |
64+
| writeLSBFIRST() | 22.94 | 23.37 | 22.25 | 14.09 | 11.50 | 12.82 | 9.12 |
65+
| writeMSBFIRST() | 20.30 | 21.86 | 22.26 | 14.08 | 11.50 | 12.82 | 9.12 |
66+
| reference shiftOut() | 89.74 | 89.74 | 89.59 | 89.60 | 89.60 | 89.59 | 89.60 |
67+
| write16() | na | na | 45.39 | 29.06 | 23.89 | 26.34 | 19.11 |
68+
| write24() | na | na | 67.66 | 43.12 | 35.40 | 39.36 | 28.23 |
69+
| write32() | na | na | 89.91 | 57.22 | 46.90 | 52.18 | 37.34 |
70+
| println("Hello world") | na | 328.92 | 328.92 | 222.68 | 189.20 | 206.32 | 158.12 |
71+
| println(1357) | na | 313.56 | 311.60 | 262.60 | 247.12 | 255.04 | 232.80 |
72+
| println(3.14159265, 4) | na | 717.36 | 716.04 | 650.68 | 629.96 | 640.52 | 610.92 |
7373

7474
- Note: 0.3.3 has improved the measurement, not the code sec.
7575
- Note: 0.3.3 numbers fixed when implementing 0.4.0. (error in test sketch).
7676
- Note: 0.4.0 measured with loop unroll flag disabled.
7777
- Note: 0.4.0L measured with loop unrolled flag enabled.
78+
- Note: 0.4.1 / 0.4.1L idem.
79+
- Note: Loop unrolled is (8046 - 7818 = 128) bytes larger in size.
7880

7981

8082
### Related
@@ -84,6 +86,7 @@ Indicative time in microseconds, Arduino UNO, IDE 1.8.19, measured over 1000 cal
8486
- https://github.com/RobTillaart/FastShiftOut
8587
- https://github.com/RobTillaart/ShiftInSlow
8688
- https://github.com/RobTillaart/ShiftOutSlow
89+
- https://github.com/RobTillaart/SWSPI (experimental)
8790

8891

8992
## Interface

examples/FastShiftOut_scope_test/FastShiftOut_scope_test.ino

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ void loop()
6262
// shiftOut(12, 13, MSBFIRST, 0x55);
6363

6464
FSO.write(0x55);
65-
delayMicroseconds(100);
65+
delayMicroseconds(50);
6666
}
6767

6868

69-
// -- END OF FILE --
69+
// -- END OF FILE --

examples/FastShiftOut_test/performance_0.4.0.txt

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,41 @@ println(3.14159265, 4): 629.96
4343

4444
done ...
4545

46+
47+
no loop unroll version
48+
49+
Performance - time in us
50+
write: 15.34
51+
write: 29.43
52+
Delta: 14.10
53+
54+
writeLSBFIRST: 14.34
55+
writeLSBFIRST: 28.42
56+
Delta: 14.09
57+
58+
writeMSBFIRST: 14.34
59+
writeMSBFIRST: 28.42
60+
Delta: 14.08
61+
62+
Standard shiftOut1: 89.85
63+
Standard shiftOut2: 179.44
64+
Delta: 89.60
65+
66+
write16: 29.31
67+
write16: 58.35
68+
Delta: 29.04
69+
70+
write24: 43.38
71+
write24: 86.51
72+
Delta: 43.13
73+
74+
write32: 57.47
75+
write32: 114.68
76+
Delta: 57.22
77+
78+
79+
Test print interface
80+
println("Hello world"): 222.68
81+
println(1357): 262.60
82+
println(3.14159265, 4): 650.68
83+

0 commit comments

Comments
 (0)