Skip to content

Commit 4bf443c

Browse files
authored
Merge pull request #113 from Neotron-Compute/speedup-chunky4
Now 640x480@4bpp works in the RP2040.
2 parents 351571c + c2e9e31 commit 4bf443c

File tree

3 files changed

+72
-144
lines changed

3 files changed

+72
-144
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ version = "0.7.0"
99

1010
[dependencies]
1111
# Useful Cortex-M specific functions (e.g. SysTick)
12-
cortex-m = "0.7"
12+
cortex-m = {version = "0.7", features = ["inline-asm"]}
1313
# The Raspberry Pi RP2040 HAL (so we can turn defmt on)
1414
rp2040-hal = { version = "0.10", features = [ "defmt", "rt", "critical-section-impl", "rom-func-cache" ] }
1515
# Cortex-M run-time (or start-up) code

memory.x

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,18 +25,18 @@ MEMORY {
2525
/*
2626
* This is the bottom of the four striped banks of SRAM in the RP2040.
2727
*/
28-
RAM_OS : ORIGIN = 0x20000000, LENGTH = 0x42000 - 0x9300
28+
RAM_OS : ORIGIN = 0x20000000, LENGTH = 0x42000 - 0x9680
2929
/*
3030
* This is the top of the four striped banks of SRAM in the RP2040, plus
3131
* SRAM_BANK4 and SRAM_BANK5.
3232
*
3333
* This is carefully calculated to give us 8 KiB of stack space and ensure
3434
* the defmt buffer doesn't span across SRAM_BANK3 and SRAM_BANK4.
3535
*
36-
* 0x9300 should be the (size of .data + size of .bss + size of .uninit +
36+
* 0x9680 should be the (size of .data + size of .bss + size of .uninit +
3737
* 0x2000 for the stack).
3838
*/
39-
RAM : ORIGIN = 0x20042000 - 0x9300, LENGTH = 0x9300
39+
RAM : ORIGIN = 0x20042000 - 0x9680, LENGTH = 0x9680
4040
}
4141

4242
/*

src/vga/mod.rs

Lines changed: 68 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -445,8 +445,8 @@ impl RenderEngine {
445445
let line_start_bytes = unsafe { base_ptr.add(line_start_offset_bytes) };
446446
// Get a pointer into our scan-line buffer
447447
let mut scan_line_buffer_ptr = scan_line_buffer.pixel_ptr();
448-
let palette_ptr = VIDEO_PALETTE.as_ptr() as *const RGBColour;
449448
if is_double {
449+
let palette_ptr = VIDEO_PALETTE.as_ptr() as *const RGBColour;
450450
// Double-width mode.
451451
// four RGB pixels (two pairs) per byte
452452
for col in 0..line_len_bytes {
@@ -462,146 +462,10 @@ impl RenderEngine {
462462
}
463463
}
464464
} else {
465-
// Single-width mode - using the interpolator.
466-
//
467-
// It's able to shift and mask out the 4 bits we want from the
468-
// chunky byte, and add the palette pointer, all in a single clock
469-
// cycle.
470-
//
471-
// two RGB pixels (one pair) per byte eight RGB pixels (four pairs)
472-
// per word
473-
//
474-
// We give the interpolator `palette_ptr` and `chunky_pixels`.
475-
//
476-
// We want to know the corresponding palette addresses for each of
477-
// the eight 4-bit pixels in `chunky_pixels`. For each pixel `p` in
478-
// the 32-bit word `pppppppp` we want `palette_ptr.offset(p)`, which
479-
// is `palette_ptr + (p * 2)`.
480-
//
481-
// The interpolator calculates `palette_ptr + ((w >> 3) & 0b11110)`
482-
// for our left pixel and `palette_ptr + ((w << 1) & 0b11110)` for
483-
// our right pixel so we can pull out two pixels for each
484-
// interpolator write.
485-
let palette_ptr = VIDEO_PALETTE.as_ptr() as *const RGBColour;
486-
487-
// Set up the interpolator. This is safe because core1 has its own.
488-
let sio = unsafe { &*crate::pac::SIO::ptr() };
489-
sio.interp0_base0().write(|w| {
490-
unsafe { w.bits(palette_ptr as u32) };
491-
w
492-
});
493-
sio.interp0_base1().write(|w| {
494-
unsafe { w.bits(palette_ptr as u32) };
495-
w
496-
});
497-
// lane0 will pull out the palette address of the higher of the two 4-bit chunky pixels in the bottom byte
498-
sio.interp0_ctrl_lane0().write(|w| {
499-
unsafe {
500-
w.shift().bits(3);
501-
w.mask_lsb().bits(1);
502-
w.mask_msb().bits(4);
503-
}
504-
w
505-
});
506-
// lane1 will pull out the palette address of the higher of the two 4-bit chunky pixels in the bottom byte
507-
sio.interp0_ctrl_lane1().write(|w| {
508-
unsafe {
509-
w.shift().bits(31);
510-
w.mask_lsb().bits(1);
511-
w.mask_msb().bits(4);
512-
}
513-
w
514-
});
515-
let line_start_words = line_start_bytes as *const u32;
516-
let line_len_words = line_len_bytes / 4;
517-
for col in 0..line_len_words {
465+
for col in 0..line_len_bytes {
518466
unsafe {
519-
let mut chunky_pixels = line_start_words.add(col).read();
520-
521-
// ========================================================
522-
// First byte (containing two 4-bit pixels)
523-
// ========================================================
524-
525-
sio.interp0_accum0().write(|w| {
526-
w.bits(chunky_pixels);
527-
w
528-
});
529-
sio.interp0_accum1().write(|w| {
530-
w.bits(chunky_pixels);
531-
w
532-
});
533-
// now we get the palette address for the left pixel
534-
let left_addr = sio.interp0_peek_lane0().read().bits() as usize as *const u16;
535-
// and we get the palette address for the right pixel
536-
let right_addr = sio.interp0_peek_lane1().read().bits() as usize as *const u16;
537-
// read from the palette, pair up, and put in the buffer
538-
let pair = RGBPair::from_pixels(RGBColour(*left_addr), RGBColour(*right_addr));
539-
scan_line_buffer_ptr.write(pair);
540-
scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
541-
542-
// ========================================================
543-
// Second byte (containing two 4-bit pixels)
544-
// ========================================================
545-
546-
chunky_pixels >>= 8;
547-
sio.interp0_accum0().write(|w| {
548-
w.bits(chunky_pixels);
549-
w
550-
});
551-
sio.interp0_accum1().write(|w| {
552-
w.bits(chunky_pixels);
553-
w
554-
});
555-
// now we get the palette address for the left pixel
556-
let left_addr = sio.interp0_peek_lane0().read().bits() as usize as *const u16;
557-
// and we get the palette address for the right pixel
558-
let right_addr = sio.interp0_peek_lane1().read().bits() as usize as *const u16;
559-
// read from the palette, pair up, and put in the buffer
560-
let pair = RGBPair::from_pixels(RGBColour(*left_addr), RGBColour(*right_addr));
561-
scan_line_buffer_ptr.write(pair);
562-
scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
563-
564-
// ========================================================
565-
// Third byte (containing two 4-bit pixels)
566-
// ========================================================
567-
568-
chunky_pixels >>= 8;
569-
sio.interp0_accum0().write(|w| {
570-
w.bits(chunky_pixels);
571-
w
572-
});
573-
sio.interp0_accum1().write(|w| {
574-
w.bits(chunky_pixels);
575-
w
576-
});
577-
// now we get the palette address for the left pixel
578-
let left_addr = sio.interp0_peek_lane0().read().bits() as usize as *const u16;
579-
// and we get the palette address for the right pixel
580-
let right_addr = sio.interp0_peek_lane1().read().bits() as usize as *const u16;
581-
// read from the palette, pair up, and put in the buffer
582-
let pair = RGBPair::from_pixels(RGBColour(*left_addr), RGBColour(*right_addr));
583-
scan_line_buffer_ptr.write(pair);
584-
scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
585-
586-
// ========================================================
587-
// Fourth byte (containing two 4-bit pixels)
588-
// ========================================================
589-
590-
chunky_pixels >>= 8;
591-
sio.interp0_accum0().write(|w| {
592-
w.bits(chunky_pixels);
593-
w
594-
});
595-
sio.interp0_accum1().write(|w| {
596-
w.bits(chunky_pixels);
597-
w
598-
});
599-
// now we get the palette address for the left pixel
600-
let left_addr = sio.interp0_peek_lane0().read().bits() as usize as *const u16;
601-
// and we get the palette address for the right pixel
602-
let right_addr = sio.interp0_peek_lane1().read().bits() as usize as *const u16;
603-
// read from the palette, pair up, and put in the buffer
604-
let pair = RGBPair::from_pixels(RGBColour(*left_addr), RGBColour(*right_addr));
467+
let pixel_pair = line_start_bytes.add(col).read();
468+
let pair = CHUNKY4_COLOUR_LOOKUP.lookup(pixel_pair);
605469
scan_line_buffer_ptr.write(pair);
606470
scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
607471
}
@@ -1143,6 +1007,63 @@ impl TextBuffer {
11431007

11441008
unsafe impl Sync for TextBuffer {}
11451009

1010+
/// See [`CHUNKY4_COLOUR_LOOKUP`]
1011+
struct Chunky4ColourLookup {
1012+
entries: [AtomicU32; 256],
1013+
}
1014+
1015+
impl Chunky4ColourLookup {
1016+
/// Create a blank look-up table.
1017+
const fn blank() -> Chunky4ColourLookup {
1018+
Chunky4ColourLookup {
1019+
entries: [const { AtomicU32::new(0) }; 256],
1020+
}
1021+
}
1022+
1023+
/// Initialise this look-up table from the palette.
1024+
fn init(&self, palette: &[AtomicU16]) {
1025+
let palette = &palette[0..16];
1026+
for (left_idx, left_colour) in palette.iter().enumerate() {
1027+
for (right_idx, right_colour) in palette.iter().enumerate() {
1028+
let left_colour = left_colour.load(Ordering::Relaxed);
1029+
let right_colour = right_colour.load(Ordering::Relaxed);
1030+
let index = (left_idx << 4) + right_idx;
1031+
let pair = RGBPair::from_pixels(RGBColour(left_colour), RGBColour(right_colour));
1032+
self.entries[index].store(pair.0, Ordering::Relaxed);
1033+
}
1034+
}
1035+
}
1036+
1037+
/// Update a look-up table entry.
1038+
///
1039+
/// The `updated_palette_entry` is an index from 0..16 into the main palette
1040+
/// (given as `palette`).
1041+
fn update_index(&self, updated_palette_entry: u8, palette: &[AtomicU16]) {
1042+
let palette = &palette[0..16];
1043+
let updated_palette_entry = usize::from(updated_palette_entry);
1044+
for (left_idx, left_colour) in palette.iter().enumerate() {
1045+
for (right_idx, right_colour) in palette.iter().enumerate() {
1046+
if left_idx == updated_palette_entry || right_idx == updated_palette_entry {
1047+
let left_colour = left_colour.load(Ordering::Relaxed);
1048+
let right_colour = right_colour.load(Ordering::Relaxed);
1049+
let index = (left_idx << 4) + right_idx;
1050+
let pair =
1051+
RGBPair::from_pixels(RGBColour(left_colour), RGBColour(right_colour));
1052+
self.entries[index].store(pair.0, Ordering::Relaxed);
1053+
}
1054+
}
1055+
}
1056+
}
1057+
1058+
/// Turn a pair of chunky4 pixels (in a `u8`), into a pair of RGB pixels.
1059+
#[inline]
1060+
fn lookup(&self, pixel_pair: u8) -> RGBPair {
1061+
let index = usize::from(pixel_pair);
1062+
let raw = self.entries[index].load(Ordering::Relaxed);
1063+
RGBPair(raw)
1064+
}
1065+
}
1066+
11461067
// -----------------------------------------------------------------------------
11471068
// Static and Const Data
11481069
// -----------------------------------------------------------------------------
@@ -1177,6 +1098,11 @@ pub static GLYPH_ATTR_ARRAY: TextBuffer = TextBuffer::new();
11771098
/// Copied at the start of every frame by the code on Core 1.
11781099
pub static VIDEO_MODE: VideoMode = VideoMode::new();
11791100

1101+
/// Holds 16 palette entries, paired with every other of 16 palette entries.
1102+
///
1103+
/// Allows a fast lookup of an RGB pixel pair given two 4-bpp pixels packed into a byte.
1104+
static CHUNKY4_COLOUR_LOOKUP: Chunky4ColourLookup = Chunky4ColourLookup::blank();
1105+
11801106
/// Holds the 256-entry palette for indexed colour modes.
11811107
///
11821108
/// Note, the first eight entries should match
@@ -1963,6 +1889,7 @@ pub fn init(
19631889

19641890
// No-one else is looking at this right now.
19651891
TEXT_COLOUR_LOOKUP.init(&VIDEO_PALETTE);
1892+
CHUNKY4_COLOUR_LOOKUP.init(&VIDEO_PALETTE);
19661893

19671894
unsafe {
19681895
crate::multicore::launch_core1_with_stack(
@@ -2059,6 +1986,7 @@ pub fn set_palette(index: u8, colour: RGBColour) {
20591986
// Update the text cache
20601987
if index <= 15 {
20611988
TEXT_COLOUR_LOOKUP.update_index(index, &VIDEO_PALETTE);
1989+
CHUNKY4_COLOUR_LOOKUP.update_index(index, &VIDEO_PALETTE);
20621990
}
20631991
}
20641992

0 commit comments

Comments
 (0)