Skip to content

Commit 2443943

Browse files
committed
Hand-roll assembly for chunky4 output.
Now 640x480 @ 16 colours works on RP2040.
1 parent 41cee80 commit 2443943

File tree

1 file changed

+34
-7
lines changed

1 file changed

+34
-7
lines changed

src/vga/mod.rs

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -465,13 +465,40 @@ impl RenderEngine {
465465
}
466466
}
467467
} else {
468-
for col in 0..line_len_bytes {
469-
unsafe {
470-
let pixel_pair = line_start_bytes.add(col).read();
471-
let pair = CHUNKY4_COLOUR_LOOKUP.lookup(pixel_pair);
472-
scan_line_buffer_ptr.write(pair);
473-
scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
474-
}
468+
// // This code optimises poorly, leaving a load from the literal pool in the middle of the for loop.
469+
//
470+
// for col in 0..line_len_bytes {
471+
// unsafe {
472+
// let pixel_pair = line_start_bytes.add(col).read();
473+
// let pair = CHUNKY4_COLOUR_LOOKUP.lookup(pixel_pair);
474+
// scan_line_buffer_ptr.write(pair);
475+
// scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
476+
// }
477+
// }
478+
479+
// So I wrote it by hand in assembly instead, saving two clock cycles per loop
480+
unsafe {
481+
core::arch::asm!(
482+
"0:",
483+
// load a byte from line_start_bytes
484+
"ldrb {tmp}, [{lsb}]",
485+
// multiply it by sizeof(u32)
486+
"lsls {tmp}, {tmp}, #0x2",
487+
// load a 32-bit word from CHUNKY4_COLOUR_LOOKUP[lsb]
488+
"ldr {tmp}, [{chunky}, {tmp}]",
489+
// store the 32-bit word to the scanline buffer, and increment
490+
"stm {slbp}!, {{ {tmp} }}",
491+
// increment the lsb
492+
"adds {lsb}, {lsb}, #0x1",
493+
// loop until we're done
494+
"cmp {lsb}, {lsb_max}",
495+
"bne 0b",
496+
lsb = in(reg) line_start_bytes,
497+
lsb_max = in(reg) line_start_bytes.add(line_len_bytes),
498+
chunky = in(reg) core::ptr::addr_of!(CHUNKY4_COLOUR_LOOKUP),
499+
tmp = in(reg) 0,
500+
slbp = in(reg) scan_line_buffer_ptr,
501+
);
475502
}
476503
}
477504
}

0 commit comments

Comments
 (0)