@@ -465,13 +465,40 @@ impl RenderEngine {
465465 }
466466 }
467467 } else {
468- for col in 0 ..line_len_bytes {
469- unsafe {
470- let pixel_pair = line_start_bytes. add ( col) . read ( ) ;
471- let pair = CHUNKY4_COLOUR_LOOKUP . lookup ( pixel_pair) ;
472- scan_line_buffer_ptr. write ( pair) ;
473- scan_line_buffer_ptr = scan_line_buffer_ptr. add ( 1 ) ;
474- }
468+ // // This code optimises poorly, leaving a load from the literal pool in the middle of the for loop.
469+ //
470+ // for col in 0..line_len_bytes {
471+ // unsafe {
472+ // let pixel_pair = line_start_bytes.add(col).read();
473+ // let pair = CHUNKY4_COLOUR_LOOKUP.lookup(pixel_pair);
474+ // scan_line_buffer_ptr.write(pair);
475+ // scan_line_buffer_ptr = scan_line_buffer_ptr.add(1);
476+ // }
477+ // }
478+
479+ // So I wrote it by hand in assembly instead, saving two clock cycles per loop
480+ unsafe {
481+ core:: arch:: asm!(
482+ "0:" ,
483+ // load a byte from line_start_bytes
484+ "ldrb {tmp}, [{lsb}]" ,
485+ // multiply it by sizeof(u32)
486+ "lsls {tmp}, {tmp}, #0x2" ,
487+ // load a 32-bit word from CHUNKY4_COLOUR_LOOKUP[lsb]
488+ "ldr {tmp}, [{chunky}, {tmp}]" ,
489+ // store the 32-bit word to the scanline buffer, and increment
490+ "stm {slbp}!, {{ {tmp} }}" ,
491+ // increment the lsb
492+ "adds {lsb}, {lsb}, #0x1" ,
493+ // loop until we're done
494+ "cmp {lsb}, {lsb_max}" ,
495+ "bne 0b" ,
496+ lsb = in( reg) line_start_bytes,
497+ lsb_max = in( reg) line_start_bytes. add( line_len_bytes) ,
498+ chunky = in( reg) core:: ptr:: addr_of!( CHUNKY4_COLOUR_LOOKUP ) ,
499+ tmp = in( reg) 0 ,
500+ slbp = in( reg) scan_line_buffer_ptr,
501+ ) ;
475502 }
476503 }
477504 }
0 commit comments