I've been hacking on the rom disk driver for the dougg3 rom simm, and implemented support for compressed disks using
liblzg. This allows cramming a 50% larger rom disk image into the same size simm. The compressor is a regular Windows/Mac/Linux program, and the decompressor is a hand-written 680000 assembly routine from the lzg authors. It works great, but decompression is kind of slow, even though I specifically chose lzg for its simplicity and fast decompressing. On a Mac IIsi or IIci, it runs about 600K/sec of decompressed data, so it takes around 5-20 seconds to decompress the whole rom disk depending on its size and the Mac CPU speed.
The meat of the 68000 decompression routine isn't too long. It's a fairly simple Lempel-Ziv algorithm that encodes repeated data as (distance,length) references to the first appearance of the data. There's a brief summary of lzg's specific algorithm
here. Does anyone see any obvious ways to substantially optimize this code? It was written for a vanilla 68000, but for the rom simm it'll always be running on a 68020 or '030. Maybe there are some '030-specific instructions that could be used to help speed it up? Or some kind of cache prefetch? My knowledge of such things is weak. There's also some bounds-checking code that could be removed, though the liblzg web site says this provides only a ~12% improvement.
The meat-of-the-meat where data gets copied from source to dest is a two-instruction dbf loop:
_loop1: move.b (a4)+,(a1)+
dbf d6,_loop1
If any '030-specific tricks could improve that, it would help the most. Here's the entirety of the decompressor's main loop:
_LZG_LENGTH_DECODE_LUT:
dc.b 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
dc.b 17,18,19,20,21,22,23,24,25,26,27,28,34,47,71,127
_lzg_decompress:
// a0 = src
// a1 = dst
// a2 = inEnd = in + inSize
// a3 = outEnd = out + decodedSize
// a6 = out
move.b (a0)+,d1 // d1 = marker1
move.b (a0)+,d2 // d2 = marker2
move.b (a0)+,d3 // d3 = marker3
move.b (a0)+,d4 // d4 = marker4
lea _LZG_LENGTH_DECODE_LUT(pc),a5 // a5 = _LZG_LENGTH_DECODE_LUT
// Main decompression loop
move.l #2056,d0 // Keep the constant 2056 in d0 (for marker1)
cmp.l a2,a0
_mainloop:
bcc.s _fail // Note: cmp.l a2,a0 must be performed prior to this!
move.b (a0)+,d7 // d7 = symbol
cmp.b d1,d7 // marker1?
beq.s _marker1
cmp.b d2,d7 // marker2?
beq.s _marker2
cmp.b d3,d7 // marker3?
beq.s _marker3
cmp.b d4,d7 // marker4?
beq.s _marker4
_literal:
cmp.l a3,a1
bcc.s _fail
move.b d7,(a1)+
cmp.l a2,a0
bcs.s _mainloop
// We're done
_done:
// irrelevant code removed
bra _exit
// marker4 - "Near copy (incl. RLE)"
_marker4:
cmp.l a2,a0
bcc.s _fail
moveq #0,d5
move.b (a0)+,d5
beq.s _literal // Single occurance of the marker symbol (rare)
move.l d5,d6
and.b #0x1f,d6
move.b (a5,d6.w),d6 // length-1 = _LZG_LENGTH_DECODE_LUT[b & 0x1f]
lsr.b #5,d5
addq.w #1,d5 // offset = (b >> 5) + 1
bra.s _copy
// marker3 - "Short copy"
_marker3:
cmp.l a2,a0
bcc.s _fail
moveq #0,d5
move.b (a0)+,d5
beq.s _literal // Single occurance of the marker symbol (rare)
move.l d5,d6
lsr.b #6,d6
addq.w #2,d6 // length-1 = (b >> 6) + 2
and.b #0x3f,d5
addq.w #8,d5 // offset = (b & 0x3f) + 8
bra.s _copy
// marker2 - "Medium copy"
_marker2:
cmp.l a2,a0
bcc.s _fail
moveq #0,d5
move.b (a0)+,d5
beq.s _literal // Single occurance of the marker symbol (rare)
cmp.l a2,a0
bcc.s _fail
move.l d5,d6
and.b #0x1f,d6
move.b (a5,d6.w),d6 // length-1 = _LZG_LENGTH_DECODE_LUT[b & 0x1f]
lsl.w #3,d5
move.b (a0)+,d5
addq.w #8,d5 // offset = (((b & 0xe0) << 3) | b2) + 8
bra.s _copy
// marker1 - "Distant copy"
_marker1:
cmp.l a2,a0
bcc.s _fail
moveq #0,d5
move.b (a0)+,d5
beq.s _literal // Single occurance of the marker symbol (rare)
lea 1(a0),a4
cmp.l a2,a4
bcc.s _fail
move.l d5,d6
and.b #0x1f,d6
move.b (a5,d6.w),d6 // length-1 = _LZG_LENGTH_DECODE_LUT[b & 0x1f]
lsr.w #5,d5
swap d5
move.b (a0)+,d5
lsl.w #8,d5
move.b (a0)+,d5
add.l d0,d5 // offset = (((b & 0xe0) << 11) | (b2 << 8) | (*src++)) + 2056
// Copy corresponding data from history window
// d5 = offset
// d6 = length-1
_copy:
lea (a1,d6.l),a4
cmp.l a3,a4
bcc _fail
move.l a1,a4
sub.l d5,a4
cmp.l a6,a4
bcs _fail
_loop1: move.b (a4)+,(a1)+
dbf d6,_loop1
cmp.l a2,a0
bcs _mainloop
bra _done