don't click here

Improved Enigma compression

Discussion in 'Engineering & Reverse Engineering' started by RealMalachi, Mar 29, 2024.

  1. RealMalachi

    RealMalachi

    you can call me mal Member
    NOTE: I recommend using the latest version. The previous versions are mainly there for archival purposes.

    I started this a while ago due to a conversation on discord, regarding improving Enigma and taking notes for a potential "Enigma+". It's gone through some needed revisions over time (my original attempt was embarrassing in hindsight), but I'm confident enough in its current state to release it.

    It comes with four flags, two for balancing between size and speed, one for enabling error free odd-addressing, and one for compatibility with segas original implementation: they didn't trash d0/a0-a1, and I planned to trash them because they usually aren't that useful when decompression is over. However, an incompatibility is an incompatibility, so I made a flag to switch between either. By default, it favours size and compatibility.
    This was made for AS. I'm not gonna bother porting it asm68k, but it should be simple enough to port over.

    You can either read the code here or get an .asm file from the .zip file
    Code (Text):
    1. ; ---------------------------------------------------------------------------
    2. ; Enigma Decompression Algorithm
    3. ; For format explanation see http://info.sonicretro.org/Enigma_compression
    4. ; this one is optimised from the original, with the more rom-intensive
    5. ; speedups enabled by some flags down below
    6. ; ---------------------------------------------------------------------------
    7. ; INPUTS:
    8. ; d0 = starting art tile (added to each 8x8 before writing to destination)
    9. ; a0 = source address
    10. ; a1 = destination address
    11. ; TRASHES:
    12. ; d0,a0,a1
    13. ; STACK:
    14. ; - saved registers d1-d7/a2-a6 (13x4 bytes)
    15. ; - 4 bytes for one bsr (GetInlineCopyVal and ChkGetNextByte)
    16. ; - 2 bytes for word conversion
    17. ; ---------------------------------------------------------------------------
    18. _Eni_CompatibilityMode    = 1
    19. ; if 1, stay compatible with the original Enigma
    20. ; (they saved d0 and a1, and made a0 point to the end of the file)
    21. _Eni_EvenAligned    = 0
    22. ; if 1, allows Enigma compressed files to be at an odd numbered address
    23. _Eni_RemoveJmpTable    = 0
    24. ; if 1, saves 22 cycles per loop (12 for SubE) at the cost of some rom space
    25. _Eni_InlineBitStream    = 0
    26. ; if 1, inlines ChkGetNextByte in EniDec_Loop, for a speedup of 34 cycles per loop
    27. ; funny how this simple speedup greatly overshadows _Eni_RemoveJmpTable
    28. ; that one required infinitely more effort then this. oh well.
    29.  
    30. ; for removejmpto, routines needs to be 16 ($10) bytes from the next routine
    31. ; the only exception is SubE; the last one
    32. ; this macro should be easy enough to port to more primative macro formats
    33. enidecpad16 macro routine
    34.     if *-routine>16        ; if it exceeds 16, throw an error
    35.     fatal "ADDR ERROR - EXCEED: routine exceeds 16 bytes! ($\{*-routine} bytes)"
    36.     elseif *-routine<16    ; if it's below 16, pad it to 16
    37. ;    message "routine got padded by $\{16-(*-routine)} bytes"    ; debug
    38.     dc.b [16-(*-routine)]$69
    39.     endif
    40.     endm
    41. ; this was just repetitive
    42. enidec_checktileflags macro bit,setmode
    43.     add.b    d1,d1
    44.     bcc.s    .skip        ; if that bit wasn't set, branch
    45.     subq.w    #1,d6        ; get next bit number
    46.     btst    d6,d5        ; is this tile flag bit set?
    47.     beq.s    .skip        ; if not, branch
    48.     if setmode=0
    49.     ori.w    #1<<bit,d3
    50.     else
    51.     addi.w    #1<<bit,d3
    52.     endif
    53. .skip
    54.     endm
    55. ; ===========================================================================
    56.  
    57. EniDec:
    58.     if _Eni_CompatibilityMode=0
    59.     movem.l    d1-d7/a2-a6,-(sp)
    60.     else
    61.     movem.l    d0-d7/a1-a6,-(sp)
    62.     endif
    63. ; for compatibility with old assemblers I can't use a proper equation, so lemme explain
    64. ; the exact value will depend on codebases amount of stack use after the saved registers
    65. ; basically just count how many bsrs, jsrs or peas can lead into another one...
    66. ; ...make it a negative number and multiply that by 4
    67. ; for this version of the code, that's 1
    68. ; then add -2, because we need a word-sized buffer
    69. ; -(1*4)-2 = -6
    70.     lea    -6(sp),a6    ; load byte-to-word ram buffer in a6
    71.  
    72. ; set subroutine loop address
    73. ; compared to a bra, jmp (aN) saves 2 cycles per loop
    74.     lea    EniDec_Loop(pc),a5
    75.  
    76.     movea.w    d0,a3        ; store starting art tile
    77.  
    78.     move.b    (a0)+,d0
    79.     ext.w    d0
    80.     movea.w    d0,a2        ; set initial bit amount for inline copy
    81.  
    82.     move.b    (a0)+,d0    ; 000PCCHV ; set vram flag permits
    83.     lsl.b    #3,d0        ; PCCHV000 ; shift by 3
    84.     move.w    d0,d2        ; store in the high word of d2
    85.     swap    d2
    86. ; set increment word
    87.     if _Eni_EvenAligned=0
    88.     move.w    (a0)+,d4
    89.     else
    90.     move.b    (a0)+,(a6)+
    91.     move.b    (a0)+,(a6)+
    92.     move.w    -(a6),d4
    93.     endif
    94.     add.w    a3,d4        ; add starting art tile
    95. ; set static word
    96.     if _Eni_EvenAligned=0
    97.     move.w    (a0)+,d0
    98.     else
    99.     move.b    (a0)+,(a6)+
    100.     move.b    (a0)+,(a6)+
    101.     move.w    -(a6),d0
    102.     endif
    103.     add.w    a3,d0        ; add starting art tile
    104.     movea.w    d0,a4        ; store in a4 (moves and adds are faster on dN.w, saves 4 cycles)
    105. ; set initial subroutine flag
    106.     if _Eni_EvenAligned=0
    107.     move.w    (a0)+,d5  
    108.     else
    109.     move.b    (a0)+,(a6)+
    110.     move.b    (a0)+,(a6)+
    111.     move.w    -(a6),d5
    112.     endif
    113. ; set bit counter
    114.     moveq    #16,d6        ; 16 bits = 2 bytes
    115. EniDec_Loop:
    116.     moveq    #7,d0            ; process 7 bits at a time
    117.     move.w    d6,d7            ; move d6 to d7
    118.     sub.w    d0,d7            ; subtract by 7 (convenient)
    119.     move.w    d5,d1            ; copy d5 into d1
    120.     lsr.w    d7,d1            ; right shift by value in d7
    121.  
    122.     move.w    d1,d2            ; move d1 to d2
    123.     andi.w    #%01110000,d1        ; keep only 3 bits. Lower 4 are for d2, sign bit unused
    124.  
    125.     cmpi.w    #1<<6,d1        ; is bit 6 set?
    126.     bhs.s    .7bitcommand        ; if it is, branch
    127.     moveq    #6,d0            ; if not, process 6 bits instead of 7
    128.     lsr.w    #1,d2            ; bitfield now becomes TTSSSS instead of TTTSSSS
    129. .7bitcommand:
    130.     if _Eni_InlineBitStream=0
    131.     bsr.w    EniDec_ChkGetNextByte    ; uses d0, doesn't touch d1 or d2
    132.     else
    133. ;EniDec_ChkGetNextByte:
    134.     sub.w    d0,d6        ; subtract d0 from d6
    135.     cmpi.w    #8,d6        ; has it hit 8 or lower?
    136.     bhi.s    .nonewbyte    ; if not, branch
    137.     addq.w    #8,d6        ; 8 bits = 1 byte
    138.  
    139.     asl.w    #8,d5        ; shift up by a byte
    140.     move.b    (a0)+,d5    ; store next byte in lower register byte
    141. .nonewbyte:
    142.     endif
    143.  
    144.     moveq    #$F,d3            ; d3 is also used for SubE
    145.     and.w    d3,d2            ; keep only lower nybble
    146.     if _Eni_RemoveJmpTable=0
    147. ; JmpTable addresses are word-sized.
    148. ; Due to its placement in rom, SubE just falls into itself
    149.     lsr.w    #4-1,d1            ; store upper nybble multiplied by 2 (max value = 7)
    150.     jmp    EniDec_JmpTable(pc,d1.w)
    151.     else
    152. ; all subroutines are offset by 16 bytes. Some of them barely fit, I'm quite proud of that
    153. ; SubE exceeds this, but it's the last one so it doesn't matter
    154.     jmp    EniDec_Sub0(pc,d1.w)
    155.     endif
    156. ; ---------------------------------------------------------------------------
    157. EniDec_Sub0:
    158. .loop:
    159.     move.w    d4,(a1)+        ; write to destination
    160.     addq.w    #1,d4            ; increment
    161.     dbra    d2,.loop        ; repeat
    162.     jmp    (a5)        ; EniDec_Loop
    163.     if _Eni_RemoveJmpTable<>0
    164.     enidecpad16 EniDec_Sub0
    165. EniDec_Sub2:
    166. .loop:
    167.     move.w    d4,(a1)+        ; write to destination
    168.     addq.w    #1,d4            ; increment
    169.     dbra    d2,.loop        ; repeat
    170.     jmp    (a5)        ; EniDec_Loop
    171.     enidecpad16 EniDec_Sub2
    172.     endif
    173. ; ---------------------------------------------------------------------------
    174. EniDec_Sub4:
    175. .loop:
    176.     move.w    a4,(a1)+        ; write to destination
    177.     dbra    d2,.loop        ; repeat
    178.     jmp    (a5)        ; EniDec_Loop
    179.     if _Eni_RemoveJmpTable<>0
    180.     enidecpad16 EniDec_Sub4
    181. EniDec_Sub6:
    182. .loop:
    183.     move.w    a4,(a1)+        ; write to destination
    184.     dbra    d2,.loop        ; repeat
    185.     jmp    (a5)        ; EniDec_Loop
    186.     enidecpad16 EniDec_Sub6
    187.     endif
    188. ; ---------------------------------------------------------------------------
    189. EniDec_Sub8:
    190.     bsr.s    EniDec_GetInlineCopyVal
    191. .loop:
    192.     move.w    d1,(a1)+
    193.     dbra    d2,.loop
    194.     jmp    (a5)        ; EniDec_Loop
    195.     if _Eni_RemoveJmpTable<>0
    196.     enidecpad16 EniDec_Sub8
    197.     endif
    198. ; ---------------------------------------------------------------------------
    199. EniDec_SubA:
    200.     bsr.s    EniDec_GetInlineCopyVal
    201. .loop:
    202.     move.w    d1,(a1)+
    203.     addq.w    #1,d1
    204.     dbra    d2,.loop
    205.     jmp    (a5)        ; EniDec_Loop
    206.     if _Eni_RemoveJmpTable<>0
    207.     enidecpad16 EniDec_SubA
    208.     endif
    209. ; ---------------------------------------------------------------------------
    210. EniDec_SubC:
    211.     bsr.s    EniDec_GetInlineCopyVal
    212. .loop:
    213.     move.w    d1,(a1)+
    214.     subq.w    #1,d1
    215.     dbra    d2,.loop
    216.     jmp    (a5)        ; EniDec_Loop
    217.     if _Eni_RemoveJmpTable<>0
    218.     enidecpad16 EniDec_SubC
    219.     else
    220. ; ---------------------------------------------------------------------------
    221. EniDec_JmpTable:
    222.     bra.s    EniDec_Sub0
    223.     bra.s    EniDec_Sub0    ; Sub2
    224.     bra.s    EniDec_Sub4
    225.     bra.s    EniDec_Sub4    ; Sub6
    226.  
    227.     bra.s    EniDec_Sub8
    228.     bra.s    EniDec_SubA
    229.     bra.s    EniDec_SubC
    230.     ;bra.s    EniDec_SubE    ; fall into SubE
    231.     endif
    232. ; ---------------------------------------------------------------------------
    233. ; EniDec_SubE is truly a special case
    234. EniDec_SubE:
    235.     cmp.w    d3,d2            ; d3 = $F ; is the loop set to 16?
    236.     beq.s    EniDec_End        ; if so, branch (signifies to end
    237. .loop:
    238.     bsr.s    EniDec_GetInlineCopyVal
    239.     move.w    d1,(a1)+
    240.     dbra    d2,.loop
    241.     jmp    (a5)        ; EniDec_Loop
    242. EniDec_End:
    243.     if _Eni_CompatibilityMode=0
    244.     movem.l    (sp)+,d1-d7/a2-a6
    245.     else
    246. ; this code figures out where a0 should end
    247.     subq.w    #1,a0
    248.     cmpi.w    #16,d6            ; were we going to start on a completely new byte?
    249.     bne.s    .got_byte        ; if not, branch
    250.     subq.w    #1,a0
    251. .got_byte:
    252.     move.w    a0,d0
    253.     lsr.w    #1,d0            ; are we on an odd byte?
    254.     bhs.s    .even_loc        ; if not, branch
    255.     addq.w    #1,a0            ; ensure we're on an even byte
    256. .even_loc:
    257.     movem.l    (sp)+,d0-d7/a1-a6
    258.     endif
    259.     rts
    260. ; ===========================================================================
    261.  
    262. EniDec_GetInlineCopyVal:
    263.     move.w    a3,d3            ; starting art tile
    264. ; original didn't need to use a high word
    265. ; this is a 4 cycle loss, though it's usually made up for everywhere else
    266.     move.l    d2,d1            ; get vram tile flags
    267.     swap    d1            ; (it's in the high word of d2)
    268.     enidec_checktileflags 15,0
    269.     enidec_checktileflags 14,1
    270.     enidec_checktileflags 13,1
    271.     enidec_checktileflags 12,0
    272.     enidec_checktileflags 11,0
    273.  
    274.     move.w    d5,d1
    275.     move.w    d6,d7            ; get remaining bits
    276.     sub.w    a2,d7            ; subtract minimum bit number
    277.     bhs.s    .got_enough        ; if we're beyond that, branch
    278.     move.w    d7,d6
    279.     addi.w    #16,d6            ; 16 bits = 2 bytes
    280.     neg.w    d7            ; calculate bit deficit
    281.     lsl.w    d7,d1            ; make space for this many bits
    282.     move.b    (a0),d5            ; get next byte
    283.     rol.b    d7,d5            ; make the upper X bits the lower X bits
    284.     add.w    d7,d7
    285.     and.w    .andvalues-2(pc,d7.w),d5; only keep X lower bits
    286.     add.w    d5,d1            ; compensate for the bit deficit
    287. .got_field:
    288.     move.w    a2,d0
    289.     add.w    d0,d0
    290.     and.w    .andvalues-2(pc,d0.w),d1; only keep as many bits as required
    291.     add.w    d3,d1            ; add starting art tile
    292.  
    293. ;    move.b    (a0)+,d5    ; 08 ; get current byte, move onto next byte
    294. ;    lsl.w    #8,d5        ; 22 ; shift up by a byte
    295. ;    move.b    (a0)+,d5    ; 08 ; store next byte in lower register byte
    296.                 ; 38
    297.  
    298. ; saves 4 cycles per branch, at the cost of saving and restoring a6, and setting up the register
    299. ; those caveats add around 24 cycles, but from my tests, it usually results in a speedup
    300.     move.b    (a0)+,(a6)+    ; 12 ; temporarily write into the destination
    301.     move.b    (a0)+,(a6)+    ; 12
    302.     move.w    -(a6),d5    ; 10 ; move result to d5, set destination back to correct spot
    303.                 ; 34
    304.     rts
    305. ; ---------------------------------------------------------------------------
    306. .andvalues:
    307.     dc.w     1,    3,    7,   $F
    308.     dc.w   $1F,  $3F,  $7F,  $FF
    309.     dc.w  $1FF, $3FF, $7FF, $FFF
    310.     dc.w $1FFF,$3FFF,$7FFF,$FFFF
    311. ; ---------------------------------------------------------------------------
    312. .got_exact:
    313.     moveq    #16,d6        ; 16 bits = 2 bytes
    314.     bra.s    .got_field
    315. ; ---------------------------------------------------------------------------
    316. .got_enough:
    317.     beq.s    .got_exact    ; if the exact number of bits are leftover, branch
    318.     lsr.w    d7,d1        ; remove unneeded bits
    319.     move.w    a2,d0
    320.     add.w    d0,d0
    321.     and.w    .andvalues-2(pc,d0.w),d1    ; only keep as many bits as required
    322.     add.w    d3,d1        ; add starting art tile
    323.     move.w    a2,d0        ; store number of bits used up by inline copy
    324. ;    bra.s    EniDec_ChkGetNextByte    ; move onto next byte
    325. EniDec_ChkGetNextByte:
    326.     sub.w    d0,d6        ; subtract d0 from d6
    327.     cmpi.w    #8,d6        ; has it hit 8 or lower?
    328.     bhi.s    .nonewbyte    ; if not, branch
    329.     addq.w    #8,d6        ; 8 bits = 1 byte
    330. ; shift lowest byte to highest byte, and load a new value into low byte
    331.     asl.w    #8,d5        ; 22 ; shift up by a byte
    332.     move.b    (a0)+,d5    ; 08 ; store next byte in lower register byte
    333.                 ; 30
    334.  
    335. ;    move.b    d5,(a6)+    ; 08
    336. ;    move.b    (a0)+,(a6)+    ; 12
    337. ;    move.w    -(a6),d5    ; 10
    338.                 ; 30, sad.
    339. .nonewbyte:
    340.     rts
    341. ; ---------------------------------------------------------------------------
     

    Attached Files:

    Last edited: Apr 16, 2024
  2. OrionNavattan

    OrionNavattan

    Tech Member
    166
    164
    43
    Oregon
    Went ahead and made an ASM68K version (mostly just a matter of modifying the macros). And FWIW, threw in an optimization to EniDec_End.got_byte.

    Code (ASM):
    1. ; ---------------------------------------------------------------------------
    2. ; Enigma Decompression Algorithm
    3. ; For format explanation see http://info.sonicretro.org/Enigma_compression
    4. ; this one is optimised from the original, with the more rom-intensive
    5. ; speedups enabled by some flags down below
    6. ; ---------------------------------------------------------------------------
    7. ; INPUTS:
    8. ; d0 = starting art tile (added to each 8x8 before writing to destination)
    9. ; a0 = source address
    10. ; a1 = destination address
    11. ; TRASHES:
    12. ; d0,a0,a1
    13. ; STACK:
    14. ; - saved registers d1-d7/a2-a6 (13x4 bytes)
    15. ; - 4 bytes for one bsr (GetInlineCopyVal and ChkGetNextByte)
    16. ; - 2 bytes for word conversion
    17. ; ---------------------------------------------------------------------------
    18.  
    19.    pusho
    20.    opt l.       ; use . as local label symbol
    21.  
    22. _Eni_CompatibilityMode:   equ 0
    23. ; if 1, stay compatible with the original Enigma
    24. ; (they saved d0 and a1, and made a0 point to the end of the file)
    25. _Eni_EvenAligned:   equ 1
    26. ; if 1, allows Enigma compressed files to be at an odd numbered address
    27. _Eni_RemoveJmpTable:   equ 1
    28. ; if 1, saves 22 cycles per loop (12 for SubE) at the cost of some rom space
    29. _Eni_InlineBitStream:   equ 1
    30. ; if 1, inlines ChkGetNextByte in EniDec_Loop, for a speedup of 34 cycles per loop
    31. ; funny how this simple speedup greatly overshadows _Eni_RemoveJmpTable
    32. ; that one required infinitely more effort then this. oh well.
    33.  
    34. ; for removejmpto, routines needs to be 16 ($10) bytes from the next routine
    35. ; the only exception is SubE; the last one
    36. ; this macro should be easy enough to port to more primative macro formats
    37. enidecpad16: macro routine
    38.    if (*-routine)>16       ; if it exceeds 16, throw an error
    39.    inform 3,"ADDR ERROR - EXCEED: routine exceeds 16 bytes! ($%h bytes)",*-routine
    40.    elseif (*-routine)<16   ; if it's below 16, pad it to 16
    41. ;   inform 0,"routine got padded by $%h bytes",*-routine   ; debug
    42.    dcb.b 16-(*-routine),0
    43.    endif
    44.    endm
    45. ; this was just repetitive
    46. enidec_checktileflags: macro bit,setmode
    47.    add.b   d1,d1
    48.    bcc.s   .skip\@       ; if that bit wasn't set, branch
    49.    subq.w   #1,d6       ; get next bit number
    50.    btst   d6,d5       ; is this tile flag bit set?
    51.    beq.s   .skip\@       ; if not, branch
    52.    if setmode=0
    53.    ori.w   #1<<bit,d3
    54.    else
    55.    addi.w   #1<<bit,d3
    56.    endif
    57. .skip\@:
    58.    endm
    59. ; ===========================================================================
    60.  
    61. EniDec:
    62.    if _Eni_CompatibilityMode=0
    63.    movem.l   d1-d7/a2-a6,-(sp)
    64.    else
    65.    movem.l   d0-d7/a1-a6,-(sp)
    66.    endif
    67. ; for compatibility with old assemblers I can't use a proper equation, so lemme explain
    68. ; the exact value will depend on codebases amount of stack use after the saved registers
    69. ; basically just count how many bsrs, jsrs or peas can lead into another one...
    70. ; ...make it a negative number and multiply that by 4
    71. ; for this version of the code, that's 1
    72. ; then add -2, because we need a word-sized buffer
    73. ; -(1*4)-2 = -6
    74.    lea   -6(sp),a6   ; load byte-to-word ram buffer in a6
    75.  
    76. ; set subroutine loop address
    77. ; compared to a bra, jmp (aN) saves 2 cycles per loop
    78.    lea   EniDec_Loop(pc),a5
    79.  
    80.    movea.w   d0,a3       ; store starting art tile
    81.  
    82.    move.b   (a0)+,d0
    83.    ext.w   d0
    84.    movea.w   d0,a2       ; set initial bit amount for inline copy
    85.  
    86.    move.b   (a0)+,d0   ; 000PCCHV ; set vram flag permits
    87.    lsl.b   #3,d0       ; PCCHV000 ; shift by 3
    88.    move.w   d0,d2       ; store in the high word of d2
    89.    swap   d2
    90. ; set increment word
    91.    if _Eni_EvenAligned=0
    92.    move.w   (a0)+,d4
    93.    else
    94.    move.b   (a0)+,(a6)+
    95.    move.b   (a0)+,(a6)+
    96.    move.w   -(a6),d4
    97.    endif
    98.    add.w   a3,d4       ; add starting art tile
    99. ; set static word
    100.    if _Eni_EvenAligned=0
    101.    move.w   (a0)+,d0
    102.    else
    103.    move.b   (a0)+,(a6)+
    104.    move.b   (a0)+,(a6)+
    105.    move.w   -(a6),d0
    106.    endif
    107.    add.w   a3,d0       ; add starting art tile
    108.    movea.w   d0,a4       ; store in a4 (moves and adds are faster on dN.w, saves 4 cycles)
    109. ; set initial subroutine flag
    110.    if _Eni_EvenAligned=0
    111.    move.w   (a0)+,d5
    112.    else
    113.    move.b   (a0)+,(a6)+
    114.    move.b   (a0)+,(a6)+
    115.    move.w   -(a6),d5
    116.    endif
    117. ; set bit counter
    118.    moveq   #16,d6       ; 16 bits = 2 bytes
    119. EniDec_Loop:
    120.    moveq   #7,d0           ; process 7 bits at a time
    121.    move.w   d6,d7           ; move d6 to d7
    122.    sub.w   d0,d7           ; subtract by 7 (convenient)
    123.    move.w   d5,d1           ; copy d5 into d1
    124.    lsr.w   d7,d1           ; right shift by value in d7
    125.  
    126.    move.w   d1,d2           ; move d1 to d2
    127.    andi.w   #%01110000,d1       ; keep only 3 bits. Lower 4 are for d2, sign bit unused
    128.  
    129.    cmpi.w   #1<<6,d1       ; is bit 6 set?
    130.    bhs.s   .7bitcommand       ; if it is, branch
    131.    moveq   #6,d0           ; if not, process 6 bits instead of 7
    132.    lsr.w   #1,d2           ; bitfield now becomes TTSSSS instead of TTTSSSS
    133. .7bitcommand:
    134.    if _Eni_InlineBitStream=0
    135.    bsr.w   EniDec_ChkGetNextByte   ; uses d0, doesn't touch d1 or d2
    136.    else
    137. ;EniDec_ChkGetNextByte:
    138.    sub.w   d0,d6       ; subtract d0 from d6
    139.    cmpi.w   #8,d6       ; has it hit 8 or lower?
    140.    bhi.s   .nonewbyte   ; if not, branch
    141.    addq.w   #8,d6       ; 8 bits = 1 byte
    142.  
    143.    asl.w   #8,d5       ; shift up by a byte
    144.    move.b   (a0)+,d5   ; store next byte in lower register byte
    145. .nonewbyte:
    146.    endif
    147.  
    148.    moveq   #$F,d3           ; d3 is also used for SubE
    149.    and.w   d3,d2           ; keep only lower nybble
    150.    if _Eni_RemoveJmpTable=0
    151. ; JmpTable addresses are word-sized.
    152. ; Due to its placement in rom, SubE just falls into itself
    153.    lsr.w   #4-1,d1           ; store upper nybble multiplied by 2 (max value = 7)
    154.    jmp   EniDec_JmpTable(pc,d1.w)
    155.    else
    156. ; all subroutines are offset by 16 bytes. Some of them barely fit, I'm quite proud of that
    157. ; SubE exceeds this, but it's the last one so it doesn't matter
    158.    jmp   EniDec_Sub0(pc,d1.w)
    159.    endif
    160. ; ---------------------------------------------------------------------------
    161. EniDec_Sub0:
    162. .loop:
    163.    move.w   d4,(a1)+       ; write to destination
    164.    addq.w   #1,d4           ; increment
    165.    dbra   d2,.loop       ; repeat
    166.    jmp   (a5)       ; EniDec_Loop
    167.    if _Eni_RemoveJmpTable<>0
    168.    enidecpad16 EniDec_Sub0
    169. EniDec_Sub2:
    170. .loop:
    171.    move.w   d4,(a1)+       ; write to destination
    172.    addq.w   #1,d4           ; increment
    173.    dbra   d2,.loop       ; repeat
    174.    jmp   (a5)       ; EniDec_Loop
    175.    enidecpad16 EniDec_Sub2
    176.    endif
    177. ; ---------------------------------------------------------------------------
    178. EniDec_Sub4:
    179. .loop:
    180.    move.w   a4,(a1)+       ; write to destination
    181.    dbra   d2,.loop       ; repeat
    182.    jmp   (a5)       ; EniDec_Loop
    183.    if _Eni_RemoveJmpTable<>0
    184.    enidecpad16 EniDec_Sub4
    185. EniDec_Sub6:
    186. .loop:
    187.    move.w   a4,(a1)+       ; write to destination
    188.    dbra   d2,.loop       ; repeat
    189.    jmp   (a5)       ; EniDec_Loop
    190.    enidecpad16 EniDec_Sub6
    191.    endif
    192. ; ---------------------------------------------------------------------------
    193. EniDec_Sub8:
    194.    bsr.s   EniDec_GetInlineCopyVal
    195. .loop:
    196.    move.w   d1,(a1)+
    197.    dbra   d2,.loop
    198.    jmp   (a5)       ; EniDec_Loop
    199.    if _Eni_RemoveJmpTable<>0
    200.    enidecpad16 EniDec_Sub8
    201.    endif
    202. ; ---------------------------------------------------------------------------
    203. EniDec_SubA:
    204.    bsr.s   EniDec_GetInlineCopyVal
    205. .loop:
    206.    move.w   d1,(a1)+
    207.    addq.w   #1,d1
    208.    dbra   d2,.loop
    209.    jmp   (a5)       ; EniDec_Loop
    210.    if _Eni_RemoveJmpTable<>0
    211.    enidecpad16 EniDec_SubA
    212.    endif
    213. ; ---------------------------------------------------------------------------
    214. EniDec_SubC:
    215.    bsr.s   EniDec_GetInlineCopyVal
    216. .loop:
    217.    move.w   d1,(a1)+
    218.    subq.w   #1,d1
    219.    dbra   d2,.loop
    220.    jmp   (a5)       ; EniDec_Loop
    221.    if _Eni_RemoveJmpTable<>0
    222.    enidecpad16 EniDec_SubC
    223.    else
    224. ; ---------------------------------------------------------------------------
    225. EniDec_JmpTable:
    226.    bra.s   EniDec_Sub0
    227.    bra.s   EniDec_Sub0   ; Sub2
    228.    bra.s   EniDec_Sub4
    229.    bra.s   EniDec_Sub4   ; Sub6
    230.  
    231.    bra.s   EniDec_Sub8
    232.    bra.s   EniDec_SubA
    233.    bra.s   EniDec_SubC
    234.    ;bra.s   EniDec_SubE   ; fall into SubE
    235.    endif
    236. ; ---------------------------------------------------------------------------
    237. ; EniDec_SubE is truly a special case
    238. EniDec_SubE:
    239.    cmp.w   d3,d2           ; d3 = $F ; is the loop set to 16?
    240.    beq.s   EniDec_End       ; if so, branch (signifies to end
    241. .loop:
    242.    bsr.s   EniDec_GetInlineCopyVal
    243.    move.w   d1,(a1)+
    244.    dbra   d2,.loop
    245.    jmp   (a5)       ; EniDec_Loop
    246. EniDec_End:
    247.    if _Eni_CompatibilityMode=0
    248.    movem.l   (sp)+,d1-d7/a2-a6
    249.    else
    250. ; this code figures out where a0 should end
    251.    subq.w   #1,a0
    252.    cmpi.w   #16,d6           ; were we going to start on a completely new byte?
    253.    bne.s   .got_byte       ; if not, branch
    254.    subq.w   #1,a0
    255.  
    256. .got_byte:
    257. ; small optimization, saves 8-10 cycles
    258.    move.w   a0,d0
    259.    andi.w   #1,d0
    260.    adda.w   d0,a0           ; ensure we're on an even byte
    261.  
    262.    movem.l   (sp)+,d0-d7/a1-a6
    263.    endif
    264.    rts
    265. ; ===========================================================================
    266.  
    267. EniDec_GetInlineCopyVal:
    268.    move.w   a3,d3           ; starting art tile
    269. ; original didn't need to use a high word
    270. ; this is a 4 cycle loss, though it's usually made up for everywhere else
    271.    move.l   d2,d1           ; get vram tile flags
    272.    swap   d1           ; (it's in the high word of d2)
    273.    enidec_checktileflags 15,0
    274.    enidec_checktileflags 14,1
    275.    enidec_checktileflags 13,1
    276.    enidec_checktileflags 12,0
    277.    enidec_checktileflags 11,0
    278.  
    279.    move.w   d5,d1
    280.    move.w   d6,d7           ; get remaining bits
    281.    sub.w   a2,d7           ; subtract minimum bit number
    282.    bhs.s   .got_enough       ; if we're beyond that, branch
    283.    move.w   d7,d6
    284.    addi.w   #16,d6           ; 16 bits = 2 bytes
    285.    neg.w   d7           ; calculate bit deficit
    286.    lsl.w   d7,d1           ; make space for this many bits
    287.    move.b   (a0),d5           ; get next byte
    288.    rol.b   d7,d5           ; make the upper X bits the lower X bits
    289.    add.w   d7,d7
    290.    and.w   .andvalues-2(pc,d7.w),d5; only keep X lower bits
    291.    add.w   d5,d1           ; compensate for the bit deficit
    292. .got_field:
    293.    move.w   a2,d0
    294.    add.w   d0,d0
    295.    and.w   .andvalues-2(pc,d0.w),d1; only keep as many bits as required
    296.    add.w   d3,d1           ; add starting art tile
    297.  
    298. ;   move.b   (a0)+,d5   ; 08 ; get current byte, move onto next byte
    299. ;   lsl.w   #8,d5       ; 22 ; shift up by a byte
    300. ;   move.b   (a0)+,d5   ; 08 ; store next byte in lower register byte
    301.                ; 38
    302.  
    303. ; saves 4 cycles per branch, at the cost of saving and restoring a6, and setting up the register
    304. ; those caveats add around 24 cycles, but from my tests, it usually results in a speedup
    305.    move.b   (a0)+,(a6)+   ; 12 ; temporarily write into the destination
    306.    move.b   (a0)+,(a6)+   ; 12
    307.    move.w   -(a6),d5   ; 10 ; move result to d5, set destination back to correct spot
    308.                ; 34
    309.    rts
    310. ; ---------------------------------------------------------------------------
    311. .andvalues:
    312.    dc.w    1,    3,    7,   $F
    313.    dc.w   $1F,  $3F,  $7F,  $FF
    314.    dc.w  $1FF, $3FF, $7FF, $FFF
    315.    dc.w $1FFF,$3FFF,$7FFF,$FFFF
    316. ; ---------------------------------------------------------------------------
    317. .got_exact:
    318.    moveq   #16,d6       ; 16 bits = 2 bytes
    319.    bra.s   .got_field
    320. ; ---------------------------------------------------------------------------
    321. .got_enough:
    322.    beq.s   .got_exact   ; if the exact number of bits are leftover, branch
    323.    lsr.w   d7,d1       ; remove unneeded bits
    324.    move.w   a2,d0
    325.    add.w   d0,d0
    326.    and.w   .andvalues-2(pc,d0.w),d1   ; only keep as many bits as required
    327.    add.w   d3,d1       ; add starting art tile
    328.    move.w   a2,d0       ; store number of bits used up by inline copy
    329. ;   bra.s   EniDec_ChkGetNextByte   ; move onto next byte
    330. EniDec_ChkGetNextByte:
    331.    sub.w   d0,d6       ; subtract d0 from d6
    332.    cmpi.w   #8,d6       ; has it hit 8 or lower?
    333.    bhi.s   .nonewbyte   ; if not, branch
    334.    addq.w   #8,d6       ; 8 bits = 1 byte
    335. ; shift lowest byte to highest byte, and load a new value into low byte
    336.    asl.w   #8,d5       ; 22 ; shift up by a byte
    337.    move.b   (a0)+,d5   ; 08 ; store next byte in lower register byte
    338.                ; 30
    339.  
    340. ;   move.b   d5,(a6)+   ; 08
    341. ;   move.b   (a0)+,(a6)+   ; 12
    342. ;   move.w   -(a6),d5   ; 10
    343.                ; 30, sad.
    344. .nonewbyte:
    345.    rts
    346. ; ---------------------------------------------------------------------------
    347.  
    348.    popo   ; restore previous options
    349.  
    350.  
     
  3. RealMalachi

    RealMalachi

    you can call me mal Member
    I'm doing another release, for two reasons:

    Firstly, Orion ported it to ASM68K. I knew it was possible, but didn't bother; mainly due to my lack of knowledge on ASM68K macros. Now there's native support... with another flag. _Eni_Assembler changes what assembler it targets: 0 for ASM68K, 1 for AS. Big thanks Orion, I'll definitely be taking notes from this for the future.

    Secondly, my original implementation had a race condition with some stack data, potentially causing issues with decompression when interrupts are enabled.
    Register a6 is used to retrieve potentially odd-addressed word-sized data (doing so without a process like this would cause an address error), and to do that quickly without requiring extra ram, is positioned at the end of where stack would reach in the decompressors code. However, that end is unprotected from say, vertical interrupts, meaning there is a short window where the data can be corrupted. This can wildly vary in severity, but most recorded cases are dire.
    For example, Sonic 1 uses Enigma compression for its block data, and those can sometimes break, resulting in this:
    [​IMG]
    Now it properly allocates a safe area of stack, instead of being just outside of it. Thanks to DSK for finding this before I did, that really helped to narrow it down.

    I'll be keeping the original file up, but highly recommend using v1.1, or whatever the highest revision is at the time. The zip file updates will be uploaded onto the first post, so you don't have to dig for it.
    v1.1 code
    Code (Text):
    1. ; ---------------------------------------------------------------------------
    2. ; Enigma Decompression Algorithm
    3. ; For format explanation see http://info.sonicretro.org/Enigma_compression
    4. ; this one is optimised from the original, but with the more rom-intensive
    5. ; speedups locked behind some flags down below
    6. ; ---------------------------------------------------------------------------
    7. ; proper ASM68K support added by OrionNavattan
    8. _Eni_Assembler:        equ 1    ; ASM68K = 0, AS = 1
    9. ; ---------------------------------------------------------------------------
    10. ; INPUTS:
    11. ; d0 = starting art tile (added to each 8x8 before writing to destination)
    12. ; a0 = source address
    13. ; a1 = destination address
    14. ; TRASHES:
    15. ; d0,a0,a1
    16. ; STACK:
    17. ; - saved registers d1-d7/a2-a6 (13x4 bytes)
    18. ; - 4 bytes for one bsr (EniDec_GetInlineCopyVal and EniDec_ChkGetNextByte)
    19. ; - 2 bytes for word conversion
    20. ; ---------------------------------------------------------------------------
    21. ; equ instead of = for ASM68K compatibility
    22. _Eni_CompatibilityMode:    equ 1
    23. ; if 1, stay compatible with the original Enigma
    24. ; (they saved d0 and a1, and made a0 point to the end of the file)
    25. _Eni_EvenAligned:    equ 0
    26. ; if 1, allows Enigma compressed files to be at an odd numbered address
    27. _Eni_RemoveJmpTable:    equ 0
    28. ; if 1, saves 22 cycles per loop (12 for SubE) at the cost of some rom space
    29. _Eni_InlineBitStream:    equ 0
    30. ; if 1, inlines ChkGetNextByte in EniDec_Loop, for a speedup of 34 cycles per loop
    31. ; funny how this simple speedup greatly overshadows _Eni_RemoveJmpTable
    32. ; that one required infinitely more effort then this. oh well.
    33.  
    34. ; macro explanations
    35. ; enidecpad16:
    36. ; - for RemoveJmpTable, routines needs to be aligned in 16($10) byte chunks
    37. ;   none of the routines can exceed that boundary, or the code won't work
    38. ;   the only exception to this is SubE; the last one
    39. ; enidec_checktileflags:
    40. ; - this was just repetitive
    41.     if _Eni_Assembler=0
    42.     pusho            ; save current options
    43.     opt l.            ; use "." as local label symbol
    44. enidecpad16: macro routine
    45.     if (*-routine)>16    ; if it exceeds 16, throw an error
    46.     inform 3,"ADDR ERROR - EXCEED: routine exceeds 16 bytes! ($%h bytes)",*-routine
    47.     elseif (*-routine)<16    ; if it's below 16, pad it to 16
    48. ;    inform 0,"routine got padded by $%h bytes",*-routine   ; debug
    49.     dcb.b 16-(*-routine),0
    50.     endif
    51.     endm
    52. enidec_checktileflags: macro bit,setmode
    53.     add.b    d1,d1
    54.     bcc.s    .skip\@        ; if that bit wasn't set, branch
    55.     subq.w    #1,d6        ; get next bit number
    56.     btst    d6,d5        ; is this tile flag bit set?
    57.     beq.s    .skip\@        ; if not, branch
    58.     if setmode=0
    59.     ori.w    #1<<bit,d3
    60.     else
    61.     addi.w    #1<<bit,d3
    62.     endif
    63. .skip\@:
    64.     endm
    65.     else
    66. enidecpad16 macro routine
    67.     if *-routine>16        ; if it exceeds 16, throw an error
    68.     fatal "ADDR ERROR - EXCEED: routine exceeds 16 bytes! ($\{*-routine} bytes)"
    69.     elseif *-routine<16    ; if it's below 16, pad it to 16
    70. ;    message "routine got padded by $\{16-(*-routine)} bytes"    ; debug
    71.     dc.b [16-(*-routine)]$69
    72.     endif
    73.     endm
    74. enidec_checktileflags macro bit,setmode
    75.     add.b    d1,d1
    76.     bcc.s    .skip        ; if that bit wasn't set, branch
    77.     subq.w    #1,d6        ; get next bit number
    78.     btst    d6,d5        ; is this tile flag bit set?
    79.     beq.s    .skip        ; if not, branch
    80.     if setmode=0
    81.     ori.w    #1<<bit,d3
    82.     else
    83.     addi.w    #1<<bit,d3
    84.     endif
    85. .skip
    86.     endm
    87.     endif
    88. ; ===========================================================================
    89.  
    90. EniDec:
    91.     if _Eni_CompatibilityMode=0
    92.     movem.l    d1-d7/a2-a6,-(sp)
    93.     else
    94.     movem.l    d0-d7/a1-a6,-(sp)
    95.     endif
    96.  
    97. ; compared to my original implementation, this prevents a race condition
    98. ; big thanks to DSK for finding this first
    99.     subq.l    #2,sp        ; allocate 2 bytes from stack
    100.     lea    (sp),a6        ; use those bytes (via a6) for conversions
    101.  
    102. ; set subroutine loop address
    103. ; compared to a bra, jmp (aN) saves 2 cycles per-loop
    104.     lea    EniDec_Loop(pc),a5
    105.  
    106.     movea.w    d0,a3        ; store starting art tile
    107.  
    108.     move.b    (a0)+,d0
    109.     ext.w    d0
    110.     movea.w    d0,a2        ; set initial bit amount for inline copy
    111.  
    112.     move.b    (a0)+,d0    ; 000PCCHV ; set vram flag permits
    113.     lsl.b    #3,d0        ; PCCHV000 ; shift by 3
    114.     move.w    d0,d2        ; store in the high word of d2
    115.     swap    d2
    116. ; set increment word
    117.     if _Eni_EvenAligned=0
    118.     move.w    (a0)+,d4
    119.     else
    120.     move.b    (a0)+,(a6)+
    121.     move.b    (a0)+,(a6)+
    122.     move.w    -(a6),d4
    123.     endif
    124.     add.w    a3,d4        ; add starting art tile
    125. ; set static word
    126.     if _Eni_EvenAligned=0
    127.     move.w    (a0)+,d0
    128.     else
    129.     move.b    (a0)+,(a6)+
    130.     move.b    (a0)+,(a6)+
    131.     move.w    -(a6),d0
    132.     endif
    133.     add.w    a3,d0        ; add starting art tile
    134.     movea.w    d0,a4        ; store in a4 (moves and adds are faster on dN.w, saves 4 cycles)
    135. ; set initial subroutine flag
    136.     if _Eni_EvenAligned=0
    137.     move.w    (a0)+,d5  
    138.     else
    139.     move.b    (a0)+,(a6)+
    140.     move.b    (a0)+,(a6)+
    141.     move.w    -(a6),d5
    142.     endif
    143. ; set bit counter
    144.     moveq    #16,d6        ; 16 bits = 2 bytes
    145. EniDec_Loop:
    146.     moveq    #7,d0            ; process 7 bits at a time
    147.     move.w    d6,d7            ; move d6 to d7
    148.     sub.w    d0,d7            ; subtract by 7 (convenient)
    149.     move.w    d5,d1            ; copy d5 into d1
    150.     lsr.w    d7,d1            ; right shift by value in d7
    151.  
    152.     move.w    d1,d2            ; move d1 to d2
    153.     andi.w    #%01110000,d1        ; keep only 3 bits. Lower 4 are for d2, sign bit unused
    154.  
    155.     cmpi.w    #1<<6,d1        ; is bit 6 set?
    156.     bhs.s    .prcocess7bits        ; if it is, branch
    157.     moveq    #6,d0            ; if not, process 6 bits instead of 7
    158.     lsr.w    #1,d2            ; bitfield now becomes TTSSSS instead of TTTSSSS
    159. .prcocess7bits:
    160.     if _Eni_InlineBitStream=0
    161.     bsr.w    EniDec_ChkGetNextByte    ; uses d0, doesn't touch d1 or d2
    162.     else
    163. ;EniDec_ChkGetNextByte:
    164.     sub.w    d0,d6        ; subtract d0 from d6
    165.     cmpi.w    #8,d6        ; has it hit 8 or lower?
    166.     bhi.s    .nonewbyte    ; if not, branch
    167.     addq.w    #8,d6        ; 8 bits = 1 byte
    168.  
    169.     asl.w    #8,d5        ; shift up by a byte
    170.     move.b    (a0)+,d5    ; store next byte in lower register byte
    171. .nonewbyte:
    172.     endif
    173.  
    174.     moveq    #$F,d3            ; d3 is also used for SubE
    175.     and.w    d3,d2            ; keep only lower nybble
    176.     if _Eni_RemoveJmpTable=0
    177. ; JmpTable addresses are word-sized.
    178. ; Due to its placement in rom, SubE just falls into itself
    179.     lsr.w    #4-1,d1            ; store upper nybble multiplied by 2 (max value = 7)
    180.     jmp    EniDec_JmpTable(pc,d1.w)
    181.     else
    182. ; all subroutines are offset by 16 bytes. Some of them barely fit, I'm quite proud of that
    183. ; SubE exceeds this, but it's the last one so it doesn't matter
    184.     jmp    EniDec_Sub0(pc,d1.w)
    185.     endif
    186. ; ---------------------------------------------------------------------------
    187. EniDec_Sub0:
    188. .loop:
    189.     move.w    d4,(a1)+        ; write to destination
    190.     addq.w    #1,d4            ; increment
    191.     dbra    d2,.loop        ; repeat
    192.     jmp    (a5)        ; EniDec_Loop
    193.     if _Eni_RemoveJmpTable<>0
    194.     enidecpad16 EniDec_Sub0
    195. EniDec_Sub2:
    196. .loop:
    197.     move.w    d4,(a1)+        ; write to destination
    198.     addq.w    #1,d4            ; increment
    199.     dbra    d2,.loop        ; repeat
    200.     jmp    (a5)        ; EniDec_Loop
    201.     enidecpad16 EniDec_Sub2
    202.     endif
    203. ; ---------------------------------------------------------------------------
    204. EniDec_Sub4:
    205. .loop:
    206.     move.w    a4,(a1)+        ; write to destination
    207.     dbra    d2,.loop        ; repeat
    208.     jmp    (a5)        ; EniDec_Loop
    209.     if _Eni_RemoveJmpTable<>0
    210.     enidecpad16 EniDec_Sub4
    211. EniDec_Sub6:
    212. .loop:
    213.     move.w    a4,(a1)+        ; write to destination
    214.     dbra    d2,.loop        ; repeat
    215.     jmp    (a5)        ; EniDec_Loop
    216.     enidecpad16 EniDec_Sub6
    217.     endif
    218. ; ---------------------------------------------------------------------------
    219. EniDec_Sub8:
    220.     bsr.s    EniDec_GetInlineCopyVal
    221. .loop:
    222.     move.w    d1,(a1)+
    223.     dbra    d2,.loop
    224.     jmp    (a5)        ; EniDec_Loop
    225.     if _Eni_RemoveJmpTable<>0
    226.     enidecpad16 EniDec_Sub8
    227.     endif
    228. ; ---------------------------------------------------------------------------
    229. EniDec_SubA:
    230.     bsr.s    EniDec_GetInlineCopyVal
    231. .loop:
    232.     move.w    d1,(a1)+
    233.     addq.w    #1,d1
    234.     dbra    d2,.loop
    235.     jmp    (a5)        ; EniDec_Loop
    236.     if _Eni_RemoveJmpTable<>0
    237.     enidecpad16 EniDec_SubA
    238.     endif
    239. ; ---------------------------------------------------------------------------
    240. EniDec_SubC:
    241.     bsr.s    EniDec_GetInlineCopyVal
    242. .loop:
    243.     move.w    d1,(a1)+
    244.     subq.w    #1,d1
    245.     dbra    d2,.loop
    246.     jmp    (a5)        ; EniDec_Loop
    247.     if _Eni_RemoveJmpTable<>0
    248.     enidecpad16 EniDec_SubC
    249.     else
    250. ; ---------------------------------------------------------------------------
    251. EniDec_JmpTable:
    252.     bra.s    EniDec_Sub0
    253.     bra.s    EniDec_Sub0    ; Sub2
    254.     bra.s    EniDec_Sub4
    255.     bra.s    EniDec_Sub4    ; Sub6
    256.  
    257.     bra.s    EniDec_Sub8
    258.     bra.s    EniDec_SubA
    259.     bra.s    EniDec_SubC
    260.     ;bra.s    EniDec_SubE    ; fall into SubE
    261.     endif
    262. ; ---------------------------------------------------------------------------
    263. ; EniDec_SubE is truly a special case
    264. EniDec_SubE:
    265.     cmp.w    d3,d2            ; d3 = $F ; is the loop set to 16?
    266.     beq.s    EniDec_End        ; if so, branch (signifies to end
    267. .loop:
    268.     bsr.s    EniDec_GetInlineCopyVal
    269.     move.w    d1,(a1)+
    270.     dbra    d2,.loop
    271.     jmp    (a5)        ; EniDec_Loop
    272. EniDec_End:
    273.     addq.l    #2,sp        ; deallocate those 2 bytes
    274.  
    275.     if _Eni_CompatibilityMode=0
    276.     movem.l    (sp)+,d1-d7/a2-a6
    277.     else
    278. ; this code figures out where a0 should end
    279.     subq.w    #1,a0
    280.     cmpi.w    #16,d6            ; were we going to start on a completely new byte?
    281.     bne.s    .got_byte        ; if not, branch
    282.     subq.w    #1,a0
    283. .got_byte:
    284.     if _Eni_EvenAligned=0    ; TODO: thorough testing
    285. ; Orion: small optimization, saves 8-10 cycles
    286.     move.w    a0,d0
    287.     andi.w    #1,d0
    288.     adda.w    d0,a0            ; ensure we're on an even byte
    289.     endif
    290.  
    291.     movem.l    (sp)+,d0-d7/a1-a6
    292.     endif
    293.     rts
    294. ; ===========================================================================
    295.  
    296. EniDec_GetInlineCopyVal:
    297.     move.w    a3,d3            ; starting art tile
    298. ; original didn't need to use a high word
    299. ; this is a 4 cycle loss, though it's usually made up for everywhere else
    300.     move.l    d2,d1            ; get vram tile flags
    301.     swap    d1            ; (it's in the high word of d2)
    302.     enidec_checktileflags 15,0
    303.     enidec_checktileflags 14,1
    304.     enidec_checktileflags 13,1
    305.     enidec_checktileflags 12,0
    306.     enidec_checktileflags 11,0
    307.  
    308.     move.w    d5,d1
    309.     move.w    d6,d7            ; get remaining bits
    310.     sub.w    a2,d7            ; subtract minimum bit number
    311.     bhs.s    .got_enough        ; if we're beyond that, branch
    312.     move.w    d7,d6
    313.     addi.w    #16,d6            ; 16 bits = 2 bytes
    314.     neg.w    d7            ; calculate bit deficit
    315.     lsl.w    d7,d1            ; make space for this many bits
    316.     move.b    (a0),d5            ; get next byte
    317.     rol.b    d7,d5            ; make the upper X bits the lower X bits
    318.     add.w    d7,d7
    319.     and.w    .andvalues-2(pc,d7.w),d5; only keep X lower bits
    320.     add.w    d5,d1            ; compensate for the bit deficit
    321. .got_field:
    322.     move.w    a2,d0
    323.     add.w    d0,d0
    324.     and.w    .andvalues-2(pc,d0.w),d1; only keep as many bits as required
    325.     add.w    d3,d1            ; add starting art tile
    326.  
    327. ;    move.b    (a0)+,d5    ; 08 ; get current byte, move onto next byte
    328. ;    lsl.w    #8,d5        ; 22 ; shift up by a byte
    329. ;    move.b    (a0)+,d5    ; 08 ; store next byte in lower register byte
    330.                 ; 38
    331.  
    332. ; saves 4 cycles per branch, at the cost of saving and restoring a6, and setting up the register
    333. ; those caveats add around 24 cycles, but from my tests, it usually results in a speedup
    334.     move.b    (a0)+,(a6)+    ; 12 ; temporarily write into the destination
    335.     move.b    (a0)+,(a6)+    ; 12
    336.     move.w    -(a6),d5    ; 10 ; move result to d5, set destination back to correct spot
    337.                 ; 34
    338.     rts
    339. ; ---------------------------------------------------------------------------
    340. .andvalues:
    341.     dc.w     1,    3,    7,   $F
    342.     dc.w   $1F,  $3F,  $7F,  $FF
    343.     dc.w  $1FF, $3FF, $7FF, $FFF
    344.     dc.w $1FFF,$3FFF,$7FFF,$FFFF
    345. ; ---------------------------------------------------------------------------
    346. .got_exact:
    347.     moveq    #16,d6        ; 16 bits = 2 bytes
    348.     bra.s    .got_field
    349. ; ---------------------------------------------------------------------------
    350. .got_enough:
    351.     beq.s    .got_exact    ; if the exact number of bits are leftover, branch
    352.     lsr.w    d7,d1        ; remove unneeded bits
    353.     move.w    a2,d0
    354.     add.w    d0,d0
    355.     and.w    .andvalues-2(pc,d0.w),d1    ; only keep as many bits as required
    356.     add.w    d3,d1        ; add starting art tile
    357.     move.w    a2,d0        ; store number of bits used up by inline copy
    358. ;    bra.s    EniDec_ChkGetNextByte    ; move onto next byte
    359. EniDec_ChkGetNextByte:
    360.     sub.w    d0,d6        ; subtract d0 from d6
    361.     cmpi.w    #8,d6        ; has it hit 8 or lower?
    362.     bhi.s    .nonewbyte    ; if not, branch
    363.     addq.w    #8,d6        ; 8 bits = 1 byte
    364. ; shift lowest byte to highest byte, and load a new value into low byte
    365.     asl.w    #8,d5        ; 22 ; shift up by a byte
    366.     move.b    (a0)+,d5    ; 08 ; store next byte in lower register byte
    367.                 ; 30
    368.  
    369. ;    move.b    d5,(a6)+    ; 08
    370. ;    move.b    (a0)+,(a6)+    ; 12
    371. ;    move.w    -(a6),d5    ; 10
    372.                 ; 30, sad.
    373. .nonewbyte:
    374.     rts
    375. ; ---------------------------------------------------------------------------
    376.     if _Eni_Assembler=0
    377.     popo            ; restore previous options
    378.     endif