don't click here

Everything That I Know About Sonic the Hedgehog's Source Code

Discussion in 'General Sonic Discussion' started by Clownacy, Mar 30, 2022.

  1. Devon

    Devon

    La mer va embrassé moi et délivré moi lakay. Tech Member
    1,515
    1,863
    93
    your mom
    BenoitRen actually decompiled D.A. Garden, and looking at the code, "unlze" is a direct translation of KosDec, even going as far as to emulate the descriptor field byte swapping via the stack pointer:
    Code (C):
    1. void unlze(unsigned char* pSrc, unsigned char* pDst) { /* Line 105, Address: 0x1003540 */
    2.   short_union sp;
    3.   short sD5, sD4, sD3, sD2, sD1, sD0;
    4.   unsigned char CarryFlg, XFlg;
    5.  
    6.   sp.b.l = *pSrc++; /* Line 110, Address: 0x100356c */
    7.   sp.b.h = *pSrc++; /* Line 111, Address: 0x1003580 */
    8.   sD5 = sp.w; /* Line 112, Address: 0x1003594 */
    9.   sD4 = 15; /* Line 113, Address: 0x10035a0 */
    10.  
    11.   while (1)
    12.   {
    13.     CarryFlg = sD5 & 1; /* Line 117, Address: 0x10035ac */
    14.     XFlg = 0; /* Line 118, Address: 0x10035c0 */
    15.     sD5 = (unsigned short)sD5 >> 1; /* Line 119, Address: 0x10035c4 */
    16.     if (--sD4 < 0) /* Line 120, Address: 0x10035e0 */
    17.     {
    18.       sp.b.l = *pSrc++; /* Line 122, Address: 0x1003604 */
    19.       sp.b.h = *pSrc++; /* Line 123, Address: 0x1003618 */
    20.       sD5 = sp.w; /* Line 124, Address: 0x100362c */
    21.       sD4 = 15; /* Line 125, Address: 0x1003638 */
    22.     }
    23.  
    24.     if (CarryFlg != 0) /* Line 128, Address: 0x1003644 */
    25.     {
    26.       *pDst++ = *pSrc++; /* Line 130, Address: 0x1003650 */
    27.       continue; /* Line 131, Address: 0x1003670 */
    28.     }
    29.  
    30.     sD3 = 0; /* Line 134, Address: 0x1003678 */
    31.     CarryFlg = sD5 & 1; /* Line 135, Address: 0x100367c */
    32.     XFlg = 0; /* Line 136, Address: 0x1003690 */
    33.     if (--sD4 < 0) /* Line 137, Address: 0x1003694 */
    34.     {
    35.       sp.b.l = *pSrc++; /* Line 139, Address: 0x10036b8 */
    36.       sp.b.h = *pSrc++; /* Line 140, Address: 0x10036cc */
    37.       sD5 = sp.w; /* Line 141, Address: 0x10036e0 */
    38.       sD4 = 15; /* Line 142, Address: 0x10036ec */
    39.     }
    40.  
    41.     if (CarryFlg == 0) /* Line 145, Address: 0x10036f8 */
    42.     {
    43.       sD5 = (unsigned short)sD5 >> 1; /* Line 147, Address: 0x1003704 */
    44.       if (--sD4 < 0) /* Line 148, Address: 0x1003720 */
    45.       {
    46.         sp.b.l = *pSrc++; /* Line 150, Address: 0x1003744 */
    47.         sp.b.h = *pSrc++; /* Line 151, Address: 0x1003758 */
    48.         sD5 = sp.w; /* Line 152, Address: 0x100376c */
    49.         sD4 = 15; /* Line 153, Address: 0x1003778 */
    50.       }
    51.  
    52.       CarryFlg = (sD3 & 32768) >> 15; /* Line 156, Address: 0x1003784 */
    53.       sD3 = (unsigned short)sD3 << 1 | XFlg; /* Line 157, Address: 0x100379c */
    54.       XFlg = CarryFlg; /* Line 158, Address: 0x10037c0 */
    55.       ++sD3; /* Line 159, Address: 0x10037c4 */
    56.       sD2 = -1; /* Line 160, Address: 0x10037d0 */
    57.       sD2 = sD2 & 65280 | *pSrc++; /* Line 161, Address: 0x10037dc */
    58.     } /* Line 162, Address: 0x1003810 */
    59.     else
    60.     {
    61.  
    62.       sD0 = *pSrc++; /* Line 166, Address: 0x1003818 */
    63.       sD1 = *pSrc++; /* Line 167, Address: 0x1003834 */
    64.       sD2 = -1; /* Line 168, Address: 0x1003850 */
    65.       sD2 = sD2 & 65280 | (sD1 & 255); /* Line 169, Address: 0x100385c */
    66.       sD2 = (unsigned short)sD2 << 5; /* Line 170, Address: 0x1003888 */
    67.       sD2 = sD2 & 65280 | (sD0 & 255); /* Line 171, Address: 0x10038a4 */
    68.       if (!(sD1 & 7)) /* Line 172, Address: 0x10038d0 */
    69.       {
    70.  
    71.         if ((sD1 = *pSrc++) == 0) return; /* Line 175, Address: 0x10038e4 */
    72.         if (sD1 == 1) continue; /* Line 176, Address: 0x1003918 */
    73.         sD3 = sD1; /* Line 177, Address: 0x100392c */
    74.       } /* Line 178, Address: 0x1003934 */
    75.       else
    76.       {
    77.         sD3 = sD1 + 1; /* Line 181, Address: 0x100393c */
    78.       }
    79.     }
    80.  
    81.     do
    82.     {
    83.       sD0 = pDst[sD2]; /* Line 187, Address: 0x1003958 */
    84.       *pDst++ = (unsigned char)sD0; /* Line 188, Address: 0x1003978 */
    85.     } while (--sD3 >= 0); /* Line 189, Address: 0x100398c */
    86.   } /* Line 190, Address: 0x10039b0 */
    87. } /* Line 191, Address: 0x10039b8 */

    So yeah, I think it's safe to say that KosDec = unlze. You can find it here.

    Example comparisons:
    Code (ASM):
    1.         subq.l  #2,sp   ; make space for 2 bytes on the stack
    2.         move.b  (a0)+,1(sp)
    3.         move.b  (a0)+,(sp)
    4.         move.w  (sp),d5 ; get first description field
    5.         moveq   #$F,d4  ; set to loop for 16 bits
    Code (C):
    1.   sp.b.l = *pSrc++; /* Line 110, Address: 0x100356c */
    2.   sp.b.h = *pSrc++; /* Line 111, Address: 0x1003580 */
    3.   sD5 = sp.w; /* Line 112, Address: 0x1003594 */
    4.   sD4 = 15; /* Line 113, Address: 0x10035a0 */

    Code (ASM):
    1.         lsr.w   #1,d5   ; shift bit into the c flag
    2.         move    sr,d6
    3.         dbf     d4,.chkbit
    4.         move.b  (a0)+,1(sp)
    5.         move.b  (a0)+,(sp)
    6.         move.w  (sp),d5
    7.         moveq   #$F,d4
    8.  
    9. .chkbit:
    Code (ASM):
    1.     CarryFlg = sD5 & 1; /* Line 117, Address: 0x10035ac */
    2.     XFlg = 0; /* Line 118, Address: 0x10035c0 */
    3.     sD5 = (unsigned short)sD5 >> 1; /* Line 119, Address: 0x10035c4 */
    4.     if (--sD4 < 0) /* Line 120, Address: 0x10035e0 */
    5.     {
    6.       sp.b.l = *pSrc++; /* Line 122, Address: 0x1003604 */
    7.       sp.b.h = *pSrc++; /* Line 123, Address: 0x1003618 */
    8.       sD5 = sp.w; /* Line 124, Address: 0x100362c */
    9.       sD4 = 15; /* Line 125, Address: 0x1003638 */
    10.     }
     
    Last edited: Aug 24, 2024
    • Like Like x 2
    • Informative Informative x 2
    • List
  2. Kilo

    Kilo

    Starting new projects every week Tech Member
    1,243
    1,189
    93
    Canada
    Changes with the weather
    Well it's good to be right even if cgdata_unlze wasn't DrawBackground! Getting the right solution from the wrong formula is somethin I seem to be good at. :eng99:
     
  3. Devon

    Devon

    La mer va embrassé moi et délivré moi lakay. Tech Member
    1,515
    1,863
    93
    your mom
    This is false. Graphics still tend to be compressed in Nemesis (for some reason stage 16x16 blocks are also stored in Nemesis??)
    Actually, looking at the code, it actually seems to emulate how the MCD graphics operation works (it's referred to as "kaiten"), with the calculation functions being direct translations of the original 68000 code.
     
    • Informative Informative x 1
    • List
  4. Kilo

    Kilo

    Starting new projects every week Tech Member
    1,243
    1,189
    93
    Canada
    Changes with the weather
    I forgot about Nemesis, you're right. I meant to say it didn't use much Kosinski compression, since there were only like 6 or so calls to it in your disasm. Granted, neither did Sonic 1 or 2. The point I should've made is just that the chunks are uncompressed so I wasn't going to go looking at the main game to figure out what KosDec was called.
     
  5. Devon

    Devon

    La mer va embrassé moi et délivré moi lakay. Tech Member
    1,515
    1,863
    93
    your mom
    I looked a bit more in the Sonic CD 1996 decomp, and I also noticed these variables, which is definitely related to keeping track of in-progress Nemesis graphics decompressions via the PLC system:
    Code (C):
    1. unsigned short bitdevadr;
    2. short bitdevcnt;
    3. short bitdevcnt2;

    I definitely feel like bitdevcnt is the overall number of tiles left to decompress in a set of graphics data, and bitdevcnt2 is tracks how many tiles to decompress in a frame (there are 2 functions for this, 1 "fast" function, and 1 "slow" function, which basically just adjusts how many tiles are decompressed). No clue about bitdevadr.
     
    Last edited: Aug 24, 2024
    • Informative Informative x 1
    • List
  6. Kilo

    Kilo

    Starting new projects every week Tech Member
    1,243
    1,189
    93
    Canada
    Changes with the weather
    bitdevadr is most definitely v_plc_ptrnemcode or $F6E0 (in S1). In fact, it's probably the only variable we can discern here since bitdev count is such a generic variable name that it could probably apply to any of the 7 PLC work variables.
     
  7. Clownacy

    Clownacy

    Tech Member
    1,127
    769
    93
    Oh my god
    OH MY GOD
    HOLY SHIT
    I FOUND KOSINSKI'S ORIGIN

    I Googled 'unlze' and found this: https://notabug.org/pgimeno/z80unlze/src/master/z80unlze.asm

    It's a decompressor for a format called 'LZEXE'. I found a specification for it, and just look at it:
    https://cosmodoc.org/topics/lzexe/

    It's Kosinski! After all these damn years I finally know where Kosinski came from! It's a format for compressing old DOS executables! That explains why it uses little-endian integers, and has that strange dummy data every 0xA000 bytes!

    And guess what? According to LZEXE's developer, the compressor (and format) were derived from Okumura's compressor. You might know Okumura's compressor as the Saxman compressor. That's right: Kosinski is a descendant of Saxman. That also explains why the Kosinski compressor had the same bug/optimisation that Saxman's compressor does; because they use the same code.

    Fucking hell, this has made my day.
     
    • Informative Informative x 10
    • Like Like x 5
    • List
  8. Devon

    Devon

    La mer va embrassé moi et délivré moi lakay. Tech Member
    1,515
    1,863
    93
    your mom
    It comes full circle, especially since @saxman was the one keeping contact with Brett Kosinski all those years ago.
     
  9. saxman

    saxman

    Oldbie Tech Member
    I AM YOUR FATHER
     
  10. BenoitRen

    BenoitRen

    Tech Member
    885
    527
    93
    Thanks, everyone! :)

    I didn't expect to have unwittingly contributed to the answer with my decompilation, much less that the answer would lead us to the origin of the compression format. Awesome!

    I've added this new information to the wiki in Kosinski compression's Origin section, placed at about the same place as the same section on the page about Saxman compression.

    Would it be appropriate to also add information as to how we found the link with LZEXE? For completeness's sake, but also to give credit where credit's due.

    Speaking of coming full circle:
    • 1989/1990: Bellard rewrites Okumura's LZSS implementation, which was written in C, into x86 ASM and releases it as LZEXE.
    • 1990 or 1991: Sega converts LZEXE to 68000 ASM for use in its products.
    • 1996: Sega converts Sonic 3's 68000 ASM, which includes an implementation of LZEXE, to x86 ASM for use in Sonic & Knuckles Collection.
    • 2024: Some guy converts Sonic & Knuckles Collection's x86 ASM to C.
    • 1995: Sega converts Sonic CD's 68000 ASM, which includes an implementation of LZEXE, to C.
     
    Last edited: Aug 24, 2024
  11. Kilo

    Kilo

    Starting new projects every week Tech Member
    1,243
    1,189
    93
    Canada
    Changes with the weather
    Where you getting SKC from this? We found this through CD.
     
  12. BenoitRen

    BenoitRen

    Tech Member
    885
    527
    93
    We did find this through Sonic CD, but all Sonic games have an implementation of LZEXE, including Sonic 3 (& Knuckles), and it's very likely that their source code shares the routine name.

    I'm currently porting Sonic & Knuckles Collection's version of the routine to C (which is why I wanted to know its official name), which makes the routine's journey come full circle: C -> x86 ASM -> 68000 ASM -> x86 ASM -> C.

    EDIT: Come to think of it, Sega themselves already made it come full circle back in 1995 when they converted Sonic CD's 68000 ASM source code to C.
     
    Last edited: Aug 24, 2024
  13. Brainulator

    Brainulator

    Regular garden-variety member Member
    Nice work regarding Kosinski, I mean LZEXE. Does this mean that we can now call Kosinski+ LZEXE+? :eng99:

    Anyway, about the Nemesis/bitdev variables: it's worth noting that, if this post is to be believed, then bitdev_d6 is $FFF6F4. (dmadivide_a0_3 is $FFFF70, suggesting it refers to Kosinski Moduled (LZEXE Moduled?), which fits with it dividing DMA transfers, pretty much.)
     
  14. Cooljerk

    Cooljerk

    Professional Electromancer Oldbie
    4,984
    652
    93
    This is fucking great, awesome work! Is this going up on any of the wiki pages?
     
  15. Kilo

    Kilo

    Starting new projects every week Tech Member
    1,243
    1,189
    93
    Canada
    Changes with the weather
    I dunno if this was posted here yet, but here's the level tile hierarchy with their names:
    8x8: zoneXXcg
    16x16: zonexxmap
    256x256: zonexxblk
    It might also be possible that zonexxmap is the layout file, since J2ME's layout files are mc_XX_map_data. But map in the MegaDrive games usually refers to tilemaps, which 16x16 is the abstraction level of data that's sent to the VDP when reloading tiles (iirc).
     
  16. BenoitRen

    BenoitRen

    Tech Member
    885
    527
    93
  17. Cooljerk

    Cooljerk

    Professional Electromancer Oldbie
    4,984
    652
    93
    Along the same lines, is there an article discussing naming conventions? I've seen posts throughout a couple of different topics where people explain what these label conventions mean, like how "wk" means "work" and such. It'd be great to have a central guide out there detailing sega's internal styles guide. If one doesn't exist, I'd be willing to start one.
     
  18. Kilo

    Kilo

    Starting new projects every week Tech Member
    1,243
    1,189
    93
    Canada
    Changes with the weather
    Benoit started a page for that: https://info.sonicretro.org/Source_code/Technical_information
    And I've been meaning to contribute since I have some minor gripes with it and how it's laid out, but I first need to get off my ass and get a mod to help me access my old wiki acc.

    Side note because I don't wanna double post, but I think I finally figured out the splash object's name.
    This was something that was of major contention for me for quite some time. Since it's an object spawned by code rather than object placement, it doesn't show up in S2NA's symbol table. So I went off J2ME, which told me it was called mizu, but that was evidentially mizuflag told me that was actually the waterfall slide. Then I thought it was plawa, since it'd be attached to the player, but that turned out to be the bubbles the player generates. And after giving the CD 96 decomp a close read I'm settling on it being exit2.
    What's exit1? No clue. It could also be a case that it can't be called exit since that's a fairly common name for the equivalent of IDA's locrets.
     
    Last edited: Aug 25, 2024
  19. BenoitRen

    BenoitRen

    Tech Member
    885
    527
    93
    I just updated mentioned wiki page with the decompression routine names and "wk" (thanks for the reference, Cooljerk ;) ).

    I'd like to add more global variables, but it's preferable to add information backed up with a reference. So here's a list of global variables (that is actually a more elaborate version of this list I already made) that were found by cross-referencing symbol data hidden in Sonic Jam with the wiki's RAM values reference:
    • cartridge - Confusingly, this is 0xFFFF when Sonic & Knuckles is not locked on to Sonic 3. If it's zero, Sonic 3 & Knuckles starts.
    • emy_wrt_flg - This one isn't documented on the wiki yet, and I'm not sure if I'm going to add it, because I'm not entirely clear on what it's for. I've encountered it while working on my C port of Sonic & Knuckles Collection, where routines emy_wrt_init and clr_wchg_bit use it as a table to keep track of something. "wrt" suggests it's related to DPLC.
    • mdstatus - According to the wiki, this contains the value of a hardware register, and is used to detect the console's region (Japan or overseas). If the region is Japan, trademark symbols are not displayed, and Tails is called Miles. Sidenote: this variable can also be found as part of Sonic CD PC's global variables, but there it goes unused.
    • pl2ring_f2 - Same as plring_f2, but for player 2. A bitfield used to keep track of the extra lives the player has earned through collecting rings in the current act.
    • plresetflag - Wiki: "If set, the sprite and ring loaders will not clear the respawn table when the level loads. Used for returning from special and bonus stages."
    • ramf1 - Used to point at the scroll plane buffer.
    • ramf8 - Used to point at the sprite table buffer.
    • specspeedmax - The player's maximum speed in the current Special Stage. Is increased as you spend more time completing it.
    • ssonicflag - Super Saiyan Super Sonic flag. Also used for Hyper Sonic, where the value will be 0xFF instead of 1.
    • stageno_is2 - "apparent zone" and "apparent act" in which the Bonus/Special Stage was entering. Because we know that stageno_s2 is the same thing for stageno, it can be deduced that the name of the variable this is a backup of is probably stageno_i.
    • stageno_s2 - Backs up the value of stageno at the time of entering a Bonus/Special Stage.
    • waterflag - The wiki only says that this is a flag that needs to be 1 for other water-related variables to have any effect. Taking a peek at Sonic CD's source code, this only seems to be 1 when there's water on-screen.
    • waterposi - The current water level.
    • whoplay1 - Who player 1 picked as their character in Competition Mode.
    EDIT: Corrected value of cartridge.
     
    Last edited: Aug 31, 2024
    • Informative Informative x 1
    • List
  20. Kilo

    Kilo

    Starting new projects every week Tech Member
    1,243
    1,189
    93
    Canada
    Changes with the weather
    Proposal for reserved object RAM names
    $D040 - scorewk
    $D180 - bariawk
    $D200 - mutekiwk
    $D240 - mutekiwk2
    $D280 - mutekiwk3
    $D2C0 - mutekiwk4
    $D300 - exitwk
    $D340 - plawawk
    $D380 - plawawk2
    $D780 - wavewk
    $D7C0 - wavewk2

    Though the CD 96 code does just use actwk+slot so ¯\_(ツ)_/¯

    I'm also thinking for object ID's $81 (Sonic on the continue screen) and object $87 (Sonic on the ending sequence) they should be called play03 and play04.
     
    Last edited: Aug 25, 2024