Recently I've started a project working on a decompressing and re-compressing MKD archives from a game.
I managed to get an IDA dump of the decompression routine, and put together a test program in C.
That was easy enough, but wow, the format makes no sense, and I have no idea where to start
with recompression, it appears to be a hybrid LZW/RLE implementation.
I'm attaching my code, and would love any suggestions...because I just can't seem to wrap my head around it
for(input_pos=12,output_pos=0,op1=0,op2=0; output_pos<size; op1--,op2>>=1)
{
if(op1==0)
{
op1=8;
op2=input[input_pos++];
}
if(!(op2 & 1)) // uncompressed data, only copies input (seems to be at most8 bytes long)
{
output[output_pos++]=input[input_pos++];
}
else
{
unsigned char low=input[input_pos++]; // these vars are used to calculate block size
unsigned char high=input[input_pos++]; // and data to be repeated
switch(low & 0x0F)
{
case 0:
for(int i=0; i<(input[input_pos]+16); i++) // reuses a previous block of data,
// seems to be able to handle bigger sizes than the
// default switch case
{
output[output_pos]=output[output_pos-((high<<4) + (low>>4))];
output_pos++;
}
input_pos++;
break;
case 1: // repeated data, high=data to repeat, low/16+3= times
for(int i=0; i<((low>>4) + 3); i++)
{
output[output_pos++]=high;
}
break;
case 2:
for(int i=0; i<((high<<4) + (low>>4) + 18); i++) // uncompressed data, it seems that it copies
{ // data blocks with bigger sizes than line 17
output[output_pos++]=input[input_pos++];
}
break;
default: // reuse a previous block of data of size at most 15
for(int i=0; i<(low & 0xF); i++)
{
output[output_pos]=output[output_pos-((high<<4) + (low>>4))];
output_pos++;
}
break;
}
}
}
return output_pos;
}
int usize=((int*)(&cdata[8]))[0]; // uncompressed data size
unsigned char *udata=(unsigned char*)malloc(usize); // create uncompressed buffer
decompress(cdata,udata);
save_file(udata,usize,argv[2]); // save decompressed data to file
return 0;
}
Well, here's a very simple encoder: http://nishi.dreamhosters.com/u/mkd_v0.rar
Implementing a proper LZ encoder with parsing optimization is kinda hard, so it uses deflate encoder (7z in test.bat),
then extracts the deflate stream with reflate's rawdet and removes deflate's entropy coding with raw2dec.
Then mkd_enc (modified dec2unp) encodes deflate's lz tokens using this format.
It doesn't implement literal runs and RLE, so compression is worse than original (matches with dist=4k..32k getting unpacked to literals is probably also bad),
but seems to work otherwise.
Btw, there's also a tricky case with zero distance in matches ( ((high<<4) + (low>>4))==0 ).
I treated it as zero runs, but it can be anything really - it would be good to also have unpacked version of that sample file
(and I mean unpacked with original decoder).
The test.bat supposedly also tests it - 00000000.mrg file there is compressed with mkd_enc.
Then its decoded with the same decoder and md5 hashes are computed (seem to match).
In other words, I test it by decoding original file and re-encoded file with the same decoder, and comparing outputs.
Recently I've started a project working on a decompressing and re-compressing MKD archives from a game.
I managed to get an IDA dump of the decompression routine, and put together a test program in C.
That was easy enough, but wow, the format makes no sense, and I have no idea where to start
with recompression, it appears to be a hybrid LZW/RLE implementation.
I'm attaching my code, and would love any suggestions...because I just can't seem to wrap my head around it
for(input_pos=12,output_pos=0,op1=0,op2=0; output_pos<size; op1--,op2>>=1)
{
if(op1==0)
{
op1=8;
op2=input[input_pos++];
}
if(!(op2 & 1)) // uncompressed data, only copies input (seems to be at most8 bytes long)
{
output[output_pos++]=input[input_pos++];
}
else
{
unsigned char low=input[input_pos++]; // these vars are used to calculate block size
unsigned char high=input[input_pos++]; // and data to be repeated
switch(low & 0x0F)
{
case 0:
for(int i=0; i<(input[input_pos]+16); i++) // reuses a previous block of data,
// seems to be able to handle bigger sizes than the
// default switch case
{
output[output_pos]=output[output_pos-((high<<4) + (low>>4))];
output_pos++;
}
input_pos++;
break;
case 1: // repeated data, high=data to repeat, low/16+3= times
for(int i=0; i<((low>>4) + 3); i++)
{
output[output_pos++]=high;
}
break;
case 2:
for(int i=0; i<((high<<4) + (low>>4) + 18); i++) // uncompressed data, it seems that it copies
{ // data blocks with bigger sizes than line 17
output[output_pos++]=input[input_pos++];
}
break;
default: // reuse a previous block of data of size at most 15
for(int i=0; i<(low & 0xF); i++)
{
output[output_pos]=output[output_pos-((high<<4) + (low>>4))];
output_pos++;
}
break;
}
}
}
return output_pos;
}
int usize=((int*)(&cdata[8]))[0]; // uncompressed data size
unsigned char *udata=(unsigned char*)malloc(usize); // create uncompressed buffer
decompress(cdata,udata);
save_file(udata,usize,argv[2]); // save decompressed data to file
return 0;
}