Here's a test to pack a buffer with optimal parsing, and unpack it later. Hopefully someone will find this useful. Note that this is for reference and not optimized for speed.

enwik8 packs and codes to 29,511 KB (30,219,262 bytes) in ~13 seconds:

Code:

pack_buffer 13.234 6
matches 10.988 6
optimal parsing 1.509 6
packing 0.221 6

Matching is slower than greedy or lazy parsing because I have to evaluate all matches at every position. I find LITERAL_COST from the entropy of the current block and MATCH_COST from the entropy of the match indices for the previous block. These are estimates for optimal parsing and const here just for reference.

Code:

const int LITERAL_COST = 6;
const int MATCH_COST = 10;
const static int BLOCK_BITS = 24;
const static int TABLE_BITS = 12;
const static int HASH_BITS = 14;
const static int MATCH_MIN = 4;
const static int MATCH_MAX = 254; // len encoded in a byte
const static int MAX_TESTS = 8;
const static int BLOCK_SIZE = 1 << BLOCK_BITS;
const static int BLOCK_MASK = BLOCK_SIZE - 1;
const static int HASH_SIZE = 1 << HASH_BITS;
const static int HASH_MASK = HASH_SIZE - 1;
struct Slot
{
const static int NUM_ENTRIES = 4;
byte max_len;
byte num_entries;
byte lengths[NUM_ENTRIES];
uint16 indices[NUM_ENTRIES];
};
struct Table
{
int32 offsets[TABLE_SIZE];
int16 head;
};
struct EncoderTable
{
int16 hash[TABLE_SIZE];
int32 next[TABLE_SIZE];
int32 root[HASH_SIZE];
};
uint32 hash_calc(byte* in)
{
const int SHIFT = 32 - HASH_BITS;
return (2654435761u * *(uint32*)in) >> SHIFT;
}
void model_update(Table* table, EncoderTable* enc_table, byte* in, int pos)
{
uint32 hash = hash_calc(in + pos) & HASH_MASK;
enc_table->hash[table->head] = hash;
enc_table->next[table->head] = enc_table->root[hash];
enc_table->root[hash] = table->head;
table->offsets[table->head] = pos;
table->head = (table->head + 1) & TABLE_MASK;
}
void model_update(Table* table, int pos)
{
table->offsets[table->head] = pos;
table->head = (table->head + 1) & TABLE_MASK;
}
int pack_buffer(byte* in, uint16* out, int in_size)
{
int wpos = 0;
int rpos = 0;
Slot* slots = new Slot[in_size];
Table* tables = new Table[0x100];
EncoderTable* enc_tables = new EncoderTable[0x100];
int* cost_to_end = new int[in_size];
memset(cost_to_end, 0, sizeof(int) * in_size);
memset(slots, 0, sizeof(Slot) * in_size);
for (int i = 0; i < 0x100; i++)
{
tables[i].head = 0;
memset(tables[i].offsets, 0, sizeof(tables[i].offsets));
memset(enc_tables[i].hash, -1, sizeof(enc_tables[i].hash));
memset(enc_tables[i].root, -1, sizeof(enc_tables[i].root));
memset(enc_tables[i].next, -1, sizeof(enc_tables[i].next));
}
printf("Tables: %d MB\n", (sizeof(Table) * 0x100) >> 20);
printf("Encoder tables: %d MB\n", (sizeof(EncoderTable) * 0x100) >> 20);
printf("Slots: %d MB\n", (sizeof(Slot) * in_size) >> 20);
printf("Costs: %d MB\n", (sizeof(int) * in_size) >> 20);
for (int pos = 1; pos < in_size - MATCH_MAX; pos++)
{
byte ctx = in[pos - 1];
Table* t = tables + ctx;
EncoderTable* enc_t = enc_tables + ctx;
Slot* slot = slots + pos;
uint32 hash = hash_calc(in + pos) & HASH_MASK;
int node = enc_t->root[hash];
int match_len = MATCH_MIN - 1;
if (node != INVALID_U32)
{
uint32 match_end = min(in_size, pos + MATCH_MAX);
for (int tests = 0; tests < MAX_TESTS; tests++)
{
uint32 offs = t->offsets[node];
if (!offs) { break; }
uint32 t0 = offs;
uint32 len = pos;
if (*(uint32*)(in + t0) == *(uint32*)(in + len))
{
t0 += 4;
len += 4;
if (*(uint32*)(in + t0) == *(uint32*)(in + len))
{
t0 += 4;
len += 4;
}
while (len < match_end && in[t0] == in[len])
{
++t0;
++len;
}
}
len -= pos;
if (len > match_len)
{
ASSERT(t0 > 0);
int idx = (t->head - node) & TABLE_MASK;
if (slot->num_entries < Slot::NUM_ENTRIES)
{
slot->lengths[slot->num_entries] = len;
slot->indices[slot->num_entries] = idx;
++slot->num_entries;
}
else
{
slot->lengths[slot->num_entries - 1] = len;
slot->indices[slot->num_entries - 1] = idx;
}
match_len = len;
slot->max_len = len;
if (len == MATCH_MAX) { break; }
}
node = enc_t->next[node];
if (node == INVALID_U32 || offs <= t->offsets[node])
{
break;
}
}
}
model_update(t, enc_t, in, pos);
}
// Lazy parsing
//for (int pos = 0; pos < in_size - MATCH_MAX; pos++)
//{
// Slot* slot = slots + pos;
// if (slot->max_len == 0) { continue; }
// for (int j = 1; j <= slot->max_len; j++)
// {
// Slot* slot2 = slots + pos + j;
// if (slot2->max_len - j > slot->max_len)
// {
// slot->max_len = j;
// break;
// }
// }
//}
// Optimal parsing
for (int pos = in_size - 1; pos >= 0; pos--)
{
Slot* slot = slots + pos;
if (slot->max_len > 0)
{
byte best_len = 0;
int best_cost = in_size * 8;
for (byte cur_len = slot->max_len; cur_len >= 1; cur_len--)
{
int cur_cost = cost_to_end[pos + cur_len];
cur_cost += (cur_len < MATCH_MIN) ? (LITERAL_COST * cur_len) : MATCH_COST;
if (cur_cost < best_cost)
{
best_len = cur_len;
best_cost = cur_cost;
}
}
slot->max_len = best_len;
}
if (pos == in_size - 1)
{
cost_to_end[pos] = LITERAL_COST;
slot->max_len = 0;
}
else if (slot->max_len < MATCH_MIN)
{
cost_to_end[pos] = cost_to_end[pos + 1] + LITERAL_COST;
slot->max_len = 0;
}
else
{
cost_to_end[pos] = cost_to_end[pos + slot->max_len] + MATCH_COST + LITERAL_COST;
}
}
printf("Optimized cost: %d KB (%d bits)\n", cost_to_end[0] / (8 * 1024), cost_to_end[0]);
int matched_bytes = 0;
while (rpos < in_size)
{
Slot* slot = slots + rpos;
if (slot->max_len >= MATCH_MIN)
{
int idx = -1;
for (int i = 0; i < slot->num_entries; i++)
{
if (slot->lengths[i] >= slot->max_len)
{
idx = slot->indices[i];
break;
}
}
ASSERT(idx > -1);
ASSERT(idx < TABLE_SIZE);
out[wpos++] = slot->max_len - MATCH_MIN + 256;
out[wpos++] = idx;
rpos += slot->max_len;
matched_bytes += slot->max_len;
}
else
{
ASSERT(slot->max_len == 0);
out[wpos++] = in[rpos];
++rpos;
}
}
delete[] cost_to_end;
delete[] slots;
delete[] tables;
delete[] enc_tables;
return wpos;
}

Code:

void unpack_buffer(uint16* in, byte* out, int in_size, int out_size)
{
int wpos = 0;
int rpos = 0;
Table* tables = new Table[0x100];
for (int i = 0; i < 0x100; i++)
{
tables[i].head = 0;
memset(tables[i].offsets, 0, sizeof(tables[i].offsets));
}
while (rpos < in_size)
{
if (in[rpos] < 256)
{
out[wpos] = in[rpos];
if (wpos > 0)
{
model_update(tables + out[wpos - 1], wpos);
}
wpos++;
rpos++;
}
else
{
byte ctx = out[wpos - 1];
int length = in[rpos++] - 256 + MATCH_MIN;
int idx = in[rpos++];
ASSERT(length + 256 - MATCH_MIN < 512);
ASSERT(idx < TABLE_SIZE);
int recon_idx = (tables[ctx].head - idx) & TABLE_MASK;
uint32 src_pos = tables[ctx].offsets[recon_idx];
ASSERT(src_pos < wpos && src_pos > 0);
while (length > 0)
{
model_update(tables + out[wpos - 1], wpos);
out[wpos++] = out[src_pos++];
length--;
}
}
}
ASSERT(wpos == out_size);
delete[] tables;
}

Coding is just this:

Code:

for (int i = 0; i < packed_block_size; i++)
{
sym_coder.EncodeSym(io, coded_block[i]);
if (coded_block[i] & 256)
{
i++;
match_coder.EncodeSym(io, coded_block[i]);
}
}