This is mainly for RichSelian, perhaps others will also find it useful:

Here is source for an experiment for optimal parsing with 64k tables. enwik8 compresses to 24,617k with the decoder slightly slower than before (no code changed, only table size and memory usage). I didn't measure timings because I didn't try using any better match finder (just hash chain, not hash table like normal), so match finding is slow. The actual optimal parse path finding on 16 MB blocks for the 100 MB is 1.5 seconds.

Code:

const int OPTIMAL_NUM_SLOTS = 8;
...
struct OptimalSlot
{
int bucket_idx[OPTIMAL_NUM_SLOTS];
int bucket_len[OPTIMAL_NUM_SLOTS];
int num_slots;
int best_len;
};
// Add buckets as more matches are found, when num_slots == OPTIMAL_NUM_SLOTS, the last one is overwritten by longer matches.
...
int bits_for_sym[0x100];
{
Coder<0x100> coder;
coder.MemInit();
for (int i = 0; i < window_n; i++)
{
coder.UpdateFreq(window[i]);
}
coder.update_lengths();
for (int i = 0; i < 0x100; i++)
{
bits_for_sym[i] = coder.code_lengths[i];
}
coder.MemFree();
}
for (int pos = window_n - 1; pos >= 0; pos--)
{
OptimalSlot* slot = slots + pos;
int cur_cost = bits_for_sym[window[pos]];
if (pos < window_n - 1)
{
cur_cost += cost_to_end[pos + 1];
}
int match_len = 1;
if (slot->best_len > 0)
{
int best_len = 0;
for (int cur_len = slot->best_len; cur_len > 1; cur_len--)
{
int idx = -1;
for (int i = 0; i < slot->num_slots; i++)
{
if (slot->bucket_len[i] >= cur_len)
{
idx = slot->bucket_idx[i];
break;
}
}
ASSERT(idx != -1);
int min_len = min_length_for_offset(idx);
if (min_len > cur_len) { continue; }
int match_cost = cost_to_end[pos + cur_len];
int lidx = base_idx(cur_len, lengthcode_base, lnum, lengthcode_bits);
match_cost += average_bits_for_length_sym;
match_cost += lengthcode_bits[lidx];
int didx = base_idx(idx, distcode_base, dnum, distcode_bits);
match_cost += average_bits_for_distance_sym;
match_cost += distcode_bits[didx];
if (match_cost < cur_cost)
{
best_len = cur_len;
cur_cost = match_cost;
}
}
match_len = best_len;
}
slot->best_len = match_len;
cost_to_end[pos] = cur_cost;
}
while (rpos < window_n)
{
OptimalSlot* slot = slots + rpos;
if (slot->best_len > 1)
{
int idx = -1;
for (int i = 0; i < slot->num_slots; i++)
{
if (slot->bucket_len[i] >= slot->best_len)
{
idx = slot->bucket_idx[i];
break;
}
}
ASSERT(idx > -1);
int min_len = min_length_for_offset(idx);
ASSERT(slot->best_len >= min_len);
codeblock[wpos++] = 256 + slot->best_len - min_len;
codeblock[wpos++] = idx;
rpos += slot->best_len;
}
else
{
codeblock[wpos++] = window[rpos];
++rpos;
}
}
delete[] cost_to_end;
delete[] slots;
...
Set up entropy
...
for (int i = 0; i < wpos; i++)
{
int sym = codeblock[i];
if (sym < 0x100)
{
coder.EncodeSym(io, sym);
num_syms++;
}
else
{
num_syms++;
num_dists++;
int match_len = sym - 256;
int match_dist = codeblock[++i];
int lidx = base_idx(match_len, lengthcode_base, lnum, lengthcode_bits);
int didx = base_idx(match_dist, distcode_base, dnum, distcode_bits);
num_extra_bits += lengthcode_bits[lidx];
num_extra_bits += distcode_bits[didx];
coder.EncodeSym(io, 256 + lidx);
if (lengthcode_bits[lidx] > 0) { io.Write(match_len - lengthcode_base[lidx], lengthcode_bits[lidx]); }
coder_dist.EncodeSym(io, didx);
if (distcode_base[didx] > 0) { io.Write(match_dist - distcode_base[didx], distcode_bits[didx]); }
}
}

I do entropy coding for lengths, distances and literals. Lengths and distances use a log-exp scheme (similar to WinRAR decoder but 2k max length and 64k max distance), only the logarithm part is entropy coded. The exponent is stored as bits. bits_for_symbol is calculated by Huffman coding.

If there are any questions, please ask. This is not optimized at all, I'm still experimenting. Using greedy + lookahead with 64k tables compresses 2-5% worse than the normal 4k tables. Using a BALZ-style predictor is ~0% better, 1% better if I context-code flag (match/literal) with a LZMA context table (kStates) and don't context length or distance.

Edit: Forgot to mention, for optimal parsing to work, I do ROLZ update and match for every symbol, not phrase. 4k tables are worse than before because offsets scroll out faster, and the decoder is slightly slower because now every byte is a (fast) update.