Regarding http://www.maximumcompression.com/data/dict.php
Edited: I originally claimed that best results were achieved by reversing the dictionary; this was a bug; my code makes the assumption that the word-list is sorted in ascending order. I've fixed the claimed results.
This transformer simply encodes deletes as <
I'd be most interested in any other results and any improvements!
This literally has not been thought out, I just wrote it in five minutes because I thought it'd work.
input:
Code:
1080
10th
1st
2
2nd
output:
Code:
1080<<th<<<st<<<2>nd
('>' denotes same stem as previous word)
Curious fact: had to special-case that "animate" appears twice in the file..
Program usage (Linux; reports of line-ending problems on Windows):
Code:
g++ -o dic1 dic1.cpp
./dic1 c < english.dic > english.dic1
./dic1 d < english.dic1 > english.dic2
cmp english.dic english.dic2
english.dic is 4,067,439 bytes.
dic1 transform of english.dic is 2,055,914 bytes.
But the point is not to use it as a stand-alone compressor (although it might well be the very fastest one), but rather to compress its output with a mainstream algorithm:
Results with some compressors I had laying around:
Code:
bbb before: 1,170,539 after: 431,644
paq8l after: 418,086
zip (Info-ZIP 3.0 as on Ubuntu) before: 1,049,959 after: 629,219
paq8hp12any after: 416,783
And the source:
Code:
#include <stdio.h>
#include <string.h>
#include <assert.h>
static int common_prefix(const char* a,const char* b) {
int common = 0;
while(*a++==*b++)
common++;
return common;
}
int main(int argc,char** args) {
enum {
LONGEST_LINE = 35, // wc -L says 31
};
if(2!=argc) {
fprintf(stderr,"Usage: dic1 [c|d]\n");
return -1;
}
if(!strcmp(args[1],"c")) {
char prev[LONGEST_LINE] = {0};
int prev_len = 0;
char word[LONGEST_LINE];
while(fgets(word,sizeof(word),stdin)) {
const int len = strlen(word);
if(!strcmp(word,prev)) {
fprintf(stderr,"Oh! %s",word);
printf(">");
continue;
}
const int common = common_prefix(word,prev);
if(common == (prev_len-2))
putchar('>');
for(int i=prev_len-2; i>common; i--)
putchar('<');
for(int i=common; i<(len-2); i++)
putchar(word[i]);
prev_len = len;
strcpy(prev,word);
}
} else if(!strcmp(args[1],"d")) {
char word[LONGEST_LINE] = {0};
int ofs = 0;
char prev = 0;
for(int ch = getchar(); ch != -1; prev = ch, ch = getchar()) {
if('>'==ch) {
printf("\r\n");
for(int i=0; i<ofs; i++)
putchar(word[i]);
} else if('<'==ch) {
if('<'!=prev)
printf("\r\n");
ofs--;
assert(ofs >= 0);
} else {
if('<'==prev)
for(int i=0; i<ofs; i++)
putchar(word[i]);
word[ofs++] = ch;
putchar(ch);
}
}
printf("\r\n");
} else {
fprintf(stderr,"Error: command \"%s\" not supported\n",args[1]);
}
return 0;
}