I need a fast algorithm for estimating the mutual information in files in order to sort them for archiving. The idea is to compress similar files together in the same block. The algorithm I am using assumes that files share information if there are strings of several bytes that appear in both files. To test this, I use a rolling 32-bit hash that depends on the last 6 non-repeating bytes and take samples when the hash value is below a threshold. Then I use the low 8 bits of the hash to index a 256 bit vector and set it to the next higher bit in the hash. To account for files of different sizes, the threshold is initially 2^30 (selecting patterns with probability 1/4) and then decreases by 1/256 after each match. This results in selecting around 1000 to 2000 patterns so that each bit is set 4 to 8 times. The final bit setting determines the signature.
The following shows some results. The first table lists each file, assigns it a number starting with 0, shows the size and the number of pattern matches, then shows the numbers of the 3 closest matches to earlier files sorted by increasing Hamming distance. For example, it shows that the closest matches to world95.txt are files number 10 (paper1), 3 (bib), and 5 (book2) with Hamming distances of 67, 74, and 75 respectively. The second table shows the distance matrix between each file and each of the first 18 files. For testing I added 3 files: empty (size 0), zero (contains all 0 bytes), and r256 (contains random bytes).
The matrix shows that there are no close matches to zero, r256, or to most of the binary files like geo, obj1, pic, a10.jpg, flashmx.pdf. Most of the text files show similarity to each other.Code:>sim empty ..\data\zero ..\data\r256 ..\calgary\* ..\maxcomp\* 0 0 0 empty 1 100000 1173 0=118 ..\data\zero 2 100000 1174 0=123 1=135 ..\data\r256 3 111261 1150 0=65 2=118 1=125 ..\calgary\BIB 4 768771 1583 3=71 0=78 1=126 ..\calgary\BOOK1 5 610856 1632 4=70 3=73 0=78 ..\calgary\BOOK2 6 102400 1174 0=101 5=117 4=119 ..\calgary\GEO 7 377109 1446 4=88 5=96 3=97 ..\calgary\NEWS 8 21504 825 0=102 3=105 4=106 ..\calgary\OBJ1 9 246814 1379 3=90 0=101 4=105 ..\calgary\OBJ2 10 53161 981 3=53 0=62 5=66 ..\calgary\PAPER1 11 82199 1109 10=60 0=62 4=66 ..\calgary\PAPER2 12 513216 1589 8=108 11=114 3=117 ..\calgary\PIC 13 39611 946 11=74 4=76 0=82 ..\calgary\PROGC 14 71646 1096 0=64 10=64 11=64 ..\calgary\PROGL 15 49379 985 0=72 11=76 3=79 ..\calgary\PROGP 16 93695 1110 11=78 13=82 7=88 ..\calgary\TRANS 17 842468 1693 6=123 0=126 1=126 ..\maxcomp\a10.jpg 18 3870784 2071 4=109 14=109 15=109 ..\maxcomp\acrord32.exe 19 4067439 2132 0=44 5=56 11=58 ..\maxcomp\english.dic 20 4526946 2096 8=120 15=126 2=127 ..\maxcomp\FlashMX.pdf 21 20617071 2086 0=50 10=60 19=62 ..\maxcomp\fp.log 22 3782416 2058 7=114 13=116 18=117 ..\maxcomp\mso97.dll 23 4168192 1833 3=104 0=107 7=113 ..\maxcomp\ohs.doc 24 4149414 2020 0=44 21=60 19=78 ..\maxcomp\rafale.bmp 25 4121418 1977 4=96 7=96 11=98 ..\maxcomp\vcfiu.hlp 26 2988578 2013 10=67 3=74 5=75 ..\maxcomp\world95.txt 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 0 0 118 123 65 78 78 101 102 102 101 62 62 128 82 64 72 90 126 1 118 0 135 125 126 122 121 120 128 123 118 122 124 126 126 128 130 126 2 123 135 0 118 133 129 142 143 121 122 117 137 129 127 115 125 123 127 3 65 125 118 0 71 73 128 97 105 90 53 69 117 97 77 79 91 133 4 78 126 133 71 0 70 119 88 106 105 68 66 126 76 76 88 90 144 5 78 122 129 73 70 0 117 96 110 105 66 68 126 82 80 84 92 130 6 101 121 142 128 119 117 0 125 123 126 109 103 119 119 123 115 127 123 7 102 120 143 97 88 96 125 0 108 113 98 88 138 86 94 114 88 136 8 102 128 121 105 106 110 123 108 0 117 110 100 108 116 116 104 110 132 9 101 123 122 90 105 105 126 113 117 0 91 93 117 105 95 107 113 131 10 62 118 117 53 68 66 109 98 110 91 0 60 120 84 64 84 90 130 11 62 122 137 69 66 68 103 88 100 93 60 0 114 74 64 76 78 132 12 128 124 129 117 126 126 119 138 108 117 120 114 0 140 120 130 124 130 13 82 126 127 97 76 82 119 86 116 105 84 74 140 0 76 92 82 138 14 64 126 115 77 76 80 123 94 116 95 64 64 120 76 0 82 88 132 15 72 128 125 79 88 84 115 114 104 107 84 76 130 92 82 0 94 132 16 90 130 123 91 90 92 127 88 110 113 90 78 124 82 88 94 0 132 17 126 126 127 133 144 130 123 136 132 131 130 132 130 138 132 132 132 0 18 113 123 110 126 109 111 130 119 127 132 125 121 143 123 109 109 123 121 19 44 128 129 69 66 56 107 92 96 95 64 58 132 76 76 62 80 124 20 136 128 127 127 134 134 135 128 120 131 134 144 132 144 130 126 138 130 21 50 116 125 67 74 80 111 94 98 89 60 68 122 74 68 84 82 130 22 120 126 135 125 122 126 137 114 130 131 130 122 142 116 118 124 126 124 23 107 133 134 104 119 129 120 113 119 130 113 125 123 133 115 121 117 119 24 44 122 119 79 92 98 97 104 104 109 80 84 122 94 82 98 102 128 25 104 126 137 99 96 102 113 96 110 123 100 98 128 98 100 104 102 126 26 85 129 122 74 85 75 118 95 107 96 67 75 119 81 79 83 83 127
Program is below. If no filename arguments are given, then it reads file names from stdin, like dir/s/b | sim.exe
Code:// Estimate file similarity: sim * (or) dir/s/b | sim // Written by Matt Mahoney. Public domain. #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <string> #include <vector> using namespace std; const int H=8; // use bit vectors of size 2^H vector<vector<bool> > v; // file signatures // Return Hamming distance int dist(const vector<bool>& v1, const vector<bool>& v2) { int n=0; for (unsigned i=0; i<v1.size() && i<v2.size(); ++i) n+=v1[i]!=v2[i]; return n; } // Compute 1 signature, store in v, and report the 3 closest so far void scan(const char* filename) { // Open file FILE* in=fopen(filename, "rb"); if (!in) return; // Compute signature and store in v.back() vector<bool> z(1<<H); v.push_back(z); int c, c1=0, sz=0; unsigned h=0, n=0, limit=1<<30; while ((c=getc(in))!=EOF) { ++sz; if (c!=c1) h<<=6; c1=c; h=(h+c+1)*314159265u; if (h<limit) { v.back()[h&(1<<H)-1]=(h>>H)&1; ++n; limit-=limit>>H; } } fclose(in); printf("%4d %8d %4d", v.size()-1, sz, n); // Find the N most similar files const int N=3; int bi[N]={0}; for (int i=0; i<N && i<int(v.size())-1; ++i) { int bd=(1<<H)+1; for (int j=0; j<int(v.size())-1; ++j) { bool done=false; for (int k=0; k<i && !done; ++k) done|=bi[k]==j; if (!done) { int d=dist(v[j], v.back()); if (d<bd) bd=d, bi[i]=j; } } printf(" %4d=%-3d", bi[i], bd); } for (int i=int(v.size())-1; i<N; ++i) printf(" "); printf(" %s\n", filename); } int main(int argc, char** argv) { // Scan files from command line args for (int i=1; i<argc; ++i) scan(argv[i]); // Scan files from stdin (e.g. piped from "dir/s/b") if (argc==1) { string s; int c; while ((c=getchar())!=EOF) { if (c<' ') scan(s.c_str()), s=""; else s+=char(c); } } // Print the difference matrix for the first 18 files printf("\n "); for (unsigned i=0; i<v.size() && i<18; ++i) printf(" %3d", i); printf("\n"); for (unsigned i=0; i<v.size(); ++i) { printf("%2d", i); for (unsigned j=0; j<v.size() && j<18; ++j) printf(" %3d", dist(v[i], v[j])); printf("\n"); } return 0; }