One way of doing this is with a trivial entropy calculation based on observed frequencies. Say for each 1k block determine whether entropy(previous_block_freqs) + entropy(current_block_freqs) is lower than entropy(previous_block_freqs + current_block_freqs), with a bit of overhead to account for the cost in starting a new frequency table. This is naive, but in this case sufficient.
Eg:
Code:
#include <stdio.h>
#include <unistd.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#define EBASE2 256
double e256(int *F1, int *F2, int len) {
double e = 0;
int i;
if (F2) {
for (i = 0; i < EBASE2; i++) {
if (F1[i]+F2[i])
e += -log((double)(F1[i]+F2[i])/len) * (F1[i]+F2[i]);
}
} else {
for (i = 0; i < EBASE2; i++) {
if (F1[i])
e += -log((double)F1[i]/len) * F1[i];
}
}
return e / log(EBASE2);
}
#ifndef BSIZE
#define BSIZE 1024
#endif
#ifndef OVERHEAD
#define OVERHEAD 10
#endif
int main(void) {
unsigned char buf[BSIZE];
int len, last_len = 0, i;
int F_last[256] = {0}, F_curr[256], pos = 0;
double e_last, e_curr;
while ((len = read(0, buf, BSIZE)) > 0) {
memset(F_curr, 0, 256*sizeof(int));
for (i = 0; i < BSIZE; i++)
F_curr[buf[i]]++;
e_last = e256(F_last, NULL, last_len);
e_curr = e256(F_curr, NULL, len);
if (pos > BSIZE) {
double e_sum = e256(F_last, F_curr, last_len + len);
if (e_sum > e_last + e_curr + OVERHEAD) {
printf("%d\t%.1f vs %.1f\t%.1f(%d) %.1f(%d)\n",
pos, e_sum, e_last + e_curr, e_last, last_len, e_curr, l\
en);
memset(F_last, 0, 256*sizeof(int));
last_len = 0;
}
}
for (i = 0; i < 256; i++)
F_last[i] += F_curr[i];
last_len += len;
pos += len;
}
return 0;
}
When run with a small OVERHEAD set it shows the points up:
Code:
@ deskpro107386[/tmp]; cc -g -DBSIZE=1024 -DOVERHEAD=5 -Wall a.c -lm && ./a.out < dat
110592 13905.1 vs 13880.6 13770.9(110592) 109.7(1024)
328704 23705.1 vs 23674.6 23548.0(218112) 126.6(1024)
1587200 156903.5 vs 156878.8 156855.9(1258496) 22.9(581)
I guess you'd want to have a larger block size initially and if spotting a variance then successively reduce the block size to find the boundary point with the highest size reduction.
Edit: it needs to not reset F_last each block, instead just accumulating so we see the impact of adding this block to all previous data since the last break-point, rather than just this vs last. Anyway, you get the idea.