Code:

#include <stdlib.h>
#include "omp.h"
#define RADIX_TBL_SIZE 256 * 256 * 4
unsigned int * bq; // main index buffer and output buffer
unsigned int bwtIndex; // resulting index
unsigned char * data, * bQlfc; // data - input data buffer; bQlfc - indexes
unsigned int fileSize // input size
int cmpBranch( unsigned int b1, unsigned int b2, unsigned int depth = 2 ) {
b1 += depth, b2 += depth; // skip (2 or 4 byte) radix sort processed data
while( ( ( unsigned int *) &data[ b1 ] )[ 0 ] ==
( (unsigned int *) &data[ b2 ] )[ 0 ] ) {
b1 += 4, b2 += 4;
if( b1 >= fileSize + 3 ) b1 -= fileSize;
if( b2 >= fileSize + 3 ) b2 -= fileSize;
}
unsigned int p1 = ( ( unsigned int *) &data[ b1 ] )[ 0 ],
p2 = ( ( unsigned int *) &data[ b2 ] )[ 0 ];
return __bswap_32( p1 ) < __bswap_32( p2 ); // __constant_cpu_to_be32 endian conversion
}
void insertionSort( unsigned int * idx, unsigned int l, unsigned int p,
unsigned int depth = 2 ) {
unsigned int j, swp, i;
for( unsigned int i = l + 1; i <= p; i++ )
{
j = i;
while( j > l && cmpBranch( idx[ j ], idx[ j - 1 ], depth ) ) {
swp = idx[ j - 1 ]; // top-down addition probably faster
idx[ j - 1 ] = idx[ j ];
idx[ j ] = swp;
j--;
}
}
}
void quickSort( unsigned int * idx, unsigned int l, unsigned int p, unsigned int de = 2 ) {
if( p - l <= 7 ) { insertionSort( idx, l, p ); return; }
unsigned int lf = l, ri = p, mid = idx[ ( l + p ) >> 1 ], swp;
do {
while( ( idx[ lf ] != mid ) &&
( 1 == cmpBranch( idx[ lf ], mid, de /*depth of already sorted data*/ ) ) ) lf++;
while( ( idx[ ri ] != mid ) &&
( 0 == cmpBranch( idx[ ri ], mid, de ) ) ) ri--; // { if( p - l == 1 ) return; ri--; }
if( (lf <= ri) ) { swp = idx[ lf ];
idx[ lf ] = idx[ ri ];
idx[ ri ] = swp; lf += 1, ri -= 1; }
} while( lf < ri );
if( l < ri ) quickSort( idx, l, ri );
if( lf < p ) quickSort( idx, lf, p );
}
void rBuckQSort( unsigned int freq, unsigned int * prevIndex, unsigned int depth = 4, unsigned int abs = 0 ) {
unsigned int * newIndex = (unsigned int *) malloc( freq * sizeof( unsigned int ) ),
* cnt = (unsigned int *) malloc( RADIX_TBL_SIZE );
memset( cnt, 0, RADIX_TBL_SIZE );
for ( int r = 0; r < freq; r++ )
cnt[ __bswap_16( ( (unsigned short *) &data[ prevIndex[ r ] + depth ])[ 0 ] ) ]++;
int ttlfreq = 0, lfreq;
for ( int r = 0; r < RADIX_TBL_SIZE / sizeof( unsigned int ); r++ )
lfreq = cnt[ r ], cnt[ r ] = ttlfreq, ttlfreq += lfreq;
for ( int r = 0; r < freq; r++ )
newIndex[ cnt[ __bswap_16( ( ( unsigned short *)
&data[ prevIndex[ r ] + depth ] )[ 0 ] ) ]++ ] = prevIndex[ r ];
lfreq = 0;
unsigned int nfreq;
for ( int r = 0; r < RADIX_TBL_SIZE / sizeof( unsigned int ); r++ ) {
if( nfreq = cnt[ r ] - lfreq ) {
if( nfreq > 8192 && depth <= 6 ) // stop recursion at given depth 2, 4, etc.
rBuckQSort( nfreq, newIndex + lfreq, depth + 2, abs + lfreq );
else {
if( nfreq > 1 )
quickSort( newIndex, lfreq, cnt[ r ] - 1, depth + 2 );
for( int j = lfreq; j < cnt[ r ]; j++ ) {
if( newIndex[ j ] ) bq[ abs + j ] = data[ newIndex[ j ] - 1 ];
else bwtIndex = abs + j, bq[ abs + j ] = data[ fileSize - 1 ];
}
}
}
lfreq = cnt[ r ];
}
free( newIndex ), free( cnt );
}
#define BUCKET_QSORT( coreN, from, to ) \
maxSeg[ coreN ] >>= 8; \
for( int b = from; b < to; b += sizeof( unsigned int ) ) { \
while( !( ( unsigned long *) &rdxSrt02[ b ] )[ 0 ] ) { if( b + 8 >= to ) break; b += 8; } \
if( freq = ( ( ( unsigned int *) &rdxSrt02[ b ] )[ 0 ] ) ) { \
freq -= lstFreq; \
if( freq > ( maxSeg[ coreN ] ) ) { \
rBuckQSort( freq, bq + lstFreq, 2, lstFreq ); \
} else { \
if( freq > 1 ) \
quickSort( bq, lstFreq, lstFreq + freq - 1 ); \
for( int r = 0; r < freq; r++ ) { \
if( bq[ lstFreq + r ] ) bq[ lstFreq + r ] = data[ bq[ lstFreq + r ] - 1 ]; \
else bwtIndex = lstFreq + r, bq[ lstFreq + r ] = data[ fileSize - 1 ]; \
} \
} \
lstFreq = freq + lstFreq; \
} \
} \
for( long l = 0; l <= 6; l++ ) { // trailing characters for beter alignment
data[ fileSize + l ] = data[ l ]; // (for BWT data buffer)
}
unsigned char * rdxSrt02 = ( unsigned char *) malloc( RADIX_TBL_SIZE );
unsigned int * rdxSrt = ( unsigned int *) rdxSrt02; bq = ( unsigned int *) bQlfc;
memset( rdxSrt02, 0, RADIX_TBL_SIZE );
#pragma omp parallel for
for( unsigned int c = 0; c < fileSize; c++ ) {
#pragma omp atomic
rdxSrt[ __bswap_16( ( (unsigned short *) &data[ t ] )[ 0 ] ) ]++;
}
// prepare paralel sections borders and statistics for the 2 byte radix sort
unsigned int freq, totalFreq = 0, procs = omp_get_num_procs(), parSec[ procs + 1 ][ 2 ], maxSeg[ procs ];
memset( parSec, -1, ( procs + 1 ) * 2 * sizeof( unsigned int ) );
memset( maxSeg, 0, procs * sizeof( unsigned int ) );
for(unsigned int j = 0; j < RADIX_TBL_SIZE; j += 4) {
while( !( (unsigned long *) &rdxSrt02[ j ] )[ 0 ] ) { if( j + 8 >= RADIX_TBL_SIZE ) break; j += 8; }
if( freq = ( (unsigned int *) &rdxSrt02[ j ] )[ 0 ] ) {
( (unsigned int *) &rdxSrt02[ j ] )[ 0 ] = totalFreq;
for( int k = procs - 1; k > 0; k-- )
if( totalFreq > ( ( fileSize / procs ) * k ) ) {
if( totalFreq < parSec[ k ][ 1 ] ) parSec[ k ][ 1 ] = totalFreq, parSec[ k ][ 0 ] = j;
if( freq > maxSeg[ k ] ) maxSeg[ k ] = freq;
}
if( totalFreq <= ( fileSize / procs ) && freq > maxSeg[ 0 ] ) maxSeg[ 0 ] = freq;
totalFreq += freq;
}
}
// failed to split symbol distribution on sections for parallel sorting? =1 thread, not optimal
parSec[ procs ][ 0 ] = RADIX_TBL_SIZE, parSec[ 0 ][ 0 ] = parSec[ 0 ][ 1 ] = 0;
if( procs > 1 ) {
if( parSec[ procs - 1 ][ 1 ] == -1 )
parSec[ procs - 1 ][ 1 ] = fileSize, parSec[ procs - 1 ][ 0 ] = RADIX_TBL_SIZE;
for( int s = 1; s <= procs - 1; s++ ) // for all sections
if( parSec[ s ][ 1 ] == -1 || parSec[ 1 ][ 0 ] == RADIX_TBL_SIZE )
parSec[ s ][ 1 ] = parSec[ procs - 1 ][ 1 ], parSec[ s ][ 0 ] = RADIX_TBL_SIZE;
}
// fill rank sort indexes on positions in out. buffer
for( unsigned int t = 0; t < fileSize; t++ ) {
bq[ rdxSrt[ __bswap_16( ( ( unsigned short *) &data[ t ] )[ 0 ] ) ]++ ] = t;
}
// quick sorts running in parallel on sections of unsorted data after 2-byte radix sort
#pragma omp parallel
{
for( int r = 0; r < procs; r++ )
#pragma omp sections nowait
{
#pragma omp section
{
unsigned int lstFreq = parSec[ r ][ 1 ], freq;
BUCKET_QSORT( r, parSec[ r ][ 0 ], parSec[ r + 1 ][ 0 ] )
}
}
}
inverse transform( forward / backward ):
unsigned char * idxs = ( unsigned char *) malloc( fileSize << 2 );
unsigned int * id = ( unsigned int *) idxs;
if( idxs == NULL ) cout << "\n Not enough free memory", exit( 1 );
//for each symbol to be decoded, sf and sf2 - symbol freq, relative or cumulative, (already stored in archive)
id[ OutSize ] = sf[ decChar ] - sf2[ decChar ]; // inicialise inverse BWT table, backward tranform
//bQlfc[ sf[ decChar ] ] = decChar, id[ sf[ decChar ] ] = OutSize++; // inicialise inverse BWT, forward tranform
// inverse BWT backward
for( unsigned int c = 0; c < OutSize; c++ ) {
decode[ OutSize - c - 1 ] = bQlfc[ bwtIndex ]; // print decoded data / characters
bwtIndex = sf2[ bQlfc[ bwtIndex ] ] + ( id[ bwtIndex ] /*& 0x00FFFFFF*/ ); // next character's index
}
// inverse BWT forward
//for( unsigned int c = 0; c < OutSize; c++ ) // print decoded data / character
// decode[ c ] = bQlfc[ bwtIndex ], bwtIndex = id[ bwtIndex ];