Here's my newly invented (or not, probably someone did such thing before, as it's really simple) blocksorting algorithm, which I'll probably use as building block in my complex QRotSort algorithm.

I've tried stuffing suffix.lcp'th byte from each suffix into suffix.lcp, so then sometimes only comparing those bytes could suffice to figure out the suffix order and lcps, but in the end the gain was really small (and complexity is higher) and probably some other technique will bring some improvement.

Currently I'm thinking about multi-way MergeSort, but that will be tricky. Also the number of ways is yet to be decided.

Code:
/* 
 * File:   main.cpp
 * Author: Piotr Tarsa
 *
 */

#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <ctime>

#include <algorithm>

#define COMPUTE_STATISTICS

struct _index_t {
    int32_t index;
    int32_t lcp;
};

typedef _index_t index_t;

size_t length;
index_t *SAin;
index_t *SAout;
u_int8_t *sourceBuffer;
u_int8_t *outputBuffer;

int32_t min(int32_t former, int32_t latter) {
    return former < latter ? former : latter;
}

int32_t clampOnce(int32_t value, int32_t limit) {
    return (value >= limit) ? value - limit : value;
}

/*
 * 
 */
int main(int argc, char** argv) {

    freopen("report.txt", "w", stderr);

    //FILE *sourceFile = fopen("/home/piotrek/Dokumenty/BWT/qsufsort.c", "rb");
    FILE *sourceFile = fopen("/home/piotrek/Pobrane/corpora/enwik/enwik8", "rb");

    fseek(sourceFile, 0, SEEK_END);
    length = ftell(sourceFile);
    const int32_t DepthLimit = min(8000000, length);
    printf("File length: %ld\n", length);
    fseek(sourceFile, 0, SEEK_SET);

    SAin = new index_t[length];
    SAout = new index_t[length];

    sourceBuffer = new u_int8_t[length];
    outputBuffer = new u_int8_t[length];

    size_t readBytes = fread(sourceBuffer, sizeof (u_int8_t), length, sourceFile);
    assert(readBytes == length);

    fclose(sourceFile);

    for (int32_t i = 0; i < length; i++) {
        SAin[i].index = i;
        SAin[i].lcp = 0;
    }


#ifdef COMPUTE_STATISTICS
    int64_t comparisons = 0;
    int64_t quickChecks = 0;
#endif

    int32_t sortedLength = 1;
    while (sortedLength < length) {

        int32_t leftStart = 0;
        int32_t leftEnd = sortedLength;
        int32_t rightStart = sortedLength;
        int32_t rightEnd = min(length, sortedLength * 2);
        int32_t outputIndex = 0;

        do {
            int32_t toLeftLcp = 0;
            int32_t toRightLcp = 0;
            int32_t lastIndex = 0;

            {
                int32_t firstPtr = SAin[leftStart].index;
                int32_t secondPtr = SAin[rightStart].index;
                int32_t equal = 0;
                for (; equal < DepthLimit; equal++) {
                    if (sourceBuffer[firstPtr] != sourceBuffer[secondPtr]) {
                        break;
                    }
                    firstPtr = ((firstPtr + 1) == length) ? 0 : firstPtr + 1;
                    secondPtr = ((secondPtr + 1) == length) ? 0 : secondPtr + 1;
                }
#ifdef COMPUTE_STATISTICS
                comparisons++;
#endif
                if (sourceBuffer[firstPtr] <= sourceBuffer[secondPtr]) {
                    lastIndex = SAin[leftStart].index;
                    toLeftLcp = SAin[leftStart].lcp;
                    leftStart++;
                    toRightLcp = equal;
                } else {
                    lastIndex = SAin[rightStart].index;
                    toRightLcp = SAin[rightStart].lcp;
                    rightStart++;
                    toLeftLcp = equal;
                }
            }

            while ((leftStart < leftEnd) && (rightStart < rightEnd)) {

                SAout[outputIndex].index = lastIndex;

                if (toLeftLcp > toRightLcp) {
                    SAout[outputIndex++].lcp = toLeftLcp;
                    lastIndex = SAin[leftStart].index;
                    toLeftLcp = SAin[leftStart].lcp;
                    leftStart++;
#ifdef COMPUTE_STATISTICS
                    quickChecks++;
#endif
                } else if (toLeftLcp < toRightLcp) {
                    SAout[outputIndex++].lcp = toRightLcp;
                    lastIndex = SAin[rightStart].index;
                    toRightLcp = SAin[rightStart].lcp;
                    rightStart++;
#ifdef COMPUTE_STATISTICS
                    quickChecks++;
#endif
                } else { // toLeftLcp == toRightLcp
                    int32_t firstPtr = clampOnce(SAin[leftStart].index + toLeftLcp, length);
                    int32_t secondPtr = clampOnce(SAin[rightStart].index + toLeftLcp, length);
                    int32_t equal = toLeftLcp;
                    for (; equal < DepthLimit; equal++) {
                        if (sourceBuffer[firstPtr] != sourceBuffer[secondPtr]) {
                            break;
                        }
                        firstPtr = ((firstPtr + 1) == length) ? 0 : firstPtr + 1;
                        secondPtr = ((secondPtr + 1) == length) ? 0 : secondPtr + 1;
                    }
#ifdef COMPUTE_STATISTICS
                    comparisons++;
#endif
                    if (sourceBuffer[firstPtr] <= sourceBuffer[secondPtr]) {
                        SAout[outputIndex++].lcp = toLeftLcp;
                        lastIndex = SAin[leftStart].index;
                        toLeftLcp = SAin[leftStart].lcp;
                        leftStart++;
                        toRightLcp = equal;
                    } else {
                        SAout[outputIndex++].lcp = toRightLcp;
                        lastIndex = SAin[rightStart].index;
                        toRightLcp = SAin[rightStart].lcp;
                        rightStart++;
                        toLeftLcp = equal;
                    }
                }
            }

            if (leftStart == leftEnd) {
                SAout[outputIndex].index = lastIndex;
                SAout[outputIndex].lcp = toRightLcp;
                outputIndex++;
#ifdef COMPUTE_STATISTICS
                quickChecks++;
#endif
                while (rightStart < rightEnd) {
                    SAout[outputIndex++] = SAin[rightStart++];
#ifdef COMPUTE_STATISTICS
                    quickChecks++;
#endif
                }
            } else { // rightStart == rightEnd
                SAout[outputIndex].index = lastIndex;
                SAout[outputIndex].lcp = toLeftLcp;
                outputIndex++;
#ifdef COMPUTE_STATISTICS
                quickChecks++;
#endif
                while (leftStart < leftEnd) {
                    SAout[outputIndex++] = SAin[leftStart++];
#ifdef COMPUTE_STATISTICS
                    quickChecks++;
#endif
                }
            }

            leftStart = rightEnd;
            leftEnd = min(leftStart + sortedLength, length);
            rightStart = leftEnd;
            rightEnd = min(rightStart + sortedLength, length);
        } while (rightStart < rightEnd);

        while (leftStart < leftEnd) {
            SAout[outputIndex++] = SAin[leftStart++];
#ifdef COMPUTE_STATISTICS
            quickChecks++;
#endif
        }

        {
            index_t *temp = SAin;
            SAin = SAout;
            SAout = temp;
        }
        sortedLength *= 2;
    }

#ifdef COMPUTE_STATISTICS
    printf("Comparisons:           %11ld\n", comparisons);
    printf("Quick checks:          %11ld\n", quickChecks);
#endif


    for (int32_t i = 0; i < length; i++) {
        outputBuffer[i] = sourceBuffer[clampOnce(SAin[i].index + length - 1, length)];
    }

    {
        clock_t time = clock();
        for (int32_t i = 1; i < length; i++) {
            int32_t firstPtr = SAin[i - 1].index;
            int32_t secondPtr = SAin[i].index;
            int32_t equal = 0;
            for (; equal < DepthLimit; equal++) {
                if (sourceBuffer[firstPtr] > sourceBuffer[secondPtr]) {
                    printf("Wrong order on position: %d, equal prefix length: %d\n", i - 1, equal);
                    break;
                } else if (sourceBuffer[firstPtr] < sourceBuffer[secondPtr]) {
                    break;
                }
                firstPtr = ((firstPtr + 1) == length) ? 0 : firstPtr + 1;
                secondPtr = ((secondPtr + 1) == length) ? 0 : secondPtr + 1;
            }
        }
        printf("Rotation order checking time: %ld\n", clock() - time);
    }

    FILE *outputFile = fopen("/home/piotrek/Pulpit/bwtoutput.mergerotsort", "wb");

    size_t writtenBytes = fwrite(outputBuffer, sizeof (u_int8_t), length, outputFile);
    assert(writtenBytes == length);

    fclose(outputFile);

    return 0;
}