I have the same experience with LLVM - didn't appear useful.

And as an example of intelc auto-vectorization, I'd suggest this:

Code:

// for( i=0; i<10; i++ ) result.b[i] %= 62;
#define vecmod(a,b) (a-((__min(1,byte(__min(a,b)-b))-1)&b))
for( i=0; i<16; i++ ) {
result.b[i] = vecmod( result.b[i], 4*62 );
result.b[i] = vecmod( result.b[i], 2*62 );
result.b[i] = vecmod( result.b[i], 1*62 );
}

Its from my cracker for Winny "trip" codes

(there's stuff like qwerpoiu -> Iws0WmvT3N)

I won't say that this example itself is optimal or anything,

I just made it specifically to test IntelC,

and here's what it generates:

Code:

;;; result.b[i] = vecmod( result.b[i], 4*62 );
;;; result.b[i] = vecmod( result.b[i], 2*62 );
;;; result.b[i] = vecmod( result.b[i], 1*62 );
movdqa xmm6, XMMWORD PTR _2il0floatpacket$3
movdqa xmm5, XMMWORD PTR _2il0floatpacket$4
movdqa xmm3, XMMWORD PTR _2il0floatpacket$5
movdqa xmm0, XMMWORD PTR _2il0floatpacket$6
[...]
movdqa xmm7, xmm6
movdqa xmm4, XMMWORD PTR _2il0floatpacket$8
movdqa xmm1, XMMWORD PTR ?result@@3UHashValue@@A
pminub xmm7, xmm1
paddb xmm7, xmm5
movdqa xmm5, xmm0
pminub xmm7, xmm3
paddb xmm7, xmm2
pand xmm7, xmm6
psubb xmm1, xmm7
pminub xmm5, xmm1
paddb xmm5, XMMWORD PTR _2il0floatpacket$7
pminub xmm5, xmm3
paddb xmm5, xmm2
pand xmm5, xmm0
psubb xmm1, xmm5
movdqa xmm0, XMMWORD PTR _2il0floatpacket$8
pminub xmm0, xmm1
paddb xmm0, XMMWORD PTR _2il0floatpacket$9
pminub xmm0, xmm3
paddb xmm0, xmm2
pand xmm0, xmm4
psubb xmm1, xmm0
movdqa XMMWORD PTR ?result@@3UHashValue@@A, xmm1

and gcc is still lacking that afaik.