Guess I'd post some samples for lazy people:
Source:
Code:
__declspec(align(16))
int mask[] = { 0x10001,0x10001,0x10001,0x10001 };
void train(short *t, short *w, int n, int err) {
<asm>
input a t=t
input d w=w
input c n=n
input rm err=err
input m mask=mask[0]
clobber xmm0,xmm1
add ecx, 7 ; n/8 rounding up
and ecx, -8
jz 1f
sub eax, 16
sub edx, 16
movd xmm0,err
pshuflw xmm0,xmm0,0
punpcklqdq xmm0,xmm0
2: ; each iteration adjusts 8 weights
movdqa xmm3, [eax+ecx*2] ; t[i]
movdqa xmm2, [edx+ecx*2] ; w[i]
paddsw xmm3, xmm3 ; t[i]*2
pmulhw xmm3, xmm0 ; t[i]*err*2 >> 16
paddsw xmm3, mask ; (t[i]*err*2 >> 16)+1
psraw xmm3, 1 ; (t[i]*err*2 >> 16)+1 >> 1
paddsw xmm2, xmm3 ; w[i] + xmm3
movdqa [edx+ecx*2], xmm2
sub ecx, 8
ja 2b
1:
</asm>
}
Preprocessed output:
Code:
__declspec(align(16))
int mask[] = { 0x10001,0x10001,0x10001,0x10001 };
void train(short *t, short *w, int n, int err) {
ASM ("\
add $7,%%ecx; \
and $-8,%%ecx; \
jz 1f; \
sub $16,%%eax; \
sub $16,%%edx; \
movd %3,%%xmm0; \
pshuflw $0,%%xmm0,%%xmm0; \
punpcklqdq %%xmm0,%%xmm0; \
2: movdqa (%%eax,%%ecx,2),%%xmm3; \
movdqa (%%edx,%%ecx,2),%%xmm2; \
paddsw %%xmm3,%%xmm3; \
pmulhw %%xmm0,%%xmm3; \
paddsw %4,%%xmm3; \
psraw $1,%%xmm3; \
paddsw %%xmm3,%%xmm2; \
movdqa %%xmm2,(%%edx,%%ecx,2); \
sub $8,%%ecx; \
ja 2b; \
1: " : : "a"(t),"d"(w),"c"(n),"rm"(err),"m"(mask[0]) : "xmm0","xmm1"
);
}
Basically the main problem is related to these input/output
specifications... GNU syntax requires _counting_ them.
And such instruction syntax is not any convenient anyway too.
But with a preprocessor its ok now 
If its like that, I may even start to use assembly inlines again