I want to write AVX2 supported train and dot_product. For some reason it does not work.

Can someone help to figure out what am i doing wrong. This is my first attempt at all.

Code:

static int dot_product (const short* const t, const short* const w, int n) {
assert(n == ((n + 15) & -16));
__m256i sum = _mm256_setzero_si256 ();
while ((n -= 16) >= 0) { // Each loop sums 16products
__m256i tmp = _mm256_madd_epi16 (*(__m256i *) &t[n], *(__m256i *) &w[n]); // t[n] * w[n] + t[n+1] * w[n+1]
tmp = _mm256_srai_epi32 (tmp, 8); // (t[n] * w[n] + t[n+1] * w[n+1]) >> 8
sum = _mm256_add_epi32 (sum, tmp); // sum += (t[n] * w[n] + t[n+1] * w[n+1]) >> 8
}
sum = _mm256_add_epi32 (sum, _mm256_srli_si256 (sum, 16));
sum = _mm256_add_epi32 (sum, _mm256_srli_si256 (sum, 8)); // Add eight sums together ...
sum = _mm256_add_epi32 (sum, _mm256_srli_si256 (sum, 4));
__m128i low = _mm256_castsi256_si128(sum);
return _mm_cvtsi128_si32 (low); // ... and scale back to integer
}
static void train (const short* const t, short* const w, int n, const int e) {
assert(n == ((n + 15) & -16));
if (e) {
const __m256i one = _mm256_set1_epi16 (1);
const __m256i err = _mm256_set1_epi16 (short(e));
while ((n -= 16) >= 0) { // Each iteration adjusts 16 weights
__m256i tmp = _mm256_adds_epi16 (*(__m256i *) &t[n], *(__m256i *) &t[n]); // t[n] * 2
tmp = _mm256_mulhi_epi16 (tmp, err); // (t[n] * 2 * err) >> 16
tmp = _mm256_adds_epi16 (tmp, one); // ((t[n] * 2 * err) >> 16) + 1
tmp = _mm256_srai_epi16 (tmp, 1); // (((t[n] * 2 * err) >> 16) + 1) >> 1
tmp = _mm256_adds_epi16 (tmp, *(__m256i *) &w[n]); // ((((t[n] * 2 * err) >> 16) + 1) >> 1) + w[n]
*(__m256i *) &w[n] = tmp; // save the new eight weights, bounded to +- 32K
}
}
}