10th October 2019, 06:36
In order to do the ZrHa_update routine on NEON, we would need to have, at the same time, a DCBA, AC, and a BD permutation (or CA/DB).
I'm thinking something like this:
ZrHa_update_neon32:
vld1.64 {d0, d1},
vld1.8 {d2, d3},
vadd.i64 q0, q0, q1
vrev64.32 q1, q0
vtrn.32 d0, d1
vswp d2, d3
vmlal.u32 q1, d0, d1
vst1.64 {d1, d2},
bx lr
ZrHa_update_neon64:
ld1 q0,
ld1 q1,
add v0.2d, v0.2d, v1.2d
xtn v1.2s, v0.2d
shrn v2.2s, v0.2d, #32
rev64 v0.4s, v0.4s
ext v0.16b, v0.16b, v0.16b, #8
umlal v0.2d, v1.2s, v2.2s
st1 q0,
ret
Not as clean as SSE2 (Wow, never thought I'd say that one!)
ZrHa_update_sse2: // x86_64, sysv
movdqu xmm0, xmmword ptr
paddq xmm0, xmmword ptr
pshufd xmm1, xmm0, _MM_SHUFFLE(2, 3, 0, 1)
pshufd xmm2, xmm0, _MM_SHUFFLE(0, 1, 2, 3)
pmuludq xmm0, xmm1
paddq xmm0, xmm2
movdqa xmmword ptr, xmm0
ret
"XXH3_64b_round":
XXH3_64b_round_neon32:
vld1.64 {d0, d1},
vld1.8 {d2, d3},
vadd.i64 q0, q0, q1
vld1.8 {d4, d5},
veor q2, q2, q1
vtrn.32 d4, d5
vmlal.u32 q0, d4, d5
vst1.64 {d0, d1},
bx lr
XXH3_64b_round_neon64:
ld1 q0,
ld1 q1,
add v0.2d, v0.2d, v1.2d
ld1 q2,
eor v2.16b, v2.16b, v1.16b
xtn v1.2s, v2.2d
shrn v2.2s, v2.2d, #32
umlal v0.2d, v1.2s, v2.2s
st1 q0,
ret