Code:
// sha3 - Compute SHA3-512 hashes of files
// Written by Matt Mahoney. Released to public domain.
// Compile: g++ -O3 -ssse3 sha3.cpp -o sha3
#define _CRT_DISABLE_PERFCRIT_LOCKS // speed up getc() and putc() in VC++
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <assert.h>
#include <x86intrin.h>
// Keccak code derived from public domain http://keccak.noekeon.org/
typedef unsigned char UINT8;
typedef unsigned long long int UINT64;
#if defined(__GNUC__)
#define ALIGN __attribute__ ((aligned(32)))
#elif defined(_MSC_VER)
#define ALIGN __declspec(align(32))
#else
#define ALIGN
#endif
typedef __m128i V64;
typedef __m128i V128;
typedef union {
V128 v128;
UINT64 v64[2];
} V6464;
#define ANDnu64(a, b) _mm_andnot_si128(a, b)
#define LOAD64(a) _mm_loadl_epi64((const V64 *)&(a))
#define CONST64(a) _mm_loadl_epi64((const V64 *)&(a))
#define ROL64(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
#define STORE64(a, b) _mm_storel_epi64((V64 *)&(a), b)
#define XOR64(a, b) _mm_xor_si128(a, b)
#define XOReq64(a, b) a = _mm_xor_si128(a, b)
#define SHUFFLEBYTES128(a, b) _mm_shuffle_epi8(a, b)
#define ANDnu128(a, b) _mm_andnot_si128(a, b)
#define LOAD6464(a, b) _mm_set_epi64((__m64)(a), (__m64)(b))
#define CONST128(a) _mm_load_si128((const V128 *)&(a))
#define LOAD128(a) _mm_load_si128((const V128 *)&(a))
#define LOAD128u(a) _mm_loadu_si128((const V128 *)&(a))
#define ROL64in128(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o)))
#define STORE128(a, b) _mm_store_si128((V128 *)&(a), b)
#define XOR128(a, b) _mm_xor_si128(a, b)
#define XOReq128(a, b) a = _mm_xor_si128(a, b)
#define GET64LOLO(a, b) _mm_unpacklo_epi64(a, b)
#define GET64HIHI(a, b) _mm_unpackhi_epi64(a, b)
#define COPY64HI2LO(a) _mm_shuffle_epi32(a, 0xEE)
#define COPY64LO2HI(a) _mm_shuffle_epi32(a, 0x44)
#define ZERO128() _mm_setzero_si128()
void KeccakPermutationOnWords(UINT64 *state)
{
ALIGN static const UINT64 rho8_56[2]={0x0605040302010007, 0x080F0E0D0C0B0A09};
static const UINT64 KeccakF1600RoundConstants[24] = {
0x0000000000000001ULL,
0x0000000000008082ULL,
0x800000000000808aULL,
0x8000000080008000ULL,
0x000000000000808bULL,
0x0000000080000001ULL,
0x8000000080008081ULL,
0x8000000000008009ULL,
0x000000000000008aULL,
0x0000000000000088ULL,
0x0000000080008009ULL,
0x000000008000000aULL,
0x000000008000808bULL,
0x800000000000008bULL,
0x8000000000008089ULL,
0x8000000000008003ULL,
0x8000000000008002ULL,
0x8000000000000080ULL,
0x000000000000800aULL,
0x800000008000000aULL,
0x8000000080008081ULL,
0x8000000000008080ULL,
0x0000000080000001ULL,
0x8000000080008008ULL };
V6464 Abage, Abegi;
V6464 Akame, Akemi;
V6464 Abae, Abio, Agae, Agio, Akae, Akio, Amae, Amio, Asae, Asio;
V64 Aba, Abe, Abi, Abo, Abu;
V64 Aga, Age, Agi, Ago, Agu;
V64 Aka, Ake, Aki, Ako, Aku;
V64 Ama, Ame, Ami, Amo, Amu;
V64 Asa, Ase, Asi, Aso, Asu;
V128 Bbage, Bbegi, Bbigo, Bbogu, Bbuga;
V128 Bkame, Bkemi, Bkimo, Bkomu, Bkuma;
V64 Bba, Bbe, Bbi, Bbo, Bbu;
V64 Bga, Bge, Bgi, Bgo, Bgu;
V64 Bka, Bke, Bki, Bku;
V64 Bma, Bme, Bmi, Bmo;
V64 Bsa, Bse, Bsi, Bso, Bsu;
V128 Cae, Cei, Cio, Cou, Cua, Dei, Dou;
V64 Ca, Ce, Ci, Co, Cu;
V64 Da, De, Di, Do, Du;
V6464 Ebage, Ebegi, Ebigo, Ebogu, Ebuga;
V6464 Ekame, Ekemi, Ekimo, Ekomu, Ekuma;
V64 Ebi, Ebo, Ebu;
V64 Ega, Ego, Egu;
V64 Eki, Eko, Eku;
V64 Ema, Emo, Emu;
V64 Esa, Ese, Esi, Eso, Esu;
V128 Zero;
Abae.v128 = LOAD128(state[ 0]);
Aba = Abae.v128;
Abe = GET64HIHI(Abae.v128, Abae.v128);
Cae = Abae.v128;
Abio.v128 = LOAD128(state[ 2]);
Abi = Abio.v128;
Abo = GET64HIHI(Abio.v128, Abio.v128);
Cio = Abio.v128;
Abu = LOAD64(state[ 4]);
Cu = Abu;
Agae.v128 = LOAD128u(state[ 5]);
Aga = Agae.v128;
Age = GET64HIHI(Agae.v128, Agae.v128);
Abage.v128 = GET64LOLO(Aba, Age);
XOReq128(Cae, Agae.v128);
Agio.v128 = LOAD128u(state[ 7]);
Agi = Agio.v128;
Abegi.v128 = GET64LOLO(Abe, Agi);
Ago = GET64HIHI(Agio.v128, Agio.v128);
XOReq128(Cio, Agio.v128);
Agu = LOAD64(state[ 9]);
XOReq64(Cu, Agu);
Akae.v128 = LOAD128(state[10]);
Aka = Akae.v128;
Ake = GET64HIHI(Akae.v128, Akae.v128);
XOReq128(Cae, Akae.v128);
Akio.v128 = LOAD128(state[12]);
Aki = Akio.v128;
Ako = GET64HIHI(Akio.v128, Akio.v128);
XOReq128(Cio, Akio.v128);
Aku = LOAD64(state[14]);
XOReq64(Cu, Aku);
Amae.v128 = LOAD128u(state[15]);
Ama = Amae.v128;
Ame = GET64HIHI(Amae.v128, Amae.v128);
Akame.v128 = GET64LOLO(Aka, Ame);
XOReq128(Cae, Amae.v128);
Amio.v128 = LOAD128u(state[17]);
Ami = Amio.v128;
Akemi.v128 = GET64LOLO(Ake, Ami);
Amo = GET64HIHI(Amio.v128, Amio.v128);
XOReq128(Cio, Amio.v128);
Amu = LOAD64(state[19]);
XOReq64(Cu, Amu);
Asae.v128 = LOAD128(state[20]);
Asa = Asae.v128;
Ase = GET64HIHI(Asae.v128, Asae.v128);
XOReq128(Cae, Asae.v128);
Asio.v128 = LOAD128(state[22]);
Asi = Asio.v128;
Aso = GET64HIHI(Asio.v128, Asio.v128);
XOReq128(Cio, Asio.v128);
Asu = LOAD64(state[24]);
XOReq64(Cu, Asu);
for(int i=0; i<24; i++) {
Cua = GET64LOLO(Cu, Cae);
Dei = XOR128(Cae, ROL64in128(Cio, 1));
Dou = XOR128(Cio, ROL64in128(Cua, 1));
Da = XOR64(Cu, ROL64in128(COPY64HI2LO(Cae), 1));
De = Dei;
Di = COPY64HI2LO(Dei);
Do = Dou;
Du = COPY64HI2LO(Dou);
Aba = LOAD64(Abage.v64[0]);
XOReq64(Aba, Da);
Bba = Aba;
XOReq64(Agu, Du);
Bge = ROL64(Agu, 20);
Bbage = GET64LOLO(Bba, Bge);
Age = LOAD64(Abage.v64[1]);
XOReq64(Age, De);
Bbe = ROL64(Age, 44);
Aka = LOAD64(Akame.v64[0]);
XOReq64(Aka, Da);
Bgi = ROL64(Aka, 3);
Bbegi = GET64LOLO(Bbe, Bgi);
XOReq64(Aki, Di);
Bbi = ROL64(Aki, 43);
Ame = LOAD64(Akame.v64[1]);
XOReq64(Ame, De);
Bgo = ROL64(Ame, 45);
Bbigo = GET64LOLO(Bbi, Bgo);
Ebage.v128 = XOR128(Bbage, ANDnu128(Bbegi, Bbigo));
XOReq128(Ebage.v128, CONST64(KeccakF1600RoundConstants[i]));
Cae = Ebage.v128;
XOReq64(Amo, Do);
Bbo = ROL64(Amo, 21);
XOReq64(Asi, Di);
Bgu = ROL64(Asi, 61);
Bbogu = GET64LOLO(Bbo, Bgu);
Ebegi.v128 = XOR128(Bbegi, ANDnu128(Bbigo, Bbogu));
Cei = Ebegi.v128;
XOReq64(Asu, Du);
Bbu = ROL64(Asu, 14);
XOReq64(Abo, Do);
Bga = ROL64(Abo, 28);
Bbuga = GET64LOLO(Bbu, Bga);
Ebigo.v128 = XOR128(Bbigo, ANDnu128(Bbogu, Bbuga));
Ebi = Ebigo.v128;
Ego = GET64HIHI(Ebigo.v128, Ebigo.v128);
Cio = Ebigo.v128;
Ebogu.v128 = XOR128(Bbogu, ANDnu128(Bbuga, Bbage));
Ebo = Ebogu.v128;
Egu = GET64HIHI(Ebogu.v128, Ebogu.v128);
Cou = Ebogu.v128;
Ebuga.v128 = XOR128(Bbuga, ANDnu128(Bbage, Bbegi));
Ebu = Ebuga.v128;
Ega = GET64HIHI(Ebuga.v128, Ebuga.v128);
Cua = Ebuga.v128;
Abe = LOAD64(Abegi.v64[0]);
XOReq64(Abe, De);
Bka = ROL64(Abe, 1);
XOReq64(Aga, Da);
Bme = ROL64(Aga, 36);
Bkame = GET64LOLO(Bka, Bme);
Agi = LOAD64(Abegi.v64[1]);
XOReq64(Agi, Di);
Bke = ROL64(Agi, 6);
Ake = LOAD64(Akemi.v64[0]);
XOReq64(Ake, De);
Bmi = ROL64(Ake, 10);
Bkemi = GET64LOLO(Bke, Bmi);
XOReq64(Ako, Do);
Bki = ROL64(Ako, 25);
Ami = LOAD64(Akemi.v64[1]);
XOReq64(Ami, Di);
Bmo = ROL64(Ami, 15);
Bkimo = GET64LOLO(Bki, Bmo);
Ekame.v128 = XOR128(Bkame, ANDnu128(Bkemi, Bkimo));
XOReq128(Cae, Ekame.v128);
Bkomu = GET64LOLO(XOR64(Amu, Du), XOR64(Aso, Do));
Bkomu = SHUFFLEBYTES128(Bkomu, CONST128(rho8_56));
Ekemi.v128 = XOR128(Bkemi, ANDnu128(Bkimo, Bkomu));
XOReq128(Cei, Ekemi.v128);
XOReq64(Asa, Da);
Bku = ROL64(Asa, 18);
XOReq64(Abu, Du);
Bma = ROL64(Abu, 27);
Bkuma = GET64LOLO(Bku, Bma);
Ekimo.v128 = XOR128(Bkimo, ANDnu128(Bkomu, Bkuma));
Eki = Ekimo.v128;
Emo = GET64HIHI(Ekimo.v128, Ekimo.v128);
XOReq128(Cio, Ekimo.v128);
Ekomu.v128 = XOR128(Bkomu, ANDnu128(Bkuma, Bkame));
Eko = Ekomu.v128;
Emu = GET64HIHI(Ekomu.v128, Ekomu.v128);
XOReq128(Cou, Ekomu.v128);
Ekuma.v128 = XOR128(Bkuma, ANDnu128(Bkame, Bkemi));
Eku = Ekuma.v128;
Ema = GET64HIHI(Ekuma.v128, Ekuma.v128);
XOReq128(Cua, Ekuma.v128);
XOReq64(Abi, Di);
Bsa = ROL64(Abi, 62);
XOReq64(Ago, Do);
Bse = ROL64(Ago, 55);
XOReq64(Aku, Du);
Bsi = ROL64(Aku, 39);
Esa = XOR64(Bsa, ANDnu64(Bse, Bsi));
Ca = Esa;
XOReq64(Ama, Da);
Bso = ROL64(Ama, 41);
Ese = XOR64(Bse, ANDnu64(Bsi, Bso));
Ce = Ese;
XOReq128(Cae, GET64LOLO(Ca, Ce));
XOReq64(Ase, De);
Bsu = ROL64(Ase, 2);
Esi = XOR64(Bsi, ANDnu64(Bso, Bsu));
Ci = Esi;
Eso = XOR64(Bso, ANDnu64(Bsu, Bsa));
Co = Eso;
XOReq128(Cio, GET64LOLO(Ci, Co));
Esu = XOR64(Bsu, ANDnu64(Bsa, Bse));
Cu = Esu;
Zero = ZERO128();
XOReq128(Cae, GET64HIHI(Cua, Zero));
XOReq128(Cae, GET64LOLO(Zero, Cei));
XOReq128(Cio, GET64HIHI(Cei, Zero));
XOReq128(Cio, GET64LOLO(Zero, Cou));
XOReq128(Cua, GET64HIHI(Cou, Zero));
XOReq64(Cu, Cua);
Abage = Ebage;
Abegi = Ebegi;
Abi = Ebi;
Abo = Ebo;
Abu = Ebu;
Aga = Ega;
Ago = Ego;
Agu = Egu;
Akame = Ekame;
Akemi = Ekemi;
Aki = Eki;
Ako = Eko;
Aku = Eku;
Ama = Ema;
Amo = Emo;
Amu = Emu;
Asa = Esa;
Ase = Ese;
Asi = Esi;
Aso = Eso;
Asu = Esu;
}
state[ 0] = Abage.v64[0];
state[ 1] = Abegi.v64[0];
STORE64(state[ 2], Abi);
STORE64(state[ 3], Abo);
STORE64(state[ 4], Abu);
STORE64(state[ 5], Aga);
state[ 6] = Abage.v64[1];
state[ 7] = Abegi.v64[1];
STORE64(state[ 8], Ago);
STORE64(state[ 9], Agu);
state[10] = Akame.v64[0];
state[11] = Akemi.v64[0];
STORE64(state[12], Aki);
STORE64(state[13], Ako);
STORE64(state[14], Aku);
STORE64(state[15], Ama);
state[16] = Akame.v64[1];
state[17] = Akemi.v64[1];
STORE64(state[18], Amo);
STORE64(state[19], Amu);
STORE64(state[20], Asa);
STORE64(state[21], Ase);
STORE64(state[22], Asi);
STORE64(state[23], Aso);
STORE64(state[24], Asu);
}
// Compute a SHA3 (Keccak[r,c]) hash with rate r
// To compute a hash, put() the input bytes and get() the result.
// For SHA3-{224,256,384,512} set r={1152,1088,832,576}.
template <int r>
class Keccak {
public:
void put(int ch); // Hash 1 byte
int get(); // Retrieve next byte of hash
void init(); // Reset state to compute a new hash
Keccak() {init();}
private:
UINT64 state[25]; // Keccak hash state
unsigned ptr; // next byte to get/put (range 0..r/8)
enum {ABSORB, SQUEEZE} spongeState; // putting or getting?
void f(); // Keccak-F function
};
// Reset hash state to accept a new input message.
template <int r>
void Keccak<r>::init() {
memset(state, 0, sizeof(state));
ptr=0;
spongeState=ABSORB;
}
// Hash 1 byte. Reset state if this is the first byte.
template <int r>
void Keccak<r>::put(int ch) {
assert(ptr>=0 && ptr<r/8);
assert(ch>=0 && ch<=255);
assert(spongeState==ABSORB);
state[ptr/8]^=UINT64(ch)<<(ptr%8*8);
if (++ptr==r/8) f();
}
// Get 1 byte of hash. If this is the first byte then pad the input.
template <int r>
int Keccak<r>::get() {
if (spongeState==ABSORB) { // first get()?
assert(ptr>=0 && ptr<r/8);
if (ptr==r/8-1)
put(0x81);
else {
put(1);
while (ptr<r/8-1) put(0);
put(0x80);
}
spongeState=SQUEEZE;
assert(ptr==0);
}
assert(ptr>=0 && ptr<=r/8);
if (ptr==r/8) f();
int ch=(state[ptr/8]>>(ptr%8*8))&255;
++ptr;
return ch;
}
// Compute rounds
template <int r>
void Keccak<r>::f() {
assert(ptr==r/8);
KeccakPermutationOnWords(state);
ptr=0;
}
// Compute hash of each arg
int main(int argc, char** argv) {
Keccak<576> keccak;
for (int i=1; i<argc; ++i) {
FILE* in=fopen(argv[i], "rb");
if (!in)
perror(argv[i]);
else {
keccak.init();
for (int c; (c=getc(in))!=EOF; keccak.put(c));
fclose(in);
for (int j=0; j<64; ++j) printf("%02x", keccak.get());
printf(" %s\n", argv[i]);
}
}
return 0;
}
Edit: there are 6 variants of the optimized code. I used #define UseSSE. I could not get UseXOP to work, and the others would be slower. I couldn't get any of the faster versions to work in VC++. There are also options to unroll the rounds, but I found it was slower that way. I kept the byte oriented version to keep it simple, even though it could probably be faster using block I/O.