oxen-core/src/crypto/cn_heavy_hash_hard_arm.cpp

// Copyright (c) 2017, SUMOKOIN
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without modification, are
// permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this list of
//    conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice, this list
//    of conditions and the following disclaimer in the documentation and/or other
//    materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its contributors may be
//    used to endorse or promote products derived from this software without specific
//    prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
// THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Parts of this file are originally copyright (c) 2014-2017, The Monero Project
// Parts of this file are originally copyright (c) 2012-2013, The Cryptonote developers

#if defined(__aarch64__)

#include "cn_heavy_hash.hpp"
extern "C" {
#include "keccak.h"
#include "jh.h"
#include "skein.h"
}

#if !defined(__APPLE__)
#include <sys/auxv.h>
#include <asm/hwcap.h>
#endif
#include <arm_neon.h>

static bool hw_check_aes()
{
#if !defined(__APPLE__)
	return (getauxval(AT_HWCAP) & HWCAP_AES) != 0;
#else
    return true;
#endif
}

extern "C" const bool cpu_aes_enabled = hw_check_aes() && !force_software_aes();

extern const uint8_t saes_sbox[256];

struct aeskeydata
{
	uint32_t x0;
	uint32_t x1;
	uint32_t x2;
	uint32_t x3;

	aeskeydata(const uint8_t* memory)
	{
		const uint32_t* mem = reinterpret_cast<const uint32_t*>(memory);
		x0 = mem[0];
		x1 = mem[1];
		x2 = mem[2];
		x3 = mem[3];
	}

	inline uint8x16_t store()
	{
		uint32x4_t tmp = {x0, x1, x2, x3};
		return vreinterpretq_u8_u32(tmp);
	}

	inline aeskeydata& operator^=(uint32_t rhs) noexcept
	{
		x0 ^= rhs;
		x1 ^= rhs;
		x2 ^= rhs;
		x3 ^= rhs;
		return *this;
	}
};

// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
inline void sl_xor(aeskeydata& x)
{
	x.x1 ^= x.x0;
	x.x2 ^= x.x1;
	x.x3 ^= x.x2;
}

inline uint32_t sub_word(uint32_t key)
{
	return (saes_sbox[key >> 24 ] << 24)   | (saes_sbox[(key >> 16) & 0xff] << 16 ) |
		(saes_sbox[(key >> 8)  & 0xff] << 8  ) | saes_sbox[key & 0xff];
}

inline uint32_t rotr(uint32_t value, uint32_t amount)
{
	return (value >> amount) | (value << ((32 - amount) & 31));
}

template<uint8_t rcon>
inline void soft_aes_genkey_sub(aeskeydata& xout0, aeskeydata& xout2)
{
	uint32_t tmp;
	sl_xor(xout0);
	xout0 ^= rotr(sub_word(xout2.x3), 8) ^ rcon;
	sl_xor(xout2);
	xout2 ^= sub_word(xout0.x3);
}

inline void aes_genkey(const uint8_t* memory, uint8x16_t& k0, uint8x16_t& k1, uint8x16_t& k2, uint8x16_t& k3, uint8x16_t& k4,
	uint8x16_t& k5, uint8x16_t& k6, uint8x16_t& k7, uint8x16_t& k8, uint8x16_t& k9)
{
	aeskeydata xout0(memory);
	aeskeydata xout2(memory + 16);

	k0 = xout0.store();
	k1 = xout2.store();

	soft_aes_genkey_sub<0x01>(xout0, xout2);
	k2 = xout0.store();
	k3 = xout2.store();

	soft_aes_genkey_sub<0x02>(xout0, xout2);
	k4 = xout0.store();
	k5 = xout2.store();

	soft_aes_genkey_sub<0x04>(xout0, xout2);
	k6 = xout0.store();
	k7 = xout2.store();

	soft_aes_genkey_sub<0x08>(xout0, xout2);
	k8 = xout0.store();
	k9 = xout2.store();
}

inline void aes_round10(uint8x16_t& x, const uint8x16_t& k0, const uint8x16_t& k1, const uint8x16_t& k2, const uint8x16_t& k3,
				const uint8x16_t& k4, const uint8x16_t& k5, const uint8x16_t& k6, const uint8x16_t& k7, const uint8x16_t& k8, const uint8x16_t& k9)
{
	x = vaesmcq_u8(vaeseq_u8(x, vdupq_n_u8(0)));
	x = vaesmcq_u8(vaeseq_u8(x, k0));
	x = vaesmcq_u8(vaeseq_u8(x, k1));
	x = vaesmcq_u8(vaeseq_u8(x, k2));
	x = vaesmcq_u8(vaeseq_u8(x, k3));
	x = vaesmcq_u8(vaeseq_u8(x, k4));
	x = vaesmcq_u8(vaeseq_u8(x, k5));
	x = vaesmcq_u8(vaeseq_u8(x, k6));
	x = vaesmcq_u8(vaeseq_u8(x, k7));
	x = vaesmcq_u8(vaeseq_u8(x, k8));
	x = veorq_u8(x, k9);
}

inline void xor_shift(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3, uint8x16_t& x4, uint8x16_t& x5, uint8x16_t& x6, uint8x16_t& x7)
{
	uint8x16_t tmp0 = x0;
	x0 ^= x1;
	x1 ^= x2;
	x2 ^= x3;
	x3 ^= x4;
	x4 ^= x5;
	x5 ^= x6;
	x6 ^= x7;
	x7 ^= tmp0;
}

inline void mem_load(cn_sptr& lpad, size_t i,uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3, uint8x16_t& x4, uint8x16_t& x5, uint8x16_t& x6, uint8x16_t& x7)
{
	x0 ^= vld1q_u8(lpad.as_byte() + i);
	x1 ^= vld1q_u8(lpad.as_byte() + i + 16);
	x2 ^= vld1q_u8(lpad.as_byte() + i + 32);
	x3 ^= vld1q_u8(lpad.as_byte() + i + 48);
	x4 ^= vld1q_u8(lpad.as_byte() + i + 64);
	x5 ^= vld1q_u8(lpad.as_byte() + i + 80);
	x6 ^= vld1q_u8(lpad.as_byte() + i + 96);
	x7 ^= vld1q_u8(lpad.as_byte() + i + 112);
}

template<size_t MEMORY, size_t ITER, size_t VERSION>
void cn_heavy_hash<MEMORY,ITER,VERSION>::implode_scratchpad_hard()
{
	uint8x16_t x0, x1, x2, x3, x4, x5, x6, x7;
	uint8x16_t k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;

	aes_genkey(spad.as_byte() + 32, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);

	x0 = vld1q_u8(spad.as_byte() + 64);
	x1 = vld1q_u8(spad.as_byte() + 80);
	x2 = vld1q_u8(spad.as_byte() + 96);
	x3 = vld1q_u8(spad.as_byte() + 112);
	x4 = vld1q_u8(spad.as_byte() + 128);
	x5 = vld1q_u8(spad.as_byte() + 144);
	x6 = vld1q_u8(spad.as_byte() + 160);
	x7 = vld1q_u8(spad.as_byte() + 176);

	for (size_t i = 0; i < MEMORY; i += 128)
	{
		mem_load(lpad, i, x0, x1, x2, x3, x4, x5, x6, x7);

		aes_round10(x0, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x1, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x3, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x4, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x5, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x6, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x7, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);

		if(VERSION > 0)
			xor_shift(x0, x1, x2, x3, x4, x5, x6, x7);
	}

	for (size_t i = 0; VERSION > 0 && i < MEMORY; i += 128)
	{
		mem_load(lpad, i, x0, x1, x2, x3, x4, x5, x6, x7);

		aes_round10(x0, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x1, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x3, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x4, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x5, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x6, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x7, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);

		xor_shift(x0, x1, x2, x3, x4, x5, x6, x7);
	}

	for (size_t i = 0; VERSION > 0 && i < 16; i++)
	{
		aes_round10(x0, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x1, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x3, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x4, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x5, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x6, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x7, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);

		xor_shift(x0, x1, x2, x3, x4, x5, x6, x7);
	}

	vst1q_u8(spad.as_byte() + 64, x0);
	vst1q_u8(spad.as_byte() + 80, x1);
	vst1q_u8(spad.as_byte() + 96, x2);
	vst1q_u8(spad.as_byte() + 112, x3);
	vst1q_u8(spad.as_byte() + 128, x4);
	vst1q_u8(spad.as_byte() + 144, x5);
	vst1q_u8(spad.as_byte() + 160, x6);
	vst1q_u8(spad.as_byte() + 176, x7);
}

template<size_t MEMORY, size_t ITER, size_t VERSION>
void cn_heavy_hash<MEMORY,ITER,VERSION>::explode_scratchpad_hard()
{
	uint8x16_t x0, x1, x2, x3, x4, x5, x6, x7;
	uint8x16_t k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;

	aes_genkey(spad.as_byte(), k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);

	x0 = vld1q_u8(spad.as_byte() + 64);
	x1 = vld1q_u8(spad.as_byte() + 80);
	x2 = vld1q_u8(spad.as_byte() + 96);
	x3 = vld1q_u8(spad.as_byte() + 112);
	x4 = vld1q_u8(spad.as_byte() + 128);
	x5 = vld1q_u8(spad.as_byte() + 144);
	x6 = vld1q_u8(spad.as_byte() + 160);
	x7 = vld1q_u8(spad.as_byte() + 176);

	for (size_t i = 0; VERSION > 0 && i < 16; i++)
	{
		aes_round10(x0, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x1, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x3, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x4, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x5, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x6, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x7, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);

		xor_shift(x0, x1, x2, x3, x4, x5, x6, x7);
	}

	for(size_t i = 0; i < MEMORY; i += 128)
	{
		aes_round10(x0, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x1, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x3, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x4, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x5, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x6, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);
		aes_round10(x7, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9);

		vst1q_u8(lpad.as_byte() + i, x0);
		vst1q_u8(lpad.as_byte() + i + 16, x1);
		vst1q_u8(lpad.as_byte() + i + 32, x2);
		vst1q_u8(lpad.as_byte() + i + 48, x3);
		vst1q_u8(lpad.as_byte() + i + 64, x4);
		vst1q_u8(lpad.as_byte() + i + 80, x5);
		vst1q_u8(lpad.as_byte() + i + 96, x6);
		vst1q_u8(lpad.as_byte() + i + 112, x7);
	}
}

inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi)
{
	unsigned __int128 r = (unsigned __int128)a * (unsigned __int128)b;
	*hi = r >> 64;
	return (uint64_t)r;
}

extern "C" void blake256_hash(uint8_t*, const uint8_t*, uint64_t);
extern "C" void groestl(const unsigned char*, unsigned long long, unsigned char*);

inline uint8x16_t _mm_set_epi64x(const uint64_t a, const uint64_t b)
{
    return vreinterpretq_u8_u64(vcombine_u64(vcreate_u64(b), vcreate_u64(a)));
}

template<size_t MEMORY, size_t ITER, size_t VERSION>
void cn_heavy_hash<MEMORY,ITER,VERSION>::hardware_hash(const void* in, size_t len, void* out, bool prehashed)
{
	if (!prehashed)
		keccak((const uint8_t *)in, len, spad.as_byte(), 200);

	explode_scratchpad_hard();

	uint64_t* h0 = spad.as_uqword();

	uint64_t al0 = h0[0] ^ h0[4];
	uint64_t ah0 = h0[1] ^ h0[5];
	uint8x16_t bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);

	uint64_t idx0 = h0[0] ^ h0[4];

	const uint8x16_t zero = vdupq_n_u8(0);
	// Optim - 90% time boundary
	for(size_t i = 0; i < ITER; i++)
	{
		uint8x16_t cx;
		cx = vld1q_u8(scratchpad_ptr(idx0).as_byte());

		cx = vaesmcq_u8(vaeseq_u8(cx, zero)) ^ _mm_set_epi64x(ah0, al0);

		vst1q_u8(scratchpad_ptr(idx0).as_byte(), bx0 ^ cx);

		idx0 = vgetq_lane_u64(vreinterpretq_u64_u8(cx), 0);
		bx0 = cx;

		uint64_t hi, lo, cl, ch;
		cl = scratchpad_ptr(idx0).as_uqword(0);
		ch = scratchpad_ptr(idx0).as_uqword(1);

		lo = _umul128(idx0, cl, &hi);

		al0 += hi;
		ah0 += lo;
		scratchpad_ptr(idx0).as_uqword(0) = al0;
		scratchpad_ptr(idx0).as_uqword(1) = ah0;
		ah0 ^= ch;
		al0 ^= cl;
		idx0 = al0;

		if(VERSION > 0)
		{
			int64_t n  = scratchpad_ptr(idx0).as_qword(0);
			int32_t d  = scratchpad_ptr(idx0).as_dword(2);
			int64_t q = n / (d | 5);
			scratchpad_ptr(idx0).as_qword(0) = n ^ q;
			idx0 = d ^ q;
		}
	}

	implode_scratchpad_hard();

	keccakf(spad.as_uqword(), 24);

	switch(spad.as_byte(0) & 3)
	{
	case 0:
		blake256_hash((uint8_t*)out, spad.as_byte(), 200);
		break;
	case 1:
		groestl(spad.as_byte(), 200 * 8, (uint8_t*)out);
		break;
	case 2:
		jh_hash(32 * 8, spad.as_byte(), 8 * 200, (uint8_t*)out);
		break;
	case 3:
		skein_hash(8 * 32, spad.as_byte(), 8 * 200, (uint8_t*)out);
		break;
	}
}

template class cn_heavy_hash<2*1024*1024, 0x80000, 0>;
template class cn_heavy_hash<4*1024*1024, 0x40000, 1>;

#endif