This commit is contained in:
Ryan Tharp 2018-08-16 10:38:02 +00:00
commit 94b6b828c1
129 changed files with 5572 additions and 770 deletions

View File

@ -47,6 +47,9 @@ else()
set(THREAD_LIB pthread)
endif()
add_cflags("-march=native")
add_cxxflags("-march=native")
if(STATIC_LINK)
add_cflags("-static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive")
add_cxxflags("-static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive")
@ -179,14 +182,14 @@ else()
endif(UNIX)
if(TUNTAP)
set(LIBTUNTAP_SRC
set(LIBTUNTAP_SRC_BASE
${TT_ROOT}/tuntap.c
${TT_ROOT}/tuntap_log.cpp
${LIBTUNTAP_IMPL})
if (UNIX)
set(LIBTUNTAP_SRC
${TT_ROOT}/tuntap-unix.c
${LIBTUNTAP_SRC})
${LIBTUNTAP_SRC_BASE})
endif()
else()
set(LIBTUNTAP_SRC "")
@ -238,6 +241,8 @@ set(LIB_PLATFORM_SRC
llarp/threadpool.cpp
# for android shim
${ANDROID_PLATFORM_SRC}
# tun
${LIBTUNTAP_SRC}
# win32 inline procs
llarp/win32_inet.c
llarp/win32_intrnl.c
@ -245,9 +250,54 @@ set(LIB_PLATFORM_SRC
contrib/msc/getopt1.c
)
set(NTRU_AVX_SRC
crypto/libntrup/src/avx/randomsmall.c
crypto/libntrup/src/avx/weight.c
crypto/libntrup/src/avx/swap.c
crypto/libntrup/src/avx/rq_round3.c
crypto/libntrup/src/avx/rq_recip3.c
crypto/libntrup/src/avx/small.c
crypto/libntrup/src/avx/randomweightw.c
crypto/libntrup/src/avx/dec.c
crypto/libntrup/src/avx/r3_recip.c
crypto/libntrup/src/avx/keypair.c
crypto/libntrup/src/avx/rq_rounded.c
crypto/libntrup/src/avx/mult.c
crypto/libntrup/src/avx/enc.c
crypto/libntrup/src/avx/int32_sort.c
crypto/libntrup/src/avx/rq.c
crypto/libntrup/src/avx/rq_mod3.c
)
set(NTRU_REF_SRC
crypto/libntrup/src/ref/randomsmall.c
crypto/libntrup/src/ref/swap.c
crypto/libntrup/src/ref/rq_round3.c
crypto/libntrup/src/ref/rq_recip3.c
crypto/libntrup/src/ref/small.c
crypto/libntrup/src/ref/rq_mult.c
crypto/libntrup/src/ref/randomweightw.c
crypto/libntrup/src/ref/random32.c
crypto/libntrup/src/ref/dec.c
crypto/libntrup/src/ref/r3_mult.c
crypto/libntrup/src/ref/r3_recip.c
crypto/libntrup/src/ref/keypair.c
crypto/libntrup/src/ref/rq_rounded.c
crypto/libntrup/src/ref/enc.c
crypto/libntrup/src/ref/int32_sort.c
crypto/libntrup/src/ref/rq.c
)
set(LIB_SRC
include_directories(crypto/libntrup/include)
set(NTRU_SRC
${NTRU_AVX_SRC}
${NTRU_REF_SRC}
crypto/libntrup/src/ntru.cpp
)
set(LIB_SRC
${NTRU_SRC}
llarp/address_info.cpp
llarp/bencode.cpp
llarp/buffer.cpp
@ -308,7 +358,7 @@ set(LIB_SRC
llarp/service/protocol.cpp
llarp/service/tag.cpp
llarp/service/info.cpp
${LIBTUNTAP_SRC}
)
set(DNS_SRC
@ -326,6 +376,7 @@ set(TEST_SRC
test/dht_unittest.cpp
test/encrypted_frame_unittest.cpp
test/hiddenservice_unittest.cpp
test/pq_unittest.cpp
)
@ -361,7 +412,7 @@ link_directories(contrib/msc/lib)
endif()
if(SHADOW)
add_shadow_plugin(shadow-plugin-${SHARED_LIB} ${EXE_SRC} ${LIB_SRC})
add_shadow_plugin(shadow-plugin-${SHARED_LIB} ${EXE_SRC} ${LIB_SRC} ${LIB_PLATFORM_SRC} ${CPP_BACKPORT_SRC})
target_link_libraries(shadow-plugin-${SHARED_LIB} ${LIBS})
install(TARGETS shadow-plugin-${SHARED_LIB} DESTINATION plugins)
else()
@ -371,16 +422,12 @@ else()
add_executable(dns ${DNS_SRC})
if(WITH_TESTS)
enable_testing()
if(NOT WIN32)
add_subdirectory(${GTEST_DIR})
include_directories(${GTEST_DIR}/include ${GTEST_DIR})
add_executable(${TEST_EXE} ${TEST_SRC})
add_test(runAllTests ${TEST_EXE})
target_link_libraries(${TEST_EXE} ${STATIC_LINK_LIBS} gtest_main ${STATIC_LIB})
endif()
if(WITH_STATIC)
add_library(${STATIC_LIB} STATIC ${LIB_SRC})
if(NOT HAVE_CXX17_FILESYSTEM)

View File

@ -1,5 +1,5 @@
all: debug
all: test
SIGN = gpg --sign --detach
@ -27,6 +27,7 @@ TESTNET_CONF=$(TESTNET_ROOT)/supervisor.conf
TESTNET_LOG=$(TESTNET_ROOT)/testnet.log
EXE = $(REPO)/lokinet
TEST_EXE = $(REPO)/testAll
TESTNET_EXE=$(REPO)/lokinet-testnet
TESTNET_CLIENTS ?= 50
@ -42,18 +43,16 @@ clean:
rm -f *.a *.so
debug-configure:
cmake -GNinja -DCMAKE_BUILD_TYPE=Debug -DWITH_TESTS=ON -DCMAKE_C_COMPILER=$(CC) -DCMAKE_CXX_COMPILER=$(CXX) -DTUNTAP=ON
cmake -GNinja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_COMPILER=$(CC) -DCMAKE_CXX_COMPILER=$(CXX) -DTUNTAP=ON
release-configure: clean
cmake -GNinja -DSTATIC_LINK=ON -DCMAKE_BUILD_TYPE=Release -DRELEASE_MOTTO="$(shell cat motto.txt)" -DCMAKE_C_COMPILER=$(CC) -DCMAKE_CXX_COMPILER=$(CXX) -DTUNTAP=ON
debug: debug-configure
ninja
ninja test
release-compile: release-configure
ninja
cp llarpd lokinet
strip $(TARGETS)
$(TARGETS): release-compile
@ -101,9 +100,8 @@ testnet:
python3 contrib/testnet/genconf.py --bin=$(TESTNET_EXE) --svc=$(TESTNET_SERVERS) --clients=$(TESTNET_CLIENTS) --dir=$(TESTNET_ROOT) --out $(TESTNET_CONF)
LLARP_DEBUG=$(TESTNET_DEBUG) supervisord -n -d $(TESTNET_ROOT) -l $(TESTNET_LOG) -c $(TESTNET_CONF)
test: debug-configure
ninja
ninja test
test: debug
$(TEST_EXE)
format:
clang-format -i $$(find daemon llarp include | grep -E '\.[h,c](pp)?$$')

View File

@ -56,9 +56,14 @@ def makeBase(settings, name, id):
def makeClient(settings, name, id):
peer = makeBase(settings, name, id)
nodeconf(peer['config'], getSetting(settings, 'baseDir', 'tmp'), name)
basedir = getSetting(settings, 'baseDir', 'tmp')
nodeconf(peer['config'], basedir, name)
fname = os.path.join(basedir, "test-service.ini")
peer['config']['services'] = {
'test-service': fname
}
with open(fname, 'w') as f:
f.write("[test-service]")
return peer
@ -81,7 +86,8 @@ def genconf(settings, outf):
kill = etree.SubElement(root, 'kill')
kill.attrib['time'] = getSetting(settings, 'runFor', '600')
baseDir = getSetting(settings, 'baseDir', 'tmp')
baseDir = getSetting(settings, 'baseDir',
os.path.join('/tmp', 'lokinet-shadow'))
if not os.path.exists(baseDir):
os.mkdir(baseDir)
@ -125,6 +131,7 @@ def genconf(settings, outf):
if __name__ == '__main__':
settings = {
'baseDir': os.path.join("/tmp", "lokinet-shadow"),
'topology': os.path.join(shadowRoot, 'share', 'topology.graphml.xml'),
'runFor': '{}'.format(60 * 10 * 10)
}

View File

@ -0,0 +1 @@
#include <sodium/crypto_hash_sha512.h>

View File

@ -0,0 +1,2 @@
#include <stdint.h>
typedef int16_t crypto_int16;

View File

@ -0,0 +1,2 @@
#include <stdint.h>
typedef int32_t crypto_int32;

View File

@ -0,0 +1,2 @@
#include <stdint.h>
typedef int64_t crypto_int64;

View File

@ -0,0 +1,2 @@
#include <stdint.h>
typedef int8_t crypto_int8;

View File

@ -0,0 +1 @@
#include <libntrup/ntru.h>

View File

@ -0,0 +1,2 @@
#include <stdint.h>
typedef uint16_t crypto_uint16;

View File

@ -0,0 +1,2 @@
#include <stdint.h>
typedef uint32_t crypto_uint32;

View File

@ -0,0 +1 @@
#include <sodium/crypto_verify_32.h>

View File

@ -0,0 +1,37 @@
#ifndef LIBNTRUP_NTRU_H
#define LIBNTRUP_NTRU_H
#ifdef __cplusplus
extern "C"
{
#endif
#include "ntru_api.h"
void
ntru_init(int force_no_avx2);
int
crypto_kem_enc(unsigned char *cstr, unsigned char *k,
const unsigned char *pk);
int
crypto_kem_dec(unsigned char *k, const unsigned char *cstr,
const unsigned char *sk);
int
crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
#define crypto_kem_SECRETKEYBYTES 1600
#define crypto_kem_PUBLICKEYBYTES 1218
#define crypto_kem_CIPHERTEXTBYTES 1047
#define NTRU_SECRETKEYBYTES CRYPTO_SECRETKEYBYTES
#define NTRU_PUBLICKEYBYTES CRYPTO_PUBLICKEYBYTES
#define NTRU_CIPHERTEXTBYTES CRYPTO_CIPHERTEXTBYTES
#define CRYPTO_BYTES 32
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,12 @@
int crypto_kem_enc_ref(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
int crypto_kem_dec_ref(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
int crypto_kem_keypair_ref(unsigned char *pk, unsigned char * sk);
int crypto_kem_enc_avx2(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
int crypto_kem_dec_avx2(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
int crypto_kem_keypair_avx2(unsigned char *pk, unsigned char * sk);

View File

@ -0,0 +1 @@
#include <sodium/randombytes.h>

View File

@ -0,0 +1,4 @@
#define CRYPTO_SECRETKEYBYTES 1600
#define CRYPTO_PUBLICKEYBYTES 1218
#define CRYPTO_CIPHERTEXTBYTES 1047
#define CRYPTO_BYTES 32

View File

@ -0,0 +1,72 @@
#ifdef KAT
#include <stdio.h>
#endif
#include "params.h"
#include "small.h"
#include "mod3.h"
#include "rq.h"
#include "r3.h"
#include "crypto_hash_sha512.h"
#include "crypto_verify_32.h"
#include "crypto_kem.h"
int crypto_kem_dec_avx2(
unsigned char *k,
const unsigned char *cstr,
const unsigned char *sk
)
{
#if __AVX2__
small f[768];
modq h[768];
small grecip[768];
modq c[768];
modq t[768];
small t3[768];
small r[768];
modq hr[768];
unsigned char rstr[small_encode_len];
unsigned char hash[64];
int i;
int result = 0;
small_decode(f,sk);
small_decode(grecip,sk + small_encode_len);
rq_decode(h,sk + 2 * small_encode_len);
rq_decoderounded(c,cstr + 32);
rq_mult(t,c,f);
rq_mod3(t3,t);
r3_mult(r,t3,grecip);
#ifdef KAT
{
int j;
printf("decrypt r:");
for (j = 0;j < p;++j)
if (r[j] == 1) printf(" +%d",j);
else if (r[j] == -1) printf(" -%d",j);
printf("\n");
}
#endif
result |= r3_weightw_mask(r);
rq_mult(hr,h,r);
rq_round3(hr,hr);
for (i = 0;i < p;++i) result |= modq_nonzero_mask(hr[i] - c[i]);
small_encode(rstr,r);
crypto_hash_sha512(hash,rstr,sizeof rstr);
result |= crypto_verify_32(hash,cstr);
for (i = 0;i < 32;++i) k[i] = (hash[32 + i] & ~result);
return result;
#else
return -1;
#endif
}

View File

@ -0,0 +1,52 @@
#ifdef KAT
#include <stdio.h>
#endif
#include <string.h>
#include "params.h"
#include "small.h"
#include "rq.h"
#include "crypto_hash_sha512.h"
#include "crypto_kem.h"
int crypto_kem_enc_avx2(
unsigned char *cstr,
unsigned char *k,
const unsigned char *pk
)
{
#if __AVX2__
small r[768];
modq h[768];
modq c[768];
unsigned char rstr[small_encode_len];
unsigned char hash[64];
small_random_weightw(r);
#ifdef KAT
{
int i;
printf("encrypt r:");
for (i = 0;i < p;++i)
if (r[i] == 1) printf(" +%d",i);
else if (r[i] == -1) printf(" -%d",i);
printf("\n");
}
#endif
small_encode(rstr,r);
crypto_hash_sha512(hash,rstr,sizeof rstr);
rq_decode(h,pk);
rq_mult(c,h,r);
memcpy(k,hash + 32,32);
memcpy(cstr,hash,32);
rq_roundencode(cstr + 32,c);
return 0;
#else
return -1;
#endif
}

View File

@ -0,0 +1,5 @@
Alphabetical order:
Daniel J. Bernstein
Chitchanok Chuengsatiansup
Tanja Lange
Christine van Vredendaal

View File

@ -0,0 +1,427 @@
#ifdef __AVX2__
#include "int32_sort.h"
#include <immintrin.h>
typedef crypto_int32 int32;
static inline void minmax(int32 *x,int32 *y)
{
asm("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg %%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)"
: : "r"(x),"r"(y) : "%eax","%ebx","%edx");
}
/* sort x0,x2; sort x1,x3; ... sort x13, x15 */
static inline void minmax02through1315(int32 *x)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i c = _mm256_unpacklo_epi64(a,b); /* a01b01a45b45 */
__m256i d = _mm256_unpackhi_epi64(a,b); /* a23b23a67b67 */
__m256i g = _mm256_min_epi32(c,d);
__m256i h = _mm256_max_epi32(c,d);
a = _mm256_unpacklo_epi64(g,h);
b = _mm256_unpackhi_epi64(g,h);
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *) (x + 8),b);
}
/* sort x0,x2; sort x1,x3; sort x4,x6; sort x5,x7 */
static inline void minmax02134657(int32 *x)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_shuffle_epi32(a,0x4e);
__m256i c = _mm256_cmpgt_epi32(a,b);
c = _mm256_shuffle_epi32(c,0x44);
__m256i abc = c & (a ^ b);
a ^= abc;
_mm256_storeu_si256((__m256i *) x,a);
}
static void multiminmax2plus2(
int32 *x,
int n)
{
while (n >= 16) {
minmax02through1315(x);
n -= 16;
x += 16;
}
if (n >= 8) {
minmax02134657(x);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + 2);
minmax(x + 1,x + 3);
n -= 4;
x += 4;
}
if (n > 0) {
minmax(x,x + 2);
if (n > 1) minmax(x + 1,x + 3);
}
}
static void multiminmax2plus6(
int32 *x,
int n)
{
while (n >= 4) {
minmax(x,x + 6);
minmax(x + 1,x + 7);
n -= 4;
x += 4;
}
if (n > 0) {
minmax(x,x + 6);
if (n > 1) minmax(x + 1,x + 7);
}
}
static void multiminmax2plus14(
int32 *x,
int n)
{
while (n >= 8) {
minmax(x,x + 14);
minmax(x + 1,x + 15);
minmax(x + 4,x + 18);
minmax(x + 5,x + 19);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + 14);
minmax(x + 1,x + 15);
n -= 4;
x += 4;
}
if (n > 0) {
minmax(x,x + 14);
if (n > 1) minmax(x + 1,x + 15);
}
}
/* sort x[i],y[i] for i in 0,1,4,5,8,9,12,13 */
/* all of x0...x15 and y0...y15 must exist; no aliasing */
static inline void minmax0145891213(int32 *x,int32 *y)
{
__m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
__m256i a0189451213 = _mm256_unpacklo_epi64(a01234567,a89101112131415);
__m256i b0189451213 = _mm256_unpacklo_epi64(b01234567,b89101112131415);
__m256i c0189451213 = _mm256_min_epi32(a0189451213,b0189451213);
__m256i d0189451213 = _mm256_max_epi32(a0189451213,b0189451213);
__m256i c01234567 = _mm256_blend_epi32(a01234567,c0189451213,0x33);
__m256i d01234567 = _mm256_blend_epi32(b01234567,d0189451213,0x33);
__m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213,a89101112131415);
__m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213,b89101112131415);
_mm256_storeu_si256((__m256i *) x,c01234567);
_mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
_mm256_storeu_si256((__m256i *) y,d01234567);
_mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
}
/* offset >= 30 */
static void multiminmax2plusmore(
int32 *x,
int n,
int offset)
{
while (n >= 16) {
minmax0145891213(x,x + offset);
n -= 16;
x += 16;
}
if (n >= 8) {
minmax(x,x + offset);
minmax(x + 1,x + 1 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 5,x + 5 + offset);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + offset);
minmax(x + 1,x + 1 + offset);
n -= 4;
x += 4;
}
if (n > 0) {
minmax(x,x + offset);
if (n > 1) minmax(x + 1,x + 1 + offset);
}
}
/* sort x0,x1; ... sort x14, x15 */
static inline void minmax01through1415(int32 *x)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i c = _mm256_unpacklo_epi32(a,b); /* ab0ab1ab4ab5 */
__m256i d = _mm256_unpackhi_epi32(a,b); /* ab2ab3ab6ab7 */
__m256i e = _mm256_unpacklo_epi32(c,d); /* a02b02a46b46 */
__m256i f = _mm256_unpackhi_epi32(c,d); /* a13b13a57b57 */
__m256i g = _mm256_min_epi32(e,f); /* a02b02a46b46 */
__m256i h = _mm256_max_epi32(e,f); /* a13b13a57b57 */
a = _mm256_unpacklo_epi32(g,h);
b = _mm256_unpackhi_epi32(g,h);
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *) (x + 8),b);
}
/* sort x0,x1; sort x2,x3; sort x4,x5; sort x6,x7 */
static inline void minmax01234567(int32 *x)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_shuffle_epi32(a,0xb1);
__m256i c = _mm256_cmpgt_epi32(a,b);
c = _mm256_shuffle_epi32(c,0xa0);
__m256i abc = c & (a ^ b);
a ^= abc;
_mm256_storeu_si256((__m256i *) x,a);
}
static void multiminmax1plus1(
int32 *x,
int n)
{
while (n >= 16) {
minmax01through1415(x);
n -= 16;
x += 16;
}
if (n >= 8) {
minmax01234567(x);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + 1);
minmax(x + 2,x + 3);
n -= 4;
x += 4;
}
if (n >= 2) {
minmax(x,x + 1);
n -= 2;
x += 2;
}
if (n > 0)
minmax(x,x + 1);
}
static void multiminmax1(
int32 *x,
int n,
int offset)
{
while (n >= 16) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 6,x + 6 + offset);
minmax(x + 8,x + 8 + offset);
minmax(x + 10,x + 10 + offset);
minmax(x + 12,x + 12 + offset);
minmax(x + 14,x + 14 + offset);
n -= 16;
x += 16;
}
if (n >= 8) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 6,x + 6 + offset);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
n -= 4;
x += 4;
}
if (n >= 2) {
minmax(x,x + offset);
n -= 2;
x += 2;
}
if (n > 0)
minmax(x,x + offset);
}
/* sort x[i],y[i] for i in 0,2,4,6,8,10,12,14 */
/* all of x0...x15 and y0...y15 must exist; no aliasing */
static inline void minmax02468101214(int32 *x,int32 *y)
{
__m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
__m256i a0819412513 = _mm256_unpacklo_epi32(a01234567,a89101112131415);
__m256i a210311614715 = _mm256_unpackhi_epi32(a01234567,a89101112131415);
__m256i a02810461214 = _mm256_unpacklo_epi32(a0819412513,a210311614715);
__m256i a13911571315 = _mm256_unpackhi_epi32(a0819412513,a210311614715);
__m256i b0819412513 = _mm256_unpacklo_epi32(b01234567,b89101112131415);
__m256i b210311614715 = _mm256_unpackhi_epi32(b01234567,b89101112131415);
__m256i b02810461214 = _mm256_unpacklo_epi32(b0819412513,b210311614715);
__m256i b13911571315 = _mm256_unpackhi_epi32(b0819412513,b210311614715);
__m256i c02810461214 = _mm256_min_epi32(a02810461214,b02810461214);
__m256i d02810461214 = _mm256_max_epi32(a02810461214,b02810461214);
__m256i c01234567 = _mm256_unpacklo_epi32(c02810461214,a13911571315);
__m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214,a13911571315);
__m256i d01234567 = _mm256_unpacklo_epi32(d02810461214,b13911571315);
__m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214,b13911571315);
_mm256_storeu_si256((__m256i *) x,c01234567);
_mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
_mm256_storeu_si256((__m256i *) y,d01234567);
_mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
}
/* assumes offset >= 31 */
static void multiminmax1plusmore(
int32 *x,
int n,
int offset)
{
while (n >= 16) {
minmax02468101214(x,x + offset);
n -= 16;
x += 16;
}
if (n >= 8) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 6,x + 6 + offset);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
n -= 4;
x += 4;
}
if (n >= 2) {
minmax(x,x + offset);
n -= 2;
x += 2;
}
if (n > 0)
minmax(x,x + offset);
}
/* sort x0,y0; sort x1,y1; ...; sort x7,y7 */
static inline void minmax8(int32 *x,int32 *y)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_loadu_si256((__m256i *) y);
_mm256_storeu_si256((__m256i *) x,_mm256_min_epi32(a,b));
_mm256_storeu_si256((__m256i *) y,_mm256_max_epi32(a,b));
}
/* assumes p >= 8; implies offset >= 8 */
static void multiminmax_atleast8(int p,
int32 *x,
int n,
int offset)
{
int i;
while (n >= 2 * p) {
for (i = 0;i < p;i += 8)
minmax8(x + i,x + i + offset);
n -= 2 * p;
x += 2 * p;
}
for (i = 0;i + 8 <= n;i += 8) {
if (i & p) return;
minmax8(x + i,x + i + offset);
}
for (;i < n;++i) {
if (i & p) return;
minmax(x + i,x + i + offset);
}
}
/* sort x0,y0; sort x1,y1; sort x2,y2; sort x3,y3 */
static inline void minmax4(int32 *x,int32 *y)
{
__m128i a = _mm_loadu_si128((__m128i *) x);
__m128i b = _mm_loadu_si128((__m128i *) y);
_mm_storeu_si128((__m128i *) x,_mm_min_epi32(a,b));
_mm_storeu_si128((__m128i *) y,_mm_max_epi32(a,b));
}
static void multiminmax4(
int32 *x,
int n,
int offset)
{
int i;
while (n >= 8) {
minmax4(x,x + offset);
n -= 8;
x += 8;
}
if (n >= 4)
minmax4(x,x + offset);
else
for (i = 0;i < n;++i)
minmax(x + i,x + i + offset);
}
void int32_sort(int32 *x,int n)
{
int top,p,q;
if (n < 2) return;
top = 1;
while (top < n - top) top += top;
for (p = top;p >= 8;p >>= 1) {
multiminmax_atleast8(p,x,n - p,p);
for (q = top;q > p;q >>= 1)
multiminmax_atleast8(p,x + p,n - q,q - p);
}
if (p >= 4) {
multiminmax4(x,n - 4,4);
for (q = top;q > 4;q >>= 1)
multiminmax4(x + 4,n - q,q - 4);
}
if (p >= 2) {
multiminmax2plus2(x,n - 2);
for (q = top;q >= 32;q >>= 1)
multiminmax2plusmore(x + 2,n - q,q - 2);
if (q >= 16)
multiminmax2plus14(x + 2,n - 16);
if (q >= 8)
multiminmax2plus6(x + 2,n - 8);
if (q >= 4)
multiminmax2plus2(x + 2,n - 4);
}
multiminmax1plus1(x,n - 1);
for (q = top;q >= 32;q >>= 1)
multiminmax1plusmore(x + 1,n - q,q - 1);
if (q >= 16)
multiminmax1(x + 1,n - 16,15);
if (q >= 8)
multiminmax1(x + 1,n - 8,7);
if (q >= 4)
multiminmax1(x + 1,n - 4,3);
if (q >= 2)
multiminmax1plus1(x + 1,n - 2);
}
#endif

View File

@ -0,0 +1,9 @@
#ifndef int32_sort_h
#define int32_sort_h
#include "crypto_int32.h"
#define int32_sort crypto_kem_sntrup4591761_avx_int32_sort
extern void int32_sort(crypto_int32 *,int);
#endif

View File

@ -0,0 +1,43 @@
#include <string.h>
#include "modq.h"
#include "params.h"
#include "r3.h"
#include "small.h"
#include "rq.h"
#include "crypto_kem.h"
#if crypto_kem_PUBLICKEYBYTES != rq_encode_len
#error "crypto_kem_PUBLICKEYBYTES must match rq_encode_len"
#endif
#if crypto_kem_SECRETKEYBYTES != rq_encode_len + 2 * small_encode_len
#error "crypto_kem_SECRETKEYBYTES must match rq_encode_len + 2 * small_encode_len"
#endif
int crypto_kem_keypair_avx2(unsigned char *pk,unsigned char *sk)
{
#if __AVX2__
small g[768];
small grecip[768];
small f[768];
modq f3recip[768];
modq h[768];
do
small_random(g);
while (r3_recip(grecip,g) != 0);
small_random_weightw(f);
rq_recip3(f3recip,f);
rq_mult(h,f3recip,g);
rq_encode(pk,h);
small_encode(sk,f);
small_encode(sk + small_encode_len,grecip);
memcpy(sk + 2 * small_encode_len,pk,rq_encode_len);
return 0;
#else
return -1;
#endif
}

View File

@ -0,0 +1,60 @@
#ifndef mod3_h
#define mod3_h
#include "small.h"
#include "crypto_int32.h"
/* -1 if x is nonzero, 0 otherwise */
static inline int mod3_nonzero_mask(small x)
{
return -x*x;
}
/* input between -100000 and 100000 */
/* output between -1 and 1 */
static inline small mod3_freeze(crypto_int32 a)
{
a -= 3 * ((10923 * a) >> 15);
a -= 3 * ((89478485 * a + 134217728) >> 28);
return a;
}
static inline small mod3_minusproduct(small a,small b,small c)
{
crypto_int32 A = a;
crypto_int32 B = b;
crypto_int32 C = c;
return mod3_freeze(A - B * C);
}
static inline small mod3_plusproduct(small a,small b,small c)
{
crypto_int32 A = a;
crypto_int32 B = b;
crypto_int32 C = c;
return mod3_freeze(A + B * C);
}
static inline small mod3_product(small a,small b)
{
return a * b;
}
static inline small mod3_sum(small a,small b)
{
crypto_int32 A = a;
crypto_int32 B = b;
return mod3_freeze(A + B);
}
static inline small mod3_reciprocal(small a1)
{
return a1;
}
static inline small mod3_quotient(small num,small den)
{
return mod3_product(num,mod3_reciprocal(den));
}
#endif

View File

@ -0,0 +1,91 @@
#ifndef modq_h
#define modq_h
#include "crypto_int16.h"
#include "crypto_int32.h"
#include "crypto_uint16.h"
typedef crypto_int16 modq;
/* input between -9000000 and 9000000 */
/* output between -2295 and 2295 */
static inline modq modq_freeze(crypto_int32 a)
{
a -= 4591 * ((228 * a) >> 20);
a -= 4591 * ((58470 * a + 134217728) >> 28);
return a;
}
static inline modq modq_minusproduct(modq a,modq b,modq c)
{
crypto_int32 A = a;
crypto_int32 B = b;
crypto_int32 C = c;
return modq_freeze(A - B * C);
}
static inline modq modq_plusproduct(modq a,modq b,modq c)
{
crypto_int32 A = a;
crypto_int32 B = b;
crypto_int32 C = c;
return modq_freeze(A + B * C);
}
static inline modq modq_product(modq a,modq b)
{
crypto_int32 A = a;
crypto_int32 B = b;
return modq_freeze(A * B);
}
static inline modq modq_square(modq a)
{
crypto_int32 A = a;
return modq_freeze(A * A);
}
static inline modq modq_sum(modq a,modq b)
{
crypto_int32 A = a;
crypto_int32 B = b;
return modq_freeze(A + B);
}
static inline modq modq_reciprocal(modq a1)
{
modq a2 = modq_square(a1);
modq a3 = modq_product(a2,a1);
modq a4 = modq_square(a2);
modq a8 = modq_square(a4);
modq a16 = modq_square(a8);
modq a32 = modq_square(a16);
modq a35 = modq_product(a32,a3);
modq a70 = modq_square(a35);
modq a140 = modq_square(a70);
modq a143 = modq_product(a140,a3);
modq a286 = modq_square(a143);
modq a572 = modq_square(a286);
modq a1144 = modq_square(a572);
modq a1147 = modq_product(a1144,a3);
modq a2294 = modq_square(a1147);
modq a4588 = modq_square(a2294);
modq a4589 = modq_product(a4588,a1);
return a4589;
}
static inline modq modq_quotient(modq num,modq den)
{
return modq_product(num,modq_reciprocal(den));
}
/* -1 if x is nonzero, 0 otherwise */
static inline int modq_nonzero_mask(modq x)
{
crypto_int32 r = (crypto_uint16) x;
r = -r;
r >>= 30;
return r;
}
#endif

View File

@ -0,0 +1,764 @@
#if __AVX2__
#include <string.h>
#include <immintrin.h>
#include "mod3.h"
#include "rq.h"
#include "r3.h"
#define MULSTEP_gcc(j,h0,h1,h2,h3,h4) \
gj = g[j]; \
h0 += f0 * gj; \
_mm256_storeu_ps(&h[i + j],h0); \
h1 += f1 * gj; \
h2 += f2 * gj; \
h3 += f3 * gj; \
h4 += f4 * gj; \
h0 = _mm256_loadu_ps(&h[i + j + 5]); \
h0 += f5 * gj;
#define MULSTEP_asm(j,h0,h1,h2,h3,h4) \
gj = g[j]; \
__asm__( \
"vfmadd231ps %5,%6,%0 \n\t" \
"vmovups %0,%12 \n\t" \
"vmovups %13,%0 \n\t" \
"vfmadd231ps %5,%7,%1 \n\t" \
"vfmadd231ps %5,%8,%2 \n\t" \
"vfmadd231ps %5,%9,%3 \n\t" \
"vfmadd231ps %5,%10,%4 \n\t" \
"vfmadd231ps %5,%11,%0 \n\t" \
: "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \
: "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]),"m"(h[i+j+5]));
#define MULSTEP MULSTEP_asm
#define MULSTEP_noload(j,h0,h1,h2,h3,h4) \
gj = g[j]; \
__asm__( \
"vfmadd231ps %5,%6,%0 \n\t" \
"vmovups %0,%12 \n\t" \
"vfmadd231ps %5,%7,%1 \n\t" \
"vfmadd231ps %5,%8,%2 \n\t" \
"vfmadd231ps %5,%9,%3 \n\t" \
"vfmadd231ps %5,%10,%4 \n\t" \
"vmulps %5,%11,%0 \n\t" \
: "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \
: "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]));
#define MULSTEP_fromzero(j,h0,h1,h2,h3,h4) \
gj = g[j]; \
__asm__( \
"vmulps %5,%6,%0 \n\t" \
"vmovups %0,%12 \n\t" \
"vmulps %5,%7,%1 \n\t" \
"vmulps %5,%8,%2 \n\t" \
"vmulps %5,%9,%3 \n\t" \
"vmulps %5,%10,%4 \n\t" \
"vmulps %5,%11,%0 \n\t" \
: "=&x"(h0),"=&x"(h1),"=&x"(h2),"=&x"(h3),"=&x"(h4) \
: "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]));
static inline __m128i _mm_load_cvtepi8_epi16(const long long *x)
{
__m128i result;
__asm__("vpmovsxbw %1, %0" : "=x"(result) : "m"(*x));
return result;
}
#define v0 _mm256_set1_epi32(0)
#define v0_128 _mm_set1_epi32(0)
#define v7 _mm256_set1_epi16(7)
#define v4591_16 _mm256_set1_epi16(4591)
#define v2296_16 _mm256_set1_epi16(2296)
#define alpha_32 _mm256_set1_epi32(0x4b400000)
#define alpha_32_128 _mm_set1_epi32(0x4b400000)
#define alpha_float _mm256_set1_ps(12582912.0)
#define v0_float _mm256_set1_ps(0)
#define v1_float _mm256_set1_ps(1)
#define vm1_float _mm256_set1_ps(-1)
#define vm4591_float _mm256_set1_ps(-4591)
#define recip4591_float _mm256_set1_ps(0.00021781746896101067305597908952297974298)
static inline __m256 add(__m256 x,__m256 y)
{
return x + y;
}
static inline __m256 fastadd(__m256 x,__m256 y)
{
return _mm256_fmadd_ps(y,v1_float,x);
}
static inline __m256 fastsub(__m256 x,__m256 y)
{
return _mm256_fmadd_ps(y,vm1_float,x);
}
static inline __m256 reduce(__m256 x)
{
__m256 q = x * recip4591_float;
q = _mm256_round_ps(q,8);
return _mm256_fmadd_ps(q,vm4591_float,x);
}
static inline __m256i squeeze(__m256i x)
{
__m256i q = _mm256_mulhrs_epi16(x,v7);
q = _mm256_mullo_epi16(q,v4591_16);
return _mm256_sub_epi16(x,q);
}
static inline __m256i squeezeadd16(__m256i x,__m256i y)
{
__m256i q;
x = _mm256_add_epi16(x,y);
q = _mm256_mulhrs_epi16(x,v7);
q = _mm256_mullo_epi16(q,v4591_16);
return _mm256_sub_epi16(x,q);
}
static inline __m256i freeze(__m256i x)
{
__m256i mask, x2296, x4591;
x4591 = _mm256_add_epi16(x,v4591_16);
mask = _mm256_srai_epi16(x,15);
x = _mm256_blendv_epi8(x,x4591,mask);
x2296 = _mm256_sub_epi16(x,v2296_16);
mask = _mm256_srai_epi16(x2296,15);
x4591 = _mm256_sub_epi16(x,v4591_16);
x = _mm256_blendv_epi8(x4591,x,mask);
return x;
}
/* 24*8*float32 f inputs between -10000 and 10000 */
/* 24*8*float32 g inputs between -32 and 32 */
/* 48*8*float32 h outputs between -7680000 and 7680000 */
static void mult24x8_float(__m256 h[48],const __m256 f[24],const __m256 g[24])
{
int i, j;
__m256 f0, f1, f2, f3, f4, f5, gj, h0, h1, h2, h3, h4;
i = 0;
f0 = f[i];
f1 = f[i + 1];
f2 = f[i + 2];
f3 = f[i + 3];
f4 = f[i + 4];
f5 = f[i + 5];
MULSTEP_fromzero(0,h0,h1,h2,h3,h4)
for (j = 0;j < 20;j += 5) {
MULSTEP_noload(j + 1,h1,h2,h3,h4,h0)
MULSTEP_noload(j + 2,h2,h3,h4,h0,h1)
MULSTEP_noload(j + 3,h3,h4,h0,h1,h2)
MULSTEP_noload(j + 4,h4,h0,h1,h2,h3)
MULSTEP_noload(j + 5,h0,h1,h2,h3,h4)
}
MULSTEP_noload(j + 1,h1,h2,h3,h4,h0)
MULSTEP_noload(j + 2,h2,h3,h4,h0,h1)
MULSTEP_noload(j + 3,h3,h4,h0,h1,h2)
h[i + j + 4] = h4;
h[i + j + 5] = h0;
h[i + j + 6] = h1;
h[i + j + 7] = h2;
h[i + j + 8] = h3;
for (i = 6;i < 24;i += 6) {
f0 = f[i];
f1 = f[i + 1];
f2 = f[i + 2];
f3 = f[i + 3];
f4 = f[i + 4];
f5 = f[i + 5];
h0 = h[i];
h1 = h[i + 1];
h2 = h[i + 2];
h3 = h[i + 3];
h4 = h[i + 4];
for (j = 0;j < 15;j += 5) {
MULSTEP(j + 0,h0,h1,h2,h3,h4)
MULSTEP(j + 1,h1,h2,h3,h4,h0)
MULSTEP(j + 2,h2,h3,h4,h0,h1)
MULSTEP(j + 3,h3,h4,h0,h1,h2)
MULSTEP(j + 4,h4,h0,h1,h2,h3)
}
MULSTEP(j + 0,h0,h1,h2,h3,h4)
MULSTEP(j + 1,h1,h2,h3,h4,h0)
MULSTEP(j + 2,h2,h3,h4,h0,h1)
MULSTEP_noload(j + 3,h3,h4,h0,h1,h2)
MULSTEP_noload(j + 4,h4,h0,h1,h2,h3)
MULSTEP_noload(j + 5,h0,h1,h2,h3,h4)
MULSTEP_noload(j + 6,h1,h2,h3,h4,h0)
MULSTEP_noload(j + 7,h2,h3,h4,h0,h1)
MULSTEP_noload(j + 8,h3,h4,h0,h1,h2)
h[i + j + 9] = h4;
h[i + j + 10] = h0;
h[i + j + 11] = h1;
h[i + j + 12] = h2;
h[i + j + 13] = h3;
}
h[47] = v0_float;
}
/* 48*8*float32 f inputs between -5000 and 5000 */
/* 48*8*float32 g inputs between -16 and 16 */
/* 96*8*float32 h outputs between -3840000 and 3840000 */
static void mult48x8_float(__m256 h[96],const __m256 f[48],const __m256 g[48])
{
__m256 h01[48];
__m256 g01[24];
__m256 *f01 = h01 + 24;
int i;
for (i = 24;i > 0;) {
i -= 2;
f01[i] = f[i] + f[i + 24];
g01[i] = g[i] + g[i + 24];
f01[i + 1] = f[i + 1] + f[i + 1 + 24];
g01[i + 1] = g[i + 1] + g[i + 1 + 24];
}
mult24x8_float(h,f,g);
mult24x8_float(h + 48,f + 24,g + 24);
mult24x8_float(h01,f01,g01);
for (i = 0;i < 24;++i) {
__m256 h0i = h[i];
__m256 h0itop = h[i + 24];
__m256 h1i = h[i + 48];
__m256 h1itop = h[i + 72];
__m256 h01i = h01[i];
__m256 h01itop = h01[i + 24];
__m256 c = fastsub(h0itop,h1i);
h[i + 24] = c + fastsub(h01i,h0i);
h[i + 48] = fastsub(h01itop,h1itop) - c;
}
}
/* 96*8*float32 f inputs between -2500 and 2500 */
/* 96*8*float32 g inputs between -8 and 8 */
/* 192*8*float32 h outputs between -1920000 and 1920000 */
static void mult96x8_float(__m256 h[192],const __m256 f[96],const __m256 g[96])
{
__m256 h01[96];
__m256 g01[48];
__m256 *f01 = h01 + 48;
int i;
for (i = 48;i > 0;) {
i -= 4;
f01[i] = f[i] + f[i + 48];
g01[i] = g[i] + g[i + 48];
f01[i + 1] = f[i + 1] + f[i + 1 + 48];
g01[i + 1] = g[i + 1] + g[i + 1 + 48];
f01[i + 2] = f[i + 2] + f[i + 2 + 48];
g01[i + 2] = g[i + 2] + g[i + 2 + 48];
f01[i + 3] = f[i + 3] + f[i + 3 + 48];
g01[i + 3] = g[i + 3] + g[i + 3 + 48];
}
mult48x8_float(h,f,g);
mult48x8_float(h + 96,f + 48,g + 48);
mult48x8_float(h01,f01,g01);
for (i = 0;i < 48;++i) {
__m256 h0i = h[i];
__m256 h0itop = h[i + 48];
__m256 h1i = h[i + 96];
__m256 h1itop = h[i + 144];
__m256 h01i = h01[i];
__m256 h01itop = h01[i + 48];
__m256 c = fastsub(h0itop,h1i);
h[i + 48] = c + fastsub(h01i,h0i);
h[i + 96] = fastsub(h01itop,h1itop) - c;
}
}
/* 96*16*int16 f inputs between -2500 and 2500 */
/* 96*(16*int8 stored in 32*int8) g inputs between -8 and 8 */
/* 192*16*int16 h outputs between -2400 and 2400 */
static void mult96x16(__m256i h[192],const __m256i f[96],const __m256i g[96])
{
__m256 hfloat[192];
__m256 gfloat[96];
__m256 *ffloat = hfloat + 96;
int i, p;
for (p = 0;p < 2;++p) {
for (i = 96;i > 0;) {
i -= 2;
__m256i fi = _mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *) &f[i]));
__m256i gi = _mm256_cvtepi16_epi32(_mm_load_cvtepi8_epi16(p + (const long long *) &g[i]));
__m256 storage;
*(__m256i *) &storage = _mm256_add_epi32(fi,alpha_32);
ffloat[i] = storage - alpha_float;
*(__m256i *) &storage = _mm256_add_epi32(gi,alpha_32);
gfloat[i] = storage - alpha_float;
fi = _mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *) &f[i + 1]));
gi = _mm256_cvtepi16_epi32(_mm_load_cvtepi8_epi16(p + (const long long *) &g[i + 1]));
*(__m256i *) &storage = _mm256_add_epi32(fi,alpha_32);
ffloat[i + 1] = storage - alpha_float;
*(__m256i *) &storage = _mm256_add_epi32(gi,alpha_32);
gfloat[i + 1] = storage - alpha_float;
}
mult96x8_float(hfloat,ffloat,gfloat);
for (i = 192;i > 0;) {
__m128i h0, h1;
i -= 4;
hfloat[i] = add(alpha_float,reduce(hfloat[i]));
hfloat[i + 1] = fastadd(alpha_float,reduce(hfloat[i + 1]));
hfloat[i + 2] = add(alpha_float,reduce(hfloat[i + 2]));
hfloat[i + 3] = fastadd(alpha_float,reduce(hfloat[i + 3]));
h0 = 0[(__m128i *) &hfloat[i]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
h1 = 1[(__m128i *) &hfloat[i]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
_mm_storeu_si128(p + (__m128i *) &h[i],_mm_packs_epi32(h0,h1));
h0 = 0[(__m128i *) &hfloat[i + 1]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
h1 = 1[(__m128i *) &hfloat[i + 1]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
_mm_storeu_si128(p + (__m128i *) &h[i + 1],_mm_packs_epi32(h0,h1));
h0 = 0[(__m128i *) &hfloat[i + 2]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
h1 = 1[(__m128i *) &hfloat[i + 2]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
_mm_storeu_si128(p + (__m128i *) &h[i + 2],_mm_packs_epi32(h0,h1));
h0 = 0[(__m128i *) &hfloat[i + 3]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
h1 = 1[(__m128i *) &hfloat[i + 3]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
_mm_storeu_si128(p + (__m128i *) &h[i + 3],_mm_packs_epi32(h0,h1));
}
}
}
/* int16 i of output x[j] is int16 j of input x[i] */
static void transpose16(__m256i x[16])
{
const static int rev[4] = {0,4,2,6};
int i;
__m256i y[16];
for (i = 0;i < 16;i += 4) {
__m256i a0 = x[i];
__m256i a1 = x[i + 1];
__m256i a2 = x[i + 2];
__m256i a3 = x[i + 3];
__m256i b0 = _mm256_unpacklo_epi16(a0,a1);
__m256i b1 = _mm256_unpackhi_epi16(a0,a1);
__m256i b2 = _mm256_unpacklo_epi16(a2,a3);
__m256i b3 = _mm256_unpackhi_epi16(a2,a3);
__m256i c0 = _mm256_unpacklo_epi32(b0,b2);
__m256i c2 = _mm256_unpackhi_epi32(b0,b2);
__m256i c1 = _mm256_unpacklo_epi32(b1,b3);
__m256i c3 = _mm256_unpackhi_epi32(b1,b3);
y[i] = c0;
y[i + 2] = c2;
y[i + 1] = c1;
y[i + 3] = c3;
}
for (i = 0;i < 4;++i) {
int r = rev[i];
__m256i c0 = y[i];
__m256i c4 = y[i + 4];
__m256i c8 = y[i + 8];
__m256i c12 = y[i + 12];
__m256i d0 = _mm256_unpacklo_epi64(c0,c4);
__m256i d4 = _mm256_unpackhi_epi64(c0,c4);
__m256i d8 = _mm256_unpacklo_epi64(c8,c12);
__m256i d12 = _mm256_unpackhi_epi64(c8,c12);
__m256i e0 = _mm256_permute2x128_si256(d0,d8,0x20);
__m256i e8 = _mm256_permute2x128_si256(d0,d8,0x31);
__m256i e4 = _mm256_permute2x128_si256(d4,d12,0x20);
__m256i e12 = _mm256_permute2x128_si256(d4,d12,0x31);
x[r] = e0;
x[r + 8] = e8;
x[r + 1] = e4;
x[r + 9] = e12;
}
}
/* byte i of output x[j] is byte j of input x[i] */
static void transpose32(__m256i x[32])
{
const static int rev[4] = {0,8,4,12};
int i;
__m256i y[32];
for (i = 0;i < 32;i += 4) {
__m256i a0 = x[i];
__m256i a1 = x[i + 1];
__m256i a2 = x[i + 2];
__m256i a3 = x[i + 3];
__m256i b0 = _mm256_unpacklo_epi8(a0,a1);
__m256i b1 = _mm256_unpackhi_epi8(a0,a1);
__m256i b2 = _mm256_unpacklo_epi8(a2,a3);
__m256i b3 = _mm256_unpackhi_epi8(a2,a3);
__m256i c0 = _mm256_unpacklo_epi16(b0,b2);
__m256i c2 = _mm256_unpackhi_epi16(b0,b2);
__m256i c1 = _mm256_unpacklo_epi16(b1,b3);
__m256i c3 = _mm256_unpackhi_epi16(b1,b3);
y[i] = c0;
y[i + 2] = c2;
y[i + 1] = c1;
y[i + 3] = c3;
}
for (i = 0;i < 4;++i) {
int r = rev[i];
__m256i c0 = y[i];
__m256i c8 = y[i + 8];
__m256i c16 = y[i + 16];
__m256i c24 = y[i + 24];
__m256i c4 = y[i + 4];
__m256i c12 = y[i + 12];
__m256i c20 = y[i + 20];
__m256i c28 = y[i + 28];
__m256i d0 = _mm256_unpacklo_epi32(c0,c4);
__m256i d4 = _mm256_unpackhi_epi32(c0,c4);
__m256i d8 = _mm256_unpacklo_epi32(c8,c12);
__m256i d12 = _mm256_unpackhi_epi32(c8,c12);
__m256i d16 = _mm256_unpacklo_epi32(c16,c20);
__m256i d20 = _mm256_unpackhi_epi32(c16,c20);
__m256i d24 = _mm256_unpacklo_epi32(c24,c28);
__m256i d28 = _mm256_unpackhi_epi32(c24,c28);
__m256i e0 = _mm256_unpacklo_epi64(d0,d8);
__m256i e8 = _mm256_unpackhi_epi64(d0,d8);
__m256i e16 = _mm256_unpacklo_epi64(d16,d24);
__m256i e24 = _mm256_unpackhi_epi64(d16,d24);
__m256i e4 = _mm256_unpacklo_epi64(d4,d12);
__m256i e12 = _mm256_unpackhi_epi64(d4,d12);
__m256i e20 = _mm256_unpacklo_epi64(d20,d28);
__m256i e28 = _mm256_unpackhi_epi64(d20,d28);
__m256i f0 = _mm256_permute2x128_si256(e0,e16,0x20);
__m256i f16 = _mm256_permute2x128_si256(e0,e16,0x31);
__m256i f8 = _mm256_permute2x128_si256(e8,e24,0x20);
__m256i f24 = _mm256_permute2x128_si256(e8,e24,0x31);
__m256i f4 = _mm256_permute2x128_si256(e4,e20,0x20);
__m256i f20 = _mm256_permute2x128_si256(e4,e20,0x31);
__m256i f12 = _mm256_permute2x128_si256(e12,e28,0x20);
__m256i f28 = _mm256_permute2x128_si256(e12,e28,0x31);
x[r] = f0;
x[r + 16] = f16;
x[r + 1] = f8;
x[r + 17] = f24;
x[r + 2] = f4;
x[r + 18] = f20;
x[r + 3] = f12;
x[r + 19] = f28;
}
}
/* 48*16*int16 f inputs between -2295 and 2295 */
/* 24*32*int8 g inputs between -1 and 1 */
/* 96*16*int16 h outputs between -2295 and 2295 */
static void mult768_mix2_m256i(__m256i h[96],const __m256i f[48],const __m256i g[24])
{
__m256i hkara[24][16];
__m256i gkara[3][32];
#define fkara hkara
int i;
for (i = 6;i-- > 0;) {
__m256i f0, f1, f2, f3, f4, f5, f6, f7;
__m256i f01, f23, f45, f67;
__m256i f02, f46, f04, f26, f0426;
__m256i f13, f57, f15, f37, f1537;
__m256i f0213, f4657, f04261537, f0415, f2637;
f0 = _mm256_loadu_si256(&f[i + 0]);
f1 = _mm256_loadu_si256(&f[i + 6]);
f2 = _mm256_loadu_si256(&f[i + 12]);
f3 = _mm256_loadu_si256(&f[i + 18]);
f4 = _mm256_loadu_si256(&f[i + 24]);
f5 = _mm256_loadu_si256(&f[i + 30]);
f6 = _mm256_loadu_si256(&f[i + 36]);
f7 = _mm256_loadu_si256(&f[i + 42]);
f01 = squeezeadd16(f0,f1); fkara[i][8] = f01;
f23 = squeezeadd16(f2,f3); fkara[i][9] = f23;
f45 = squeezeadd16(f4,f5); fkara[i][10] = f45;
f67 = squeezeadd16(f6,f7); fkara[i][11] = f67;
fkara[i][0] = f0;
fkara[i][2] = f2;
fkara[i][4] = f4;
fkara[i][6] = f6;
f02 = squeezeadd16(f0,f2); fkara[i + 6][0] = f02;
f04 = squeezeadd16(f0,f4); fkara[i + 6][6] = f04;
f46 = squeezeadd16(f4,f6); fkara[i + 6][3] = f46;
f26 = squeezeadd16(f2,f6); fkara[i + 6][8] = f26;
fkara[i][1] = f1;
fkara[i][3] = f3;
fkara[i][5] = f5;
fkara[i][7] = f7;
f13 = squeezeadd16(f1,f3); fkara[i + 6][1] = f13;
f15 = squeezeadd16(f1,f5); fkara[i + 6][7] = f15;
f57 = squeezeadd16(f5,f7); fkara[i + 6][4] = f57;
f37 = squeezeadd16(f3,f7); fkara[i + 6][9] = f37;
f0426 = squeezeadd16(f04,f26); fkara[i + 6][12] = f0426;
f1537 = squeezeadd16(f15,f37); fkara[i + 6][13] = f1537;
f0213 = squeezeadd16(f02,f13); fkara[i + 6][2] = f0213;
f4657 = squeezeadd16(f46,f57); fkara[i + 6][5] = f4657;
f0415 = squeezeadd16(f04,f15); fkara[i + 6][10] = f0415;
f2637 = squeezeadd16(f26,f37); fkara[i + 6][11] = f2637;
f04261537 = squeezeadd16(f0426,f1537); fkara[i + 6][14] = f04261537;
fkara[i][12] = v0;
fkara[i][13] = v0;
fkara[i][14] = v0;
fkara[i][15] = v0;
fkara[i + 6][15] = v0;
}
for (i = 3;i-- > 0;) {
__m256i g0, g1, g2, g3, g4, g5, g6, g7;
__m256i g01, g23, g45, g67;
__m256i g02, g46, g04, g26, g0426;
__m256i g13, g57, g15, g37, g1537;
__m256i g0213, g4657, g04261537, g0415, g2637;
g0 = _mm256_loadu_si256(&g[i + 0]);
g1 = _mm256_loadu_si256(&g[i + 3]);
g2 = _mm256_loadu_si256(&g[i + 6]);
g3 = _mm256_loadu_si256(&g[i + 9]);
g4 = _mm256_loadu_si256(&g[i + 12]);
g5 = _mm256_loadu_si256(&g[i + 15]);
g6 = _mm256_loadu_si256(&g[i + 18]);
g7 = _mm256_loadu_si256(&g[i + 21]);
g01 = _mm256_add_epi8(g0,g1); gkara[i][8] = g01;
g23 = _mm256_add_epi8(g2,g3); gkara[i][9] = g23;
g45 = _mm256_add_epi8(g4,g5); gkara[i][10] = g45;
g67 = _mm256_add_epi8(g6,g7); gkara[i][11] = g67;
gkara[i][0] = g0;
gkara[i][2] = g2;
gkara[i][4] = g4;
gkara[i][6] = g6;
g02 = _mm256_add_epi8(g0,g2); gkara[i][16] = g02;
g04 = _mm256_add_epi8(g0,g4); gkara[i][22] = g04;
g46 = _mm256_add_epi8(g4,g6); gkara[i][19] = g46;
g26 = _mm256_add_epi8(g2,g6); gkara[i][24] = g26;
gkara[i][1] = g1;
gkara[i][3] = g3;
gkara[i][5] = g5;
gkara[i][7] = g7;
g13 = _mm256_add_epi8(g1,g3); gkara[i][17] = g13;
g15 = _mm256_add_epi8(g1,g5); gkara[i][23] = g15;
g57 = _mm256_add_epi8(g5,g7); gkara[i][20] = g57;
g37 = _mm256_add_epi8(g3,g7); gkara[i][25] = g37;
g0426 = _mm256_add_epi8(g04,g26); gkara[i][28] = g0426;
g1537 = _mm256_add_epi8(g15,g37); gkara[i][29] = g1537;
g0213 = _mm256_add_epi8(g02,g13); gkara[i][18] = g0213;
g4657 = _mm256_add_epi8(g46,g57); gkara[i][21] = g4657;
g0415 = _mm256_add_epi8(g04,g15); gkara[i][26] = g0415;
g2637 = _mm256_add_epi8(g26,g37); gkara[i][27] = g2637;
g04261537 = _mm256_add_epi8(g0426,g1537); gkara[i][30] = g04261537;
gkara[i][12] = v0;
gkara[i][13] = v0;
gkara[i][14] = v0;
gkara[i][15] = v0;
gkara[i][31] = v0;
}
for (i = 12;i-- > 0;)
transpose16(fkara[i]);
for (i = 3;i-- > 0;)
transpose32(gkara[i]);
mult96x16(hkara[12],fkara[6],(__m256i *) (1 + (__m128i *) gkara));
mult96x16(hkara[0],fkara[0],gkara[0]);
for (i = 24;i-- > 0;)
transpose16(hkara[i]);
for (i = 6;i-- > 0;) {
__m256i h0,h1,h2,h3,h4,h5,h6,h7,h8,h9;
__m256i h10,h11,h12,h13,h14,h15,h16,h17,h18,h19;
__m256i h20,h21,h22,h23;
__m256i h32,h33,h34,h35,h36,h37,h38,h39;
__m256i h40,h41,h42,h43,h44,h45,h46,h47,h48,h49;
__m256i h50,h51,h52,h53,h54,h55,h56,h57,h58,h59;
__m256i h60,h61;
__m256i c;
#define COMBINE(h0,h1,h2,h3,x0,x1) \
c = _mm256_sub_epi16(h1,h2); \
h1 = _mm256_sub_epi16(_mm256_add_epi16(c,x0),h0); \
h2 = _mm256_sub_epi16(x1,_mm256_add_epi16(c,h3)); \
h1 = squeeze(h1); \
h2 = squeeze(h2);
h56 = hkara[i + 12][12];
h57 = hkara[i + 18][12];
h58 = hkara[i + 12][13];
h59 = hkara[i + 18][13];
h60 = hkara[i + 12][14];
h61 = hkara[i + 18][14];
COMBINE(h56,h57,h58,h59,h60,h61)
h44 = hkara[i + 12][6];
h45 = hkara[i + 18][6];
h46 = hkara[i + 12][7];
h47 = hkara[i + 18][7];
h52 = hkara[i + 12][10];
h53 = hkara[i + 18][10];
COMBINE(h44,h45,h46,h47,h52,h53)
h48 = hkara[i + 12][8];
h49 = hkara[i + 18][8];
h50 = hkara[i + 12][9];
h51 = hkara[i + 18][9];
h54 = hkara[i + 12][11];
h55 = hkara[i + 18][11];
COMBINE(h48,h49,h50,h51,h54,h55)
COMBINE(h44,h46,h48,h50,h56,h58)
COMBINE(h45,h47,h49,h51,h57,h59)
h0 = hkara[i][0];
h1 = hkara[i + 6][0];
h2 = hkara[i][1];
h3 = hkara[i + 6][1];
h16 = hkara[i][8];
h17 = hkara[i + 6][8];
COMBINE(h0,h1,h2,h3,h16,h17)
h4 = hkara[i][2];
h5 = hkara[i + 6][2];
h6 = hkara[i][3];
h7 = hkara[i + 6][3];
h18 = hkara[i][9];
h19 = hkara[i + 6][9];
COMBINE(h4,h5,h6,h7,h18,h19)
h32 = hkara[i + 12][0];
h33 = hkara[i + 18][0];
h34 = hkara[i + 12][1];
h35 = hkara[i + 18][1];
h36 = hkara[i + 12][2];
h37 = hkara[i + 18][2];
COMBINE(h32,h33,h34,h35,h36,h37)
COMBINE(h1,h3,h5,h7,h33,h35)
COMBINE(h0,h2,h4,h6,h32,h34)
h8 = hkara[i][4];
h9 = hkara[i + 6][4];
h10 = hkara[i][5];
h11 = hkara[i + 6][5];
h20 = hkara[i][10];
h21 = hkara[i + 6][10];
COMBINE(h8,h9,h10,h11,h20,h21)
h12 = hkara[i][6];
h13 = hkara[i + 6][6];
h14 = hkara[i][7];
h15 = hkara[i + 6][7];
h22 = hkara[i][11];
h23 = hkara[i + 6][11];
COMBINE(h12,h13,h14,h15,h22,h23)
h38 = hkara[i + 12][3];
h39 = hkara[i + 18][3];
h40 = hkara[i + 12][4];
h41 = hkara[i + 18][4];
h42 = hkara[i + 12][5];
h43 = hkara[i + 18][5];
COMBINE(h38,h39,h40,h41,h42,h43)
COMBINE(h8,h10,h12,h14,h38,h40)
COMBINE(h9,h11,h13,h15,h39,h41)
COMBINE(h0,h4,h8,h12,h44,h48)
h0 = freeze(h0);
h4 = freeze(h4);
h8 = freeze(h8);
h12 = freeze(h12);
_mm256_storeu_si256(&h[i + 0],h0);
_mm256_storeu_si256(&h[i + 24],h4);
_mm256_storeu_si256(&h[i + 48],h8);
_mm256_storeu_si256(&h[i + 72],h12);
COMBINE(h1,h5,h9,h13,h45,h49)
h1 = freeze(h1);
h5 = freeze(h5);
h9 = freeze(h9);
h13 = freeze(h13);
_mm256_storeu_si256(&h[i + 6],h1);
_mm256_storeu_si256(&h[i + 30],h5);
_mm256_storeu_si256(&h[i + 54],h9);
_mm256_storeu_si256(&h[i + 78],h13);
COMBINE(h2,h6,h10,h14,h46,h50)
h2 = freeze(h2);
h6 = freeze(h6);
h10 = freeze(h10);
h14 = freeze(h14);
_mm256_storeu_si256(&h[i + 12],h2);
_mm256_storeu_si256(&h[i + 36],h6);
_mm256_storeu_si256(&h[i + 60],h10);
_mm256_storeu_si256(&h[i + 84],h14);
COMBINE(h3,h7,h11,h15,h47,h51)
h3 = freeze(h3);
h7 = freeze(h7);
h11 = freeze(h11);
h15 = freeze(h15);
_mm256_storeu_si256(&h[i + 18],h3);
_mm256_storeu_si256(&h[i + 42],h7);
_mm256_storeu_si256(&h[i + 66],h11);
_mm256_storeu_si256(&h[i + 90],h15);
}
}
#define p 761
/* 761 f inputs between -2295 and 2295 */
/* 761 g inputs between -1 and 1 */
/* 761 h outputs between -2295 and 2295 */
void rq_mult(modq *h,const modq *f,const small *g)
{
__m256i fgvec[96];
modq *fg;
int i;
mult768_mix2_m256i(fgvec,(__m256i *) f,(__m256i *) g);
fg = (modq *) fgvec;
h[0] = modq_freeze(fg[0] + fg[p]);
for (i = 1;i < 9;++i)
h[i] = modq_freeze(fg[i] + fg[i + p - 1] + fg[i + p]);
for (i = 9;i < 761;i += 16) {
__m256i fgi = _mm256_loadu_si256((__m256i *) &fg[i]);
__m256i fgip = _mm256_loadu_si256((__m256i *) &fg[i + p]);
__m256i fgip1 = _mm256_loadu_si256((__m256i *) &fg[i + p - 1]);
__m256i x = _mm256_add_epi16(fgi,_mm256_add_epi16(fgip,fgip1));
x = freeze(squeeze(x));
_mm256_storeu_si256((__m256i *) &h[i],x);
}
for (i = 761;i < 768;++i)
h[i] = 0;
}
void r3_mult(small *h,const small *f,const small *g)
{
__m256i fgvec[96];
__m256i fvec[48];
modq *fg;
int i;
memset(fvec,0,sizeof fvec);
for (i = 0;i < 761;++i)
i[(modq *) fvec] = f[i];
mult768_mix2_m256i(fgvec,fvec,(__m256i *) g);
fg = (modq *) fgvec;
h[0] = mod3_freeze(fg[0] + fg[p]);
for (i = 1;i < p;++i)
h[i] = mod3_freeze(fg[i] + fg[i + p - 1] + fg[i + p]);
for (i = p;i < 768;++i)
h[i] = 0;
}
#endif

View File

@ -0,0 +1,14 @@
#ifndef params_h
#define params_h
#define q 4591
/* XXX: also built into modq in various ways */
#define qshift 2295
#define p 761
#define w 286
#define rq_encode_len 1218
#define small_encode_len 191
#endif

View File

@ -0,0 +1,15 @@
#ifndef r3_h
#define r3_h
#include "small.h"
#define r3_mult crypto_kem_sntrup4591761_avx_r3_mult
extern void r3_mult(small *,const small *,const small *);
#define r3_recip crypto_kem_sntrup4591761_avx_r3_recip
extern int r3_recip(small *,const small *);
#define r3_weightw_mask crypto_kem_sntrup4591761_avx_r3_weightw_mask
extern int r3_weightw_mask(const small *);
#endif

View File

@ -0,0 +1,194 @@
#if __AVX2__
#include <immintrin.h>
#include "params.h"
#include "mod3.h"
#include "swap.h"
#include "r3.h"
/* caller must ensure that x-y does not overflow */
static int smaller_mask(int x,int y)
{
return (x - y) >> 31;
}
static void vectormod3_product(small *z,int len,const small *x,const small c)
{
int i;
int minusmask = c;
int plusmask = -c;
__m256i minusvec, plusvec, zerovec;
minusmask >>= 31;
plusmask >>= 31;
minusvec = _mm256_set1_epi32(minusmask);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
while (len >= 32) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
xi = (xi & plusvec) | (_mm256_sub_epi8(zerovec,xi) & minusvec);
_mm256_storeu_si256((__m256i *) z,xi);
x += 32;
z += 32;
len -= 32;
}
for (i = 0;i < len;++i) z[i] = mod3_product(x[i],c);
}
static void vectormod3_minusproduct(small *z,int len,const small *x,const small *y,const small c)
{
int i;
int minusmask = c;
int plusmask = -c;
__m256i minusvec, plusvec, zerovec, twovec, fourvec;
minusmask >>= 31;
plusmask >>= 31;
minusvec = _mm256_set1_epi32(minusmask);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
twovec = _mm256_set1_epi32(0x02020202);
fourvec = _mm256_set1_epi32(0x04040404);
while (len >= 32) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
__m256i yi = _mm256_loadu_si256((__m256i *) y);
__m256i r;
yi = (yi & plusvec) | (_mm256_sub_epi8(zerovec,yi) & minusvec);
xi = _mm256_sub_epi8(xi,yi);
r = _mm256_add_epi8(xi,twovec);
r &= fourvec;
r = _mm256_srli_epi32(r,2);
xi = _mm256_sub_epi8(xi,r);
r = _mm256_add_epi8(r,r);
xi = _mm256_sub_epi8(xi,r);
r = _mm256_sub_epi8(twovec,xi);
r &= fourvec;
r = _mm256_srli_epi32(r,2);
xi = _mm256_add_epi8(xi,r);
r = _mm256_add_epi8(r,r);
xi = _mm256_add_epi8(xi,r);
_mm256_storeu_si256((__m256i *) z,xi);
x += 32;
y += 32;
z += 32;
len -= 32;
}
for (i = 0;i < len;++i) z[i] = mod3_minusproduct(x[i],y[i],c);
}
static void vectormod3_shift(small *z,int len)
{
int i;
while (len >= 33) {
__m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 33));
_mm256_storeu_si256((__m256i *) (z + len - 32),zi);
len -= 32;
}
for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
z[0] = 0;
}
/*
r = s^(-1) mod m, returning 0, if s is invertible mod m
or returning -1 if s is not invertible mod m
r,s are polys of degree <p
m is x^p-x-1
*/
int r3_recip(small *r,const small *s)
{
const int loops = 2*p + 1;
int loop;
small f[768];
small g[769];
small u[1536];
small v[1537];
small c;
int i;
int d = p;
int e = p;
int swapmask;
for (i = 2;i < p;++i) f[i] = 0;
f[0] = -1;
f[1] = -1;
f[p] = 1;
/* generalization: can initialize f to any polynomial m */
/* requirements: m has degree exactly p, nonzero constant coefficient */
for (i = 0;i < p;++i) g[i] = s[i];
g[p] = 0;
for (i = 0;i <= loops;++i) u[i] = 0;
v[0] = 1;
for (i = 1;i <= loops;++i) v[i] = 0;
loop = 0;
for (;;) {
/* e == -1 or d + e + loop <= 2*p */
/* f has degree p: i.e., f[p]!=0 */
/* f[i]==0 for i < p-d */
/* g has degree <=p (so it fits in p+1 coefficients) */
/* g[i]==0 for i < p-e */
/* u has degree <=loop (so it fits in loop+1 coefficients) */
/* u[i]==0 for i < p-d */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
/* v has degree <=loop (so it fits in loop+1 coefficients) */
/* v[i]==0 for i < p-e */
/* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
if (loop >= loops) break;
c = mod3_quotient(g[p],f[p]);
vectormod3_minusproduct(g,768,g,f,c);
vectormod3_shift(g,769);
#ifdef SIMPLER
vectormod3_minusproduct(v,1536,v,u,c);
vectormod3_shift(v,1537);
#else
if (loop < p) {
vectormod3_minusproduct(v,loop + 1,v,u,c);
vectormod3_shift(v,loop + 2);
} else {
vectormod3_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
vectormod3_shift(v + loop - p,p + 2);
}
#endif
e -= 1;
++loop;
swapmask = smaller_mask(e,d) & mod3_nonzero_mask(g[p]);
swap(&e,&d,sizeof e,swapmask);
swap(f,g,(p + 1) * sizeof(small),swapmask);
#ifdef SIMPLER
swap(u,v,1536 * sizeof(small),swapmask);
#else
if (loop < p) {
swap(u,v,(loop + 1) * sizeof(small),swapmask);
} else {
swap(u + loop - p,v + loop - p,(p + 1) * sizeof(small),swapmask);
}
#endif
}
c = mod3_reciprocal(f[p]);
vectormod3_product(r,p,u + p,c);
for (i = p;i < 768;++i) r[i] = 0;
return smaller_mask(0,d);
}
#endif

View File

@ -0,0 +1,17 @@
#include "params.h"
#include "randombytes.h"
#include "crypto_uint32.h"
#include "small.h"
void small_random(small *g)
{
crypto_uint32 r[p];
int i;
randombytes((unsigned char *) r,sizeof r);
for (i = 0;i < p;++i)
g[i] = (small) (((r[i] & 1073741823) * 3) >> 30) - 1;
/* bias is miniscule */
for (i = p;i < 768;++i)
g[i] = 0;
}

View File

@ -0,0 +1,17 @@
#include "params.h"
#include "randombytes.h"
#include "int32_sort.h"
#include "small.h"
void small_random_weightw(small *f)
{
crypto_int32 r[p];
int i;
randombytes((unsigned char *) r,sizeof r);
for (i = 0;i < w;++i) r[i] &= -2;
for (i = w;i < p;++i) r[i] = (r[i] & -3) | 1;
int32_sort(r,p);
for (i = 0;i < p;++i) f[i] = ((small) (r[i] & 3)) - 1;
for (i = p;i < 768;++i) f[i] = 0;
}

View File

@ -0,0 +1,160 @@
#if __AVX2__
#include <immintrin.h>
#include "params.h"
#include "crypto_uint32.h"
#include "crypto_int64.h"
#include "rq.h"
#define v2295_16 _mm256_set1_epi16(2295)
#define v2295_16_128 _mm_set1_epi16(2295)
#define alpha_top _mm256_set1_epi32(0x43380000)
#define alpha _mm256_set1_pd(6755399441055744.0)
#define alpha_64 _mm256_set1_epi64(0x4338000000000000)
/* each reciprocal is rounded _up_ to nearest floating-point number */
#define recip54 0.0185185185185185209599811884118025773204863071441650390625
#define recip4591 0.000217817468961010681817447309782664888189174234867095947265625
#define recip6144 0.0001627604166666666847367028747584072334575466811656951904296875
#define recip331776 0.00000301408179012345704632478034235010255770248477347195148468017578125
#define recip37748736 0.000000026490953233506946282623583451172610825352649044361896812915802001953125
#define broadcast(r) _mm256_set1_pd(r)
#define floor(x) _mm256_floor_pd(x)
void rq_encode(unsigned char *c,const modq *f)
{
crypto_int32 f0, f1, f2, f3, f4;
int i;
for (i = 0;i < p/5;++i) {
f0 = *f++ + qshift;
f1 = *f++ + qshift;
f2 = *f++ + qshift;
f3 = *f++ + qshift;
f4 = *f++ + qshift;
/* now want f0 + 6144*f1 + ... as a 64-bit integer */
f1 *= 3;
f2 *= 9;
f3 *= 27;
f4 *= 81;
/* now want f0 + f1<<11 + f2<<22 + f3<<33 + f4<<44 */
f0 += f1 << 11;
*c++ = f0; f0 >>= 8;
*c++ = f0; f0 >>= 8;
f0 += f2 << 6;
*c++ = f0; f0 >>= 8;
*c++ = f0; f0 >>= 8;
f0 += f3 << 1;
*c++ = f0; f0 >>= 8;
f0 += f4 << 4;
*c++ = f0; f0 >>= 8;
*c++ = f0; f0 >>= 8;
*c++ = f0;
}
/* XXX: using p mod 5 = 1 */
f0 = *f++ + qshift;
*c++ = f0; f0 >>= 8;
*c++ = f0;
}
void rq_decode(modq *f,const unsigned char *c)
{
crypto_uint32 c0, c1;
int i;
for (i = 0;i < 152;i += 4) {
__m256i abcd, ac, bd, abcd0, abcd1;
__m256d x0, x1, f4, f3, f2, f1, f0;
__m128i if4, if3, if2, if1, if0;
__m128i x01, x23, x02, x13, xab, xcd;
/* f0 + f1*6144 + f2*6144^2 + f3*6144^3 + f4*6144^4 */
/* = c0 + c1*256 + ... + c6*256^6 + c7*256^7 */
/* with each f between 0 and 4590 */
/* could use _mm256_cvtepi32_pd instead; but beware uint32 */
abcd = _mm256_loadu_si256((__m256i *) c); /* a0 a1 b0 b1 c0 c1 d0 d1 */
c += 32;
ac = _mm256_unpacklo_epi32(abcd,alpha_top); /* a0 a1 c0 c1 */
bd = _mm256_unpackhi_epi32(abcd,alpha_top); /* b0 b1 d0 d1 */
abcd1 = _mm256_unpackhi_epi64(ac,bd); /* a1 b1 c1 d1 */
abcd0 = _mm256_unpacklo_epi64(ac,bd); /* a0 b0 c0 d0 */
x1 = *(__m256d *) &abcd1;
x0 = *(__m256d *) &abcd0;
x1 -= alpha;
x0 -= alpha;
/* x1 is [0,41] + [0,4590]*54 + f4*331776 */
f4 = broadcast(recip331776) * x1;
f4 = floor(f4);
x1 -= broadcast(331776.0) * f4;
/* x1 is [0,41] + f3*54 */
f3 = broadcast(recip54) * x1;
f3 = floor(f3);
x1 -= broadcast(54.0) * f3;
x0 += broadcast(4294967296.0) * x1;
/* x0 is [0,4590] + [0,4590]*6144 + f2*6144^2 */
f2 = broadcast(recip37748736) * x0;
f2 = floor(f2);
x0 -= broadcast(37748736.0) * f2;
/* x0 is [0,4590] + f1*6144 */
f1 = broadcast(recip6144) * x0;
f1 = floor(f1);
x0 -= broadcast(6144.0) * f1;
f0 = x0;
f4 -= broadcast(4591.0) * floor(broadcast(recip4591) * f4);
f3 -= broadcast(4591.0) * floor(broadcast(recip4591) * f3);
f2 -= broadcast(4591.0) * floor(broadcast(recip4591) * f2);
f1 -= broadcast(4591.0) * floor(broadcast(recip4591) * f1);
f0 -= broadcast(4591.0) * floor(broadcast(recip4591) * f0);
if4 = _mm256_cvtpd_epi32(f4); /* a4 0 b4 0 c4 0 d4 0 */
if3 = _mm256_cvtpd_epi32(f3); /* a3 0 b3 0 c3 0 d3 0 */
if2 = _mm256_cvtpd_epi32(f2); /* a2 0 b2 0 c2 0 d2 0 */
if1 = _mm256_cvtpd_epi32(f1); /* a1 0 b1 0 c1 0 d1 0 */
if0 = _mm256_cvtpd_epi32(f0); /* a0 0 b0 0 c0 0 d0 0 */
if4 = _mm_sub_epi16(if4,v2295_16_128);
f[4] = _mm_extract_epi32(if4,0);
f[9] = _mm_extract_epi32(if4,1);
f[14] = _mm_extract_epi32(if4,2);
f[19] = _mm_extract_epi32(if4,3);
x23 = _mm_packs_epi32(if2,if3); /* a2 b2 c2 d2 a3 b3 c3 d3 */
x01 = _mm_packs_epi32(if0,if1); /* a0 b0 c0 d0 a1 b1 c1 d1 */
x02 = _mm_unpacklo_epi16(x01,x23); /* a0 a2 b0 b2 c0 c2 d0 d2 */
x13 = _mm_unpackhi_epi16(x01,x23); /* a1 a3 b1 b3 c1 c3 d1 d3 */
xab = _mm_unpacklo_epi16(x02,x13); /* a0 a1 a2 a3 b0 b1 b2 b3 */
xcd = _mm_unpackhi_epi16(x02,x13); /* c0 c1 c2 c3 d0 d1 d2 d3 */
xab = _mm_sub_epi16(xab,v2295_16_128);
xcd = _mm_sub_epi16(xcd,v2295_16_128);
*(crypto_int64 *) (f + 0) = _mm_extract_epi64(xab,0);
*(crypto_int64 *) (f + 5) = _mm_extract_epi64(xab,1);
*(crypto_int64 *) (f + 10) = _mm_extract_epi64(xcd,0);
*(crypto_int64 *) (f + 15) = _mm_extract_epi64(xcd,1);
f += 20;
}
c0 = *c++;
c1 = *c++;
c0 += c1 << 8;
*f++ = modq_freeze(c0 + q - qshift);
*f++ = 0;
*f++ = 0;
*f++ = 0;
*f++ = 0;
*f++ = 0;
*f++ = 0;
*f++ = 0;
}
#endif

View File

@ -0,0 +1,31 @@
#ifndef rq_h
#define rq_h
#include "modq.h"
#include "small.h"
#define rq_encode crypto_kem_sntrup4591761_avx_rq_encode
extern void rq_encode(unsigned char *,const modq *);
#define rq_decode crypto_kem_sntrup4591761_avx_rq_decode
extern void rq_decode(modq *,const unsigned char *);
#define rq_roundencode crypto_kem_sntrup4591761_avx_rq_roundencode
extern void rq_roundencode(unsigned char *,const modq *);
#define rq_decoderounded crypto_kem_sntrup4591761_avx_rq_decoderounded
extern void rq_decoderounded(modq *,const unsigned char *);
#define rq_round3 crypto_kem_sntrup4591761_avx_rq_round3
extern void rq_round3(modq *,const modq *);
#define rq_mod3 crypto_kem_sntrup4591761_avx_rq_mod3
extern void rq_mod3(small *,const modq *);
#define rq_mult crypto_kem_sntrup4591761_avx_rq_mult
extern void rq_mult(modq *,const modq *,const small *);
#define rq_recip3 crypto_kem_sntrup4591761_avx_rq_recip3
int rq_recip3(modq *,const small *);
#endif

View File

@ -0,0 +1,51 @@
#if __AVX2__
#include <immintrin.h>
#include "mod3.h"
#include "rq.h"
#define v3 _mm256_set1_epi16(3)
#define v7 _mm256_set1_epi16(7)
#define v2296_16 _mm256_set1_epi16(2296)
#define v4591_16 _mm256_set1_epi16(4591)
#define v10923_16 _mm256_set1_epi16(10923)
static inline __m256i squeeze(__m256i x)
{
__m256i q = _mm256_mulhrs_epi16(x,v7);
q = _mm256_mullo_epi16(q,v4591_16);
return _mm256_sub_epi16(x,q);
}
static inline __m256i freeze(__m256i x)
{
__m256i mask, x2296, x4591;
x4591 = _mm256_add_epi16(x,v4591_16);
mask = _mm256_srai_epi16(x,15);
x = _mm256_blendv_epi8(x,x4591,mask);
x2296 = _mm256_sub_epi16(x,v2296_16);
mask = _mm256_srai_epi16(x2296,15);
x4591 = _mm256_sub_epi16(x,v4591_16);
x = _mm256_blendv_epi8(x4591,x,mask);
return x;
}
void rq_mod3(small *g,const modq *f)
{
int i;
for (i = 0;i < 768;i += 16) {
__m256i x = _mm256_loadu_si256((__m256i *) &f[i]);
__m256i q;
x = _mm256_mullo_epi16(x,v3);
x = squeeze(x);
x = freeze(x);
q = _mm256_mulhrs_epi16(x,v10923_16);
x = _mm256_sub_epi16(x,q);
q = _mm256_add_epi16(q,q);
x = _mm256_sub_epi16(x,q); /* g0 g1 ... g15 */
x = _mm256_packs_epi16(x,x); /* g0 ... g7 g0 ... g7 g8 ... g15 g8 ... g15 */
0[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,0),0);
1[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,1),0);
}
}
#endif

View File

@ -0,0 +1,217 @@
#if __AVX2__
#include <immintrin.h>
#include "params.h"
#include "swap.h"
#include "rq.h"
#define v7 _mm256_set1_epi16(7)
#define v1827_16 _mm256_set1_epi16(1827)
#define v4591_16 _mm256_set1_epi16(4591)
#define v29234_16 _mm256_set1_epi16(29234)
/* caller must ensure that x-y does not overflow */
static int smaller_mask(int x,int y)
{
return (x - y) >> 31;
}
static inline __m256i product(__m256i x,__m256i y)
{
__m256i lo, hi, r0, r1, t0, t1, t, s0, s1;
lo = _mm256_mullo_epi16(x,y);
hi = _mm256_mulhi_epi16(x,y);
r0 = _mm256_unpacklo_epi16(lo,hi);
r1 = _mm256_unpackhi_epi16(lo,hi);
t0 = _mm256_srai_epi32(r0,16);
t1 = _mm256_srai_epi32(r1,16);
t = _mm256_packs_epi32(t0,t1);
t = _mm256_mulhrs_epi16(t,v29234_16);
lo = _mm256_mullo_epi16(t,v4591_16);
hi = _mm256_mulhi_epi16(t,v4591_16);
s0 = _mm256_unpacklo_epi16(lo,hi);
s1 = _mm256_unpackhi_epi16(lo,hi);
s0 = _mm256_slli_epi32(s0,4);
s1 = _mm256_slli_epi32(s1,4);
r0 = _mm256_sub_epi32(r0,s0);
r1 = _mm256_sub_epi32(r1,s1);
t0 = _mm256_srai_epi32(r0,8);
t1 = _mm256_srai_epi32(r1,8);
t = _mm256_packs_epi32(t0,t1);
t = _mm256_mulhrs_epi16(t,v1827_16);
lo = _mm256_mullo_epi16(t,v4591_16);
hi = _mm256_mulhi_epi16(t,v4591_16);
s0 = _mm256_unpacklo_epi16(lo,hi);
s1 = _mm256_unpackhi_epi16(lo,hi);
r0 = _mm256_sub_epi32(r0,s0);
r1 = _mm256_sub_epi32(r1,s1);
x = _mm256_packs_epi32(r0,r1);
return x;
}
static inline __m256i minusproduct(__m256i x,__m256i y,__m256i z)
{
__m256i t;
x = _mm256_sub_epi16(x,product(y,z));
t = _mm256_mulhrs_epi16(x,v7);
t = _mm256_mullo_epi16(t,v4591_16);
x = _mm256_sub_epi16(x,t);
return x;
}
static void vectormodq_product(modq *z,int len,const modq *x,const modq c)
{
__m256i cvec = _mm256_set1_epi16(c);
while (len >= 16) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
xi = product(xi,cvec);
_mm256_storeu_si256((__m256i *) z,xi);
x += 16;
z += 16;
len -= 16;
}
while (len > 0) {
*z = modq_product(*x,c);
++x;
++z;
--len;
}
}
static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,const modq c)
{
__m256i cvec = _mm256_set1_epi16(c);
while (len >= 16) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
__m256i yi = _mm256_loadu_si256((__m256i *) y);
xi = minusproduct(xi,yi,cvec);
_mm256_storeu_si256((__m256i *) z,xi);
x += 16;
y += 16;
z += 16;
len -= 16;
}
while (len > 0) {
*z = modq_minusproduct(*x,*y,c);
++x;
++y;
++z;
--len;
}
}
static void vectormodq_shift(modq *z,int len)
{
int i;
while (len >= 17) {
__m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 17));
_mm256_storeu_si256((__m256i *) (z + len - 16),zi);
len -= 16;
}
for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
z[0] = 0;
}
/*
r = (3s)^(-1) mod m, returning 0, if s is invertible mod m
or returning -1 if s is not invertible mod m
r,s are polys of degree <p
m is x^p-x-1
*/
int rq_recip3(modq *r,const small *s)
{
const int loops = 2*p + 1;
int loop;
modq f[768];
modq g[769];
modq u[1536];
modq v[1537];
modq c;
int i;
int d = p;
int e = p;
int swapmask;
for (i = 2;i < p;++i) f[i] = 0;
f[0] = -1;
f[1] = -1;
f[p] = 1;
/* generalization: can initialize f to any polynomial m */
/* requirements: m has degree exactly p, nonzero constant coefficient */
for (i = 0;i < p;++i) g[i] = 3 * s[i];
g[p] = 0;
for (i = 0;i <= loops;++i) u[i] = 0;
v[0] = 1;
for (i = 1;i <= loops;++i) v[i] = 0;
loop = 0;
for (;;) {
/* e == -1 or d + e + loop <= 2*p */
/* f has degree p: i.e., f[p]!=0 */
/* f[i]==0 for i < p-d */
/* g has degree <=p (so it fits in p+1 coefficients) */
/* g[i]==0 for i < p-e */
/* u has degree <=loop (so it fits in loop+1 coefficients) */
/* u[i]==0 for i < p-d */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
/* v has degree <=loop (so it fits in loop+1 coefficients) */
/* v[i]==0 for i < p-e */
/* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
if (loop >= loops) break;
c = modq_quotient(g[p],f[p]);
vectormodq_minusproduct(g,768,g,f,c);
vectormodq_shift(g,769);
#ifdef SIMPLER
vectormodq_minusproduct(v,1536,v,u,c);
vectormodq_shift(v,1537);
#else
if (loop < p) {
vectormodq_minusproduct(v,loop + 1,v,u,c);
vectormodq_shift(v,loop + 2);
} else {
vectormodq_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
vectormodq_shift(v + loop - p,p + 2);
}
#endif
e -= 1;
++loop;
swapmask = smaller_mask(e,d) & modq_nonzero_mask(g[p]);
swap(&e,&d,sizeof e,swapmask);
swap(f,g,768 * sizeof(modq),swapmask);
#ifdef SIMPLER
swap(u,v,1536 * sizeof(modq),swapmask);
#else
if (loop < p) {
swap(u,v,(loop + 1) * sizeof(modq),swapmask);
} else {
swap(u + loop - p,v + loop - p,(p + 1) * sizeof(modq),swapmask);
}
#endif
}
c = modq_reciprocal(f[p]);
vectormodq_product(r,p,u + p,c);
for (i = 0;i < p;++i) r[i] = modq_freeze(r[i]);
for (i = p;i < 768;++i) r[i] = 0;
return smaller_mask(0,d);
}
#endif

View File

@ -0,0 +1,22 @@
#if __AVX2__
#include <immintrin.h>
#include "params.h"
#include "rq.h"
#define v3_16 _mm256_set1_epi16(3)
#define v10923_16 _mm256_set1_epi16(10923)
void rq_round3(modq *h,const modq *f)
{
int i;
for (i = 0;i < 768;i += 16) {
__m256i x = _mm256_loadu_si256((__m256i *) &f[i]);
__m256i x2;
x = _mm256_mulhrs_epi16(x,v10923_16);
x2 = _mm256_add_epi16(x,x);
x = _mm256_add_epi16(x,x2);
_mm256_storeu_si256((__m256i *) &h[i],x);
}
}
#endif

View File

@ -0,0 +1,262 @@
#if __AVX2__
#include <immintrin.h>
#include "params.h"
#include "crypto_uint32.h"
#include "rq.h"
#define alpha_top _mm256_set1_epi32(0x43380000)
#define alpha _mm256_set1_pd(6755399441055744.0)
#define v10923_16 _mm256_set1_epi16(10923)
#define floor(x) _mm256_floor_pd(x)
void rq_roundencode(unsigned char *c,const modq *f)
{
int i;
__m256i h[50];
for (i = 0;i < 208;i += 16) {
__m256i a0, a1, a2, b0, b1, b2, c0, c1, c2, d0, d1, d2;
__m256i e0, e1, f0, f1, g0, g1;
a0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[0]));
a1 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[8]));
a2 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[16]));
a0 = _mm256_inserti128_si256(a0,_mm_loadu_si128((__m128i *) &f[24]),1);
a1 = _mm256_inserti128_si256(a1,_mm_loadu_si128((__m128i *) &f[32]),1);
a2 = _mm256_inserti128_si256(a2,_mm_loadu_si128((__m128i *) &f[40]),1);
f += 48;
a0 = _mm256_mulhrs_epi16(a0,v10923_16);
a1 = _mm256_mulhrs_epi16(a1,v10923_16);
a2 = _mm256_mulhrs_epi16(a2,v10923_16);
/* a0: a0 a1 a2 b0 b1 b2 c0 c1 and similar second half */
/* a1: c2 d0 d1 d2 e0 e1 e2 f0 */
/* a2: f1 f2 g0 g1 g2 h0 h1 h2 */
b1 = _mm256_blend_epi16(a2,a0,0xf0);
b1 = _mm256_shuffle_epi32(b1,0x4e);
b0 = _mm256_blend_epi16(a0,a1,0xf0);
b2 = _mm256_blend_epi16(a1,a2,0xf0);
/* XXX: use shufps instead? */
/* b0: a0 a1 a2 b0 e0 e1 e2 f0 */
/* b1: b1 b2 c0 c1 f1 f2 g0 g1 */
/* b2: c2 d0 d1 d2 g2 h0 h1 h2 */
c1 = _mm256_blend_epi16(b2,b0,0xcc);
c1 = _mm256_shuffle_epi32(c1,0xb1);
c0 = _mm256_blend_epi16(b0,b1,0xcc);
c2 = _mm256_blend_epi16(b1,b2,0xcc);
/* c0: a0 a1 c0 c1 e0 e1 g0 g1 */
/* c1: a2 b0 c2 d0 e2 f0 g2 h0 */
/* c2: b1 b2 d1 d2 f1 f2 h1 h2 */
d1 = _mm256_blend_epi16(c2,c0,0xaa);
d1 = _mm256_shufflelo_epi16(d1,0xb1);
d1 = _mm256_shufflehi_epi16(d1,0xb1);
d0 = _mm256_blend_epi16(c0,c1,0xaa);
d2 = _mm256_blend_epi16(c1,c2,0xaa);
/* d0: a0 b0 c0 d0 e0 f0 g0 h0 */
/* d1: a1 b1 c1 d1 e1 f1 g1 h1 */
/* d2: a2 b2 c2 d2 e2 f2 g2 h2 */
d0 = _mm256_add_epi16(d0,_mm256_set1_epi16(765));
d1 = _mm256_add_epi16(d1,_mm256_set1_epi16(765));
d2 = _mm256_add_epi16(d2,_mm256_set1_epi16(765));
/* want bytes of d0 + 1536*d1 + 1536*1536*d2 */
e0 = d0 & _mm256_set1_epi16(0xff);
d0 = _mm256_srli_epi16(d0,8);
/* want e0, d0 + 6*d1 + 6*1536*d2 */
d1 = _mm256_mullo_epi16(d1,_mm256_set1_epi16(6));
d0 = _mm256_add_epi16(d0,d1);
/* want e0, d0 + 6*1536*d2 */
e1 = _mm256_slli_epi16(d0,8);
e0 = _mm256_add_epi16(e0,e1);
d0 = _mm256_srli_epi16(d0,8);
/* want e0, d0 + 36*d2 */
d2 = _mm256_mullo_epi16(d2,_mm256_set1_epi16(36));
e1 = _mm256_add_epi16(d0,d2);
/* want e0, e1 */
/* e0: out0 out1 out4 out5 out8 out9 ... */
/* e1: out2 out3 out6 out7 out10 out11 ... */
f0 = _mm256_unpacklo_epi16(e0,e1);
f1 = _mm256_unpackhi_epi16(e0,e1);
g0 = _mm256_permute2x128_si256(f0,f1,0x20);
g1 = _mm256_permute2x128_si256(f0,f1,0x31);
_mm256_storeu_si256((__m256i *) c,g0);
_mm256_storeu_si256((__m256i *) (c + 32),g1);
c += 64;
}
for (i = 0;i < 9;++i) {
__m256i x = _mm256_loadu_si256((__m256i *) &f[16 * i]);
_mm256_storeu_si256(&h[i],_mm256_mulhrs_epi16(x,v10923_16));
}
f = (const modq *) h;
for (i = 208;i < 253;++i) {
crypto_int32 f0, f1, f2;
f0 = *f++;
f1 = *f++;
f2 = *f++;
f0 += 1806037245;
f1 *= 3;
f2 *= 9;
f0 += f1 << 9;
f0 += f2 << 18;
*(crypto_int32 *) c = f0;
c += 4;
}
{
crypto_int32 f0, f1;
f0 = *f++;
f1 = *f++;
f0 += 1175805;
f1 *= 3;
f0 += f1 << 9;
*c++ = f0; f0 >>= 8;
*c++ = f0; f0 >>= 8;
*c++ = f0;
}
}
void rq_decoderounded(modq *f,const unsigned char *c)
{
crypto_uint32 c0, c1, c2, c3;
crypto_uint32 f0, f1, f2;
int i;
for (i = 0;i < 248;i += 8) {
__m256i abcdefgh, todo[2];
__m256d x, f2, f1, f0;
__m128i if2, if1, if0;
int j;
abcdefgh = _mm256_loadu_si256((__m256i *) c);
c += 32;
todo[0] = _mm256_unpacklo_epi32(abcdefgh,alpha_top);
todo[1] = _mm256_unpackhi_epi32(abcdefgh,alpha_top);
for (j = 0;j < 2;++j) {
x = *(__m256d *) &todo[j];
x -= alpha;
/* x is f0 + f1*1536 + f2*1536^2 */
/* with each f between 0 and 1530 */
f2 = x * _mm256_set1_pd(0.00000042385525173611114052197733521876177320564238470979034900665283203125);
f2 = floor(f2);
x -= f2 * _mm256_set1_pd(2359296.0);
f1 = x * _mm256_set1_pd(0.00065104166666666673894681149903362893383018672466278076171875);
f1 = floor(f1);
x -= f1 * _mm256_set1_pd(1536.0);
f0 = x;
f2 -= _mm256_set1_pd(1531.0) * floor(f2 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875));
f1 -= _mm256_set1_pd(1531.0) * floor(f1 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875));
f0 -= _mm256_set1_pd(1531.0) * floor(f0 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875));
f2 *= _mm256_set1_pd(3.0); f2 -= _mm256_set1_pd(2295.0);
f1 *= _mm256_set1_pd(3.0); f1 -= _mm256_set1_pd(2295.0);
f0 *= _mm256_set1_pd(3.0); f0 -= _mm256_set1_pd(2295.0);
if2 = _mm256_cvtpd_epi32(f2); /* a2 b2 e2 f2 */
if1 = _mm256_cvtpd_epi32(f1); /* a1 b1 e1 f1 */
if0 = _mm256_cvtpd_epi32(f0); /* a0 b0 e0 f0 */
f[6*j + 0] = _mm_extract_epi32(if0,0);
f[6*j + 1] = _mm_extract_epi32(if1,0);
f[6*j + 2] = _mm_extract_epi32(if2,0);
f[6*j + 3] = _mm_extract_epi32(if0,1);
f[6*j + 4] = _mm_extract_epi32(if1,1);
f[6*j + 5] = _mm_extract_epi32(if2,1);
f[6*j + 12] = _mm_extract_epi32(if0,2);
f[6*j + 13] = _mm_extract_epi32(if1,2);
f[6*j + 14] = _mm_extract_epi32(if2,2);
f[6*j + 15] = _mm_extract_epi32(if0,3);
f[6*j + 16] = _mm_extract_epi32(if1,3);
f[6*j + 17] = _mm_extract_epi32(if2,3);
}
f += 24;
}
for (i = 248;i < 253;++i) {
c0 = *c++;
c1 = *c++;
c2 = *c++;
c3 = *c++;
/* f0 + f1*1536 + f2*1536^2 */
/* = c0 + c1*256 + c2*256^2 + c3*256^3 */
/* with each f between 0 and 1530 */
/* f2 = (64/9)c3 + (1/36)c2 + (1/9216)c1 + (1/2359296)c0 - [0,0.99675] */
/* claim: 2^21 f2 < x < 2^21(f2+1) */
/* where x = 14913081*c3 + 58254*c2 + 228*(c1+2) */
/* proof: x - 2^21 f2 = 456 - (8/9)c0 + (4/9)c1 - (2/9)c2 + (1/9)c3 + 2^21 [0,0.99675] */
/* at least 456 - (8/9)255 - (2/9)255 > 0 */
/* at most 456 + (4/9)255 + (1/9)255 + 2^21 0.99675 < 2^21 */
f2 = (14913081*c3 + 58254*c2 + 228*(c1+2)) >> 21;
c2 += c3 << 8;
c2 -= (f2 * 9) << 2;
/* f0 + f1*1536 */
/* = c0 + c1*256 + c2*256^2 */
/* c2 <= 35 = floor((1530+1530*1536)/256^2) */
/* f1 = (128/3)c2 + (1/6)c1 + (1/1536)c0 - (1/1536)f0 */
/* claim: 2^21 f1 < x < 2^21(f1+1) */
/* where x = 89478485*c2 + 349525*c1 + 1365*(c0+1) */
/* proof: x - 2^21 f1 = 1365 - (1/3)c2 - (1/3)c1 - (1/3)c0 + (4096/3)f0 */
/* at least 1365 - (1/3)35 - (1/3)255 - (1/3)255 > 0 */
/* at most 1365 + (4096/3)1530 < 2^21 */
f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21;
c1 += c2 << 8;
c1 -= (f1 * 3) << 1;
c0 += c1 << 8;
f0 = c0;
*f++ = modq_freeze(f0 * 3 + q - qshift);
*f++ = modq_freeze(f1 * 3 + q - qshift);
*f++ = modq_freeze(f2 * 3 + q - qshift);
}
c0 = *c++;
c1 = *c++;
c2 = *c++;
f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21;
c1 += c2 << 8;
c1 -= (f1 * 3) << 1;
c0 += c1 << 8;
f0 = c0;
*f++ = modq_freeze(f0 * 3 + q - qshift);
*f++ = modq_freeze(f1 * 3 + q - qshift);
*f++ = 0;
*f++ = 0;
*f++ = 0;
*f++ = 0;
*f++ = 0;
*f++ = 0;
*f++ = 0;
}
#endif

View File

@ -0,0 +1,45 @@
#include <immintrin.h>
#include "params.h"
#include "small.h"
/* XXX: these functions rely on p mod 4 = 1 */
/* all coefficients in -1, 0, 1 */
void small_encode(unsigned char *c,const small *f)
{
small c0;
int i;
for (i = 0;i < p/4;++i) {
c0 = *f++ + 1;
c0 += (*f++ + 1) << 2;
c0 += (*f++ + 1) << 4;
c0 += (*f++ + 1) << 6;
*c++ = c0;
}
c0 = *f++ + 1;
*c++ = c0;
}
void small_decode(small *f,const unsigned char *c)
{
unsigned char c0;
int i;
for (i = 0;i < p/4;++i) {
c0 = *c++;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1;
}
c0 = *c++;
*f++ = ((small) (c0 & 3)) - 1;
*f++ = 0;
*f++ = 0;
*f++ = 0;
*f++ = 0;
*f++ = 0;
*f++ = 0;
*f++ = 0;
}

View File

@ -0,0 +1,20 @@
#ifndef small_h
#define small_h
#include "crypto_int8.h"
typedef crypto_int8 small;
#define small_encode crypto_kem_sntrup4591761_avx_small_encode
extern void small_encode(unsigned char *,const small *);
#define small_decode crypto_kem_sntrup4591761_avx_small_decode
extern void small_decode(small *,const unsigned char *);
#define small_random crypto_kem_sntrup4591761_avx_small_random
extern void small_random(small *);
#define small_random_weightw crypto_kem_sntrup4591761_avx_small_random_weightw
extern void small_random_weightw(small *);
#endif

View File

@ -0,0 +1,34 @@
#if __AVX2__
#include <immintrin.h>
#include "swap.h"
void swap(void *x,void *y,int bytes,int mask)
{
char c = mask;
__m256i maskvec = _mm256_set1_epi32(mask);
while (bytes >= 32) {
__m256i xi = _mm256_loadu_si256(x);
__m256i yi = _mm256_loadu_si256(y);
__m256i xinew = _mm256_blendv_epi8(xi,yi,maskvec);
__m256i yinew = _mm256_blendv_epi8(yi,xi,maskvec);
_mm256_storeu_si256(x,xinew);
_mm256_storeu_si256(y,yinew);
x = 32 + (char *) x;
y = 32 + (char *) y;
bytes -= 32;
}
while (bytes > 0) {
char xi = *(char *) x;
char yi = *(char *) y;
char t = c & (xi ^ yi);
xi ^= t;
yi ^= t;
*(char *) x = xi;
*(char *) y = yi;
++x;
++y;
--bytes;
}
}
#endif

View File

@ -0,0 +1,7 @@
#ifndef swap_h
#define swap_h
#define swap crypto_kem_sntrup4591761_avx_swap
extern void swap(void *,void *,int,int);
#endif

View File

@ -0,0 +1,31 @@
#if __AVX2__
#include <immintrin.h>
#include "params.h"
#include "r3.h"
#include "crypto_uint16.h"
#include "crypto_int32.h"
int r3_weightw_mask(const small *r)
{
int weight;
int i;
__m256i tally = _mm256_set1_epi32(0);
for (i = 0;i < 768;i += 16) {
__m256i x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i *) &r[i]));
x &= _mm256_set1_epi32(0x00010001);
tally = _mm256_add_epi16(tally,x);
}
tally = _mm256_hadd_epi16(tally,tally);
tally = _mm256_hadd_epi16(tally,tally);
tally = _mm256_hadd_epi16(tally,tally);
weight = _mm_extract_epi16(_mm256_extracti128_si256(tally,0),0) + _mm_extract_epi16(_mm256_extracti128_si256(tally,1),0);
weight -= w;
return (-(crypto_int32) (crypto_uint16) weight) >> 30;
}
#endif

View File

@ -0,0 +1,32 @@
crypto/libntrup/src/ref/randomsmall.c
crypto/libntrup/src/ref/swap.c
crypto/libntrup/src/ref/rq_round3.c
crypto/libntrup/src/ref/rq_recip3.c
crypto/libntrup/src/ref/small.c
crypto/libntrup/src/ref/rq_mult.c
crypto/libntrup/src/ref/randomweightw.c
crypto/libntrup/src/ref/random32.c
crypto/libntrup/src/ref/dec.c
crypto/libntrup/src/ref/r3_mult.c
crypto/libntrup/src/ref/r3_recip.c
crypto/libntrup/src/ref/keypair.c
crypto/libntrup/src/ref/rq_rounded.c
crypto/libntrup/src/ref/enc.c
crypto/libntrup/src/ref/int32_sort.c
crypto/libntrup/src/ref/rq.c
crypto/libntrup/src/avx/randomsmall.c
crypto/libntrup/src/avx/weight.c
crypto/libntrup/src/avx/swap.c
crypto/libntrup/src/avx/rq_round3.c
crypto/libntrup/src/avx/rq_recip3.c
crypto/libntrup/src/avx/small.c
crypto/libntrup/src/avx/randomweightw.c
crypto/libntrup/src/avx/dec.c
crypto/libntrup/src/avx/r3_recip.c
crypto/libntrup/src/avx/keypair.c
crypto/libntrup/src/avx/rq_rounded.c
crypto/libntrup/src/avx/mult.c
crypto/libntrup/src/avx/enc.c
crypto/libntrup/src/avx/int32_sort.c
crypto/libntrup/src/avx/rq.c
crypto/libntrup/src/avx/rq_mod3.c

View File

@ -0,0 +1,80 @@
#include <libntrup/ntru.h>
#include <stdbool.h>
#if __AVX2__
#include <cpuid.h>
#include <array>
std::array< int, 4 >
CPUID(int funcno)
{
std::array< int, 4 > cpuinfo;
__cpuid(funcno, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
return cpuinfo;
}
bool
supports_avx2()
{
return CPUID(0).at(0) >= 7 && CPUID(7).at(1) & (1 << 5);
}
#else
bool
supports_avx2()
{
printf("AVX2 disabled on compile time\n");
return false;
}
#endif
int (*__crypto_kem_enc)(unsigned char *cstr, unsigned char *k,
const unsigned char *pk);
int (*__crypto_kem_dec)(unsigned char *k, const unsigned char *cstr,
const unsigned char *sk);
int (*__crypto_kem_keypair)(unsigned char *pk, unsigned char *sk);
extern "C"
{
void
ntru_init(int force_no_avx2)
{
if(supports_avx2() && !force_no_avx2)
{
__crypto_kem_dec = &crypto_kem_dec_avx2;
__crypto_kem_enc = &crypto_kem_enc_avx2;
__crypto_kem_dec = &crypto_kem_dec_avx2;
__crypto_kem_keypair = &crypto_kem_keypair_avx2;
}
else
{
__crypto_kem_dec = &crypto_kem_dec_ref;
__crypto_kem_enc = &crypto_kem_enc_ref;
__crypto_kem_dec = &crypto_kem_dec_ref;
__crypto_kem_keypair = &crypto_kem_keypair_ref;
}
}
int
crypto_kem_enc(unsigned char *cstr, unsigned char *k, const unsigned char *pk)
{
return __crypto_kem_enc(cstr, k, pk);
}
int
crypto_kem_dec(unsigned char *k, const unsigned char *cstr,
const unsigned char *sk)
{
return __crypto_kem_dec(k, cstr, sk);
}
int
crypto_kem_keypair(unsigned char *pk, unsigned char *sk)
{
return __crypto_kem_keypair(pk, sk);
}
}

View File

@ -0,0 +1,32 @@
This is a reference implementation of Streamlined NTRU Prime 4591^761.
This implementation is designed primarily for clarity, subject to the
following constraints:
* The implementation is written in C. The Sage implementation in the
NTRU Prime paper is considerably more concise (and compatible).
* The implementation avoids data-dependent branches and array
indices. For example, conditional swaps are computed by arithmetic
rather than by branches.
* The implementation avoids other C operations that often take
variable time. For example, divisions by 3 are computed via
multiplications and shifts.
This implementation does _not_ sacrifice clarity for speed.
This implementation has not yet been reviewed for correctness or for
constant-time behavior. It does pass various tests and has no known
bugs, but there are at least some platforms where multiplications take
variable time, and fixing this requires platform-specific effort; see
https://www.bearssl.org/ctmul.html and http://repository.tue.nl/800603.
This implementation allows "benign malleability" of ciphertexts, as
defined in http://www.shoup.net/papers/iso-2_1.pdf. Specifically, each
32-bit ciphertext word encodes three integers between 0 and 1530; if
larger integers appear then they are silently reduced modulo 1531.
Similar comments apply to public keys.
There is a separate "avx" implementation where similar comments apply,
except that "avx" _does_ sacrifice clarity for speed on CPUs with AVX2
instructions.

View File

@ -0,0 +1,4 @@
#define CRYPTO_SECRETKEYBYTES 1600
#define CRYPTO_PUBLICKEYBYTES 1218
#define CRYPTO_CIPHERTEXTBYTES 1047
#define CRYPTO_BYTES 32

View File

@ -0,0 +1,71 @@
#ifdef KAT
#include <stdio.h>
#endif
#include "params.h"
#include "small.h"
#include "mod3.h"
#include "rq.h"
#include "r3.h"
#include "crypto_hash_sha512.h"
#include "crypto_verify_32.h"
#include "crypto_kem.h"
int crypto_kem_dec_ref(
unsigned char *k,
const unsigned char *cstr,
const unsigned char *sk
)
{
small f[p];
modq h[p];
small grecip[p];
modq c[p];
modq t[p];
small t3[p];
small r[p];
modq hr[p];
unsigned char rstr[small_encode_len];
unsigned char hash[64];
int i;
int result = 0;
int weight;
small_decode(f,sk);
small_decode(grecip,sk + small_encode_len);
rq_decode(h,sk + 2 * small_encode_len);
rq_decoderounded(c,cstr + 32);
rq_mult(t,c,f);
for (i = 0;i < p;++i) t3[i] = mod3_freeze(modq_freeze(3*t[i]));
r3_mult(r,t3,grecip);
#ifdef KAT
{
int j;
printf("decrypt r:");
for (j = 0;j < p;++j)
if (r[j] == 1) printf(" +%d",j);
else if (r[j] == -1) printf(" -%d",j);
printf("\n");
}
#endif
weight = 0;
for (i = 0;i < p;++i) weight += (1 & r[i]);
weight -= w;
result |= modq_nonzero_mask(weight); /* XXX: puts limit on p */
rq_mult(hr,h,r);
rq_round3(hr,hr);
for (i = 0;i < p;++i) result |= modq_nonzero_mask(hr[i] - c[i]);
small_encode(rstr,r);
crypto_hash_sha512(hash,rstr,sizeof rstr);
result |= crypto_verify_32(hash,cstr);
for (i = 0;i < 32;++i) k[i] = (hash[32 + i] & ~result);
return result;
}

View File

@ -0,0 +1,49 @@
#ifdef KAT
#include <stdio.h>
#endif
#include <string.h>
#include "params.h"
#include "small.h"
#include "rq.h"
#include "crypto_hash_sha512.h"
#include "crypto_kem.h"
int crypto_kem_enc_ref(
unsigned char *cstr,
unsigned char *k,
const unsigned char *pk
)
{
small r[p];
modq h[p];
modq c[p];
unsigned char rstr[small_encode_len];
unsigned char hash[64];
small_random_weightw(r);
#ifdef KAT
{
int i;
printf("encrypt r:");
for (i = 0;i < p;++i)
if (r[i] == 1) printf(" +%d",i);
else if (r[i] == -1) printf(" -%d",i);
printf("\n");
}
#endif
small_encode(rstr,r);
crypto_hash_sha512(hash,rstr,sizeof rstr);
rq_decode(h,pk);
rq_mult(c,h,r);
rq_round3(c,c);
memcpy(k,hash + 32,32);
memcpy(cstr,hash,32);
rq_encoderounded(cstr + 32,c);
return 0;
}

View File

@ -0,0 +1,5 @@
Alphabetical order:
Daniel J. Bernstein
Chitchanok Chuengsatiansup
Tanja Lange
Christine van Vredendaal

View File

@ -0,0 +1,35 @@
#include "int32_sort.h"
#include "crypto_uint32.h"
static void minmax(crypto_int32 *x,crypto_int32 *y)
{
crypto_uint32 xi = *x;
crypto_uint32 yi = *y;
crypto_uint32 xy = xi ^ yi;
crypto_uint32 c = yi - xi;
c ^= xy & (c ^ yi);
c >>= 31;
c = -c;
c &= xy;
*x = xi ^ c;
*y = yi ^ c;
}
void int32_sort(crypto_int32 *x,int n)
{
int top,p,q,i;
if (n < 2) return;
top = 1;
while (top < n - top) top += top;
for (p = top;p > 0;p >>= 1) {
for (i = 0;i < n - p;++i)
if (!(i & p))
minmax(x + i,x + i + p);
for (q = top;q > p;q >>= 1)
for (i = 0;i < n - q;++i)
if (!(i & p))
minmax(x + i + p,x + i + q);
}
}

View File

@ -0,0 +1,9 @@
#ifndef int32_sort_h
#define int32_sort_h
#include "crypto_int32.h"
#define int32_sort crypto_kem_sntrup4591761_ref_int32_sort
extern void int32_sort(crypto_int32 *,int);
#endif

View File

@ -0,0 +1,39 @@
#include <string.h>
#include "modq.h"
#include "params.h"
#include "r3.h"
#include "small.h"
#include "rq.h"
#include "crypto_kem.h"
#if crypto_kem_PUBLICKEYBYTES != rq_encode_len
#error "crypto_kem_PUBLICKEYBYTES must match rq_encode_len"
#endif
#if crypto_kem_SECRETKEYBYTES != rq_encode_len + 2 * small_encode_len
#error "crypto_kem_SECRETKEYBYTES must match rq_encode_len + 2 * small_encode_len"
#endif
int crypto_kem_keypair_ref(unsigned char *pk,unsigned char *sk)
{
small g[p];
small grecip[p];
small f[p];
modq f3recip[p];
modq h[p];
do
small_random(g);
while (r3_recip(grecip,g) != 0);
small_random_weightw(f);
rq_recip3(f3recip,f);
rq_mult(h,f3recip,g);
rq_encode(pk,h);
small_encode(sk,f);
small_encode(sk + small_encode_len,grecip);
memcpy(sk + 2 * small_encode_len,pk,rq_encode_len);
return 0;
}

View File

@ -0,0 +1,60 @@
#ifndef mod3_h
#define mod3_h
#include "small.h"
#include "crypto_int32.h"
/* -1 if x is nonzero, 0 otherwise */
static inline int mod3_nonzero_mask(small x)
{
return -x*x;
}
/* input between -100000 and 100000 */
/* output between -1 and 1 */
static inline small mod3_freeze(crypto_int32 a)
{
a -= 3 * ((10923 * a) >> 15);
a -= 3 * ((89478485 * a + 134217728) >> 28);
return a;
}
static inline small mod3_minusproduct(small a,small b,small c)
{
crypto_int32 A = a;
crypto_int32 B = b;
crypto_int32 C = c;
return mod3_freeze(A - B * C);
}
static inline small mod3_plusproduct(small a,small b,small c)
{
crypto_int32 A = a;
crypto_int32 B = b;
crypto_int32 C = c;
return mod3_freeze(A + B * C);
}
static inline small mod3_product(small a,small b)
{
return a * b;
}
static inline small mod3_sum(small a,small b)
{
crypto_int32 A = a;
crypto_int32 B = b;
return mod3_freeze(A + B);
}
static inline small mod3_reciprocal(small a1)
{
return a1;
}
static inline small mod3_quotient(small num,small den)
{
return mod3_product(num,mod3_reciprocal(den));
}
#endif

View File

@ -0,0 +1,92 @@
#ifndef modq_h
#define modq_h
#include "crypto_int16.h"
#include "crypto_int32.h"
#include "crypto_uint16.h"
#include "crypto_uint32.h"
typedef crypto_int16 modq;
/* -1 if x is nonzero, 0 otherwise */
static inline int modq_nonzero_mask(modq x)
{
crypto_int32 r = (crypto_uint16) x;
r = -r;
r >>= 30;
return r;
}
/* input between -9000000 and 9000000 */
/* output between -2295 and 2295 */
static inline modq modq_freeze(crypto_int32 a)
{
a -= 4591 * ((228 * a) >> 20);
a -= 4591 * ((58470 * a + 134217728) >> 28);
return a;
}
static inline modq modq_minusproduct(modq a,modq b,modq c)
{
crypto_int32 A = a;
crypto_int32 B = b;
crypto_int32 C = c;
return modq_freeze(A - B * C);
}
static inline modq modq_plusproduct(modq a,modq b,modq c)
{
crypto_int32 A = a;
crypto_int32 B = b;
crypto_int32 C = c;
return modq_freeze(A + B * C);
}
static inline modq modq_product(modq a,modq b)
{
crypto_int32 A = a;
crypto_int32 B = b;
return modq_freeze(A * B);
}
static inline modq modq_square(modq a)
{
crypto_int32 A = a;
return modq_freeze(A * A);
}
static inline modq modq_sum(modq a,modq b)
{
crypto_int32 A = a;
crypto_int32 B = b;
return modq_freeze(A + B);
}
static inline modq modq_reciprocal(modq a1)
{
modq a2 = modq_square(a1);
modq a3 = modq_product(a2,a1);
modq a4 = modq_square(a2);
modq a8 = modq_square(a4);
modq a16 = modq_square(a8);
modq a32 = modq_square(a16);
modq a35 = modq_product(a32,a3);
modq a70 = modq_square(a35);
modq a140 = modq_square(a70);
modq a143 = modq_product(a140,a3);
modq a286 = modq_square(a143);
modq a572 = modq_square(a286);
modq a1144 = modq_square(a572);
modq a1147 = modq_product(a1144,a3);
modq a2294 = modq_square(a1147);
modq a4588 = modq_square(a2294);
modq a4589 = modq_product(a4588,a1);
return a4589;
}
static inline modq modq_quotient(modq num,modq den)
{
return modq_product(num,modq_reciprocal(den));
}
#endif

View File

@ -0,0 +1,14 @@
#ifndef params_h
#define params_h
#define q 4591
/* XXX: also built into modq in various ways */
#define qshift 2295
#define p 761
#define w 286
#define rq_encode_len 1218
#define small_encode_len 191
#endif

View File

@ -0,0 +1,12 @@
#ifndef r3_h
#define r3_h
#include "small.h"
#define r3_mult crypto_kem_sntrup4591761_ref_r3_mult
extern void r3_mult(small *,const small *,const small *);
#define r3_recip crypto_kem_sntrup4591761_ref_r3_recip
extern int r3_recip(small *,const small *);
#endif

View File

@ -0,0 +1,31 @@
#include "params.h"
#include "mod3.h"
#include "r3.h"
void r3_mult(small *h,const small *f,const small *g)
{
small fg[p + p - 1];
small result;
int i, j;
for (i = 0;i < p;++i) {
result = 0;
for (j = 0;j <= i;++j)
result = mod3_plusproduct(result,f[j],g[i - j]);
fg[i] = result;
}
for (i = p;i < p + p - 1;++i) {
result = 0;
for (j = i - p + 1;j < p;++j)
result = mod3_plusproduct(result,f[j],g[i - j]);
fg[i] = result;
}
for (i = p + p - 2;i >= p;--i) {
fg[i - p] = mod3_sum(fg[i - p],fg[i]);
fg[i - p + 1] = mod3_sum(fg[i - p + 1],fg[i]);
}
for (i = 0;i < p;++i)
h[i] = fg[i];
}

View File

@ -0,0 +1,126 @@
#include "params.h"
#include "mod3.h"
#include "swap.h"
#include "r3.h"
/* caller must ensure that x-y does not overflow */
static int smaller_mask(int x,int y)
{
return (x - y) >> 31;
}
static void vectormod3_product(small *z,int len,const small *x,const small c)
{
int i;
for (i = 0;i < len;++i) z[i] = mod3_product(x[i],c);
}
static void vectormod3_minusproduct(small *z,int len,const small *x,const small *y,const small c)
{
int i;
for (i = 0;i < len;++i) z[i] = mod3_minusproduct(x[i],y[i],c);
}
static void vectormod3_shift(small *z,int len)
{
int i;
for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
z[0] = 0;
}
/*
r = s^(-1) mod m, returning 0, if s is invertible mod m
or returning -1 if s is not invertible mod m
r,s are polys of degree <p
m is x^p-x-1
*/
int r3_recip(small *r,const small *s)
{
const int loops = 2*p + 1;
int loop;
small f[p + 1];
small g[p + 1];
small u[loops + 1];
small v[loops + 1];
small c;
int i;
int d = p;
int e = p;
int swapmask;
for (i = 2;i < p;++i) f[i] = 0;
f[0] = -1;
f[1] = -1;
f[p] = 1;
/* generalization: can initialize f to any polynomial m */
/* requirements: m has degree exactly p, nonzero constant coefficient */
for (i = 0;i < p;++i) g[i] = s[i];
g[p] = 0;
for (i = 0;i <= loops;++i) u[i] = 0;
v[0] = 1;
for (i = 1;i <= loops;++i) v[i] = 0;
loop = 0;
for (;;) {
/* e == -1 or d + e + loop <= 2*p */
/* f has degree p: i.e., f[p]!=0 */
/* f[i]==0 for i < p-d */
/* g has degree <=p (so it fits in p+1 coefficients) */
/* g[i]==0 for i < p-e */
/* u has degree <=loop (so it fits in loop+1 coefficients) */
/* u[i]==0 for i < p-d */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
/* v has degree <=loop (so it fits in loop+1 coefficients) */
/* v[i]==0 for i < p-e */
/* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
if (loop >= loops) break;
c = mod3_quotient(g[p],f[p]);
vectormod3_minusproduct(g,p + 1,g,f,c);
vectormod3_shift(g,p + 1);
#ifdef SIMPLER
vectormod3_minusproduct(v,loops + 1,v,u,c);
vectormod3_shift(v,loops + 1);
#else
if (loop < p) {
vectormod3_minusproduct(v,loop + 1,v,u,c);
vectormod3_shift(v,loop + 2);
} else {
vectormod3_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
vectormod3_shift(v + loop - p,p + 2);
}
#endif
e -= 1;
++loop;
swapmask = smaller_mask(e,d) & mod3_nonzero_mask(g[p]);
swap(&e,&d,sizeof e,swapmask);
swap(f,g,(p + 1) * sizeof(small),swapmask);
#ifdef SIMPLER
swap(u,v,(loops + 1) * sizeof(small),swapmask);
#else
if (loop < p) {
swap(u,v,(loop + 1) * sizeof(small),swapmask);
} else {
swap(u + loop - p,v + loop - p,(p + 1) * sizeof(small),swapmask);
}
#endif
}
c = mod3_reciprocal(f[p]);
vectormod3_product(r,p,u + p,c);
return smaller_mask(0,d);
}

View File

@ -0,0 +1,24 @@
#include "randombytes.h"
#include "small.h"
#ifdef KAT
/* NIST KAT generator fails to provide chunk-independence */
static unsigned char x[4*761];
static long long pos = 4*761;
#endif
crypto_int32 small_random32(void)
{
#ifdef KAT
if (pos == 4*761) {
randombytes(x,sizeof x);
pos = 0;
}
pos += 4;
return x[pos - 4] + (x[pos - 3] << 8) + (x[pos - 2] << 16) + (x[pos - 1] << 24);
#else
unsigned char x[4];
randombytes(x,4);
return x[0] + (x[1] << 8) + (x[2] << 16) + (x[3] << 24);
#endif
}

View File

@ -0,0 +1,14 @@
#include "params.h"
#include "randombytes.h"
#include "crypto_uint32.h"
#include "small.h"
void small_random(small *g)
{
int i;
for (i = 0;i < p;++i) {
crypto_uint32 r = small_random32();
g[i] = (small) (((1073741823 & r) * 3) >> 30) - 1;
}
}

View File

@ -0,0 +1,16 @@
#include "params.h"
#include "randombytes.h"
#include "int32_sort.h"
#include "small.h"
void small_random_weightw(small *f)
{
crypto_int32 r[p];
int i;
for (i = 0;i < p;++i) r[i] = small_random32();
for (i = 0;i < w;++i) r[i] &= -2;
for (i = w;i < p;++i) r[i] = (r[i] & -3) | 1;
int32_sort(r,p);
for (i = 0;i < p;++i) f[i] = ((small) (r[i] & 3)) - 1;
}

View File

@ -0,0 +1,128 @@
#include "params.h"
#include "crypto_uint32.h"
#include "rq.h"
void rq_encode(unsigned char *c,const modq *f)
{
crypto_int32 f0, f1, f2, f3, f4;
int i;
for (i = 0;i < p/5;++i) {
f0 = *f++ + qshift;
f1 = *f++ + qshift;
f2 = *f++ + qshift;
f3 = *f++ + qshift;
f4 = *f++ + qshift;
/* now want f0 + 6144*f1 + ... as a 64-bit integer */
f1 *= 3;
f2 *= 9;
f3 *= 27;
f4 *= 81;
/* now want f0 + f1<<11 + f2<<22 + f3<<33 + f4<<44 */
f0 += f1 << 11;
*c++ = f0; f0 >>= 8;
*c++ = f0; f0 >>= 8;
f0 += f2 << 6;
*c++ = f0; f0 >>= 8;
*c++ = f0; f0 >>= 8;
f0 += f3 << 1;
*c++ = f0; f0 >>= 8;
f0 += f4 << 4;
*c++ = f0; f0 >>= 8;
*c++ = f0; f0 >>= 8;
*c++ = f0;
}
/* XXX: using p mod 5 = 1 */
f0 = *f++ + qshift;
*c++ = f0; f0 >>= 8;
*c++ = f0;
}
void rq_decode(modq *f,const unsigned char *c)
{
crypto_uint32 c0, c1, c2, c3, c4, c5, c6, c7;
crypto_uint32 f0, f1, f2, f3, f4;
int i;
for (i = 0;i < p/5;++i) {
c0 = *c++;
c1 = *c++;
c2 = *c++;
c3 = *c++;
c4 = *c++;
c5 = *c++;
c6 = *c++;
c7 = *c++;
/* f0 + f1*6144 + f2*6144^2 + f3*6144^3 + f4*6144^4 */
/* = c0 + c1*256 + ... + c6*256^6 + c7*256^7 */
/* with each f between 0 and 4590 */
c6 += c7 << 8;
/* c6 <= 23241 = floor(4591*6144^4/2^48) */
/* f4 = (16/81)c6 + (1/1296)(c5+[0,1]) - [0,0.75] */
/* claim: 2^19 f4 < x < 2^19(f4+1) */
/* where x = 103564 c6 + 405(c5+1) */
/* proof: x - 2^19 f4 = (76/81)c6 + (37/81)c5 + 405 - (32768/81)[0,1] + 2^19[0,0.75] */
/* at least 405 - 32768/81 > 0 */
/* at most (76/81)23241 + (37/81)255 + 405 + 2^19 0.75 < 2^19 */
f4 = (103564*c6 + 405*(c5+1)) >> 19;
c5 += c6 << 8;
c5 -= (f4 * 81) << 4;
c4 += c5 << 8;
/* f0 + f1*6144 + f2*6144^2 + f3*6144^3 */
/* = c0 + c1*256 + c2*256^2 + c3*256^3 + c4*256^4 */
/* c4 <= 247914 = floor(4591*6144^3/2^32) */
/* f3 = (1/54)(c4+[0,1]) - [0,0.75] */
/* claim: 2^19 f3 < x < 2^19(f3+1) */
/* where x = 9709(c4+2) */
/* proof: x - 2^19 f3 = 19418 - (1/27)c4 - (262144/27)[0,1] + 2^19[0,0.75] */
/* at least 19418 - 247914/27 - 262144/27 > 0 */
/* at most 19418 + 2^19 0.75 < 2^19 */
f3 = (9709*(c4+2)) >> 19;
c4 -= (f3 * 27) << 1;
c3 += c4 << 8;
/* f0 + f1*6144 + f2*6144^2 */
/* = c0 + c1*256 + c2*256^2 + c3*256^3 */
/* c3 <= 10329 = floor(4591*6144^2/2^24) */
/* f2 = (4/9)c3 + (1/576)c2 + (1/147456)c1 + (1/37748736)c0 - [0,0.75] */
/* claim: 2^19 f2 < x < 2^19(f2+1) */
/* where x = 233017 c3 + 910(c2+2) */
/* proof: x - 2^19 f2 = 1820 + (1/9)c3 - (2/9)c2 - (32/9)c1 - (1/72)c0 + 2^19[0,0.75] */
/* at least 1820 - (2/9)255 - (32/9)255 - (1/72)255 > 0 */
/* at most 1820 + (1/9)10329 + 2^19 0.75 < 2^19 */
f2 = (233017*c3 + 910*(c2+2)) >> 19;
c2 += c3 << 8;
c2 -= (f2 * 9) << 6;
c1 += c2 << 8;
/* f0 + f1*6144 */
/* = c0 + c1*256 */
/* c1 <= 110184 = floor(4591*6144/2^8) */
/* f1 = (1/24)c1 + (1/6144)c0 - (1/6144)f0 */
/* claim: 2^19 f1 < x < 2^19(f1+1) */
/* where x = 21845(c1+2) + 85 c0 */
/* proof: x - 2^19 f1 = 43690 - (1/3)c1 - (1/3)c0 + 2^19 [0,0.75] */
/* at least 43690 - (1/3)110184 - (1/3)255 > 0 */
/* at most 43690 + 2^19 0.75 < 2^19 */
f1 = (21845*(c1+2) + 85*c0) >> 19;
c1 -= (f1 * 3) << 3;
c0 += c1 << 8;
f0 = c0;
*f++ = modq_freeze(f0 + q - qshift);
*f++ = modq_freeze(f1 + q - qshift);
*f++ = modq_freeze(f2 + q - qshift);
*f++ = modq_freeze(f3 + q - qshift);
*f++ = modq_freeze(f4 + q - qshift);
}
c0 = *c++;
c1 = *c++;
c0 += c1 << 8;
*f++ = modq_freeze(c0 + q - qshift);
}

View File

@ -0,0 +1,28 @@
#ifndef rq_h
#define rq_h
#include "modq.h"
#include "small.h"
#define rq_encode crypto_kem_sntrup4591761_ref_rq_encode
extern void rq_encode(unsigned char *,const modq *);
#define rq_decode crypto_kem_sntrup4591761_ref_rq_decode
extern void rq_decode(modq *,const unsigned char *);
#define rq_encoderounded crypto_kem_sntrup4591761_ref_rq_encoderounded
extern void rq_encoderounded(unsigned char *,const modq *);
#define rq_decoderounded crypto_kem_sntrup4591761_ref_rq_decoderounded
extern void rq_decoderounded(modq *,const unsigned char *);
#define rq_round3 crypto_kem_sntrup4591761_ref_rq_round
extern void rq_round3(modq *,const modq *);
#define rq_mult crypto_kem_sntrup4591761_ref_rq_mult
extern void rq_mult(modq *,const modq *,const small *);
#define rq_recip3 crypto_kem_sntrup4591761_ref_rq_recip3
int rq_recip3(modq *,const small *);
#endif

View File

@ -0,0 +1,30 @@
#include "params.h"
#include "rq.h"
void rq_mult(modq *h,const modq *f,const small *g)
{
modq fg[p + p - 1];
modq result;
int i, j;
for (i = 0;i < p;++i) {
result = 0;
for (j = 0;j <= i;++j)
result = modq_plusproduct(result,f[j],g[i - j]);
fg[i] = result;
}
for (i = p;i < p + p - 1;++i) {
result = 0;
for (j = i - p + 1;j < p;++j)
result = modq_plusproduct(result,f[j],g[i - j]);
fg[i] = result;
}
for (i = p + p - 2;i >= p;--i) {
fg[i - p] = modq_sum(fg[i - p],fg[i]);
fg[i - p + 1] = modq_sum(fg[i - p + 1],fg[i]);
}
for (i = 0;i < p;++i)
h[i] = fg[i];
}

View File

@ -0,0 +1,125 @@
#include "params.h"
#include "swap.h"
#include "rq.h"
/* caller must ensure that x-y does not overflow */
static int smaller_mask(int x,int y)
{
return (x - y) >> 31;
}
static void vectormodq_product(modq *z,int len,const modq *x,const modq c)
{
int i;
for (i = 0;i < len;++i) z[i] = modq_product(x[i],c);
}
static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,const modq c)
{
int i;
for (i = 0;i < len;++i) z[i] = modq_minusproduct(x[i],y[i],c);
}
static void vectormodq_shift(modq *z,int len)
{
int i;
for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
z[0] = 0;
}
/*
r = (3s)^(-1) mod m, returning 0, if s is invertible mod m
or returning -1 if s is not invertible mod m
r,s are polys of degree <p
m is x^p-x-1
*/
int rq_recip3(modq *r,const small *s)
{
const int loops = 2*p + 1;
int loop;
modq f[p + 1];
modq g[p + 1];
modq u[loops + 1];
modq v[loops + 1];
modq c;
int i;
int d = p;
int e = p;
int swapmask;
for (i = 2;i < p;++i) f[i] = 0;
f[0] = -1;
f[1] = -1;
f[p] = 1;
/* generalization: can initialize f to any polynomial m */
/* requirements: m has degree exactly p, nonzero constant coefficient */
for (i = 0;i < p;++i) g[i] = 3 * s[i];
g[p] = 0;
for (i = 0;i <= loops;++i) u[i] = 0;
v[0] = 1;
for (i = 1;i <= loops;++i) v[i] = 0;
loop = 0;
for (;;) {
/* e == -1 or d + e + loop <= 2*p */
/* f has degree p: i.e., f[p]!=0 */
/* f[i]==0 for i < p-d */
/* g has degree <=p (so it fits in p+1 coefficients) */
/* g[i]==0 for i < p-e */
/* u has degree <=loop (so it fits in loop+1 coefficients) */
/* u[i]==0 for i < p-d */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
/* v has degree <=loop (so it fits in loop+1 coefficients) */
/* v[i]==0 for i < p-e */
/* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
if (loop >= loops) break;
c = modq_quotient(g[p],f[p]);
vectormodq_minusproduct(g,p + 1,g,f,c);
vectormodq_shift(g,p + 1);
#ifdef SIMPLER
vectormodq_minusproduct(v,loops + 1,v,u,c);
vectormodq_shift(v,loops + 1);
#else
if (loop < p) {
vectormodq_minusproduct(v,loop + 1,v,u,c);
vectormodq_shift(v,loop + 2);
} else {
vectormodq_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
vectormodq_shift(v + loop - p,p + 2);
}
#endif
e -= 1;
++loop;
swapmask = smaller_mask(e,d) & modq_nonzero_mask(g[p]);
swap(&e,&d,sizeof e,swapmask);
swap(f,g,(p + 1) * sizeof(modq),swapmask);
#ifdef SIMPLER
swap(u,v,(loops + 1) * sizeof(modq),swapmask);
#else
if (loop < p) {
swap(u,v,(loop + 1) * sizeof(modq),swapmask);
} else {
swap(u + loop - p,v + loop - p,(p + 1) * sizeof(modq),swapmask);
}
#endif
}
c = modq_reciprocal(f[p]);
vectormodq_product(r,p,u + p,c);
return smaller_mask(0,d);
}

View File

@ -0,0 +1,10 @@
#include "params.h"
#include "rq.h"
void rq_round3(modq *h,const modq *f)
{
int i;
for (i = 0;i < p;++i)
h[i] = ((21846 * (f[i] + 2295) + 32768) >> 16) * 3 - 2295;
}

View File

@ -0,0 +1,101 @@
#include "params.h"
#include "crypto_uint32.h"
#include "rq.h"
void rq_encoderounded(unsigned char *c,const modq *f)
{
crypto_int32 f0, f1, f2;
int i;
for (i = 0;i < p/3;++i) {
f0 = *f++ + qshift;
f1 = *f++ + qshift;
f2 = *f++ + qshift;
f0 = (21846 * f0) >> 16;
f1 = (21846 * f1) >> 16;
f2 = (21846 * f2) >> 16;
/* now want f0 + f1*1536 + f2*1536^2 as a 32-bit integer */
f2 *= 3;
f1 += f2 << 9;
f1 *= 3;
f0 += f1 << 9;
*c++ = f0; f0 >>= 8;
*c++ = f0; f0 >>= 8;
*c++ = f0; f0 >>= 8;
*c++ = f0;
}
/* XXX: using p mod 3 = 2 */
f0 = *f++ + qshift;
f1 = *f++ + qshift;
f0 = (21846 * f0) >> 16;
f1 = (21846 * f1) >> 16;
f1 *= 3;
f0 += f1 << 9;
*c++ = f0; f0 >>= 8;
*c++ = f0; f0 >>= 8;
*c++ = f0;
}
void rq_decoderounded(modq *f,const unsigned char *c)
{
crypto_uint32 c0, c1, c2, c3;
crypto_uint32 f0, f1, f2;
int i;
for (i = 0;i < p/3;++i) {
c0 = *c++;
c1 = *c++;
c2 = *c++;
c3 = *c++;
/* f0 + f1*1536 + f2*1536^2 */
/* = c0 + c1*256 + c2*256^2 + c3*256^3 */
/* with each f between 0 and 1530 */
/* f2 = (64/9)c3 + (1/36)c2 + (1/9216)c1 + (1/2359296)c0 - [0,0.99675] */
/* claim: 2^21 f2 < x < 2^21(f2+1) */
/* where x = 14913081*c3 + 58254*c2 + 228*(c1+2) */
/* proof: x - 2^21 f2 = 456 - (8/9)c0 + (4/9)c1 - (2/9)c2 + (1/9)c3 + 2^21 [0,0.99675] */
/* at least 456 - (8/9)255 - (2/9)255 > 0 */
/* at most 456 + (4/9)255 + (1/9)255 + 2^21 0.99675 < 2^21 */
f2 = (14913081*c3 + 58254*c2 + 228*(c1+2)) >> 21;
c2 += c3 << 8;
c2 -= (f2 * 9) << 2;
/* f0 + f1*1536 */
/* = c0 + c1*256 + c2*256^2 */
/* c2 <= 35 = floor((1530+1530*1536)/256^2) */
/* f1 = (128/3)c2 + (1/6)c1 + (1/1536)c0 - (1/1536)f0 */
/* claim: 2^21 f1 < x < 2^21(f1+1) */
/* where x = 89478485*c2 + 349525*c1 + 1365*(c0+1) */
/* proof: x - 2^21 f1 = 1365 - (1/3)c2 - (1/3)c1 - (1/3)c0 + (4096/3)f0 */
/* at least 1365 - (1/3)35 - (1/3)255 - (1/3)255 > 0 */
/* at most 1365 + (4096/3)1530 < 2^21 */
f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21;
c1 += c2 << 8;
c1 -= (f1 * 3) << 1;
c0 += c1 << 8;
f0 = c0;
*f++ = modq_freeze(f0 * 3 + q - qshift);
*f++ = modq_freeze(f1 * 3 + q - qshift);
*f++ = modq_freeze(f2 * 3 + q - qshift);
}
c0 = *c++;
c1 = *c++;
c2 = *c++;
f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21;
c1 += c2 << 8;
c1 -= (f1 * 3) << 1;
c0 += c1 << 8;
f0 = c0;
*f++ = modq_freeze(f0 * 3 + q - qshift);
*f++ = modq_freeze(f1 * 3 + q - qshift);
}

View File

@ -0,0 +1,37 @@
#include "params.h"
#include "small.h"
/* XXX: these functions rely on p mod 4 = 1 */
/* all coefficients in -1, 0, 1 */
void small_encode(unsigned char *c,const small *f)
{
small c0;
int i;
for (i = 0;i < p/4;++i) {
c0 = *f++ + 1;
c0 += (*f++ + 1) << 2;
c0 += (*f++ + 1) << 4;
c0 += (*f++ + 1) << 6;
*c++ = c0;
}
c0 = *f++ + 1;
*c++ = c0;
}
void small_decode(small *f,const unsigned char *c)
{
unsigned char c0;
int i;
for (i = 0;i < p/4;++i) {
c0 = *c++;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1;
}
c0 = *c++;
*f++ = ((small) (c0 & 3)) - 1;
}

View File

@ -0,0 +1,24 @@
#ifndef small_h
#define small_h
#include "crypto_int8.h"
#include "crypto_int32.h"
typedef crypto_int8 small;
#define small_encode crypto_kem_sntrup4591761_ref_small_encode
extern void small_encode(unsigned char *,const small *);
#define small_decode crypto_kem_sntrup4591761_ref_small_decode
extern void small_decode(small *,const unsigned char *);
#define small_random32 crypto_kem_sntrup4591761_ref_small_random32
extern crypto_int32 small_random32(void);
#define small_random crypto_kem_sntrup4591761_ref_small_random
extern void small_random(small *);
#define small_random_weightw crypto_kem_sntrup4591761_ref_small_random_weightw
extern void small_random_weightw(small *);
#endif

View File

@ -0,0 +1,19 @@
#include "swap.h"
void swap(void *x,void *y,int bytes,int mask)
{
int i;
char xi, yi, c, t;
c = mask;
for (i = 0;i < bytes;++i) {
xi = i[(char *) x];
yi = i[(char *) y];
t = c & (xi ^ yi);
xi ^= t;
yi ^= t;
i[(char *) x] = xi;
i[(char *) y] = yi;
}
}

View File

@ -0,0 +1,7 @@
#ifndef swap_h
#define swap_h
#define swap crypto_kem_sntrup4591761_ref_swap
extern void swap(void *,void *,int,int);
#endif

View File

@ -0,0 +1 @@
83705d49d3a8cb2e16028b86ea6bd44a969b51c2e5114ee02767cf2ddf1aac26

View File

@ -0,0 +1 @@
336647fe0ed2f6e0d4b15d05e68faec67a81312d769ad3cbee8e0f2de83c2dde

View File

@ -0,0 +1 @@
Streamlined NTRU Prime 4591^761

View File

@ -0,0 +1,5 @@
Alphabetical order:
Daniel J. Bernstein
Chitchanok Chuengsatiansup
Tanja Lange
Christine van Vredendaal

View File

@ -317,7 +317,7 @@ main(int argc, char *argv[])
llarp_findOrCreateIdentity(&crypt, ident_keyfile.string().c_str(),
identity);
// get identity public key
uint8_t *pubkey = llarp::seckey_topublic(identity);
const uint8_t *pubkey = llarp::seckey_topublic(identity);
llarp_rc_set_pubsigkey(&rc, pubkey);
llarp_rc_sign(&crypt, identity, &rc);

View File

@ -9,19 +9,18 @@ MDS(x, k) is 256 bit blake2b hmac of x with secret value k
SE(k, n, x) is chacha20 encrypt data x using symettric key k and nounce n
SD(k, n, x) is chacha20 dectypt data x using symettric key k and nounce n
S(k, x) is sign x with ed25519 using secret key k
EDKG() is generate ec keypair (p, s) public key p (32 bytes), secret key s (643 bytes)
EDKG() is generate ec keypair (p, s) public key p (32 bytes), secret key s (64 bytes)
V(k, x, sig) is verify x data using signature sig using public key k
EDDH(a, b) is curve25519 scalar multiplication of a and b
HKE(a, b, x) is hashed key exchange between a and b using a secret key x HS(a + b + EDDH(x, b))
TKE(a, b, sk, n) is a transport shared secret kdf using MDS(n, HKE(a, b, sk))
TKE(a, b, x, n) is a transport shared secret kdf using MDS(n, HKE(a, b, x))
when A is client and B is server where n is a 32 bytes shared random
client computes TKE(A.pk, B.pk, A.sk, n)
server computes TKE(A.pk, B.pk, B.sk, n)
PDH(a, b, x) is path shared secret generation HS(a + b + curve41417_scalar_mult(x, b))
PDH(a, b, x) is path shared secret generation HS(a + b + EDDH(x, b))
PKE(a, b, x, n) is a path shared secret kdf using MDS(n, PDH(a, b, x))
@ -34,4 +33,7 @@ S_a is equal to S_b
RAND(n) is n random bytes
PQKG() is generate a sntrup4591761 key pair (sk, pk)
PQKE_A(pk) is alice generating (x, k) where x is sntrup4591761 ciphertext block and k is the session key
PQKE_B(x, sk) is bob calculating k where x is sntrup4591761 ciphertext block, sk is bob's sntrup4591761 secretkey and k is the session key

View File

@ -176,8 +176,9 @@ x is the timestamp seconds since epoch that this introduction expires at
introduction set (IS)
a signed set of introductions for a hidden service
a is the service info
a is the service info of the publisher
i is the list of introductions that this service is advertising with
k is the public key to use when doing encryption to this hidden service
n is a 16 byte null padded utf-8 encoded string tagging the hidden service in
a topic searchable via a lookup (optional)
v is the protocol version
@ -188,6 +189,7 @@ service's signing key.
{
a: SI,
i: [ I, I, I, ... ],
k: "<1218 bytes sntrup4591761 public key block>",
n: "<16 bytes service topic (optional)>",
v: 0,
w: optional proof of work,
@ -528,54 +530,6 @@ B is set to a backoff value.
R contains additional metadata text describing why the exit was rejected.
hidden service frame (HSF)
TODO: document this better
intro message (variant 1)
start a new session
{
A: "H",
D: "<N bytes encrypted HSD>",
H: "<32 bytes ephemeral public encryption key>",
N: "<32 bytes nonce for key exchange>",
S: 0,
V: 0,
Z: "<64 bytes signature of entire message using sender's signing key>"
}
D is encrypted with session key K which is derived by
K = PKE(H, SI.enckey, N)
ordered data message (variant 2)
{
A: "H",
D: "<N bytes encrypted HSD>",
N: "<32 bytes nonce for symettric cipher>",
S: sequence_number_uint64,
T: "<16 bytes converstation tag>",
V: 0,
Z: "<64 bytes signature using sender's signing key>"
}
hidden service data (HSD)
data sent anonymously over the network to a recipiant from a sender.
sent inside a HSFM encrypted with a shared secret.
{
a: protocol_number_uint,
d: "<N bytes payload>",
i: Introduction for reply,
s: SI of sender,
t: "<16 bytes converstation tag present only in message 0>",
v: 0
}
transfer data fragment message (TDFM)
transfer data between paths.
@ -592,6 +546,132 @@ transfer data to another path with id P on the local router place a random 32 by
into y and z values into a LRDM message (respectively) and send it in the
downstream direction.
hidden service data (HSD)
data sent anonymously over the network to a recipiant from a sender.
sent inside a HSFM encrypted with a shared secret.
{
a: protocol_number_uint,
d: "<N bytes payload>",
i: Introduction for reply,
n: uint_message_sequence_number,
o: N seconds until this converstation plans terminate,
s: SI of sender,
t: "<16 bytes converstation tag present only when n is 0>",
v: 0
}
hidden service frame (HSF)
TODO: document this better
intro message (variant 1)
start a new session
{
A: "H",
C: "<1048 bytes ciphertext block>",
D: "<N bytes encrypted HSD>",
N: "<32 bytes nonce for key exchange>",
V: 0,
Z: "<64 bytes signature of entire message using sender's signing key>"
}
alice (A) wants to talk to bob (B) over the network, both have hidden services
set up and are online on the network.
A and B are both SI.
A_sk is alice's private signing key.
for alice (A) to send the string "beep" to bob (B), alice picks an introduction
to use on one of her paths (I_A) such that I_A is aligning with one of bobs's
paths (I_B)
alice generates:
T = RAND(16)
m = {
a: 0,
d: "beep",
i: I_A,
n: 0,
s: A,
t: T,
v: 0
}
X = BE(m)
C, K = PQKE_A(I_B.k)
N = RAND(32)
D = SE(X, K, N)
M = {
A: "T",
P: I_B.P,
S: uint64_sequence_number,
T: {
A: "H",
C: C,
D: D,
N: N,
V: 0,
Z: "\x00" * 32
},
V: 0
}
Z = S(A_sk, BE(M))
alice transmits a TDFM to router with public key I_B.K via her path that ends
with router with public key I_B.k
{
A: "T",
P: I_B.P,
S: uint64_sequence_number,
T: {
A: "H",
C: C,
D: D,
N: N,
V: 0,
Z: Z
},
V: 0
}
the shared secret (S) for further message encryption is:
S = HS(K + PKE(A, B, sk, N))
given sk is the local secret encryption key used by the current hidden service
please note:
signature verification can only be done after decryption
TODO: explain bob's side too (it's invsere of alice's process)
data from a previously made session (variant 2)
transfer data on a session previously made
{
A: "H",
D: "<N bytes encrypted HSD>",
N: "<32 bytes nonce for symettric cipher>",
T: "<16 bytes converstation tag>",
V: 0,
Z: "<64 bytes signature using sender's signing key>"
}
transfer ip traffic message (TITM)
transfer ip traffic for exit

View File

@ -11,12 +11,10 @@
namespace llarp
{
/// aligned buffer, sz must be multiple of 8 bytes
/// aligned buffer, aligns to the nears 8 bytes
template < size_t sz, bool randomize = false >
struct AlignedBuffer
{
static_assert(sz % 8 == 0, "aligned buffer size is not a multiple of 8");
AlignedBuffer()
{
if(randomize)
@ -121,20 +119,13 @@ namespace llarp
bool
IsZero() const
{
size_t idx = sz / 8;
while(idx)
{
if(l[--idx])
return false;
}
return true;
return sodium_is_zero(b, sz) != 0;
}
void
Zero()
{
for(size_t idx = 0; idx * 8 < sz; ++idx)
l[idx] = 0;
sodium_memzero(l, sz);
}
void
@ -212,7 +203,7 @@ namespace llarp
protected:
union {
byte_t b[sz];
uint64_t l[sz / 8];
uint64_t l[(sz / 8) + (sz % 8)];
};
};

View File

@ -2,8 +2,9 @@
#define LLARP_BENCODE_HPP
#include <llarp/bencode.h>
#include <llarp/buffer.hpp>
#include <llarp/logger.hpp>
#include <llarp/mem.hpp>
#include <set>
namespace llarp
@ -197,6 +198,16 @@ namespace llarp
r->buffer);
return true;
}
template < size_t bufsz, size_t align = 128 >
void
Dump() const
{
byte_t tmp[bufsz] = {0};
auto buf = llarp::StackBuffer< decltype(tmp) >(tmp);
if(BEncode(&buf))
llarp::DumpBuffer< decltype(buf), align >(buf);
}
};
} // namespace llarp

44
include/llarp/buffer.hpp Normal file
View File

@ -0,0 +1,44 @@
#ifndef LLARP_BUFFER_HPP
#define LLARP_BUFFER_HPP
#include <llarp/buffer.h>
namespace llarp
{
template < typename T >
llarp_buffer_t
StackBuffer(T& stack)
{
llarp_buffer_t buff;
buff.base = &stack[0];
buff.cur = buff.base;
buff.sz = sizeof(stack);
return buff;
}
/** initialize llarp_buffer_t from container */
template < typename T >
llarp_buffer_t
Buffer(T& t)
{
llarp_buffer_t buff;
buff.base = &t[0];
buff.cur = buff.base;
buff.sz = t.size();
return buff;
}
template < typename T >
llarp_buffer_t
ConstBuffer(const T& t)
{
llarp_buffer_t buff;
buff.base = (byte_t*)&t[0];
buff.cur = buff.base;
buff.sz = t.size();
return buff;
}
} // namespace llarp
#endif

View File

@ -24,26 +24,22 @@
#define HMACSIZE 32
#define PATHIDSIZE 16
/*
typedef byte_t llarp_pubkey_t[PUBKEYSIZE];
typedef byte_t llarp_seckey_t[SECKEYSIZE];
typedef byte_t llarp_nonce_t[NONCESIZE];
typedef byte_t llarp_sharedkey_t[SHAREDKEYSIZE];
typedef byte_t llarp_hash_t[HASHSIZE];
typedef byte_t llarp_shorthash_t[SHORTHASHSIZE];
typedef byte_t llarp_hmac_t[HMACSIZE];
typedef byte_t llarp_hmacsec_t[HMACSECSIZE];
typedef byte_t llarp_sig_t[SIGSIZE];
typedef byte_t llarp_tunnel_nonce_t[TUNNONCESIZE];
*/
#include <libntrup/ntru.h>
#define PQ_CIPHERTEXTSIZE crypto_kem_CIPHERTEXTBYTES
#define PQ_PUBKEYSIZE crypto_kem_PUBLICKEYBYTES
#define PQ_SECRETKEYSIZE crypto_kem_SECRETKEYBYTES
#define PQ_KEYPAIRSIZE (PQ_SECRETKEYSIZE + PQ_SECRETKEYSIZE)
/// label functors
/// PKE(result, publickey, secretkey, nonce)
typedef bool (*llarp_path_dh_func)(byte_t *, byte_t *, byte_t *, byte_t *);
typedef bool (*llarp_path_dh_func)(byte_t *, const byte_t *, const byte_t *,
const byte_t *);
/// TKE(result, publickey, secretkey, nonce)
typedef bool (*llarp_transport_dh_func)(byte_t *, byte_t *, byte_t *, byte_t *);
typedef bool (*llarp_transport_dh_func)(byte_t *, const byte_t *,
const byte_t *, const byte_t *);
/// SD/SE(buffer, key, nonce)
typedef bool (*llarp_sym_cipher_func)(llarp_buffer_t, const byte_t *,
@ -96,6 +92,12 @@ struct llarp_crypto
void (*identity_keygen)(byte_t *);
/// generate encryption keypair
void (*encryption_keygen)(byte_t *);
/// generate post quantum encrytion key
void (*pqe_keygen)(byte_t *);
/// post quantum decrypt (buffer, sharedkey_dst, sec)
bool (*pqe_decrypt)(const byte_t *, byte_t *, const byte_t *);
/// post quantum encrypt (buffer, sharedkey_dst, pub)
bool (*pqe_encrypt)(byte_t *, byte_t *, const byte_t *);
};
/// set crypto function pointers to use libsodium

View File

@ -11,36 +11,26 @@ namespace llarp
const byte_t*
seckey_topublic(const byte_t* secret);
byte_t*
seckey_topublic(byte_t* secret);
const byte_t*
pq_keypair_to_public(const byte_t* keypair);
typedef AlignedBuffer< 32 > SharedSecret;
const byte_t*
pq_keypair_to_secret(const byte_t* keypair);
typedef AlignedBuffer< SHAREDKEYSIZE > SharedSecret;
typedef AlignedBuffer< 32 > KeyExchangeNonce;
typedef AlignedBuffer< PUBKEYSIZE > PubKey;
struct PubKeyHash
{
std::size_t
operator()(PubKey const& a) const noexcept
{
size_t sz = 0;
memcpy(&sz, a.data(), sizeof(size_t));
return sz;
}
};
typedef AlignedBuffer< SECKEYSIZE > SecretKey;
typedef AlignedBuffer< SHORTHASHSIZE > ShortHash;
typedef AlignedBuffer< SIGSIZE > Signature;
typedef AlignedBuffer< TUNNONCESIZE > TunnelNonce;
typedef AlignedBuffer< 24 > SymmNonce;
typedef AlignedBuffer< NONCESIZE > SymmNonce;
typedef AlignedBuffer< 32 > SymmKey;
typedef AlignedBuffer< PQ_CIPHERTEXTSIZE + 1 > PQCipherBlock;
typedef AlignedBuffer< PQ_PUBKEYSIZE > PQPubKey;
typedef AlignedBuffer< PQ_KEYPAIRSIZE > PQKeyPair;
} // namespace llarp
#endif

View File

@ -26,7 +26,7 @@ namespace llarp
FindPendingTX(const Key_t& owner, uint64_t txid);
void
RemovePendingLookup(const Key_t& owner, uint64_t txid);
RemovePendingTX(const Key_t& owner, uint64_t txid);
void
LookupServiceDirect(const Key_t& target, const Key_t& whoasked,
@ -64,7 +64,7 @@ namespace llarp
const llarp::PathID_t& path, Key_t askpeer);
template < typename Job, typename Result >
void
bool
TryLookupAgain(Job* j, Result r, uint64_t R)
{
const Key_t targetKey = j->target.ToKey();
@ -73,14 +73,12 @@ namespace llarp
if(!nodes->FindCloseExcluding(targetKey, askpeer, exclude))
{
j->Exausted();
delete j;
return;
return true;
}
if((OurKey() ^ targetKey) < (askpeer ^ targetKey))
{
j->Exausted();
delete j;
return;
return true;
}
auto id = ++ids;
TXOwner ownerKey;
@ -94,6 +92,7 @@ namespace llarp
" with txid=", id);
DHTSendTo(askpeer, msg);
j->asked.insert(std::move(askpeer));
return false;
}
void

View File

@ -24,14 +24,24 @@ namespace llarp
return bencode_write_bytestring(buf, _data, _sz);
}
Encrypted&
operator=(const Encrypted& other)
{
return (*this) = other.Buffer();
}
Encrypted&
operator=(llarp_buffer_t buf)
{
if(_data)
delete[] _data;
_data = nullptr;
_sz = buf.sz;
_data = new byte_t[_sz];
memcpy(_data, buf.base, _sz);
if(_sz)
{
_data = new byte_t[_sz];
memcpy(_data, buf.base, _sz);
}
UpdateBuffer();
return *this;
}

View File

@ -22,7 +22,7 @@ namespace llarp
{
}
EncryptedFrame(byte_t* buf, size_t sz) : Encrypted(buf, sz)
EncryptedFrame(const byte_t* buf, size_t sz) : Encrypted(buf, sz)
{
}
EncryptedFrame(size_t sz)
@ -42,10 +42,11 @@ namespace llarp
}
bool
DecryptInPlace(byte_t* seckey, llarp_crypto* crypto);
DecryptInPlace(const byte_t* seckey, llarp_crypto* crypto);
bool
EncryptInPlace(byte_t* seckey, byte_t* other, llarp_crypto* crypto);
EncryptInPlace(const byte_t* seckey, const byte_t* other,
llarp_crypto* crypto);
};
/// TOOD: can only handle 1 frame at a time
@ -116,7 +117,8 @@ namespace llarp
ctx->result(nullptr, ctx->context);
}
AsyncFrameDecrypter(llarp_crypto* c, byte_t* secretkey, DecryptHandler h)
AsyncFrameDecrypter(llarp_crypto* c, const byte_t* secretkey,
DecryptHandler h)
: result(h), crypto(c), seckey(secretkey)
{
}
@ -124,7 +126,7 @@ namespace llarp
DecryptHandler result;
User* context;
llarp_crypto* crypto;
byte_t* seckey;
const byte_t* seckey;
EncryptedFrame* target;
void

View File

@ -8,13 +8,14 @@
#define ssize_t long
#endif
#else
#include <netinet/in.h>
#include <sys/socket.h>
#endif
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <tuntap.h>
/**
* ev.h
*
@ -74,4 +75,33 @@ llarp_ev_udp_sendto(struct llarp_udp_io *udp, const struct sockaddr *to,
int
llarp_ev_close_udp(struct llarp_udp_io *udp);
#ifdef _WIN32
#define IFNAMSIZ (16)
#endif
struct llarp_tun_io
{
// TODO: more info?
char ifaddr[128];
int netmask;
char ifname[IFNAMSIZ + 1];
void *user;
void *impl;
struct llarp_ev_loop *parent;
/// called every event loop tick after reads
void (*tick)(struct llarp_tun_io *);
void (*recvpkt)(struct llarp_tun_io *, const void *, ssize_t);
};
/// create tun interface with network interface name ifname
/// returns true on success otherwise returns false
bool
llarp_ev_add_tun(struct llarp_ev_loop *ev, struct llarp_tun_io *tun);
/// async write a packet on tun interface
/// returns true if queued, returns false on drop
bool
llarp_ev_tun_async_write(struct llarp_tun_io *tun, const void *pkt, size_t sz);
#endif

View File

@ -72,6 +72,14 @@ struct llarp_link
void
MapAddr(const llarp::Addr &src, const llarp::PubKey &identity);
/// does nothing if we have no session already established
void
KeepAliveSessionTo(const byte_t *pubkey);
/// does nothing if we have no session already established
void
CloseSessionTo(const byte_t *pubkey);
bool
has_session_to(const byte_t *pubkey);
@ -112,7 +120,7 @@ struct llarp_link
void
RemoveSession(llarp_link_session *s);
uint8_t *
const uint8_t *
pubkey();
bool

View File

@ -87,6 +87,9 @@ struct llarp_link_session
bool
Tick(llarp_time_t now);
void
keepalive();
void
PumpCryptoOutbound();

60
include/llarp/mem.hpp Normal file
View File

@ -0,0 +1,60 @@
#ifndef LLARP_MEM_HPP
#define LLARP_MEM_HPP
#include <llarp/buffer.h>
#include <llarp/mem.h>
#include <cctype>
#include <cstdio>
namespace llarp
{
void
Zero(void *ptr, size_t sz);
template < typename T >
void
dumphex(const uint8_t *t)
{
size_t idx = 0;
while(idx < sizeof(T))
{
printf("%.2x ", t[idx++]);
if(idx % 8 == 0)
printf("\n");
}
}
template < typename T, size_t align = 128 >
void
DumpBuffer(const T &buff)
{
size_t idx = 0;
printf("buffer of size %zu\n", buff.sz);
while(idx < buff.sz)
{
if(buff.base + idx == buff.cur)
{
printf("%c[1;31m", 27);
}
if(std::isprint(buff.base[idx]))
{
printf("%c", buff.base[idx]);
}
else
{
printf("X");
}
if(buff.base + idx == buff.cur)
{
printf("%c[0;0m", 27);
}
++idx;
if(idx % align == 0)
printf("\n");
}
printf("\n");
fflush(stdout);
}
} // namespace llarp
#endif

View File

@ -5,7 +5,7 @@
#define MAXHOPS (8)
#define DEFAULT_PATH_LIFETIME (10 * 60 * 1000)
#define PATH_BUILD_TIMEOUT (30 * 1000)
#define PATH_BUILD_TIMEOUT (10 * 1000)
#define MESSAGE_PAD_SIZE (1024)
struct llarp_path_hop

View File

@ -235,6 +235,12 @@ namespace llarp
m_DataHandler = func;
}
llarp_time_t
ExpireTime() const
{
return buildStarted + hops[0].lifetime;
}
bool
Expired(llarp_time_t now) const;

View File

@ -15,6 +15,7 @@ namespace llarp
{
llarp::SecretKey enckey;
llarp::SecretKey signkey;
llarp::PQKeyPair pq;
uint64_t version = 0;
VanityNonce vanity;
@ -37,11 +38,18 @@ namespace llarp
bool
EnsureKeys(const std::string& fpath, llarp_crypto* c);
bool
KeyExchange(llarp_path_dh_func dh, byte_t* sharedkey,
const ServiceInfo& other, const byte_t* N) const;
bool
DecodeKey(llarp_buffer_t key, llarp_buffer_t* buf);
bool
SignIntroSet(IntroSet& i, llarp_crypto* c) const;
bool
Sign(llarp_crypto*, byte_t* sig, llarp_buffer_t buf) const;
};
} // namespace service
} // namespace llarp

View File

@ -49,8 +49,8 @@ namespace llarp
return crypto->verify(signkey, payload, sig);
}
byte_t*
EncryptionPublicKey()
const byte_t*
EncryptionPublicKey() const
{
return enckey;
}

View File

@ -15,12 +15,13 @@ namespace llarp
{
namespace service
{
constexpr std::size_t MAX_INTROSET_SIZE = 1024;
constexpr std::size_t MAX_INTROSET_SIZE = 4096;
struct IntroSet : public llarp::IBEncodeMessage
{
ServiceInfo A;
std::vector< Introduction > I;
PQPubKey K;
Tag topic;
llarp::PoW* W = nullptr;
llarp::Signature Z;
@ -31,6 +32,7 @@ namespace llarp
{
A = std::move(other.A);
I = std::move(other.I);
K = std::move(other.K);
version = std::move(other.version);
topic = std::move(other.topic);
W = std::move(other.W);
@ -41,6 +43,7 @@ namespace llarp
{
A = other.A;
I = other.I;
K = other.K;
version = other.version;
topic = other.topic;
if(other.W)
@ -55,6 +58,7 @@ namespace llarp
{
A = other.A;
I = other.I;
K = other.K;
version = other.version;
topic = other.topic;
if(W)
@ -80,6 +84,7 @@ namespace llarp
out << intro << ",";
}
out << "]";
out << "K=" << i.K;
auto topic = i.topic.ToString();
if(topic.size())
{

View File

@ -97,10 +97,10 @@ namespace llarp
void
EnsureRouterIsKnown(const RouterID& router);
Identity*
const Identity&
GetIdentity()
{
return &m_Identity;
return m_Identity;
}
void
@ -146,9 +146,6 @@ namespace llarp
bool
HandleHiddenServiceFrame(const ProtocolFrame* frame);
void
PutLookup(IServiceLookup* lookup, uint64_t txid);
std::string
Name() const;
@ -157,10 +154,10 @@ namespace llarp
OnIntroSetUpdate(const IntroSet* i);
void
EncryptAndSendTo(llarp_buffer_t payload);
EncryptAndSendTo(path::Path* p, llarp_buffer_t payload, ProtocolType t);
void
AsyncGenIntro(llarp_buffer_t payload);
AsyncGenIntro(path::Path* p, llarp_buffer_t payload, ProtocolType t);
/// send a fully encrypted hidden service frame
void
@ -194,7 +191,7 @@ namespace llarp
bool
GetCachedSessionKeyFor(const ConvoTag& remote,
SharedSecret& secret) const;
const byte_t*& secret) const;
void
PutCachedSessionKeyFor(const ConvoTag& remote,
const SharedSecret& secret);
@ -263,7 +260,28 @@ namespace llarp
std::unordered_map< Address, PathEnsureHook, Address::Hash >
m_PendingServiceLookups;
std::unordered_map< RouterID, uint64_t, RouterID::Hash > m_PendingRouters;
struct RouterLookupJob
{
RouterLookupJob(Endpoint* p)
{
started = llarp_time_now_ms();
txid = p->GenTXID();
}
uint64_t txid;
llarp_time_t started;
bool
IsExpired(llarp_time_t now) const
{
if(now < started)
return false;
return now - started > 5000;
}
};
std::unordered_map< RouterID, RouterLookupJob, RouterID::Hash >
m_PendingRouters;
uint64_t m_CurrentPublishTX = 0;
llarp_time_t m_LastPublish = 0;
@ -271,7 +289,8 @@ namespace llarp
/// our introset
service::IntroSet m_IntroSet;
/// pending remote service lookups by id
std::unordered_map< uint64_t, service::IServiceLookup* > m_PendingLookups;
std::unordered_map< uint64_t, std::unique_ptr< service::IServiceLookup > >
m_PendingLookups;
/// prefetch remote address list
std::set< Address > m_PrefetchAddrs;
/// hidden service tag
@ -293,7 +312,7 @@ namespace llarp
/// sessions
std::unordered_map< ConvoTag, Session, ConvoTag::Hash > m_Sessions;
struct CachedTagResult : public IServiceLookup
struct CachedTagResult
{
const static llarp_time_t TTL = 10000;
llarp_time_t lastRequest = 0;
@ -301,12 +320,13 @@ namespace llarp
std::set< IntroSet > result;
Tag tag;
CachedTagResult(Endpoint* p, const Tag& t, uint64_t tx)
: IServiceLookup(p, tx), tag(t)
CachedTagResult(const Tag& t) : tag(t)
{
}
~CachedTagResult();
~CachedTagResult()
{
}
void
Expire(llarp_time_t now);
@ -320,12 +340,39 @@ namespace llarp
}
llarp::routing::IMessage*
BuildRequestMessage();
BuildRequestMessage(uint64_t txid);
bool
HandleResponse(const std::set< IntroSet >& results);
};
struct TagLookupJob : public IServiceLookup
{
TagLookupJob(Endpoint* parent, CachedTagResult* result)
: IServiceLookup(parent, parent->GenTXID(), "taglookup")
, m_result(result)
{
}
~TagLookupJob()
{
}
llarp::routing::IMessage*
BuildRequestMessage()
{
return m_result->BuildRequestMessage(txid);
}
bool
HandleResponse(const std::set< IntroSet >& results)
{
return m_result->HandleResponse(results);
}
CachedTagResult* m_result;
};
std::unordered_map< Tag, CachedTagResult, Tag::Hash > m_PrefetchedTags;
};
} // namespace service

View File

@ -17,7 +17,7 @@ namespace llarp
virtual bool
GetCachedSessionKeyFor(const ConvoTag& remote,
SharedSecret& secret) const = 0;
const byte_t*& secret) const = 0;
virtual void
PutCachedSessionKeyFor(const ConvoTag& remote,
const SharedSecret& secret) = 0;

View File

@ -20,13 +20,15 @@ namespace llarp
struct IServiceLookup
{
IServiceLookup(ILookupHolder* parent, uint64_t tx);
IServiceLookup() = delete;
virtual ~IServiceLookup(){};
/// handle lookup result
virtual bool
HandleResponse(const std::set< IntroSet >& results) = 0;
HandleResponse(const std::set< IntroSet >& results)
{
return false;
}
/// determine if this request has timed out
bool
@ -47,8 +49,12 @@ namespace llarp
ILookupHolder* parent;
uint64_t txid;
const std::string name;
protected:
IServiceLookup(ILookupHolder* parent, uint64_t tx,
const std::string& name);
llarp_time_t m_created;
};

View File

@ -5,6 +5,7 @@
#include <llarp/crypto.hpp>
#include <llarp/encrypted.hpp>
#include <llarp/routing/message.hpp>
#include <llarp/service/Identity.hpp>
#include <llarp/service/Info.hpp>
#include <llarp/service/Intro.hpp>
#include <llarp/service/handler.hpp>
@ -51,29 +52,32 @@ namespace llarp
/// outer message
struct ProtocolFrame : public llarp::routing::IMessage
{
llarp::PQCipherBlock C;
llarp::Encrypted D;
llarp::PubKey H;
llarp::KeyExchangeNonce N;
llarp::Signature Z;
llarp::service::ConvoTag T;
ProtocolFrame();
ProtocolFrame(const ProtocolFrame& other);
~ProtocolFrame();
ProtocolFrame&
operator=(const ProtocolFrame& other);
bool
EncryptAndSign(llarp_crypto* c, const ProtocolMessage* msg,
byte_t* sharedkey, byte_t* signingkey);
EncryptAndSign(llarp_crypto* c, const ProtocolMessage& msg,
const byte_t* sharedkey, const Identity& localIdent);
bool
AsyncDecryptAndVerify(llarp_logic* logic, llarp_crypto* c,
llarp_threadpool* worker, byte_t* localSecret,
llarp_threadpool* worker,
const Identity& localIdent,
IDataHandler* handler) const;
bool
DecryptPayloadInto(llarp_crypto* c, byte_t* sharedkey,
ProtocolMessage* into) const;
DecryptPayloadInto(llarp_crypto* c, const byte_t* sharedkey,
ProtocolMessage& into) const;
bool
DecodeKey(llarp_buffer_t key, llarp_buffer_t* val);

View File

@ -13,6 +13,7 @@ namespace llarp
~TunEndpoint();
device* m_tunif;
std::string m_IfName;
};
} // namespace service
} // namespace llarp

View File

@ -29,41 +29,9 @@ namespace llarp
}
};
/// a condition variable that does nothing
struct NullCondition
{
void
wait(NullLock& l)
{
}
void
notify_one()
{
}
void
notify_all()
{
}
template < typename Interval >
void
wait_for(NullLock& l, Interval i)
{
std::this_thread::sleep_for(i);
}
};
#ifdef SHADOW_TESTNET
typedef NullMutex mtx_t;
typedef NullLock lock_t;
typedef NullCondition cond_t;
#else
typedef std::mutex mtx_t;
typedef std::unique_lock< std::mutex > lock_t;
typedef std::condition_variable cond_t;
#endif
struct Mutex
{

Some files were not shown because too many files have changed in this diff Show More