Merge branch 'dev'

4 years ago · 908fa76fca
parent c1bcf317a1 215106e9e6
commit 908fa76fca
4 changed files with 612 additions and 0 deletions
--- a/misc/Makefile
+++ b/misc/Makefile
@ -0,0 +1,6 @@
+all: low_test
+
+CFLAGS=-I ../include/ -m32 -Ofast -Wall -Wextra -DNDEBUG
+
+low_test: low_test.cpp ../src/low_funct.hpp
+	$(CXX) -o low_test low_test.cpp $(CFLAGS)
--- a/misc/low_test.cpp
+++ b/misc/low_test.cpp
@ -0,0 +1,234 @@
+#include <stdio.h>
+#include <stdint.h>
+
+void dump(const char *msg, const uint32_t *x, size_t n)
+{
+	printf("%s", msg);
+	for (size_t i = 0; i < n; i++) {
+		printf("%08x", x[n - 1 - i]);
+	}
+	printf("\n");
+}
+#include "../src/low_funct.hpp"
+
+#define MCL_USE_VINT
+#define MCL_VINT_FIXED_BUFFER
+#define MCL_SIZEOF_UNIT 4
+#define MCL_MAX_BIT_SIZE 768
+#include <mcl/vint.hpp>
+#include <cybozu/test.hpp>
+#include <cybozu/xorshift.hpp>
+#include <cybozu/benchmark.hpp>
+#include <mcl/util.hpp>
+
+const int C = 10000;
+
+template<class RG>
+void setRand(uint32_t *x, size_t n, RG& rg)
+{
+	for (size_t i = 0; i < n; i++) {
+		x[i] = rg.get32();
+	}
+}
+
+/*
+g++ -Ofast -DNDEBUG -Wall -Wextra -m32 -I ./include/ misc/low_test.cpp
+Core i7-8700
+         mulT  karatsuba
+N =  6, 182clk   225clk
+N =  8, 300clk   350clk
+N = 12, 594clk   730clk
+*/
+template<size_t N>
+void mulTest()
+{
+	printf("N=%zd (%zdbit)\n", N, N * 32);
+	cybozu::XorShift rg;
+	uint32_t x[N];
+	uint32_t y[N];
+	uint32_t z[N * 2];
+	for (size_t i = 0; i < 1000; i++) {
+		setRand(x, N, rg);
+		setRand(y, N, rg);
+		// remove MSB
+		x[N - 1] &= 0x7fffffff;
+		y[N - 1] &= 0x7fffffff;
+		mcl::Vint vx, vy;
+		vx.setArray(x, N);
+		vy.setArray(y, N);
+		vx *= vy;
+		mcl::mulT<N>(z, x, y);
+		CYBOZU_TEST_EQUAL_ARRAY(z, vx.getUnit(), N * 2);
+		memset(z, 0, sizeof(z));
+		mcl::karatsubaT<N>(z, x, y);
+		CYBOZU_TEST_EQUAL_ARRAY(z, vx.getUnit(), N * 2);
+	}
+	CYBOZU_BENCH_C("mulT", C, mcl::mulT<N>, z, x, y);
+	CYBOZU_BENCH_C("kara", C, mcl::karatsubaT<N>, z, x, y);
+}
+
+CYBOZU_TEST_AUTO(mulT)
+{
+	mulTest<8>();
+	mulTest<12>();
+}
+
+template<size_t N>
+void sqrTest()
+{
+	printf("N=%zd (%zdbit)\n", N, N * 32);
+	cybozu::XorShift rg;
+	uint32_t x[N];
+	uint32_t y[N * 2];
+	for (size_t i = 0; i < 1000; i++) {
+		setRand(x, N, rg);
+		// remove MSB
+		x[N - 1] &= 0x7fffffff;
+		mcl::Vint vx;
+		vx.setArray(x, N);
+		vx *= vx;
+		mcl::sqrT<N>(y, x);
+		CYBOZU_TEST_EQUAL_ARRAY(y, vx.getUnit(), N * 2);
+	}
+	CYBOZU_BENCH_C("sqrT", C, mcl::sqrT<N>, y, x);
+}
+
+CYBOZU_TEST_AUTO(sqrT)
+{
+	sqrTest<8>();
+	sqrTest<12>();
+}
+
+struct Montgomery {
+	mcl::Vint p_;
+	mcl::Vint R_; // (1 << (pn_ * 64)) % p
+	mcl::Vint RR_; // (R * R) % p
+	uint32_t rp_; // rp * p = -1 mod M = 1 << 64
+	size_t pn_;
+	Montgomery() {}
+	explicit Montgomery(const mcl::Vint& p)
+	{
+		p_ = p;
+		rp_ = mcl::fp::getMontgomeryCoeff(p.getUnit()[0]);
+		pn_ = p.getUnitSize();
+		R_ = 1;
+		R_ = (R_ << (pn_ * 64)) % p_;
+		RR_ = (R_ * R_) % p_;
+	}
+
+	void toMont(mcl::Vint& x) const { mul(x, x, RR_); }
+	void fromMont(mcl::Vint& x) const { mul(x, x, 1); }
+
+	void mul(mcl::Vint& z, const mcl::Vint& x, const mcl::Vint& y) const
+	{
+		const size_t ySize = y.getUnitSize();
+		mcl::Vint c = x * y.getUnit()[0];
+		uint32_t q = c.getUnit()[0] * rp_;
+		c += p_ * q;
+		c >>= sizeof(uint32_t) * 8;
+		for (size_t i = 1; i < pn_; i++) {
+			if (i < ySize) {
+				c += x * y.getUnit()[i];
+			}
+			uint32_t q = c.getUnit()[0] * rp_;
+			c += p_ * q;
+			c >>= sizeof(uint32_t) * 8;
+		}
+		if (c >= p_) {
+			c -= p_;
+		}
+		z = c;
+	}
+	void mod(mcl::Vint& z, const mcl::Vint& xy) const
+	{
+		z = xy;
+		for (size_t i = 0; i < pn_; i++) {
+			uint32_t q = z.getUnit()[0] * rp_;
+			mcl::Vint t = q;
+			z += p_ * t;
+			z >>= 32;
+		}
+		if (z >= p_) {
+			z -= p_;
+		}
+	}
+};
+
+template<size_t N>
+void mulMontTest(const char *pStr)
+{
+	mcl::Vint vp;
+	vp.setStr(pStr);
+	Montgomery mont(vp);
+
+	cybozu::XorShift rg;
+	uint32_t x[N];
+	uint32_t y[N];
+	uint32_t z[N];
+	uint32_t _p[N + 1];
+	uint32_t *const p = _p + 1;
+	vp.getArray(p, N);
+	p[-1] = mont.rp_;
+
+	for (size_t i = 0; i < 1000; i++) {
+		setRand(x, N, rg);
+		setRand(y, N, rg);
+		// remove MSB
+		x[N - 1] &= 0x7fffffff;
+		y[N - 1] &= 0x7fffffff;
+		mcl::Vint vx, vy, vz;
+		vx.setArray(x, N);
+		vy.setArray(y, N);
+		mont.mul(vz, vx, vy);
+		mcl::mulMontT<N>(z, x, y, p);
+		CYBOZU_TEST_EQUAL_ARRAY(z, vz.getUnit(), N);
+
+		mont.mul(vz, vx, vx);
+		mcl::sqrMontT<N>(z, x, p);
+		CYBOZU_TEST_EQUAL_ARRAY(z, vz.getUnit(), N);
+	}
+	CYBOZU_BENCH_C("mulMontT", C, mcl::mulMontT<N>, x, x, y, p);
+	CYBOZU_BENCH_C("sqrMontT", C, mcl::sqrMontT<N>, x, x, p);
+}
+
+template<size_t N>
+void modTest(const char *pStr)
+{
+	mcl::Vint vp;
+	vp.setStr(pStr);
+	Montgomery mont(vp);
+
+	cybozu::XorShift rg;
+	uint32_t xy[N * 2];
+	uint32_t z[N];
+	uint32_t _p[N + 1];
+	uint32_t *const p = _p + 1;
+	vp.getArray(p, N);
+	p[-1] = mont.rp_;
+
+	for (size_t i = 0; i < 1000; i++) {
+		setRand(xy, N * 2, rg);
+		// remove MSB
+		xy[N * 2 - 1] &= 0x7fffffff;
+		mcl::Vint vxy, vz;
+		vxy.setArray(xy, N * 2);
+		mont.mod(vz, vxy);
+		mcl::modT<N>(z, xy, p);
+		CYBOZU_TEST_EQUAL_ARRAY(z, vz.getUnit(), N);
+	}
+	CYBOZU_BENCH_C("modT", C, mcl::modT<N>, z, xy, p);
+}
+
+CYBOZU_TEST_AUTO(mont)
+{
+	const char *pBN254 = "0x2523648240000001ba344d80000000086121000000000013a700000000000013";
+	puts("BN254");
+	mulMontTest<8>(pBN254);
+	modTest<8>(pBN254);
+
+	const char *pBLS12_381 = "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab";
+	puts("BLS12");
+	mulMontTest<12>(pBLS12_381);
+	modTest<12>(pBLS12_381);
+}
+
--- a/src/fp.cpp
+++ b/src/fp.cpp
@ -3,6 +3,10 @@
 #include <cybozu/sha2.hpp>
 #include <cybozu/endian.hpp>
 #include <mcl/conversion.hpp>
+#if defined(__EMSCRIPTEN__) && MCL_SIZEOF_UNIT == 4
+#define FOR_WASM
+#include "low_funct.hpp"
+#endif

 #if defined(MCL_STATIC_CODE) || defined(MCL_USE_XBYAK) || (defined(MCL_USE_LLVM) && (CYBOZU_HOST == CYBOZU_HOST_INTEL))

@ -407,6 +411,25 @@ static bool initForMont(Op& op, const Unit *p, Mode mode)
 	return true;
 }

+#ifdef FOR_WASM
+template<size_t N>
+void setWasmOp(Op& op)
+{
+	if (!(op.isMont && !op.isFullBit)) return;
+EM_ASM({console.log($0)}, N);
+//	op.fp_addPre = mcl::addT<N>;
+//	op.fp_subPre = mcl::subT<N>;
+//	op.fpDbl_addPre = mcl::addT<N * 2>;
+//	op.fpDbl_subPre = mcl::subT<N * 2>;
+	op.fp_add = mcl::addModT<N>;
+	op.fp_sub = mcl::subModT<N>;
+	op.fp_mul = mcl::mulMontT<N>;
+	op.fp_sqr = mcl::sqrMontT<N>;
+	op.fpDbl_mulPre = mulT<N>;
+	op.fpDbl_mod = modT<N>;
+}
+#endif
+
 bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size_t mclMaxBitSize)
 {
 	if (mclMaxBitSize != MCL_MAX_BIT_SIZE) return false;
@ -547,6 +570,13 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size
 	default:
 		return false;
 	}
+#ifdef FOR_WASM
+	if (N == 8) {
+		setWasmOp<8>(*this);
+	} else if (N == 12) {
+		setWasmOp<12>(*this);
+	}
+#endif
 #ifdef MCL_USE_LLVM
 	if (primeMode == PM_NIST_P192) {
 		fp_mul = &mcl_fp_mulNIST_P192L;
--- a/src/low_funct.hpp
+++ b/src/low_funct.hpp
@ -0,0 +1,342 @@
+#pragma once
+/**
+	@file
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+	@note for only 32bit not full bit prime version
+	assert((p[N - 1] & 0x80000000) == 0);
+*/
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+
+
+namespace mcl {
+
+template<size_t N>
+void copyT(uint32_t y[N], const uint32_t x[N])
+{
+	for (size_t i = 0; i < N; i++) {
+		y[i] = x[i];
+	}
+}
+
+template<size_t N>
+uint32_t shlT(uint32_t y[N], const uint32_t x[N], size_t bit)
+{
+	assert(0 < bit && bit < 32);
+	assert((N % 2) == 0);
+	size_t rBit = sizeof(uint32_t) * 8 - bit;
+	uint32_t keep = x[N - 1];
+	uint32_t prev = keep;
+	for (size_t i = N - 1; i > 0; i--) {
+		uint32_t t = x[i - 1];
+		y[i] = (prev << bit) | (t >> rBit);
+		prev = t;
+	}
+	y[0] = prev << bit;
+	return keep >> rBit;
+}
+
+// [return:y[N]] += x
+template<size_t N>
+inline uint32_t addUnitT(uint32_t y[N], uint32_t x)
+{
+	uint64_t v = uint64_t(y[0]) + x;
+	y[0] = uint32_t(v);
+	uint32_t c = v >> 32;
+	if (c == 0) return 0;
+	for (size_t i = 1; i < N; i++) {
+		v = uint64_t(y[i]) + 1;
+		y[i] = uint32_t(v);
+		if ((v >> 32) == 0) return 0;
+	}
+	return 1;
+}
+
+template<size_t N>
+uint32_t addT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N])
+{
+	uint32_t c = 0;
+	for (size_t i = 0; i < N; i++) {
+		uint64_t v = uint64_t(x[i]) + y[i] + c;
+		z[i] = uint32_t(v);
+		c = uint32_t(v >> 32);
+	}
+	return c;
+}
+
+template<size_t N>
+uint32_t subT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N])
+{
+	uint32_t c = 0;
+	for (size_t i = 0; i < N; i++) {
+		uint64_t v = uint64_t(x[i]) - y[i] - c;
+		z[i] = uint32_t(v);
+		c = uint32_t(v >> 63);
+	}
+	return c;
+}
+
+// [return:z[N]] = x[N] * y
+template<size_t N>
+uint32_t mulUnitT(uint32_t z[N], const uint32_t x[N], uint32_t y)
+{
+	uint32_t H = 0;
+	for (size_t i = 0; i < N; i++) {
+		uint64_t v = uint64_t(x[i]) * y;
+		v += H;
+		z[i] = uint32_t(v);
+		H = uint32_t(v >> 32);
+	}
+	return H;
+}
+
+// [return:z[N]] = z[N] + x[N] * y
+template<size_t N>
+uint32_t addMulUnitT(uint32_t z[N], const uint32_t x[N], uint32_t y)
+{
+	uint32_t H = 0;
+	for (size_t i = 0; i < N; i++) {
+		uint64_t v = uint64_t(x[i]) * y;
+		v += H;
+		v += z[i];
+		z[i] = uint32_t(v);
+		H = uint32_t(v >> 32);
+	}
+	return H;
+}
+
+// z[N * 2] = x[N] * y[N]
+template<size_t N>
+void mulT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N])
+{
+	z[N] = mulUnitT<N>(z, x, y[0]);
+	for (size_t i = 1; i < N; i++) {
+		z[N + i] = addMulUnitT<N>(&z[i], x, y[i]);
+	}
+}
+
+#if 0
+// slower than mulT
+template<size_t N>
+uint32_t mulUnitWithTblT(uint32_t z[N], const uint64_t *tbl_j)
+{
+	uint32_t H = 0;
+	for (size_t i = 0; i < N; i++) {
+		uint64_t v = tbl_j[i];
+		v += H;
+		z[i] = uint32_t(v);
+		H = uint32_t(v >> 32);
+	}
+	return H;
+}
+
+template<size_t N>
+uint32_t addMulUnitWithTblT(uint32_t z[N], const uint64_t *tbl_j)
+{
+	uint32_t H = 0;
+	for (size_t i = 0; i < N; i++) {
+		uint64_t v = tbl_j[i];
+		v += H;
+		v += z[i];
+		z[i] = uint32_t(v);
+		H = uint32_t(v >> 32);
+	}
+	return H;
+}
+
+// y[N * 2] = x[N] * x[N]
+template<size_t N>
+void sqrT(uint32_t y[N * 2], const uint32_t x[N])
+{
+	uint64_t tbl[N * N]; // x[i]x[j]
+	for (size_t i = 0; i < N; i++) {
+		uint64_t xi = x[i];
+		tbl[i * N + i] = xi * xi;
+		for (size_t j = i + 1; j < N; j++) {
+			uint64_t v = xi * x[j];
+			tbl[i * N + j] = v;
+			tbl[j * N + i] = v;
+		}
+	}
+	y[N] = mulUnitWithTblT<N>(y, tbl);
+	for (size_t i = 1; i < N; i++) {
+		y[N + i] = addMulUnitWithTblT<N>(&y[i], tbl + N * i);
+	}
+}
+#endif
+
+/*
+	z[N * 2] = x[N] * y[N]
+	H = N/2
+	W = 1 << (H * 32)
+	x = aW + b, y = cW + d
+	assume a < W/2, c < W/2
+	(aW + b)(cW + d) = acW^2 + (ad + bc)W + bd
+	ad + bc = (a + b)(c + d) - ac - bd < (1 << (N * 32))
+	slower than mulT on Core i7 with -m32 for N <= 12
+*/
+template<size_t N>
+void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N])
+{
+	assert((N % 2) == 0);
+	assert((x[N - 1] & 0x80000000) == 0);
+	assert((y[N - 1] & 0x80000000) == 0);
+	const size_t H = N / 2;
+	uint32_t a_b[H];
+	uint32_t c_d[H];
+	uint32_t c1 = addT<H>(a_b, x, x + H); // a + b
+	uint32_t c2 = addT<H>(c_d, y, y + H); // c + d
+	uint32_t tmp[N];
+	mulT<H>(tmp, a_b, c_d);
+	if (c1) {
+		addT<H>(tmp + H, tmp + H, c_d);
+	}
+	if (c2) {
+		addT<H>(tmp + H, tmp + H, a_b);
+	}
+	mulT<H>(z, x, y); // bd
+	mulT<H>(z + N, x + H, y + H); // ac
+	// c:tmp[N] = (a + b)(c + d)
+	subT<N>(tmp, tmp, z);
+	subT<N>(tmp, tmp, z + N);
+	// c:tmp[N] = ad + bc
+	if (addT<N>(z + H, z + H, tmp)) {
+		addUnitT<H>(z + N + H, 1);
+	}
+}
+
+/*
+	y[N * 2] = x[N] * x[N]
+	(aW + b)^2 = a^2 W + b^2 + 2abW
+	(a+b)^2 - a^2 - b^2
+*/
+template<size_t N>
+void sqrT(uint32_t y[N * 2], const uint32_t x[N])
+{
+	assert((N % 2) == 0);
+	assert((x[N - 1] & 0x80000000) == 0);
+	const size_t H = N / 2;
+	uint32_t a_b[H];
+	uint32_t c = addT<H>(a_b, x, x + H); // a + b
+	uint32_t tmp[N];
+	mulT<H>(tmp, a_b, a_b);
+	if (c) {
+		shlT<H>(a_b, a_b, 1);
+		addT<H>(tmp + H, tmp + H, a_b);
+	}
+	mulT<H>(y, x, x); // b^2
+	mulT<H>(y + N, x + H, x + H); // a^2
+	// tmp[N] = (a + b)^2
+	subT<N>(tmp, tmp, y);
+	subT<N>(tmp, tmp, y + N);
+	// tmp[N] = 2ab
+	if (addT<N>(y + H, y + H, tmp)) {
+		addUnitT<H>(y + N + H, 1);
+	}
+}
+
+template<size_t N>
+void addModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N])
+{
+	uint32_t t[N];
+	addT<N>(z, x, y);
+	uint32_t c = subT<N>(t, z, p);
+	if (!c) {
+		copyT<N>(z, t);
+	}
+}
+
+template<size_t N>
+void subModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N])
+{
+	uint32_t c = subT<N>(z, x, y);
+	if (c) {
+		addT<N>(z, z, p);
+	}
+}
+
+/*
+	z[N] = Montgomery(x[N], y[N], p[N])
+	@remark : assume p[-1] = rp
+*/
+template<size_t N>
+void mulMontT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N])
+{
+	const uint32_t rp = p[-1];
+	assert((p[N - 1] & 0x80000000) == 0);
+	uint32_t buf[N * 2];
+	buf[N] = mulUnitT<N>(buf, x, y[0]);
+	uint32_t q = buf[0] * rp;
+	buf[N] += addMulUnitT<N>(buf, p, q);
+	for (size_t i = 1; i < N; i++) {
+		buf[N + i] = addMulUnitT<N>(buf + i, x, y[i]);
+		uint32_t q = buf[i] * rp;
+		buf[N + i] += addMulUnitT<N>(buf + i, p, q);
+	}
+	if (subT<N>(z, buf + N, p)) {
+		copyT<N>(z, buf + N);
+	}
+}
+
+// [return:z[N+1]] = z[N+1] + x[N] * y + (cc << (N * 32))
+template<size_t N>
+uint32_t addMulUnit2T(uint32_t z[N + 1], const uint32_t x[N], uint32_t y, const uint32_t *cc = 0)
+{
+	uint32_t H = 0;
+	for (size_t i = 0; i < N; i++) {
+		uint64_t v = uint64_t(x[i]) * y;
+		v += H;
+		v += z[i];
+		z[i] = uint32_t(v);
+		H = uint32_t(v >> 32);
+	}
+	if (cc) H += *cc;
+	uint64_t v = uint64_t(z[N]);
+	v += H;
+	z[N] = uint32_t(v);
+	return uint32_t(v >> 32);
+}
+
+/*
+	z[N] = Montgomery reduction(y[N], xy[N], p[N])
+	@remark : assume p[-1] = rp
+*/
+template<size_t N>
+void modT(uint32_t y[N], const uint32_t xy[N * 2], const uint32_t p[N])
+{
+	const uint32_t rp = p[-1];
+	assert((p[N - 1] & 0x80000000) == 0);
+	uint32_t buf[N * 2];
+	copyT<N * 2>(buf, xy);
+	uint32_t c = 0;
+	for (size_t i = 0; i < N; i++) {
+		uint32_t q = buf[i] * rp;
+		c = addMulUnit2T<N>(buf + i, p, q, &c);
+	}
+	if (subT<N>(y, buf + N, p)) {
+		copyT<N>(y, buf + N);
+	}
+}
+
+/*
+	z[N] = Montgomery(x[N], y[N], p[N])
+	@remark : assume p[-1] = rp
+*/
+template<size_t N>
+void sqrMontT(uint32_t y[N], const uint32_t x[N], const uint32_t p[N])
+{
+#if 1
+	mulMontT<N>(y, x, x, p);
+#else
+	// slower
+	uint32_t xx[N * 2];
+	sqrT<N>(xx, x);
+	modT<N>(y, xx, p);
+#endif
+}
+
+} // mcl
+