Merge branch 'dev'

4 years ago · 7bfe60c537
parent 4fb3fec3db 98fc193f5f
commit 7bfe60c537
12 changed files with 155 additions and 26 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -0,0 +1,14 @@
+name: test
+on: [push]
+
+jobs:
+  build:
+    name: test
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - run: make test_ci DEBUG=1 -j3
+    - run: make clean
+    - run: make test_ci DEBUG=1 -j3 CXX=clang++
+    - run: make clean
+    - run: make test_go
--- a/.travis.yml
+++ b/.travis.yml
@ -1,17 +0,0 @@
-sudo: true
-dist: trusty
-language: cpp
-compiler:
-  - gcc
-  - clang
-addons:
-  apt:
-    packages:
-      - libgmp-dev
-script:
-  - make test_ci DEBUG=1 -j3
-  - make clean
-  - make test_ci CFLAGS_USER=-DMCL_DONT_USE_XBYAK -j3
-  - make clean
-  - make test_go
- 
--- a/common.mk
+++ b/common.mk
@ -91,7 +91,7 @@ else
    CFLAGS_OPT+=-O3
  else
    ifeq ($(shell expr $(GCC_VER) \> 4.6.0),1)
-      CFLAGS_OPT+=-Ofast
+      CFLAGS_OPT+=-O3
    else
      CFLAGS_OPT+=-O3
    endif
--- a/include/mcl/ec.hpp
+++ b/include/mcl/ec.hpp
@ -301,7 +301,7 @@ void dblJacobi(E& R, const E& P)
 	xy += xy; // 4xy^2
 	switch (E::specialA_) {
 	case Zero:
-		F::add(t, x2, x2);
+		F::mul2(t, x2);
 		x2 += t;
 		break;
 	case Minus3:
@ -312,7 +312,7 @@ void dblJacobi(E& R, const E& P)
 			F::sqr(t, t);
 			x2 -= t;
 		}
-		F::add(t, x2, x2);
+		F::mul2(t, x2);
 		x2 += t;
 		break;
 	case GenericA:
@ -325,7 +325,7 @@ void dblJacobi(E& R, const E& P)
 			t *= E::a_;
 		}
 		t += x2;
-		x2 += x2;
+		F::mul2(x2, x2);
 		x2 += t;
 		break;
 	}
@ -337,12 +337,12 @@ void dblJacobi(E& R, const E& P)
 	} else {
 		F::mul(R.z, P.y, P.z);
 	}
-	R.z += R.z;
+	F::mul2(R.z, R.z);
 	F::sub(R.y, xy, R.x);
 	R.y *= x2;
-	y2 += y2;
-	y2 += y2;
-	y2 += y2;
+	F::mul2(y2, y2);
+	F::mul2(y2, y2);
+	F::mul2(y2, y2);
 	R.y -= y2;
 }

--- a/include/mcl/fp.hpp
+++ b/include/mcl/fp.hpp
@ -165,6 +165,8 @@ public:
 		if (sqr == 0) sqr = sqrC;
 		mul2 = fp::func_ptr_cast<void (*)(FpT& y, const FpT& x)>(op_.fp_mul2A_);
 		if (mul2 == 0) mul2 = mul2C;
+		mul9 = fp::func_ptr_cast<void (*)(FpT& y, const FpT& x)>(op_.fp_mul9A_);
+		if (mul9 == 0) mul9 = mul9C;
 #endif
 		*pb = true;
 	}
@ -499,6 +501,8 @@ public:
 	static inline void sqrC(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); }
 	static void (*mul2)(FpT& y, const FpT& x);
 	static inline void mul2C(FpT& y, const FpT& x) { op_.fp_mul2(y.v_, x.v_, op_.p); }
+	static void (*mul9)(FpT& y, const FpT& x);
+	static inline void mul9C(FpT& y, const FpT& x) { mulSmall(y, x, 9); }
 #else
 	static inline void add(FpT& z, const FpT& x, const FpT& y) { op_.fp_add(z.v_, x.v_, y.v_, op_.p); }
 	static inline void sub(FpT& z, const FpT& x, const FpT& y) { op_.fp_sub(z.v_, x.v_, y.v_, op_.p); }
@ -506,9 +510,20 @@ public:
 	static inline void mul(FpT& z, const FpT& x, const FpT& y) { op_.fp_mul(z.v_, x.v_, y.v_, op_.p); }
 	static inline void sqr(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); }
 	static inline void mul2(FpT& y, const FpT& x) { op_.fp_mul2(y.v_, x.v_, op_.p); }
+	static inline void mul9(FpT& y, const FpT& x) { mulSmall(y, x, 9); }
 #endif
 	static inline void addPre(FpT& z, const FpT& x, const FpT& y) { op_.fp_addPre(z.v_, x.v_, y.v_); }
 	static inline void subPre(FpT& z, const FpT& x, const FpT& y) { op_.fp_subPre(z.v_, x.v_, y.v_); }
+	static inline void mulSmall(FpT& z, const FpT& x, const uint32_t y)
+	{
+		assert(y <= op_.smallModp.maxMulN);
+		Unit xy[maxSize + 1];
+		op_.fp_mulUnitPre(xy, x.v_, y);
+		int v = op_.smallModp.approxMul(xy);
+		const Unit *pv = op_.smallModp.getPmul(v);
+		op_.fp_subPre(z.v_, xy, pv);
+		op_.fp_sub(z.v_, z.v_, op_.p, op_.p);
+	}
 	static inline void mulUnit(FpT& z, const FpT& x, const Unit y)
 	{
 		if (mulSmallUnit(z, x, y)) return;
@ -746,6 +761,7 @@ template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::neg)(FpT& y,
 template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::mul)(FpT& z, const FpT& x, const FpT& y);
 template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::sqr)(FpT& y, const FpT& x);
 template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::mul2)(FpT& y, const FpT& x);
+template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::mul9)(FpT& y, const FpT& x);
 #endif

 } // mcl
--- a/include/mcl/gmp_util.hpp
+++ b/include/mcl/gmp_util.hpp
@ -942,6 +942,85 @@ public:
 #endif
 };

+/*
+	x mod p for a small value x < (pMulTblN * p).
+*/
+struct SmallModp {
+	typedef mcl::fp::Unit Unit;
+	static const size_t unitBitSize = sizeof(Unit) * 8;
+	static const size_t maxTblSize = (MCL_MAX_BIT_SIZE + unitBitSize - 1) / unitBitSize + 1;
+	static const size_t maxMulN = 9;
+	static const size_t pMulTblN = maxMulN + 1;
+	int N_;
+	uint32_t shiftL_;
+	uint32_t shiftR_;
+	uint32_t maxIdx_;
+	// pMulTbl_[i] = (p * i) >> (pBitSize_ - 1)
+	Unit pMulTbl_[pMulTblN][maxTblSize];
+	// idxTbl_[x] = (x << (pBitSize_ - 1)) / p
+	uint8_t idxTbl_[pMulTblN * 2];
+	// return x >> (pBitSize_ - 1)
+	SmallModp()
+		: N_(0)
+		, shiftL_(0)
+		, shiftR_(0)
+		, maxIdx_(0)
+		, pMulTbl_()
+		, idxTbl_()
+	{
+	}
+	// return argmax { i : x > i * p }
+	uint32_t approxMul(const Unit *x) const
+	{
+		uint32_t top = getTop(x);
+		assert(top <= maxIdx_);
+		return idxTbl_[top];
+	}
+	const Unit *getPmul(size_t v) const
+	{
+		assert(v < pMulTblN);
+		return pMulTbl_[v];
+	}
+	uint32_t getTop(const Unit *x) const
+	{
+		return (x[N_ - 1] >> shiftR_) | (x[N_] << shiftL_);
+	}
+	uint32_t cvtInt(const mpz_class& x) const
+	{
+		assert(mcl::gmp::getUnitSize(x) <= 1);
+		if (x == 0) {
+			return 0;
+		} else {
+			return uint32_t(mcl::gmp::getUnit(x)[0]);
+		}
+	}
+	void init(const mpz_class& p)
+	{
+		size_t pBitSize = mcl::gmp::getBitSize(p);
+		N_ = (pBitSize + unitBitSize - 1) / unitBitSize;
+		shiftR_ = (pBitSize - 1) % unitBitSize;
+		shiftL_ = unitBitSize - shiftR_;
+		mpz_class t = 0;
+		for (size_t i = 0; i < pMulTblN; i++) {
+			bool b;
+			mcl::gmp::getArray(&b, pMulTbl_[i], maxTblSize, t);
+			assert(b);
+			(void)b;
+			if (i == pMulTblN - 1) {
+				maxIdx_ = getTop(pMulTbl_[i]);
+				assert(maxIdx_ < CYBOZU_NUM_OF_ARRAY(idxTbl_));
+				break;
+			}
+			t += p;
+		}
+
+		for (uint32_t i = 0; i <= maxIdx_; i++) {
+			idxTbl_[i] = cvtInt((mpz_class(int(i)) << (pBitSize - 1)) / p);
+		}
+	}
+};
+
+
 /*
 	Barrett Reduction
 	for non GMP version
--- a/include/mcl/op.hpp
+++ b/include/mcl/op.hpp
@ -191,6 +191,7 @@ struct Op {
 	uint32_t pmod4;
 	mcl::SquareRoot sq;
 	mcl::Modp modp;
+	mcl::SmallModp smallModp;
 	Unit half[maxUnitSize]; // (p + 1) / 2
 	Unit oneRep[maxUnitSize]; // 1(=inv R if Montgomery)
 	/*
@ -215,6 +216,7 @@ struct Op {
 	void3u fp_mulA_;
 	void2u fp_sqrA_;
 	void2u fp_mul2A_;
+	void2u fp_mul9A_;
 	void3u fp2_addA_;
 	void3u fp2_subA_;
 	void2u fp2_negA_;
@ -304,6 +306,7 @@ struct Op {
 		fp_mulA_ = 0;
 		fp_sqrA_ = 0;
 		fp_mul2A_ = 0;
+		fp_mul9A_ = 0;
 		fp2_addA_ = 0;
 		fp2_subA_ = 0;
 		fp2_negA_ = 0;
--- a/misc/snark-p.py
+++ b/misc/snark-p.py
@ -0,0 +1,13 @@
+p=21888242871839275222246405745257275088696311157297823662689037894645226208583
+
+print("over 253 bit")
+for i in range (10):
+	print(i, (p * i) >> 253)
+
+def maxarg(x):
+	return x // p
+
+print("maxarg")
+for i in range(16):
+	print(i, maxarg(i << 253))
+
--- a/readme.md
+++ b/readme.md
@ -1,4 +1,4 @@
-[![Build Status](https://api.travis-ci.com/herumi/mcl.svg?branch=master)](https://travis-ci.com/github/herumi/mcl)
+[![Build Status](https://github.com/herumi/mcl/actions/workflows/main.yml/badge.svg)](https://github.com/herumi/mcl/actions/workflows/main.yml)

 # mcl

--- a/src/fp.cpp
+++ b/src/fp.cpp
@ -639,6 +639,7 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size
 		if (!b) return false;
 	}
 	modp.init(mp);
+	smallModp.init(mp);
 	return fp::initForMont(*this, p, mode);
 }

--- a/test/bench.hpp
+++ b/test/bench.hpp
@ -116,6 +116,10 @@ void testBench(const G1& P, const G2& Q)
 	CYBOZU_BENCH_C("Fp::sub       ", C3, Fp::sub, x, x, y);
 	CYBOZU_BENCH_C("Fp::add 2     ", C3, Fp::add, x, x, x);
 	CYBOZU_BENCH_C("Fp::mul2      ", C3, Fp::mul2, x, x);
+	CYBOZU_BENCH_C("Fp::mulSmall8 ", C3, Fp::mulSmall, x, x, 8);
+	CYBOZU_BENCH_C("Fp::mulUnit8  ", C3, Fp::mulUnit, x, x, 8);
+	CYBOZU_BENCH_C("Fp::mul9      ", C3, Fp::mul9, x, x);
+	CYBOZU_BENCH_C("Fp::mulUnit9  ", C3, Fp::mulUnit, x, x, 9);
 	CYBOZU_BENCH_C("Fp::neg       ", C3, Fp::neg, x, x);
 	CYBOZU_BENCH_C("Fp::mul       ", C3, Fp::mul, x, x, y);
 	CYBOZU_BENCH_C("Fp::sqr       ", C3, Fp::sqr, x, x);
--- a/test/common_test.hpp
+++ b/test/common_test.hpp
@ -183,8 +183,24 @@ void testFp2Dbl_mul_xi1()
 	}
 }

+void testMulSmall()
+{
+	puts("testMulSmall");
+	cybozu::XorShift rg;
+	for (int y = 0; y < 10; y++) {
+		for (int i = 0; i < 40; i++) {
+			Fp x, z1, z2;
+			x.setByCSPRNG(rg);
+			Fp::mulSmall(z1, x, y);
+			z2 = x * y;
+			CYBOZU_TEST_EQUAL(z1, z2);
+		}
+	}
+}
+
 void testCommon(const G1& P, const G2& Q)
 {
+	testMulSmall();
 	testFp2Dbl_mul_xi1();
 	testABCD();
 	testMul2();