add gmp version of Mont with not full prime

8 years ago · 389bb658ef
parent 21324c9e0a
commit 389bb658ef
6 changed files with 131 additions and 55 deletions
--- a/readme.md
+++ b/readme.md
@ -89,7 +89,9 @@ make MCL_USE_LLVM=1 LLVM_VER=-3.8 ARCH=arm

 A benchmark of a BN curve over the 254-bit prime.
 * x64, x86 ; Inte Core i7-6700 3.4GHz(Skylake)
-    * `sudo cpufreq-set -g performance`
+    ```
+    sudo cpufreq-set -g performance
+    ```
 * arm ; 900MHz quad-core ARM Cortex-A7 on Raspberry Pi2, Linux 4.4.11-v7+
 * arm64 ; 1.2GHz ARM Cortex-A53 [HiKey](http://www.96boards.org/product/hikey/)

--- a/src/fp.cpp
+++ b/src/fp.cpp
@ -191,8 +191,13 @@ void setOp2(Op& op)
 		op.fp_sub = Sub<N, false, Tag>::f;
 	}
 	if (op.isMont) {
-		op.fp_mul = Mont<N, Tag>::f;
-		op.fp_sqr = SqrMont<N, Tag>::f;
+		if (op.isFullBit) {
+			op.fp_mul = Mont<N, true, Tag>::f;
+			op.fp_sqr = SqrMont<N, true, Tag>::f;
+		} else {
+			op.fp_mul = Mont<N, false, Tag>::f;
+			op.fp_sqr = SqrMont<N, false, Tag>::f;
+		}
 		op.fpDbl_mod = MontRed<N, Tag>::f;
 	} else {
 		op.fp_mul = Mul<N, Tag>::f;
--- a/src/gen.cpp
+++ b/src/gen.cpp
@ -738,7 +738,7 @@ struct Code : public mcl::Generator {
 		generic_fpDbl_mul(py, px, px);
 		endFunc();
 	}
-	void gen_mcl_fp_mont()
+	void gen_mcl_fp_mont(bool isFullBit = true)
 	{
 		const int bu = bit + unit;
 		const int bu2 = bit + unit * 2;
@ -747,39 +747,75 @@ struct Code : public mcl::Generator {
 		Operand px(IntPtr, unit);
 		Operand py(IntPtr, unit);
 		Operand pp(IntPtr, unit);
-		std::string name = "mcl_fp_mont" + cybozu::itoa(N) + "L";
+		std::string name = "mcl_fp_mont";
+		if (!isFullBit) {
+			name += "NF";
+		}
+		name += cybozu::itoa(N) + "L";
 		mcl_fp_montM[N] = Function(name, Void, pz, px, py, pp);
 		mcl_fp_montM[N].setAlias();
 		verifyAndSetPrivate(mcl_fp_montM[N]);
 		beginFunc(mcl_fp_montM[N]);
 		Operand rp = load(getelementptr(pp, -1));
-		Operand p = loadN(pp, N);
 		Operand z, s, a;
-		for (uint32_t i = 0; i < N; i++) {
-			Operand y = load(getelementptr(py, i));
-			Operand xy = call(mulPvM[bit], px, y);
-			Operand at;
-			if (i == 0) {
-				a = zext(xy, bu2);
-				at = trunc(xy, unit);
-			} else {
-				xy = zext(xy, bu2);
-				a = add(s, xy);
-				at = trunc(a, unit);
+		if (1 || isFullBit) {
+			for (uint32_t i = 0; i < N; i++) {
+				Operand y = load(getelementptr(py, i));
+				Operand xy = call(mulPvM[bit], px, y);
+				Operand at;
+				if (i == 0) {
+					a = zext(xy, bu2);
+					at = trunc(xy, unit);
+				} else {
+					xy = zext(xy, bu2);
+					a = add(s, xy);
+					at = trunc(a, unit);
+				}
+				Operand q = mul(at, rp);
+				Operand pq = call(mulPvM[bit], pp, q);
+				pq = zext(pq, bu2);
+				Operand t = add(a, pq);
+				s = lshr(t, unit);
 			}
-			Operand q = mul(at, rp);
-			Operand pq = call(mulPvM[bit], pp, q);
-			pq = zext(pq, bu2);
-			Operand t = add(a, pq);
-			s = lshr(t, unit);
+			s = trunc(s, bu);
+			Operand p = zext(loadN(pp, N), bu);
+			Operand vc = sub(s, p);
+			Operand c = trunc(lshr(vc, bit), 1);
+			z = select(c, s, vc);
+			z = trunc(z, bit);
+			storeN(z, pz);
+		} else {
+			for (uint32_t i = 0; i < N; i++) {
+				Operand y = load(getelementptr(py, i));
+				Operand xy = call(mulPvM[bit], px, y);
+				Operand at;
+				if (i == 0) {
+					a = xy;
+					at = trunc(xy, unit);
+					Operand q = mul(at, rp);
+					Operand pq = call(mulPvM[bit], pp, q);
+					pq = zext(pq, bu2);
+					Operand t = add(a, pq);
+					s = lshr(t, unit);
+				} else {
+					xy = zext(xy, bu2);
+					a = add(s, xy);
+					at = trunc(a, unit);
+					Operand q = mul(at, rp);
+					Operand pq = call(mulPvM[bit], pp, q);
+					pq = zext(pq, bu2);
+					Operand t = add(a, pq);
+					s = lshr(t, unit);
+				}
+			}
+			s = trunc(s, bu);
+			Operand p = zext(loadN(pp, N), bu);
+			Operand vc = sub(s, p);
+			Operand c = trunc(lshr(vc, bit), 1);
+			z = select(c, s, vc);
+			z = trunc(z, bit);
+			storeN(z, pz);
 		}
-		s = trunc(s, bu);
-		p = zext(p, bu);
-		Operand vc = sub(s, p);
-		Operand c = trunc(lshr(vc, bit), 1);
-		z = select(c, s, vc);
-		z = trunc(z, bit);
-		storeN(z, pz);
 		ret(Void);
 		endFunc();
 	}
@ -840,7 +876,8 @@ struct Code : public mcl::Generator {
 		gen_mcl_fp_mulUnitPre();
 		gen_mcl_fpDbl_mulPre();
 		gen_mcl_fpDbl_sqrPre();
-		gen_mcl_fp_mont();
+		gen_mcl_fp_mont(true);
+		gen_mcl_fp_mont(false);
 		gen_mcl_fp_montRed();
 	}
 	void setBit(uint32_t bit)
--- a/src/low_func.hpp
+++ b/src/low_func.hpp
@ -507,7 +507,7 @@ const void3u MontRed<N, Tag>::f = MontRed<N, Tag>::func;
 	z[N] <- Montgomery(x[N], y[N], p[N])
 	REMARK : assume p[-1] = rp
 */
-template<size_t N, class Tag = Gtag>
+template<size_t N, bool isFullBit, class Tag = Gtag>
 struct Mont {
 	static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
 	{
@ -517,26 +517,56 @@ struct Mont {
 		MontRed<N, Tag>::f(z, xy, p);
 #else
 		const Unit rp = p[-1];
-		Unit buf[N * 2 + 2];
-		Unit *c = buf;
-		MulUnitPre<N, Tag>::f(c, x, y[0]); // x * y[0]
-		Unit q = c[0] * rp;
-		Unit t[N + 2];
-		MulUnitPre<N, Tag>::f(t, p, q); // p * q
-		t[N + 1] = 0; // always zero
-		c[N + 1] = AddPre<N + 1, Tag>::f(c, c, t);
-		c++;
-		for (size_t i = 1; i < N; i++) {
-			MulUnitPre<N, Tag>::f(t, x, y[i]);
+		if (isFullBit) {
+			Unit buf[N * 2 + 2];
+			Unit *c = buf;
+			MulUnitPre<N, Tag>::f(c, x, y[0]); // x * y[0]
+			Unit q = c[0] * rp;
+			Unit t[N + 2];
+			MulUnitPre<N, Tag>::f(t, p, q); // p * q
+			t[N + 1] = 0; // always zero
 			c[N + 1] = AddPre<N + 1, Tag>::f(c, c, t);
-			q = c[0] * rp;
-			MulUnitPre<N, Tag>::f(t, p, q);
-			AddPre<N + 2, Tag>::f(c, c, t);
 			c++;
-		}
-		if (c[N]) {
-			SubPre<N, Tag>::f(z, c, p);
+			for (size_t i = 1; i < N; i++) {
+				MulUnitPre<N, Tag>::f(t, x, y[i]);
+				c[N + 1] = AddPre<N + 1, Tag>::f(c, c, t);
+				q = c[0] * rp;
+				MulUnitPre<N, Tag>::f(t, p, q);
+				AddPre<N + 2, Tag>::f(c, c, t);
+				c++;
+			}
+			if (c[N]) {
+				SubPre<N, Tag>::f(z, c, p);
+			} else {
+				if (SubPre<N, Tag>::f(z, c, p)) {
+					memcpy(z, c, N * sizeof(Unit));
+				}
+			}
 		} else {
+			Unit carry;
+			(void)carry;
+			Unit buf[N * 2 + 1];
+			Unit *c = buf;
+			MulUnitPre<N, Tag>::f(c, x, y[0]); // x * y[0]
+			Unit q = c[0] * rp;
+			Unit t[N + 1];
+			MulUnitPre<N, Tag>::f(t, p, q); // p * q
+			carry = AddPre<N + 1, Tag>::f(c, c, t);
+			assert(carry == 0);
+			c++;
+			c[N] = 0;
+			for (size_t i = 1; i < N; i++) {
+				c[N + 1] = 0;
+				MulUnitPre<N, Tag>::f(t, x, y[i]);
+				carry = AddPre<N + 1, Tag>::f(c, c, t);
+				assert(carry == 0);
+				q = c[0] * rp;
+				MulUnitPre<N, Tag>::f(t, p, q);
+				carry = AddPre<N + 1, Tag>::f(c, c, t);
+				assert(carry == 0);
+				c++;
+			}
+			assert(c[N] == 0);
 			if (SubPre<N, Tag>::f(z, c, p)) {
 				memcpy(z, c, N * sizeof(Unit));
 			}
@ -546,11 +576,11 @@ struct Mont {
 	static const void4u f;
 };

-template<size_t N, class Tag>
-const void4u Mont<N, Tag>::f = Mont<N, Tag>::func;
+template<size_t N, bool isFullBit, class Tag>
+const void4u Mont<N, isFullBit, Tag>::f = Mont<N, isFullBit, Tag>::func;

 // z[N] <- Montgomery(x[N], x[N], p[N])
-template<size_t N, class Tag = Gtag>
+template<size_t N, bool isFullBit, class Tag = Gtag>
 struct SqrMont {
 	static inline void func(Unit *y, const Unit *x, const Unit *p)
 	{
@ -559,13 +589,13 @@ struct SqrMont {
 		SqrPre<N, Tag>::f(xx, x);
 		MontRed<N, Tag>::f(y, xx, p);
 #else
-		Mont<N, Tag>::f(y, x, x, p);
+		Mont<N, isFullBit, Tag>::f(y, x, x, p);
 #endif
 	}
 	static const void3u f;
 };
-template<size_t N, class Tag>
-const void3u SqrMont<N, Tag>::f = SqrMont<N, Tag>::func;
+template<size_t N, bool isFullBit, class Tag>
+const void3u SqrMont<N, isFullBit, Tag>::f = SqrMont<N, isFullBit, Tag>::func;

 // z[N] <- (x[N] * y[N]) % p[N]
 template<size_t N, class Tag = Gtag>
--- a/src/low_func_llvm.hpp
+++ b/src/low_func_llvm.hpp
@ -35,7 +35,8 @@ template<>const void4u Add<n, true, Ltag>::f = &mcl_fp_add ## n ## L; \
 template<>const void4u Add<n, false, Ltag>::f = &mcl_fp_addNF ## n ## L; \
 template<>const void4u Sub<n, true, Ltag>::f = &mcl_fp_sub ## n ## L; \
 template<>const void4u Sub<n, false, Ltag>::f = &mcl_fp_subNF ## n ## L; \
-template<>const void4u Mont<n, Ltag>::f = &mcl_fp_mont ## n ## L; \
+template<>const void4u Mont<n, true, Ltag>::f = &mcl_fp_mont ## n ## L; \
+template<>const void4u Mont<n, false, Ltag>::f = &mcl_fp_mont ## n ## L; \
 template<>const void3u MontRed<n, Ltag>::f = &mcl_fp_montRed ## n ## L; \
 template<>const void4u DblAdd<n, Ltag>::f = &mcl_fpDbl_add ## n ## L; \
 template<>const void4u DblSub<n, Ltag>::f = &mcl_fpDbl_sub ## n ## L; \
--- a/src/proto.hpp
+++ b/src/proto.hpp
@ -20,6 +20,7 @@ void mcl_fp_mulUnitPre ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, mcl
 void mcl_fpDbl_mulPre ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y); \
 void mcl_fpDbl_sqrPre ## n ## suf(mcl::fp::Unit* y, const mcl::fp::Unit* x); \
 void mcl_fp_mont ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p); \
+void mcl_fp_montNF ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p); \
 void mcl_fp_montRed ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* xy, const mcl::fp::Unit* p); \
 void mcl_fpDbl_add ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p); \
 void mcl_fpDbl_sub ## n ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p);