fast Fp4::sqrPre

4 years ago · bc4ed309e3
parent 285b44546a
commit bc4ed309e3
2 changed files with 76 additions and 2 deletions
--- a/src/fp_generator.hpp
+++ b/src/fp_generator.hpp
@ -1936,12 +1936,83 @@ private:
 		mul2x2(px, py, t4, t3, t2, t1, t0);
 		store_mr(pz, Pack(t3, t2, t1, t0));
 	}
+	/*
+		(3, 3)(2, 2)(1, 1)(0, 0)
+		   t5 t4 t3 t2 t1 t0
+		   (3, 2)(2, 1)(1, 0)x2
+		      (3, 1)(2, 0)x2
+		         (3, 0)x2
+	*/
+	void sqrPre4NF(const Reg64& py, const Reg64& px, const Pack& t)
+	{
+		const Reg64& t0 = t[0];
+		const Reg64& t1 = t[1];
+		const Reg64& t2 = t[2];
+		const Reg64& t3 = t[3];
+		const Reg64& t4 = t[4];
+		const Reg64& t5 = t[5];
+		const Reg64& x0 = t[6];
+		const Reg64& x1 = t[7];
+		const Reg64& x2 = t[8];
+		const Reg64& x3 = t[9];
+		const Reg64& H = t[10];
+
+		load_rm(Pack(x3, x2, x1, x0), px);
+		mov(rdx, x0);
+		mulx(t3, t2, x3); // (3, 0)
+		mulx(rax, t1, x2); // (2, 0)
+		add(t2, rax);
+		mov(rdx, x1);
+		mulx(t4, rax, x3); // (3, 1)
+		adc(t3, rax);
+		adc(t4, 0); // [t4:t3:t2:t1]
+		mulx(rax, t0, x0); // (1, 0)
+		add(t1, rax);
+		mulx(rdx, rax, x2); // (2, 1)
+		adc(t2, rax);
+		adc(t3, rdx);
+		mov(rdx, x3);
+		mulx(t5, rax, x2); // (3, 2)
+		adc(t4, rax);
+		adc(t5, 0);
+
+		shl1(Pack(t5, t4, t3, t2, t1, t0), &H);
+		mov(rdx, x0);
+		mulx(rdx, rax, rdx);
+		mov(ptr[py + 8 * 0], rax);
+		add(rdx, t0);
+		mov(ptr[py + 8 * 1], rdx);
+		mov(rdx, x1);
+		mulx(rdx, rax, rdx);
+		adc(rax, t1);
+		mov(ptr[py + 8 * 2], rax);
+		adc(rdx, t2);
+		mov(ptr[py + 8 * 3], rdx);
+		mov(rdx, x2);
+		mulx(rdx, rax, rdx);
+		adc(rax, t3);
+		mov(ptr[py + 8 * 4], rax);
+		adc(rdx, t4);
+		mov(ptr[py + 8 * 5], rdx);
+		mov(rdx, x3);
+		mulx(rdx, rax, rdx);
+		adc(rax, t5);
+		mov(ptr[py + 8 * 6], rax);
+		adc(rdx, H);
+		mov(ptr[py + 8 * 7], rdx);
+	}
 	/*
 		py[7..0] = px[3..0] ^ 2
 		use xmm0
 	*/
-	void sqrPre4(const RegExp& py, const RegExp& px, const Pack& t)
+	void sqrPre4(const Reg64& py, const Reg64& px, const Pack& t)
 	{
+#if 1
+		if (useMulx_ && useAdx_) {
+			sqrPre4NF(py, px, t);
+			return;
+		}
+#endif
 		const Reg64& t0 = t[0];
 		const Reg64& t1 = t[1];
 		const Reg64& t2 = t[2];
@ -2250,7 +2321,6 @@ private:
 	/*
 		@input (z, xy)
 		z[5..0] <- montgomery reduction(x[11..0])
-		use xm0, xm1, xm2
 	*/
 	void gen_fpDbl_mod6(const Reg64& z, const Reg64& xy, const Pack& t)
 	{
--- a/test/fp_tower_test.cpp
+++ b/test/fp_tower_test.cpp
@ -453,11 +453,15 @@ void testAll()
 		"0x0000000000000001000000000000000000000000000000000000000000000085", // min prime
 		"0x2523648240000001ba344d80000000086121000000000013a700000000000013",
 		"0x7523648240000001ba344d80000000086121000000000013a700000000000017",
+		// max prime less than 2**256/4
+		"0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff0b",
 		"0x800000000000000000000000000000000000000000000000000000000000005f",
 		"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff43", // max prime
 #if MCL_MAX_BIT_SIZE >= 384
 		// N = 6
 		"0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab",
+		// max prime less than 2**384/4
+		"0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff97",
 		"0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff0000000000000000ffffffff",
 #endif
 #if MCL_MAX_BIT_SIZE >= 768