remove non-mulx-code in jit

4 years ago · c4a356038a
parent 3f9cce8741
commit c4a356038a
1 changed files with 134 additions and 431 deletions
--- a/src/fp_generator.hpp
+++ b/src/fp_generator.hpp
@ -588,58 +588,34 @@ private:
 			mov(ptr [pz + 8], rax);
 			return;
 		}
-		if (useMulx_) {
+		assert(wk.size() > 0 && wk.isReg(0));
-			assert(wk.size() > 0 && wk.isReg(0));
+		const Reg64& t1 = wk.getReg(0);
-			const Reg64& t1 = wk.getReg(0);
+		// mulx(H, L, x) = [H:L] = x * rdx
-			// mulx(H, L, x) = [H:L] = x * rdx
+		mov(rdx, y);
-			mov(rdx, y);
+		mulx(t1, rax, ptr [px]); // [y:rax] = px * y
-			mulx(t1, rax, ptr [px]); // [y:rax] = px * y
+		mov(ptr [pz], rax);
-			mov(ptr [pz], rax);
+		const Reg64 *pt0 = &t;
-			const Reg64 *pt0 = &t;
+		const Reg64 *pt1 = &t1;
 			const Reg64 *pt1 = &t1;
 			for (size_t i = 1; i < n - 1; i++) {
 				mulx(*pt0, rax, ptr [px + i * 8]);
 				if (i == 1) {
 					add(rax, *pt1);
 				} else {
 					adc(rax, *pt1);
 				}
 				mov(ptr [pz + i * 8], rax);
 				std::swap(pt0, pt1);
 			}
 			mulx(rdx, rax, ptr [px + (n - 1) * 8]);
 			adc(rax, *pt1);
 			mov(ptr [pz + (n - 1) * 8], rax);
 			adc(rdx, 0);
 			return;
 		}
 		assert(wk.size() >= n - 1);
 		for (size_t i = 0; i < n; i++) {
 			mov(rax, ptr [px + i * 8]);
 			mul(y);
 			if (i < n - 1) {
 				mov(ptr [pz + i * 8], rax);
 				g_mov(wk[i], rdx);
 			}
 		}
 		for (size_t i = 1; i < n - 1; i++) {
-			mov(t, ptr [pz + i * 8]);
+			mulx(*pt0, rax, ptr [px + i * 8]);
 			if (i == 1) {
-				g_add(t, wk[i - 1]);
+				add(rax, *pt1);
 			} else {
-				g_adc(t, wk[i - 1]);
+				adc(rax, *pt1);
 			}
-			mov(ptr [pz + i * 8], t);
+			mov(ptr [pz + i * 8], rax);
 			std::swap(pt0, pt1);
 		}
-		g_adc(rax, wk[n - 2]);
+		mulx(rdx, rax, ptr [px + (n - 1) * 8]);
 		adc(rax, *pt1);
 		mov(ptr [pz + (n - 1) * 8], rax);
 		adc(rdx, 0);
 	}
 	void gen_mulUnit()
 	{
 //		assert(pn_ >= 2);
-		const int regNum = useMulx_ ? 2 : (1 + (std::min)(pn_ - 1, 8));
+		const int regNum = 2;
-		const int stackSize = useMulx_ ? 0 : (pn_ - 1) * 8;
+		const int stackSize = 0;
 		StackFrame sf(this, 3, regNum | UseRDX, stackSize);
 		const Reg64& pz = sf.p[0];
 		const Reg64& px = sf.p[1];
@ -870,7 +846,7 @@ private:
 			gen_montMul4();
 			return func;
 		}
-		if (pn_ == 6 && !isFullBit_ && useMulx_ && useAdx_) {
+		if (pn_ == 6 && !isFullBit_) {
 #if 1
 			// a little faster
 			gen_montMul6();
@ -907,7 +883,6 @@ private:
 		const Reg64& t0 = sf.t[0];
 		const Reg64& t1 = sf.t[1];
 		const Reg64& t2 = sf.t[2];
 		const Reg64& t3 = sf.t[3];
 		const Reg64& t4 = sf.t[4];
 		const Reg64& t5 = sf.t[5];
 		const Reg64& t6 = sf.t[6];
@ -925,7 +900,7 @@ private:
 		mov(t7, a); // q
 		// [d:t7:t1] = p * q
-		mul2x1(t0, t7, t1, t8);
+		mul2x1(t0, t7, t1);
 		xor_(t8, t8);
 		if (isFullBit_) {
@ -944,7 +919,7 @@ private:
 		mov(t6, a); // q
 		// [d:t6:xy] = p * q
-		mul2x1(t0, t6, xy, t3);
+		mul2x1(t0, t6, xy);
 		add_rr(Pack(t8, t4, t7), Pack(d, t6, xy));
 		// [t8:t4]
@ -994,7 +969,7 @@ private:
 		mov(t7, a); // q
 		// [d:t7:t2:t1] = p * q
-		mul3x1(t0, t7, t4, t2, t1, t8);
+		mul3x1(t0, t7, t2, t1, t8);
 		xor_(t8, t8);
 		xor_(t9, t9);
@ -1014,7 +989,7 @@ private:
 		mov(t10, a); // q
 		// [d:t10:t6:xy] = p * q
-		mul3x1(t0, t10, t1, t6, xy, t3);
+		mul3x1(t0, t10, t6, xy, t3);
 		add_rr(Pack(t8, t4, t7, t2), Pack(d, t10, t6, xy));
 		adc(t9, 0); // [t9:t8:t4:t7]
@ -1027,7 +1002,7 @@ private:
 		mov(t10, a); // q
 		// [d:t10:xy:t6] = p * q
-		mul3x1(t0, t10, t1, xy, t6, t2);
+		mul3x1(t0, t10, xy, t6, t2);
 		add_rr(Pack(t9, t8, t4, t7), Pack(d, t10, xy, t6));
 		// [t9:t8:t4]
@ -1095,7 +1070,7 @@ private:
 	*/
 	void gen_fpDbl_mod4(const Reg64& z, const Reg64& xy, const Pack& t)
 	{
-		if (!isFullBit_ && useMulx_ && useAdx_) {
+		if (!isFullBit_) {
 			gen_fpDbl_mod4NF(z, xy, t);
 			return;
 		}
@ -1123,7 +1098,7 @@ private:
 		mov(t7, a); // q
 		// [d:t7:t3:t2:t1] = p * q
-		mul4x1(t0, t7, t4, t3, t2, t1, t8);
+		mul4x1(t0, t7, t3, t2, t1);
 		xor_(t8, t8);
 		xor_(t9, t9);
@ -1152,7 +1127,7 @@ private:
 		vmovq(xm1, t10);
 		// [d:z:t5:t6:xy] = p * q
-		mul4x1(t0, z, t1, t5, t6, xy, t10);
+		mul4x1(t0, z, t5, t6, xy);
 		vmovq(t10, xm1);
 		add_rr(Pack(t8, t4, t7, t3, t2), Pack(d, z, t5, t6, xy));
@ -1171,7 +1146,7 @@ private:
 		mov(z, a); // q
 		// [d:z:t5:xy:t6] = p * q
-		mul4x1(t0, z, t1, t5, xy, t6, t2);
+		mul4x1(t0, z, t5, xy, t6);
 		add_rr(Pack(t9, t8, t4, t7, t3), Pack(d, z, t5, xy, t6));
 		adc(t10, 0); // c' = [t10:t9:t8:t4:t7]
@ -1187,7 +1162,7 @@ private:
 		mov(z, a); // q
 		// [d:z:t5:xy:t6] = p * q
-		mul4x1(t0, z, t1, t5, xy, t6, t2);
+		mul4x1(t0, z, t5, xy, t6);
 		add_rr(Pack(t10, t9, t8, t4, t7), Pack(d, z, t5, xy, t6));
 		// [t10:t9:t8:t4]
@ -1242,7 +1217,7 @@ private:
 			ret();
 			return func;
 		}
-		if (pn_ == 6 && !isFullBit_ && useMulx_ && useAdx_) {
+		if (pn_ == 6 && !isFullBit_) {
 			StackFrame sf(this, 3, 10 | UseRDX, 0, false);
 			call(fpDbl_modL);
 			sf.close();
@ -1270,7 +1245,7 @@ private:
 			gen_montSqr3();
 			return func;
 		}
-		if (pn_ == 4 && useMulx_) {
+		if (pn_ == 4) {
 #if 0
 			// sqr(y, x) = mul(y, x, x)
 #ifdef XBYAK64_WIN
@ -1290,7 +1265,7 @@ private:
 #endif
 			return func;
 		}
-		if (pn_ == 6 && !isFullBit_ && useMulx_ && useAdx_) {
+		if (pn_ == 6 && !isFullBit_) {
 #if 1
 			StackFrame sf(this, 3, 10 | UseRDX);
 			Pack t = sf.t;
@ -1352,27 +1327,25 @@ private:
 		const Reg64& t5 = sf.t[5];
 		const Reg64& t6 = sf.t[6];
 		const Reg64& t7 = sf.t[7];
 		const Reg64& t8 = sf.t[8];
 		const Reg64& t9 = sf.t[9];
 	L(fp_mulL);
 		vmovq(xm0, p0); // save p0
 		lea(p0, ptr[rip+pL_]);
 		vmovq(xm1, p2);
 		mov(p2, ptr [p2]);
-		montgomery4_1(rp_, t0, t7, t3, t2, t1, p1, p2, p0, t4, t5, t6, t8, t9, true, xm2);
+		montgomery4_1(rp_, t0, t7, t3, t2, t1, p1, p2, p0, t4, t5, t6, true, xm2);
 		vmovq(p2, xm1);
 		mov(p2, ptr [p2 + 8]);
-		montgomery4_1(rp_, t1, t0, t7, t3, t2, p1, p2, p0, t4, t5, t6, t8, t9, false, xm2);
+		montgomery4_1(rp_, t1, t0, t7, t3, t2, p1, p2, p0, t4, t5, t6, false, xm2);
 		vmovq(p2, xm1);
 		mov(p2, ptr [p2 + 16]);
-		montgomery4_1(rp_, t2, t1, t0, t7, t3, p1, p2, p0, t4, t5, t6, t8, t9, false, xm2);
+		montgomery4_1(rp_, t2, t1, t0, t7, t3, p1, p2, p0, t4, t5, t6, false, xm2);
 		vmovq(p2, xm1);
 		mov(p2, ptr [p2 + 24]);
-		montgomery4_1(rp_, t3, t2, t1, t0, t7, p1, p2, p0, t4, t5, t6, t8, t9, false, xm2);
+		montgomery4_1(rp_, t3, t2, t1, t0, t7, p1, p2, p0, t4, t5, t6, false, xm2);
 		// [t7:t3:t2:t1:t0]
 		mov(t4, t0);
@ -1475,7 +1448,7 @@ private:
 	*/
 	void gen_montMul6()
 	{
-		assert(!isFullBit_ && useMulx_ && useAdx_);
+		assert(!isFullBit_);
 		StackFrame sf(this, 3, 10 | UseRDX, 0, false);
 		call(fp_mulL);
 		sf.close();
@ -1543,12 +1516,12 @@ private:
 		lea(t7, ptr[rip+pL_]);
 		mov(t9, ptr [p2]);
 		//                c3, c2, c1, c0, px, y,  p,
-		montgomery3_1(rp_, t0, t3, t2, t1, p1, t9, t7, t4, t5, t6, t8, p0, true);
+		montgomery3_1(rp_, t0, t3, t2, t1, p1, t9, t7, t4, t5, t8, p0, true);
 		mov(t9, ptr [p2 + 8]);
-		montgomery3_1(rp_, t1, t0, t3, t2, p1, t9, t7, t4, t5, t6, t8, p0, false);
+		montgomery3_1(rp_, t1, t0, t3, t2, p1, t9, t7, t4, t5, t8, p0, false);
 		mov(t9, ptr [p2 + 16]);
-		montgomery3_1(rp_, t2, t1, t0, t3, p1, t9, t7, t4, t5, t6, t8, p0, false);
+		montgomery3_1(rp_, t2, t1, t0, t3, p1, t9, t7, t4, t5, t8, p0, false);
 		// [(t3):t2:t1:t0]
 		mov(t4, t0);
@ -1590,19 +1563,19 @@ private:
 		mov(t9, ptr [px]);
 		mul3x1_sqr1(px, t9, t3, t2, t1, t0);
 		mov(t0, rdx);
-		montgomery3_sub(rp_, t0, t9, t2, t1, px, t3, t7, t4, t5, t6, t8, pz, true);
+		montgomery3_sub(rp_, t0, t9, t2, t1, px, t3, t7, t4, t5, t8, pz, true);
 		mov(t3, ptr [px + 8]);
 		mul3x1_sqr2(px, t3, t6, t5, t4);
 		add_rr(Pack(t1, t0, t9, t2), Pack(rdx, rax, t5, t4));
 		if (isFullBit_) setc(pz.cvt8());
-		montgomery3_sub(rp_, t1, t3, t9, t2, px, t0, t7, t4, t5, t6, t8, pz, false);
+		montgomery3_sub(rp_, t1, t3, t9, t2, px, t0, t7, t4, t5, t8, pz, false);
 		mov(t0, ptr [px + 16]);
 		mul3x1_sqr3(t0, t5, t4);
 		add_rr(Pack(t2, t1, t3, t9), Pack(rdx, rax, t5, t4));
 		if (isFullBit_) setc(pz.cvt8());
-		montgomery3_sub(rp_, t2, t0, t3, t9, px, t1, t7, t4, t5, t6, t8, pz, false);
+		montgomery3_sub(rp_, t2, t0, t3, t9, px, t1, t7, t4, t5, t8, pz, false);
 		// [t9:t2:t0:t3]
 		mov(t4, t3);
@ -1636,58 +1609,25 @@ private:
 		const Reg64& t9 = t[9];
 		const Reg64& t10 = t[10];
-		if (useMulx_) {
+		mov(d, ptr [px + 8 * 0]);
-			mov(d, ptr [px + 8 * 0]);
+		mulx(t0, a, d);
-			mulx(t0, a, d);
+		mov(ptr [py + 8 * 0], a);
 			mov(ptr [py + 8 * 0], a);
-			mov(t7, ptr [px + 8 * 1]);
+		mov(t7, ptr [px + 8 * 1]);
-			mov(t9, ptr [px + 8 * 2]);
+		mov(t9, ptr [px + 8 * 2]);
-			mulx(t2, t1, t7);
+		mulx(t2, t1, t7);
-			mulx(t4, t3, t9);
+		mulx(t4, t3, t9);
-			mov(t5, t2);
+		mov(t5, t2);
-			mov(t6, t4);
+		mov(t6, t4);
-			add(t0, t1);
+		add(t0, t1);
-			adc(t5, t3);
+		adc(t5, t3);
-			adc(t6, 0); // [t6:t5:t0]
+		adc(t6, 0); // [t6:t5:t0]
-			mov(d, t7);
+		mov(d, t7);
-			mulx(t8, t7, d);
+		mulx(t8, t7, d);
-			mulx(t10, t9, t9);
+		mulx(t10, t9, t9);
 		} else {
 			mov(t9, ptr [px + 8 * 0]);
 			mov(a, t9);
 			mul(t9);
 			mov(ptr [py + 8 * 0], a);
 			mov(t0, d);
 			mov(a, ptr [px + 8 * 1]);
 			mul(t9);
 			mov(t1, a);
 			mov(t2, d);
 			mov(a, ptr [px + 8 * 2]);
 			mul(t9);
 			mov(t3, a);
 			mov(t4, d);
 			mov(t5, t2);
 			mov(t6, t4);
 			add(t0, t1);
 			adc(t5, t3);
 			adc(t6, 0); // [t6:t5:t0]
 			mov(t9, ptr [px + 8 * 1]);
 			mov(a, t9);
 			mul(t9);
 			mov(t7, a);
 			mov(t8, d);
 			mov(a, ptr [px + 8 * 2]);
 			mul(t9);
 			mov(t9, a);
 			mov(t10, d);
 		}
 		add(t2, t7);
 		adc(t8, t9);
 		mov(t7, t10);
@ -1738,21 +1678,10 @@ private:
 		mov(d, ptr [px]);
 		mulx(pd[0], a, ptr [py + 8 * 0]);
 		mov(ptr [pz + 8 * 0], a);
-		if (useAdx_) {
+		xor_(a, a);
-			xor_(a, a);
+		for (size_t i = 1; i < pd.size(); i++) {
-			for (size_t i = 1; i < pd.size(); i++) {
+			mulx(pd[i], a, ptr [py + 8 * i]);
-				mulx(pd[i], a, ptr [py + 8 * i]);
+			adcx(pd[i - 1], a);
 				adcx(pd[i - 1], a);
 			}
 		} else {
 			for (size_t i = 1; i < pd.size(); i++) {
 				mulx(pd[i], a, ptr [py + 8 * i]);
 				if (i == 1) {
 					add(pd[i - 1], a);
 				} else {
 					adc(pd[i - 1], a);
 				}
 			}
 		}
 		adc(pd[pd.size() - 1], 0);
 	}
@ -1823,58 +1752,34 @@ private:
 	*/
 	void mulPre3(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t)
 	{
 		const Reg64& a = rax;
 		const Reg64& d = rdx;
 		const Reg64& t0 = t[0];
 		const Reg64& t1 = t[1];
 		const Reg64& t2 = t[2];
 		const Reg64& t3 = t[3];
 		const Reg64& t4 = t[4];
 		const Reg64& t5 = t[5];
 		const Reg64& t6 = t[6];
 		const Reg64& t7 = t[7];
 		const Reg64& t8 = t[8];
 		const Reg64& t9 = t[9];
-		if (useMulx_) {
+		mulPack(pz, px, py, Pack(t2, t1, t0));
 			mulPack(pz, px, py, Pack(t2, t1, t0));
 #if 0 // a little slow
-			if (useAdx_) {
+		if (useAdx_) {
-				// [t2:t1:t0]
+			// [t2:t1:t0]
-				mulPackAdd(pz + 8 * 1, px + 8 * 1, py, t3, Pack(t2, t1, t0));
+			mulPackAdd(pz + 8 * 1, px + 8 * 1, py, t3, Pack(t2, t1, t0));
-				// [t3:t2:t1]
+			// [t3:t2:t1]
-				mulPackAdd(pz + 8 * 2, px + 8 * 2, py, t4, Pack(t3, t2, t1));
+			mulPackAdd(pz + 8 * 2, px + 8 * 2, py, t4, Pack(t3, t2, t1));
-				// [t4:t3:t2]
+			// [t4:t3:t2]
-				store_mr(pz + 8 * 3, Pack(t4, t3, t2));
+			store_mr(pz + 8 * 3, Pack(t4, t3, t2));
-				return;
+			return;
 			}
 #endif
 		} else {
 			mov(t5, ptr [px]);
 			mov(a, ptr [py + 8 * 0]);
 			mul(t5);
 			mov(ptr [pz + 8 * 0], a);
 			mov(t0, d);
 			mov(a, ptr [py + 8 * 1]);
 			mul(t5);
 			mov(t3, a);
 			mov(t1, d);
 			mov(a, ptr [py + 8 * 2]);
 			mul(t5);
 			mov(t4, a);
 			mov(t2, d);
 			add(t0, t3);
 			mov(t2, 0);
 			adc(t1, a);
 			adc(t2, d); // [t2:t1:t0:pz[0]] = px[0] * py[2..0]
 		}
-
+#endif
 		// here [t2:t1:t0]
 		mov(t9, ptr [px + 8]);
 		// [d:t9:t6:t5] = px[1] * py[2..0]
-		mul3x1(py, t9, t7, t6, t5, t4);
+		mul3x1(py, t9, t6, t5, t4);
 		add_rr(Pack(t2, t1, t0), Pack(t9, t6, t5));
 		adc(d, 0);
 		mov(t8, d);
@ -1884,7 +1789,7 @@ private:
 		mov(t9, ptr [px + 16]);
 		// [d:t9:t5:t4]
-		mul3x1(py, t9, t6, t5, t4, t0);
+		mul3x1(py, t9, t5, t4, t0);
 		add_rr(Pack(t8, t2, t1), Pack(t9, t5, t4));
 		adc(d, 0);
 		store_mr(pz + 8 * 2, Pack(d, t8, t2, t1));
@ -1909,7 +1814,6 @@ private:
 	*/
 	void sqr2(const Reg64& y3, const Reg64& y2, const Reg64& y1, const Reg64& y0, const Reg64& x1, const Reg64& x0, const Reg64& t1, const Reg64& t0)
 	{
 		assert(useMulx_);
 		mov(rdx, x0);
 		mulx(y1, y0, x0); // x0^2
 		mov(rdx, x1);
@ -1928,7 +1832,6 @@ private:
 	*/
 	void mul2x2(const RegExp& px, const RegExp& py, const Reg64& t4, const Reg64& t3, const Reg64& t2, const Reg64& t1, const Reg64& t0)
 	{
 		assert(useMulx_);
 #if 0
 		// # of add is less, but a little slower
 		mov(t4, ptr [py + 8 * 0]);
@ -2043,58 +1946,7 @@ private:
 	*/
 	void sqrPre4(const Reg64& py, const Reg64& px, const Pack& t)
 	{
-#if 1
+		sqrPre4NF(py, px, t);
 		if (useMulx_ && useAdx_) {
 			sqrPre4NF(py, px, t);
 			return;
 		}
 #endif
 		const Reg64& t0 = t[0];
 		const Reg64& t1 = t[1];
 		const Reg64& t2 = t[2];
 		const Reg64& t3 = t[3];
 		const Reg64& t4 = t[4];
 		const Reg64& t5 = t[5];
 		const Reg64& t6 = t[6];
 		const Reg64& t7 = t[7];
 		const Reg64& t8 = t[8];
 		const Reg64& t9 = t[9];
 		const Reg64& t10 = t[10];
 		const Reg64& a = rax;
 		const Reg64& d = rdx;
 		/*
 			(aN + b)^2 = a^2 N^2 + 2ab N + b^2
 		*/
 		load_rm(Pack(t9, t8), px);
 		sqr2(t3, t2, t1, t0, t9, t8, t7, t6);
 		// [t3:t2:t1:t0] = b^2
 		store_mr(py, Pack(t1, t0));
 		vmovq(xm0, t2);
 		mul2x2(px, px + 2 * 8, t6, t5, t4, t1, t0);
 		// [t5:t4:t1:t0] = ab
 		xor_(t6, t6);
 		add_rr(Pack(t6, t5, t4, t1, t0), Pack(t6, t5, t4, t1, t0));
 		// [t6:t5:t4:t1:t0] = 2ab
 		load_rm(Pack(t8, t7), px + 2 * 8);
 		// free t10, t9, rax, rdx
 		/*
 			[d:t8:t10:t9] = [t8:t7]^2
 		*/
 		mov(d, t7);
 		mulx(t10, t9, t7); // [t10:t9] = t7^2
 		mulx(t7, t2, t8); // [t7:t2] = t7 t8
 		xor_(a, a);
 		add_rr(Pack(a, t7, t2), Pack(a, t7, t2));
 		// [a:t7:t2] = 2 t7 t8
 		mov(d, t8);
 		mulx(d, t8, t8); // [d:t8] = t8^2
 		add_rr(Pack(d, t8, t10), Pack(a, t7, t2));
 		// [d:t8:t10:t9] = [t8:t7]^2
 		vmovq(t2, xm0);
 		add_rr(Pack(t8, t10, t9, t3, t2), Pack(t6, t5, t4, t1, t0));
 		adc(d, 0);
 		store_mr(py + 2 * 8, Pack(d, t8, t10, t9, t3, t2));
 	}
 	/*
 		(5, 5)(4, 4)(3, 3)(2, 2)(1, 1)(0, 0)
@ -2221,7 +2073,6 @@ private:
 	*/
 	void mulPre4(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t)
 	{
 		const Reg64& a = rax;
 		const Reg64& d = rdx;
 		const Reg64& t0 = t[0];
 		const Reg64& t1 = t[1];
@ -2235,14 +2086,12 @@ private:
 		const Reg64& t9 = t[9];
 #if 0 // a little slower
-		if (useMulx_ && useAdx_) {
+		mulPack(pz, px, py, Pack(t3, t2, t1, t0));
-			mulPack(pz, px, py, Pack(t3, t2, t1, t0));
+		mulPackAdd(pz + 8 * 1, px + 8 * 1, py, t4, Pack(t3, t2, t1, t0));
-			mulPackAdd(pz + 8 * 1, px + 8 * 1, py, t4, Pack(t3, t2, t1, t0));
+		mulPackAdd(pz + 8 * 2, px + 8 * 2, py, t0, Pack(t4, t3, t2, t1));
-			mulPackAdd(pz + 8 * 2, px + 8 * 2, py, t0, Pack(t4, t3, t2, t1));
+		mulPackAdd(pz + 8 * 3, px + 8 * 3, py, t1, Pack(t0, t4, t3, t2));
-			mulPackAdd(pz + 8 * 3, px + 8 * 3, py, t1, Pack(t0, t4, t3, t2));
+		store_mr(pz + 8 * 4, Pack(t1, t0, t4, t3));
-			store_mr(pz + 8 * 4, Pack(t1, t0, t4, t3));
+		return;
 			return;
 		}
 #endif
 #if 0
 		// a little slower
@ -2266,37 +2115,14 @@ private:
 		adc(t7, 0);
 		store_mr(pz + 8 * 2, Pack(t7, t4, t3, t2, t1, t0));
 #else
-		if (useMulx_) {
+		mulPack(pz, px, py, Pack(t3, t2, t1, t0));
 			mulPack(pz, px, py, Pack(t3, t2, t1, t0));
 		} else {
 			mov(t5, ptr [px]);
 			mov(a, ptr [py + 8 * 0]);
 			mul(t5);
 			mov(ptr [pz + 8 * 0], a);
 			mov(t0, d);
 			mov(a, ptr [py + 8 * 1]);
 			mul(t5);
 			mov(t3, a);
 			mov(t1, d);
 			mov(a, ptr [py + 8 * 2]);
 			mul(t5);
 			mov(t4, a);
 			mov(t2, d);
 			mov(a, ptr [py + 8 * 3]);
 			mul(t5);
 			add(t0, t3);
 			mov(t3, 0);
 			adc(t1, t4);
 			adc(t2, a);
 			adc(t3, d); // [t3:t2:t1:t0:pz[0]] = px[0] * py[3..0]
 		}
 		// here [t3:t2:t1:t0]
 		mov(t9, ptr [px + 8]);
 		// [d:t9:t7:t6:t5] = px[1] * py[3..0]
-		mul4x1(py, t9, t8, t7, t6, t5, t4);
+		mul4x1(py, t9, t7, t6, t5);
 		add_rr(Pack(t3, t2, t1, t0), Pack(t9, t7, t6, t5));
 		adc(d, 0);
 		mov(t8, d);
@ -2306,7 +2132,7 @@ private:
 		mov(t9, ptr [px + 16]);
 		// [d:t9:t6:t5:t4]
-		mul4x1(py, t9, t7, t6, t5, t4, t0);
+		mul4x1(py, t9, t6, t5, t4);
 		add_rr(Pack(t8, t3, t2, t1), Pack(t9, t6, t5, t4));
 		adc(d, 0);
 		mov(t7, d);
@ -2315,7 +2141,7 @@ private:
 		mov(t9, ptr [px + 24]);
 		// [d:t9:t5:t4:t1]
-		mul4x1(py, t9, t6, t5, t4, t1, t0);
+		mul4x1(py, t9, t5, t4, t1);
 		add_rr(Pack(t7, t8, t3, t2), Pack(t9, t5, t4, t1));
 		adc(d, 0);
 		store_mr(pz + 8 * 3, Pack(t7, t8, t3, t2));
@ -2502,7 +2328,6 @@ private:
 	}
 	void gen_fpDbl_sqrPre(void2u& f)
 	{
 		if (!(useMulx_ && useAdx_)) return;
 		void2u func = getCurr<void2u>();
 		switch (pn_) {
 		case 2:
@ -2547,7 +2372,6 @@ private:
 	}
 	void gen_fpDbl_mulPre(void3u& f)
 	{
 		if (!useMulx_ || (pn_ == 6 && !useAdx_)) return;
 		void3u func = getCurr<void3u>();
 		switch (pn_) {
 		case 2:
@ -3245,78 +3069,40 @@ private:
 	}
 	/*
 		[rdx:x:t0] <- py[1:0] * x
-		destroy x, t
+		destroy x, t0
 	*/
-	void mul2x1(const RegExp& py, const Reg64& x, const Reg64& t0, const Reg64& t)
+	void mul2x1(const RegExp& py, const Reg64& x, const Reg64& t0)
 	{
-		if (useMulx_) {
+		// mulx(H, L, x) = [H:L] = x * rdx
-			// mulx(H, L, x) = [H:L] = x * rdx
+		/*
-			/*
+			rdx:x
-				rdx:x
+			   rax:t0
-				   rax:t0
+		*/
-			*/
+		mov(rdx, x);
-			mov(rdx, x);
+		mulx(rax, t0, ptr [py]); // [rax:t0] = py[0] * x
-			mulx(rax, t0, ptr [py]); // [rax:t0] = py[0] * x
+		mulx(rdx, x, ptr [py + 8]); // [t:t1] = py[1] * x
-			mulx(rdx, x, ptr [py + 8]); // [t:t1] = py[1] * x
+		add(x, rax);
-			add(x, rax);
+		adc(rdx, 0);
 			adc(rdx, 0);
 		} else {
 			mov(rax, ptr [py]);
 			mul(x);
 			mov(t0, rax);
 			mov(t, rdx);
 			mov(rax, ptr [py + 8]);
 			mul(x);
 			/*
 				rdx:rax
 				     t:t0
 			*/
 			add(rax, t);
 			adc(rdx, 0);
 			mov(x, rax);
 		}
 	}
 	/*
 		[rdx:x:t1:t0] <- py[2:1:0] * x
 		destroy x, t
 	*/
-	void mul3x1(const RegExp& py, const Reg64& x, const Reg64& t2, const Reg64& t1, const Reg64& t0, const Reg64& t)
+	void mul3x1(const RegExp& py, const Reg64& x, const Reg64& t1, const Reg64& t0, const Reg64& t)
 	{
-		if (useMulx_) {
+		// mulx(H, L, x) = [H:L] = x * rdx
-			// mulx(H, L, x) = [H:L] = x * rdx
+		/*
-			/*
+			rdx:x
-				rdx:x
+			    t:t1
-				    t:t1
+			      rax:t0
-				      rax:t0
+		*/
-			*/
+		mov(rdx, x);
-			mov(rdx, x);
+		mulx(rax, t0, ptr [py]); // [rax:t0] = py[0] * x
-			mulx(rax, t0, ptr [py]); // [rax:t0] = py[0] * x
+		mulx(t, t1, ptr [py + 8]); // [t:t1] = py[1] * x
-			mulx(t, t1, ptr [py + 8]); // [t:t1] = py[1] * x
+		add(t1, rax);
-			add(t1, rax);
+		mulx(rdx, x, ptr [py + 8 * 2]);
-			mulx(rdx, x, ptr [py + 8 * 2]);
+		adc(x, t);
-			adc(x, t);
+		adc(rdx, 0);
 			adc(rdx, 0);
 		} else {
 			mov(rax, ptr [py]);
 			mul(x);
 			mov(t0, rax);
 			mov(t1, rdx);
 			mov(rax, ptr [py + 8]);
 			mul(x);
 			mov(t, rax);
 			mov(t2, rdx);
 			mov(rax, ptr [py + 8 * 2]);
 			mul(x);
 			/*
 				rdx:rax
 				     t2:t
 				        t1:t0
 			*/
 			add(t1, t);
 			adc(rax, t2);
 			adc(rdx, 0);
 			mov(x, rax);
 		}
 	}
 	/*
 		[x2:x1:x0] * x0
@ -3399,7 +3185,7 @@ private:
 	*/
 	void montgomery3_sub(uint64_t pp, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0,
 		const Reg64& /*px*/, const Reg64& y, const Reg64& p,
-		const Reg64& t0, const Reg64& t1, const Reg64& t2, const Reg64& t3, const Reg64& t4, bool isFirst)
+		const Reg64& t0, const Reg64& t1, const Reg64& t3, const Reg64& t4, bool isFirst)
 	{
 		// input [c3:y:c1:0]
 		// [t4:c3:y:c1:c0]
@ -3407,7 +3193,7 @@ private:
 		mov(rax, pp);
 		mul(c0); // q = rax
 		mov(c2, rax);
-		mul3x1(p, c2, t2, t1, t0, t3);
+		mul3x1(p, c2, t1, t0, t3);
 		// [rdx:c2:t1:t0] = p * q
 		add(c0, t0); // always c0 is zero
 		adc(c1, t1);
@ -3433,118 +3219,35 @@ private:
 	*/
 	void montgomery3_1(uint64_t pp, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0,
 		const Reg64& px, const Reg64& y, const Reg64& p,
-		const Reg64& t0, const Reg64& t1, const Reg64& t2, const Reg64& t3, const Reg64& t4, bool isFirst)
+		const Reg64& t0, const Reg64& t1, const Reg64& t3, const Reg64& t4, bool isFirst)
 	{
 		if (isFirst) {
-			mul3x1(px, y, c2, c1, c0, c3);
+			mul3x1(px, y, c1, c0, c3);
 			mov(c3, rdx);
 			// [c3:y:c1:c0] = px[2..0] * y
 		} else {
-			mul3x1(px, y, t2, t1, t0, t3);
+			mul3x1(px, y, t1, t0, t3);
 			// [rdx:y:t1:t0] = px[2..0] * y
 			add_rr(Pack(c3, y, c1, c0), Pack(rdx, c2, t1, t0));
 			if (isFullBit_) setc(t4.cvt8());
 		}
-		montgomery3_sub(pp, c3, c2, c1, c0, px, y, p, t0, t1, t2, t3, t4, isFirst);
+		montgomery3_sub(pp, c3, c2, c1, c0, px, y, p, t0, t1, t3, t4, isFirst);
 	}
 	/*
 		pc[0..n] += x[0..n-1] * y ; pc[] = 0 if isFirst
 		pc[n + 1] is temporary used if isFullBit_
 		q = uint64_t(pc[0] * pp)
 		pc[] = (pc[] + q * p) >> 64
 		input : pc[], px[], y, p[], pw1[], pw2[]
 		output : pc[0..n]   ; if isFullBit_
 		         pc[0..n-1] ; if !isFullBit_
 		destroy y
 		use
 		pw1[0] if useMulx_
 		pw1[0..n-2] otherwise
 		pw2[0..n-1]
 	*/
 	void montgomeryN_1(uint64_t pp, int n, const RegExp& pc, const RegExp& px, const Reg64& y, const Reg64& p, const Reg64& t, const MixPack& pw1, const RegExp& pw2, bool isFirst)
 	{
 		// pc[] += x[] * y
 		if (isFirst) {
 			gen_raw_mulUnit(pc, px, y, pw1, t, n);
 			mov(ptr [pc + n * 8], rdx);
 		} else {
 			gen_raw_mulUnit(pw2, px, y, pw1, t, n);
 			mov(t, ptr [pw2 + 0 * 8]);
 			add(ptr [pc + 0 * 8], t);
 			for (int i = 1; i < n; i++) {
 				mov(t, ptr [pw2 + i * 8]);
 				adc(ptr [pc + i * 8], t);
 			}
 			adc(ptr [pc + n * 8], rdx);
 			if (isFullBit_) {
 				mov(t, 0);
 				adc(t, 0);
 				mov(qword [pc + (n + 1) * 8], t);
 			}
 		}
 		mov(rax, pp);
 		mul(qword [pc]);
 		mov(y, rax); // y = q
 		gen_raw_mulUnit(pw2, p, y, pw1, t, n);
 		// c[] = (c[] + pw2[]) >> 64
 		mov(t, ptr [pw2 + 0 * 8]);
 		add(t, ptr [pc + 0 * 8]);
 		for (int i = 1; i < n; i++) {
 			mov(t, ptr [pw2 + i * 8]);
 			adc(t, ptr [pc + i * 8]);
 			mov(ptr [pc + (i - 1) * 8], t);
 		}
 		adc(rdx, ptr [pc + n * 8]);
 		mov(ptr [pc + (n - 1) * 8], rdx);
 		if (isFullBit_) {
 			if (isFirst) {
 				mov(t, 0);
 			} else {
 				mov(t, ptr [pc + (n + 1) * 8]);
 			}
 			adc(t, 0);
 			mov(qword [pc + n * 8], t);
 		} else {
 			xor_(eax, eax);
 			mov(ptr [pc + n * 8], rax);
 		}
 	}
 	/*
 		[rdx:x:t2:t1:t0] <- py[3:2:1:0] * x
 		destroy x, t
 	*/
-	void mul4x1(const RegExp& py, const Reg64& x, const Reg64& t3, const Reg64& t2, const Reg64& t1, const Reg64& t0, const Reg64& t)
+	void mul4x1(const RegExp& py, const Reg64& x, const Reg64& t2, const Reg64& t1, const Reg64& t0)
-	{
+	{
-		if (useMulx_) {
+		mov(rdx, x);
-			mov(rdx, x);
+		mulx(t1, t0, ptr [py + 8 * 0]);
-			mulx(t1, t0, ptr [py + 8 * 0]);
+		mulx(t2, rax, ptr [py + 8 * 1]);
-			mulx(t2, rax, ptr [py + 8 * 1]);
+		add(t1, rax);
-			add(t1, rax);
+		mulx(x, rax, ptr [py + 8 * 2]);
-			mulx(x, rax, ptr [py + 8 * 2]);
+		adc(t2, rax);
-			adc(t2, rax);
+		mulx(rdx, rax, ptr [py + 8 * 3]);
-			mulx(rdx, rax, ptr [py + 8 * 3]);
+		adc(x, rax);
-			adc(x, rax);
+		adc(rdx, 0);
 			adc(rdx, 0);
 		} else {
 			mov(rax, ptr [py]);
 			mul(x);
 			mov(t0, rax);
 			mov(t1, rdx);
 			mov(rax, ptr [py + 8]);
 			mul(x);
 			mov(t, rax);
 			mov(t2, rdx);
 			mov(rax, ptr [py + 8 * 2]);
 			mul(x);
 			mov(t3, rax);
 			mov(rax, x);
 			mov(x, rdx);
 			mul(qword [py + 8 * 3]);
 			add(t1, t);
 			adc(t2, t3);
 			adc(x, rax);
 			adc(rdx, 0);
 		}
 	}
 	/*
@ -3560,14 +3263,14 @@ private:
 	*/
 	void montgomery4_1(uint64_t pp, const Reg64& c4, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0,
 		const Reg64& px, const Reg64& y, const Reg64& p,
-		const Reg64& t0, const Reg64& t1, const Reg64& t2, const Reg64& t3, const Reg64& t4, bool isFirst, const Xmm& xt)
+		const Reg64& t0, const Reg64& t1, const Reg64& t2, bool isFirst, const Xmm& xt)
 	{
 		if (isFirst) {
-			mul4x1(px, y, c3, c2, c1, c0, c4);
+			mul4x1(px, y, c2, c1, c0);
 			mov(c4, rdx);
 			// [c4:y:c2:c1:c0] = px[3..0] * y
 		} else {
-			mul4x1(px, y, t3, t2, t1, t0, t4);
+			mul4x1(px, y, t2, t1, t0);
 			// [rdx:y:t2:t1:t0] = px[3..0] * y
 			if (isFullBit_) {
 				vmovq(xt, px);
@ -3583,7 +3286,7 @@ private:
 		mov(rax, pp);
 		mul(c0); // q = rax
 		mov(c3, rax);
-		mul4x1(p, c3, t3, t2, t1, t0, t4);
+		mul4x1(p, c3, t2, t1, t0);
 		add(c0, t0); // always c0 is zero
 		adc(c1, t1);
 		adc(c2, t2);
@ -3601,7 +3304,7 @@ private:
 	void3u gen_fp2Dbl_mulPre()
 	{
 		if (isFullBit_) return 0;
-		if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
+		if (!(pn_ == 4 || pn_ == 6)) return 0;
 		void3u func = getCurr<void3u>();
 		bool embedded = pn_ == 4;
@ -3881,7 +3584,7 @@ private:
 	void3u gen_fp2_mul()
 	{
 		if (isFullBit_) return 0;
-		if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
+		if (!(pn_ == 4 || pn_ == 6)) return 0;
 		void3u func = getCurr<void3u>();
 		int stackSize = 8 + FpByte_ * 4;
 		StackFrame sf(this, 3, 10 | UseRDX, stackSize);
@ -3903,7 +3606,7 @@ private:
 	void2u gen_fp2_sqr()
 	{
 		if (isFullBit_) return 0;
-		if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
+		if (!(pn_ == 4 || pn_ == 6)) return 0;
 		bool nocarry = (p_[pn_ - 1] >> 62) == 0;
 		if (!nocarry) return 0;
 		void2u func = getCurr<void2u>();