remove non-mulx-code in jit

update-fork
MITSUNARI Shigeo 4 years ago
parent 3f9cce8741
commit c4a356038a
  1. 565
      src/fp_generator.hpp

@ -588,58 +588,34 @@ private:
mov(ptr [pz + 8], rax); mov(ptr [pz + 8], rax);
return; return;
} }
if (useMulx_) { assert(wk.size() > 0 && wk.isReg(0));
assert(wk.size() > 0 && wk.isReg(0)); const Reg64& t1 = wk.getReg(0);
const Reg64& t1 = wk.getReg(0); // mulx(H, L, x) = [H:L] = x * rdx
// mulx(H, L, x) = [H:L] = x * rdx mov(rdx, y);
mov(rdx, y); mulx(t1, rax, ptr [px]); // [y:rax] = px * y
mulx(t1, rax, ptr [px]); // [y:rax] = px * y mov(ptr [pz], rax);
mov(ptr [pz], rax); const Reg64 *pt0 = &t;
const Reg64 *pt0 = &t; const Reg64 *pt1 = &t1;
const Reg64 *pt1 = &t1;
for (size_t i = 1; i < n - 1; i++) {
mulx(*pt0, rax, ptr [px + i * 8]);
if (i == 1) {
add(rax, *pt1);
} else {
adc(rax, *pt1);
}
mov(ptr [pz + i * 8], rax);
std::swap(pt0, pt1);
}
mulx(rdx, rax, ptr [px + (n - 1) * 8]);
adc(rax, *pt1);
mov(ptr [pz + (n - 1) * 8], rax);
adc(rdx, 0);
return;
}
assert(wk.size() >= n - 1);
for (size_t i = 0; i < n; i++) {
mov(rax, ptr [px + i * 8]);
mul(y);
if (i < n - 1) {
mov(ptr [pz + i * 8], rax);
g_mov(wk[i], rdx);
}
}
for (size_t i = 1; i < n - 1; i++) { for (size_t i = 1; i < n - 1; i++) {
mov(t, ptr [pz + i * 8]); mulx(*pt0, rax, ptr [px + i * 8]);
if (i == 1) { if (i == 1) {
g_add(t, wk[i - 1]); add(rax, *pt1);
} else { } else {
g_adc(t, wk[i - 1]); adc(rax, *pt1);
} }
mov(ptr [pz + i * 8], t); mov(ptr [pz + i * 8], rax);
std::swap(pt0, pt1);
} }
g_adc(rax, wk[n - 2]); mulx(rdx, rax, ptr [px + (n - 1) * 8]);
adc(rax, *pt1);
mov(ptr [pz + (n - 1) * 8], rax); mov(ptr [pz + (n - 1) * 8], rax);
adc(rdx, 0); adc(rdx, 0);
} }
void gen_mulUnit() void gen_mulUnit()
{ {
// assert(pn_ >= 2); // assert(pn_ >= 2);
const int regNum = useMulx_ ? 2 : (1 + (std::min)(pn_ - 1, 8)); const int regNum = 2;
const int stackSize = useMulx_ ? 0 : (pn_ - 1) * 8; const int stackSize = 0;
StackFrame sf(this, 3, regNum | UseRDX, stackSize); StackFrame sf(this, 3, regNum | UseRDX, stackSize);
const Reg64& pz = sf.p[0]; const Reg64& pz = sf.p[0];
const Reg64& px = sf.p[1]; const Reg64& px = sf.p[1];
@ -870,7 +846,7 @@ private:
gen_montMul4(); gen_montMul4();
return func; return func;
} }
if (pn_ == 6 && !isFullBit_ && useMulx_ && useAdx_) { if (pn_ == 6 && !isFullBit_) {
#if 1 #if 1
// a little faster // a little faster
gen_montMul6(); gen_montMul6();
@ -907,7 +883,6 @@ private:
const Reg64& t0 = sf.t[0]; const Reg64& t0 = sf.t[0];
const Reg64& t1 = sf.t[1]; const Reg64& t1 = sf.t[1];
const Reg64& t2 = sf.t[2]; const Reg64& t2 = sf.t[2];
const Reg64& t3 = sf.t[3];
const Reg64& t4 = sf.t[4]; const Reg64& t4 = sf.t[4];
const Reg64& t5 = sf.t[5]; const Reg64& t5 = sf.t[5];
const Reg64& t6 = sf.t[6]; const Reg64& t6 = sf.t[6];
@ -925,7 +900,7 @@ private:
mov(t7, a); // q mov(t7, a); // q
// [d:t7:t1] = p * q // [d:t7:t1] = p * q
mul2x1(t0, t7, t1, t8); mul2x1(t0, t7, t1);
xor_(t8, t8); xor_(t8, t8);
if (isFullBit_) { if (isFullBit_) {
@ -944,7 +919,7 @@ private:
mov(t6, a); // q mov(t6, a); // q
// [d:t6:xy] = p * q // [d:t6:xy] = p * q
mul2x1(t0, t6, xy, t3); mul2x1(t0, t6, xy);
add_rr(Pack(t8, t4, t7), Pack(d, t6, xy)); add_rr(Pack(t8, t4, t7), Pack(d, t6, xy));
// [t8:t4] // [t8:t4]
@ -994,7 +969,7 @@ private:
mov(t7, a); // q mov(t7, a); // q
// [d:t7:t2:t1] = p * q // [d:t7:t2:t1] = p * q
mul3x1(t0, t7, t4, t2, t1, t8); mul3x1(t0, t7, t2, t1, t8);
xor_(t8, t8); xor_(t8, t8);
xor_(t9, t9); xor_(t9, t9);
@ -1014,7 +989,7 @@ private:
mov(t10, a); // q mov(t10, a); // q
// [d:t10:t6:xy] = p * q // [d:t10:t6:xy] = p * q
mul3x1(t0, t10, t1, t6, xy, t3); mul3x1(t0, t10, t6, xy, t3);
add_rr(Pack(t8, t4, t7, t2), Pack(d, t10, t6, xy)); add_rr(Pack(t8, t4, t7, t2), Pack(d, t10, t6, xy));
adc(t9, 0); // [t9:t8:t4:t7] adc(t9, 0); // [t9:t8:t4:t7]
@ -1027,7 +1002,7 @@ private:
mov(t10, a); // q mov(t10, a); // q
// [d:t10:xy:t6] = p * q // [d:t10:xy:t6] = p * q
mul3x1(t0, t10, t1, xy, t6, t2); mul3x1(t0, t10, xy, t6, t2);
add_rr(Pack(t9, t8, t4, t7), Pack(d, t10, xy, t6)); add_rr(Pack(t9, t8, t4, t7), Pack(d, t10, xy, t6));
// [t9:t8:t4] // [t9:t8:t4]
@ -1095,7 +1070,7 @@ private:
*/ */
void gen_fpDbl_mod4(const Reg64& z, const Reg64& xy, const Pack& t) void gen_fpDbl_mod4(const Reg64& z, const Reg64& xy, const Pack& t)
{ {
if (!isFullBit_ && useMulx_ && useAdx_) { if (!isFullBit_) {
gen_fpDbl_mod4NF(z, xy, t); gen_fpDbl_mod4NF(z, xy, t);
return; return;
} }
@ -1123,7 +1098,7 @@ private:
mov(t7, a); // q mov(t7, a); // q
// [d:t7:t3:t2:t1] = p * q // [d:t7:t3:t2:t1] = p * q
mul4x1(t0, t7, t4, t3, t2, t1, t8); mul4x1(t0, t7, t3, t2, t1);
xor_(t8, t8); xor_(t8, t8);
xor_(t9, t9); xor_(t9, t9);
@ -1152,7 +1127,7 @@ private:
vmovq(xm1, t10); vmovq(xm1, t10);
// [d:z:t5:t6:xy] = p * q // [d:z:t5:t6:xy] = p * q
mul4x1(t0, z, t1, t5, t6, xy, t10); mul4x1(t0, z, t5, t6, xy);
vmovq(t10, xm1); vmovq(t10, xm1);
add_rr(Pack(t8, t4, t7, t3, t2), Pack(d, z, t5, t6, xy)); add_rr(Pack(t8, t4, t7, t3, t2), Pack(d, z, t5, t6, xy));
@ -1171,7 +1146,7 @@ private:
mov(z, a); // q mov(z, a); // q
// [d:z:t5:xy:t6] = p * q // [d:z:t5:xy:t6] = p * q
mul4x1(t0, z, t1, t5, xy, t6, t2); mul4x1(t0, z, t5, xy, t6);
add_rr(Pack(t9, t8, t4, t7, t3), Pack(d, z, t5, xy, t6)); add_rr(Pack(t9, t8, t4, t7, t3), Pack(d, z, t5, xy, t6));
adc(t10, 0); // c' = [t10:t9:t8:t4:t7] adc(t10, 0); // c' = [t10:t9:t8:t4:t7]
@ -1187,7 +1162,7 @@ private:
mov(z, a); // q mov(z, a); // q
// [d:z:t5:xy:t6] = p * q // [d:z:t5:xy:t6] = p * q
mul4x1(t0, z, t1, t5, xy, t6, t2); mul4x1(t0, z, t5, xy, t6);
add_rr(Pack(t10, t9, t8, t4, t7), Pack(d, z, t5, xy, t6)); add_rr(Pack(t10, t9, t8, t4, t7), Pack(d, z, t5, xy, t6));
// [t10:t9:t8:t4] // [t10:t9:t8:t4]
@ -1242,7 +1217,7 @@ private:
ret(); ret();
return func; return func;
} }
if (pn_ == 6 && !isFullBit_ && useMulx_ && useAdx_) { if (pn_ == 6 && !isFullBit_) {
StackFrame sf(this, 3, 10 | UseRDX, 0, false); StackFrame sf(this, 3, 10 | UseRDX, 0, false);
call(fpDbl_modL); call(fpDbl_modL);
sf.close(); sf.close();
@ -1270,7 +1245,7 @@ private:
gen_montSqr3(); gen_montSqr3();
return func; return func;
} }
if (pn_ == 4 && useMulx_) { if (pn_ == 4) {
#if 0 #if 0
// sqr(y, x) = mul(y, x, x) // sqr(y, x) = mul(y, x, x)
#ifdef XBYAK64_WIN #ifdef XBYAK64_WIN
@ -1290,7 +1265,7 @@ private:
#endif #endif
return func; return func;
} }
if (pn_ == 6 && !isFullBit_ && useMulx_ && useAdx_) { if (pn_ == 6 && !isFullBit_) {
#if 1 #if 1
StackFrame sf(this, 3, 10 | UseRDX); StackFrame sf(this, 3, 10 | UseRDX);
Pack t = sf.t; Pack t = sf.t;
@ -1352,27 +1327,25 @@ private:
const Reg64& t5 = sf.t[5]; const Reg64& t5 = sf.t[5];
const Reg64& t6 = sf.t[6]; const Reg64& t6 = sf.t[6];
const Reg64& t7 = sf.t[7]; const Reg64& t7 = sf.t[7];
const Reg64& t8 = sf.t[8];
const Reg64& t9 = sf.t[9];
L(fp_mulL); L(fp_mulL);
vmovq(xm0, p0); // save p0 vmovq(xm0, p0); // save p0
lea(p0, ptr[rip+pL_]); lea(p0, ptr[rip+pL_]);
vmovq(xm1, p2); vmovq(xm1, p2);
mov(p2, ptr [p2]); mov(p2, ptr [p2]);
montgomery4_1(rp_, t0, t7, t3, t2, t1, p1, p2, p0, t4, t5, t6, t8, t9, true, xm2); montgomery4_1(rp_, t0, t7, t3, t2, t1, p1, p2, p0, t4, t5, t6, true, xm2);
vmovq(p2, xm1); vmovq(p2, xm1);
mov(p2, ptr [p2 + 8]); mov(p2, ptr [p2 + 8]);
montgomery4_1(rp_, t1, t0, t7, t3, t2, p1, p2, p0, t4, t5, t6, t8, t9, false, xm2); montgomery4_1(rp_, t1, t0, t7, t3, t2, p1, p2, p0, t4, t5, t6, false, xm2);
vmovq(p2, xm1); vmovq(p2, xm1);
mov(p2, ptr [p2 + 16]); mov(p2, ptr [p2 + 16]);
montgomery4_1(rp_, t2, t1, t0, t7, t3, p1, p2, p0, t4, t5, t6, t8, t9, false, xm2); montgomery4_1(rp_, t2, t1, t0, t7, t3, p1, p2, p0, t4, t5, t6, false, xm2);
vmovq(p2, xm1); vmovq(p2, xm1);
mov(p2, ptr [p2 + 24]); mov(p2, ptr [p2 + 24]);
montgomery4_1(rp_, t3, t2, t1, t0, t7, p1, p2, p0, t4, t5, t6, t8, t9, false, xm2); montgomery4_1(rp_, t3, t2, t1, t0, t7, p1, p2, p0, t4, t5, t6, false, xm2);
// [t7:t3:t2:t1:t0] // [t7:t3:t2:t1:t0]
mov(t4, t0); mov(t4, t0);
@ -1475,7 +1448,7 @@ private:
*/ */
void gen_montMul6() void gen_montMul6()
{ {
assert(!isFullBit_ && useMulx_ && useAdx_); assert(!isFullBit_);
StackFrame sf(this, 3, 10 | UseRDX, 0, false); StackFrame sf(this, 3, 10 | UseRDX, 0, false);
call(fp_mulL); call(fp_mulL);
sf.close(); sf.close();
@ -1543,12 +1516,12 @@ private:
lea(t7, ptr[rip+pL_]); lea(t7, ptr[rip+pL_]);
mov(t9, ptr [p2]); mov(t9, ptr [p2]);
// c3, c2, c1, c0, px, y, p, // c3, c2, c1, c0, px, y, p,
montgomery3_1(rp_, t0, t3, t2, t1, p1, t9, t7, t4, t5, t6, t8, p0, true); montgomery3_1(rp_, t0, t3, t2, t1, p1, t9, t7, t4, t5, t8, p0, true);
mov(t9, ptr [p2 + 8]); mov(t9, ptr [p2 + 8]);
montgomery3_1(rp_, t1, t0, t3, t2, p1, t9, t7, t4, t5, t6, t8, p0, false); montgomery3_1(rp_, t1, t0, t3, t2, p1, t9, t7, t4, t5, t8, p0, false);
mov(t9, ptr [p2 + 16]); mov(t9, ptr [p2 + 16]);
montgomery3_1(rp_, t2, t1, t0, t3, p1, t9, t7, t4, t5, t6, t8, p0, false); montgomery3_1(rp_, t2, t1, t0, t3, p1, t9, t7, t4, t5, t8, p0, false);
// [(t3):t2:t1:t0] // [(t3):t2:t1:t0]
mov(t4, t0); mov(t4, t0);
@ -1590,19 +1563,19 @@ private:
mov(t9, ptr [px]); mov(t9, ptr [px]);
mul3x1_sqr1(px, t9, t3, t2, t1, t0); mul3x1_sqr1(px, t9, t3, t2, t1, t0);
mov(t0, rdx); mov(t0, rdx);
montgomery3_sub(rp_, t0, t9, t2, t1, px, t3, t7, t4, t5, t6, t8, pz, true); montgomery3_sub(rp_, t0, t9, t2, t1, px, t3, t7, t4, t5, t8, pz, true);
mov(t3, ptr [px + 8]); mov(t3, ptr [px + 8]);
mul3x1_sqr2(px, t3, t6, t5, t4); mul3x1_sqr2(px, t3, t6, t5, t4);
add_rr(Pack(t1, t0, t9, t2), Pack(rdx, rax, t5, t4)); add_rr(Pack(t1, t0, t9, t2), Pack(rdx, rax, t5, t4));
if (isFullBit_) setc(pz.cvt8()); if (isFullBit_) setc(pz.cvt8());
montgomery3_sub(rp_, t1, t3, t9, t2, px, t0, t7, t4, t5, t6, t8, pz, false); montgomery3_sub(rp_, t1, t3, t9, t2, px, t0, t7, t4, t5, t8, pz, false);
mov(t0, ptr [px + 16]); mov(t0, ptr [px + 16]);
mul3x1_sqr3(t0, t5, t4); mul3x1_sqr3(t0, t5, t4);
add_rr(Pack(t2, t1, t3, t9), Pack(rdx, rax, t5, t4)); add_rr(Pack(t2, t1, t3, t9), Pack(rdx, rax, t5, t4));
if (isFullBit_) setc(pz.cvt8()); if (isFullBit_) setc(pz.cvt8());
montgomery3_sub(rp_, t2, t0, t3, t9, px, t1, t7, t4, t5, t6, t8, pz, false); montgomery3_sub(rp_, t2, t0, t3, t9, px, t1, t7, t4, t5, t8, pz, false);
// [t9:t2:t0:t3] // [t9:t2:t0:t3]
mov(t4, t3); mov(t4, t3);
@ -1636,58 +1609,25 @@ private:
const Reg64& t9 = t[9]; const Reg64& t9 = t[9];
const Reg64& t10 = t[10]; const Reg64& t10 = t[10];
if (useMulx_) { mov(d, ptr [px + 8 * 0]);
mov(d, ptr [px + 8 * 0]); mulx(t0, a, d);
mulx(t0, a, d); mov(ptr [py + 8 * 0], a);
mov(ptr [py + 8 * 0], a);
mov(t7, ptr [px + 8 * 1]); mov(t7, ptr [px + 8 * 1]);
mov(t9, ptr [px + 8 * 2]); mov(t9, ptr [px + 8 * 2]);
mulx(t2, t1, t7); mulx(t2, t1, t7);
mulx(t4, t3, t9); mulx(t4, t3, t9);
mov(t5, t2); mov(t5, t2);
mov(t6, t4); mov(t6, t4);
add(t0, t1); add(t0, t1);
adc(t5, t3); adc(t5, t3);
adc(t6, 0); // [t6:t5:t0] adc(t6, 0); // [t6:t5:t0]
mov(d, t7); mov(d, t7);
mulx(t8, t7, d); mulx(t8, t7, d);
mulx(t10, t9, t9); mulx(t10, t9, t9);
} else {
mov(t9, ptr [px + 8 * 0]);
mov(a, t9);
mul(t9);
mov(ptr [py + 8 * 0], a);
mov(t0, d);
mov(a, ptr [px + 8 * 1]);
mul(t9);
mov(t1, a);
mov(t2, d);
mov(a, ptr [px + 8 * 2]);
mul(t9);
mov(t3, a);
mov(t4, d);
mov(t5, t2);
mov(t6, t4);
add(t0, t1);
adc(t5, t3);
adc(t6, 0); // [t6:t5:t0]
mov(t9, ptr [px + 8 * 1]);
mov(a, t9);
mul(t9);
mov(t7, a);
mov(t8, d);
mov(a, ptr [px + 8 * 2]);
mul(t9);
mov(t9, a);
mov(t10, d);
}
add(t2, t7); add(t2, t7);
adc(t8, t9); adc(t8, t9);
mov(t7, t10); mov(t7, t10);
@ -1738,21 +1678,10 @@ private:
mov(d, ptr [px]); mov(d, ptr [px]);
mulx(pd[0], a, ptr [py + 8 * 0]); mulx(pd[0], a, ptr [py + 8 * 0]);
mov(ptr [pz + 8 * 0], a); mov(ptr [pz + 8 * 0], a);
if (useAdx_) { xor_(a, a);
xor_(a, a); for (size_t i = 1; i < pd.size(); i++) {
for (size_t i = 1; i < pd.size(); i++) { mulx(pd[i], a, ptr [py + 8 * i]);
mulx(pd[i], a, ptr [py + 8 * i]); adcx(pd[i - 1], a);
adcx(pd[i - 1], a);
}
} else {
for (size_t i = 1; i < pd.size(); i++) {
mulx(pd[i], a, ptr [py + 8 * i]);
if (i == 1) {
add(pd[i - 1], a);
} else {
adc(pd[i - 1], a);
}
}
} }
adc(pd[pd.size() - 1], 0); adc(pd[pd.size() - 1], 0);
} }
@ -1823,58 +1752,34 @@ private:
*/ */
void mulPre3(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t) void mulPre3(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t)
{ {
const Reg64& a = rax;
const Reg64& d = rdx; const Reg64& d = rdx;
const Reg64& t0 = t[0]; const Reg64& t0 = t[0];
const Reg64& t1 = t[1]; const Reg64& t1 = t[1];
const Reg64& t2 = t[2]; const Reg64& t2 = t[2];
const Reg64& t3 = t[3];
const Reg64& t4 = t[4]; const Reg64& t4 = t[4];
const Reg64& t5 = t[5]; const Reg64& t5 = t[5];
const Reg64& t6 = t[6]; const Reg64& t6 = t[6];
const Reg64& t7 = t[7];
const Reg64& t8 = t[8]; const Reg64& t8 = t[8];
const Reg64& t9 = t[9]; const Reg64& t9 = t[9];
if (useMulx_) { mulPack(pz, px, py, Pack(t2, t1, t0));
mulPack(pz, px, py, Pack(t2, t1, t0));
#if 0 // a little slow #if 0 // a little slow
if (useAdx_) { if (useAdx_) {
// [t2:t1:t0] // [t2:t1:t0]
mulPackAdd(pz + 8 * 1, px + 8 * 1, py, t3, Pack(t2, t1, t0)); mulPackAdd(pz + 8 * 1, px + 8 * 1, py, t3, Pack(t2, t1, t0));
// [t3:t2:t1] // [t3:t2:t1]
mulPackAdd(pz + 8 * 2, px + 8 * 2, py, t4, Pack(t3, t2, t1)); mulPackAdd(pz + 8 * 2, px + 8 * 2, py, t4, Pack(t3, t2, t1));
// [t4:t3:t2] // [t4:t3:t2]
store_mr(pz + 8 * 3, Pack(t4, t3, t2)); store_mr(pz + 8 * 3, Pack(t4, t3, t2));
return; return;
}
#endif
} else {
mov(t5, ptr [px]);
mov(a, ptr [py + 8 * 0]);
mul(t5);
mov(ptr [pz + 8 * 0], a);
mov(t0, d);
mov(a, ptr [py + 8 * 1]);
mul(t5);
mov(t3, a);
mov(t1, d);
mov(a, ptr [py + 8 * 2]);
mul(t5);
mov(t4, a);
mov(t2, d);
add(t0, t3);
mov(t2, 0);
adc(t1, a);
adc(t2, d); // [t2:t1:t0:pz[0]] = px[0] * py[2..0]
} }
#endif
// here [t2:t1:t0] // here [t2:t1:t0]
mov(t9, ptr [px + 8]); mov(t9, ptr [px + 8]);
// [d:t9:t6:t5] = px[1] * py[2..0] // [d:t9:t6:t5] = px[1] * py[2..0]
mul3x1(py, t9, t7, t6, t5, t4); mul3x1(py, t9, t6, t5, t4);
add_rr(Pack(t2, t1, t0), Pack(t9, t6, t5)); add_rr(Pack(t2, t1, t0), Pack(t9, t6, t5));
adc(d, 0); adc(d, 0);
mov(t8, d); mov(t8, d);
@ -1884,7 +1789,7 @@ private:
mov(t9, ptr [px + 16]); mov(t9, ptr [px + 16]);
// [d:t9:t5:t4] // [d:t9:t5:t4]
mul3x1(py, t9, t6, t5, t4, t0); mul3x1(py, t9, t5, t4, t0);
add_rr(Pack(t8, t2, t1), Pack(t9, t5, t4)); add_rr(Pack(t8, t2, t1), Pack(t9, t5, t4));
adc(d, 0); adc(d, 0);
store_mr(pz + 8 * 2, Pack(d, t8, t2, t1)); store_mr(pz + 8 * 2, Pack(d, t8, t2, t1));
@ -1909,7 +1814,6 @@ private:
*/ */
void sqr2(const Reg64& y3, const Reg64& y2, const Reg64& y1, const Reg64& y0, const Reg64& x1, const Reg64& x0, const Reg64& t1, const Reg64& t0) void sqr2(const Reg64& y3, const Reg64& y2, const Reg64& y1, const Reg64& y0, const Reg64& x1, const Reg64& x0, const Reg64& t1, const Reg64& t0)
{ {
assert(useMulx_);
mov(rdx, x0); mov(rdx, x0);
mulx(y1, y0, x0); // x0^2 mulx(y1, y0, x0); // x0^2
mov(rdx, x1); mov(rdx, x1);
@ -1928,7 +1832,6 @@ private:
*/ */
void mul2x2(const RegExp& px, const RegExp& py, const Reg64& t4, const Reg64& t3, const Reg64& t2, const Reg64& t1, const Reg64& t0) void mul2x2(const RegExp& px, const RegExp& py, const Reg64& t4, const Reg64& t3, const Reg64& t2, const Reg64& t1, const Reg64& t0)
{ {
assert(useMulx_);
#if 0 #if 0
// # of add is less, but a little slower // # of add is less, but a little slower
mov(t4, ptr [py + 8 * 0]); mov(t4, ptr [py + 8 * 0]);
@ -2043,58 +1946,7 @@ private:
*/ */
void sqrPre4(const Reg64& py, const Reg64& px, const Pack& t) void sqrPre4(const Reg64& py, const Reg64& px, const Pack& t)
{ {
#if 1 sqrPre4NF(py, px, t);
if (useMulx_ && useAdx_) {
sqrPre4NF(py, px, t);
return;
}
#endif
const Reg64& t0 = t[0];
const Reg64& t1 = t[1];
const Reg64& t2 = t[2];
const Reg64& t3 = t[3];
const Reg64& t4 = t[4];
const Reg64& t5 = t[5];
const Reg64& t6 = t[6];
const Reg64& t7 = t[7];
const Reg64& t8 = t[8];
const Reg64& t9 = t[9];
const Reg64& t10 = t[10];
const Reg64& a = rax;
const Reg64& d = rdx;
/*
(aN + b)^2 = a^2 N^2 + 2ab N + b^2
*/
load_rm(Pack(t9, t8), px);
sqr2(t3, t2, t1, t0, t9, t8, t7, t6);
// [t3:t2:t1:t0] = b^2
store_mr(py, Pack(t1, t0));
vmovq(xm0, t2);
mul2x2(px, px + 2 * 8, t6, t5, t4, t1, t0);
// [t5:t4:t1:t0] = ab
xor_(t6, t6);
add_rr(Pack(t6, t5, t4, t1, t0), Pack(t6, t5, t4, t1, t0));
// [t6:t5:t4:t1:t0] = 2ab
load_rm(Pack(t8, t7), px + 2 * 8);
// free t10, t9, rax, rdx
/*
[d:t8:t10:t9] = [t8:t7]^2
*/
mov(d, t7);
mulx(t10, t9, t7); // [t10:t9] = t7^2
mulx(t7, t2, t8); // [t7:t2] = t7 t8
xor_(a, a);
add_rr(Pack(a, t7, t2), Pack(a, t7, t2));
// [a:t7:t2] = 2 t7 t8
mov(d, t8);
mulx(d, t8, t8); // [d:t8] = t8^2
add_rr(Pack(d, t8, t10), Pack(a, t7, t2));
// [d:t8:t10:t9] = [t8:t7]^2
vmovq(t2, xm0);
add_rr(Pack(t8, t10, t9, t3, t2), Pack(t6, t5, t4, t1, t0));
adc(d, 0);
store_mr(py + 2 * 8, Pack(d, t8, t10, t9, t3, t2));
} }
/* /*
(5, 5)(4, 4)(3, 3)(2, 2)(1, 1)(0, 0) (5, 5)(4, 4)(3, 3)(2, 2)(1, 1)(0, 0)
@ -2221,7 +2073,6 @@ private:
*/ */
void mulPre4(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t) void mulPre4(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t)
{ {
const Reg64& a = rax;
const Reg64& d = rdx; const Reg64& d = rdx;
const Reg64& t0 = t[0]; const Reg64& t0 = t[0];
const Reg64& t1 = t[1]; const Reg64& t1 = t[1];
@ -2235,14 +2086,12 @@ private:
const Reg64& t9 = t[9]; const Reg64& t9 = t[9];
#if 0 // a little slower #if 0 // a little slower
if (useMulx_ && useAdx_) { mulPack(pz, px, py, Pack(t3, t2, t1, t0));
mulPack(pz, px, py, Pack(t3, t2, t1, t0)); mulPackAdd(pz + 8 * 1, px + 8 * 1, py, t4, Pack(t3, t2, t1, t0));
mulPackAdd(pz + 8 * 1, px + 8 * 1, py, t4, Pack(t3, t2, t1, t0)); mulPackAdd(pz + 8 * 2, px + 8 * 2, py, t0, Pack(t4, t3, t2, t1));
mulPackAdd(pz + 8 * 2, px + 8 * 2, py, t0, Pack(t4, t3, t2, t1)); mulPackAdd(pz + 8 * 3, px + 8 * 3, py, t1, Pack(t0, t4, t3, t2));
mulPackAdd(pz + 8 * 3, px + 8 * 3, py, t1, Pack(t0, t4, t3, t2)); store_mr(pz + 8 * 4, Pack(t1, t0, t4, t3));
store_mr(pz + 8 * 4, Pack(t1, t0, t4, t3)); return;
return;
}
#endif #endif
#if 0 #if 0
// a little slower // a little slower
@ -2266,37 +2115,14 @@ private:
adc(t7, 0); adc(t7, 0);
store_mr(pz + 8 * 2, Pack(t7, t4, t3, t2, t1, t0)); store_mr(pz + 8 * 2, Pack(t7, t4, t3, t2, t1, t0));
#else #else
if (useMulx_) { mulPack(pz, px, py, Pack(t3, t2, t1, t0));
mulPack(pz, px, py, Pack(t3, t2, t1, t0));
} else {
mov(t5, ptr [px]);
mov(a, ptr [py + 8 * 0]);
mul(t5);
mov(ptr [pz + 8 * 0], a);
mov(t0, d);
mov(a, ptr [py + 8 * 1]);
mul(t5);
mov(t3, a);
mov(t1, d);
mov(a, ptr [py + 8 * 2]);
mul(t5);
mov(t4, a);
mov(t2, d);
mov(a, ptr [py + 8 * 3]);
mul(t5);
add(t0, t3);
mov(t3, 0);
adc(t1, t4);
adc(t2, a);
adc(t3, d); // [t3:t2:t1:t0:pz[0]] = px[0] * py[3..0]
}
// here [t3:t2:t1:t0] // here [t3:t2:t1:t0]
mov(t9, ptr [px + 8]); mov(t9, ptr [px + 8]);
// [d:t9:t7:t6:t5] = px[1] * py[3..0] // [d:t9:t7:t6:t5] = px[1] * py[3..0]
mul4x1(py, t9, t8, t7, t6, t5, t4); mul4x1(py, t9, t7, t6, t5);
add_rr(Pack(t3, t2, t1, t0), Pack(t9, t7, t6, t5)); add_rr(Pack(t3, t2, t1, t0), Pack(t9, t7, t6, t5));
adc(d, 0); adc(d, 0);
mov(t8, d); mov(t8, d);
@ -2306,7 +2132,7 @@ private:
mov(t9, ptr [px + 16]); mov(t9, ptr [px + 16]);
// [d:t9:t6:t5:t4] // [d:t9:t6:t5:t4]
mul4x1(py, t9, t7, t6, t5, t4, t0); mul4x1(py, t9, t6, t5, t4);
add_rr(Pack(t8, t3, t2, t1), Pack(t9, t6, t5, t4)); add_rr(Pack(t8, t3, t2, t1), Pack(t9, t6, t5, t4));
adc(d, 0); adc(d, 0);
mov(t7, d); mov(t7, d);
@ -2315,7 +2141,7 @@ private:
mov(t9, ptr [px + 24]); mov(t9, ptr [px + 24]);
// [d:t9:t5:t4:t1] // [d:t9:t5:t4:t1]
mul4x1(py, t9, t6, t5, t4, t1, t0); mul4x1(py, t9, t5, t4, t1);
add_rr(Pack(t7, t8, t3, t2), Pack(t9, t5, t4, t1)); add_rr(Pack(t7, t8, t3, t2), Pack(t9, t5, t4, t1));
adc(d, 0); adc(d, 0);
store_mr(pz + 8 * 3, Pack(t7, t8, t3, t2)); store_mr(pz + 8 * 3, Pack(t7, t8, t3, t2));
@ -2502,7 +2328,6 @@ private:
} }
void gen_fpDbl_sqrPre(void2u& f) void gen_fpDbl_sqrPre(void2u& f)
{ {
if (!(useMulx_ && useAdx_)) return;
void2u func = getCurr<void2u>(); void2u func = getCurr<void2u>();
switch (pn_) { switch (pn_) {
case 2: case 2:
@ -2547,7 +2372,6 @@ private:
} }
void gen_fpDbl_mulPre(void3u& f) void gen_fpDbl_mulPre(void3u& f)
{ {
if (!useMulx_ || (pn_ == 6 && !useAdx_)) return;
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
switch (pn_) { switch (pn_) {
case 2: case 2:
@ -3245,78 +3069,40 @@ private:
} }
/* /*
[rdx:x:t0] <- py[1:0] * x [rdx:x:t0] <- py[1:0] * x
destroy x, t destroy x, t0
*/ */
void mul2x1(const RegExp& py, const Reg64& x, const Reg64& t0, const Reg64& t) void mul2x1(const RegExp& py, const Reg64& x, const Reg64& t0)
{ {
if (useMulx_) { // mulx(H, L, x) = [H:L] = x * rdx
// mulx(H, L, x) = [H:L] = x * rdx /*
/* rdx:x
rdx:x rax:t0
rax:t0 */
*/ mov(rdx, x);
mov(rdx, x); mulx(rax, t0, ptr [py]); // [rax:t0] = py[0] * x
mulx(rax, t0, ptr [py]); // [rax:t0] = py[0] * x mulx(rdx, x, ptr [py + 8]); // [t:t1] = py[1] * x
mulx(rdx, x, ptr [py + 8]); // [t:t1] = py[1] * x add(x, rax);
add(x, rax); adc(rdx, 0);
adc(rdx, 0);
} else {
mov(rax, ptr [py]);
mul(x);
mov(t0, rax);
mov(t, rdx);
mov(rax, ptr [py + 8]);
mul(x);
/*
rdx:rax
t:t0
*/
add(rax, t);
adc(rdx, 0);
mov(x, rax);
}
} }
/* /*
[rdx:x:t1:t0] <- py[2:1:0] * x [rdx:x:t1:t0] <- py[2:1:0] * x
destroy x, t destroy x, t
*/ */
void mul3x1(const RegExp& py, const Reg64& x, const Reg64& t2, const Reg64& t1, const Reg64& t0, const Reg64& t) void mul3x1(const RegExp& py, const Reg64& x, const Reg64& t1, const Reg64& t0, const Reg64& t)
{ {
if (useMulx_) { // mulx(H, L, x) = [H:L] = x * rdx
// mulx(H, L, x) = [H:L] = x * rdx /*
/* rdx:x
rdx:x t:t1
t:t1 rax:t0
rax:t0 */
*/ mov(rdx, x);
mov(rdx, x); mulx(rax, t0, ptr [py]); // [rax:t0] = py[0] * x
mulx(rax, t0, ptr [py]); // [rax:t0] = py[0] * x mulx(t, t1, ptr [py + 8]); // [t:t1] = py[1] * x
mulx(t, t1, ptr [py + 8]); // [t:t1] = py[1] * x add(t1, rax);
add(t1, rax); mulx(rdx, x, ptr [py + 8 * 2]);
mulx(rdx, x, ptr [py + 8 * 2]); adc(x, t);
adc(x, t); adc(rdx, 0);
adc(rdx, 0);
} else {
mov(rax, ptr [py]);
mul(x);
mov(t0, rax);
mov(t1, rdx);
mov(rax, ptr [py + 8]);
mul(x);
mov(t, rax);
mov(t2, rdx);
mov(rax, ptr [py + 8 * 2]);
mul(x);
/*
rdx:rax
t2:t
t1:t0
*/
add(t1, t);
adc(rax, t2);
adc(rdx, 0);
mov(x, rax);
}
} }
/* /*
[x2:x1:x0] * x0 [x2:x1:x0] * x0
@ -3399,7 +3185,7 @@ private:
*/ */
void montgomery3_sub(uint64_t pp, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0, void montgomery3_sub(uint64_t pp, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0,
const Reg64& /*px*/, const Reg64& y, const Reg64& p, const Reg64& /*px*/, const Reg64& y, const Reg64& p,
const Reg64& t0, const Reg64& t1, const Reg64& t2, const Reg64& t3, const Reg64& t4, bool isFirst) const Reg64& t0, const Reg64& t1, const Reg64& t3, const Reg64& t4, bool isFirst)
{ {
// input [c3:y:c1:0] // input [c3:y:c1:0]
// [t4:c3:y:c1:c0] // [t4:c3:y:c1:c0]
@ -3407,7 +3193,7 @@ private:
mov(rax, pp); mov(rax, pp);
mul(c0); // q = rax mul(c0); // q = rax
mov(c2, rax); mov(c2, rax);
mul3x1(p, c2, t2, t1, t0, t3); mul3x1(p, c2, t1, t0, t3);
// [rdx:c2:t1:t0] = p * q // [rdx:c2:t1:t0] = p * q
add(c0, t0); // always c0 is zero add(c0, t0); // always c0 is zero
adc(c1, t1); adc(c1, t1);
@ -3433,118 +3219,35 @@ private:
*/ */
void montgomery3_1(uint64_t pp, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0, void montgomery3_1(uint64_t pp, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0,
const Reg64& px, const Reg64& y, const Reg64& p, const Reg64& px, const Reg64& y, const Reg64& p,
const Reg64& t0, const Reg64& t1, const Reg64& t2, const Reg64& t3, const Reg64& t4, bool isFirst) const Reg64& t0, const Reg64& t1, const Reg64& t3, const Reg64& t4, bool isFirst)
{ {
if (isFirst) { if (isFirst) {
mul3x1(px, y, c2, c1, c0, c3); mul3x1(px, y, c1, c0, c3);
mov(c3, rdx); mov(c3, rdx);
// [c3:y:c1:c0] = px[2..0] * y // [c3:y:c1:c0] = px[2..0] * y
} else { } else {
mul3x1(px, y, t2, t1, t0, t3); mul3x1(px, y, t1, t0, t3);
// [rdx:y:t1:t0] = px[2..0] * y // [rdx:y:t1:t0] = px[2..0] * y
add_rr(Pack(c3, y, c1, c0), Pack(rdx, c2, t1, t0)); add_rr(Pack(c3, y, c1, c0), Pack(rdx, c2, t1, t0));
if (isFullBit_) setc(t4.cvt8()); if (isFullBit_) setc(t4.cvt8());
} }
montgomery3_sub(pp, c3, c2, c1, c0, px, y, p, t0, t1, t2, t3, t4, isFirst); montgomery3_sub(pp, c3, c2, c1, c0, px, y, p, t0, t1, t3, t4, isFirst);
}
/*
pc[0..n] += x[0..n-1] * y ; pc[] = 0 if isFirst
pc[n + 1] is temporary used if isFullBit_
q = uint64_t(pc[0] * pp)
pc[] = (pc[] + q * p) >> 64
input : pc[], px[], y, p[], pw1[], pw2[]
output : pc[0..n] ; if isFullBit_
pc[0..n-1] ; if !isFullBit_
destroy y
use
pw1[0] if useMulx_
pw1[0..n-2] otherwise
pw2[0..n-1]
*/
void montgomeryN_1(uint64_t pp, int n, const RegExp& pc, const RegExp& px, const Reg64& y, const Reg64& p, const Reg64& t, const MixPack& pw1, const RegExp& pw2, bool isFirst)
{
// pc[] += x[] * y
if (isFirst) {
gen_raw_mulUnit(pc, px, y, pw1, t, n);
mov(ptr [pc + n * 8], rdx);
} else {
gen_raw_mulUnit(pw2, px, y, pw1, t, n);
mov(t, ptr [pw2 + 0 * 8]);
add(ptr [pc + 0 * 8], t);
for (int i = 1; i < n; i++) {
mov(t, ptr [pw2 + i * 8]);
adc(ptr [pc + i * 8], t);
}
adc(ptr [pc + n * 8], rdx);
if (isFullBit_) {
mov(t, 0);
adc(t, 0);
mov(qword [pc + (n + 1) * 8], t);
}
}
mov(rax, pp);
mul(qword [pc]);
mov(y, rax); // y = q
gen_raw_mulUnit(pw2, p, y, pw1, t, n);
// c[] = (c[] + pw2[]) >> 64
mov(t, ptr [pw2 + 0 * 8]);
add(t, ptr [pc + 0 * 8]);
for (int i = 1; i < n; i++) {
mov(t, ptr [pw2 + i * 8]);
adc(t, ptr [pc + i * 8]);
mov(ptr [pc + (i - 1) * 8], t);
}
adc(rdx, ptr [pc + n * 8]);
mov(ptr [pc + (n - 1) * 8], rdx);
if (isFullBit_) {
if (isFirst) {
mov(t, 0);
} else {
mov(t, ptr [pc + (n + 1) * 8]);
}
adc(t, 0);
mov(qword [pc + n * 8], t);
} else {
xor_(eax, eax);
mov(ptr [pc + n * 8], rax);
}
} }
/* /*
[rdx:x:t2:t1:t0] <- py[3:2:1:0] * x [rdx:x:t2:t1:t0] <- py[3:2:1:0] * x
destroy x, t destroy x, t
*/ */
void mul4x1(const RegExp& py, const Reg64& x, const Reg64& t3, const Reg64& t2, const Reg64& t1, const Reg64& t0, const Reg64& t) void mul4x1(const RegExp& py, const Reg64& x, const Reg64& t2, const Reg64& t1, const Reg64& t0)
{ {
if (useMulx_) { mov(rdx, x);
mov(rdx, x); mulx(t1, t0, ptr [py + 8 * 0]);
mulx(t1, t0, ptr [py + 8 * 0]); mulx(t2, rax, ptr [py + 8 * 1]);
mulx(t2, rax, ptr [py + 8 * 1]); add(t1, rax);
add(t1, rax); mulx(x, rax, ptr [py + 8 * 2]);
mulx(x, rax, ptr [py + 8 * 2]); adc(t2, rax);
adc(t2, rax); mulx(rdx, rax, ptr [py + 8 * 3]);
mulx(rdx, rax, ptr [py + 8 * 3]); adc(x, rax);
adc(x, rax); adc(rdx, 0);
adc(rdx, 0);
} else {
mov(rax, ptr [py]);
mul(x);
mov(t0, rax);
mov(t1, rdx);
mov(rax, ptr [py + 8]);
mul(x);
mov(t, rax);
mov(t2, rdx);
mov(rax, ptr [py + 8 * 2]);
mul(x);
mov(t3, rax);
mov(rax, x);
mov(x, rdx);
mul(qword [py + 8 * 3]);
add(t1, t);
adc(t2, t3);
adc(x, rax);
adc(rdx, 0);
}
} }
/* /*
@ -3560,14 +3263,14 @@ private:
*/ */
void montgomery4_1(uint64_t pp, const Reg64& c4, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0, void montgomery4_1(uint64_t pp, const Reg64& c4, const Reg64& c3, const Reg64& c2, const Reg64& c1, const Reg64& c0,
const Reg64& px, const Reg64& y, const Reg64& p, const Reg64& px, const Reg64& y, const Reg64& p,
const Reg64& t0, const Reg64& t1, const Reg64& t2, const Reg64& t3, const Reg64& t4, bool isFirst, const Xmm& xt) const Reg64& t0, const Reg64& t1, const Reg64& t2, bool isFirst, const Xmm& xt)
{ {
if (isFirst) { if (isFirst) {
mul4x1(px, y, c3, c2, c1, c0, c4); mul4x1(px, y, c2, c1, c0);
mov(c4, rdx); mov(c4, rdx);
// [c4:y:c2:c1:c0] = px[3..0] * y // [c4:y:c2:c1:c0] = px[3..0] * y
} else { } else {
mul4x1(px, y, t3, t2, t1, t0, t4); mul4x1(px, y, t2, t1, t0);
// [rdx:y:t2:t1:t0] = px[3..0] * y // [rdx:y:t2:t1:t0] = px[3..0] * y
if (isFullBit_) { if (isFullBit_) {
vmovq(xt, px); vmovq(xt, px);
@ -3583,7 +3286,7 @@ private:
mov(rax, pp); mov(rax, pp);
mul(c0); // q = rax mul(c0); // q = rax
mov(c3, rax); mov(c3, rax);
mul4x1(p, c3, t3, t2, t1, t0, t4); mul4x1(p, c3, t2, t1, t0);
add(c0, t0); // always c0 is zero add(c0, t0); // always c0 is zero
adc(c1, t1); adc(c1, t1);
adc(c2, t2); adc(c2, t2);
@ -3601,7 +3304,7 @@ private:
void3u gen_fp2Dbl_mulPre() void3u gen_fp2Dbl_mulPre()
{ {
if (isFullBit_) return 0; if (isFullBit_) return 0;
if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; if (!(pn_ == 4 || pn_ == 6)) return 0;
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
bool embedded = pn_ == 4; bool embedded = pn_ == 4;
@ -3881,7 +3584,7 @@ private:
void3u gen_fp2_mul() void3u gen_fp2_mul()
{ {
if (isFullBit_) return 0; if (isFullBit_) return 0;
if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; if (!(pn_ == 4 || pn_ == 6)) return 0;
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
int stackSize = 8 + FpByte_ * 4; int stackSize = 8 + FpByte_ * 4;
StackFrame sf(this, 3, 10 | UseRDX, stackSize); StackFrame sf(this, 3, 10 | UseRDX, stackSize);
@ -3903,7 +3606,7 @@ private:
void2u gen_fp2_sqr() void2u gen_fp2_sqr()
{ {
if (isFullBit_) return 0; if (isFullBit_) return 0;
if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; if (!(pn_ == 4 || pn_ == 6)) return 0;
bool nocarry = (p_[pn_ - 1] >> 62) == 0; bool nocarry = (p_[pn_ - 1] >> 62) == 0;
if (!nocarry) return 0; if (!nocarry) return 0;
void2u func = getCurr<void2u>(); void2u func = getCurr<void2u>();

Loading…
Cancel
Save