refactor montgomery6_1

update-fork
MITSUNARI Shigeo 4 years ago
parent 4e0e7c9089
commit 70e8b3f992
  1. 125
      src/fp_generator.hpp

@ -1450,6 +1450,27 @@ private:
adox(c[n], t0);
adc(c[n], 0);
}
/*
(h, c[n..0]) = c[n..0] + px[n-1..0] * rdx + (cc << n)
h = 0 or 1
use rax, t0
*/
void mulAdd2(const Reg64& h, const Pack& c, int n, const RegExp& px, const Reg64& t0, const Reg64 *cc = 0, bool updateCarry = true)
{
assert(!isFullBit_);
const Reg64& a = rax;
xor_(h, h); // h = 0
for (int i = 0; i < n; i++) {
mulx(t0, a, ptr [px + i * 8]);
adox(c[i], a);
if (i == n - 1) break;
adcx(c[i + 1], t0);
}
adox(t0, h); // no carry
if (cc) adox(t0, *cc); // no carry
adcx(c[n], t0);
if (updateCarry) adc(h, h);
}
/*
input
c[5..0]
@ -1466,30 +1487,20 @@ private:
c += p * q
c >>= 64
*/
void montgomery6_1(const Pack& c, const RegExp& px, const RegExp& pp, const Reg64& t0, const Reg64& t1, bool isFirst)
void montgomery6_1(const Pack& c, const RegExp& px, const RegExp& pp, const Reg64& t1, bool isFirst)
{
assert(!isFullBit_);
const int n = 6;
const Reg64& a = rax;
const Reg64& d = rdx;
if (isFirst) {
// c[6..0] = px[5..0] * rdx
mulx(c[1], c[0], ptr [px + 0 * 8]);
for (int i = 1; i < n; i++) {
mulx(c[i + 1], a, ptr[px + i * 8]);
if (i == 1) {
add(c[i], a);
} else {
adc(c[i], a);
}
}
adc(c[n], 0);
mulPack1(c, n, px);
} else {
// c[6..0] = c[5..0] + px[5..0] * rdx because of not fuill bit
mulAdd(c, 6, px, t1, true);
}
mov(d, rp_);
imul(d, c[0]); // q = d
imul(d, c[0]); // d = q = uint64_t(d * c[0])
// c[6..0] += p * q because of not fuill bit
mulAdd(c, 6, pp, t1, false);
}
@ -1521,17 +1532,17 @@ private:
L(fp_mulL);
lea(pp, ptr[rip+pL_]);
mov(rdx, ptr [py + 0 * 8]);
montgomery6_1(Pack(t6, t5, t4, t3, t2, t1, t0), px, pp, t7, t8, true);
montgomery6_1(Pack(t6, t5, t4, t3, t2, t1, t0), px, pp, t8, true);
mov(rdx, ptr [py + 1 * 8]);
montgomery6_1(Pack(t0, t6, t5, t4, t3, t2, t1), px, pp, t7, t8, false);
montgomery6_1(Pack(t0, t6, t5, t4, t3, t2, t1), px, pp, t8, false);
mov(rdx, ptr [py + 2 * 8]);
montgomery6_1(Pack(t1, t0, t6, t5, t4, t3, t2), px, pp, t7, t8, false);
montgomery6_1(Pack(t1, t0, t6, t5, t4, t3, t2), px, pp, t8, false);
mov(rdx, ptr [py + 3 * 8]);
montgomery6_1(Pack(t2, t1, t0, t6, t5, t4, t3), px, pp, t7, t8, false);
montgomery6_1(Pack(t2, t1, t0, t6, t5, t4, t3), px, pp, t8, false);
mov(rdx, ptr [py + 4 * 8]);
montgomery6_1(Pack(t3, t2, t1, t0, t6, t5, t4), px, pp, t7, t8, false);
montgomery6_1(Pack(t3, t2, t1, t0, t6, t5, t4), px, pp, t8, false);
mov(rdx, ptr [py + 5 * 8]);
montgomery6_1(Pack(t4, t3, t2, t1, t0, t6, t5), px, pp, t7, t8, false);
montgomery6_1(Pack(t4, t3, t2, t1, t0, t6, t5), px, pp, t8, false);
const Pack z = Pack(t4, t3, t2, t1, t0, t6);
const Pack keep = Pack(rdx, rax, px, py, t7, t8);
@ -1736,6 +1747,23 @@ private:
adc(d, 0);
store_mr(py + 8 * 2, Pack(d, t7, t6, t2));
}
/*
c[n..0] = px[n-1..0] * rdx
use rax
*/
void mulPack1(const Pack& c, int n, const RegExp& px)
{
mulx(c[1], c[0], ptr [px + 0 * 8]);
for (int i = 1; i < n; i++) {
mulx(c[i + 1], rax, ptr[px + i * 8]);
if (i == 1) {
add(c[i], rax);
} else {
adc(c[i], rax);
}
}
adc(c[n], 0);
}
/*
[pd:pz[0]] <- py[n-1..0] * px[0]
*/
@ -2301,17 +2329,65 @@ private:
const Reg64& t7 = t[7];
const Reg64& t8 = t[8];
const Reg64& t9 = t[9];
const Reg64& t10 = t[10];
const Reg64& a = rax;
const Reg64& d = rdx;
#if 0
const Reg64& pp = t[10];
lea(pp, ptr[rip + pL_]);
mov(a, ptr[xy + 0 * 8]);
mov(d, rp_);
imul(d, a); // q
load_rm(Pack(t6, t5, t4, t3, t2, t1, t0), xy);
mulAdd2(t9, Pack(t6, t5, t4, t3, t2, t1, t0), 6, pp, t8);
// t9 : carry, [t6:t5:t4:t3:t2:t1:t0] += p * q
mov(a, ptr[xy + 1 * 8]);
mov(d, rp_);
imul(d, a);
mov(t0, ptr[xy + 7 * 8]);
mulAdd2(t7, Pack(t0, t6, t5, t4, t3, t2, t1), 6, pp, t8, &t9);
mov(a, ptr[xy + 2 * 8]);
mov(d, rp_);
imul(d, a);
mov(t1, ptr[xy + 8 * 8]);
mulAdd2(t9, Pack(t1, t7, t6, t5, t4, t3, t2), 6, pp, t8, &t7);
mov(a, ptr[xy + 3 * 8]);
mov(d, rp_);
imul(d, a);
mov(t2, ptr[xy + 9 * 8]);
mulAdd2(t7, Pack(t2, t1, t7, t6, t5, t4, t3), 6, pp, t8, &t9);
mov(a, ptr[xy + 4 * 8]);
mov(d, rp_);
imul(d, a);
mov(t3, ptr[xy + 10 * 8]);
mulAdd2(t9, Pack(t3, t2, t1, t7, t6, t5, t4), 6, pp, t8, &t7);
mov(a, ptr[xy + 5 * 8]);
mov(d, rp_);
imul(d, a);
mov(t4, ptr[xy + 11 * 8]);
mulAdd2(t7, Pack(t4, t3, t2, t1, t7, t6, t5), 6, pp, t8, &t9, false);
// z = [t4:t3:t2:t1:t7:t6]
Pack zp = Pack(t4, t3, t2, t1, t7, t6);
Pack keep = Pack(t0, xy, rax, rdx, t5, t8);
mov_rr(keep, zp);
sub_rm(zp, pp); // z -= p
cmovc_rr(zp, keep);
store_mr(z, zp);
#else
const Reg64& t10 = t[10];
vmovq(xm0, z);
mov(z, ptr [xy + 0 * 8]);
mov(a, rp_);
mul(z);
mov(a, ptr [xy + 0 * 8]);
mov(d, rp_);
imul(d, a); // q
lea(t0, ptr [rip + pL_]);
load_rm(Pack(t7, t6, t5, t4, t3, t2, t1), xy);
mov(d, a); // q
mulPackAddShr(Pack(t7, t6, t5, t4, t3, t2, t1), t0, t10);
load_rm(Pack(t1, t0, t10, t9, t8), xy + 7 * 8);
adc(t8, rax);
@ -2373,6 +2449,7 @@ private:
cmovc_rr(zp, keep);
vmovq(z, xm0);
store_mr(z, zp);
#endif
}
void2u gen_fpDbl_sqrPre()
{

Loading…
Cancel
Save