remove unnecessary add

update-fork
MITSUNARI Shigeo 4 years ago
parent 106082b9ac
commit 0731c980b9
  1. 23
      src/fp_generator.hpp

@ -1428,11 +1428,12 @@ private:
ret(); ret();
} }
/* /*
c[n+2] = c[n+1] + px[n] * rdx c[n..0] = c[n..0] + px[n-1..0] * rdx
use rax, t0 use rax, t0
*/ */
void mulAdd(const Pack& c, int n, const RegExp& px, const Reg64& t0) void mulAdd(const Pack& c, int n, const RegExp& px, const Reg64& t0)
{ {
assert(!isFullBit_);
const Reg64& a = rax; const Reg64& a = rax;
xor_(c[n + 1], c[n + 1]); // c[n + 1] = 0 xor_(c[n + 1], c[n + 1]); // c[n + 1] = 0
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
@ -1441,17 +1442,16 @@ private:
if (i == n - 1) break; if (i == n - 1) break;
adcx(c[i + 1], t0); adcx(c[i + 1], t0);
} }
adox(t0, c[n + 1]); // carry o adox(c[n], t0);
adcx(c[n], t0); adc(c[n], 0);
adc(c[n + 1], 0);
} }
/* /*
input input
c[6..0] c[5..0]
rdx = yi rdx = yi
use rax, rdx use rax, rdx
output output
c[7..1] c[6..1]
if first: if first:
c = x[5..0] * rdx c = x[5..0] * rdx
@ -1463,6 +1463,7 @@ private:
*/ */
void montgomery6_1(const Pack& c, const RegExp& px, const Reg64& t0, const Reg64& t1, bool isFirst) void montgomery6_1(const Pack& c, const RegExp& px, const Reg64& t0, const Reg64& t1, bool isFirst)
{ {
assert(!isFullBit_);
const int n = 6; const int n = 6;
const Reg64& a = rax; const Reg64& a = rax;
const Reg64& d = rdx; const Reg64& d = rdx;
@ -1483,13 +1484,13 @@ private:
adc(*pt0, 0); adc(*pt0, 0);
mov(c[n], *pt0); mov(c[n], *pt0);
} else { } else {
// c[7..0] = c[6..0] + px[5..0] * rdx // c[5..0] = c[5..0] + px[5..0] * rdx because of not fuill bit
mulAdd(c, 6, px, t1); mulAdd(c, 6, px, t1);
} }
mov(d, rp_); mov(d, rp_);
imul(d, c[0]); // q = d imul(d, c[0]); // q = d
lea(t0, ptr[rip+pL_]); lea(t0, ptr[rip+pL_]);
// c += p * q // c[5..0] += p * q because of not fuill bit
mulAdd(c, 6, t0, t1); mulAdd(c, 6, t0, t1);
} }
/* /*
@ -1520,22 +1521,16 @@ private:
L(fp_mulL); L(fp_mulL);
mov(rdx, ptr [py + 0 * 8]); mov(rdx, ptr [py + 0 * 8]);
montgomery6_1(Pack(t7, t6, t5, t4, t3, t2, t1, t0), px, t8, t9, true); montgomery6_1(Pack(t7, t6, t5, t4, t3, t2, t1, t0), px, t8, t9, true);
xor_(t7, t7);
mov(rdx, ptr [py + 1 * 8]); mov(rdx, ptr [py + 1 * 8]);
montgomery6_1(Pack(t0, t7, t6, t5, t4, t3, t2, t1), px, t8, t9, false); montgomery6_1(Pack(t0, t7, t6, t5, t4, t3, t2, t1), px, t8, t9, false);
xor_(t0, t0);
mov(rdx, ptr [py + 2 * 8]); mov(rdx, ptr [py + 2 * 8]);
montgomery6_1(Pack(t1, t0, t7, t6, t5, t4, t3, t2), px, t8, t9, false); montgomery6_1(Pack(t1, t0, t7, t6, t5, t4, t3, t2), px, t8, t9, false);
xor_(t1, t1);
mov(rdx, ptr [py + 3 * 8]); mov(rdx, ptr [py + 3 * 8]);
montgomery6_1(Pack(t2, t1, t0, t7, t6, t5, t4, t3), px, t8, t9, false); montgomery6_1(Pack(t2, t1, t0, t7, t6, t5, t4, t3), px, t8, t9, false);
xor_(t2, t2);
mov(rdx, ptr [py + 4 * 8]); mov(rdx, ptr [py + 4 * 8]);
montgomery6_1(Pack(t3, t2, t1, t0, t7, t6, t5, t4), px, t8, t9, false); montgomery6_1(Pack(t3, t2, t1, t0, t7, t6, t5, t4), px, t8, t9, false);
xor_(t3, t3);
mov(rdx, ptr [py + 5 * 8]); mov(rdx, ptr [py + 5 * 8]);
montgomery6_1(Pack(t4, t3, t2, t1, t0, t7, t6, t5), px, t8, t9, false); montgomery6_1(Pack(t4, t3, t2, t1, t0, t7, t6, t5), px, t8, t9, false);
xor_(t4, t4);
// [t4:t3:t2:t1:t0:t7:t6] // [t4:t3:t2:t1:t0:t7:t6]
const Pack z = Pack(t3, t2, t1, t0, t7, t6); const Pack z = Pack(t3, t2, t1, t0, t7, t6);
const Pack keep = Pack(rdx, rax, px, py, t8, t9); const Pack keep = Pack(rdx, rax, px, py, t8, t9);

Loading…
Cancel
Save