diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index f09c265..fd5d59b 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -1450,6 +1450,27 @@ private: adox(c[n], t0); adc(c[n], 0); } + /* + (h, c[n..0]) = c[n..0] + px[n-1..0] * rdx + (cc << n) + h = 0 or 1 + use rax, t0 + */ + void mulAdd2(const Reg64& h, const Pack& c, int n, const RegExp& px, const Reg64& t0, const Reg64 *cc = 0, bool updateCarry = true) + { + assert(!isFullBit_); + const Reg64& a = rax; + xor_(h, h); // h = 0 + for (int i = 0; i < n; i++) { + mulx(t0, a, ptr [px + i * 8]); + adox(c[i], a); + if (i == n - 1) break; + adcx(c[i + 1], t0); + } + adox(t0, h); // no carry + if (cc) adox(t0, *cc); // no carry + adcx(c[n], t0); + if (updateCarry) adc(h, h); + } /* input c[5..0] @@ -1466,30 +1487,20 @@ private: c += p * q c >>= 64 */ - void montgomery6_1(const Pack& c, const RegExp& px, const RegExp& pp, const Reg64& t0, const Reg64& t1, bool isFirst) + void montgomery6_1(const Pack& c, const RegExp& px, const RegExp& pp, const Reg64& t1, bool isFirst) { assert(!isFullBit_); const int n = 6; - const Reg64& a = rax; const Reg64& d = rdx; if (isFirst) { // c[6..0] = px[5..0] * rdx - mulx(c[1], c[0], ptr [px + 0 * 8]); - for (int i = 1; i < n; i++) { - mulx(c[i + 1], a, ptr[px + i * 8]); - if (i == 1) { - add(c[i], a); - } else { - adc(c[i], a); - } - } - adc(c[n], 0); + mulPack1(c, n, px); } else { // c[6..0] = c[5..0] + px[5..0] * rdx because of not fuill bit mulAdd(c, 6, px, t1, true); } mov(d, rp_); - imul(d, c[0]); // q = d + imul(d, c[0]); // d = q = uint64_t(d * c[0]) // c[6..0] += p * q because of not fuill bit mulAdd(c, 6, pp, t1, false); } @@ -1521,17 +1532,17 @@ private: L(fp_mulL); lea(pp, ptr[rip+pL_]); mov(rdx, ptr [py + 0 * 8]); - montgomery6_1(Pack(t6, t5, t4, t3, t2, t1, t0), px, pp, t7, t8, true); + montgomery6_1(Pack(t6, t5, t4, t3, t2, t1, t0), px, pp, t8, true); mov(rdx, ptr [py + 1 * 8]); - montgomery6_1(Pack(t0, t6, t5, t4, t3, t2, t1), px, pp, t7, t8, false); + montgomery6_1(Pack(t0, t6, t5, t4, t3, t2, t1), px, pp, t8, false); mov(rdx, ptr [py + 2 * 8]); - montgomery6_1(Pack(t1, t0, t6, t5, t4, t3, t2), px, pp, t7, t8, false); + montgomery6_1(Pack(t1, t0, t6, t5, t4, t3, t2), px, pp, t8, false); mov(rdx, ptr [py + 3 * 8]); - montgomery6_1(Pack(t2, t1, t0, t6, t5, t4, t3), px, pp, t7, t8, false); + montgomery6_1(Pack(t2, t1, t0, t6, t5, t4, t3), px, pp, t8, false); mov(rdx, ptr [py + 4 * 8]); - montgomery6_1(Pack(t3, t2, t1, t0, t6, t5, t4), px, pp, t7, t8, false); + montgomery6_1(Pack(t3, t2, t1, t0, t6, t5, t4), px, pp, t8, false); mov(rdx, ptr [py + 5 * 8]); - montgomery6_1(Pack(t4, t3, t2, t1, t0, t6, t5), px, pp, t7, t8, false); + montgomery6_1(Pack(t4, t3, t2, t1, t0, t6, t5), px, pp, t8, false); const Pack z = Pack(t4, t3, t2, t1, t0, t6); const Pack keep = Pack(rdx, rax, px, py, t7, t8); @@ -1736,6 +1747,23 @@ private: adc(d, 0); store_mr(py + 8 * 2, Pack(d, t7, t6, t2)); } + /* + c[n..0] = px[n-1..0] * rdx + use rax + */ + void mulPack1(const Pack& c, int n, const RegExp& px) + { + mulx(c[1], c[0], ptr [px + 0 * 8]); + for (int i = 1; i < n; i++) { + mulx(c[i + 1], rax, ptr[px + i * 8]); + if (i == 1) { + add(c[i], rax); + } else { + adc(c[i], rax); + } + } + adc(c[n], 0); + } /* [pd:pz[0]] <- py[n-1..0] * px[0] */ @@ -2301,17 +2329,65 @@ private: const Reg64& t7 = t[7]; const Reg64& t8 = t[8]; const Reg64& t9 = t[9]; - const Reg64& t10 = t[10]; const Reg64& a = rax; const Reg64& d = rdx; +#if 0 + const Reg64& pp = t[10]; + lea(pp, ptr[rip + pL_]); + + mov(a, ptr[xy + 0 * 8]); + mov(d, rp_); + imul(d, a); // q + load_rm(Pack(t6, t5, t4, t3, t2, t1, t0), xy); + mulAdd2(t9, Pack(t6, t5, t4, t3, t2, t1, t0), 6, pp, t8); + // t9 : carry, [t6:t5:t4:t3:t2:t1:t0] += p * q + + mov(a, ptr[xy + 1 * 8]); + mov(d, rp_); + imul(d, a); + mov(t0, ptr[xy + 7 * 8]); + mulAdd2(t7, Pack(t0, t6, t5, t4, t3, t2, t1), 6, pp, t8, &t9); + + mov(a, ptr[xy + 2 * 8]); + mov(d, rp_); + imul(d, a); + mov(t1, ptr[xy + 8 * 8]); + mulAdd2(t9, Pack(t1, t7, t6, t5, t4, t3, t2), 6, pp, t8, &t7); + + mov(a, ptr[xy + 3 * 8]); + mov(d, rp_); + imul(d, a); + mov(t2, ptr[xy + 9 * 8]); + mulAdd2(t7, Pack(t2, t1, t7, t6, t5, t4, t3), 6, pp, t8, &t9); + + mov(a, ptr[xy + 4 * 8]); + mov(d, rp_); + imul(d, a); + mov(t3, ptr[xy + 10 * 8]); + mulAdd2(t9, Pack(t3, t2, t1, t7, t6, t5, t4), 6, pp, t8, &t7); + + mov(a, ptr[xy + 5 * 8]); + mov(d, rp_); + imul(d, a); + mov(t4, ptr[xy + 11 * 8]); + mulAdd2(t7, Pack(t4, t3, t2, t1, t7, t6, t5), 6, pp, t8, &t9, false); + + // z = [t4:t3:t2:t1:t7:t6] + Pack zp = Pack(t4, t3, t2, t1, t7, t6); + Pack keep = Pack(t0, xy, rax, rdx, t5, t8); + mov_rr(keep, zp); + sub_rm(zp, pp); // z -= p + cmovc_rr(zp, keep); + store_mr(z, zp); +#else + const Reg64& t10 = t[10]; vmovq(xm0, z); - mov(z, ptr [xy + 0 * 8]); - mov(a, rp_); - mul(z); + mov(a, ptr [xy + 0 * 8]); + mov(d, rp_); + imul(d, a); // q lea(t0, ptr [rip + pL_]); load_rm(Pack(t7, t6, t5, t4, t3, t2, t1), xy); - mov(d, a); // q mulPackAddShr(Pack(t7, t6, t5, t4, t3, t2, t1), t0, t10); load_rm(Pack(t1, t0, t10, t9, t8), xy + 7 * 8); adc(t8, rax); @@ -2373,6 +2449,7 @@ private: cmovc_rr(zp, keep); vmovq(z, xm0); store_mr(z, zp); +#endif } void2u gen_fpDbl_sqrPre() {