diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 7fe808e..392e3ba 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -340,21 +340,36 @@ private: this function calls mulPreL directly. */ StackFrame sf(this, 3, 10 | UseRDX, 0, false); -#if 0 - call(mulPreL); -#else mulPre4(gp0, gp1, gp2, sf.t); -#endif sf.close(); // make epilog L(mulPreL); // called only from asm code mulPre4(gp0, gp1, gp2, sf.t); ret(); + } else if (op.N == 6 && useAdx_) { +#if 1 + StackFrame sf(this, 3, 7 | UseRDX, 0, false); + mulPre6(gp0, gp1, gp2, sf.t); + sf.close(); // make epilog + L(mulPreL); // called only from asm code + mulPre6(gp0, gp1, gp2, sf.t); + ret(); +#else + { + StackFrame sf(this, 3, 7 | UseRDX); + mulPre6(gp0, gp1, gp2, sf.t); + } + { + StackFrame sf(this, 3, 10 | UseRDX, 0, false); + L(mulPreL); // called only from asm code + mulPre6(gp0, gp1, gp2, sf.t); + ret(); + } +#endif } else { gen_fpDbl_mulPre(); } } - if (op.N > 4) return; - if (op.N == 2 || op.N == 3 || op.N == 4) { + if (op.N == 2 || op.N == 3 || op.N == 4 || (op.N == 6 && !isFullBit_ && useAdx_)) { align(16); op.fpDbl_modA_ = getCurr(); if (op.N == 4) { @@ -364,10 +379,20 @@ private: L(fpDbl_modL); gen_fpDbl_mod4(gp0, gp1, sf.t, gp2); ret(); + } else if (op.N == 6 && !isFullBit_ && useAdx_) { + StackFrame sf(this, 3, 10 | UseRDX, 0, false); + call(fpDbl_modL); + sf.close(); + L(fpDbl_modL); + Pack t = sf.t; + t.append(gp2); + gen_fpDbl_mod6(gp0, gp1, t); + ret(); } else { gen_fpDbl_mod(op); } } + if (op.N > 4) return; if ((useMulx_ && op.N == 2) || op.N == 3 || op.N == 4) { align(16); op.fpDbl_sqrPreA_ = getCurr(); @@ -1524,6 +1549,48 @@ private: adcx(hi, d); adox(hi, d); } + /* + input : z[n], p[n-1], rdx(implicit) + output: z[] += p[] * rdx, rax = 0 and set CF + use rax, rdx + */ + void mulPackAddShr(const Pack& z, const RegExp& p, const Reg64& H, bool last = false) + { + assert(n >= 3); + const Reg64& a = rax; + const size_t n = z.size(); + // clear CF and OF + xor_(a, a); + const size_t loop = last ? n - 1 : n - 3; + for (size_t i = 0; i < loop; i++) { + // mulx(H, L, x) = [H:L] = x * rdx + mulx(H, a, ptr [p + i * 8]); + adox(z[i], a); + adcx(z[i + 1], H); + } + if (last) { + mov(a, 0); + adox(z[n - 1], a); + return; + } + /* + reorder addtion not to propage OF outside this routine + H + + + rdx a + | | + v v + z[n-1] z[n-2] + */ + mulx(H, a, ptr [p + (n - 3) * 8]); + adox(z[n - 3], a); + mulx(rdx, a, ptr [p + (n - 2) * 8]); // destroy rdx + adox(H, a); + mov(a, 0); + adox(rdx, a); + adcx(z[n - 2], H); + adcx(z[n - 1], rdx); + } /* pz[5..0] <- px[2..0] * py[2..0] */ @@ -1845,6 +1912,97 @@ private: mulPackAdd(pz + 8 * 5, px + 8 * 5, py, t3, Pack(t2, t1, t0, t6, t5, t4)); // [t3:t2:t1:t0:t6:t5] store_mr(pz + 8 * 6, Pack(t3, t2, t1, t0, t6, t5)); } + /* + @input (z, xy) + z[5..0] <- montgomery reduction(x[11..0]) + use xm0, xm1, xm2 + */ + void gen_fpDbl_mod6(const Reg64& z, const Reg64& xy, const Pack& t) + { + assert(!isFullBit_); + const Reg64& t0 = t[0]; + const Reg64& t1 = t[1]; + const Reg64& t2 = t[2]; + const Reg64& t3 = t[3]; + const Reg64& t4 = t[4]; + const Reg64& t5 = t[5]; + const Reg64& t6 = t[6]; + const Reg64& t7 = t[7]; + const Reg64& t8 = t[8]; + const Reg64& t9 = t[9]; + const Reg64& t10 = t[10]; + + const Reg64& a = rax; + const Reg64& d = rdx; + movq(xm0, z); + mov(z, ptr [xy + 0 * 8]); + mov(a, rp_); + mul(z); + lea(t0, ptr [rip + *pL_]); + load_rm(Pack(t7, t6, t5, t4, t3, t2, t1), xy); + mov(d, a); // q + mulPackAddShr(Pack(t7, t6, t5, t4, t3, t2, t1), t0, t10); + load_rm(Pack(t1, t0, t10, t9, t8), xy + 7 * 8); + adc(t8, rax); + adc(t9, rax); + adc(t10, rax); + adc(t0, rax); + adc(t1, rax); + // z = [t1:t0:t10:t9:t8:t7:t6:t5:t4:t3:t2] + mov(a, rp_); + mul(t2); + movq(xm1, t0); // save + lea(t0, ptr [rip + *pL_]); + mov(d, a); + movq(xm2, t10); + mulPackAddShr(Pack(t8, t7, t6, t5, t4, t3, t2), t0, t10); + movq(t10, xm2); + adc(t9, rax); + adc(t10, rax); + movq(t0, xm1); // load + adc(t0, rax); + adc(t1, rax); + // z = [t1:t0:t10:t9:t8:t7:t6:t5:t4:t3] + mov(a, rp_); + mul(t3); + lea(t2, ptr [rip + *pL_]); + mov(d, a); + movq(xm2, t10); + mulPackAddShr(Pack(t9, t8, t7, t6, t5, t4, t3), t2, t10); + movq(t10, xm2); + adc(t10, rax); + adc(t0, rax); + adc(t1, rax); + // z = [t1:t0:t10:t9:t8:t7:t6:t5:t4] + mov(a, rp_); + mul(t4); + lea(t2, ptr [rip + *pL_]); + mov(d, a); + mulPackAddShr(Pack(t10, t9, t8, t7, t6, t5, t4), t2, t3); + adc(t0, rax); + adc(t1, rax); + // z = [t1:t0:t10:t9:t8:t7:t6:t5] + mov(a, rp_); + mul(t5); + lea(t2, ptr [rip + *pL_]); + mov(d, a); + mulPackAddShr(Pack(t0, t10, t9, t8, t7, t6, t5), t2, t3); + adc(t1, a); + // z = [t1:t0:t10:t9:t8:t7:t6] + mov(a, rp_); + mul(t6); + lea(t2, ptr [rip + *pL_]); + mov(d, a); + mulPackAddShr(Pack(t1, t0, t10, t9, t8, t7, t6), t2, t3, true); + // z = [t1:t0:t10:t9:t8:t7] + Pack zp = Pack(t1, t0, t10, t9, t8, t7); + Pack keep = Pack(z, xy, rax, rdx, t3, t6); + mov_rr(keep, zp); + sub_rm(zp, t2); // z -= p + cmovc_rr(zp, keep); + movq(z, xm0); + store_mr(z, zp); + } void gen_fpDbl_sqrPre(mcl::fp::Op& op) { if (useMulx_ && pn_ == 2) { @@ -1881,16 +2039,8 @@ private: mulPre3(sf.p[0], sf.p[1], sf.p[2], sf.t); return; } - if (pn_ == 4) { - StackFrame sf(this, 3, 10 | UseRDX); - mulPre4(sf.p[0], sf.p[1], sf.p[2], sf.t); - return; - } - // 64clk -> 56clk - if (pn_ == 6 && useAdx_) { - StackFrame sf(this, 3, 10 | UseRDX); // 7 is ok, but to use same api - mulPre6(sf.p[0], sf.p[1], sf.p[2], sf.t); - } + assert(0); + exit(1); } static inline void debug_put_inner(const uint64_t *ptr, int n) {