diff --git a/include/mcl/fp_generator.hpp b/include/mcl/fp_generator.hpp index 14f9524..1eb04ce 100644 --- a/include/mcl/fp_generator.hpp +++ b/include/mcl/fp_generator.hpp @@ -194,7 +194,7 @@ struct FpGenerator : Xbyak::CodeGenerator { gen_fp_add(); align(16); op.fp_sub = getCurr(); - gen_sub(); + gen_fp_sub(); if (op.isFullBit) { op.fp_addNC = op.fp_add; @@ -234,8 +234,12 @@ struct FpGenerator : Xbyak::CodeGenerator { align(16); op.fpDbl_add = getCurr(); gen_fpDbl_add(); + align(16); + op.fpDbl_sub = getCurr(); + gen_fpDbl_sub(); if (op.isFullBit) { op.fpDbl_addNC = op.fpDbl_add; + op.fpDbl_subNC = op.fpDbl_sub; } else { align(16); op.fpDbl_addNC = getCurr(); @@ -429,69 +433,49 @@ struct FpGenerator : Xbyak::CodeGenerator { if (fullReg) { sbb(*fullReg, 0); } - cmovc_rr(p1, p0); + for (size_t i = 0; i < p1.size(); i++) { + cmovc(p1[i], p0[i]); + } store_mr(pz, p1); } - void gen_in_fp_add(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& p0, const Pack& p1, bool withCarry, const Reg64 *fullReg) + /* + pz[] = px[] - py[] mod p[] + use rax, t + */ + void gen_in_fp_sub(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t, bool withCarry) { + const Pack& p0 = t.sub(0, pn_); + const Pack& p1 = t.sub(pn_, pn_); load_rm(p0, px); - add_rm(p0, py, withCarry); - mov_rr(p1, p0); - if (isFullBit_) { - mov(*fullReg, 0); - adc(*fullReg, 0); - } + sub_rm(p0, py, withCarry); mov(rax, (size_t)p_); - sub_rm(p1, rax); - if (isFullBit_) { - sbb(*fullReg, 0); + load_rm(p1, rax); + sbb(rax, rax); // rax = (x > y) ? 0 : -1 + for (size_t i = 0; i < p1.size(); i++) { + and_(p1[i], rax); } - cmovc_rr(p1, p0); - store_mr(pz, p1); + add_rr(p0, p1); + store_mr(pz, p0); } void gen_fp_add_le4() { - const bool withCarry = false; assert(pn_ <= 4); const int tn = pn_ * 2 + (isFullBit_ ? 1 : 0); StackFrame sf(this, 3, tn); const Reg64& pz = sf.p[0]; const Reg64& px = sf.p[1]; const Reg64& py = sf.p[2]; - gen_in_fp_add(pz, px, py, sf.t, withCarry); + gen_in_fp_add(pz, px, py, sf.t, false); } - void gen_subMod_le4(int n) + void gen_fp_sub_le4() { - assert(2 <= n && n <= 4); - StackFrame sf(this, 3, (n - 1) * 2); + assert(pn_ <= 4); + const int tn = pn_ * 2; + StackFrame sf(this, 3, tn); const Reg64& pz = sf.p[0]; const Reg64& px = sf.p[1]; const Reg64& py = sf.p[2]; - - Pack rx = sf.t.sub(0, n - 1); - rx.append(px); // rx = [px, t1, t0] - Pack ry = sf.t.sub(n - 1, n - 1); - ry.append(rax); // ry = [rax, t3, t2] - - load_rm(rx, px); // destroy px - sub_rm(rx, py); -#if 0 - sbb(ry[0], ry[0]); // rx[0] = (x > y) ? 0 : -1 - for (int i = 1; i < n; i++) mov(ry[i], ry[0]); - mov(py, (size_t)p_); - for (int i = 0; i < n; i++) and_(ry[i], qword [py + 8 * i]); - add_rr(rx, ry); -#else - // a little faster - sbb(py, py); // py = (x > y) ? 0 : -1 - mov(rax, (size_t)p_); - load_rm(ry, rax); // destroy rax - for (size_t i = 0; i < ry.size(); i++) { - and_(ry[i], py); - } - add_rr(rx, ry); -#endif - store_mr(pz, rx); + gen_in_fp_sub(pz, px, py, sf.t, false); } void gen_fp_add() { @@ -545,10 +529,21 @@ struct FpGenerator : Xbyak::CodeGenerator { gen_raw_add(pz, px, py, rax, pn_); gen_in_fp_add(pz + 8 * pn_, px + 8 * pn_, py + 8 * pn_, sf.t, true); } - void gen_sub() + void gen_fpDbl_sub() + { + assert(pn_ <= 4); + int tn = pn_ * 2; + StackFrame sf(this, 3, tn); + const Reg64& pz = sf.p[0]; + const Reg64& px = sf.p[1]; + const Reg64& py = sf.p[2]; + gen_raw_sub(pz, px, py, rax, pn_); + gen_in_fp_sub(pz + 8 * pn_, px + 8 * pn_, py + 8 * pn_, sf.t, true); + } + void gen_fp_sub() { if (pn_ <= 4) { - gen_subMod_le4(pn_); + gen_fp_sub_le4(); return; } StackFrame sf(this, 3); @@ -1594,9 +1589,13 @@ private: /* z[] -= m[] */ - void sub_rm(const Pack& z, const RegExp& m) + void sub_rm(const Pack& z, const RegExp& m, bool withCarry = false) { - sub(z[0], ptr [m + 8 * 0]); + if (withCarry) { + sbb(z[0], ptr [m + 8 * 0]); + } else { + sub(z[0], ptr [m + 8 * 0]); + } for (int i = 1, n = (int)z.size(); i < n; i++) { sbb(z[i], ptr [m + 8 * i]); }