diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 3f727e8..3a4981d 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -755,6 +755,21 @@ private: } return 0; } + void gen_raw_fp_sub_2(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t, bool withCarry) + { + Pack t1 = t.sub(0, pn_); + Pack t2 = t.sub(pn_, pn_); + load_rm(t1, px); + sub_rm(t1, py, withCarry); + push(t1[0]); + lea(t1[0], ptr[rip + pL_]); + load_rm(t2, t1[0]); + sbb(t1[0], t1[0]); + and_pr(t2, t1[0]); + pop(t1[0]); + add_rr(t1, t2); + store_mr(pz, t1); + } void gen_raw_fp_sub6(const RegExp& pz, const RegExp& px, const RegExp& py, int offset, const Pack& t, bool withCarry) { load_rm(t, px + offset); @@ -767,39 +782,22 @@ private: L("@@"); store_mr(pz + offset, t); } - void gen_fp_sub6() - { - StackFrame sf(this, 3, 4); - const Reg64& pz = sf.p[0]; - const Reg64& px = sf.p[1]; - const Reg64& py = sf.p[2]; - Pack t = sf.t; - t.append(rax); - t.append(px); // |t| = 6 - gen_raw_fp_sub6(pz, px, py, 0, t, false); - } void3u gen_fp_sub() { + if (pn_ > 6) return 0; void3u func = getCurr(); - if (pn_ <= 4) { - gen_fp_sub_le4(); - return func; - } - if (pn_ == 6) { - gen_fp_sub6(); - return func; - } - StackFrame sf(this, 3); + /* + micro-benchmark of jmp is faster than and-mask + but it's slower for pairings + */ + int n = pn_ * 2 - 1; + StackFrame sf(this, 3, n); const Reg64& pz = sf.p[0]; const Reg64& px = sf.p[1]; const Reg64& py = sf.p[2]; - const Xbyak::CodeGenerator::LabelType jmpMode = pn_ < 5 ? T_AUTO : T_NEAR; - Label exit; - gen_raw_sub(pz, px, py, rax, pn_); - jnc(exit, jmpMode); - lea(px, ptr[rip+pL_]); - gen_raw_add(pz, pz, px, rax, pn_); - L(exit); + Pack t = sf.t; + t.append(rax); + gen_raw_fp_sub_2(pz, px, py, t, false); return func; } void2u gen_fp_neg() @@ -3513,8 +3511,6 @@ private: // almost same for pn_ == 6 if (pn_ != 4) return 0; void2u func = getCurr(); - // almost same for pn_ == 6 - if (pn_ != 4) return 0; const RegExp y = rsp + 0 * 8; const RegExp x = rsp + 1 * 8; const Ext1 t1(FpByte_, rsp, 2 * 8);