refactor gen_fp_add

dev
MITSUNARI Shigeo 9 years ago
parent e29d182b4b
commit 1a84d4eeb8
  1. 122
      include/mcl/fp_generator.hpp

@ -191,7 +191,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
setSize(0); // reset code setSize(0); // reset code
align(16); align(16);
op.fp_add = getCurr<void3u>(); op.fp_add = getCurr<void3u>();
gen_add(); gen_fp_add();
align(16); align(16);
op.fp_sub = getCurr<void3u>(); op.fp_sub = getCurr<void3u>();
gen_sub(); gen_sub();
@ -408,76 +408,57 @@ struct FpGenerator : Xbyak::CodeGenerator {
mov(ptr [pz + i * 8], t); mov(ptr [pz + i * 8], t);
} }
} }
void gen_inAddMod3(const RegExp& pz, const RegExp& px, const RegExp& py, const StackFrame& sf, bool withCarry) /*
{ pz[] = px[] + py[] mod p[]
const Reg64& t0 = sf.t[0]; use rax, t
const Reg64& t1 = sf.t[1]; */
const Reg64& t2 = sf.t[2]; void gen_in_fp_add(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t, bool withCarry)
const Reg64& t3 = sf.t[3]; {
const Reg64& t4 = sf.t[4]; const Pack& p0 = t.sub(0, pn_);
const Reg64& t5 = sf.t[5]; const Pack& p1 = t.sub(pn_, pn_);
const Reg64 *fullReg = isFullBit_ ? &t[pn_ * 2] : 0;
load_rm(Pack(t2, t1, t0), px); load_rm(p0, px);
add_rm(Pack(t2, t1, t0), py, withCarry); add_rm(p0, py, withCarry);
mov_rr(Pack(t5, t4, t3), Pack(t2, t1, t0)); mov_rr(p1, p0);
if (isFullBit_) { if (fullReg) {
mov(sf.t[6], 0); mov(*fullReg, 0);
adc(sf.t[6], 0); adc(*fullReg, 0);
} }
mov(rax, (size_t)p_); mov(rax, (size_t)p_);
sub_rm(Pack(t5, t4, t3), rax); sub_rm(p1, rax);
if (isFullBit_) { if (fullReg) {
sbb(sf.t[6], 0); sbb(*fullReg, 0);
} }
cmovc(t5, t2); cmovc_rr(p1, p0);
cmovc(t4, t1); store_mr(pz, p1);
cmovc(t3, t0);
store_mr(pz, Pack(t5, t4, t3));
} }
void gen_inAddMod4(const RegExp& pz, const RegExp& px, const RegExp& py, const StackFrame& sf, bool withCarry) void gen_in_fp_add(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& p0, const Pack& p1, bool withCarry, const Reg64 *fullReg)
{ {
const Reg64& t0 = sf.t[0]; load_rm(p0, px);
const Reg64& t1 = sf.t[1]; add_rm(p0, py, withCarry);
const Reg64& t2 = sf.t[2]; mov_rr(p1, p0);
const Reg64& t3 = sf.t[3];
const Reg64& t4 = sf.t[4];
const Reg64& t5 = sf.t[5];
const Reg64& t6 = sf.t[6];
const Reg64& t7 = sf.t[7];
load_rm(Pack(t3, t2, t1, t0), px);
add_rm(Pack(t3, t2, t1, t0), py, withCarry);
mov_rr(Pack(t7, t6, t5, t4), Pack(t3, t2, t1, t0));
if (isFullBit_) { if (isFullBit_) {
mov(sf.t[8], 0); mov(*fullReg, 0);
adc(sf.t[8], 0); adc(*fullReg, 0);
} }
mov(rax, (size_t)p_); mov(rax, (size_t)p_);
sub_rm(Pack(t7, t6, t5, t4), rax); sub_rm(p1, rax);
if (isFullBit_) { if (isFullBit_) {
sbb(sf.t[8], 0); sbb(*fullReg, 0);
} }
cmovc(t7, t3); cmovc_rr(p1, p0);
cmovc(t6, t2); store_mr(pz, p1);
cmovc(t5, t1);
cmovc(t4, t0);
store_mr(pz, Pack(t7, t6, t5, t4));
} }
void gen_addMod3() void gen_fp_add_le4()
{ {
StackFrame sf(this, 3, isFullBit_ ? 7 : 6); const bool withCarry = false;
const Reg64& pz = sf.p[0]; assert(pn_ <= 4);
const Reg64& px = sf.p[1]; const int tn = pn_ * 2 + (isFullBit_ ? 1 : 0);
const Reg64& py = sf.p[2]; StackFrame sf(this, 3, tn);
gen_inAddMod3(pz, px, py, sf, false);
}
void gen_addMod4()
{
StackFrame sf(this, 3, isFullBit_ ? 9 : 8);
const Reg64& pz = sf.p[0]; const Reg64& pz = sf.p[0];
const Reg64& px = sf.p[1]; const Reg64& px = sf.p[1];
const Reg64& py = sf.p[2]; const Reg64& py = sf.p[2];
gen_inAddMod4(pz, px, py, sf, false); gen_in_fp_add(pz, px, py, sf.t, withCarry);
} }
void gen_subMod_le4(int n) void gen_subMod_le4(int n)
{ {
@ -512,14 +493,10 @@ struct FpGenerator : Xbyak::CodeGenerator {
#endif #endif
store_mr(pz, rx); store_mr(pz, rx);
} }
void gen_add() void gen_fp_add()
{ {
if (pn_ == 3) { if (pn_ <= 4) {
gen_addMod3(); gen_fp_add_le4();
return;
}
if (pn_ == 4) {
gen_addMod4();
return; return;
} }
StackFrame sf(this, 3, 0, pn_ * 8); StackFrame sf(this, 3, 0, pn_ * 8);
@ -560,22 +537,13 @@ struct FpGenerator : Xbyak::CodeGenerator {
void gen_fpDbl_add() void gen_fpDbl_add()
{ {
assert(pn_ <= 4); assert(pn_ <= 4);
int tn = 0; int tn = pn_ * 2 + (isFullBit_ ? 1 : 0);
if (pn_ == 3) {
tn = isFullBit_ ? 7 : 6;
} else if (pn_ == 4) {
tn = isFullBit_ ? 9 : 8;
}
StackFrame sf(this, 3, tn); StackFrame sf(this, 3, tn);
const Reg64& pz = sf.p[0]; const Reg64& pz = sf.p[0];
const Reg64& px = sf.p[1]; const Reg64& px = sf.p[1];
const Reg64& py = sf.p[2]; const Reg64& py = sf.p[2];
gen_raw_add(pz, px, py, rax, pn_); gen_raw_add(pz, px, py, rax, pn_);
if (pn_ == 3) { gen_in_fp_add(pz + 8 * pn_, px + 8 * pn_, py + 8 * pn_, sf.t, true);
gen_inAddMod3(pz + 8 * pn_, px + 8 * pn_, py + 8 * pn_, sf, true);
} else {
gen_inAddMod4(pz + 8 * pn_, px + 8 * pn_, py + 8 * pn_, sf, true);
}
} }
void gen_sub() void gen_sub()
{ {
@ -1633,6 +1601,12 @@ private:
sbb(z[i], ptr [m + 8 * i]); sbb(z[i], ptr [m + 8 * i]);
} }
} }
void cmovc_rr(const Pack& z, const Pack& x)
{
for (int i = 0, n = (int)z.size(); i < n; i++) {
cmovc(z[i], x[i]);
}
}
/* /*
t = all or z[i] t = all or z[i]
ZF = z is zero ZF = z is zero

Loading…
Cancel
Save