refactor fp_add

update-fork
MITSUNARI Shigeo 4 years ago
parent d59dd09a34
commit aa6b6d0e15
  1. 112
      src/fp_generator.hpp

@ -649,25 +649,6 @@ private:
mov(ptr [pz + i * 8], t);
}
}
/*
pz[] = px[] + py[] mod p[]
use rax, t
*/
void gen_raw_fp_add(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t, bool withCarry)
{
const Pack& p0 = t.sub(0, pn_);
const Pack& p1 = t.sub(pn_, pn_);
const Reg64 *fullReg = isFullBit_ ? &t[pn_ * 2] : 0;
load_rm(p0, px);
add_rm(p0, py, withCarry);
if (isFullBit_) {
mov(*fullReg, 0);
adc(*fullReg, 0);
}
lea(rax, ptr[rip+pL_]);
sub_p_mod(p1, p0, rax, fullReg);
store_mr(pz, p1);
}
/*
pz[] = px[] - py[] mod p[]
use rax, t
@ -687,16 +668,6 @@ private:
add_rr(p0, p1);
store_mr(pz, p0);
}
void gen_fp_add_le4()
{
assert(pn_ <= 4);
const int tn = pn_ * 2 + (isFullBit_ ? 1 : 0);
StackFrame sf(this, 3, tn);
const Reg64& pz = sf.p[0];
const Reg64& px = sf.p[1];
const Reg64& py = sf.p[2];
gen_raw_fp_add(pz, px, py, sf.t, false);
}
void gen_fp_sub_le4()
{
assert(pn_ <= 4);
@ -707,32 +678,7 @@ private:
const Reg64& py = sf.p[2];
gen_raw_fp_sub(pz, px, py, sf.t, false);
}
/*
add(pz, px, py);
size of t1, t2 == 6
destroy t0, t1
*/
void gen_raw_fp_add6(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t1, const Pack& t2, bool withCarry)
{
load_rm(t1, px);
add_rm(t1, py, withCarry);
Label exit;
if (isFullBit_) {
jnc("@f");
lea(t2[0], ptr[rip+pL_]); // t2[0] is not used
sub_rm(t1, t2[0]);
jmp(exit);
L("@@");
}
mov_rr(t2, t1);
sub_rm(t2, rip + pL_);
for (int i = 0; i < 6; i++) {
cmovnc(t1[i], t2[i]);
}
L(exit);
store_mr(pz, t1);
}
void gen_raw_fp_add_2(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t, bool withCarry = false, const Reg64 *H = 0)
void gen_raw_fp_add(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t, bool withCarry = false, const Reg64 *H = 0)
{
const Pack& t1 = t.sub(0, pn_);
const Pack& t2 = t.sub(pn_, pn_);
@ -761,7 +707,7 @@ private:
Pack t = sf.t;
t.append(rax);
const Reg64 *H = isFullBit_ ? &rax : 0;
gen_raw_fp_add_2(pz, px, py, t, false, H);
gen_raw_fp_add(pz, px, py, t, false, H);
return func;
}
void3u gen_fpDbl_add()
@ -780,7 +726,7 @@ private:
t.append(rax);
const Reg64 *H = isFullBit_ ? &rax : 0;
gen_raw_add(pz, px, py, rax, pn_);
gen_raw_fp_add_2(pz + 8 * pn_, px + 8 * pn_, py + 8 * pn_, t, true, H);
gen_raw_fp_add(pz + 8 * pn_, px + 8 * pn_, py + 8 * pn_, t, true, H);
return func;
}
void3u gen_fpDbl_sub()
@ -2584,7 +2530,7 @@ private:
g_adc(z[i], x[i], t);
}
}
void add_m_m(const RegExp& mz, const RegExp& mx, const Reg64& t, int n)
void add_mm(const RegExp& mz, const RegExp& mx, const Reg64& t, int n)
{
for (int i = 0; i < n; i++) {
mov(t, ptr [mx + i * 8]);
@ -2843,7 +2789,7 @@ private:
sub_m_mp_m(t3, t2, rr, t);
jnc("@f");
// pr[] += p[]
add_m_m(t3, t2, t, pn_);
add_mm(t3, t2, t, pn_);
L("@@");
outLocalLabel();
}
@ -3683,8 +3629,8 @@ private:
Pack t = sf.t;
t.append(rax);
const Reg64 *H = isFullBit_ ? &rax : 0;
gen_raw_fp_add_2(pz, px, py, t, false, H);
gen_raw_fp_add_2(pz + FpByte_, px + FpByte_, py + FpByte_, t, false, H);
gen_raw_fp_add(pz, px, py, t, false, H);
gen_raw_fp_add(pz + FpByte_, px + FpByte_, py + FpByte_, t, false, H);
return func;
}
void3u gen_fp2_sub()
@ -3876,6 +3822,8 @@ private:
{
if (isFullBit_) return 0;
if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
bool nocarry = (p_[pn_ - 1] >> 62) == 0;
if (!nocarry) return 0;
void2u func = getCurr<void2u>();
const RegExp y = rsp + 0 * 8;
@ -3883,39 +3831,23 @@ private:
const Ext1 t1(FpByte_, rsp, 2 * 8);
const Ext1 t2(FpByte_, rsp, t1.next);
const Ext1 t3(FpByte_, rsp, t2.next);
bool nocarry = (p_[pn_ - 1] >> 62) == 0;
StackFrame sf(this, 3, 10 | UseRDX, t3.next);
mov(ptr [y], gp0);
mov(ptr [x], gp1);
// t1 = b + b
lea(gp0, ptr [t1]);
if (nocarry) {
for (int i = 0; i < pn_; i++) {
mov(rax, ptr [gp1 + FpByte_ + i * 8]);
if (i == 0) {
add(rax, rax);
} else {
adc(rax, rax);
}
mov(ptr [gp0 + i * 8], rax);
}
} else {
if (pn_ == 4) {
gen_raw_fp_add(gp0, gp1 + FpByte_, gp1 + FpByte_, sf.t, false);
} else {
assert(pn_ == 6);
Pack t = sf.t.sub(6, 4);
t.append(rax);
t.append(rdx);
gen_raw_fp_add6(gp0, gp1 + FpByte_, gp1 + FpByte_, sf.t.sub(0, 6), t, false);
}
{
Pack t = sf.t.sub(0, pn_);
load_rm(t, gp1 + FpByte_);
shl1(t);
store_mr(gp0, t);
}
// t1 = 2ab
mov(gp1, gp0);
mov(gp2, ptr [x]);
call(fp_mulL);
if (nocarry) {
{
Pack t = sf.t;
t.append(rdx);
t.append(gp1);
@ -3939,20 +3871,6 @@ private:
add_rm(a, rax);
sub_rr(a, b);
store_mr(t3, a);
} else {
mov(gp0, ptr [x]);
if (pn_ == 4) {
gen_raw_fp_add(t2, gp0, gp0 + FpByte_, sf.t, false);
gen_raw_fp_sub(t3, gp0, gp0 + FpByte_, sf.t, false);
} else {
assert(pn_ == 6);
Pack p1 = sf.t.sub(0, 6);
Pack p2 = sf.t.sub(6, 4);
p2.append(rax);
p2.append(rdx);
gen_raw_fp_add6(t2, gp0, gp0 + FpByte_, p1, p2, false);
gen_raw_fp_sub6(t3, gp0, gp0 + FpByte_, 0, p1, false);
}
}
mov(gp0, ptr [y]);

Loading…
Cancel
Save