under refactoring fp_add

update-fork
MITSUNARI Shigeo 4 years ago
parent d3d1504063
commit 765ca7b89e
  1. 86
      src/fp_generator.hpp

@ -252,6 +252,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
Label mulPreL; Label mulPreL;
Label fpDbl_modL; Label fpDbl_modL;
Label fp_mulL; Label fp_mulL;
Label fp_addL;
const uint64_t *p_; const uint64_t *p_;
uint64_t rp_; uint64_t rp_;
int pn_; int pn_;
@ -490,12 +491,13 @@ private:
*/ */
void gen_raw_add(const RegExp& pz, const RegExp& px, const RegExp& py, const Reg64& t, int n) void gen_raw_add(const RegExp& pz, const RegExp& px, const RegExp& py, const Reg64& t, int n)
{ {
mov(t, ptr [px]); for (int i = 0; i < n; i++) {
add(t, ptr [py]);
mov(ptr [pz], t);
for (int i = 1; i < n; i++) {
mov(t, ptr [px + i * 8]); mov(t, ptr [px + i * 8]);
adc(t, ptr [py + i * 8]); if (i == 0) {
add(t, ptr [py + i * 8]);
} else {
adc(t, ptr [py + i * 8]);
}
mov(ptr [pz + i * 8], t); mov(ptr [pz + i * 8], t);
} }
} }
@ -659,19 +661,12 @@ private:
const Reg64 *fullReg = isFullBit_ ? &t[pn_ * 2] : 0; const Reg64 *fullReg = isFullBit_ ? &t[pn_ * 2] : 0;
load_rm(p0, px); load_rm(p0, px);
add_rm(p0, py, withCarry); add_rm(p0, py, withCarry);
mov_rr(p1, p0);
if (isFullBit_) { if (isFullBit_) {
mov(*fullReg, 0); mov(*fullReg, 0);
adc(*fullReg, 0); adc(*fullReg, 0);
} }
lea(rax, ptr[rip+pL_]); lea(rax, ptr[rip+pL_]);
sub_rm(p1, rax); sub_p_mod(p1, p0, rax, fullReg);
if (fullReg) {
sbb(*fullReg, 0);
}
for (size_t i = 0; i < p1.size(); i++) {
cmovc(p1[i], p0[i]);
}
store_mr(pz, p1); store_mr(pz, p1);
} }
/* /*
@ -738,8 +733,36 @@ private:
L(exit); L(exit);
store_mr(pz, t1); store_mr(pz, t1);
} }
void gen_raw_fp_add6_2(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t, bool withCarry = false, const Reg64 *H = 0)
{
const Pack& t1 = t.sub(0, pn_);
const Pack& t2 = t.sub(pn_, pn_);
load_rm(t1, px);
add_rm(t1, py, withCarry);
if (H) {
mov(*H, 0);
adc(*H, 0);
}
sub_p_mod(t2, t1, rip + pL_, H);
store_mr(pz, t2);
}
void gen_fp_add6() void gen_fp_add6()
{ {
#if 1
const int n = pn_ * 2 - 2;
StackFrame sf(this, 3, n | UseRDX, 0, false);
call(fp_addL);
sf.close();
const Reg64& pz = sf.p[0];
const Reg64& px = sf.p[1];
const Reg64& py = sf.p[2];
Pack t = sf.t;
t.append(rdx);
t.append(rax);
L(fp_addL);
gen_raw_fp_add6_2(pz, px, py, t);
ret();
#else
/* /*
cmov is faster than jmp cmov is faster than jmp
*/ */
@ -752,10 +775,33 @@ private:
t2.append(rax); t2.append(rax);
t2.append(px); // destory after used t2.append(px); // destory after used
gen_raw_fp_add6(pz, px, py, t1, t2, false); gen_raw_fp_add6(pz, px, py, t1, t2, false);
#endif
} }
void3u gen_fp_add() void3u gen_fp_add()
{ {
if (!(pn_ < 6 || (pn_ == 6 && !isFullBit_))) return 0;
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
#if 1
int n = pn_ * 2 - 2;
if (isFullBit_) {
n++;
}
StackFrame sf(this, 3, n | UseRDX, 0, false);
call(fp_addL);
sf.close();
const Reg64& pz = sf.p[0];
const Reg64& px = sf.p[1];
const Reg64& py = sf.p[2];
Pack t = sf.t;
t.append(rdx);
t.append(rax);
const Reg64 *H = isFullBit_ ? &t[t.size() - 1] : 0;
L(fp_addL);
gen_raw_fp_add6_2(pz, px, py, t, false, H);
ret();
return func;
#else
if (pn_ <= 4) { if (pn_ <= 4) {
gen_fp_add_le4(); gen_fp_add_le4();
return func; return func;
@ -799,6 +845,7 @@ private:
#endif #endif
outLocalLabel(); outLocalLabel();
return func; return func;
#endif
} }
void3u gen_fpDbl_add() void3u gen_fpDbl_add()
{ {
@ -944,7 +991,8 @@ private:
/* /*
y = (x >= p[]) x - p[] : x y = (x >= p[]) x - p[] : x
*/ */
void sub_p_mod(const Pack& y, const Pack& x, const RegExp& p, const Reg64 *H = 0) template<class ADDR>
void sub_p_mod(const Pack& y, const Pack& x, const ADDR& p, const Reg64 *H = 0)
{ {
mov_rr(y, x); mov_rr(y, x);
sub_rm(y, p); sub_rm(y, p);
@ -3673,15 +3721,7 @@ private:
const RegExp& xa = sf.p[1]; const RegExp& xa = sf.p[1];
const RegExp& xb = sf.p[1] + FpByte_ * 2; const RegExp& xb = sf.p[1] + FpByte_ * 2;
// [rsp] = x.a + x.b // [rsp] = x.a + x.b
for (int i = 0; i < pn_ * 2; i++) { gen_raw_add(rsp, xa, xb, rax, pn_ * 2);
mov(rax, ptr[xa + i * 8]);
if (i == 0) {
add(rax, ptr[xb + i * 8]);
} else {
adc(rax, ptr[xb + i * 8]);
}
mov(ptr[rsp + i * 8], rax);
}
// low : x.a = x.a - x.b // low : x.a = x.a - x.b
load_rm(t1, xa); load_rm(t1, xa);
sub_rm(t1, xb); sub_rm(t1, xb);

Loading…
Cancel
Save