allocate data are in front of code

dev
MITSUNARI Shigeo 6 years ago
parent ee824b12f9
commit 9aa4ba14a6
  1. 99
      src/fp_generator.hpp
  2. 2
      test/bls12_test.cpp

@ -192,6 +192,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
const Reg64& gt8; const Reg64& gt8;
const Reg64& gt9; const Reg64& gt9;
const mcl::fp::Op *op_; const mcl::fp::Op *op_;
Label *pL_; // valid only in init_inner
const uint64_t *p_; const uint64_t *p_;
uint64_t rp_; uint64_t rp_;
int pn_; int pn_;
@ -218,7 +219,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
@param op [in] ; use op.p, op.N, op.isFullBit @param op [in] ; use op.p, op.N, op.isFullBit
*/ */
FpGenerator() FpGenerator()
: CodeGenerator(4096 * 8, Xbyak::DontSetProtectRWE) : CodeGenerator(4096 * 9, Xbyak::DontSetProtectRWE)
#ifdef XBYAK64_WIN #ifdef XBYAK64_WIN
, gp0(rcx) , gp0(rcx)
, gp1(r11) , gp1(r11)
@ -267,15 +268,26 @@ private:
Label mulPreL; Label mulPreL;
Label fpDbl_modL; Label fpDbl_modL;
Label fp_mulL; Label fp_mulL;
Label pL; // label to p_
op_ = &op; op_ = &op;
p_ = op.p; pL_ = &pL;
/*
first 4096-byte is data area
remain is code area
*/
L(pL);
p_ = reinterpret_cast<const uint64_t*>(getCurr());
for (size_t i = 0; i < op.N; i++) {
dq(op.p[i]);
}
rp_ = fp::getMontgomeryCoeff(p_[0]); rp_ = fp::getMontgomeryCoeff(p_[0]);
pn_ = (int)op.N; pn_ = (int)op.N;
FpByte_ = int(op.maxN * sizeof(uint64_t)); FpByte_ = int(op.maxN * sizeof(uint64_t));
isFullBit_ = op.isFullBit; isFullBit_ = op.isFullBit;
// printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_); // printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_);
// code from here
align(16); setSize(4096);
assert((getCurr<size_t>() & 4095) == 0);
op.fp_addPre = getCurr<u3u>(); op.fp_addPre = getCurr<u3u>();
gen_addSubPre(true, pn_); gen_addSubPre(true, pn_);
align(16); align(16);
@ -585,7 +597,7 @@ private:
load_rm(p0, px); load_rm(p0, px);
add_rm(p0, py, withCarry); add_rm(p0, py, withCarry);
mov_rr(p1, p0); mov_rr(p1, p0);
if (fullReg) { if (isFullBit_) {
mov(*fullReg, 0); mov(*fullReg, 0);
adc(*fullReg, 0); adc(*fullReg, 0);
} }
@ -640,33 +652,28 @@ private:
} }
/* /*
add(pz + offset, px + offset, py + offset); add(pz + offset, px + offset, py + offset);
t.size() == 10 size of t1, t2 == 6
destroy px, py, rax destroy t0, t1
*/ */
void gen_raw_fp_add6(const Reg64& pz, const Reg64& px, const Reg64& py, int offset, Pack t, bool withCarry) void gen_raw_fp_add6(const Reg64& pz, const Reg64& px, const Reg64& py, int offset, const Pack& t1, const Pack& t2, bool withCarry)
{ {
Pack t2 = t.sub(6); load_rm(t1, px + offset);
t = t.sub(0, 6); add_rm(t1, py + offset, withCarry);
t2.append(rax);
t2.append(px);
load_rm(t, px + offset);
add_rm(t, py + offset, withCarry);
Label exit; Label exit;
if (isFullBit_) { if (isFullBit_) {
jnc("@f"); jnc("@f");
mov(py, (size_t)p_); mov(t2[0], *pL_); // t2 is not used
sub_rm(t, py); sub_rm(t1, t2[0]);
jmp(exit); jmp(exit);
L("@@"); L("@@");
} }
mov_rr(t2, t); // destroy px mov_rr(t2, t1);
mov(py, (size_t)p_); sub_rm(t2, rip + *pL_);
sub_rm(t2, py);
for (int i = 0; i < 6; i++) { for (int i = 0; i < 6; i++) {
cmovnc(t[i], t2[i]); cmovnc(t1[i], t2[i]);
} }
L(exit); L(exit);
store_mr(pz + offset, t); store_mr(pz + offset, t1);
} }
void gen_fp_add6() void gen_fp_add6()
{ {
@ -677,7 +684,11 @@ private:
const Reg64& pz = sf.p[0]; const Reg64& pz = sf.p[0];
const Reg64& px = sf.p[1]; const Reg64& px = sf.p[1];
const Reg64& py = sf.p[2]; const Reg64& py = sf.p[2];
gen_raw_fp_add6(pz, px, py, 0, sf.t, false); Pack t1 = sf.t.sub(0, 6);
Pack t2 = sf.t.sub(6);
t2.append(rax);
t2.append(px); // destory after used
gen_raw_fp_add6(pz, px, py, 0, t1, t2, false);
} }
void gen_fp_add() void gen_fp_add()
{ {
@ -740,7 +751,11 @@ private:
const Reg64& px = sf.p[1]; const Reg64& px = sf.p[1];
const Reg64& py = sf.p[2]; const Reg64& py = sf.p[2];
gen_raw_add(pz, px, py, rax, pn_); gen_raw_add(pz, px, py, rax, pn_);
gen_raw_fp_add6(pz, px, py, pn_ * 8, sf.t, true); Pack t1 = sf.t.sub(0, 6);
Pack t2 = sf.t.sub(6);
t2.append(rax);
t2.append(py);
gen_raw_fp_add6(pz, px, py, pn_ * 8, t1, t2, true);
} else { } else {
assert(0); assert(0);
exit(1); exit(1);
@ -757,6 +772,18 @@ private:
gen_raw_sub(pz, px, py, rax, pn_); gen_raw_sub(pz, px, py, rax, pn_);
gen_raw_fp_sub(pz + 8 * pn_, px + 8 * pn_, py + 8 * pn_, sf.t, true); gen_raw_fp_sub(pz + 8 * pn_, px + 8 * pn_, py + 8 * pn_, sf.t, true);
} }
void gen_raw_fp_sub6(const Reg64& pz, const Reg64& px, const Reg64& py, int offset, const Pack& t)
{
load_rm(t, px + offset);
sub_rm(t, py + offset);
/*
jmp is faster than and-mask without jmp
*/
jnc("@f");
add_rm(t, rip + *pL_);
L("@@");
store_mr(pz + offset, t);
}
void gen_fp_sub6() void gen_fp_sub6()
{ {
StackFrame sf(this, 3, 4); StackFrame sf(this, 3, 4);
@ -766,16 +793,7 @@ private:
Pack t = sf.t; Pack t = sf.t;
t.append(rax); t.append(rax);
t.append(px); // |t| = 6 t.append(px); // |t| = 6
load_rm(t, px); // destroy px gen_raw_fp_sub6(pz, px, py, 0, t);
sub_rm(t, py);
/*
jmp is faster than and-mask without jmp
*/
jnc("@f");
mov(py, (size_t)p_); // destory py
add_rm(t, py);
L("@@");
store_mr(pz, t);
} }
void gen_fp_sub() void gen_fp_sub()
{ {
@ -2453,10 +2471,17 @@ private:
mov(ptr [m + 8 * i], x[i]); mov(ptr [m + 8 * i], x[i]);
} }
} }
void store_mr(const Xbyak::RegRip& m, const Pack& x)
{
for (int i = 0, n = (int)x.size(); i < n; i++) {
mov(ptr [m + 8 * i], x[i]);
}
}
/* /*
x[] = m[] x[] = m[]
*/ */
void load_rm(const Pack& z, const RegExp& m) template<class ADDR>
void load_rm(const Pack& z, const ADDR& m)
{ {
for (int i = 0, n = (int)z.size(); i < n; i++) { for (int i = 0, n = (int)z.size(); i < n; i++) {
mov(z[i], ptr [m + 8 * i]); mov(z[i], ptr [m + 8 * i]);
@ -2487,7 +2512,8 @@ private:
/* /*
z[] += m[] z[] += m[]
*/ */
void add_rm(const Pack& z, const RegExp& m, bool withCarry = false) template<class ADDR>
void add_rm(const Pack& z, const ADDR& m, bool withCarry = false)
{ {
if (withCarry) { if (withCarry) {
adc(z[0], ptr [m + 8 * 0]); adc(z[0], ptr [m + 8 * 0]);
@ -2501,7 +2527,8 @@ private:
/* /*
z[] -= m[] z[] -= m[]
*/ */
void sub_rm(const Pack& z, const RegExp& m, bool withCarry = false) template<class ADDR>
void sub_rm(const Pack& z, const ADDR& m, bool withCarry = false)
{ {
if (withCarry) { if (withCarry) {
sbb(z[0], ptr [m + 8 * 0]); sbb(z[0], ptr [m + 8 * 0]);

@ -687,7 +687,7 @@ int main(int argc, char *argv[])
} }
FpDbl dx; FpDbl dx;
FpDbl::mulPre(dx, xv[0], xv[0]); FpDbl::mulPre(dx, xv[0], xv[0]);
CYBOZU_BENCH_C("addDbl", 10000000, FpDbl::add, dx, dx, dx); CYBOZU_BENCH_C("subDbl", 10000000, FpDbl::sub, dx, dx, dx);
// CYBOZU_BENCH_C("mul", 10000000 / n, f, xv, yv, xv); // CYBOZU_BENCH_C("mul", 10000000 / n, f, xv, yv, xv);
// CYBOZU_BENCH_C("mulPre", 10000000, FpDbl::mulPre, dx, xv[0], yv[0]); // CYBOZU_BENCH_C("mulPre", 10000000, FpDbl::mulPre, dx, xv[0], yv[0]);
return 0; return 0;

Loading…
Cancel
Save