dev
MITSUNARI Shigeo 6 years ago
parent 98a32e5c16
commit 629496d1a1
  1. 56
      src/fp_generator.hpp

@ -296,7 +296,7 @@ private:
op.fp_negA_ = gen_fp_neg(); op.fp_negA_ = gen_fp_neg();
const void* func = 0; void* func = 0;
// setup fp_tower // setup fp_tower
op.fp2_mulNF = 0; op.fp2_mulNF = 0;
func = gen_fpDbl_add(); func = gen_fpDbl_add();
@ -320,10 +320,11 @@ private:
op.fp_mul = reinterpret_cast<void4u>(func); // used in toMont/fromMont op.fp_mul = reinterpret_cast<void4u>(func); // used in toMont/fromMont
op.fp_mulA_ = reinterpret_cast<void3u>(func); op.fp_mulA_ = reinterpret_cast<void3u>(func);
} }
func = gen_sqr();
if (func) {
op.fp_sqrA_ = reinterpret_cast<void2u>(func);
}
if (op.N > 4) return; if (op.N > 4) return;
align(16);
op.fp_sqrA_ = getCurr<void2u>();
gen_sqr();
if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4
align(16); align(16);
op.fp_preInv = getCurr<int2u>(); op.fp_preInv = getCurr<int2u>();
@ -685,10 +686,10 @@ private:
outLocalLabel(); outLocalLabel();
return func; return func;
} }
const void* gen_fpDbl_add() void* gen_fpDbl_add()
{ {
align(16); align(16);
const void* func = getCurr<void*>(); void* func = getCurr<void*>();
if (pn_ <= 4) { if (pn_ <= 4) {
int tn = pn_ * 2 + (isFullBit_ ? 1 : 0); int tn = pn_ * 2 + (isFullBit_ ? 1 : 0);
StackFrame sf(this, 3, tn); StackFrame sf(this, 3, tn);
@ -713,10 +714,10 @@ private:
} }
return 0; return 0;
} }
const void* gen_fpDbl_sub() void* gen_fpDbl_sub()
{ {
align(16); align(16);
const void* func = getCurr<void*>(); void* func = getCurr<void*>();
if (pn_ <= 4) { if (pn_ <= 4) {
int tn = pn_ * 2; int tn = pn_ * 2;
StackFrame sf(this, 3, tn); StackFrame sf(this, 3, tn);
@ -817,10 +818,10 @@ private:
mov(ptr [pz + (pn_ - 1) * 8], *t0); mov(ptr [pz + (pn_ - 1) * 8], *t0);
return func; return func;
} }
const void* gen_mul() void* gen_mul()
{ {
align(16); align(16);
const void* func = getCurr<void*>(); void* func = getCurr<void*>();
if (op_->primeMode == PM_NIST_P192) { if (op_->primeMode == PM_NIST_P192) {
StackFrame sf(this, 3, 10 | UseRDX, 8 * 6); StackFrame sf(this, 3, 10 | UseRDX, 8 * 6);
mulPre3(rsp, sf.p[1], sf.p[2], sf.t); mulPre3(rsp, sf.p[1], sf.p[2], sf.t);
@ -835,7 +836,7 @@ private:
gen_montMul4(); gen_montMul4();
return func; return func;
} }
if (pn_ == 6 && useAdx_) { if (pn_ == 6 && useMulx_ && useAdx_) {
// gen_montMul6(p_, rp_); // gen_montMul6(p_, rp_);
StackFrame sf(this, 3, 10 | UseRDX, (1 + 12) * 8); StackFrame sf(this, 3, 10 | UseRDX, (1 + 12) * 8);
mov(ptr[rsp + 12 * 8], gp0); mov(ptr[rsp + 12 * 8], gp0);
@ -1120,10 +1121,10 @@ private:
movq(z, xm0); movq(z, xm0);
store_mr(z, Pack(t10, t9, t8, t4)); store_mr(z, Pack(t10, t9, t8, t4));
} }
const void* gen_fpDbl_mod(const fp::Op& op) void* gen_fpDbl_mod(const fp::Op& op)
{ {
align(16); align(16);
const void* func = getCurr<void*>(); void* func = getCurr<void*>();
if (op.primeMode == PM_NIST_P192) { if (op.primeMode == PM_NIST_P192) {
StackFrame sf(this, 2, 6 | UseRDX); StackFrame sf(this, 2, 6 | UseRDX);
fpDbl_mod_NIST_P192(sf.p[0], sf.p[1], sf.t); fpDbl_mod_NIST_P192(sf.p[0], sf.p[1], sf.t);
@ -1166,18 +1167,32 @@ private:
} }
return 0; return 0;
} }
void gen_sqr() void* gen_sqr()
{ {
align(16);
void* func = getCurr<void*>();
if (op_->primeMode == PM_NIST_P192) { if (op_->primeMode == PM_NIST_P192) {
StackFrame sf(this, 3, 10 | UseRDX, 6 * 8); StackFrame sf(this, 3, 10 | UseRDX, 6 * 8);
Pack t = sf.t; Pack t = sf.t;
t.append(sf.p[2]); t.append(sf.p[2]);
sqrPre3(rsp, sf.p[1], t); sqrPre3(rsp, sf.p[1], t);
fpDbl_mod_NIST_P192(sf.p[0], rsp, sf.t); fpDbl_mod_NIST_P192(sf.p[0], rsp, sf.t);
return func;
} }
if (pn_ == 3) { if (pn_ == 3) {
gen_montSqr3(); gen_montSqr3();
return; return func;
}
if (pn_ == 6 && useMulx_ && useAdx_) {
StackFrame sf(this, 3, 10 | UseRDX, (1 + 12) * 8);
mov(ptr[rsp + 12 * 8], gp0);
mov(gp0, rsp);
mov(gp2, gp1);
call(mulPreL); // gp0, x, y
mov(gp0, ptr[rsp + 12 * 8]);
mov(gp1, rsp);
call(fpDbl_modL);
return func;
} }
#if 0 // (sqrPre + mod) is slower than mul #if 0 // (sqrPre + mod) is slower than mul
if (pn_ == 4 && useMulx_) { if (pn_ == 4 && useMulx_) {
@ -1188,7 +1203,7 @@ private:
mov(gp0, sf.p[0]); mov(gp0, sf.p[0]);
mov(gp1, rsp); mov(gp1, rsp);
call(fpDbl_modL); call(fpDbl_modL);
return; return func;
} }
#endif #endif
// sqr(y, x) = mul(y, x, x) // sqr(y, x) = mul(y, x, x)
@ -1198,6 +1213,7 @@ private:
mov(rdx, rsi); mov(rdx, rsi);
#endif #endif
jmp((const void*)op_->fp_mulA_); jmp((const void*)op_->fp_mulA_);
return func;
} }
/* /*
input (pz[], px[], py[]) input (pz[], px[], py[])
@ -2134,10 +2150,10 @@ private:
movq(z, xm0); movq(z, xm0);
store_mr(z, zp); store_mr(z, zp);
} }
const void* gen_fpDbl_sqrPre(const fp::Op&/* op */) void* gen_fpDbl_sqrPre(const fp::Op&/* op */)
{ {
align(16); align(16);
const void* func = getCurr<void*>(); void* func = getCurr<void*>();
if (pn_ == 2 && useMulx_) { if (pn_ == 2 && useMulx_) {
StackFrame sf(this, 2, 7 | UseRDX); StackFrame sf(this, 2, 7 | UseRDX);
sqrPre2(sf.p[0], sf.p[1], sf.t); sqrPre2(sf.p[0], sf.p[1], sf.t);
@ -2175,10 +2191,10 @@ private:
return func; return func;
#endif #endif
} }
const void* gen_fpDbl_mulPre() void* gen_fpDbl_mulPre()
{ {
align(16); align(16);
const void* func = getCurr<void*>(); void* func = getCurr<void*>();
if (pn_ == 2 && useMulx_) { if (pn_ == 2 && useMulx_) {
StackFrame sf(this, 3, 5 | UseRDX); StackFrame sf(this, 3, 5 | UseRDX);
mulPre2(sf.p[0], sf.p[1], sf.p[2], sf.t); mulPre2(sf.p[0], sf.p[1], sf.p[2], sf.t);

Loading…
Cancel
Save