dev
MITSUNARI Shigeo 6 years ago
parent 98a32e5c16
commit 629496d1a1
  1. 56
      src/fp_generator.hpp

@ -296,7 +296,7 @@ private:
op.fp_negA_ = gen_fp_neg();
const void* func = 0;
void* func = 0;
// setup fp_tower
op.fp2_mulNF = 0;
func = gen_fpDbl_add();
@ -320,10 +320,11 @@ private:
op.fp_mul = reinterpret_cast<void4u>(func); // used in toMont/fromMont
op.fp_mulA_ = reinterpret_cast<void3u>(func);
}
func = gen_sqr();
if (func) {
op.fp_sqrA_ = reinterpret_cast<void2u>(func);
}
if (op.N > 4) return;
align(16);
op.fp_sqrA_ = getCurr<void2u>();
gen_sqr();
if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4
align(16);
op.fp_preInv = getCurr<int2u>();
@ -685,10 +686,10 @@ private:
outLocalLabel();
return func;
}
const void* gen_fpDbl_add()
void* gen_fpDbl_add()
{
align(16);
const void* func = getCurr<void*>();
void* func = getCurr<void*>();
if (pn_ <= 4) {
int tn = pn_ * 2 + (isFullBit_ ? 1 : 0);
StackFrame sf(this, 3, tn);
@ -713,10 +714,10 @@ private:
}
return 0;
}
const void* gen_fpDbl_sub()
void* gen_fpDbl_sub()
{
align(16);
const void* func = getCurr<void*>();
void* func = getCurr<void*>();
if (pn_ <= 4) {
int tn = pn_ * 2;
StackFrame sf(this, 3, tn);
@ -817,10 +818,10 @@ private:
mov(ptr [pz + (pn_ - 1) * 8], *t0);
return func;
}
const void* gen_mul()
void* gen_mul()
{
align(16);
const void* func = getCurr<void*>();
void* func = getCurr<void*>();
if (op_->primeMode == PM_NIST_P192) {
StackFrame sf(this, 3, 10 | UseRDX, 8 * 6);
mulPre3(rsp, sf.p[1], sf.p[2], sf.t);
@ -835,7 +836,7 @@ private:
gen_montMul4();
return func;
}
if (pn_ == 6 && useAdx_) {
if (pn_ == 6 && useMulx_ && useAdx_) {
// gen_montMul6(p_, rp_);
StackFrame sf(this, 3, 10 | UseRDX, (1 + 12) * 8);
mov(ptr[rsp + 12 * 8], gp0);
@ -1120,10 +1121,10 @@ private:
movq(z, xm0);
store_mr(z, Pack(t10, t9, t8, t4));
}
const void* gen_fpDbl_mod(const fp::Op& op)
void* gen_fpDbl_mod(const fp::Op& op)
{
align(16);
const void* func = getCurr<void*>();
void* func = getCurr<void*>();
if (op.primeMode == PM_NIST_P192) {
StackFrame sf(this, 2, 6 | UseRDX);
fpDbl_mod_NIST_P192(sf.p[0], sf.p[1], sf.t);
@ -1166,18 +1167,32 @@ private:
}
return 0;
}
void gen_sqr()
void* gen_sqr()
{
align(16);
void* func = getCurr<void*>();
if (op_->primeMode == PM_NIST_P192) {
StackFrame sf(this, 3, 10 | UseRDX, 6 * 8);
Pack t = sf.t;
t.append(sf.p[2]);
sqrPre3(rsp, sf.p[1], t);
fpDbl_mod_NIST_P192(sf.p[0], rsp, sf.t);
return func;
}
if (pn_ == 3) {
gen_montSqr3();
return;
return func;
}
if (pn_ == 6 && useMulx_ && useAdx_) {
StackFrame sf(this, 3, 10 | UseRDX, (1 + 12) * 8);
mov(ptr[rsp + 12 * 8], gp0);
mov(gp0, rsp);
mov(gp2, gp1);
call(mulPreL); // gp0, x, y
mov(gp0, ptr[rsp + 12 * 8]);
mov(gp1, rsp);
call(fpDbl_modL);
return func;
}
#if 0 // (sqrPre + mod) is slower than mul
if (pn_ == 4 && useMulx_) {
@ -1188,7 +1203,7 @@ private:
mov(gp0, sf.p[0]);
mov(gp1, rsp);
call(fpDbl_modL);
return;
return func;
}
#endif
// sqr(y, x) = mul(y, x, x)
@ -1198,6 +1213,7 @@ private:
mov(rdx, rsi);
#endif
jmp((const void*)op_->fp_mulA_);
return func;
}
/*
input (pz[], px[], py[])
@ -2134,10 +2150,10 @@ private:
movq(z, xm0);
store_mr(z, zp);
}
const void* gen_fpDbl_sqrPre(const fp::Op&/* op */)
void* gen_fpDbl_sqrPre(const fp::Op&/* op */)
{
align(16);
const void* func = getCurr<void*>();
void* func = getCurr<void*>();
if (pn_ == 2 && useMulx_) {
StackFrame sf(this, 2, 7 | UseRDX);
sqrPre2(sf.p[0], sf.p[1], sf.t);
@ -2175,10 +2191,10 @@ private:
return func;
#endif
}
const void* gen_fpDbl_mulPre()
void* gen_fpDbl_mulPre()
{
align(16);
const void* func = getCurr<void*>();
void* func = getCurr<void*>();
if (pn_ == 2 && useMulx_) {
StackFrame sf(this, 3, 5 | UseRDX);
mulPre2(sf.p[0], sf.p[1], sf.p[2], sf.t);

Loading…
Cancel
Save