enable Fp2Dbl_mulPre

update-fork
MITSUNARI Shigeo 4 years ago
parent 0785143e9a
commit bf703d617a
  1. 77
      src/fp_generator.hpp
  2. 2
      src/fp_static_code.hpp

@ -3423,10 +3423,9 @@ private:
void3u gen_fp2Dbl_mulPre() void3u gen_fp2Dbl_mulPre()
{ {
if (isFullBit_) return 0; if (isFullBit_) return 0;
// if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
// almost same for pn_ == 6
if (pn_ != 4) return 0;
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
bool embedded = pn_ == 4;
const RegExp z = rsp + 0 * 8; const RegExp z = rsp + 0 * 8;
const RegExp x = rsp + 1 * 8; const RegExp x = rsp + 1 * 8;
@ -3436,51 +3435,63 @@ private:
const Ext1 d2(FpByte_ * 2, rsp, t.next); const Ext1 d2(FpByte_ * 2, rsp, t.next);
const int SS = d2.next; const int SS = d2.next;
StackFrame sf(this, 3, 10 | UseRDX, SS); StackFrame sf(this, 3, 10 | UseRDX, SS);
mov(ptr [z], gp0); mov(ptr[z], gp0);
mov(ptr [x], gp1); mov(ptr[x], gp1);
mov(ptr [y], gp2); mov(ptr[y], gp2);
// s = a + b // s = a + b
gen_raw_add(s, gp1, gp1 + FpByte_, rax, pn_); gen_raw_add(s, gp1, gp1 + FpByte_, rax, pn_);
// t = c + d // t = c + d
gen_raw_add(t, gp2, gp2 + FpByte_, rax, pn_); gen_raw_add(t, gp2, gp2 + FpByte_, rax, pn_);
// d1 = (a + b)(c + d) // d1 = (a + b)(c + d)
mov(gp0, ptr [z]); lea(gp0, ptr [gp0 + FpByte_ * 2]);
add(gp0, FpByte_ * 2); // d1 if (embedded) {
lea(gp1, ptr [s]); mulPre4(gp0, s, t, sf.t);
lea(gp2, ptr [t]); } else {
call(mulPreL); lea(gp1, ptr [s]);
// d0 = a c lea(gp2, ptr [t]);
call(mulPreL);
}
// d0 = z.a = a c
mov(gp0, ptr [z]); mov(gp0, ptr [z]);
mov(gp1, ptr [x]); mov(gp1, ptr [x]);
mov(gp2, ptr [y]); mov(gp2, ptr [y]);
call(mulPreL); if (embedded) {
mulPre4(gp0, gp1, gp2, sf.t);
// d2 = b d } else {
lea(gp0, ptr [d2]); call(mulPreL);
}
// d2 = z.b = b d
mov(gp1, ptr [x]); mov(gp1, ptr [x]);
add(gp1, FpByte_); add(gp1, FpByte_);
mov(gp2, ptr [y]); mov(gp2, ptr [y]);
add(gp2, FpByte_); add(gp2, FpByte_);
call(mulPreL); if (embedded) {
mulPre4(d2, gp1, gp2, sf.t);
} else {
lea(gp0, ptr [d2]);
call(mulPreL);
}
mov(gp0, ptr [z]); {
add(gp0, FpByte_ * 2); // d1 Pack t = sf.t;
mov(gp1, gp0); if (pn_ == 4) {
mov(gp2, ptr [z]); t = t.sub(0, pn_ * 2);
gen_raw_sub(gp0, gp1, gp2, rax, pn_ * 2); } else if (pn_ == 6) {
lea(gp2, ptr [d2]); t.append(gp1);
gen_raw_sub(gp0, gp1, gp2, rax, pn_ * 2); t.append(gp2);
}
assert(t.size() == pn_ * 2);
mov(gp0, ptr [z]); mov(gp0, ptr [z]);
mov(gp1, gp0); load_rm(t, gp0 + FpByte_ * 2);
lea(gp2, ptr [d2]); sub_rm(t, gp0); // d1 -= d0
sub_rm(t, (RegExp)d2); // d1 -= d2
store_mr(gp0 + FpByte_ * 2, t);
gen_raw_sub(gp0, gp1, gp2, rax, pn_); gen_raw_sub(gp0, gp0, d2, rax, pn_);
if (pn_ == 4) { const RegExp& d0H = gp0 + pn_ * 8;
gen_raw_fp_sub(gp0 + pn_ * 8, gp1 + pn_ * 8, gp2 + pn_ * 8, Pack(gt0, gt1, gt2, gt3, gt4, gt5, gt6, gt7), true); const RegExp& d2H = (RegExp)d2 + pn_ * 8;
} else { gen_raw_fp_sub_2(d0H, d0H, d2H, t, true);
assert(pn_ == 6);
gen_raw_fp_sub6(gp0, gp1, gp2, pn_ * 8, sf.t.sub(0, 6), true);
} }
return func; return func;
} }

@ -82,7 +82,7 @@ void setStaticCode(mcl::fp::Op& op)
op.fp2_sqrA_ = mclx_Fp2_sqr; op.fp2_sqrA_ = mclx_Fp2_sqr;
op.fp2_mul2A_ = mclx_Fp2_mul2; op.fp2_mul2A_ = mclx_Fp2_mul2;
op.fp2_mul_xiA_ = mclx_Fp2_mul_xi; op.fp2_mul_xiA_ = mclx_Fp2_mul_xi;
op.fp2Dbl_mulPreA_ = 0;//mclx_Fp2Dbl_mulPre; op.fp2Dbl_mulPreA_ = mclx_Fp2Dbl_mulPre;
op.fp2Dbl_sqrPreA_ = 0;//mclx_Fp2Dbl_sqrPre; op.fp2Dbl_sqrPreA_ = 0;//mclx_Fp2Dbl_sqrPre;
op.fp2Dbl_mul_xiA_ = mclx_Fp2Dbl_mul_xi; op.fp2Dbl_mul_xiA_ = mclx_Fp2Dbl_mul_xi;
op.fp_preInv = mclx_Fp_preInv; op.fp_preInv = mclx_Fp_preInv;

Loading…
Cancel
Save