diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 4628ff8..2dca191 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -324,16 +324,14 @@ private: if (func) { op.fp_sqrA_ = reinterpret_cast(func); } - if (op.N > 4) return; if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 align(16); op.fp_preInv = getCurr(); gen_preInv(); } + op.fp2_addA_ = gen_fp2_add(); + if (op.N == 4 && !isFullBit_) { - align(16); - op.fp2_addA_ = getCurr(); - gen_fp2_add4(); align(16); op.fp2_subA_ = getCurr(); gen_fp2_sub4(); @@ -3505,6 +3503,36 @@ private: gen_raw_fp_add(sf.p[0], sf.p[1], sf.p[2], sf.t, false); gen_raw_fp_add(sf.p[0] + FpByte_, sf.p[1] + FpByte_, sf.p[2] + FpByte_, sf.t, false); } + void gen_fp2_add6() + { + assert(!isFullBit_); + StackFrame sf(this, 3, 10); + const Reg64& pz = sf.p[0]; + const Reg64& px = sf.p[1]; + const Reg64& py = sf.p[2]; + Pack t1 = sf.t.sub(0, 6); + Pack t2 = sf.t.sub(6); + t2.append(rax); + t2.append(px); // destory after used + movq(xm0, px); + gen_raw_fp_add6(pz, px, py, 0, t1, t2, false); + movq(px, xm0); + gen_raw_fp_add6(pz, px, py, FpByte_, t1, t2, false); + } + void3u gen_fp2_add() + { + align(16); + void3u func = getCurr(); + if (pn_ == 4 && !isFullBit_) { + gen_fp2_add4(); + return func; + } + if (pn_ == 6 && !isFullBit_) { + gen_fp2_add6(); + return func; + } + return 0; + } void gen_fp2_sub4() { assert(!isFullBit_); diff --git a/test/bls12_test.cpp b/test/bls12_test.cpp index 42a013d..8722e76 100644 --- a/test/bls12_test.cpp +++ b/test/bls12_test.cpp @@ -687,6 +687,11 @@ int main(int argc, char *argv[]) } FpDbl dx; FpDbl::mulPre(dx, xv[0], yv[0]); + Fp2 x2, y2; + x2.a.setByCSPRNG(rg); + x2.b.setByCSPRNG(rg); + y2.a.setByCSPRNG(rg); + y2.b.setByCSPRNG(rg); if(0){ puts("----------"); xv[0].dump(); @@ -695,12 +700,13 @@ if(0){ puts("----------"); // exit(1); } -// CYBOZU_BENCH_C("subDbl", 10000000, FpDbl::sub, dx, dx, dx); + CYBOZU_BENCH_C("Fp2::add", 10000000, Fp2::add, x2, x2, y2); + CYBOZU_BENCH_C("Fp2::sub", 10000000, Fp2::sub, x2, x2, y2); // CYBOZU_BENCH_C("mulPre", 100000000, FpDbl::mulPre, dx, xv[0], yv[0]); // CYBOZU_BENCH_C("sqrPre", 100000000, FpDbl::sqrPre, dx, xv[0]); // CYBOZU_BENCH_C("mod ", 100000000, FpDbl::mod, xv[0], dx); - CYBOZU_BENCH_C("mul ", 100000000, Fp::mul, xv[0], yv[0], xv[0]); - CYBOZU_BENCH_C("sqr ", 100000000, Fp::sqr, xv[0], xv[0]); +// CYBOZU_BENCH_C("mul ", 100000000, Fp::mul, xv[0], yv[0], xv[0]); +// CYBOZU_BENCH_C("sqr ", 100000000, Fp::sqr, xv[0], xv[0]); return 0; #endif return cybozu::test::autoRun.run(argc, argv);