From df455d0ba4c7b380c82c531d1c8e8e54b0371f06 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 30 Oct 2018 17:48:22 +0900 Subject: [PATCH] Karatsuba of mulPre6 is slower --- src/fp_generator.hpp | 154 +++++++++++++++++++++++++++++++++---------- test/bench.hpp | 15 +++-- test/bls12_test.cpp | 16 +++-- 3 files changed, 140 insertions(+), 45 deletions(-) diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 4408203..80410e0 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -345,25 +345,12 @@ private: mulPre4(gp0, gp1, gp2, sf.t); ret(); } else if (op.N == 6 && useAdx_) { -#if 1 - StackFrame sf(this, 3, 7 | UseRDX, 0, false); - mulPre6(gp0, gp1, gp2, sf.t); + StackFrame sf(this, 3, 10 | UseRDX, 0, false); + call(mulPreL); sf.close(); // make epilog L(mulPreL); // called only from asm code - mulPre6(gp0, gp1, gp2, sf.t); + mulPre6(sf.t); ret(); -#else - { - StackFrame sf(this, 3, 7 | UseRDX); - mulPre6(gp0, gp1, gp2, sf.t); - } - { - StackFrame sf(this, 3, 10 | UseRDX, 0, false); - L(mulPreL); // called only from asm code - mulPre6(gp0, gp1, gp2, sf.t); - ret(); - } -#endif } else { gen_fpDbl_mulPre(); } @@ -1546,13 +1533,13 @@ private: const Reg64& a = rax; const Reg64& d = rdx; mov(d, ptr [px]); - mulx(hi, a, ptr [py + 8 * 0]); - adox(pd[0], a); - mov(ptr [pz], pd[0]); - for (size_t i = 1; i < pd.size(); i++) { - adcx(pd[i], hi); - mulx(hi, a, ptr [py + 8 * i]); + xor_(a, a); + for (size_t i = 0; i < pd.size(); i++) { + mulx(hi, a, ptr [py + i * 8]); adox(pd[i], a); + if (i == 0) mov(ptr[pz], pd[0]); + if (i == pd.size() - 1) break; + adcx(pd[i + 1], hi); } mov(d, 0); adcx(hi, d); @@ -1814,6 +1801,16 @@ private: const Reg64& t8 = t[8]; const Reg64& t9 = t[9]; +#if 0 // a little slower + if (useMulx_ && useAdx_) { + mulPack(pz, px, py, Pack(t3, t2, t1, t0)); + mulPackAdd(pz + 8 * 1, px + 8 * 1, py, t4, Pack(t3, t2, t1, t0)); + mulPackAdd(pz + 8 * 2, px + 8 * 2, py, t0, Pack(t4, t3, t2, t1)); + mulPackAdd(pz + 8 * 3, px + 8 * 3, py, t1, Pack(t0, t4, t3, t2)); + store_mr(pz + 8 * 4, Pack(t1, t0, t4, t3)); + return; + } +#endif #if 0 // a little slower if (!useMulx_) { @@ -1838,17 +1835,6 @@ private: #else if (useMulx_) { mulPack(pz, px, py, Pack(t3, t2, t1, t0)); - if (0 && useAdx_) { // a little slower? - // [t3:t2:t1:t0] - mulPackAdd(pz + 8 * 1, px + 8 * 1, py, t4, Pack(t3, t2, t1, t0)); - // [t4:t3:t2:t1] - mulPackAdd(pz + 8 * 2, px + 8 * 2, py, t5, Pack(t4, t3, t2, t1)); - // [t5:t4:t3:t2] - mulPackAdd(pz + 8 * 3, px + 8 * 3, py, t0, Pack(t5, t4, t3, t2)); - // [t0:t5:t4:t3] - store_mr(pz + 8 * 4, Pack(t0, t5, t4, t3)); - return; - } } else { mov(t5, ptr [px]); mov(a, ptr [py + 8 * 0]); @@ -1903,12 +1889,111 @@ private: mov(ptr [pz + 8 * 7], d); #endif } - void mulPre6(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t) + // [gp0] <- [gp1] * [gp2] + void mulPre6(const Pack& t) { + const Reg64& pz = gp0; + const Reg64& px = gp1; + const Reg64& py = gp2; const Reg64& t0 = t[0]; const Reg64& t1 = t[1]; const Reg64& t2 = t[2]; const Reg64& t3 = t[3]; +#if 0 // slower than basic multiplication(56clk -> 67clk) +// const Reg64& t7 = t[7]; +// const Reg64& t8 = t[8]; +// const Reg64& t9 = t[9]; + const Reg64& a = rax; + const Reg64& d = rdx; + const int stackSize = (3 + 3 + 6 + 1 + 1 + 1) * 8; // a+b, c+d, (a+b)(c+d), x, y, z + const int abPos = 0; + const int cdPos = abPos + 3 * 8; + const int abcdPos = cdPos + 3 * 8; + const int zPos = abcdPos + 6 * 8; + const int yPos = zPos + 8; + const int xPos = yPos + 8; + + sub(rsp, stackSize); + mov(ptr[rsp + zPos], pz); + mov(ptr[rsp + xPos], px); + mov(ptr[rsp + yPos], py); + /* + x = aN + b, y = cN + d + xy = abN^2 + ((a+b)(c+d) - ac - bd)N + bd + */ + xor_(a, a); + load_rm(Pack(t2, t1, t0), px); // b + add_rm(Pack(t2, t1, t0), px + 3 * 8); // a + b + adc(a, 0); + store_mr(pz, Pack(t2, t1, t0)); + movq(xm0, a); // carry1 + + xor_(a, a); + load_rm(Pack(t2, t1, t0), py); // d + add_rm(Pack(t2, t1, t0), py + 3 * 8); // c + d + adc(a, 0); + store_mr(pz + 3 * 8, Pack(t2, t1, t0)); + movq(xm1, a); // carry2 + + mulPre3(rsp + abcdPos, pz, pz + 3 * 8, t); // (a+b)(c+d) + + movq(a, xm0); + movq(d, xm1); + mov(t3, a); + and_(t3, d); // t3 = carry1 & carry2 + Label doNothing; + je(doNothing); + load_rm(Pack(t2, t1, t0), rsp + abcdPos + 3 * 8); + test(a, a); + je("@f"); + // add (c+d) + add_rm(Pack(t2, t1, t0), pz + 3 * 8); + adc(t3, 0); + L("@@"); + test(d, d); + je("@f"); + // add(a+b) + add_rm(Pack(t2, t1, t0), pz); + adc(t3, 0); + L("@@"); + store_mr(rsp + abcdPos + 3 * 8, Pack(t2, t1, t0)); + L(doNothing); + movq(xm0, t3); // save new carry + + + mov(gp0, ptr [rsp + zPos]); + mov(gp1, ptr [rsp + xPos]); + mov(gp2, ptr [rsp + yPos]); + mulPre3(gp0, gp1, gp2, t); // [rsp] <- bd + + mov(gp0, ptr [rsp + zPos]); + mov(gp1, ptr [rsp + xPos]); + mov(gp2, ptr [rsp + yPos]); + mulPre3(gp0 + 6 * 8, gp1 + 3 * 8, gp2 + 3 * 8, t); // [rsp + 6 * 8] <- ac + + mov(pz, ptr[rsp + zPos]); + movq(d, xm0); + for (int i = 0; i < 6; i++) { + mov(a, ptr[pz + (3 + i) * 8]); + if (i == 0) { + add(a, ptr[rsp + abcdPos + i * 8]); + } else { + adc(a, ptr[rsp + abcdPos + i * 8]); + } + mov(ptr[pz + (3 + i) * 8], a); + } + mov(a, ptr[pz + 9 * 8]); + adc(a, d); + mov(ptr[pz + 9 * 8], a); + jnc("@f"); + for (int i = 10; i < 12; i++) { + mov(a, ptr[pz + i * 8]); + adc(a, 0); + mov(ptr[pz + i * 8], a); + } + L("@@"); + add(rsp, stackSize); +#else const Reg64& t4 = t[4]; const Reg64& t5 = t[5]; const Reg64& t6 = t[6]; @@ -1920,6 +2005,7 @@ private: mulPackAdd(pz + 8 * 4, px + 8 * 4, py, t2, Pack(t1, t0, t6, t5, t4, t3)); // [t2:t1:t0:t6:t5:t4] mulPackAdd(pz + 8 * 5, px + 8 * 5, py, t3, Pack(t2, t1, t0, t6, t5, t4)); // [t3:t2:t1:t0:t6:t5] store_mr(pz + 8 * 6, Pack(t3, t2, t1, t0, t6, t5)); +#endif } /* @input (z, xy) diff --git a/test/bench.hpp b/test/bench.hpp index 8693a71..65850fa 100644 --- a/test/bench.hpp +++ b/test/bench.hpp @@ -8,12 +8,12 @@ void testBench(const G1& P, const G2& Q) pairing(e1, P, Q); Fp12::pow(e2, e1, 12345); const int C = 500; - const int C2 = 1000; const int C3 = 10000; Fp x, y; x.setHashOf("abc"); y.setHashOf("xyz"); #if 1 + const int C2 = 1000; mpz_class a = x.getMpz(); CYBOZU_BENCH_C("G1::mulCT ", C, G1::mulCT, Pa, P, a); CYBOZU_BENCH_C("G1::mul ", C, G1::mul, Pa, Pa, a); @@ -52,6 +52,9 @@ void testBench(const G1& P, const G2& Q) xx.b = 3; yy.a = y; yy.b = -5; + FpDbl d0, d1; + x = 9; + y = 3; #if 1 CYBOZU_BENCH_C("Fp2::add ", C3, Fp2::add, xx, xx, yy); CYBOZU_BENCH_C("Fp2::sub ", C3, Fp2::sub, xx, xx, yy); @@ -60,9 +63,6 @@ void testBench(const G1& P, const G2& Q) CYBOZU_BENCH_C("Fp2::mul_xi ", C3, Fp2::mul_xi, xx, xx); CYBOZU_BENCH_C("Fp2::sqr ", C3, Fp2::sqr, xx, xx); CYBOZU_BENCH_C("Fp2::inv ", C3, Fp2::inv, xx, xx); - FpDbl d0, d1; - x = 9; - y = 3; CYBOZU_BENCH_C("FpDbl::addPre ", C3, FpDbl::addPre, d1, d1, d0); CYBOZU_BENCH_C("FpDbl::subPre ", C3, FpDbl::subPre, d1, d1, d0); CYBOZU_BENCH_C("FpDbl::add ", C3, FpDbl::add, d1, d1, d0); @@ -79,9 +79,10 @@ void testBench(const G1& P, const G2& Q) CYBOZU_BENCH_C("GT::sqr ", C2, GT::sqr, e1, e1); CYBOZU_BENCH_C("GT::inv ", C2, GT::inv, e1, e1); #endif - CYBOZU_BENCH_C("pairing ", C, pairing, e1, P, Q); - CYBOZU_BENCH_C("millerLoop ", C, millerLoop, e1, P, Q); - CYBOZU_BENCH_C("finalExp ", C, finalExp, e1, e1); + CYBOZU_BENCH_C("FpDbl::mulPre ", 10000000, FpDbl::mulPre, d0, x, y); + CYBOZU_BENCH_C("pairing ", C3, pairing, e1, P, Q); + CYBOZU_BENCH_C("millerLoop ", C3, millerLoop, e1, P, Q); + CYBOZU_BENCH_C("finalExp ", C3, finalExp, e1, e1); //exit(1); std::vector Qcoeff; precomputeG2(Qcoeff, Q); diff --git a/test/bls12_test.cpp b/test/bls12_test.cpp index 9be7307..501603a 100644 --- a/test/bls12_test.cpp +++ b/test/bls12_test.cpp @@ -686,10 +686,18 @@ int main(int argc, char *argv[]) yv[i].setByCSPRNG(rg); } FpDbl dx; - FpDbl::mulPre(dx, xv[0], xv[0]); - CYBOZU_BENCH_C("subDbl", 10000000, FpDbl::sub, dx, dx, dx); -// CYBOZU_BENCH_C("mul", 10000000 / n, f, xv, yv, xv); -// CYBOZU_BENCH_C("mulPre", 10000000, FpDbl::mulPre, dx, xv[0], yv[0]); + FpDbl::mulPre(dx, xv[0], yv[0]); +if(0){ + puts("----------"); + xv[0].dump(); + yv[0].dump(); + dx.dump(); + puts("----------"); +// exit(1); +} +// CYBOZU_BENCH_C("subDbl", 10000000, FpDbl::sub, dx, dx, dx); + CYBOZU_BENCH_C("mul", 10000000 / n, f, xv, yv, xv); + CYBOZU_BENCH_C("mulPre", 10000000, FpDbl::mulPre, dx, xv[0], yv[0]); return 0; #endif return cybozu::test::autoRun.run(argc, argv);