fast Fp::sqrPre

update-fork
MITSUNARI Shigeo 4 years ago
parent b7a47dc519
commit e1fc81a551
  1. 138
      src/fp_generator.hpp
  2. 1
      test/bench.hpp

@ -2091,38 +2091,124 @@ private:
store_mr(py + 2 * 8, Pack(d, t8, t10, t9, t3, t2)); store_mr(py + 2 * 8, Pack(d, t8, t10, t9, t3, t2));
} }
/* /*
py[11..0] = px[5..0] ^ 2 (5, 5)(4, 4)(3, 3)(2, 2)(1, 1)(0, 0)
use rax, rdx, stack[6 * 8] t9 t8 t7 t6 t5 t4 t3 t2 t1 t0
(5, 4)(4, 3)(3, 2)(2, 1)(1, 0)
(5, 3)(4, 2)(3, 1)(2, 0)
(5, 2)(4, 1)(3, 0)
(5, 1)(4, 0)
(5, 0)
*/ */
void sqrPre6(const RegExp& py, const RegExp& px, const Pack& t) void sqrPre6(const RegExp& py, const RegExp& px, const Pack& t)
{ {
const Reg64& t0 = t[0]; const Reg64& t0 = t[0];
const Reg64& t1 = t[1]; const Reg64& t1 = t[1];
const Reg64& t2 = t[2]; const Reg64& t2 = t[2];
/* const Reg64& t3 = t[3];
(aN + b)^2 = a^2 N^2 + 2ab N + b^2 const Reg64& t4 = t[4];
*/ const Reg64& t5 = t[5];
sqrPre3(py, px, t); // [py] <- b^2 const Reg64& t6 = t[6];
sqrPre3(py + 6 * 8, px + 3 * 8, t); // [py + 6 * 8] <- a^2 const Reg64& t7 = t[7];
mulPre3(rsp, px, px + 3 * 8, t); // ab const Reg64& t8 = t[8];
Pack ab = t.sub(0, 6); const Reg64& t9 = t[9];
load_rm(ab, rsp); const Reg64& H = t[10];
xor_(rax, rax);
for (int i = 0; i < 6; i++) { mov(rdx, ptr[px + 8 * 0]);
if (i == 0) { mulx(t5, t4, ptr[px + 8 * 5]); // [t5:t4] = (5, 0)
add(ab[i], ab[i]); mulx(rax, t3, ptr[px + 8 * 4]); // (4, 0)
} else { add(t4, rax);
adc(ab[i], ab[i]); mov(rdx, ptr[px + 8 * 1]);
} mulx(t6, rax, ptr[px + 8 * 5]); // (5, 1)
} adc(t5, rax);
adc(rax, rax); adc(t6, 0); // [t6:t5:t4:t3]
add_rm(ab, py + 3 * 8); mov(rdx, ptr[px + 8 * 0]);
store_mr(py + 3 * 8, ab); mulx(rax, t2, ptr[px + 8 * 3]);
load_rm(Pack(t2, t1, t0), py + 9 * 8); add(t3, rax);
adc(t0, rax); mov(rdx, ptr[px + 8 * 1]);
adc(t1, 0); mulx(H, rax, ptr[px + 8 * 4]);
adc(t2, 0); adc(t4, rax);
store_mr(py + 9 * 8, Pack(t2, t1, t0)); adc(t5, H);
mov(rdx, ptr[px + 8 * 2]);
mulx(t7, rax, ptr[px + 8 * 5]);
adc(t6, rax);
adc(t7, 0); // [t7:...:t2]
mov(rdx, ptr[px + 8 * 0]);
mulx(H, t1, ptr[px + 8 * 2]);
adc(t2, H);
mov(rdx, ptr[px + 8 * 1]);
mulx(H, rax, ptr[px + 8 * 3]);
adc(t3, rax);
adc(t4, H);
mov(rdx, ptr[px + 8 * 2]);
mulx(H, rax, ptr[px + 8 * 4]);
adc(t5, rax);
adc(t6, H);
mov(rdx, ptr[px + 8 * 3]);
mulx(t8, rax, ptr[px + 8 * 5]);
adc(t7, rax);
adc(t8, 0); // [t8:...:t1]
mov(rdx, ptr[px + 8 * 0]);
mulx(H, t0, ptr[px + 8 * 1]);
add(t1, H);
mov(rdx, ptr[px + 8 * 1]);
mulx(H, rax, ptr[px + 8 * 2]);
adc(t2, rax);
adc(t3, H);
mov(rdx, ptr[px + 8 * 2]);
mulx(H, rax, ptr[px + 8 * 3]);
adc(t4, rax);
adc(t5, H);
mov(rdx, ptr[px + 8 * 3]);
mulx(H, rax, ptr[px + 8 * 4]);
adc(t6, rax);
adc(t7, H);
mov(rdx, ptr[px + 8 * 4]);
mulx(t9, rax, ptr[px + 8 * 5]);
adc(t8, rax);
adc(t9, 0); // [t9...:t0]
shl1(Pack(t9, t8, t7, t6, t5, t4, t3, t2, t1, t0), &H);
mov(rdx, ptr[px + 8 * 0]);
mulx(rdx, rax, rdx);
mov(ptr[py + 8 * 0], rax);
add(t0, rdx);
mov(ptr[py + 8 * 1], t0);
mov(rdx, ptr[px + 8 * 1]);
mulx(rdx, rax, rdx);
adc(t1, rax);
mov(ptr[py + 8 * 2], t1);
adc(t2, rdx);
mov(ptr[py + 8 * 3], t2);
mov(rdx, ptr[px + 8 * 2]);
mulx(rdx, rax, rdx);
adc(t3, rax);
mov(ptr[py + 8 * 4], t3);
adc(t4, edx);
mov(ptr[py + 8 * 5], t4);
mov(rdx, ptr[px + 8 * 3]);
mulx(rdx, rax, rdx);
adc(t5, rax);
mov(ptr[py + 8 * 6], t5);
adc(t6, rdx);
mov(ptr[py + 8 * 7], t6);
mov(rdx, ptr[px + 8 * 4]);
mulx(rdx, rax, rdx);
adc(t7, rax);
mov(ptr[py + 8 * 8], t7);
adc(t8, rdx);
mov(ptr[py + 8 * 9], t8);
mov(rdx, ptr[px + 8 * 5]);
mulx(rdx, rax, rdx);
adc(t9, rax);
mov(ptr[py + 8 * 10], t9);
adc(rdx, H);
mov(ptr[py + 8 * 11], rdx);
} }
/* /*
pz[7..0] <- px[3..0] * py[3..0] pz[7..0] <- px[3..0] * py[3..0]

@ -168,7 +168,6 @@ void testBench(const G1& P, const G2& Q)
CYBOZU_BENCH_C("GT::sqr ", C2, GT::sqr, e1, e1); CYBOZU_BENCH_C("GT::sqr ", C2, GT::sqr, e1, e1);
CYBOZU_BENCH_C("GT::inv ", C2, GT::inv, e1, e1); CYBOZU_BENCH_C("GT::inv ", C2, GT::inv, e1, e1);
#endif #endif
CYBOZU_BENCH_C("FpDbl::mulPre ", C3, FpDbl::mulPre, d0, x, y);
CYBOZU_BENCH_C("pairing ", 3000, pairing, e1, P, Q); CYBOZU_BENCH_C("pairing ", 3000, pairing, e1, P, Q);
CYBOZU_BENCH_C("millerLoop ", 3000, millerLoop, e1, P, Q); CYBOZU_BENCH_C("millerLoop ", 3000, millerLoop, e1, P, Q);
CYBOZU_BENCH_C("finalExp ", 3000, finalExp, e1, e1); CYBOZU_BENCH_C("finalExp ", 3000, finalExp, e1, e1);

Loading…
Cancel
Save