dev
MITSUNARI Shigeo 9 years ago
parent 1b442abe46
commit 27a623806c
  1. 82
      include/mcl/fp_generator.hpp

@ -253,10 +253,10 @@ struct FpGenerator : Xbyak::CodeGenerator {
op.fp_mod = getCurr<void2u>();
gen_fp_mod();
}
if (op.N == 4) {
if (op.N == 3 || op.N == 4) {
align(16);
op.fp_mulPre = getCurr<void3u>();
gen_fp_mulPre4();
gen_fp_mulPre();
}
}
void gen_addSubNC(bool isAdd, int n)
@ -1048,6 +1048,73 @@ struct FpGenerator : Xbyak::CodeGenerator {
movq(pz, xm0);
store_mr(pz, Pack(t2, t0, t3));
}
/*
pz[5..0] <- px[2..0] * py[2..0]
*/
void mul3x3(const RegExp& pz, const RegExp& px, const RegExp& py, const Pack& t)
{
const Reg64& a = rax;
const Reg64& d = rdx;
const Reg64& t0 = t[0];
const Reg64& t1 = t[1];
const Reg64& t2 = t[2];
const Reg64& t3 = t[3];
const Reg64& t4 = t[4];
const Reg64& t5 = t[5];
const Reg64& t6 = t[6];
const Reg64& t7 = t[7];
const Reg64& t8 = t[8];
const Reg64& t9 = t[9];
if (useMulx_) {
mov(d, ptr [px]);
mulx(t0, a, ptr [py + 8 * 0]);
mov(ptr [pz + 8 * 0], a);
mulx(t1, a, ptr [py + 8 * 1]);
add(t0, a);
mulx(t2, a, ptr [py + 8 * 2]);
adc(t1, a);
adc(t2, 0);
} else {
mov(t5, ptr [px]);
mov(a, ptr [py + 8 * 0]);
mul(t5);
mov(ptr [pz + 8 * 0], a);
mov(t0, d);
mov(a, ptr [py + 8 * 1]);
mul(t5);
mov(t3, a);
mov(t1, d);
mov(a, ptr [py + 8 * 2]);
mul(t5);
mov(t4, a);
mov(t2, d);
add(t0, t3);
mov(t2, 0);
adc(t1, a);
adc(t2, d); // [t2:t1:t0:pz[0]] = px[0] * py[2..0]
}
// here [t2:t1:t0]
mov(t9, ptr [px + 8]);
// [d:t9:t6:t5] = px[1] * py[2..0]
mul3x1(py, t9, t7, t6, t5, t4);
add_rr(Pack(t2, t1, t0), Pack(t9, t6, t5));
adc(d, 0);
mov(t8, d);
mov(ptr [pz + 8], t0);
// here [t8:t2:t1]
mov(t9, ptr [px + 16]);
// [d:t9:t5:t4]
mul3x1(py, t9, t6, t5, t4, t0);
add_rr(Pack(t8, t2, t1), Pack(t9, t5, t4));
adc(d, 0);
store_mr(pz + 8 * 2, Pack(d, t8, t2, t1));
}
/*
pz[7..0] <- px[3..0] * py[3..0]
*/
@ -1130,10 +1197,15 @@ struct FpGenerator : Xbyak::CodeGenerator {
store_mr(pz + 8 * 3, Pack(t7, t8, t3, t2));
mov(ptr [pz + 8 * 7], d);
}
void gen_fp_mulPre4()
void gen_fp_mulPre()
{
StackFrame sf(this, 3, 10 | UseRDX);
mul4x4(sf.p[0], sf.p[1], sf.p[2], sf.t);
if (pn_ == 3) {
StackFrame sf(this, 3, 10 | UseRDX);
mul3x3(sf.p[0], sf.p[1], sf.p[2], sf.t);
} else if (pn_ == 4) {
StackFrame sf(this, 3, 10 | UseRDX);
mul4x4(sf.p[0], sf.p[1], sf.p[2], sf.t);
}
}
static inline void debug_put_inner(const uint64_t *ptr, int n)
{

Loading…
Cancel
Save