optimize divBy2

dev
MITSUNARI Shigeo 8 years ago
parent f03372873e
commit b0f23bb8fd
  1. 8
      include/mcl/fp.hpp
  2. 2
      include/mcl/op.hpp
  3. 9
      sample/rawbench.cpp
  4. 11
      src/fp.cpp
  5. 17
      src/fp_generator.hpp
  6. 14
      src/fp_proto.hpp

@ -341,7 +341,15 @@ public:
static inline void sqr(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); } static inline void sqr(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); }
static inline void divBy2(FpT& y, const FpT& x) static inline void divBy2(FpT& y, const FpT& x)
{ {
#if 0
mul(y, x, inv2_); // QQQ : optimize later mul(y, x, inv2_); // QQQ : optimize later
#else
bool odd = (x.v_[0] & 1) != 0;
op_.fp_shr1(y.v_, x.v_);
if (odd) {
op_.fp_addPre(y.v_, y.v_, op_.half);
}
#endif
} }
bool isZero() const { return op_.fp_isZero(v_); } bool isZero() const { return op_.fp_isZero(v_); }
bool isOne() const { return fp::isEqualArray(v_, op_.oneRep, op_.N); } bool isOne() const { return fp::isEqualArray(v_, op_.oneRep, op_.N); }

@ -92,6 +92,7 @@ struct Op {
bool (*fp_isZero)(const Unit*); bool (*fp_isZero)(const Unit*);
void1u fp_clear; void1u fp_clear;
void2u fp_copy; void2u fp_copy;
void2u fp_shr1;
void3u fp_neg; void3u fp_neg;
void4u fp_add; void4u fp_add;
void4u fp_sub; void4u fp_sub;
@ -159,6 +160,7 @@ struct Op {
fp_isZero = 0; fp_isZero = 0;
fp_clear = 0; fp_clear = 0;
fp_copy = 0; fp_copy = 0;
fp_shr1 = 0;
fp_neg = 0; fp_neg = 0;
fp_add = 0; fp_add = 0;
fp_sub = 0; fp_sub = 0;

@ -38,13 +38,8 @@ void benchRaw(const char *p, mcl::fp::Mode mode)
double fp2_sqrT, fp2_mulT; double fp2_sqrT, fp2_mulT;
CYBOZU_BENCH_T(fp_addT, op.fp_add, uz, ux, uy, op.p); CYBOZU_BENCH_T(fp_addT, op.fp_add, uz, ux, uy, op.p);
CYBOZU_BENCH_T(fp_subT, op.fp_sub, uz, uy, ux, op.p); CYBOZU_BENCH_T(fp_subT, op.fp_sub, uz, uy, ux, op.p);
if (op.fp_addPre) { CYBOZU_BENCH_T(fp_addPreT, op.fp_addPre, uz, ux, uy);
CYBOZU_BENCH_T(fp_addPreT, op.fp_addPre, uz, ux, uy); CYBOZU_BENCH_T(fp_subPreT, op.fp_subPre, uz, uy, ux);
CYBOZU_BENCH_T(fp_subPreT, op.fp_subPre, uz, uy, ux);
} else {
fp_addPreT = 0;
fp_subPreT = 0;
}
CYBOZU_BENCH_T(fp_sqrT, op.fp_sqr, uz, ux, op.p); CYBOZU_BENCH_T(fp_sqrT, op.fp_sqr, uz, ux, op.p);
CYBOZU_BENCH_T(fp_mulT, op.fp_mul, uz, ux, uy, op.p); CYBOZU_BENCH_T(fp_mulT, op.fp_mul, uz, ux, uy, op.p);
CYBOZU_BENCH_T(fp_mulUnitT, op.fp_mulUnit, uz, ux, 12345678, op.p); CYBOZU_BENCH_T(fp_mulUnitT, op.fp_mulUnit, uz, ux, 12345678, op.p);

@ -153,16 +153,17 @@ template<size_t N>
struct SetFpDbl<N, true> { struct SetFpDbl<N, true> {
static inline void exec(Op& op) static inline void exec(Op& op)
{ {
if (!op.isFullBit) { // if (!op.isFullBit) {
op.fpDbl_addPre = AddPre<N * 2, Ltag>::f; op.fpDbl_addPre = AddPre<N * 2, Ltag>::f;
op.fpDbl_subPre = SubPre<N * 2, Ltag>::f; op.fpDbl_subPre = SubPre<N * 2, Ltag>::f;
} // }
} }
}; };
template<size_t N, class Tag, bool enableFpDbl> template<size_t N, class Tag, bool enableFpDbl>
void setOpSub(Op& op) void setOpSub(Op& op)
{ {
op.fp_shr1 = Shr1<N, Tag>::f;
op.fp_neg = Neg<N, Tag>::f; op.fp_neg = Neg<N, Tag>::f;
op.fp_add = Add<N, Tag>::f; op.fp_add = Add<N, Tag>::f;
op.fp_sub = Sub<N, Tag>::f; op.fp_sub = Sub<N, Tag>::f;
@ -182,10 +183,8 @@ void setOpSub(Op& op)
op.fpN1_mod = N1_Mod<N, Tag>::f; op.fpN1_mod = N1_Mod<N, Tag>::f;
op.fpDbl_add = DblAdd<N, Tag>::f; op.fpDbl_add = DblAdd<N, Tag>::f;
op.fpDbl_sub = DblSub<N, Tag>::f; op.fpDbl_sub = DblSub<N, Tag>::f;
if (!op.isFullBit) { op.fp_addPre = AddPre<N, Tag>::f;
op.fp_addPre = AddPre<N, Tag>::f; op.fp_subPre = SubPre<N, Tag>::f;
op.fp_subPre = SubPre<N, Tag>::f;
}
SetFpDbl<N, enableFpDbl>::exec(op); SetFpDbl<N, enableFpDbl>::exec(op);
} }

@ -197,17 +197,12 @@ struct FpGenerator : Xbyak::CodeGenerator {
op.fp_sub = getCurr<void4u>(); op.fp_sub = getCurr<void4u>();
gen_fp_sub(); gen_fp_sub();
if (op.isFullBit) { align(16);
op.fp_addPre = 0; op.fp_addPre = getCurr<u3u>();
op.fp_subPre = 0; gen_addSubPre(true, pn_);
} else { align(16);
align(16); op.fp_subPre = getCurr<u3u>();
op.fp_addPre = getCurr<u3u>(); gen_addSubPre(false, pn_);
gen_addSubPre(true, pn_);
align(16);
op.fp_subPre = getCurr<u3u>();
gen_addSubPre(false, pn_);
}
align(16); align(16);
shr1_ = getCurr<void2op>(); shr1_ = getCurr<void2op>();
gen_shr1(); gen_shr1();

@ -43,7 +43,6 @@ template<size_t N, class Tag = Gtag>
struct AddPre { struct AddPre {
static inline Unit func(Unit *z, const Unit *x, const Unit *y) static inline Unit func(Unit *z, const Unit *x, const Unit *y)
{ {
if (N == 0) return 0;
return mpn_add_n((mp_limb_t*)z, (const mp_limb_t*)x, (const mp_limb_t*)y, N); return mpn_add_n((mp_limb_t*)z, (const mp_limb_t*)x, (const mp_limb_t*)y, N);
} }
static const u3u f; static const u3u f;
@ -87,6 +86,19 @@ struct SubPre {
template<size_t N, class Tag> template<size_t N, class Tag>
const u3u SubPre<N, Tag>::f = SubPre<N, Tag>::func; const u3u SubPre<N, Tag>::f = SubPre<N, Tag>::func;
// y[N] <- (x[N] >> 1)
template<size_t N, class Tag = Gtag>
struct Shr1 {
static inline void func(Unit *y, const Unit *x)
{
mpn_rshift((mp_limb_t*)y, (const mp_limb_t*)x, (int)N, 1);
}
static const void2u f;
};
template<size_t N, class Tag>
const void2u Shr1<N, Tag>::f = Shr1<N, Tag>::func;
// y[N] <- (-x[N]) % p[N] // y[N] <- (-x[N]) % p[N]
template<size_t N, class Tag = Gtag> template<size_t N, class Tag = Gtag>
struct Neg { struct Neg {

Loading…
Cancel
Save