diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp index b6c9cc0..57e5cfa 100644 --- a/include/mcl/fp.hpp +++ b/include/mcl/fp.hpp @@ -341,7 +341,15 @@ public: static inline void sqr(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); } static inline void divBy2(FpT& y, const FpT& x) { +#if 0 mul(y, x, inv2_); // QQQ : optimize later +#else + bool odd = (x.v_[0] & 1) != 0; + op_.fp_shr1(y.v_, x.v_); + if (odd) { + op_.fp_addPre(y.v_, y.v_, op_.half); + } +#endif } bool isZero() const { return op_.fp_isZero(v_); } bool isOne() const { return fp::isEqualArray(v_, op_.oneRep, op_.N); } diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index cdf71c5..0e5cba9 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -92,6 +92,7 @@ struct Op { bool (*fp_isZero)(const Unit*); void1u fp_clear; void2u fp_copy; + void2u fp_shr1; void3u fp_neg; void4u fp_add; void4u fp_sub; @@ -159,6 +160,7 @@ struct Op { fp_isZero = 0; fp_clear = 0; fp_copy = 0; + fp_shr1 = 0; fp_neg = 0; fp_add = 0; fp_sub = 0; diff --git a/sample/rawbench.cpp b/sample/rawbench.cpp index 81f261a..2030c30 100644 --- a/sample/rawbench.cpp +++ b/sample/rawbench.cpp @@ -38,13 +38,8 @@ void benchRaw(const char *p, mcl::fp::Mode mode) double fp2_sqrT, fp2_mulT; CYBOZU_BENCH_T(fp_addT, op.fp_add, uz, ux, uy, op.p); CYBOZU_BENCH_T(fp_subT, op.fp_sub, uz, uy, ux, op.p); - if (op.fp_addPre) { - CYBOZU_BENCH_T(fp_addPreT, op.fp_addPre, uz, ux, uy); - CYBOZU_BENCH_T(fp_subPreT, op.fp_subPre, uz, uy, ux); - } else { - fp_addPreT = 0; - fp_subPreT = 0; - } + CYBOZU_BENCH_T(fp_addPreT, op.fp_addPre, uz, ux, uy); + CYBOZU_BENCH_T(fp_subPreT, op.fp_subPre, uz, uy, ux); CYBOZU_BENCH_T(fp_sqrT, op.fp_sqr, uz, ux, op.p); CYBOZU_BENCH_T(fp_mulT, op.fp_mul, uz, ux, uy, op.p); CYBOZU_BENCH_T(fp_mulUnitT, op.fp_mulUnit, uz, ux, 12345678, op.p); diff --git a/src/fp.cpp b/src/fp.cpp index a789d96..2f5b12d 100644 --- a/src/fp.cpp +++ b/src/fp.cpp @@ -153,16 +153,17 @@ template struct SetFpDbl { static inline void exec(Op& op) { - if (!op.isFullBit) { +// if (!op.isFullBit) { op.fpDbl_addPre = AddPre::f; op.fpDbl_subPre = SubPre::f; - } +// } } }; template void setOpSub(Op& op) { + op.fp_shr1 = Shr1::f; op.fp_neg = Neg::f; op.fp_add = Add::f; op.fp_sub = Sub::f; @@ -182,10 +183,8 @@ void setOpSub(Op& op) op.fpN1_mod = N1_Mod::f; op.fpDbl_add = DblAdd::f; op.fpDbl_sub = DblSub::f; - if (!op.isFullBit) { - op.fp_addPre = AddPre::f; - op.fp_subPre = SubPre::f; - } + op.fp_addPre = AddPre::f; + op.fp_subPre = SubPre::f; SetFpDbl::exec(op); } diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 18089cd..7947e2d 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -197,17 +197,12 @@ struct FpGenerator : Xbyak::CodeGenerator { op.fp_sub = getCurr(); gen_fp_sub(); - if (op.isFullBit) { - op.fp_addPre = 0; - op.fp_subPre = 0; - } else { - align(16); - op.fp_addPre = getCurr(); - gen_addSubPre(true, pn_); - align(16); - op.fp_subPre = getCurr(); - gen_addSubPre(false, pn_); - } + align(16); + op.fp_addPre = getCurr(); + gen_addSubPre(true, pn_); + align(16); + op.fp_subPre = getCurr(); + gen_addSubPre(false, pn_); align(16); shr1_ = getCurr(); gen_shr1(); diff --git a/src/fp_proto.hpp b/src/fp_proto.hpp index 6b538d7..7c755b6 100644 --- a/src/fp_proto.hpp +++ b/src/fp_proto.hpp @@ -43,7 +43,6 @@ template struct AddPre { static inline Unit func(Unit *z, const Unit *x, const Unit *y) { - if (N == 0) return 0; return mpn_add_n((mp_limb_t*)z, (const mp_limb_t*)x, (const mp_limb_t*)y, N); } static const u3u f; @@ -87,6 +86,19 @@ struct SubPre { template const u3u SubPre::f = SubPre::func; +// y[N] <- (x[N] >> 1) +template +struct Shr1 { + static inline void func(Unit *y, const Unit *x) + { + mpn_rshift((mp_limb_t*)y, (const mp_limb_t*)x, (int)N, 1); + } + static const void2u f; +}; + +template +const void2u Shr1::f = Shr1::func; + // y[N] <- (-x[N]) % p[N] template struct Neg {