direct call asm

dev
MITSUNARI Shigeo 6 years ago
parent a22538c8b5
commit 7f5f70f673
  1. 1
      Makefile
  2. 24
      include/mcl/fp.hpp
  3. 6
      include/mcl/op.hpp
  4. 5
      src/fp_generator.hpp
  5. 3
      test/bench.hpp

@ -15,6 +15,7 @@ ifeq ($(CPU),x86-64)
TEST_SRC+=low_test.cpp
endif
ifeq ($(MCL_USE_XBYAK),1)
CFLAGS+=-DMCL_XBYAK_DIRECT_CALL
TEST_SRC+=fp_generator_test.cpp
endif
endif

@ -129,6 +129,14 @@ public:
if (!*pb) return;
}
inv(inv2_, 2);
#ifdef MCL_XBYAK_DIRECT_CALL
add = (void (*)(FpT& z, const FpT& x, const FpT& y))op_.fp_addA_;
if (add == 0) add = addC;
sub = (void (*)(FpT& z, const FpT& x, const FpT& y))op_.fp_subA_;
if (sub == 0) sub = subC;
mul = (void (*)(FpT& z, const FpT& x, const FpT& y))op_.fp_mulA_;
if (mul == 0) mul = mulC;
#endif
*pb = true;
}
static inline void init(bool *pb, const char *mstr, fp::Mode mode = fp::FP_AUTO)
@ -344,11 +352,20 @@ public:
}
setArray(pb, gmp::getUnit(x), gmp::getUnitSize(x));
}
#ifdef MCL_XBYAK_DIRECT_CALL
static void (*add)(FpT& z, const FpT& x, const FpT& y);
static inline void addC(FpT& z, const FpT& x, const FpT& y) { op_.fp_add(z.v_, x.v_, y.v_, op_.p); }
static void (*sub)(FpT& z, const FpT& x, const FpT& y);
static inline void subC(FpT& z, const FpT& x, const FpT& y) { op_.fp_sub(z.v_, x.v_, y.v_, op_.p); }
static void (*mul)(FpT& z, const FpT& x, const FpT& y);
static inline void mulC(FpT& z, const FpT& x, const FpT& y) { op_.fp_mul(z.v_, x.v_, y.v_, op_.p); }
#else
static inline void add(FpT& z, const FpT& x, const FpT& y) { op_.fp_add(z.v_, x.v_, y.v_, op_.p); }
static inline void sub(FpT& z, const FpT& x, const FpT& y) { op_.fp_sub(z.v_, x.v_, y.v_, op_.p); }
static inline void mul(FpT& z, const FpT& x, const FpT& y) { op_.fp_mul(z.v_, x.v_, y.v_, op_.p); }
#endif
static inline void addPre(FpT& z, const FpT& x, const FpT& y) { op_.fp_addPre(z.v_, x.v_, y.v_); }
static inline void subPre(FpT& z, const FpT& x, const FpT& y) { op_.fp_subPre(z.v_, x.v_, y.v_); }
static inline void mul(FpT& z, const FpT& x, const FpT& y) { op_.fp_mul(z.v_, x.v_, y.v_, op_.p); }
static inline void mulUnit(FpT& z, const FpT& x, const Unit y)
{
if (mulSmallUnit(z, x, y)) return;
@ -563,6 +580,11 @@ public:
template<class tag, size_t maxBitSize> fp::Op FpT<tag, maxBitSize>::op_;
template<class tag, size_t maxBitSize> FpT<tag, maxBitSize> FpT<tag, maxBitSize>::inv2_;
template<class tag, size_t maxBitSize> int FpT<tag, maxBitSize>::ioMode_ = IoAuto;
#ifdef MCL_XBYAK_DIRECT_CALL
template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::add)(FpT& z, const FpT& x, const FpT& y);
template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::sub)(FpT& z, const FpT& x, const FpT& y);
template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::mul)(FpT& z, const FpT& x, const FpT& y);
#endif
} // mcl

@ -179,6 +179,9 @@ struct Op {
FpGenerator *fg;
mcl::Array<Unit> invTbl;
#endif
void3u fp_addA_;
void3u fp_subA_;
void3u fp_mulA_;
size_t maxN;
size_t N;
size_t bitSize;
@ -256,6 +259,9 @@ struct Op {
fg = 0;
invTbl.clear();
#endif
fp_addA_ = 0;
fp_subA_ = 0;
fp_mulA_ = 0;
maxN = 0;
N = 0;
bitSize = 0;

@ -200,7 +200,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
typedef bool (*bool3op)(uint64_t*, const uint64_t*, const uint64_t*);
// add/sub with mod
typedef void (*void3op)(uint64_t*, const uint64_t*, const uint64_t*);
// typedef void (*void3op)(uint64_t*, const uint64_t*, const uint64_t*);
// mul without carry. return top of z
typedef uint64_t (*uint3opI)(uint64_t*, const uint64_t*, uint64_t);
@ -268,9 +268,11 @@ struct FpGenerator : Xbyak::CodeGenerator {
setSize(0); // reset code
align(16);
op.fp_add = getCurr<void4u>();
op.fp_addA_ = getCurr<void3u>();
gen_fp_add();
align(16);
op.fp_sub = getCurr<void4u>();
op.fp_subA_ = getCurr<void3u>();
gen_fp_sub();
align(16);
@ -293,6 +295,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
align(16);
mul_ = getCurr<void4u>();
op.fp_mul = mul_;
op.fp_mulA_ = getCurr<void3u>();
gen_mul();
align(16);
op.fp_sqr = getCurr<void3u>();

@ -43,11 +43,11 @@ void testBench(const G1& P, const G2& Q)
verifyOrderG2(true);
CYBOZU_BENCH_C("hashAndMapToG1", C, hashAndMapToG1, PP, "abc", 3);
CYBOZU_BENCH_C("hashAndMapToG2", C, hashAndMapToG2, QQ, "abc", 3);
#endif
CYBOZU_BENCH_C("Fp::add ", C3, Fp::add, x, x, y);
CYBOZU_BENCH_C("Fp::mul ", C3, Fp::mul, x, x, y);
CYBOZU_BENCH_C("Fp::sqr ", C3, Fp::sqr, x, x);
CYBOZU_BENCH_C("Fp::inv ", C3, Fp::inv, x, x);
#endif
Fp2 xx, yy;
xx.a = x;
xx.b = 3;
@ -75,6 +75,7 @@ void testBench(const G1& P, const G2& Q)
CYBOZU_BENCH_C("pairing ", C, pairing, e1, P, Q);
CYBOZU_BENCH_C("millerLoop ", C, millerLoop, e1, P, Q);
CYBOZU_BENCH_C("finalExp ", C, finalExp, e1, e1);
//exit(1);
std::vector<Fp6> Qcoeff;
precomputeG2(Qcoeff, Q);
CYBOZU_BENCH_C("precomputedML ", C, precomputedMillerLoop, e2, P, Qcoeff);

Loading…
Cancel
Save