diff --git a/include/mcl/bn.hpp b/include/mcl/bn.hpp index 3668da2..8710f55 100644 --- a/include/mcl/bn.hpp +++ b/include/mcl/bn.hpp @@ -875,9 +875,9 @@ struct Param { assert((p % 6) == 1); r = local::evalPoly(z, rCoff); } - Fr::init(pb, r, mode); + Fr::init(pb, r, mode, "Fr"); if (!*pb) return; - Fp::init(pb, cp.xi_a, p, mode); + Fp::init(pb, cp.xi_a, p, mode, "Fp"); if (!*pb) return; Fp2::init(); const Fp2 xi(cp.xi_a, 1); diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp index 6c5b0b0..c8b5a6d 100644 --- a/include/mcl/fp.hpp +++ b/include/mcl/fp.hpp @@ -130,10 +130,10 @@ public: xi_a is used for Fp2::mul_xi(), where xi = xi_a + i and i^2 = -1 if xi_a = 0 then asm functions for Fp2 are not generated. */ - static inline void init(bool *pb, int xi_a, const mpz_class& p, fp::Mode mode = fp::FP_AUTO) + static inline void init(bool *pb, int xi_a, const mpz_class& p, fp::Mode mode = fp::FP_AUTO, const char *suf = 0) { assert(maxBitSize <= MCL_MAX_BIT_SIZE); - *pb = op_.init(p, maxBitSize, xi_a, mode); + *pb = op_.init(p, maxBitSize, xi_a, mode, suf); if (!*pb) return; { // set oneRep FpT& one = *reinterpret_cast(op_.oneRep); @@ -163,16 +163,16 @@ public: #endif *pb = true; } - static inline void init(bool *pb, const mpz_class& p, fp::Mode mode = fp::FP_AUTO) + static inline void init(bool *pb, const mpz_class& p, fp::Mode mode = fp::FP_AUTO, const char *suf = 0) { - init(pb, 0, p, mode); + init(pb, 0, p, mode, suf); } - static inline void init(bool *pb, const char *mstr, fp::Mode mode = fp::FP_AUTO) + static inline void init(bool *pb, const char *mstr, fp::Mode mode = fp::FP_AUTO, const char *suf = 0) { mpz_class p; gmp::setStr(pb, p, mstr); if (!*pb) return; - init(pb, p, mode); + init(pb, p, mode, suf); } static inline size_t getModulo(char *buf, size_t bufSize) { diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index 99c0e4d..45320e5 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -364,7 +364,7 @@ struct Op { */ fp_mul(y, x, R2, p); } - bool init(const mpz_class& p, size_t maxBitSize, int xi_a, Mode mode, size_t mclMaxBitSize = MCL_MAX_BIT_SIZE); + bool init(const mpz_class& p, size_t maxBitSize, int xi_a, Mode mode, const char *suf = 0, size_t mclMaxBitSize = MCL_MAX_BIT_SIZE); #ifdef MCL_USE_XBYAK static FpGenerator* createFpGenerator(); static void destroyFpGenerator(FpGenerator *fg); diff --git a/src/fp.cpp b/src/fp.cpp index b3b07d1..998a53b 100644 --- a/src/fp.cpp +++ b/src/fp.cpp @@ -346,7 +346,7 @@ static void initInvTbl(Op& op) } #endif -static bool initForMont(Op& op, const Unit *p, Mode mode) +static bool initForMont(Op& op, const Unit *p, Mode mode, const char *suf) { const size_t N = op.N; bool b; @@ -366,17 +366,19 @@ static bool initForMont(Op& op, const Unit *p, Mode mode) if (mode != FP_XBYAK) return true; #ifdef MCL_USE_XBYAK if (op.fg == 0) op.fg = Op::createFpGenerator(); - bool useXbyak = op.fg->init(op); + bool useXbyak = op.fg->init(op, suf); if (useXbyak && op.isMont && N <= 4) { op.fp_invOp = &invOpForMontC; initInvTbl(op); } +#else + (void)suf; #endif return true; } -bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size_t mclMaxBitSize) +bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, const char *suf, size_t mclMaxBitSize) { if (mclMaxBitSize != MCL_MAX_BIT_SIZE) return false; #ifdef MCL_USE_VINT @@ -534,7 +536,7 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size if (!b) return false; } modp.init(mp); - return fp::initForMont(*this, p, mode); + return fp::initForMont(*this, p, mode, suf); } void copyUnitToByteAsLE(uint8_t *dst, const Unit *src, size_t byteSize) diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 97ce9ae..08d5844 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -23,8 +23,59 @@ #pragma warning(disable : 4458) #endif +//#define MCL_FREEZE_JIT + namespace mcl { +#ifdef MCL_FREEZE_JIT +struct Profiler { + FILE *fp_; + const uint8_t *prev_; + std::string suf_; + Profiler() + : fp_(0) + , prev_(0) + { + } + ~Profiler() + { + if (fp_) fclose(fp_); + } + void open(const std::string& fileName) + { + fp_ = fopen(fileName.c_str(), "wb"); + } + void setStartAddr(const uint8_t *addr) + { + prev_ = addr; + } + void setNameSuffix(const char *suf) + { + suf_ = suf; + } + void set(const char *name, const uint8_t *end) + { + fprintf(fp_, "global %s%s\n", suf_.c_str(), name); + fprintf(fp_, "align 16\n"); + fprintf(fp_, "%s%s:\n", suf_.c_str(), name); + const uint8_t *p = prev_; + size_t remain = end - prev_; + while (remain > 0) { + size_t n = remain >= 16 ? 16 : remain; + fprintf(fp_, "db "); + for (size_t i = 0; i < n; i++) { + fprintf(fp_, "0x%02x,", *p++); + } + fprintf(fp_, "\n"); + remain -= n; + } + prev_ = end; + } +}; +#else +typedef Xbyak::util::Profiler Profiler; +#endif + namespace fp_gen_local { class MemReg { @@ -203,7 +254,7 @@ struct FpGenerator : Xbyak::CodeGenerator { int pn_; int FpByte_; bool isFullBit_; - Xbyak::util::Profiler prof_; + Profiler prof_; /* @param op [in] ; use op.p, op.N, op.isFullBit @@ -242,12 +293,12 @@ struct FpGenerator : Xbyak::CodeGenerator { useMulx_ = cpu_.has(Xbyak::util::Cpu::tBMI2); useAdx_ = cpu_.has(Xbyak::util::Cpu::tADX); } - bool init(Op& op) + bool init(Op& op, const char *suf) { if (!cpu_.has(Xbyak::util::Cpu::tAVX)) return false; reset(); // reset jit code for reuse setProtectModeRW(); // read/write memory - init_inner(op); + init_inner(op, suf); // ToDo : recover op if false if (Xbyak::GetError()) return false; // printf("code size=%d\n", (int)getSize()); @@ -255,7 +306,7 @@ struct FpGenerator : Xbyak::CodeGenerator { return true; } private: - void init_inner(Op& op) + void init_inner(Op& op, const char *suf) { op_ = &op; L(pL_); @@ -269,7 +320,6 @@ private: isFullBit_ = op.isFullBit; // printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_); #ifdef MCL_USE_PROF - static char suf[] = "_0"; int profMode = 0; #ifdef XBYAK_USE_VTUNE profMode = 2; @@ -281,89 +331,92 @@ private: if (profMode) { prof_.init(profMode); prof_.setStartAddr(getCurr()); + if (suf == 0) suf = "fp"; prof_.setNameSuffix(suf); suf[1]++; } +#else + (void)suf; #endif op.fp_addPre = gen_addSubPre(true, pn_); - prof_.set("Fp_addPre", getCurr()); + prof_.set("_addPre", getCurr()); op.fp_subPre = gen_addSubPre(false, pn_); - prof_.set("Fp_subPre", getCurr()); + prof_.set("_subPre", getCurr()); op.fp_addA_ = gen_fp_add(); - prof_.set("Fp_add", getCurr()); + prof_.set("_add", getCurr()); op.fp_subA_ = gen_fp_sub(); - prof_.set("Fp_sub", getCurr()); + prof_.set("_sub", getCurr()); op.fp_shr1 = gen_shr1(); - prof_.set("Fp_shr1", getCurr()); + prof_.set("_shr1", getCurr()); op.fp_negA_ = gen_fp_neg(); - prof_.set("Fp_neg", getCurr()); + prof_.set("_neg", getCurr()); op.fpDbl_addA_ = gen_fpDbl_add(); - prof_.set("FpDbl_add", getCurr()); + prof_.set("Dbl_add", getCurr()); op.fpDbl_subA_ = gen_fpDbl_sub(); - prof_.set("FpDbl_sub", getCurr()); + prof_.set("Dbl_sub", getCurr()); op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2); - prof_.set("FpDbl_addPre", getCurr()); + prof_.set("Dbl_addPre", getCurr()); op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2); - prof_.set("FpDbl_subPre", getCurr()); + prof_.set("Dbl_subPre", getCurr()); op.fpDbl_mulPreA_ = gen_fpDbl_mulPre(); - prof_.set("FpDbl_mulPre", getCurr()); + prof_.set("Dbl_mulPre", getCurr()); op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre(); - prof_.set("FpDbl_sqrPre", getCurr()); + prof_.set("Dbl_sqrPre", getCurr()); op.fpDbl_modA_ = gen_fpDbl_mod(op); - prof_.set("FpDbl_mod", getCurr()); + prof_.set("Dbl_mod", getCurr()); op.fp_mulA_ = gen_mul(); - prof_.set("Fp_mul", getCurr()); + prof_.set("_mul", getCurr()); if (op.fp_mulA_) { op.fp_mul = fp::func_ptr_cast(op.fp_mulA_); // used in toMont/fromMont } op.fp_sqrA_ = gen_sqr(); - prof_.set("Fp_sqr", getCurr()); + prof_.set("_sqr", getCurr()); if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 align(16); op.fp_preInv = getCurr(); gen_preInv(); - prof_.set("preInv", getCurr()); + prof_.set("_preInv", getCurr()); } if (op.xi_a == 0) return; // Fp2 is not used op.fp2_addA_ = gen_fp2_add(); - prof_.set("Fp2_add", getCurr()); + prof_.set("2_add", getCurr()); op.fp2_subA_ = gen_fp2_sub(); - prof_.set("Fp2_sub", getCurr()); + prof_.set("2_sub", getCurr()); op.fp2_negA_ = gen_fp2_neg(); - prof_.set("Fp2_neg", getCurr()); + prof_.set("2_neg", getCurr()); op.fp2_mulNF = 0; op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre(); - prof_.set("Fp2Dbl_mulPre", getCurr()); + prof_.set("2Dbl_mulPre", getCurr()); op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre(); - prof_.set("Fp2Dbl_sqrPre", getCurr()); + prof_.set("2Dbl_sqrPre", getCurr()); op.fp2_mulA_ = gen_fp2_mul(); - prof_.set("Fp2_mul", getCurr()); + prof_.set("2_mul", getCurr()); op.fp2_sqrA_ = gen_fp2_sqr(); - prof_.set("Fp2_sqr", getCurr()); + prof_.set("2_sqr", getCurr()); op.fp2_mul_xiA_ = gen_fp2_mul_xi(); - prof_.set("Fp2_mul_xi", getCurr()); + prof_.set("2_mul_xi", getCurr()); } u3u gen_addSubPre(bool isAdd, int n) {