refactor profiler

update-fork
MITSUNARI Shigeo 4 years ago
parent 9bfc47ca8d
commit 10621c6299
  1. 4
      include/mcl/bn.hpp
  2. 12
      include/mcl/fp.hpp
  3. 2
      include/mcl/op.hpp
  4. 10
      src/fp.cpp
  5. 111
      src/fp_generator.hpp

@ -875,9 +875,9 @@ struct Param {
assert((p % 6) == 1); assert((p % 6) == 1);
r = local::evalPoly(z, rCoff); r = local::evalPoly(z, rCoff);
} }
Fr::init(pb, r, mode); Fr::init(pb, r, mode, "Fr");
if (!*pb) return; if (!*pb) return;
Fp::init(pb, cp.xi_a, p, mode); Fp::init(pb, cp.xi_a, p, mode, "Fp");
if (!*pb) return; if (!*pb) return;
Fp2::init(); Fp2::init();
const Fp2 xi(cp.xi_a, 1); const Fp2 xi(cp.xi_a, 1);

@ -130,10 +130,10 @@ public:
xi_a is used for Fp2::mul_xi(), where xi = xi_a + i and i^2 = -1 xi_a is used for Fp2::mul_xi(), where xi = xi_a + i and i^2 = -1
if xi_a = 0 then asm functions for Fp2 are not generated. if xi_a = 0 then asm functions for Fp2 are not generated.
*/ */
static inline void init(bool *pb, int xi_a, const mpz_class& p, fp::Mode mode = fp::FP_AUTO) static inline void init(bool *pb, int xi_a, const mpz_class& p, fp::Mode mode = fp::FP_AUTO, const char *suf = 0)
{ {
assert(maxBitSize <= MCL_MAX_BIT_SIZE); assert(maxBitSize <= MCL_MAX_BIT_SIZE);
*pb = op_.init(p, maxBitSize, xi_a, mode); *pb = op_.init(p, maxBitSize, xi_a, mode, suf);
if (!*pb) return; if (!*pb) return;
{ // set oneRep { // set oneRep
FpT& one = *reinterpret_cast<FpT*>(op_.oneRep); FpT& one = *reinterpret_cast<FpT*>(op_.oneRep);
@ -163,16 +163,16 @@ public:
#endif #endif
*pb = true; *pb = true;
} }
static inline void init(bool *pb, const mpz_class& p, fp::Mode mode = fp::FP_AUTO) static inline void init(bool *pb, const mpz_class& p, fp::Mode mode = fp::FP_AUTO, const char *suf = 0)
{ {
init(pb, 0, p, mode); init(pb, 0, p, mode, suf);
} }
static inline void init(bool *pb, const char *mstr, fp::Mode mode = fp::FP_AUTO) static inline void init(bool *pb, const char *mstr, fp::Mode mode = fp::FP_AUTO, const char *suf = 0)
{ {
mpz_class p; mpz_class p;
gmp::setStr(pb, p, mstr); gmp::setStr(pb, p, mstr);
if (!*pb) return; if (!*pb) return;
init(pb, p, mode); init(pb, p, mode, suf);
} }
static inline size_t getModulo(char *buf, size_t bufSize) static inline size_t getModulo(char *buf, size_t bufSize)
{ {

@ -364,7 +364,7 @@ struct Op {
*/ */
fp_mul(y, x, R2, p); fp_mul(y, x, R2, p);
} }
bool init(const mpz_class& p, size_t maxBitSize, int xi_a, Mode mode, size_t mclMaxBitSize = MCL_MAX_BIT_SIZE); bool init(const mpz_class& p, size_t maxBitSize, int xi_a, Mode mode, const char *suf = 0, size_t mclMaxBitSize = MCL_MAX_BIT_SIZE);
#ifdef MCL_USE_XBYAK #ifdef MCL_USE_XBYAK
static FpGenerator* createFpGenerator(); static FpGenerator* createFpGenerator();
static void destroyFpGenerator(FpGenerator *fg); static void destroyFpGenerator(FpGenerator *fg);

@ -346,7 +346,7 @@ static void initInvTbl(Op& op)
} }
#endif #endif
static bool initForMont(Op& op, const Unit *p, Mode mode) static bool initForMont(Op& op, const Unit *p, Mode mode, const char *suf)
{ {
const size_t N = op.N; const size_t N = op.N;
bool b; bool b;
@ -366,17 +366,19 @@ static bool initForMont(Op& op, const Unit *p, Mode mode)
if (mode != FP_XBYAK) return true; if (mode != FP_XBYAK) return true;
#ifdef MCL_USE_XBYAK #ifdef MCL_USE_XBYAK
if (op.fg == 0) op.fg = Op::createFpGenerator(); if (op.fg == 0) op.fg = Op::createFpGenerator();
bool useXbyak = op.fg->init(op); bool useXbyak = op.fg->init(op, suf);
if (useXbyak && op.isMont && N <= 4) { if (useXbyak && op.isMont && N <= 4) {
op.fp_invOp = &invOpForMontC; op.fp_invOp = &invOpForMontC;
initInvTbl(op); initInvTbl(op);
} }
#else
(void)suf;
#endif #endif
return true; return true;
} }
bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size_t mclMaxBitSize) bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, const char *suf, size_t mclMaxBitSize)
{ {
if (mclMaxBitSize != MCL_MAX_BIT_SIZE) return false; if (mclMaxBitSize != MCL_MAX_BIT_SIZE) return false;
#ifdef MCL_USE_VINT #ifdef MCL_USE_VINT
@ -534,7 +536,7 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size
if (!b) return false; if (!b) return false;
} }
modp.init(mp); modp.init(mp);
return fp::initForMont(*this, p, mode); return fp::initForMont(*this, p, mode, suf);
} }
void copyUnitToByteAsLE(uint8_t *dst, const Unit *src, size_t byteSize) void copyUnitToByteAsLE(uint8_t *dst, const Unit *src, size_t byteSize)

@ -23,8 +23,59 @@
#pragma warning(disable : 4458) #pragma warning(disable : 4458)
#endif #endif
//#define MCL_FREEZE_JIT
namespace mcl { namespace mcl {
#ifdef MCL_FREEZE_JIT
struct Profiler {
FILE *fp_;
const uint8_t *prev_;
std::string suf_;
Profiler()
: fp_(0)
, prev_(0)
{
}
~Profiler()
{
if (fp_) fclose(fp_);
}
void open(const std::string& fileName)
{
fp_ = fopen(fileName.c_str(), "wb");
}
void setStartAddr(const uint8_t *addr)
{
prev_ = addr;
}
void setNameSuffix(const char *suf)
{
suf_ = suf;
}
void set(const char *name, const uint8_t *end)
{
fprintf(fp_, "global %s%s\n", suf_.c_str(), name);
fprintf(fp_, "align 16\n");
fprintf(fp_, "%s%s:\n", suf_.c_str(), name);
const uint8_t *p = prev_;
size_t remain = end - prev_;
while (remain > 0) {
size_t n = remain >= 16 ? 16 : remain;
fprintf(fp_, "db ");
for (size_t i = 0; i < n; i++) {
fprintf(fp_, "0x%02x,", *p++);
}
fprintf(fp_, "\n");
remain -= n;
}
prev_ = end;
}
};
#else
typedef Xbyak::util::Profiler Profiler;
#endif
namespace fp_gen_local { namespace fp_gen_local {
class MemReg { class MemReg {
@ -203,7 +254,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
int pn_; int pn_;
int FpByte_; int FpByte_;
bool isFullBit_; bool isFullBit_;
Xbyak::util::Profiler prof_; Profiler prof_;
/* /*
@param op [in] ; use op.p, op.N, op.isFullBit @param op [in] ; use op.p, op.N, op.isFullBit
@ -242,12 +293,12 @@ struct FpGenerator : Xbyak::CodeGenerator {
useMulx_ = cpu_.has(Xbyak::util::Cpu::tBMI2); useMulx_ = cpu_.has(Xbyak::util::Cpu::tBMI2);
useAdx_ = cpu_.has(Xbyak::util::Cpu::tADX); useAdx_ = cpu_.has(Xbyak::util::Cpu::tADX);
} }
bool init(Op& op) bool init(Op& op, const char *suf)
{ {
if (!cpu_.has(Xbyak::util::Cpu::tAVX)) return false; if (!cpu_.has(Xbyak::util::Cpu::tAVX)) return false;
reset(); // reset jit code for reuse reset(); // reset jit code for reuse
setProtectModeRW(); // read/write memory setProtectModeRW(); // read/write memory
init_inner(op); init_inner(op, suf);
// ToDo : recover op if false // ToDo : recover op if false
if (Xbyak::GetError()) return false; if (Xbyak::GetError()) return false;
// printf("code size=%d\n", (int)getSize()); // printf("code size=%d\n", (int)getSize());
@ -255,7 +306,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
return true; return true;
} }
private: private:
void init_inner(Op& op) void init_inner(Op& op, const char *suf)
{ {
op_ = &op; op_ = &op;
L(pL_); L(pL_);
@ -269,7 +320,6 @@ private:
isFullBit_ = op.isFullBit; isFullBit_ = op.isFullBit;
// printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_); // printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_);
#ifdef MCL_USE_PROF #ifdef MCL_USE_PROF
static char suf[] = "_0";
int profMode = 0; int profMode = 0;
#ifdef XBYAK_USE_VTUNE #ifdef XBYAK_USE_VTUNE
profMode = 2; profMode = 2;
@ -281,89 +331,92 @@ private:
if (profMode) { if (profMode) {
prof_.init(profMode); prof_.init(profMode);
prof_.setStartAddr(getCurr()); prof_.setStartAddr(getCurr());
if (suf == 0) suf = "fp";
prof_.setNameSuffix(suf); prof_.setNameSuffix(suf);
suf[1]++; suf[1]++;
} }
#else
(void)suf;
#endif #endif
op.fp_addPre = gen_addSubPre(true, pn_); op.fp_addPre = gen_addSubPre(true, pn_);
prof_.set("Fp_addPre", getCurr()); prof_.set("_addPre", getCurr());
op.fp_subPre = gen_addSubPre(false, pn_); op.fp_subPre = gen_addSubPre(false, pn_);
prof_.set("Fp_subPre", getCurr()); prof_.set("_subPre", getCurr());
op.fp_addA_ = gen_fp_add(); op.fp_addA_ = gen_fp_add();
prof_.set("Fp_add", getCurr()); prof_.set("_add", getCurr());
op.fp_subA_ = gen_fp_sub(); op.fp_subA_ = gen_fp_sub();
prof_.set("Fp_sub", getCurr()); prof_.set("_sub", getCurr());
op.fp_shr1 = gen_shr1(); op.fp_shr1 = gen_shr1();
prof_.set("Fp_shr1", getCurr()); prof_.set("_shr1", getCurr());
op.fp_negA_ = gen_fp_neg(); op.fp_negA_ = gen_fp_neg();
prof_.set("Fp_neg", getCurr()); prof_.set("_neg", getCurr());
op.fpDbl_addA_ = gen_fpDbl_add(); op.fpDbl_addA_ = gen_fpDbl_add();
prof_.set("FpDbl_add", getCurr()); prof_.set("Dbl_add", getCurr());
op.fpDbl_subA_ = gen_fpDbl_sub(); op.fpDbl_subA_ = gen_fpDbl_sub();
prof_.set("FpDbl_sub", getCurr()); prof_.set("Dbl_sub", getCurr());
op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2); op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2);
prof_.set("FpDbl_addPre", getCurr()); prof_.set("Dbl_addPre", getCurr());
op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2); op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2);
prof_.set("FpDbl_subPre", getCurr()); prof_.set("Dbl_subPre", getCurr());
op.fpDbl_mulPreA_ = gen_fpDbl_mulPre(); op.fpDbl_mulPreA_ = gen_fpDbl_mulPre();
prof_.set("FpDbl_mulPre", getCurr()); prof_.set("Dbl_mulPre", getCurr());
op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre(); op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre();
prof_.set("FpDbl_sqrPre", getCurr()); prof_.set("Dbl_sqrPre", getCurr());
op.fpDbl_modA_ = gen_fpDbl_mod(op); op.fpDbl_modA_ = gen_fpDbl_mod(op);
prof_.set("FpDbl_mod", getCurr()); prof_.set("Dbl_mod", getCurr());
op.fp_mulA_ = gen_mul(); op.fp_mulA_ = gen_mul();
prof_.set("Fp_mul", getCurr()); prof_.set("_mul", getCurr());
if (op.fp_mulA_) { if (op.fp_mulA_) {
op.fp_mul = fp::func_ptr_cast<void4u>(op.fp_mulA_); // used in toMont/fromMont op.fp_mul = fp::func_ptr_cast<void4u>(op.fp_mulA_); // used in toMont/fromMont
} }
op.fp_sqrA_ = gen_sqr(); op.fp_sqrA_ = gen_sqr();
prof_.set("Fp_sqr", getCurr()); prof_.set("_sqr", getCurr());
if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4
align(16); align(16);
op.fp_preInv = getCurr<int2u>(); op.fp_preInv = getCurr<int2u>();
gen_preInv(); gen_preInv();
prof_.set("preInv", getCurr()); prof_.set("_preInv", getCurr());
} }
if (op.xi_a == 0) return; // Fp2 is not used if (op.xi_a == 0) return; // Fp2 is not used
op.fp2_addA_ = gen_fp2_add(); op.fp2_addA_ = gen_fp2_add();
prof_.set("Fp2_add", getCurr()); prof_.set("2_add", getCurr());
op.fp2_subA_ = gen_fp2_sub(); op.fp2_subA_ = gen_fp2_sub();
prof_.set("Fp2_sub", getCurr()); prof_.set("2_sub", getCurr());
op.fp2_negA_ = gen_fp2_neg(); op.fp2_negA_ = gen_fp2_neg();
prof_.set("Fp2_neg", getCurr()); prof_.set("2_neg", getCurr());
op.fp2_mulNF = 0; op.fp2_mulNF = 0;
op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre(); op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre();
prof_.set("Fp2Dbl_mulPre", getCurr()); prof_.set("2Dbl_mulPre", getCurr());
op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre(); op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre();
prof_.set("Fp2Dbl_sqrPre", getCurr()); prof_.set("2Dbl_sqrPre", getCurr());
op.fp2_mulA_ = gen_fp2_mul(); op.fp2_mulA_ = gen_fp2_mul();
prof_.set("Fp2_mul", getCurr()); prof_.set("2_mul", getCurr());
op.fp2_sqrA_ = gen_fp2_sqr(); op.fp2_sqrA_ = gen_fp2_sqr();
prof_.set("Fp2_sqr", getCurr()); prof_.set("2_sqr", getCurr());
op.fp2_mul_xiA_ = gen_fp2_mul_xi(); op.fp2_mul_xiA_ = gen_fp2_mul_xi();
prof_.set("Fp2_mul_xi", getCurr()); prof_.set("2_mul_xi", getCurr());
} }
u3u gen_addSubPre(bool isAdd, int n) u3u gen_addSubPre(bool isAdd, int n)
{ {

Loading…
Cancel
Save