support perf if MCL_PERF=1

dev
MITSUNARI Shigeo 6 years ago
parent 543e5fc19d
commit 3a6b1c2124
  1. 111
      src/fp_generator.hpp

@ -127,6 +127,71 @@ if (rm.isReg()) { \
namespace fp { namespace fp {
struct Profiler {
FILE *fp_;
const char *suf_;
const uint8_t *prev_ = 0;
Profiler()
: fp_(0)
, suf_(0)
, prev_(0)
{
}
void init(const char *suf, const uint8_t *prev)
{
#ifdef __linux__
close();
const char *s = getenv("MCL_PERF");
if (s == 0 || strcmp(s, "1") != 0) return;
fprintf(stderr, "use perf suf=%s\n", suf);
suf_ = suf;
const int pid = getpid();
char name[128];
snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid);
fp_ = fopen(name, "wb");
if (fp_ == 0) throw cybozu::Exception("PerMap") << name;
prev_ = prev;
#else
(void)suf;
(void)prev;
#endif
}
~Profiler()
{
close();
}
void close()
{
#ifdef __linux__
if (fp_ == 0) return;
fclose(fp_);
fp_ = 0;
prev_ = 0;
#endif
}
void set(const uint8_t *p, size_t n, const char *name) const
{
#ifdef __linux__
if (fp_ == 0) return;
fprintf(fp_, "%llx %zx %s%s\n", (long long)p, n, name, suf_);
#else
(void)p;
(void)n;
(void)name;
#endif
}
void set(const char *name, const uint8_t *cur)
{
#ifdef __linux__
set(prev_, cur - prev_, name);
prev_ = cur;
#else
(void)name;
(void)cur;
#endif
}
};
struct FpGenerator : Xbyak::CodeGenerator { struct FpGenerator : Xbyak::CodeGenerator {
typedef Xbyak::RegExp RegExp; typedef Xbyak::RegExp RegExp;
typedef Xbyak::Reg64 Reg64; typedef Xbyak::Reg64 Reg64;
@ -203,6 +268,7 @@ struct FpGenerator : Xbyak::CodeGenerator {
int pn_; int pn_;
int FpByte_; int FpByte_;
bool isFullBit_; bool isFullBit_;
Profiler prof_;
/* /*
@param op [in] ; use op.p, op.N, op.isFullBit @param op [in] ; use op.p, op.N, op.isFullBit
@ -264,45 +330,88 @@ private:
FpByte_ = int(op.maxN * sizeof(uint64_t)); FpByte_ = int(op.maxN * sizeof(uint64_t));
isFullBit_ = op.isFullBit; isFullBit_ = op.isFullBit;
// printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_); // printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_);
static char suf[] = "_0";
prof_.init(suf, getCurr());
suf[1]++;
op.fp_addPre = gen_addSubPre(true, pn_); op.fp_addPre = gen_addSubPre(true, pn_);
prof_.set("Fp_addPre", getCurr());
op.fp_subPre = gen_addSubPre(false, pn_); op.fp_subPre = gen_addSubPre(false, pn_);
op.fp_subA_ = gen_fp_sub(); prof_.set("Fp_subPre", getCurr());
op.fp_addA_ = gen_fp_add(); op.fp_addA_ = gen_fp_add();
prof_.set("Fp_add", getCurr());
op.fp_subA_ = gen_fp_sub();
prof_.set("Fp_sub", getCurr());
op.fp_shr1 = gen_shr1(); op.fp_shr1 = gen_shr1();
prof_.set("Fp_shr1", getCurr());
op.fp_negA_ = gen_fp_neg(); op.fp_negA_ = gen_fp_neg();
prof_.set("Fp_neg", getCurr());
op.fpDbl_addA_ = gen_fpDbl_add(); op.fpDbl_addA_ = gen_fpDbl_add();
prof_.set("FpDbl_add", getCurr());
op.fpDbl_subA_ = gen_fpDbl_sub(); op.fpDbl_subA_ = gen_fpDbl_sub();
prof_.set("FpDbl_sub", getCurr());
op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2); op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2);
prof_.set("FpDbl_addPre", getCurr());
op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2); op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2);
prof_.set("FpDbl_subPre", getCurr());
op.fpDbl_mulPreA_ = gen_fpDbl_mulPre(); op.fpDbl_mulPreA_ = gen_fpDbl_mulPre();
prof_.set("FpDbl_mulPre", getCurr());
op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre(); op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre();
prof_.set("FpDbl_sqrPre", getCurr());
op.fpDbl_modA_ = gen_fpDbl_mod(op); op.fpDbl_modA_ = gen_fpDbl_mod(op);
prof_.set("FpDbl_mod", getCurr());
op.fp_mulA_ = gen_mul(); op.fp_mulA_ = gen_mul();
prof_.set("Fp_mul", getCurr());
if (op.fp_mulA_) { if (op.fp_mulA_) {
op.fp_mul = reinterpret_cast<void4u>(op.fp_mulA_); // used in toMont/fromMont op.fp_mul = reinterpret_cast<void4u>(op.fp_mulA_); // used in toMont/fromMont
} }
op.fp_sqrA_ = gen_sqr(); op.fp_sqrA_ = gen_sqr();
prof_.set("Fp_sqr", getCurr());
if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4
align(16); align(16);
op.fp_preInv = getCurr<int2u>(); op.fp_preInv = getCurr<int2u>();
gen_preInv(); gen_preInv();
prof_.set("preInv", getCurr());
} }
if (op.xi_a == 0) return; // Fp2 is not used if (op.xi_a == 0) return; // Fp2 is not used
op.fp2_addA_ = gen_fp2_add(); op.fp2_addA_ = gen_fp2_add();
prof_.set("Fp2_add", getCurr());
op.fp2_subA_ = gen_fp2_sub(); op.fp2_subA_ = gen_fp2_sub();
prof_.set("Fp2_sub", getCurr());
op.fp2_negA_ = gen_fp2_neg(); op.fp2_negA_ = gen_fp2_neg();
prof_.set("Fp2_neg", getCurr());
op.fp2_mulNF = 0; op.fp2_mulNF = 0;
op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre(); op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre();
prof_.set("Fp2Dbl_mulPre", getCurr());
op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre(); op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre();
prof_.set("Fp2Dbl_sqrPre", getCurr());
op.fp2_mulA_ = gen_fp2_mul(); op.fp2_mulA_ = gen_fp2_mul();
prof_.set("Fp2_mul", getCurr());
op.fp2_sqrA_ = gen_fp2_sqr(); op.fp2_sqrA_ = gen_fp2_sqr();
prof_.set("Fp2_sqr", getCurr());
op.fp2_mul_xiA_ = gen_fp2_mul_xi(); op.fp2_mul_xiA_ = gen_fp2_mul_xi();
prof_.set("Fp2_mul_xi", getCurr());
} }
u3u gen_addSubPre(bool isAdd, int n) u3u gen_addSubPre(bool isAdd, int n)
{ {

Loading…
Cancel
Save