diff --git a/Makefile b/Makefile index 346be6a..21f5f15 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,13 @@ TEST_SRC+=bls12_test.cpp TEST_SRC+=mapto_wb19_test.cpp TEST_SRC+=ecdsa_c_test.cpp TEST_SRC+=modp_test.cpp +ifeq ($(MCL_STATIC_CODE),1) + MCL_USE_XBYAK=0 + MCL_MAX_BIT_SIZE=384 + CFLAGS+=-DMCL_STATIC_CODE + LIB_OBJ=obj/static_code.o + TEST_SRC=bls12_test.cpp +endif ifeq ($(CPU),x86-64) MCL_USE_XBYAK?=1 TEST_SRC+=mont_fp_test.cpp sq_test.cpp @@ -86,7 +93,7 @@ ifneq ($(CPU),) ASM_SRC=$(ASM_SRC_PATH_NAME).s endif ASM_OBJ=$(OBJ_DIR)/$(CPU).o -LIB_OBJ=$(OBJ_DIR)/fp.o +LIB_OBJ+=$(OBJ_DIR)/fp.o BN256_OBJ=$(OBJ_DIR)/bn_c256.o BN384_OBJ=$(OBJ_DIR)/bn_c384.o BN384_256_OBJ=$(OBJ_DIR)/bn_c384_256.o @@ -106,7 +113,9 @@ ifeq ($(MCL_USE_LLVM),1) LIB_OBJ+=$(ASM_OBJ) # special case for intel with bmi2 ifeq ($(INTEL),1) - LIB_OBJ+=$(OBJ_DIR)/$(CPU).bmi2.o + ifneq ($(MCL_STATIC_CODE),1) + LIB_OBJ+=$(OBJ_DIR)/$(CPU).bmi2.o + endif endif endif LLVM_SRC=src/base$(BIT).ll @@ -237,6 +246,18 @@ endif $(GEN_EXE): src/gen.cpp src/llvm_gen.hpp $(CXX) -o $@ $< $(CFLAGS) +src/dump_code: src/dump_code.cpp src/fp.cpp src/fp_generator.hpp + $(CXX) -o $@ src/dump_code.cpp src/fp.cpp -g -I include -DMCL_DUMP_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER + +src/static_code.asm: src/dump_code + $< > $@ + +obj/static_code.o: src/static_code.asm + nasm -felf64 -o $@ $< + +bin/static_code_test.exe: test/static_code_test.cpp src/fp.cpp obj/static_code.o + $(CXX) -o $@ -O3 $^ -g -DMCL_DONT_USE_XBYAK -DMCL_STATIC_CODE -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I include -Wall -Wextra + asm: $(LLVM_SRC) $(LLVM_OPT) -O3 -o - $(LLVM_SRC) | $(LLVM_LLC) -O3 $(LLVM_FLAGS) -x86-asm-syntax=intel @@ -388,7 +409,7 @@ update_cybozulib: cp -a $(addprefix ../cybozulib/,$(wildcard include/cybozu/*.hpp)) include/cybozu/ clean: - $(RM) $(LIB_DIR)/*.a $(LIB_DIR)/*.$(LIB_SUF) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.obj $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_OBJ) $(LIB_OBJ) $(BN256_OBJ) $(BN384_OBJ) $(BN512_OBJ) $(FUNC_LIST) src/*.ll lib/*.a + $(RM) $(LIB_DIR)/*.a $(LIB_DIR)/*.$(LIB_SUF) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.obj $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_OBJ) $(LIB_OBJ) $(BN256_OBJ) $(BN384_OBJ) $(BN512_OBJ) $(FUNC_LIST) src/*.ll lib/*.a src/static_code.asm src/dump_code ALL_SRC=$(SRC_SRC) $(TEST_SRC) $(SAMPLE_SRC) DEPEND_FILE=$(addprefix $(OBJ_DIR)/, $(addsuffix .d,$(basename $(ALL_SRC)))) diff --git a/common.mk b/common.mk index 4816049..6f4ed72 100644 --- a/common.mk +++ b/common.mk @@ -91,7 +91,7 @@ else CFLAGS_OPT+=$(MARCH) endif endif -CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith +CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith -Wundef CFLAGS+=-g3 INC_OPT=-I include -I test CFLAGS+=$(CFLAGS_WARN) $(BIT_OPT) $(INC_OPT) diff --git a/include/mcl/bn.hpp b/include/mcl/bn.hpp index 3668da2..ab9e15c 100644 --- a/include/mcl/bn.hpp +++ b/include/mcl/bn.hpp @@ -854,6 +854,12 @@ struct Param { { this->cp = cp; isBLS12 = cp.curveType == MCL_BLS12_381; +#ifdef MCL_STATIC_CODE + if (!isBLS12) { + *pb = false; + return; + } +#endif gmp::setStr(pb, z, cp.z); if (!*pb) return; isNegative = z < 0; diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index 99c0e4d..22a78b1 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -16,6 +16,9 @@ #endif #if !defined(MCL_DONT_USE_XBYAK) && (defined(_WIN64) || defined(__x86_64__)) && (MCL_SIZEOF_UNIT == 8) #define MCL_USE_XBYAK +#endif +#if defined(MCL_USE_XBYAK) || defined(MCL_STATIC_CODE) + #define MCL_X64_ASM #define MCL_XBYAK_DIRECT_CALL #endif @@ -202,6 +205,8 @@ struct Op { Unit R3[maxUnitSize]; #ifdef MCL_USE_XBYAK FpGenerator *fg; +#endif +#ifdef MCL_X64_ASM mcl::Array invTbl; #endif void3u fp_addA_; @@ -288,7 +293,7 @@ struct Op { memset(one, 0, sizeof(one)); memset(R2, 0, sizeof(R2)); memset(R3, 0, sizeof(R3)); -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM invTbl.clear(); #endif fp_addA_ = 0; diff --git a/sample/bench.cpp b/sample/bench.cpp index de81f25..d3c101c 100644 --- a/sample/bench.cpp +++ b/sample/bench.cpp @@ -68,7 +68,7 @@ void benchFp(size_t bitSize, int mode) if (mode & 4) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM); if (mode & 8) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM_MONT); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (mode & 16) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_XBYAK); #endif } @@ -122,7 +122,7 @@ void benchEc(size_t bitSize, int mode, mcl::ec::Mode ecMode) if (mode & 4) benchEcSub(tbl[i], mcl::fp::FP_LLVM, ecMode); if (mode & 8) benchEcSub(tbl[i], mcl::fp::FP_LLVM_MONT, ecMode); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (mode & 16) benchEcSub(tbl[i], mcl::fp::FP_XBYAK, ecMode); #endif } diff --git a/sample/rawbench.cpp b/sample/rawbench.cpp index 4d7506e..cc74bc3 100644 --- a/sample/rawbench.cpp +++ b/sample/rawbench.cpp @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) benchRaw(tbl[i], mcl::fp::FP_LLVM); benchRaw(tbl[i], mcl::fp::FP_LLVM_MONT); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (bitSize <= 384) { benchRaw(tbl[i], mcl::fp::FP_XBYAK); } diff --git a/src/dump_code.cpp b/src/dump_code.cpp new file mode 100644 index 0000000..f1655e9 --- /dev/null +++ b/src/dump_code.cpp @@ -0,0 +1,7 @@ +#include + +int main() +{ + mcl::bn::initPairing(mcl::BLS12_381); +} + diff --git a/src/fp.cpp b/src/fp.cpp index b3b07d1..ab3a1a7 100644 --- a/src/fp.cpp +++ b/src/fp.cpp @@ -3,12 +3,14 @@ #include #include #include +#ifdef MCL_STATIC_CODE +#include "fp_static_code.hpp" +#endif #ifdef MCL_USE_XBYAK #include "fp_generator.hpp" #else #define XBYAK_ONLY_CLASS_CPU #include "xbyak/xbyak_util.h" -//#include "detect_cpu.hpp" #endif #include "low_func.hpp" #ifdef MCL_USE_LLVM @@ -315,7 +317,7 @@ void setOp(Op& op, Mode mode) #endif } -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM inline void invOpForMontC(Unit *y, const Unit *x, const Op& op) { Unit r[maxUnitSize]; @@ -372,6 +374,12 @@ static bool initForMont(Op& op, const Unit *p, Mode mode) op.fp_invOp = &invOpForMontC; initInvTbl(op); } +#elif defined(MCL_STATIC_CODE) + fp::setStaticCode(op); + if (op.isMont && N <= 4) { + op.fp_invOp = &invOpForMontC; + initInvTbl(op); + } #endif return true; } @@ -403,14 +411,25 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size priority : MCL_USE_XBYAK > MCL_USE_LLVM > none Xbyak > llvm_mont > llvm > gmp_mont > gmp */ -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (mode == FP_AUTO) mode = FP_XBYAK; if (mode == FP_XBYAK && bitSize > 384) { mode = FP_AUTO; } +#ifdef MCL_USE_XBYAK if (!isEnableJIT()) { mode = FP_AUTO; } +#elif MCL_STATIC_CODE + { + // static jit code uses avx, mulx, adox, adcx + using namespace Xbyak::util; + Cpu cpu; + if (!(cpu.has(Cpu::tAVX) && cpu.has(Cpu::tBMI2) && cpu.has(Cpu::tADX))) { + mode = FP_AUTO; + } + } +#endif #else if (mode == FP_XBYAK) mode = FP_AUTO; #endif diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 97ce9ae..4243368 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -7,7 +7,6 @@ http://opensource.org/licenses/BSD-3-Clause */ #if CYBOZU_HOST == CYBOZU_HOST_INTEL -#define XBYAK_NO_OP_NAMES #define XBYAK_DISABLE_AVX512 #include "xbyak/xbyak_util.h" @@ -25,6 +24,61 @@ namespace mcl { +#ifdef MCL_DUMP_JIT +struct DumpCode { + FILE *fp_; + DumpCode() + : fp_(stdout) + { + } + void set(const std::string& name, const uint8_t *begin, const size_t size) + { + fprintf(fp_, "segment .text\n"); + fprintf(fp_, "global %s\n", name.c_str()); + fprintf(fp_, "align 16\n"); + fprintf(fp_, "%s:\n", name.c_str()); + const uint8_t *p = begin; + size_t remain = size; + while (remain > 0) { + size_t n = remain >= 16 ? 16 : remain; + fprintf(fp_, "db "); + for (size_t i = 0; i < n; i++) { + fprintf(fp_, "0x%02x,", *p++); + } + fprintf(fp_, "\n"); + remain -= n; + } + } + void dumpData(const void *begin, const void *end) + { + fprintf(fp_, "align 16\n"); + fprintf(fp_, "dq "); + const uint64_t *p = (const uint64_t*)begin; + const uint64_t *pe = (const uint64_t*)end; + const size_t n = pe - p; + for (size_t i = 0; i < n; i++) { + fprintf(fp_, "0x%016llx,", (unsigned long long)*p++); + } + fprintf(fp_, "\n"); + } +}; +template +void setFuncInfo(DumpCode& prof, const char *suf, const char *name, const T& begin, const uint8_t* end) +{ + if (suf == 0) suf = ""; + const uint8_t*p = (const uint8_t*)begin; + prof.set(std::string("mclx_") + suf + name, p, end - p); +} +#else +template +void setFuncInfo(Xbyak::util::Profiler& prof, const char *suf, const char *name, const T& begin, const uint8_t* end) +{ + if (suf == 0) suf = ""; + const uint8_t*p = (const uint8_t*)begin; + prof.set((std::string("mclx_") + suf + name).c_str(), p, end - p); +} +#endif + namespace fp_gen_local { class MemReg { @@ -203,7 +257,11 @@ struct FpGenerator : Xbyak::CodeGenerator { int pn_; int FpByte_; bool isFullBit_; +#ifdef MCL_DUMP_JIT + DumpCode prof_; +#else Xbyak::util::Profiler prof_; +#endif /* @param op [in] ; use op.p, op.N, op.isFullBit @@ -257,19 +315,22 @@ struct FpGenerator : Xbyak::CodeGenerator { private: void init_inner(Op& op) { + const char *suf = op.xi_a ? "Fp" : "Fr"; op_ = &op; L(pL_); p_ = reinterpret_cast(getCurr()); for (size_t i = 0; i < op.N; i++) { dq(op.p[i]); } +#ifdef MCL_DUMP_JIT + prof_.dumpData(p_, getCurr()); +#endif rp_ = fp::getMontgomeryCoeff(p_[0]); pn_ = (int)op.N; FpByte_ = int(op.maxN * sizeof(uint64_t)); isFullBit_ = op.isFullBit; // printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_); #ifdef MCL_USE_PROF - static char suf[] = "_0"; int profMode = 0; #ifdef XBYAK_USE_VTUNE profMode = 2; @@ -281,94 +342,116 @@ private: if (profMode) { prof_.init(profMode); prof_.setStartAddr(getCurr()); - prof_.setNameSuffix(suf); - suf[1]++; } +#else + (void)suf; #endif + align(16); op.fp_addPre = gen_addSubPre(true, pn_); - prof_.set("Fp_addPre", getCurr()); + setFuncInfo(prof_, suf, "_addPre", op.fp_addPre, getCurr()); + align(16); op.fp_subPre = gen_addSubPre(false, pn_); - prof_.set("Fp_subPre", getCurr()); + setFuncInfo(prof_, suf, "_subPre", op.fp_subPre, getCurr()); + align(16); op.fp_addA_ = gen_fp_add(); - prof_.set("Fp_add", getCurr()); + setFuncInfo(prof_, suf, "_add", op.fp_addA_, getCurr()); + align(16); op.fp_subA_ = gen_fp_sub(); - prof_.set("Fp_sub", getCurr()); + setFuncInfo(prof_, suf, "_sub", op.fp_subA_, getCurr()); + align(16); op.fp_shr1 = gen_shr1(); - prof_.set("Fp_shr1", getCurr()); + setFuncInfo(prof_, suf, "_shr1", op.fp_shr1, getCurr()); + align(16); op.fp_negA_ = gen_fp_neg(); - prof_.set("Fp_neg", getCurr()); - - op.fpDbl_addA_ = gen_fpDbl_add(); - prof_.set("FpDbl_add", getCurr()); - - op.fpDbl_subA_ = gen_fpDbl_sub(); - prof_.set("FpDbl_sub", getCurr()); - - op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2); - prof_.set("FpDbl_addPre", getCurr()); - - op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2); - prof_.set("FpDbl_subPre", getCurr()); - - op.fpDbl_mulPreA_ = gen_fpDbl_mulPre(); - prof_.set("FpDbl_mulPre", getCurr()); - - op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre(); - prof_.set("FpDbl_sqrPre", getCurr()); - + setFuncInfo(prof_, suf, "_neg", op.fp_negA_, getCurr()); + align(16); op.fpDbl_modA_ = gen_fpDbl_mod(op); - prof_.set("FpDbl_mod", getCurr()); - + setFuncInfo(prof_, suf, "Dbl_mod", op.fpDbl_modA_, getCurr()); + align(16); op.fp_mulA_ = gen_mul(); - prof_.set("Fp_mul", getCurr()); + setFuncInfo(prof_, suf, "_mul", op.fp_mulA_, getCurr()); + if (op.fp_mulA_) { op.fp_mul = fp::func_ptr_cast(op.fp_mulA_); // used in toMont/fromMont } + + align(16); op.fp_sqrA_ = gen_sqr(); - prof_.set("Fp_sqr", getCurr()); + setFuncInfo(prof_, suf, "_sqr", op.fp_sqrA_, getCurr()); if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 align(16); op.fp_preInv = getCurr(); gen_preInv(); - prof_.set("preInv", getCurr()); + setFuncInfo(prof_, suf, "_preInv", op.fp_preInv, getCurr()); } if (op.xi_a == 0) return; // Fp2 is not used + align(16); + op.fpDbl_addA_ = gen_fpDbl_add(); + setFuncInfo(prof_, suf, "Dbl_add", op.fpDbl_addA_, getCurr()); + + align(16); + op.fpDbl_subA_ = gen_fpDbl_sub(); + setFuncInfo(prof_, suf, "Dbl_sub", op.fpDbl_subA_, getCurr()); + + align(16); + op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2); + setFuncInfo(prof_, suf, "Dbl_addPre", op.fpDbl_addPre, getCurr()); + + align(16); + op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2); + setFuncInfo(prof_, suf, "Dbl_subPre", op.fpDbl_subPre, getCurr()); + + align(16); + op.fpDbl_mulPreA_ = gen_fpDbl_mulPre(); + setFuncInfo(prof_, suf, "Dbl_mulPre", op.fpDbl_mulPreA_, getCurr()); + + align(16); + op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre(); + setFuncInfo(prof_, suf, "Dbl_sqrPre", op.fpDbl_sqrPreA_, getCurr()); + + align(16); op.fp2_addA_ = gen_fp2_add(); - prof_.set("Fp2_add", getCurr()); + setFuncInfo(prof_, suf, "2_add", op.fp2_addA_, getCurr()); + align(16); op.fp2_subA_ = gen_fp2_sub(); - prof_.set("Fp2_sub", getCurr()); + setFuncInfo(prof_, suf, "2_sub", op.fp2_subA_, getCurr()); + align(16); op.fp2_negA_ = gen_fp2_neg(); - prof_.set("Fp2_neg", getCurr()); + setFuncInfo(prof_, suf, "2_neg", op.fp2_negA_, getCurr()); op.fp2_mulNF = 0; + align(16); op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre(); - prof_.set("Fp2Dbl_mulPre", getCurr()); + if (op.fp2Dbl_mulPreA_) setFuncInfo(prof_, suf, "2Dbl_mulPre", op.fp2Dbl_mulPreA_, getCurr()); + align(16); op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre(); - prof_.set("Fp2Dbl_sqrPre", getCurr()); + if (op.fp2Dbl_sqrPreA_) setFuncInfo(prof_, suf, "2Dbl_sqrPre", op.fp2Dbl_sqrPreA_, getCurr()); + align(16); op.fp2_mulA_ = gen_fp2_mul(); - prof_.set("Fp2_mul", getCurr()); + setFuncInfo(prof_, suf, "2_mul", op.fp2_mulA_, getCurr()); + align(16); op.fp2_sqrA_ = gen_fp2_sqr(); - prof_.set("Fp2_sqr", getCurr()); + setFuncInfo(prof_, suf, "2_sqr", op.fp2_sqrA_, getCurr()); + align(16); op.fp2_mul_xiA_ = gen_fp2_mul_xi(); - prof_.set("Fp2_mul_xi", getCurr()); + setFuncInfo(prof_, suf, "2_mul_xi", op.fp2_mul_xiA_, getCurr()); } u3u gen_addSubPre(bool isAdd, int n) { // if (isFullBit_) return 0; - align(16); u3u func = getCurr(); StackFrame sf(this, 3); if (isAdd) { @@ -429,7 +512,7 @@ private: } jmp(exit); L(nonZero); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); for (size_t i = 0; i < t.size(); i++) { mov(rdx, ptr [rax + i * 8]); if (i == 0) { @@ -557,7 +640,7 @@ private: mov(*fullReg, 0); adc(*fullReg, 0); } - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); sub_rm(p1, rax); if (fullReg) { sbb(*fullReg, 0); @@ -577,7 +660,7 @@ private: const Pack& p1 = t.sub(pn_, pn_); load_rm(p0, px); sub_rm(p0, py, withCarry); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); load_rm(p1, rax); sbb(rax, rax); // rax = (x > y) ? 0 : -1 for (size_t i = 0; i < p1.size(); i++) { @@ -618,7 +701,7 @@ private: Label exit; if (isFullBit_) { jnc("@f"); - mov(t2[0], pL_); // t2 is not used + lea(t2[0], ptr[rip+pL_]); // t2[0] is not used sub_rm(t1, t2[0]); jmp(exit); L("@@"); @@ -648,7 +731,6 @@ private: } void3u gen_fp_add() { - align(16); void3u func = getCurr(); if (pn_ <= 4) { gen_fp_add_le4(); @@ -666,7 +748,7 @@ private: inLocalLabel(); gen_raw_add(pz, px, py, rax, pn_); - mov(px, pL_); // destroy px + lea(px, ptr[rip+pL_]); if (isFullBit_) { jc(".over", jmpMode); } @@ -696,7 +778,6 @@ private: } void3u gen_fpDbl_add() { - align(16); void3u func = getCurr(); if (pn_ <= 4) { int tn = pn_ * 2 + (isFullBit_ ? 1 : 0); @@ -724,7 +805,6 @@ private: } void3u gen_fpDbl_sub() { - align(16); void3u func = getCurr(); if (pn_ <= 4) { int tn = pn_ * 2; @@ -774,7 +854,6 @@ private: } void3u gen_fp_sub() { - align(16); void3u func = getCurr(); if (pn_ <= 4) { gen_fp_sub_le4(); @@ -792,14 +871,13 @@ private: Label exit; gen_raw_sub(pz, px, py, rax, pn_); jnc(exit, jmpMode); - mov(px, pL_); + lea(px, ptr[rip+pL_]); gen_raw_add(pz, pz, px, rax, pn_); L(exit); return func; } void2u gen_fp_neg() { - align(16); void2u func = getCurr(); StackFrame sf(this, 2, UseRDX | pn_); gen_raw_neg(sf.p[0], sf.p[1], sf.t); @@ -807,7 +885,6 @@ private: } void2u gen_shr1() { - align(16); void2u func = getCurr(); const int c = 1; StackFrame sf(this, 2, 1); @@ -828,7 +905,6 @@ private: } void3u gen_mul() { - align(16); void3u func = getCurr(); if (op_->primeMode == PM_NIST_P192) { StackFrame sf(this, 3, 10 | UseRDX, 8 * 6); @@ -901,7 +977,7 @@ private: mov(a, rp_); mul(t6); - mov(t0, pL_); + lea(t0, ptr[rip+pL_]); mov(t7, a); // q // [d:t7:t1] = p * q @@ -970,7 +1046,7 @@ private: mov(a, rp_); mul(t10); - mov(t0, pL_); + lea(t0, ptr[rip+pL_]); mov(t7, a); // q // [d:t7:t2:t1] = p * q @@ -1050,7 +1126,7 @@ private: mov(a, rp_); mul(z); - mov(t0, pL_); + lea(t0, ptr[rip+pL_]); mov(t7, a); // q // [d:t7:t3:t2:t1] = p * q @@ -1141,7 +1217,6 @@ private: } void2u gen_fpDbl_mod(const fp::Op& op) { - align(16); void2u func = getCurr(); if (op.primeMode == PM_NIST_P192) { StackFrame sf(this, 2, 6 | UseRDX); @@ -1187,7 +1262,6 @@ private: } void2u gen_sqr() { - align(16); void2u func = getCurr(); if (op_->primeMode == PM_NIST_P192) { StackFrame sf(this, 3, 10 | UseRDX, 6 * 8); @@ -1308,7 +1382,7 @@ private: L(fp_mulL); vmovq(xm0, p0); // save p0 - mov(p0, pL_); + lea(p0, ptr[rip+pL_]); vmovq(xm1, p2); mov(p2, ptr [p2]); montgomery4_1(rp_, t0, t7, t3, t2, t1, p1, p2, p0, t4, t5, t6, t8, t9, true, xm2); @@ -1404,7 +1478,7 @@ private: mov(a, rp_); mul(c[0]); // q = a mov(d, a); - mov(t1, pL_); + lea(t1, ptr[rip+pL_]); // c += p * q mulAdd(c, 6, t1); } @@ -1450,7 +1524,7 @@ private: const Pack z = Pack(t3, t2, t1, t0, t7, t6); const Pack keep = Pack(rdx, rax, px, py, t8, t9); mov_rr(keep, z); - mov(t5, pL_); + lea(t5, ptr[rip+pL_]); sub_rm(z, t5); cmovc_rr(z, keep); store_mr(pz, z); @@ -1480,7 +1554,7 @@ private: const Reg64& t9 = sf.t[9]; vmovq(xm0, p0); // save p0 - mov(t7, pL_); + lea(t7, ptr[rip+pL_]); mov(t9, ptr [p2]); // c3, c2, c1, c0, px, y, p, montgomery3_1(rp_, t0, t3, t2, t1, p1, t9, t7, t4, t5, t6, t8, p0, true); @@ -1526,7 +1600,7 @@ private: const Reg64& t9 = sf.t[9]; vmovq(xm0, pz); // save pz - mov(t7, pL_); + lea(t7, ptr[rip+pL_]); mov(t9, ptr [px]); mul3x1_sqr1(px, t9, t3, t2, t1, t0); mov(t0, rdx); @@ -2291,7 +2365,6 @@ private: } void2u gen_fpDbl_sqrPre() { - align(16); void2u func = getCurr(); if (pn_ == 2 && useMulx_) { StackFrame sf(this, 2, 7 | UseRDX); @@ -2332,7 +2405,6 @@ private: } void3u gen_fpDbl_mulPre() { - align(16); void3u func = getCurr(); if (pn_ == 2 && useMulx_) { StackFrame sf(this, 3, 5 | UseRDX); @@ -2630,7 +2702,7 @@ private: mov(rax, px); // px is free frome here load_mp(vv, rax, t); // v = x - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); load_mp(uu, rax, t); // u = p_ // k = 0 xor_(rax, rax); @@ -2708,7 +2780,7 @@ private: const Reg64& t2 = ss.getReg(0); const Reg64& t3 = rdx; - mov(t2, pL_); + lea(t2, ptr[rip+pL_]); if (isFullBit_) { mov(t, ptr [rTop]); test(t, t); @@ -3373,7 +3445,6 @@ private: // if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; // almost same for pn_ == 6 if (pn_ != 4) return 0; - align(16); void3u func = getCurr(); const RegExp z = rsp + 0 * 8; @@ -3438,7 +3509,6 @@ private: // if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; // almost same for pn_ == 6 if (pn_ != 4) return 0; - align(16); void2u func = getCurr(); // almost same for pn_ == 6 if (pn_ != 4) return 0; @@ -3524,7 +3594,6 @@ private: } void3u gen_fp2_add() { - align(16); void3u func = getCurr(); if (pn_ == 4 && !isFullBit_) { gen_fp2_add4(); @@ -3538,7 +3607,6 @@ private: } void3u gen_fp2_sub() { - align(16); void3u func = getCurr(); if (pn_ == 4 && !isFullBit_) { gen_fp2_sub4(); @@ -3584,7 +3652,7 @@ private: } } sub_rr(a, b); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); load_rm(b, rax); sbb(rax, rax); for (int i = 0; i < pn_; i++) { @@ -3592,7 +3660,7 @@ private: } add_rr(a, b); store_mr(py, a); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); mov_rr(a, t); sub_rm(t, rax); cmovc_rr(t, a); @@ -3610,7 +3678,7 @@ private: mov_rr(b, a); add_rm(b, px + FpByte_); sub_rm(a, px + FpByte_); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); jnc("@f"); add_rm(a, rax); L("@@"); @@ -3624,7 +3692,6 @@ private: { if (isFullBit_) return 0; if (op_->xi_a != 1) return 0; - align(16); void2u func = getCurr(); if (pn_ == 4) { gen_fp2_mul_xi4(); @@ -3638,7 +3705,6 @@ private: } void2u gen_fp2_neg() { - align(16); void2u func = getCurr(); if (pn_ <= 6) { StackFrame sf(this, 2, UseRDX | pn_); @@ -3652,7 +3718,6 @@ private: { if (isFullBit_) return 0; if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; - align(16); void3u func = getCurr(); bool embedded = pn_ == 4; @@ -3729,7 +3794,6 @@ private: { if (isFullBit_) return 0; if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; - align(16); void2u func = getCurr(); const RegExp y = rsp + 0 * 8; @@ -3789,7 +3853,7 @@ private: mov(ptr [(RegExp)t2 + i * 8], rax); } // t3 = a + p - b - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); add_rm(a, rax); sub_rr(a, b); store_mr(t3, a); diff --git a/src/fp_static_code.hpp b/src/fp_static_code.hpp new file mode 100644 index 0000000..832062e --- /dev/null +++ b/src/fp_static_code.hpp @@ -0,0 +1,92 @@ +#pragma once +/** + @file + @brief Fp generator + @author MITSUNARI Shigeo(@herumi) + @license modified new BSD license + http://opensource.org/licenses/BSD-3-Clause +*/ +#ifndef MCL_STATIC_CODE + #error "define MCL_STATIC_CODE" +#endif + +namespace mcl { namespace fp { + +extern "C" { + +Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_Fp_subPre(Unit*, const Unit*, const Unit*); +void mclx_Fp_add(Unit*, const Unit*, const Unit*); +void mclx_Fp_sub(Unit*, const Unit*, const Unit*); +void mclx_Fp_shr1(Unit*, const Unit*); +void mclx_Fp_neg(Unit*, const Unit*); +void mclx_FpDbl_mod(Unit*, const Unit*); +void mclx_Fp_mul(Unit*, const Unit*, const Unit*); +void mclx_Fp_sqr(Unit*, const Unit*); +void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); +Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_sqrPre(Unit*, const Unit*); +void mclx_Fp2_add(Unit*, const Unit*, const Unit*); +void mclx_Fp2_sub(Unit*, const Unit*, const Unit*); +void mclx_Fp2_neg(Unit*, const Unit*); +void mclx_Fp2_mul(Unit*, const Unit*, const Unit*); +void mclx_Fp2_sqr(Unit*, const Unit*); +void mclx_Fp2_mul_xi(Unit*, const Unit*); + +Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_Fr_subPre(Unit*, const Unit*, const Unit*); +void mclx_Fr_add(Unit*, const Unit*, const Unit*); +void mclx_Fr_sub(Unit*, const Unit*, const Unit*); +void mclx_Fr_shr1(Unit*, const Unit*); +void mclx_Fr_neg(Unit*, const Unit*); +void mclx_Fr_mul(Unit*, const Unit*, const Unit*); +void mclx_Fr_sqr(Unit*, const Unit*); +int mclx_Fr_preInv(Unit*, const Unit*); +} // extern "C" + +void setStaticCode(mcl::fp::Op& op) +{ + if (op.xi_a) { + // Fp, sizeof(Fp) = 48, supports Fp2 + op.fp_addPre = mclx_Fp_addPre; + op.fp_subPre = mclx_Fp_subPre; + op.fp_addA_ = mclx_Fp_add; + op.fp_subA_ = mclx_Fp_sub; + op.fp_shr1 = mclx_Fp_shr1; + op.fp_negA_ = mclx_Fp_neg; + op.fpDbl_modA_ = mclx_FpDbl_mod; + op.fp_mulA_ = mclx_Fp_mul; + op.fp_sqrA_ = mclx_Fp_sqr; + op.fpDbl_addA_ = mclx_FpDbl_add; + op.fpDbl_subA_ = mclx_FpDbl_sub; + op.fpDbl_addPre = mclx_FpDbl_addPre; + op.fpDbl_subPre = mclx_FpDbl_subPre; + op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre; + op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre; + op.fp2_addA_ = mclx_Fp2_add; + op.fp2_subA_ = mclx_Fp2_sub; + op.fp2_negA_ = mclx_Fp2_neg; + op.fp2_mulNF = 0; + op.fp2_mulA_ = mclx_Fp2_mul; + op.fp2_sqrA_ = mclx_Fp2_sqr; + op.fp2_mul_xiA_ = mclx_Fp2_mul_xi; + } else { + // Fr, sizeof(Fr) = 32 + op.fp_addPre = mclx_Fr_addPre; + op.fp_subPre = mclx_Fr_subPre; + op.fp_addA_ = mclx_Fr_add; + op.fp_subA_ = mclx_Fr_sub; + op.fp_shr1 = mclx_Fr_shr1; + op.fp_negA_ = mclx_Fr_neg; + op.fp_mulA_ = mclx_Fr_mul; + op.fp_sqrA_ = mclx_Fr_sqr; + op.fp_preInv = mclx_Fr_preInv; + } + op.fp_mul = fp::func_ptr_cast(op.fp_mulA_); +} + +} } // mcl::fp + diff --git a/src/low_func.hpp b/src/low_func.hpp index 89a748e..9192e51 100644 --- a/src/low_func.hpp +++ b/src/low_func.hpp @@ -16,8 +16,10 @@ #endif #ifndef MCL_LLVM_BMI2 - #if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_USE_XBYAK) && !defined(MCL_USE_VINT) + #if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && !defined(MCL_STATIC_CODE) && !defined(MCL_USE_VINT) #define MCL_LLVM_BMI2 1 + #else + #define MCL_LLVM_BMI2 0 #endif #endif diff --git a/test/bench.hpp b/test/bench.hpp index c8c3911..f7acfce 100644 --- a/test/bench.hpp +++ b/test/bench.hpp @@ -100,6 +100,19 @@ void testBench(const G1& P, const G2& Q) CYBOZU_BENCH_C("Fp::mul ", C3, Fp::mul, x, x, y); CYBOZU_BENCH_C("Fp::sqr ", C3, Fp::sqr, x, x); CYBOZU_BENCH_C("Fp::inv ", C3, Fp::inv, x, x); + CYBOZU_BENCH_C("Fp::pow ", C3, Fp::pow, x, x, y); + { + Fr a, b, c; + a.setHashOf("abc", 3); + b.setHashOf("123", 3); + CYBOZU_BENCH_C("Fr::add ", C3, Fr::add, a, a, b); + CYBOZU_BENCH_C("Fr::sub ", C3, Fr::sub, a, a, b); + CYBOZU_BENCH_C("Fr::neg ", C3, Fr::neg, a, a); + CYBOZU_BENCH_C("Fr::mul ", C3, Fr::mul, a, a, b); + CYBOZU_BENCH_C("Fr::sqr ", C3, Fr::sqr, a, a); + CYBOZU_BENCH_C("Fr::inv ", C3, Fr::inv, a, a); + CYBOZU_BENCH_C("Fr::pow ", C3, Fr::pow, a, a, b); + } Fp2 xx, yy; xx.a = x; xx.b = 3; diff --git a/test/bls12_test.cpp b/test/bls12_test.cpp index 723bf3a..ec4204c 100644 --- a/test/bls12_test.cpp +++ b/test/bls12_test.cpp @@ -688,6 +688,8 @@ CYBOZU_TEST_AUTO(multi) G1 P; G2 Q; int i; + +#ifndef MCL_STATIC_CODE puts("BN254"); testCurve(mcl::BN254); i = 1; @@ -695,6 +697,7 @@ CYBOZU_TEST_AUTO(multi) CYBOZU_BENCH_C("naiveG2", 100, (BN::param.mapTo.naiveMapTo), P, i++); CYBOZU_BENCH_C("calcBN2", 100, (BN::param.mapTo.calcBN), Q, i++); CYBOZU_BENCH_C("naiveG2", 100, (BN::param.mapTo.naiveMapTo), Q, i++); +#endif puts("BLS12_381"); testCurve(mcl::BLS12_381); i = 1; @@ -861,7 +864,11 @@ int main(int argc, char *argv[]) return 1; } g_mode = mcl::fp::StrToMode(mode); +#ifdef MCL_STATIC_CODE + printf("static code for BLS12-381\n"); +#else printf("JIT %d\n", mcl::fp::isEnableJIT()); +#endif #if 0 initPairing(mcl::BLS12_381); cybozu::XorShift rg; diff --git a/test/ec_test.cpp b/test/ec_test.cpp index a3e79e5..855ceba 100644 --- a/test/ec_test.cpp +++ b/test/ec_test.cpp @@ -602,7 +602,7 @@ void test_sub(const mcl::EcParam *para, size_t paraNum) test_sub_sub(para[i], mcl::fp::FP_LLVM); test_sub_sub(para[i], mcl::fp::FP_LLVM_MONT); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM test_sub_sub(para[i], mcl::fp::FP_XBYAK); #endif mulVec(para[i]); diff --git a/test/fp_test.cpp b/test/fp_test.cpp index 469f35d..70fef8a 100644 --- a/test/fp_test.cpp +++ b/test/fp_test.cpp @@ -876,7 +876,7 @@ void modpTest() } #include -#if (defined(MCL_USE_LLVM) || defined(MCL_USE_XBYAK)) && (MCL_MAX_BIT_SIZE >= 521) +#if (defined(MCL_USE_LLVM) || defined(MCL_X64_ASM)) && (MCL_MAX_BIT_SIZE >= 521) CYBOZU_TEST_AUTO(mod_NIST_P521) { const size_t len = 521; @@ -908,7 +908,7 @@ CYBOZU_TEST_AUTO(mod_NIST_P521) mcl_fpDbl_mod_NIST_P521L(ex, in, Fp::getOp().p); CYBOZU_TEST_EQUAL_ARRAY(ex, ok, N + 1); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM const mcl::fp::Op& op = Fp::getOp(); if (!op.isMont) { op.fpDbl_mod(ex, in, op.p); @@ -1014,7 +1014,7 @@ CYBOZU_TEST_AUTO(main) sub(mcl::fp::FP_LLVM_MONT); } #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (g_mode.empty() || g_mode == "xbyak") { sub(mcl::fp::FP_XBYAK); } diff --git a/test/fp_tower_test.cpp b/test/fp_tower_test.cpp index c26c5d7..4576376 100644 --- a/test/fp_tower_test.cpp +++ b/test/fp_tower_test.cpp @@ -465,7 +465,7 @@ void testAll() test(p, mcl::fp::FP_LLVM); test(p, mcl::fp::FP_LLVM_MONT); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM test(p, mcl::fp::FP_XBYAK); #endif } diff --git a/test/static_code_test.cpp b/test/static_code_test.cpp new file mode 100644 index 0000000..3751741 --- /dev/null +++ b/test/static_code_test.cpp @@ -0,0 +1,70 @@ +#include +#include + +using namespace mcl::bn; + +CYBOZU_TEST_AUTO(init) +{ + initPairing(mcl::BLS12_381); +} + +CYBOZU_TEST_AUTO(Fr) +{ + Fr x, y; + x = 3; + y = 5; + CYBOZU_TEST_EQUAL(x + y, 8); + CYBOZU_TEST_EQUAL(x - y, -2); + CYBOZU_TEST_EQUAL(x * y, 15); +} + +CYBOZU_TEST_AUTO(Fp) +{ + Fp x, y; + x = 3; + y = 5; + CYBOZU_TEST_EQUAL(x + y, 8); + CYBOZU_TEST_EQUAL(x - y, -2); + CYBOZU_TEST_EQUAL(x * y, 15); +} + +CYBOZU_TEST_AUTO(Fp2) +{ + Fp2 x, y; + x.a = 3; + x.b = 2; + y.a = 1; + y.b = 4; + /* + (3+2i)(1+4i)=3-8+(12+2)i + */ + CYBOZU_TEST_EQUAL(x + y, Fp2(4, 6)); + CYBOZU_TEST_EQUAL(x - y, Fp2(2, -2)); + CYBOZU_TEST_EQUAL(x * y, Fp2(-5, 14)); +} + +CYBOZU_TEST_AUTO(G1) +{ + G1 P, Q; + hashAndMapToG1(P, "abc", 3); + Fr r1, r2; + r1.setHashOf("abc", 3); + r2 = -r1; + G1::mul(Q, P, r1); + Q = -Q; + P *= r2; + CYBOZU_TEST_EQUAL(P, Q); +} + +CYBOZU_TEST_AUTO(G2) +{ + G2 P, Q; + hashAndMapToG2(P, "abc", 3); + Fr r1, r2; + r1.setHashOf("abc", 3); + r2 = -r1; + G2::mul(Q, P, r1); + Q = -Q; + P *= r2; + CYBOZU_TEST_EQUAL(P, Q); +}