From 10621c6299d3db1c88fd0c27e63654edada08049 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 15 Sep 2020 16:20:14 +0900 Subject: [PATCH 01/15] refactor profiler --- include/mcl/bn.hpp | 4 +- include/mcl/fp.hpp | 12 ++--- include/mcl/op.hpp | 2 +- src/fp.cpp | 10 ++-- src/fp_generator.hpp | 111 ++++++++++++++++++++++++++++++++----------- 5 files changed, 97 insertions(+), 42 deletions(-) diff --git a/include/mcl/bn.hpp b/include/mcl/bn.hpp index 3668da2..8710f55 100644 --- a/include/mcl/bn.hpp +++ b/include/mcl/bn.hpp @@ -875,9 +875,9 @@ struct Param { assert((p % 6) == 1); r = local::evalPoly(z, rCoff); } - Fr::init(pb, r, mode); + Fr::init(pb, r, mode, "Fr"); if (!*pb) return; - Fp::init(pb, cp.xi_a, p, mode); + Fp::init(pb, cp.xi_a, p, mode, "Fp"); if (!*pb) return; Fp2::init(); const Fp2 xi(cp.xi_a, 1); diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp index 6c5b0b0..c8b5a6d 100644 --- a/include/mcl/fp.hpp +++ b/include/mcl/fp.hpp @@ -130,10 +130,10 @@ public: xi_a is used for Fp2::mul_xi(), where xi = xi_a + i and i^2 = -1 if xi_a = 0 then asm functions for Fp2 are not generated. */ - static inline void init(bool *pb, int xi_a, const mpz_class& p, fp::Mode mode = fp::FP_AUTO) + static inline void init(bool *pb, int xi_a, const mpz_class& p, fp::Mode mode = fp::FP_AUTO, const char *suf = 0) { assert(maxBitSize <= MCL_MAX_BIT_SIZE); - *pb = op_.init(p, maxBitSize, xi_a, mode); + *pb = op_.init(p, maxBitSize, xi_a, mode, suf); if (!*pb) return; { // set oneRep FpT& one = *reinterpret_cast(op_.oneRep); @@ -163,16 +163,16 @@ public: #endif *pb = true; } - static inline void init(bool *pb, const mpz_class& p, fp::Mode mode = fp::FP_AUTO) + static inline void init(bool *pb, const mpz_class& p, fp::Mode mode = fp::FP_AUTO, const char *suf = 0) { - init(pb, 0, p, mode); + init(pb, 0, p, mode, suf); } - static inline void init(bool *pb, const char *mstr, fp::Mode mode = fp::FP_AUTO) + static inline void init(bool *pb, const char *mstr, fp::Mode mode = fp::FP_AUTO, const char *suf = 0) { mpz_class p; gmp::setStr(pb, p, mstr); if (!*pb) return; - init(pb, p, mode); + init(pb, p, mode, suf); } static inline size_t getModulo(char *buf, size_t bufSize) { diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index 99c0e4d..45320e5 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -364,7 +364,7 @@ struct Op { */ fp_mul(y, x, R2, p); } - bool init(const mpz_class& p, size_t maxBitSize, int xi_a, Mode mode, size_t mclMaxBitSize = MCL_MAX_BIT_SIZE); + bool init(const mpz_class& p, size_t maxBitSize, int xi_a, Mode mode, const char *suf = 0, size_t mclMaxBitSize = MCL_MAX_BIT_SIZE); #ifdef MCL_USE_XBYAK static FpGenerator* createFpGenerator(); static void destroyFpGenerator(FpGenerator *fg); diff --git a/src/fp.cpp b/src/fp.cpp index b3b07d1..998a53b 100644 --- a/src/fp.cpp +++ b/src/fp.cpp @@ -346,7 +346,7 @@ static void initInvTbl(Op& op) } #endif -static bool initForMont(Op& op, const Unit *p, Mode mode) +static bool initForMont(Op& op, const Unit *p, Mode mode, const char *suf) { const size_t N = op.N; bool b; @@ -366,17 +366,19 @@ static bool initForMont(Op& op, const Unit *p, Mode mode) if (mode != FP_XBYAK) return true; #ifdef MCL_USE_XBYAK if (op.fg == 0) op.fg = Op::createFpGenerator(); - bool useXbyak = op.fg->init(op); + bool useXbyak = op.fg->init(op, suf); if (useXbyak && op.isMont && N <= 4) { op.fp_invOp = &invOpForMontC; initInvTbl(op); } +#else + (void)suf; #endif return true; } -bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size_t mclMaxBitSize) +bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, const char *suf, size_t mclMaxBitSize) { if (mclMaxBitSize != MCL_MAX_BIT_SIZE) return false; #ifdef MCL_USE_VINT @@ -534,7 +536,7 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size if (!b) return false; } modp.init(mp); - return fp::initForMont(*this, p, mode); + return fp::initForMont(*this, p, mode, suf); } void copyUnitToByteAsLE(uint8_t *dst, const Unit *src, size_t byteSize) diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 97ce9ae..08d5844 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -23,8 +23,59 @@ #pragma warning(disable : 4458) #endif +//#define MCL_FREEZE_JIT + namespace mcl { +#ifdef MCL_FREEZE_JIT +struct Profiler { + FILE *fp_; + const uint8_t *prev_; + std::string suf_; + Profiler() + : fp_(0) + , prev_(0) + { + } + ~Profiler() + { + if (fp_) fclose(fp_); + } + void open(const std::string& fileName) + { + fp_ = fopen(fileName.c_str(), "wb"); + } + void setStartAddr(const uint8_t *addr) + { + prev_ = addr; + } + void setNameSuffix(const char *suf) + { + suf_ = suf; + } + void set(const char *name, const uint8_t *end) + { + fprintf(fp_, "global %s%s\n", suf_.c_str(), name); + fprintf(fp_, "align 16\n"); + fprintf(fp_, "%s%s:\n", suf_.c_str(), name); + const uint8_t *p = prev_; + size_t remain = end - prev_; + while (remain > 0) { + size_t n = remain >= 16 ? 16 : remain; + fprintf(fp_, "db "); + for (size_t i = 0; i < n; i++) { + fprintf(fp_, "0x%02x,", *p++); + } + fprintf(fp_, "\n"); + remain -= n; + } + prev_ = end; + } +}; +#else +typedef Xbyak::util::Profiler Profiler; +#endif + namespace fp_gen_local { class MemReg { @@ -203,7 +254,7 @@ struct FpGenerator : Xbyak::CodeGenerator { int pn_; int FpByte_; bool isFullBit_; - Xbyak::util::Profiler prof_; + Profiler prof_; /* @param op [in] ; use op.p, op.N, op.isFullBit @@ -242,12 +293,12 @@ struct FpGenerator : Xbyak::CodeGenerator { useMulx_ = cpu_.has(Xbyak::util::Cpu::tBMI2); useAdx_ = cpu_.has(Xbyak::util::Cpu::tADX); } - bool init(Op& op) + bool init(Op& op, const char *suf) { if (!cpu_.has(Xbyak::util::Cpu::tAVX)) return false; reset(); // reset jit code for reuse setProtectModeRW(); // read/write memory - init_inner(op); + init_inner(op, suf); // ToDo : recover op if false if (Xbyak::GetError()) return false; // printf("code size=%d\n", (int)getSize()); @@ -255,7 +306,7 @@ struct FpGenerator : Xbyak::CodeGenerator { return true; } private: - void init_inner(Op& op) + void init_inner(Op& op, const char *suf) { op_ = &op; L(pL_); @@ -269,7 +320,6 @@ private: isFullBit_ = op.isFullBit; // printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_); #ifdef MCL_USE_PROF - static char suf[] = "_0"; int profMode = 0; #ifdef XBYAK_USE_VTUNE profMode = 2; @@ -281,89 +331,92 @@ private: if (profMode) { prof_.init(profMode); prof_.setStartAddr(getCurr()); + if (suf == 0) suf = "fp"; prof_.setNameSuffix(suf); suf[1]++; } +#else + (void)suf; #endif op.fp_addPre = gen_addSubPre(true, pn_); - prof_.set("Fp_addPre", getCurr()); + prof_.set("_addPre", getCurr()); op.fp_subPre = gen_addSubPre(false, pn_); - prof_.set("Fp_subPre", getCurr()); + prof_.set("_subPre", getCurr()); op.fp_addA_ = gen_fp_add(); - prof_.set("Fp_add", getCurr()); + prof_.set("_add", getCurr()); op.fp_subA_ = gen_fp_sub(); - prof_.set("Fp_sub", getCurr()); + prof_.set("_sub", getCurr()); op.fp_shr1 = gen_shr1(); - prof_.set("Fp_shr1", getCurr()); + prof_.set("_shr1", getCurr()); op.fp_negA_ = gen_fp_neg(); - prof_.set("Fp_neg", getCurr()); + prof_.set("_neg", getCurr()); op.fpDbl_addA_ = gen_fpDbl_add(); - prof_.set("FpDbl_add", getCurr()); + prof_.set("Dbl_add", getCurr()); op.fpDbl_subA_ = gen_fpDbl_sub(); - prof_.set("FpDbl_sub", getCurr()); + prof_.set("Dbl_sub", getCurr()); op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2); - prof_.set("FpDbl_addPre", getCurr()); + prof_.set("Dbl_addPre", getCurr()); op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2); - prof_.set("FpDbl_subPre", getCurr()); + prof_.set("Dbl_subPre", getCurr()); op.fpDbl_mulPreA_ = gen_fpDbl_mulPre(); - prof_.set("FpDbl_mulPre", getCurr()); + prof_.set("Dbl_mulPre", getCurr()); op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre(); - prof_.set("FpDbl_sqrPre", getCurr()); + prof_.set("Dbl_sqrPre", getCurr()); op.fpDbl_modA_ = gen_fpDbl_mod(op); - prof_.set("FpDbl_mod", getCurr()); + prof_.set("Dbl_mod", getCurr()); op.fp_mulA_ = gen_mul(); - prof_.set("Fp_mul", getCurr()); + prof_.set("_mul", getCurr()); if (op.fp_mulA_) { op.fp_mul = fp::func_ptr_cast(op.fp_mulA_); // used in toMont/fromMont } op.fp_sqrA_ = gen_sqr(); - prof_.set("Fp_sqr", getCurr()); + prof_.set("_sqr", getCurr()); if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 align(16); op.fp_preInv = getCurr(); gen_preInv(); - prof_.set("preInv", getCurr()); + prof_.set("_preInv", getCurr()); } if (op.xi_a == 0) return; // Fp2 is not used op.fp2_addA_ = gen_fp2_add(); - prof_.set("Fp2_add", getCurr()); + prof_.set("2_add", getCurr()); op.fp2_subA_ = gen_fp2_sub(); - prof_.set("Fp2_sub", getCurr()); + prof_.set("2_sub", getCurr()); op.fp2_negA_ = gen_fp2_neg(); - prof_.set("Fp2_neg", getCurr()); + prof_.set("2_neg", getCurr()); op.fp2_mulNF = 0; op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre(); - prof_.set("Fp2Dbl_mulPre", getCurr()); + prof_.set("2Dbl_mulPre", getCurr()); op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre(); - prof_.set("Fp2Dbl_sqrPre", getCurr()); + prof_.set("2Dbl_sqrPre", getCurr()); op.fp2_mulA_ = gen_fp2_mul(); - prof_.set("Fp2_mul", getCurr()); + prof_.set("2_mul", getCurr()); op.fp2_sqrA_ = gen_fp2_sqr(); - prof_.set("Fp2_sqr", getCurr()); + prof_.set("2_sqr", getCurr()); op.fp2_mul_xiA_ = gen_fp2_mul_xi(); - prof_.set("Fp2_mul_xi", getCurr()); + prof_.set("2_mul_xi", getCurr()); } u3u gen_addSubPre(bool isAdd, int n) { From 7146cfd0f425acb17f4aff1951dd307388da9075 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 15 Sep 2020 16:43:55 +0900 Subject: [PATCH 02/15] add dump_code --- Makefile | 3 +++ src/dump_code.cpp | 7 +++++++ src/fp_generator.hpp | 26 +++++++++++++++++++++++--- 3 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 src/dump_code.cpp diff --git a/Makefile b/Makefile index 346be6a..75cca34 100644 --- a/Makefile +++ b/Makefile @@ -237,6 +237,9 @@ endif $(GEN_EXE): src/gen.cpp src/llvm_gen.hpp $(CXX) -o $@ $< $(CFLAGS) +src/dump_code: src/dump_code.cpp src/fp.cpp src/fp_generator.hpp + $(CXX) -o $@ src/dump_code.cpp src/fp.cpp -I include -DMCL_FREEZE_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER + asm: $(LLVM_SRC) $(LLVM_OPT) -O3 -o - $(LLVM_SRC) | $(LLVM_LLC) -O3 $(LLVM_FLAGS) -x86-asm-syntax=intel diff --git a/src/dump_code.cpp b/src/dump_code.cpp new file mode 100644 index 0000000..f1655e9 --- /dev/null +++ b/src/dump_code.cpp @@ -0,0 +1,7 @@ +#include + +int main() +{ + mcl::bn::initPairing(mcl::BLS12_381); +} + diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 08d5844..5b00fa6 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -28,28 +28,31 @@ namespace mcl { #ifdef MCL_FREEZE_JIT +// not profiler, but dump jit code struct Profiler { FILE *fp_; const uint8_t *prev_; std::string suf_; Profiler() - : fp_(0) + : fp_(stdout) , prev_(0) { } ~Profiler() { - if (fp_) fclose(fp_); +// if (fp_) fclose(fp_); } +#if 0 void open(const std::string& fileName) { fp_ = fopen(fileName.c_str(), "wb"); } +#endif void setStartAddr(const uint8_t *addr) { prev_ = addr; } - void setNameSuffix(const char *suf) + void setNameSuffix(const std::string& suf) { suf_ = suf; } @@ -71,6 +74,18 @@ struct Profiler { } prev_ = end; } + void dumpData(const void *begin, const void *end) + { + fprintf(fp_, "align 16\n"); + fprintf(fp_, "dq "); + const uint64_t *p = (const uint64_t*)begin; + const uint64_t *pe = (const uint64_t*)end; + const size_t n = pe - p; + for (size_t i = 0; i < n; i++) { + fprintf(fp_, "0x%016llx,", (unsigned long long)*p++); + } + fprintf(fp_, "\n"); + } }; #else typedef Xbyak::util::Profiler Profiler; @@ -314,6 +329,11 @@ private: for (size_t i = 0; i < op.N; i++) { dq(op.p[i]); } +#ifdef MCL_FREEZE_JIT + prof_.dumpData(p_, getCurr()); + prof_.setStartAddr(getCurr()); + prof_.setNameSuffix(std::string("mclx_") + suf); +#endif rp_ = fp::getMontgomeryCoeff(p_[0]); pn_ = (int)op.N; FpByte_ = int(op.maxN * sizeof(uint64_t)); From c29157cc9a17a36e80cd73d4c32cd7d40220d508 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 15 Sep 2020 17:23:08 +0900 Subject: [PATCH 03/15] test static_code --- Makefile | 13 +++++++++++-- src/fp_generator.hpp | 6 +++--- test/static_code_test.cpp | 11 +++++++++++ 3 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 test/static_code_test.cpp diff --git a/Makefile b/Makefile index 75cca34..dbc2a41 100644 --- a/Makefile +++ b/Makefile @@ -238,8 +238,17 @@ $(GEN_EXE): src/gen.cpp src/llvm_gen.hpp $(CXX) -o $@ $< $(CFLAGS) src/dump_code: src/dump_code.cpp src/fp.cpp src/fp_generator.hpp - $(CXX) -o $@ src/dump_code.cpp src/fp.cpp -I include -DMCL_FREEZE_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER + $(CXX) -o $@ src/dump_code.cpp src/fp.cpp -I include -DMCL_DUMP_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER +src/static_code.asm: src/dump_code + $< > $@ + +obj/static_code.o: src/static_code.asm + nasm -felf64 -o $@ $< + +bin/static_code_test.exe: test/static_code_test.cpp src/fp.cpp obj/static_code.o + $(CXX) -o $@ -O3 $^ -DMCL_STATIC_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I include -Wall -Wextra + asm: $(LLVM_SRC) $(LLVM_OPT) -O3 -o - $(LLVM_SRC) | $(LLVM_LLC) -O3 $(LLVM_FLAGS) -x86-asm-syntax=intel @@ -391,7 +400,7 @@ update_cybozulib: cp -a $(addprefix ../cybozulib/,$(wildcard include/cybozu/*.hpp)) include/cybozu/ clean: - $(RM) $(LIB_DIR)/*.a $(LIB_DIR)/*.$(LIB_SUF) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.obj $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_OBJ) $(LIB_OBJ) $(BN256_OBJ) $(BN384_OBJ) $(BN512_OBJ) $(FUNC_LIST) src/*.ll lib/*.a + $(RM) $(LIB_DIR)/*.a $(LIB_DIR)/*.$(LIB_SUF) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.obj $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_OBJ) $(LIB_OBJ) $(BN256_OBJ) $(BN384_OBJ) $(BN512_OBJ) $(FUNC_LIST) src/*.ll lib/*.a src/static_code.asm src/dump_code ALL_SRC=$(SRC_SRC) $(TEST_SRC) $(SAMPLE_SRC) DEPEND_FILE=$(addprefix $(OBJ_DIR)/, $(addsuffix .d,$(basename $(ALL_SRC)))) diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 5b00fa6..7a3771f 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -23,11 +23,11 @@ #pragma warning(disable : 4458) #endif -//#define MCL_FREEZE_JIT +//#define MCL_DUMP_JIT namespace mcl { -#ifdef MCL_FREEZE_JIT +#ifdef MCL_DUMP_JIT // not profiler, but dump jit code struct Profiler { FILE *fp_; @@ -329,7 +329,7 @@ private: for (size_t i = 0; i < op.N; i++) { dq(op.p[i]); } -#ifdef MCL_FREEZE_JIT +#ifdef MCL_DUMP_JIT prof_.dumpData(p_, getCurr()); prof_.setStartAddr(getCurr()); prof_.setNameSuffix(std::string("mclx_") + suf); diff --git a/test/static_code_test.cpp b/test/static_code_test.cpp new file mode 100644 index 0000000..56d8420 --- /dev/null +++ b/test/static_code_test.cpp @@ -0,0 +1,11 @@ +#include + +using namespace mcl::bn; + +int main() +{ + initPairing(mcl::BLS12_381); + Fr x; + x = 3; + printf("%s\n", x.getStr(16).c_str()); +} From f11b3be1ab34abe0b15e56e605749030202f22ae Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Wed, 16 Sep 2020 17:39:28 +0900 Subject: [PATCH 04/15] refactor DumpCode --- src/fp_generator.hpp | 182 ++++++++++++++++++++------------------ test/static_code_test.cpp | 8 +- 2 files changed, 104 insertions(+), 86 deletions(-) diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 7a3771f..2ce90ba 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -23,46 +23,34 @@ #pragma warning(disable : 4458) #endif -//#define MCL_DUMP_JIT - namespace mcl { +#ifdef MCL_STATIC_JIT +typedef fp::Unit Unit; +extern "C" { +Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*); +void mclx_Fr_add(Unit*, const Unit*, const Unit*); + +Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*); +void mclx_Fp_add(Unit*, const Unit*, const Unit*); +} +#endif + #ifdef MCL_DUMP_JIT -// not profiler, but dump jit code -struct Profiler { +struct DumpCode { FILE *fp_; - const uint8_t *prev_; - std::string suf_; - Profiler() + DumpCode() : fp_(stdout) - , prev_(0) - { - } - ~Profiler() - { -// if (fp_) fclose(fp_); - } -#if 0 - void open(const std::string& fileName) { - fp_ = fopen(fileName.c_str(), "wb"); } -#endif - void setStartAddr(const uint8_t *addr) + void set(const std::string& name, const uint8_t *begin, const size_t size) { - prev_ = addr; - } - void setNameSuffix(const std::string& suf) - { - suf_ = suf; - } - void set(const char *name, const uint8_t *end) - { - fprintf(fp_, "global %s%s\n", suf_.c_str(), name); + fprintf(fp_, "segment .text\n"); + fprintf(fp_, "global %s\n", name.c_str()); fprintf(fp_, "align 16\n"); - fprintf(fp_, "%s%s:\n", suf_.c_str(), name); - const uint8_t *p = prev_; - size_t remain = end - prev_; + fprintf(fp_, "%s:\n", name.c_str()); + const uint8_t *p = begin; + size_t remain = size; while (remain > 0) { size_t n = remain >= 16 ? 16 : remain; fprintf(fp_, "db "); @@ -72,7 +60,6 @@ struct Profiler { fprintf(fp_, "\n"); remain -= n; } - prev_ = end; } void dumpData(const void *begin, const void *end) { @@ -87,8 +74,19 @@ struct Profiler { fprintf(fp_, "\n"); } }; +template +void setFuncInfo(DumpCode& prof, const char *suf, const char *name, const T& begin, const uint8_t* end) +{ + const uint8_t*p = (const uint8_t*)begin; + prof.set(std::string("mclx_") + suf + name, p, end - p); +} #else -typedef Xbyak::util::Profiler Profiler; +template +void setFuncInfo(Xbyak::util::Profiler& prof, const char *suf, const char *name, const T& begin, const uint8_t* end) +{ + const uint8_t*p = (const uint8_t*)begin; + prof.set((std::string("mclx_") + suf + name).c_str(), p, end - p); +} #endif namespace fp_gen_local { @@ -269,7 +267,11 @@ struct FpGenerator : Xbyak::CodeGenerator { int pn_; int FpByte_; bool isFullBit_; - Profiler prof_; +#ifdef MCL_DUMP_JIT + DumpCode prof_; +#else + Xbyak::util::Profiler prof_; +#endif /* @param op [in] ; use op.p, op.N, op.isFullBit @@ -331,8 +333,6 @@ private: } #ifdef MCL_DUMP_JIT prof_.dumpData(p_, getCurr()); - prof_.setStartAddr(getCurr()); - prof_.setNameSuffix(std::string("mclx_") + suf); #endif rp_ = fp::getMontgomeryCoeff(p_[0]); pn_ = (int)op.N; @@ -351,97 +351,130 @@ private: if (profMode) { prof_.init(profMode); prof_.setStartAddr(getCurr()); - if (suf == 0) suf = "fp"; - prof_.setNameSuffix(suf); - suf[1]++; } #else (void)suf; #endif + align(16); op.fp_addPre = gen_addSubPre(true, pn_); - prof_.set("_addPre", getCurr()); + setFuncInfo(prof_, suf, "_addPre", op.fp_addPre, getCurr()); + align(16); op.fp_subPre = gen_addSubPre(false, pn_); - prof_.set("_subPre", getCurr()); + setFuncInfo(prof_, suf, "_subPre", op.fp_subPre, getCurr()); + align(16); op.fp_addA_ = gen_fp_add(); - prof_.set("_add", getCurr()); + setFuncInfo(prof_, suf, "_add", op.fp_addA_, getCurr()); op.fp_subA_ = gen_fp_sub(); - prof_.set("_sub", getCurr()); + setFuncInfo(prof_, suf, "_sub", op.fp_subA_, getCurr()); + align(16); op.fp_shr1 = gen_shr1(); - prof_.set("_shr1", getCurr()); + setFuncInfo(prof_, suf, "_shr1", op.fp_shr1, getCurr()); + align(16); op.fp_negA_ = gen_fp_neg(); - prof_.set("_neg", getCurr()); + setFuncInfo(prof_, suf, "_neg", op.fp_negA_, getCurr()); + align(16); op.fpDbl_addA_ = gen_fpDbl_add(); - prof_.set("Dbl_add", getCurr()); + setFuncInfo(prof_, suf, "Dbl_add", op.fpDbl_addA_, getCurr()); + align(16); op.fpDbl_subA_ = gen_fpDbl_sub(); - prof_.set("Dbl_sub", getCurr()); + setFuncInfo(prof_, suf, "Dbl_sub", op.fpDbl_subA_, getCurr()); + align(16); op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2); - prof_.set("Dbl_addPre", getCurr()); + setFuncInfo(prof_, suf, "Dbl_addPre", op.fpDbl_addPre, getCurr()); + align(16); op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2); - prof_.set("Dbl_subPre", getCurr()); + setFuncInfo(prof_, suf, "Dbl_subPre", op.fpDbl_subPre, getCurr()); + align(16); op.fpDbl_mulPreA_ = gen_fpDbl_mulPre(); - prof_.set("Dbl_mulPre", getCurr()); + setFuncInfo(prof_, suf, "Dbl_mulPre", op.fpDbl_mulPreA_, getCurr()); + align(16); op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre(); - prof_.set("Dbl_sqrPre", getCurr()); + setFuncInfo(prof_, suf, "Dbl_sqrPre", op.fpDbl_sqrPreA_, getCurr()); + align(16); op.fpDbl_modA_ = gen_fpDbl_mod(op); - prof_.set("Dbl_mod", getCurr()); + setFuncInfo(prof_, suf, "Dbl_mod", op.fpDbl_modA_, getCurr()); + align(16); op.fp_mulA_ = gen_mul(); - prof_.set("_mul", getCurr()); + setFuncInfo(prof_, suf, "_mul", op.fp_mulA_, getCurr()); + align(16); + if (op.fp_mulA_) { op.fp_mul = fp::func_ptr_cast(op.fp_mulA_); // used in toMont/fromMont } + op.fp_sqrA_ = gen_sqr(); - prof_.set("_sqr", getCurr()); + setFuncInfo(prof_, suf, "_sqr", op.fp_sqrA_, getCurr()); + align(16); if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 - align(16); op.fp_preInv = getCurr(); gen_preInv(); - prof_.set("_preInv", getCurr()); + setFuncInfo(prof_, suf, "_preInv", op.fp_preInv, getCurr()); + align(16); } if (op.xi_a == 0) return; // Fp2 is not used op.fp2_addA_ = gen_fp2_add(); - prof_.set("2_add", getCurr()); + setFuncInfo(prof_, suf, "2_add", op.fp2_addA_, getCurr()); + align(16); op.fp2_subA_ = gen_fp2_sub(); - prof_.set("2_sub", getCurr()); + setFuncInfo(prof_, suf, "2_sub", op.fp2_subA_, getCurr()); + align(16); op.fp2_negA_ = gen_fp2_neg(); - prof_.set("2_neg", getCurr()); + setFuncInfo(prof_, suf, "2_neg", op.fp2_negA_, getCurr()); + align(16); op.fp2_mulNF = 0; op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre(); - prof_.set("2Dbl_mulPre", getCurr()); + if (op.fp2Dbl_mulPreA_) setFuncInfo(prof_, suf, "2Dbl_mulPre", op.fp2Dbl_mulPreA_, getCurr()); + align(16); op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre(); - prof_.set("2Dbl_sqrPre", getCurr()); + if (op.fp2Dbl_sqrPreA_) setFuncInfo(prof_, suf, "2Dbl_sqrPre", op.fp2Dbl_sqrPreA_, getCurr()); + align(16); op.fp2_mulA_ = gen_fp2_mul(); - prof_.set("2_mul", getCurr()); + setFuncInfo(prof_, suf, "2_mul", op.fp2_mulA_, getCurr()); + align(16); op.fp2_sqrA_ = gen_fp2_sqr(); - prof_.set("2_sqr", getCurr()); + setFuncInfo(prof_, suf, "2_sqr", op.fp2_sqrA_, getCurr()); + align(16); op.fp2_mul_xiA_ = gen_fp2_mul_xi(); - prof_.set("2_mul_xi", getCurr()); + setFuncInfo(prof_, suf, "2_mul_xi", op.fp2_mul_xiA_, getCurr()); + align(16); + +#ifdef MCL_STATIC_JIT + const bool isFp = strcmp(suf, "Fp") == 0; +printf("isFp=%d\n", isFp); + if (isFp) { + op.fp_addPre = mclx_Fp_addPre; + op.fp_addA_ = mclx_Fr_add; + } else { + op.fp_addPre = mclx_Fr_addPre; + op.fp_addA_ = mclx_Fr_add; + } +#endif } u3u gen_addSubPre(bool isAdd, int n) { // if (isFullBit_) return 0; - align(16); u3u func = getCurr(); StackFrame sf(this, 3); if (isAdd) { @@ -721,7 +754,6 @@ private: } void3u gen_fp_add() { - align(16); void3u func = getCurr(); if (pn_ <= 4) { gen_fp_add_le4(); @@ -769,7 +801,6 @@ private: } void3u gen_fpDbl_add() { - align(16); void3u func = getCurr(); if (pn_ <= 4) { int tn = pn_ * 2 + (isFullBit_ ? 1 : 0); @@ -797,7 +828,6 @@ private: } void3u gen_fpDbl_sub() { - align(16); void3u func = getCurr(); if (pn_ <= 4) { int tn = pn_ * 2; @@ -847,7 +877,6 @@ private: } void3u gen_fp_sub() { - align(16); void3u func = getCurr(); if (pn_ <= 4) { gen_fp_sub_le4(); @@ -872,7 +901,6 @@ private: } void2u gen_fp_neg() { - align(16); void2u func = getCurr(); StackFrame sf(this, 2, UseRDX | pn_); gen_raw_neg(sf.p[0], sf.p[1], sf.t); @@ -880,7 +908,6 @@ private: } void2u gen_shr1() { - align(16); void2u func = getCurr(); const int c = 1; StackFrame sf(this, 2, 1); @@ -901,7 +928,6 @@ private: } void3u gen_mul() { - align(16); void3u func = getCurr(); if (op_->primeMode == PM_NIST_P192) { StackFrame sf(this, 3, 10 | UseRDX, 8 * 6); @@ -1214,7 +1240,6 @@ private: } void2u gen_fpDbl_mod(const fp::Op& op) { - align(16); void2u func = getCurr(); if (op.primeMode == PM_NIST_P192) { StackFrame sf(this, 2, 6 | UseRDX); @@ -1260,7 +1285,6 @@ private: } void2u gen_sqr() { - align(16); void2u func = getCurr(); if (op_->primeMode == PM_NIST_P192) { StackFrame sf(this, 3, 10 | UseRDX, 6 * 8); @@ -2364,7 +2388,6 @@ private: } void2u gen_fpDbl_sqrPre() { - align(16); void2u func = getCurr(); if (pn_ == 2 && useMulx_) { StackFrame sf(this, 2, 7 | UseRDX); @@ -2405,7 +2428,6 @@ private: } void3u gen_fpDbl_mulPre() { - align(16); void3u func = getCurr(); if (pn_ == 2 && useMulx_) { StackFrame sf(this, 3, 5 | UseRDX); @@ -3446,7 +3468,6 @@ private: // if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; // almost same for pn_ == 6 if (pn_ != 4) return 0; - align(16); void3u func = getCurr(); const RegExp z = rsp + 0 * 8; @@ -3511,7 +3532,6 @@ private: // if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; // almost same for pn_ == 6 if (pn_ != 4) return 0; - align(16); void2u func = getCurr(); // almost same for pn_ == 6 if (pn_ != 4) return 0; @@ -3597,7 +3617,6 @@ private: } void3u gen_fp2_add() { - align(16); void3u func = getCurr(); if (pn_ == 4 && !isFullBit_) { gen_fp2_add4(); @@ -3611,7 +3630,6 @@ private: } void3u gen_fp2_sub() { - align(16); void3u func = getCurr(); if (pn_ == 4 && !isFullBit_) { gen_fp2_sub4(); @@ -3697,7 +3715,6 @@ private: { if (isFullBit_) return 0; if (op_->xi_a != 1) return 0; - align(16); void2u func = getCurr(); if (pn_ == 4) { gen_fp2_mul_xi4(); @@ -3711,7 +3728,6 @@ private: } void2u gen_fp2_neg() { - align(16); void2u func = getCurr(); if (pn_ <= 6) { StackFrame sf(this, 2, UseRDX | pn_); @@ -3725,7 +3741,6 @@ private: { if (isFullBit_) return 0; if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; - align(16); void3u func = getCurr(); bool embedded = pn_ == 4; @@ -3802,7 +3817,6 @@ private: { if (isFullBit_) return 0; if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; - align(16); void2u func = getCurr(); const RegExp y = rsp + 0 * 8; diff --git a/test/static_code_test.cpp b/test/static_code_test.cpp index 56d8420..93dc223 100644 --- a/test/static_code_test.cpp +++ b/test/static_code_test.cpp @@ -5,7 +5,11 @@ using namespace mcl::bn; int main() { initPairing(mcl::BLS12_381); - Fr x; + Fp x, y, z; x = 3; - printf("%s\n", x.getStr(16).c_str()); + y = 5; + z = x + y; + printf("x=%s\n", x.getStr(16).c_str()); + printf("y=%s\n", y.getStr(16).c_str()); + printf("z=%s\n", z.getStr(16).c_str()); } From 0c6b2c59b8630a3e74aca267d546b52e3c3a1ad4 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Wed, 16 Sep 2020 17:49:01 +0900 Subject: [PATCH 05/15] use rip instead of abs addr --- src/fp_generator.hpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 2ce90ba..a7536e6 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -77,6 +77,7 @@ struct DumpCode { template void setFuncInfo(DumpCode& prof, const char *suf, const char *name, const T& begin, const uint8_t* end) { + if (suf == 0) suf = ""; const uint8_t*p = (const uint8_t*)begin; prof.set(std::string("mclx_") + suf + name, p, end - p); } @@ -84,6 +85,7 @@ void setFuncInfo(DumpCode& prof, const char *suf, const char *name, const T& beg template void setFuncInfo(Xbyak::util::Profiler& prof, const char *suf, const char *name, const T& begin, const uint8_t* end) { + if (suf == 0) suf = ""; const uint8_t*p = (const uint8_t*)begin; prof.set((std::string("mclx_") + suf + name).c_str(), p, end - p); } @@ -535,7 +537,7 @@ printf("isFp=%d\n", isFp); } jmp(exit); L(nonZero); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); for (size_t i = 0; i < t.size(); i++) { mov(rdx, ptr [rax + i * 8]); if (i == 0) { @@ -663,7 +665,7 @@ printf("isFp=%d\n", isFp); mov(*fullReg, 0); adc(*fullReg, 0); } - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); sub_rm(p1, rax); if (fullReg) { sbb(*fullReg, 0); @@ -683,7 +685,7 @@ printf("isFp=%d\n", isFp); const Pack& p1 = t.sub(pn_, pn_); load_rm(p0, px); sub_rm(p0, py, withCarry); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); load_rm(p1, rax); sbb(rax, rax); // rax = (x > y) ? 0 : -1 for (size_t i = 0; i < p1.size(); i++) { @@ -724,7 +726,7 @@ printf("isFp=%d\n", isFp); Label exit; if (isFullBit_) { jnc("@f"); - mov(t2[0], pL_); // t2 is not used + lea(t2[0], ptr[rip+pL_]); // t2[0] is not used sub_rm(t1, t2[0]); jmp(exit); L("@@"); @@ -771,7 +773,7 @@ printf("isFp=%d\n", isFp); inLocalLabel(); gen_raw_add(pz, px, py, rax, pn_); - mov(px, pL_); // destroy px + lea(px, ptr[rip+pL_]); if (isFullBit_) { jc(".over", jmpMode); } @@ -894,7 +896,7 @@ printf("isFp=%d\n", isFp); Label exit; gen_raw_sub(pz, px, py, rax, pn_); jnc(exit, jmpMode); - mov(px, pL_); + lea(px, ptr[rip+pL_]); gen_raw_add(pz, pz, px, rax, pn_); L(exit); return func; @@ -1000,7 +1002,7 @@ printf("isFp=%d\n", isFp); mov(a, rp_); mul(t6); - mov(t0, pL_); + lea(t0, ptr[rip+pL_]); mov(t7, a); // q // [d:t7:t1] = p * q @@ -1069,7 +1071,7 @@ printf("isFp=%d\n", isFp); mov(a, rp_); mul(t10); - mov(t0, pL_); + lea(t0, ptr[rip+pL_]); mov(t7, a); // q // [d:t7:t2:t1] = p * q @@ -1149,7 +1151,7 @@ printf("isFp=%d\n", isFp); mov(a, rp_); mul(z); - mov(t0, pL_); + lea(t0, ptr[rip+pL_]); mov(t7, a); // q // [d:t7:t3:t2:t1] = p * q @@ -1405,7 +1407,7 @@ printf("isFp=%d\n", isFp); L(fp_mulL); vmovq(xm0, p0); // save p0 - mov(p0, pL_); + lea(p0, ptr[rip+pL_]); vmovq(xm1, p2); mov(p2, ptr [p2]); montgomery4_1(rp_, t0, t7, t3, t2, t1, p1, p2, p0, t4, t5, t6, t8, t9, true, xm2); @@ -1501,7 +1503,7 @@ printf("isFp=%d\n", isFp); mov(a, rp_); mul(c[0]); // q = a mov(d, a); - mov(t1, pL_); + lea(t1, ptr[rip+pL_]); // c += p * q mulAdd(c, 6, t1); } @@ -1547,7 +1549,7 @@ printf("isFp=%d\n", isFp); const Pack z = Pack(t3, t2, t1, t0, t7, t6); const Pack keep = Pack(rdx, rax, px, py, t8, t9); mov_rr(keep, z); - mov(t5, pL_); + lea(t5, ptr[rip+pL_]); sub_rm(z, t5); cmovc_rr(z, keep); store_mr(pz, z); @@ -1577,7 +1579,7 @@ printf("isFp=%d\n", isFp); const Reg64& t9 = sf.t[9]; vmovq(xm0, p0); // save p0 - mov(t7, pL_); + lea(t7, ptr[rip+pL_]); mov(t9, ptr [p2]); // c3, c2, c1, c0, px, y, p, montgomery3_1(rp_, t0, t3, t2, t1, p1, t9, t7, t4, t5, t6, t8, p0, true); @@ -1623,7 +1625,7 @@ printf("isFp=%d\n", isFp); const Reg64& t9 = sf.t[9]; vmovq(xm0, pz); // save pz - mov(t7, pL_); + lea(t7, ptr[rip+pL_]); mov(t9, ptr [px]); mul3x1_sqr1(px, t9, t3, t2, t1, t0); mov(t0, rdx); From 3768ebfedf27cbf94b572ad900131e931bed7268 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Wed, 16 Sep 2020 18:39:41 +0900 Subject: [PATCH 06/15] test mclx_Fp_mul --- src/fp_generator.hpp | 93 ++++++++++++++++++++++++++++++++++++--- test/bench.hpp | 1 + test/static_code_test.cpp | 35 ++++++++++++--- 3 files changed, 118 insertions(+), 11 deletions(-) diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index a7536e6..2feaf7b 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -28,11 +28,53 @@ namespace mcl { #ifdef MCL_STATIC_JIT typedef fp::Unit Unit; extern "C" { -Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*); -void mclx_Fr_add(Unit*, const Unit*, const Unit*); - Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_Fp_subPre(Unit*, const Unit*, const Unit*); void mclx_Fp_add(Unit*, const Unit*, const Unit*); +void mclx_Fp_sub(Unit*, const Unit*, const Unit*); +void mclx_Fp_shr1(Unit*, const Unit*); +void mclx_Fp_neg(Unit*, const Unit*); +void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); +Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_sqrPre(Unit*, const Unit*); +void mclx_FpDbl_mod(Unit*, const Unit*); +void mclx_Fp_mul(Unit*, const Unit*, const Unit*); +void mclx_Fp_sqr(Unit*, const Unit*); +void mclx_Fp2_add(Unit*, const Unit*, const Unit*); +void mclx_Fp2_sub(Unit*, const Unit*, const Unit*); +void mclx_Fp2_neg(Unit*, const Unit*); +void mclx_Fp2_mul(Unit*, const Unit*, const Unit*); +void mclx_Fp2_sqr(Unit*, const Unit*); +void mclx_Fp2_mul_xi(Unit*, const Unit*); + +Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_Fr_subPre(Unit*, const Unit*, const Unit*); +void mclx_Fr_add(Unit*, const Unit*, const Unit*); +void mclx_Fr_sub(Unit*, const Unit*, const Unit*); +void mclx_Fr_shr1(Unit*, const Unit*); +void mclx_Fr_neg(Unit*, const Unit*); +void mclx_FrDbl_add(Unit*, const Unit*, const Unit*); +void mclx_FrDbl_sub(Unit*, const Unit*, const Unit*); +void mclx_FrDbl_add(Unit*, const Unit*, const Unit*); +void mclx_FrDbl_sub(Unit*, const Unit*, const Unit*); +Unit mclx_FrDbl_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_FrDbl_subPre(Unit*, const Unit*, const Unit*); +void mclx_FrDbl_mulPre(Unit*, const Unit*, const Unit*); +void mclx_FrDbl_sqrPre(Unit*, const Unit*); +void mclx_FrDbl_mod(Unit*, const Unit*); +void mclx_Fr_mul(Unit*, const Unit*, const Unit*); +void mclx_Fr_sqr(Unit*, const Unit*); +void mclx_Fr2_add(Unit*, const Unit*, const Unit*); +void mclx_Fr2_sub(Unit*, const Unit*, const Unit*); +void mclx_Fr2_neg(Unit*, const Unit*); +void mclx_Fr2_mul(Unit*, const Unit*, const Unit*); +void mclx_Fr2_sqr(Unit*, const Unit*); +void mclx_Fr2_mul_xi(Unit*, const Unit*); } #endif @@ -327,6 +369,7 @@ struct FpGenerator : Xbyak::CodeGenerator { private: void init_inner(Op& op, const char *suf) { + const bool isFp = suf && suf[0] == 'F' && suf[1] == 'p'; op_ = &op; L(pL_); p_ = reinterpret_cast(getCurr()); @@ -382,6 +425,7 @@ private: setFuncInfo(prof_, suf, "_neg", op.fp_negA_, getCurr()); align(16); +if (op.xi_a) { op.fpDbl_addA_ = gen_fpDbl_add(); setFuncInfo(prof_, suf, "Dbl_add", op.fpDbl_addA_, getCurr()); align(16); @@ -409,6 +453,7 @@ private: op.fpDbl_modA_ = gen_fpDbl_mod(op); setFuncInfo(prof_, suf, "Dbl_mod", op.fpDbl_modA_, getCurr()); align(16); +} op.fp_mulA_ = gen_mul(); setFuncInfo(prof_, suf, "_mul", op.fp_mulA_, getCurr()); @@ -463,14 +508,50 @@ private: align(16); #ifdef MCL_STATIC_JIT - const bool isFp = strcmp(suf, "Fp") == 0; -printf("isFp=%d\n", isFp); if (isFp) { + // Fp, sizeof(Fp) = 48 op.fp_addPre = mclx_Fp_addPre; - op.fp_addA_ = mclx_Fr_add; + op.fp_subPre = mclx_Fp_subPre; + op.fp_addA_ = mclx_Fp_add; + op.fp_subA_ = mclx_Fp_sub; + op.fp_shr1 = mclx_Fp_shr1; + op.fp_negA_ = mclx_Fp_neg; + op.fpDbl_addA_ = mclx_FpDbl_add; + op.fpDbl_subA_ = mclx_FpDbl_sub; + op.fpDbl_addPre = mclx_FpDbl_addPre; + op.fpDbl_subPre = mclx_FpDbl_subPre; + op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre; + op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre; + op.fpDbl_modA_ = mclx_FpDbl_mod; + op.fp_mulA_ = mclx_Fp_mul; + op.fp_sqrA_ = mclx_Fp_sqr; +#if 0 +// op.fp_preInv = mclx_Fp_preInv; + op.fp2_addA_ = mclx_Fp2_add; + op.fp2_subA_ = mclx_Fp2_sub; + op.fp2_negA_ = mclx_Fp2_neg; + op.fp2_mulA_ = mclx_Fp2_mul; + op.fp2_sqrA_ = mclx_Fp2_sqr; + op.fp2_mul_xiA_ = mclx_Fp2_mul_xi; +#endif } else { + // Fr, sizeof(Fr) = 32 op.fp_addPre = mclx_Fr_addPre; + op.fp_subPre = mclx_Fr_subPre; op.fp_addA_ = mclx_Fr_add; + op.fp_subA_ = mclx_Fr_sub; + op.fp_shr1 = mclx_Fr_shr1; + op.fp_negA_ = mclx_Fr_neg; + op.fpDbl_addA_ = mclx_FpDbl_add; + op.fpDbl_subA_ = mclx_FpDbl_sub; + op.fpDbl_addPre = mclx_FpDbl_addPre; + op.fpDbl_subPre = mclx_FpDbl_subPre; + op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre; + op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre; + op.fpDbl_modA_ = mclx_FpDbl_mod; + op.fp_mulA_ = mclx_Fr_mul; + op.fp_sqrA_ = mclx_Fr_sqr; + op.fp_preInv = mclx_Fr_preInv; } #endif } diff --git a/test/bench.hpp b/test/bench.hpp index c8c3911..b4a8bd2 100644 --- a/test/bench.hpp +++ b/test/bench.hpp @@ -100,6 +100,7 @@ void testBench(const G1& P, const G2& Q) CYBOZU_BENCH_C("Fp::mul ", C3, Fp::mul, x, x, y); CYBOZU_BENCH_C("Fp::sqr ", C3, Fp::sqr, x, x); CYBOZU_BENCH_C("Fp::inv ", C3, Fp::inv, x, x); + CYBOZU_BENCH_C("Fp::pow ", C3, Fp::pow, x, x, y); Fp2 xx, yy; xx.a = x; xx.b = 3; diff --git a/test/static_code_test.cpp b/test/static_code_test.cpp index 93dc223..e69fda7 100644 --- a/test/static_code_test.cpp +++ b/test/static_code_test.cpp @@ -2,14 +2,39 @@ using namespace mcl::bn; -int main() +void testFr() +{ + Fr x, y, z; + x = 3; + y = 5; + z = x + y; + printf("x=%s\n", x.getStr().c_str()); + printf("y=%s\n", y.getStr().c_str()); + printf("z=%s\n", z.getStr().c_str()); + z = x * y; + printf("z=%s\n", z.getStr().c_str()); + Fr::sqr(z, x); + printf("z=%s\n", z.getStr().c_str()); +} + +void testFp() { - initPairing(mcl::BLS12_381); Fp x, y, z; x = 3; y = 5; z = x + y; - printf("x=%s\n", x.getStr(16).c_str()); - printf("y=%s\n", y.getStr(16).c_str()); - printf("z=%s\n", z.getStr(16).c_str()); + printf("x=%s\n", x.getStr().c_str()); + printf("y=%s\n", y.getStr().c_str()); + printf("z=%s\n", z.getStr().c_str()); + z = x * y; + printf("z=%s\n", z.getStr().c_str()); + Fp::sqr(z, x); + printf("z=%s\n", z.getStr().c_str()); +} + +int main() +{ + initPairing(mcl::BLS12_381); + testFr(); + testFp(); } From df3e118538c40072e6b1a4cab65ff07d18b62fd2 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Thu, 17 Sep 2020 10:16:53 +0900 Subject: [PATCH 07/15] remove suf --- include/mcl/bn.hpp | 4 +- include/mcl/fp.hpp | 12 ++--- include/mcl/op.hpp | 2 +- src/fp.cpp | 10 ++-- src/fp_generator.hpp | 118 ++++++++++++++++--------------------------- 5 files changed, 56 insertions(+), 90 deletions(-) diff --git a/include/mcl/bn.hpp b/include/mcl/bn.hpp index 8710f55..3668da2 100644 --- a/include/mcl/bn.hpp +++ b/include/mcl/bn.hpp @@ -875,9 +875,9 @@ struct Param { assert((p % 6) == 1); r = local::evalPoly(z, rCoff); } - Fr::init(pb, r, mode, "Fr"); + Fr::init(pb, r, mode); if (!*pb) return; - Fp::init(pb, cp.xi_a, p, mode, "Fp"); + Fp::init(pb, cp.xi_a, p, mode); if (!*pb) return; Fp2::init(); const Fp2 xi(cp.xi_a, 1); diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp index c8b5a6d..6c5b0b0 100644 --- a/include/mcl/fp.hpp +++ b/include/mcl/fp.hpp @@ -130,10 +130,10 @@ public: xi_a is used for Fp2::mul_xi(), where xi = xi_a + i and i^2 = -1 if xi_a = 0 then asm functions for Fp2 are not generated. */ - static inline void init(bool *pb, int xi_a, const mpz_class& p, fp::Mode mode = fp::FP_AUTO, const char *suf = 0) + static inline void init(bool *pb, int xi_a, const mpz_class& p, fp::Mode mode = fp::FP_AUTO) { assert(maxBitSize <= MCL_MAX_BIT_SIZE); - *pb = op_.init(p, maxBitSize, xi_a, mode, suf); + *pb = op_.init(p, maxBitSize, xi_a, mode); if (!*pb) return; { // set oneRep FpT& one = *reinterpret_cast(op_.oneRep); @@ -163,16 +163,16 @@ public: #endif *pb = true; } - static inline void init(bool *pb, const mpz_class& p, fp::Mode mode = fp::FP_AUTO, const char *suf = 0) + static inline void init(bool *pb, const mpz_class& p, fp::Mode mode = fp::FP_AUTO) { - init(pb, 0, p, mode, suf); + init(pb, 0, p, mode); } - static inline void init(bool *pb, const char *mstr, fp::Mode mode = fp::FP_AUTO, const char *suf = 0) + static inline void init(bool *pb, const char *mstr, fp::Mode mode = fp::FP_AUTO) { mpz_class p; gmp::setStr(pb, p, mstr); if (!*pb) return; - init(pb, p, mode, suf); + init(pb, p, mode); } static inline size_t getModulo(char *buf, size_t bufSize) { diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index 45320e5..99c0e4d 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -364,7 +364,7 @@ struct Op { */ fp_mul(y, x, R2, p); } - bool init(const mpz_class& p, size_t maxBitSize, int xi_a, Mode mode, const char *suf = 0, size_t mclMaxBitSize = MCL_MAX_BIT_SIZE); + bool init(const mpz_class& p, size_t maxBitSize, int xi_a, Mode mode, size_t mclMaxBitSize = MCL_MAX_BIT_SIZE); #ifdef MCL_USE_XBYAK static FpGenerator* createFpGenerator(); static void destroyFpGenerator(FpGenerator *fg); diff --git a/src/fp.cpp b/src/fp.cpp index 998a53b..b3b07d1 100644 --- a/src/fp.cpp +++ b/src/fp.cpp @@ -346,7 +346,7 @@ static void initInvTbl(Op& op) } #endif -static bool initForMont(Op& op, const Unit *p, Mode mode, const char *suf) +static bool initForMont(Op& op, const Unit *p, Mode mode) { const size_t N = op.N; bool b; @@ -366,19 +366,17 @@ static bool initForMont(Op& op, const Unit *p, Mode mode, const char *suf) if (mode != FP_XBYAK) return true; #ifdef MCL_USE_XBYAK if (op.fg == 0) op.fg = Op::createFpGenerator(); - bool useXbyak = op.fg->init(op, suf); + bool useXbyak = op.fg->init(op); if (useXbyak && op.isMont && N <= 4) { op.fp_invOp = &invOpForMontC; initInvTbl(op); } -#else - (void)suf; #endif return true; } -bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, const char *suf, size_t mclMaxBitSize) +bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size_t mclMaxBitSize) { if (mclMaxBitSize != MCL_MAX_BIT_SIZE) return false; #ifdef MCL_USE_VINT @@ -536,7 +534,7 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, cons if (!b) return false; } modp.init(mp); - return fp::initForMont(*this, p, mode, suf); + return fp::initForMont(*this, p, mode); } void copyUnitToByteAsLE(uint8_t *dst, const Unit *src, size_t byteSize) diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 2feaf7b..f4a626a 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -34,6 +34,8 @@ void mclx_Fp_add(Unit*, const Unit*, const Unit*); void mclx_Fp_sub(Unit*, const Unit*, const Unit*); void mclx_Fp_shr1(Unit*, const Unit*); void mclx_Fp_neg(Unit*, const Unit*); +void mclx_Fp_mul(Unit*, const Unit*, const Unit*); +void mclx_Fp_sqr(Unit*, const Unit*); void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); @@ -43,8 +45,6 @@ Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*); void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*); void mclx_FpDbl_sqrPre(Unit*, const Unit*); void mclx_FpDbl_mod(Unit*, const Unit*); -void mclx_Fp_mul(Unit*, const Unit*, const Unit*); -void mclx_Fp_sqr(Unit*, const Unit*); void mclx_Fp2_add(Unit*, const Unit*, const Unit*); void mclx_Fp2_sub(Unit*, const Unit*, const Unit*); void mclx_Fp2_neg(Unit*, const Unit*); @@ -58,23 +58,9 @@ void mclx_Fr_add(Unit*, const Unit*, const Unit*); void mclx_Fr_sub(Unit*, const Unit*, const Unit*); void mclx_Fr_shr1(Unit*, const Unit*); void mclx_Fr_neg(Unit*, const Unit*); -void mclx_FrDbl_add(Unit*, const Unit*, const Unit*); -void mclx_FrDbl_sub(Unit*, const Unit*, const Unit*); -void mclx_FrDbl_add(Unit*, const Unit*, const Unit*); -void mclx_FrDbl_sub(Unit*, const Unit*, const Unit*); -Unit mclx_FrDbl_addPre(Unit*, const Unit*, const Unit*); -Unit mclx_FrDbl_subPre(Unit*, const Unit*, const Unit*); -void mclx_FrDbl_mulPre(Unit*, const Unit*, const Unit*); -void mclx_FrDbl_sqrPre(Unit*, const Unit*); -void mclx_FrDbl_mod(Unit*, const Unit*); void mclx_Fr_mul(Unit*, const Unit*, const Unit*); void mclx_Fr_sqr(Unit*, const Unit*); -void mclx_Fr2_add(Unit*, const Unit*, const Unit*); -void mclx_Fr2_sub(Unit*, const Unit*, const Unit*); -void mclx_Fr2_neg(Unit*, const Unit*); -void mclx_Fr2_mul(Unit*, const Unit*, const Unit*); -void mclx_Fr2_sqr(Unit*, const Unit*); -void mclx_Fr2_mul_xi(Unit*, const Unit*); +int mclx_Fr_preInv(Unit*, const Unit*); } #endif @@ -354,12 +340,12 @@ struct FpGenerator : Xbyak::CodeGenerator { useMulx_ = cpu_.has(Xbyak::util::Cpu::tBMI2); useAdx_ = cpu_.has(Xbyak::util::Cpu::tADX); } - bool init(Op& op, const char *suf) + bool init(Op& op) { if (!cpu_.has(Xbyak::util::Cpu::tAVX)) return false; reset(); // reset jit code for reuse setProtectModeRW(); // read/write memory - init_inner(op, suf); + init_inner(op); // ToDo : recover op if false if (Xbyak::GetError()) return false; // printf("code size=%d\n", (int)getSize()); @@ -367,9 +353,9 @@ struct FpGenerator : Xbyak::CodeGenerator { return true; } private: - void init_inner(Op& op, const char *suf) + void init_inner(Op& op) { - const bool isFp = suf && suf[0] == 'F' && suf[1] == 'p'; + const char *suf = op.xi_a ? "Fp" : "Fr"; op_ = &op; L(pL_); p_ = reinterpret_cast(getCurr()); @@ -413,102 +399,100 @@ private: op.fp_addA_ = gen_fp_add(); setFuncInfo(prof_, suf, "_add", op.fp_addA_, getCurr()); + align(16); op.fp_subA_ = gen_fp_sub(); setFuncInfo(prof_, suf, "_sub", op.fp_subA_, getCurr()); - align(16); + align(16); op.fp_shr1 = gen_shr1(); setFuncInfo(prof_, suf, "_shr1", op.fp_shr1, getCurr()); - align(16); + align(16); op.fp_negA_ = gen_fp_neg(); setFuncInfo(prof_, suf, "_neg", op.fp_negA_, getCurr()); + align(16); + op.fp_mulA_ = gen_mul(); + setFuncInfo(prof_, suf, "_mul", op.fp_mulA_, getCurr()); + + if (op.fp_mulA_) { + op.fp_mul = fp::func_ptr_cast(op.fp_mulA_); // used in toMont/fromMont + } -if (op.xi_a) { + align(16); + op.fp_sqrA_ = gen_sqr(); + setFuncInfo(prof_, suf, "_sqr", op.fp_sqrA_, getCurr()); + + if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 + align(16); + op.fp_preInv = getCurr(); + gen_preInv(); + setFuncInfo(prof_, suf, "_preInv", op.fp_preInv, getCurr()); + } + if (op.xi_a == 0) return; // Fp2 is not used + align(16); op.fpDbl_addA_ = gen_fpDbl_add(); setFuncInfo(prof_, suf, "Dbl_add", op.fpDbl_addA_, getCurr()); - align(16); + align(16); op.fpDbl_subA_ = gen_fpDbl_sub(); setFuncInfo(prof_, suf, "Dbl_sub", op.fpDbl_subA_, getCurr()); - align(16); + align(16); op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2); setFuncInfo(prof_, suf, "Dbl_addPre", op.fpDbl_addPre, getCurr()); - align(16); + align(16); op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2); setFuncInfo(prof_, suf, "Dbl_subPre", op.fpDbl_subPre, getCurr()); - align(16); + align(16); op.fpDbl_mulPreA_ = gen_fpDbl_mulPre(); setFuncInfo(prof_, suf, "Dbl_mulPre", op.fpDbl_mulPreA_, getCurr()); - align(16); + align(16); op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre(); setFuncInfo(prof_, suf, "Dbl_sqrPre", op.fpDbl_sqrPreA_, getCurr()); - align(16); + align(16); op.fpDbl_modA_ = gen_fpDbl_mod(op); setFuncInfo(prof_, suf, "Dbl_mod", op.fpDbl_modA_, getCurr()); - align(16); -} - op.fp_mulA_ = gen_mul(); - setFuncInfo(prof_, suf, "_mul", op.fp_mulA_, getCurr()); align(16); - - if (op.fp_mulA_) { - op.fp_mul = fp::func_ptr_cast(op.fp_mulA_); // used in toMont/fromMont - } - - op.fp_sqrA_ = gen_sqr(); - setFuncInfo(prof_, suf, "_sqr", op.fp_sqrA_, getCurr()); - align(16); - - if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 - op.fp_preInv = getCurr(); - gen_preInv(); - setFuncInfo(prof_, suf, "_preInv", op.fp_preInv, getCurr()); - align(16); - } - if (op.xi_a == 0) return; // Fp2 is not used op.fp2_addA_ = gen_fp2_add(); setFuncInfo(prof_, suf, "2_add", op.fp2_addA_, getCurr()); - align(16); + align(16); op.fp2_subA_ = gen_fp2_sub(); setFuncInfo(prof_, suf, "2_sub", op.fp2_subA_, getCurr()); - align(16); + align(16); op.fp2_negA_ = gen_fp2_neg(); setFuncInfo(prof_, suf, "2_neg", op.fp2_negA_, getCurr()); - align(16); op.fp2_mulNF = 0; + align(16); op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre(); if (op.fp2Dbl_mulPreA_) setFuncInfo(prof_, suf, "2Dbl_mulPre", op.fp2Dbl_mulPreA_, getCurr()); - align(16); + align(16); op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre(); if (op.fp2Dbl_sqrPreA_) setFuncInfo(prof_, suf, "2Dbl_sqrPre", op.fp2Dbl_sqrPreA_, getCurr()); - align(16); + align(16); op.fp2_mulA_ = gen_fp2_mul(); setFuncInfo(prof_, suf, "2_mul", op.fp2_mulA_, getCurr()); - align(16); + align(16); op.fp2_sqrA_ = gen_fp2_sqr(); setFuncInfo(prof_, suf, "2_sqr", op.fp2_sqrA_, getCurr()); - align(16); + align(16); op.fp2_mul_xiA_ = gen_fp2_mul_xi(); setFuncInfo(prof_, suf, "2_mul_xi", op.fp2_mul_xiA_, getCurr()); - align(16); #ifdef MCL_STATIC_JIT - if (isFp) { + if (op.xi_a) { // Fp, sizeof(Fp) = 48 op.fp_addPre = mclx_Fp_addPre; op.fp_subPre = mclx_Fp_subPre; @@ -525,15 +509,6 @@ if (op.xi_a) { op.fpDbl_modA_ = mclx_FpDbl_mod; op.fp_mulA_ = mclx_Fp_mul; op.fp_sqrA_ = mclx_Fp_sqr; -#if 0 -// op.fp_preInv = mclx_Fp_preInv; - op.fp2_addA_ = mclx_Fp2_add; - op.fp2_subA_ = mclx_Fp2_sub; - op.fp2_negA_ = mclx_Fp2_neg; - op.fp2_mulA_ = mclx_Fp2_mul; - op.fp2_sqrA_ = mclx_Fp2_sqr; - op.fp2_mul_xiA_ = mclx_Fp2_mul_xi; -#endif } else { // Fr, sizeof(Fr) = 32 op.fp_addPre = mclx_Fr_addPre; @@ -542,13 +517,6 @@ if (op.xi_a) { op.fp_subA_ = mclx_Fr_sub; op.fp_shr1 = mclx_Fr_shr1; op.fp_negA_ = mclx_Fr_neg; - op.fpDbl_addA_ = mclx_FpDbl_add; - op.fpDbl_subA_ = mclx_FpDbl_sub; - op.fpDbl_addPre = mclx_FpDbl_addPre; - op.fpDbl_subPre = mclx_FpDbl_subPre; - op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre; - op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre; - op.fpDbl_modA_ = mclx_FpDbl_mod; op.fp_mulA_ = mclx_Fr_mul; op.fp_sqrA_ = mclx_Fr_sqr; op.fp_preInv = mclx_Fr_preInv; From 11a752cb6cefded11603a54d5be594fd81626a44 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Thu, 17 Sep 2020 10:41:07 +0900 Subject: [PATCH 08/15] Fr, Fp, Fp2 test ok --- Makefile | 2 +- src/fp_generator.hpp | 10 +++----- test/static_code_test.cpp | 53 +++++++++++++++++++++------------------ 3 files changed, 34 insertions(+), 31 deletions(-) diff --git a/Makefile b/Makefile index dbc2a41..9e37876 100644 --- a/Makefile +++ b/Makefile @@ -238,7 +238,7 @@ $(GEN_EXE): src/gen.cpp src/llvm_gen.hpp $(CXX) -o $@ $< $(CFLAGS) src/dump_code: src/dump_code.cpp src/fp.cpp src/fp_generator.hpp - $(CXX) -o $@ src/dump_code.cpp src/fp.cpp -I include -DMCL_DUMP_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER + $(CXX) -o $@ src/dump_code.cpp src/fp.cpp -g -I include -DMCL_DUMP_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER src/static_code.asm: src/dump_code $< > $@ diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index f4a626a..b5d4628 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -410,7 +410,9 @@ private: align(16); op.fp_negA_ = gen_fp_neg(); setFuncInfo(prof_, suf, "_neg", op.fp_negA_, getCurr()); - + align(16); + op.fpDbl_modA_ = gen_fpDbl_mod(op); + setFuncInfo(prof_, suf, "Dbl_mod", op.fpDbl_modA_, getCurr()); align(16); op.fp_mulA_ = gen_mul(); setFuncInfo(prof_, suf, "_mul", op.fp_mulA_, getCurr()); @@ -454,10 +456,6 @@ private: op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre(); setFuncInfo(prof_, suf, "Dbl_sqrPre", op.fpDbl_sqrPreA_, getCurr()); - align(16); - op.fpDbl_modA_ = gen_fpDbl_mod(op); - setFuncInfo(prof_, suf, "Dbl_mod", op.fpDbl_modA_, getCurr()); - align(16); op.fp2_addA_ = gen_fp2_add(); setFuncInfo(prof_, suf, "2_add", op.fp2_addA_, getCurr()); @@ -493,7 +491,7 @@ private: #ifdef MCL_STATIC_JIT if (op.xi_a) { - // Fp, sizeof(Fp) = 48 + // Fp, sizeof(Fp) = 48, supports Fp2 op.fp_addPre = mclx_Fp_addPre; op.fp_subPre = mclx_Fp_subPre; op.fp_addA_ = mclx_Fp_add; diff --git a/test/static_code_test.cpp b/test/static_code_test.cpp index e69fda7..8238e48 100644 --- a/test/static_code_test.cpp +++ b/test/static_code_test.cpp @@ -1,40 +1,45 @@ +#include #include using namespace mcl::bn; -void testFr() +CYBOZU_TEST_AUTO(init) { - Fr x, y, z; + initPairing(mcl::BLS12_381); +} + +CYBOZU_TEST_AUTO(Fr) +{ + Fr x, y; x = 3; y = 5; - z = x + y; - printf("x=%s\n", x.getStr().c_str()); - printf("y=%s\n", y.getStr().c_str()); - printf("z=%s\n", z.getStr().c_str()); - z = x * y; - printf("z=%s\n", z.getStr().c_str()); - Fr::sqr(z, x); - printf("z=%s\n", z.getStr().c_str()); + CYBOZU_TEST_EQUAL(x + y, 8); + CYBOZU_TEST_EQUAL(x - y, -2); + CYBOZU_TEST_EQUAL(x * y, 15); } -void testFp() +CYBOZU_TEST_AUTO(Fp) { - Fp x, y, z; + Fp x, y; x = 3; y = 5; - z = x + y; - printf("x=%s\n", x.getStr().c_str()); - printf("y=%s\n", y.getStr().c_str()); - printf("z=%s\n", z.getStr().c_str()); - z = x * y; - printf("z=%s\n", z.getStr().c_str()); - Fp::sqr(z, x); - printf("z=%s\n", z.getStr().c_str()); + CYBOZU_TEST_EQUAL(x + y, 8); + CYBOZU_TEST_EQUAL(x - y, -2); + CYBOZU_TEST_EQUAL(x * y, 15); } -int main() +CYBOZU_TEST_AUTO(Fp2) { - initPairing(mcl::BLS12_381); - testFr(); - testFp(); + Fp2 x, y; + x.a = 3; + x.b = 2; + y.a = 1; + y.b = 4; + /* + (3+2i)(1+4i)=3-8+(12+2)i + */ + CYBOZU_TEST_EQUAL(x + y, Fp2(4, 6)); + CYBOZU_TEST_EQUAL(x - y, Fp2(2, -2)); + CYBOZU_TEST_EQUAL(x * y, Fp2(-5, 14)); } + From a522fd532d207e698665efa631166a26e96b9f35 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Thu, 17 Sep 2020 11:07:09 +0900 Subject: [PATCH 09/15] test of G1 and G2 --- test/static_code_test.cpp | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/test/static_code_test.cpp b/test/static_code_test.cpp index 8238e48..3751741 100644 --- a/test/static_code_test.cpp +++ b/test/static_code_test.cpp @@ -43,3 +43,28 @@ CYBOZU_TEST_AUTO(Fp2) CYBOZU_TEST_EQUAL(x * y, Fp2(-5, 14)); } +CYBOZU_TEST_AUTO(G1) +{ + G1 P, Q; + hashAndMapToG1(P, "abc", 3); + Fr r1, r2; + r1.setHashOf("abc", 3); + r2 = -r1; + G1::mul(Q, P, r1); + Q = -Q; + P *= r2; + CYBOZU_TEST_EQUAL(P, Q); +} + +CYBOZU_TEST_AUTO(G2) +{ + G2 P, Q; + hashAndMapToG2(P, "abc", 3); + Fr r1, r2; + r1.setHashOf("abc", 3); + r2 = -r1; + G2::mul(Q, P, r1); + Q = -Q; + P *= r2; + CYBOZU_TEST_EQUAL(P, Q); +} From eaabb2337b011fb4989752a42fcf2d4eefa65fcf Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Thu, 17 Sep 2020 16:44:20 +0900 Subject: [PATCH 10/15] fix pic code --- Makefile | 2 +- include/mcl/op.hpp | 7 +++- sample/bench.cpp | 4 +- sample/rawbench.cpp | 2 +- src/fp.cpp | 25 ++++++++++-- src/fp_generator.hpp | 84 +++------------------------------------- src/fp_static_code.hpp | 87 ++++++++++++++++++++++++++++++++++++++++++ src/low_func.hpp | 2 +- test/ec_test.cpp | 2 +- test/fp_test.cpp | 6 +-- test/fp_tower_test.cpp | 2 +- 11 files changed, 131 insertions(+), 92 deletions(-) create mode 100644 src/fp_static_code.hpp diff --git a/Makefile b/Makefile index 9e37876..1b59ce7 100644 --- a/Makefile +++ b/Makefile @@ -247,7 +247,7 @@ obj/static_code.o: src/static_code.asm nasm -felf64 -o $@ $< bin/static_code_test.exe: test/static_code_test.cpp src/fp.cpp obj/static_code.o - $(CXX) -o $@ -O3 $^ -DMCL_STATIC_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I include -Wall -Wextra + $(CXX) -o $@ -O3 $^ -g -DMCL_DONT_USE_XBYAK -DMCL_STATIC_CODE -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I include -Wall -Wextra asm: $(LLVM_SRC) $(LLVM_OPT) -O3 -o - $(LLVM_SRC) | $(LLVM_LLC) -O3 $(LLVM_FLAGS) -x86-asm-syntax=intel diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index 99c0e4d..22a78b1 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -16,6 +16,9 @@ #endif #if !defined(MCL_DONT_USE_XBYAK) && (defined(_WIN64) || defined(__x86_64__)) && (MCL_SIZEOF_UNIT == 8) #define MCL_USE_XBYAK +#endif +#if defined(MCL_USE_XBYAK) || defined(MCL_STATIC_CODE) + #define MCL_X64_ASM #define MCL_XBYAK_DIRECT_CALL #endif @@ -202,6 +205,8 @@ struct Op { Unit R3[maxUnitSize]; #ifdef MCL_USE_XBYAK FpGenerator *fg; +#endif +#ifdef MCL_X64_ASM mcl::Array invTbl; #endif void3u fp_addA_; @@ -288,7 +293,7 @@ struct Op { memset(one, 0, sizeof(one)); memset(R2, 0, sizeof(R2)); memset(R3, 0, sizeof(R3)); -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM invTbl.clear(); #endif fp_addA_ = 0; diff --git a/sample/bench.cpp b/sample/bench.cpp index de81f25..d3c101c 100644 --- a/sample/bench.cpp +++ b/sample/bench.cpp @@ -68,7 +68,7 @@ void benchFp(size_t bitSize, int mode) if (mode & 4) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM); if (mode & 8) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM_MONT); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (mode & 16) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_XBYAK); #endif } @@ -122,7 +122,7 @@ void benchEc(size_t bitSize, int mode, mcl::ec::Mode ecMode) if (mode & 4) benchEcSub(tbl[i], mcl::fp::FP_LLVM, ecMode); if (mode & 8) benchEcSub(tbl[i], mcl::fp::FP_LLVM_MONT, ecMode); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (mode & 16) benchEcSub(tbl[i], mcl::fp::FP_XBYAK, ecMode); #endif } diff --git a/sample/rawbench.cpp b/sample/rawbench.cpp index 4d7506e..cc74bc3 100644 --- a/sample/rawbench.cpp +++ b/sample/rawbench.cpp @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) benchRaw(tbl[i], mcl::fp::FP_LLVM); benchRaw(tbl[i], mcl::fp::FP_LLVM_MONT); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (bitSize <= 384) { benchRaw(tbl[i], mcl::fp::FP_XBYAK); } diff --git a/src/fp.cpp b/src/fp.cpp index b3b07d1..ab3a1a7 100644 --- a/src/fp.cpp +++ b/src/fp.cpp @@ -3,12 +3,14 @@ #include #include #include +#ifdef MCL_STATIC_CODE +#include "fp_static_code.hpp" +#endif #ifdef MCL_USE_XBYAK #include "fp_generator.hpp" #else #define XBYAK_ONLY_CLASS_CPU #include "xbyak/xbyak_util.h" -//#include "detect_cpu.hpp" #endif #include "low_func.hpp" #ifdef MCL_USE_LLVM @@ -315,7 +317,7 @@ void setOp(Op& op, Mode mode) #endif } -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM inline void invOpForMontC(Unit *y, const Unit *x, const Op& op) { Unit r[maxUnitSize]; @@ -372,6 +374,12 @@ static bool initForMont(Op& op, const Unit *p, Mode mode) op.fp_invOp = &invOpForMontC; initInvTbl(op); } +#elif defined(MCL_STATIC_CODE) + fp::setStaticCode(op); + if (op.isMont && N <= 4) { + op.fp_invOp = &invOpForMontC; + initInvTbl(op); + } #endif return true; } @@ -403,14 +411,25 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size priority : MCL_USE_XBYAK > MCL_USE_LLVM > none Xbyak > llvm_mont > llvm > gmp_mont > gmp */ -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (mode == FP_AUTO) mode = FP_XBYAK; if (mode == FP_XBYAK && bitSize > 384) { mode = FP_AUTO; } +#ifdef MCL_USE_XBYAK if (!isEnableJIT()) { mode = FP_AUTO; } +#elif MCL_STATIC_CODE + { + // static jit code uses avx, mulx, adox, adcx + using namespace Xbyak::util; + Cpu cpu; + if (!(cpu.has(Cpu::tAVX) && cpu.has(Cpu::tBMI2) && cpu.has(Cpu::tADX))) { + mode = FP_AUTO; + } + } +#endif #else if (mode == FP_XBYAK) mode = FP_AUTO; #endif diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index b5d4628..4243368 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -7,7 +7,6 @@ http://opensource.org/licenses/BSD-3-Clause */ #if CYBOZU_HOST == CYBOZU_HOST_INTEL -#define XBYAK_NO_OP_NAMES #define XBYAK_DISABLE_AVX512 #include "xbyak/xbyak_util.h" @@ -25,45 +24,6 @@ namespace mcl { -#ifdef MCL_STATIC_JIT -typedef fp::Unit Unit; -extern "C" { -Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*); -Unit mclx_Fp_subPre(Unit*, const Unit*, const Unit*); -void mclx_Fp_add(Unit*, const Unit*, const Unit*); -void mclx_Fp_sub(Unit*, const Unit*, const Unit*); -void mclx_Fp_shr1(Unit*, const Unit*); -void mclx_Fp_neg(Unit*, const Unit*); -void mclx_Fp_mul(Unit*, const Unit*, const Unit*); -void mclx_Fp_sqr(Unit*, const Unit*); -void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); -void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); -void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); -void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); -Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*); -Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*); -void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*); -void mclx_FpDbl_sqrPre(Unit*, const Unit*); -void mclx_FpDbl_mod(Unit*, const Unit*); -void mclx_Fp2_add(Unit*, const Unit*, const Unit*); -void mclx_Fp2_sub(Unit*, const Unit*, const Unit*); -void mclx_Fp2_neg(Unit*, const Unit*); -void mclx_Fp2_mul(Unit*, const Unit*, const Unit*); -void mclx_Fp2_sqr(Unit*, const Unit*); -void mclx_Fp2_mul_xi(Unit*, const Unit*); - -Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*); -Unit mclx_Fr_subPre(Unit*, const Unit*, const Unit*); -void mclx_Fr_add(Unit*, const Unit*, const Unit*); -void mclx_Fr_sub(Unit*, const Unit*, const Unit*); -void mclx_Fr_shr1(Unit*, const Unit*); -void mclx_Fr_neg(Unit*, const Unit*); -void mclx_Fr_mul(Unit*, const Unit*, const Unit*); -void mclx_Fr_sqr(Unit*, const Unit*); -int mclx_Fr_preInv(Unit*, const Unit*); -} -#endif - #ifdef MCL_DUMP_JIT struct DumpCode { FILE *fp_; @@ -488,38 +448,6 @@ private: align(16); op.fp2_mul_xiA_ = gen_fp2_mul_xi(); setFuncInfo(prof_, suf, "2_mul_xi", op.fp2_mul_xiA_, getCurr()); - -#ifdef MCL_STATIC_JIT - if (op.xi_a) { - // Fp, sizeof(Fp) = 48, supports Fp2 - op.fp_addPre = mclx_Fp_addPre; - op.fp_subPre = mclx_Fp_subPre; - op.fp_addA_ = mclx_Fp_add; - op.fp_subA_ = mclx_Fp_sub; - op.fp_shr1 = mclx_Fp_shr1; - op.fp_negA_ = mclx_Fp_neg; - op.fpDbl_addA_ = mclx_FpDbl_add; - op.fpDbl_subA_ = mclx_FpDbl_sub; - op.fpDbl_addPre = mclx_FpDbl_addPre; - op.fpDbl_subPre = mclx_FpDbl_subPre; - op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre; - op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre; - op.fpDbl_modA_ = mclx_FpDbl_mod; - op.fp_mulA_ = mclx_Fp_mul; - op.fp_sqrA_ = mclx_Fp_sqr; - } else { - // Fr, sizeof(Fr) = 32 - op.fp_addPre = mclx_Fr_addPre; - op.fp_subPre = mclx_Fr_subPre; - op.fp_addA_ = mclx_Fr_add; - op.fp_subA_ = mclx_Fr_sub; - op.fp_shr1 = mclx_Fr_shr1; - op.fp_negA_ = mclx_Fr_neg; - op.fp_mulA_ = mclx_Fr_mul; - op.fp_sqrA_ = mclx_Fr_sqr; - op.fp_preInv = mclx_Fr_preInv; - } -#endif } u3u gen_addSubPre(bool isAdd, int n) { @@ -2774,7 +2702,7 @@ private: mov(rax, px); // px is free frome here load_mp(vv, rax, t); // v = x - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); load_mp(uu, rax, t); // u = p_ // k = 0 xor_(rax, rax); @@ -2852,7 +2780,7 @@ private: const Reg64& t2 = ss.getReg(0); const Reg64& t3 = rdx; - mov(t2, pL_); + lea(t2, ptr[rip+pL_]); if (isFullBit_) { mov(t, ptr [rTop]); test(t, t); @@ -3724,7 +3652,7 @@ private: } } sub_rr(a, b); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); load_rm(b, rax); sbb(rax, rax); for (int i = 0; i < pn_; i++) { @@ -3732,7 +3660,7 @@ private: } add_rr(a, b); store_mr(py, a); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); mov_rr(a, t); sub_rm(t, rax); cmovc_rr(t, a); @@ -3750,7 +3678,7 @@ private: mov_rr(b, a); add_rm(b, px + FpByte_); sub_rm(a, px + FpByte_); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); jnc("@f"); add_rm(a, rax); L("@@"); @@ -3925,7 +3853,7 @@ private: mov(ptr [(RegExp)t2 + i * 8], rax); } // t3 = a + p - b - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); add_rm(a, rax); sub_rr(a, b); store_mr(t3, a); diff --git a/src/fp_static_code.hpp b/src/fp_static_code.hpp new file mode 100644 index 0000000..0da39cb --- /dev/null +++ b/src/fp_static_code.hpp @@ -0,0 +1,87 @@ +#pragma once +/** + @file + @brief Fp generator + @author MITSUNARI Shigeo(@herumi) + @license modified new BSD license + http://opensource.org/licenses/BSD-3-Clause +*/ +#ifndef MCL_STATIC_CODE + #error "define MCL_STATIC_CODE" +#endif + +namespace mcl { namespace fp { + +extern "C" { + +Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_Fp_subPre(Unit*, const Unit*, const Unit*); +void mclx_Fp_add(Unit*, const Unit*, const Unit*); +void mclx_Fp_sub(Unit*, const Unit*, const Unit*); +void mclx_Fp_shr1(Unit*, const Unit*); +void mclx_Fp_neg(Unit*, const Unit*); +void mclx_Fp_mul(Unit*, const Unit*, const Unit*); +void mclx_Fp_sqr(Unit*, const Unit*); +void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); +Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_sqrPre(Unit*, const Unit*); +void mclx_FpDbl_mod(Unit*, const Unit*); +void mclx_Fp2_add(Unit*, const Unit*, const Unit*); +void mclx_Fp2_sub(Unit*, const Unit*, const Unit*); +void mclx_Fp2_neg(Unit*, const Unit*); +void mclx_Fp2_mul(Unit*, const Unit*, const Unit*); +void mclx_Fp2_sqr(Unit*, const Unit*); +void mclx_Fp2_mul_xi(Unit*, const Unit*); + +Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_Fr_subPre(Unit*, const Unit*, const Unit*); +void mclx_Fr_add(Unit*, const Unit*, const Unit*); +void mclx_Fr_sub(Unit*, const Unit*, const Unit*); +void mclx_Fr_shr1(Unit*, const Unit*); +void mclx_Fr_neg(Unit*, const Unit*); +void mclx_Fr_mul(Unit*, const Unit*, const Unit*); +void mclx_Fr_sqr(Unit*, const Unit*); +int mclx_Fr_preInv(Unit*, const Unit*); +} // extern "C" + +void setStaticCode(mcl::fp::Op& op) +{ + if (op.xi_a) { + // Fp, sizeof(Fp) = 48, supports Fp2 + op.fp_addPre = mclx_Fp_addPre; + op.fp_subPre = mclx_Fp_subPre; + op.fp_addA_ = mclx_Fp_add; + op.fp_subA_ = mclx_Fp_sub; + op.fp_shr1 = mclx_Fp_shr1; + op.fp_negA_ = mclx_Fp_neg; + op.fpDbl_addA_ = mclx_FpDbl_add; + op.fpDbl_subA_ = mclx_FpDbl_sub; + op.fpDbl_addPre = mclx_FpDbl_addPre; + op.fpDbl_subPre = mclx_FpDbl_subPre; + op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre; + op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre; + op.fpDbl_modA_ = mclx_FpDbl_mod; + op.fp_mulA_ = mclx_Fp_mul; + op.fp_sqrA_ = mclx_Fp_sqr; + } else { + // Fr, sizeof(Fr) = 32 + op.fp_addPre = mclx_Fr_addPre; + op.fp_subPre = mclx_Fr_subPre; + op.fp_addA_ = mclx_Fr_add; + op.fp_subA_ = mclx_Fr_sub; + op.fp_shr1 = mclx_Fr_shr1; + op.fp_negA_ = mclx_Fr_neg; + op.fp_mulA_ = mclx_Fr_mul; + op.fp_sqrA_ = mclx_Fr_sqr; + op.fp_preInv = mclx_Fr_preInv; + } + op.fp_mul = fp::func_ptr_cast(op.fp_mulA_); +} + +} } // mcl::fp + diff --git a/src/low_func.hpp b/src/low_func.hpp index 89a748e..2db815e 100644 --- a/src/low_func.hpp +++ b/src/low_func.hpp @@ -16,7 +16,7 @@ #endif #ifndef MCL_LLVM_BMI2 - #if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_USE_XBYAK) && !defined(MCL_USE_VINT) + #if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_X64_ASM) && !defined(MCL_USE_VINT) #define MCL_LLVM_BMI2 1 #endif #endif diff --git a/test/ec_test.cpp b/test/ec_test.cpp index a3e79e5..855ceba 100644 --- a/test/ec_test.cpp +++ b/test/ec_test.cpp @@ -602,7 +602,7 @@ void test_sub(const mcl::EcParam *para, size_t paraNum) test_sub_sub(para[i], mcl::fp::FP_LLVM); test_sub_sub(para[i], mcl::fp::FP_LLVM_MONT); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM test_sub_sub(para[i], mcl::fp::FP_XBYAK); #endif mulVec(para[i]); diff --git a/test/fp_test.cpp b/test/fp_test.cpp index 469f35d..70fef8a 100644 --- a/test/fp_test.cpp +++ b/test/fp_test.cpp @@ -876,7 +876,7 @@ void modpTest() } #include -#if (defined(MCL_USE_LLVM) || defined(MCL_USE_XBYAK)) && (MCL_MAX_BIT_SIZE >= 521) +#if (defined(MCL_USE_LLVM) || defined(MCL_X64_ASM)) && (MCL_MAX_BIT_SIZE >= 521) CYBOZU_TEST_AUTO(mod_NIST_P521) { const size_t len = 521; @@ -908,7 +908,7 @@ CYBOZU_TEST_AUTO(mod_NIST_P521) mcl_fpDbl_mod_NIST_P521L(ex, in, Fp::getOp().p); CYBOZU_TEST_EQUAL_ARRAY(ex, ok, N + 1); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM const mcl::fp::Op& op = Fp::getOp(); if (!op.isMont) { op.fpDbl_mod(ex, in, op.p); @@ -1014,7 +1014,7 @@ CYBOZU_TEST_AUTO(main) sub(mcl::fp::FP_LLVM_MONT); } #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (g_mode.empty() || g_mode == "xbyak") { sub(mcl::fp::FP_XBYAK); } diff --git a/test/fp_tower_test.cpp b/test/fp_tower_test.cpp index c26c5d7..4576376 100644 --- a/test/fp_tower_test.cpp +++ b/test/fp_tower_test.cpp @@ -465,7 +465,7 @@ void testAll() test(p, mcl::fp::FP_LLVM); test(p, mcl::fp::FP_LLVM_MONT); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM test(p, mcl::fp::FP_XBYAK); #endif } From 32453e25a20c382ba6ceff3a078cd201115ada07 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Thu, 17 Sep 2020 16:59:46 +0900 Subject: [PATCH 11/15] add MCL_STATIC_CODE --- Makefile | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1b59ce7..123249f 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,13 @@ TEST_SRC+=bls12_test.cpp TEST_SRC+=mapto_wb19_test.cpp TEST_SRC+=ecdsa_c_test.cpp TEST_SRC+=modp_test.cpp +ifeq ($(MCL_STATIC_CODE),1) + MCL_USE_XBYAK=0 + MCL_MAX_BIT_SIZE=384 + CFLAGS+=-DMCL_STATI_CODE + LIB_OBJ=obj/static_code.o + TEST_SRC=bls12_test.cpp +endif ifeq ($(CPU),x86-64) MCL_USE_XBYAK?=1 TEST_SRC+=mont_fp_test.cpp sq_test.cpp @@ -86,7 +93,7 @@ ifneq ($(CPU),) ASM_SRC=$(ASM_SRC_PATH_NAME).s endif ASM_OBJ=$(OBJ_DIR)/$(CPU).o -LIB_OBJ=$(OBJ_DIR)/fp.o +LIB_OBJ+=$(OBJ_DIR)/fp.o BN256_OBJ=$(OBJ_DIR)/bn_c256.o BN384_OBJ=$(OBJ_DIR)/bn_c384.o BN384_256_OBJ=$(OBJ_DIR)/bn_c384_256.o From 938e15432a82c91c526168ea790fa3084b96702e Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 18 Sep 2020 11:15:42 +0900 Subject: [PATCH 12/15] fix typo --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 123249f..ce7fc34 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ TEST_SRC+=modp_test.cpp ifeq ($(MCL_STATIC_CODE),1) MCL_USE_XBYAK=0 MCL_MAX_BIT_SIZE=384 - CFLAGS+=-DMCL_STATI_CODE + CFLAGS+=-DMCL_STATIC_CODE LIB_OBJ=obj/static_code.o TEST_SRC=bls12_test.cpp endif From ad7b7891fa66388a36387769290ff8658a8620c1 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 18 Sep 2020 11:16:01 +0900 Subject: [PATCH 13/15] fix bls12_test for static code --- include/mcl/bn.hpp | 6 ++++++ src/fp_static_code.hpp | 17 +++++++++++------ test/bls12_test.cpp | 3 +++ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/mcl/bn.hpp b/include/mcl/bn.hpp index 3668da2..ab9e15c 100644 --- a/include/mcl/bn.hpp +++ b/include/mcl/bn.hpp @@ -854,6 +854,12 @@ struct Param { { this->cp = cp; isBLS12 = cp.curveType == MCL_BLS12_381; +#ifdef MCL_STATIC_CODE + if (!isBLS12) { + *pb = false; + return; + } +#endif gmp::setStr(pb, z, cp.z); if (!*pb) return; isNegative = z < 0; diff --git a/src/fp_static_code.hpp b/src/fp_static_code.hpp index 0da39cb..832062e 100644 --- a/src/fp_static_code.hpp +++ b/src/fp_static_code.hpp @@ -20,17 +20,15 @@ void mclx_Fp_add(Unit*, const Unit*, const Unit*); void mclx_Fp_sub(Unit*, const Unit*, const Unit*); void mclx_Fp_shr1(Unit*, const Unit*); void mclx_Fp_neg(Unit*, const Unit*); +void mclx_FpDbl_mod(Unit*, const Unit*); void mclx_Fp_mul(Unit*, const Unit*, const Unit*); void mclx_Fp_sqr(Unit*, const Unit*); void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); -void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); -void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*); Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*); void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*); void mclx_FpDbl_sqrPre(Unit*, const Unit*); -void mclx_FpDbl_mod(Unit*, const Unit*); void mclx_Fp2_add(Unit*, const Unit*, const Unit*); void mclx_Fp2_sub(Unit*, const Unit*, const Unit*); void mclx_Fp2_neg(Unit*, const Unit*); @@ -59,15 +57,22 @@ void setStaticCode(mcl::fp::Op& op) op.fp_subA_ = mclx_Fp_sub; op.fp_shr1 = mclx_Fp_shr1; op.fp_negA_ = mclx_Fp_neg; + op.fpDbl_modA_ = mclx_FpDbl_mod; + op.fp_mulA_ = mclx_Fp_mul; + op.fp_sqrA_ = mclx_Fp_sqr; op.fpDbl_addA_ = mclx_FpDbl_add; op.fpDbl_subA_ = mclx_FpDbl_sub; op.fpDbl_addPre = mclx_FpDbl_addPre; op.fpDbl_subPre = mclx_FpDbl_subPre; op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre; op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre; - op.fpDbl_modA_ = mclx_FpDbl_mod; - op.fp_mulA_ = mclx_Fp_mul; - op.fp_sqrA_ = mclx_Fp_sqr; + op.fp2_addA_ = mclx_Fp2_add; + op.fp2_subA_ = mclx_Fp2_sub; + op.fp2_negA_ = mclx_Fp2_neg; + op.fp2_mulNF = 0; + op.fp2_mulA_ = mclx_Fp2_mul; + op.fp2_sqrA_ = mclx_Fp2_sqr; + op.fp2_mul_xiA_ = mclx_Fp2_mul_xi; } else { // Fr, sizeof(Fr) = 32 op.fp_addPre = mclx_Fr_addPre; diff --git a/test/bls12_test.cpp b/test/bls12_test.cpp index 723bf3a..94dce59 100644 --- a/test/bls12_test.cpp +++ b/test/bls12_test.cpp @@ -688,6 +688,8 @@ CYBOZU_TEST_AUTO(multi) G1 P; G2 Q; int i; + +#ifndef MCL_STATIC_CODE puts("BN254"); testCurve(mcl::BN254); i = 1; @@ -695,6 +697,7 @@ CYBOZU_TEST_AUTO(multi) CYBOZU_BENCH_C("naiveG2", 100, (BN::param.mapTo.naiveMapTo), P, i++); CYBOZU_BENCH_C("calcBN2", 100, (BN::param.mapTo.calcBN), Q, i++); CYBOZU_BENCH_C("naiveG2", 100, (BN::param.mapTo.naiveMapTo), Q, i++); +#endif puts("BLS12_381"); testCurve(mcl::BLS12_381); i = 1; From 4ee23f5fd3797e556898c22ce849871e72b7f342 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 18 Sep 2020 11:18:31 +0900 Subject: [PATCH 14/15] add -Wundef --- common.mk | 2 +- test/bls12_test.cpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/common.mk b/common.mk index 4816049..6f4ed72 100644 --- a/common.mk +++ b/common.mk @@ -91,7 +91,7 @@ else CFLAGS_OPT+=$(MARCH) endif endif -CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith +CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith -Wundef CFLAGS+=-g3 INC_OPT=-I include -I test CFLAGS+=$(CFLAGS_WARN) $(BIT_OPT) $(INC_OPT) diff --git a/test/bls12_test.cpp b/test/bls12_test.cpp index 94dce59..ec4204c 100644 --- a/test/bls12_test.cpp +++ b/test/bls12_test.cpp @@ -864,7 +864,11 @@ int main(int argc, char *argv[]) return 1; } g_mode = mcl::fp::StrToMode(mode); +#ifdef MCL_STATIC_CODE + printf("static code for BLS12-381\n"); +#else printf("JIT %d\n", mcl::fp::isEnableJIT()); +#endif #if 0 initPairing(mcl::BLS12_381); cybozu::XorShift rg; From 4fd5fef8edbe886989dbff4b4d91ccd5bfdc2dca Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 18 Sep 2020 11:55:55 +0900 Subject: [PATCH 15/15] static_code does not need llvm-bmi2 --- Makefile | 4 +++- src/low_func.hpp | 4 +++- test/bench.hpp | 12 ++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ce7fc34..21f5f15 100644 --- a/Makefile +++ b/Makefile @@ -113,7 +113,9 @@ ifeq ($(MCL_USE_LLVM),1) LIB_OBJ+=$(ASM_OBJ) # special case for intel with bmi2 ifeq ($(INTEL),1) - LIB_OBJ+=$(OBJ_DIR)/$(CPU).bmi2.o + ifneq ($(MCL_STATIC_CODE),1) + LIB_OBJ+=$(OBJ_DIR)/$(CPU).bmi2.o + endif endif endif LLVM_SRC=src/base$(BIT).ll diff --git a/src/low_func.hpp b/src/low_func.hpp index 2db815e..9192e51 100644 --- a/src/low_func.hpp +++ b/src/low_func.hpp @@ -16,8 +16,10 @@ #endif #ifndef MCL_LLVM_BMI2 - #if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_X64_ASM) && !defined(MCL_USE_VINT) + #if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && !defined(MCL_STATIC_CODE) && !defined(MCL_USE_VINT) #define MCL_LLVM_BMI2 1 + #else + #define MCL_LLVM_BMI2 0 #endif #endif diff --git a/test/bench.hpp b/test/bench.hpp index b4a8bd2..f7acfce 100644 --- a/test/bench.hpp +++ b/test/bench.hpp @@ -101,6 +101,18 @@ void testBench(const G1& P, const G2& Q) CYBOZU_BENCH_C("Fp::sqr ", C3, Fp::sqr, x, x); CYBOZU_BENCH_C("Fp::inv ", C3, Fp::inv, x, x); CYBOZU_BENCH_C("Fp::pow ", C3, Fp::pow, x, x, y); + { + Fr a, b, c; + a.setHashOf("abc", 3); + b.setHashOf("123", 3); + CYBOZU_BENCH_C("Fr::add ", C3, Fr::add, a, a, b); + CYBOZU_BENCH_C("Fr::sub ", C3, Fr::sub, a, a, b); + CYBOZU_BENCH_C("Fr::neg ", C3, Fr::neg, a, a); + CYBOZU_BENCH_C("Fr::mul ", C3, Fr::mul, a, a, b); + CYBOZU_BENCH_C("Fr::sqr ", C3, Fr::sqr, a, a); + CYBOZU_BENCH_C("Fr::inv ", C3, Fr::inv, a, a); + CYBOZU_BENCH_C("Fr::pow ", C3, Fr::pow, a, a, b); + } Fp2 xx, yy; xx.a = x; xx.b = 3;