diff --git a/Makefile b/Makefile index 9e37876..1b59ce7 100644 --- a/Makefile +++ b/Makefile @@ -247,7 +247,7 @@ obj/static_code.o: src/static_code.asm nasm -felf64 -o $@ $< bin/static_code_test.exe: test/static_code_test.cpp src/fp.cpp obj/static_code.o - $(CXX) -o $@ -O3 $^ -DMCL_STATIC_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I include -Wall -Wextra + $(CXX) -o $@ -O3 $^ -g -DMCL_DONT_USE_XBYAK -DMCL_STATIC_CODE -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I include -Wall -Wextra asm: $(LLVM_SRC) $(LLVM_OPT) -O3 -o - $(LLVM_SRC) | $(LLVM_LLC) -O3 $(LLVM_FLAGS) -x86-asm-syntax=intel diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index 99c0e4d..22a78b1 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -16,6 +16,9 @@ #endif #if !defined(MCL_DONT_USE_XBYAK) && (defined(_WIN64) || defined(__x86_64__)) && (MCL_SIZEOF_UNIT == 8) #define MCL_USE_XBYAK +#endif +#if defined(MCL_USE_XBYAK) || defined(MCL_STATIC_CODE) + #define MCL_X64_ASM #define MCL_XBYAK_DIRECT_CALL #endif @@ -202,6 +205,8 @@ struct Op { Unit R3[maxUnitSize]; #ifdef MCL_USE_XBYAK FpGenerator *fg; +#endif +#ifdef MCL_X64_ASM mcl::Array invTbl; #endif void3u fp_addA_; @@ -288,7 +293,7 @@ struct Op { memset(one, 0, sizeof(one)); memset(R2, 0, sizeof(R2)); memset(R3, 0, sizeof(R3)); -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM invTbl.clear(); #endif fp_addA_ = 0; diff --git a/sample/bench.cpp b/sample/bench.cpp index de81f25..d3c101c 100644 --- a/sample/bench.cpp +++ b/sample/bench.cpp @@ -68,7 +68,7 @@ void benchFp(size_t bitSize, int mode) if (mode & 4) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM); if (mode & 8) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM_MONT); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (mode & 16) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_XBYAK); #endif } @@ -122,7 +122,7 @@ void benchEc(size_t bitSize, int mode, mcl::ec::Mode ecMode) if (mode & 4) benchEcSub(tbl[i], mcl::fp::FP_LLVM, ecMode); if (mode & 8) benchEcSub(tbl[i], mcl::fp::FP_LLVM_MONT, ecMode); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (mode & 16) benchEcSub(tbl[i], mcl::fp::FP_XBYAK, ecMode); #endif } diff --git a/sample/rawbench.cpp b/sample/rawbench.cpp index 4d7506e..cc74bc3 100644 --- a/sample/rawbench.cpp +++ b/sample/rawbench.cpp @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) benchRaw(tbl[i], mcl::fp::FP_LLVM); benchRaw(tbl[i], mcl::fp::FP_LLVM_MONT); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (bitSize <= 384) { benchRaw(tbl[i], mcl::fp::FP_XBYAK); } diff --git a/src/fp.cpp b/src/fp.cpp index b3b07d1..ab3a1a7 100644 --- a/src/fp.cpp +++ b/src/fp.cpp @@ -3,12 +3,14 @@ #include #include #include +#ifdef MCL_STATIC_CODE +#include "fp_static_code.hpp" +#endif #ifdef MCL_USE_XBYAK #include "fp_generator.hpp" #else #define XBYAK_ONLY_CLASS_CPU #include "xbyak/xbyak_util.h" -//#include "detect_cpu.hpp" #endif #include "low_func.hpp" #ifdef MCL_USE_LLVM @@ -315,7 +317,7 @@ void setOp(Op& op, Mode mode) #endif } -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM inline void invOpForMontC(Unit *y, const Unit *x, const Op& op) { Unit r[maxUnitSize]; @@ -372,6 +374,12 @@ static bool initForMont(Op& op, const Unit *p, Mode mode) op.fp_invOp = &invOpForMontC; initInvTbl(op); } +#elif defined(MCL_STATIC_CODE) + fp::setStaticCode(op); + if (op.isMont && N <= 4) { + op.fp_invOp = &invOpForMontC; + initInvTbl(op); + } #endif return true; } @@ -403,14 +411,25 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size priority : MCL_USE_XBYAK > MCL_USE_LLVM > none Xbyak > llvm_mont > llvm > gmp_mont > gmp */ -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (mode == FP_AUTO) mode = FP_XBYAK; if (mode == FP_XBYAK && bitSize > 384) { mode = FP_AUTO; } +#ifdef MCL_USE_XBYAK if (!isEnableJIT()) { mode = FP_AUTO; } +#elif MCL_STATIC_CODE + { + // static jit code uses avx, mulx, adox, adcx + using namespace Xbyak::util; + Cpu cpu; + if (!(cpu.has(Cpu::tAVX) && cpu.has(Cpu::tBMI2) && cpu.has(Cpu::tADX))) { + mode = FP_AUTO; + } + } +#endif #else if (mode == FP_XBYAK) mode = FP_AUTO; #endif diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index b5d4628..4243368 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -7,7 +7,6 @@ http://opensource.org/licenses/BSD-3-Clause */ #if CYBOZU_HOST == CYBOZU_HOST_INTEL -#define XBYAK_NO_OP_NAMES #define XBYAK_DISABLE_AVX512 #include "xbyak/xbyak_util.h" @@ -25,45 +24,6 @@ namespace mcl { -#ifdef MCL_STATIC_JIT -typedef fp::Unit Unit; -extern "C" { -Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*); -Unit mclx_Fp_subPre(Unit*, const Unit*, const Unit*); -void mclx_Fp_add(Unit*, const Unit*, const Unit*); -void mclx_Fp_sub(Unit*, const Unit*, const Unit*); -void mclx_Fp_shr1(Unit*, const Unit*); -void mclx_Fp_neg(Unit*, const Unit*); -void mclx_Fp_mul(Unit*, const Unit*, const Unit*); -void mclx_Fp_sqr(Unit*, const Unit*); -void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); -void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); -void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); -void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); -Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*); -Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*); -void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*); -void mclx_FpDbl_sqrPre(Unit*, const Unit*); -void mclx_FpDbl_mod(Unit*, const Unit*); -void mclx_Fp2_add(Unit*, const Unit*, const Unit*); -void mclx_Fp2_sub(Unit*, const Unit*, const Unit*); -void mclx_Fp2_neg(Unit*, const Unit*); -void mclx_Fp2_mul(Unit*, const Unit*, const Unit*); -void mclx_Fp2_sqr(Unit*, const Unit*); -void mclx_Fp2_mul_xi(Unit*, const Unit*); - -Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*); -Unit mclx_Fr_subPre(Unit*, const Unit*, const Unit*); -void mclx_Fr_add(Unit*, const Unit*, const Unit*); -void mclx_Fr_sub(Unit*, const Unit*, const Unit*); -void mclx_Fr_shr1(Unit*, const Unit*); -void mclx_Fr_neg(Unit*, const Unit*); -void mclx_Fr_mul(Unit*, const Unit*, const Unit*); -void mclx_Fr_sqr(Unit*, const Unit*); -int mclx_Fr_preInv(Unit*, const Unit*); -} -#endif - #ifdef MCL_DUMP_JIT struct DumpCode { FILE *fp_; @@ -488,38 +448,6 @@ private: align(16); op.fp2_mul_xiA_ = gen_fp2_mul_xi(); setFuncInfo(prof_, suf, "2_mul_xi", op.fp2_mul_xiA_, getCurr()); - -#ifdef MCL_STATIC_JIT - if (op.xi_a) { - // Fp, sizeof(Fp) = 48, supports Fp2 - op.fp_addPre = mclx_Fp_addPre; - op.fp_subPre = mclx_Fp_subPre; - op.fp_addA_ = mclx_Fp_add; - op.fp_subA_ = mclx_Fp_sub; - op.fp_shr1 = mclx_Fp_shr1; - op.fp_negA_ = mclx_Fp_neg; - op.fpDbl_addA_ = mclx_FpDbl_add; - op.fpDbl_subA_ = mclx_FpDbl_sub; - op.fpDbl_addPre = mclx_FpDbl_addPre; - op.fpDbl_subPre = mclx_FpDbl_subPre; - op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre; - op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre; - op.fpDbl_modA_ = mclx_FpDbl_mod; - op.fp_mulA_ = mclx_Fp_mul; - op.fp_sqrA_ = mclx_Fp_sqr; - } else { - // Fr, sizeof(Fr) = 32 - op.fp_addPre = mclx_Fr_addPre; - op.fp_subPre = mclx_Fr_subPre; - op.fp_addA_ = mclx_Fr_add; - op.fp_subA_ = mclx_Fr_sub; - op.fp_shr1 = mclx_Fr_shr1; - op.fp_negA_ = mclx_Fr_neg; - op.fp_mulA_ = mclx_Fr_mul; - op.fp_sqrA_ = mclx_Fr_sqr; - op.fp_preInv = mclx_Fr_preInv; - } -#endif } u3u gen_addSubPre(bool isAdd, int n) { @@ -2774,7 +2702,7 @@ private: mov(rax, px); // px is free frome here load_mp(vv, rax, t); // v = x - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); load_mp(uu, rax, t); // u = p_ // k = 0 xor_(rax, rax); @@ -2852,7 +2780,7 @@ private: const Reg64& t2 = ss.getReg(0); const Reg64& t3 = rdx; - mov(t2, pL_); + lea(t2, ptr[rip+pL_]); if (isFullBit_) { mov(t, ptr [rTop]); test(t, t); @@ -3724,7 +3652,7 @@ private: } } sub_rr(a, b); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); load_rm(b, rax); sbb(rax, rax); for (int i = 0; i < pn_; i++) { @@ -3732,7 +3660,7 @@ private: } add_rr(a, b); store_mr(py, a); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); mov_rr(a, t); sub_rm(t, rax); cmovc_rr(t, a); @@ -3750,7 +3678,7 @@ private: mov_rr(b, a); add_rm(b, px + FpByte_); sub_rm(a, px + FpByte_); - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); jnc("@f"); add_rm(a, rax); L("@@"); @@ -3925,7 +3853,7 @@ private: mov(ptr [(RegExp)t2 + i * 8], rax); } // t3 = a + p - b - mov(rax, pL_); + lea(rax, ptr[rip+pL_]); add_rm(a, rax); sub_rr(a, b); store_mr(t3, a); diff --git a/src/fp_static_code.hpp b/src/fp_static_code.hpp new file mode 100644 index 0000000..0da39cb --- /dev/null +++ b/src/fp_static_code.hpp @@ -0,0 +1,87 @@ +#pragma once +/** + @file + @brief Fp generator + @author MITSUNARI Shigeo(@herumi) + @license modified new BSD license + http://opensource.org/licenses/BSD-3-Clause +*/ +#ifndef MCL_STATIC_CODE + #error "define MCL_STATIC_CODE" +#endif + +namespace mcl { namespace fp { + +extern "C" { + +Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_Fp_subPre(Unit*, const Unit*, const Unit*); +void mclx_Fp_add(Unit*, const Unit*, const Unit*); +void mclx_Fp_sub(Unit*, const Unit*, const Unit*); +void mclx_Fp_shr1(Unit*, const Unit*); +void mclx_Fp_neg(Unit*, const Unit*); +void mclx_Fp_mul(Unit*, const Unit*, const Unit*); +void mclx_Fp_sqr(Unit*, const Unit*); +void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_add(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*); +Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*); +void mclx_FpDbl_sqrPre(Unit*, const Unit*); +void mclx_FpDbl_mod(Unit*, const Unit*); +void mclx_Fp2_add(Unit*, const Unit*, const Unit*); +void mclx_Fp2_sub(Unit*, const Unit*, const Unit*); +void mclx_Fp2_neg(Unit*, const Unit*); +void mclx_Fp2_mul(Unit*, const Unit*, const Unit*); +void mclx_Fp2_sqr(Unit*, const Unit*); +void mclx_Fp2_mul_xi(Unit*, const Unit*); + +Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*); +Unit mclx_Fr_subPre(Unit*, const Unit*, const Unit*); +void mclx_Fr_add(Unit*, const Unit*, const Unit*); +void mclx_Fr_sub(Unit*, const Unit*, const Unit*); +void mclx_Fr_shr1(Unit*, const Unit*); +void mclx_Fr_neg(Unit*, const Unit*); +void mclx_Fr_mul(Unit*, const Unit*, const Unit*); +void mclx_Fr_sqr(Unit*, const Unit*); +int mclx_Fr_preInv(Unit*, const Unit*); +} // extern "C" + +void setStaticCode(mcl::fp::Op& op) +{ + if (op.xi_a) { + // Fp, sizeof(Fp) = 48, supports Fp2 + op.fp_addPre = mclx_Fp_addPre; + op.fp_subPre = mclx_Fp_subPre; + op.fp_addA_ = mclx_Fp_add; + op.fp_subA_ = mclx_Fp_sub; + op.fp_shr1 = mclx_Fp_shr1; + op.fp_negA_ = mclx_Fp_neg; + op.fpDbl_addA_ = mclx_FpDbl_add; + op.fpDbl_subA_ = mclx_FpDbl_sub; + op.fpDbl_addPre = mclx_FpDbl_addPre; + op.fpDbl_subPre = mclx_FpDbl_subPre; + op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre; + op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre; + op.fpDbl_modA_ = mclx_FpDbl_mod; + op.fp_mulA_ = mclx_Fp_mul; + op.fp_sqrA_ = mclx_Fp_sqr; + } else { + // Fr, sizeof(Fr) = 32 + op.fp_addPre = mclx_Fr_addPre; + op.fp_subPre = mclx_Fr_subPre; + op.fp_addA_ = mclx_Fr_add; + op.fp_subA_ = mclx_Fr_sub; + op.fp_shr1 = mclx_Fr_shr1; + op.fp_negA_ = mclx_Fr_neg; + op.fp_mulA_ = mclx_Fr_mul; + op.fp_sqrA_ = mclx_Fr_sqr; + op.fp_preInv = mclx_Fr_preInv; + } + op.fp_mul = fp::func_ptr_cast(op.fp_mulA_); +} + +} } // mcl::fp + diff --git a/src/low_func.hpp b/src/low_func.hpp index 89a748e..2db815e 100644 --- a/src/low_func.hpp +++ b/src/low_func.hpp @@ -16,7 +16,7 @@ #endif #ifndef MCL_LLVM_BMI2 - #if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_USE_XBYAK) && !defined(MCL_USE_VINT) + #if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_X64_ASM) && !defined(MCL_USE_VINT) #define MCL_LLVM_BMI2 1 #endif #endif diff --git a/test/ec_test.cpp b/test/ec_test.cpp index a3e79e5..855ceba 100644 --- a/test/ec_test.cpp +++ b/test/ec_test.cpp @@ -602,7 +602,7 @@ void test_sub(const mcl::EcParam *para, size_t paraNum) test_sub_sub(para[i], mcl::fp::FP_LLVM); test_sub_sub(para[i], mcl::fp::FP_LLVM_MONT); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM test_sub_sub(para[i], mcl::fp::FP_XBYAK); #endif mulVec(para[i]); diff --git a/test/fp_test.cpp b/test/fp_test.cpp index 469f35d..70fef8a 100644 --- a/test/fp_test.cpp +++ b/test/fp_test.cpp @@ -876,7 +876,7 @@ void modpTest() } #include -#if (defined(MCL_USE_LLVM) || defined(MCL_USE_XBYAK)) && (MCL_MAX_BIT_SIZE >= 521) +#if (defined(MCL_USE_LLVM) || defined(MCL_X64_ASM)) && (MCL_MAX_BIT_SIZE >= 521) CYBOZU_TEST_AUTO(mod_NIST_P521) { const size_t len = 521; @@ -908,7 +908,7 @@ CYBOZU_TEST_AUTO(mod_NIST_P521) mcl_fpDbl_mod_NIST_P521L(ex, in, Fp::getOp().p); CYBOZU_TEST_EQUAL_ARRAY(ex, ok, N + 1); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM const mcl::fp::Op& op = Fp::getOp(); if (!op.isMont) { op.fpDbl_mod(ex, in, op.p); @@ -1014,7 +1014,7 @@ CYBOZU_TEST_AUTO(main) sub(mcl::fp::FP_LLVM_MONT); } #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM if (g_mode.empty() || g_mode == "xbyak") { sub(mcl::fp::FP_XBYAK); } diff --git a/test/fp_tower_test.cpp b/test/fp_tower_test.cpp index c26c5d7..4576376 100644 --- a/test/fp_tower_test.cpp +++ b/test/fp_tower_test.cpp @@ -465,7 +465,7 @@ void testAll() test(p, mcl::fp::FP_LLVM); test(p, mcl::fp::FP_LLVM_MONT); #endif -#ifdef MCL_USE_XBYAK +#ifdef MCL_X64_ASM test(p, mcl::fp::FP_XBYAK); #endif }