From 2bdbeda2be82bae67698adfcbd5f33ca80c42cd5 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Wed, 23 Mar 2016 21:53:56 +0900 Subject: [PATCH] add fpDbl_mod_NIST_P192 --- include/mcl/ec.hpp | 9 +++++++ include/mcl/op.hpp | 2 ++ include/mcl/util.hpp | 9 +++++++ src/fp.cpp | 11 ++++---- src/fp_generator.hpp | 61 +++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 83 insertions(+), 9 deletions(-) diff --git a/include/mcl/ec.hpp b/include/mcl/ec.hpp index a21b046..abb24bd 100644 --- a/include/mcl/ec.hpp +++ b/include/mcl/ec.hpp @@ -14,6 +14,11 @@ //#define MCL_EC_USE_AFFINE +#ifdef _MSC_VER + #pragma warning(push) + #pragma warning(disable : 4458) +#endif + namespace mcl { namespace ec { @@ -659,3 +664,7 @@ struct hash > { }; CYBOZU_NAMESPACE_TR1_END } // std + +#ifdef _MSC_VER + #pragma warning(pop) +#endif diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index 8ac5037..ef240b4 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -99,6 +99,7 @@ struct Op { bool isFullBit; // true if bitSize % uniSize == 0 bool isMont; // true if use Montgomery + bool isNIST_P192; // true if p is NIST_P192 bool isFastMod; // true if modulo is fast /* same fp_add, fp_sub if isFullBit @@ -161,6 +162,7 @@ struct Op { , fp_mulI(0) , isFullBit(true) , isMont(false) + , isNIST_P192(false) , isFastMod(false) , fp_addNC(0), fp_subNC(0) , fp_preInv(0) diff --git a/include/mcl/util.hpp b/include/mcl/util.hpp index 397cf1c..001a6ad 100644 --- a/include/mcl/util.hpp +++ b/include/mcl/util.hpp @@ -9,6 +9,12 @@ #include #include +#ifdef _MSC_VER + #pragma warning(push) + #pragma warning(disable : 4456) + #pragma warning(disable : 4459) +#endif + namespace mcl { namespace fp { /* @@ -252,3 +258,6 @@ void powerGeneric(G& out, const G& x, const T *y, size_t n, void mul(G&, const G } } // mcl::fp +#ifdef _MSC_VER + #pragma warning(pop) +#endif diff --git a/src/fp.cpp b/src/fp.cpp index 7454d70..854069c 100644 --- a/src/fp.cpp +++ b/src/fp.cpp @@ -391,7 +391,7 @@ static void initForMont(Op& op, const Unit *p, Mode mode) if (fg == 0) return; fg->init(op); - if (N <= 4) { + if (op.isMont && N <= 4) { op.fp_invOp = &invOpForMontC; initInvTbl(op); } @@ -410,8 +410,9 @@ void Op::init(const std::string& mstr, int base, size_t maxBitSize, Mode mode) isFullBit = (bitSize % UnitBitSize) == 0; const size_t roundBit = (bitSize + UnitBitSize - 1) & ~(UnitBitSize - 1); -#ifdef MCL_USE_LLVM - const bool isNIST_P192 = (mode == FP_AUTO || mode == FP_LLVM) && mp == mpz_class("0xfffffffffffffffffffffffffffffffeffffffffffffffff"); +#if defined(MCL_USE_LLVM) || defined(MCL_USE_XBYAK) + isNIST_P192 = (mode == FP_AUTO || mode == FP_LLVM || mode == FP_XBYAK) + && mp == mpz_class("0xfffffffffffffffffffffffffffffffeffffffffffffffff"); if (isNIST_P192) { isMont = false; isFastMod = true; @@ -449,9 +450,7 @@ void Op::init(const std::string& mstr, int base, size_t maxBitSize, Mode mode) fpDbl_mod = &mcl_fpDbl_mod_NIST_P192; } #endif - if (isMont) { - fp::initForMont(*this, p, mode); - } + fp::initForMont(*this, p, mode); sq.set(mp); } diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 253e3c7..0364082 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -141,6 +141,7 @@ struct FpGenerator : Xbyak::CodeGenerator { static const int UseRCX = Xbyak::util::UseRCX; Xbyak::util::Cpu cpu_; bool useMulx_; + const mcl::fp::Op *op_; const uint64_t *p_; uint64_t rp_; int pn_; @@ -165,6 +166,7 @@ struct FpGenerator : Xbyak::CodeGenerator { void2op shr1_; FpGenerator() : CodeGenerator(4096 * 8) + , op_(0) , p_(0) , rp_(0) , pn_(0) @@ -182,6 +184,7 @@ struct FpGenerator : Xbyak::CodeGenerator { void init(Op& op) { if (op.N < 2) throw cybozu::Exception("mcl:FpGenerator:small pn") << op.N; + op_ = &op; p_ = op.p; rp_ = fp::getMontgomeryCoeff(p_[0]); pn_ = (int)op.N; @@ -224,7 +227,7 @@ struct FpGenerator : Xbyak::CodeGenerator { align(16); shr1_ = getCurr(); gen_shr1(); - if (op.N <= 4) { // support general op.N but not fast for op.N > 4 + if (!op.isNIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 align(16); op.fp_preInv = getCurr(); gen_preInv(); @@ -251,7 +254,7 @@ struct FpGenerator : Xbyak::CodeGenerator { if (op.N == 2 || op.N == 3 || op.N == 4) { align(16); op.fpDbl_mod = getCurr(); - gen_fpDbl_mod(); + gen_fpDbl_mod(op); } if ((useMulx_ && op.N == 2) || op.N == 3 || op.N == 4) { align(16); @@ -597,6 +600,11 @@ struct FpGenerator : Xbyak::CodeGenerator { } void gen_mul() { + if (op_->isNIST_P192) { + StackFrame sf(this, 3, 10 | UseRDX, 8 * 6); + mulPre3(rsp, sf.p[1], sf.p[2], sf.t); + fpDbl_mod_NIST_P192(sf.p[0], rsp, sf.t); + } if (pn_ == 3) { gen_montMul3(p_, rp_); } else if (pn_ == 4) { @@ -878,8 +886,13 @@ struct FpGenerator : Xbyak::CodeGenerator { movq(z, xm0); store_mr(z, Pack(t10, t9, t8, t4)); } - void gen_fpDbl_mod() + void gen_fpDbl_mod(const mcl::fp::Op& op) { + if (op.isNIST_P192) { + StackFrame sf(this, 2, 6 | UseRDX); + fpDbl_mod_NIST_P192(sf.p[0], sf.p[1], sf.t); + return; + } switch (pn_) { case 2: gen_fpDbl_mod2(); @@ -896,6 +909,11 @@ struct FpGenerator : Xbyak::CodeGenerator { } void gen_sqr() { + if (op_->isNIST_P192) { + StackFrame sf(this, 2, 10 | UseRDX | UseRCX, 8 * 6); + sqrPre3(rsp, sf.p[1], sf.t); + fpDbl_mod_NIST_P192(sf.p[0], rsp, sf.t); + } if (pn_ == 3) { gen_montSqr3(p_, rp_); return; @@ -1104,6 +1122,7 @@ struct FpGenerator : Xbyak::CodeGenerator { } /* py[5..0] <- px[2..0]^2 + @note use rax, rdx, rcx! */ void sqrPre3(const RegExp& py, const RegExp& px, const Pack& t) { @@ -1933,6 +1952,42 @@ struct FpGenerator : Xbyak::CodeGenerator { L("@@"); outLocalLabel(); } + void fpDbl_mod_NIST_P192(const RegExp &py, const RegExp& px, const Pack& t) + { + const Reg64& t0 = t[0]; + const Reg64& t1 = t[1]; + const Reg64& t2 = t[2]; + const Reg64& t3 = t[3]; + const Reg64& t4 = t[4]; + const Reg64& t5 = t[5]; + load_rm(Pack(t2, t1, t0), px); // L=[t2:t1:t0] + load_rm(Pack(rax, t5, t4), px + 8 * 3); // H = [rax:t5:t4] + xor_(t3, t3); + add_rr(Pack(t3, t2, t1, t0), Pack(t3, rax, t5, t4)); // [t3:t2:t1:t0] = L + H + add_rr(Pack(t2, t1, t0), Pack(t5, t4, rax)); + adc(t3, 0); // [t3:t2:t1:t0] = L + H + [H1:H0:H2] + add(t1, rax); + adc(t2, 0); + adc(t3, 0); // e = t3, t = [t2:t1:t0] + xor_(t4, t4); + add(t0, t3); + adc(t1, 0); + adc(t2, 0); + adc(t4, 0); // t + e + add(t1, t3); + adc(t2, 0); + adc(t4, 0); // t + e + (e << 64) + // p = [ffffffffffffffff:fffffffffffffffe:ffffffffffffffff] + mov(rax, size_t(-1)); + mov(rdx, size_t(-2)); + jz("@f"); + sub_rr(Pack(t2, t1, t0), Pack(rax, rdx, rax)); + L("@@"); + mov_rr(Pack(t5, t4, t3), Pack(t2, t1, t0)); + sub_rr(Pack(t2, t1, t0), Pack(rax, rax, rax)); + cmovc_rr(Pack(t2, t1, t0), Pack(t5, t4, t3)); + store_mr(py, Pack(t2, t1, t0)); + } void mov32c(const Reg64& r, uint64_t c) { if (c & 0xffffffff00000000ULL) {