|
|
|
@ -7,7 +7,6 @@ |
|
|
|
|
http://opensource.org/licenses/BSD-3-Clause
|
|
|
|
|
*/ |
|
|
|
|
#if CYBOZU_HOST == CYBOZU_HOST_INTEL |
|
|
|
|
#define XBYAK_NO_OP_NAMES |
|
|
|
|
#define XBYAK_DISABLE_AVX512 |
|
|
|
|
#include "xbyak/xbyak_util.h" |
|
|
|
|
|
|
|
|
@ -25,6 +24,61 @@ |
|
|
|
|
|
|
|
|
|
namespace mcl { |
|
|
|
|
|
|
|
|
|
#ifdef MCL_DUMP_JIT |
|
|
|
|
struct DumpCode { |
|
|
|
|
FILE *fp_; |
|
|
|
|
DumpCode() |
|
|
|
|
: fp_(stdout) |
|
|
|
|
{ |
|
|
|
|
} |
|
|
|
|
void set(const std::string& name, const uint8_t *begin, const size_t size) |
|
|
|
|
{ |
|
|
|
|
fprintf(fp_, "segment .text\n"); |
|
|
|
|
fprintf(fp_, "global %s\n", name.c_str()); |
|
|
|
|
fprintf(fp_, "align 16\n"); |
|
|
|
|
fprintf(fp_, "%s:\n", name.c_str()); |
|
|
|
|
const uint8_t *p = begin; |
|
|
|
|
size_t remain = size; |
|
|
|
|
while (remain > 0) { |
|
|
|
|
size_t n = remain >= 16 ? 16 : remain; |
|
|
|
|
fprintf(fp_, "db "); |
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
|
|
|
fprintf(fp_, "0x%02x,", *p++); |
|
|
|
|
} |
|
|
|
|
fprintf(fp_, "\n"); |
|
|
|
|
remain -= n; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
void dumpData(const void *begin, const void *end) |
|
|
|
|
{ |
|
|
|
|
fprintf(fp_, "align 16\n"); |
|
|
|
|
fprintf(fp_, "dq "); |
|
|
|
|
const uint64_t *p = (const uint64_t*)begin; |
|
|
|
|
const uint64_t *pe = (const uint64_t*)end; |
|
|
|
|
const size_t n = pe - p; |
|
|
|
|
for (size_t i = 0; i < n; i++) { |
|
|
|
|
fprintf(fp_, "0x%016llx,", (unsigned long long)*p++); |
|
|
|
|
} |
|
|
|
|
fprintf(fp_, "\n"); |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
template<class T> |
|
|
|
|
void setFuncInfo(DumpCode& prof, const char *suf, const char *name, const T& begin, const uint8_t* end) |
|
|
|
|
{ |
|
|
|
|
if (suf == 0) suf = ""; |
|
|
|
|
const uint8_t*p = (const uint8_t*)begin; |
|
|
|
|
prof.set(std::string("mclx_") + suf + name, p, end - p); |
|
|
|
|
} |
|
|
|
|
#else |
|
|
|
|
template<class T> |
|
|
|
|
void setFuncInfo(Xbyak::util::Profiler& prof, const char *suf, const char *name, const T& begin, const uint8_t* end) |
|
|
|
|
{ |
|
|
|
|
if (suf == 0) suf = ""; |
|
|
|
|
const uint8_t*p = (const uint8_t*)begin; |
|
|
|
|
prof.set((std::string("mclx_") + suf + name).c_str(), p, end - p); |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
namespace fp_gen_local { |
|
|
|
|
|
|
|
|
|
class MemReg { |
|
|
|
@ -203,7 +257,11 @@ struct FpGenerator : Xbyak::CodeGenerator { |
|
|
|
|
int pn_; |
|
|
|
|
int FpByte_; |
|
|
|
|
bool isFullBit_; |
|
|
|
|
#ifdef MCL_DUMP_JIT |
|
|
|
|
DumpCode prof_; |
|
|
|
|
#else |
|
|
|
|
Xbyak::util::Profiler prof_; |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
@param op [in] ; use op.p, op.N, op.isFullBit |
|
|
|
@ -257,19 +315,22 @@ struct FpGenerator : Xbyak::CodeGenerator { |
|
|
|
|
private: |
|
|
|
|
void init_inner(Op& op) |
|
|
|
|
{ |
|
|
|
|
const char *suf = op.xi_a ? "Fp" : "Fr"; |
|
|
|
|
op_ = &op; |
|
|
|
|
L(pL_); |
|
|
|
|
p_ = reinterpret_cast<const uint64_t*>(getCurr()); |
|
|
|
|
for (size_t i = 0; i < op.N; i++) { |
|
|
|
|
dq(op.p[i]); |
|
|
|
|
} |
|
|
|
|
#ifdef MCL_DUMP_JIT |
|
|
|
|
prof_.dumpData(p_, getCurr()); |
|
|
|
|
#endif |
|
|
|
|
rp_ = fp::getMontgomeryCoeff(p_[0]); |
|
|
|
|
pn_ = (int)op.N; |
|
|
|
|
FpByte_ = int(op.maxN * sizeof(uint64_t)); |
|
|
|
|
isFullBit_ = op.isFullBit; |
|
|
|
|
// printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_);
|
|
|
|
|
#ifdef MCL_USE_PROF |
|
|
|
|
static char suf[] = "_0"; |
|
|
|
|
int profMode = 0; |
|
|
|
|
#ifdef XBYAK_USE_VTUNE |
|
|
|
|
profMode = 2; |
|
|
|
@ -281,94 +342,116 @@ private: |
|
|
|
|
if (profMode) { |
|
|
|
|
prof_.init(profMode); |
|
|
|
|
prof_.setStartAddr(getCurr()); |
|
|
|
|
prof_.setNameSuffix(suf); |
|
|
|
|
suf[1]++; |
|
|
|
|
} |
|
|
|
|
#else |
|
|
|
|
(void)suf; |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp_addPre = gen_addSubPre(true, pn_); |
|
|
|
|
prof_.set("Fp_addPre", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "_addPre", op.fp_addPre, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp_subPre = gen_addSubPre(false, pn_); |
|
|
|
|
prof_.set("Fp_subPre", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "_subPre", op.fp_subPre, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp_addA_ = gen_fp_add(); |
|
|
|
|
prof_.set("Fp_add", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "_add", op.fp_addA_, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp_subA_ = gen_fp_sub(); |
|
|
|
|
prof_.set("Fp_sub", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "_sub", op.fp_subA_, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp_shr1 = gen_shr1(); |
|
|
|
|
prof_.set("Fp_shr1", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "_shr1", op.fp_shr1, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp_negA_ = gen_fp_neg(); |
|
|
|
|
prof_.set("Fp_neg", getCurr()); |
|
|
|
|
|
|
|
|
|
op.fpDbl_addA_ = gen_fpDbl_add(); |
|
|
|
|
prof_.set("FpDbl_add", getCurr()); |
|
|
|
|
|
|
|
|
|
op.fpDbl_subA_ = gen_fpDbl_sub(); |
|
|
|
|
prof_.set("FpDbl_sub", getCurr()); |
|
|
|
|
|
|
|
|
|
op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2); |
|
|
|
|
prof_.set("FpDbl_addPre", getCurr()); |
|
|
|
|
|
|
|
|
|
op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2); |
|
|
|
|
prof_.set("FpDbl_subPre", getCurr()); |
|
|
|
|
|
|
|
|
|
op.fpDbl_mulPreA_ = gen_fpDbl_mulPre(); |
|
|
|
|
prof_.set("FpDbl_mulPre", getCurr()); |
|
|
|
|
|
|
|
|
|
op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre(); |
|
|
|
|
prof_.set("FpDbl_sqrPre", getCurr()); |
|
|
|
|
|
|
|
|
|
setFuncInfo(prof_, suf, "_neg", op.fp_negA_, getCurr()); |
|
|
|
|
align(16); |
|
|
|
|
op.fpDbl_modA_ = gen_fpDbl_mod(op); |
|
|
|
|
prof_.set("FpDbl_mod", getCurr()); |
|
|
|
|
|
|
|
|
|
setFuncInfo(prof_, suf, "Dbl_mod", op.fpDbl_modA_, getCurr()); |
|
|
|
|
align(16); |
|
|
|
|
op.fp_mulA_ = gen_mul(); |
|
|
|
|
prof_.set("Fp_mul", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "_mul", op.fp_mulA_, getCurr()); |
|
|
|
|
|
|
|
|
|
if (op.fp_mulA_) { |
|
|
|
|
op.fp_mul = fp::func_ptr_cast<void4u>(op.fp_mulA_); // used in toMont/fromMont
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp_sqrA_ = gen_sqr(); |
|
|
|
|
prof_.set("Fp_sqr", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "_sqr", op.fp_sqrA_, getCurr()); |
|
|
|
|
|
|
|
|
|
if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4
|
|
|
|
|
align(16); |
|
|
|
|
op.fp_preInv = getCurr<int2u>(); |
|
|
|
|
gen_preInv(); |
|
|
|
|
prof_.set("preInv", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "_preInv", op.fp_preInv, getCurr()); |
|
|
|
|
} |
|
|
|
|
if (op.xi_a == 0) return; // Fp2 is not used
|
|
|
|
|
align(16); |
|
|
|
|
op.fpDbl_addA_ = gen_fpDbl_add(); |
|
|
|
|
setFuncInfo(prof_, suf, "Dbl_add", op.fpDbl_addA_, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fpDbl_subA_ = gen_fpDbl_sub(); |
|
|
|
|
setFuncInfo(prof_, suf, "Dbl_sub", op.fpDbl_subA_, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2); |
|
|
|
|
setFuncInfo(prof_, suf, "Dbl_addPre", op.fpDbl_addPre, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2); |
|
|
|
|
setFuncInfo(prof_, suf, "Dbl_subPre", op.fpDbl_subPre, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fpDbl_mulPreA_ = gen_fpDbl_mulPre(); |
|
|
|
|
setFuncInfo(prof_, suf, "Dbl_mulPre", op.fpDbl_mulPreA_, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre(); |
|
|
|
|
setFuncInfo(prof_, suf, "Dbl_sqrPre", op.fpDbl_sqrPreA_, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp2_addA_ = gen_fp2_add(); |
|
|
|
|
prof_.set("Fp2_add", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "2_add", op.fp2_addA_, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp2_subA_ = gen_fp2_sub(); |
|
|
|
|
prof_.set("Fp2_sub", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "2_sub", op.fp2_subA_, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp2_negA_ = gen_fp2_neg(); |
|
|
|
|
prof_.set("Fp2_neg", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "2_neg", op.fp2_negA_, getCurr()); |
|
|
|
|
|
|
|
|
|
op.fp2_mulNF = 0; |
|
|
|
|
align(16); |
|
|
|
|
op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre(); |
|
|
|
|
prof_.set("Fp2Dbl_mulPre", getCurr()); |
|
|
|
|
if (op.fp2Dbl_mulPreA_) setFuncInfo(prof_, suf, "2Dbl_mulPre", op.fp2Dbl_mulPreA_, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre(); |
|
|
|
|
prof_.set("Fp2Dbl_sqrPre", getCurr()); |
|
|
|
|
if (op.fp2Dbl_sqrPreA_) setFuncInfo(prof_, suf, "2Dbl_sqrPre", op.fp2Dbl_sqrPreA_, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp2_mulA_ = gen_fp2_mul(); |
|
|
|
|
prof_.set("Fp2_mul", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "2_mul", op.fp2_mulA_, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp2_sqrA_ = gen_fp2_sqr(); |
|
|
|
|
prof_.set("Fp2_sqr", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "2_sqr", op.fp2_sqrA_, getCurr()); |
|
|
|
|
|
|
|
|
|
align(16); |
|
|
|
|
op.fp2_mul_xiA_ = gen_fp2_mul_xi(); |
|
|
|
|
prof_.set("Fp2_mul_xi", getCurr()); |
|
|
|
|
setFuncInfo(prof_, suf, "2_mul_xi", op.fp2_mul_xiA_, getCurr()); |
|
|
|
|
} |
|
|
|
|
u3u gen_addSubPre(bool isAdd, int n) |
|
|
|
|
{ |
|
|
|
|
// if (isFullBit_) return 0;
|
|
|
|
|
align(16); |
|
|
|
|
u3u func = getCurr<u3u>(); |
|
|
|
|
StackFrame sf(this, 3); |
|
|
|
|
if (isAdd) { |
|
|
|
@ -429,7 +512,7 @@ private: |
|
|
|
|
} |
|
|
|
|
jmp(exit); |
|
|
|
|
L(nonZero); |
|
|
|
|
mov(rax, pL_); |
|
|
|
|
lea(rax, ptr[rip+pL_]); |
|
|
|
|
for (size_t i = 0; i < t.size(); i++) { |
|
|
|
|
mov(rdx, ptr [rax + i * 8]); |
|
|
|
|
if (i == 0) { |
|
|
|
@ -557,7 +640,7 @@ private: |
|
|
|
|
mov(*fullReg, 0); |
|
|
|
|
adc(*fullReg, 0); |
|
|
|
|
} |
|
|
|
|
mov(rax, pL_); |
|
|
|
|
lea(rax, ptr[rip+pL_]); |
|
|
|
|
sub_rm(p1, rax); |
|
|
|
|
if (fullReg) { |
|
|
|
|
sbb(*fullReg, 0); |
|
|
|
@ -577,7 +660,7 @@ private: |
|
|
|
|
const Pack& p1 = t.sub(pn_, pn_); |
|
|
|
|
load_rm(p0, px); |
|
|
|
|
sub_rm(p0, py, withCarry); |
|
|
|
|
mov(rax, pL_); |
|
|
|
|
lea(rax, ptr[rip+pL_]); |
|
|
|
|
load_rm(p1, rax); |
|
|
|
|
sbb(rax, rax); // rax = (x > y) ? 0 : -1
|
|
|
|
|
for (size_t i = 0; i < p1.size(); i++) { |
|
|
|
@ -618,7 +701,7 @@ private: |
|
|
|
|
Label exit; |
|
|
|
|
if (isFullBit_) { |
|
|
|
|
jnc("@f"); |
|
|
|
|
mov(t2[0], pL_); // t2 is not used
|
|
|
|
|
lea(t2[0], ptr[rip+pL_]); // t2[0] is not used
|
|
|
|
|
sub_rm(t1, t2[0]); |
|
|
|
|
jmp(exit); |
|
|
|
|
L("@@"); |
|
|
|
@ -648,7 +731,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void3u gen_fp_add() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void3u func = getCurr<void3u>(); |
|
|
|
|
if (pn_ <= 4) { |
|
|
|
|
gen_fp_add_le4(); |
|
|
|
@ -666,7 +748,7 @@ private: |
|
|
|
|
|
|
|
|
|
inLocalLabel(); |
|
|
|
|
gen_raw_add(pz, px, py, rax, pn_); |
|
|
|
|
mov(px, pL_); // destroy px
|
|
|
|
|
lea(px, ptr[rip+pL_]); |
|
|
|
|
if (isFullBit_) { |
|
|
|
|
jc(".over", jmpMode); |
|
|
|
|
} |
|
|
|
@ -696,7 +778,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void3u gen_fpDbl_add() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void3u func = getCurr<void3u>(); |
|
|
|
|
if (pn_ <= 4) { |
|
|
|
|
int tn = pn_ * 2 + (isFullBit_ ? 1 : 0); |
|
|
|
@ -724,7 +805,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void3u gen_fpDbl_sub() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void3u func = getCurr<void3u>(); |
|
|
|
|
if (pn_ <= 4) { |
|
|
|
|
int tn = pn_ * 2; |
|
|
|
@ -774,7 +854,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void3u gen_fp_sub() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void3u func = getCurr<void3u>(); |
|
|
|
|
if (pn_ <= 4) { |
|
|
|
|
gen_fp_sub_le4(); |
|
|
|
@ -792,14 +871,13 @@ private: |
|
|
|
|
Label exit; |
|
|
|
|
gen_raw_sub(pz, px, py, rax, pn_); |
|
|
|
|
jnc(exit, jmpMode); |
|
|
|
|
mov(px, pL_); |
|
|
|
|
lea(px, ptr[rip+pL_]); |
|
|
|
|
gen_raw_add(pz, pz, px, rax, pn_); |
|
|
|
|
L(exit); |
|
|
|
|
return func; |
|
|
|
|
} |
|
|
|
|
void2u gen_fp_neg() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void2u func = getCurr<void2u>(); |
|
|
|
|
StackFrame sf(this, 2, UseRDX | pn_); |
|
|
|
|
gen_raw_neg(sf.p[0], sf.p[1], sf.t); |
|
|
|
@ -807,7 +885,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void2u gen_shr1() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void2u func = getCurr<void2u>(); |
|
|
|
|
const int c = 1; |
|
|
|
|
StackFrame sf(this, 2, 1); |
|
|
|
@ -828,7 +905,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void3u gen_mul() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void3u func = getCurr<void3u>(); |
|
|
|
|
if (op_->primeMode == PM_NIST_P192) { |
|
|
|
|
StackFrame sf(this, 3, 10 | UseRDX, 8 * 6); |
|
|
|
@ -901,7 +977,7 @@ private: |
|
|
|
|
|
|
|
|
|
mov(a, rp_); |
|
|
|
|
mul(t6); |
|
|
|
|
mov(t0, pL_); |
|
|
|
|
lea(t0, ptr[rip+pL_]); |
|
|
|
|
mov(t7, a); // q
|
|
|
|
|
|
|
|
|
|
// [d:t7:t1] = p * q
|
|
|
|
@ -970,7 +1046,7 @@ private: |
|
|
|
|
|
|
|
|
|
mov(a, rp_); |
|
|
|
|
mul(t10); |
|
|
|
|
mov(t0, pL_); |
|
|
|
|
lea(t0, ptr[rip+pL_]); |
|
|
|
|
mov(t7, a); // q
|
|
|
|
|
|
|
|
|
|
// [d:t7:t2:t1] = p * q
|
|
|
|
@ -1050,7 +1126,7 @@ private: |
|
|
|
|
|
|
|
|
|
mov(a, rp_); |
|
|
|
|
mul(z); |
|
|
|
|
mov(t0, pL_); |
|
|
|
|
lea(t0, ptr[rip+pL_]); |
|
|
|
|
mov(t7, a); // q
|
|
|
|
|
|
|
|
|
|
// [d:t7:t3:t2:t1] = p * q
|
|
|
|
@ -1141,7 +1217,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void2u gen_fpDbl_mod(const fp::Op& op) |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void2u func = getCurr<void2u>(); |
|
|
|
|
if (op.primeMode == PM_NIST_P192) { |
|
|
|
|
StackFrame sf(this, 2, 6 | UseRDX); |
|
|
|
@ -1187,7 +1262,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void2u gen_sqr() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void2u func = getCurr<void2u>(); |
|
|
|
|
if (op_->primeMode == PM_NIST_P192) { |
|
|
|
|
StackFrame sf(this, 3, 10 | UseRDX, 6 * 8); |
|
|
|
@ -1308,7 +1382,7 @@ private: |
|
|
|
|
|
|
|
|
|
L(fp_mulL); |
|
|
|
|
vmovq(xm0, p0); // save p0
|
|
|
|
|
mov(p0, pL_); |
|
|
|
|
lea(p0, ptr[rip+pL_]); |
|
|
|
|
vmovq(xm1, p2); |
|
|
|
|
mov(p2, ptr [p2]); |
|
|
|
|
montgomery4_1(rp_, t0, t7, t3, t2, t1, p1, p2, p0, t4, t5, t6, t8, t9, true, xm2); |
|
|
|
@ -1404,7 +1478,7 @@ private: |
|
|
|
|
mov(a, rp_); |
|
|
|
|
mul(c[0]); // q = a
|
|
|
|
|
mov(d, a); |
|
|
|
|
mov(t1, pL_); |
|
|
|
|
lea(t1, ptr[rip+pL_]); |
|
|
|
|
// c += p * q
|
|
|
|
|
mulAdd(c, 6, t1); |
|
|
|
|
} |
|
|
|
@ -1450,7 +1524,7 @@ private: |
|
|
|
|
const Pack z = Pack(t3, t2, t1, t0, t7, t6); |
|
|
|
|
const Pack keep = Pack(rdx, rax, px, py, t8, t9); |
|
|
|
|
mov_rr(keep, z); |
|
|
|
|
mov(t5, pL_); |
|
|
|
|
lea(t5, ptr[rip+pL_]); |
|
|
|
|
sub_rm(z, t5); |
|
|
|
|
cmovc_rr(z, keep); |
|
|
|
|
store_mr(pz, z); |
|
|
|
@ -1480,7 +1554,7 @@ private: |
|
|
|
|
const Reg64& t9 = sf.t[9]; |
|
|
|
|
|
|
|
|
|
vmovq(xm0, p0); // save p0
|
|
|
|
|
mov(t7, pL_); |
|
|
|
|
lea(t7, ptr[rip+pL_]); |
|
|
|
|
mov(t9, ptr [p2]); |
|
|
|
|
// c3, c2, c1, c0, px, y, p,
|
|
|
|
|
montgomery3_1(rp_, t0, t3, t2, t1, p1, t9, t7, t4, t5, t6, t8, p0, true); |
|
|
|
@ -1526,7 +1600,7 @@ private: |
|
|
|
|
const Reg64& t9 = sf.t[9]; |
|
|
|
|
|
|
|
|
|
vmovq(xm0, pz); // save pz
|
|
|
|
|
mov(t7, pL_); |
|
|
|
|
lea(t7, ptr[rip+pL_]); |
|
|
|
|
mov(t9, ptr [px]); |
|
|
|
|
mul3x1_sqr1(px, t9, t3, t2, t1, t0); |
|
|
|
|
mov(t0, rdx); |
|
|
|
@ -2291,7 +2365,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void2u gen_fpDbl_sqrPre() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void2u func = getCurr<void2u>(); |
|
|
|
|
if (pn_ == 2 && useMulx_) { |
|
|
|
|
StackFrame sf(this, 2, 7 | UseRDX); |
|
|
|
@ -2332,7 +2405,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void3u gen_fpDbl_mulPre() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void3u func = getCurr<void3u>(); |
|
|
|
|
if (pn_ == 2 && useMulx_) { |
|
|
|
|
StackFrame sf(this, 3, 5 | UseRDX); |
|
|
|
@ -2630,7 +2702,7 @@ private: |
|
|
|
|
mov(rax, px); |
|
|
|
|
// px is free frome here
|
|
|
|
|
load_mp(vv, rax, t); // v = x
|
|
|
|
|
mov(rax, pL_); |
|
|
|
|
lea(rax, ptr[rip+pL_]); |
|
|
|
|
load_mp(uu, rax, t); // u = p_
|
|
|
|
|
// k = 0
|
|
|
|
|
xor_(rax, rax); |
|
|
|
@ -2708,7 +2780,7 @@ private: |
|
|
|
|
const Reg64& t2 = ss.getReg(0); |
|
|
|
|
const Reg64& t3 = rdx; |
|
|
|
|
|
|
|
|
|
mov(t2, pL_); |
|
|
|
|
lea(t2, ptr[rip+pL_]); |
|
|
|
|
if (isFullBit_) { |
|
|
|
|
mov(t, ptr [rTop]); |
|
|
|
|
test(t, t); |
|
|
|
@ -3373,7 +3445,6 @@ private: |
|
|
|
|
// if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
|
|
|
|
|
// almost same for pn_ == 6
|
|
|
|
|
if (pn_ != 4) return 0; |
|
|
|
|
align(16); |
|
|
|
|
void3u func = getCurr<void3u>(); |
|
|
|
|
|
|
|
|
|
const RegExp z = rsp + 0 * 8; |
|
|
|
@ -3438,7 +3509,6 @@ private: |
|
|
|
|
// if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
|
|
|
|
|
// almost same for pn_ == 6
|
|
|
|
|
if (pn_ != 4) return 0; |
|
|
|
|
align(16); |
|
|
|
|
void2u func = getCurr<void2u>(); |
|
|
|
|
// almost same for pn_ == 6
|
|
|
|
|
if (pn_ != 4) return 0; |
|
|
|
@ -3524,7 +3594,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void3u gen_fp2_add() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void3u func = getCurr<void3u>(); |
|
|
|
|
if (pn_ == 4 && !isFullBit_) { |
|
|
|
|
gen_fp2_add4(); |
|
|
|
@ -3538,7 +3607,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void3u gen_fp2_sub() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void3u func = getCurr<void3u>(); |
|
|
|
|
if (pn_ == 4 && !isFullBit_) { |
|
|
|
|
gen_fp2_sub4(); |
|
|
|
@ -3584,7 +3652,7 @@ private: |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
sub_rr(a, b); |
|
|
|
|
mov(rax, pL_); |
|
|
|
|
lea(rax, ptr[rip+pL_]); |
|
|
|
|
load_rm(b, rax); |
|
|
|
|
sbb(rax, rax); |
|
|
|
|
for (int i = 0; i < pn_; i++) { |
|
|
|
@ -3592,7 +3660,7 @@ private: |
|
|
|
|
} |
|
|
|
|
add_rr(a, b); |
|
|
|
|
store_mr(py, a); |
|
|
|
|
mov(rax, pL_); |
|
|
|
|
lea(rax, ptr[rip+pL_]); |
|
|
|
|
mov_rr(a, t); |
|
|
|
|
sub_rm(t, rax); |
|
|
|
|
cmovc_rr(t, a); |
|
|
|
@ -3610,7 +3678,7 @@ private: |
|
|
|
|
mov_rr(b, a); |
|
|
|
|
add_rm(b, px + FpByte_); |
|
|
|
|
sub_rm(a, px + FpByte_); |
|
|
|
|
mov(rax, pL_); |
|
|
|
|
lea(rax, ptr[rip+pL_]); |
|
|
|
|
jnc("@f"); |
|
|
|
|
add_rm(a, rax); |
|
|
|
|
L("@@"); |
|
|
|
@ -3624,7 +3692,6 @@ private: |
|
|
|
|
{ |
|
|
|
|
if (isFullBit_) return 0; |
|
|
|
|
if (op_->xi_a != 1) return 0; |
|
|
|
|
align(16); |
|
|
|
|
void2u func = getCurr<void2u>(); |
|
|
|
|
if (pn_ == 4) { |
|
|
|
|
gen_fp2_mul_xi4(); |
|
|
|
@ -3638,7 +3705,6 @@ private: |
|
|
|
|
} |
|
|
|
|
void2u gen_fp2_neg() |
|
|
|
|
{ |
|
|
|
|
align(16); |
|
|
|
|
void2u func = getCurr<void2u>(); |
|
|
|
|
if (pn_ <= 6) { |
|
|
|
|
StackFrame sf(this, 2, UseRDX | pn_); |
|
|
|
@ -3652,7 +3718,6 @@ private: |
|
|
|
|
{ |
|
|
|
|
if (isFullBit_) return 0; |
|
|
|
|
if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; |
|
|
|
|
align(16); |
|
|
|
|
void3u func = getCurr<void3u>(); |
|
|
|
|
bool embedded = pn_ == 4; |
|
|
|
|
|
|
|
|
@ -3729,7 +3794,6 @@ private: |
|
|
|
|
{ |
|
|
|
|
if (isFullBit_) return 0; |
|
|
|
|
if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; |
|
|
|
|
align(16); |
|
|
|
|
void2u func = getCurr<void2u>(); |
|
|
|
|
|
|
|
|
|
const RegExp y = rsp + 0 * 8; |
|
|
|
@ -3789,7 +3853,7 @@ private: |
|
|
|
|
mov(ptr [(RegExp)t2 + i * 8], rax); |
|
|
|
|
} |
|
|
|
|
// t3 = a + p - b
|
|
|
|
|
mov(rax, pL_); |
|
|
|
|
lea(rax, ptr[rip+pL_]); |
|
|
|
|
add_rm(a, rax); |
|
|
|
|
sub_rr(a, b); |
|
|
|
|
store_mr(t3, a); |
|
|
|
|