diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 9e42414..1907e10 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -836,6 +836,7 @@ private: gen_montMul4(); return func; } + return 0; if (pn_ == 6 && useMulx_ && useAdx_) { // gen_montMul6(p_, rp_); StackFrame sf(this, 3, 10 | UseRDX, (1 + 12) * 8); @@ -1183,6 +1184,27 @@ private: gen_montSqr3(); return func; } + if (pn_ == 4 && useMulx_) { +#if 1 + // sqr(y, x) = mul(y, x, x) +#ifdef XBYAK64_WIN + mov(r8, rdx); +#else + mov(rdx, rsi); +#endif + jmp((const void*)op_->fp_mulA_); +#else // (sqrPre + mod) is slower than mul + StackFrame sf(this, 3, 10 | UseRDX, 8 * 8); + Pack t = sf.t; + t.append(sf.p[2]); + sqrPre4(rsp, sf.p[1], t); + mov(gp0, sf.p[0]); + mov(gp1, rsp); + call(fpDbl_modL); +#endif + return func; + } +return 0; if (pn_ == 6 && useMulx_ && useAdx_) { StackFrame sf(this, 3, 10 | UseRDX, (1 + 12) * 8); mov(ptr[rsp + 12 * 8], gp0); @@ -1194,26 +1216,7 @@ private: call(fpDbl_modL); return func; } -#if 0 // (sqrPre + mod) is slower than mul - if (pn_ == 4 && useMulx_) { - StackFrame sf(this, 3, 10 | UseRDX, 8 * 8); - Pack t = sf.t; - t.append(sf.p[2]); - sqrPre4(rsp, sf.p[1], t); - mov(gp0, sf.p[0]); - mov(gp1, rsp); - call(fpDbl_modL); - return func; - } -#endif - // sqr(y, x) = mul(y, x, x) -#ifdef XBYAK64_WIN - mov(r8, rdx); -#else - mov(rdx, rsi); -#endif - jmp((const void*)op_->fp_mulA_); - return func; + return 0; } /* input (pz[], px[], py[])