diff --git a/common.mk b/common.mk index 5c749e1..8c4dc63 100644 --- a/common.mk +++ b/common.mk @@ -79,6 +79,7 @@ else ifeq ($(MARCH),) ifeq ($(INTEL),1) # CFLAGS_OPT+=-march=native + CFLAGS_OPT+=-mavx endif else CFLAGS_OPT+=$(MARCH) diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index c62ecab..4d45725 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -270,6 +270,7 @@ private: void init_inner(Op& op) { op_ = &op; + if (!cpu_.has(Xbyak::util::Cpu::tAVX)) return; /* first 4096-byte is data area remain is code area @@ -829,10 +830,10 @@ private: rsp [0, ..12 * 8) ; mul(x, y) */ - movq(xm3, gp0); + vmovq(xm3, gp0); mov(gp0, rsp); call(mulPreL); // gp0, x, y - movq(gp0, xm3); + vmovq(gp0, xm3); mov(gp1, rsp); call(fpDbl_modL); #endif @@ -1018,7 +1019,7 @@ private: const Reg64& a = rax; const Reg64& d = rdx; - movq(xm0, z); + vmovq(xm0, z); mov(z, ptr [xy + 8 * 0]); mov(a, rp_); @@ -1045,7 +1046,7 @@ private: if (isFullBit_) { mov(t5, 0); adc(t5, 0); - movq(xm2, t5); + vmovq(xm2, t5); } // free z, t0, t1, t5, t6, xy @@ -1054,18 +1055,18 @@ private: mul(t2); mov(z, a); // q - movq(xm1, t10); + vmovq(xm1, t10); // [d:z:t5:t6:xy] = p * q mul4x1(t0, z, t1, t5, t6, xy, t10); - movq(t10, xm1); + vmovq(t10, xm1); add_rr(Pack(t8, t4, t7, t3, t2), Pack(d, z, t5, t6, xy)); adc(t9, 0); adc(t10, 0); // [t10:t9:t8:t4:t7:t3] if (isFullBit_) { - movq(t5, xm2); + vmovq(t5, xm2); adc(t5, 0); - movq(xm2, t5); + vmovq(xm2, t5); } // free z, t0, t1, t2, t5, t6, xy @@ -1080,7 +1081,7 @@ private: add_rr(Pack(t9, t8, t4, t7, t3), Pack(d, z, t5, xy, t6)); adc(t10, 0); // c' = [t10:t9:t8:t4:t7] if (isFullBit_) { - movq(t3, xm2); + vmovq(t3, xm2); adc(t3, 0); } @@ -1109,7 +1110,7 @@ private: cmovc(t9, t2); cmovc(t10, t6); - movq(z, xm0); + vmovq(z, xm0); store_mr(z, Pack(t10, t9, t8, t4)); } void* gen_fpDbl_mod(const fp::Op& op) @@ -1203,13 +1204,13 @@ private: [6 * 8, (12 + 6) * 8) ; sqrPre(x, x) [0..6 * 8) ; stack for sqrPre6 */ - movq(xm3, gp0); + vmovq(xm3, gp0); Pack t = sf.t; t.append(sf.p[2]); // sqrPre6 uses 6 * 8 bytes stack sqrPre6(rsp + 6 * 8, sf.p[1], t); mov(gp0, ptr[rsp + (12 + 6) * 8]); - movq(gp0, xm3); + vmovq(gp0, xm3); lea(gp1, ptr[rsp + 6 * 8]); call(fpDbl_modL); return func; @@ -1280,21 +1281,21 @@ private: const Reg64& t9 = sf.t[9]; L(fp_mulL); - movq(xm0, p0); // save p0 + vmovq(xm0, p0); // save p0 mov(p0, pL_); - movq(xm1, p2); + vmovq(xm1, p2); mov(p2, ptr [p2]); montgomery4_1(rp_, t0, t7, t3, t2, t1, p1, p2, p0, t4, t5, t6, t8, t9, true, xm2); - movq(p2, xm1); + vmovq(p2, xm1); mov(p2, ptr [p2 + 8]); montgomery4_1(rp_, t1, t0, t7, t3, t2, p1, p2, p0, t4, t5, t6, t8, t9, false, xm2); - movq(p2, xm1); + vmovq(p2, xm1); mov(p2, ptr [p2 + 16]); montgomery4_1(rp_, t2, t1, t0, t7, t3, p1, p2, p0, t4, t5, t6, t8, t9, false, xm2); - movq(p2, xm1); + vmovq(p2, xm1); mov(p2, ptr [p2 + 24]); montgomery4_1(rp_, t3, t2, t1, t0, t7, p1, p2, p0, t4, t5, t6, t8, t9, false, xm2); // [t7:t3:t2:t1:t0] @@ -1310,7 +1311,7 @@ private: cmovc(t2, t6); cmovc(t3, rdx); - movq(p0, xm0); // load p0 + vmovq(p0, xm0); // load p0 store_mr(p0, Pack(t3, t2, t1, t0)); ret(); } @@ -1452,7 +1453,7 @@ private: const Reg64& t8 = sf.t[8]; const Reg64& t9 = sf.t[9]; - movq(xm0, p0); // save p0 + vmovq(xm0, p0); // save p0 mov(t7, pL_); mov(t9, ptr [p2]); // c3, c2, c1, c0, px, y, p, @@ -1472,7 +1473,7 @@ private: cmovc(t0, t4); cmovc(t1, t5); cmovc(t2, t6); - movq(p0, xm0); + vmovq(p0, xm0); store_mr(p0, Pack(t2, t1, t0)); } /* @@ -1498,7 +1499,7 @@ private: const Reg64& t8 = sf.t[8]; const Reg64& t9 = sf.t[9]; - movq(xm0, pz); // save pz + vmovq(xm0, pz); // save pz mov(t7, pL_); mov(t9, ptr [px]); mul3x1_sqr1(px, t9, t3, t2, t1, t0); @@ -1526,7 +1527,7 @@ private: cmovc(t3, t4); cmovc(t0, t5); cmovc(t2, t6); - movq(pz, xm0); + vmovq(pz, xm0); store_mr(pz, Pack(t2, t0, t3)); } /* @@ -1891,7 +1892,7 @@ private: sqr2(t3, t2, t1, t0, t9, t8, t7, t6); // [t3:t2:t1:t0] = b^2 store_mr(py, Pack(t1, t0)); - movq(xm0, t2); + vmovq(xm0, t2); mul2x2(px, px + 2 * 8, t6, t5, t4, t1, t0); // [t5:t4:t1:t0] = ab xor_(t6, t6); @@ -1912,7 +1913,7 @@ private: mulx(d, t8, t8); // [d:t8] = t8^2 add_rr(Pack(d, t8, t10), Pack(a, t7, t2)); // [d:t8:t10:t9] = [t8:t7]^2 - movq(t2, xm0); + vmovq(t2, xm0); add_rr(Pack(t8, t10, t9, t3, t2), Pack(t6, t5, t4, t1, t0)); adc(d, 0); store_mr(py + 2 * 8, Pack(d, t8, t10, t9, t3, t2)); @@ -1992,11 +1993,11 @@ private: mul2x2(px + 8 * 0, py + 8 * 0, t9, t8, t7, t6, t5); store_mr(pz, Pack(t6, t5)); // [t8:t7] - movq(xm0, t7); - movq(xm1, t8); + vmovq(xm0, t7); + vmovq(xm1, t8); mul2x2(px + 8 * 2, py + 8 * 2, t8, t7, t9, t6, t5); - movq(a, xm0); - movq(d, xm1); + vmovq(a, xm0); + vmovq(d, xm1); add_rr(Pack(t4, t3, t2, t1, t0), Pack(t9, t6, t5, d, a)); adc(t7, 0); store_mr(pz + 8 * 2, Pack(t7, t4, t3, t2, t1, t0)); @@ -2094,19 +2095,19 @@ private: add_rm(Pack(t2, t1, t0), px + 3 * 8); // a + b adc(a, 0); store_mr(pz, Pack(t2, t1, t0)); - movq(xm0, a); // carry1 + vmovq(xm0, a); // carry1 xor_(a, a); load_rm(Pack(t2, t1, t0), py); // d add_rm(Pack(t2, t1, t0), py + 3 * 8); // c + d adc(a, 0); store_mr(pz + 3 * 8, Pack(t2, t1, t0)); - movq(xm1, a); // carry2 + vmovq(xm1, a); // carry2 mulPre3(rsp + abcdPos, pz, pz + 3 * 8, t); // (a+b)(c+d) - movq(a, xm0); - movq(d, xm1); + vmovq(a, xm0); + vmovq(d, xm1); mov(t3, a); and_(t3, d); // t3 = carry1 & carry2 Label doNothing; @@ -2126,7 +2127,7 @@ private: L("@@"); store_mr(rsp + abcdPos + 3 * 8, Pack(t2, t1, t0)); L(doNothing); - movq(xm0, t3); // save new carry + vmovq(xm0, t3); // save new carry mov(gp0, ptr [rsp + zPos]); @@ -2140,7 +2141,7 @@ private: mulPre3(gp0 + 6 * 8, gp1 + 3 * 8, gp2 + 3 * 8, t); // [rsp + 6 * 8] <- ac mov(pz, ptr[rsp + zPos]); - movq(d, xm0); + vmovq(d, xm0); for (int i = 0; i < 6; i++) { mov(a, ptr[pz + (3 + i) * 8]); if (i == 0) { @@ -2197,7 +2198,7 @@ private: const Reg64& a = rax; const Reg64& d = rdx; - movq(xm0, z); + vmovq(xm0, z); mov(z, ptr [xy + 0 * 8]); mov(a, rp_); mul(z); @@ -2214,15 +2215,15 @@ private: // z = [t1:t0:t10:t9:t8:t7:t6:t5:t4:t3:t2] mov(a, rp_); mul(t2); - movq(xm1, t0); // save + vmovq(xm1, t0); // save lea(t0, ptr [rip + pL_]); mov(d, a); - movq(xm2, t10); + vmovq(xm2, t10); mulPackAddShr(Pack(t8, t7, t6, t5, t4, t3, t2), t0, t10); - movq(t10, xm2); + vmovq(t10, xm2); adc(t9, rax); adc(t10, rax); - movq(t0, xm1); // load + vmovq(t0, xm1); // load adc(t0, rax); adc(t1, rax); // z = [t1:t0:t10:t9:t8:t7:t6:t5:t4:t3] @@ -2230,9 +2231,9 @@ private: mul(t3); lea(t2, ptr [rip + pL_]); mov(d, a); - movq(xm2, t10); + vmovq(xm2, t10); mulPackAddShr(Pack(t9, t8, t7, t6, t5, t4, t3), t2, t10); - movq(t10, xm2); + vmovq(t10, xm2); adc(t10, rax); adc(t0, rax); adc(t1, rax); @@ -2263,7 +2264,7 @@ private: mov_rr(keep, zp); sub_rm(zp, t2); // z -= p cmovc_rr(zp, keep); - movq(z, xm0); + vmovq(z, xm0); store_mr(z, zp); } void* gen_fpDbl_sqrPre(const fp::Op&/* op */) @@ -2553,10 +2554,10 @@ private: { if (n >= 10) exit(1); static uint64_t buf[10]; - movq(xm0, rax); + vmovq(xm0, rax); mov(rax, (size_t)buf); store_mp(rax, mp, t); - movq(rax, xm0); + vmovq(rax, xm0); push(rax); mov(rax, (size_t)buf); debug_put(rax, n); @@ -3360,7 +3361,7 @@ private: mul4x1(px, y, t3, t2, t1, t0, t4); // [rdx:y:t2:t1:t0] = px[3..0] * y if (isFullBit_) { - movq(xt, px); + vmovq(xt, px); xor_(px, px); } add_rr(Pack(c4, y, c2, c1, c0), Pack(rdx, c3, t2, t1, t0)); @@ -3384,7 +3385,7 @@ private: adc(c0, 0); } else { adc(c0, px); - movq(px, xt); + vmovq(px, xt); } } } @@ -3517,9 +3518,9 @@ private: Pack t2 = sf.t.sub(6); t2.append(rax); t2.append(px); // destory after used - movq(xm0, px); + vmovq(xm0, px); gen_raw_fp_add6(pz, px, py, 0, t1, t2, false); - movq(px, xm0); + vmovq(px, xm0); gen_raw_fp_add6(pz, px, py, FpByte_, t1, t2, false); } void3u gen_fp2_add()