From 64be787592b97467312b70ccc03f259d9b192c96 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 8 Feb 2021 14:20:18 +0900 Subject: [PATCH 1/4] reduce one wrapper of mulPre for wasm --- include/mcl/fp_tower.hpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp index 4802038..b39f7b3 100644 --- a/include/mcl/fp_tower.hpp +++ b/include/mcl/fp_tower.hpp @@ -113,24 +113,25 @@ public: static void (*mod)(Fp& z, const FpDblT& xy); static void (*addPre)(FpDblT& z, const FpDblT& x, const FpDblT& y); static void (*subPre)(FpDblT& z, const FpDblT& x, const FpDblT& y); + static void (*mulPre)(FpDblT& xy, const Fp& x, const Fp& y); static void addC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_add(z.v_, x.v_, y.v_, Fp::op_.p); } static void subC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_sub(z.v_, x.v_, y.v_, Fp::op_.p); } static void modC(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); } static void addPreC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); } static void subPreC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); } + static void mulPreC(FpDblT& xy, const Fp& x, const Fp& y) { Fp::op_.fpDbl_mulPre(xy.v_, x.v_, y.v_); } #else static void add(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_add(z.v_, x.v_, y.v_, Fp::op_.p); } static void sub(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_sub(z.v_, x.v_, y.v_, Fp::op_.p); } static void mod(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); } static void addPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); } static void subPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); } + static void mulPre(FpDblT& xy, const Fp& x, const Fp& y) { Fp::op_.fpDbl_mulPre(xy.v_, x.v_, y.v_); } #endif - static void mulPreC(FpDblT& xy, const Fp& x, const Fp& y) { Fp::op_.fpDbl_mulPre(xy.v_, x.v_, y.v_); } static void sqrPreC(FpDblT& xx, const Fp& x) { Fp::op_.fpDbl_sqrPre(xx.v_, x.v_); } /* mul(z, x, y) = mulPre(xy, x, y) + mod(z, xy) */ - static void (*mulPre)(FpDblT& xy, const Fp& x, const Fp& y); static void (*sqrPre)(FpDblT& xx, const Fp& x); static void mulUnit(FpDblT& z, const FpDblT& x, Unit y) { @@ -151,12 +152,9 @@ public: if (addPre == 0) addPre = addPreC; subPre = fp::func_ptr_cast(op.fpDbl_subPre); if (subPre == 0) subPre = subPreC; + mulPre = fp::func_ptr_cast(op.fpDbl_mulPreA_); + if (mulPre == 0) mulPre = mulPreC; #endif - if (op.fpDbl_mulPreA_) { - mulPre = fp::func_ptr_cast(op.fpDbl_mulPreA_); - } else { - mulPre = mulPreC; - } if (op.fpDbl_sqrPreA_) { sqrPre = fp::func_ptr_cast(op.fpDbl_sqrPreA_); } else { @@ -173,8 +171,8 @@ template void (*FpDblT::sub)(FpDblT&, const FpDblT&, const FpDblT& template void (*FpDblT::mod)(Fp&, const FpDblT&); template void (*FpDblT::addPre)(FpDblT&, const FpDblT&, const FpDblT&); template void (*FpDblT::subPre)(FpDblT&, const FpDblT&, const FpDblT&); -#endif template void (*FpDblT::mulPre)(FpDblT&, const Fp&, const Fp&); +#endif template void (*FpDblT::sqrPre)(FpDblT&, const Fp&); template struct Fp12T; From ebbb5cf6cc96b89ff76eb996d2a43d3e2d5a3668 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 8 Feb 2021 14:47:48 +0900 Subject: [PATCH 2/4] remove fpDbl_sqrPreA_ --- include/mcl/fp_tower.hpp | 7 +------ include/mcl/op.hpp | 2 -- src/fp_generator.hpp | 4 ++-- src/fp_static_code.hpp | 2 +- 4 files changed, 4 insertions(+), 11 deletions(-) diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp index b39f7b3..65aedd7 100644 --- a/include/mcl/fp_tower.hpp +++ b/include/mcl/fp_tower.hpp @@ -128,7 +128,6 @@ public: static void subPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); } static void mulPre(FpDblT& xy, const Fp& x, const Fp& y) { Fp::op_.fpDbl_mulPre(xy.v_, x.v_, y.v_); } #endif - static void sqrPreC(FpDblT& xx, const Fp& x) { Fp::op_.fpDbl_sqrPre(xx.v_, x.v_); } /* mul(z, x, y) = mulPre(xy, x, y) + mod(z, xy) */ @@ -155,11 +154,7 @@ public: mulPre = fp::func_ptr_cast(op.fpDbl_mulPreA_); if (mulPre == 0) mulPre = mulPreC; #endif - if (op.fpDbl_sqrPreA_) { - sqrPre = fp::func_ptr_cast(op.fpDbl_sqrPreA_); - } else { - sqrPre = sqrPreC; - } + sqrPre = fp::func_ptr_cast(op.fpDbl_sqrPre); } void operator+=(const FpDblT& x) { add(*this, *this, x); } void operator-=(const FpDblT& x) { sub(*this, *this, x); } diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index 29ca9f8..b8c1dbe 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -222,7 +222,6 @@ struct Op { void3u fpDbl_addA_; void3u fpDbl_subA_; void3u fpDbl_mulPreA_; - void2u fpDbl_sqrPreA_; void2u fpDbl_modA_; void3u fp2Dbl_mulPreA_; void2u fp2Dbl_sqrPreA_; @@ -309,7 +308,6 @@ struct Op { fpDbl_addA_ = 0; fpDbl_subA_ = 0; fpDbl_mulPreA_ = 0; - fpDbl_sqrPreA_ = 0; fpDbl_modA_ = 0; fp2Dbl_mulPreA_ = 0; fp2Dbl_sqrPreA_ = 0; diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 3227d93..5ab7f9c 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -425,8 +425,8 @@ private: setFuncInfo(prof_, suf, "Dbl_mulPre", op.fpDbl_mulPreA_, getCurr()); align(16); - op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre(); - setFuncInfo(prof_, suf, "Dbl_sqrPre", op.fpDbl_sqrPreA_, getCurr()); + op.fpDbl_sqrPre = gen_fpDbl_sqrPre(); + setFuncInfo(prof_, suf, "Dbl_sqrPre", op.fpDbl_sqrPre, getCurr()); align(16); op.fp2_addA_ = gen_fp2_add(); diff --git a/src/fp_static_code.hpp b/src/fp_static_code.hpp index 09d4d01..6584410 100644 --- a/src/fp_static_code.hpp +++ b/src/fp_static_code.hpp @@ -66,7 +66,7 @@ void setStaticCode(mcl::fp::Op& op) op.fpDbl_addPre = mclx_FpDbl_addPre; op.fpDbl_subPre = mclx_FpDbl_subPre; op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre; - op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre; + op.fpDbl_sqrPre = mclx_FpDbl_sqrPre; op.fp2_addA_ = mclx_Fp2_add; op.fp2_subA_ = mclx_Fp2_sub; op.fp2_negA_ = mclx_Fp2_neg; From 52a9f4d2135782a43fc2bf64a880ef97232e8f27 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 8 Feb 2021 15:28:31 +0900 Subject: [PATCH 3/4] remove mulPreC --- include/mcl/fp_tower.hpp | 9 +++------ include/mcl/op.hpp | 2 -- src/fp_generator.hpp | 28 +++++++++++++--------------- src/fp_static_code.hpp | 2 +- 4 files changed, 17 insertions(+), 24 deletions(-) diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp index 65aedd7..730a044 100644 --- a/include/mcl/fp_tower.hpp +++ b/include/mcl/fp_tower.hpp @@ -113,24 +113,22 @@ public: static void (*mod)(Fp& z, const FpDblT& xy); static void (*addPre)(FpDblT& z, const FpDblT& x, const FpDblT& y); static void (*subPre)(FpDblT& z, const FpDblT& x, const FpDblT& y); - static void (*mulPre)(FpDblT& xy, const Fp& x, const Fp& y); static void addC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_add(z.v_, x.v_, y.v_, Fp::op_.p); } static void subC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_sub(z.v_, x.v_, y.v_, Fp::op_.p); } static void modC(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); } static void addPreC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); } static void subPreC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); } - static void mulPreC(FpDblT& xy, const Fp& x, const Fp& y) { Fp::op_.fpDbl_mulPre(xy.v_, x.v_, y.v_); } #else static void add(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_add(z.v_, x.v_, y.v_, Fp::op_.p); } static void sub(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_sub(z.v_, x.v_, y.v_, Fp::op_.p); } static void mod(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); } static void addPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); } static void subPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); } - static void mulPre(FpDblT& xy, const Fp& x, const Fp& y) { Fp::op_.fpDbl_mulPre(xy.v_, x.v_, y.v_); } #endif /* mul(z, x, y) = mulPre(xy, x, y) + mod(z, xy) */ + static void (*mulPre)(FpDblT& xy, const Fp& x, const Fp& y); static void (*sqrPre)(FpDblT& xx, const Fp& x); static void mulUnit(FpDblT& z, const FpDblT& x, Unit y) { @@ -151,9 +149,8 @@ public: if (addPre == 0) addPre = addPreC; subPre = fp::func_ptr_cast(op.fpDbl_subPre); if (subPre == 0) subPre = subPreC; - mulPre = fp::func_ptr_cast(op.fpDbl_mulPreA_); - if (mulPre == 0) mulPre = mulPreC; #endif + mulPre = fp::func_ptr_cast(op.fpDbl_mulPre); sqrPre = fp::func_ptr_cast(op.fpDbl_sqrPre); } void operator+=(const FpDblT& x) { add(*this, *this, x); } @@ -166,8 +163,8 @@ template void (*FpDblT::sub)(FpDblT&, const FpDblT&, const FpDblT& template void (*FpDblT::mod)(Fp&, const FpDblT&); template void (*FpDblT::addPre)(FpDblT&, const FpDblT&, const FpDblT&); template void (*FpDblT::subPre)(FpDblT&, const FpDblT&, const FpDblT&); -template void (*FpDblT::mulPre)(FpDblT&, const Fp&, const Fp&); #endif +template void (*FpDblT::mulPre)(FpDblT&, const Fp&, const Fp&); template void (*FpDblT::sqrPre)(FpDblT&, const Fp&); template struct Fp12T; diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index b8c1dbe..e3d78d8 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -221,7 +221,6 @@ struct Op { void2u fp2_sqrA_; void3u fpDbl_addA_; void3u fpDbl_subA_; - void3u fpDbl_mulPreA_; void2u fpDbl_modA_; void3u fp2Dbl_mulPreA_; void2u fp2Dbl_sqrPreA_; @@ -307,7 +306,6 @@ struct Op { fp2_sqrA_ = 0; fpDbl_addA_ = 0; fpDbl_subA_ = 0; - fpDbl_mulPreA_ = 0; fpDbl_modA_ = 0; fp2Dbl_mulPreA_ = 0; fp2Dbl_sqrPreA_ = 0; diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 5ab7f9c..ef38b63 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -421,11 +421,11 @@ private: setFuncInfo(prof_, suf, "Dbl_subPre", op.fpDbl_subPre, getCurr()); align(16); - op.fpDbl_mulPreA_ = gen_fpDbl_mulPre(); - setFuncInfo(prof_, suf, "Dbl_mulPre", op.fpDbl_mulPreA_, getCurr()); + gen_fpDbl_mulPre(op.fpDbl_mulPre); + setFuncInfo(prof_, suf, "Dbl_mulPre", op.fpDbl_mulPre, getCurr()); align(16); - op.fpDbl_sqrPre = gen_fpDbl_sqrPre(); + gen_fpDbl_sqrPre(op.fpDbl_sqrPre); setFuncInfo(prof_, suf, "Dbl_sqrPre", op.fpDbl_sqrPre, getCurr()); align(16); @@ -2373,36 +2373,35 @@ private: cmovc_rr(zp, keep); store_mr(z, zp); } - void2u gen_fpDbl_sqrPre() + void gen_fpDbl_sqrPre(void2u& f) { void2u func = getCurr(); if (pn_ == 2 && useMulx_) { StackFrame sf(this, 2, 7 | UseRDX); sqrPre2(sf.p[0], sf.p[1], sf.t); - return func; + f = func; } if (pn_ == 3) { StackFrame sf(this, 3, 10 | UseRDX); Pack t = sf.t; t.append(sf.p[2]); sqrPre3(sf.p[0], sf.p[1], t); - return func; + f = func; } if (pn_ == 4 && useMulx_) { StackFrame sf(this, 3, 10 | UseRDX); Pack t = sf.t; t.append(sf.p[2]); sqrPre4(sf.p[0], sf.p[1], t); - return func; + f = func; } if (pn_ == 6 && useMulx_ && useAdx_) { StackFrame sf(this, 3, 10 | UseRDX, 6 * 8); Pack t = sf.t; t.append(sf.p[2]); sqrPre6(sf.p[0], sf.p[1], t); - return func; + f = func; } - return 0; #if 0 #ifdef XBYAK64_WIN mov(r8, rdx); @@ -2413,18 +2412,18 @@ private: return func; #endif } - void3u gen_fpDbl_mulPre() + void gen_fpDbl_mulPre(void3u& f) { void3u func = getCurr(); if (pn_ == 2 && useMulx_) { StackFrame sf(this, 3, 5 | UseRDX); mulPre2(sf.p[0], sf.p[1], sf.p[2], sf.t); - return func; + f = func; } if (pn_ == 3) { StackFrame sf(this, 3, 10 | UseRDX); mulPre3(sf.p[0], sf.p[1], sf.p[2], sf.t); - return func; + f = func; } if (pn_ == 4) { /* @@ -2437,7 +2436,7 @@ private: L(mulPreL); // called only from asm code mulPre4(gp0, gp1, gp2, sf.t); ret(); - return func; + f = func; } if (pn_ == 6 && useAdx_) { StackFrame sf(this, 3, 10 | UseRDX, 0, false); @@ -2446,9 +2445,8 @@ private: L(mulPreL); // called only from asm code mulPre6(sf.t); ret(); - return func; + f = func; } - return 0; } static inline void debug_put_inner(const uint64_t *ptr, int n) { diff --git a/src/fp_static_code.hpp b/src/fp_static_code.hpp index 6584410..7421f0a 100644 --- a/src/fp_static_code.hpp +++ b/src/fp_static_code.hpp @@ -65,7 +65,7 @@ void setStaticCode(mcl::fp::Op& op) op.fpDbl_subA_ = mclx_FpDbl_sub; op.fpDbl_addPre = mclx_FpDbl_addPre; op.fpDbl_subPre = mclx_FpDbl_subPre; - op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre; + op.fpDbl_mulPre = mclx_FpDbl_mulPre; op.fpDbl_sqrPre = mclx_FpDbl_sqrPre; op.fp2_addA_ = mclx_Fp2_add; op.fp2_subA_ = mclx_Fp2_sub; From 6afa976ad2beb81a852bf39419235da440c0dc6c Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Mon, 8 Feb 2021 16:21:59 +0900 Subject: [PATCH 4/4] rename local template function --- include/mcl/fp_tower.hpp | 6 +++--- src/fp.cpp | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp index 730a044..26d96e1 100644 --- a/include/mcl/fp_tower.hpp +++ b/include/mcl/fp_tower.hpp @@ -680,9 +680,9 @@ struct Fp2DblT { mulPre = fp::func_ptr_cast(op.fp2Dbl_mulPreA_); } else { if (op.isFullBit) { - mulPre = fp2Dbl_mulPreW; + mulPre = fp2Dbl_mulPreTW; } else { - mulPre = fp2Dbl_mulPreW; + mulPre = fp2Dbl_mulPreTW; } } if (op.fp2Dbl_sqrPreA_) { @@ -700,7 +700,7 @@ struct Fp2DblT { @note mod of NIST_P192 is fast */ template - static void fp2Dbl_mulPreW(Fp2DblT& z, const Fp2& x, const Fp2& y) + static void fp2Dbl_mulPreTW(Fp2DblT& z, const Fp2& x, const Fp2& y) { const Fp& a = x.a; const Fp& b = x.b; diff --git a/src/fp.cpp b/src/fp.cpp index cd3266e..0534580 100644 --- a/src/fp.cpp +++ b/src/fp.cpp @@ -426,6 +426,7 @@ void setWasmOp(Op& op) op.fp_mul = mcl::mulMontT; op.fp_sqr = mcl::sqrMontT; op.fpDbl_mulPre = mulT; +// op.fpDbl_sqrPre = sqrT; op.fpDbl_mod = modT; } #endif