From c0be1e91cec8077335a41ca17237958eb4c05a0f Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 7 Feb 2021 12:12:50 +0900 Subject: [PATCH 1/4] rename low_funct.hpp to low_func_wasm.hpp --- misc/Makefile | 2 +- misc/low_test.cpp | 2 +- src/fp.cpp | 10 +++++----- src/{low_funct.hpp => low_func_wasm.hpp} | 0 4 files changed, 7 insertions(+), 7 deletions(-) rename src/{low_funct.hpp => low_func_wasm.hpp} (100%) diff --git a/misc/Makefile b/misc/Makefile index 25a7c27..c9dec20 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -2,5 +2,5 @@ all: low_test CFLAGS=-I ../include/ -m32 -Ofast -Wall -Wextra -DNDEBUG -low_test: low_test.cpp ../src/low_funct.hpp +low_test: low_test.cpp ../src/low_func_wasm.hpp $(CXX) -o low_test low_test.cpp $(CFLAGS) diff --git a/misc/low_test.cpp b/misc/low_test.cpp index 91af412..6d3e9b4 100644 --- a/misc/low_test.cpp +++ b/misc/low_test.cpp @@ -9,7 +9,7 @@ void dump(const char *msg, const uint32_t *x, size_t n) } printf("\n"); } -#include "../src/low_funct.hpp" +#include "../src/low_func_wasm.hpp" #define MCL_USE_VINT #define MCL_VINT_FIXED_BUFFER diff --git a/src/fp.cpp b/src/fp.cpp index 484ad43..cd3266e 100644 --- a/src/fp.cpp +++ b/src/fp.cpp @@ -4,8 +4,8 @@ #include #include #if defined(__EMSCRIPTEN__) && MCL_SIZEOF_UNIT == 4 -#define FOR_WASM -#include "low_funct.hpp" +#define USE_WASM +#include "low_func_wasm.hpp" #endif #if defined(MCL_STATIC_CODE) || defined(MCL_USE_XBYAK) || (defined(MCL_USE_LLVM) && (CYBOZU_HOST == CYBOZU_HOST_INTEL)) @@ -411,12 +411,12 @@ static bool initForMont(Op& op, const Unit *p, Mode mode) return true; } -#ifdef FOR_WASM +#ifdef USE_WASM template void setWasmOp(Op& op) { if (!(op.isMont && !op.isFullBit)) return; -EM_ASM({console.log($0)}, N); +//EM_ASM({console.log($0)}, N); // op.fp_addPre = mcl::addT; // op.fp_subPre = mcl::subT; // op.fpDbl_addPre = mcl::addT; @@ -570,7 +570,7 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size default: return false; } -#ifdef FOR_WASM +#ifdef USE_WASM if (N == 8) { setWasmOp<8>(*this); } else if (N == 12) { diff --git a/src/low_funct.hpp b/src/low_func_wasm.hpp similarity index 100% rename from src/low_funct.hpp rename to src/low_func_wasm.hpp From 54155986b4b64856739ea544add15701d9f20855 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 7 Feb 2021 12:13:08 +0900 Subject: [PATCH 2/4] replace bool to uint32_t --- misc/low_test.cpp | 14 ++++++++------ src/low_func_wasm.hpp | 40 ++++++++++++++++++++-------------------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/misc/low_test.cpp b/misc/low_test.cpp index 6d3e9b4..735a7d6 100644 --- a/misc/low_test.cpp +++ b/misc/low_test.cpp @@ -21,6 +21,8 @@ void dump(const char *msg, const uint32_t *x, size_t n) #include #include +const int C = 10000; + template void setRand(uint32_t *x, size_t n, RG& rg) { @@ -61,8 +63,8 @@ void mulTest() mcl::karatsubaT(z, x, y); CYBOZU_TEST_EQUAL_ARRAY(z, vx.getUnit(), N * 2); } - CYBOZU_BENCH_C("mulT", 10000, mcl::mulT, z, x, y); - CYBOZU_BENCH_C("kara", 10000, mcl::karatsubaT, z, x, y); + CYBOZU_BENCH_C("mulT", C, mcl::mulT, z, x, y); + CYBOZU_BENCH_C("kara", C, mcl::karatsubaT, z, x, y); } CYBOZU_TEST_AUTO(mulT) @@ -88,7 +90,7 @@ void sqrTest() mcl::sqrT(y, x); CYBOZU_TEST_EQUAL_ARRAY(y, vx.getUnit(), N * 2); } - CYBOZU_BENCH_C("sqrT", 10000, mcl::sqrT, y, x); + CYBOZU_BENCH_C("sqrT", C, mcl::sqrT, y, x); } CYBOZU_TEST_AUTO(sqrT) @@ -185,8 +187,8 @@ void mulMontTest(const char *pStr) mcl::sqrMontT(z, x, p); CYBOZU_TEST_EQUAL_ARRAY(z, vz.getUnit(), N); } - CYBOZU_BENCH_C("mulMontT", 10000, mcl::mulMontT, x, x, y, p); - CYBOZU_BENCH_C("sqrMontT", 10000, mcl::sqrMontT, x, x, p); + CYBOZU_BENCH_C("mulMontT", C, mcl::mulMontT, x, x, y, p); + CYBOZU_BENCH_C("sqrMontT", C, mcl::sqrMontT, x, x, p); } template @@ -214,7 +216,7 @@ void modTest(const char *pStr) mcl::modT(z, xy, p); CYBOZU_TEST_EQUAL_ARRAY(z, vz.getUnit(), N); } - CYBOZU_BENCH_C("modT", 10000, mcl::modT, z, xy, p); + CYBOZU_BENCH_C("modT", C, mcl::modT, z, xy, p); } CYBOZU_TEST_AUTO(mont) diff --git a/src/low_func_wasm.hpp b/src/low_func_wasm.hpp index 082c2df..885b16a 100644 --- a/src/low_func_wasm.hpp +++ b/src/low_func_wasm.hpp @@ -41,40 +41,40 @@ uint32_t shlT(uint32_t y[N], const uint32_t x[N], size_t bit) // [return:y[N]] += x template -inline bool addUnitT(uint32_t y[N], uint32_t x) +inline uint32_t addUnitT(uint32_t y[N], uint32_t x) { uint64_t v = uint64_t(y[0]) + x; y[0] = uint32_t(v); - bool c = (v >> 32) != 0; - if (!c) return false; + uint32_t c = v >> 32; + if (c == 0) return 0; for (size_t i = 1; i < N; i++) { v = uint64_t(y[i]) + 1; y[i] = uint32_t(v); - if ((v >> 32) == 0) return false; + if ((v >> 32) == 0) return 0; } - return true; + return 1; } template -bool addT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) +uint32_t addT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) { - bool c = false; + uint32_t c = 0; for (size_t i = 0; i < N; i++) { uint64_t v = uint64_t(x[i]) + y[i] + c; z[i] = uint32_t(v); - c = (v >> 32) != 0; + c = uint32_t(v >> 32); } return c; } template -bool subT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) +uint32_t subT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) { - bool c = false; + uint32_t c = 0; for (size_t i = 0; i < N; i++) { uint64_t v = uint64_t(x[i]) - y[i] - c; z[i] = uint32_t(v); - c = (v >> 32) != 0; + c = uint32_t(v >> 63); } return c; } @@ -187,8 +187,8 @@ void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N]) const size_t H = N / 2; uint32_t a_b[H]; uint32_t c_d[H]; - bool c1 = addT(a_b, x, x + H); // a + b - bool c2 = addT(c_d, y, y + H); // c + d + uint32_t c1 = addT(a_b, x, x + H); // a + b + uint32_t c2 = addT(c_d, y, y + H); // c + d uint32_t tmp[N]; mulT(tmp, a_b, c_d); if (c1) { @@ -220,11 +220,10 @@ void sqrT(uint32_t y[N * 2], const uint32_t x[N]) assert((x[N - 1] & 0x80000000) == 0); const size_t H = N / 2; uint32_t a_b[H]; - bool c = addT(a_b, x, x + H); // a + b + uint32_t c = addT(a_b, x, x + H); // a + b uint32_t tmp[N]; mulT(tmp, a_b, a_b); if (c) { -// addT(a_b, a_b, a_b); shlT(a_b, a_b, 1); addT(tmp + H, tmp + H, a_b); } @@ -244,7 +243,7 @@ void addModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint { uint32_t t[N]; addT(z, x, y); - bool c = subT(t, z, p); + uint32_t c = subT(t, z, p); if (!c) { copyT(z, t); } @@ -253,7 +252,7 @@ void addModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint template void subModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N]) { - bool c = subT(z, x, y); + uint32_t c = subT(z, x, y); if (c) { addT(z, z, p); } @@ -284,7 +283,7 @@ void mulMontT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uin // [return:z[N+1]] = z[N+1] + x[N] * y + (cc << (N * 32)) template -bool addMulUnit2T(uint32_t z[N + 1], const uint32_t x[N], uint32_t y, const bool *cc = 0) +uint32_t addMulUnit2T(uint32_t z[N + 1], const uint32_t x[N], uint32_t y, const uint32_t *cc = 0) { uint32_t H = 0; for (size_t i = 0; i < N; i++) { @@ -298,7 +297,7 @@ bool addMulUnit2T(uint32_t z[N + 1], const uint32_t x[N], uint32_t y, const bool uint64_t v = uint64_t(z[N]); v += H; z[N] = uint32_t(v); - return (v >> 32) != 0; + return uint32_t(v >> 32); } /* @@ -312,7 +311,7 @@ void modT(uint32_t y[N], const uint32_t xy[N * 2], const uint32_t p[N]) assert((p[N - 1] & 0x80000000) == 0); uint32_t buf[N * 2]; copyT(buf, xy); - bool c = 0; + uint32_t c = 0; for (size_t i = 0; i < N; i++) { uint32_t q = buf[i] * rp; c = addMulUnit2T(buf + i, p, q, &c); @@ -332,6 +331,7 @@ void sqrMontT(uint32_t y[N], const uint32_t x[N], const uint32_t p[N]) #if 1 mulMontT(y, x, x, p); #else + // slower uint32_t xx[N * 2]; sqrT(xx, x); modT(y, xx, p); From c0d65655eaa853c7f43955d91d81b7d1d0ada182 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 7 Feb 2021 12:56:34 +0900 Subject: [PATCH 3/4] remove unused code --- src/low_func_wasm.hpp | 50 ------------------------------------------- 1 file changed, 50 deletions(-) diff --git a/src/low_func_wasm.hpp b/src/low_func_wasm.hpp index 885b16a..352d446 100644 --- a/src/low_func_wasm.hpp +++ b/src/low_func_wasm.hpp @@ -118,56 +118,6 @@ void mulT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N]) } } -#if 0 -// slower than mulT -template -uint32_t mulUnitWithTblT(uint32_t z[N], const uint64_t *tbl_j) -{ - uint32_t H = 0; - for (size_t i = 0; i < N; i++) { - uint64_t v = tbl_j[i]; - v += H; - z[i] = uint32_t(v); - H = uint32_t(v >> 32); - } - return H; -} - -template -uint32_t addMulUnitWithTblT(uint32_t z[N], const uint64_t *tbl_j) -{ - uint32_t H = 0; - for (size_t i = 0; i < N; i++) { - uint64_t v = tbl_j[i]; - v += H; - v += z[i]; - z[i] = uint32_t(v); - H = uint32_t(v >> 32); - } - return H; -} - -// y[N * 2] = x[N] * x[N] -template -void sqrT(uint32_t y[N * 2], const uint32_t x[N]) -{ - uint64_t tbl[N * N]; // x[i]x[j] - for (size_t i = 0; i < N; i++) { - uint64_t xi = x[i]; - tbl[i * N + i] = xi * xi; - for (size_t j = i + 1; j < N; j++) { - uint64_t v = xi * x[j]; - tbl[i * N + j] = v; - tbl[j * N + i] = v; - } - } - y[N] = mulUnitWithTblT(y, tbl); - for (size_t i = 1; i < N; i++) { - y[N + i] = addMulUnitWithTblT(&y[i], tbl + N * i); - } -} -#endif - /* z[N * 2] = x[N] * y[N] H = N/2 From 2378fd27434810e2aadac4bec704959e1d2e4ce0 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 7 Feb 2021 14:56:24 +0900 Subject: [PATCH 4/4] a little optimization of portable mulUnit --- include/mcl/vint.hpp | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/include/mcl/vint.hpp b/include/mcl/vint.hpp index 13c4483..3bed527 100644 --- a/include/mcl/vint.hpp +++ b/include/mcl/vint.hpp @@ -90,35 +90,19 @@ inline uint32_t mulUnit(uint32_t *pH, uint32_t x, uint32_t y) inline uint64_t mulUnit(uint64_t *pH, uint64_t x, uint64_t y) { #ifdef MCL_VINT_64BIT_PORTABLE - uint32_t a = uint32_t(x >> 32); - uint32_t b = uint32_t(x); - uint32_t c = uint32_t(y >> 32); - uint32_t d = uint32_t(y); - - uint64_t ad = uint64_t(d) * a; - uint64_t bd = uint64_t(d) * b; - uint64_t L = uint32_t(bd); - ad += bd >> 32; // [ad:L] - - uint64_t ac = uint64_t(c) * a; - uint64_t bc = uint64_t(c) * b; - uint64_t H = uint32_t(bc); - ac += bc >> 32; // [ac:H] - /* - adL - acH - */ - uint64_t t = (ac << 32) | H; - ac >>= 32; - H = t + ad; - if (H < t) { - ac++; - } - /* - ac:H:L - */ + const uint64_t mask = 0xffffffff; + uint64_t v = (x & mask) * (y & mask); + uint64_t L = uint32_t(v); + uint64_t H = v >> 32; + uint64_t ad = (x & mask) * uint32_t(y >> 32); + uint64_t bc = uint32_t(x >> 32) * (y & mask); + H += uint32_t(ad); + H += uint32_t(bc); L |= H << 32; - H = (ac << 32) | uint32_t(H >> 32); + H >>= 32; + H += ad >> 32; + H += bc >> 32; + H += (x >> 32) * (y >> 32); *pH = H; return L; #elif defined(_WIN64) && !defined(__INTEL_COMPILER)