From 8d64a0cfed1c88c19d38f5ca220c6cdee390b268 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 2 Feb 2021 09:40:31 +0900 Subject: [PATCH 01/12] test of wasm --- src/fp.cpp | 18 +++++++++++++ src/low_funct.hpp | 67 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 src/low_funct.hpp diff --git a/src/fp.cpp b/src/fp.cpp index eb8a7de..3a70db5 100644 --- a/src/fp.cpp +++ b/src/fp.cpp @@ -3,6 +3,10 @@ #include #include #include +#if defined(__EMSCRIPTEN__) && MCL_SIZEOF_UNIT == 4 +#define FOR_WASM +#include "low_funct.hpp" +#endif #if defined(MCL_STATIC_CODE) || defined(MCL_USE_XBYAK) || (defined(MCL_USE_LLVM) && (CYBOZU_HOST == CYBOZU_HOST_INTEL)) @@ -268,6 +272,20 @@ void setOp2(Op& op) } else { op.fp_add = Add::f; op.fp_sub = Sub::f; +#ifdef FOR_WASM + switch (N) { + case 8: + op.fp_add = mcl::addModT<8>; + op.fp_sub = mcl::subModT<8>; + break; + case 12: + op.fp_add = mcl::addModT<12>; + op.fp_sub = mcl::subModT<12>; + break; + default: + break; + } +#endif } if (op.isMont) { if (op.isFullBit) { diff --git a/src/low_funct.hpp b/src/low_funct.hpp new file mode 100644 index 0000000..a9751dd --- /dev/null +++ b/src/low_funct.hpp @@ -0,0 +1,67 @@ +#pragma once +/** + @file + @author MITSUNARI Shigeo(@herumi) + @license modified new BSD license + http://opensource.org/licenses/BSD-3-Clause +*/ +#include +#include + +// for 32bit not full version + +namespace mcl { + +template +void copyT(uint32_t *y, const uint32_t *x) +{ + for (size_t i = 0; i < N; i++) { + y[i] = x[i]; + } +} + +template +void addT(uint32_t *z, const uint32_t *x, const uint32_t *y) +{ + bool c = false; + for (size_t i = 0; i < N; i++) { + uint64_t v = uint64_t(x[i]) + y[i] + c; + z[i] = uint32_t(v); + c = (v >> 32) != 0; + } +} + +template +bool subT(uint32_t *z, const uint32_t *x, const uint32_t *y) +{ + bool c = false; + for (size_t i = 0; i < N; i++) { + uint64_t v = uint64_t(x[i]) - y[i] - c; + z[i] = uint32_t(v); + c = (v >> 32) != 0; + } + return c; +} + +template +void addModT(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) +{ + uint32_t t[N]; + addT(z, x, y); + bool c = subT(t, z, p); + if (!c) { + copyT(z, t); + } +} + +template +void subModT(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) +{ + bool c = subT(z, x, y); + if (c) { + addT(z, z, p); + } +} + +} // mcl + From a31bde314fc4124e5a3b791f73a12f301c0d13e1 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 2 Feb 2021 10:53:18 +0900 Subject: [PATCH 02/12] add mulT --- misc/low_test.cpp | 42 ++++++++++++++++++++++++++++++++++++++ src/low_funct.hpp | 51 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 88 insertions(+), 5 deletions(-) create mode 100644 misc/low_test.cpp diff --git a/misc/low_test.cpp b/misc/low_test.cpp new file mode 100644 index 0000000..c5bef6c --- /dev/null +++ b/misc/low_test.cpp @@ -0,0 +1,42 @@ +#include "../src/low_funct.hpp" + +#define MCL_VINT_FIXED_BUFFER +#define MCL_SIZEOF_UNIT 4 +#define MCL_MAX_BIT_SIZE 384 +#include +#include +#include + +void mul3(uint32_t z[6], const uint32_t x[3], uint32_t y[3]) +{ + return mcl::mulT<3>(z, x, y); +} + +template +void setRand(uint32_t *x, size_t n, RG& rg) +{ + for (size_t i = 0; i < n; i++) { + x[i] = rg.get32(); + } +} + +CYBOZU_TEST_AUTO(mul3) +{ + cybozu::XorShift rg; + uint32_t x[3]; + uint32_t y[3]; + uint32_t z[6]; + for (size_t i = 0; i < 1000; i++) { + setRand(x, 3, rg); + setRand(y, 3, rg); + mcl::Vint vx, vy; + vx.setArray(x, 3); + vy.setArray(y, 3); + printf("vx=%s\n", vx.getStr(16).c_str()); + printf("vy=%s\n", vy.getStr(16).c_str()); + vx *= vy; + printf("xy=%s\n", vx.getStr(16).c_str()); + mul3(z, x, y); + CYBOZU_TEST_EQUAL_ARRAY(z, vx.getUnit(), 6); + } +} diff --git a/src/low_funct.hpp b/src/low_funct.hpp index a9751dd..af9dd79 100644 --- a/src/low_funct.hpp +++ b/src/low_funct.hpp @@ -7,13 +7,14 @@ */ #include #include +#include -// for 32bit not full version +// only for 32bit not full bit prime version namespace mcl { template -void copyT(uint32_t *y, const uint32_t *x) +void copyT(uint32_t y[N], const uint32_t x[N]) { for (size_t i = 0; i < N; i++) { y[i] = x[i]; @@ -21,7 +22,7 @@ void copyT(uint32_t *y, const uint32_t *x) } template -void addT(uint32_t *z, const uint32_t *x, const uint32_t *y) +void addT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) { bool c = false; for (size_t i = 0; i < N; i++) { @@ -29,10 +30,11 @@ void addT(uint32_t *z, const uint32_t *x, const uint32_t *y) z[i] = uint32_t(v); c = (v >> 32) != 0; } + assert(!c); } template -bool subT(uint32_t *z, const uint32_t *x, const uint32_t *y) +bool subT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) { bool c = false; for (size_t i = 0; i < N; i++) { @@ -43,8 +45,47 @@ bool subT(uint32_t *z, const uint32_t *x, const uint32_t *y) return c; } +// [return:z[N]] = x[N] * y template -void addModT(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) +uint32_t mulUnitT(uint32_t z[N], const uint32_t x[N], uint32_t y) +{ + uint32_t H = 0; + for (size_t i = 0; i < N; i++) { + uint64_t v = uint64_t(x[i]) * y; + v += H; + z[i] = uint32_t(v); + H = uint32_t(v >> 32); + } + return H; +} + +// [return:z[N]] = z[N] + x[N] * z +template +uint32_t addMulUnitT(uint32_t z[N], const uint32_t x[N], uint32_t y) +{ + uint32_t H = 0; + for (size_t i = 0; i < N; i++) { + uint64_t v = uint64_t(x[i]) * y; + v += H; + v += z[i]; + z[i] = uint32_t(v); + H = uint32_t(v >> 32); + } + return H; +} + +// z[N * 2] = x[N] * y[N] +template +void mulT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N]) +{ + z[N] = mulUnitT(z, x, y[0]); + for (size_t i = 1; i < N; i++) { + z[N + i] = addMulUnitT(&z[i], x, y[i]); + } +} + +template +void addModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N]) { uint32_t t[N]; addT(z, x, y); From a3dd0f8a55403899c4d651ba94a675bc6b810fd0 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 2 Feb 2021 15:28:51 +0900 Subject: [PATCH 03/12] add karatsuba --- misc/low_test.cpp | 51 +++++++++++++++++++++++++--------------- src/low_funct.hpp | 59 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 89 insertions(+), 21 deletions(-) diff --git a/misc/low_test.cpp b/misc/low_test.cpp index c5bef6c..a0e0905 100644 --- a/misc/low_test.cpp +++ b/misc/low_test.cpp @@ -1,16 +1,14 @@ +#include #include "../src/low_funct.hpp" +#define MCL_USE_VINT #define MCL_VINT_FIXED_BUFFER #define MCL_SIZEOF_UNIT 4 -#define MCL_MAX_BIT_SIZE 384 +#define MCL_MAX_BIT_SIZE 768 #include #include #include - -void mul3(uint32_t z[6], const uint32_t x[3], uint32_t y[3]) -{ - return mcl::mulT<3>(z, x, y); -} +#include template void setRand(uint32_t *x, size_t n, RG& rg) @@ -20,23 +18,38 @@ void setRand(uint32_t *x, size_t n, RG& rg) } } -CYBOZU_TEST_AUTO(mul3) +/* +g++ -Ofast -DNDEBUG -Wall -Wextra -m32 -I ./include/ misc/low_test.cpp +Core i7-8700 + mulT karatsuba +N = 6, 182clk 225clk +N = 8, 300clk 350clk +N = 12, 594clk 730clk +*/ +CYBOZU_TEST_AUTO(mulT) { cybozu::XorShift rg; - uint32_t x[3]; - uint32_t y[3]; - uint32_t z[6]; + const size_t N = 12; + uint32_t x[N]; + uint32_t y[N]; + uint32_t z[N * 2]; for (size_t i = 0; i < 1000; i++) { - setRand(x, 3, rg); - setRand(y, 3, rg); + setRand(x, N, rg); + setRand(y, N, rg); + // remove MSB + x[N - 1] &= 0x7fffffff; + y[N - 1] &= 0x7fffffff; mcl::Vint vx, vy; - vx.setArray(x, 3); - vy.setArray(y, 3); - printf("vx=%s\n", vx.getStr(16).c_str()); - printf("vy=%s\n", vy.getStr(16).c_str()); + vx.setArray(x, N); + vy.setArray(y, N); vx *= vy; - printf("xy=%s\n", vx.getStr(16).c_str()); - mul3(z, x, y); - CYBOZU_TEST_EQUAL_ARRAY(z, vx.getUnit(), 6); + mcl::mulT(z, x, y); + CYBOZU_TEST_EQUAL_ARRAY(z, vx.getUnit(), N * 2); + memset(z, 0, sizeof(z)); + mcl::karatsubaT(z, x, y); + CYBOZU_TEST_EQUAL_ARRAY(z, vx.getUnit(), N * 2); } + CYBOZU_BENCH_C("mulT", 10000, mcl::mulT, z, x, y); + CYBOZU_BENCH_C("kara", 10000, mcl::karatsubaT, z, x, y); } + diff --git a/src/low_funct.hpp b/src/low_funct.hpp index af9dd79..919c574 100644 --- a/src/low_funct.hpp +++ b/src/low_funct.hpp @@ -21,8 +21,24 @@ void copyT(uint32_t y[N], const uint32_t x[N]) } } +// [return:y[N]] += x template -void addT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) +inline bool addUnitT(uint32_t y[N], uint32_t x) +{ + uint64_t v = uint64_t(y[0]) + x; + y[0] = uint32_t(v); + bool c = (v >> 32) != 0; + if (!c) return false; + for (size_t i = 1; i < N; i++) { + v = uint64_t(y[i]) + 1; + y[i] = uint32_t(v); + if ((v >> 32) == 0) return false; + } + return true; +} + +template +bool addT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) { bool c = false; for (size_t i = 0; i < N; i++) { @@ -30,7 +46,7 @@ void addT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) z[i] = uint32_t(v); c = (v >> 32) != 0; } - assert(!c); + return c; } template @@ -84,6 +100,45 @@ void mulT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N]) } } +/* + z[N * 2] = x[N] * y[N] + H = N/2 + W = 1 << (H * 32) + x = aW + b, y = cW + d + assume a < W/2, c < W/2 + (aW + b)(cW + d) = acW^2 + (ad + bc)W + bd + ad + bc = (a + b)(c + d) - ac - bd < (1 << (N * 32)) +*/ +template +void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N]) +{ + assert((N % 2) == 0); + assert((x[N - 1] & 0x80000000) == 0); + assert((y[N - 1] & 0x80000000) == 0); + const size_t H = N / 2; + mulT(z, x, y); // bd + mulT(z + N, x + H, y + H); // ac + uint32_t a_b[H]; + uint32_t c_d[H]; + bool c1 = addT(a_b, x, x + H); // a + b + bool c2 = addT(c_d, y, y + H); // c + d + uint32_t tmp[N]; + mulT(tmp, a_b, c_d); + if (c1) { + addT(tmp + H, tmp + H, c_d); + } + if (c2) { + addT(tmp + H, tmp + H, a_b); + } + // c:tmp[N] = (a + b)(c + d) + subT(tmp, tmp, z); + subT(tmp, tmp, z + N); + // c:tmp[N] = ad + bc + if (addT(z + H, z + H, tmp)) { + addUnitT(z + N + H, 1); + } +} + template void addModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N]) { From 32b1f2f7e7116e447853bd7d234a5d5ba8a6a404 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 2 Feb 2021 15:44:14 +0900 Subject: [PATCH 04/12] test N = 8, 12 --- misc/low_test.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/misc/low_test.cpp b/misc/low_test.cpp index a0e0905..7163902 100644 --- a/misc/low_test.cpp +++ b/misc/low_test.cpp @@ -26,10 +26,11 @@ N = 6, 182clk 225clk N = 8, 300clk 350clk N = 12, 594clk 730clk */ -CYBOZU_TEST_AUTO(mulT) +template +void mulTest() { + printf("N=%zd (%zdbit)\n", N, N * 32); cybozu::XorShift rg; - const size_t N = 12; uint32_t x[N]; uint32_t y[N]; uint32_t z[N * 2]; @@ -53,3 +54,8 @@ CYBOZU_TEST_AUTO(mulT) CYBOZU_BENCH_C("kara", 10000, mcl::karatsubaT, z, x, y); } +CYBOZU_TEST_AUTO(mulT) +{ + mulTest<8>(); + mulTest<12>(); +} From 7dae2ec939adb09968f07ea967eb0615db511ee7 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 2 Feb 2021 18:20:02 +0900 Subject: [PATCH 05/12] add montT --- misc/low_test.cpp | 90 +++++++++++++++++++++++++++++++++++++++++++++++ src/low_funct.hpp | 26 +++++++++++++- 2 files changed, 115 insertions(+), 1 deletion(-) diff --git a/misc/low_test.cpp b/misc/low_test.cpp index 7163902..62f84af 100644 --- a/misc/low_test.cpp +++ b/misc/low_test.cpp @@ -1,4 +1,14 @@ #include +#include + +void dump(const char *msg, const uint32_t *x, size_t n) +{ + printf("%s", msg); + for (size_t i = 0; i < n; i++) { + printf("%08x", x[n - 1 - i]); + } + printf("\n"); +} #include "../src/low_funct.hpp" #define MCL_USE_VINT @@ -9,6 +19,7 @@ #include #include #include +#include template void setRand(uint32_t *x, size_t n, RG& rg) @@ -59,3 +70,82 @@ CYBOZU_TEST_AUTO(mulT) mulTest<8>(); mulTest<12>(); } + +struct Montgomery { + mcl::Vint p_; + mcl::Vint R_; // (1 << (pn_ * 64)) % p + mcl::Vint RR_; // (R * R) % p + uint32_t rp_; // rp * p = -1 mod M = 1 << 64 + size_t pn_; + Montgomery() {} + explicit Montgomery(const mcl::Vint& p) + { + p_ = p; + rp_ = mcl::fp::getMontgomeryCoeff(p.getUnit()[0]); + pn_ = p.getUnitSize(); + R_ = 1; + R_ = (R_ << (pn_ * 64)) % p_; + RR_ = (R_ * R_) % p_; + } + + void toMont(mcl::Vint& x) const { mul(x, x, RR_); } + void fromMont(mcl::Vint& x) const { mul(x, x, 1); } + + void mul(mcl::Vint& z, const mcl::Vint& x, const mcl::Vint& y) const + { + const size_t ySize = y.getUnitSize(); + mcl::Vint c = x * y.getUnit()[0]; + uint32_t q = c.getUnit()[0] * rp_; + c += p_ * q; + c >>= sizeof(uint32_t) * 8; + for (size_t i = 1; i < pn_; i++) { + if (i < ySize) { + c += x * y.getUnit()[i]; + } + uint32_t q = c.getUnit()[0] * rp_; + c += p_ * q; + c >>= sizeof(uint32_t) * 8; + } + if (c >= p_) { + c -= p_; + } + z = c; + } +}; + +template +void montTest(const char *pStr) +{ + mcl::Vint vp; + vp.setStr(pStr); + Montgomery mont(vp); + + cybozu::XorShift rg; + uint32_t x[N]; + uint32_t y[N]; + uint32_t z[N]; + uint32_t _p[N + 1]; + uint32_t *const p = _p + 1; + vp.getArray(p, N); + p[-1] = mont.rp_; + + for (size_t i = 0; i < 1000; i++) { + setRand(x, N, rg); + setRand(y, N, rg); + // remove MSB + x[N - 1] &= 0x7fffffff; + y[N - 1] &= 0x7fffffff; + mcl::Vint vx, vy, vz; + vx.setArray(x, N); + vy.setArray(y, N); + mont.mul(vz, vx, vy); + mcl::montT(z, x, y, p); + CYBOZU_TEST_EQUAL_ARRAY(z, vz.getUnit(), N); + } +} + +CYBOZU_TEST_AUTO(mont) +{ + const char *pStr = "0x2523648240000001ba344d80000000086121000000000013a700000000000013"; + montTest<8>(pStr); +} diff --git a/src/low_funct.hpp b/src/low_funct.hpp index 919c574..23e5885 100644 --- a/src/low_funct.hpp +++ b/src/low_funct.hpp @@ -4,12 +4,13 @@ @author MITSUNARI Shigeo(@herumi) @license modified new BSD license http://opensource.org/licenses/BSD-3-Clause + @note for only 32bit not full bit prime version + assert((p[N - 1] & 0x80000000) == 0); */ #include #include #include -// only for 32bit not full bit prime version namespace mcl { @@ -159,5 +160,28 @@ void subModT(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t * } } +/* + z[N] = Montgomery(x[N], y[N], p[N]) + @remark : assume p[-1] = rp +*/ +template +void montT(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) +{ + const uint32_t rp = p[-1]; + assert((p[N - 1] & 0x80000000) == 0); + uint32_t buf[N * 2]; + buf[N] = mulUnitT(buf, x, y[0]); + uint32_t q = buf[0] * rp; + buf[N] += addMulUnitT(buf, p, q); + for (size_t i = 1; i < N; i++) { + buf[N + i] = addMulUnitT(buf + i, x, y[i]); + uint32_t q = buf[i] * rp; + buf[N + i] += addMulUnitT(buf + i, p, q); + } + if (subT(z, buf + N, p)) { + copyT(z, buf + N); + } +} + } // mcl From e61e56abdf4ed7b0d7bcfd86da06d14bf72c7beb Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Wed, 3 Feb 2021 11:17:48 +0900 Subject: [PATCH 06/12] add test of mont12 --- misc/Makefile | 6 ++++++ misc/low_test.cpp | 7 +++++-- 2 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 misc/Makefile diff --git a/misc/Makefile b/misc/Makefile new file mode 100644 index 0000000..1a727f3 --- /dev/null +++ b/misc/Makefile @@ -0,0 +1,6 @@ +all: low_test + +CFLAGS=-I ../include/ -m32 -Ofast -Wall -Wextra -DNDEBUG + +low_test: low_test.cpp + $(CXX) -o low_test low_test.cpp $(CFLAGS) diff --git a/misc/low_test.cpp b/misc/low_test.cpp index 62f84af..856be61 100644 --- a/misc/low_test.cpp +++ b/misc/low_test.cpp @@ -142,10 +142,13 @@ void montTest(const char *pStr) mcl::montT(z, x, y, p); CYBOZU_TEST_EQUAL_ARRAY(z, vz.getUnit(), N); } + CYBOZU_BENCH_C("montT", 10000, mcl::montT, z, x, y, p); } CYBOZU_TEST_AUTO(mont) { - const char *pStr = "0x2523648240000001ba344d80000000086121000000000013a700000000000013"; - montTest<8>(pStr); + const char *pBN254 = "0x2523648240000001ba344d80000000086121000000000013a700000000000013"; + montTest<8>(pBN254); + const char *pBLS12_381 = "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab"; + montTest<12>(pBLS12_381); } From f2f39faf99db59ac1a10bc9f87c42ad5e5b854ac Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Wed, 3 Feb 2021 12:06:23 +0900 Subject: [PATCH 07/12] fix comment --- src/low_funct.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/low_funct.hpp b/src/low_funct.hpp index 23e5885..5db871b 100644 --- a/src/low_funct.hpp +++ b/src/low_funct.hpp @@ -76,7 +76,7 @@ uint32_t mulUnitT(uint32_t z[N], const uint32_t x[N], uint32_t y) return H; } -// [return:z[N]] = z[N] + x[N] * z +// [return:z[N]] = z[N] + x[N] * y template uint32_t addMulUnitT(uint32_t z[N], const uint32_t x[N], uint32_t y) { From a7728b22865189e4d107b94dab6c7499f4e4aeba Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Wed, 3 Feb 2021 12:18:21 +0900 Subject: [PATCH 08/12] add sqrT --- misc/low_test.cpp | 33 ++++++++++++++++++++++++++++++++ src/low_funct.hpp | 48 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/misc/low_test.cpp b/misc/low_test.cpp index 856be61..a77ed36 100644 --- a/misc/low_test.cpp +++ b/misc/low_test.cpp @@ -69,6 +69,39 @@ CYBOZU_TEST_AUTO(mulT) { mulTest<8>(); mulTest<12>(); + mulTest<16>(); +} + +template +void sqrTest() +{ + printf("N=%zd (%zdbit)\n", N, N * 32); + cybozu::XorShift rg; + uint32_t x[N]; + uint32_t y[N * 2]; + for (size_t i = 0; i < 1000; i++) { + setRand(x, N, rg); + // remove MSB + x[N - 1] &= 0x7fffffff; + mcl::Vint vx; + vx.setArray(x, N); + vx *= vx; + mcl::sqrT(y, x); + CYBOZU_TEST_EQUAL_ARRAY(y, vx.getUnit(), N * 2); +#if 0 + memset(z, 0, sizeof(z)); + mcl::karatsubaT(z, x, y); + CYBOZU_TEST_EQUAL_ARRAY(z, vx.getUnit(), N * 2); +#endif + } + CYBOZU_BENCH_C("sqrT", 10000, mcl::sqrT, y, x); +} + +CYBOZU_TEST_AUTO(sqrT) +{ + sqrTest<8>(); + sqrTest<12>(); + sqrTest<16>(); } struct Montgomery { diff --git a/src/low_funct.hpp b/src/low_funct.hpp index 5db871b..f25559c 100644 --- a/src/low_funct.hpp +++ b/src/low_funct.hpp @@ -101,6 +101,54 @@ void mulT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N]) } } +template +uint32_t mulUnitWithTblT(uint32_t z[N], const uint64_t *tbl_j) +{ + uint32_t H = 0; + for (size_t i = 0; i < N; i++) { + uint64_t v = tbl_j[i]; + v += H; + z[i] = uint32_t(v); + H = uint32_t(v >> 32); + } + return H; +} + +template +uint32_t addMulUnitWithTblT(uint32_t z[N], const uint64_t *tbl_j) +{ + uint32_t H = 0; + for (size_t i = 0; i < N; i++) { + uint64_t v = tbl_j[i]; + v += H; + v += z[i]; + z[i] = uint32_t(v); + H = uint32_t(v >> 32); + } + return H; +} + + +// y[N * 2] = x[N] * x[N] +template +void sqrT(uint32_t y[N * 2], const uint32_t x[N]) +{ + uint64_t tbl[N * N]; // x[i]x[j] + for (size_t i = 0; i < N; i++) { + uint64_t xi = x[i]; + tbl[i * N + i] = xi * xi; + for (size_t j = i + 1; j < N; j++) { + uint64_t v = xi * x[j]; + tbl[i * N + j] = v; + tbl[j * N + i] = v; + } + } + y[N] = mulUnitWithTblT(y, tbl); + for (size_t i = 1; i < N; i++) { + y[N + i] = addMulUnitWithTblT(&y[i], tbl + N * i); + } +} + /* z[N * 2] = x[N] * y[N] H = N/2 From a3293c2c85582e4874e5e8599948711babfafdea Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Wed, 3 Feb 2021 17:06:29 +0900 Subject: [PATCH 09/12] add sqrT --- misc/Makefile | 2 +- misc/low_test.cpp | 7 ------ src/low_funct.hpp | 57 ++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 55 insertions(+), 11 deletions(-) diff --git a/misc/Makefile b/misc/Makefile index 1a727f3..25a7c27 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -2,5 +2,5 @@ all: low_test CFLAGS=-I ../include/ -m32 -Ofast -Wall -Wextra -DNDEBUG -low_test: low_test.cpp +low_test: low_test.cpp ../src/low_funct.hpp $(CXX) -o low_test low_test.cpp $(CFLAGS) diff --git a/misc/low_test.cpp b/misc/low_test.cpp index a77ed36..540e3d9 100644 --- a/misc/low_test.cpp +++ b/misc/low_test.cpp @@ -69,7 +69,6 @@ CYBOZU_TEST_AUTO(mulT) { mulTest<8>(); mulTest<12>(); - mulTest<16>(); } template @@ -88,11 +87,6 @@ void sqrTest() vx *= vx; mcl::sqrT(y, x); CYBOZU_TEST_EQUAL_ARRAY(y, vx.getUnit(), N * 2); -#if 0 - memset(z, 0, sizeof(z)); - mcl::karatsubaT(z, x, y); - CYBOZU_TEST_EQUAL_ARRAY(z, vx.getUnit(), N * 2); -#endif } CYBOZU_BENCH_C("sqrT", 10000, mcl::sqrT, y, x); } @@ -101,7 +95,6 @@ CYBOZU_TEST_AUTO(sqrT) { sqrTest<8>(); sqrTest<12>(); - sqrTest<16>(); } struct Montgomery { diff --git a/src/low_funct.hpp b/src/low_funct.hpp index f25559c..2f3a2a1 100644 --- a/src/low_funct.hpp +++ b/src/low_funct.hpp @@ -22,6 +22,23 @@ void copyT(uint32_t y[N], const uint32_t x[N]) } } +template +uint32_t shlT(uint32_t y[N], const uint32_t x[N], size_t bit) +{ + assert(0 < bit && bit < 32); + assert((N % 2) == 0); + size_t rBit = sizeof(uint32_t) * 8 - bit; + uint32_t keep = x[N - 1]; + uint32_t prev = keep; + for (size_t i = N - 1; i > 0; i--) { + uint32_t t = x[i - 1]; + y[i] = (prev << bit) | (t >> rBit); + prev = t; + } + y[0] = prev << bit; + return keep >> rBit; +} + // [return:y[N]] += x template inline bool addUnitT(uint32_t y[N], uint32_t x) @@ -101,6 +118,8 @@ void mulT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N]) } } +#if 0 +// slower than mulT template uint32_t mulUnitWithTblT(uint32_t z[N], const uint64_t *tbl_j) { @@ -128,7 +147,6 @@ uint32_t addMulUnitWithTblT(uint32_t z[N], const uint64_t *tbl_j) return H; } - // y[N * 2] = x[N] * x[N] template void sqrT(uint32_t y[N * 2], const uint32_t x[N]) @@ -148,6 +166,7 @@ void sqrT(uint32_t y[N * 2], const uint32_t x[N]) y[N + i] = addMulUnitWithTblT(&y[i], tbl + N * i); } } +#endif /* z[N * 2] = x[N] * y[N] @@ -157,6 +176,7 @@ void sqrT(uint32_t y[N * 2], const uint32_t x[N]) assume a < W/2, c < W/2 (aW + b)(cW + d) = acW^2 + (ad + bc)W + bd ad + bc = (a + b)(c + d) - ac - bd < (1 << (N * 32)) + slower than mulT on Core i7 with -m32 for N <= 12 */ template void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N]) @@ -165,8 +185,6 @@ void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N]) assert((x[N - 1] & 0x80000000) == 0); assert((y[N - 1] & 0x80000000) == 0); const size_t H = N / 2; - mulT(z, x, y); // bd - mulT(z + N, x + H, y + H); // ac uint32_t a_b[H]; uint32_t c_d[H]; bool c1 = addT(a_b, x, x + H); // a + b @@ -179,6 +197,8 @@ void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N]) if (c2) { addT(tmp + H, tmp + H, a_b); } + mulT(z, x, y); // bd + mulT(z + N, x + H, y + H); // ac // c:tmp[N] = (a + b)(c + d) subT(tmp, tmp, z); subT(tmp, tmp, z + N); @@ -188,6 +208,37 @@ void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N]) } } +/* + y[N * 2] = x[N] * x[N] + (aW + b)^2 = a^2 W + b^2 + 2abW + (a+b)^2 - a^2 - b^2 +*/ +template +void sqrT(uint32_t y[N * 2], const uint32_t x[N]) +{ + assert((N % 2) == 0); + assert((x[N - 1] & 0x80000000) == 0); + const size_t H = N / 2; + uint32_t a_b[H]; + bool c = addT(a_b, x, x + H); // a + b + uint32_t tmp[N]; + mulT(tmp, a_b, a_b); + if (c) { +// addT(a_b, a_b, a_b); + shlT(a_b, a_b, 1); + addT(tmp + H, tmp + H, a_b); + } + mulT(y, x, x); // b^2 + mulT(y + N, x + H, x + H); // a^2 + // tmp[N] = (a + b)^2 + subT(tmp, tmp, y); + subT(tmp, tmp, y + N); + // tmp[N] = 2ab + if (addT(y + H, y + H, tmp)) { + addUnitT(y + N + H, 1); + } +} + template void addModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N]) { From c346f747d7768c5f6497e888d858de003b9963c0 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Thu, 4 Feb 2021 15:09:45 +0900 Subject: [PATCH 10/12] add modT --- misc/low_test.cpp | 47 +++++++++++++++++++++++++++++++++++++++++++++++ src/low_funct.hpp | 44 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/misc/low_test.cpp b/misc/low_test.cpp index 540e3d9..547e27c 100644 --- a/misc/low_test.cpp +++ b/misc/low_test.cpp @@ -137,6 +137,19 @@ struct Montgomery { } z = c; } + void mod(mcl::Vint& z, const mcl::Vint& xy) const + { + z = xy; + for (size_t i = 0; i < pn_; i++) { + uint32_t q = z.getUnit()[0] * rp_; + mcl::Vint t = q; + z += p_ * t; + z >>= 32; + } + if (z >= p_) { + z -= p_; + } + } }; template @@ -171,10 +184,44 @@ void montTest(const char *pStr) CYBOZU_BENCH_C("montT", 10000, mcl::montT, z, x, y, p); } +template +void modTest(const char *pStr) +{ + mcl::Vint vp; + vp.setStr(pStr); + Montgomery mont(vp); + + cybozu::XorShift rg; + uint32_t xy[N * 2]; + uint32_t z[N]; + uint32_t _p[N + 1]; + uint32_t *const p = _p + 1; + vp.getArray(p, N); + p[-1] = mont.rp_; + + for (size_t i = 0; i < 1000; i++) { + setRand(xy, N * 2, rg); + // remove MSB + xy[N * 2 - 1] &= 0x7fffffff; + mcl::Vint vxy, vz; + vxy.setArray(xy, N * 2); + mont.mod(vz, vxy); + mcl::modT(z, xy, p); + CYBOZU_TEST_EQUAL_ARRAY(z, vz.getUnit(), N); + } + CYBOZU_BENCH_C("modT", 10000, mcl::modT, z, xy, p); +} + CYBOZU_TEST_AUTO(mont) { const char *pBN254 = "0x2523648240000001ba344d80000000086121000000000013a700000000000013"; + puts("BN254"); montTest<8>(pBN254); + modTest<8>(pBN254); + const char *pBLS12_381 = "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab"; + puts("BLS12"); montTest<12>(pBLS12_381); + modTest<12>(pBLS12_381); } + diff --git a/src/low_funct.hpp b/src/low_funct.hpp index 2f3a2a1..104d6fc 100644 --- a/src/low_funct.hpp +++ b/src/low_funct.hpp @@ -251,7 +251,7 @@ void addModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint } template -void subModT(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) +void subModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N]) { bool c = subT(z, x, y); if (c) { @@ -264,7 +264,7 @@ void subModT(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t * @remark : assume p[-1] = rp */ template -void montT(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) +void montT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N]) { const uint32_t rp = p[-1]; assert((p[N - 1] & 0x80000000) == 0); @@ -282,5 +282,45 @@ void montT(uint32_t *z, const uint32_t *x, const uint32_t *y, const uint32_t *p) } } +// [return:z[N+1]] = z[N+1] + x[N] * y + (cc << (N * 32)) +template +bool addMulUnit2T(uint32_t z[N + 1], const uint32_t x[N], uint32_t y, const bool *cc = 0) +{ + uint32_t H = 0; + for (size_t i = 0; i < N; i++) { + uint64_t v = uint64_t(x[i]) * y; + v += H; + v += z[i]; + z[i] = uint32_t(v); + H = uint32_t(v >> 32); + } + if (cc) H += *cc; + uint64_t v = uint64_t(z[N]); + v += H; + z[N] = uint32_t(v); + return (v >> 32) != 0; +} + +/* + z[N] = Montgomery reduction(y[N], xy[N], p[N]) + @remark : assume p[-1] = rp +*/ +template +void modT(uint32_t y[N], const uint32_t xy[N * 2], const uint32_t p[N]) +{ + const uint32_t rp = p[-1]; + assert((p[N - 1] & 0x80000000) == 0); + uint32_t buf[N * 2]; + copyT(buf, xy); + bool c = 0; + for (size_t i = 0; i < N; i++) { + uint32_t q = buf[i] * rp; + c = addMulUnit2T(buf + i, p, q, &c); + } + if (subT(y, buf + N, p)) { + copyT(y, buf + N); + } +} + } // mcl From dc8964db5ab4326329354a7deccd60198c61118d Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Thu, 4 Feb 2021 17:18:36 +0900 Subject: [PATCH 11/12] add sqrMont --- misc/low_test.cpp | 15 ++++++++++----- src/fp.cpp | 40 ++++++++++++++++++++++++++-------------- src/low_funct.hpp | 18 +++++++++++++++++- 3 files changed, 53 insertions(+), 20 deletions(-) diff --git a/misc/low_test.cpp b/misc/low_test.cpp index 547e27c..91af412 100644 --- a/misc/low_test.cpp +++ b/misc/low_test.cpp @@ -153,7 +153,7 @@ struct Montgomery { }; template -void montTest(const char *pStr) +void mulMontTest(const char *pStr) { mcl::Vint vp; vp.setStr(pStr); @@ -178,10 +178,15 @@ void montTest(const char *pStr) vx.setArray(x, N); vy.setArray(y, N); mont.mul(vz, vx, vy); - mcl::montT(z, x, y, p); + mcl::mulMontT(z, x, y, p); + CYBOZU_TEST_EQUAL_ARRAY(z, vz.getUnit(), N); + + mont.mul(vz, vx, vx); + mcl::sqrMontT(z, x, p); CYBOZU_TEST_EQUAL_ARRAY(z, vz.getUnit(), N); } - CYBOZU_BENCH_C("montT", 10000, mcl::montT, z, x, y, p); + CYBOZU_BENCH_C("mulMontT", 10000, mcl::mulMontT, x, x, y, p); + CYBOZU_BENCH_C("sqrMontT", 10000, mcl::sqrMontT, x, x, p); } template @@ -216,12 +221,12 @@ CYBOZU_TEST_AUTO(mont) { const char *pBN254 = "0x2523648240000001ba344d80000000086121000000000013a700000000000013"; puts("BN254"); - montTest<8>(pBN254); + mulMontTest<8>(pBN254); modTest<8>(pBN254); const char *pBLS12_381 = "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab"; puts("BLS12"); - montTest<12>(pBLS12_381); + mulMontTest<12>(pBLS12_381); modTest<12>(pBLS12_381); } diff --git a/src/fp.cpp b/src/fp.cpp index 3a70db5..484ad43 100644 --- a/src/fp.cpp +++ b/src/fp.cpp @@ -272,20 +272,6 @@ void setOp2(Op& op) } else { op.fp_add = Add::f; op.fp_sub = Sub::f; -#ifdef FOR_WASM - switch (N) { - case 8: - op.fp_add = mcl::addModT<8>; - op.fp_sub = mcl::subModT<8>; - break; - case 12: - op.fp_add = mcl::addModT<12>; - op.fp_sub = mcl::subModT<12>; - break; - default: - break; - } -#endif } if (op.isMont) { if (op.isFullBit) { @@ -425,6 +411,25 @@ static bool initForMont(Op& op, const Unit *p, Mode mode) return true; } +#ifdef FOR_WASM +template +void setWasmOp(Op& op) +{ + if (!(op.isMont && !op.isFullBit)) return; +EM_ASM({console.log($0)}, N); +// op.fp_addPre = mcl::addT; +// op.fp_subPre = mcl::subT; +// op.fpDbl_addPre = mcl::addT; +// op.fpDbl_subPre = mcl::subT; + op.fp_add = mcl::addModT; + op.fp_sub = mcl::subModT; + op.fp_mul = mcl::mulMontT; + op.fp_sqr = mcl::sqrMontT; + op.fpDbl_mulPre = mulT; + op.fpDbl_mod = modT; +} +#endif + bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size_t mclMaxBitSize) { if (mclMaxBitSize != MCL_MAX_BIT_SIZE) return false; @@ -565,6 +570,13 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size default: return false; } +#ifdef FOR_WASM + if (N == 8) { + setWasmOp<8>(*this); + } else if (N == 12) { + setWasmOp<12>(*this); + } +#endif #ifdef MCL_USE_LLVM if (primeMode == PM_NIST_P192) { fp_mul = &mcl_fp_mulNIST_P192L; diff --git a/src/low_funct.hpp b/src/low_funct.hpp index 104d6fc..082c2df 100644 --- a/src/low_funct.hpp +++ b/src/low_funct.hpp @@ -264,7 +264,7 @@ void subModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint @remark : assume p[-1] = rp */ template -void montT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N]) +void mulMontT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N]) { const uint32_t rp = p[-1]; assert((p[N - 1] & 0x80000000) == 0); @@ -322,5 +322,21 @@ void modT(uint32_t y[N], const uint32_t xy[N * 2], const uint32_t p[N]) } } +/* + z[N] = Montgomery(x[N], y[N], p[N]) + @remark : assume p[-1] = rp +*/ +template +void sqrMontT(uint32_t y[N], const uint32_t x[N], const uint32_t p[N]) +{ +#if 1 + mulMontT(y, x, x, p); +#else + uint32_t xx[N * 2]; + sqrT(xx, x); + modT(y, xx, p); +#endif +} + } // mcl From 215106e9e6365fbc3ccd98ea89fc583510843885 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Fri, 5 Feb 2021 17:00:47 +0900 Subject: [PATCH 12/12] replace bool to unit32_t --- misc/low_test.cpp | 14 ++++++++------ src/low_funct.hpp | 40 ++++++++++++++++++++-------------------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/misc/low_test.cpp b/misc/low_test.cpp index 91af412..06418f9 100644 --- a/misc/low_test.cpp +++ b/misc/low_test.cpp @@ -21,6 +21,8 @@ void dump(const char *msg, const uint32_t *x, size_t n) #include #include +const int C = 10000; + template void setRand(uint32_t *x, size_t n, RG& rg) { @@ -61,8 +63,8 @@ void mulTest() mcl::karatsubaT(z, x, y); CYBOZU_TEST_EQUAL_ARRAY(z, vx.getUnit(), N * 2); } - CYBOZU_BENCH_C("mulT", 10000, mcl::mulT, z, x, y); - CYBOZU_BENCH_C("kara", 10000, mcl::karatsubaT, z, x, y); + CYBOZU_BENCH_C("mulT", C, mcl::mulT, z, x, y); + CYBOZU_BENCH_C("kara", C, mcl::karatsubaT, z, x, y); } CYBOZU_TEST_AUTO(mulT) @@ -88,7 +90,7 @@ void sqrTest() mcl::sqrT(y, x); CYBOZU_TEST_EQUAL_ARRAY(y, vx.getUnit(), N * 2); } - CYBOZU_BENCH_C("sqrT", 10000, mcl::sqrT, y, x); + CYBOZU_BENCH_C("sqrT", C, mcl::sqrT, y, x); } CYBOZU_TEST_AUTO(sqrT) @@ -185,8 +187,8 @@ void mulMontTest(const char *pStr) mcl::sqrMontT(z, x, p); CYBOZU_TEST_EQUAL_ARRAY(z, vz.getUnit(), N); } - CYBOZU_BENCH_C("mulMontT", 10000, mcl::mulMontT, x, x, y, p); - CYBOZU_BENCH_C("sqrMontT", 10000, mcl::sqrMontT, x, x, p); + CYBOZU_BENCH_C("mulMontT", C, mcl::mulMontT, x, x, y, p); + CYBOZU_BENCH_C("sqrMontT", C, mcl::sqrMontT, x, x, p); } template @@ -214,7 +216,7 @@ void modTest(const char *pStr) mcl::modT(z, xy, p); CYBOZU_TEST_EQUAL_ARRAY(z, vz.getUnit(), N); } - CYBOZU_BENCH_C("modT", 10000, mcl::modT, z, xy, p); + CYBOZU_BENCH_C("modT", C, mcl::modT, z, xy, p); } CYBOZU_TEST_AUTO(mont) diff --git a/src/low_funct.hpp b/src/low_funct.hpp index 082c2df..885b16a 100644 --- a/src/low_funct.hpp +++ b/src/low_funct.hpp @@ -41,40 +41,40 @@ uint32_t shlT(uint32_t y[N], const uint32_t x[N], size_t bit) // [return:y[N]] += x template -inline bool addUnitT(uint32_t y[N], uint32_t x) +inline uint32_t addUnitT(uint32_t y[N], uint32_t x) { uint64_t v = uint64_t(y[0]) + x; y[0] = uint32_t(v); - bool c = (v >> 32) != 0; - if (!c) return false; + uint32_t c = v >> 32; + if (c == 0) return 0; for (size_t i = 1; i < N; i++) { v = uint64_t(y[i]) + 1; y[i] = uint32_t(v); - if ((v >> 32) == 0) return false; + if ((v >> 32) == 0) return 0; } - return true; + return 1; } template -bool addT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) +uint32_t addT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) { - bool c = false; + uint32_t c = 0; for (size_t i = 0; i < N; i++) { uint64_t v = uint64_t(x[i]) + y[i] + c; z[i] = uint32_t(v); - c = (v >> 32) != 0; + c = uint32_t(v >> 32); } return c; } template -bool subT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) +uint32_t subT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N]) { - bool c = false; + uint32_t c = 0; for (size_t i = 0; i < N; i++) { uint64_t v = uint64_t(x[i]) - y[i] - c; z[i] = uint32_t(v); - c = (v >> 32) != 0; + c = uint32_t(v >> 63); } return c; } @@ -187,8 +187,8 @@ void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N]) const size_t H = N / 2; uint32_t a_b[H]; uint32_t c_d[H]; - bool c1 = addT(a_b, x, x + H); // a + b - bool c2 = addT(c_d, y, y + H); // c + d + uint32_t c1 = addT(a_b, x, x + H); // a + b + uint32_t c2 = addT(c_d, y, y + H); // c + d uint32_t tmp[N]; mulT(tmp, a_b, c_d); if (c1) { @@ -220,11 +220,10 @@ void sqrT(uint32_t y[N * 2], const uint32_t x[N]) assert((x[N - 1] & 0x80000000) == 0); const size_t H = N / 2; uint32_t a_b[H]; - bool c = addT(a_b, x, x + H); // a + b + uint32_t c = addT(a_b, x, x + H); // a + b uint32_t tmp[N]; mulT(tmp, a_b, a_b); if (c) { -// addT(a_b, a_b, a_b); shlT(a_b, a_b, 1); addT(tmp + H, tmp + H, a_b); } @@ -244,7 +243,7 @@ void addModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint { uint32_t t[N]; addT(z, x, y); - bool c = subT(t, z, p); + uint32_t c = subT(t, z, p); if (!c) { copyT(z, t); } @@ -253,7 +252,7 @@ void addModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint template void subModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N]) { - bool c = subT(z, x, y); + uint32_t c = subT(z, x, y); if (c) { addT(z, z, p); } @@ -284,7 +283,7 @@ void mulMontT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uin // [return:z[N+1]] = z[N+1] + x[N] * y + (cc << (N * 32)) template -bool addMulUnit2T(uint32_t z[N + 1], const uint32_t x[N], uint32_t y, const bool *cc = 0) +uint32_t addMulUnit2T(uint32_t z[N + 1], const uint32_t x[N], uint32_t y, const uint32_t *cc = 0) { uint32_t H = 0; for (size_t i = 0; i < N; i++) { @@ -298,7 +297,7 @@ bool addMulUnit2T(uint32_t z[N + 1], const uint32_t x[N], uint32_t y, const bool uint64_t v = uint64_t(z[N]); v += H; z[N] = uint32_t(v); - return (v >> 32) != 0; + return uint32_t(v >> 32); } /* @@ -312,7 +311,7 @@ void modT(uint32_t y[N], const uint32_t xy[N * 2], const uint32_t p[N]) assert((p[N - 1] & 0x80000000) == 0); uint32_t buf[N * 2]; copyT(buf, xy); - bool c = 0; + uint32_t c = 0; for (size_t i = 0; i < N; i++) { uint32_t q = buf[i] * rp; c = addMulUnit2T(buf + i, p, q, &c); @@ -332,6 +331,7 @@ void sqrMontT(uint32_t y[N], const uint32_t x[N], const uint32_t p[N]) #if 1 mulMontT(y, x, x, p); #else + // slower uint32_t xx[N * 2]; sqrT(xx, x); modT(y, xx, p);