update-fork
MITSUNARI Shigeo 4 years ago
parent a7728b2286
commit a3293c2c85
  1. 2
      misc/Makefile
  2. 7
      misc/low_test.cpp
  3. 57
      src/low_funct.hpp

@ -2,5 +2,5 @@ all: low_test
CFLAGS=-I ../include/ -m32 -Ofast -Wall -Wextra -DNDEBUG
low_test: low_test.cpp
low_test: low_test.cpp ../src/low_funct.hpp
$(CXX) -o low_test low_test.cpp $(CFLAGS)

@ -69,7 +69,6 @@ CYBOZU_TEST_AUTO(mulT)
{
mulTest<8>();
mulTest<12>();
mulTest<16>();
}
template<size_t N>
@ -88,11 +87,6 @@ void sqrTest()
vx *= vx;
mcl::sqrT<N>(y, x);
CYBOZU_TEST_EQUAL_ARRAY(y, vx.getUnit(), N * 2);
#if 0
memset(z, 0, sizeof(z));
mcl::karatsubaT<N>(z, x, y);
CYBOZU_TEST_EQUAL_ARRAY(z, vx.getUnit(), N * 2);
#endif
}
CYBOZU_BENCH_C("sqrT", 10000, mcl::sqrT<N>, y, x);
}
@ -101,7 +95,6 @@ CYBOZU_TEST_AUTO(sqrT)
{
sqrTest<8>();
sqrTest<12>();
sqrTest<16>();
}
struct Montgomery {

@ -22,6 +22,23 @@ void copyT(uint32_t y[N], const uint32_t x[N])
}
}
template<size_t N>
uint32_t shlT(uint32_t y[N], const uint32_t x[N], size_t bit)
{
assert(0 < bit && bit < 32);
assert((N % 2) == 0);
size_t rBit = sizeof(uint32_t) * 8 - bit;
uint32_t keep = x[N - 1];
uint32_t prev = keep;
for (size_t i = N - 1; i > 0; i--) {
uint32_t t = x[i - 1];
y[i] = (prev << bit) | (t >> rBit);
prev = t;
}
y[0] = prev << bit;
return keep >> rBit;
}
// [return:y[N]] += x
template<size_t N>
inline bool addUnitT(uint32_t y[N], uint32_t x)
@ -101,6 +118,8 @@ void mulT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N])
}
}
#if 0
// slower than mulT
template<size_t N>
uint32_t mulUnitWithTblT(uint32_t z[N], const uint64_t *tbl_j)
{
@ -128,7 +147,6 @@ uint32_t addMulUnitWithTblT(uint32_t z[N], const uint64_t *tbl_j)
return H;
}
// y[N * 2] = x[N] * x[N]
template<size_t N>
void sqrT(uint32_t y[N * 2], const uint32_t x[N])
@ -148,6 +166,7 @@ void sqrT(uint32_t y[N * 2], const uint32_t x[N])
y[N + i] = addMulUnitWithTblT<N>(&y[i], tbl + N * i);
}
}
#endif
/*
z[N * 2] = x[N] * y[N]
@ -157,6 +176,7 @@ void sqrT(uint32_t y[N * 2], const uint32_t x[N])
assume a < W/2, c < W/2
(aW + b)(cW + d) = acW^2 + (ad + bc)W + bd
ad + bc = (a + b)(c + d) - ac - bd < (1 << (N * 32))
slower than mulT on Core i7 with -m32 for N <= 12
*/
template<size_t N>
void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N])
@ -165,8 +185,6 @@ void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N])
assert((x[N - 1] & 0x80000000) == 0);
assert((y[N - 1] & 0x80000000) == 0);
const size_t H = N / 2;
mulT<H>(z, x, y); // bd
mulT<H>(z + N, x + H, y + H); // ac
uint32_t a_b[H];
uint32_t c_d[H];
bool c1 = addT<H>(a_b, x, x + H); // a + b
@ -179,6 +197,8 @@ void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N])
if (c2) {
addT<H>(tmp + H, tmp + H, a_b);
}
mulT<H>(z, x, y); // bd
mulT<H>(z + N, x + H, y + H); // ac
// c:tmp[N] = (a + b)(c + d)
subT<N>(tmp, tmp, z);
subT<N>(tmp, tmp, z + N);
@ -188,6 +208,37 @@ void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N])
}
}
/*
y[N * 2] = x[N] * x[N]
(aW + b)^2 = a^2 W + b^2 + 2abW
(a+b)^2 - a^2 - b^2
*/
template<size_t N>
void sqrT(uint32_t y[N * 2], const uint32_t x[N])
{
assert((N % 2) == 0);
assert((x[N - 1] & 0x80000000) == 0);
const size_t H = N / 2;
uint32_t a_b[H];
bool c = addT<H>(a_b, x, x + H); // a + b
uint32_t tmp[N];
mulT<H>(tmp, a_b, a_b);
if (c) {
// addT<H>(a_b, a_b, a_b);
shlT<H>(a_b, a_b, 1);
addT<H>(tmp + H, tmp + H, a_b);
}
mulT<H>(y, x, x); // b^2
mulT<H>(y + N, x + H, x + H); // a^2
// tmp[N] = (a + b)^2
subT<N>(tmp, tmp, y);
subT<N>(tmp, tmp, y + N);
// tmp[N] = 2ab
if (addT<N>(y + H, y + H, tmp)) {
addUnitT<H>(y + N + H, 1);
}
}
template<size_t N>
void addModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N])
{

Loading…
Cancel
Save