diff --git a/misc/Makefile b/misc/Makefile
index 1a727f3..25a7c27 100644
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -2,5 +2,5 @@ all: low_test
 
 CFLAGS=-I ../include/ -m32 -Ofast -Wall -Wextra -DNDEBUG
 
-low_test: low_test.cpp
+low_test: low_test.cpp ../src/low_funct.hpp
 	$(CXX) -o low_test low_test.cpp $(CFLAGS)
diff --git a/misc/low_test.cpp b/misc/low_test.cpp
index a77ed36..540e3d9 100644
--- a/misc/low_test.cpp
+++ b/misc/low_test.cpp
@@ -69,7 +69,6 @@ CYBOZU_TEST_AUTO(mulT)
 {
 	mulTest<8>();
 	mulTest<12>();
-	mulTest<16>();
 }
 
 template<size_t N>
@@ -88,11 +87,6 @@ void sqrTest()
 		vx *= vx;
 		mcl::sqrT<N>(y, x);
 		CYBOZU_TEST_EQUAL_ARRAY(y, vx.getUnit(), N * 2);
-#if 0
-		memset(z, 0, sizeof(z));
-		mcl::karatsubaT<N>(z, x, y);
-		CYBOZU_TEST_EQUAL_ARRAY(z, vx.getUnit(), N * 2);
-#endif
 	}
 	CYBOZU_BENCH_C("sqrT", 10000, mcl::sqrT<N>, y, x);
 }
@@ -101,7 +95,6 @@ CYBOZU_TEST_AUTO(sqrT)
 {
 	sqrTest<8>();
 	sqrTest<12>();
-	sqrTest<16>();
 }
 
 struct Montgomery {
diff --git a/src/low_funct.hpp b/src/low_funct.hpp
index f25559c..2f3a2a1 100644
--- a/src/low_funct.hpp
+++ b/src/low_funct.hpp
@@ -22,6 +22,23 @@ void copyT(uint32_t y[N], const uint32_t x[N])
 	}
 }
 
+template<size_t N>
+uint32_t shlT(uint32_t y[N], const uint32_t x[N], size_t bit)
+{
+	assert(0 < bit && bit < 32);
+	assert((N % 2) == 0);
+	size_t rBit = sizeof(uint32_t) * 8 - bit;
+	uint32_t keep = x[N - 1];
+	uint32_t prev = keep;
+	for (size_t i = N - 1; i > 0; i--) {
+		uint32_t t = x[i - 1];
+		y[i] = (prev << bit) | (t >> rBit);
+		prev = t;
+	}
+	y[0] = prev << bit;
+	return keep >> rBit;
+}
+
 // [return:y[N]] += x
 template<size_t N>
 inline bool addUnitT(uint32_t y[N], uint32_t x)
@@ -101,6 +118,8 @@ void mulT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N])
 	}
 }
 
+#if 0
+// slower than mulT
 template<size_t N>
 uint32_t mulUnitWithTblT(uint32_t z[N], const uint64_t *tbl_j)
 {
@@ -128,7 +147,6 @@ uint32_t addMulUnitWithTblT(uint32_t z[N], const uint64_t *tbl_j)
 	return H;
 }
 
-
 // y[N * 2] = x[N] * x[N]
 template<size_t N>
 void sqrT(uint32_t y[N * 2], const uint32_t x[N])
@@ -148,6 +166,7 @@ void sqrT(uint32_t y[N * 2], const uint32_t x[N])
 		y[N + i] = addMulUnitWithTblT<N>(&y[i], tbl + N * i);
 	}
 }
+#endif
 
 /*
 	z[N * 2] = x[N] * y[N]
@@ -157,6 +176,7 @@ void sqrT(uint32_t y[N * 2], const uint32_t x[N])
 	assume a < W/2, c < W/2
 	(aW + b)(cW + d) = acW^2 + (ad + bc)W + bd
 	ad + bc = (a + b)(c + d) - ac - bd < (1 << (N * 32))
+	slower than mulT on Core i7 with -m32 for N <= 12
 */
 template<size_t N>
 void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N])
@@ -165,8 +185,6 @@ void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N])
 	assert((x[N - 1] & 0x80000000) == 0);
 	assert((y[N - 1] & 0x80000000) == 0);
 	const size_t H = N / 2;
-	mulT<H>(z, x, y); // bd
-	mulT<H>(z + N, x + H, y + H); // ac
 	uint32_t a_b[H];
 	uint32_t c_d[H];
 	bool c1 = addT<H>(a_b, x, x + H); // a + b
@@ -179,6 +197,8 @@ void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N])
 	if (c2) {
 		addT<H>(tmp + H, tmp + H, a_b);
 	}
+	mulT<H>(z, x, y); // bd
+	mulT<H>(z + N, x + H, y + H); // ac
 	// c:tmp[N] = (a + b)(c + d)
 	subT<N>(tmp, tmp, z);
 	subT<N>(tmp, tmp, z + N);
@@ -188,6 +208,37 @@ void karatsubaT(uint32_t z[N * 2], const uint32_t x[N], const uint32_t y[N])
 	}
 }
 
+/*
+	y[N * 2] = x[N] * x[N]
+	(aW + b)^2 = a^2 W + b^2 + 2abW
+	(a+b)^2 - a^2 - b^2
+*/
+template<size_t N>
+void sqrT(uint32_t y[N * 2], const uint32_t x[N])
+{
+	assert((N % 2) == 0);
+	assert((x[N - 1] & 0x80000000) == 0);
+	const size_t H = N / 2;
+	uint32_t a_b[H];
+	bool c = addT<H>(a_b, x, x + H); // a + b
+	uint32_t tmp[N];
+	mulT<H>(tmp, a_b, a_b);
+	if (c) {
+//		addT<H>(a_b, a_b, a_b);
+		shlT<H>(a_b, a_b, 1);
+		addT<H>(tmp + H, tmp + H, a_b);
+	}
+	mulT<H>(y, x, x); // b^2
+	mulT<H>(y + N, x + H, x + H); // a^2
+	// tmp[N] = (a + b)^2
+	subT<N>(tmp, tmp, y);
+	subT<N>(tmp, tmp, y + N);
+	// tmp[N] = 2ab
+	if (addT<N>(y + H, y + H, tmp)) {
+		addUnitT<H>(y + N + H, 1);
+	}
+}
+
 template<size_t N>
 void addModT(uint32_t z[N], const uint32_t x[N], const uint32_t y[N], const uint32_t p[N])
 {