diff --git a/sample/large.cpp b/sample/large.cpp
index 72de4a2..cd79412 100644
--- a/sample/large.cpp
+++ b/sample/large.cpp
@@ -108,11 +108,11 @@ void test(const std::string& pStr, mcl::fp::Mode mode)
 	}
 	CYBOZU_BENCH("mulPre", op.fpDbl_mulPre, ux, ux, uy);
 	CYBOZU_BENCH("sqrPre", op.fpDbl_sqrPre, ux, ux);
-	CYBOZU_BENCH("add", op.fpDbl_add, ux, ux, ux);
-	CYBOZU_BENCH("sub", op.fpDbl_sub, ux, ux, ux);
+	CYBOZU_BENCH("add", op.fpDbl_add, ux, ux, ux, op.p);
+	CYBOZU_BENCH("sub", op.fpDbl_sub, ux, ux, ux, op.p);
 	CYBOZU_BENCH("addNC", op.fpDbl_addNC, ux, ux, ux);
 	CYBOZU_BENCH("subNC", op.fpDbl_subNC, ux, ux, ux);
-	CYBOZU_BENCH("mont", op.fpDbl_mod, ux, ux);
+	CYBOZU_BENCH("mont", op.fpDbl_mod, ux, ux, op.p);
 	CYBOZU_BENCH("mul", Fp::mul, x, x, x);
 	compareGmp(pStr);
 }
diff --git a/sample/rawbench.cpp b/sample/rawbench.cpp
index 083d0cf..ddfe733 100644
--- a/sample/rawbench.cpp
+++ b/sample/rawbench.cpp
@@ -36,19 +36,19 @@ void benchRaw(const char *p, mcl::fp::Mode mode)
 	double fpDbl_addT, fpDbl_subT;
 	double fpDbl_sqrPreT, fpDbl_mulPreT, fpDbl_modT;
 	double fp2_sqrT, fp2_mulT;
-	CYBOZU_BENCH_T(fp_addT, op.fp_add, uz, ux, uy);
-	CYBOZU_BENCH_T(fp_subT, op.fp_sub, uz, uy, ux);
+	CYBOZU_BENCH_T(fp_addT, op.fp_add, uz, ux, uy, op.p);
+	CYBOZU_BENCH_T(fp_subT, op.fp_sub, uz, uy, ux, op.p);
 	CYBOZU_BENCH_T(fp_addNCT, op.fp_addNC, uz, ux, uy);
 	CYBOZU_BENCH_T(fp_subNCT, op.fp_subNC, uz, uy, ux);
-	CYBOZU_BENCH_T(fp_sqrT, op.fp_sqr, uz, ux);
-	CYBOZU_BENCH_T(fp_mulT, op.fp_mul, uz, ux, uy);
-	CYBOZU_BENCH_T(fp_mul_UnitT, op.fp_mul_Unit, uz, ux, 12345678);
+	CYBOZU_BENCH_T(fp_sqrT, op.fp_sqr, uz, ux, op.p);
+	CYBOZU_BENCH_T(fp_mulT, op.fp_mul, uz, ux, uy, op.p);
+	CYBOZU_BENCH_T(fp_mul_UnitT, op.fp_mul_Unit, uz, ux, 12345678, op.p);
 	CYBOZU_BENCH_T(fp_mul_UnitPreT, op.fp_mul_UnitPre, ux, ux, 12345678);
-	CYBOZU_BENCH_T(fpDbl_addT, op.fpDbl_add, uz, ux, uy);
-	CYBOZU_BENCH_T(fpDbl_subT, op.fpDbl_sub, uz, uy, ux);
+	CYBOZU_BENCH_T(fpDbl_addT, op.fpDbl_add, uz, ux, uy, op.p);
+	CYBOZU_BENCH_T(fpDbl_subT, op.fpDbl_sub, uz, uy, ux, op.p);
 	CYBOZU_BENCH_T(fpDbl_sqrPreT, op.fpDbl_sqrPre, uz, ux);
 	CYBOZU_BENCH_T(fpDbl_mulPreT, op.fpDbl_mulPre, uz, ux, uy);
-	CYBOZU_BENCH_T(fpDbl_modT, op.fpDbl_mod, uz, ux);
+	CYBOZU_BENCH_T(fpDbl_modT, op.fpDbl_mod, uz, ux, op.p);
 	Fp2 f2x, f2y;
 	f2x.a = fx;
 	f2x.b = fy;
diff --git a/src/fp.cpp b/src/fp.cpp
index bcaa295..97648d2 100644
--- a/src/fp.cpp
+++ b/src/fp.cpp
@@ -107,6 +107,70 @@ Mode StrToMode(const std::string& s)
 	throw cybozu::Exception("StrToMode") << s;
 }
 
+#ifdef MCL_USE_LLVM
+
+#define MCL_DEF_LLVM_FUNC(bit) \
+template<>const u3u AddNC<bit / mcl::fp::UnitBitSize, Ltag>::f = &mcl_fp_addNC ## bit ## L; \
+template<>const u3u SubNC<bit / mcl::fp::UnitBitSize, Ltag>::f = &mcl_fp_subNC ## bit ## L; \
+template<>const void3u MulPre<bit / mcl::fp::UnitBitSize, Ltag>::f = &mcl_fpDbl_mulPre ## bit ## L; \
+template<>const void2u SqrPre<bit / mcl::fp::UnitBitSize, Ltag>::f = &mcl_fpDbl_sqrPre ## bit ## L; \
+template<>const void2uI Mul_UnitPre<bit / mcl::fp::UnitBitSize, Ltag>::f = &mcl_fp_mul_UnitPre ## bit ## L; \
+template<>const void4u Add<bit / mcl::fp::UnitBitSize, Ltag>::f = &mcl_fp_add ## bit ## L; \
+template<>const void4u Sub<bit / mcl::fp::UnitBitSize, Ltag>::f = &mcl_fp_sub ## bit ## L; \
+template<>const void4u Mont<bit / mcl::fp::UnitBitSize, Ltag>::f = &mcl_fp_mont ## bit ## L; \
+template<>const void3u MontRed<bit / mcl::fp::UnitBitSize, Ltag>::f = &mcl_fp_montRed ## bit ## L; \
+template<>const void4u DblAdd<bit / mcl::fp::UnitBitSize, Ltag>::f = &mcl_fpDbl_add ## bit ## L; \
+template<>const void4u DblSub<bit / mcl::fp::UnitBitSize, Ltag>::f = &mcl_fpDbl_sub ## bit ## L; \
+
+template<size_t N>
+struct Mul<N, Ltag> {
+	static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
+	{
+		Unit xy[N * 2];
+		MulPre<N, Ltag>::f(xy, x, y);
+		Dbl_Mod<N, Gtag>::f(z, xy, p);
+	}
+	static const void4u f;
+};
+
+template<size_t N>
+const void4u Mul<N, Ltag>::f = Mul<N, Ltag>::func;
+
+template<size_t N>
+struct Sqr<N, Ltag> {
+	static inline void func(Unit *y, const Unit *x, const Unit *p)
+	{
+		Unit xx[N * 2];
+		SqrPre<N, Ltag>::f(xx, x);
+		Dbl_Mod<N, Gtag>::f(y, xx, p);
+	}
+	static const void3u f;
+};
+
+template<size_t N>
+const void3u Sqr<N, Ltag>::f = Sqr<N, Ltag>::func;
+
+MCL_DEF_LLVM_FUNC(64)
+MCL_DEF_LLVM_FUNC(128)
+MCL_DEF_LLVM_FUNC(192)
+MCL_DEF_LLVM_FUNC(256)
+MCL_DEF_LLVM_FUNC(320)
+MCL_DEF_LLVM_FUNC(384)
+MCL_DEF_LLVM_FUNC(448)
+MCL_DEF_LLVM_FUNC(512)
+#if CYBOZU_OS_BIT == 32
+MCL_DEF_LLVM_FUNC(160)
+MCL_DEF_LLVM_FUNC(224)
+MCL_DEF_LLVM_FUNC(288)
+MCL_DEF_LLVM_FUNC(352)
+MCL_DEF_LLVM_FUNC(416)
+MCL_DEF_LLVM_FUNC(480)
+MCL_DEF_LLVM_FUNC(544)
+#else
+MCL_DEF_LLVM_FUNC(576)
+#endif
+
+#endif
 
 template<size_t bitSize>
 struct OpeFunc {
@@ -136,43 +200,6 @@ struct OpeFunc {
 	{
 		copyArray(y, x, N);
 	}
-	static inline void fp_addC(Unit *z, const Unit *x, const Unit *y, const Unit *p)
-	{
-		if (AddPre<N, Gtag>::f(z, x, y)) {
-			SubPre<N, Gtag>::f(z, z, p);
-			return;
-		}
-		Unit tmp[N];
-		if (SubPre<N, Gtag>::f(tmp, z, p) == 0) {
-			memcpy(z, tmp, sizeof(tmp));
-		}
-	}
-	static inline void fp_subC(Unit *z, const Unit *x, const Unit *y, const Unit *p)
-	{
-		if (SubPre<N, Gtag>::f(z, x, y)) {
-			AddPre<N, Gtag>::f(z, z, p);
-		}
-	}
-	/*
-		z[N * 2] <- x[N * 2] + y[N * 2] mod p[N] << (N * UnitBitSize)
-	*/
-	static inline void fpDbl_addC(Unit *z, const Unit *x, const Unit *y, const Unit *p)
-	{
-		if (AddPre<N * 2, Gtag>::f(z, x, y)) {
-			SubPre<N, Gtag>::f(z + N, z + N, p);
-			return;
-		}
-		Unit tmp[N];
-		if (SubPre<N, Gtag>::f(tmp, z + N, p) == 0) {
-			memcpy(z + N, tmp, sizeof(tmp));
-		}
-	}
-	static inline void fpDbl_subC(Unit *z, const Unit *x, const Unit *y, const Unit *p)
-	{
-		if (SubPre<N * 2, Gtag>::f(z, x, y)) {
-			AddPre<N, Gtag>::f(z + N, z + N, p);
-		}
-	}
 	// z[N] <- mont(x[N], y[N])
 	static inline void fp_mulMontC(Unit *z, const Unit *x, const Unit *y, const Unit *p)
 	{
@@ -189,20 +216,20 @@ struct OpeFunc {
 		Unit t[N + 2];
 		Mul_UnitPre<N, Gtag>::f(t, p, q); // p * q
 		t[N + 1] = 0; // always zero
-		c[N + 1] = AddPre<N + 1, Gtag>::f(c, c, t);
+		c[N + 1] = AddNC<N + 1, Gtag>::f(c, c, t);
 		c++;
 		for (size_t i = 1; i < N; i++) {
 			Mul_UnitPre<N, Gtag>::f(t, x, y[i]);
-			c[N + 1] = AddPre<N + 1, Gtag>::f(c, c, t);
+			c[N + 1] = AddNC<N + 1, Gtag>::f(c, c, t);
 			q = c[0] * rp;
 			Mul_UnitPre<N, Gtag>::f(t, p, q);
-			AddPre<N + 2, Gtag>::f(c, c, t);
+			AddNC<N + 2, Gtag>::f(c, c, t);
 			c++;
 		}
 		if (c[N]) {
-			SubPre<N, Gtag>::f(z, c, p);
+			SubNC<N, Gtag>::f(z, c, p);
 		} else {
-			if (SubPre<N, Gtag>::f(z, c, p)) {
+			if (SubNC<N, Gtag>::f(z, c, p)) {
 				memcpy(z, c, N * sizeof(Unit));
 			}
 		}
@@ -221,7 +248,7 @@ struct OpeFunc {
 		Unit *c = buf;
 		Unit q = xy[0] * rp;
 		Mul_UnitPre<N, Gtag>::f(t, p, q);
-		buf[N * 2] = AddPre<N * 2, Gtag>::f(buf, xy, t);
+		buf[N * 2] = AddNC<N * 2, Gtag>::f(buf, xy, t);
 		c++;
 		for (size_t i = 1; i < N; i++) {
 			q = c[0] * rp;
@@ -231,9 +258,9 @@ struct OpeFunc {
 			c++;
 		}
 		if (c[N]) {
-			SubPre<N, Gtag>::f(z, c, p);
+			SubNC<N, Gtag>::f(z, c, p);
 		} else {
-			if (SubPre<N, Gtag>::f(z, c, p)) {
+			if (SubNC<N, Gtag>::f(z, c, p)) {
 				memcpy(z, c, N * sizeof(Unit));
 			}
 		}
@@ -289,39 +316,48 @@ struct OpeFunc {
 			if (x != y) fp_clearC(y);
 			return;
 		}
-		fp_subC(y, p, x, p);
+		SubNC<N, Gtag>::f(y, p, x);
 	}
 };
 
 #ifdef MCL_USE_LLVM
-	#define SET_OP_LLVM(bit) \
+	#define SET_OP_LLVM /* assume n */ \
 		if (mode == FP_LLVM || mode == FP_LLVM_MONT) { \
-			fp_add = mcl_fp_add ## bit ## L; \
-			fp_sub = mcl_fp_sub ## bit ## L; \
-			if (!isFullBit) { \
-				fp_addNC = mcl_fp_addNC ## bit ## L; \
-				fp_subNC = mcl_fp_subNC ## bit ## L; \
-			} \
-			fpDbl_mulPre = mcl_fpDbl_mulPre ## bit ## L; \
-			fp_mul_UnitPre = mcl_fp_mul_UnitPre ## bit ## L; \
-			fpDbl_sqrPre = mcl_fpDbl_sqrPre ## bit ## L; \
+			fp_add = Add<n, Ltag>::f; \
+			fp_sub = Sub<n, Ltag>::f; \
+			fpDbl_add = DblAdd<n, Ltag>::f; \
+			fpDbl_sub = DblSub<n, Ltag>::f; \
 			if (mode == FP_LLVM_MONT) { \
-				fpDbl_mod = mcl_fp_montRed ## bit ## L; \
-				fp_mul = mcl_fp_mont ## bit ## L; \
+				fp_mul = Mont<n, Ltag>::f; \
+				fp_sqr = SqrMont<n, Ltag>::f; \
+				fpDbl_mod = MontRed<n, Ltag>::f; \
+			} else { \
+				fp_mul = Mul<n, Ltag>::f; \
+				fp_sqr = Sqr<n, Ltag>::f; \
+			} \
+			fpDbl_mulPre = MulPre<n, Ltag>::f; \
+			fpDbl_sqrPre = SqrPre<n, Ltag>::f; \
+			fp_mul_UnitPre = Mul_UnitPre<n, Ltag>::f; \
+			if (!isFullBit) { \
+				fp_addNC = AddNC<n, Ltag>::f; \
+				fp_subNC = SubNC<n, Ltag>::f; \
 			} \
 		}
-	#define SET_OP_DBL_LLVM(bit, n2) \
+
+#define SET_OP_LLVM2(bit) \
+	{ \
+		const int n = bit / UnitBitSize; \
 		if (mode == FP_LLVM || mode == FP_LLVM_MONT) { \
-			fpDbl_add = mcl_fpDbl_add ## bit ## L; \
-			fpDbl_sub = mcl_fpDbl_sub ## bit ## L; \
 			if (!isFullBit) { \
-				fpDbl_addNC = mcl_fp_addNC ## n2 ## L; \
-				fpDbl_subNC = mcl_fp_subNC ## n2 ## L; \
+				fpDbl_addNC = AddNC<n * 2, Ltag>::f; \
+				fpDbl_subNC = SubNC<n * 2, Ltag>::f; \
 			} \
-		}
+		} \
+	}
+
 #else
-	#define SET_OP_LLVM(bit)
-	#define SET_OP_DBL_LLVM(bit, n2)
+	#define SET_OP_LLVM
+	#define SET_OP_LLVM2(bit)
 #endif
 
 #define SET_OP(bit) \
@@ -332,8 +368,8 @@ struct OpeFunc {
 		fp_clear = OpeFunc<bit>::fp_clearC; \
 		fp_copy = OpeFunc<bit>::fp_copyC; \
 		fp_neg = OpeFunc<bit>::fp_negC; \
-		fp_add = OpeFunc<bit>::fp_addC; \
-		fp_sub = OpeFunc<bit>::fp_subC; \
+		fp_add = Add<n, Gtag>::f; \
+		fp_sub = Sub<n, Gtag>::f; \
 		if (isMont) { \
 			fp_mul = OpeFunc<bit>::fp_mulMontC; \
 			fp_sqr = OpeFunc<bit>::fp_sqrMontC; \
@@ -350,15 +386,15 @@ struct OpeFunc {
 		fpDbl_sqrPre = SqrPre<n, Gtag>::f; \
 		fp_mul_UnitPre = Mul_UnitPre<n, Gtag>::f; \
 		fpN1_mod = N1_Mod<n, Gtag>::f; \
-		fpDbl_add = OpeFunc<bit>::fpDbl_addC; \
-		fpDbl_sub = OpeFunc<bit>::fpDbl_subC; \
+		fpDbl_add = DblAdd<n, Gtag>::f; \
+		fpDbl_sub = DblSub<n, Gtag>::f; \
 		if (!isFullBit) { \
-			fp_addNC = AddPre<n, Gtag>::f; \
-			fp_subNC = SubPre<n, Gtag>::f; \
-			fpDbl_addNC = AddPre<n * 2, Gtag>::f; \
-			fpDbl_subNC = SubPre<n * 2, Gtag>::f; \
+			fp_addNC = AddNC<n, Gtag>::f; \
+			fp_subNC = SubNC<n, Gtag>::f; \
+			fpDbl_addNC = AddNC<n * 2, Gtag>::f; \
+			fpDbl_subNC = SubNC<n * 2, Gtag>::f; \
 		} \
-		SET_OP_LLVM(bit) \
+		SET_OP_LLVM \
 	}
 
 #ifdef MCL_USE_XBYAK
@@ -476,41 +512,26 @@ void Op::init(const std::string& mstr, size_t maxBitSize, Mode mode)
 	}
 #endif
 	switch (roundBit) {
-	case 64: SET_OP(64); SET_OP_DBL_LLVM(64, 128); break;
-	case 128: SET_OP(128); SET_OP_DBL_LLVM(128, 256); break;
-	case 192: SET_OP(192); SET_OP_DBL_LLVM(192, 384); break;
-	case 256: SET_OP(256); SET_OP_DBL_LLVM(256, 512); break;
+	case 64:  SET_OP(64);  SET_OP_LLVM2(64);  break;
+	case 128: SET_OP(128); SET_OP_LLVM2(128); break;
+	case 192: SET_OP(192); SET_OP_LLVM2(192); break;
+	case 256: SET_OP(256); SET_OP_LLVM2(256); break;
 	case 320: SET_OP(320); break;
 	case 384: SET_OP(384); break;
 	case 448: SET_OP(448); break;
-	case 512: SET_OP(512);
-		// QQQ : need refactor for large prime
-#if MCL_MAX_OP_BIT_SIZE == 768
-		SET_OP_DBL_LLVM(512, 1024);
-#endif
-		break;
+	case 512: SET_OP(512); break;
 #if CYBOZU_OS_BIT == 64
-	case 576: SET_OP(576);
-#if MCL_MAX_OP_BIT_SIZE == 768
-		SET_OP_DBL_LLVM(576, 1152);
-#endif
-		break;
+	case 576: SET_OP(576); break;
 #if MCL_MAX_OP_BIT_SIZE == 768
-	case 640: SET_OP(640);
-		SET_OP_DBL_LLVM(640, 1280);
-		break;
-	case 704: SET_OP(704);
-		SET_OP_DBL_LLVM(704, 1408);
-		break;
-	case 768: SET_OP(768);
-		SET_OP_DBL_LLVM(768, 1536);
-		break;
+	case 640: SET_OP(640); break;
+	case 704: SET_OP(704); break;
+	case 768: SET_OP(768); break;
 #endif
 #else
-	case 32: SET_OP(32); SET_OP_DBL_LLVM(32, 64); break;
-	case 96: SET_OP(96); SET_OP_DBL_LLVM(96, 192); break;
-	case 160: SET_OP(160); SET_OP_DBL_LLVM(160, 320); break;
-	case 224: SET_OP(224); SET_OP_DBL_LLVM(224, 448); break;
+	case 32:  SET_OP(32);  SET_OP_LLVM2(32);  break;
+	case 96:  SET_OP(96);  SET_OP_LLVM2(96);  break;
+	case 160: SET_OP(160); SET_OP_LLVM2(160); break;
+	case 224: SET_OP(224); SET_OP_LLVM2(224); break;
 	case 288: SET_OP(288); break;
 	case 352: SET_OP(352); break;
 	case 416: SET_OP(416); break;
diff --git a/src/fp_proto.hpp b/src/fp_proto.hpp
index 99763e6..a30730b 100644
--- a/src/fp_proto.hpp
+++ b/src/fp_proto.hpp
@@ -10,32 +10,124 @@
 
 namespace mcl { namespace fp {
 
+struct Ltag;
+struct Atag;
+
 // (carry, z[N]) <- x[N] + y[N]
-template<size_t N, class Tag>class AddPre { static const u3u f; };
+template<size_t N, class Tag>struct AddNC { static const u3u f; };
 // (carry, z[N]) <- x[N] - y[N]
-template<size_t N, class Tag>class SubPre { static const u3u f; };
+template<size_t N, class Tag>struct SubNC { static const u3u f; };
 // z[N * 2] <- x[N] * y[N]
-template<size_t N, class Tag>class MulPre { static const void3u f; };
+template<size_t N, class Tag>struct MulPre { static const void3u f; };
 // z[N * 2] <- x[N] * x[N]
-template<size_t N, class Tag>class SqrPre { static const void2u f; };
+template<size_t N, class Tag>struct SqrPre { static const void2u f; };
 // z[N + 1] <- x[N] * y
-template<size_t N, class Tag>class Mul_UnitPre { static const void2uI f; };
+template<size_t N, class Tag>struct Mul_UnitPre { static const void2uI f; };
 // z[N] <- x[N + 1] % p[N]
-template<size_t N, class Tag>class N1_Mod { static const void3u f; };
+template<size_t N, class Tag>struct N1_Mod { static const void3u f; };
 // z[N] <- x[N * 2] % p[N]
-template<size_t N, class Tag>class Dbl_Mod { static const void3u f; };
+template<size_t N, class Tag>struct Dbl_Mod { static const void3u f; };
+// z[N] <- Montgomery(x[N], y[N], p[N])
+template<size_t N, class Tag>struct Mont { static const void4u f; };
+// z[N] <- MontRed(xy[N], p[N])
+template<size_t N, class Tag>struct MontRed { static const void3u f; };
+
+// z[N] <- (x[N] * y[N]) % p[N]
+template<size_t N, class Tag>struct Mul { static const void4u f; };
+// z[N] <- (x[N] ^ 2) % p[N]
+template<size_t N, class Tag>struct Sqr { static const void3u f; };
+
+// z[N] <- Montgomery(x[N], x[N], p[N])
+template<size_t N, class Tag>
+struct SqrMont {
+	static inline void func(Unit *y, const Unit *x, const Unit *p)
+	{
+		Mont<N, Tag>::f(y, x, x, p);
+	}
+	static const void3u f;
+};
+template<size_t N, class Tag>
+const void3u SqrMont<N, Tag>::f = SqrMont<N, Tag>::func;
+
+// z[N] <- (x[N] + y[N]) % p[N]
+template<size_t N, class Tag>
+struct Add {
+	static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
+	{
+		if (AddNC<N, Tag>::f(z, x, y)) {
+			SubNC<N, Tag>::f(z, z, p);
+			return;
+		}
+		Unit tmp[N];
+		if (SubNC<N, Tag>::f(tmp, z, p) == 0) {
+			memcpy(z, tmp, sizeof(tmp));
+		}
+	}
+	static const void4u f;
+};
+
+template<size_t N, class Tag>
+const void4u Add<N, Tag>::f = Add<N, Tag>::func;
+
+// z[N] <- (x[N] - y[N]) % p[N]
+template<size_t N, class Tag>
+struct Sub {
+	static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
+	{
+		if (SubNC<N, Tag>::f(z, x, y)) {
+			AddNC<N, Tag>::f(z, z, p);
+		}
+	}
+	static const void4u f;
+};
+
+template<size_t N, class Tag>
+const void4u Sub<N, Tag>::f = Sub<N, Tag>::func;
+
+//	z[N * 2] <- (x[N * 2] + y[N * 2]) mod p[N] << (N * UnitBitSize)
+template<size_t N, class Tag>
+struct DblAdd {
+	static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
+	{
+		if (AddNC<N * 2, Tag>::f(z, x, y)) {
+			SubNC<N, Tag>::f(z + N, z + N, p);
+			return;
+		}
+		Unit tmp[N];
+		if (SubNC<N, Tag>::f(tmp, z + N, p) == 0) {
+			memcpy(z + N, tmp, sizeof(tmp));
+		}
+	}
+	static const void4u f;
+};
+
+template<size_t N, class Tag>
+const void4u DblAdd<N, Tag>::f = DblAdd<N, Tag>::func;
+
+//	z[N * 2] <- (x[N * 2] - y[N * 2]) mod p[N] << (N * UnitBitSize)
+template<size_t N, class Tag>
+struct DblSub {
+	static inline void func(Unit *z, const Unit *x, const Unit *y, const Unit *p)
+	{
+		if (SubNC<N * 2, Tag>::f(z, x, y)) {
+			AddNC<N, Tag>::f(z + N, z + N, p);
+		}
+	}
+	static const void4u f;
+};
+
+template<size_t N, class Tag>
+const void4u DblSub<N, Tag>::f = DblSub<N, Tag>::func;
 
 } } // mcl::fp
 
 #ifdef MCL_USE_LLVM
 
-extern "C" {
-
 #define MCL_FP_DEF_FUNC_SUB(len, suf) \
 void mcl_fp_add ## len ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p); \
 void mcl_fp_sub ## len ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p); \
-void mcl_fp_addNC ## len ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y); \
-void mcl_fp_subNC ## len ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y); \
+mcl::fp::Unit mcl_fp_addNC ## len ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y); \
+mcl::fp::Unit mcl_fp_subNC ## len ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y); \
 void mcl_fp_mul_UnitPre ## len ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, mcl::fp::Unit y); \
 void mcl_fpDbl_mulPre ## len ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y); \
 void mcl_fpDbl_sqrPre ## len ## suf(mcl::fp::Unit* y, const mcl::fp::Unit* x); \
@@ -45,15 +137,16 @@ void mcl_fpDbl_add ## len ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const
 void mcl_fpDbl_sub ## len ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* p);
 
 #define MCL_FP_DEF_FUNC(len) \
-	MCL_FP_DEF_FUNC_SUB(len, G) \
 	MCL_FP_DEF_FUNC_SUB(len, L) \
 	MCL_FP_DEF_FUNC_SUB(len, A)
 
 #define MCL_FP_DEF_FUNC_SPECIAL(suf) \
-	void mcl_fpDbl_mod_NIST_P192 ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* xy, const mcl::fp::Unit* /* dummy */); \
-	void mcl_fp_mul_NIST_P192 ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* /* dummy */); \
-	void mcl_fp_sqr_NIST_P192 ## suf(mcl::fp::Unit* y, const mcl::fp::Unit* x, const mcl::fp::Unit* /* dummy */); \
-	void mcl_fpDbl_mod_NIST_P521 ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* xy, const mcl::fp::Unit* /* dummy */);
+void mcl_fpDbl_mod_NIST_P192 ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* xy, const mcl::fp::Unit* /* dummy */); \
+void mcl_fp_mul_NIST_P192 ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* x, const mcl::fp::Unit* y, const mcl::fp::Unit* /* dummy */); \
+void mcl_fp_sqr_NIST_P192 ## suf(mcl::fp::Unit* y, const mcl::fp::Unit* x, const mcl::fp::Unit* /* dummy */); \
+void mcl_fpDbl_mod_NIST_P521 ## suf(mcl::fp::Unit* z, const mcl::fp::Unit* xy, const mcl::fp::Unit* /* dummy */);
+
+extern "C" {
 
 MCL_FP_DEF_FUNC(64)
 MCL_FP_DEF_FUNC(128)
@@ -83,14 +176,13 @@ MCL_FP_DEF_FUNC(1408)
 MCL_FP_DEF_FUNC(1536)
 #endif
 
-MCL_FP_DEF_FUNC_SPECIAL(G)
 MCL_FP_DEF_FUNC_SPECIAL(L)
 MCL_FP_DEF_FUNC_SPECIAL(A)
 
+}
+
 #undef MCL_FP_DEF_FUNC_SUB
 #undef MCL_FP_DEF_FUNC
 
-}
-
 #endif
 
diff --git a/src/gen.cpp b/src/gen.cpp
index b7d9f9f..155a5b6 100644
--- a/src/gen.cpp
+++ b/src/gen.cpp
@@ -296,57 +296,65 @@ struct Code : public mcl::Generator {
 	void gen_mcl_fp_addsubNC(bool isAdd)
 	{
 		resetGlobalIdx();
+		Operand r(Int, unit);
 		Operand pz(IntPtr, bit);
 		Operand px(IntPtr, bit);
 		Operand py(IntPtr, bit);
 		std::string name;
 		if (isAdd) {
 			name = "mcl_fp_addNC" + cybozu::itoa(bit) + "L";
-			mcl_fp_addNCM[bit] = Function(name, Void, pz, px, py);
+			mcl_fp_addNCM[bit] = Function(name, r, pz, px, py);
 			verifyAndSetPrivate(mcl_fp_addNCM[bit]);
 			beginFunc(mcl_fp_addNCM[bit]);
 		} else {
 			name = "mcl_fp_subNC" + cybozu::itoa(bit) + "L";
-			mcl_fp_subNCM[bit] = Function(name, Void, pz, px, py);
+			mcl_fp_subNCM[bit] = Function(name, r, pz, px, py);
 			verifyAndSetPrivate(mcl_fp_subNCM[bit]);
 			beginFunc(mcl_fp_subNCM[bit]);
 		}
-		Operand x = load(px);
-		Operand y = load(py);
+		Operand x = zext(load(px), bit + unit);
+		Operand y = zext(load(py), bit + unit);
 		Operand z;
 		if (isAdd) {
 			z = add(x, y);
+			store(trunc(z, bit), pz);
+			r = trunc(lshr(z, bit), unit);
 		} else {
 			z = sub(x, y);
+			store(trunc(z, bit), pz);
+			r = _and(trunc(lshr(z, bit), unit), makeImm(unit, 1));
 		}
-		store(z, pz);
-		ret(Void);
+		ret(r);
 		endFunc();
 	}
-#if 0
-	void gen_mcl_fp_addS()
+#if 0 // void-return version
+	void gen_mcl_fp_addsubNC(bool isAdd)
 	{
 		resetGlobalIdx();
 		Operand pz(IntPtr, bit);
 		Operand px(IntPtr, bit);
 		Operand py(IntPtr, bit);
-		Operand pp(IntPtr, bit);
-		std::string name = "mcl_fp_add" + cybozu::itoa(bit) + "S";
-		mcl_fp_addM[bit] = Function(name, Void, pz, px, py, pp);
-		beginFunc(mcl_fp_addM[bit]);
+		std::string name;
+		if (isAdd) {
+			name = "mcl_fp_addNC" + cybozu::itoa(bit) + "L";
+			mcl_fp_addNCM[bit] = Function(name, Void, pz, px, py);
+			verifyAndSetPrivate(mcl_fp_addNCM[bit]);
+			beginFunc(mcl_fp_addNCM[bit]);
+		} else {
+			name = "mcl_fp_subNC" + cybozu::itoa(bit) + "L";
+			mcl_fp_subNCM[bit] = Function(name, Void, pz, px, py);
+			verifyAndSetPrivate(mcl_fp_subNCM[bit]);
+			beginFunc(mcl_fp_subNCM[bit]);
+		}
 		Operand x = load(px);
 		Operand y = load(py);
-		Operand p = load(pp);
-		x = zext(x, bit + unit);
-		y = zext(y, bit + unit);
-		p = zext(p, bit + unit);
-		Operand t0 = add(x, y);
-		Operand t1 = sub(t0, p);
-		Operand t = lshr(t1, bit);
-		t = trunc(t, 1);
-		t = select(t, t0, t1);
-		t = trunc(t, bit);
-		store(t, pz);
+		Operand z;
+		if (isAdd) {
+			z = add(x, y);
+		} else {
+			z = sub(x, y);
+		}
+		store(z, pz);
 		ret(Void);
 		endFunc();
 	}
@@ -385,33 +393,6 @@ struct Code : public mcl::Generator {
 		ret(Void);
 		endFunc();
 	}
-#if 0
-	void gen_mcl_fp_subS()
-	{
-		resetGlobalIdx();
-		Operand pz(IntPtr, bit);
-		Operand px(IntPtr, bit);
-		Operand py(IntPtr, bit);
-		Operand pp(IntPtr, bit);
-		std::string name = "mcl_fp_sub" + cybozu::itoa(bit) + "S";
-		mcl_fp_subM[bit] = Function(name, Void, pz, px, py, pp);
-		beginFunc(mcl_fp_subM[bit]);
-		Operand x = load(px);
-		Operand y = load(py);
-		x = zext(x, bit + unit);
-		y = zext(y, bit + unit);
-		Operand vc = sub(x, y);
-		Operand v = trunc(vc, bit); // v = x - y
-		Operand c = lshr(vc, bit);
-		c = trunc(c, 1);
-		Operand p = load(pp);
-		Operand z = select(c, p, makeImm(bit, 0));
-		v = add(v, z);
-		store(v, pz);
-		ret(Void);
-		endFunc();
-	}
-#endif
 	void gen_mcl_fp_sub()
 	{
 		resetGlobalIdx();
diff --git a/src/low_gmp.hpp b/src/low_gmp.hpp
index 44477d5..d11a30f 100644
--- a/src/low_gmp.hpp
+++ b/src/low_gmp.hpp
@@ -7,7 +7,7 @@ namespace mcl { namespace fp {
 struct Gtag;
 
 template<size_t N>
-struct AddPre<N, Gtag> {
+struct AddNC<N, Gtag> {
 	static inline Unit func(Unit *z, const Unit *x, const Unit *y)
 	{
 		return mpn_add_n((mp_limb_t*)z, (const mp_limb_t*)x, (const mp_limb_t*)y, N);
@@ -16,10 +16,10 @@ struct AddPre<N, Gtag> {
 };
 
 template<size_t N>
-const u3u AddPre<N, Gtag>::f = &AddPre<N, Gtag>::func;
+const u3u AddNC<N, Gtag>::f = &AddNC<N, Gtag>::func;
 
 template<size_t N>
-struct SubPre<N, Gtag> {
+struct SubNC<N, Gtag> {
 	static inline Unit func(Unit *z, const Unit *x, const Unit *y)
 	{
 		return mpn_sub_n((mp_limb_t*)z, (const mp_limb_t*)x, (const mp_limb_t*)y, N);
@@ -28,7 +28,7 @@ struct SubPre<N, Gtag> {
 };
 
 template<size_t N>
-const u3u SubPre<N, Gtag>::f = &SubPre<N, Gtag>::func;
+const u3u SubNC<N, Gtag>::f = &SubNC<N, Gtag>::func;
 
 template<size_t N>
 struct MulPre<N, Gtag> {