From 7806835e9c03543e88f4f7a0f52b8761ff48e241 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Fri, 7 May 2021 17:27:20 +0900
Subject: [PATCH 01/23] [pedantic] avoid undefined behavior of abs

---
 include/mcl/fp.hpp   |  2 +-
 include/mcl/util.hpp | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp
index 706d9fa..f41d4f8 100644
--- a/include/mcl/fp.hpp
+++ b/include/mcl/fp.hpp
@@ -227,7 +227,7 @@ public:
 		} else {
 			clear();
 			if (x) {
-				int64_t y = x < 0 ? -x : x;
+				uint64_t y = fp::abs_(x);
 				if (sizeof(Unit) == 8) {
 					v_[0] = y;
 				} else {
diff --git a/include/mcl/util.hpp b/include/mcl/util.hpp
index 8915c88..b35801e 100644
--- a/include/mcl/util.hpp
+++ b/include/mcl/util.hpp
@@ -17,8 +17,21 @@
 namespace mcl { namespace fp {
 
 // some environments do not have utility
-template<class T>
-T abs_(T x) { return x < 0 ? -x : x; }
+inline uint32_t abs_(int32_t x)
+{
+	if (x >= 0) return uint32_t(x);
+	// avoid undefined behavior
+	if (x == -2147483647 - 1) return 2147483648u;
+	return uint32_t(-x);
+}
+
+inline uint64_t abs_(int64_t x)
+{
+	if (x >= 0) return uint64_t(x);
+	// avoid undefined behavior
+	if (x == -9223372036854775807ll - 1) return 9223372036854775808ull;
+	return uint64_t(-x);
+}
 
 template<class T>
 T min_(T x, T y) { return x < y ? x : y; }

From 093e916151f8728071d97fb91c4e87c99a6b6c5e Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Fri, 7 May 2021 17:39:20 +0900
Subject: [PATCH 02/23] fix for (bitlen(p) + 1) % 64 = 0

---
 include/mcl/gmp_util.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/mcl/gmp_util.hpp b/include/mcl/gmp_util.hpp
index e444993..4111c37 100644
--- a/include/mcl/gmp_util.hpp
+++ b/include/mcl/gmp_util.hpp
@@ -960,6 +960,7 @@ struct SmallModp {
 	}
 	uint32_t getTop(const Unit *x) const
 	{
+		if (shiftR_ == 0) return x[N_ - 1];
 		return (x[N_ - 1] >> shiftR_) | (x[N_] << shiftL_);
 	}
 	uint32_t cvtInt(const mpz_class& x) const

From 1653541c0a9e484083d2025ced19b1120d637ec3 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Sun, 9 May 2021 17:57:35 +0900
Subject: [PATCH 03/23] fix init bool

---
 include/mcl/fp_tower.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 451436b..93d5654 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -442,6 +442,7 @@ public:
 			Fp2T::mul(g2[i], t, g[i]);
 			g3[i] = g[i] * g2[i];
 		}
+		*pb = true;
 	}
 #ifndef CYBOZU_DONT_USE_EXCEPTION
 	static void init()

From 5dcd5c53b7e496ca8eac4ccc5e154941d8146592 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Sun, 9 May 2021 18:34:10 +0900
Subject: [PATCH 04/23] avoid cast of fpDbl_add

---
 include/mcl/fp_tower.hpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 93d5654..5b039a1 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -113,19 +113,25 @@ public:
 	{
 		gmp::setArray(pb, x, v_, Fp::op_.N * 2);
 	}
+	static inline void add(FpDblT& z, const FpDblT& x, const FpDblT& y)
+	{
+#ifdef MCL_XBYAK_DIRECT_CALL
+		Fp::op_.fpDbl_addA_(z.v_, x.v_, y.v_);
+#else
+		Fp::op_.fpDbl_add(z.v_, x.v_, y.v_, Fp::op_.p);
+#endif
+	}
 #ifdef MCL_XBYAK_DIRECT_CALL
-	static void (*add)(FpDblT& z, const FpDblT& x, const FpDblT& y);
+	static void addA(Unit *z, const Unit *x, const Unit *y) { Fp::op_.fpDbl_add(z, x, y, Fp::op_.p); }
 	static void (*sub)(FpDblT& z, const FpDblT& x, const FpDblT& y);
 	static void (*mod)(Fp& z, const FpDblT& xy);
 	static void (*addPre)(FpDblT& z, const FpDblT& x, const FpDblT& y);
 	static void (*subPre)(FpDblT& z, const FpDblT& x, const FpDblT& y);
-	static void addC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_add(z.v_, x.v_, y.v_, Fp::op_.p); }
 	static void subC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_sub(z.v_, x.v_, y.v_, Fp::op_.p); }
 	static void modC(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); }
 	static void addPreC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); }
 	static void subPreC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); }
 #else
-	static void add(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_add(z.v_, x.v_, y.v_, Fp::op_.p); }
 	static void sub(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_sub(z.v_, x.v_, y.v_, Fp::op_.p); }
 	static void mod(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); }
 	static void addPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); }
@@ -152,9 +158,10 @@ public:
 	static void init()
 	{
 #ifdef MCL_XBYAK_DIRECT_CALL
-		const mcl::fp::Op& op = Fp::getOp();
-		add = fp::func_ptr_cast<void (*)(FpDblT&, const FpDblT&, const FpDblT&)>(op.fpDbl_addA_);
-		if (add == 0) add = addC;
+		mcl::fp::Op& op = Fp::getOpNonConst();
+		if (op.fpDbl_addA_ == 0) {
+			op.fpDbl_addA_ = addA;
+		}
 		sub = fp::func_ptr_cast<void (*)(FpDblT&, const FpDblT&, const FpDblT&)>(op.fpDbl_subA_);
 		if (sub == 0) sub = subC;
 		mod = fp::func_ptr_cast<void (*)(Fp&, const FpDblT&)>(op.fpDbl_modA_);
@@ -170,7 +177,6 @@ public:
 };
 
 #ifdef MCL_XBYAK_DIRECT_CALL
-template<class Fp> void (*FpDblT<Fp>::add)(FpDblT&, const FpDblT&, const FpDblT&);
 template<class Fp> void (*FpDblT<Fp>::sub)(FpDblT&, const FpDblT&, const FpDblT&);
 template<class Fp> void (*FpDblT<Fp>::mod)(Fp&, const FpDblT&);
 template<class Fp> void (*FpDblT<Fp>::addPre)(FpDblT&, const FpDblT&, const FpDblT&);

From 52a83ac5267f48d1dc2b9d15eb2fb675dbc40239 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 09:25:43 +0900
Subject: [PATCH 05/23] avoid cast of fpDbl_sub

---
 include/mcl/fp_tower.hpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 5b039a1..61ce62b 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -119,20 +119,26 @@ public:
 		Fp::op_.fpDbl_addA_(z.v_, x.v_, y.v_);
 #else
 		Fp::op_.fpDbl_add(z.v_, x.v_, y.v_, Fp::op_.p);
+#endif
+	}
+	static inline void sub(FpDblT& z, const FpDblT& x, const FpDblT& y)
+	{
+#ifdef MCL_XBYAK_DIRECT_CALL
+		Fp::op_.fpDbl_subA_(z.v_, x.v_, y.v_);
+#else
+		Fp::op_.fpDbl_sub(z.v_, x.v_, y.v_, Fp::op_.p);
 #endif
 	}
 #ifdef MCL_XBYAK_DIRECT_CALL
 	static void addA(Unit *z, const Unit *x, const Unit *y) { Fp::op_.fpDbl_add(z, x, y, Fp::op_.p); }
-	static void (*sub)(FpDblT& z, const FpDblT& x, const FpDblT& y);
+	static void subA(Unit *z, const Unit *x, const Unit *y) { Fp::op_.fpDbl_sub(z, x, y, Fp::op_.p); }
 	static void (*mod)(Fp& z, const FpDblT& xy);
 	static void (*addPre)(FpDblT& z, const FpDblT& x, const FpDblT& y);
 	static void (*subPre)(FpDblT& z, const FpDblT& x, const FpDblT& y);
-	static void subC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_sub(z.v_, x.v_, y.v_, Fp::op_.p); }
 	static void modC(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); }
 	static void addPreC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); }
 	static void subPreC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); }
 #else
-	static void sub(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_sub(z.v_, x.v_, y.v_, Fp::op_.p); }
 	static void mod(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); }
 	static void addPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); }
 	static void subPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); }
@@ -162,8 +168,9 @@ public:
 		if (op.fpDbl_addA_ == 0) {
 			op.fpDbl_addA_ = addA;
 		}
-		sub = fp::func_ptr_cast<void (*)(FpDblT&, const FpDblT&, const FpDblT&)>(op.fpDbl_subA_);
-		if (sub == 0) sub = subC;
+		if (op.fpDbl_subA_ == 0) {
+			op.fpDbl_subA_ = subA;
+		}
 		mod = fp::func_ptr_cast<void (*)(Fp&, const FpDblT&)>(op.fpDbl_modA_);
 		if (mod == 0) mod = modC;
 		addPre = fp::func_ptr_cast<void (*)(FpDblT&, const FpDblT&, const FpDblT&)>(op.fpDbl_addPre);
@@ -177,7 +184,6 @@ public:
 };
 
 #ifdef MCL_XBYAK_DIRECT_CALL
-template<class Fp> void (*FpDblT<Fp>::sub)(FpDblT&, const FpDblT&, const FpDblT&);
 template<class Fp> void (*FpDblT<Fp>::mod)(Fp&, const FpDblT&);
 template<class Fp> void (*FpDblT<Fp>::addPre)(FpDblT&, const FpDblT&, const FpDblT&);
 template<class Fp> void (*FpDblT<Fp>::subPre)(FpDblT&, const FpDblT&, const FpDblT&);

From 55cd686b2259a9ca09d833e061582db69903c0ae Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 09:32:12 +0900
Subject: [PATCH 06/23] avoid cast of fpDbl_mod

---
 include/mcl/fp_tower.hpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 61ce62b..3acff3f 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -127,19 +127,25 @@ public:
 		Fp::op_.fpDbl_subA_(z.v_, x.v_, y.v_);
 #else
 		Fp::op_.fpDbl_sub(z.v_, x.v_, y.v_, Fp::op_.p);
+#endif
+	}
+	static inline void mod(Fp& z, const FpDblT& xy)
+	{
+#ifdef MCL_XBYAK_DIRECT_CALL
+		Fp::op_.fpDbl_modA_(z.v_, xy.v_);
+#else
+		Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p);
 #endif
 	}
 #ifdef MCL_XBYAK_DIRECT_CALL
 	static void addA(Unit *z, const Unit *x, const Unit *y) { Fp::op_.fpDbl_add(z, x, y, Fp::op_.p); }
 	static void subA(Unit *z, const Unit *x, const Unit *y) { Fp::op_.fpDbl_sub(z, x, y, Fp::op_.p); }
-	static void (*mod)(Fp& z, const FpDblT& xy);
+	static void modA(Unit *z, const Unit *xy) { Fp::op_.fpDbl_mod(z, xy, Fp::op_.p); }
 	static void (*addPre)(FpDblT& z, const FpDblT& x, const FpDblT& y);
 	static void (*subPre)(FpDblT& z, const FpDblT& x, const FpDblT& y);
-	static void modC(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); }
 	static void addPreC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); }
 	static void subPreC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); }
 #else
-	static void mod(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); }
 	static void addPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); }
 	static void subPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); }
 #endif
@@ -171,8 +177,9 @@ public:
 		if (op.fpDbl_subA_ == 0) {
 			op.fpDbl_subA_ = subA;
 		}
-		mod = fp::func_ptr_cast<void (*)(Fp&, const FpDblT&)>(op.fpDbl_modA_);
-		if (mod == 0) mod = modC;
+		if (op.fpDbl_modA_ == 0) {
+			op.fpDbl_modA_ = modA;
+		}
 		addPre = fp::func_ptr_cast<void (*)(FpDblT&, const FpDblT&, const FpDblT&)>(op.fpDbl_addPre);
 		if (addPre == 0) addPre = addPreC;
 		subPre = fp::func_ptr_cast<void (*)(FpDblT&, const FpDblT&, const FpDblT&)>(op.fpDbl_subPre);
@@ -184,7 +191,6 @@ public:
 };
 
 #ifdef MCL_XBYAK_DIRECT_CALL
-template<class Fp> void (*FpDblT<Fp>::mod)(Fp&, const FpDblT&);
 template<class Fp> void (*FpDblT<Fp>::addPre)(FpDblT&, const FpDblT&, const FpDblT&);
 template<class Fp> void (*FpDblT<Fp>::subPre)(FpDblT&, const FpDblT&, const FpDblT&);
 #endif

From 676488081a60c1371511bd00388de81acc0d8801 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 10:03:41 +0900
Subject: [PATCH 07/23] avoid cast of fpDbl_addPre/subPre

---
 include/mcl/fp_tower.hpp | 30 ++++--------------------------
 1 file changed, 4 insertions(+), 26 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 3acff3f..31f97c8 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -141,27 +141,14 @@ public:
 	static void addA(Unit *z, const Unit *x, const Unit *y) { Fp::op_.fpDbl_add(z, x, y, Fp::op_.p); }
 	static void subA(Unit *z, const Unit *x, const Unit *y) { Fp::op_.fpDbl_sub(z, x, y, Fp::op_.p); }
 	static void modA(Unit *z, const Unit *xy) { Fp::op_.fpDbl_mod(z, xy, Fp::op_.p); }
-	static void (*addPre)(FpDblT& z, const FpDblT& x, const FpDblT& y);
-	static void (*subPre)(FpDblT& z, const FpDblT& x, const FpDblT& y);
-	static void addPreC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); }
-	static void subPreC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); }
-#else
+#endif
 	static void addPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); }
 	static void subPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); }
-#endif
 	/*
 		mul(z, x, y) = mulPre(xy, x, y) + mod(z, xy)
 	*/
-	static void mulPre(FpDblT& xy, const Fp& x, const Fp& y)
-	{
-		const mcl::fp::Op& op = Fp::getOp();
-		op.fpDbl_mulPre(xy.v_, x.v_, y.v_);
-	}
-	static void sqrPre(FpDblT& xx, const Fp& x)
-	{
-		const mcl::fp::Op& op = Fp::getOp();
-		op.fpDbl_sqrPre(xx.v_, x.v_);
-	}
+	static void mulPre(FpDblT& xy, const Fp& x, const Fp& y) { Fp::op_.fpDbl_mulPre(xy.v_, x.v_, y.v_); }
+	static void sqrPre(FpDblT& xx, const Fp& x) { Fp::op_.fpDbl_sqrPre(xx.v_, x.v_); }
 	static void mulUnit(FpDblT& z, const FpDblT& x, Unit y)
 	{
 		if (mulSmallUnit(z, x, y)) return;
@@ -170,7 +157,7 @@ public:
 	static void init()
 	{
 #ifdef MCL_XBYAK_DIRECT_CALL
-		mcl::fp::Op& op = Fp::getOpNonConst();
+		mcl::fp::Op& op = Fp::op_;
 		if (op.fpDbl_addA_ == 0) {
 			op.fpDbl_addA_ = addA;
 		}
@@ -180,21 +167,12 @@ public:
 		if (op.fpDbl_modA_ == 0) {
 			op.fpDbl_modA_ = modA;
 		}
-		addPre = fp::func_ptr_cast<void (*)(FpDblT&, const FpDblT&, const FpDblT&)>(op.fpDbl_addPre);
-		if (addPre == 0) addPre = addPreC;
-		subPre = fp::func_ptr_cast<void (*)(FpDblT&, const FpDblT&, const FpDblT&)>(op.fpDbl_subPre);
-		if (subPre == 0) subPre = subPreC;
 #endif
 	}
 	void operator+=(const FpDblT& x) { add(*this, *this, x); }
 	void operator-=(const FpDblT& x) { sub(*this, *this, x); }
 };
 
-#ifdef MCL_XBYAK_DIRECT_CALL
-template<class Fp> void (*FpDblT<Fp>::addPre)(FpDblT&, const FpDblT&, const FpDblT&);
-template<class Fp> void (*FpDblT<Fp>::subPre)(FpDblT&, const FpDblT&, const FpDblT&);
-#endif
-
 /*
 	beta = -1
 	Fp2 = F[i] / (i^2 + 1)

From bb83774fadbaa0b6daf137700b325bc121df2c79 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 11:12:00 +0900
Subject: [PATCH 08/23] avoid cast of Fp2::add

---
 include/mcl/fp_tower.hpp | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 31f97c8..41039ba 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -223,15 +223,21 @@ public:
 		a = a_;
 		b = b_;
 	}
+	static void add(Fp2T& z, const Fp2T& x, const Fp2T& y)
+	{
+#ifdef MCL_XBYAK_DIRECT_CALL
+		Fp::op_.fp2_addA_(z.a.v_, x.a.v_, y.a.v_);
+#else
+		addA(z.a.v_, x.a.v_, y.a.v_);
+#endif
+	}
 #ifdef MCL_XBYAK_DIRECT_CALL
-	static void (*add)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 	static void (*sub)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 	static void (*neg)(Fp2T& y, const Fp2T& x);
 	static void (*mul)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 	static void (*sqr)(Fp2T& y, const Fp2T& x);
 	static void (*mul2)(Fp2T& y, const Fp2T& x);
 #else
-	static void add(Fp2T& z, const Fp2T& x, const Fp2T& y) { addC(z, x, y); }
 	static void sub(Fp2T& z, const Fp2T& x, const Fp2T& y) { subC(z, x, y); }
 	static void neg(Fp2T& y, const Fp2T& x) { negC(y, x); }
 	static void mul(Fp2T& z, const Fp2T& x, const Fp2T& y) { mulC(z, x, y); }
@@ -376,7 +382,6 @@ public:
 	static uint32_t get_xi_a() { return Fp::getOp().xi_a; }
 	static void init(bool *pb)
 	{
-//		assert(Fp::maxSize <= 256);
 		mcl::fp::Op& op = Fp::op_;
 		assert(op.xi_a);
 		// assume p < W/4 where W = 1 << (N * sizeof(Unit) * 8)
@@ -386,8 +391,9 @@ public:
 		}
 		mul_xi = 0;
 #ifdef MCL_XBYAK_DIRECT_CALL
-		add = fp::func_ptr_cast<void (*)(Fp2T& z, const Fp2T& x, const Fp2T& y)>(op.fp2_addA_);
-		if (add == 0) add = addC;
+		if (op.fp2_addA_ == 0) {
+			op.fp2_addA_ = addA;
+		}
 		sub = fp::func_ptr_cast<void (*)(Fp2T& z, const Fp2T& x, const Fp2T& y)>(op.fp2_subA_);
 		if (sub == 0) sub = subC;
 		neg = fp::func_ptr_cast<void (*)(Fp2T& y, const Fp2T& x)>(op.fp2_negA_);
@@ -487,8 +493,11 @@ private:
 		default Fp2T operator
 		Fp2T = Fp[i]/(i^2 + 1)
 	*/
-	static void addC(Fp2T& z, const Fp2T& x, const Fp2T& y)
+	static void addA(Unit *pz, const Unit *px, const Unit *py)
 	{
+		Fp2T& z = *reinterpret_cast<Fp2T*>(pz);
+		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
+		const Fp2T& y = *reinterpret_cast<const Fp2T*>(py);
 		Fp::add(z.a, x.a, y.a);
 		Fp::add(z.b, x.b, y.b);
 	}
@@ -594,7 +603,6 @@ private:
 };
 
 #ifdef MCL_XBYAK_DIRECT_CALL
-template<class Fp_> void (*Fp2T<Fp_>::add)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 template<class Fp_> void (*Fp2T<Fp_>::sub)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 template<class Fp_> void (*Fp2T<Fp_>::neg)(Fp2T& y, const Fp2T& x);
 template<class Fp_> void (*Fp2T<Fp_>::mul)(Fp2T& z, const Fp2T& x, const Fp2T& y);

From a36e6f276bfcf6c42634a3c50c897feac6742723 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 11:14:02 +0900
Subject: [PATCH 09/23] avoid cast of Fp2::sub

---
 include/mcl/fp_tower.hpp | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 41039ba..04c972e 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -229,16 +229,22 @@ public:
 		Fp::op_.fp2_addA_(z.a.v_, x.a.v_, y.a.v_);
 #else
 		addA(z.a.v_, x.a.v_, y.a.v_);
+#endif
+	}
+	static void sub(Fp2T& z, const Fp2T& x, const Fp2T& y)
+	{
+#ifdef MCL_XBYAK_DIRECT_CALL
+		Fp::op_.fp2_subA_(z.a.v_, x.a.v_, y.a.v_);
+#else
+		subA(z.a.v_, x.a.v_, y.a.v_);
 #endif
 	}
 #ifdef MCL_XBYAK_DIRECT_CALL
-	static void (*sub)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 	static void (*neg)(Fp2T& y, const Fp2T& x);
 	static void (*mul)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 	static void (*sqr)(Fp2T& y, const Fp2T& x);
 	static void (*mul2)(Fp2T& y, const Fp2T& x);
 #else
-	static void sub(Fp2T& z, const Fp2T& x, const Fp2T& y) { subC(z, x, y); }
 	static void neg(Fp2T& y, const Fp2T& x) { negC(y, x); }
 	static void mul(Fp2T& z, const Fp2T& x, const Fp2T& y) { mulC(z, x, y); }
 	static void sqr(Fp2T& y, const Fp2T& x) { sqrC(y, x); }
@@ -394,8 +400,9 @@ public:
 		if (op.fp2_addA_ == 0) {
 			op.fp2_addA_ = addA;
 		}
-		sub = fp::func_ptr_cast<void (*)(Fp2T& z, const Fp2T& x, const Fp2T& y)>(op.fp2_subA_);
-		if (sub == 0) sub = subC;
+		if (op.fp2_subA_ == 0) {
+			op.fp2_subA_ = subA;
+		}
 		neg = fp::func_ptr_cast<void (*)(Fp2T& y, const Fp2T& x)>(op.fp2_negA_);
 		if (neg == 0) neg = negC;
 		mul = fp::func_ptr_cast<void (*)(Fp2T& z, const Fp2T& x, const Fp2T& y)>(op.fp2_mulA_);
@@ -501,8 +508,11 @@ private:
 		Fp::add(z.a, x.a, y.a);
 		Fp::add(z.b, x.b, y.b);
 	}
-	static void subC(Fp2T& z, const Fp2T& x, const Fp2T& y)
+	static void subA(Unit *pz, const Unit *px, const Unit *py)
 	{
+		Fp2T& z = *reinterpret_cast<Fp2T*>(pz);
+		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
+		const Fp2T& y = *reinterpret_cast<const Fp2T*>(py);
 		Fp::sub(z.a, x.a, y.a);
 		Fp::sub(z.b, x.b, y.b);
 	}
@@ -603,7 +613,6 @@ private:
 };
 
 #ifdef MCL_XBYAK_DIRECT_CALL
-template<class Fp_> void (*Fp2T<Fp_>::sub)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 template<class Fp_> void (*Fp2T<Fp_>::neg)(Fp2T& y, const Fp2T& x);
 template<class Fp_> void (*Fp2T<Fp_>::mul)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 template<class Fp_> void (*Fp2T<Fp_>::sqr)(Fp2T& y, const Fp2T& x);

From 76e87c23c679725811fe2d1406e12e246e84b264 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 11:32:21 +0900
Subject: [PATCH 10/23] avoid cast of Fp2::neg

---
 include/mcl/fp_tower.hpp | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 04c972e..3d2c317 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -237,15 +237,21 @@ public:
 		Fp::op_.fp2_subA_(z.a.v_, x.a.v_, y.a.v_);
 #else
 		subA(z.a.v_, x.a.v_, y.a.v_);
+#endif
+	}
+	static void neg(Fp2T& y, const Fp2T& x)
+	{
+#ifdef MCL_XBYAK_DIRECT_CALL
+		Fp::op_.fp2_negA_(y.a.v_, x.a.v_);
+#else
+		negA(y.a.v_, x.a.v_);
 #endif
 	}
 #ifdef MCL_XBYAK_DIRECT_CALL
-	static void (*neg)(Fp2T& y, const Fp2T& x);
 	static void (*mul)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 	static void (*sqr)(Fp2T& y, const Fp2T& x);
 	static void (*mul2)(Fp2T& y, const Fp2T& x);
 #else
-	static void neg(Fp2T& y, const Fp2T& x) { negC(y, x); }
 	static void mul(Fp2T& z, const Fp2T& x, const Fp2T& y) { mulC(z, x, y); }
 	static void sqr(Fp2T& y, const Fp2T& x) { sqrC(y, x); }
 	static void mul2(Fp2T& y, const Fp2T& x) { mul2C(y, x); }
@@ -403,8 +409,9 @@ public:
 		if (op.fp2_subA_ == 0) {
 			op.fp2_subA_ = subA;
 		}
-		neg = fp::func_ptr_cast<void (*)(Fp2T& y, const Fp2T& x)>(op.fp2_negA_);
-		if (neg == 0) neg = negC;
+		if (op.fp2_negA_ == 0) {
+			op.fp2_negA_ = negA;
+		}
 		mul = fp::func_ptr_cast<void (*)(Fp2T& z, const Fp2T& x, const Fp2T& y)>(op.fp2_mulA_);
 		if (mul == 0) mul = mulC;
 		sqr = fp::func_ptr_cast<void (*)(Fp2T& y, const Fp2T& x)>(op.fp2_sqrA_);
@@ -516,8 +523,10 @@ private:
 		Fp::sub(z.a, x.a, y.a);
 		Fp::sub(z.b, x.b, y.b);
 	}
-	static void negC(Fp2T& y, const Fp2T& x)
+	static void negA(Unit *py, const Unit *px)
 	{
+		Fp2T& y = *reinterpret_cast<Fp2T*>(py);
+		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
 		Fp::neg(y.a, x.a);
 		Fp::neg(y.b, x.b);
 	}
@@ -613,7 +622,6 @@ private:
 };
 
 #ifdef MCL_XBYAK_DIRECT_CALL
-template<class Fp_> void (*Fp2T<Fp_>::neg)(Fp2T& y, const Fp2T& x);
 template<class Fp_> void (*Fp2T<Fp_>::mul)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 template<class Fp_> void (*Fp2T<Fp_>::sqr)(Fp2T& y, const Fp2T& x);
 template<class Fp_> void (*Fp2T<Fp_>::mul2)(Fp2T& y, const Fp2T& x);

From 646f008ded65498f52173d32e6b888f8dc23cc2f Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 11:40:28 +0900
Subject: [PATCH 11/23] avoid cast of Fp2::mul

---
 include/mcl/fp_tower.hpp | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 3d2c317..bdd0cdc 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -245,14 +245,20 @@ public:
 		Fp::op_.fp2_negA_(y.a.v_, x.a.v_);
 #else
 		negA(y.a.v_, x.a.v_);
+#endif
+	}
+	static void mul(Fp2T& z, const Fp2T& x, const Fp2T& y)
+	{
+#ifdef MCL_XBYAK_DIRECT_CALL
+		Fp::op_.fp2_mulA_(z.a.v_, x.a.v_, y.a.v_);
+#else
+		mulA(z.a.v_, x.a.v_, y.a.v_);
 #endif
 	}
 #ifdef MCL_XBYAK_DIRECT_CALL
-	static void (*mul)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 	static void (*sqr)(Fp2T& y, const Fp2T& x);
 	static void (*mul2)(Fp2T& y, const Fp2T& x);
 #else
-	static void mul(Fp2T& z, const Fp2T& x, const Fp2T& y) { mulC(z, x, y); }
 	static void sqr(Fp2T& y, const Fp2T& x) { sqrC(y, x); }
 	static void mul2(Fp2T& y, const Fp2T& x) { mul2C(y, x); }
 #endif
@@ -412,8 +418,9 @@ public:
 		if (op.fp2_negA_ == 0) {
 			op.fp2_negA_ = negA;
 		}
-		mul = fp::func_ptr_cast<void (*)(Fp2T& z, const Fp2T& x, const Fp2T& y)>(op.fp2_mulA_);
-		if (mul == 0) mul = mulC;
+		if (op.fp2_mulA_ == 0) {
+			op.fp2_mulA_ = mulA;
+		}
 		sqr = fp::func_ptr_cast<void (*)(Fp2T& y, const Fp2T& x)>(op.fp2_sqrA_);
 		if (sqr == 0) sqr = sqrC;
 		mul2 = fp::func_ptr_cast<void (*)(Fp2T& y, const Fp2T& x)>(op.fp2_mul2A_);
@@ -530,18 +537,21 @@ private:
 		Fp::neg(y.a, x.a);
 		Fp::neg(y.b, x.b);
 	}
-	static void mul2C(Fp2T& y, const Fp2T& x)
-	{
-		Fp::mul2(y.a, x.a);
-		Fp::mul2(y.b, x.b);
-	}
-	static void mulC(Fp2T& z, const Fp2T& x, const Fp2T& y)
+	static void mulA(Unit *pz, const Unit *px, const Unit *py)
 	{
+		Fp2T& z = *reinterpret_cast<Fp2T*>(pz);
+		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
+		const Fp2T& y = *reinterpret_cast<const Fp2T*>(py);
 		Fp2Dbl d;
 		Fp2Dbl::mulPre(d, x, y);
 		FpDbl::mod(z.a, d.a);
 		FpDbl::mod(z.b, d.b);
 	}
+	static void mul2C(Fp2T& y, const Fp2T& x)
+	{
+		Fp::mul2(y.a, x.a);
+		Fp::mul2(y.b, x.b);
+	}
 	/*
 		x = a + bi, i^2 = -1
 		y = x^2 = (a + bi)^2 = (a + b)(a - b) + 2abi
@@ -622,7 +632,6 @@ private:
 };
 
 #ifdef MCL_XBYAK_DIRECT_CALL
-template<class Fp_> void (*Fp2T<Fp_>::mul)(Fp2T& z, const Fp2T& x, const Fp2T& y);
 template<class Fp_> void (*Fp2T<Fp_>::sqr)(Fp2T& y, const Fp2T& x);
 template<class Fp_> void (*Fp2T<Fp_>::mul2)(Fp2T& y, const Fp2T& x);
 #endif

From 7d56a7fbf82b6a8ba6ecea9aa84e059f5c4c8b59 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 11:46:27 +0900
Subject: [PATCH 12/23] avoid cast of Fp2::sqr

---
 include/mcl/fp_tower.hpp | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index bdd0cdc..3bb7b97 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -253,13 +253,19 @@ public:
 		Fp::op_.fp2_mulA_(z.a.v_, x.a.v_, y.a.v_);
 #else
 		mulA(z.a.v_, x.a.v_, y.a.v_);
+#endif
+	}
+	static void sqr(Fp2T& y, const Fp2T& x)
+	{
+#ifdef MCL_XBYAK_DIRECT_CALL
+		Fp::op_.fp2_sqrA_(y.a.v_, x.a.v_);
+#else
+		sqrA(y.a.v_, x.a.v_);
 #endif
 	}
 #ifdef MCL_XBYAK_DIRECT_CALL
-	static void (*sqr)(Fp2T& y, const Fp2T& x);
 	static void (*mul2)(Fp2T& y, const Fp2T& x);
 #else
-	static void sqr(Fp2T& y, const Fp2T& x) { sqrC(y, x); }
 	static void mul2(Fp2T& y, const Fp2T& x) { mul2C(y, x); }
 #endif
 	static void (*mul_xi)(Fp2T& y, const Fp2T& x);
@@ -421,8 +427,9 @@ public:
 		if (op.fp2_mulA_ == 0) {
 			op.fp2_mulA_ = mulA;
 		}
-		sqr = fp::func_ptr_cast<void (*)(Fp2T& y, const Fp2T& x)>(op.fp2_sqrA_);
-		if (sqr == 0) sqr = sqrC;
+		if (op.fp2_sqrA_ == 0) {
+			op.fp2_sqrA_ = sqrA;
+		}
 		mul2 = fp::func_ptr_cast<void (*)(Fp2T& y, const Fp2T& x)>(op.fp2_mul2A_);
 		if (mul2 == 0) mul2 = mul2C;
 		mul_xi = fp::func_ptr_cast<void (*)(Fp2T&, const Fp2T&)>(op.fp2_mul_xiA_);
@@ -556,8 +563,10 @@ private:
 		x = a + bi, i^2 = -1
 		y = x^2 = (a + bi)^2 = (a + b)(a - b) + 2abi
 	*/
-	static void sqrC(Fp2T& y, const Fp2T& x)
+	static void sqrA(Unit *py, const Unit *px)
 	{
+		Fp2T& y = *reinterpret_cast<Fp2T*>(py);
+		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
 		const Fp& a = x.a;
 		const Fp& b = x.b;
 #if 1 // faster than using FpDbl
@@ -632,7 +641,6 @@ private:
 };
 
 #ifdef MCL_XBYAK_DIRECT_CALL
-template<class Fp_> void (*Fp2T<Fp_>::sqr)(Fp2T& y, const Fp2T& x);
 template<class Fp_> void (*Fp2T<Fp_>::mul2)(Fp2T& y, const Fp2T& x);
 #endif
 template<class Fp_> void (*Fp2T<Fp_>::mul_xi)(Fp2T& y, const Fp2T& x);

From 5f95d70767d6343c563f618a631bfe389eabf30a Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 11:55:15 +0900
Subject: [PATCH 13/23] avoid cast of Fp2::mul2

---
 include/mcl/fp_tower.hpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 3bb7b97..7fa885b 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -263,11 +263,14 @@ public:
 		sqrA(y.a.v_, x.a.v_);
 #endif
 	}
+	static void mul2(Fp2T& y, const Fp2T& x)
+	{
 #ifdef MCL_XBYAK_DIRECT_CALL
-	static void (*mul2)(Fp2T& y, const Fp2T& x);
+		Fp::op_.fp2_mul2A_(y.a.v_, x.a.v_);
 #else
-	static void mul2(Fp2T& y, const Fp2T& x) { mul2C(y, x); }
+		mul2A(y.a.v_, x.a.v_);
 #endif
+	}
 	static void (*mul_xi)(Fp2T& y, const Fp2T& x);
 	static void addPre(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::addPre(z.a, x.a, y.a); Fp::addPre(z.b, x.b, y.b); }
 	static void inv(Fp2T& y, const Fp2T& x) { Fp::op_.fp2_inv(y.a.v_, x.a.v_); }
@@ -430,8 +433,9 @@ public:
 		if (op.fp2_sqrA_ == 0) {
 			op.fp2_sqrA_ = sqrA;
 		}
-		mul2 = fp::func_ptr_cast<void (*)(Fp2T& y, const Fp2T& x)>(op.fp2_mul2A_);
-		if (mul2 == 0) mul2 = mul2C;
+		if (op.fp2_mul2A_ == 0) {
+			op.fp2_mul2A_ = mul2A;
+		}
 		mul_xi = fp::func_ptr_cast<void (*)(Fp2T&, const Fp2T&)>(op.fp2_mul_xiA_);
 #endif
 		op.fp2_inv = fp2_invW;
@@ -554,8 +558,10 @@ private:
 		FpDbl::mod(z.a, d.a);
 		FpDbl::mod(z.b, d.b);
 	}
-	static void mul2C(Fp2T& y, const Fp2T& x)
+	static void mul2A(Unit *py, const Unit *px)
 	{
+		Fp2T& y = *reinterpret_cast<Fp2T*>(py);
+		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
 		Fp::mul2(y.a, x.a);
 		Fp::mul2(y.b, x.b);
 	}
@@ -640,9 +646,6 @@ private:
 	}
 };
 
-#ifdef MCL_XBYAK_DIRECT_CALL
-template<class Fp_> void (*Fp2T<Fp_>::mul2)(Fp2T& y, const Fp2T& x);
-#endif
 template<class Fp_> void (*Fp2T<Fp_>::mul_xi)(Fp2T& y, const Fp2T& x);
 
 template<class Fp>

From 02a51c8077784f8e015b3c7215cde8da09c5fed2 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 12:18:07 +0900
Subject: [PATCH 14/23] avoid cast of Fp2::mul_xi

---
 include/mcl/fp_tower.hpp | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 7fa885b..f75b4c2 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -271,7 +271,10 @@ public:
 		mul2A(y.a.v_, x.a.v_);
 #endif
 	}
-	static void (*mul_xi)(Fp2T& y, const Fp2T& x);
+	static void mul_xi(Fp2T& y, const Fp2T& x)
+	{
+		Fp::op_.fp2_mul_xiA_(y.a.v_, x.a.v_);
+	}
 	static void addPre(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::addPre(z.a, x.a, y.a); Fp::addPre(z.b, x.b, y.b); }
 	static void inv(Fp2T& y, const Fp2T& x) { Fp::op_.fp2_inv(y.a.v_, x.a.v_); }
 	static void divBy2(Fp2T& y, const Fp2T& x)
@@ -416,7 +419,6 @@ public:
 			*pb = false;
 			return;
 		}
-		mul_xi = 0;
 #ifdef MCL_XBYAK_DIRECT_CALL
 		if (op.fp2_addA_ == 0) {
 			op.fp2_addA_ = addA;
@@ -436,16 +438,15 @@ public:
 		if (op.fp2_mul2A_ == 0) {
 			op.fp2_mul2A_ = mul2A;
 		}
-		mul_xi = fp::func_ptr_cast<void (*)(Fp2T&, const Fp2T&)>(op.fp2_mul_xiA_);
 #endif
-		op.fp2_inv = fp2_invW;
-		if (mul_xi == 0) {
+		if (op.fp2_mul_xiA_ == 0) {
 			if (op.xi_a == 1) {
-				mul_xi = fp2_mul_xi_1_1iC;
+				op.fp2_mul_xiA_ = fp2_mul_xi_1_1iA;
 			} else {
-				mul_xi = fp2_mul_xiC;
+				op.fp2_mul_xiA_ = fp2_mul_xiA;
 			}
 		}
+		op.fp2_inv = fp2_invW;
 		FpDblT<Fp>::init();
 		Fp2DblT<Fp>::init();
 		// call init before Fp2::pow because FpDbl is used in Fp2T
@@ -601,8 +602,10 @@ private:
 		y = (a + bi)xi = (a + bi)(xi_a + i)
 		=(a * x_ia - b) + (a + b xi_a)i
 	*/
-	static void fp2_mul_xiC(Fp2T& y, const Fp2T& x)
+	static void fp2_mul_xiA(Unit *py, const Unit *px)
 	{
+		Fp2T& y = *reinterpret_cast<Fp2T*>(py);
+		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
 		const Fp& a = x.a;
 		const Fp& b = x.b;
 		Fp t;
@@ -616,8 +619,10 @@ private:
 		xi = 1 + i ; xi_a = 1
 		y = (a + bi)xi = (a - b) + (a + b)i
 	*/
-	static void fp2_mul_xi_1_1iC(Fp2T& y, const Fp2T& x)
+	static void fp2_mul_xi_1_1iA(Unit *py, const Unit *px)
 	{
+		Fp2T& y = *reinterpret_cast<Fp2T*>(py);
+		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
 		const Fp& a = x.a;
 		const Fp& b = x.b;
 		Fp t;
@@ -646,8 +651,6 @@ private:
 	}
 };
 
-template<class Fp_> void (*Fp2T<Fp_>::mul_xi)(Fp2T& y, const Fp2T& x);
-
 template<class Fp>
 struct Fp2DblT {
 	typedef FpDblT<Fp> FpDbl;

From f474245e874f386b279899afbee762290f91b49b Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 12:24:45 +0900
Subject: [PATCH 15/23] avoid cast in Fp2::inv

---
 include/mcl/fp_tower.hpp | 39 ++++++++++++++++++---------------------
 include/mcl/op.hpp       |  2 --
 2 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index f75b4c2..247f7e9 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -275,8 +275,25 @@ public:
 	{
 		Fp::op_.fp2_mul_xiA_(y.a.v_, x.a.v_);
 	}
+	/*
+		x = a + bi
+		1 / x = (a - bi) / (a^2 + b^2)
+	*/
+	static void inv(Fp2T& y, const Fp2T& x)
+	{
+		assert(!x.isZero());
+		const Fp& a = x.a;
+		const Fp& b = x.b;
+		Fp aa, bb;
+		Fp::sqr(aa, a);
+		Fp::sqr(bb, b);
+		aa += bb;
+		Fp::inv(aa, aa); // aa = 1 / (a^2 + b^2)
+		Fp::mul(y.a, a, aa);
+		Fp::mul(y.b, b, aa);
+		Fp::neg(y.b, y.b);
+	}
 	static void addPre(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::addPre(z.a, x.a, y.a); Fp::addPre(z.b, x.b, y.b); }
-	static void inv(Fp2T& y, const Fp2T& x) { Fp::op_.fp2_inv(y.a.v_, x.a.v_); }
 	static void divBy2(Fp2T& y, const Fp2T& x)
 	{
 		Fp::divBy2(y.a, x.a);
@@ -446,7 +463,6 @@ public:
 				op.fp2_mul_xiA_ = fp2_mul_xiA;
 			}
 		}
-		op.fp2_inv = fp2_invW;
 		FpDblT<Fp>::init();
 		Fp2DblT<Fp>::init();
 		// call init before Fp2::pow because FpDbl is used in Fp2T
@@ -630,25 +646,6 @@ private:
 		Fp::sub(y.a, a, b);
 		y.b = t;
 	}
-	/*
-		x = a + bi
-		1 / x = (a - bi) / (a^2 + b^2)
-	*/
-	static void fp2_invW(Unit *y, const Unit *x)
-	{
-		const Fp *px = reinterpret_cast<const Fp*>(x);
-		Fp *py = reinterpret_cast<Fp*>(y);
-		const Fp& a = px[0];
-		const Fp& b = px[1];
-		Fp aa, bb;
-		Fp::sqr(aa, a);
-		Fp::sqr(bb, b);
-		aa += bb;
-		Fp::inv(aa, aa); // aa = 1 / (a^2 + b^2)
-		Fp::mul(py[0], a, aa);
-		Fp::mul(py[1], b, aa);
-		Fp::neg(py[1], py[1]);
-	}
 };
 
 template<class Fp>
diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp
index b1085da..25d6bce 100644
--- a/include/mcl/op.hpp
+++ b/include/mcl/op.hpp
@@ -258,7 +258,6 @@ struct Op {
 	*/
 	int xi_a; // xi = xi_a + u
 	void4u fp2_mulNF;
-	void2u fp2_inv;
 	void2u fp2_mul_xiA_;
 	uint32_t (*hash)(void *out, uint32_t maxOutSize, const void *msg, uint32_t msgSize);
 
@@ -345,7 +344,6 @@ struct Op {
 
 		xi_a = 0;
 		fp2_mulNF = 0;
-		fp2_inv = 0;
 		fp2_mul_xiA_ = 0;
 		hash = 0;
 

From 125451649c6f7b219f316f383fffaddc9a2b10cf Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 12:34:51 +0900
Subject: [PATCH 16/23] use FpDbl in Fp2::inv

---
 include/mcl/fp_tower.hpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 247f7e9..1dea498 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -284,13 +284,15 @@ public:
 		assert(!x.isZero());
 		const Fp& a = x.a;
 		const Fp& b = x.b;
-		Fp aa, bb;
-		Fp::sqr(aa, a);
-		Fp::sqr(bb, b);
-		aa += bb;
-		Fp::inv(aa, aa); // aa = 1 / (a^2 + b^2)
-		Fp::mul(y.a, a, aa);
-		Fp::mul(y.b, b, aa);
+		FpDbl AA, BB;
+		FpDbl::sqrPre(AA, a);
+		FpDbl::sqrPre(BB, b);
+		FpDbl::addPre(AA, AA, BB);
+		Fp r;
+		FpDbl::mod(r, AA);
+		Fp::inv(r, r); // r = 1 / (a^2 + b^2)
+		Fp::mul(y.a, a, r);
+		Fp::mul(y.b, b, r);
 		Fp::neg(y.b, y.b);
 	}
 	static void addPre(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::addPre(z.a, x.a, y.a); Fp::addPre(z.b, x.b, y.b); }

From e25fc2fa62767d5dd5d547b3da4321c0e4e5055f Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 14:30:39 +0900
Subject: [PATCH 17/23] refactor cast of Fp2T

---
 include/mcl/fp_tower.hpp | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 1dea498..b92e750 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -540,38 +540,40 @@ public:
 	}
 #endif
 private:
+	static Fp2T& cast(Unit *x) { return *reinterpret_cast<Fp2T*>(x); }
+	static const Fp2T& cast(const Unit *x) { return *reinterpret_cast<const Fp2T*>(x); }
 	/*
 		default Fp2T operator
 		Fp2T = Fp[i]/(i^2 + 1)
 	*/
 	static void addA(Unit *pz, const Unit *px, const Unit *py)
 	{
-		Fp2T& z = *reinterpret_cast<Fp2T*>(pz);
-		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
-		const Fp2T& y = *reinterpret_cast<const Fp2T*>(py);
+		Fp2T& z = cast(pz);
+		const Fp2T& x = cast(px);
+		const Fp2T& y = cast(py);
 		Fp::add(z.a, x.a, y.a);
 		Fp::add(z.b, x.b, y.b);
 	}
 	static void subA(Unit *pz, const Unit *px, const Unit *py)
 	{
-		Fp2T& z = *reinterpret_cast<Fp2T*>(pz);
-		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
-		const Fp2T& y = *reinterpret_cast<const Fp2T*>(py);
+		Fp2T& z = cast(pz);
+		const Fp2T& x = cast(px);
+		const Fp2T& y = cast(py);
 		Fp::sub(z.a, x.a, y.a);
 		Fp::sub(z.b, x.b, y.b);
 	}
 	static void negA(Unit *py, const Unit *px)
 	{
-		Fp2T& y = *reinterpret_cast<Fp2T*>(py);
-		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
+		Fp2T& y = cast(py);
+		const Fp2T& x = cast(px);
 		Fp::neg(y.a, x.a);
 		Fp::neg(y.b, x.b);
 	}
 	static void mulA(Unit *pz, const Unit *px, const Unit *py)
 	{
-		Fp2T& z = *reinterpret_cast<Fp2T*>(pz);
-		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
-		const Fp2T& y = *reinterpret_cast<const Fp2T*>(py);
+		Fp2T& z = cast(pz);
+		const Fp2T& x = cast(px);
+		const Fp2T& y = cast(py);
 		Fp2Dbl d;
 		Fp2Dbl::mulPre(d, x, y);
 		FpDbl::mod(z.a, d.a);
@@ -579,8 +581,8 @@ private:
 	}
 	static void mul2A(Unit *py, const Unit *px)
 	{
-		Fp2T& y = *reinterpret_cast<Fp2T*>(py);
-		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
+		Fp2T& y = cast(py);
+		const Fp2T& x = cast(px);
 		Fp::mul2(y.a, x.a);
 		Fp::mul2(y.b, x.b);
 	}
@@ -590,8 +592,8 @@ private:
 	*/
 	static void sqrA(Unit *py, const Unit *px)
 	{
-		Fp2T& y = *reinterpret_cast<Fp2T*>(py);
-		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
+		Fp2T& y = cast(py);
+		const Fp2T& x = cast(px);
 		const Fp& a = x.a;
 		const Fp& b = x.b;
 #if 1 // faster than using FpDbl
@@ -622,8 +624,8 @@ private:
 	*/
 	static void fp2_mul_xiA(Unit *py, const Unit *px)
 	{
-		Fp2T& y = *reinterpret_cast<Fp2T*>(py);
-		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
+		Fp2T& y = cast(py);
+		const Fp2T& x = cast(px);
 		const Fp& a = x.a;
 		const Fp& b = x.b;
 		Fp t;
@@ -639,8 +641,8 @@ private:
 	*/
 	static void fp2_mul_xi_1_1iA(Unit *py, const Unit *px)
 	{
-		Fp2T& y = *reinterpret_cast<Fp2T*>(py);
-		const Fp2T& x = *reinterpret_cast<const Fp2T*>(px);
+		Fp2T& y = cast(py);
+		const Fp2T& x = cast(px);
 		const Fp& a = x.a;
 		const Fp& b = x.b;
 		Fp t;

From 0f141988bd7b84d79362202b10e7998005d6fe20 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 15:13:03 +0900
Subject: [PATCH 18/23] tweak

---
 include/mcl/fp_tower.hpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index b92e750..13d922c 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -284,18 +284,18 @@ public:
 		assert(!x.isZero());
 		const Fp& a = x.a;
 		const Fp& b = x.b;
-		FpDbl AA, BB;
-		FpDbl::sqrPre(AA, a);
-		FpDbl::sqrPre(BB, b);
-		FpDbl::addPre(AA, AA, BB);
 		Fp r;
-		FpDbl::mod(r, AA);
+		norm(r, x);
 		Fp::inv(r, r); // r = 1 / (a^2 + b^2)
 		Fp::mul(y.a, a, r);
 		Fp::mul(y.b, b, r);
 		Fp::neg(y.b, y.b);
 	}
-	static void addPre(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::addPre(z.a, x.a, y.a); Fp::addPre(z.b, x.b, y.b); }
+	static void addPre(Fp2T& z, const Fp2T& x, const Fp2T& y)
+	{
+		Fp::addPre(z.a, x.a, y.a);
+		Fp::addPre(z.b, x.b, y.b);
+	}
 	static void divBy2(Fp2T& y, const Fp2T& x)
 	{
 		Fp::divBy2(y.a, x.a);
@@ -400,12 +400,14 @@ public:
 		Fp::mul(y.b, x.b, t2);
 		return true;
 	}
+	// y = a^2 + b^2
 	static void inline norm(Fp& y, const Fp2T& x)
 	{
-		Fp aa, bb;
-		Fp::sqr(aa, x.a);
-		Fp::sqr(bb, x.b);
-		Fp::add(y, aa, bb);
+		FpDbl AA, BB;
+		FpDbl::sqrPre(AA, x.a);
+		FpDbl::sqrPre(BB, x.b);
+		FpDbl::addPre(AA, AA, BB);
+		FpDbl::mod(y, AA);
 	}
 	/*
 		Frobenius

From de21f2ea4ce48455bfa025fb67a32242aec953cc Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 15:43:25 +0900
Subject: [PATCH 19/23] avoid cast of Fp2Dbl::mulPre

---
 include/mcl/fp_tower.hpp | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 13d922c..d7ab91b 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -656,6 +656,7 @@ private:
 
 template<class Fp>
 struct Fp2DblT {
+	typedef Fp2DblT<Fp> Fp2Dbl;
 	typedef FpDblT<Fp> FpDbl;
 	typedef Fp2T<Fp> Fp2;
 	typedef fp::Unit Unit;
@@ -711,11 +712,13 @@ struct Fp2DblT {
 		FpDbl::add(y.b, y.b, x.a);
 		y.a = t;
 	}
-	static void (*mulPre)(Fp2DblT&, const Fp2&, const Fp2&);
+	static void mulPre(Fp2DblT& z, const Fp2& x, const Fp2& y)
+	{
+		Fp::getOp().fp2Dbl_mulPreA_(z.a.v_, x.getUnit(), y.getUnit());
+	}
 	static void sqrPre(Fp2DblT& y, const Fp2& x)
 	{
-		const mcl::fp::Op& op = Fp::getOp();
-		op.fp2Dbl_sqrPreA_(y.a.v_, x.getUnit());
+		Fp::getOp().fp2Dbl_sqrPreA_(y.a.v_, x.getUnit());
 	}
 	static void (*mul_xi)(Fp2DblT&, const Fp2DblT&);
 	static void mod(Fp2& y, const Fp2DblT& x)
@@ -735,13 +738,11 @@ struct Fp2DblT {
  	{
 		assert(!Fp::getOp().isFullBit);
 		mcl::fp::Op& op = Fp::getOpNonConst();
-		if (op.fp2Dbl_mulPreA_) {
-			mulPre = fp::func_ptr_cast<void (*)(Fp2DblT&, const Fp2&, const Fp2&)>(op.fp2Dbl_mulPreA_);
-		} else {
-			mulPre = fp2Dbl_mulPreW;
+		if (op.fp2Dbl_mulPreA_ == 0) {
+			op.fp2Dbl_mulPreA_ = mulPreA;
 		}
 		if (op.fp2Dbl_sqrPreA_ == 0) {
-			op.fp2Dbl_sqrPreA_ = fp2Dbl_sqrPreC;
+			op.fp2Dbl_sqrPreA_ = sqrPreA;
 		}
 		const uint32_t xi_a = Fp2::get_xi_a();
 		switch (xi_a) {
@@ -756,12 +757,19 @@ struct Fp2DblT {
 			break;
 		}
 	}
+private:
+	static Fp2 cast(Unit *x) { return *reinterpret_cast<Fp2*>(x); }
+	static const Fp2 cast(const Unit *x) { return *reinterpret_cast<const Fp2*>(x); }
+	static Fp2Dbl& castD(Unit *x) { return *reinterpret_cast<Fp2Dbl*>(x); }
 	/*
 		Fp2Dbl::mulPre by FpDblT
 		@note mod of NIST_P192 is fast
 	*/
-	static void fp2Dbl_mulPreW(Fp2DblT& z, const Fp2& x, const Fp2& y)
+	static void mulPreA(Unit *pz, const Unit *px, const Unit *py)
 	{
+		Fp2Dbl& z = castD(pz);
+		const Fp2& x = cast(px);
+		const Fp2& y = cast(py);
 		assert(!Fp::getOp().isFullBit);
 		const Fp& a = x.a;
 		const Fp& b = x.b;
@@ -780,11 +788,11 @@ struct Fp2DblT {
 		FpDbl::subPre(d1, d1, d2);
 		FpDbl::sub(d0, d0, d2); // ac - bd
 	}
-	static void fp2Dbl_sqrPreC(Unit *py, const Unit *px)
+	static void sqrPreA(Unit *py, const Unit *px)
 	{
 		assert(!Fp::getOp().isFullBit);
-		const Fp2& x = *reinterpret_cast<const Fp2*>(px);
-		Fp2DblT& y = *reinterpret_cast<Fp2DblT*>(py);
+		Fp2Dbl& y = castD(py);
+		const Fp2& x = cast(px);
 		Fp t1, t2;
 		Fp::addPre(t1, x.b, x.b); // 2b
 		Fp::addPre(t2, x.a, x.b); // a + b
@@ -794,7 +802,6 @@ struct Fp2DblT {
 	}
 };
 
-template<class Fp> void (*Fp2DblT<Fp>::mulPre)(Fp2DblT&, const Fp2T<Fp>&, const Fp2T<Fp>&);
 template<class Fp> void (*Fp2DblT<Fp>::mul_xi)(Fp2DblT<Fp>&, const Fp2DblT<Fp>&);
 
 template<class Fp> Fp2T<Fp> Fp2T<Fp>::g[Fp2T<Fp>::gN];

From 8f195134f7ba1cbbfcedcf69739d0252a02e0361 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 15:52:26 +0900
Subject: [PATCH 20/23] avoid cast in Fp2Dbl::mul_xi

---
 include/mcl/fp_tower.hpp | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index d7ab91b..d4536a9 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -695,16 +695,20 @@ struct Fp2DblT {
 		FpDbl::neg(y.a, x.a);
 		FpDbl::neg(y.b, x.b);
 	}
-	static void mul_xi_1C(Fp2DblT& y, const Fp2DblT& x)
+	static void mul_xi_1A(Unit *py, const Unit *px)
 	{
+		Fp2Dbl& y = castD(py);
+		const Fp2Dbl& x = castD(px);
 		FpDbl t;
 		FpDbl::add(t, x.a, x.b);
 		FpDbl::sub(y.a, x.a, x.b);
 		y.b = t;
 	}
-	static void mul_xi_genericC(Fp2DblT& y, const Fp2DblT& x)
+	static void mul_xi_genericA(Unit *py, const Unit *px)
 	{
 		const uint32_t xi_a = Fp2::get_xi_a();
+		Fp2Dbl& y = castD(py);
+		const Fp2Dbl& x = castD(px);
 		FpDbl t;
 		FpDbl::mulUnit(t, x.a, xi_a);
 		FpDbl::sub(t, t, x.b);
@@ -720,7 +724,10 @@ struct Fp2DblT {
 	{
 		Fp::getOp().fp2Dbl_sqrPreA_(y.a.v_, x.getUnit());
 	}
-	static void (*mul_xi)(Fp2DblT&, const Fp2DblT&);
+	static void mul_xi(Fp2DblT& y, const Fp2DblT& x)
+	{
+		Fp::getOp().fp2Dbl_mul_xiA_(y.a.v_, x.a.getUnit());
+	}
 	static void mod(Fp2& y, const Fp2DblT& x)
 	{
 		FpDbl::mod(y.a, x.a);
@@ -744,23 +751,20 @@ struct Fp2DblT {
 		if (op.fp2Dbl_sqrPreA_ == 0) {
 			op.fp2Dbl_sqrPreA_ = sqrPreA;
 		}
-		const uint32_t xi_a = Fp2::get_xi_a();
-		switch (xi_a) {
-		case 1:
-			mul_xi = mul_xi_1C;
-			if (op.fp2Dbl_mul_xiA_) {
-				mul_xi = fp::func_ptr_cast<void (*)(Fp2DblT&, const Fp2DblT&)>(op.fp2Dbl_mul_xiA_);
+		if (op.fp2Dbl_mul_xiA_ == 0) {
+			const uint32_t xi_a = Fp2::get_xi_a();
+			if (xi_a == 1) {
+				op.fp2Dbl_mul_xiA_ = mul_xi_1A;
+			} else {
+				op.fp2Dbl_mul_xiA_ = mul_xi_genericA;
 			}
-			break;
-		default:
-			mul_xi = mul_xi_genericC;
-			break;
 		}
 	}
 private:
 	static Fp2 cast(Unit *x) { return *reinterpret_cast<Fp2*>(x); }
 	static const Fp2 cast(const Unit *x) { return *reinterpret_cast<const Fp2*>(x); }
 	static Fp2Dbl& castD(Unit *x) { return *reinterpret_cast<Fp2Dbl*>(x); }
+	static const Fp2Dbl& castD(const Unit *x) { return *reinterpret_cast<const Fp2Dbl*>(x); }
 	/*
 		Fp2Dbl::mulPre by FpDblT
 		@note mod of NIST_P192 is fast
@@ -802,8 +806,6 @@ private:
 	}
 };
 
-template<class Fp> void (*Fp2DblT<Fp>::mul_xi)(Fp2DblT<Fp>&, const Fp2DblT<Fp>&);
-
 template<class Fp> Fp2T<Fp> Fp2T<Fp>::g[Fp2T<Fp>::gN];
 template<class Fp> Fp2T<Fp> Fp2T<Fp>::g2[Fp2T<Fp>::gN];
 template<class Fp> Fp2T<Fp> Fp2T<Fp>::g3[Fp2T<Fp>::gN];

From 3be037a85375d4f1ad71d1ab0d3a197d06899fbf Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 16:30:07 +0900
Subject: [PATCH 21/23] tweak

---
 include/mcl/fp_tower.hpp | 42 ++++++++++++++++++++--------------------
 test/common_test.hpp     | 12 +++++++++---
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index d4536a9..a2cf930 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -695,27 +695,6 @@ struct Fp2DblT {
 		FpDbl::neg(y.a, x.a);
 		FpDbl::neg(y.b, x.b);
 	}
-	static void mul_xi_1A(Unit *py, const Unit *px)
-	{
-		Fp2Dbl& y = castD(py);
-		const Fp2Dbl& x = castD(px);
-		FpDbl t;
-		FpDbl::add(t, x.a, x.b);
-		FpDbl::sub(y.a, x.a, x.b);
-		y.b = t;
-	}
-	static void mul_xi_genericA(Unit *py, const Unit *px)
-	{
-		const uint32_t xi_a = Fp2::get_xi_a();
-		Fp2Dbl& y = castD(py);
-		const Fp2Dbl& x = castD(px);
-		FpDbl t;
-		FpDbl::mulUnit(t, x.a, xi_a);
-		FpDbl::sub(t, t, x.b);
-		FpDbl::mulUnit(y.b, x.b, xi_a);
-		FpDbl::add(y.b, y.b, x.a);
-		y.a = t;
-	}
 	static void mulPre(Fp2DblT& z, const Fp2& x, const Fp2& y)
 	{
 		Fp::getOp().fp2Dbl_mulPreA_(z.a.v_, x.getUnit(), y.getUnit());
@@ -804,6 +783,27 @@ private:
 		Fp::sub(t1, x.a, x.b); // a - b
 		FpDbl::mulPre(y.a, t1, t2); // (a + b)(a - b)
 	}
+	static void mul_xi_1A(Unit *py, const Unit *px)
+	{
+		Fp2Dbl& y = castD(py);
+		const Fp2Dbl& x = castD(px);
+		FpDbl t;
+		FpDbl::add(t, x.a, x.b);
+		FpDbl::sub(y.a, x.a, x.b);
+		y.b = t;
+	}
+	static void mul_xi_genericA(Unit *py, const Unit *px)
+	{
+		const uint32_t xi_a = Fp2::get_xi_a();
+		Fp2Dbl& y = castD(py);
+		const Fp2Dbl& x = castD(px);
+		FpDbl t;
+		FpDbl::mulUnit(t, x.a, xi_a);
+		FpDbl::sub(t, t, x.b);
+		FpDbl::mulUnit(y.b, x.b, xi_a);
+		FpDbl::add(y.b, y.b, x.a);
+		y.a = t;
+	}
 };
 
 template<class Fp> Fp2T<Fp> Fp2T<Fp>::g[Fp2T<Fp>::gN];
diff --git a/test/common_test.hpp b/test/common_test.hpp
index 74a745c..5deb9f1 100644
--- a/test/common_test.hpp
+++ b/test/common_test.hpp
@@ -163,10 +163,11 @@ void testABCD()
 
 void testFp2Dbl_mul_xi1()
 {
-	if (Fp2::get_xi_a() != 1) return;
+	const uint32_t xi_a = Fp2::get_xi_a();
+	if (xi_a != 1) return;
 	puts("testFp2Dbl_mul_xi1");
 	cybozu::XorShift rg;
-	for (int i = 0; i < 100; i++) {
+	for (int i = 0; i < 10; i++) {
 		Fp a1, a2;
 		a1.setByCSPRNG(rg);
 		a2.setByCSPRNG(rg);
@@ -176,7 +177,12 @@ void testFp2Dbl_mul_xi1()
 		a2.setByCSPRNG(rg);
 		FpDbl::mulPre(x.b, a1, a2);
 		Fp2Dbl ok;
-		Fp2Dbl::mul_xi_1C(ok, x);
+		{
+			FpDbl::mulUnit(ok.a, x.a, xi_a);
+			ok.a -= x.b;
+			FpDbl::mulUnit(ok.b, x.b, xi_a);
+			ok.b += x.a;
+		}
 		Fp2Dbl::mul_xi(x, x);
 		CYBOZU_TEST_EQUAL_ARRAY(ok.a.getUnit(), x.a.getUnit(), ok.a.getUnitSize());
 		CYBOZU_TEST_EQUAL_ARRAY(ok.b.getUnit(), x.b.getUnit(), ok.b.getUnitSize());

From ead99eb002ea98eeb424f134431d50f5005a3ed6 Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 16:38:34 +0900
Subject: [PATCH 22/23] avoid cast of Fp::add

---
 include/mcl/fp.hpp | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp
index f41d4f8..f33e905 100644
--- a/include/mcl/fp.hpp
+++ b/include/mcl/fp.hpp
@@ -146,8 +146,9 @@ public:
 		ioMode_ = 0;
 		isETHserialization_ = false;
 #ifdef MCL_XBYAK_DIRECT_CALL
-		add = fp::func_ptr_cast<void (*)(FpT& z, const FpT& x, const FpT& y)>(op_.fp_addA_);
-		if (add == 0) add = addC;
+		if (op_.fp_addA_ == 0) {
+			op_.fp_addA_ = addA;
+		}
 		sub = fp::func_ptr_cast<void (*)(FpT& z, const FpT& x, const FpT& y)>(op_.fp_subA_);
 		if (sub == 0) sub = subC;
 		neg = fp::func_ptr_cast<void (*)(FpT& y, const FpT& x)>(op_.fp_negA_);
@@ -518,9 +519,21 @@ public:
 		}
 		setArray(pb, gmp::getUnit(x), gmp::getUnitSize(x));
 	}
+	static void add(FpT& z, const FpT& x, const FpT& y)
+	{
+#ifdef MCL_XBYAK_DIRECT_CALL
+		op_.fp_addA_(z.v_, x.v_, y.v_);
+#else
+		op_.fp_add(z.v_, x.v_, y.v_, op_.p);
+#endif
+	}
+#ifdef MCL_XBYAK_DIRECT_CALL
+	static inline void addA(Unit *z, const Unit *x, const Unit *y)
+	{
+		op_.fp_add(z, x, y, op_.p);
+	}
+#endif
 #ifdef MCL_XBYAK_DIRECT_CALL
-	static void (*add)(FpT& z, const FpT& x, const FpT& y);
-	static inline void addC(FpT& z, const FpT& x, const FpT& y) { op_.fp_add(z.v_, x.v_, y.v_, op_.p); }
 	static void (*sub)(FpT& z, const FpT& x, const FpT& y);
 	static inline void subC(FpT& z, const FpT& x, const FpT& y) { op_.fp_sub(z.v_, x.v_, y.v_, op_.p); }
 	static void (*neg)(FpT& y, const FpT& x);
@@ -534,7 +547,6 @@ public:
 	static void (*mul9)(FpT& y, const FpT& x);
 	static inline void mul9C(FpT& y, const FpT& x) { mulSmall(y, x, 9); }
 #else
-	static inline void add(FpT& z, const FpT& x, const FpT& y) { op_.fp_add(z.v_, x.v_, y.v_, op_.p); }
 	static inline void sub(FpT& z, const FpT& x, const FpT& y) { op_.fp_sub(z.v_, x.v_, y.v_, op_.p); }
 	static inline void neg(FpT& y, const FpT& x) { op_.fp_neg(y.v_, x.v_, op_.p); }
 	static inline void mul(FpT& z, const FpT& x, const FpT& y) { op_.fp_mul(z.v_, x.v_, y.v_, op_.p); }
@@ -789,7 +801,6 @@ template<class tag, size_t maxBitSize> FpT<tag, maxBitSize> FpT<tag, maxBitSize>
 template<class tag, size_t maxBitSize> int FpT<tag, maxBitSize>::ioMode_ = IoAuto;
 template<class tag, size_t maxBitSize> bool FpT<tag, maxBitSize>::isETHserialization_ = false;
 #ifdef MCL_XBYAK_DIRECT_CALL
-template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::add)(FpT& z, const FpT& x, const FpT& y);
 template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::sub)(FpT& z, const FpT& x, const FpT& y);
 template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::neg)(FpT& y, const FpT& x);
 template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::mul)(FpT& z, const FpT& x, const FpT& y);

From 86dd59e3c3faf532d58fd9e61168eedb1ba9005a Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Mon, 10 May 2021 17:16:40 +0900
Subject: [PATCH 23/23] avoid cast of Fp::sub/neg/mul/sqr/mul2/mul9

---
 include/mcl/fp.hpp | 148 +++++++++++++++++++++++++++++++--------------
 1 file changed, 101 insertions(+), 47 deletions(-)

diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp
index f33e905..7cf258e 100644
--- a/include/mcl/fp.hpp
+++ b/include/mcl/fp.hpp
@@ -100,6 +100,47 @@ private:
 	template<class Fp> friend class FpDblT;
 	template<class Fp> friend class Fp2T;
 	template<class Fp> friend struct Fp6T;
+#ifdef MCL_XBYAK_DIRECT_CALL
+	static inline void addA(Unit *z, const Unit *x, const Unit *y)
+	{
+		op_.fp_add(z, x, y, op_.p);
+	}
+	static inline void subA(Unit *z, const Unit *x, const Unit *y)
+	{
+		op_.fp_sub(z, x, y, op_.p);
+	}
+	static inline void negA(Unit *y, const Unit *x)
+	{
+		op_.fp_neg(y, x, op_.p);
+	}
+	static inline void mulA(Unit *z, const Unit *x, const Unit *y)
+	{
+		op_.fp_mul(z, x, y, op_.p);
+	}
+	static inline void sqrA(Unit *y, const Unit *x)
+	{
+		op_.fp_sqr(y, x, op_.p);
+	}
+	static inline void mul2A(Unit *y, const Unit *x)
+	{
+		op_.fp_mul2(y, x, op_.p);
+	}
+#endif
+	static inline void mul9A(Unit *y, const Unit *x)
+	{
+		mulSmall(y, x, 9);
+//		op_.fp_mul9(y, x, op_.p);
+	}
+	static inline void mulSmall(Unit *z, const Unit *x, const uint32_t y)
+	{
+		assert(y <= op_.smallModp.maxMulN);
+		Unit xy[maxSize + 1];
+		op_.fp_mulUnitPre(xy, x, y);
+		int v = op_.smallModp.approxMul(xy);
+		const Unit *pv = op_.smallModp.getPmul(v);
+		op_.fp_subPre(z, xy, pv);
+		op_.fp_sub(z, z, op_.p, op_.p);
+	}
 public:
 	typedef FpT<tag, maxBitSize> BaseFp;
 	// return pointer to array v_[]
@@ -149,18 +190,24 @@ public:
 		if (op_.fp_addA_ == 0) {
 			op_.fp_addA_ = addA;
 		}
-		sub = fp::func_ptr_cast<void (*)(FpT& z, const FpT& x, const FpT& y)>(op_.fp_subA_);
-		if (sub == 0) sub = subC;
-		neg = fp::func_ptr_cast<void (*)(FpT& y, const FpT& x)>(op_.fp_negA_);
-		if (neg == 0) neg = negC;
-		mul = fp::func_ptr_cast<void (*)(FpT& z, const FpT& x, const FpT& y)>(op_.fp_mulA_);
-		if (mul == 0) mul = mulC;
-		sqr = fp::func_ptr_cast<void (*)(FpT& y, const FpT& x)>(op_.fp_sqrA_);
-		if (sqr == 0) sqr = sqrC;
-		mul2 = fp::func_ptr_cast<void (*)(FpT& y, const FpT& x)>(op_.fp_mul2A_);
-		if (mul2 == 0) mul2 = mul2C;
-		mul9 = fp::func_ptr_cast<void (*)(FpT& y, const FpT& x)>(op_.fp_mul9A_);
-		if (mul9 == 0) mul9 = mul9C;
+		if (op_.fp_subA_ == 0) {
+			op_.fp_subA_ = subA;
+		}
+		if (op_.fp_negA_ == 0) {
+			op_.fp_negA_ = negA;
+		}
+		if (op_.fp_mulA_ == 0) {
+			op_.fp_mulA_ = mulA;
+		}
+		if (op_.fp_sqrA_ == 0) {
+			op_.fp_sqrA_ = sqrA;
+		}
+		if (op_.fp_mul2A_ == 0) {
+			op_.fp_mul2A_ = mul2A;
+		}
+		if (op_.fp_mul9A_ == 0) {
+			op_.fp_mul9A_ = mul9A;
+		}
 #endif
 		*pb = true;
 	}
@@ -527,44 +574,59 @@ public:
 		op_.fp_add(z.v_, x.v_, y.v_, op_.p);
 #endif
 	}
+	static void sub(FpT& z, const FpT& x, const FpT& y)
+	{
 #ifdef MCL_XBYAK_DIRECT_CALL
-	static inline void addA(Unit *z, const Unit *x, const Unit *y)
+		op_.fp_subA_(z.v_, x.v_, y.v_);
+#else
+		op_.fp_sub(z.v_, x.v_, y.v_, op_.p);
+#endif
+	}
+	static void neg(FpT& y, const FpT& x)
 	{
-		op_.fp_add(z, x, y, op_.p);
+#ifdef MCL_XBYAK_DIRECT_CALL
+		op_.fp_negA_(y.v_, x.v_);
+#else
+		op_.fp_neg(y.v_, x.v_, op_.p);
+#endif
+	}
+	static void mul(FpT& z, const FpT& x, const FpT& y)
+	{
+#ifdef MCL_XBYAK_DIRECT_CALL
+		op_.fp_mulA_(z.v_, x.v_, y.v_);
+#else
+		op_.fp_mul(z.v_, x.v_, y.v_, op_.p);
+#endif
+	}
+	static void sqr(FpT& y, const FpT& x)
+	{
+#ifdef MCL_XBYAK_DIRECT_CALL
+		op_.fp_sqrA_(y.v_, x.v_);
+#else
+		op_.fp_sqr(y.v_, x.v_, op_.p);
+#endif
 	}
+	static void mul2(FpT& y, const FpT& x)
+	{
+#ifdef MCL_XBYAK_DIRECT_CALL
+		op_.fp_mul2A_(y.v_, x.v_);
+#else
+		op_.fp_mul2(y.v_, x.v_, op_.p);
 #endif
+	}
+	static void mul9(FpT& y, const FpT& x)
+	{
 #ifdef MCL_XBYAK_DIRECT_CALL
-	static void (*sub)(FpT& z, const FpT& x, const FpT& y);
-	static inline void subC(FpT& z, const FpT& x, const FpT& y) { op_.fp_sub(z.v_, x.v_, y.v_, op_.p); }
-	static void (*neg)(FpT& y, const FpT& x);
-	static inline void negC(FpT& y, const FpT& x) { op_.fp_neg(y.v_, x.v_, op_.p); }
-	static void (*mul)(FpT& z, const FpT& x, const FpT& y);
-	static inline void mulC(FpT& z, const FpT& x, const FpT& y) { op_.fp_mul(z.v_, x.v_, y.v_, op_.p); }
-	static void (*sqr)(FpT& y, const FpT& x);
-	static inline void sqrC(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); }
-	static void (*mul2)(FpT& y, const FpT& x);
-	static inline void mul2C(FpT& y, const FpT& x) { op_.fp_mul2(y.v_, x.v_, op_.p); }
-	static void (*mul9)(FpT& y, const FpT& x);
-	static inline void mul9C(FpT& y, const FpT& x) { mulSmall(y, x, 9); }
+		op_.fp_mul9A_(y.v_, x.v_);
 #else
-	static inline void sub(FpT& z, const FpT& x, const FpT& y) { op_.fp_sub(z.v_, x.v_, y.v_, op_.p); }
-	static inline void neg(FpT& y, const FpT& x) { op_.fp_neg(y.v_, x.v_, op_.p); }
-	static inline void mul(FpT& z, const FpT& x, const FpT& y) { op_.fp_mul(z.v_, x.v_, y.v_, op_.p); }
-	static inline void sqr(FpT& y, const FpT& x) { op_.fp_sqr(y.v_, x.v_, op_.p); }
-	static inline void mul2(FpT& y, const FpT& x) { op_.fp_mul2(y.v_, x.v_, op_.p); }
-	static inline void mul9(FpT& y, const FpT& x) { mulSmall(y, x, 9); }
+		mul9A(y.v_, x.v_);
 #endif
+	}
 	static inline void addPre(FpT& z, const FpT& x, const FpT& y) { op_.fp_addPre(z.v_, x.v_, y.v_); }
 	static inline void subPre(FpT& z, const FpT& x, const FpT& y) { op_.fp_subPre(z.v_, x.v_, y.v_); }
 	static inline void mulSmall(FpT& z, const FpT& x, const uint32_t y)
 	{
-		assert(y <= op_.smallModp.maxMulN);
-		Unit xy[maxSize + 1];
-		op_.fp_mulUnitPre(xy, x.v_, y);
-		int v = op_.smallModp.approxMul(xy);
-		const Unit *pv = op_.smallModp.getPmul(v);
-		op_.fp_subPre(z.v_, xy, pv);
-		op_.fp_sub(z.v_, z.v_, op_.p, op_.p);
+		mulSmall(z.v_, x.v_, y);
 	}
 	static inline void mulUnit(FpT& z, const FpT& x, const Unit y)
 	{
@@ -800,14 +862,6 @@ template<class tag, size_t maxBitSize> fp::Op FpT<tag, maxBitSize>::op_;
 template<class tag, size_t maxBitSize> FpT<tag, maxBitSize> FpT<tag, maxBitSize>::inv2_;
 template<class tag, size_t maxBitSize> int FpT<tag, maxBitSize>::ioMode_ = IoAuto;
 template<class tag, size_t maxBitSize> bool FpT<tag, maxBitSize>::isETHserialization_ = false;
-#ifdef MCL_XBYAK_DIRECT_CALL
-template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::sub)(FpT& z, const FpT& x, const FpT& y);
-template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::neg)(FpT& y, const FpT& x);
-template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::mul)(FpT& z, const FpT& x, const FpT& y);
-template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::sqr)(FpT& y, const FpT& x);
-template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::mul2)(FpT& y, const FpT& x);
-template<class tag, size_t maxBitSize> void (*FpT<tag, maxBitSize>::mul9)(FpT& y, const FpT& x);
-#endif
 
 } // mcl