diff --git a/include/mcl/fp.hpp b/include/mcl/fp.hpp
index c52ce78..a223317 100644
--- a/include/mcl/fp.hpp
+++ b/include/mcl/fp.hpp
@@ -118,6 +118,7 @@ public:
 	static inline size_t getBitSize() { return op_.bitSize; }
 	static inline size_t getByteSize() { return (op_.bitSize + 7) / 8; }
 	static inline const fp::Op& getOp() { return op_; }
+	static inline fp::Op& getOpNonConst() { return op_; }
 	void dump() const
 	{
 		const size_t N = op_.N;
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index b5ed9a6..34dd062 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -10,10 +10,15 @@
 
 namespace mcl {
 
+template<class Fp> struct Fp12T;
+template<class Fp> class BNT;
+template<class Fp> struct Fp2DblT;
+
 template<class Fp>
 class FpDblT : public fp::Serializable<FpDblT<Fp> > {
 	typedef fp::Unit Unit;
 	Unit v_[Fp::maxSize * 2];
+	friend struct Fp2DblT<Fp>;
 public:
 	static size_t getUnitSize() { return Fp::op_.N * 2; }
 	const fp::Unit *getUnit() const { return v_; }
@@ -172,9 +177,6 @@ template<class Fp> void (*FpDblT<Fp>::addPre)(FpDblT&, const FpDblT&, const FpDb
 template<class Fp> void (*FpDblT<Fp>::subPre)(FpDblT&, const FpDblT&, const FpDblT&);
 #endif
 
-template<class Fp> struct Fp12T;
-template<class Fp> class BNT;
-template<class Fp> struct Fp2DblT;
 /*
 	beta = -1
 	Fp2 = F[i] / (i^2 + 1)
@@ -662,7 +664,11 @@ struct Fp2DblT {
 		y.a = t;
 	}
 	static void (*mulPre)(Fp2DblT&, const Fp2&, const Fp2&);
-	static void (*sqrPre)(Fp2DblT&, const Fp2&);
+	static void sqrPre(Fp2DblT& y, const Fp2& x)
+	{
+		const mcl::fp::Op& op = Fp::getOp();
+		op.fp2Dbl_sqrPreA_(y.a.v_, x.getUnit());
+	}
 	static void (*mul_xi)(Fp2DblT&, const Fp2DblT&);
 	static void mod(Fp2& y, const Fp2DblT& x)
 	{
@@ -680,16 +686,14 @@ struct Fp2DblT {
 	static void init()
  	{
 		assert(!Fp::getOp().isFullBit);
-		const mcl::fp::Op& op = Fp::getOp();
+		mcl::fp::Op& op = Fp::getOpNonConst();
 		if (op.fp2Dbl_mulPreA_) {
 			mulPre = fp::func_ptr_cast<void (*)(Fp2DblT&, const Fp2&, const Fp2&)>(op.fp2Dbl_mulPreA_);
 		} else {
 			mulPre = fp2Dbl_mulPreW;
 		}
-		if (op.fp2Dbl_sqrPreA_) {
-			sqrPre = fp::func_ptr_cast<void (*)(Fp2DblT&, const Fp2&)>(op.fp2Dbl_sqrPreA_);
-		} else {
-			sqrPre = fp2Dbl_sqrPreW;
+		if (op.fp2Dbl_sqrPreA_ == 0) {
+			op.fp2Dbl_sqrPreA_ = fp2Dbl_sqrPreC;
 		}
 		const uint32_t xi_a = Fp2::get_xi_a();
 		switch (xi_a) {
@@ -728,9 +732,11 @@ struct Fp2DblT {
 		FpDbl::subPre(d1, d1, d2);
 		FpDbl::sub(d0, d0, d2); // ac - bd
 	}
-	static void fp2Dbl_sqrPreW(Fp2DblT& y, const Fp2& x)
+	static void fp2Dbl_sqrPreC(Unit *py, const Unit *px)
 	{
 		assert(!Fp::getOp().isFullBit);
+		const Fp2& x = *reinterpret_cast<const Fp2*>(px);
+		Fp2DblT& y = *reinterpret_cast<Fp2DblT*>(py);
 		Fp t1, t2;
 		Fp::addPre(t1, x.b, x.b); // 2b
 		Fp::addPre(t2, x.a, x.b); // a + b
@@ -741,7 +747,6 @@ struct Fp2DblT {
 };
 
 template<class Fp> void (*Fp2DblT<Fp>::mulPre)(Fp2DblT&, const Fp2T<Fp>&, const Fp2T<Fp>&);
-template<class Fp> void (*Fp2DblT<Fp>::sqrPre)(Fp2DblT&, const Fp2T<Fp>&);
 template<class Fp> void (*Fp2DblT<Fp>::mul_xi)(Fp2DblT<Fp>&, const Fp2DblT<Fp>&);
 
 template<class Fp> Fp2T<Fp> Fp2T<Fp>::g[Fp2T<Fp>::gN];