use gmp for >256

9 years ago · 089367e159
parent f21002a6d6
commit 089367e159
3 changed files with 32 additions and 3 deletions
--- a/include/mcl/fp_proto.hpp
+++ b/include/mcl/fp_proto.hpp
@ -18,6 +18,7 @@ void mcl_fp_add ## len ## L(mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp:
 void mcl_fp_sub ## len ## S(mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*); \
 void mcl_fp_sub ## len ## L(mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*); \
 void mcl_fp_mulPre ## len(mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*); \
+void mcl_fp_sqrPre ## len(mcl::fp::Unit*, const mcl::fp::Unit*); \
 void mcl_fp_mont ## len(mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*, const mcl::fp::Unit*, mcl::fp::Unit);

 MCL_FP_DEF_FUNC(128)
--- a/src/fp.cpp
+++ b/src/fp.cpp
@ -194,6 +194,9 @@ struct OpeFunc {
 			addP = mcl_fp_add ## n ##S; \
 			subP = mcl_fp_sub ## n ##S; \
 			mulPreP = mcl_fp_mulPre ## n; \
+			if (n <= 256) { \
+				sqrPreP = mcl_fp_sqrPre ## n; \
+			} \
 			mont = mcl_fp_mont ## n; \
 		}
 #else
@ -272,10 +275,12 @@ static void initForMont(Op& op, const Unit *p, Mode mode)
 	op.sub = Xbyak::CastTo<void3u>(fg->sub_);
 	op.mul = Xbyak::CastTo<void3u>(fg->mul_);
 	op.sqr = Xbyak::CastTo<void2u>(fg->sqr_);
-	op.preInv = Xbyak::CastTo<int2u>(op.fg->preInv_);
-	op.invOp = &invOpForMont;
+	if (N <= 4) {
+		op.preInv = Xbyak::CastTo<int2u>(op.fg->preInv_);
+		op.invOp = &invOpForMont;
+		initInvTbl(op);
+	}

-	initInvTbl(op);
 #endif
 }

--- a/src/mul.txt
+++ b/src/mul.txt
@ -42,6 +42,29 @@ define void @mcl_fp_mulPre$(bit)(i$(unit)* %pz, i$(bit)* %px, i$(bit)* %py) {
  store i$(bu) %sum$(N-1), i$(bu)* %p
  ret void
 }
+define void @mcl_fp_sqrPre$(bit)(i$(unit)* %py, i$(bit)* %px) {
+  %x = load i$(bit)* %px
+@for i, 0, N
+  %x$(i) = call i$(unit) @extract$(bit)(i$(bit) %x, i$(bit) $(unit*i))
+@endfor
+  %sum0 = call i$(bu) @mul$(bit)x$(unit)(i$(bit) %x, i$(unit) %x0)
+  %t0 = trunc i$(bu) %sum0 to i$(unit)
+  store i$(unit) %t0, i$(unit)* %py
+@for i, 1, N
+
+  %s$(i-1) = lshr i$(bu) %sum$(i-1), $(unit)
+  %xx$(i) = call i$(bu) @mul$(bit)x$(unit)(i$(bit) %x, i$(unit) %x$(i))
+  %sum$(i) = add i$(bu) %s$(i-1), %xx$(i)
+  %y$(i) = getelementptr i$(unit)* %py, i32 $(i)
+  @if i < N - 1
+  %ts$(i) = trunc i$(bu) %sum$(i) to i$(unit)
+  store i$(unit) %ts$(i), i$(unit)* %y$(i)
+  @endif
+@endfor
+  %p = bitcast i$(unit)* %y$(N-1) to i$(bu)*
+  store i$(bu) %sum$(N-1), i$(bu)* %p
+  ret void
+}

@define bu = bit + unit
@define bu2 = bit + unit * 2