From eaabb2337b011fb4989752a42fcf2d4eefa65fcf Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi@nifty.com>
Date: Thu, 17 Sep 2020 16:44:20 +0900
Subject: [PATCH] fix pic code

---
 Makefile               |  2 +-
 include/mcl/op.hpp     |  7 +++-
 sample/bench.cpp       |  4 +-
 sample/rawbench.cpp    |  2 +-
 src/fp.cpp             | 25 ++++++++++--
 src/fp_generator.hpp   | 84 +++-------------------------------------
 src/fp_static_code.hpp | 87 ++++++++++++++++++++++++++++++++++++++++++
 src/low_func.hpp       |  2 +-
 test/ec_test.cpp       |  2 +-
 test/fp_test.cpp       |  6 +--
 test/fp_tower_test.cpp |  2 +-
 11 files changed, 131 insertions(+), 92 deletions(-)
 create mode 100644 src/fp_static_code.hpp
diff --git a/Makefile b/Makefile
index 9e37876..1b59ce7 100644
--- a/Makefile
+++ b/Makefile
@@ -247,7 +247,7 @@ obj/static_code.o: src/static_code.asm
 	nasm -felf64 -o $@ $<
 
 bin/static_code_test.exe: test/static_code_test.cpp src/fp.cpp obj/static_code.o
-	$(CXX) -o $@ -O3 $^ -DMCL_STATIC_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I include -Wall -Wextra
+	$(CXX) -o $@ -O3 $^ -g -DMCL_DONT_USE_XBYAK -DMCL_STATIC_CODE -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I include -Wall -Wextra
  
 asm: $(LLVM_SRC)
 	$(LLVM_OPT) -O3 -o - $(LLVM_SRC) | $(LLVM_LLC) -O3 $(LLVM_FLAGS) -x86-asm-syntax=intel
diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp
index 99c0e4d..22a78b1 100644
--- a/include/mcl/op.hpp
+++ b/include/mcl/op.hpp
@@ -16,6 +16,9 @@
 #endif
 #if !defined(MCL_DONT_USE_XBYAK) && (defined(_WIN64) || defined(__x86_64__)) && (MCL_SIZEOF_UNIT == 8)
 	#define MCL_USE_XBYAK
+#endif
+#if defined(MCL_USE_XBYAK) || defined(MCL_STATIC_CODE)
+	#define MCL_X64_ASM
 	#define MCL_XBYAK_DIRECT_CALL
 #endif
 
@@ -202,6 +205,8 @@ struct Op {
 	Unit R3[maxUnitSize];
 #ifdef MCL_USE_XBYAK
 	FpGenerator *fg;
+#endif
+#ifdef MCL_X64_ASM
 	mcl::Array<Unit> invTbl;
 #endif
 	void3u fp_addA_;
@@ -288,7 +293,7 @@ struct Op {
 		memset(one, 0, sizeof(one));
 		memset(R2, 0, sizeof(R2));
 		memset(R3, 0, sizeof(R3));
-#ifdef MCL_USE_XBYAK
+#ifdef MCL_X64_ASM
 		invTbl.clear();
 #endif
 		fp_addA_ = 0;
diff --git a/sample/bench.cpp b/sample/bench.cpp
index de81f25..d3c101c 100644
--- a/sample/bench.cpp
+++ b/sample/bench.cpp
@@ -68,7 +68,7 @@ void benchFp(size_t bitSize, int mode)
 		if (mode & 4) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM);
 		if (mode & 8) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM_MONT);
 #endif
-#ifdef MCL_USE_XBYAK
+#ifdef MCL_X64_ASM
 		if (mode & 16) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_XBYAK);
 #endif
 	}
@@ -122,7 +122,7 @@ void benchEc(size_t bitSize, int mode, mcl::ec::Mode ecMode)
 		if (mode & 4) benchEcSub(tbl[i], mcl::fp::FP_LLVM, ecMode);
 		if (mode & 8) benchEcSub(tbl[i], mcl::fp::FP_LLVM_MONT, ecMode);
 #endif
-#ifdef MCL_USE_XBYAK
+#ifdef MCL_X64_ASM
 		if (mode & 16) benchEcSub(tbl[i], mcl::fp::FP_XBYAK, ecMode);
 #endif
 	}
diff --git a/sample/rawbench.cpp b/sample/rawbench.cpp
index 4d7506e..cc74bc3 100644
--- a/sample/rawbench.cpp
+++ b/sample/rawbench.cpp
@@ -168,7 +168,7 @@ int main(int argc, char *argv[])
 		benchRaw(tbl[i], mcl::fp::FP_LLVM);
 		benchRaw(tbl[i], mcl::fp::FP_LLVM_MONT);
 #endif
-#ifdef MCL_USE_XBYAK
+#ifdef MCL_X64_ASM
 		if (bitSize <= 384) {
 			benchRaw(tbl[i], mcl::fp::FP_XBYAK);
 		}
diff --git a/src/fp.cpp b/src/fp.cpp
index b3b07d1..ab3a1a7 100644
--- a/src/fp.cpp
+++ b/src/fp.cpp
@@ -3,12 +3,14 @@
 #include <cybozu/sha2.hpp>
 #include <cybozu/endian.hpp>
 #include <mcl/conversion.hpp>
+#ifdef MCL_STATIC_CODE
+#include "fp_static_code.hpp"
+#endif
 #ifdef MCL_USE_XBYAK
 #include "fp_generator.hpp"
 #else
 #define XBYAK_ONLY_CLASS_CPU
 #include "xbyak/xbyak_util.h"
-//#include "detect_cpu.hpp"
 #endif
 #include "low_func.hpp"
 #ifdef MCL_USE_LLVM
@@ -315,7 +317,7 @@ void setOp(Op& op, Mode mode)
 #endif
 }
 
-#ifdef MCL_USE_XBYAK
+#ifdef MCL_X64_ASM
 inline void invOpForMontC(Unit *y, const Unit *x, const Op& op)
 {
 	Unit r[maxUnitSize];
@@ -372,6 +374,12 @@ static bool initForMont(Op& op, const Unit *p, Mode mode)
 		op.fp_invOp = &invOpForMontC;
 		initInvTbl(op);
 	}
+#elif defined(MCL_STATIC_CODE)
+	fp::setStaticCode(op);
+	if (op.isMont && N <= 4) {
+		op.fp_invOp = &invOpForMontC;
+		initInvTbl(op);
+	}
 #endif
 	return true;
 }
@@ -403,14 +411,25 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size
 	priority : MCL_USE_XBYAK > MCL_USE_LLVM > none
 	Xbyak > llvm_mont > llvm > gmp_mont > gmp
 */
-#ifdef MCL_USE_XBYAK
+#ifdef MCL_X64_ASM
 	if (mode == FP_AUTO) mode = FP_XBYAK;
 	if (mode == FP_XBYAK && bitSize > 384) {
 		mode = FP_AUTO;
 	}
+#ifdef MCL_USE_XBYAK
 	if (!isEnableJIT()) {
 		mode = FP_AUTO;
 	}
+#elif MCL_STATIC_CODE
+	{
+		// static jit code uses avx, mulx, adox, adcx
+		using namespace Xbyak::util;
+		Cpu cpu;
+		if (!(cpu.has(Cpu::tAVX) && cpu.has(Cpu::tBMI2) && cpu.has(Cpu::tADX))) {
+			mode = FP_AUTO;
+		}
+	}
+#endif
 #else
 	if (mode == FP_XBYAK) mode = FP_AUTO;
 #endif
diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp
index b5d4628..4243368 100644
--- a/src/fp_generator.hpp
+++ b/src/fp_generator.hpp
@@ -7,7 +7,6 @@
 	http://opensource.org/licenses/BSD-3-Clause
 */
 #if CYBOZU_HOST == CYBOZU_HOST_INTEL
-#define XBYAK_NO_OP_NAMES
 #define XBYAK_DISABLE_AVX512
 #include "xbyak/xbyak_util.h"
 
@@ -25,45 +24,6 @@
 
 namespace mcl {
 
-#ifdef MCL_STATIC_JIT
-typedef fp::Unit Unit;
-extern "C" {
-Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*);
-Unit mclx_Fp_subPre(Unit*, const Unit*, const Unit*);
-void mclx_Fp_add(Unit*, const Unit*, const Unit*);
-void mclx_Fp_sub(Unit*, const Unit*, const Unit*);
-void mclx_Fp_shr1(Unit*, const Unit*);
-void mclx_Fp_neg(Unit*, const Unit*);
-void mclx_Fp_mul(Unit*, const Unit*, const Unit*);
-void mclx_Fp_sqr(Unit*, const Unit*);
-void mclx_FpDbl_add(Unit*, const Unit*, const Unit*);
-void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*);
-void mclx_FpDbl_add(Unit*, const Unit*, const Unit*);
-void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*);
-Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*);
-Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*);
-void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*);
-void mclx_FpDbl_sqrPre(Unit*, const Unit*);
-void mclx_FpDbl_mod(Unit*, const Unit*);
-void mclx_Fp2_add(Unit*, const Unit*, const Unit*);
-void mclx_Fp2_sub(Unit*, const Unit*, const Unit*);
-void mclx_Fp2_neg(Unit*, const Unit*);
-void mclx_Fp2_mul(Unit*, const Unit*, const Unit*);
-void mclx_Fp2_sqr(Unit*, const Unit*);
-void mclx_Fp2_mul_xi(Unit*, const Unit*);
-
-Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*);
-Unit mclx_Fr_subPre(Unit*, const Unit*, const Unit*);
-void mclx_Fr_add(Unit*, const Unit*, const Unit*);
-void mclx_Fr_sub(Unit*, const Unit*, const Unit*);
-void mclx_Fr_shr1(Unit*, const Unit*);
-void mclx_Fr_neg(Unit*, const Unit*);
-void mclx_Fr_mul(Unit*, const Unit*, const Unit*);
-void mclx_Fr_sqr(Unit*, const Unit*);
-int mclx_Fr_preInv(Unit*, const Unit*);
-}
-#endif
-
 #ifdef MCL_DUMP_JIT
 struct DumpCode {
 	FILE *fp_;
@@ -488,38 +448,6 @@ private:
 		align(16);
 		op.fp2_mul_xiA_ = gen_fp2_mul_xi();
 		setFuncInfo(prof_, suf, "2_mul_xi", op.fp2_mul_xiA_, getCurr());
-
-#ifdef MCL_STATIC_JIT
-		if (op.xi_a) {
-			// Fp, sizeof(Fp) = 48, supports Fp2
-			op.fp_addPre = mclx_Fp_addPre;
-			op.fp_subPre = mclx_Fp_subPre;
-			op.fp_addA_ = mclx_Fp_add;
-			op.fp_subA_ = mclx_Fp_sub;
-			op.fp_shr1 = mclx_Fp_shr1;
-			op.fp_negA_ = mclx_Fp_neg;
-			op.fpDbl_addA_ = mclx_FpDbl_add;
-			op.fpDbl_subA_ = mclx_FpDbl_sub;
-			op.fpDbl_addPre = mclx_FpDbl_addPre;
-			op.fpDbl_subPre = mclx_FpDbl_subPre;
-			op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre;
-			op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre;
-			op.fpDbl_modA_ = mclx_FpDbl_mod;
-			op.fp_mulA_ = mclx_Fp_mul;
-			op.fp_sqrA_ = mclx_Fp_sqr;
-		} else {
-			// Fr, sizeof(Fr) = 32
-			op.fp_addPre = mclx_Fr_addPre;
-			op.fp_subPre = mclx_Fr_subPre;
-			op.fp_addA_ = mclx_Fr_add;
-			op.fp_subA_ = mclx_Fr_sub;
-			op.fp_shr1 = mclx_Fr_shr1;
-			op.fp_negA_ = mclx_Fr_neg;
-			op.fp_mulA_ = mclx_Fr_mul;
-			op.fp_sqrA_ = mclx_Fr_sqr;
-			op.fp_preInv = mclx_Fr_preInv;
-		}
-#endif
 	}
 	u3u gen_addSubPre(bool isAdd, int n)
 	{
@@ -2774,7 +2702,7 @@ private:
 		mov(rax, px);
 		// px is free frome here
 		load_mp(vv, rax, t); // v = x
-		mov(rax, pL_);
+		lea(rax, ptr[rip+pL_]);
 		load_mp(uu, rax, t); // u = p_
 		// k = 0
 		xor_(rax, rax);
@@ -2852,7 +2780,7 @@ private:
 		const Reg64& t2 = ss.getReg(0);
 		const Reg64& t3 = rdx;
 
-		mov(t2, pL_);
+		lea(t2, ptr[rip+pL_]);
 		if (isFullBit_) {
 			mov(t, ptr [rTop]);
 			test(t, t);
@@ -3724,7 +3652,7 @@ private:
 			}
 		}
 		sub_rr(a, b);
-		mov(rax, pL_);
+		lea(rax, ptr[rip+pL_]);
 		load_rm(b, rax);
 		sbb(rax, rax);
 		for (int i = 0; i < pn_; i++) {
@@ -3732,7 +3660,7 @@ private:
 		}
 		add_rr(a, b);
 		store_mr(py, a);
-		mov(rax, pL_);
+		lea(rax, ptr[rip+pL_]);
 		mov_rr(a, t);
 		sub_rm(t, rax);
 		cmovc_rr(t, a);
@@ -3750,7 +3678,7 @@ private:
 		mov_rr(b, a);
 		add_rm(b, px + FpByte_);
 		sub_rm(a, px + FpByte_);
-		mov(rax, pL_);
+		lea(rax, ptr[rip+pL_]);
 		jnc("@f");
 		add_rm(a, rax);
 	L("@@");
@@ -3925,7 +3853,7 @@ private:
 				mov(ptr [(RegExp)t2 + i * 8], rax);
 			}
 			// t3 = a + p - b
-			mov(rax, pL_);
+			lea(rax, ptr[rip+pL_]);
 			add_rm(a, rax);
 			sub_rr(a, b);
 			store_mr(t3, a);
diff --git a/src/fp_static_code.hpp b/src/fp_static_code.hpp
new file mode 100644
index 0000000..0da39cb
--- /dev/null
+++ b/src/fp_static_code.hpp
@@ -0,0 +1,87 @@
+#pragma once
+/**
+	@file
+	@brief Fp generator
+	@author MITSUNARI Shigeo(@herumi)
+	@license modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+#ifndef MCL_STATIC_CODE
+	#error "define MCL_STATIC_CODE"
+#endif
+
+namespace mcl { namespace fp {
+
+extern "C" {
+
+Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*);
+Unit mclx_Fp_subPre(Unit*, const Unit*, const Unit*);
+void mclx_Fp_add(Unit*, const Unit*, const Unit*);
+void mclx_Fp_sub(Unit*, const Unit*, const Unit*);
+void mclx_Fp_shr1(Unit*, const Unit*);
+void mclx_Fp_neg(Unit*, const Unit*);
+void mclx_Fp_mul(Unit*, const Unit*, const Unit*);
+void mclx_Fp_sqr(Unit*, const Unit*);
+void mclx_FpDbl_add(Unit*, const Unit*, const Unit*);
+void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*);
+void mclx_FpDbl_add(Unit*, const Unit*, const Unit*);
+void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*);
+Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*);
+Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*);
+void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*);
+void mclx_FpDbl_sqrPre(Unit*, const Unit*);
+void mclx_FpDbl_mod(Unit*, const Unit*);
+void mclx_Fp2_add(Unit*, const Unit*, const Unit*);
+void mclx_Fp2_sub(Unit*, const Unit*, const Unit*);
+void mclx_Fp2_neg(Unit*, const Unit*);
+void mclx_Fp2_mul(Unit*, const Unit*, const Unit*);
+void mclx_Fp2_sqr(Unit*, const Unit*);
+void mclx_Fp2_mul_xi(Unit*, const Unit*);
+
+Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*);
+Unit mclx_Fr_subPre(Unit*, const Unit*, const Unit*);
+void mclx_Fr_add(Unit*, const Unit*, const Unit*);
+void mclx_Fr_sub(Unit*, const Unit*, const Unit*);
+void mclx_Fr_shr1(Unit*, const Unit*);
+void mclx_Fr_neg(Unit*, const Unit*);
+void mclx_Fr_mul(Unit*, const Unit*, const Unit*);
+void mclx_Fr_sqr(Unit*, const Unit*);
+int mclx_Fr_preInv(Unit*, const Unit*);
+} // extern "C"
+
+void setStaticCode(mcl::fp::Op& op)
+{
+	if (op.xi_a) {
+		// Fp, sizeof(Fp) = 48, supports Fp2
+		op.fp_addPre = mclx_Fp_addPre;
+		op.fp_subPre = mclx_Fp_subPre;
+		op.fp_addA_ = mclx_Fp_add;
+		op.fp_subA_ = mclx_Fp_sub;
+		op.fp_shr1 = mclx_Fp_shr1;
+		op.fp_negA_ = mclx_Fp_neg;
+		op.fpDbl_addA_ = mclx_FpDbl_add;
+		op.fpDbl_subA_ = mclx_FpDbl_sub;
+		op.fpDbl_addPre = mclx_FpDbl_addPre;
+		op.fpDbl_subPre = mclx_FpDbl_subPre;
+		op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre;
+		op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre;
+		op.fpDbl_modA_ = mclx_FpDbl_mod;
+		op.fp_mulA_ = mclx_Fp_mul;
+		op.fp_sqrA_ = mclx_Fp_sqr;
+	} else {
+		// Fr, sizeof(Fr) = 32
+		op.fp_addPre = mclx_Fr_addPre;
+		op.fp_subPre = mclx_Fr_subPre;
+		op.fp_addA_ = mclx_Fr_add;
+		op.fp_subA_ = mclx_Fr_sub;
+		op.fp_shr1 = mclx_Fr_shr1;
+		op.fp_negA_ = mclx_Fr_neg;
+		op.fp_mulA_ = mclx_Fr_mul;
+		op.fp_sqrA_ = mclx_Fr_sqr;
+		op.fp_preInv = mclx_Fr_preInv;
+	}
+	op.fp_mul = fp::func_ptr_cast<void4u>(op.fp_mulA_);
+}
+
+} } // mcl::fp
+
diff --git a/src/low_func.hpp b/src/low_func.hpp
index 89a748e..2db815e 100644
--- a/src/low_func.hpp
+++ b/src/low_func.hpp
@@ -16,7 +16,7 @@
 #endif
 
 #ifndef MCL_LLVM_BMI2
-	#if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_USE_XBYAK) && !defined(MCL_USE_VINT)
+	#if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_X64_ASM) && !defined(MCL_USE_VINT)
 		#define MCL_LLVM_BMI2 1
 	#endif
 #endif
diff --git a/test/ec_test.cpp b/test/ec_test.cpp
index a3e79e5..855ceba 100644
--- a/test/ec_test.cpp
+++ b/test/ec_test.cpp
@@ -602,7 +602,7 @@ void test_sub(const mcl::EcParam *para, size_t paraNum)
 		test_sub_sub(para[i], mcl::fp::FP_LLVM);
 		test_sub_sub(para[i], mcl::fp::FP_LLVM_MONT);
 #endif
-#ifdef MCL_USE_XBYAK
+#ifdef MCL_X64_ASM
 		test_sub_sub(para[i], mcl::fp::FP_XBYAK);
 #endif
 		mulVec(para[i]);
diff --git a/test/fp_test.cpp b/test/fp_test.cpp
index 469f35d..70fef8a 100644
--- a/test/fp_test.cpp
+++ b/test/fp_test.cpp
@@ -876,7 +876,7 @@ void modpTest()
 }
 
 #include <iostream>
-#if (defined(MCL_USE_LLVM) || defined(MCL_USE_XBYAK)) && (MCL_MAX_BIT_SIZE >= 521)
+#if (defined(MCL_USE_LLVM) || defined(MCL_X64_ASM)) && (MCL_MAX_BIT_SIZE >= 521)
 CYBOZU_TEST_AUTO(mod_NIST_P521)
 {
 	const size_t len = 521;
@@ -908,7 +908,7 @@ CYBOZU_TEST_AUTO(mod_NIST_P521)
 		mcl_fpDbl_mod_NIST_P521L(ex, in, Fp::getOp().p);
 		CYBOZU_TEST_EQUAL_ARRAY(ex, ok, N + 1);
 #endif
-#ifdef MCL_USE_XBYAK
+#ifdef MCL_X64_ASM
 		const mcl::fp::Op& op = Fp::getOp();
 		if (!op.isMont) {
 			op.fpDbl_mod(ex, in, op.p);
@@ -1014,7 +1014,7 @@ CYBOZU_TEST_AUTO(main)
 		sub(mcl::fp::FP_LLVM_MONT);
 	}
 #endif
-#ifdef MCL_USE_XBYAK
+#ifdef MCL_X64_ASM
 	if (g_mode.empty() || g_mode == "xbyak") {
 		sub(mcl::fp::FP_XBYAK);
 	}
diff --git a/test/fp_tower_test.cpp b/test/fp_tower_test.cpp
index c26c5d7..4576376 100644
--- a/test/fp_tower_test.cpp
+++ b/test/fp_tower_test.cpp
@@ -465,7 +465,7 @@ void testAll()
 		test(p, mcl::fp::FP_LLVM);
 		test(p, mcl::fp::FP_LLVM_MONT);
 #endif
-#ifdef MCL_USE_XBYAK
+#ifdef MCL_X64_ASM
 		test(p, mcl::fp::FP_XBYAK);
 #endif
 	}