Merge branch 'dev'

update-fork
MITSUNARI Shigeo 4 years ago
commit b390d6d4ff
  1. 27
      Makefile
  2. 2
      common.mk
  3. 6
      include/mcl/bn.hpp
  4. 7
      include/mcl/op.hpp
  5. 4
      sample/bench.cpp
  6. 2
      sample/rawbench.cpp
  7. 7
      src/dump_code.cpp
  8. 25
      src/fp.cpp
  9. 228
      src/fp_generator.hpp
  10. 92
      src/fp_static_code.hpp
  11. 4
      src/low_func.hpp
  12. 13
      test/bench.hpp
  13. 7
      test/bls12_test.cpp
  14. 2
      test/ec_test.cpp
  15. 6
      test/fp_test.cpp
  16. 2
      test/fp_tower_test.cpp
  17. 70
      test/static_code_test.cpp

@ -11,6 +11,13 @@ TEST_SRC+=bls12_test.cpp
TEST_SRC+=mapto_wb19_test.cpp TEST_SRC+=mapto_wb19_test.cpp
TEST_SRC+=ecdsa_c_test.cpp TEST_SRC+=ecdsa_c_test.cpp
TEST_SRC+=modp_test.cpp TEST_SRC+=modp_test.cpp
ifeq ($(MCL_STATIC_CODE),1)
MCL_USE_XBYAK=0
MCL_MAX_BIT_SIZE=384
CFLAGS+=-DMCL_STATIC_CODE
LIB_OBJ=obj/static_code.o
TEST_SRC=bls12_test.cpp
endif
ifeq ($(CPU),x86-64) ifeq ($(CPU),x86-64)
MCL_USE_XBYAK?=1 MCL_USE_XBYAK?=1
TEST_SRC+=mont_fp_test.cpp sq_test.cpp TEST_SRC+=mont_fp_test.cpp sq_test.cpp
@ -86,7 +93,7 @@ ifneq ($(CPU),)
ASM_SRC=$(ASM_SRC_PATH_NAME).s ASM_SRC=$(ASM_SRC_PATH_NAME).s
endif endif
ASM_OBJ=$(OBJ_DIR)/$(CPU).o ASM_OBJ=$(OBJ_DIR)/$(CPU).o
LIB_OBJ=$(OBJ_DIR)/fp.o LIB_OBJ+=$(OBJ_DIR)/fp.o
BN256_OBJ=$(OBJ_DIR)/bn_c256.o BN256_OBJ=$(OBJ_DIR)/bn_c256.o
BN384_OBJ=$(OBJ_DIR)/bn_c384.o BN384_OBJ=$(OBJ_DIR)/bn_c384.o
BN384_256_OBJ=$(OBJ_DIR)/bn_c384_256.o BN384_256_OBJ=$(OBJ_DIR)/bn_c384_256.o
@ -106,7 +113,9 @@ ifeq ($(MCL_USE_LLVM),1)
LIB_OBJ+=$(ASM_OBJ) LIB_OBJ+=$(ASM_OBJ)
# special case for intel with bmi2 # special case for intel with bmi2
ifeq ($(INTEL),1) ifeq ($(INTEL),1)
LIB_OBJ+=$(OBJ_DIR)/$(CPU).bmi2.o ifneq ($(MCL_STATIC_CODE),1)
LIB_OBJ+=$(OBJ_DIR)/$(CPU).bmi2.o
endif
endif endif
endif endif
LLVM_SRC=src/base$(BIT).ll LLVM_SRC=src/base$(BIT).ll
@ -237,6 +246,18 @@ endif
$(GEN_EXE): src/gen.cpp src/llvm_gen.hpp $(GEN_EXE): src/gen.cpp src/llvm_gen.hpp
$(CXX) -o $@ $< $(CFLAGS) $(CXX) -o $@ $< $(CFLAGS)
src/dump_code: src/dump_code.cpp src/fp.cpp src/fp_generator.hpp
$(CXX) -o $@ src/dump_code.cpp src/fp.cpp -g -I include -DMCL_DUMP_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER
src/static_code.asm: src/dump_code
$< > $@
obj/static_code.o: src/static_code.asm
nasm -felf64 -o $@ $<
bin/static_code_test.exe: test/static_code_test.cpp src/fp.cpp obj/static_code.o
$(CXX) -o $@ -O3 $^ -g -DMCL_DONT_USE_XBYAK -DMCL_STATIC_CODE -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I include -Wall -Wextra
asm: $(LLVM_SRC) asm: $(LLVM_SRC)
$(LLVM_OPT) -O3 -o - $(LLVM_SRC) | $(LLVM_LLC) -O3 $(LLVM_FLAGS) -x86-asm-syntax=intel $(LLVM_OPT) -O3 -o - $(LLVM_SRC) | $(LLVM_LLC) -O3 $(LLVM_FLAGS) -x86-asm-syntax=intel
@ -388,7 +409,7 @@ update_cybozulib:
cp -a $(addprefix ../cybozulib/,$(wildcard include/cybozu/*.hpp)) include/cybozu/ cp -a $(addprefix ../cybozulib/,$(wildcard include/cybozu/*.hpp)) include/cybozu/
clean: clean:
$(RM) $(LIB_DIR)/*.a $(LIB_DIR)/*.$(LIB_SUF) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.obj $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_OBJ) $(LIB_OBJ) $(BN256_OBJ) $(BN384_OBJ) $(BN512_OBJ) $(FUNC_LIST) src/*.ll lib/*.a $(RM) $(LIB_DIR)/*.a $(LIB_DIR)/*.$(LIB_SUF) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.obj $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_OBJ) $(LIB_OBJ) $(BN256_OBJ) $(BN384_OBJ) $(BN512_OBJ) $(FUNC_LIST) src/*.ll lib/*.a src/static_code.asm src/dump_code
ALL_SRC=$(SRC_SRC) $(TEST_SRC) $(SAMPLE_SRC) ALL_SRC=$(SRC_SRC) $(TEST_SRC) $(SAMPLE_SRC)
DEPEND_FILE=$(addprefix $(OBJ_DIR)/, $(addsuffix .d,$(basename $(ALL_SRC)))) DEPEND_FILE=$(addprefix $(OBJ_DIR)/, $(addsuffix .d,$(basename $(ALL_SRC))))

@ -91,7 +91,7 @@ else
CFLAGS_OPT+=$(MARCH) CFLAGS_OPT+=$(MARCH)
endif endif
endif endif
CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith -Wundef
CFLAGS+=-g3 CFLAGS+=-g3
INC_OPT=-I include -I test INC_OPT=-I include -I test
CFLAGS+=$(CFLAGS_WARN) $(BIT_OPT) $(INC_OPT) CFLAGS+=$(CFLAGS_WARN) $(BIT_OPT) $(INC_OPT)

@ -854,6 +854,12 @@ struct Param {
{ {
this->cp = cp; this->cp = cp;
isBLS12 = cp.curveType == MCL_BLS12_381; isBLS12 = cp.curveType == MCL_BLS12_381;
#ifdef MCL_STATIC_CODE
if (!isBLS12) {
*pb = false;
return;
}
#endif
gmp::setStr(pb, z, cp.z); gmp::setStr(pb, z, cp.z);
if (!*pb) return; if (!*pb) return;
isNegative = z < 0; isNegative = z < 0;

@ -16,6 +16,9 @@
#endif #endif
#if !defined(MCL_DONT_USE_XBYAK) && (defined(_WIN64) || defined(__x86_64__)) && (MCL_SIZEOF_UNIT == 8) #if !defined(MCL_DONT_USE_XBYAK) && (defined(_WIN64) || defined(__x86_64__)) && (MCL_SIZEOF_UNIT == 8)
#define MCL_USE_XBYAK #define MCL_USE_XBYAK
#endif
#if defined(MCL_USE_XBYAK) || defined(MCL_STATIC_CODE)
#define MCL_X64_ASM
#define MCL_XBYAK_DIRECT_CALL #define MCL_XBYAK_DIRECT_CALL
#endif #endif
@ -202,6 +205,8 @@ struct Op {
Unit R3[maxUnitSize]; Unit R3[maxUnitSize];
#ifdef MCL_USE_XBYAK #ifdef MCL_USE_XBYAK
FpGenerator *fg; FpGenerator *fg;
#endif
#ifdef MCL_X64_ASM
mcl::Array<Unit> invTbl; mcl::Array<Unit> invTbl;
#endif #endif
void3u fp_addA_; void3u fp_addA_;
@ -288,7 +293,7 @@ struct Op {
memset(one, 0, sizeof(one)); memset(one, 0, sizeof(one));
memset(R2, 0, sizeof(R2)); memset(R2, 0, sizeof(R2));
memset(R3, 0, sizeof(R3)); memset(R3, 0, sizeof(R3));
#ifdef MCL_USE_XBYAK #ifdef MCL_X64_ASM
invTbl.clear(); invTbl.clear();
#endif #endif
fp_addA_ = 0; fp_addA_ = 0;

@ -68,7 +68,7 @@ void benchFp(size_t bitSize, int mode)
if (mode & 4) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM); if (mode & 4) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM);
if (mode & 8) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM_MONT); if (mode & 8) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM_MONT);
#endif #endif
#ifdef MCL_USE_XBYAK #ifdef MCL_X64_ASM
if (mode & 16) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_XBYAK); if (mode & 16) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_XBYAK);
#endif #endif
} }
@ -122,7 +122,7 @@ void benchEc(size_t bitSize, int mode, mcl::ec::Mode ecMode)
if (mode & 4) benchEcSub(tbl[i], mcl::fp::FP_LLVM, ecMode); if (mode & 4) benchEcSub(tbl[i], mcl::fp::FP_LLVM, ecMode);
if (mode & 8) benchEcSub(tbl[i], mcl::fp::FP_LLVM_MONT, ecMode); if (mode & 8) benchEcSub(tbl[i], mcl::fp::FP_LLVM_MONT, ecMode);
#endif #endif
#ifdef MCL_USE_XBYAK #ifdef MCL_X64_ASM
if (mode & 16) benchEcSub(tbl[i], mcl::fp::FP_XBYAK, ecMode); if (mode & 16) benchEcSub(tbl[i], mcl::fp::FP_XBYAK, ecMode);
#endif #endif
} }

@ -168,7 +168,7 @@ int main(int argc, char *argv[])
benchRaw(tbl[i], mcl::fp::FP_LLVM); benchRaw(tbl[i], mcl::fp::FP_LLVM);
benchRaw(tbl[i], mcl::fp::FP_LLVM_MONT); benchRaw(tbl[i], mcl::fp::FP_LLVM_MONT);
#endif #endif
#ifdef MCL_USE_XBYAK #ifdef MCL_X64_ASM
if (bitSize <= 384) { if (bitSize <= 384) {
benchRaw(tbl[i], mcl::fp::FP_XBYAK); benchRaw(tbl[i], mcl::fp::FP_XBYAK);
} }

@ -0,0 +1,7 @@
#include <mcl/bls12_381.hpp>
int main()
{
mcl::bn::initPairing(mcl::BLS12_381);
}

@ -3,12 +3,14 @@
#include <cybozu/sha2.hpp> #include <cybozu/sha2.hpp>
#include <cybozu/endian.hpp> #include <cybozu/endian.hpp>
#include <mcl/conversion.hpp> #include <mcl/conversion.hpp>
#ifdef MCL_STATIC_CODE
#include "fp_static_code.hpp"
#endif
#ifdef MCL_USE_XBYAK #ifdef MCL_USE_XBYAK
#include "fp_generator.hpp" #include "fp_generator.hpp"
#else #else
#define XBYAK_ONLY_CLASS_CPU #define XBYAK_ONLY_CLASS_CPU
#include "xbyak/xbyak_util.h" #include "xbyak/xbyak_util.h"
//#include "detect_cpu.hpp"
#endif #endif
#include "low_func.hpp" #include "low_func.hpp"
#ifdef MCL_USE_LLVM #ifdef MCL_USE_LLVM
@ -315,7 +317,7 @@ void setOp(Op& op, Mode mode)
#endif #endif
} }
#ifdef MCL_USE_XBYAK #ifdef MCL_X64_ASM
inline void invOpForMontC(Unit *y, const Unit *x, const Op& op) inline void invOpForMontC(Unit *y, const Unit *x, const Op& op)
{ {
Unit r[maxUnitSize]; Unit r[maxUnitSize];
@ -372,6 +374,12 @@ static bool initForMont(Op& op, const Unit *p, Mode mode)
op.fp_invOp = &invOpForMontC; op.fp_invOp = &invOpForMontC;
initInvTbl(op); initInvTbl(op);
} }
#elif defined(MCL_STATIC_CODE)
fp::setStaticCode(op);
if (op.isMont && N <= 4) {
op.fp_invOp = &invOpForMontC;
initInvTbl(op);
}
#endif #endif
return true; return true;
} }
@ -403,14 +411,25 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size
priority : MCL_USE_XBYAK > MCL_USE_LLVM > none priority : MCL_USE_XBYAK > MCL_USE_LLVM > none
Xbyak > llvm_mont > llvm > gmp_mont > gmp Xbyak > llvm_mont > llvm > gmp_mont > gmp
*/ */
#ifdef MCL_USE_XBYAK #ifdef MCL_X64_ASM
if (mode == FP_AUTO) mode = FP_XBYAK; if (mode == FP_AUTO) mode = FP_XBYAK;
if (mode == FP_XBYAK && bitSize > 384) { if (mode == FP_XBYAK && bitSize > 384) {
mode = FP_AUTO; mode = FP_AUTO;
} }
#ifdef MCL_USE_XBYAK
if (!isEnableJIT()) { if (!isEnableJIT()) {
mode = FP_AUTO; mode = FP_AUTO;
} }
#elif MCL_STATIC_CODE
{
// static jit code uses avx, mulx, adox, adcx
using namespace Xbyak::util;
Cpu cpu;
if (!(cpu.has(Cpu::tAVX) && cpu.has(Cpu::tBMI2) && cpu.has(Cpu::tADX))) {
mode = FP_AUTO;
}
}
#endif
#else #else
if (mode == FP_XBYAK) mode = FP_AUTO; if (mode == FP_XBYAK) mode = FP_AUTO;
#endif #endif

@ -7,7 +7,6 @@
http://opensource.org/licenses/BSD-3-Clause http://opensource.org/licenses/BSD-3-Clause
*/ */
#if CYBOZU_HOST == CYBOZU_HOST_INTEL #if CYBOZU_HOST == CYBOZU_HOST_INTEL
#define XBYAK_NO_OP_NAMES
#define XBYAK_DISABLE_AVX512 #define XBYAK_DISABLE_AVX512
#include "xbyak/xbyak_util.h" #include "xbyak/xbyak_util.h"
@ -25,6 +24,61 @@
namespace mcl { namespace mcl {
#ifdef MCL_DUMP_JIT
struct DumpCode {
FILE *fp_;
DumpCode()
: fp_(stdout)
{
}
void set(const std::string& name, const uint8_t *begin, const size_t size)
{
fprintf(fp_, "segment .text\n");
fprintf(fp_, "global %s\n", name.c_str());
fprintf(fp_, "align 16\n");
fprintf(fp_, "%s:\n", name.c_str());
const uint8_t *p = begin;
size_t remain = size;
while (remain > 0) {
size_t n = remain >= 16 ? 16 : remain;
fprintf(fp_, "db ");
for (size_t i = 0; i < n; i++) {
fprintf(fp_, "0x%02x,", *p++);
}
fprintf(fp_, "\n");
remain -= n;
}
}
void dumpData(const void *begin, const void *end)
{
fprintf(fp_, "align 16\n");
fprintf(fp_, "dq ");
const uint64_t *p = (const uint64_t*)begin;
const uint64_t *pe = (const uint64_t*)end;
const size_t n = pe - p;
for (size_t i = 0; i < n; i++) {
fprintf(fp_, "0x%016llx,", (unsigned long long)*p++);
}
fprintf(fp_, "\n");
}
};
template<class T>
void setFuncInfo(DumpCode& prof, const char *suf, const char *name, const T& begin, const uint8_t* end)
{
if (suf == 0) suf = "";
const uint8_t*p = (const uint8_t*)begin;
prof.set(std::string("mclx_") + suf + name, p, end - p);
}
#else
template<class T>
void setFuncInfo(Xbyak::util::Profiler& prof, const char *suf, const char *name, const T& begin, const uint8_t* end)
{
if (suf == 0) suf = "";
const uint8_t*p = (const uint8_t*)begin;
prof.set((std::string("mclx_") + suf + name).c_str(), p, end - p);
}
#endif
namespace fp_gen_local { namespace fp_gen_local {
class MemReg { class MemReg {
@ -203,7 +257,11 @@ struct FpGenerator : Xbyak::CodeGenerator {
int pn_; int pn_;
int FpByte_; int FpByte_;
bool isFullBit_; bool isFullBit_;
#ifdef MCL_DUMP_JIT
DumpCode prof_;
#else
Xbyak::util::Profiler prof_; Xbyak::util::Profiler prof_;
#endif
/* /*
@param op [in] ; use op.p, op.N, op.isFullBit @param op [in] ; use op.p, op.N, op.isFullBit
@ -257,19 +315,22 @@ struct FpGenerator : Xbyak::CodeGenerator {
private: private:
void init_inner(Op& op) void init_inner(Op& op)
{ {
const char *suf = op.xi_a ? "Fp" : "Fr";
op_ = &op; op_ = &op;
L(pL_); L(pL_);
p_ = reinterpret_cast<const uint64_t*>(getCurr()); p_ = reinterpret_cast<const uint64_t*>(getCurr());
for (size_t i = 0; i < op.N; i++) { for (size_t i = 0; i < op.N; i++) {
dq(op.p[i]); dq(op.p[i]);
} }
#ifdef MCL_DUMP_JIT
prof_.dumpData(p_, getCurr());
#endif
rp_ = fp::getMontgomeryCoeff(p_[0]); rp_ = fp::getMontgomeryCoeff(p_[0]);
pn_ = (int)op.N; pn_ = (int)op.N;
FpByte_ = int(op.maxN * sizeof(uint64_t)); FpByte_ = int(op.maxN * sizeof(uint64_t));
isFullBit_ = op.isFullBit; isFullBit_ = op.isFullBit;
// printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_); // printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_);
#ifdef MCL_USE_PROF #ifdef MCL_USE_PROF
static char suf[] = "_0";
int profMode = 0; int profMode = 0;
#ifdef XBYAK_USE_VTUNE #ifdef XBYAK_USE_VTUNE
profMode = 2; profMode = 2;
@ -281,94 +342,116 @@ private:
if (profMode) { if (profMode) {
prof_.init(profMode); prof_.init(profMode);
prof_.setStartAddr(getCurr()); prof_.setStartAddr(getCurr());
prof_.setNameSuffix(suf);
suf[1]++;
} }
#else
(void)suf;
#endif #endif
align(16);
op.fp_addPre = gen_addSubPre(true, pn_); op.fp_addPre = gen_addSubPre(true, pn_);
prof_.set("Fp_addPre", getCurr()); setFuncInfo(prof_, suf, "_addPre", op.fp_addPre, getCurr());
align(16);
op.fp_subPre = gen_addSubPre(false, pn_); op.fp_subPre = gen_addSubPre(false, pn_);
prof_.set("Fp_subPre", getCurr()); setFuncInfo(prof_, suf, "_subPre", op.fp_subPre, getCurr());
align(16);
op.fp_addA_ = gen_fp_add(); op.fp_addA_ = gen_fp_add();
prof_.set("Fp_add", getCurr()); setFuncInfo(prof_, suf, "_add", op.fp_addA_, getCurr());
align(16);
op.fp_subA_ = gen_fp_sub(); op.fp_subA_ = gen_fp_sub();
prof_.set("Fp_sub", getCurr()); setFuncInfo(prof_, suf, "_sub", op.fp_subA_, getCurr());
align(16);
op.fp_shr1 = gen_shr1(); op.fp_shr1 = gen_shr1();
prof_.set("Fp_shr1", getCurr()); setFuncInfo(prof_, suf, "_shr1", op.fp_shr1, getCurr());
align(16);
op.fp_negA_ = gen_fp_neg(); op.fp_negA_ = gen_fp_neg();
prof_.set("Fp_neg", getCurr()); setFuncInfo(prof_, suf, "_neg", op.fp_negA_, getCurr());
align(16);
op.fpDbl_addA_ = gen_fpDbl_add();
prof_.set("FpDbl_add", getCurr());
op.fpDbl_subA_ = gen_fpDbl_sub();
prof_.set("FpDbl_sub", getCurr());
op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2);
prof_.set("FpDbl_addPre", getCurr());
op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2);
prof_.set("FpDbl_subPre", getCurr());
op.fpDbl_mulPreA_ = gen_fpDbl_mulPre();
prof_.set("FpDbl_mulPre", getCurr());
op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre();
prof_.set("FpDbl_sqrPre", getCurr());
op.fpDbl_modA_ = gen_fpDbl_mod(op); op.fpDbl_modA_ = gen_fpDbl_mod(op);
prof_.set("FpDbl_mod", getCurr()); setFuncInfo(prof_, suf, "Dbl_mod", op.fpDbl_modA_, getCurr());
align(16);
op.fp_mulA_ = gen_mul(); op.fp_mulA_ = gen_mul();
prof_.set("Fp_mul", getCurr()); setFuncInfo(prof_, suf, "_mul", op.fp_mulA_, getCurr());
if (op.fp_mulA_) { if (op.fp_mulA_) {
op.fp_mul = fp::func_ptr_cast<void4u>(op.fp_mulA_); // used in toMont/fromMont op.fp_mul = fp::func_ptr_cast<void4u>(op.fp_mulA_); // used in toMont/fromMont
} }
align(16);
op.fp_sqrA_ = gen_sqr(); op.fp_sqrA_ = gen_sqr();
prof_.set("Fp_sqr", getCurr()); setFuncInfo(prof_, suf, "_sqr", op.fp_sqrA_, getCurr());
if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4 if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4
align(16); align(16);
op.fp_preInv = getCurr<int2u>(); op.fp_preInv = getCurr<int2u>();
gen_preInv(); gen_preInv();
prof_.set("preInv", getCurr()); setFuncInfo(prof_, suf, "_preInv", op.fp_preInv, getCurr());
} }
if (op.xi_a == 0) return; // Fp2 is not used if (op.xi_a == 0) return; // Fp2 is not used
align(16);
op.fpDbl_addA_ = gen_fpDbl_add();
setFuncInfo(prof_, suf, "Dbl_add", op.fpDbl_addA_, getCurr());
align(16);
op.fpDbl_subA_ = gen_fpDbl_sub();
setFuncInfo(prof_, suf, "Dbl_sub", op.fpDbl_subA_, getCurr());
align(16);
op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2);
setFuncInfo(prof_, suf, "Dbl_addPre", op.fpDbl_addPre, getCurr());
align(16);
op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2);
setFuncInfo(prof_, suf, "Dbl_subPre", op.fpDbl_subPre, getCurr());
align(16);
op.fpDbl_mulPreA_ = gen_fpDbl_mulPre();
setFuncInfo(prof_, suf, "Dbl_mulPre", op.fpDbl_mulPreA_, getCurr());
align(16);
op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre();
setFuncInfo(prof_, suf, "Dbl_sqrPre", op.fpDbl_sqrPreA_, getCurr());
align(16);
op.fp2_addA_ = gen_fp2_add(); op.fp2_addA_ = gen_fp2_add();
prof_.set("Fp2_add", getCurr()); setFuncInfo(prof_, suf, "2_add", op.fp2_addA_, getCurr());
align(16);
op.fp2_subA_ = gen_fp2_sub(); op.fp2_subA_ = gen_fp2_sub();
prof_.set("Fp2_sub", getCurr()); setFuncInfo(prof_, suf, "2_sub", op.fp2_subA_, getCurr());
align(16);
op.fp2_negA_ = gen_fp2_neg(); op.fp2_negA_ = gen_fp2_neg();
prof_.set("Fp2_neg", getCurr()); setFuncInfo(prof_, suf, "2_neg", op.fp2_negA_, getCurr());
op.fp2_mulNF = 0; op.fp2_mulNF = 0;
align(16);
op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre(); op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre();
prof_.set("Fp2Dbl_mulPre", getCurr()); if (op.fp2Dbl_mulPreA_) setFuncInfo(prof_, suf, "2Dbl_mulPre", op.fp2Dbl_mulPreA_, getCurr());
align(16);
op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre(); op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre();
prof_.set("Fp2Dbl_sqrPre", getCurr()); if (op.fp2Dbl_sqrPreA_) setFuncInfo(prof_, suf, "2Dbl_sqrPre", op.fp2Dbl_sqrPreA_, getCurr());
align(16);
op.fp2_mulA_ = gen_fp2_mul(); op.fp2_mulA_ = gen_fp2_mul();
prof_.set("Fp2_mul", getCurr()); setFuncInfo(prof_, suf, "2_mul", op.fp2_mulA_, getCurr());
align(16);
op.fp2_sqrA_ = gen_fp2_sqr(); op.fp2_sqrA_ = gen_fp2_sqr();
prof_.set("Fp2_sqr", getCurr()); setFuncInfo(prof_, suf, "2_sqr", op.fp2_sqrA_, getCurr());
align(16);
op.fp2_mul_xiA_ = gen_fp2_mul_xi(); op.fp2_mul_xiA_ = gen_fp2_mul_xi();
prof_.set("Fp2_mul_xi", getCurr()); setFuncInfo(prof_, suf, "2_mul_xi", op.fp2_mul_xiA_, getCurr());
} }
u3u gen_addSubPre(bool isAdd, int n) u3u gen_addSubPre(bool isAdd, int n)
{ {
// if (isFullBit_) return 0; // if (isFullBit_) return 0;
align(16);
u3u func = getCurr<u3u>(); u3u func = getCurr<u3u>();
StackFrame sf(this, 3); StackFrame sf(this, 3);
if (isAdd) { if (isAdd) {
@ -429,7 +512,7 @@ private:
} }
jmp(exit); jmp(exit);
L(nonZero); L(nonZero);
mov(rax, pL_); lea(rax, ptr[rip+pL_]);
for (size_t i = 0; i < t.size(); i++) { for (size_t i = 0; i < t.size(); i++) {
mov(rdx, ptr [rax + i * 8]); mov(rdx, ptr [rax + i * 8]);
if (i == 0) { if (i == 0) {
@ -557,7 +640,7 @@ private:
mov(*fullReg, 0); mov(*fullReg, 0);
adc(*fullReg, 0); adc(*fullReg, 0);
} }
mov(rax, pL_); lea(rax, ptr[rip+pL_]);
sub_rm(p1, rax); sub_rm(p1, rax);
if (fullReg) { if (fullReg) {
sbb(*fullReg, 0); sbb(*fullReg, 0);
@ -577,7 +660,7 @@ private:
const Pack& p1 = t.sub(pn_, pn_); const Pack& p1 = t.sub(pn_, pn_);
load_rm(p0, px); load_rm(p0, px);
sub_rm(p0, py, withCarry); sub_rm(p0, py, withCarry);
mov(rax, pL_); lea(rax, ptr[rip+pL_]);
load_rm(p1, rax); load_rm(p1, rax);
sbb(rax, rax); // rax = (x > y) ? 0 : -1 sbb(rax, rax); // rax = (x > y) ? 0 : -1
for (size_t i = 0; i < p1.size(); i++) { for (size_t i = 0; i < p1.size(); i++) {
@ -618,7 +701,7 @@ private:
Label exit; Label exit;
if (isFullBit_) { if (isFullBit_) {
jnc("@f"); jnc("@f");
mov(t2[0], pL_); // t2 is not used lea(t2[0], ptr[rip+pL_]); // t2[0] is not used
sub_rm(t1, t2[0]); sub_rm(t1, t2[0]);
jmp(exit); jmp(exit);
L("@@"); L("@@");
@ -648,7 +731,6 @@ private:
} }
void3u gen_fp_add() void3u gen_fp_add()
{ {
align(16);
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
if (pn_ <= 4) { if (pn_ <= 4) {
gen_fp_add_le4(); gen_fp_add_le4();
@ -666,7 +748,7 @@ private:
inLocalLabel(); inLocalLabel();
gen_raw_add(pz, px, py, rax, pn_); gen_raw_add(pz, px, py, rax, pn_);
mov(px, pL_); // destroy px lea(px, ptr[rip+pL_]);
if (isFullBit_) { if (isFullBit_) {
jc(".over", jmpMode); jc(".over", jmpMode);
} }
@ -696,7 +778,6 @@ private:
} }
void3u gen_fpDbl_add() void3u gen_fpDbl_add()
{ {
align(16);
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
if (pn_ <= 4) { if (pn_ <= 4) {
int tn = pn_ * 2 + (isFullBit_ ? 1 : 0); int tn = pn_ * 2 + (isFullBit_ ? 1 : 0);
@ -724,7 +805,6 @@ private:
} }
void3u gen_fpDbl_sub() void3u gen_fpDbl_sub()
{ {
align(16);
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
if (pn_ <= 4) { if (pn_ <= 4) {
int tn = pn_ * 2; int tn = pn_ * 2;
@ -774,7 +854,6 @@ private:
} }
void3u gen_fp_sub() void3u gen_fp_sub()
{ {
align(16);
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
if (pn_ <= 4) { if (pn_ <= 4) {
gen_fp_sub_le4(); gen_fp_sub_le4();
@ -792,14 +871,13 @@ private:
Label exit; Label exit;
gen_raw_sub(pz, px, py, rax, pn_); gen_raw_sub(pz, px, py, rax, pn_);
jnc(exit, jmpMode); jnc(exit, jmpMode);
mov(px, pL_); lea(px, ptr[rip+pL_]);
gen_raw_add(pz, pz, px, rax, pn_); gen_raw_add(pz, pz, px, rax, pn_);
L(exit); L(exit);
return func; return func;
} }
void2u gen_fp_neg() void2u gen_fp_neg()
{ {
align(16);
void2u func = getCurr<void2u>(); void2u func = getCurr<void2u>();
StackFrame sf(this, 2, UseRDX | pn_); StackFrame sf(this, 2, UseRDX | pn_);
gen_raw_neg(sf.p[0], sf.p[1], sf.t); gen_raw_neg(sf.p[0], sf.p[1], sf.t);
@ -807,7 +885,6 @@ private:
} }
void2u gen_shr1() void2u gen_shr1()
{ {
align(16);
void2u func = getCurr<void2u>(); void2u func = getCurr<void2u>();
const int c = 1; const int c = 1;
StackFrame sf(this, 2, 1); StackFrame sf(this, 2, 1);
@ -828,7 +905,6 @@ private:
} }
void3u gen_mul() void3u gen_mul()
{ {
align(16);
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
if (op_->primeMode == PM_NIST_P192) { if (op_->primeMode == PM_NIST_P192) {
StackFrame sf(this, 3, 10 | UseRDX, 8 * 6); StackFrame sf(this, 3, 10 | UseRDX, 8 * 6);
@ -901,7 +977,7 @@ private:
mov(a, rp_); mov(a, rp_);
mul(t6); mul(t6);
mov(t0, pL_); lea(t0, ptr[rip+pL_]);
mov(t7, a); // q mov(t7, a); // q
// [d:t7:t1] = p * q // [d:t7:t1] = p * q
@ -970,7 +1046,7 @@ private:
mov(a, rp_); mov(a, rp_);
mul(t10); mul(t10);
mov(t0, pL_); lea(t0, ptr[rip+pL_]);
mov(t7, a); // q mov(t7, a); // q
// [d:t7:t2:t1] = p * q // [d:t7:t2:t1] = p * q
@ -1050,7 +1126,7 @@ private:
mov(a, rp_); mov(a, rp_);
mul(z); mul(z);
mov(t0, pL_); lea(t0, ptr[rip+pL_]);
mov(t7, a); // q mov(t7, a); // q
// [d:t7:t3:t2:t1] = p * q // [d:t7:t3:t2:t1] = p * q
@ -1141,7 +1217,6 @@ private:
} }
void2u gen_fpDbl_mod(const fp::Op& op) void2u gen_fpDbl_mod(const fp::Op& op)
{ {
align(16);
void2u func = getCurr<void2u>(); void2u func = getCurr<void2u>();
if (op.primeMode == PM_NIST_P192) { if (op.primeMode == PM_NIST_P192) {
StackFrame sf(this, 2, 6 | UseRDX); StackFrame sf(this, 2, 6 | UseRDX);
@ -1187,7 +1262,6 @@ private:
} }
void2u gen_sqr() void2u gen_sqr()
{ {
align(16);
void2u func = getCurr<void2u>(); void2u func = getCurr<void2u>();
if (op_->primeMode == PM_NIST_P192) { if (op_->primeMode == PM_NIST_P192) {
StackFrame sf(this, 3, 10 | UseRDX, 6 * 8); StackFrame sf(this, 3, 10 | UseRDX, 6 * 8);
@ -1308,7 +1382,7 @@ private:
L(fp_mulL); L(fp_mulL);
vmovq(xm0, p0); // save p0 vmovq(xm0, p0); // save p0
mov(p0, pL_); lea(p0, ptr[rip+pL_]);
vmovq(xm1, p2); vmovq(xm1, p2);
mov(p2, ptr [p2]); mov(p2, ptr [p2]);
montgomery4_1(rp_, t0, t7, t3, t2, t1, p1, p2, p0, t4, t5, t6, t8, t9, true, xm2); montgomery4_1(rp_, t0, t7, t3, t2, t1, p1, p2, p0, t4, t5, t6, t8, t9, true, xm2);
@ -1404,7 +1478,7 @@ private:
mov(a, rp_); mov(a, rp_);
mul(c[0]); // q = a mul(c[0]); // q = a
mov(d, a); mov(d, a);
mov(t1, pL_); lea(t1, ptr[rip+pL_]);
// c += p * q // c += p * q
mulAdd(c, 6, t1); mulAdd(c, 6, t1);
} }
@ -1450,7 +1524,7 @@ private:
const Pack z = Pack(t3, t2, t1, t0, t7, t6); const Pack z = Pack(t3, t2, t1, t0, t7, t6);
const Pack keep = Pack(rdx, rax, px, py, t8, t9); const Pack keep = Pack(rdx, rax, px, py, t8, t9);
mov_rr(keep, z); mov_rr(keep, z);
mov(t5, pL_); lea(t5, ptr[rip+pL_]);
sub_rm(z, t5); sub_rm(z, t5);
cmovc_rr(z, keep); cmovc_rr(z, keep);
store_mr(pz, z); store_mr(pz, z);
@ -1480,7 +1554,7 @@ private:
const Reg64& t9 = sf.t[9]; const Reg64& t9 = sf.t[9];
vmovq(xm0, p0); // save p0 vmovq(xm0, p0); // save p0
mov(t7, pL_); lea(t7, ptr[rip+pL_]);
mov(t9, ptr [p2]); mov(t9, ptr [p2]);
// c3, c2, c1, c0, px, y, p, // c3, c2, c1, c0, px, y, p,
montgomery3_1(rp_, t0, t3, t2, t1, p1, t9, t7, t4, t5, t6, t8, p0, true); montgomery3_1(rp_, t0, t3, t2, t1, p1, t9, t7, t4, t5, t6, t8, p0, true);
@ -1526,7 +1600,7 @@ private:
const Reg64& t9 = sf.t[9]; const Reg64& t9 = sf.t[9];
vmovq(xm0, pz); // save pz vmovq(xm0, pz); // save pz
mov(t7, pL_); lea(t7, ptr[rip+pL_]);
mov(t9, ptr [px]); mov(t9, ptr [px]);
mul3x1_sqr1(px, t9, t3, t2, t1, t0); mul3x1_sqr1(px, t9, t3, t2, t1, t0);
mov(t0, rdx); mov(t0, rdx);
@ -2291,7 +2365,6 @@ private:
} }
void2u gen_fpDbl_sqrPre() void2u gen_fpDbl_sqrPre()
{ {
align(16);
void2u func = getCurr<void2u>(); void2u func = getCurr<void2u>();
if (pn_ == 2 && useMulx_) { if (pn_ == 2 && useMulx_) {
StackFrame sf(this, 2, 7 | UseRDX); StackFrame sf(this, 2, 7 | UseRDX);
@ -2332,7 +2405,6 @@ private:
} }
void3u gen_fpDbl_mulPre() void3u gen_fpDbl_mulPre()
{ {
align(16);
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
if (pn_ == 2 && useMulx_) { if (pn_ == 2 && useMulx_) {
StackFrame sf(this, 3, 5 | UseRDX); StackFrame sf(this, 3, 5 | UseRDX);
@ -2630,7 +2702,7 @@ private:
mov(rax, px); mov(rax, px);
// px is free frome here // px is free frome here
load_mp(vv, rax, t); // v = x load_mp(vv, rax, t); // v = x
mov(rax, pL_); lea(rax, ptr[rip+pL_]);
load_mp(uu, rax, t); // u = p_ load_mp(uu, rax, t); // u = p_
// k = 0 // k = 0
xor_(rax, rax); xor_(rax, rax);
@ -2708,7 +2780,7 @@ private:
const Reg64& t2 = ss.getReg(0); const Reg64& t2 = ss.getReg(0);
const Reg64& t3 = rdx; const Reg64& t3 = rdx;
mov(t2, pL_); lea(t2, ptr[rip+pL_]);
if (isFullBit_) { if (isFullBit_) {
mov(t, ptr [rTop]); mov(t, ptr [rTop]);
test(t, t); test(t, t);
@ -3373,7 +3445,6 @@ private:
// if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; // if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
// almost same for pn_ == 6 // almost same for pn_ == 6
if (pn_ != 4) return 0; if (pn_ != 4) return 0;
align(16);
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
const RegExp z = rsp + 0 * 8; const RegExp z = rsp + 0 * 8;
@ -3438,7 +3509,6 @@ private:
// if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; // if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
// almost same for pn_ == 6 // almost same for pn_ == 6
if (pn_ != 4) return 0; if (pn_ != 4) return 0;
align(16);
void2u func = getCurr<void2u>(); void2u func = getCurr<void2u>();
// almost same for pn_ == 6 // almost same for pn_ == 6
if (pn_ != 4) return 0; if (pn_ != 4) return 0;
@ -3524,7 +3594,6 @@ private:
} }
void3u gen_fp2_add() void3u gen_fp2_add()
{ {
align(16);
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
if (pn_ == 4 && !isFullBit_) { if (pn_ == 4 && !isFullBit_) {
gen_fp2_add4(); gen_fp2_add4();
@ -3538,7 +3607,6 @@ private:
} }
void3u gen_fp2_sub() void3u gen_fp2_sub()
{ {
align(16);
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
if (pn_ == 4 && !isFullBit_) { if (pn_ == 4 && !isFullBit_) {
gen_fp2_sub4(); gen_fp2_sub4();
@ -3584,7 +3652,7 @@ private:
} }
} }
sub_rr(a, b); sub_rr(a, b);
mov(rax, pL_); lea(rax, ptr[rip+pL_]);
load_rm(b, rax); load_rm(b, rax);
sbb(rax, rax); sbb(rax, rax);
for (int i = 0; i < pn_; i++) { for (int i = 0; i < pn_; i++) {
@ -3592,7 +3660,7 @@ private:
} }
add_rr(a, b); add_rr(a, b);
store_mr(py, a); store_mr(py, a);
mov(rax, pL_); lea(rax, ptr[rip+pL_]);
mov_rr(a, t); mov_rr(a, t);
sub_rm(t, rax); sub_rm(t, rax);
cmovc_rr(t, a); cmovc_rr(t, a);
@ -3610,7 +3678,7 @@ private:
mov_rr(b, a); mov_rr(b, a);
add_rm(b, px + FpByte_); add_rm(b, px + FpByte_);
sub_rm(a, px + FpByte_); sub_rm(a, px + FpByte_);
mov(rax, pL_); lea(rax, ptr[rip+pL_]);
jnc("@f"); jnc("@f");
add_rm(a, rax); add_rm(a, rax);
L("@@"); L("@@");
@ -3624,7 +3692,6 @@ private:
{ {
if (isFullBit_) return 0; if (isFullBit_) return 0;
if (op_->xi_a != 1) return 0; if (op_->xi_a != 1) return 0;
align(16);
void2u func = getCurr<void2u>(); void2u func = getCurr<void2u>();
if (pn_ == 4) { if (pn_ == 4) {
gen_fp2_mul_xi4(); gen_fp2_mul_xi4();
@ -3638,7 +3705,6 @@ private:
} }
void2u gen_fp2_neg() void2u gen_fp2_neg()
{ {
align(16);
void2u func = getCurr<void2u>(); void2u func = getCurr<void2u>();
if (pn_ <= 6) { if (pn_ <= 6) {
StackFrame sf(this, 2, UseRDX | pn_); StackFrame sf(this, 2, UseRDX | pn_);
@ -3652,7 +3718,6 @@ private:
{ {
if (isFullBit_) return 0; if (isFullBit_) return 0;
if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
align(16);
void3u func = getCurr<void3u>(); void3u func = getCurr<void3u>();
bool embedded = pn_ == 4; bool embedded = pn_ == 4;
@ -3729,7 +3794,6 @@ private:
{ {
if (isFullBit_) return 0; if (isFullBit_) return 0;
if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0; if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
align(16);
void2u func = getCurr<void2u>(); void2u func = getCurr<void2u>();
const RegExp y = rsp + 0 * 8; const RegExp y = rsp + 0 * 8;
@ -3789,7 +3853,7 @@ private:
mov(ptr [(RegExp)t2 + i * 8], rax); mov(ptr [(RegExp)t2 + i * 8], rax);
} }
// t3 = a + p - b // t3 = a + p - b
mov(rax, pL_); lea(rax, ptr[rip+pL_]);
add_rm(a, rax); add_rm(a, rax);
sub_rr(a, b); sub_rr(a, b);
store_mr(t3, a); store_mr(t3, a);

@ -0,0 +1,92 @@
#pragma once
/**
@file
@brief Fp generator
@author MITSUNARI Shigeo(@herumi)
@license modified new BSD license
http://opensource.org/licenses/BSD-3-Clause
*/
#ifndef MCL_STATIC_CODE
#error "define MCL_STATIC_CODE"
#endif
namespace mcl { namespace fp {
extern "C" {
Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*);
Unit mclx_Fp_subPre(Unit*, const Unit*, const Unit*);
void mclx_Fp_add(Unit*, const Unit*, const Unit*);
void mclx_Fp_sub(Unit*, const Unit*, const Unit*);
void mclx_Fp_shr1(Unit*, const Unit*);
void mclx_Fp_neg(Unit*, const Unit*);
void mclx_FpDbl_mod(Unit*, const Unit*);
void mclx_Fp_mul(Unit*, const Unit*, const Unit*);
void mclx_Fp_sqr(Unit*, const Unit*);
void mclx_FpDbl_add(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*);
Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*);
Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_sqrPre(Unit*, const Unit*);
void mclx_Fp2_add(Unit*, const Unit*, const Unit*);
void mclx_Fp2_sub(Unit*, const Unit*, const Unit*);
void mclx_Fp2_neg(Unit*, const Unit*);
void mclx_Fp2_mul(Unit*, const Unit*, const Unit*);
void mclx_Fp2_sqr(Unit*, const Unit*);
void mclx_Fp2_mul_xi(Unit*, const Unit*);
Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*);
Unit mclx_Fr_subPre(Unit*, const Unit*, const Unit*);
void mclx_Fr_add(Unit*, const Unit*, const Unit*);
void mclx_Fr_sub(Unit*, const Unit*, const Unit*);
void mclx_Fr_shr1(Unit*, const Unit*);
void mclx_Fr_neg(Unit*, const Unit*);
void mclx_Fr_mul(Unit*, const Unit*, const Unit*);
void mclx_Fr_sqr(Unit*, const Unit*);
int mclx_Fr_preInv(Unit*, const Unit*);
} // extern "C"
void setStaticCode(mcl::fp::Op& op)
{
if (op.xi_a) {
// Fp, sizeof(Fp) = 48, supports Fp2
op.fp_addPre = mclx_Fp_addPre;
op.fp_subPre = mclx_Fp_subPre;
op.fp_addA_ = mclx_Fp_add;
op.fp_subA_ = mclx_Fp_sub;
op.fp_shr1 = mclx_Fp_shr1;
op.fp_negA_ = mclx_Fp_neg;
op.fpDbl_modA_ = mclx_FpDbl_mod;
op.fp_mulA_ = mclx_Fp_mul;
op.fp_sqrA_ = mclx_Fp_sqr;
op.fpDbl_addA_ = mclx_FpDbl_add;
op.fpDbl_subA_ = mclx_FpDbl_sub;
op.fpDbl_addPre = mclx_FpDbl_addPre;
op.fpDbl_subPre = mclx_FpDbl_subPre;
op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre;
op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre;
op.fp2_addA_ = mclx_Fp2_add;
op.fp2_subA_ = mclx_Fp2_sub;
op.fp2_negA_ = mclx_Fp2_neg;
op.fp2_mulNF = 0;
op.fp2_mulA_ = mclx_Fp2_mul;
op.fp2_sqrA_ = mclx_Fp2_sqr;
op.fp2_mul_xiA_ = mclx_Fp2_mul_xi;
} else {
// Fr, sizeof(Fr) = 32
op.fp_addPre = mclx_Fr_addPre;
op.fp_subPre = mclx_Fr_subPre;
op.fp_addA_ = mclx_Fr_add;
op.fp_subA_ = mclx_Fr_sub;
op.fp_shr1 = mclx_Fr_shr1;
op.fp_negA_ = mclx_Fr_neg;
op.fp_mulA_ = mclx_Fr_mul;
op.fp_sqrA_ = mclx_Fr_sqr;
op.fp_preInv = mclx_Fr_preInv;
}
op.fp_mul = fp::func_ptr_cast<void4u>(op.fp_mulA_);
}
} } // mcl::fp

@ -16,8 +16,10 @@
#endif #endif
#ifndef MCL_LLVM_BMI2 #ifndef MCL_LLVM_BMI2
#if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_USE_XBYAK) && !defined(MCL_USE_VINT) #if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && !defined(MCL_STATIC_CODE) && !defined(MCL_USE_VINT)
#define MCL_LLVM_BMI2 1 #define MCL_LLVM_BMI2 1
#else
#define MCL_LLVM_BMI2 0
#endif #endif
#endif #endif

@ -100,6 +100,19 @@ void testBench(const G1& P, const G2& Q)
CYBOZU_BENCH_C("Fp::mul ", C3, Fp::mul, x, x, y); CYBOZU_BENCH_C("Fp::mul ", C3, Fp::mul, x, x, y);
CYBOZU_BENCH_C("Fp::sqr ", C3, Fp::sqr, x, x); CYBOZU_BENCH_C("Fp::sqr ", C3, Fp::sqr, x, x);
CYBOZU_BENCH_C("Fp::inv ", C3, Fp::inv, x, x); CYBOZU_BENCH_C("Fp::inv ", C3, Fp::inv, x, x);
CYBOZU_BENCH_C("Fp::pow ", C3, Fp::pow, x, x, y);
{
Fr a, b, c;
a.setHashOf("abc", 3);
b.setHashOf("123", 3);
CYBOZU_BENCH_C("Fr::add ", C3, Fr::add, a, a, b);
CYBOZU_BENCH_C("Fr::sub ", C3, Fr::sub, a, a, b);
CYBOZU_BENCH_C("Fr::neg ", C3, Fr::neg, a, a);
CYBOZU_BENCH_C("Fr::mul ", C3, Fr::mul, a, a, b);
CYBOZU_BENCH_C("Fr::sqr ", C3, Fr::sqr, a, a);
CYBOZU_BENCH_C("Fr::inv ", C3, Fr::inv, a, a);
CYBOZU_BENCH_C("Fr::pow ", C3, Fr::pow, a, a, b);
}
Fp2 xx, yy; Fp2 xx, yy;
xx.a = x; xx.a = x;
xx.b = 3; xx.b = 3;

@ -688,6 +688,8 @@ CYBOZU_TEST_AUTO(multi)
G1 P; G1 P;
G2 Q; G2 Q;
int i; int i;
#ifndef MCL_STATIC_CODE
puts("BN254"); puts("BN254");
testCurve(mcl::BN254); testCurve(mcl::BN254);
i = 1; i = 1;
@ -695,6 +697,7 @@ CYBOZU_TEST_AUTO(multi)
CYBOZU_BENCH_C("naiveG2", 100, (BN::param.mapTo.naiveMapTo<G1, Fp>), P, i++); CYBOZU_BENCH_C("naiveG2", 100, (BN::param.mapTo.naiveMapTo<G1, Fp>), P, i++);
CYBOZU_BENCH_C("calcBN2", 100, (BN::param.mapTo.calcBN<G2, Fp2>), Q, i++); CYBOZU_BENCH_C("calcBN2", 100, (BN::param.mapTo.calcBN<G2, Fp2>), Q, i++);
CYBOZU_BENCH_C("naiveG2", 100, (BN::param.mapTo.naiveMapTo<G2, Fp2>), Q, i++); CYBOZU_BENCH_C("naiveG2", 100, (BN::param.mapTo.naiveMapTo<G2, Fp2>), Q, i++);
#endif
puts("BLS12_381"); puts("BLS12_381");
testCurve(mcl::BLS12_381); testCurve(mcl::BLS12_381);
i = 1; i = 1;
@ -861,7 +864,11 @@ int main(int argc, char *argv[])
return 1; return 1;
} }
g_mode = mcl::fp::StrToMode(mode); g_mode = mcl::fp::StrToMode(mode);
#ifdef MCL_STATIC_CODE
printf("static code for BLS12-381\n");
#else
printf("JIT %d\n", mcl::fp::isEnableJIT()); printf("JIT %d\n", mcl::fp::isEnableJIT());
#endif
#if 0 #if 0
initPairing(mcl::BLS12_381); initPairing(mcl::BLS12_381);
cybozu::XorShift rg; cybozu::XorShift rg;

@ -602,7 +602,7 @@ void test_sub(const mcl::EcParam *para, size_t paraNum)
test_sub_sub(para[i], mcl::fp::FP_LLVM); test_sub_sub(para[i], mcl::fp::FP_LLVM);
test_sub_sub(para[i], mcl::fp::FP_LLVM_MONT); test_sub_sub(para[i], mcl::fp::FP_LLVM_MONT);
#endif #endif
#ifdef MCL_USE_XBYAK #ifdef MCL_X64_ASM
test_sub_sub(para[i], mcl::fp::FP_XBYAK); test_sub_sub(para[i], mcl::fp::FP_XBYAK);
#endif #endif
mulVec(para[i]); mulVec(para[i]);

@ -876,7 +876,7 @@ void modpTest()
} }
#include <iostream> #include <iostream>
#if (defined(MCL_USE_LLVM) || defined(MCL_USE_XBYAK)) && (MCL_MAX_BIT_SIZE >= 521) #if (defined(MCL_USE_LLVM) || defined(MCL_X64_ASM)) && (MCL_MAX_BIT_SIZE >= 521)
CYBOZU_TEST_AUTO(mod_NIST_P521) CYBOZU_TEST_AUTO(mod_NIST_P521)
{ {
const size_t len = 521; const size_t len = 521;
@ -908,7 +908,7 @@ CYBOZU_TEST_AUTO(mod_NIST_P521)
mcl_fpDbl_mod_NIST_P521L(ex, in, Fp::getOp().p); mcl_fpDbl_mod_NIST_P521L(ex, in, Fp::getOp().p);
CYBOZU_TEST_EQUAL_ARRAY(ex, ok, N + 1); CYBOZU_TEST_EQUAL_ARRAY(ex, ok, N + 1);
#endif #endif
#ifdef MCL_USE_XBYAK #ifdef MCL_X64_ASM
const mcl::fp::Op& op = Fp::getOp(); const mcl::fp::Op& op = Fp::getOp();
if (!op.isMont) { if (!op.isMont) {
op.fpDbl_mod(ex, in, op.p); op.fpDbl_mod(ex, in, op.p);
@ -1014,7 +1014,7 @@ CYBOZU_TEST_AUTO(main)
sub(mcl::fp::FP_LLVM_MONT); sub(mcl::fp::FP_LLVM_MONT);
} }
#endif #endif
#ifdef MCL_USE_XBYAK #ifdef MCL_X64_ASM
if (g_mode.empty() || g_mode == "xbyak") { if (g_mode.empty() || g_mode == "xbyak") {
sub(mcl::fp::FP_XBYAK); sub(mcl::fp::FP_XBYAK);
} }

@ -465,7 +465,7 @@ void testAll()
test(p, mcl::fp::FP_LLVM); test(p, mcl::fp::FP_LLVM);
test(p, mcl::fp::FP_LLVM_MONT); test(p, mcl::fp::FP_LLVM_MONT);
#endif #endif
#ifdef MCL_USE_XBYAK #ifdef MCL_X64_ASM
test(p, mcl::fp::FP_XBYAK); test(p, mcl::fp::FP_XBYAK);
#endif #endif
} }

@ -0,0 +1,70 @@
#include <cybozu/test.hpp>
#include <mcl/bls12_381.hpp>
using namespace mcl::bn;
CYBOZU_TEST_AUTO(init)
{
initPairing(mcl::BLS12_381);
}
CYBOZU_TEST_AUTO(Fr)
{
Fr x, y;
x = 3;
y = 5;
CYBOZU_TEST_EQUAL(x + y, 8);
CYBOZU_TEST_EQUAL(x - y, -2);
CYBOZU_TEST_EQUAL(x * y, 15);
}
CYBOZU_TEST_AUTO(Fp)
{
Fp x, y;
x = 3;
y = 5;
CYBOZU_TEST_EQUAL(x + y, 8);
CYBOZU_TEST_EQUAL(x - y, -2);
CYBOZU_TEST_EQUAL(x * y, 15);
}
CYBOZU_TEST_AUTO(Fp2)
{
Fp2 x, y;
x.a = 3;
x.b = 2;
y.a = 1;
y.b = 4;
/*
(3+2i)(1+4i)=3-8+(12+2)i
*/
CYBOZU_TEST_EQUAL(x + y, Fp2(4, 6));
CYBOZU_TEST_EQUAL(x - y, Fp2(2, -2));
CYBOZU_TEST_EQUAL(x * y, Fp2(-5, 14));
}
CYBOZU_TEST_AUTO(G1)
{
G1 P, Q;
hashAndMapToG1(P, "abc", 3);
Fr r1, r2;
r1.setHashOf("abc", 3);
r2 = -r1;
G1::mul(Q, P, r1);
Q = -Q;
P *= r2;
CYBOZU_TEST_EQUAL(P, Q);
}
CYBOZU_TEST_AUTO(G2)
{
G2 P, Q;
hashAndMapToG2(P, "abc", 3);
Fr r1, r2;
r1.setHashOf("abc", 3);
r2 = -r1;
G2::mul(Q, P, r1);
Q = -Q;
P *= r2;
CYBOZU_TEST_EQUAL(P, Q);
}
Loading…
Cancel
Save