Merge branch 'dev'

update-fork
MITSUNARI Shigeo 4 years ago
commit b390d6d4ff
  1. 27
      Makefile
  2. 2
      common.mk
  3. 6
      include/mcl/bn.hpp
  4. 7
      include/mcl/op.hpp
  5. 4
      sample/bench.cpp
  6. 2
      sample/rawbench.cpp
  7. 7
      src/dump_code.cpp
  8. 25
      src/fp.cpp
  9. 228
      src/fp_generator.hpp
  10. 92
      src/fp_static_code.hpp
  11. 4
      src/low_func.hpp
  12. 13
      test/bench.hpp
  13. 7
      test/bls12_test.cpp
  14. 2
      test/ec_test.cpp
  15. 6
      test/fp_test.cpp
  16. 2
      test/fp_tower_test.cpp
  17. 70
      test/static_code_test.cpp

@ -11,6 +11,13 @@ TEST_SRC+=bls12_test.cpp
TEST_SRC+=mapto_wb19_test.cpp
TEST_SRC+=ecdsa_c_test.cpp
TEST_SRC+=modp_test.cpp
ifeq ($(MCL_STATIC_CODE),1)
MCL_USE_XBYAK=0
MCL_MAX_BIT_SIZE=384
CFLAGS+=-DMCL_STATIC_CODE
LIB_OBJ=obj/static_code.o
TEST_SRC=bls12_test.cpp
endif
ifeq ($(CPU),x86-64)
MCL_USE_XBYAK?=1
TEST_SRC+=mont_fp_test.cpp sq_test.cpp
@ -86,7 +93,7 @@ ifneq ($(CPU),)
ASM_SRC=$(ASM_SRC_PATH_NAME).s
endif
ASM_OBJ=$(OBJ_DIR)/$(CPU).o
LIB_OBJ=$(OBJ_DIR)/fp.o
LIB_OBJ+=$(OBJ_DIR)/fp.o
BN256_OBJ=$(OBJ_DIR)/bn_c256.o
BN384_OBJ=$(OBJ_DIR)/bn_c384.o
BN384_256_OBJ=$(OBJ_DIR)/bn_c384_256.o
@ -106,7 +113,9 @@ ifeq ($(MCL_USE_LLVM),1)
LIB_OBJ+=$(ASM_OBJ)
# special case for intel with bmi2
ifeq ($(INTEL),1)
LIB_OBJ+=$(OBJ_DIR)/$(CPU).bmi2.o
ifneq ($(MCL_STATIC_CODE),1)
LIB_OBJ+=$(OBJ_DIR)/$(CPU).bmi2.o
endif
endif
endif
LLVM_SRC=src/base$(BIT).ll
@ -237,6 +246,18 @@ endif
$(GEN_EXE): src/gen.cpp src/llvm_gen.hpp
$(CXX) -o $@ $< $(CFLAGS)
src/dump_code: src/dump_code.cpp src/fp.cpp src/fp_generator.hpp
$(CXX) -o $@ src/dump_code.cpp src/fp.cpp -g -I include -DMCL_DUMP_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER
src/static_code.asm: src/dump_code
$< > $@
obj/static_code.o: src/static_code.asm
nasm -felf64 -o $@ $<
bin/static_code_test.exe: test/static_code_test.cpp src/fp.cpp obj/static_code.o
$(CXX) -o $@ -O3 $^ -g -DMCL_DONT_USE_XBYAK -DMCL_STATIC_CODE -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I include -Wall -Wextra
asm: $(LLVM_SRC)
$(LLVM_OPT) -O3 -o - $(LLVM_SRC) | $(LLVM_LLC) -O3 $(LLVM_FLAGS) -x86-asm-syntax=intel
@ -388,7 +409,7 @@ update_cybozulib:
cp -a $(addprefix ../cybozulib/,$(wildcard include/cybozu/*.hpp)) include/cybozu/
clean:
$(RM) $(LIB_DIR)/*.a $(LIB_DIR)/*.$(LIB_SUF) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.obj $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_OBJ) $(LIB_OBJ) $(BN256_OBJ) $(BN384_OBJ) $(BN512_OBJ) $(FUNC_LIST) src/*.ll lib/*.a
$(RM) $(LIB_DIR)/*.a $(LIB_DIR)/*.$(LIB_SUF) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.obj $(OBJ_DIR)/*.d $(EXE_DIR)/*.exe $(GEN_EXE) $(ASM_OBJ) $(LIB_OBJ) $(BN256_OBJ) $(BN384_OBJ) $(BN512_OBJ) $(FUNC_LIST) src/*.ll lib/*.a src/static_code.asm src/dump_code
ALL_SRC=$(SRC_SRC) $(TEST_SRC) $(SAMPLE_SRC)
DEPEND_FILE=$(addprefix $(OBJ_DIR)/, $(addsuffix .d,$(basename $(ALL_SRC))))

@ -91,7 +91,7 @@ else
CFLAGS_OPT+=$(MARCH)
endif
endif
CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith
CFLAGS_WARN=-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith -Wundef
CFLAGS+=-g3
INC_OPT=-I include -I test
CFLAGS+=$(CFLAGS_WARN) $(BIT_OPT) $(INC_OPT)

@ -854,6 +854,12 @@ struct Param {
{
this->cp = cp;
isBLS12 = cp.curveType == MCL_BLS12_381;
#ifdef MCL_STATIC_CODE
if (!isBLS12) {
*pb = false;
return;
}
#endif
gmp::setStr(pb, z, cp.z);
if (!*pb) return;
isNegative = z < 0;

@ -16,6 +16,9 @@
#endif
#if !defined(MCL_DONT_USE_XBYAK) && (defined(_WIN64) || defined(__x86_64__)) && (MCL_SIZEOF_UNIT == 8)
#define MCL_USE_XBYAK
#endif
#if defined(MCL_USE_XBYAK) || defined(MCL_STATIC_CODE)
#define MCL_X64_ASM
#define MCL_XBYAK_DIRECT_CALL
#endif
@ -202,6 +205,8 @@ struct Op {
Unit R3[maxUnitSize];
#ifdef MCL_USE_XBYAK
FpGenerator *fg;
#endif
#ifdef MCL_X64_ASM
mcl::Array<Unit> invTbl;
#endif
void3u fp_addA_;
@ -288,7 +293,7 @@ struct Op {
memset(one, 0, sizeof(one));
memset(R2, 0, sizeof(R2));
memset(R3, 0, sizeof(R3));
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
invTbl.clear();
#endif
fp_addA_ = 0;

@ -68,7 +68,7 @@ void benchFp(size_t bitSize, int mode)
if (mode & 4) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM);
if (mode & 8) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM_MONT);
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
if (mode & 16) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_XBYAK);
#endif
}
@ -122,7 +122,7 @@ void benchEc(size_t bitSize, int mode, mcl::ec::Mode ecMode)
if (mode & 4) benchEcSub(tbl[i], mcl::fp::FP_LLVM, ecMode);
if (mode & 8) benchEcSub(tbl[i], mcl::fp::FP_LLVM_MONT, ecMode);
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
if (mode & 16) benchEcSub(tbl[i], mcl::fp::FP_XBYAK, ecMode);
#endif
}

@ -168,7 +168,7 @@ int main(int argc, char *argv[])
benchRaw(tbl[i], mcl::fp::FP_LLVM);
benchRaw(tbl[i], mcl::fp::FP_LLVM_MONT);
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
if (bitSize <= 384) {
benchRaw(tbl[i], mcl::fp::FP_XBYAK);
}

@ -0,0 +1,7 @@
#include <mcl/bls12_381.hpp>
int main()
{
mcl::bn::initPairing(mcl::BLS12_381);
}

@ -3,12 +3,14 @@
#include <cybozu/sha2.hpp>
#include <cybozu/endian.hpp>
#include <mcl/conversion.hpp>
#ifdef MCL_STATIC_CODE
#include "fp_static_code.hpp"
#endif
#ifdef MCL_USE_XBYAK
#include "fp_generator.hpp"
#else
#define XBYAK_ONLY_CLASS_CPU
#include "xbyak/xbyak_util.h"
//#include "detect_cpu.hpp"
#endif
#include "low_func.hpp"
#ifdef MCL_USE_LLVM
@ -315,7 +317,7 @@ void setOp(Op& op, Mode mode)
#endif
}
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
inline void invOpForMontC(Unit *y, const Unit *x, const Op& op)
{
Unit r[maxUnitSize];
@ -372,6 +374,12 @@ static bool initForMont(Op& op, const Unit *p, Mode mode)
op.fp_invOp = &invOpForMontC;
initInvTbl(op);
}
#elif defined(MCL_STATIC_CODE)
fp::setStaticCode(op);
if (op.isMont && N <= 4) {
op.fp_invOp = &invOpForMontC;
initInvTbl(op);
}
#endif
return true;
}
@ -403,14 +411,25 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size
priority : MCL_USE_XBYAK > MCL_USE_LLVM > none
Xbyak > llvm_mont > llvm > gmp_mont > gmp
*/
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
if (mode == FP_AUTO) mode = FP_XBYAK;
if (mode == FP_XBYAK && bitSize > 384) {
mode = FP_AUTO;
}
#ifdef MCL_USE_XBYAK
if (!isEnableJIT()) {
mode = FP_AUTO;
}
#elif MCL_STATIC_CODE
{
// static jit code uses avx, mulx, adox, adcx
using namespace Xbyak::util;
Cpu cpu;
if (!(cpu.has(Cpu::tAVX) && cpu.has(Cpu::tBMI2) && cpu.has(Cpu::tADX))) {
mode = FP_AUTO;
}
}
#endif
#else
if (mode == FP_XBYAK) mode = FP_AUTO;
#endif

@ -7,7 +7,6 @@
http://opensource.org/licenses/BSD-3-Clause
*/
#if CYBOZU_HOST == CYBOZU_HOST_INTEL
#define XBYAK_NO_OP_NAMES
#define XBYAK_DISABLE_AVX512
#include "xbyak/xbyak_util.h"
@ -25,6 +24,61 @@
namespace mcl {
#ifdef MCL_DUMP_JIT
struct DumpCode {
FILE *fp_;
DumpCode()
: fp_(stdout)
{
}
void set(const std::string& name, const uint8_t *begin, const size_t size)
{
fprintf(fp_, "segment .text\n");
fprintf(fp_, "global %s\n", name.c_str());
fprintf(fp_, "align 16\n");
fprintf(fp_, "%s:\n", name.c_str());
const uint8_t *p = begin;
size_t remain = size;
while (remain > 0) {
size_t n = remain >= 16 ? 16 : remain;
fprintf(fp_, "db ");
for (size_t i = 0; i < n; i++) {
fprintf(fp_, "0x%02x,", *p++);
}
fprintf(fp_, "\n");
remain -= n;
}
}
void dumpData(const void *begin, const void *end)
{
fprintf(fp_, "align 16\n");
fprintf(fp_, "dq ");
const uint64_t *p = (const uint64_t*)begin;
const uint64_t *pe = (const uint64_t*)end;
const size_t n = pe - p;
for (size_t i = 0; i < n; i++) {
fprintf(fp_, "0x%016llx,", (unsigned long long)*p++);
}
fprintf(fp_, "\n");
}
};
template<class T>
void setFuncInfo(DumpCode& prof, const char *suf, const char *name, const T& begin, const uint8_t* end)
{
if (suf == 0) suf = "";
const uint8_t*p = (const uint8_t*)begin;
prof.set(std::string("mclx_") + suf + name, p, end - p);
}
#else
template<class T>
void setFuncInfo(Xbyak::util::Profiler& prof, const char *suf, const char *name, const T& begin, const uint8_t* end)
{
if (suf == 0) suf = "";
const uint8_t*p = (const uint8_t*)begin;
prof.set((std::string("mclx_") + suf + name).c_str(), p, end - p);
}
#endif
namespace fp_gen_local {
class MemReg {
@ -203,7 +257,11 @@ struct FpGenerator : Xbyak::CodeGenerator {
int pn_;
int FpByte_;
bool isFullBit_;
#ifdef MCL_DUMP_JIT
DumpCode prof_;
#else
Xbyak::util::Profiler prof_;
#endif
/*
@param op [in] ; use op.p, op.N, op.isFullBit
@ -257,19 +315,22 @@ struct FpGenerator : Xbyak::CodeGenerator {
private:
void init_inner(Op& op)
{
const char *suf = op.xi_a ? "Fp" : "Fr";
op_ = &op;
L(pL_);
p_ = reinterpret_cast<const uint64_t*>(getCurr());
for (size_t i = 0; i < op.N; i++) {
dq(op.p[i]);
}
#ifdef MCL_DUMP_JIT
prof_.dumpData(p_, getCurr());
#endif
rp_ = fp::getMontgomeryCoeff(p_[0]);
pn_ = (int)op.N;
FpByte_ = int(op.maxN * sizeof(uint64_t));
isFullBit_ = op.isFullBit;
// printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_);
#ifdef MCL_USE_PROF
static char suf[] = "_0";
int profMode = 0;
#ifdef XBYAK_USE_VTUNE
profMode = 2;
@ -281,94 +342,116 @@ private:
if (profMode) {
prof_.init(profMode);
prof_.setStartAddr(getCurr());
prof_.setNameSuffix(suf);
suf[1]++;
}
#else
(void)suf;
#endif
align(16);
op.fp_addPre = gen_addSubPre(true, pn_);
prof_.set("Fp_addPre", getCurr());
setFuncInfo(prof_, suf, "_addPre", op.fp_addPre, getCurr());
align(16);
op.fp_subPre = gen_addSubPre(false, pn_);
prof_.set("Fp_subPre", getCurr());
setFuncInfo(prof_, suf, "_subPre", op.fp_subPre, getCurr());
align(16);
op.fp_addA_ = gen_fp_add();
prof_.set("Fp_add", getCurr());
setFuncInfo(prof_, suf, "_add", op.fp_addA_, getCurr());
align(16);
op.fp_subA_ = gen_fp_sub();
prof_.set("Fp_sub", getCurr());
setFuncInfo(prof_, suf, "_sub", op.fp_subA_, getCurr());
align(16);
op.fp_shr1 = gen_shr1();
prof_.set("Fp_shr1", getCurr());
setFuncInfo(prof_, suf, "_shr1", op.fp_shr1, getCurr());
align(16);
op.fp_negA_ = gen_fp_neg();
prof_.set("Fp_neg", getCurr());
op.fpDbl_addA_ = gen_fpDbl_add();
prof_.set("FpDbl_add", getCurr());
op.fpDbl_subA_ = gen_fpDbl_sub();
prof_.set("FpDbl_sub", getCurr());
op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2);
prof_.set("FpDbl_addPre", getCurr());
op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2);
prof_.set("FpDbl_subPre", getCurr());
op.fpDbl_mulPreA_ = gen_fpDbl_mulPre();
prof_.set("FpDbl_mulPre", getCurr());
op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre();
prof_.set("FpDbl_sqrPre", getCurr());
setFuncInfo(prof_, suf, "_neg", op.fp_negA_, getCurr());
align(16);
op.fpDbl_modA_ = gen_fpDbl_mod(op);
prof_.set("FpDbl_mod", getCurr());
setFuncInfo(prof_, suf, "Dbl_mod", op.fpDbl_modA_, getCurr());
align(16);
op.fp_mulA_ = gen_mul();
prof_.set("Fp_mul", getCurr());
setFuncInfo(prof_, suf, "_mul", op.fp_mulA_, getCurr());
if (op.fp_mulA_) {
op.fp_mul = fp::func_ptr_cast<void4u>(op.fp_mulA_); // used in toMont/fromMont
}
align(16);
op.fp_sqrA_ = gen_sqr();
prof_.set("Fp_sqr", getCurr());
setFuncInfo(prof_, suf, "_sqr", op.fp_sqrA_, getCurr());
if (op.primeMode != PM_NIST_P192 && op.N <= 4) { // support general op.N but not fast for op.N > 4
align(16);
op.fp_preInv = getCurr<int2u>();
gen_preInv();
prof_.set("preInv", getCurr());
setFuncInfo(prof_, suf, "_preInv", op.fp_preInv, getCurr());
}
if (op.xi_a == 0) return; // Fp2 is not used
align(16);
op.fpDbl_addA_ = gen_fpDbl_add();
setFuncInfo(prof_, suf, "Dbl_add", op.fpDbl_addA_, getCurr());
align(16);
op.fpDbl_subA_ = gen_fpDbl_sub();
setFuncInfo(prof_, suf, "Dbl_sub", op.fpDbl_subA_, getCurr());
align(16);
op.fpDbl_addPre = gen_addSubPre(true, pn_ * 2);
setFuncInfo(prof_, suf, "Dbl_addPre", op.fpDbl_addPre, getCurr());
align(16);
op.fpDbl_subPre = gen_addSubPre(false, pn_ * 2);
setFuncInfo(prof_, suf, "Dbl_subPre", op.fpDbl_subPre, getCurr());
align(16);
op.fpDbl_mulPreA_ = gen_fpDbl_mulPre();
setFuncInfo(prof_, suf, "Dbl_mulPre", op.fpDbl_mulPreA_, getCurr());
align(16);
op.fpDbl_sqrPreA_ = gen_fpDbl_sqrPre();
setFuncInfo(prof_, suf, "Dbl_sqrPre", op.fpDbl_sqrPreA_, getCurr());
align(16);
op.fp2_addA_ = gen_fp2_add();
prof_.set("Fp2_add", getCurr());
setFuncInfo(prof_, suf, "2_add", op.fp2_addA_, getCurr());
align(16);
op.fp2_subA_ = gen_fp2_sub();
prof_.set("Fp2_sub", getCurr());
setFuncInfo(prof_, suf, "2_sub", op.fp2_subA_, getCurr());
align(16);
op.fp2_negA_ = gen_fp2_neg();
prof_.set("Fp2_neg", getCurr());
setFuncInfo(prof_, suf, "2_neg", op.fp2_negA_, getCurr());
op.fp2_mulNF = 0;
align(16);
op.fp2Dbl_mulPreA_ = gen_fp2Dbl_mulPre();
prof_.set("Fp2Dbl_mulPre", getCurr());
if (op.fp2Dbl_mulPreA_) setFuncInfo(prof_, suf, "2Dbl_mulPre", op.fp2Dbl_mulPreA_, getCurr());
align(16);
op.fp2Dbl_sqrPreA_ = gen_fp2Dbl_sqrPre();
prof_.set("Fp2Dbl_sqrPre", getCurr());
if (op.fp2Dbl_sqrPreA_) setFuncInfo(prof_, suf, "2Dbl_sqrPre", op.fp2Dbl_sqrPreA_, getCurr());
align(16);
op.fp2_mulA_ = gen_fp2_mul();
prof_.set("Fp2_mul", getCurr());
setFuncInfo(prof_, suf, "2_mul", op.fp2_mulA_, getCurr());
align(16);
op.fp2_sqrA_ = gen_fp2_sqr();
prof_.set("Fp2_sqr", getCurr());
setFuncInfo(prof_, suf, "2_sqr", op.fp2_sqrA_, getCurr());
align(16);
op.fp2_mul_xiA_ = gen_fp2_mul_xi();
prof_.set("Fp2_mul_xi", getCurr());
setFuncInfo(prof_, suf, "2_mul_xi", op.fp2_mul_xiA_, getCurr());
}
u3u gen_addSubPre(bool isAdd, int n)
{
// if (isFullBit_) return 0;
align(16);
u3u func = getCurr<u3u>();
StackFrame sf(this, 3);
if (isAdd) {
@ -429,7 +512,7 @@ private:
}
jmp(exit);
L(nonZero);
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
for (size_t i = 0; i < t.size(); i++) {
mov(rdx, ptr [rax + i * 8]);
if (i == 0) {
@ -557,7 +640,7 @@ private:
mov(*fullReg, 0);
adc(*fullReg, 0);
}
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
sub_rm(p1, rax);
if (fullReg) {
sbb(*fullReg, 0);
@ -577,7 +660,7 @@ private:
const Pack& p1 = t.sub(pn_, pn_);
load_rm(p0, px);
sub_rm(p0, py, withCarry);
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
load_rm(p1, rax);
sbb(rax, rax); // rax = (x > y) ? 0 : -1
for (size_t i = 0; i < p1.size(); i++) {
@ -618,7 +701,7 @@ private:
Label exit;
if (isFullBit_) {
jnc("@f");
mov(t2[0], pL_); // t2 is not used
lea(t2[0], ptr[rip+pL_]); // t2[0] is not used
sub_rm(t1, t2[0]);
jmp(exit);
L("@@");
@ -648,7 +731,6 @@ private:
}
void3u gen_fp_add()
{
align(16);
void3u func = getCurr<void3u>();
if (pn_ <= 4) {
gen_fp_add_le4();
@ -666,7 +748,7 @@ private:
inLocalLabel();
gen_raw_add(pz, px, py, rax, pn_);
mov(px, pL_); // destroy px
lea(px, ptr[rip+pL_]);
if (isFullBit_) {
jc(".over", jmpMode);
}
@ -696,7 +778,6 @@ private:
}
void3u gen_fpDbl_add()
{
align(16);
void3u func = getCurr<void3u>();
if (pn_ <= 4) {
int tn = pn_ * 2 + (isFullBit_ ? 1 : 0);
@ -724,7 +805,6 @@ private:
}
void3u gen_fpDbl_sub()
{
align(16);
void3u func = getCurr<void3u>();
if (pn_ <= 4) {
int tn = pn_ * 2;
@ -774,7 +854,6 @@ private:
}
void3u gen_fp_sub()
{
align(16);
void3u func = getCurr<void3u>();
if (pn_ <= 4) {
gen_fp_sub_le4();
@ -792,14 +871,13 @@ private:
Label exit;
gen_raw_sub(pz, px, py, rax, pn_);
jnc(exit, jmpMode);
mov(px, pL_);
lea(px, ptr[rip+pL_]);
gen_raw_add(pz, pz, px, rax, pn_);
L(exit);
return func;
}
void2u gen_fp_neg()
{
align(16);
void2u func = getCurr<void2u>();
StackFrame sf(this, 2, UseRDX | pn_);
gen_raw_neg(sf.p[0], sf.p[1], sf.t);
@ -807,7 +885,6 @@ private:
}
void2u gen_shr1()
{
align(16);
void2u func = getCurr<void2u>();
const int c = 1;
StackFrame sf(this, 2, 1);
@ -828,7 +905,6 @@ private:
}
void3u gen_mul()
{
align(16);
void3u func = getCurr<void3u>();
if (op_->primeMode == PM_NIST_P192) {
StackFrame sf(this, 3, 10 | UseRDX, 8 * 6);
@ -901,7 +977,7 @@ private:
mov(a, rp_);
mul(t6);
mov(t0, pL_);
lea(t0, ptr[rip+pL_]);
mov(t7, a); // q
// [d:t7:t1] = p * q
@ -970,7 +1046,7 @@ private:
mov(a, rp_);
mul(t10);
mov(t0, pL_);
lea(t0, ptr[rip+pL_]);
mov(t7, a); // q
// [d:t7:t2:t1] = p * q
@ -1050,7 +1126,7 @@ private:
mov(a, rp_);
mul(z);
mov(t0, pL_);
lea(t0, ptr[rip+pL_]);
mov(t7, a); // q
// [d:t7:t3:t2:t1] = p * q
@ -1141,7 +1217,6 @@ private:
}
void2u gen_fpDbl_mod(const fp::Op& op)
{
align(16);
void2u func = getCurr<void2u>();
if (op.primeMode == PM_NIST_P192) {
StackFrame sf(this, 2, 6 | UseRDX);
@ -1187,7 +1262,6 @@ private:
}
void2u gen_sqr()
{
align(16);
void2u func = getCurr<void2u>();
if (op_->primeMode == PM_NIST_P192) {
StackFrame sf(this, 3, 10 | UseRDX, 6 * 8);
@ -1308,7 +1382,7 @@ private:
L(fp_mulL);
vmovq(xm0, p0); // save p0
mov(p0, pL_);
lea(p0, ptr[rip+pL_]);
vmovq(xm1, p2);
mov(p2, ptr [p2]);
montgomery4_1(rp_, t0, t7, t3, t2, t1, p1, p2, p0, t4, t5, t6, t8, t9, true, xm2);
@ -1404,7 +1478,7 @@ private:
mov(a, rp_);
mul(c[0]); // q = a
mov(d, a);
mov(t1, pL_);
lea(t1, ptr[rip+pL_]);
// c += p * q
mulAdd(c, 6, t1);
}
@ -1450,7 +1524,7 @@ private:
const Pack z = Pack(t3, t2, t1, t0, t7, t6);
const Pack keep = Pack(rdx, rax, px, py, t8, t9);
mov_rr(keep, z);
mov(t5, pL_);
lea(t5, ptr[rip+pL_]);
sub_rm(z, t5);
cmovc_rr(z, keep);
store_mr(pz, z);
@ -1480,7 +1554,7 @@ private:
const Reg64& t9 = sf.t[9];
vmovq(xm0, p0); // save p0
mov(t7, pL_);
lea(t7, ptr[rip+pL_]);
mov(t9, ptr [p2]);
// c3, c2, c1, c0, px, y, p,
montgomery3_1(rp_, t0, t3, t2, t1, p1, t9, t7, t4, t5, t6, t8, p0, true);
@ -1526,7 +1600,7 @@ private:
const Reg64& t9 = sf.t[9];
vmovq(xm0, pz); // save pz
mov(t7, pL_);
lea(t7, ptr[rip+pL_]);
mov(t9, ptr [px]);
mul3x1_sqr1(px, t9, t3, t2, t1, t0);
mov(t0, rdx);
@ -2291,7 +2365,6 @@ private:
}
void2u gen_fpDbl_sqrPre()
{
align(16);
void2u func = getCurr<void2u>();
if (pn_ == 2 && useMulx_) {
StackFrame sf(this, 2, 7 | UseRDX);
@ -2332,7 +2405,6 @@ private:
}
void3u gen_fpDbl_mulPre()
{
align(16);
void3u func = getCurr<void3u>();
if (pn_ == 2 && useMulx_) {
StackFrame sf(this, 3, 5 | UseRDX);
@ -2630,7 +2702,7 @@ private:
mov(rax, px);
// px is free frome here
load_mp(vv, rax, t); // v = x
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
load_mp(uu, rax, t); // u = p_
// k = 0
xor_(rax, rax);
@ -2708,7 +2780,7 @@ private:
const Reg64& t2 = ss.getReg(0);
const Reg64& t3 = rdx;
mov(t2, pL_);
lea(t2, ptr[rip+pL_]);
if (isFullBit_) {
mov(t, ptr [rTop]);
test(t, t);
@ -3373,7 +3445,6 @@ private:
// if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
// almost same for pn_ == 6
if (pn_ != 4) return 0;
align(16);
void3u func = getCurr<void3u>();
const RegExp z = rsp + 0 * 8;
@ -3438,7 +3509,6 @@ private:
// if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
// almost same for pn_ == 6
if (pn_ != 4) return 0;
align(16);
void2u func = getCurr<void2u>();
// almost same for pn_ == 6
if (pn_ != 4) return 0;
@ -3524,7 +3594,6 @@ private:
}
void3u gen_fp2_add()
{
align(16);
void3u func = getCurr<void3u>();
if (pn_ == 4 && !isFullBit_) {
gen_fp2_add4();
@ -3538,7 +3607,6 @@ private:
}
void3u gen_fp2_sub()
{
align(16);
void3u func = getCurr<void3u>();
if (pn_ == 4 && !isFullBit_) {
gen_fp2_sub4();
@ -3584,7 +3652,7 @@ private:
}
}
sub_rr(a, b);
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
load_rm(b, rax);
sbb(rax, rax);
for (int i = 0; i < pn_; i++) {
@ -3592,7 +3660,7 @@ private:
}
add_rr(a, b);
store_mr(py, a);
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
mov_rr(a, t);
sub_rm(t, rax);
cmovc_rr(t, a);
@ -3610,7 +3678,7 @@ private:
mov_rr(b, a);
add_rm(b, px + FpByte_);
sub_rm(a, px + FpByte_);
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
jnc("@f");
add_rm(a, rax);
L("@@");
@ -3624,7 +3692,6 @@ private:
{
if (isFullBit_) return 0;
if (op_->xi_a != 1) return 0;
align(16);
void2u func = getCurr<void2u>();
if (pn_ == 4) {
gen_fp2_mul_xi4();
@ -3638,7 +3705,6 @@ private:
}
void2u gen_fp2_neg()
{
align(16);
void2u func = getCurr<void2u>();
if (pn_ <= 6) {
StackFrame sf(this, 2, UseRDX | pn_);
@ -3652,7 +3718,6 @@ private:
{
if (isFullBit_) return 0;
if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
align(16);
void3u func = getCurr<void3u>();
bool embedded = pn_ == 4;
@ -3729,7 +3794,6 @@ private:
{
if (isFullBit_) return 0;
if (pn_ != 4 && !(pn_ == 6 && useMulx_ && useAdx_)) return 0;
align(16);
void2u func = getCurr<void2u>();
const RegExp y = rsp + 0 * 8;
@ -3789,7 +3853,7 @@ private:
mov(ptr [(RegExp)t2 + i * 8], rax);
}
// t3 = a + p - b
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
add_rm(a, rax);
sub_rr(a, b);
store_mr(t3, a);

@ -0,0 +1,92 @@
#pragma once
/**
@file
@brief Fp generator
@author MITSUNARI Shigeo(@herumi)
@license modified new BSD license
http://opensource.org/licenses/BSD-3-Clause
*/
#ifndef MCL_STATIC_CODE
#error "define MCL_STATIC_CODE"
#endif
namespace mcl { namespace fp {
extern "C" {
Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*);
Unit mclx_Fp_subPre(Unit*, const Unit*, const Unit*);
void mclx_Fp_add(Unit*, const Unit*, const Unit*);
void mclx_Fp_sub(Unit*, const Unit*, const Unit*);
void mclx_Fp_shr1(Unit*, const Unit*);
void mclx_Fp_neg(Unit*, const Unit*);
void mclx_FpDbl_mod(Unit*, const Unit*);
void mclx_Fp_mul(Unit*, const Unit*, const Unit*);
void mclx_Fp_sqr(Unit*, const Unit*);
void mclx_FpDbl_add(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*);
Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*);
Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_sqrPre(Unit*, const Unit*);
void mclx_Fp2_add(Unit*, const Unit*, const Unit*);
void mclx_Fp2_sub(Unit*, const Unit*, const Unit*);
void mclx_Fp2_neg(Unit*, const Unit*);
void mclx_Fp2_mul(Unit*, const Unit*, const Unit*);
void mclx_Fp2_sqr(Unit*, const Unit*);
void mclx_Fp2_mul_xi(Unit*, const Unit*);
Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*);
Unit mclx_Fr_subPre(Unit*, const Unit*, const Unit*);
void mclx_Fr_add(Unit*, const Unit*, const Unit*);
void mclx_Fr_sub(Unit*, const Unit*, const Unit*);
void mclx_Fr_shr1(Unit*, const Unit*);
void mclx_Fr_neg(Unit*, const Unit*);
void mclx_Fr_mul(Unit*, const Unit*, const Unit*);
void mclx_Fr_sqr(Unit*, const Unit*);
int mclx_Fr_preInv(Unit*, const Unit*);
} // extern "C"
void setStaticCode(mcl::fp::Op& op)
{
if (op.xi_a) {
// Fp, sizeof(Fp) = 48, supports Fp2
op.fp_addPre = mclx_Fp_addPre;
op.fp_subPre = mclx_Fp_subPre;
op.fp_addA_ = mclx_Fp_add;
op.fp_subA_ = mclx_Fp_sub;
op.fp_shr1 = mclx_Fp_shr1;
op.fp_negA_ = mclx_Fp_neg;
op.fpDbl_modA_ = mclx_FpDbl_mod;
op.fp_mulA_ = mclx_Fp_mul;
op.fp_sqrA_ = mclx_Fp_sqr;
op.fpDbl_addA_ = mclx_FpDbl_add;
op.fpDbl_subA_ = mclx_FpDbl_sub;
op.fpDbl_addPre = mclx_FpDbl_addPre;
op.fpDbl_subPre = mclx_FpDbl_subPre;
op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre;
op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre;
op.fp2_addA_ = mclx_Fp2_add;
op.fp2_subA_ = mclx_Fp2_sub;
op.fp2_negA_ = mclx_Fp2_neg;
op.fp2_mulNF = 0;
op.fp2_mulA_ = mclx_Fp2_mul;
op.fp2_sqrA_ = mclx_Fp2_sqr;
op.fp2_mul_xiA_ = mclx_Fp2_mul_xi;
} else {
// Fr, sizeof(Fr) = 32
op.fp_addPre = mclx_Fr_addPre;
op.fp_subPre = mclx_Fr_subPre;
op.fp_addA_ = mclx_Fr_add;
op.fp_subA_ = mclx_Fr_sub;
op.fp_shr1 = mclx_Fr_shr1;
op.fp_negA_ = mclx_Fr_neg;
op.fp_mulA_ = mclx_Fr_mul;
op.fp_sqrA_ = mclx_Fr_sqr;
op.fp_preInv = mclx_Fr_preInv;
}
op.fp_mul = fp::func_ptr_cast<void4u>(op.fp_mulA_);
}
} } // mcl::fp

@ -16,8 +16,10 @@
#endif
#ifndef MCL_LLVM_BMI2
#if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_USE_XBYAK) && !defined(MCL_USE_VINT)
#if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && !defined(MCL_STATIC_CODE) && !defined(MCL_USE_VINT)
#define MCL_LLVM_BMI2 1
#else
#define MCL_LLVM_BMI2 0
#endif
#endif

@ -100,6 +100,19 @@ void testBench(const G1& P, const G2& Q)
CYBOZU_BENCH_C("Fp::mul ", C3, Fp::mul, x, x, y);
CYBOZU_BENCH_C("Fp::sqr ", C3, Fp::sqr, x, x);
CYBOZU_BENCH_C("Fp::inv ", C3, Fp::inv, x, x);
CYBOZU_BENCH_C("Fp::pow ", C3, Fp::pow, x, x, y);
{
Fr a, b, c;
a.setHashOf("abc", 3);
b.setHashOf("123", 3);
CYBOZU_BENCH_C("Fr::add ", C3, Fr::add, a, a, b);
CYBOZU_BENCH_C("Fr::sub ", C3, Fr::sub, a, a, b);
CYBOZU_BENCH_C("Fr::neg ", C3, Fr::neg, a, a);
CYBOZU_BENCH_C("Fr::mul ", C3, Fr::mul, a, a, b);
CYBOZU_BENCH_C("Fr::sqr ", C3, Fr::sqr, a, a);
CYBOZU_BENCH_C("Fr::inv ", C3, Fr::inv, a, a);
CYBOZU_BENCH_C("Fr::pow ", C3, Fr::pow, a, a, b);
}
Fp2 xx, yy;
xx.a = x;
xx.b = 3;

@ -688,6 +688,8 @@ CYBOZU_TEST_AUTO(multi)
G1 P;
G2 Q;
int i;
#ifndef MCL_STATIC_CODE
puts("BN254");
testCurve(mcl::BN254);
i = 1;
@ -695,6 +697,7 @@ CYBOZU_TEST_AUTO(multi)
CYBOZU_BENCH_C("naiveG2", 100, (BN::param.mapTo.naiveMapTo<G1, Fp>), P, i++);
CYBOZU_BENCH_C("calcBN2", 100, (BN::param.mapTo.calcBN<G2, Fp2>), Q, i++);
CYBOZU_BENCH_C("naiveG2", 100, (BN::param.mapTo.naiveMapTo<G2, Fp2>), Q, i++);
#endif
puts("BLS12_381");
testCurve(mcl::BLS12_381);
i = 1;
@ -861,7 +864,11 @@ int main(int argc, char *argv[])
return 1;
}
g_mode = mcl::fp::StrToMode(mode);
#ifdef MCL_STATIC_CODE
printf("static code for BLS12-381\n");
#else
printf("JIT %d\n", mcl::fp::isEnableJIT());
#endif
#if 0
initPairing(mcl::BLS12_381);
cybozu::XorShift rg;

@ -602,7 +602,7 @@ void test_sub(const mcl::EcParam *para, size_t paraNum)
test_sub_sub(para[i], mcl::fp::FP_LLVM);
test_sub_sub(para[i], mcl::fp::FP_LLVM_MONT);
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
test_sub_sub(para[i], mcl::fp::FP_XBYAK);
#endif
mulVec(para[i]);

@ -876,7 +876,7 @@ void modpTest()
}
#include <iostream>
#if (defined(MCL_USE_LLVM) || defined(MCL_USE_XBYAK)) && (MCL_MAX_BIT_SIZE >= 521)
#if (defined(MCL_USE_LLVM) || defined(MCL_X64_ASM)) && (MCL_MAX_BIT_SIZE >= 521)
CYBOZU_TEST_AUTO(mod_NIST_P521)
{
const size_t len = 521;
@ -908,7 +908,7 @@ CYBOZU_TEST_AUTO(mod_NIST_P521)
mcl_fpDbl_mod_NIST_P521L(ex, in, Fp::getOp().p);
CYBOZU_TEST_EQUAL_ARRAY(ex, ok, N + 1);
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
const mcl::fp::Op& op = Fp::getOp();
if (!op.isMont) {
op.fpDbl_mod(ex, in, op.p);
@ -1014,7 +1014,7 @@ CYBOZU_TEST_AUTO(main)
sub(mcl::fp::FP_LLVM_MONT);
}
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
if (g_mode.empty() || g_mode == "xbyak") {
sub(mcl::fp::FP_XBYAK);
}

@ -465,7 +465,7 @@ void testAll()
test(p, mcl::fp::FP_LLVM);
test(p, mcl::fp::FP_LLVM_MONT);
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
test(p, mcl::fp::FP_XBYAK);
#endif
}

@ -0,0 +1,70 @@
#include <cybozu/test.hpp>
#include <mcl/bls12_381.hpp>
using namespace mcl::bn;
CYBOZU_TEST_AUTO(init)
{
initPairing(mcl::BLS12_381);
}
CYBOZU_TEST_AUTO(Fr)
{
Fr x, y;
x = 3;
y = 5;
CYBOZU_TEST_EQUAL(x + y, 8);
CYBOZU_TEST_EQUAL(x - y, -2);
CYBOZU_TEST_EQUAL(x * y, 15);
}
CYBOZU_TEST_AUTO(Fp)
{
Fp x, y;
x = 3;
y = 5;
CYBOZU_TEST_EQUAL(x + y, 8);
CYBOZU_TEST_EQUAL(x - y, -2);
CYBOZU_TEST_EQUAL(x * y, 15);
}
CYBOZU_TEST_AUTO(Fp2)
{
Fp2 x, y;
x.a = 3;
x.b = 2;
y.a = 1;
y.b = 4;
/*
(3+2i)(1+4i)=3-8+(12+2)i
*/
CYBOZU_TEST_EQUAL(x + y, Fp2(4, 6));
CYBOZU_TEST_EQUAL(x - y, Fp2(2, -2));
CYBOZU_TEST_EQUAL(x * y, Fp2(-5, 14));
}
CYBOZU_TEST_AUTO(G1)
{
G1 P, Q;
hashAndMapToG1(P, "abc", 3);
Fr r1, r2;
r1.setHashOf("abc", 3);
r2 = -r1;
G1::mul(Q, P, r1);
Q = -Q;
P *= r2;
CYBOZU_TEST_EQUAL(P, Q);
}
CYBOZU_TEST_AUTO(G2)
{
G2 P, Q;
hashAndMapToG2(P, "abc", 3);
Fr r1, r2;
r1.setHashOf("abc", 3);
r2 = -r1;
G2::mul(Q, P, r1);
Q = -Q;
P *= r2;
CYBOZU_TEST_EQUAL(P, Q);
}
Loading…
Cancel
Save