fix pic code

update-fork
MITSUNARI Shigeo 4 years ago
parent a522fd532d
commit eaabb2337b
  1. 2
      Makefile
  2. 7
      include/mcl/op.hpp
  3. 4
      sample/bench.cpp
  4. 2
      sample/rawbench.cpp
  5. 25
      src/fp.cpp
  6. 84
      src/fp_generator.hpp
  7. 87
      src/fp_static_code.hpp
  8. 2
      src/low_func.hpp
  9. 2
      test/ec_test.cpp
  10. 6
      test/fp_test.cpp
  11. 2
      test/fp_tower_test.cpp

@ -247,7 +247,7 @@ obj/static_code.o: src/static_code.asm
nasm -felf64 -o $@ $<
bin/static_code_test.exe: test/static_code_test.cpp src/fp.cpp obj/static_code.o
$(CXX) -o $@ -O3 $^ -DMCL_STATIC_JIT -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I include -Wall -Wextra
$(CXX) -o $@ -O3 $^ -g -DMCL_DONT_USE_XBYAK -DMCL_STATIC_CODE -DMCL_MAX_BIT_SIZE=384 -DMCL_DONT_USE_OPENSSL -DMCL_USE_VINT -DMCL_SIZEOF_UNIT=8 -DMCL_VINT_FIXED_BUFFER -I include -Wall -Wextra
asm: $(LLVM_SRC)
$(LLVM_OPT) -O3 -o - $(LLVM_SRC) | $(LLVM_LLC) -O3 $(LLVM_FLAGS) -x86-asm-syntax=intel

@ -16,6 +16,9 @@
#endif
#if !defined(MCL_DONT_USE_XBYAK) && (defined(_WIN64) || defined(__x86_64__)) && (MCL_SIZEOF_UNIT == 8)
#define MCL_USE_XBYAK
#endif
#if defined(MCL_USE_XBYAK) || defined(MCL_STATIC_CODE)
#define MCL_X64_ASM
#define MCL_XBYAK_DIRECT_CALL
#endif
@ -202,6 +205,8 @@ struct Op {
Unit R3[maxUnitSize];
#ifdef MCL_USE_XBYAK
FpGenerator *fg;
#endif
#ifdef MCL_X64_ASM
mcl::Array<Unit> invTbl;
#endif
void3u fp_addA_;
@ -288,7 +293,7 @@ struct Op {
memset(one, 0, sizeof(one));
memset(R2, 0, sizeof(R2));
memset(R3, 0, sizeof(R3));
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
invTbl.clear();
#endif
fp_addA_ = 0;

@ -68,7 +68,7 @@ void benchFp(size_t bitSize, int mode)
if (mode & 4) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM);
if (mode & 8) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_LLVM_MONT);
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
if (mode & 16) benchFpSub(tbl[i].p, tbl[i].x, tbl[i].y, mcl::fp::FP_XBYAK);
#endif
}
@ -122,7 +122,7 @@ void benchEc(size_t bitSize, int mode, mcl::ec::Mode ecMode)
if (mode & 4) benchEcSub(tbl[i], mcl::fp::FP_LLVM, ecMode);
if (mode & 8) benchEcSub(tbl[i], mcl::fp::FP_LLVM_MONT, ecMode);
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
if (mode & 16) benchEcSub(tbl[i], mcl::fp::FP_XBYAK, ecMode);
#endif
}

@ -168,7 +168,7 @@ int main(int argc, char *argv[])
benchRaw(tbl[i], mcl::fp::FP_LLVM);
benchRaw(tbl[i], mcl::fp::FP_LLVM_MONT);
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
if (bitSize <= 384) {
benchRaw(tbl[i], mcl::fp::FP_XBYAK);
}

@ -3,12 +3,14 @@
#include <cybozu/sha2.hpp>
#include <cybozu/endian.hpp>
#include <mcl/conversion.hpp>
#ifdef MCL_STATIC_CODE
#include "fp_static_code.hpp"
#endif
#ifdef MCL_USE_XBYAK
#include "fp_generator.hpp"
#else
#define XBYAK_ONLY_CLASS_CPU
#include "xbyak/xbyak_util.h"
//#include "detect_cpu.hpp"
#endif
#include "low_func.hpp"
#ifdef MCL_USE_LLVM
@ -315,7 +317,7 @@ void setOp(Op& op, Mode mode)
#endif
}
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
inline void invOpForMontC(Unit *y, const Unit *x, const Op& op)
{
Unit r[maxUnitSize];
@ -372,6 +374,12 @@ static bool initForMont(Op& op, const Unit *p, Mode mode)
op.fp_invOp = &invOpForMontC;
initInvTbl(op);
}
#elif defined(MCL_STATIC_CODE)
fp::setStaticCode(op);
if (op.isMont && N <= 4) {
op.fp_invOp = &invOpForMontC;
initInvTbl(op);
}
#endif
return true;
}
@ -403,14 +411,25 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, int _xi_a, Mode mode, size
priority : MCL_USE_XBYAK > MCL_USE_LLVM > none
Xbyak > llvm_mont > llvm > gmp_mont > gmp
*/
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
if (mode == FP_AUTO) mode = FP_XBYAK;
if (mode == FP_XBYAK && bitSize > 384) {
mode = FP_AUTO;
}
#ifdef MCL_USE_XBYAK
if (!isEnableJIT()) {
mode = FP_AUTO;
}
#elif MCL_STATIC_CODE
{
// static jit code uses avx, mulx, adox, adcx
using namespace Xbyak::util;
Cpu cpu;
if (!(cpu.has(Cpu::tAVX) && cpu.has(Cpu::tBMI2) && cpu.has(Cpu::tADX))) {
mode = FP_AUTO;
}
}
#endif
#else
if (mode == FP_XBYAK) mode = FP_AUTO;
#endif

@ -7,7 +7,6 @@
http://opensource.org/licenses/BSD-3-Clause
*/
#if CYBOZU_HOST == CYBOZU_HOST_INTEL
#define XBYAK_NO_OP_NAMES
#define XBYAK_DISABLE_AVX512
#include "xbyak/xbyak_util.h"
@ -25,45 +24,6 @@
namespace mcl {
#ifdef MCL_STATIC_JIT
typedef fp::Unit Unit;
extern "C" {
Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*);
Unit mclx_Fp_subPre(Unit*, const Unit*, const Unit*);
void mclx_Fp_add(Unit*, const Unit*, const Unit*);
void mclx_Fp_sub(Unit*, const Unit*, const Unit*);
void mclx_Fp_shr1(Unit*, const Unit*);
void mclx_Fp_neg(Unit*, const Unit*);
void mclx_Fp_mul(Unit*, const Unit*, const Unit*);
void mclx_Fp_sqr(Unit*, const Unit*);
void mclx_FpDbl_add(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_add(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*);
Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*);
Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_sqrPre(Unit*, const Unit*);
void mclx_FpDbl_mod(Unit*, const Unit*);
void mclx_Fp2_add(Unit*, const Unit*, const Unit*);
void mclx_Fp2_sub(Unit*, const Unit*, const Unit*);
void mclx_Fp2_neg(Unit*, const Unit*);
void mclx_Fp2_mul(Unit*, const Unit*, const Unit*);
void mclx_Fp2_sqr(Unit*, const Unit*);
void mclx_Fp2_mul_xi(Unit*, const Unit*);
Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*);
Unit mclx_Fr_subPre(Unit*, const Unit*, const Unit*);
void mclx_Fr_add(Unit*, const Unit*, const Unit*);
void mclx_Fr_sub(Unit*, const Unit*, const Unit*);
void mclx_Fr_shr1(Unit*, const Unit*);
void mclx_Fr_neg(Unit*, const Unit*);
void mclx_Fr_mul(Unit*, const Unit*, const Unit*);
void mclx_Fr_sqr(Unit*, const Unit*);
int mclx_Fr_preInv(Unit*, const Unit*);
}
#endif
#ifdef MCL_DUMP_JIT
struct DumpCode {
FILE *fp_;
@ -488,38 +448,6 @@ private:
align(16);
op.fp2_mul_xiA_ = gen_fp2_mul_xi();
setFuncInfo(prof_, suf, "2_mul_xi", op.fp2_mul_xiA_, getCurr());
#ifdef MCL_STATIC_JIT
if (op.xi_a) {
// Fp, sizeof(Fp) = 48, supports Fp2
op.fp_addPre = mclx_Fp_addPre;
op.fp_subPre = mclx_Fp_subPre;
op.fp_addA_ = mclx_Fp_add;
op.fp_subA_ = mclx_Fp_sub;
op.fp_shr1 = mclx_Fp_shr1;
op.fp_negA_ = mclx_Fp_neg;
op.fpDbl_addA_ = mclx_FpDbl_add;
op.fpDbl_subA_ = mclx_FpDbl_sub;
op.fpDbl_addPre = mclx_FpDbl_addPre;
op.fpDbl_subPre = mclx_FpDbl_subPre;
op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre;
op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre;
op.fpDbl_modA_ = mclx_FpDbl_mod;
op.fp_mulA_ = mclx_Fp_mul;
op.fp_sqrA_ = mclx_Fp_sqr;
} else {
// Fr, sizeof(Fr) = 32
op.fp_addPre = mclx_Fr_addPre;
op.fp_subPre = mclx_Fr_subPre;
op.fp_addA_ = mclx_Fr_add;
op.fp_subA_ = mclx_Fr_sub;
op.fp_shr1 = mclx_Fr_shr1;
op.fp_negA_ = mclx_Fr_neg;
op.fp_mulA_ = mclx_Fr_mul;
op.fp_sqrA_ = mclx_Fr_sqr;
op.fp_preInv = mclx_Fr_preInv;
}
#endif
}
u3u gen_addSubPre(bool isAdd, int n)
{
@ -2774,7 +2702,7 @@ private:
mov(rax, px);
// px is free frome here
load_mp(vv, rax, t); // v = x
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
load_mp(uu, rax, t); // u = p_
// k = 0
xor_(rax, rax);
@ -2852,7 +2780,7 @@ private:
const Reg64& t2 = ss.getReg(0);
const Reg64& t3 = rdx;
mov(t2, pL_);
lea(t2, ptr[rip+pL_]);
if (isFullBit_) {
mov(t, ptr [rTop]);
test(t, t);
@ -3724,7 +3652,7 @@ private:
}
}
sub_rr(a, b);
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
load_rm(b, rax);
sbb(rax, rax);
for (int i = 0; i < pn_; i++) {
@ -3732,7 +3660,7 @@ private:
}
add_rr(a, b);
store_mr(py, a);
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
mov_rr(a, t);
sub_rm(t, rax);
cmovc_rr(t, a);
@ -3750,7 +3678,7 @@ private:
mov_rr(b, a);
add_rm(b, px + FpByte_);
sub_rm(a, px + FpByte_);
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
jnc("@f");
add_rm(a, rax);
L("@@");
@ -3925,7 +3853,7 @@ private:
mov(ptr [(RegExp)t2 + i * 8], rax);
}
// t3 = a + p - b
mov(rax, pL_);
lea(rax, ptr[rip+pL_]);
add_rm(a, rax);
sub_rr(a, b);
store_mr(t3, a);

@ -0,0 +1,87 @@
#pragma once
/**
@file
@brief Fp generator
@author MITSUNARI Shigeo(@herumi)
@license modified new BSD license
http://opensource.org/licenses/BSD-3-Clause
*/
#ifndef MCL_STATIC_CODE
#error "define MCL_STATIC_CODE"
#endif
namespace mcl { namespace fp {
extern "C" {
Unit mclx_Fp_addPre(Unit*, const Unit*, const Unit*);
Unit mclx_Fp_subPre(Unit*, const Unit*, const Unit*);
void mclx_Fp_add(Unit*, const Unit*, const Unit*);
void mclx_Fp_sub(Unit*, const Unit*, const Unit*);
void mclx_Fp_shr1(Unit*, const Unit*);
void mclx_Fp_neg(Unit*, const Unit*);
void mclx_Fp_mul(Unit*, const Unit*, const Unit*);
void mclx_Fp_sqr(Unit*, const Unit*);
void mclx_FpDbl_add(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_add(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_sub(Unit*, const Unit*, const Unit*);
Unit mclx_FpDbl_addPre(Unit*, const Unit*, const Unit*);
Unit mclx_FpDbl_subPre(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_mulPre(Unit*, const Unit*, const Unit*);
void mclx_FpDbl_sqrPre(Unit*, const Unit*);
void mclx_FpDbl_mod(Unit*, const Unit*);
void mclx_Fp2_add(Unit*, const Unit*, const Unit*);
void mclx_Fp2_sub(Unit*, const Unit*, const Unit*);
void mclx_Fp2_neg(Unit*, const Unit*);
void mclx_Fp2_mul(Unit*, const Unit*, const Unit*);
void mclx_Fp2_sqr(Unit*, const Unit*);
void mclx_Fp2_mul_xi(Unit*, const Unit*);
Unit mclx_Fr_addPre(Unit*, const Unit*, const Unit*);
Unit mclx_Fr_subPre(Unit*, const Unit*, const Unit*);
void mclx_Fr_add(Unit*, const Unit*, const Unit*);
void mclx_Fr_sub(Unit*, const Unit*, const Unit*);
void mclx_Fr_shr1(Unit*, const Unit*);
void mclx_Fr_neg(Unit*, const Unit*);
void mclx_Fr_mul(Unit*, const Unit*, const Unit*);
void mclx_Fr_sqr(Unit*, const Unit*);
int mclx_Fr_preInv(Unit*, const Unit*);
} // extern "C"
void setStaticCode(mcl::fp::Op& op)
{
if (op.xi_a) {
// Fp, sizeof(Fp) = 48, supports Fp2
op.fp_addPre = mclx_Fp_addPre;
op.fp_subPre = mclx_Fp_subPre;
op.fp_addA_ = mclx_Fp_add;
op.fp_subA_ = mclx_Fp_sub;
op.fp_shr1 = mclx_Fp_shr1;
op.fp_negA_ = mclx_Fp_neg;
op.fpDbl_addA_ = mclx_FpDbl_add;
op.fpDbl_subA_ = mclx_FpDbl_sub;
op.fpDbl_addPre = mclx_FpDbl_addPre;
op.fpDbl_subPre = mclx_FpDbl_subPre;
op.fpDbl_mulPreA_ = mclx_FpDbl_mulPre;
op.fpDbl_sqrPreA_ = mclx_FpDbl_sqrPre;
op.fpDbl_modA_ = mclx_FpDbl_mod;
op.fp_mulA_ = mclx_Fp_mul;
op.fp_sqrA_ = mclx_Fp_sqr;
} else {
// Fr, sizeof(Fr) = 32
op.fp_addPre = mclx_Fr_addPre;
op.fp_subPre = mclx_Fr_subPre;
op.fp_addA_ = mclx_Fr_add;
op.fp_subA_ = mclx_Fr_sub;
op.fp_shr1 = mclx_Fr_shr1;
op.fp_negA_ = mclx_Fr_neg;
op.fp_mulA_ = mclx_Fr_mul;
op.fp_sqrA_ = mclx_Fr_sqr;
op.fp_preInv = mclx_Fr_preInv;
}
op.fp_mul = fp::func_ptr_cast<void4u>(op.fp_mulA_);
}
} } // mcl::fp

@ -16,7 +16,7 @@
#endif
#ifndef MCL_LLVM_BMI2
#if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_USE_XBYAK) && !defined(MCL_USE_VINT)
#if (CYBOZU_HOST == CYBOZU_HOST_INTEL) && defined(MCL_X64_ASM) && !defined(MCL_USE_VINT)
#define MCL_LLVM_BMI2 1
#endif
#endif

@ -602,7 +602,7 @@ void test_sub(const mcl::EcParam *para, size_t paraNum)
test_sub_sub(para[i], mcl::fp::FP_LLVM);
test_sub_sub(para[i], mcl::fp::FP_LLVM_MONT);
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
test_sub_sub(para[i], mcl::fp::FP_XBYAK);
#endif
mulVec(para[i]);

@ -876,7 +876,7 @@ void modpTest()
}
#include <iostream>
#if (defined(MCL_USE_LLVM) || defined(MCL_USE_XBYAK)) && (MCL_MAX_BIT_SIZE >= 521)
#if (defined(MCL_USE_LLVM) || defined(MCL_X64_ASM)) && (MCL_MAX_BIT_SIZE >= 521)
CYBOZU_TEST_AUTO(mod_NIST_P521)
{
const size_t len = 521;
@ -908,7 +908,7 @@ CYBOZU_TEST_AUTO(mod_NIST_P521)
mcl_fpDbl_mod_NIST_P521L(ex, in, Fp::getOp().p);
CYBOZU_TEST_EQUAL_ARRAY(ex, ok, N + 1);
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
const mcl::fp::Op& op = Fp::getOp();
if (!op.isMont) {
op.fpDbl_mod(ex, in, op.p);
@ -1014,7 +1014,7 @@ CYBOZU_TEST_AUTO(main)
sub(mcl::fp::FP_LLVM_MONT);
}
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
if (g_mode.empty() || g_mode == "xbyak") {
sub(mcl::fp::FP_XBYAK);
}

@ -465,7 +465,7 @@ void testAll()
test(p, mcl::fp::FP_LLVM);
test(p, mcl::fp::FP_LLVM_MONT);
#endif
#ifdef MCL_USE_XBYAK
#ifdef MCL_X64_ASM
test(p, mcl::fp::FP_XBYAK);
#endif
}

Loading…
Cancel
Save