a little faster mcl_fp_addNC96

dev
MITSUNARI Shigeo 9 years ago
parent e22bf8eaf6
commit d8aea4d347
  1. 39
      src/low_armv7.s
  2. 6
      test/low_test.cpp

@ -1,7 +1,7 @@
.arch armv7-a
.global mcl_fp_addNC64
.global mcl_fp_addNC96_1
.global mcl_fp_addNC96
.global mcl_fp_addNC96_2
.align 2
@ -15,33 +15,28 @@ mcl_fp_addNC64:
.align 2
mcl_fp_addNC96_1:
mcl_fp_addNC96:
push {r4, lr}
ldm r1, {r3, r12, lr}
ldm r2, {r1, r4}
ldr r2, [r2, #8]
adds r1, r1, r3
adcs r3, r4, r12
adc r2, r2, lr
stm r0, {r1, r3}
str r2, [r0, #8]
pop {r4, lr}
ldm r1, {r1, r3, r12}
ldm r2, {r2, r4, lr}
adds r1, r1, r2
adcs r3, r3, r4
adc r12, r12, lr
stm r0, {r1, r3, r12}
pop {r4, lr}
bx lr
# slower
.align 2
mcl_fp_addNC96_2:
ldr r3, [r1]
ldr r12, [r2]
ldr r3, [r1], #4
ldr r12, [r2], #4
adds r3, r3, r12
str r3, [r0]
ldr r3, [r1, #4]
ldr r12, [r2, #4]
adcs r3, r3, r12
str r3, [r0, #4]
str r3, [r0], #4
ldr r3, [r1, #8]
ldr r12, [r2, #8]
ldm r1, {r1, r3}
ldm r2, {r2, r12}
adcs r1, r1, r2
adcs r3, r3, r12
str r3, [r0, #8]
stm r0, {r1, r3}
bx lr

@ -8,7 +8,7 @@
cybozu::XorShift rg;
extern "C" void mcl_fp_addNC64(uint32_t *z, const uint32_t *x, const uint32_t *y);
extern "C" void mcl_fp_addNC96_1(uint32_t *z, const uint32_t *x, const uint32_t *y);
extern "C" void mcl_fp_addNC96(uint32_t *z, const uint32_t *x, const uint32_t *y);
extern "C" void mcl_fp_addNC96_2(uint32_t *z, const uint32_t *x, const uint32_t *y);
CYBOZU_TEST_AUTO(addNC64)
@ -38,12 +38,12 @@ CYBOZU_TEST_AUTO(addNC)
rg.read(x, N);
rg.read(y, N);
low_add<N>(z, x, y);
mcl_fp_addNC96_1(w, x, y);
mcl_fp_addNC96(w, x, y);
CYBOZU_TEST_EQUAL_ARRAY(z, w, N);
mcl_fp_addNC96_2(w, x, y);
CYBOZU_TEST_EQUAL_ARRAY(z, w, N);
}
CYBOZU_BENCH("add96_1", mcl_fp_addNC96_1, x, x, y);
CYBOZU_BENCH("add96", mcl_fp_addNC96, x, x, y);
CYBOZU_BENCH("add96_2", mcl_fp_addNC96_2, x, x, y);
}

Loading…
Cancel
Save