diff --git a/src/low_armv7.s b/src/low_armv7.s index b0c836f..b219e33 100644 --- a/src/low_armv7.s +++ b/src/low_armv7.s @@ -1,7 +1,7 @@ .arch armv7-a .global mcl_fp_addNC64 - .global mcl_fp_addNC96_1 + .global mcl_fp_addNC96 .global mcl_fp_addNC96_2 .align 2 @@ -15,33 +15,28 @@ mcl_fp_addNC64: .align 2 -mcl_fp_addNC96_1: +mcl_fp_addNC96: push {r4, lr} - ldm r1, {r3, r12, lr} - ldm r2, {r1, r4} - ldr r2, [r2, #8] - adds r1, r1, r3 - adcs r3, r4, r12 - adc r2, r2, lr - stm r0, {r1, r3} - str r2, [r0, #8] - pop {r4, lr} + ldm r1, {r1, r3, r12} + ldm r2, {r2, r4, lr} + adds r1, r1, r2 + adcs r3, r3, r4 + adc r12, r12, lr + stm r0, {r1, r3, r12} + pop {r4, lr} bx lr +# slower .align 2 mcl_fp_addNC96_2: - ldr r3, [r1] - ldr r12, [r2] + ldr r3, [r1], #4 + ldr r12, [r2], #4 adds r3, r3, r12 - str r3, [r0] - - ldr r3, [r1, #4] - ldr r12, [r2, #4] - adcs r3, r3, r12 - str r3, [r0, #4] + str r3, [r0], #4 - ldr r3, [r1, #8] - ldr r12, [r2, #8] + ldm r1, {r1, r3} + ldm r2, {r2, r12} + adcs r1, r1, r2 adcs r3, r3, r12 - str r3, [r0, #8] + stm r0, {r1, r3} bx lr diff --git a/test/low_test.cpp b/test/low_test.cpp index 9965b85..95be622 100644 --- a/test/low_test.cpp +++ b/test/low_test.cpp @@ -8,7 +8,7 @@ cybozu::XorShift rg; extern "C" void mcl_fp_addNC64(uint32_t *z, const uint32_t *x, const uint32_t *y); -extern "C" void mcl_fp_addNC96_1(uint32_t *z, const uint32_t *x, const uint32_t *y); +extern "C" void mcl_fp_addNC96(uint32_t *z, const uint32_t *x, const uint32_t *y); extern "C" void mcl_fp_addNC96_2(uint32_t *z, const uint32_t *x, const uint32_t *y); CYBOZU_TEST_AUTO(addNC64) @@ -38,12 +38,12 @@ CYBOZU_TEST_AUTO(addNC) rg.read(x, N); rg.read(y, N); low_add(z, x, y); - mcl_fp_addNC96_1(w, x, y); + mcl_fp_addNC96(w, x, y); CYBOZU_TEST_EQUAL_ARRAY(z, w, N); mcl_fp_addNC96_2(w, x, y); CYBOZU_TEST_EQUAL_ARRAY(z, w, N); } - CYBOZU_BENCH("add96_1", mcl_fp_addNC96_1, x, x, y); + CYBOZU_BENCH("add96", mcl_fp_addNC96, x, x, y); CYBOZU_BENCH("add96_2", mcl_fp_addNC96_2, x, x, y); }