diff --git a/src/low_armv7.s b/src/low_armv7.s index 6df0dc3..a655520 100644 --- a/src/low_armv7.s +++ b/src/low_armv7.s @@ -72,6 +72,65 @@ cl_fp_addNC128_2: pop {r4, r5, r6, lr} bx lr + .globl mcl_fp_addNC160 + .align 2 +mcl_fp_addNC160: + push {r4, lr} + ldm r1!, {r3, r4} + ldm r2!, {r12, lr} + adds r3, r3, r12 + adcs r4, r4, lr + stm r0!, {r3, r4} + ldm r1, {r1, r3, r4} + ldm r2, {r2, r12, lr} + adcs r1, r1, r2 + adcs r3, r3, r12 + adcs r4, r4, lr + stm r0, {r1, r3, r4} + pop {r4, lr} + bx lr + + .globl mcl_fp_addNC192 + .align 2 +mcl_fp_addNC192: + push {r4, r5, r6, lr} + ldm r1!, {r3, r4, r5} + ldm r2!, {r6, r12, lr} + adds r3, r3, r6 + adcs r4, r4, r12 + adcs r5, r5, lr + stm r0!, {r3, r4, r5} + + ldm r1, {r3, r4, r5} + ldm r2, {r6, r12, lr} + adcs r3, r3, r6 + adcs r4, r4, r12 + adcs r5, r5, lr + stm r0, {r3, r4, r5} + pop {r4, r5, r6, lr} + bx lr + + .globl mcl_fp_addNC224 + .align 2 +mcl_fp_addNC224: + push {r4, r5, r6, lr} + ldm r1!, {r3, r4, r5} + ldm r2!, {r6, r12, lr} + adds r3, r3, r6 + adcs r4, r4, r12 + adcs r5, r5, lr + stm r0!, {r3, r4, r5} + + ldm r1, {r1, r3, r4, r5} + ldm r2, {r2, r6, r12, lr} + adcs r1, r1, r2 + adcs r3, r3, r6 + adcs r4, r4, r12 + adcs r5, r5, lr + stm r0, {r1, r3, r4, r5} + pop {r4, r5, r6, lr} + bx lr + .globl mcl_fp_addNC256 .align 2 mcl_fp_addNC256: @@ -84,12 +143,12 @@ mcl_fp_addNC256: adcs r6, r6, lr stm r0!, {r3, r4, r5, r6} - ldm r1!, {r3, r4, r5, r6} - ldm r2!, {r7, r8, r12, lr} + ldm r1, {r3, r4, r5, r6} + ldm r2, {r7, r8, r12, lr} adcs r3, r3, r7 adcs r4, r4, r8 adcs r5, r5, r12 adcs r6, r6, lr - stm r0!, {r3, r4, r5, r6} + stm r0, {r3, r4, r5, r6} pop {r4, r5, r6, r7, r8, lr} bx lr diff --git a/test/low_test.cpp b/test/low_test.cpp index dc1b48a..f73f14c 100644 --- a/test/low_test.cpp +++ b/test/low_test.cpp @@ -11,6 +11,9 @@ cybozu::XorShift rg; extern "C" void mcl_fp_addNC64(uint32_t *z, const uint32_t *x, const uint32_t *y); extern "C" void mcl_fp_addNC96(uint32_t *z, const uint32_t *x, const uint32_t *y); extern "C" void mcl_fp_addNC128(uint32_t *z, const uint32_t *x, const uint32_t *y); +extern "C" void mcl_fp_addNC160(uint32_t *z, const uint32_t *x, const uint32_t *y); +extern "C" void mcl_fp_addNC192(uint32_t *z, const uint32_t *x, const uint32_t *y); +extern "C" void mcl_fp_addNC224(uint32_t *z, const uint32_t *x, const uint32_t *y); extern "C" void mcl_fp_addNC256(uint32_t *z, const uint32_t *x, const uint32_t *y); extern "C" void add_test(uint32_t *z, const uint32_t *x, const uint32_t *y); @@ -22,6 +25,9 @@ void addNC(uint32_t *z, const uint32_t *x, const uint32_t *y); DEF_ADD(64) DEF_ADD(96) DEF_ADD(128) +DEF_ADD(160) +DEF_ADD(192) +DEF_ADD(224) DEF_ADD(256) #define CAT(S, BIT) "S##BIT" @@ -48,8 +54,11 @@ void benchAdd() CYBOZU_TEST_AUTO(addNC64) { benchAdd<64>(); } CYBOZU_TEST_AUTO(addNC96) { benchAdd<96>(); } CYBOZU_TEST_AUTO(addNC128) { benchAdd<128>(); } +CYBOZU_TEST_AUTO(addNC160) { benchAdd<160>(); } +CYBOZU_TEST_AUTO(addNC192) { benchAdd<192>(); } +CYBOZU_TEST_AUTO(addNC224) { benchAdd<224>(); } CYBOZU_TEST_AUTO(addNC256) { benchAdd<256>(); } -#if 1 +#if 0 CYBOZU_TEST_AUTO(addNC) { using namespace mcl::fp;