diff options
author | Haibo Huang <hhb@google.com> | 2018-05-30 15:39:04 -0700 |
---|---|---|
committer | android-build-merger <android-build-merger@google.com> | 2018-05-30 15:39:04 -0700 |
commit | 1bb5a5aeaea1d714c129eeccbbc89eb37660e2e5 (patch) | |
tree | cf8c23cc877981ede77e8d3d49ffdb5b2311dd72 | |
parent | 8b8add1e608176033d2a982cf8c609e5de3c6b6a (diff) | |
parent | d0c330b5d852d9c397fe86845893c46103ecf09e (diff) |
Merge "Use cortex-a53/bionic/memmove.S by default for arm64"
am: d0c330b5d8
Change-Id: I70f94eb95130e6f66f48ff9ee90d247455744ede
-rw-r--r-- | libc/Android.bp | 34 | ||||
-rw-r--r-- | libc/arch-arm64/cortex-a53/bionic/memmove.S | 153 | ||||
-rw-r--r-- | libc/arch-arm64/denver64/bionic/memmove.S | 329 | ||||
-rw-r--r-- | libc/arch-arm64/generic/bionic/memmove.S | 386 |
4 files changed, 436 insertions, 466 deletions
diff --git a/libc/Android.bp b/libc/Android.bp index 00686acc3..f5e2285c0 100644 --- a/libc/Android.bp +++ b/libc/Android.bp @@ -1042,43 +1042,13 @@ cc_library_static { denver64: { srcs: [ "arch-arm64/denver64/bionic/memcpy.S", + "arch-arm64/denver64/bionic/memmove.S", "arch-arm64/denver64/bionic/memset.S", ], exclude_srcs: [ "arch-arm64/generic/bionic/memcpy.S", - "arch-arm64/generic/bionic/memset.S", - ], - }, - cortex_a53: { - srcs: [ - "arch-arm64/cortex-a53/bionic/memmove.S", - ], - exclude_srcs: [ - "arch-arm64/generic/bionic/memmove.S", - ], - }, - cortex_a55: { - srcs: [ - "arch-arm64/cortex-a53/bionic/memmove.S", - ], - exclude_srcs: [ - "arch-arm64/generic/bionic/memmove.S", - ], - }, - cortex_a73: { - srcs: [ - "arch-arm64/cortex-a53/bionic/memmove.S", - ], - exclude_srcs: [ - "arch-arm64/generic/bionic/memmove.S", - ], - }, - cortex_a75: { - srcs: [ - "arch-arm64/cortex-a53/bionic/memmove.S", - ], - exclude_srcs: [ "arch-arm64/generic/bionic/memmove.S", + "arch-arm64/generic/bionic/memset.S", ], }, }, diff --git a/libc/arch-arm64/cortex-a53/bionic/memmove.S b/libc/arch-arm64/cortex-a53/bionic/memmove.S deleted file mode 100644 index c50112d94..000000000 --- a/libc/arch-arm64/cortex-a53/bionic/memmove.S +++ /dev/null @@ -1,153 +0,0 @@ -/* Copyright (c) 2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -/* - * Copyright (c) 2015 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes - */ - -#include <private/bionic_asm.h> - -/* Parameters and result. */ -#define dstin x0 -#define src x1 -#define count x2 -#define srcend x3 -#define dstend x4 -#define tmp1 x5 -#define A_l x6 -#define A_h x7 -#define B_l x8 -#define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l count -#define E_h tmp1 - -/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps. - Larger backwards copies are also handled by memcpy. The only remaining - case is forward large copies. The destination is aligned, and an - unrolled loop processes 64 bytes per iteration. -*/ - -#if defined(WMEMMOVE) -ENTRY(wmemmove) - lsl count, count, #2 -#else -ENTRY(memmove) -#endif - sub tmp1, dstin, src - cmp count, 96 - ccmp tmp1, count, 2, hi - b.hs memcpy - - cbz tmp1, 3f - add dstend, dstin, count - add srcend, src, count - - /* Align dstend to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 96 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - - and tmp1, dstend, 15 - ldp D_l, D_h, [srcend, -16] - sub srcend, srcend, tmp1 - sub count, count, tmp1 - ldp A_l, A_h, [srcend, -16] - stp D_l, D_h, [dstend, -16] - ldp B_l, B_h, [srcend, -32] - ldp C_l, C_h, [srcend, -48] - ldp D_l, D_h, [srcend, -64]! - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls 2f - nop -1: - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [srcend, -16] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [srcend, -48] - stp D_l, D_h, [dstend, -64]! - ldp D_l, D_h, [srcend, -64]! - subs count, count, 64 - b.hi 1b - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ -2: - ldp E_l, E_h, [src, 48] - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [src, 32] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [src, 16] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [src] - stp D_l, D_h, [dstend, -64] - stp E_l, E_h, [dstin, 48] - stp A_l, A_h, [dstin, 32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin] -3: ret - -#if defined(WMEMMOVE) -END(wmemmove) -#else -END(memmove) -#endif diff --git a/libc/arch-arm64/denver64/bionic/memmove.S b/libc/arch-arm64/denver64/bionic/memmove.S new file mode 100644 index 000000000..739ce4982 --- /dev/null +++ b/libc/arch-arm64/denver64/bionic/memmove.S @@ -0,0 +1,329 @@ +/* Copyright (c) 2014, Linaro Limited + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the Linaro nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Unaligned accesses + * wchar_t is 4 bytes + */ + +#include <private/bionic_asm.h> + +/* Parameters and result. */ +#define dstin x0 +#define src x1 +#define count x2 +#define tmp1 x3 +#define tmp1w w3 +#define tmp2 x4 +#define tmp2w w4 +#define tmp3 x5 +#define tmp3w w5 +#define dst x6 + +#define A_l x7 +#define A_h x8 +#define B_l x9 +#define B_h x10 +#define C_l x11 +#define C_h x12 +#define D_l x13 +#define D_h x14 + +#if defined(WMEMMOVE) +ENTRY(wmemmove) + lsl count, count, #2 +#else +ENTRY(memmove) +#endif + cmp dstin, src + b.lo .Ldownwards + add tmp1, src, count + cmp dstin, tmp1 + b.hs memcpy /* No overlap. */ + + /* Upwards move with potential overlap. + * Need to move from the tail backwards. SRC and DST point one + * byte beyond the remaining data to move. */ + add dst, dstin, count + add src, src, count + cmp count, #64 + b.ge .Lmov_not_short_up + + /* Deal with small moves quickly by dropping straight into the + * exit block. */ +.Ltail63up: + /* Move up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. */ + ands tmp1, count, #0x30 + b.eq .Ltail15up + sub dst, dst, tmp1 + sub src, src, tmp1 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp A_l, A_h, [src, #32] + stp A_l, A_h, [dst, #32] +1: + ldp A_l, A_h, [src, #16] + stp A_l, A_h, [dst, #16] +2: + ldp A_l, A_h, [src] + stp A_l, A_h, [dst] +.Ltail15up: + /* Move up to 15 bytes of data. Does not assume additional data + * being moved. */ + tbz count, #3, 1f + ldr tmp1, [src, #-8]! + str tmp1, [dst, #-8]! +1: + tbz count, #2, 1f + ldr tmp1w, [src, #-4]! + str tmp1w, [dst, #-4]! +1: + tbz count, #1, 1f + ldrh tmp1w, [src, #-2]! + strh tmp1w, [dst, #-2]! +1: + tbz count, #0, 1f + ldrb tmp1w, [src, #-1] + strb tmp1w, [dst, #-1] +1: + ret + +.Lmov_not_short_up: + /* We don't much care about the alignment of DST, but we want SRC + * to be 128-bit (16 byte) aligned so that we don't cross cache line + * boundaries on both loads and stores. */ + ands tmp2, src, #15 /* Bytes to reach alignment. */ + b.eq 2f + sub count, count, tmp2 + /* Move enough data to reach alignment; unlike memcpy, we have to + * be aware of the overlap, which means we can't move data twice. */ + tbz tmp2, #3, 1f + ldr tmp1, [src, #-8]! + str tmp1, [dst, #-8]! +1: + tbz tmp2, #2, 1f + ldr tmp1w, [src, #-4]! + str tmp1w, [dst, #-4]! +1: + tbz tmp2, #1, 1f + ldrh tmp1w, [src, #-2]! + strh tmp1w, [dst, #-2]! +1: + tbz tmp2, #0, 1f + ldrb tmp1w, [src, #-1]! + strb tmp1w, [dst, #-1]! +1: + + /* There may be less than 63 bytes to go now. */ + cmp count, #63 + b.le .Ltail63up +2: + subs count, count, #128 + b.ge .Lmov_body_large_up + /* Less than 128 bytes to move, so handle 64 here and then jump + * to the tail. */ + ldp A_l, A_h, [src, #-64]! + ldp B_l, B_h, [src, #16] + ldp C_l, C_h, [src, #32] + ldp D_l, D_h, [src, #48] + stp A_l, A_h, [dst, #-64]! + stp B_l, B_h, [dst, #16] + stp C_l, C_h, [dst, #32] + stp D_l, D_h, [dst, #48] + tst count, #0x3f + b.ne .Ltail63up + ret + + /* Critical loop. Start at a new Icache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. */ + .p2align 6 +.Lmov_body_large_up: + /* There are at least 128 bytes to move. */ + ldp A_l, A_h, [src, #-16] + ldp B_l, B_h, [src, #-32] + ldp C_l, C_h, [src, #-48] + ldp D_l, D_h, [src, #-64]! +1: + stp A_l, A_h, [dst, #-16] + ldp A_l, A_h, [src, #-16] + stp B_l, B_h, [dst, #-32] + ldp B_l, B_h, [src, #-32] + stp C_l, C_h, [dst, #-48] + ldp C_l, C_h, [src, #-48] + stp D_l, D_h, [dst, #-64]! + ldp D_l, D_h, [src, #-64]! + subs count, count, #64 + b.ge 1b + stp A_l, A_h, [dst, #-16] + stp B_l, B_h, [dst, #-32] + stp C_l, C_h, [dst, #-48] + stp D_l, D_h, [dst, #-64]! + tst count, #0x3f + b.ne .Ltail63up + ret + + +.Ldownwards: + /* For a downwards move we can safely use memcpy provided that + * DST is more than 16 bytes away from SRC. */ + sub tmp1, src, #16 + cmp dstin, tmp1 + b.ls memcpy /* May overlap, but not critically. */ + + mov dst, dstin /* Preserve DSTIN for return value. */ + cmp count, #64 + b.ge .Lmov_not_short_down + + /* Deal with small moves quickly by dropping straight into the + * exit block. */ +.Ltail63down: + /* Move up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. */ + ands tmp1, count, #0x30 + b.eq .Ltail15down + add dst, dst, tmp1 + add src, src, tmp1 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp A_l, A_h, [src, #-48] + stp A_l, A_h, [dst, #-48] +1: + ldp A_l, A_h, [src, #-32] + stp A_l, A_h, [dst, #-32] +2: + ldp A_l, A_h, [src, #-16] + stp A_l, A_h, [dst, #-16] +.Ltail15down: + /* Move up to 15 bytes of data. Does not assume additional data + being moved. */ + tbz count, #3, 1f + ldr tmp1, [src], #8 + str tmp1, [dst], #8 +1: + tbz count, #2, 1f + ldr tmp1w, [src], #4 + str tmp1w, [dst], #4 +1: + tbz count, #1, 1f + ldrh tmp1w, [src], #2 + strh tmp1w, [dst], #2 +1: + tbz count, #0, 1f + ldrb tmp1w, [src] + strb tmp1w, [dst] +1: + ret + +.Lmov_not_short_down: + /* We don't much care about the alignment of DST, but we want SRC + * to be 128-bit (16 byte) aligned so that we don't cross cache line + * boundaries on both loads and stores. */ + neg tmp2, src + ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ + b.eq 2f + sub count, count, tmp2 + /* Move enough data to reach alignment; unlike memcpy, we have to + * be aware of the overlap, which means we can't move data twice. */ + tbz tmp2, #3, 1f + ldr tmp1, [src], #8 + str tmp1, [dst], #8 +1: + tbz tmp2, #2, 1f + ldr tmp1w, [src], #4 + str tmp1w, [dst], #4 +1: + tbz tmp2, #1, 1f + ldrh tmp1w, [src], #2 + strh tmp1w, [dst], #2 +1: + tbz tmp2, #0, 1f + ldrb tmp1w, [src], #1 + strb tmp1w, [dst], #1 +1: + + /* There may be less than 63 bytes to go now. */ + cmp count, #63 + b.le .Ltail63down +2: + subs count, count, #128 + b.ge .Lmov_body_large_down + /* Less than 128 bytes to move, so handle 64 here and then jump + * to the tail. */ + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, #16] + ldp C_l, C_h, [src, #32] + ldp D_l, D_h, [src, #48] + stp A_l, A_h, [dst] + stp B_l, B_h, [dst, #16] + stp C_l, C_h, [dst, #32] + stp D_l, D_h, [dst, #48] + tst count, #0x3f + add src, src, #64 + add dst, dst, #64 + b.ne .Ltail63down + ret + + /* Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. */ + .p2align 6 +.Lmov_body_large_down: + /* There are at least 128 bytes to move. */ + ldp A_l, A_h, [src, #0] + sub dst, dst, #16 /* Pre-bias. */ + ldp B_l, B_h, [src, #16] + ldp C_l, C_h, [src, #32] + ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ +1: + stp A_l, A_h, [dst, #16] + ldp A_l, A_h, [src, #16] + stp B_l, B_h, [dst, #32] + ldp B_l, B_h, [src, #32] + stp C_l, C_h, [dst, #48] + ldp C_l, C_h, [src, #48] + stp D_l, D_h, [dst, #64]! + ldp D_l, D_h, [src, #64]! + subs count, count, #64 + b.ge 1b + stp A_l, A_h, [dst, #16] + stp B_l, B_h, [dst, #32] + stp C_l, C_h, [dst, #48] + stp D_l, D_h, [dst, #64] + add src, src, #16 + add dst, dst, #64 + 16 + tst count, #0x3f + b.ne .Ltail63down + ret +#if defined(WMEMMOVE) +END(wmemmove) +#else +END(memmove) +#endif diff --git a/libc/arch-arm64/generic/bionic/memmove.S b/libc/arch-arm64/generic/bionic/memmove.S index 739ce4982..c50112d94 100644 --- a/libc/arch-arm64/generic/bionic/memmove.S +++ b/libc/arch-arm64/generic/bionic/memmove.S @@ -1,4 +1,4 @@ -/* Copyright (c) 2014, Linaro Limited +/* Copyright (c) 2013, Linaro Limited All rights reserved. Redistribution and use in source and binary forms, with or without @@ -22,14 +22,39 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + +/* + * Copyright (c) 2015 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ /* Assumptions: * - * ARMv8-a, AArch64 - * Unaligned accesses - * wchar_t is 4 bytes + * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes */ #include <private/bionic_asm.h> @@ -38,22 +63,25 @@ #define dstin x0 #define src x1 #define count x2 -#define tmp1 x3 -#define tmp1w w3 -#define tmp2 x4 -#define tmp2w w4 -#define tmp3 x5 -#define tmp3w w5 -#define dst x6 - -#define A_l x7 -#define A_h x8 -#define B_l x9 -#define B_h x10 -#define C_l x11 -#define C_h x12 -#define D_l x13 -#define D_h x14 +#define srcend x3 +#define dstend x4 +#define tmp1 x5 +#define A_l x6 +#define A_h x7 +#define B_l x8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l count +#define E_h tmp1 + +/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps. + Larger backwards copies are also handled by memcpy. The only remaining + case is forward large copies. The destination is aligned, and an + unrolled loop processes 64 bytes per iteration. +*/ #if defined(WMEMMOVE) ENTRY(wmemmove) @@ -61,267 +89,63 @@ ENTRY(wmemmove) #else ENTRY(memmove) #endif - cmp dstin, src - b.lo .Ldownwards - add tmp1, src, count - cmp dstin, tmp1 - b.hs memcpy /* No overlap. */ - - /* Upwards move with potential overlap. - * Need to move from the tail backwards. SRC and DST point one - * byte beyond the remaining data to move. */ - add dst, dstin, count - add src, src, count - cmp count, #64 - b.ge .Lmov_not_short_up - - /* Deal with small moves quickly by dropping straight into the - * exit block. */ -.Ltail63up: - /* Move up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq .Ltail15up - sub dst, dst, tmp1 - sub src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #32] - stp A_l, A_h, [dst, #32] -1: - ldp A_l, A_h, [src, #16] - stp A_l, A_h, [dst, #16] -2: - ldp A_l, A_h, [src] - stp A_l, A_h, [dst] -.Ltail15up: - /* Move up to 15 bytes of data. Does not assume additional data - * being moved. */ - tbz count, #3, 1f - ldr tmp1, [src, #-8]! - str tmp1, [dst, #-8]! -1: - tbz count, #2, 1f - ldr tmp1w, [src, #-4]! - str tmp1w, [dst, #-4]! -1: - tbz count, #1, 1f - ldrh tmp1w, [src, #-2]! - strh tmp1w, [dst, #-2]! -1: - tbz count, #0, 1f - ldrb tmp1w, [src, #-1] - strb tmp1w, [dst, #-1] -1: - ret - -.Lmov_not_short_up: - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - ands tmp2, src, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Move enough data to reach alignment; unlike memcpy, we have to - * be aware of the overlap, which means we can't move data twice. */ - tbz tmp2, #3, 1f - ldr tmp1, [src, #-8]! - str tmp1, [dst, #-8]! -1: - tbz tmp2, #2, 1f - ldr tmp1w, [src, #-4]! - str tmp1w, [dst, #-4]! -1: - tbz tmp2, #1, 1f - ldrh tmp1w, [src, #-2]! - strh tmp1w, [dst, #-2]! -1: - tbz tmp2, #0, 1f - ldrb tmp1w, [src, #-1]! - strb tmp1w, [dst, #-1]! -1: - - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le .Ltail63up + sub tmp1, dstin, src + cmp count, 96 + ccmp tmp1, count, 2, hi + b.hs memcpy + + cbz tmp1, 3f + add dstend, dstin, count + add srcend, src, count + + /* Align dstend to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + and tmp1, dstend, 15 + ldp D_l, D_h, [srcend, -16] + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls 2f + nop +1: + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the start even if + there is just 1 byte left. */ 2: - subs count, count, #128 - b.ge .Lmov_body_large_up - /* Less than 128 bytes to move, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src, #-64]! - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst, #-64]! - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - b.ne .Ltail63up - ret + ldp E_l, E_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp E_l, E_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] +3: ret - /* Critical loop. Start at a new Icache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -.Lmov_body_large_up: - /* There are at least 128 bytes to move. */ - ldp A_l, A_h, [src, #-16] - ldp B_l, B_h, [src, #-32] - ldp C_l, C_h, [src, #-48] - ldp D_l, D_h, [src, #-64]! -1: - stp A_l, A_h, [dst, #-16] - ldp A_l, A_h, [src, #-16] - stp B_l, B_h, [dst, #-32] - ldp B_l, B_h, [src, #-32] - stp C_l, C_h, [dst, #-48] - ldp C_l, C_h, [src, #-48] - stp D_l, D_h, [dst, #-64]! - ldp D_l, D_h, [src, #-64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #-16] - stp B_l, B_h, [dst, #-32] - stp C_l, C_h, [dst, #-48] - stp D_l, D_h, [dst, #-64]! - tst count, #0x3f - b.ne .Ltail63up - ret - - -.Ldownwards: - /* For a downwards move we can safely use memcpy provided that - * DST is more than 16 bytes away from SRC. */ - sub tmp1, src, #16 - cmp dstin, tmp1 - b.ls memcpy /* May overlap, but not critically. */ - - mov dst, dstin /* Preserve DSTIN for return value. */ - cmp count, #64 - b.ge .Lmov_not_short_down - - /* Deal with small moves quickly by dropping straight into the - * exit block. */ -.Ltail63down: - /* Move up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq .Ltail15down - add dst, dst, tmp1 - add src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #-48] - stp A_l, A_h, [dst, #-48] -1: - ldp A_l, A_h, [src, #-32] - stp A_l, A_h, [dst, #-32] -2: - ldp A_l, A_h, [src, #-16] - stp A_l, A_h, [dst, #-16] -.Ltail15down: - /* Move up to 15 bytes of data. Does not assume additional data - being moved. */ - tbz count, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz count, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -1: - tbz count, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -1: - tbz count, #0, 1f - ldrb tmp1w, [src] - strb tmp1w, [dst] -1: - ret - -.Lmov_not_short_down: - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - neg tmp2, src - ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Move enough data to reach alignment; unlike memcpy, we have to - * be aware of the overlap, which means we can't move data twice. */ - tbz tmp2, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz tmp2, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -1: - tbz tmp2, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -1: - tbz tmp2, #0, 1f - ldrb tmp1w, [src], #1 - strb tmp1w, [dst], #1 -1: - - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le .Ltail63down -2: - subs count, count, #128 - b.ge .Lmov_body_large_down - /* Less than 128 bytes to move, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src] - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst] - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - add src, src, #64 - add dst, dst, #64 - b.ne .Ltail63down - ret - - /* Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -.Lmov_body_large_down: - /* There are at least 128 bytes to move. */ - ldp A_l, A_h, [src, #0] - sub dst, dst, #16 /* Pre-bias. */ - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ -1: - stp A_l, A_h, [dst, #16] - ldp A_l, A_h, [src, #16] - stp B_l, B_h, [dst, #32] - ldp B_l, B_h, [src, #32] - stp C_l, C_h, [dst, #48] - ldp C_l, C_h, [src, #48] - stp D_l, D_h, [dst, #64]! - ldp D_l, D_h, [src, #64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #16] - stp B_l, B_h, [dst, #32] - stp C_l, C_h, [dst, #48] - stp D_l, D_h, [dst, #64] - add src, src, #16 - add dst, dst, #64 + 16 - tst count, #0x3f - b.ne .Ltail63down - ret #if defined(WMEMMOVE) END(wmemmove) #else |