diff options
author | Danny Lin <danny@kdrag0n.dev> | 2021-03-21 01:51:00 -0700 |
---|---|---|
committer | alk3pInjection <webmaster@raspii.tech> | 2021-09-27 21:17:05 +0800 |
commit | 03559b84468a98cd72150a66092ec463035bb666 (patch) | |
tree | bb321caca05a1413985d29560130bde2550b5fc7 | |
parent | ba602b8990ff1b695ea580a537a07a07224da937 (diff) |
[ProtonAOSP] Switch to arm-optimized-routines for arm64lineage-18.1
This replaces the following string functions with implementations from
arm-optimized-routines [1]:
- memchr
- strchr
- strcmp
- strlen
- strncmp
- strnlen
- memcmp
- memcpy (NEON variant)
- memmove
- memrchr
- memset
- stpcpy
- strchrnul
- strcpy
- strrchr
- wmemmove
All string functions supported by arm-optimized-routines were added as
of upstream commit e823e3abf5f89ecba58a10fc0fd82c13d9984b6b. The NEON
variant of memcpy was used with the assumption that all of our arm64
targets support NEON.
[1] https://github.com/ARM-software/optimized-routines
Test: builds and boots
Change-Id: Id43fcf54c459b7edab306030c668d1c1ce3d95cd
20 files changed, 1322 insertions, 1358 deletions
diff --git a/libc/Android.bp b/libc/Android.bp index 08fce3261..a422c26c5 100644 --- a/libc/Android.bp +++ b/libc/Android.bp @@ -619,6 +619,7 @@ cc_library_static { arm64: { exclude_srcs: [ "upstream-openbsd/lib/libc/string/memchr.c", + "upstream-openbsd/lib/libc/string/memrchr.c", "upstream-openbsd/lib/libc/string/stpcpy.c", "upstream-openbsd/lib/libc/string/strcpy.c", "upstream-openbsd/lib/libc/string/strncmp.c", @@ -851,9 +852,12 @@ cc_library_static { "arch-arm64/generic/bionic/memcmp.S", "arch-arm64/generic/bionic/memcpy.S", "arch-arm64/generic/bionic/memmove.S", + "arch-arm64/generic/bionic/memrchr.S", "arch-arm64/generic/bionic/memset.S", "arch-arm64/generic/bionic/stpcpy.S", + "arch-arm64/generic/bionic/strchrnul.S", "arch-arm64/generic/bionic/strcpy.S", + "arch-arm64/generic/bionic/strrchr.S", "arch-arm64/generic/bionic/wmemmove.S", "arch-arm64/default/bionic/memchr.S", @@ -880,6 +884,7 @@ cc_library_static { "bionic/__memcpy_chk.cpp", "bionic/strchr.cpp", "bionic/strnlen.c", + "bionic/strrchr.cpp", ], }, @@ -1178,6 +1183,14 @@ cc_library_static { "bionic/icu_static.cpp", ], + arch: { + arm64: { + exclude_srcs: [ + "bionic/strchrnul.cpp", + ], + }, + }, + multilib: { lib32: { // LP32 cruft diff --git a/libc/arch-arm64/default/bionic/memchr.S b/libc/arch-arm64/default/bionic/memchr.S index 7fbcc8fd4..f5599bf2d 100644 --- a/libc/arch-arm64/default/bionic/memchr.S +++ b/libc/arch-arm64/default/bionic/memchr.S @@ -1,32 +1,9 @@ /* + * memchr - find a character in a memory zone * - Copyright (c) 2014, ARM Limited - All rights Reserved. - Copyright (c) 2014, Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the company nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ /* Assumptions: * @@ -69,12 +46,15 @@ * identify exactly which byte has matched. */ -ENTRY(memchr_default) +ENTRY (memchr_default) + PTR_ARG (0) + SIZE_ARG (2) + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, L(zero_length) /* * Magic constant 0x40100401 allows us to identify which lane matches * the requested byte. */ - cbz cntin, .Lzero_length mov wtmp2, #0x0401 movk wtmp2, #0x4010, lsl #16 dup vrepchr.16b, chrin @@ -83,7 +63,7 @@ ENTRY(memchr_default) dup vrepmask.4s, wtmp2 ands soff, srcin, #31 and cntrem, cntin, #31 - b.eq .Lloop + b.eq L(loop) /* * Input string is not 32-byte aligned. We calculate the syndrome @@ -106,25 +86,25 @@ ENTRY(memchr_default) lsr synd, synd, tmp lsl synd, synd, tmp /* The first block can also be the last */ - b.ls .Lmasklast + b.ls L(masklast) /* Have we found something already? */ - cbnz synd, .Ltail + cbnz synd, L(tail) -.Lloop: +L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 subs cntin, cntin, #32 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b /* If we're out of data we finish regardless of the result */ - b.ls .Lend + b.ls L(end) /* Use a fast check for the termination condition */ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b addp vend.2d, vend.2d, vend.2d mov synd, vend.d[0] /* We're not out of data, loop if we haven't found the character */ - cbz synd, .Lloop + cbz synd, L(loop) -.Lend: +L(end): /* Termination condition found, let's calculate the syndrome value */ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b @@ -132,9 +112,9 @@ ENTRY(memchr_default) addp vend.16b, vend.16b, vend.16b /* 128->64 */ mov synd, vend.d[0] /* Only do the clear for the last possible block */ - b.hi .Ltail + b.hs L(tail) -.Lmasklast: +L(masklast): /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ add tmp, cntrem, soff and tmp, tmp, #31 @@ -143,7 +123,7 @@ ENTRY(memchr_default) lsl synd, synd, tmp lsr synd, synd, tmp -.Ltail: +L(tail): /* Count the trailing zeros using bit reversing */ rbit synd, synd /* Compensate the last post-increment */ @@ -158,7 +138,9 @@ ENTRY(memchr_default) csel result, xzr, result, eq ret -.Lzero_length: - mov result, xzr +L(zero_length): + mov result, #0 ret -END(memchr_default) + +END (memchr_default) + diff --git a/libc/arch-arm64/default/bionic/strchr.S b/libc/arch-arm64/default/bionic/strchr.S index f8cb724cc..ae522e81d 100644 --- a/libc/arch-arm64/default/bionic/strchr.S +++ b/libc/arch-arm64/default/bionic/strchr.S @@ -1,32 +1,9 @@ /* + * strchr - find a character in a string * - Copyright (c) 2014, ARM Limited - All rights Reserved. - Copyright (c) 2014, Linaro Ltd. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the company nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ /* Assumptions: * @@ -73,18 +50,19 @@ /* Locals and temporaries. */ -ENTRY(strchr_default) - /* Magic constant 0x40100401 to allow us to identify which lane - matches the requested byte. Magic constant 0x80200802 used - similarly for NUL termination. */ - mov wtmp2, #0x0401 - movk wtmp2, #0x4010, lsl #16 +ENTRY (strchr_default) + PTR_ARG (0) + /* Magic constant 0xc0300c03 to allow us to identify which lane + matches the requested byte. Even bits are set if the character + matches, odd bits if either the char is NUL or matches. */ + mov wtmp2, 0x0c03 + movk wtmp2, 0xc030, lsl 16 dup vrepchr.16b, chrin bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ dup vrepmask_c.4s, wtmp2 ands tmp1, srcin, #31 add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ - b.eq .Lloop + b.eq L(loop) /* Input string is not 32-byte aligned. Rather than forcing the padding bytes to a safe value, we calculate the syndrome @@ -96,12 +74,10 @@ ENTRY(strchr_default) cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b cmeq vhas_nul2.16b, vdata2.16b, #0 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b lsl tmp1, tmp1, #1 addp vend1.16b, vend1.16b, vend2.16b // 256->128 mov tmp3, #~0 @@ -110,35 +86,30 @@ ENTRY(strchr_default) mov tmp3, vend1.d[0] bic tmp1, tmp3, tmp1 // Mask padding bits. - cbnz tmp1, .Ltail + cbnz tmp1, L(tail) -.Lloop: + .p2align 4 +L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 - cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b - cmeq vhas_nul2.16b, vdata2.16b, #0 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b - /* Use a fast check for the termination condition. */ - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b - orr vend1.16b, vend1.16b, vend2.16b - addp vend1.2d, vend1.2d, vend1.2d + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b + umaxp vend1.16b, vend1.16b, vend1.16b mov tmp1, vend1.d[0] - cbz tmp1, .Lloop + cbz tmp1, L(loop) /* Termination condition found. Now need to establish exactly why we terminated. */ - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b addp vend1.16b, vend1.16b, vend2.16b // 256->128 addp vend1.16b, vend1.16b, vend2.16b // 128->64 - mov tmp1, vend1.d[0] -.Ltail: +L(tail): /* Count the trailing zeros, by bit reversing... */ rbit tmp1, tmp1 /* Re-bias source. */ @@ -150,4 +121,6 @@ ENTRY(strchr_default) add result, src, tmp1, lsr #1 csel result, result, xzr, eq ret -END(strchr_default) + +END (strchr_default) + diff --git a/libc/arch-arm64/default/bionic/strcmp.S b/libc/arch-arm64/default/bionic/strcmp.S index dfac7c42c..418b4c894 100644 --- a/libc/arch-arm64/default/bionic/strcmp.S +++ b/libc/arch-arm64/default/bionic/strcmp.S @@ -1,29 +1,9 @@ -/* Copyright (c) 2012, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ +/* + * strcmp - compare two strings + * + * Copyright (c) 2012-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ /* Assumptions: * @@ -32,8 +12,6 @@ #include <private/bionic_asm.h> -#define L(label) .L ## label - #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f #define REP8_80 0x8080808080808080 @@ -58,8 +36,9 @@ #define pos x11 /* Start of performance-critical section -- one 64B cache line. */ -ENTRY(strcmp_default) -.p2align 6 +ENTRY (strcmp_default) + PTR_ARG (0) + PTR_ARG (1) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 @@ -189,4 +168,6 @@ L(loop_misaligned): L(done): sub result, data1, data2 ret -END(strcmp_default) + +END (strcmp_default) + diff --git a/libc/arch-arm64/default/bionic/strlen.S b/libc/arch-arm64/default/bionic/strlen.S index 07c5294c2..d102dec2d 100644 --- a/libc/arch-arm64/default/bionic/strlen.S +++ b/libc/arch-arm64/default/bionic/strlen.S @@ -1,105 +1,88 @@ -/* Copyright (c) 2013-2015, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* + * strlen - calculate the length of a string. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * Not MTE compatible. */ #include <private/bionic_asm.h> -/* To test the page crossing code path more thoroughly, compile with - -DTEST_PAGE_CROSS - this will force all calls through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ -#define srcin x0 -#define len x0 - -/* Locals and temporaries. */ -#define src x1 -#define data1 x2 -#define data2 x3 -#define has_nul1 x4 -#define has_nul2 x5 -#define tmp1 x4 -#define tmp2 x5 -#define tmp3 x6 -#define tmp4 x7 -#define zeroones x8 - -#define L(l) .L ## l - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. A faster check - (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives - false hits for characters 129..255. */ +#define srcin x0 +#define len x0 + +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 + +#define maskv v0 +#define maskd d0 +#define dataq1 q1 +#define dataq2 q2 +#define datav1 v1 +#define datav2 v2 +#define tmp x2 +#define tmpw w2 +#define synd x3 +#define shift x4 + +/* For the first 32 bytes, NUL detection works on the principle that + (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a + byte is zero, and can be done in parallel across the entire word. */ #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 + +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ #ifdef TEST_PAGE_CROSS -# define MIN_PAGE_SIZE 15 +# define MIN_PAGE_SIZE 32 #else # define MIN_PAGE_SIZE 4096 #endif - /* Since strings are short on average, we check the first 16 bytes - of the string for a NUL character. In order to do an unaligned ldp - safely we have to do a page cross check first. If there is a NUL - byte we calculate the length from the 2 8-byte words using - conditional select to reduce branch mispredictions (it is unlikely - strlen will be repeatedly called on strings with the same length). - - If the string is longer than 16 bytes, we align src so don't need - further page cross checks, and process 32 bytes per iteration - using the fast NUL check. If we encounter non-ASCII characters, - fallback to a second loop using the full NUL check. - - If the page cross check fails, we read 16 bytes from an aligned - address, remove any characters before the string, and continue - in the main loop using aligned loads. Since strings crossing a - page in the first 16 bytes are rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. - - AArch64 systems have a minimum page size of 4k. We don't bother - checking for larger page sizes - the cost of setting up the correct - page size is just not worth the extra gain from a small reduction in - the cases taking the slow path. Note that we only care about - whether the first fetch, which may be misaligned, crosses a page - boundary. */ - -ENTRY(strlen_default) +/* Core algorithm: + + Since strings are short on average, we check the first 32 bytes of the + string for a NUL character without aligning the string. In order to use + unaligned loads safely we must do a page cross check first. + + If there is a NUL byte we calculate the length from the 2 8-byte words + using conditional select to reduce branch mispredictions (it is unlikely + strlen will be repeatedly called on strings with the same length). + + If the string is longer than 32 bytes, align src so we don't need further + page cross checks, and process 32 bytes per iteration using a fast SIMD + loop. + + If the page cross check fails, we read 32 bytes from an aligned address, + and ignore any characters before the string. If it contains a NUL + character, return the length, if not, continue in the main loop. */ + +ENTRY (strlen_default) + PTR_ARG (0) and tmp1, srcin, MIN_PAGE_SIZE - 1 - mov zeroones, REP8_01 - cmp tmp1, MIN_PAGE_SIZE - 16 - b.gt L(page_cross) + cmp tmp1, MIN_PAGE_SIZE - 32 + b.hi L(page_cross) + + /* Look for a NUL byte in the first 16 bytes. */ ldp data1, data2, [srcin] + mov zeroones, REP8_01 + #ifdef __AARCH64EB__ /* For big-endian, carry propagation (if the final byte in the string is 0x01) means we cannot use has_nul1/2 directly. @@ -115,113 +98,103 @@ ENTRY(strlen_default) bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq - beq L(main_loop_entry) + b.eq L(bytes16_31) - /* Enter with C = has_nul1 == 0. */ + /* Find the exact offset of the first NUL byte in the first 16 bytes + from the string start. Enter with C = has_nul1 == 0. */ csel has_nul1, has_nul1, has_nul2, cc mov len, 8 rev has_nul1, has_nul1 - clz tmp1, has_nul1 csel len, xzr, len, cc + clz tmp1, has_nul1 add len, len, tmp1, lsr 3 ret - /* The inner loop processes 32 bytes per iteration and uses the fast - NUL check. If we encounter non-ASCII characters, use a second - loop with the accurate NUL check. */ - .p2align 4 -L(main_loop_entry): - bic src, srcin, 15 - sub src, src, 16 -L(main_loop): - ldp data1, data2, [src, 32]! -.Lpage_cross_entry: - sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - bne 1f - ldp data1, data2, [src, 16] + .p2align 3 + /* Look for a NUL byte at offset 16..31 in the string. */ +L(bytes16_31): + ldp data1, data2, [srcin, 16] +#ifdef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - beq L(main_loop) - add src, src, 16 -1: - /* The fast check failed, so do the slower, accurate NUL check. */ orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones orr tmp4, data2, REP8_7f bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) + b.eq L(loop_entry) - /* Enter with C = has_nul1 == 0. */ -L(tail): -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul1/2 directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, cc - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - bic has_nul1, tmp1, tmp2 -#else + /* Find the exact offset of the first NUL byte at offset 16..31 from + the string start. Enter with C = has_nul1 == 0. */ csel has_nul1, has_nul1, has_nul2, cc -#endif - sub len, src, srcin + mov len, 24 rev has_nul1, has_nul1 - add tmp2, len, 8 + mov tmp3, 16 clz tmp1, has_nul1 - csel len, len, tmp2, cc + csel len, tmp3, len, cc add len, len, tmp1, lsr 3 ret -L(nonascii_loop): - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - bne L(tail) - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) - b L(tail) +L(loop_entry): + bic src, srcin, 31 - /* Load 16 bytes from [srcin & ~15] and force the bytes that precede - srcin to 0x7f, so we ignore any NUL bytes before the string. - Then continue in the aligned loop. */ -L(page_cross): - bic src, srcin, 15 - ldp data1, data2, [src] - lsl tmp1, srcin, 3 - mov tmp4, -1 + .p2align 5 +L(loop): + ldp dataq1, dataq2, [src, 32]! + uminp maskv.16b, datav1.16b, datav2.16b + uminp maskv.16b, maskv.16b, maskv.16b + cmeq maskv.8b, maskv.8b, 0 + fmov synd, maskd + cbz synd, L(loop) + + /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ + cmeq maskv.16b, datav1.16b, 0 + sub len, src, srcin + tst synd, 0xffffffff + b.ne 1f + cmeq maskv.16b, datav2.16b, 0 + add len, len, 16 +1: + /* Generate a bitmask and compute correct byte offset. */ #ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ + bic maskv.8h, 0xf0 #else - /* Little-endian. Early bytes are at LSB. */ - lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ + bic maskv.8h, 0x0f, lsl 8 #endif - orr tmp1, tmp1, REP8_80 - orn data1, data1, tmp1 - orn tmp2, data2, tmp1 - tst srcin, 8 - csel data1, data1, tmp4, eq - csel data2, data2, tmp2, eq - b L(page_cross_entry) - -END(strlen_default) + umaxp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz tmp, synd + add len, len, tmp, lsr 2 + ret + + .p2align 4 + +L(page_cross): + bic src, srcin, 31 + mov tmpw, 0x0c03 + movk tmpw, 0xc030, lsl 16 + ld1 {datav1.16b, datav2.16b}, [src] + dup maskv.4s, tmpw + cmeq datav1.16b, datav1.16b, 0 + cmeq datav2.16b, datav2.16b, 0 + and datav1.16b, datav1.16b, maskv.16b + and datav2.16b, datav2.16b, maskv.16b + addp maskv.16b, datav1.16b, datav2.16b + addp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd + lsl shift, srcin, 1 + lsr synd, synd, shift + cbz synd, L(loop) + + rbit synd, synd + clz len, synd + lsr len, len, 1 + ret + +END (strlen_default) diff --git a/libc/arch-arm64/default/bionic/strncmp.S b/libc/arch-arm64/default/bionic/strncmp.S index 5432b738e..68c6e6526 100644 --- a/libc/arch-arm64/default/bionic/strncmp.S +++ b/libc/arch-arm64/default/bionic/strncmp.S @@ -1,29 +1,9 @@ -/* Copyright (c) 2014, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ +/* + * strncmp - compare two strings + * + * Copyright (c) 2013-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ /* Assumptions: * @@ -60,19 +40,17 @@ #define endloop x15 #define count mask - .text - .p2align 6 - .rep 7 - nop /* Pad so that the loop below fits a cache line. */ - .endr -ENTRY(strncmp_default) - cbz limit, .Lret0 +ENTRY (strncmp_default) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + cbz limit, L(ret0) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 and count, src1, #7 - b.ne .Lmisaligned8 - cbnz count, .Lmutual_align + b.ne L(misaligned8) + cbnz count, L(mutual_align) /* Calculate the number of full and partial words -1. */ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ @@ -80,11 +58,11 @@ ENTRY(strncmp_default) /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and can be done in parallel across the entire word. */ - /* Start of performance-critical section -- one 64B cache line. */ -.Lloop_aligned: + .p2align 4 +L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 -.Lstart_realigned: +L(start_realigned): subs limit_wd, limit_wd, #1 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f @@ -92,15 +70,15 @@ ENTRY(strncmp_default) csinv endloop, diff, xzr, pl /* Last Dword or differences. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq - b.eq .Lloop_aligned - /* End of performance-critical section -- one 64B cache line. */ + b.eq L(loop_aligned) + /* End of main loop */ /* Not reached the limit, must have found the end or a diff. */ - tbz limit_wd, #63, .Lnot_limit + tbz limit_wd, #63, L(not_limit) /* Limit % 8 == 0 => all bytes significant. */ ands limit, limit, #7 - b.eq .Lnot_limit + b.eq L(not_limit) lsl limit, limit, #3 /* Bits -> bytes. */ mov mask, #~0 @@ -115,7 +93,7 @@ ENTRY(strncmp_default) /* Make sure that the NUL byte is marked in the syndrome. */ orr has_nul, has_nul, mask -.Lnot_limit: +L(not_limit): orr syndrome, diff, has_nul #ifndef __AARCH64EB__ @@ -168,7 +146,7 @@ ENTRY(strncmp_default) ret #endif -.Lmutual_align: +L(mutual_align): /* Sources are mutually aligned, but are not currently at an alignment boundary. Round down the addresses and then mask off the bytes that precede the start point. @@ -196,56 +174,56 @@ ENTRY(strncmp_default) orr data1, data1, tmp2 orr data2, data2, tmp2 add limit_wd, limit_wd, tmp3, lsr #3 - b .Lstart_realigned + b L(start_realigned) - .p2align 6 + .p2align 4 /* Don't bother with dwords for up to 16 bytes. */ -.Lmisaligned8: +L(misaligned8): cmp limit, #16 - b.hs .Ltry_misaligned_words + b.hs L(try_misaligned_words) -.Lbyte_loop: +L(byte_loop): /* Perhaps we can do better than this. */ ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 subs limit, limit, #1 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq .Lbyte_loop -.Ldone: + b.eq L(byte_loop) +L(done): sub result, data1, data2 ret /* Align the SRC1 to a dword by doing a bytewise compare and then do the dword loop. */ -.Ltry_misaligned_words: +L(try_misaligned_words): lsr limit_wd, limit, #3 - cbz count, .Ldo_misaligned + cbz count, L(do_misaligned) neg count, count and count, count, #7 sub limit, limit, count lsr limit_wd, limit, #3 -.Lpage_end_loop: +L(page_end_loop): ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 cmp data1w, #1 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.ne .Ldone + b.ne L(done) subs count, count, #1 - b.hi .Lpage_end_loop + b.hi L(page_end_loop) -.Ldo_misaligned: +L(do_misaligned): /* Prepare ourselves for the next page crossing. Unlike the aligned loop, we fetch 1 less dword because we risk crossing bounds on SRC2. */ mov count, #8 subs limit_wd, limit_wd, #1 - b.lo .Ldone_loop -.Lloop_misaligned: + b.lo L(done_loop) +L(loop_misaligned): and tmp2, src2, #0xff8 eor tmp2, tmp2, #0xff8 - cbz tmp2, .Lpage_end_loop + cbz tmp2, L(page_end_loop) ldr data1, [src1], #8 ldr data2, [src2], #8 @@ -254,14 +232,14 @@ ENTRY(strncmp_default) eor diff, data1, data2 /* Non-zero if differences found. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp diff, #0, #0, eq - b.ne .Lnot_limit + b.ne L(not_limit) subs limit_wd, limit_wd, #1 - b.pl .Lloop_misaligned + b.pl L(loop_misaligned) -.Ldone_loop: +L(done_loop): /* We found a difference or a NULL before the limit was reached. */ and limit, limit, #7 - cbz limit, .Lnot_limit + cbz limit, L(not_limit) /* Read the last word. */ sub src1, src1, 8 sub src2, src2, 8 @@ -272,9 +250,11 @@ ENTRY(strncmp_default) eor diff, data1, data2 /* Non-zero if differences found. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp diff, #0, #0, eq - b.ne .Lnot_limit + b.ne L(not_limit) -.Lret0: +L(ret0): mov result, #0 ret -END(strncmp_default) + +END ( strncmp_default) + diff --git a/libc/arch-arm64/default/bionic/strnlen.S b/libc/arch-arm64/default/bionic/strnlen.S index 169453207..445661d48 100644 --- a/libc/arch-arm64/default/bionic/strnlen.S +++ b/libc/arch-arm64/default/bionic/strnlen.S @@ -1,174 +1,112 @@ -/* Copyright (c) 2014, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ +/* + * strnlen - calculate the length of a string with limit. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. */ #include <private/bionic_asm.h> -/* Arguments and results. */ #define srcin x0 -#define len x0 -#define limit x1 +#define cntin x1 +#define result x0 -/* Locals and temporaries. */ #define src x2 -#define data1 x3 -#define data2 x4 -#define data2a x5 -#define has_nul1 x6 -#define has_nul2 x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define tmp4 x11 -#define zeroones x12 -#define pos x13 -#define limit_wd x14 - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - - .text - .p2align 6 -.Lstart: - /* Pre-pad to ensure critical loop begins an icache line. */ - .rep 7 - nop - .endr - /* Put this code here to avoid wasting more space with pre-padding. */ -.Lhit_limit: - mov len, limit +#define synd x3 +#define shift x4 +#define wtmp w4 +#define tmp x4 +#define cntrem x5 + +#define qdata q0 +#define vdata v0 +#define vhas_chr v1 +#define vrepmask v2 +#define vend v3 +#define dend d3 + +/* + Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (strnlen_default) + PTR_ARG (0) + SIZE_ARG (1) + bic src, srcin, 15 + mov wtmp, 0xf00f + cbz cntin, L(nomatch) + ld1 {vdata.16b}, [src], 16 + dup vrepmask.8h, wtmp + cmeq vhas_chr.16b, vdata.16b, 0 + lsl shift, srcin, 2 + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + lsr synd, synd, shift + cbz synd, L(start_loop) +L(finish): + rbit synd, synd + clz synd, synd + lsr result, synd, 2 + cmp cntin, result + csel result, cntin, result, ls ret -ENTRY(strnlen_default) - cbz limit, .Lhit_limit - mov zeroones, #REP8_01 - bic src, srcin, #15 - ands tmp1, srcin, #15 - b.ne .Lmisaligned - /* Calculate the number of full and partial words -1. */ - sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - /* The inner loop deals with two Dwords at a time. This has a - slightly higher start-up cost, but we should win quite quickly, - especially on cores with a high number of issue slots per - cycle, as we get much better parallelism out of the operations. */ - - /* Start of critial section -- keep to one 64Byte cache line. */ -.Lloop: - ldp data1, data2, [src], #16 -.Lrealigned: - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - subs limit_wd, limit_wd, #1 - orr tmp1, has_nul1, has_nul2 - ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ - b.eq .Lloop - /* End of critical section -- keep to one 64Byte cache line. */ - - orr tmp1, has_nul1, has_nul2 - cbz tmp1, .Lhit_limit /* No null in final Qword. */ - - /* We know there's a null in the final Qword. The easiest thing - to do now is work out the length of the string and return - MIN (len, limit). */ - - sub len, src, srcin - cbz has_nul1, .Lnul_in_data2 -#ifdef __AARCH64EB__ - mov data2, data1 +L(start_loop): + sub tmp, src, srcin + subs cntrem, cntin, tmp + b.ls L(nomatch) + + /* Make sure that it won't overread by a 16-byte chunk */ + add tmp, cntrem, 15 + tbnz tmp, 4, L(loop32_2) + + .p2align 5 +L(loop32): + ldr qdata, [src], 16 + cmeq vhas_chr.16b, vdata.16b, 0 + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbnz synd, L(end) +L(loop32_2): + ldr qdata, [src], 16 + subs cntrem, cntrem, 32 + cmeq vhas_chr.16b, vdata.16b, 0 + b.ls L(end) + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbz synd, L(loop32) + +L(end): + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + sub src, src, 16 + mov synd, vend.d[0] + sub result, src, srcin +#ifndef __AARCH64EB__ + rbit synd, synd #endif - sub len, len, #8 - mov has_nul2, has_nul1 -.Lnul_in_data2: -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - rev data2, data2 - sub tmp1, data2, zeroones - orr tmp2, data2, #REP8_7f - bic has_nul2, tmp1, tmp2 -#endif - sub len, len, #8 - rev has_nul2, has_nul2 - clz pos, has_nul2 - add len, len, pos, lsr #3 /* Bits to bytes. */ - cmp len, limit - csel len, len, limit, ls /* Return the lower value. */ + clz synd, synd + add result, result, synd, lsr 2 + cmp cntin, result + csel result, cntin, result, ls ret -.Lmisaligned: - /* Deal with a partial first word. - We're doing two things in parallel here; - 1) Calculate the number of words (but avoiding overflow if - limit is near ULONG_MAX) - to do this we need to work out - limit + tmp1 - 1 as a 65-bit value before shifting it; - 2) Load and mask the initial data words - we force the bytes - before the ones we are interested in to 0xff - this ensures - early bytes will not hit any zero detection. */ - sub limit_wd, limit, #1 - neg tmp4, tmp1 - cmp tmp1, #8 - - and tmp3, limit_wd, #15 - lsr limit_wd, limit_wd, #4 - mov tmp2, #~0 - - ldp data1, data2, [src], #16 - lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ - add tmp3, tmp3, tmp1 - -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ -#endif - add limit_wd, limit_wd, tmp3, lsr #4 +L(nomatch): + mov result, cntin + ret - orr data1, data1, tmp2 - orr data2a, data2, tmp2 +END (strnlen_default) - csinv data1, data1, xzr, le - csel data2, data2, data2a, le - b .Lrealigned -END(strnlen_default) diff --git a/libc/arch-arm64/generic/bionic/memcmp.S b/libc/arch-arm64/generic/bionic/memcmp.S index bff54aea3..fae0d4226 100644 --- a/libc/arch-arm64/generic/bionic/memcmp.S +++ b/libc/arch-arm64/generic/bionic/memcmp.S @@ -1,29 +1,7 @@ -/* - * Copyright (c) 2017 ARM Ltd - * All rights reserved. +/* memcmp - compare memory * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2013-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -33,8 +11,6 @@ #include <private/bionic_asm.h> -#define L(l) .L ## l - /* Parameters and result. */ #define src1 x0 #define src2 x1 @@ -51,20 +27,13 @@ #define tmp1 x7 #define tmp2 x8 -/* Small inputs of less than 8 bytes are handled separately. This allows the - main code to be speed up using unaligned loads since there are now at least - 8 bytes to be compared. If the first 8 bytes are equal, align src1. - This ensures each iteration does at most one unaligned access even if both - src1 and src2 are unaligned, and mutually aligned inputs behave as if - aligned. After the main loop, process the last 16 bytes using unaligned - accesses. */ - -ENTRY(memcmp) -.p2align 6 +ENTRY (memcmp) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) subs limit, limit, 8 b.lo L(less8) - /* Limit >= 8, so check first 8 bytes using unaligned loads. */ ldr data1, [src1], 8 ldr data2, [src2], 8 cmp data1, data2 @@ -164,4 +133,5 @@ L(byte_loop): sub result, data1w, data2w ret -END(memcmp) +END (memcmp) + diff --git a/libc/arch-arm64/generic/bionic/memcpy.S b/libc/arch-arm64/generic/bionic/memcpy.S index baadb9204..fc487d3a4 100644 --- a/libc/arch-arm64/generic/bionic/memcpy.S +++ b/libc/arch-arm64/generic/bionic/memcpy.S @@ -30,6 +30,6 @@ #include <private/bionic_asm.h> -ENTRY(__memcpy) +ENTRY(memcpy) #include "memcpy_base.S" -END(__memcpy) +END(memcpy) diff --git a/libc/arch-arm64/generic/bionic/memcpy_base.S b/libc/arch-arm64/generic/bionic/memcpy_base.S index f85062408..f77e4dd35 100644 --- a/libc/arch-arm64/generic/bionic/memcpy_base.S +++ b/libc/arch-arm64/generic/bionic/memcpy_base.S @@ -1,60 +1,13 @@ -/* Copyright (c) 2012-2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - /* - * Copyright (c) 2015 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. + * memcpy - copy memory area * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. * */ @@ -69,149 +22,181 @@ #define A_l x6 #define A_lw w6 #define A_h x7 -#define A_hw w7 #define B_l x8 -#define B_lw w8 +#define B_lw w8 #define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l src -#define E_h count -#define F_l srcend -#define F_h dst -#define tmp1 x9 - -#define L(l) .L ## l - -/* Copies are split into 3 main cases: small copies of up to 16 bytes, - medium copies of 17..96 bytes which are fully unrolled. Large copies - of more than 96 bytes align the destination and use an unrolled loop - processing 64 bytes per iteration. - Small and medium copies read all data before writing, allowing any - kind of overlap, and memmove tailcalls memcpy for these cases as - well as non-overlapping copies. +#define C_lw w10 +#define tmp1 x14 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. */ - prfm PLDL1KEEP, [src] + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) add srcend, src, count add dstend, dstin, count - cmp count, 16 - b.ls L(copy16) - cmp count, 96 + cmp count, 128 b.hi L(copy_long) - - /* Medium copies: 17..96 bytes. */ - sub tmp1, count, 1 - ldp A_l, A_h, [src] - tbnz tmp1, 6, L(copy96) - ldp D_l, D_h, [srcend, -16] - tbz tmp1, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] -1: - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] ret - .p2align 4 - - /* Small copies: 0..16 bytes. */ + /* Copy 8-15 bytes. */ L(copy16): - cmp count, 8 - b.lo 1f + tbz count, 3, L(copy8) ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret - .p2align 4 -1: - tbz count, 2, 1f + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) ldr A_lw, [src] - ldr A_hw, [srcend, -4] + ldr B_lw, [srcend, -4] str A_lw, [dstin] - str A_hw, [dstend, -4] + str B_lw, [dstend, -4] ret - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -1: - cbz count, 2f + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) lsr tmp1, count, 1 ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] + ldrb C_lw, [srcend, -1] ldrb B_lw, [src, tmp1] strb A_lw, [dstin] strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] -2: ret + strb C_lw, [dstend, -1] +L(copy0): + ret .p2align 4 - /* Copy 64..96 bytes. Copy 64 bytes from the start and - 32 bytes from the end. */ -L(copy96): - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [src, 32] - ldp D_l, D_h, [src, 48] - ldp E_l, E_h, [srcend, -32] - ldp F_l, F_h, [srcend, -16] - stp A_l, A_h, [dstin] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin, 32] - stp D_l, D_h, [dstin, 48] - stp E_l, E_h, [dstend, -32] - stp F_l, F_h, [dstend, -16] + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] ret - /* Align DST to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 96 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy more than 128 bytes. */ L(copy_long): - and tmp1, dstin, 15 - bic dst, dstin, 15 - ldp D_l, D_h, [src] - sub src, src, tmp1 + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 add count, count, tmp1 /* Count is now 16 too large. */ - ldp A_l, A_h, [src, 16] - stp D_l, D_h, [dstin] - ldp B_l, B_h, [src, 32] - ldp C_l, C_h, [src, 48] - ldp D_l, D_h, [src, 64]! + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] subs count, count, 128 + 16 /* Test and readjust count. */ - b.ls 2f -1: - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [src, 16] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [src, 32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [src, 48] - stp D_l, D_h, [dst, 64]! - ldp D_l, D_h, [src, 64]! + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 subs count, count, 64 - b.hi 1b - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the end even if - there is just 1 byte left. */ -2: - ldp E_l, E_h, [srcend, -64] - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [srcend, -48] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [srcend, -16] - stp D_l, D_h, [dst, 64] - stp E_l, E_h, [dstend, -64] - stp A_l, A_h, [dstend, -48] - stp B_l, B_h, [dstend, -32] - stp C_l, C_h, [dstend, -16] + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] ret + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + cbz tmp1, L(copy0) + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] + ret + diff --git a/libc/arch-arm64/generic/bionic/memmove.S b/libc/arch-arm64/generic/bionic/memmove.S index 335b7d6ce..ad4f31f25 100644 --- a/libc/arch-arm64/generic/bionic/memmove.S +++ b/libc/arch-arm64/generic/bionic/memmove.S @@ -1,155 +1,33 @@ -/* Copyright (c) 2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - /* - * Copyright (c) 2015 ARM Ltd + * Copyright (C) 2008 The Android Open Source Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: - * 1. Redistributions of source code must retain the above copyright + * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include <private/bionic_asm.h> -/* Parameters and result. */ -#define dstin x0 -#define src x1 -#define count x2 -#define srcend x3 -#define dstend x4 -#define tmp1 x5 -#define A_l x6 -#define A_h x7 -#define B_l x8 -#define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l count -#define E_h tmp1 - -/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps. - Larger backwards copies are also handled by memcpy. The only remaining - case is forward large copies. The destination is aligned, and an - unrolled loop processes 64 bytes per iteration. -*/ - -#if defined(WMEMMOVE) -ENTRY(wmemmove) - lsl count, count, #2 -#else ENTRY(memmove) -#endif - sub tmp1, dstin, src - cmp count, 96 - ccmp tmp1, count, 2, hi - b.hs __memcpy - - cbz tmp1, 3f - add dstend, dstin, count - add srcend, src, count - - /* Align dstend to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 96 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - - and tmp1, dstend, 15 - ldp D_l, D_h, [srcend, -16] - sub srcend, srcend, tmp1 - sub count, count, tmp1 - ldp A_l, A_h, [srcend, -16] - stp D_l, D_h, [dstend, -16] - ldp B_l, B_h, [srcend, -32] - ldp C_l, C_h, [srcend, -48] - ldp D_l, D_h, [srcend, -64]! - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls 2f - nop -1: - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [srcend, -16] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [srcend, -48] - stp D_l, D_h, [dstend, -64]! - ldp D_l, D_h, [srcend, -64]! - subs count, count, 64 - b.hi 1b - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ -2: - ldp E_l, E_h, [src, 48] - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [src, 32] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [src, 16] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [src] - stp D_l, D_h, [dstend, -64] - stp E_l, E_h, [dstin, 48] - stp A_l, A_h, [dstin, 32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin] -3: ret - -#if defined(WMEMMOVE) -END(wmemmove) -#else + #include "memcpy_base.S" END(memmove) - -ALIAS_SYMBOL(memcpy, memmove) -#endif diff --git a/libc/arch-arm64/generic/bionic/memrchr.S b/libc/arch-arm64/generic/bionic/memrchr.S new file mode 100644 index 000000000..9ab232775 --- /dev/null +++ b/libc/arch-arm64/generic/bionic/memrchr.S @@ -0,0 +1,117 @@ +/* + * memrchr - find last character in a memory zone. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include <private/bionic_asm.h> + +#define srcin x0 +#define chrin w1 +#define cntin x2 +#define result x0 + +#define src x3 +#define cntrem x4 +#define synd x5 +#define shift x6 +#define tmp x7 +#define wtmp w7 +#define end x8 +#define endm1 x9 + +#define vrepchr v0 +#define qdata q1 +#define vdata v1 +#define vhas_chr v2 +#define vrepmask v3 +#define vend v4 +#define dend d4 + +/* + Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (memrchr) + PTR_ARG (0) + add end, srcin, cntin + sub endm1, end, 1 + bic src, endm1, 15 + cbz cntin, L(nomatch) + ld1 {vdata.16b}, [src] + dup vrepchr.16b, chrin + mov wtmp, 0xf00f + dup vrepmask.8h, wtmp + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + neg shift, end, lsl 2 + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + lsl synd, synd, shift + cbz synd, L(start_loop) + + clz synd, synd + sub result, endm1, synd, lsr 2 + cmp cntin, synd, lsr 2 + csel result, result, xzr, hi + ret + +L(start_loop): + sub tmp, end, src + subs cntrem, cntin, tmp + b.ls L(nomatch) + + /* Make sure that it won't overread by a 16-byte chunk */ + add tmp, cntrem, 15 + tbnz tmp, 4, L(loop32_2) + + .p2align 4 +L(loop32): + ldr qdata, [src, -16]! + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbnz synd, L(end) + +L(loop32_2): + ldr qdata, [src, -16]! + subs cntrem, cntrem, 32 + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + b.ls L(end) + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbz synd, L(loop32) +L(end): + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + + add tmp, src, 15 +#ifdef __AARCH64EB__ + rbit synd, synd +#endif + clz synd, synd + sub tmp, tmp, synd, lsr 2 + cmp tmp, srcin + csel result, tmp, xzr, hs + ret + +L(nomatch): + mov result, 0 + ret + +END (memrchr) + diff --git a/libc/arch-arm64/generic/bionic/memset.S b/libc/arch-arm64/generic/bionic/memset.S index 12fc09db8..0f54f9d79 100644 --- a/libc/arch-arm64/generic/bionic/memset.S +++ b/libc/arch-arm64/generic/bionic/memset.S @@ -1,89 +1,25 @@ -/* Copyright (c) 2012-2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - /* - * Copyright (c) 2015 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. + * memset - fill memory with a constant byte * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2012-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. * */ #include <private/bionic_asm.h> -/* By default we assume that the DC instruction can be used to zero - data blocks more efficiently. In some circumstances this might be - unsafe, for example in an asymmetric multiprocessor environment with - different DC clear lengths (neither the upper nor lower lengths are - safe to use). - - If code may be run in a virtualized environment, then define - MAYBE_VIRT. This will cause the code to cache the system register - values rather than re-reading them each call. */ - -#define dstin x0 -#define val x1 -#define valw w1 -#define count x2 -#define dst x3 -#define dstend x4 -#define tmp1 x5 -#define tmp1w w5 -#define tmp2 x6 -#define tmp2w w6 -#define zva_len x7 -#define zva_lenw w7 - -#define L(l) .L ## l +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define zva_val x5 ENTRY(__memset_chk) cmp count, dst @@ -98,7 +34,9 @@ ENTRY(__memset_chk) bl __memset_chk_fail END(__memset_chk) -ENTRY(memset) +ENTRY (memset) + PTR_ARG (0) + SIZE_ARG (2) dup v0.16B, valw add dstend, dstin, count @@ -114,7 +52,7 @@ ENTRY(memset) str val, [dstin] str val, [dstend, -8] ret - nop + .p2align 4 1: tbz count, 2, 2f str valw, [dstin] str valw, [dstend, -4] @@ -144,108 +82,49 @@ L(set96): stp q0, q0, [dstend, -32] ret - .p2align 3 - nop + .p2align 4 L(set_long): and valw, valw, 255 bic dst, dstin, 15 str q0, [dstin] - cmp count, 256 - ccmp valw, 0, 0, cs - b.eq L(try_zva) -L(no_zva): - sub count, dstend, dst /* Count is 16 too large. */ - add dst, dst, 16 - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] -L(tail64): - subs count, count, 64 - b.hi 1b -2: stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret - - .p2align 3 -L(try_zva): - mrs tmp1, dczid_el0 - tbnz tmp1w, 4, L(no_zva) - and tmp1w, tmp1w, 15 - cmp tmp1w, 4 /* ZVA size is 64 bytes. */ - b.ne L(zva_128) - - /* Write the first and last 64 byte aligned block using stp rather - than using DC ZVA. This is faster on some cores. - */ -L(zva_64): + cmp count, 160 + ccmp valw, 0, 0, hs + b.ne L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif str q0, [dst, 16] stp q0, q0, [dst, 32] bic dst, dst, 63 - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ - add dst, dst, 128 - nop -1: dc zva, dst + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): add dst, dst, 64 + dc zva, dst subs count, count, 64 - b.hi 1b - stp q0, q0, [dst, 0] - stp q0, q0, [dst, 32] + b.hi L(zva_loop) stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret - .p2align 3 -L(zva_128): - cmp tmp1w, 5 /* ZVA size is 128 bytes. */ - b.ne L(zva_other) - - str q0, [dst, 16] +L(no_zva): + sub count, dstend, dst /* Count is 16 too large. */ + sub dst, dst, 16 /* Dst is biased by -32. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +L(no_zva_loop): stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - bic dst, dst, 127 - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+128 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst - add dst, dst, 128 - subs count, count, 128 - b.hi 1b - stp q0, q0, [dstend, -128] - stp q0, q0, [dstend, -96] + stp q0, q0, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret -L(zva_other): - mov tmp2w, 4 - lsl zva_lenw, tmp2w, tmp1w - add tmp1, zva_len, 64 /* Max alignment bytes written. */ - cmp count, tmp1 - blo L(no_zva) - - sub tmp2, zva_len, 1 - add tmp1, dst, zva_len - add dst, dst, 16 - subs count, tmp1, dst /* Actual alignment bytes to write. */ - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ - beq 2f -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] - subs count, count, 64 - b.hi 1b -2: mov dst, tmp1 - sub count, dstend, tmp1 /* Remaining bytes to write. */ - subs count, count, zva_len - b.lo 4f -3: dc zva, dst - add dst, dst, zva_len - subs count, count, zva_len - b.hs 3b -4: add count, count, zva_len - b L(tail64) - -END(memset) +END (memset) + diff --git a/libc/arch-arm64/generic/bionic/stpcpy.S b/libc/arch-arm64/generic/bionic/stpcpy.S index e4a799387..4f62aa462 100644 --- a/libc/arch-arm64/generic/bionic/stpcpy.S +++ b/libc/arch-arm64/generic/bionic/stpcpy.S @@ -1,29 +1,10 @@ /* - * Copyright (C) 2014 The Android Open Source Project - * All rights reserved. + * stpcpy - copy a string returning pointer to end. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ -#define STPCPY -#include "string_copy.S" + +#define BUILD_STPCPY 1 + +#include "strcpy.S" diff --git a/libc/arch-arm64/generic/bionic/strchrnul.S b/libc/arch-arm64/generic/bionic/strchrnul.S new file mode 100644 index 000000000..3fd909c15 --- /dev/null +++ b/libc/arch-arm64/generic/bionic/strchrnul.S @@ -0,0 +1,114 @@ +/* + * strchrnul - find a character or nul in a string + * + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +#include <private/bionic_asm.h> + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 + +#define result x0 + +#define src x2 +#define tmp1 x3 +#define wtmp2 w4 +#define tmp3 x5 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_nul1 v3 +#define vhas_nul2 v4 +#define vhas_chr1 v5 +#define vhas_chr2 v6 +#define vrepmask v7 +#define vend1 v16 + +/* Core algorithm. + + For each 32-byte hunk we calculate a 64-bit syndrome value, with + two bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). For each tuple, bit 0 is set iff + the relevant byte matched the requested character or nul. Since the + bits in the syndrome reflect exactly the order in which things occur + in the original string a count_trailing_zeros() operation will + identify exactly which byte is causing the termination. */ + +/* Locals and temporaries. */ + +ENTRY (strchrnul) + PTR_ARG (0) + /* Magic constant 0x40100401 to allow us to identify which lane + matches the termination condition. */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ + dup vrepmask.4s, wtmp2 + ands tmp1, srcin, #31 + b.eq L(loop) + + /* Input string is not 32-byte aligned. Rather than forcing + the padding bytes to a safe value, we calculate the syndrome + for all the bytes, but then mask off those bits of the + syndrome that are related to the padding. */ + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + neg tmp1, tmp1 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b + lsl tmp1, tmp1, #1 + addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + mov tmp3, #~0 + addp vend1.16b, vend1.16b, vend1.16b // 128->64 + lsr tmp1, tmp3, tmp1 + + mov tmp3, vend1.d[0] + bic tmp1, tmp3, tmp1 // Mask padding bits. + cbnz tmp1, L(tail) + + .p2align 4 +L(loop): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b + umaxp vend1.16b, vend1.16b, vend1.16b + mov tmp1, vend1.d[0] + cbz tmp1, L(loop) + + /* Termination condition found. Now need to establish exactly why + we terminated. */ + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b + addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend1.16b, vend1.16b, vend1.16b // 128->64 + + mov tmp1, vend1.d[0] +L(tail): + /* Count the trailing zeros, by bit reversing... */ + rbit tmp1, tmp1 + /* Re-bias source. */ + sub src, src, #32 + clz tmp1, tmp1 /* ... and counting the leading zeros. */ + /* tmp1 is twice the offset into the fragment. */ + add result, src, tmp1, lsr #1 + ret + +END (strchrnul) + diff --git a/libc/arch-arm64/generic/bionic/strcpy.S b/libc/arch-arm64/generic/bionic/strcpy.S index 260c32138..54da47c0b 100644 --- a/libc/arch-arm64/generic/bionic/strcpy.S +++ b/libc/arch-arm64/generic/bionic/strcpy.S @@ -1,29 +1,311 @@ /* - * Copyright (C) 2014 The Android Open Source Project - * All rights reserved. + * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. + * Copyright (c) 2013-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * ARMv8-a, AArch64, unaligned accesses, min page size 4k. */ -#define STRCPY -#include "string_copy.S" + +#include <private/bionic_asm.h> + +/* To build as stpcpy, define BUILD_STPCPY before compiling this file. + + To test the page crossing code path more thoroughly, compile with + -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower + entry path. This option is not intended for production use. */ + +/* Arguments and results. */ +#define dstin x0 +#define srcin x1 + +/* Locals and temporaries. */ +#define src x2 +#define dst x3 +#define data1 x4 +#define data1w w4 +#define data2 x5 +#define data2w w5 +#define has_nul1 x6 +#define has_nul2 x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define tmp4 x11 +#define zeroones x12 +#define data1a x13 +#define data2a x14 +#define pos x15 +#define len x16 +#define to_align x17 + +#ifdef BUILD_STPCPY +#define STRCPY stpcpy +#else +#define STRCPY strcpy +#endif + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + + /* AArch64 systems have a minimum page size of 4k. We can do a quick + page size check for crossing this boundary on entry and if we + do not, then we can short-circuit much of the entry code. We + expect early page-crossing strings to be rare (probability of + 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite + predictable, even with random strings. + + We don't bother checking for larger page sizes, the cost of setting + up the correct page size is just not worth the extra gain from + a small reduction in the cases taking the slow path. Note that + we only care about whether the first fetch, which may be + misaligned, crosses a page boundary - after that we move to aligned + fetches for the remainder of the string. */ + +#ifdef STRCPY_TEST_PAGE_CROSS + /* Make everything that isn't Qword aligned look like a page cross. */ +#define MIN_PAGE_P2 4 +#else +#define MIN_PAGE_P2 12 +#endif + +#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) + +ENTRY (STRCPY) + PTR_ARG (0) + PTR_ARG (1) + /* For moderately short strings, the fastest way to do the copy is to + calculate the length of the string in the same way as strlen, then + essentially do a memcpy of the result. This avoids the need for + multiple byte copies and further means that by the time we + reach the bulk copy loop we know we can always use DWord + accesses. We expect __strcpy_aarch64 to rarely be called repeatedly + with the same source string, so branch prediction is likely to + always be difficult - we mitigate against this by preferring + conditional select operations over branches whenever this is + feasible. */ + and tmp2, srcin, #(MIN_PAGE_SIZE - 1) + mov zeroones, #REP8_01 + and to_align, srcin, #15 + cmp tmp2, #(MIN_PAGE_SIZE - 16) + neg tmp1, to_align + /* The first fetch will straddle a (possible) page boundary iff + srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte + aligned string will never fail the page align check, so will + always take the fast path. */ + b.gt L(page_cross) + +L(page_cross_ok): + ldp data1, data2, [srcin] +#ifdef __AARCH64EB__ + /* Because we expect the end to be found within 16 characters + (profiling shows this is the most common case), it's worth + swapping the bytes now to save having to recalculate the + termination syndrome later. We preserve data1 and data2 + so that we can re-use the values later on. */ + rev tmp2, data1 + sub tmp1, tmp2, zeroones + orr tmp2, tmp2, #REP8_7f + bics has_nul1, tmp1, tmp2 + b.ne L(fp_le8) + rev tmp4, data2 + sub tmp3, tmp4, zeroones + orr tmp4, tmp4, #REP8_7f +#else + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bics has_nul1, tmp1, tmp2 + b.ne L(fp_le8) + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f +#endif + bics has_nul2, tmp3, tmp4 + b.eq L(bulk_entry) + + /* The string is short (<=16 bytes). We don't know exactly how + short though, yet. Work out the exact length so that we can + quickly select the optimal copy strategy. */ +L(fp_gt8): + rev has_nul2, has_nul2 + clz pos, has_nul2 + mov tmp2, #56 + add dst, dstin, pos, lsr #3 /* Bits to bytes. */ + sub pos, tmp2, pos +#ifdef __AARCH64EB__ + lsr data2, data2, pos +#else + lsl data2, data2, pos +#endif + str data2, [dst, #1] + str data1, [dstin] +#ifdef BUILD_STPCPY + add dstin, dst, #8 +#endif + ret + +L(fp_le8): + rev has_nul1, has_nul1 + clz pos, has_nul1 + add dst, dstin, pos, lsr #3 /* Bits to bytes. */ + subs tmp2, pos, #24 /* Pos in bits. */ + b.lt L(fp_lt4) +#ifdef __AARCH64EB__ + mov tmp2, #56 + sub pos, tmp2, pos + lsr data2, data1, pos + lsr data1, data1, #32 +#else + lsr data2, data1, tmp2 +#endif + /* 4->7 bytes to copy. */ + str data2w, [dst, #-3] + str data1w, [dstin] +#ifdef BUILD_STPCPY + mov dstin, dst +#endif + ret +L(fp_lt4): + cbz pos, L(fp_lt2) + /* 2->3 bytes to copy. */ +#ifdef __AARCH64EB__ + lsr data1, data1, #48 +#endif + strh data1w, [dstin] + /* Fall-through, one byte (max) to go. */ +L(fp_lt2): + /* Null-terminated string. Last character must be zero! */ + strb wzr, [dst] +#ifdef BUILD_STPCPY + mov dstin, dst +#endif + ret + + .p2align 6 + /* Aligning here ensures that the entry code and main loop all lies + within one 64-byte cache line. */ +L(bulk_entry): + sub to_align, to_align, #16 + stp data1, data2, [dstin] + sub src, srcin, to_align + sub dst, dstin, to_align + b L(entry_no_page_cross) + + /* The inner loop deals with two Dwords at a time. This has a + slightly higher start-up cost, but we should win quite quickly, + especially on cores with a high number of issue slots per + cycle, as we get much better parallelism out of the operations. */ +L(main_loop): + stp data1, data2, [dst], #16 +L(entry_no_page_cross): + ldp data1, data2, [src], #16 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq L(main_loop) + + /* Since we know we are copying at least 16 bytes, the fastest way + to deal with the tail is to determine the location of the + trailing NUL, then (re)copy the 16 bytes leading up to that. */ + cmp has_nul1, #0 +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul directly. The + easiest way to get the correct byte is to byte-swap the data + and calculate the syndrome a second time. */ + csel data1, data1, data2, ne + rev data1, data1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul1, tmp1, tmp2 +#else + csel has_nul1, has_nul1, has_nul2, ne +#endif + rev has_nul1, has_nul1 + clz pos, has_nul1 + add tmp1, pos, #72 + add pos, pos, #8 + csel pos, pos, tmp1, ne + add src, src, pos, lsr #3 + add dst, dst, pos, lsr #3 + ldp data1, data2, [src, #-32] + stp data1, data2, [dst, #-16] +#ifdef BUILD_STPCPY + sub dstin, dst, #1 +#endif + ret + +L(page_cross): + bic src, srcin, #15 + /* Start by loading two words at [srcin & ~15], then forcing the + bytes that precede srcin to 0xff. This means they never look + like termination bytes. */ + ldp data1, data2, [src] + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + tst to_align, #7 + csetm tmp2, ne +#ifdef __AARCH64EB__ + lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#else + lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#endif + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + cmp to_align, #8 + csinv data1, data1, xzr, lt + csel data2, data2, data2a, lt + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq L(page_cross_ok) + /* We now need to make data1 and data2 look like they've been + loaded directly from srcin. Do a rotate on the 128-bit value. */ + lsl tmp1, to_align, #3 /* Bytes->bits. */ + neg tmp2, to_align, lsl #3 +#ifdef __AARCH64EB__ + lsl data1a, data1, tmp1 + lsr tmp4, data2, tmp2 + lsl data2, data2, tmp1 + orr tmp4, tmp4, data1a + cmp to_align, #8 + csel data1, tmp4, data2, lt + rev tmp2, data1 + rev tmp4, data2 + sub tmp1, tmp2, zeroones + orr tmp2, tmp2, #REP8_7f + sub tmp3, tmp4, zeroones + orr tmp4, tmp4, #REP8_7f +#else + lsr data1a, data1, tmp1 + lsl tmp4, data2, tmp2 + lsr data2, data2, tmp1 + orr tmp4, tmp4, data1a + cmp to_align, #8 + csel data1, tmp4, data2, lt + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f +#endif + bic has_nul1, tmp1, tmp2 + cbnz has_nul1, L(fp_le8) + bic has_nul2, tmp3, tmp4 + b L(fp_gt8) + +END (STRCPY) + diff --git a/libc/arch-arm64/generic/bionic/string_copy.S b/libc/arch-arm64/generic/bionic/string_copy.S deleted file mode 100644 index 2bf969d60..000000000 --- a/libc/arch-arm64/generic/bionic/string_copy.S +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - Copyright (c) 2014, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -/* Assumptions: - * - * ARMv8-a, AArch64 - */ - -#if !defined(STPCPY) && !defined(STRCPY) -#error "Either STPCPY or STRCPY must be defined." -#endif - -#include <private/bionic_asm.h> - -/* Arguments and results. */ -#if defined(STPCPY) -#define dst x0 -#elif defined(STRCPY) -#define dstin x0 -#endif -#define src x1 - -/* Locals and temporaries. */ -#if defined(STRCPY) -#define dst x2 -#endif -#define data1 x3 -#define data1_w w3 -#define data2 x4 -#define data2_w w4 -#define has_nul1 x5 -#define has_nul1_w w5 -#define has_nul2 x6 -#define tmp1 x7 -#define tmp2 x8 -#define tmp3 x9 -#define tmp4 x10 -#define zeroones x11 -#define zeroones_w w11 -#define pos x12 - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - -#if defined(STPCPY) -ENTRY(stpcpy) -#elif defined(STRCPY) -ENTRY(strcpy) -#endif - mov zeroones, #REP8_01 -#if defined(STRCPY) - mov dst, dstin -#endif - ands tmp1, src, #15 - b.ne .Lmisaligned - // NUL detection works on the principle that (X - 1) & (~X) & 0x80 - // (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - // can be done in parallel across the entire word. - // The inner loop deals with two Dwords at a time. This has a - // slightly higher start-up cost, but we should win quite quickly, - // especially on cores with a high number of issue slots per - // cycle, as we get much better parallelism out of the operations. -.Lloop: - ldp data1, data2, [src], #16 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bic has_nul1, tmp1, tmp2 - cbnz has_nul1, .Lnul_in_data1 - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul2, tmp3, tmp4 - cbnz has_nul2, .Lnul_in_data2 - // No NUL in either register, copy it in a single instruction. - stp data1, data2, [dst], #16 - b .Lloop - -.Lnul_in_data1: - rev has_nul1, has_nul1 - clz pos, has_nul1 - add tmp1, pos, #0x8 - - tbz tmp1, #6, 1f -#if defined(STPCPY) - str data1, [dst], #7 -#elif defined(STRCPY) - str data1, [dst] -#endif - ret -1: - tbz tmp1, #5, 1f - str data1_w, [dst], #4 - lsr data1, data1, #32 -1: - tbz tmp1, #4, 1f - strh data1_w, [dst], #2 - lsr data1, data1, #16 -1: - tbz tmp1, #3, 1f - strb data1_w, [dst] -#if defined(STPCPY) - ret -#endif -1: -#if defined(STPCPY) - // Back up one so that dst points to the '\0' string terminator. - sub dst, dst, #1 -#endif - ret - -.Lnul_in_data2: - str data1, [dst], #8 - rev has_nul2, has_nul2 - clz pos, has_nul2 - add tmp1, pos, #0x8 - - tbz tmp1, #6, 1f -#if defined(STPCPY) - str data2, [dst], #7 -#elif defined(STRCPY) - str data2, [dst] -#endif - ret -1: - tbz tmp1, #5, 1f - str data2_w, [dst], #4 - lsr data2, data2, #32 -1: - tbz tmp1, #4, 1f - strh data2_w, [dst], #2 - lsr data2, data2, #16 -1: - tbz tmp1, #3, 1f - strb data2_w, [dst] -#if defined(STPCPY) - ret -#endif -1: -#if defined(STPCPY) - // Back up one so that dst points to the '\0' string terminator. - sub dst, dst, #1 -#endif - ret - -.Lmisaligned: - tbz src, #0, 1f - ldrb data1_w, [src], #1 - strb data1_w, [dst], #1 - cbnz data1_w, 1f -#if defined(STPCPY) - // Back up one so that dst points to the '\0' string terminator. - sub dst, dst, #1 -#endif - ret -1: - tbz src, #1, 1f - ldrb data1_w, [src], #1 - strb data1_w, [dst], #1 - cbz data1_w, .Ldone - ldrb data2_w, [src], #1 - strb data2_w, [dst], #1 - cbnz data2_w, 1f -.Ldone: -#if defined(STPCPY) - // Back up one so that dst points to the '\0' string terminator. - sub dst, dst, #1 -#endif - ret -1: - tbz src, #2, 1f - ldr data1_w, [src], #4 - // Check for a zero. - sub has_nul1_w, data1_w, zeroones_w - bic has_nul1_w, has_nul1_w, data1_w - ands has_nul1_w, has_nul1_w, #0x80808080 - b.ne .Lnul_in_data1 - str data1_w, [dst], #4 -1: - tbz src, #3, .Lloop - ldr data1, [src], #8 - // Check for a zero. - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne .Lnul_in_data1 - str data1, [dst], #8 - b .Lloop -#if defined(STPCPY) -END(stpcpy) -#elif defined(STRCPY) -END(strcpy) -#endif diff --git a/libc/arch-arm64/generic/bionic/strrchr.S b/libc/arch-arm64/generic/bionic/strrchr.S new file mode 100644 index 000000000..274d19746 --- /dev/null +++ b/libc/arch-arm64/generic/bionic/strrchr.S @@ -0,0 +1,149 @@ +/* + * strrchr - find last position of a character in a string. + * + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +#include <private/bionic_asm.h> + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 + +#define result x0 + +#define src x2 +#define tmp1 x3 +#define wtmp2 w4 +#define tmp3 x5 +#define src_match x6 +#define src_offset x7 +#define const_m1 x8 +#define tmp4 x9 +#define nul_match x10 +#define chr_match x11 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_nul1 v3 +#define vhas_nul2 v4 +#define vhas_chr1 v5 +#define vhas_chr2 v6 +#define vrepmask_0 v7 +#define vrepmask_c v16 +#define vend1 v17 +#define vend2 v18 + +/* Core algorithm. + + For each 32-byte hunk we calculate a 64-bit syndrome value, with + two bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). For each tuple, bit 0 is set iff + the relevant byte matched the requested character; bit 1 is set + iff the relevant byte matched the NUL end of string (we trigger + off bit0 for the special case of looking for NUL). Since the bits + in the syndrome reflect exactly the order in which things occur + in the original string a count_trailing_zeros() operation will + identify exactly which byte is causing the termination, and why. */ + +ENTRY (strrchr) + PTR_ARG (0) + /* Magic constant 0x40100401 to allow us to identify which lane + matches the requested byte. Magic constant 0x80200802 used + similarly for NUL termination. */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ + dup vrepmask_c.4s, wtmp2 + mov src_offset, #0 + ands tmp1, srcin, #31 + add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ + b.eq L(aligned) + + /* Input string is not 32-byte aligned. Rather than forcing + the padding bytes to a safe value, we calculate the syndrome + for all the bytes, but then mask off those bits of the + syndrome that are related to the padding. */ + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + neg tmp1, tmp1 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vend1.d[0] + lsl tmp1, tmp1, #1 + mov const_m1, #~0 + lsr tmp3, const_m1, tmp1 + mov chr_match, vend1.d[1] + + bic nul_match, nul_match, tmp3 // Mask padding bits. + bic chr_match, chr_match, tmp3 // Mask padding bits. + cbnz nul_match, L(tail) + + .p2align 4 +L(loop): + cmp chr_match, #0 + csel src_match, src, src_match, ne + csel src_offset, chr_match, src_offset, ne +L(aligned): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + uminp vend1.16b, vdata1.16b, vdata2.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + cmeq vend1.16b, vend1.16b, 0 + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vend1.d[0] + mov chr_match, vend1.d[1] + cbz nul_match, L(loop) + + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_nul2.16b, vdata2.16b, #0 + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b + mov nul_match, vhas_nul1.d[0] + +L(tail): + /* Work out exactly where the string ends. */ + sub tmp4, nul_match, #1 + eor tmp4, tmp4, nul_match + ands chr_match, chr_match, tmp4 + /* And pick the values corresponding to the last match. */ + csel src_match, src, src_match, ne + csel src_offset, chr_match, src_offset, ne + + /* Count down from the top of the syndrome to find the last match. */ + clz tmp3, src_offset + /* Src_match points beyond the word containing the match, so we can + simply subtract half the bit-offset into the syndrome. Because + we are counting down, we need to go back one more character. */ + add tmp3, tmp3, #2 + sub result, src_match, tmp3, lsr #1 + /* But if the syndrome shows no match was found, then return NULL. */ + cmp src_offset, #0 + csel result, result, xzr, ne + + ret + +END (strrchr) + diff --git a/libc/arch-arm64/generic/bionic/wmemmove.S b/libc/arch-arm64/generic/bionic/wmemmove.S index e4f67f759..dc03f4c92 100644 --- a/libc/arch-arm64/generic/bionic/wmemmove.S +++ b/libc/arch-arm64/generic/bionic/wmemmove.S @@ -1,30 +1,33 @@ -/* Copyright (c) 2014, Linaro Limited - All rights reserved. +/* + * Copyright (C) 2008 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. +#include <private/bionic_asm.h> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#define WMEMMOVE -#include "memmove.S" -#undef WMEMMOVE +ENTRY(wmemmove) + #include "memcpy_base.S" +END(wmemmove) diff --git a/libc/private/bionic_asm.h b/libc/private/bionic_asm.h index 6409563f2..d18f0fc42 100644 --- a/libc/private/bionic_asm.h +++ b/libc/private/bionic_asm.h @@ -83,4 +83,15 @@ .globl alias; \ .equ alias, original +/* For arm-optimized-routines */ +#define L(l) .L ## l +#ifdef __ILP32__ + /* Sanitize padding bits of pointer arguments as per aapcs64 */ +#define PTR_ARG(n) mov w##n, w##n +#define SIZE_ARG(n) mov w##n, w##n +#else +#define PTR_ARG(n) +#define SIZE_ARG(n) +#endif + #endif |