summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDanny Lin <danny@kdrag0n.dev>2021-03-21 01:51:00 -0700
committeralk3pInjection <webmaster@raspii.tech>2021-09-27 21:17:05 +0800
commit03559b84468a98cd72150a66092ec463035bb666 (patch)
treebb321caca05a1413985d29560130bde2550b5fc7
parentba602b8990ff1b695ea580a537a07a07224da937 (diff)
[ProtonAOSP] Switch to arm-optimized-routines for arm64lineage-18.1
This replaces the following string functions with implementations from arm-optimized-routines [1]: - memchr - strchr - strcmp - strlen - strncmp - strnlen - memcmp - memcpy (NEON variant) - memmove - memrchr - memset - stpcpy - strchrnul - strcpy - strrchr - wmemmove All string functions supported by arm-optimized-routines were added as of upstream commit e823e3abf5f89ecba58a10fc0fd82c13d9984b6b. The NEON variant of memcpy was used with the assumption that all of our arm64 targets support NEON. [1] https://github.com/ARM-software/optimized-routines Test: builds and boots Change-Id: Id43fcf54c459b7edab306030c668d1c1ce3d95cd
-rw-r--r--libc/Android.bp13
-rw-r--r--libc/arch-arm64/default/bionic/memchr.S66
-rw-r--r--libc/arch-arm64/default/bionic/strchr.S91
-rw-r--r--libc/arch-arm64/default/bionic/strcmp.S43
-rw-r--r--libc/arch-arm64/default/bionic/strlen.S311
-rw-r--r--libc/arch-arm64/default/bionic/strncmp.S114
-rw-r--r--libc/arch-arm64/default/bionic/strnlen.S252
-rw-r--r--libc/arch-arm64/generic/bionic/memcmp.S48
-rw-r--r--libc/arch-arm64/generic/bionic/memcpy.S4
-rw-r--r--libc/arch-arm64/generic/bionic/memcpy_base.S311
-rw-r--r--libc/arch-arm64/generic/bionic/memmove.S160
-rw-r--r--libc/arch-arm64/generic/bionic/memrchr.S117
-rw-r--r--libc/arch-arm64/generic/bionic/memset.S207
-rw-r--r--libc/arch-arm64/generic/bionic/stpcpy.S33
-rw-r--r--libc/arch-arm64/generic/bionic/strchrnul.S114
-rw-r--r--libc/arch-arm64/generic/bionic/strcpy.S332
-rw-r--r--libc/arch-arm64/generic/bionic/string_copy.S245
-rw-r--r--libc/arch-arm64/generic/bionic/strrchr.S149
-rw-r--r--libc/arch-arm64/generic/bionic/wmemmove.S59
-rw-r--r--libc/private/bionic_asm.h11
20 files changed, 1322 insertions, 1358 deletions
diff --git a/libc/Android.bp b/libc/Android.bp
index 08fce3261..a422c26c5 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -619,6 +619,7 @@ cc_library_static {
arm64: {
exclude_srcs: [
"upstream-openbsd/lib/libc/string/memchr.c",
+ "upstream-openbsd/lib/libc/string/memrchr.c",
"upstream-openbsd/lib/libc/string/stpcpy.c",
"upstream-openbsd/lib/libc/string/strcpy.c",
"upstream-openbsd/lib/libc/string/strncmp.c",
@@ -851,9 +852,12 @@ cc_library_static {
"arch-arm64/generic/bionic/memcmp.S",
"arch-arm64/generic/bionic/memcpy.S",
"arch-arm64/generic/bionic/memmove.S",
+ "arch-arm64/generic/bionic/memrchr.S",
"arch-arm64/generic/bionic/memset.S",
"arch-arm64/generic/bionic/stpcpy.S",
+ "arch-arm64/generic/bionic/strchrnul.S",
"arch-arm64/generic/bionic/strcpy.S",
+ "arch-arm64/generic/bionic/strrchr.S",
"arch-arm64/generic/bionic/wmemmove.S",
"arch-arm64/default/bionic/memchr.S",
@@ -880,6 +884,7 @@ cc_library_static {
"bionic/__memcpy_chk.cpp",
"bionic/strchr.cpp",
"bionic/strnlen.c",
+ "bionic/strrchr.cpp",
],
},
@@ -1178,6 +1183,14 @@ cc_library_static {
"bionic/icu_static.cpp",
],
+ arch: {
+ arm64: {
+ exclude_srcs: [
+ "bionic/strchrnul.cpp",
+ ],
+ },
+ },
+
multilib: {
lib32: {
// LP32 cruft
diff --git a/libc/arch-arm64/default/bionic/memchr.S b/libc/arch-arm64/default/bionic/memchr.S
index 7fbcc8fd4..f5599bf2d 100644
--- a/libc/arch-arm64/default/bionic/memchr.S
+++ b/libc/arch-arm64/default/bionic/memchr.S
@@ -1,32 +1,9 @@
/*
+ * memchr - find a character in a memory zone
*
- Copyright (c) 2014, ARM Limited
- All rights Reserved.
- Copyright (c) 2014, Linaro Ltd.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the company nor the names of its contributors
- may be used to endorse or promote products derived from this
- software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
/* Assumptions:
*
@@ -69,12 +46,15 @@
* identify exactly which byte has matched.
*/
-ENTRY(memchr_default)
+ENTRY (memchr_default)
+ PTR_ARG (0)
+ SIZE_ARG (2)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
/*
* Magic constant 0x40100401 allows us to identify which lane matches
* the requested byte.
*/
- cbz cntin, .Lzero_length
mov wtmp2, #0x0401
movk wtmp2, #0x4010, lsl #16
dup vrepchr.16b, chrin
@@ -83,7 +63,7 @@ ENTRY(memchr_default)
dup vrepmask.4s, wtmp2
ands soff, srcin, #31
and cntrem, cntin, #31
- b.eq .Lloop
+ b.eq L(loop)
/*
* Input string is not 32-byte aligned. We calculate the syndrome
@@ -106,25 +86,25 @@ ENTRY(memchr_default)
lsr synd, synd, tmp
lsl synd, synd, tmp
/* The first block can also be the last */
- b.ls .Lmasklast
+ b.ls L(masklast)
/* Have we found something already? */
- cbnz synd, .Ltail
+ cbnz synd, L(tail)
-.Lloop:
+L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
subs cntin, cntin, #32
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
/* If we're out of data we finish regardless of the result */
- b.ls .Lend
+ b.ls L(end)
/* Use a fast check for the termination condition */
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
addp vend.2d, vend.2d, vend.2d
mov synd, vend.d[0]
/* We're not out of data, loop if we haven't found the character */
- cbz synd, .Lloop
+ cbz synd, L(loop)
-.Lend:
+L(end):
/* Termination condition found, let's calculate the syndrome value */
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
@@ -132,9 +112,9 @@ ENTRY(memchr_default)
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov synd, vend.d[0]
/* Only do the clear for the last possible block */
- b.hi .Ltail
+ b.hs L(tail)
-.Lmasklast:
+L(masklast):
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
add tmp, cntrem, soff
and tmp, tmp, #31
@@ -143,7 +123,7 @@ ENTRY(memchr_default)
lsl synd, synd, tmp
lsr synd, synd, tmp
-.Ltail:
+L(tail):
/* Count the trailing zeros using bit reversing */
rbit synd, synd
/* Compensate the last post-increment */
@@ -158,7 +138,9 @@ ENTRY(memchr_default)
csel result, xzr, result, eq
ret
-.Lzero_length:
- mov result, xzr
+L(zero_length):
+ mov result, #0
ret
-END(memchr_default)
+
+END (memchr_default)
+
diff --git a/libc/arch-arm64/default/bionic/strchr.S b/libc/arch-arm64/default/bionic/strchr.S
index f8cb724cc..ae522e81d 100644
--- a/libc/arch-arm64/default/bionic/strchr.S
+++ b/libc/arch-arm64/default/bionic/strchr.S
@@ -1,32 +1,9 @@
/*
+ * strchr - find a character in a string
*
- Copyright (c) 2014, ARM Limited
- All rights Reserved.
- Copyright (c) 2014, Linaro Ltd.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the company nor the names of its contributors
- may be used to endorse or promote products derived from this
- software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
/* Assumptions:
*
@@ -73,18 +50,19 @@
/* Locals and temporaries. */
-ENTRY(strchr_default)
- /* Magic constant 0x40100401 to allow us to identify which lane
- matches the requested byte. Magic constant 0x80200802 used
- similarly for NUL termination. */
- mov wtmp2, #0x0401
- movk wtmp2, #0x4010, lsl #16
+ENTRY (strchr_default)
+ PTR_ARG (0)
+ /* Magic constant 0xc0300c03 to allow us to identify which lane
+ matches the requested byte. Even bits are set if the character
+ matches, odd bits if either the char is NUL or matches. */
+ mov wtmp2, 0x0c03
+ movk wtmp2, 0xc030, lsl 16
dup vrepchr.16b, chrin
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
dup vrepmask_c.4s, wtmp2
ands tmp1, srcin, #31
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
- b.eq .Lloop
+ b.eq L(loop)
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
@@ -96,12 +74,10 @@ ENTRY(strchr_default)
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
- and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
- orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+ bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+ and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+ and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
lsl tmp1, tmp1, #1
addp vend1.16b, vend1.16b, vend2.16b // 256->128
mov tmp3, #~0
@@ -110,35 +86,30 @@ ENTRY(strchr_default)
mov tmp3, vend1.d[0]
bic tmp1, tmp3, tmp1 // Mask padding bits.
- cbnz tmp1, .Ltail
+ cbnz tmp1, L(tail)
-.Lloop:
+ .p2align 4
+L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
- cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- /* Use a fast check for the termination condition. */
- orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
- orr vend1.16b, vend1.16b, vend2.16b
- addp vend1.2d, vend1.2d, vend1.2d
+ cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+ cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+ umaxp vend1.16b, vend1.16b, vend1.16b
mov tmp1, vend1.d[0]
- cbz tmp1, .Lloop
+ cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
- and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
- and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
- orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
- orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+ bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+ and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+ and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
addp vend1.16b, vend1.16b, vend2.16b // 256->128
addp vend1.16b, vend1.16b, vend2.16b // 128->64
-
mov tmp1, vend1.d[0]
-.Ltail:
+L(tail):
/* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
/* Re-bias source. */
@@ -150,4 +121,6 @@ ENTRY(strchr_default)
add result, src, tmp1, lsr #1
csel result, result, xzr, eq
ret
-END(strchr_default)
+
+END (strchr_default)
+
diff --git a/libc/arch-arm64/default/bionic/strcmp.S b/libc/arch-arm64/default/bionic/strcmp.S
index dfac7c42c..418b4c894 100644
--- a/libc/arch-arm64/default/bionic/strcmp.S
+++ b/libc/arch-arm64/default/bionic/strcmp.S
@@ -1,29 +1,9 @@
-/* Copyright (c) 2012, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
/* Assumptions:
*
@@ -32,8 +12,6 @@
#include <private/bionic_asm.h>
-#define L(label) .L ## label
-
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
@@ -58,8 +36,9 @@
#define pos x11
/* Start of performance-critical section -- one 64B cache line. */
-ENTRY(strcmp_default)
-.p2align 6
+ENTRY (strcmp_default)
+ PTR_ARG (0)
+ PTR_ARG (1)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
@@ -189,4 +168,6 @@ L(loop_misaligned):
L(done):
sub result, data1, data2
ret
-END(strcmp_default)
+
+END (strcmp_default)
+
diff --git a/libc/arch-arm64/default/bionic/strlen.S b/libc/arch-arm64/default/bionic/strlen.S
index 07c5294c2..d102dec2d 100644
--- a/libc/arch-arm64/default/bionic/strlen.S
+++ b/libc/arch-arm64/default/bionic/strlen.S
@@ -1,105 +1,88 @@
-/* Copyright (c) 2013-2015, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+/*
+ * strlen - calculate the length of a string.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
*/
#include <private/bionic_asm.h>
-/* To test the page crossing code path more thoroughly, compile with
- -DTEST_PAGE_CROSS - this will force all calls through the slower
- entry path. This option is not intended for production use. */
-
-/* Arguments and results. */
-#define srcin x0
-#define len x0
-
-/* Locals and temporaries. */
-#define src x1
-#define data1 x2
-#define data2 x3
-#define has_nul1 x4
-#define has_nul2 x5
-#define tmp1 x4
-#define tmp2 x5
-#define tmp3 x6
-#define tmp4 x7
-#define zeroones x8
-
-#define L(l) .L ## l
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. A faster check
- (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
- false hits for characters 129..255. */
+#define srcin x0
+#define len x0
+
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+#define maskv v0
+#define maskd d0
+#define dataq1 q1
+#define dataq2 q2
+#define datav1 v1
+#define datav2 v2
+#define tmp x2
+#define tmpw w2
+#define synd x3
+#define shift x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+ (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+ byte is zero, and can be done in parallel across the entire word. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
+
+/* To test the page crossing code path more thoroughly, compile with
+ -DTEST_PAGE_CROSS - this will force all calls through the slower
+ entry path. This option is not intended for production use. */
#ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 15
+# define MIN_PAGE_SIZE 32
#else
# define MIN_PAGE_SIZE 4096
#endif
- /* Since strings are short on average, we check the first 16 bytes
- of the string for a NUL character. In order to do an unaligned ldp
- safely we have to do a page cross check first. If there is a NUL
- byte we calculate the length from the 2 8-byte words using
- conditional select to reduce branch mispredictions (it is unlikely
- strlen will be repeatedly called on strings with the same length).
-
- If the string is longer than 16 bytes, we align src so don't need
- further page cross checks, and process 32 bytes per iteration
- using the fast NUL check. If we encounter non-ASCII characters,
- fallback to a second loop using the full NUL check.
-
- If the page cross check fails, we read 16 bytes from an aligned
- address, remove any characters before the string, and continue
- in the main loop using aligned loads. Since strings crossing a
- page in the first 16 bytes are rare (probability of
- 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
- AArch64 systems have a minimum page size of 4k. We don't bother
- checking for larger page sizes - the cost of setting up the correct
- page size is just not worth the extra gain from a small reduction in
- the cases taking the slow path. Note that we only care about
- whether the first fetch, which may be misaligned, crosses a page
- boundary. */
-
-ENTRY(strlen_default)
+/* Core algorithm:
+
+ Since strings are short on average, we check the first 32 bytes of the
+ string for a NUL character without aligning the string. In order to use
+ unaligned loads safely we must do a page cross check first.
+
+ If there is a NUL byte we calculate the length from the 2 8-byte words
+ using conditional select to reduce branch mispredictions (it is unlikely
+ strlen will be repeatedly called on strings with the same length).
+
+ If the string is longer than 32 bytes, align src so we don't need further
+ page cross checks, and process 32 bytes per iteration using a fast SIMD
+ loop.
+
+ If the page cross check fails, we read 32 bytes from an aligned address,
+ and ignore any characters before the string. If it contains a NUL
+ character, return the length, if not, continue in the main loop. */
+
+ENTRY (strlen_default)
+ PTR_ARG (0)
and tmp1, srcin, MIN_PAGE_SIZE - 1
- mov zeroones, REP8_01
- cmp tmp1, MIN_PAGE_SIZE - 16
- b.gt L(page_cross)
+ cmp tmp1, MIN_PAGE_SIZE - 32
+ b.hi L(page_cross)
+
+ /* Look for a NUL byte in the first 16 bytes. */
ldp data1, data2, [srcin]
+ mov zeroones, REP8_01
+
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul1/2 directly.
@@ -115,113 +98,103 @@ ENTRY(strlen_default)
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
- beq L(main_loop_entry)
+ b.eq L(bytes16_31)
- /* Enter with C = has_nul1 == 0. */
+ /* Find the exact offset of the first NUL byte in the first 16 bytes
+ from the string start. Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
mov len, 8
rev has_nul1, has_nul1
- clz tmp1, has_nul1
csel len, xzr, len, cc
+ clz tmp1, has_nul1
add len, len, tmp1, lsr 3
ret
- /* The inner loop processes 32 bytes per iteration and uses the fast
- NUL check. If we encounter non-ASCII characters, use a second
- loop with the accurate NUL check. */
- .p2align 4
-L(main_loop_entry):
- bic src, srcin, 15
- sub src, src, 16
-L(main_loop):
- ldp data1, data2, [src, 32]!
-.Lpage_cross_entry:
- sub tmp1, data1, zeroones
- sub tmp3, data2, zeroones
- orr tmp2, tmp1, tmp3
- tst tmp2, zeroones, lsl 7
- bne 1f
- ldp data1, data2, [src, 16]
+ .p2align 3
+ /* Look for a NUL byte at offset 16..31 in the string. */
+L(bytes16_31):
+ ldp data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
sub tmp1, data1, zeroones
- sub tmp3, data2, zeroones
- orr tmp2, tmp1, tmp3
- tst tmp2, zeroones, lsl 7
- beq L(main_loop)
- add src, src, 16
-1:
- /* The fast check failed, so do the slower, accurate NUL check. */
orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
- beq L(nonascii_loop)
+ b.eq L(loop_entry)
- /* Enter with C = has_nul1 == 0. */
-L(tail):
-#ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul1/2 directly. The
- easiest way to get the correct byte is to byte-swap the data
- and calculate the syndrome a second time. */
- csel data1, data1, data2, cc
- rev data1, data1
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- bic has_nul1, tmp1, tmp2
-#else
+ /* Find the exact offset of the first NUL byte at offset 16..31 from
+ the string start. Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
-#endif
- sub len, src, srcin
+ mov len, 24
rev has_nul1, has_nul1
- add tmp2, len, 8
+ mov tmp3, 16
clz tmp1, has_nul1
- csel len, len, tmp2, cc
+ csel len, tmp3, len, cc
add len, len, tmp1, lsr 3
ret
-L(nonascii_loop):
- ldp data1, data2, [src, 16]!
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, REP8_7f
- bics has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- ccmp has_nul2, 0, 0, eq
- bne L(tail)
- ldp data1, data2, [src, 16]!
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, REP8_7f
- bics has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- ccmp has_nul2, 0, 0, eq
- beq L(nonascii_loop)
- b L(tail)
+L(loop_entry):
+ bic src, srcin, 31
- /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
- srcin to 0x7f, so we ignore any NUL bytes before the string.
- Then continue in the aligned loop. */
-L(page_cross):
- bic src, srcin, 15
- ldp data1, data2, [src]
- lsl tmp1, srcin, 3
- mov tmp4, -1
+ .p2align 5
+L(loop):
+ ldp dataq1, dataq2, [src, 32]!
+ uminp maskv.16b, datav1.16b, datav2.16b
+ uminp maskv.16b, maskv.16b, maskv.16b
+ cmeq maskv.8b, maskv.8b, 0
+ fmov synd, maskd
+ cbz synd, L(loop)
+
+ /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
+ cmeq maskv.16b, datav1.16b, 0
+ sub len, src, srcin
+ tst synd, 0xffffffff
+ b.ne 1f
+ cmeq maskv.16b, datav2.16b, 0
+ add len, len, 16
+1:
+ /* Generate a bitmask and compute correct byte offset. */
#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+ bic maskv.8h, 0xf0
#else
- /* Little-endian. Early bytes are at LSB. */
- lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+ bic maskv.8h, 0x0f, lsl 8
#endif
- orr tmp1, tmp1, REP8_80
- orn data1, data1, tmp1
- orn tmp2, data2, tmp1
- tst srcin, 8
- csel data1, data1, tmp4, eq
- csel data2, data2, tmp2, eq
- b L(page_cross_entry)
-
-END(strlen_default)
+ umaxp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ clz tmp, synd
+ add len, len, tmp, lsr 2
+ ret
+
+ .p2align 4
+
+L(page_cross):
+ bic src, srcin, 31
+ mov tmpw, 0x0c03
+ movk tmpw, 0xc030, lsl 16
+ ld1 {datav1.16b, datav2.16b}, [src]
+ dup maskv.4s, tmpw
+ cmeq datav1.16b, datav1.16b, 0
+ cmeq datav2.16b, datav2.16b, 0
+ and datav1.16b, datav1.16b, maskv.16b
+ and datav2.16b, datav2.16b, maskv.16b
+ addp maskv.16b, datav1.16b, datav2.16b
+ addp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+ lsl shift, srcin, 1
+ lsr synd, synd, shift
+ cbz synd, L(loop)
+
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 1
+ ret
+
+END (strlen_default)
diff --git a/libc/arch-arm64/default/bionic/strncmp.S b/libc/arch-arm64/default/bionic/strncmp.S
index 5432b738e..68c6e6526 100644
--- a/libc/arch-arm64/default/bionic/strncmp.S
+++ b/libc/arch-arm64/default/bionic/strncmp.S
@@ -1,29 +1,9 @@
-/* Copyright (c) 2014, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
/* Assumptions:
*
@@ -60,19 +40,17 @@
#define endloop x15
#define count mask
- .text
- .p2align 6
- .rep 7
- nop /* Pad so that the loop below fits a cache line. */
- .endr
-ENTRY(strncmp_default)
- cbz limit, .Lret0
+ENTRY (strncmp_default)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+ cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
and count, src1, #7
- b.ne .Lmisaligned8
- cbnz count, .Lmutual_align
+ b.ne L(misaligned8)
+ cbnz count, L(mutual_align)
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
@@ -80,11 +58,11 @@ ENTRY(strncmp_default)
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
- /* Start of performance-critical section -- one 64B cache line. */
-.Lloop_aligned:
+ .p2align 4
+L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
subs limit_wd, limit_wd, #1
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
@@ -92,15 +70,15 @@ ENTRY(strncmp_default)
csinv endloop, diff, xzr, pl /* Last Dword or differences. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
- b.eq .Lloop_aligned
- /* End of performance-critical section -- one 64B cache line. */
+ b.eq L(loop_aligned)
+ /* End of main loop */
/* Not reached the limit, must have found the end or a diff. */
- tbz limit_wd, #63, .Lnot_limit
+ tbz limit_wd, #63, L(not_limit)
/* Limit % 8 == 0 => all bytes significant. */
ands limit, limit, #7
- b.eq .Lnot_limit
+ b.eq L(not_limit)
lsl limit, limit, #3 /* Bits -> bytes. */
mov mask, #~0
@@ -115,7 +93,7 @@ ENTRY(strncmp_default)
/* Make sure that the NUL byte is marked in the syndrome. */
orr has_nul, has_nul, mask
-.Lnot_limit:
+L(not_limit):
orr syndrome, diff, has_nul
#ifndef __AARCH64EB__
@@ -168,7 +146,7 @@ ENTRY(strncmp_default)
ret
#endif
-.Lmutual_align:
+L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that precede the start point.
@@ -196,56 +174,56 @@ ENTRY(strncmp_default)
orr data1, data1, tmp2
orr data2, data2, tmp2
add limit_wd, limit_wd, tmp3, lsr #3
- b .Lstart_realigned
+ b L(start_realigned)
- .p2align 6
+ .p2align 4
/* Don't bother with dwords for up to 16 bytes. */
-.Lmisaligned8:
+L(misaligned8):
cmp limit, #16
- b.hs .Ltry_misaligned_words
+ b.hs L(try_misaligned_words)
-.Lbyte_loop:
+L(byte_loop):
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq .Lbyte_loop
-.Ldone:
+ b.eq L(byte_loop)
+L(done):
sub result, data1, data2
ret
/* Align the SRC1 to a dword by doing a bytewise compare and then do
the dword loop. */
-.Ltry_misaligned_words:
+L(try_misaligned_words):
lsr limit_wd, limit, #3
- cbz count, .Ldo_misaligned
+ cbz count, L(do_misaligned)
neg count, count
and count, count, #7
sub limit, limit, count
lsr limit_wd, limit, #3
-.Lpage_end_loop:
+L(page_end_loop):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.ne .Ldone
+ b.ne L(done)
subs count, count, #1
- b.hi .Lpage_end_loop
+ b.hi L(page_end_loop)
-.Ldo_misaligned:
+L(do_misaligned):
/* Prepare ourselves for the next page crossing. Unlike the aligned
loop, we fetch 1 less dword because we risk crossing bounds on
SRC2. */
mov count, #8
subs limit_wd, limit_wd, #1
- b.lo .Ldone_loop
-.Lloop_misaligned:
+ b.lo L(done_loop)
+L(loop_misaligned):
and tmp2, src2, #0xff8
eor tmp2, tmp2, #0xff8
- cbz tmp2, .Lpage_end_loop
+ cbz tmp2, L(page_end_loop)
ldr data1, [src1], #8
ldr data2, [src2], #8
@@ -254,14 +232,14 @@ ENTRY(strncmp_default)
eor diff, data1, data2 /* Non-zero if differences found. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp diff, #0, #0, eq
- b.ne .Lnot_limit
+ b.ne L(not_limit)
subs limit_wd, limit_wd, #1
- b.pl .Lloop_misaligned
+ b.pl L(loop_misaligned)
-.Ldone_loop:
+L(done_loop):
/* We found a difference or a NULL before the limit was reached. */
and limit, limit, #7
- cbz limit, .Lnot_limit
+ cbz limit, L(not_limit)
/* Read the last word. */
sub src1, src1, 8
sub src2, src2, 8
@@ -272,9 +250,11 @@ ENTRY(strncmp_default)
eor diff, data1, data2 /* Non-zero if differences found. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp diff, #0, #0, eq
- b.ne .Lnot_limit
+ b.ne L(not_limit)
-.Lret0:
+L(ret0):
mov result, #0
ret
-END(strncmp_default)
+
+END ( strncmp_default)
+
diff --git a/libc/arch-arm64/default/bionic/strnlen.S b/libc/arch-arm64/default/bionic/strnlen.S
index 169453207..445661d48 100644
--- a/libc/arch-arm64/default/bionic/strnlen.S
+++ b/libc/arch-arm64/default/bionic/strnlen.S
@@ -1,174 +1,112 @@
-/* Copyright (c) 2014, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+/*
+ * strnlen - calculate the length of a string with limit.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
*/
#include <private/bionic_asm.h>
-/* Arguments and results. */
#define srcin x0
-#define len x0
-#define limit x1
+#define cntin x1
+#define result x0
-/* Locals and temporaries. */
#define src x2
-#define data1 x3
-#define data2 x4
-#define data2a x5
-#define has_nul1 x6
-#define has_nul2 x7
-#define tmp1 x8
-#define tmp2 x9
-#define tmp3 x10
-#define tmp4 x11
-#define zeroones x12
-#define pos x13
-#define limit_wd x14
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
- .text
- .p2align 6
-.Lstart:
- /* Pre-pad to ensure critical loop begins an icache line. */
- .rep 7
- nop
- .endr
- /* Put this code here to avoid wasting more space with pre-padding. */
-.Lhit_limit:
- mov len, limit
+#define synd x3
+#define shift x4
+#define wtmp w4
+#define tmp x4
+#define cntrem x5
+
+#define qdata q0
+#define vdata v0
+#define vhas_chr v1
+#define vrepmask v2
+#define vend v3
+#define dend d3
+
+/*
+ Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (strnlen_default)
+ PTR_ARG (0)
+ SIZE_ARG (1)
+ bic src, srcin, 15
+ mov wtmp, 0xf00f
+ cbz cntin, L(nomatch)
+ ld1 {vdata.16b}, [src], 16
+ dup vrepmask.8h, wtmp
+ cmeq vhas_chr.16b, vdata.16b, 0
+ lsl shift, srcin, 2
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ lsr synd, synd, shift
+ cbz synd, L(start_loop)
+L(finish):
+ rbit synd, synd
+ clz synd, synd
+ lsr result, synd, 2
+ cmp cntin, result
+ csel result, cntin, result, ls
ret
-ENTRY(strnlen_default)
- cbz limit, .Lhit_limit
- mov zeroones, #REP8_01
- bic src, srcin, #15
- ands tmp1, srcin, #15
- b.ne .Lmisaligned
- /* Calculate the number of full and partial words -1. */
- sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
- lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
- /* The inner loop deals with two Dwords at a time. This has a
- slightly higher start-up cost, but we should win quite quickly,
- especially on cores with a high number of issue slots per
- cycle, as we get much better parallelism out of the operations. */
-
- /* Start of critial section -- keep to one 64Byte cache line. */
-.Lloop:
- ldp data1, data2, [src], #16
-.Lrealigned:
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- subs limit_wd, limit_wd, #1
- orr tmp1, has_nul1, has_nul2
- ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
- b.eq .Lloop
- /* End of critical section -- keep to one 64Byte cache line. */
-
- orr tmp1, has_nul1, has_nul2
- cbz tmp1, .Lhit_limit /* No null in final Qword. */
-
- /* We know there's a null in the final Qword. The easiest thing
- to do now is work out the length of the string and return
- MIN (len, limit). */
-
- sub len, src, srcin
- cbz has_nul1, .Lnul_in_data2
-#ifdef __AARCH64EB__
- mov data2, data1
+L(start_loop):
+ sub tmp, src, srcin
+ subs cntrem, cntin, tmp
+ b.ls L(nomatch)
+
+ /* Make sure that it won't overread by a 16-byte chunk */
+ add tmp, cntrem, 15
+ tbnz tmp, 4, L(loop32_2)
+
+ .p2align 5
+L(loop32):
+ ldr qdata, [src], 16
+ cmeq vhas_chr.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbnz synd, L(end)
+L(loop32_2):
+ ldr qdata, [src], 16
+ subs cntrem, cntrem, 32
+ cmeq vhas_chr.16b, vdata.16b, 0
+ b.ls L(end)
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop32)
+
+L(end):
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ sub src, src, 16
+ mov synd, vend.d[0]
+ sub result, src, srcin
+#ifndef __AARCH64EB__
+ rbit synd, synd
#endif
- sub len, len, #8
- mov has_nul2, has_nul1
-.Lnul_in_data2:
-#ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul directly. The
- easiest way to get the correct byte is to byte-swap the data
- and calculate the syndrome a second time. */
- rev data2, data2
- sub tmp1, data2, zeroones
- orr tmp2, data2, #REP8_7f
- bic has_nul2, tmp1, tmp2
-#endif
- sub len, len, #8
- rev has_nul2, has_nul2
- clz pos, has_nul2
- add len, len, pos, lsr #3 /* Bits to bytes. */
- cmp len, limit
- csel len, len, limit, ls /* Return the lower value. */
+ clz synd, synd
+ add result, result, synd, lsr 2
+ cmp cntin, result
+ csel result, cntin, result, ls
ret
-.Lmisaligned:
- /* Deal with a partial first word.
- We're doing two things in parallel here;
- 1) Calculate the number of words (but avoiding overflow if
- limit is near ULONG_MAX) - to do this we need to work out
- limit + tmp1 - 1 as a 65-bit value before shifting it;
- 2) Load and mask the initial data words - we force the bytes
- before the ones we are interested in to 0xff - this ensures
- early bytes will not hit any zero detection. */
- sub limit_wd, limit, #1
- neg tmp4, tmp1
- cmp tmp1, #8
-
- and tmp3, limit_wd, #15
- lsr limit_wd, limit_wd, #4
- mov tmp2, #~0
-
- ldp data1, data2, [src], #16
- lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
- add tmp3, tmp3, tmp1
-
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
-#endif
- add limit_wd, limit_wd, tmp3, lsr #4
+L(nomatch):
+ mov result, cntin
+ ret
- orr data1, data1, tmp2
- orr data2a, data2, tmp2
+END (strnlen_default)
- csinv data1, data1, xzr, le
- csel data2, data2, data2a, le
- b .Lrealigned
-END(strnlen_default)
diff --git a/libc/arch-arm64/generic/bionic/memcmp.S b/libc/arch-arm64/generic/bionic/memcmp.S
index bff54aea3..fae0d4226 100644
--- a/libc/arch-arm64/generic/bionic/memcmp.S
+++ b/libc/arch-arm64/generic/bionic/memcmp.S
@@ -1,29 +1,7 @@
-/*
- * Copyright (c) 2017 ARM Ltd
- * All rights reserved.
+/* memcmp - compare memory
*
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
@@ -33,8 +11,6 @@
#include <private/bionic_asm.h>
-#define L(l) .L ## l
-
/* Parameters and result. */
#define src1 x0
#define src2 x1
@@ -51,20 +27,13 @@
#define tmp1 x7
#define tmp2 x8
-/* Small inputs of less than 8 bytes are handled separately. This allows the
- main code to be speed up using unaligned loads since there are now at least
- 8 bytes to be compared. If the first 8 bytes are equal, align src1.
- This ensures each iteration does at most one unaligned access even if both
- src1 and src2 are unaligned, and mutually aligned inputs behave as if
- aligned. After the main loop, process the last 16 bytes using unaligned
- accesses. */
-
-ENTRY(memcmp)
-.p2align 6
+ENTRY (memcmp)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
subs limit, limit, 8
b.lo L(less8)
- /* Limit >= 8, so check first 8 bytes using unaligned loads. */
ldr data1, [src1], 8
ldr data2, [src2], 8
cmp data1, data2
@@ -164,4 +133,5 @@ L(byte_loop):
sub result, data1w, data2w
ret
-END(memcmp)
+END (memcmp)
+
diff --git a/libc/arch-arm64/generic/bionic/memcpy.S b/libc/arch-arm64/generic/bionic/memcpy.S
index baadb9204..fc487d3a4 100644
--- a/libc/arch-arm64/generic/bionic/memcpy.S
+++ b/libc/arch-arm64/generic/bionic/memcpy.S
@@ -30,6 +30,6 @@
#include <private/bionic_asm.h>
-ENTRY(__memcpy)
+ENTRY(memcpy)
#include "memcpy_base.S"
-END(__memcpy)
+END(memcpy)
diff --git a/libc/arch-arm64/generic/bionic/memcpy_base.S b/libc/arch-arm64/generic/bionic/memcpy_base.S
index f85062408..f77e4dd35 100644
--- a/libc/arch-arm64/generic/bionic/memcpy_base.S
+++ b/libc/arch-arm64/generic/bionic/memcpy_base.S
@@ -1,60 +1,13 @@
-/* Copyright (c) 2012-2013, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
/*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
+ * memcpy - copy memory area
*
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
@@ -69,149 +22,181 @@
#define A_l x6
#define A_lw w6
#define A_h x7
-#define A_hw w7
#define B_l x8
-#define B_lw w8
+#define B_lw w8
#define B_h x9
-#define C_l x10
-#define C_h x11
-#define D_l x12
-#define D_h x13
-#define E_l src
-#define E_h count
-#define F_l srcend
-#define F_h dst
-#define tmp1 x9
-
-#define L(l) .L ## l
-
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
- medium copies of 17..96 bytes which are fully unrolled. Large copies
- of more than 96 bytes align the destination and use an unrolled loop
- processing 64 bytes per iteration.
- Small and medium copies read all data before writing, allowing any
- kind of overlap, and memmove tailcalls memcpy for these cases as
- well as non-overlapping copies.
+#define C_lw w10
+#define tmp1 x14
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The source pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
*/
- prfm PLDL1KEEP, [src]
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
add srcend, src, count
add dstend, dstin, count
- cmp count, 16
- b.ls L(copy16)
- cmp count, 96
+ cmp count, 128
b.hi L(copy_long)
-
- /* Medium copies: 17..96 bytes. */
- sub tmp1, count, 1
- ldp A_l, A_h, [src]
- tbnz tmp1, 6, L(copy96)
- ldp D_l, D_h, [srcend, -16]
- tbz tmp1, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldr A_q, [src]
+ ldr B_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstend, -16]
ret
- .p2align 4
-
- /* Small copies: 0..16 bytes. */
+ /* Copy 8-15 bytes. */
L(copy16):
- cmp count, 8
- b.lo 1f
+ tbz count, 3, L(copy8)
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
- .p2align 4
-1:
- tbz count, 2, 1f
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
ldr A_lw, [src]
- ldr A_hw, [srcend, -4]
+ ldr B_lw, [srcend, -4]
str A_lw, [dstin]
- str A_hw, [dstend, -4]
+ str B_lw, [dstend, -4]
ret
- /* Copy 0..3 bytes. Use a branchless sequence that copies the same
- byte 3 times if count==1, or the 2nd byte twice if count==2. */
-1:
- cbz count, 2f
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
lsr tmp1, count, 1
ldrb A_lw, [src]
- ldrb A_hw, [srcend, -1]
+ ldrb C_lw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
- strb A_hw, [dstend, -1]
-2: ret
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
.p2align 4
- /* Copy 64..96 bytes. Copy 64 bytes from the start and
- 32 bytes from the end. */
-L(copy96):
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [src, 32]
- ldp D_l, D_h, [src, 48]
- ldp E_l, E_h, [srcend, -32]
- ldp F_l, F_h, [srcend, -16]
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin, 32]
- stp D_l, D_h, [dstin, 48]
- stp E_l, E_h, [dstend, -32]
- stp F_l, F_h, [dstend, -16]
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
ret
- /* Align DST to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 96 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
-
.p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_q, F_q, [src, 32]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_q, H_q, [srcend, -64]
+ stp G_q, H_q, [dstend, -64]
+L(copy96):
+ stp A_q, B_q, [dstin]
+ stp E_q, F_q, [dstin, 32]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ /* Copy more than 128 bytes. */
L(copy_long):
- and tmp1, dstin, 15
- bic dst, dstin, 15
- ldp D_l, D_h, [src]
- sub src, src, tmp1
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align src to 16-byte alignment. */
+ ldr D_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_l, A_h, [src, 16]
- stp D_l, D_h, [dstin]
- ldp B_l, B_h, [src, 32]
- ldp C_l, C_h, [src, 48]
- ldp D_l, D_h, [src, 64]!
+ ldp A_q, B_q, [src, 16]
+ str D_q, [dstin]
+ ldp C_q, D_q, [src, 48]
subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls 2f
-1:
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [src, 16]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [src, 32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [src, 48]
- stp D_l, D_h, [dst, 64]!
- ldp D_l, D_h, [src, 64]!
+ b.ls L(copy64_from_end)
+L(loop64):
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [src, 80]
+ stp C_q, D_q, [dst, 48]
+ ldp C_q, D_q, [src, 112]
+ add src, src, 64
+ add dst, dst, 64
subs count, count, 64
- b.hi 1b
-
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the end even if
- there is just 1 byte left. */
-2:
- ldp E_l, E_h, [srcend, -64]
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [srcend, -48]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [srcend, -16]
- stp D_l, D_h, [dst, 64]
- stp E_l, E_h, [dstend, -64]
- stp A_l, A_h, [dstend, -48]
- stp B_l, B_h, [dstend, -32]
- stp C_l, C_h, [dstend, -16]
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_q, F_q, [srcend, -64]
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [srcend, -32]
+ stp C_q, D_q, [dst, 48]
+ stp E_q, F_q, [dstend, -64]
+ stp A_q, B_q, [dstend, -32]
ret
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align srcend to 16-byte alignment. */
+L(copy_long_backwards):
+ cbz tmp1, L(copy0)
+ ldr D_q, [srcend, -16]
+ and tmp1, srcend, 15
+ bic srcend, srcend, 15
+ sub count, count, tmp1
+ ldp A_q, B_q, [srcend, -32]
+ str D_q, [dstend, -16]
+ ldp C_q, D_q, [srcend, -64]
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ str B_q, [dstend, -16]
+ str A_q, [dstend, -32]
+ ldp A_q, B_q, [srcend, -96]
+ str D_q, [dstend, -48]
+ str C_q, [dstend, -64]!
+ ldp C_q, D_q, [srcend, -128]
+ sub srcend, srcend, 64
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp E_q, F_q, [src, 32]
+ stp A_q, B_q, [dstend, -32]
+ ldp A_q, B_q, [src]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstin, 32]
+ stp A_q, B_q, [dstin]
+ ret
+
diff --git a/libc/arch-arm64/generic/bionic/memmove.S b/libc/arch-arm64/generic/bionic/memmove.S
index 335b7d6ce..ad4f31f25 100644
--- a/libc/arch-arm64/generic/bionic/memmove.S
+++ b/libc/arch-arm64/generic/bionic/memmove.S
@@ -1,155 +1,33 @@
-/* Copyright (c) 2013, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
/*
- * Copyright (c) 2015 ARM Ltd
+ * Copyright (C) 2008 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
- * 1. Redistributions of source code must retain the above copyright
+ * * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
*
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
*/
#include <private/bionic_asm.h>
-/* Parameters and result. */
-#define dstin x0
-#define src x1
-#define count x2
-#define srcend x3
-#define dstend x4
-#define tmp1 x5
-#define A_l x6
-#define A_h x7
-#define B_l x8
-#define B_h x9
-#define C_l x10
-#define C_h x11
-#define D_l x12
-#define D_h x13
-#define E_l count
-#define E_h tmp1
-
-/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
- Larger backwards copies are also handled by memcpy. The only remaining
- case is forward large copies. The destination is aligned, and an
- unrolled loop processes 64 bytes per iteration.
-*/
-
-#if defined(WMEMMOVE)
-ENTRY(wmemmove)
- lsl count, count, #2
-#else
ENTRY(memmove)
-#endif
- sub tmp1, dstin, src
- cmp count, 96
- ccmp tmp1, count, 2, hi
- b.hs __memcpy
-
- cbz tmp1, 3f
- add dstend, dstin, count
- add srcend, src, count
-
- /* Align dstend to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 96 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
-
- and tmp1, dstend, 15
- ldp D_l, D_h, [srcend, -16]
- sub srcend, srcend, tmp1
- sub count, count, tmp1
- ldp A_l, A_h, [srcend, -16]
- stp D_l, D_h, [dstend, -16]
- ldp B_l, B_h, [srcend, -32]
- ldp C_l, C_h, [srcend, -48]
- ldp D_l, D_h, [srcend, -64]!
- sub dstend, dstend, tmp1
- subs count, count, 128
- b.ls 2f
- nop
-1:
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [srcend, -16]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [srcend, -48]
- stp D_l, D_h, [dstend, -64]!
- ldp D_l, D_h, [srcend, -64]!
- subs count, count, 64
- b.hi 1b
-
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the start even if
- there is just 1 byte left. */
-2:
- ldp E_l, E_h, [src, 48]
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [src, 32]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [src, 16]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [src]
- stp D_l, D_h, [dstend, -64]
- stp E_l, E_h, [dstin, 48]
- stp A_l, A_h, [dstin, 32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin]
-3: ret
-
-#if defined(WMEMMOVE)
-END(wmemmove)
-#else
+ #include "memcpy_base.S"
END(memmove)
-
-ALIAS_SYMBOL(memcpy, memmove)
-#endif
diff --git a/libc/arch-arm64/generic/bionic/memrchr.S b/libc/arch-arm64/generic/bionic/memrchr.S
new file mode 100644
index 000000000..9ab232775
--- /dev/null
+++ b/libc/arch-arm64/generic/bionic/memrchr.S
@@ -0,0 +1,117 @@
+/*
+ * memrchr - find last character in a memory zone.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include <private/bionic_asm.h>
+
+#define srcin x0
+#define chrin w1
+#define cntin x2
+#define result x0
+
+#define src x3
+#define cntrem x4
+#define synd x5
+#define shift x6
+#define tmp x7
+#define wtmp w7
+#define end x8
+#define endm1 x9
+
+#define vrepchr v0
+#define qdata q1
+#define vdata v1
+#define vhas_chr v2
+#define vrepmask v3
+#define vend v4
+#define dend d4
+
+/*
+ Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (memrchr)
+ PTR_ARG (0)
+ add end, srcin, cntin
+ sub endm1, end, 1
+ bic src, endm1, 15
+ cbz cntin, L(nomatch)
+ ld1 {vdata.16b}, [src]
+ dup vrepchr.16b, chrin
+ mov wtmp, 0xf00f
+ dup vrepmask.8h, wtmp
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ neg shift, end, lsl 2
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ lsl synd, synd, shift
+ cbz synd, L(start_loop)
+
+ clz synd, synd
+ sub result, endm1, synd, lsr 2
+ cmp cntin, synd, lsr 2
+ csel result, result, xzr, hi
+ ret
+
+L(start_loop):
+ sub tmp, end, src
+ subs cntrem, cntin, tmp
+ b.ls L(nomatch)
+
+ /* Make sure that it won't overread by a 16-byte chunk */
+ add tmp, cntrem, 15
+ tbnz tmp, 4, L(loop32_2)
+
+ .p2align 4
+L(loop32):
+ ldr qdata, [src, -16]!
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbnz synd, L(end)
+
+L(loop32_2):
+ ldr qdata, [src, -16]!
+ subs cntrem, cntrem, 32
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ b.ls L(end)
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop32)
+L(end):
+ and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+ addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ fmov synd, dend
+
+ add tmp, src, 15
+#ifdef __AARCH64EB__
+ rbit synd, synd
+#endif
+ clz synd, synd
+ sub tmp, tmp, synd, lsr 2
+ cmp tmp, srcin
+ csel result, tmp, xzr, hs
+ ret
+
+L(nomatch):
+ mov result, 0
+ ret
+
+END (memrchr)
+
diff --git a/libc/arch-arm64/generic/bionic/memset.S b/libc/arch-arm64/generic/bionic/memset.S
index 12fc09db8..0f54f9d79 100644
--- a/libc/arch-arm64/generic/bionic/memset.S
+++ b/libc/arch-arm64/generic/bionic/memset.S
@@ -1,89 +1,25 @@
-/* Copyright (c) 2012-2013, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
/*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
+ * memset - fill memory with a constant byte
*
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2012-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
#include <private/bionic_asm.h>
-/* By default we assume that the DC instruction can be used to zero
- data blocks more efficiently. In some circumstances this might be
- unsafe, for example in an asymmetric multiprocessor environment with
- different DC clear lengths (neither the upper nor lower lengths are
- safe to use).
-
- If code may be run in a virtualized environment, then define
- MAYBE_VIRT. This will cause the code to cache the system register
- values rather than re-reading them each call. */
-
-#define dstin x0
-#define val x1
-#define valw w1
-#define count x2
-#define dst x3
-#define dstend x4
-#define tmp1 x5
-#define tmp1w w5
-#define tmp2 x6
-#define tmp2w w6
-#define zva_len x7
-#define zva_lenw w7
-
-#define L(l) .L ## l
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
ENTRY(__memset_chk)
cmp count, dst
@@ -98,7 +34,9 @@ ENTRY(__memset_chk)
bl __memset_chk_fail
END(__memset_chk)
-ENTRY(memset)
+ENTRY (memset)
+ PTR_ARG (0)
+ SIZE_ARG (2)
dup v0.16B, valw
add dstend, dstin, count
@@ -114,7 +52,7 @@ ENTRY(memset)
str val, [dstin]
str val, [dstend, -8]
ret
- nop
+ .p2align 4
1: tbz count, 2, 2f
str valw, [dstin]
str valw, [dstend, -4]
@@ -144,108 +82,49 @@ L(set96):
stp q0, q0, [dstend, -32]
ret
- .p2align 3
- nop
+ .p2align 4
L(set_long):
and valw, valw, 255
bic dst, dstin, 15
str q0, [dstin]
- cmp count, 256
- ccmp valw, 0, 0, cs
- b.eq L(try_zva)
-L(no_zva):
- sub count, dstend, dst /* Count is 16 too large. */
- add dst, dst, 16
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
-L(tail64):
- subs count, count, 64
- b.hi 1b
-2: stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-
- .p2align 3
-L(try_zva):
- mrs tmp1, dczid_el0
- tbnz tmp1w, 4, L(no_zva)
- and tmp1w, tmp1w, 15
- cmp tmp1w, 4 /* ZVA size is 64 bytes. */
- b.ne L(zva_128)
-
- /* Write the first and last 64 byte aligned block using stp rather
- than using DC ZVA. This is faster on some cores.
- */
-L(zva_64):
+ cmp count, 160
+ ccmp valw, 0, 0, hs
+ b.ne L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(no_zva)
+#endif
str q0, [dst, 16]
stp q0, q0, [dst, 32]
bic dst, dst, 63
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
- add dst, dst, 128
- nop
-1: dc zva, dst
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ .p2align 4
+L(zva_loop):
add dst, dst, 64
+ dc zva, dst
subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
+ b.hi L(zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
- .p2align 3
-L(zva_128):
- cmp tmp1w, 5 /* ZVA size is 128 bytes. */
- b.ne L(zva_other)
-
- str q0, [dst, 16]
+L(no_zva):
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub dst, dst, 16 /* Dst is biased by -32. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+L(no_zva_loop):
stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- bic dst, dst, 127
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+128 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 128
- subs count, count, 128
- b.hi 1b
- stp q0, q0, [dstend, -128]
- stp q0, q0, [dstend, -96]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.hi L(no_zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
-L(zva_other):
- mov tmp2w, 4
- lsl zva_lenw, tmp2w, tmp1w
- add tmp1, zva_len, 64 /* Max alignment bytes written. */
- cmp count, tmp1
- blo L(no_zva)
-
- sub tmp2, zva_len, 1
- add tmp1, dst, zva_len
- add dst, dst, 16
- subs count, tmp1, dst /* Actual alignment bytes to write. */
- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
- beq 2f
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
- subs count, count, 64
- b.hi 1b
-2: mov dst, tmp1
- sub count, dstend, tmp1 /* Remaining bytes to write. */
- subs count, count, zva_len
- b.lo 4f
-3: dc zva, dst
- add dst, dst, zva_len
- subs count, count, zva_len
- b.hs 3b
-4: add count, count, zva_len
- b L(tail64)
-
-END(memset)
+END (memset)
+
diff --git a/libc/arch-arm64/generic/bionic/stpcpy.S b/libc/arch-arm64/generic/bionic/stpcpy.S
index e4a799387..4f62aa462 100644
--- a/libc/arch-arm64/generic/bionic/stpcpy.S
+++ b/libc/arch-arm64/generic/bionic/stpcpy.S
@@ -1,29 +1,10 @@
/*
- * Copyright (C) 2014 The Android Open Source Project
- * All rights reserved.
+ * stpcpy - copy a string returning pointer to end.
*
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
*/
-#define STPCPY
-#include "string_copy.S"
+
+#define BUILD_STPCPY 1
+
+#include "strcpy.S"
diff --git a/libc/arch-arm64/generic/bionic/strchrnul.S b/libc/arch-arm64/generic/bionic/strchrnul.S
new file mode 100644
index 000000000..3fd909c15
--- /dev/null
+++ b/libc/arch-arm64/generic/bionic/strchrnul.S
@@ -0,0 +1,114 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include <private/bionic_asm.h>
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_nul1 v3
+#define vhas_nul2 v4
+#define vhas_chr1 v5
+#define vhas_chr2 v6
+#define vrepmask v7
+#define vend1 v16
+
+/* Core algorithm.
+
+ For each 32-byte hunk we calculate a 64-bit syndrome value, with
+ two bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set iff
+ the relevant byte matched the requested character or nul. Since the
+ bits in the syndrome reflect exactly the order in which things occur
+ in the original string a count_trailing_zeros() operation will
+ identify exactly which byte is causing the termination. */
+
+/* Locals and temporaries. */
+
+ENTRY (strchrnul)
+ PTR_ARG (0)
+ /* Magic constant 0x40100401 to allow us to identify which lane
+ matches the termination condition. */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
+ dup vrepmask.4s, wtmp2
+ ands tmp1, srcin, #31
+ b.eq L(loop)
+
+ /* Input string is not 32-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ neg tmp1, tmp1
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+ cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+ and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
+ lsl tmp1, tmp1, #1
+ addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ mov tmp3, #~0
+ addp vend1.16b, vend1.16b, vend1.16b // 128->64
+ lsr tmp1, tmp3, tmp1
+
+ mov tmp3, vend1.d[0]
+ bic tmp1, tmp3, tmp1 // Mask padding bits.
+ cbnz tmp1, L(tail)
+
+ .p2align 4
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+ cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+ umaxp vend1.16b, vend1.16b, vend1.16b
+ mov tmp1, vend1.d[0]
+ cbz tmp1, L(loop)
+
+ /* Termination condition found. Now need to establish exactly why
+ we terminated. */
+ and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
+ addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend1.16b, vend1.16b, vend1.16b // 128->64
+
+ mov tmp1, vend1.d[0]
+L(tail):
+ /* Count the trailing zeros, by bit reversing... */
+ rbit tmp1, tmp1
+ /* Re-bias source. */
+ sub src, src, #32
+ clz tmp1, tmp1 /* ... and counting the leading zeros. */
+ /* tmp1 is twice the offset into the fragment. */
+ add result, src, tmp1, lsr #1
+ ret
+
+END (strchrnul)
+
diff --git a/libc/arch-arm64/generic/bionic/strcpy.S b/libc/arch-arm64/generic/bionic/strcpy.S
index 260c32138..54da47c0b 100644
--- a/libc/arch-arm64/generic/bionic/strcpy.S
+++ b/libc/arch-arm64/generic/bionic/strcpy.S
@@ -1,29 +1,311 @@
/*
- * Copyright (C) 2014 The Android Open Source Project
- * All rights reserved.
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
*
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
*
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
*/
-#define STRCPY
-#include "string_copy.S"
+
+#include <private/bionic_asm.h>
+
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+
+ To test the page crossing code path more thoroughly, compile with
+ -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
+ entry path. This option is not intended for production use. */
+
+/* Arguments and results. */
+#define dstin x0
+#define srcin x1
+
+/* Locals and temporaries. */
+#define src x2
+#define dst x3
+#define data1 x4
+#define data1w w4
+#define data2 x5
+#define data2w w5
+#define has_nul1 x6
+#define has_nul2 x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define tmp4 x11
+#define zeroones x12
+#define data1a x13
+#define data2a x14
+#define pos x15
+#define len x16
+#define to_align x17
+
+#ifdef BUILD_STPCPY
+#define STRCPY stpcpy
+#else
+#define STRCPY strcpy
+#endif
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ /* AArch64 systems have a minimum page size of 4k. We can do a quick
+ page size check for crossing this boundary on entry and if we
+ do not, then we can short-circuit much of the entry code. We
+ expect early page-crossing strings to be rare (probability of
+ 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
+ predictable, even with random strings.
+
+ We don't bother checking for larger page sizes, the cost of setting
+ up the correct page size is just not worth the extra gain from
+ a small reduction in the cases taking the slow path. Note that
+ we only care about whether the first fetch, which may be
+ misaligned, crosses a page boundary - after that we move to aligned
+ fetches for the remainder of the string. */
+
+#ifdef STRCPY_TEST_PAGE_CROSS
+ /* Make everything that isn't Qword aligned look like a page cross. */
+#define MIN_PAGE_P2 4
+#else
+#define MIN_PAGE_P2 12
+#endif
+
+#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+
+ENTRY (STRCPY)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ /* For moderately short strings, the fastest way to do the copy is to
+ calculate the length of the string in the same way as strlen, then
+ essentially do a memcpy of the result. This avoids the need for
+ multiple byte copies and further means that by the time we
+ reach the bulk copy loop we know we can always use DWord
+ accesses. We expect __strcpy_aarch64 to rarely be called repeatedly
+ with the same source string, so branch prediction is likely to
+ always be difficult - we mitigate against this by preferring
+ conditional select operations over branches whenever this is
+ feasible. */
+ and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
+ mov zeroones, #REP8_01
+ and to_align, srcin, #15
+ cmp tmp2, #(MIN_PAGE_SIZE - 16)
+ neg tmp1, to_align
+ /* The first fetch will straddle a (possible) page boundary iff
+ srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
+ aligned string will never fail the page align check, so will
+ always take the fast path. */
+ b.gt L(page_cross)
+
+L(page_cross_ok):
+ ldp data1, data2, [srcin]
+#ifdef __AARCH64EB__
+ /* Because we expect the end to be found within 16 characters
+ (profiling shows this is the most common case), it's worth
+ swapping the bytes now to save having to recalculate the
+ termination syndrome later. We preserve data1 and data2
+ so that we can re-use the values later on. */
+ rev tmp2, data1
+ sub tmp1, tmp2, zeroones
+ orr tmp2, tmp2, #REP8_7f
+ bics has_nul1, tmp1, tmp2
+ b.ne L(fp_le8)
+ rev tmp4, data2
+ sub tmp3, tmp4, zeroones
+ orr tmp4, tmp4, #REP8_7f
+#else
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ bics has_nul1, tmp1, tmp2
+ b.ne L(fp_le8)
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+#endif
+ bics has_nul2, tmp3, tmp4
+ b.eq L(bulk_entry)
+
+ /* The string is short (<=16 bytes). We don't know exactly how
+ short though, yet. Work out the exact length so that we can
+ quickly select the optimal copy strategy. */
+L(fp_gt8):
+ rev has_nul2, has_nul2
+ clz pos, has_nul2
+ mov tmp2, #56
+ add dst, dstin, pos, lsr #3 /* Bits to bytes. */
+ sub pos, tmp2, pos
+#ifdef __AARCH64EB__
+ lsr data2, data2, pos
+#else
+ lsl data2, data2, pos
+#endif
+ str data2, [dst, #1]
+ str data1, [dstin]
+#ifdef BUILD_STPCPY
+ add dstin, dst, #8
+#endif
+ ret
+
+L(fp_le8):
+ rev has_nul1, has_nul1
+ clz pos, has_nul1
+ add dst, dstin, pos, lsr #3 /* Bits to bytes. */
+ subs tmp2, pos, #24 /* Pos in bits. */
+ b.lt L(fp_lt4)
+#ifdef __AARCH64EB__
+ mov tmp2, #56
+ sub pos, tmp2, pos
+ lsr data2, data1, pos
+ lsr data1, data1, #32
+#else
+ lsr data2, data1, tmp2
+#endif
+ /* 4->7 bytes to copy. */
+ str data2w, [dst, #-3]
+ str data1w, [dstin]
+#ifdef BUILD_STPCPY
+ mov dstin, dst
+#endif
+ ret
+L(fp_lt4):
+ cbz pos, L(fp_lt2)
+ /* 2->3 bytes to copy. */
+#ifdef __AARCH64EB__
+ lsr data1, data1, #48
+#endif
+ strh data1w, [dstin]
+ /* Fall-through, one byte (max) to go. */
+L(fp_lt2):
+ /* Null-terminated string. Last character must be zero! */
+ strb wzr, [dst]
+#ifdef BUILD_STPCPY
+ mov dstin, dst
+#endif
+ ret
+
+ .p2align 6
+ /* Aligning here ensures that the entry code and main loop all lies
+ within one 64-byte cache line. */
+L(bulk_entry):
+ sub to_align, to_align, #16
+ stp data1, data2, [dstin]
+ sub src, srcin, to_align
+ sub dst, dstin, to_align
+ b L(entry_no_page_cross)
+
+ /* The inner loop deals with two Dwords at a time. This has a
+ slightly higher start-up cost, but we should win quite quickly,
+ especially on cores with a high number of issue slots per
+ cycle, as we get much better parallelism out of the operations. */
+L(main_loop):
+ stp data1, data2, [dst], #16
+L(entry_no_page_cross):
+ ldp data1, data2, [src], #16
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bics has_nul2, tmp3, tmp4
+ ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
+ b.eq L(main_loop)
+
+ /* Since we know we are copying at least 16 bytes, the fastest way
+ to deal with the tail is to determine the location of the
+ trailing NUL, then (re)copy the 16 bytes leading up to that. */
+ cmp has_nul1, #0
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ csel data1, data1, data2, ne
+ rev data1, data1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+#else
+ csel has_nul1, has_nul1, has_nul2, ne
+#endif
+ rev has_nul1, has_nul1
+ clz pos, has_nul1
+ add tmp1, pos, #72
+ add pos, pos, #8
+ csel pos, pos, tmp1, ne
+ add src, src, pos, lsr #3
+ add dst, dst, pos, lsr #3
+ ldp data1, data2, [src, #-32]
+ stp data1, data2, [dst, #-16]
+#ifdef BUILD_STPCPY
+ sub dstin, dst, #1
+#endif
+ ret
+
+L(page_cross):
+ bic src, srcin, #15
+ /* Start by loading two words at [srcin & ~15], then forcing the
+ bytes that precede srcin to 0xff. This means they never look
+ like termination bytes. */
+ ldp data1, data2, [src]
+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
+ tst to_align, #7
+ csetm tmp2, ne
+#ifdef __AARCH64EB__
+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#else
+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr data1, data1, tmp2
+ orr data2a, data2, tmp2
+ cmp to_align, #8
+ csinv data1, data1, xzr, lt
+ csel data2, data2, data2a, lt
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bics has_nul2, tmp3, tmp4
+ ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
+ b.eq L(page_cross_ok)
+ /* We now need to make data1 and data2 look like they've been
+ loaded directly from srcin. Do a rotate on the 128-bit value. */
+ lsl tmp1, to_align, #3 /* Bytes->bits. */
+ neg tmp2, to_align, lsl #3
+#ifdef __AARCH64EB__
+ lsl data1a, data1, tmp1
+ lsr tmp4, data2, tmp2
+ lsl data2, data2, tmp1
+ orr tmp4, tmp4, data1a
+ cmp to_align, #8
+ csel data1, tmp4, data2, lt
+ rev tmp2, data1
+ rev tmp4, data2
+ sub tmp1, tmp2, zeroones
+ orr tmp2, tmp2, #REP8_7f
+ sub tmp3, tmp4, zeroones
+ orr tmp4, tmp4, #REP8_7f
+#else
+ lsr data1a, data1, tmp1
+ lsl tmp4, data2, tmp2
+ lsr data2, data2, tmp1
+ orr tmp4, tmp4, data1a
+ cmp to_align, #8
+ csel data1, tmp4, data2, lt
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+#endif
+ bic has_nul1, tmp1, tmp2
+ cbnz has_nul1, L(fp_le8)
+ bic has_nul2, tmp3, tmp4
+ b L(fp_gt8)
+
+END (STRCPY)
+
diff --git a/libc/arch-arm64/generic/bionic/string_copy.S b/libc/arch-arm64/generic/bionic/string_copy.S
deleted file mode 100644
index 2bf969d60..000000000
--- a/libc/arch-arm64/generic/bionic/string_copy.S
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- Copyright (c) 2014, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- */
-
-#if !defined(STPCPY) && !defined(STRCPY)
-#error "Either STPCPY or STRCPY must be defined."
-#endif
-
-#include <private/bionic_asm.h>
-
-/* Arguments and results. */
-#if defined(STPCPY)
-#define dst x0
-#elif defined(STRCPY)
-#define dstin x0
-#endif
-#define src x1
-
-/* Locals and temporaries. */
-#if defined(STRCPY)
-#define dst x2
-#endif
-#define data1 x3
-#define data1_w w3
-#define data2 x4
-#define data2_w w4
-#define has_nul1 x5
-#define has_nul1_w w5
-#define has_nul2 x6
-#define tmp1 x7
-#define tmp2 x8
-#define tmp3 x9
-#define tmp4 x10
-#define zeroones x11
-#define zeroones_w w11
-#define pos x12
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-#if defined(STPCPY)
-ENTRY(stpcpy)
-#elif defined(STRCPY)
-ENTRY(strcpy)
-#endif
- mov zeroones, #REP8_01
-#if defined(STRCPY)
- mov dst, dstin
-#endif
- ands tmp1, src, #15
- b.ne .Lmisaligned
- // NUL detection works on the principle that (X - 1) & (~X) & 0x80
- // (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- // can be done in parallel across the entire word.
- // The inner loop deals with two Dwords at a time. This has a
- // slightly higher start-up cost, but we should win quite quickly,
- // especially on cores with a high number of issue slots per
- // cycle, as we get much better parallelism out of the operations.
-.Lloop:
- ldp data1, data2, [src], #16
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bic has_nul1, tmp1, tmp2
- cbnz has_nul1, .Lnul_in_data1
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul2, tmp3, tmp4
- cbnz has_nul2, .Lnul_in_data2
- // No NUL in either register, copy it in a single instruction.
- stp data1, data2, [dst], #16
- b .Lloop
-
-.Lnul_in_data1:
- rev has_nul1, has_nul1
- clz pos, has_nul1
- add tmp1, pos, #0x8
-
- tbz tmp1, #6, 1f
-#if defined(STPCPY)
- str data1, [dst], #7
-#elif defined(STRCPY)
- str data1, [dst]
-#endif
- ret
-1:
- tbz tmp1, #5, 1f
- str data1_w, [dst], #4
- lsr data1, data1, #32
-1:
- tbz tmp1, #4, 1f
- strh data1_w, [dst], #2
- lsr data1, data1, #16
-1:
- tbz tmp1, #3, 1f
- strb data1_w, [dst]
-#if defined(STPCPY)
- ret
-#endif
-1:
-#if defined(STPCPY)
- // Back up one so that dst points to the '\0' string terminator.
- sub dst, dst, #1
-#endif
- ret
-
-.Lnul_in_data2:
- str data1, [dst], #8
- rev has_nul2, has_nul2
- clz pos, has_nul2
- add tmp1, pos, #0x8
-
- tbz tmp1, #6, 1f
-#if defined(STPCPY)
- str data2, [dst], #7
-#elif defined(STRCPY)
- str data2, [dst]
-#endif
- ret
-1:
- tbz tmp1, #5, 1f
- str data2_w, [dst], #4
- lsr data2, data2, #32
-1:
- tbz tmp1, #4, 1f
- strh data2_w, [dst], #2
- lsr data2, data2, #16
-1:
- tbz tmp1, #3, 1f
- strb data2_w, [dst]
-#if defined(STPCPY)
- ret
-#endif
-1:
-#if defined(STPCPY)
- // Back up one so that dst points to the '\0' string terminator.
- sub dst, dst, #1
-#endif
- ret
-
-.Lmisaligned:
- tbz src, #0, 1f
- ldrb data1_w, [src], #1
- strb data1_w, [dst], #1
- cbnz data1_w, 1f
-#if defined(STPCPY)
- // Back up one so that dst points to the '\0' string terminator.
- sub dst, dst, #1
-#endif
- ret
-1:
- tbz src, #1, 1f
- ldrb data1_w, [src], #1
- strb data1_w, [dst], #1
- cbz data1_w, .Ldone
- ldrb data2_w, [src], #1
- strb data2_w, [dst], #1
- cbnz data2_w, 1f
-.Ldone:
-#if defined(STPCPY)
- // Back up one so that dst points to the '\0' string terminator.
- sub dst, dst, #1
-#endif
- ret
-1:
- tbz src, #2, 1f
- ldr data1_w, [src], #4
- // Check for a zero.
- sub has_nul1_w, data1_w, zeroones_w
- bic has_nul1_w, has_nul1_w, data1_w
- ands has_nul1_w, has_nul1_w, #0x80808080
- b.ne .Lnul_in_data1
- str data1_w, [dst], #4
-1:
- tbz src, #3, .Lloop
- ldr data1, [src], #8
- // Check for a zero.
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bics has_nul1, tmp1, tmp2
- b.ne .Lnul_in_data1
- str data1, [dst], #8
- b .Lloop
-#if defined(STPCPY)
-END(stpcpy)
-#elif defined(STRCPY)
-END(strcpy)
-#endif
diff --git a/libc/arch-arm64/generic/bionic/strrchr.S b/libc/arch-arm64/generic/bionic/strrchr.S
new file mode 100644
index 000000000..274d19746
--- /dev/null
+++ b/libc/arch-arm64/generic/bionic/strrchr.S
@@ -0,0 +1,149 @@
+/*
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include <private/bionic_asm.h>
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+#define src_match x6
+#define src_offset x7
+#define const_m1 x8
+#define tmp4 x9
+#define nul_match x10
+#define chr_match x11
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_nul1 v3
+#define vhas_nul2 v4
+#define vhas_chr1 v5
+#define vhas_chr2 v6
+#define vrepmask_0 v7
+#define vrepmask_c v16
+#define vend1 v17
+#define vend2 v18
+
+/* Core algorithm.
+
+ For each 32-byte hunk we calculate a 64-bit syndrome value, with
+ two bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set iff
+ the relevant byte matched the requested character; bit 1 is set
+ iff the relevant byte matched the NUL end of string (we trigger
+ off bit0 for the special case of looking for NUL). Since the bits
+ in the syndrome reflect exactly the order in which things occur
+ in the original string a count_trailing_zeros() operation will
+ identify exactly which byte is causing the termination, and why. */
+
+ENTRY (strrchr)
+ PTR_ARG (0)
+ /* Magic constant 0x40100401 to allow us to identify which lane
+ matches the requested byte. Magic constant 0x80200802 used
+ similarly for NUL termination. */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
+ dup vrepmask_c.4s, wtmp2
+ mov src_offset, #0
+ ands tmp1, srcin, #31
+ add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+ b.eq L(aligned)
+
+ /* Input string is not 32-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ neg tmp1, tmp1
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64
+ mov nul_match, vend1.d[0]
+ lsl tmp1, tmp1, #1
+ mov const_m1, #~0
+ lsr tmp3, const_m1, tmp1
+ mov chr_match, vend1.d[1]
+
+ bic nul_match, nul_match, tmp3 // Mask padding bits.
+ bic chr_match, chr_match, tmp3 // Mask padding bits.
+ cbnz nul_match, L(tail)
+
+ .p2align 4
+L(loop):
+ cmp chr_match, #0
+ csel src_match, src, src_match, ne
+ csel src_offset, chr_match, src_offset, ne
+L(aligned):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ uminp vend1.16b, vdata1.16b, vdata2.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ cmeq vend1.16b, vend1.16b, 0
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64
+ mov nul_match, vend1.d[0]
+ mov chr_match, vend1.d[1]
+ cbz nul_match, L(loop)
+
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
+ mov nul_match, vhas_nul1.d[0]
+
+L(tail):
+ /* Work out exactly where the string ends. */
+ sub tmp4, nul_match, #1
+ eor tmp4, tmp4, nul_match
+ ands chr_match, chr_match, tmp4
+ /* And pick the values corresponding to the last match. */
+ csel src_match, src, src_match, ne
+ csel src_offset, chr_match, src_offset, ne
+
+ /* Count down from the top of the syndrome to find the last match. */
+ clz tmp3, src_offset
+ /* Src_match points beyond the word containing the match, so we can
+ simply subtract half the bit-offset into the syndrome. Because
+ we are counting down, we need to go back one more character. */
+ add tmp3, tmp3, #2
+ sub result, src_match, tmp3, lsr #1
+ /* But if the syndrome shows no match was found, then return NULL. */
+ cmp src_offset, #0
+ csel result, result, xzr, ne
+
+ ret
+
+END (strrchr)
+
diff --git a/libc/arch-arm64/generic/bionic/wmemmove.S b/libc/arch-arm64/generic/bionic/wmemmove.S
index e4f67f759..dc03f4c92 100644
--- a/libc/arch-arm64/generic/bionic/wmemmove.S
+++ b/libc/arch-arm64/generic/bionic/wmemmove.S
@@ -1,30 +1,33 @@
-/* Copyright (c) 2014, Linaro Limited
- All rights reserved.
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
+#include <private/bionic_asm.h>
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define WMEMMOVE
-#include "memmove.S"
-#undef WMEMMOVE
+ENTRY(wmemmove)
+ #include "memcpy_base.S"
+END(wmemmove)
diff --git a/libc/private/bionic_asm.h b/libc/private/bionic_asm.h
index 6409563f2..d18f0fc42 100644
--- a/libc/private/bionic_asm.h
+++ b/libc/private/bionic_asm.h
@@ -83,4 +83,15 @@
.globl alias; \
.equ alias, original
+/* For arm-optimized-routines */
+#define L(l) .L ## l
+#ifdef __ILP32__
+ /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n) mov w##n, w##n
+#define SIZE_ARG(n) mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#define SIZE_ARG(n)
+#endif
+
#endif