[ProtonAOSP] Switch to arm-optimized-routines for arm64lineage-18.1

This replaces the following string functions with implementations from arm-optimized-routines [1]: - memchr - strchr - strcmp - strlen - strncmp - strnlen - memcmp - memcpy (NEON variant) - memmove - memrchr - memset - stpcpy - strchrnul - strcpy - strrchr - wmemmove All string functions supported by arm-optimized-routines were added as of upstream commit e823e3abf5f89ecba58a10fc0fd82c13d9984b6b. The NEON variant of memcpy was used with the assumption that all of our arm64 targets support NEON. [1] https://github.com/ARM-software/optimized-routines Test: builds and boots Change-Id: Id43fcf54c459b7edab306030c668d1c1ce3d95cd
author: Danny Lin <danny@kdrag0n.dev> 2021-03-21 01:51:00 -0700
committer: alk3pInjection <webmaster@raspii.tech> 2021-09-27 21:17:05 +0800
commit: 03559b84468a98cd72150a66092ec463035bb666 (patch)
tree: bb321caca05a1413985d29560130bde2550b5fc7
parent: ba602b8990ff1b695ea580a537a07a07224da937 (diff)
20 files changed, 1322 insertions, 1358 deletions
diff --git a/libc/Android.bp b/libc/Android.bp
index 08fce3261..a422c26c5 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -619,6 +619,7 @@ cc_library_static {
         arm64: {
             exclude_srcs: [
                 "upstream-openbsd/lib/libc/string/memchr.c",
+                "upstream-openbsd/lib/libc/string/memrchr.c",
                 "upstream-openbsd/lib/libc/string/stpcpy.c",
                 "upstream-openbsd/lib/libc/string/strcpy.c",
                 "upstream-openbsd/lib/libc/string/strncmp.c",
@@ -851,9 +852,12 @@ cc_library_static {
                 "arch-arm64/generic/bionic/memcmp.S",
                 "arch-arm64/generic/bionic/memcpy.S",
                 "arch-arm64/generic/bionic/memmove.S",
+                "arch-arm64/generic/bionic/memrchr.S",
                 "arch-arm64/generic/bionic/memset.S",
                 "arch-arm64/generic/bionic/stpcpy.S",
+                "arch-arm64/generic/bionic/strchrnul.S",
                 "arch-arm64/generic/bionic/strcpy.S",
+                "arch-arm64/generic/bionic/strrchr.S",
                 "arch-arm64/generic/bionic/wmemmove.S",
 
                 "arch-arm64/default/bionic/memchr.S",
@@ -880,6 +884,7 @@ cc_library_static {
                 "bionic/__memcpy_chk.cpp",
                 "bionic/strchr.cpp",
                 "bionic/strnlen.c",
+                "bionic/strrchr.cpp",
             ],
         },
 
@@ -1178,6 +1183,14 @@ cc_library_static {
         "bionic/icu_static.cpp",
     ],
 
+    arch: {
+        arm64: {
+            exclude_srcs: [
+                "bionic/strchrnul.cpp",
+            ],
+        },
+    },
+
     multilib: {
         lib32: {
             // LP32 cruft
diff --git a/libc/arch-arm64/default/bionic/memchr.S b/libc/arch-arm64/default/bionic/memchr.S
index 7fbcc8fd4..f5599bf2d 100644
--- a/libc/arch-arm64/default/bionic/memchr.S
+++ b/libc/arch-arm64/default/bionic/memchr.S
@@ -1,32 +1,9 @@
 /*
+ * memchr - find a character in a memory zone
  *
-   Copyright (c) 2014, ARM Limited
-   All rights Reserved.
-   Copyright (c) 2014, Linaro Ltd.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the company nor the names of its contributors
-         may be used to endorse or promote products derived from this
-         software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
 
 /* Assumptions:
  *
@@ -69,12 +46,15 @@
  * identify exactly which byte has matched.
  */
 
-ENTRY(memchr_default)
+ENTRY (memchr_default)
+	PTR_ARG (0)
+	SIZE_ARG (2)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
 	/*
 	 * Magic constant 0x40100401 allows us to identify which lane matches
 	 * the requested byte.
 	 */
-	cbz	cntin, .Lzero_length
 	mov	wtmp2, #0x0401
 	movk	wtmp2, #0x4010, lsl #16
 	dup	vrepchr.16b, chrin
@@ -83,7 +63,7 @@ ENTRY(memchr_default)
 	dup	vrepmask.4s, wtmp2
 	ands	soff, srcin, #31
 	and	cntrem, cntin, #31
-	b.eq	.Lloop
+	b.eq	L(loop)
 
 	/*
 	 * Input string is not 32-byte aligned. We calculate the syndrome
@@ -106,25 +86,25 @@ ENTRY(memchr_default)
 	lsr	synd, synd, tmp
 	lsl	synd, synd, tmp
 	/* The first block can also be the last */
-	b.ls	.Lmasklast
+	b.ls	L(masklast)
 	/* Have we found something already? */
-	cbnz	synd, .Ltail
+	cbnz	synd, L(tail)
 
-.Lloop:
+L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	subs	cntin, cntin, #32
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	/* If we're out of data we finish regardless of the result */
-	b.ls	.Lend
+	b.ls	L(end)
 	/* Use a fast check for the termination condition */
 	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
 	addp	vend.2d, vend.2d, vend.2d
 	mov	synd, vend.d[0]
 	/* We're not out of data, loop if we haven't found the character */
-	cbz	synd, .Lloop
+	cbz	synd, L(loop)
 
-.Lend:
+L(end):
 	/* Termination condition found, let's calculate the syndrome value */
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
@@ -132,9 +112,9 @@ ENTRY(memchr_default)
 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
 	mov	synd, vend.d[0]
 	/* Only do the clear for the last possible block */
-	b.hi	.Ltail
+	b.hs	L(tail)
 
-.Lmasklast:
+L(masklast):
 	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
 	add	tmp, cntrem, soff
 	and	tmp, tmp, #31
@@ -143,7 +123,7 @@ ENTRY(memchr_default)
 	lsl	synd, synd, tmp
 	lsr	synd, synd, tmp
 
-.Ltail:
+L(tail):
 	/* Count the trailing zeros using bit reversing */
 	rbit	synd, synd
 	/* Compensate the last post-increment */
@@ -158,7 +138,9 @@ ENTRY(memchr_default)
 	csel	result, xzr, result, eq
 	ret
 
-.Lzero_length:
-	mov	result, xzr
+L(zero_length):
+	mov	result, #0
 	ret
-END(memchr_default)
+
+END (memchr_default)
+
diff --git a/libc/arch-arm64/default/bionic/strchr.S b/libc/arch-arm64/default/bionic/strchr.S
index f8cb724cc..ae522e81d 100644
--- a/libc/arch-arm64/default/bionic/strchr.S
+++ b/libc/arch-arm64/default/bionic/strchr.S
@@ -1,32 +1,9 @@
 /*
+ * strchr - find a character in a string
  *
-   Copyright (c) 2014, ARM Limited
-   All rights Reserved.
-   Copyright (c) 2014, Linaro Ltd.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the company nor the names of its contributors
-         may be used to endorse or promote products derived from this
-         software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
 
 /* Assumptions:
  *
@@ -73,18 +50,19 @@
 
 /* Locals and temporaries.  */
 
-ENTRY(strchr_default)
-	/* Magic constant 0x40100401 to allow us to identify which lane
-	   matches the requested byte.  Magic constant 0x80200802 used
-	   similarly for NUL termination.  */
-	mov	wtmp2, #0x0401
-	movk	wtmp2, #0x4010, lsl #16
+ENTRY (strchr_default)
+	PTR_ARG (0)
+	/* Magic constant 0xc0300c03 to allow us to identify which lane
+	   matches the requested byte.  Even bits are set if the character
+	   matches, odd bits if either the char is NUL or matches.  */
+	mov	wtmp2, 0x0c03
+	movk	wtmp2, 0xc030, lsl 16
 	dup	vrepchr.16b, chrin
 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
 	dup	vrepmask_c.4s, wtmp2
 	ands	tmp1, srcin, #31
 	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
-	b.eq	.Lloop
+	b.eq	L(loop)
 
 	/* Input string is not 32-byte aligned.  Rather than forcing
 	   the padding bytes to a safe value, we calculate the syndrome
@@ -96,12 +74,10 @@ ENTRY(strchr_default)
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
-	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
 	lsl	tmp1, tmp1, #1
 	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
 	mov	tmp3, #~0
@@ -110,35 +86,30 @@ ENTRY(strchr_default)
 
 	mov	tmp3, vend1.d[0]
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
-	cbnz	tmp1, .Ltail
+	cbnz	tmp1, L(tail)
 
-.Lloop:
+	.p2align 4
+L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	/* Use a fast check for the termination condition.  */
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
-	orr	vend1.16b, vend1.16b, vend2.16b
-	addp	vend1.2d, vend1.2d, vend1.2d
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+	umaxp	vend1.16b, vend1.16b, vend1.16b
 	mov	tmp1, vend1.d[0]
-	cbz	tmp1, .Lloop
+	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
-	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
-	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
 	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
 	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
-
 	mov	tmp1, vend1.d[0]
-.Ltail:
+L(tail):
 	/* Count the trailing zeros, by bit reversing...  */
 	rbit	tmp1, tmp1
 	/* Re-bias source.  */
@@ -150,4 +121,6 @@ ENTRY(strchr_default)
 	add	result, src, tmp1, lsr #1
 	csel	result, result, xzr, eq
 	ret
-END(strchr_default)
+
+END (strchr_default)
+
diff --git a/libc/arch-arm64/default/bionic/strcmp.S b/libc/arch-arm64/default/bionic/strcmp.S
index dfac7c42c..418b4c894 100644
--- a/libc/arch-arm64/default/bionic/strcmp.S
+++ b/libc/arch-arm64/default/bionic/strcmp.S
@@ -1,29 +1,9 @@
-/* Copyright (c) 2012, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
 
 /* Assumptions:
  *
@@ -32,8 +12,6 @@
 
 #include <private/bionic_asm.h>
 
-#define L(label) .L ## label
-
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
@@ -58,8 +36,9 @@
 #define pos		x11
 
 	/* Start of performance-critical section  -- one 64B cache line.  */
-ENTRY(strcmp_default)
-.p2align  6
+ENTRY (strcmp_default)
+	PTR_ARG (0)
+	PTR_ARG (1)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
@@ -189,4 +168,6 @@ L(loop_misaligned):
 L(done):
 	sub	result, data1, data2
 	ret
-END(strcmp_default)
+
+END (strcmp_default)
+
diff --git a/libc/arch-arm64/default/bionic/strlen.S b/libc/arch-arm64/default/bionic/strlen.S
index 07c5294c2..d102dec2d 100644
--- a/libc/arch-arm64/default/bionic/strlen.S
+++ b/libc/arch-arm64/default/bionic/strlen.S
@@ -1,105 +1,88 @@
-/* Copyright (c) 2013-2015, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-	 notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-	 notice, this list of conditions and the following disclaimer in the
-	 documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-	 names of its contributors may be used to endorse or promote products
-	 derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+/*
+ * strlen - calculate the length of a string.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
  */
 
 #include <private/bionic_asm.h>
 
-/* To test the page crossing code path more thoroughly, compile with
-   -DTEST_PAGE_CROSS - this will force all calls through the slower
-   entry path.  This option is not intended for production use.	 */
-
-/* Arguments and results.  */
-#define srcin		x0
-#define len		x0
-
-/* Locals and temporaries.  */
-#define src		x1
-#define data1		x2
-#define data2		x3
-#define has_nul1	x4
-#define has_nul2	x5
-#define tmp1		x4
-#define tmp2		x5
-#define tmp3		x6
-#define tmp4		x7
-#define zeroones	x8
-
-#define L(l) .L ## l
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word. A faster check
-	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
-	   false hits for characters 129..255.	*/
+#define srcin	x0
+#define len	x0
+
+#define src	x1
+#define data1	x2
+#define data2	x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1	x4
+#define tmp2	x5
+#define tmp3	x6
+#define tmp4	x7
+#define zeroones x8
+
+#define maskv	v0
+#define maskd	d0
+#define dataq1	q1
+#define dataq2	q2
+#define datav1	v1
+#define datav2	v2
+#define tmp	x2
+#define tmpw	w2
+#define synd	x3
+#define shift	x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+   byte is zero, and can be done in parallel across the entire word.  */
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
+
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.  */
 
 #ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 15
+# define MIN_PAGE_SIZE 32
 #else
 # define MIN_PAGE_SIZE 4096
 #endif
 
-	/* Since strings are short on average, we check the first 16 bytes
-	   of the string for a NUL character.  In order to do an unaligned ldp
-	   safely we have to do a page cross check first.  If there is a NUL
-	   byte we calculate the length from the 2 8-byte words using
-	   conditional select to reduce branch mispredictions (it is unlikely
-	   strlen will be repeatedly called on strings with the same length).
-
-	   If the string is longer than 16 bytes, we align src so don't need
-	   further page cross checks, and process 32 bytes per iteration
-	   using the fast NUL check.  If we encounter non-ASCII characters,
-	   fallback to a second loop using the full NUL check.
-
-	   If the page cross check fails, we read 16 bytes from an aligned
-	   address, remove any characters before the string, and continue
-	   in the main loop using aligned loads.  Since strings crossing a
-	   page in the first 16 bytes are rare (probability of
-	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
-	   AArch64 systems have a minimum page size of 4k.  We don't bother
-	   checking for larger page sizes - the cost of setting up the correct
-	   page size is just not worth the extra gain from a small reduction in
-	   the cases taking the slow path.  Note that we only care about
-	   whether the first fetch, which may be misaligned, crosses a page
-	   boundary.  */
-
-ENTRY(strlen_default)
+/* Core algorithm:
+
+   Since strings are short on average, we check the first 32 bytes of the
+   string for a NUL character without aligning the string.  In order to use
+   unaligned loads safely we must do a page cross check first.
+
+   If there is a NUL byte we calculate the length from the 2 8-byte words
+   using conditional select to reduce branch mispredictions (it is unlikely
+   strlen will be repeatedly called on strings with the same length).
+
+   If the string is longer than 32 bytes, align src so we don't need further
+   page cross checks, and process 32 bytes per iteration using a fast SIMD
+   loop.
+
+   If the page cross check fails, we read 32 bytes from an aligned address,
+   and ignore any characters before the string.  If it contains a NUL
+   character, return the length, if not, continue in the main loop.  */
+
+ENTRY (strlen_default)
+	PTR_ARG (0)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
-	mov	zeroones, REP8_01
-	cmp	tmp1, MIN_PAGE_SIZE - 16
-	b.gt	L(page_cross)
+	cmp	tmp1, MIN_PAGE_SIZE - 32
+	b.hi	L(page_cross)
+
+	/* Look for a NUL byte in the first 16 bytes.  */
 	ldp	data1, data2, [srcin]
+	mov	zeroones, REP8_01
+
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
 	   string is 0x01) means we cannot use has_nul1/2 directly.
@@ -115,113 +98,103 @@ ENTRY(strlen_default)
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
-	beq	L(main_loop_entry)
+	b.eq	L(bytes16_31)
 
-	/* Enter with C = has_nul1 == 0.  */
+	/* Find the exact offset of the first NUL byte in the first 16 bytes
+	   from the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
 	rev	has_nul1, has_nul1
-	clz	tmp1, has_nul1
 	csel	len, xzr, len, cc
+	clz	tmp1, has_nul1
 	add	len, len, tmp1, lsr 3
 	ret
 
-	/* The inner loop processes 32 bytes per iteration and uses the fast
-	   NUL check.  If we encounter non-ASCII characters, use a second
-	   loop with the accurate NUL check.  */
-	.p2align 4
-L(main_loop_entry):
-	bic	src, srcin, 15
-	sub	src, src, 16
-L(main_loop):
-	ldp	data1, data2, [src, 32]!
-.Lpage_cross_entry:
-	sub	tmp1, data1, zeroones
-	sub	tmp3, data2, zeroones
-	orr	tmp2, tmp1, tmp3
-	tst	tmp2, zeroones, lsl 7
-	bne	1f
-	ldp	data1, data2, [src, 16]
+	.p2align 3
+	/* Look for a NUL byte at offset 16..31 in the string.  */
+L(bytes16_31):
+	ldp	data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
 	sub	tmp1, data1, zeroones
-	sub	tmp3, data2, zeroones
-	orr	tmp2, tmp1, tmp3
-	tst	tmp2, zeroones, lsl 7
-	beq	L(main_loop)
-	add	src, src, 16
-1:
-	/* The fast check failed, so do the slower, accurate NUL check.	 */
 	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
 	orr	tmp4, data2, REP8_7f
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
-	beq	L(nonascii_loop)
+	b.eq	L(loop_entry)
 
-	/* Enter with C = has_nul1 == 0.  */
-L(tail):
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul1/2 directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	csel	data1, data1, data2, cc
-	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
+	/* Find the exact offset of the first NUL byte at offset 16..31 from
+	   the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
-#endif
-	sub	len, src, srcin
+	mov	len, 24
 	rev	has_nul1, has_nul1
-	add	tmp2, len, 8
+	mov	tmp3, 16
 	clz	tmp1, has_nul1
-	csel	len, len, tmp2, cc
+	csel	len, tmp3, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
 
-L(nonascii_loop):
-	ldp	data1, data2, [src, 16]!
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	bne	L(tail)
-	ldp	data1, data2, [src, 16]!
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	beq	L(nonascii_loop)
-	b	L(tail)
+L(loop_entry):
+	bic	src, srcin, 31
 
-	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
-	   srcin to 0x7f, so we ignore any NUL bytes before the string.
-	   Then continue in the aligned loop.  */
-L(page_cross):
-	bic	src, srcin, 15
-	ldp	data1, data2, [src]
-	lsl	tmp1, srcin, 3
-	mov	tmp4, -1
+	.p2align 5
+L(loop):
+	ldp	dataq1, dataq2, [src, 32]!
+	uminp	maskv.16b, datav1.16b, datav2.16b
+	uminp	maskv.16b, maskv.16b, maskv.16b
+	cmeq	maskv.8b, maskv.8b, 0
+	fmov	synd, maskd
+	cbz	synd, L(loop)
+
+	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
+	cmeq	maskv.16b, datav1.16b, 0
+	sub	len, src, srcin
+	tst	synd, 0xffffffff
+	b.ne	1f
+	cmeq	maskv.16b, datav2.16b, 0
+	add	len, len, 16
+1:
+	/* Generate a bitmask and compute correct byte offset.  */
 #ifdef __AARCH64EB__
-	/* Big-endian.	Early bytes are at MSB.	 */
-	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+	bic	maskv.8h, 0xf0
 #else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+	bic	maskv.8h, 0x0f, lsl 8
 #endif
-	orr	tmp1, tmp1, REP8_80
-	orn	data1, data1, tmp1
-	orn	tmp2, data2, tmp1
-	tst	srcin, 8
-	csel	data1, data1, tmp4, eq
-	csel	data2, data2, tmp2, eq
-	b	L(page_cross_entry)
-
-END(strlen_default)
+	umaxp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	tmp, synd
+	add	len, len, tmp, lsr 2
+	ret
+
+        .p2align 4
+
+L(page_cross):
+	bic	src, srcin, 31
+	mov	tmpw, 0x0c03
+	movk	tmpw, 0xc030, lsl 16
+	ld1	{datav1.16b, datav2.16b}, [src]
+	dup	maskv.4s, tmpw
+	cmeq	datav1.16b, datav1.16b, 0
+	cmeq	datav2.16b, datav2.16b, 0
+	and	datav1.16b, datav1.16b, maskv.16b
+	and	datav2.16b, datav2.16b, maskv.16b
+	addp	maskv.16b, datav1.16b, datav2.16b
+	addp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+	lsl	shift, srcin, 1
+	lsr	synd, synd, shift
+	cbz	synd, L(loop)
+
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 1
+	ret
+
+END (strlen_default)
diff --git a/libc/arch-arm64/default/bionic/strncmp.S b/libc/arch-arm64/default/bionic/strncmp.S
index 5432b738e..68c6e6526 100644
--- a/libc/arch-arm64/default/bionic/strncmp.S
+++ b/libc/arch-arm64/default/bionic/strncmp.S
@@ -1,29 +1,9 @@
-/* Copyright (c) 2014, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
 
 /* Assumptions:
  *
@@ -60,19 +40,17 @@
 #define endloop		x15
 #define count		mask
 
-	.text
-	.p2align 6
-	.rep 7
-	nop	/* Pad so that the loop below fits a cache line.  */
-	.endr
-ENTRY(strncmp_default)
-	cbz	limit, .Lret0
+ENTRY (strncmp_default)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
 	and	count, src1, #7
-	b.ne	.Lmisaligned8
-	cbnz	count, .Lmutual_align
+	b.ne	L(misaligned8)
+	cbnz	count, L(mutual_align)
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
@@ -80,11 +58,11 @@ ENTRY(strncmp_default)
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
-	/* Start of performance-critical section  -- one 64B cache line.  */
-.Lloop_aligned:
+	.p2align 4
+L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
 	subs	limit_wd, limit_wd, #1
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
@@ -92,15 +70,15 @@ ENTRY(strncmp_default)
 	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
-	b.eq	.Lloop_aligned
-	/* End of performance-critical section  -- one 64B cache line.  */
+	b.eq	L(loop_aligned)
+	/* End of main loop */
 
 	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit_wd, #63, .Lnot_limit
+	tbz	limit_wd, #63, L(not_limit)
 
 	/* Limit % 8 == 0 => all bytes significant.  */
 	ands	limit, limit, #7
-	b.eq	.Lnot_limit
+	b.eq	L(not_limit)
 
 	lsl	limit, limit, #3	/* Bits -> bytes.  */
 	mov	mask, #~0
@@ -115,7 +93,7 @@ ENTRY(strncmp_default)
 	/* Make sure that the NUL byte is marked in the syndrome.  */
 	orr	has_nul, has_nul, mask
 
-.Lnot_limit:
+L(not_limit):
 	orr	syndrome, diff, has_nul
 
 #ifndef	__AARCH64EB__
@@ -168,7 +146,7 @@ ENTRY(strncmp_default)
 	ret
 #endif
 
-.Lmutual_align:
+L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
 	   the bytes that precede the start point.
@@ -196,56 +174,56 @@ ENTRY(strncmp_default)
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	add	limit_wd, limit_wd, tmp3, lsr #3
-	b	.Lstart_realigned
+	b	L(start_realigned)
 
-	.p2align 6
+	.p2align 4
 	/* Don't bother with dwords for up to 16 bytes.  */
-.Lmisaligned8:
+L(misaligned8):
 	cmp	limit, #16
-	b.hs	.Ltry_misaligned_words
+	b.hs	L(try_misaligned_words)
 
-.Lbyte_loop:
+L(byte_loop):
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
 	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	.Lbyte_loop
-.Ldone:
+	b.eq	L(byte_loop)
+L(done):
 	sub	result, data1, data2
 	ret
 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
 	   the dword loop.  */
-.Ltry_misaligned_words:
+L(try_misaligned_words):
 	lsr	limit_wd, limit, #3
-	cbz	count, .Ldo_misaligned
+	cbz	count, L(do_misaligned)
 
 	neg	count, count
 	and	count, count, #7
 	sub	limit, limit, count
 	lsr	limit_wd, limit, #3
 
-.Lpage_end_loop:
+L(page_end_loop):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	cmp	data1w, #1
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.ne	.Ldone
+	b.ne	L(done)
 	subs	count, count, #1
-	b.hi	.Lpage_end_loop
+	b.hi	L(page_end_loop)
 
-.Ldo_misaligned:
+L(do_misaligned):
 	/* Prepare ourselves for the next page crossing.  Unlike the aligned
 	   loop, we fetch 1 less dword because we risk crossing bounds on
 	   SRC2.  */
 	mov	count, #8
 	subs	limit_wd, limit_wd, #1
-	b.lo	.Ldone_loop
-.Lloop_misaligned:
+	b.lo	L(done_loop)
+L(loop_misaligned):
 	and	tmp2, src2, #0xff8
 	eor	tmp2, tmp2, #0xff8
-	cbz	tmp2, .Lpage_end_loop
+	cbz	tmp2, L(page_end_loop)
 
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
@@ -254,14 +232,14 @@ ENTRY(strncmp_default)
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	diff, #0, #0, eq
-	b.ne	.Lnot_limit
+	b.ne	L(not_limit)
 	subs	limit_wd, limit_wd, #1
-	b.pl	.Lloop_misaligned
+	b.pl	L(loop_misaligned)
 
-.Ldone_loop:
+L(done_loop):
 	/* We found a difference or a NULL before the limit was reached.  */
 	and	limit, limit, #7
-	cbz	limit, .Lnot_limit
+	cbz	limit, L(not_limit)
 	/* Read the last word.  */
 	sub	src1, src1, 8
 	sub	src2, src2, 8
@@ -272,9 +250,11 @@ ENTRY(strncmp_default)
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	diff, #0, #0, eq
-	b.ne	.Lnot_limit
+	b.ne	L(not_limit)
 
-.Lret0:
+L(ret0):
 	mov	result, #0
 	ret
-END(strncmp_default)
+
+END ( strncmp_default)
+
diff --git a/libc/arch-arm64/default/bionic/strnlen.S b/libc/arch-arm64/default/bionic/strnlen.S
index 169453207..445661d48 100644
--- a/libc/arch-arm64/default/bionic/strnlen.S
+++ b/libc/arch-arm64/default/bionic/strnlen.S
@@ -1,174 +1,112 @@
-/* Copyright (c) 2014, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+/*
+ * strnlen - calculate the length of a string with limit.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
 #include <private/bionic_asm.h>
 
-/* Arguments and results.  */
 #define srcin		x0
-#define len		x0
-#define limit		x1
+#define cntin		x1
+#define result		x0
 
-/* Locals and temporaries.  */
 #define src		x2
-#define data1		x3
-#define data2		x4
-#define data2a		x5
-#define has_nul1	x6
-#define has_nul2	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define tmp4		x11
-#define zeroones	x12
-#define pos		x13
-#define limit_wd	x14
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-	.text
-	.p2align	6
-.Lstart:
-	/* Pre-pad to ensure critical loop begins an icache line.  */
-	.rep 7
-	nop
-	.endr
-	/* Put this code here to avoid wasting more space with pre-padding.  */
-.Lhit_limit:
-	mov	len, limit
+#define synd		x3
+#define	shift		x4
+#define wtmp		w4
+#define tmp		x4
+#define cntrem		x5
+
+#define qdata		q0
+#define vdata		v0
+#define vhas_chr	v1
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
+
+/*
+   Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (strnlen_default)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src], 16
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbz	synd, L(start_loop)
+L(finish):
+	rbit	synd, synd
+	clz	synd, synd
+	lsr	result, synd, 2
+	cmp	cntin, result
+	csel	result, cntin, result, ls
 	ret
 
-ENTRY(strnlen_default)
-	cbz	limit, .Lhit_limit
-	mov	zeroones, #REP8_01
-	bic	src, srcin, #15
-	ands	tmp1, srcin, #15
-	b.ne	.Lmisaligned
-	/* Calculate the number of full and partial words -1.  */
-	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-
-	/* Start of critial section -- keep to one 64Byte cache line.  */
-.Lloop:
-	ldp	data1, data2, [src], #16
-.Lrealigned:
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	subs	limit_wd, limit_wd, #1
-	orr	tmp1, has_nul1, has_nul2
-	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
-	b.eq	.Lloop
-	/* End of critical section -- keep to one 64Byte cache line.  */
-
-	orr	tmp1, has_nul1, has_nul2
-	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */
-
-	/* We know there's a null in the final Qword.  The easiest thing
-	   to do now is work out the length of the string and return
-	   MIN (len, limit).  */
-
-	sub	len, src, srcin
-	cbz	has_nul1, .Lnul_in_data2
-#ifdef __AARCH64EB__
-	mov	data2, data1
+L(start_loop):
+	sub	tmp, src, srcin
+	subs	cntrem, cntin, tmp
+	b.ls	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
+
+	.p2align 5
+L(loop32):
+	ldr	qdata, [src], 16
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+L(loop32_2):
+	ldr	qdata, [src], 16
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	b.ls	L(end)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+
+L(end):
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	sub	src, src, 16
+	mov	synd, vend.d[0]
+	sub	result, src, srcin
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	sub	len, len, #8
-	mov	has_nul2, has_nul1
-.Lnul_in_data2:
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	rev	data2, data2
-	sub	tmp1, data2, zeroones
-	orr	tmp2, data2, #REP8_7f
-	bic	has_nul2, tmp1, tmp2
-#endif
-	sub	len, len, #8
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	add	len, len, pos, lsr #3		/* Bits to bytes.  */
-	cmp	len, limit
-	csel	len, len, limit, ls		/* Return the lower value.  */
+	clz	synd, synd
+	add	result, result, synd, lsr 2
+	cmp	cntin, result
+	csel	result, cntin, result, ls
 	ret
 
-.Lmisaligned:
-	/* Deal with a partial first word.
-	   We're doing two things in parallel here;
-	   1) Calculate the number of words (but avoiding overflow if
-	      limit is near ULONG_MAX) - to do this we need to work out
-	      limit + tmp1 - 1 as a 65-bit value before shifting it;
-	   2) Load and mask the initial data words - we force the bytes
-	      before the ones we are interested in to 0xff - this ensures
-	      early bytes will not hit any zero detection.  */
-	sub	limit_wd, limit, #1
-	neg	tmp4, tmp1
-	cmp	tmp1, #8
-
-	and	tmp3, limit_wd, #15
-	lsr	limit_wd, limit_wd, #4
-	mov	tmp2, #~0
-
-	ldp	data1, data2, [src], #16
-	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
-	add	tmp3, tmp3, tmp1
-
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
-#endif
-	add	limit_wd, limit_wd, tmp3, lsr #4
+L(nomatch):
+	mov	result, cntin
+	ret
 
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
+END (strnlen_default)
 
-	csinv	data1, data1, xzr, le
-	csel	data2, data2, data2a, le
-	b	.Lrealigned
-END(strnlen_default)
diff --git a/libc/arch-arm64/generic/bionic/memcmp.S b/libc/arch-arm64/generic/bionic/memcmp.S
index bff54aea3..fae0d4226 100644
--- a/libc/arch-arm64/generic/bionic/memcmp.S
+++ b/libc/arch-arm64/generic/bionic/memcmp.S
@@ -1,29 +1,7 @@
-/*
- * Copyright (c) 2017 ARM Ltd
- * All rights reserved.
+/* memcmp - compare memory
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -33,8 +11,6 @@
 
 #include <private/bionic_asm.h>
 
-#define L(l) .L ## l
-
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
@@ -51,20 +27,13 @@
 #define tmp1		x7
 #define tmp2		x8
 
-/* Small inputs of less than 8 bytes are handled separately.  This allows the
-   main code to be speed up using unaligned loads since there are now at least
-   8 bytes to be compared.  If the first 8 bytes are equal, align src1.
-   This ensures each iteration does at most one unaligned access even if both
-   src1 and src2 are unaligned, and mutually aligned inputs behave as if
-   aligned.  After the main loop, process the last 16 bytes using unaligned
-   accesses.  */
-
-ENTRY(memcmp)
-.p2align 6
+ENTRY (memcmp)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	subs	limit, limit, 8
 	b.lo	L(less8)
 
-	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
 	ldr	data1, [src1], 8
 	ldr	data2, [src2], 8
 	cmp	data1, data2
@@ -164,4 +133,5 @@ L(byte_loop):
 	sub	result, data1w, data2w
 	ret
 
-END(memcmp)
+END (memcmp)
+
diff --git a/libc/arch-arm64/generic/bionic/memcpy.S b/libc/arch-arm64/generic/bionic/memcpy.S
index baadb9204..fc487d3a4 100644
--- a/libc/arch-arm64/generic/bionic/memcpy.S
+++ b/libc/arch-arm64/generic/bionic/memcpy.S
@@ -30,6 +30,6 @@
 
 #include <private/bionic_asm.h>
 
-ENTRY(__memcpy)
+ENTRY(memcpy)
   #include "memcpy_base.S"
-END(__memcpy)
+END(memcpy)
diff --git a/libc/arch-arm64/generic/bionic/memcpy_base.S b/libc/arch-arm64/generic/bionic/memcpy_base.S
index f85062408..f77e4dd35 100644
--- a/libc/arch-arm64/generic/bionic/memcpy_base.S
+++ b/libc/arch-arm64/generic/bionic/memcpy_base.S
@@ -1,60 +1,13 @@
-/* Copyright (c) 2012-2013, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
 /*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
+ * memcpy - copy memory area
  *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
@@ -69,149 +22,181 @@
 #define A_l	x6
 #define A_lw	w6
 #define A_h	x7
-#define A_hw	w7
 #define B_l	x8
-#define B_lw   w8
+#define B_lw	w8
 #define B_h	x9
-#define C_l	x10
-#define C_h	x11
-#define D_l	x12
-#define D_h	x13
-#define E_l	src
-#define E_h	count
-#define F_l	srcend
-#define F_h	dst
-#define tmp1	x9
-
-#define L(l) .L ## l
-
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
-   medium copies of 17..96 bytes which are fully unrolled. Large copies
-   of more than 96 bytes align the destination and use an unrolled loop
-   processing 64 bytes per iteration.
-   Small and medium copies read all data before writing, allowing any
-   kind of overlap, and memmove tailcalls memcpy for these cases as
-   well as non-overlapping copies.
+#define C_lw	w10
+#define tmp1	x14
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The source pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
 */
 
-	prfm    PLDL1KEEP, [src]
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	add	srcend, src, count
 	add	dstend, dstin, count
-        cmp     count, 16
-        b.ls    L(copy16)
-	cmp	count, 96
+	cmp	count, 128
 	b.hi	L(copy_long)
-
-	/* Medium copies: 17..96 bytes.  */
-	sub	tmp1, count, 1
-	ldp	A_l, A_h, [src]
-	tbnz	tmp1, 6, L(copy96)
-	ldp	D_l, D_h, [srcend, -16]
-	tbz	tmp1, 5, 1f
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [srcend, -32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
-1:
-	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
 	ret
 
-	.p2align 4
-
-	/* Small copies: 0..16 bytes.  */
+	/* Copy 8-15 bytes.  */
 L(copy16):
-	cmp	count, 8
-	b.lo	1f
+	tbz	count, 3, L(copy8)
 	ldr	A_l, [src]
 	ldr	A_h, [srcend, -8]
 	str	A_l, [dstin]
 	str	A_h, [dstend, -8]
 	ret
-	.p2align 4
-1:
-	tbz	count, 2, 1f
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
 	ldr	A_lw, [src]
-	ldr	A_hw, [srcend, -4]
+	ldr	B_lw, [srcend, -4]
 	str	A_lw, [dstin]
-	str	A_hw, [dstend, -4]
+	str	B_lw, [dstend, -4]
 	ret
 
-	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
-	cbz	count, 2f
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
 	lsr	tmp1, count, 1
 	ldrb	A_lw, [src]
-	ldrb	A_hw, [srcend, -1]
+	ldrb	C_lw, [srcend, -1]
 	ldrb	B_lw, [src, tmp1]
 	strb	A_lw, [dstin]
 	strb	B_lw, [dstin, tmp1]
-	strb	A_hw, [dstend, -1]
-2:	ret
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
 
 	.p2align 4
-	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
-	   32 bytes from the end.  */
-L(copy96):
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [src, 32]
-	ldp	D_l, D_h, [src, 48]
-	ldp	E_l, E_h, [srcend, -32]
-	ldp	F_l, F_h, [srcend, -16]
-	stp	A_l, A_h, [dstin]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstin, 32]
-	stp	D_l, D_h, [dstin, 48]
-	stp	E_l, E_h, [dstend, -32]
-	stp	F_l, F_h, [dstend, -16]
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
 	ret
 
-	/* Align DST to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.	 There are at least 96 bytes
-	   to copy, so copy 16 bytes unaligned and then align.	The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
 	.p2align 4
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_q, F_q, [src, 32]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
+L(copy96):
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy more than 128 bytes.  */
 L(copy_long):
-	and	tmp1, dstin, 15
-	bic	dst, dstin, 15
-	ldp	D_l, D_h, [src]
-	sub	src, src, tmp1
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
 	add	count, count, tmp1	/* Count is now 16 too large.  */
-	ldp	A_l, A_h, [src, 16]
-	stp	D_l, D_h, [dstin]
-	ldp	B_l, B_h, [src, 32]
-	ldp	C_l, C_h, [src, 48]
-	ldp	D_l, D_h, [src, 64]!
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
-	b.ls	2f
-1:
-	stp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [src, 16]
-	stp	B_l, B_h, [dst, 32]
-	ldp	B_l, B_h, [src, 32]
-	stp	C_l, C_h, [dst, 48]
-	ldp	C_l, C_h, [src, 48]
-	stp	D_l, D_h, [dst, 64]!
-	ldp	D_l, D_h, [src, 64]!
+	b.ls	L(copy64_from_end)
+L(loop64):
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
 	subs	count, count, 64
-	b.hi	1b
-
-	/* Write the last full set of 64 bytes.	 The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the end even if
-	   there is just 1 byte left.  */
-2:
-	ldp	E_l, E_h, [srcend, -64]
-	stp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [srcend, -48]
-	stp	B_l, B_h, [dst, 32]
-	ldp	B_l, B_h, [srcend, -32]
-	stp	C_l, C_h, [dst, 48]
-	ldp	C_l, C_h, [srcend, -16]
-	stp	D_l, D_h, [dst, 64]
-	stp	E_l, E_h, [dstend, -64]
-	stp	A_l, A_h, [dstend, -48]
-	stp	B_l, B_h, [dstend, -32]
-	stp	C_l, C_h, [dstend, -16]
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
 	ret
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	cbz	tmp1, L(copy0)
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+	ret
+
diff --git a/libc/arch-arm64/generic/bionic/memmove.S b/libc/arch-arm64/generic/bionic/memmove.S
index 335b7d6ce..ad4f31f25 100644
--- a/libc/arch-arm64/generic/bionic/memmove.S
+++ b/libc/arch-arm64/generic/bionic/memmove.S
@@ -1,155 +1,33 @@
-/* Copyright (c) 2013, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
 /*
- * Copyright (c) 2015 ARM Ltd
+ * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
- * 1. Redistributions of source code must retain the above copyright
+ *  * Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
  *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  */
 
 #include <private/bionic_asm.h>
 
-/* Parameters and result.  */
-#define dstin	x0
-#define src	x1
-#define count	x2
-#define srcend	x3
-#define dstend	x4
-#define tmp1	x5
-#define A_l	x6
-#define A_h	x7
-#define B_l	x8
-#define B_h	x9
-#define C_l	x10
-#define C_h	x11
-#define D_l	x12
-#define D_h	x13
-#define E_l	count
-#define E_h	tmp1
-
-/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
-   Larger backwards copies are also handled by memcpy. The only remaining
-   case is forward large copies.  The destination is aligned, and an
-   unrolled loop processes 64 bytes per iteration.
-*/
-
-#if defined(WMEMMOVE)
-ENTRY(wmemmove)
-	lsl	count, count, #2
-#else
 ENTRY(memmove)
-#endif
-	sub	tmp1, dstin, src
-	cmp	count, 96
-	ccmp	tmp1, count, 2, hi
-	b.hs	__memcpy
-
-	cbz	tmp1, 3f
-	add	dstend, dstin, count
-	add	srcend, src, count
-
-	/* Align dstend to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.	 There are at least 96 bytes
-	   to copy, so copy 16 bytes unaligned and then align.	The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
-	and	tmp1, dstend, 15
-	ldp	D_l, D_h, [srcend, -16]
-	sub	srcend, srcend, tmp1
-	sub	count, count, tmp1
-	ldp	A_l, A_h, [srcend, -16]
-	stp	D_l, D_h, [dstend, -16]
-	ldp	B_l, B_h, [srcend, -32]
-	ldp	C_l, C_h, [srcend, -48]
-	ldp	D_l, D_h, [srcend, -64]!
-	sub	dstend, dstend, tmp1
-	subs	count, count, 128
-	b.ls	2f
-	nop
-1:
-	stp	A_l, A_h, [dstend, -16]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	B_l, B_h, [dstend, -32]
-	ldp	B_l, B_h, [srcend, -32]
-	stp	C_l, C_h, [dstend, -48]
-	ldp	C_l, C_h, [srcend, -48]
-	stp	D_l, D_h, [dstend, -64]!
-	ldp	D_l, D_h, [srcend, -64]!
-	subs	count, count, 64
-	b.hi	1b
-
-	/* Write the last full set of 64 bytes.	 The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the start even if
-	   there is just 1 byte left.  */
-2:
-	ldp	E_l, E_h, [src, 48]
-	stp	A_l, A_h, [dstend, -16]
-	ldp	A_l, A_h, [src, 32]
-	stp	B_l, B_h, [dstend, -32]
-	ldp	B_l, B_h, [src, 16]
-	stp	C_l, C_h, [dstend, -48]
-	ldp	C_l, C_h, [src]
-	stp	D_l, D_h, [dstend, -64]
-	stp	E_l, E_h, [dstin, 48]
-	stp	A_l, A_h, [dstin, 32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstin]
-3:	ret
-
-#if defined(WMEMMOVE)
-END(wmemmove)
-#else
+  #include "memcpy_base.S"
 END(memmove)
-
-ALIAS_SYMBOL(memcpy, memmove)
-#endif
diff --git a/libc/arch-arm64/generic/bionic/memrchr.S b/libc/arch-arm64/generic/bionic/memrchr.S
new file mode 100644
index 000000000..9ab232775
--- /dev/null
+++ b/libc/arch-arm64/generic/bionic/memrchr.S
@@ -0,0 +1,117 @@
+/*
+ * memrchr - find last character in a memory zone.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include <private/bionic_asm.h>
+
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+#define result		x0
+
+#define src		x3
+#define cntrem		x4
+#define synd		x5
+#define shift		x6
+#define	tmp		x7
+#define wtmp		w7
+#define end		x8
+#define endm1		x9
+
+#define vrepchr		v0
+#define qdata		q1
+#define vdata		v1
+#define vhas_chr	v2
+#define vrepmask	v3
+#define vend		v4
+#define dend		d4
+
+/*
+   Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (memrchr)
+	PTR_ARG (0)
+	add	end, srcin, cntin
+	sub	endm1, end, 1
+	bic	src, endm1, 15
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src]
+	dup	vrepchr.16b, chrin
+	mov	wtmp, 0xf00f
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	neg	shift, end, lsl 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	fmov	synd, dend
+	lsl	synd, synd, shift
+	cbz	synd, L(start_loop)
+
+	clz	synd, synd
+	sub	result, endm1, synd, lsr 2
+	cmp	cntin, synd, lsr 2
+	csel	result, result, xzr, hi
+	ret
+
+L(start_loop):
+	sub	tmp, end, src
+	subs	cntrem, cntin, tmp
+	b.ls	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
+
+	.p2align 4
+L(loop32):
+	ldr	qdata, [src, -16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+
+L(loop32_2):
+	ldr	qdata, [src, -16]!
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	b.ls	L(end)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+L(end):
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+
+	add	tmp, src, 15
+#ifdef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	synd, synd
+	sub	tmp, tmp, synd, lsr 2
+	cmp	tmp, srcin
+	csel	result, tmp, xzr, hs
+	ret
+
+L(nomatch):
+	mov	result, 0
+	ret
+
+END (memrchr)
+
diff --git a/libc/arch-arm64/generic/bionic/memset.S b/libc/arch-arm64/generic/bionic/memset.S
index 12fc09db8..0f54f9d79 100644
--- a/libc/arch-arm64/generic/bionic/memset.S
+++ b/libc/arch-arm64/generic/bionic/memset.S
@@ -1,89 +1,25 @@
-/* Copyright (c) 2012-2013, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
 /*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
+ * memset - fill memory with a constant byte
  *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2012-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
 #include <private/bionic_asm.h>
 
-/* By default we assume that the DC instruction can be used to zero
-   data blocks more efficiently.  In some circumstances this might be
-   unsafe, for example in an asymmetric multiprocessor environment with
-   different DC clear lengths (neither the upper nor lower lengths are
-   safe to use).
-
-   If code may be run in a virtualized environment, then define
-   MAYBE_VIRT.  This will cause the code to cache the system register
-   values rather than re-reading them each call.  */
-
-#define dstin		x0
-#define val		x1
-#define valw		w1
-#define count		x2
-#define dst 		x3
-#define dstend		x4
-#define tmp1		x5
-#define tmp1w		w5
-#define tmp2		x6
-#define tmp2w		w6
-#define zva_len		x7
-#define zva_lenw	w7
-
-#define L(l) .L ## l
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define zva_val	x5
 
 ENTRY(__memset_chk)
   cmp count, dst
@@ -98,7 +34,9 @@ ENTRY(__memset_chk)
   bl __memset_chk_fail
 END(__memset_chk)
 
-ENTRY(memset)
+ENTRY (memset)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 
 	dup	v0.16B, valw
 	add	dstend, dstin, count
@@ -114,7 +52,7 @@ ENTRY(memset)
 	str	val, [dstin]
 	str	val, [dstend, -8]
 	ret
-	nop
+	.p2align 4
 1:	tbz	count, 2, 2f
 	str	valw, [dstin]
 	str	valw, [dstend, -4]
@@ -144,108 +82,49 @@ L(set96):
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
-	nop
+	.p2align 4
 L(set_long):
 	and	valw, valw, 255
 	bic	dst, dstin, 15
 	str	q0, [dstin]
-	cmp	count, 256
-	ccmp	valw, 0, 0, cs
-	b.eq	L(try_zva)
-L(no_zva):
-	sub	count, dstend, dst	/* Count is 16 too large.  */
-	add	dst, dst, 16
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-L(tail64):
-	subs	count, count, 64
-	b.hi	1b
-2:	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-
-	.p2align 3
-L(try_zva):
-	mrs	tmp1, dczid_el0
-	tbnz	tmp1w, 4, L(no_zva)
-	and	tmp1w, tmp1w, 15
-	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
-	b.ne	 L(zva_128)
-
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.
-	 */
-L(zva_64):
+	cmp	count, 160
+	ccmp	valw, 0, 0, hs
+	b.ne	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
 	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
 	bic	dst, dst, 63
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-	nop
-1:	dc	zva, dst
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
 	add	dst, dst, 64
+	dc	zva, dst
 	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
+	b.hi	L(zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
-L(zva_128):
-	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
-	b.ne	L(zva_other)
-
-	str	q0, [dst, 16]
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
 	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	bic	dst, dst, 127
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 128
-	subs	count, count, 128
-	b.hi	1b
-	stp	q0, q0, [dstend, -128]
-	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
-L(zva_other):
-	mov	tmp2w, 4
-	lsl	zva_lenw, tmp2w, tmp1w
-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
-	cmp	count, tmp1
-	blo	L(no_zva)
-
-	sub	tmp2, zva_len, 1
-	add	tmp1, dst, zva_len
-	add	dst, dst, 16
-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
-	beq	2f
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-	subs	count, count, 64
-	b.hi	1b
-2:	mov	dst, tmp1
-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
-	subs	count, count, zva_len
-	b.lo	4f
-3:	dc	zva, dst
-	add	dst, dst, zva_len
-	subs	count, count, zva_len
-	b.hs	3b
-4:	add	count, count, zva_len
-	b	L(tail64)
-
-END(memset)
+END (memset)
+
diff --git a/libc/arch-arm64/generic/bionic/stpcpy.S b/libc/arch-arm64/generic/bionic/stpcpy.S
index e4a799387..4f62aa462 100644
--- a/libc/arch-arm64/generic/bionic/stpcpy.S
+++ b/libc/arch-arm64/generic/bionic/stpcpy.S
@@ -1,29 +1,10 @@
 /*
- * Copyright (C) 2014 The Android Open Source Project
- * All rights reserved.
+ * stpcpy - copy a string returning pointer to end.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
-#define STPCPY
-#include "string_copy.S"
+
+#define BUILD_STPCPY 1
+
+#include "strcpy.S"
diff --git a/libc/arch-arm64/generic/bionic/strchrnul.S b/libc/arch-arm64/generic/bionic/strchrnul.S
new file mode 100644
index 000000000..3fd909c15
--- /dev/null
+++ b/libc/arch-arm64/generic/bionic/strchrnul.S
@@ -0,0 +1,114 @@
+/*
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include <private/bionic_asm.h>
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask	v7
+#define vend1		v16
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character or nul.  Since the
+   bits in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination.  */
+
+/* Locals and temporaries.  */
+
+ENTRY (strchrnul)
+	PTR_ARG (0)
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the termination condition.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask.4s, wtmp2
+	ands	tmp1, srcin, #31
+	b.eq	L(loop)
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
+	lsl	tmp1, tmp1, #1
+	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	mov	tmp3, #~0
+	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
+	lsr	tmp1, tmp3, tmp1
+
+	mov	tmp3, vend1.d[0]
+	bic	tmp1, tmp3, tmp1	// Mask padding bits.
+	cbnz	tmp1, L(tail)
+
+	.p2align 4
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+	umaxp	vend1.16b, vend1.16b, vend1.16b
+	mov	tmp1, vend1.d[0]
+	cbz	tmp1, L(loop)
+
+	/* Termination condition found.  Now need to establish exactly why
+	   we terminated.  */
+	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
+	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
+	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
+
+	mov	tmp1, vend1.d[0]
+L(tail):
+	/* Count the trailing zeros, by bit reversing...  */
+	rbit	tmp1, tmp1
+	/* Re-bias source.  */
+	sub	src, src, #32
+	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
+	/* tmp1 is twice the offset into the fragment.  */
+	add	result, src, tmp1, lsr #1
+	ret
+
+END (strchrnul)
+
diff --git a/libc/arch-arm64/generic/bionic/strcpy.S b/libc/arch-arm64/generic/bionic/strcpy.S
index 260c32138..54da47c0b 100644
--- a/libc/arch-arm64/generic/bionic/strcpy.S
+++ b/libc/arch-arm64/generic/bionic/strcpy.S
@@ -1,29 +1,311 @@
 /*
- * Copyright (C) 2014 The Android Open Source Project
- * All rights reserved.
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  */
-#define STRCPY
-#include "string_copy.S"
+
+#include <private/bionic_asm.h>
+
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+
+   To test the page crossing code path more thoroughly, compile with
+   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
+   entry path.  This option is not intended for production use.  */
+
+/* Arguments and results.  */
+#define dstin		x0
+#define srcin		x1
+
+/* Locals and temporaries.  */
+#define src		x2
+#define dst		x3
+#define data1		x4
+#define data1w		w4
+#define data2		x5
+#define data2w		w5
+#define has_nul1	x6
+#define has_nul2	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define tmp4		x11
+#define zeroones	x12
+#define data1a		x13
+#define data2a		x14
+#define pos		x15
+#define len		x16
+#define to_align	x17
+
+#ifdef BUILD_STPCPY
+#define STRCPY stpcpy
+#else
+#define STRCPY strcpy
+#endif
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
+	   page size check for crossing this boundary on entry and if we
+	   do not, then we can short-circuit much of the entry code.  We
+	   expect early page-crossing strings to be rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
+	   predictable, even with random strings.
+
+	   We don't bother checking for larger page sizes, the cost of setting
+	   up the correct page size is just not worth the extra gain from
+	   a small reduction in the cases taking the slow path.  Note that
+	   we only care about whether the first fetch, which may be
+	   misaligned, crosses a page boundary - after that we move to aligned
+	   fetches for the remainder of the string.  */
+
+#ifdef STRCPY_TEST_PAGE_CROSS
+	/* Make everything that isn't Qword aligned look like a page cross.  */
+#define MIN_PAGE_P2 4
+#else
+#define MIN_PAGE_P2 12
+#endif
+
+#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+
+ENTRY (STRCPY)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	/* For moderately short strings, the fastest way to do the copy is to
+	   calculate the length of the string in the same way as strlen, then
+	   essentially do a memcpy of the result.  This avoids the need for
+	   multiple byte copies and further means that by the time we
+	   reach the bulk copy loop we know we can always use DWord
+	   accesses.  We expect __strcpy_aarch64 to rarely be called repeatedly
+	   with the same source string, so branch prediction is likely to
+	   always be difficult - we mitigate against this by preferring
+	   conditional select operations over branches whenever this is
+	   feasible.  */
+	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
+	mov	zeroones, #REP8_01
+	and	to_align, srcin, #15
+	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
+	neg	tmp1, to_align
+	/* The first fetch will straddle a (possible) page boundary iff
+	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
+	   aligned string will never fail the page align check, so will
+	   always take the fast path.  */
+	b.gt	L(page_cross)
+
+L(page_cross_ok):
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	/* Because we expect the end to be found within 16 characters
+	   (profiling shows this is the most common case), it's worth
+	   swapping the bytes now to save having to recalculate the
+	   termination syndrome later.  We preserve data1 and data2
+	   so that we can re-use the values later on.  */
+	rev	tmp2, data1
+	sub	tmp1, tmp2, zeroones
+	orr	tmp2, tmp2, #REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	b.ne	L(fp_le8)
+	rev	tmp4, data2
+	sub	tmp3, tmp4, zeroones
+	orr	tmp4, tmp4, #REP8_7f
+#else
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	b.ne	L(fp_le8)
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+#endif
+	bics	has_nul2, tmp3, tmp4
+	b.eq	L(bulk_entry)
+
+	/* The string is short (<=16 bytes).  We don't know exactly how
+	   short though, yet.  Work out the exact length so that we can
+	   quickly select the optimal copy strategy.  */
+L(fp_gt8):
+	rev	has_nul2, has_nul2
+	clz	pos, has_nul2
+	mov	tmp2, #56
+	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
+	sub	pos, tmp2, pos
+#ifdef __AARCH64EB__
+	lsr	data2, data2, pos
+#else
+	lsl	data2, data2, pos
+#endif
+	str	data2, [dst, #1]
+	str	data1, [dstin]
+#ifdef BUILD_STPCPY
+	add	dstin, dst, #8
+#endif
+	ret
+
+L(fp_le8):
+	rev	has_nul1, has_nul1
+	clz	pos, has_nul1
+	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
+	subs	tmp2, pos, #24			/* Pos in bits. */
+	b.lt	L(fp_lt4)
+#ifdef __AARCH64EB__
+	mov	tmp2, #56
+	sub	pos, tmp2, pos
+	lsr	data2, data1, pos
+	lsr	data1, data1, #32
+#else
+	lsr	data2, data1, tmp2
+#endif
+	/* 4->7 bytes to copy.  */
+	str	data2w, [dst, #-3]
+	str	data1w, [dstin]
+#ifdef BUILD_STPCPY
+	mov	dstin, dst
+#endif
+	ret
+L(fp_lt4):
+	cbz	pos, L(fp_lt2)
+	/* 2->3 bytes to copy.  */
+#ifdef __AARCH64EB__
+	lsr	data1, data1, #48
+#endif
+	strh	data1w, [dstin]
+	/* Fall-through, one byte (max) to go.  */
+L(fp_lt2):
+	/* Null-terminated string.  Last character must be zero!  */
+	strb	wzr, [dst]
+#ifdef BUILD_STPCPY
+	mov	dstin, dst
+#endif
+	ret
+
+	.p2align 6
+	/* Aligning here ensures that the entry code and main loop all lies
+	   within one 64-byte cache line.  */
+L(bulk_entry):
+	sub	to_align, to_align, #16
+	stp	data1, data2, [dstin]
+	sub	src, srcin, to_align
+	sub	dst, dstin, to_align
+	b	L(entry_no_page_cross)
+
+	/* The inner loop deals with two Dwords at a time.  This has a
+	   slightly higher start-up cost, but we should win quite quickly,
+	   especially on cores with a high number of issue slots per
+	   cycle, as we get much better parallelism out of the operations.  */
+L(main_loop):
+	stp	data1, data2, [dst], #16
+L(entry_no_page_cross):
+	ldp	data1, data2, [src], #16
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	L(main_loop)
+
+	/* Since we know we are copying at least 16 bytes, the fastest way
+	   to deal with the tail is to determine the location of the
+	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
+	cmp	has_nul1, #0
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	csel	data1, data1, data2, ne
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, ne
+#endif
+	rev	has_nul1, has_nul1
+	clz	pos, has_nul1
+	add	tmp1, pos, #72
+	add	pos, pos, #8
+	csel	pos, pos, tmp1, ne
+	add	src, src, pos, lsr #3
+	add	dst, dst, pos, lsr #3
+	ldp	data1, data2, [src, #-32]
+	stp	data1, data2, [dst, #-16]
+#ifdef BUILD_STPCPY
+	sub	dstin, dst, #1
+#endif
+	ret
+
+L(page_cross):
+	bic	src, srcin, #15
+	/* Start by loading two words at [srcin & ~15], then forcing the
+	   bytes that precede srcin to 0xff.  This means they never look
+	   like termination bytes.  */
+	ldp	data1, data2, [src]
+	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
+	tst	to_align, #7
+	csetm	tmp2, ne
+#ifdef __AARCH64EB__
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2a, data2, tmp2
+	cmp	to_align, #8
+	csinv	data1, data1, xzr, lt
+	csel	data2, data2, data2a, lt
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	L(page_cross_ok)
+	/* We now need to make data1 and data2 look like they've been
+	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
+	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
+	neg	tmp2, to_align, lsl #3
+#ifdef __AARCH64EB__
+	lsl	data1a, data1, tmp1
+	lsr	tmp4, data2, tmp2
+	lsl	data2, data2, tmp1
+	orr	tmp4, tmp4, data1a
+	cmp	to_align, #8
+	csel	data1, tmp4, data2, lt
+	rev	tmp2, data1
+	rev	tmp4, data2
+	sub	tmp1, tmp2, zeroones
+	orr	tmp2, tmp2, #REP8_7f
+	sub	tmp3, tmp4, zeroones
+	orr	tmp4, tmp4, #REP8_7f
+#else
+	lsr	data1a, data1, tmp1
+	lsl	tmp4, data2, tmp2
+	lsr	data2, data2, tmp1
+	orr	tmp4, tmp4, data1a
+	cmp	to_align, #8
+	csel	data1, tmp4, data2, lt
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+#endif
+	bic	has_nul1, tmp1, tmp2
+	cbnz	has_nul1, L(fp_le8)
+	bic	has_nul2, tmp3, tmp4
+	b	L(fp_gt8)
+
+END (STRCPY)
+
diff --git a/libc/arch-arm64/generic/bionic/string_copy.S b/libc/arch-arm64/generic/bionic/string_copy.S
deleted file mode 100644
index 2bf969d60..000000000
--- a/libc/arch-arm64/generic/bionic/string_copy.S
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
-   Copyright (c) 2014, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- */
-
-#if !defined(STPCPY) && !defined(STRCPY)
-#error "Either STPCPY or STRCPY must be defined."
-#endif
-
-#include <private/bionic_asm.h>
-
-/* Arguments and results.  */
-#if defined(STPCPY)
-#define dst         x0
-#elif defined(STRCPY)
-#define dstin       x0
-#endif
-#define src         x1
-
-/* Locals and temporaries.  */
-#if defined(STRCPY)
-#define dst         x2
-#endif
-#define data1       x3
-#define data1_w     w3
-#define data2       x4
-#define data2_w     w4
-#define has_nul1    x5
-#define has_nul1_w  w5
-#define has_nul2    x6
-#define tmp1        x7
-#define tmp2        x8
-#define tmp3        x9
-#define tmp4        x10
-#define zeroones    x11
-#define zeroones_w  w11
-#define pos         x12
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-#if defined(STPCPY)
-ENTRY(stpcpy)
-#elif defined(STRCPY)
-ENTRY(strcpy)
-#endif
-    mov     zeroones, #REP8_01
-#if defined(STRCPY)
-    mov     dst, dstin
-#endif
-    ands    tmp1, src, #15
-    b.ne    .Lmisaligned
-    // NUL detection works on the principle that (X - 1) & (~X) & 0x80
-    // (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-    // can be done in parallel across the entire word.
-    // The inner loop deals with two Dwords at a time.  This has a
-    // slightly higher start-up cost, but we should win quite quickly,
-    // especially on cores with a high number of issue slots per
-    // cycle, as we get much better parallelism out of the operations.
-.Lloop:
-    ldp     data1, data2, [src], #16
-    sub     tmp1, data1, zeroones
-    orr     tmp2, data1, #REP8_7f
-    bic     has_nul1, tmp1, tmp2
-    cbnz    has_nul1, .Lnul_in_data1
-    sub     tmp3, data2, zeroones
-    orr     tmp4, data2, #REP8_7f
-    bic     has_nul2, tmp3, tmp4
-    cbnz    has_nul2, .Lnul_in_data2
-    // No NUL in either register, copy it in a single instruction.
-    stp     data1, data2, [dst], #16
-    b       .Lloop
-
-.Lnul_in_data1:
-    rev     has_nul1, has_nul1
-    clz     pos, has_nul1
-    add     tmp1, pos, #0x8
-
-    tbz     tmp1, #6, 1f
-#if defined(STPCPY)
-    str     data1, [dst], #7
-#elif defined(STRCPY)
-    str     data1, [dst]
-#endif
-    ret
-1:
-    tbz     tmp1, #5, 1f
-    str     data1_w, [dst], #4
-    lsr     data1, data1, #32
-1:
-    tbz     tmp1, #4, 1f
-    strh    data1_w, [dst], #2
-    lsr     data1, data1, #16
-1:
-    tbz     tmp1, #3, 1f
-    strb    data1_w, [dst]
-#if defined(STPCPY)
-    ret
-#endif
-1:
-#if defined(STPCPY)
-    // Back up one so that dst points to the '\0' string terminator.
-    sub     dst, dst, #1
-#endif
-    ret
-
-.Lnul_in_data2:
-    str     data1, [dst], #8
-    rev     has_nul2, has_nul2
-    clz     pos, has_nul2
-    add     tmp1, pos, #0x8
-
-    tbz     tmp1, #6, 1f
-#if defined(STPCPY)
-    str     data2, [dst], #7
-#elif defined(STRCPY)
-    str     data2, [dst]
-#endif
-    ret
-1:
-    tbz     tmp1, #5, 1f
-    str     data2_w, [dst], #4
-    lsr     data2, data2, #32
-1:
-    tbz     tmp1, #4, 1f
-    strh    data2_w, [dst], #2
-    lsr     data2, data2, #16
-1:
-    tbz     tmp1, #3, 1f
-    strb    data2_w, [dst]
-#if defined(STPCPY)
-    ret
-#endif
-1:
-#if defined(STPCPY)
-    // Back up one so that dst points to the '\0' string terminator.
-    sub     dst, dst, #1
-#endif
-    ret
-
-.Lmisaligned:
-    tbz     src, #0, 1f
-    ldrb    data1_w, [src], #1
-    strb    data1_w, [dst], #1
-    cbnz    data1_w, 1f
-#if defined(STPCPY)
-    // Back up one so that dst points to the '\0' string terminator.
-    sub     dst, dst, #1
-#endif
-    ret
-1:
-    tbz     src, #1, 1f
-    ldrb    data1_w, [src], #1
-    strb    data1_w, [dst], #1
-    cbz     data1_w, .Ldone
-    ldrb    data2_w, [src], #1
-    strb    data2_w, [dst], #1
-    cbnz    data2_w, 1f
-.Ldone:
-#if defined(STPCPY)
-    // Back up one so that dst points to the '\0' string terminator.
-    sub     dst, dst, #1
-#endif
-    ret
-1:
-    tbz     src, #2, 1f
-    ldr     data1_w, [src], #4
-    // Check for a zero.
-    sub     has_nul1_w, data1_w, zeroones_w
-    bic     has_nul1_w, has_nul1_w, data1_w
-    ands    has_nul1_w, has_nul1_w, #0x80808080
-    b.ne    .Lnul_in_data1
-    str     data1_w, [dst], #4
-1:
-    tbz     src, #3, .Lloop
-    ldr     data1, [src], #8
-    // Check for a zero.
-    sub     tmp1, data1, zeroones
-    orr     tmp2, data1, #REP8_7f
-    bics    has_nul1, tmp1, tmp2
-    b.ne    .Lnul_in_data1
-    str     data1, [dst], #8
-    b       .Lloop
-#if defined(STPCPY)
-END(stpcpy)
-#elif defined(STRCPY)
-END(strcpy)
-#endif
diff --git a/libc/arch-arm64/generic/bionic/strrchr.S b/libc/arch-arm64/generic/bionic/strrchr.S
new file mode 100644
index 000000000..274d19746
--- /dev/null
+++ b/libc/arch-arm64/generic/bionic/strrchr.S
@@ -0,0 +1,149 @@
+/*
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include <private/bionic_asm.h>
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+#define src_match	x6
+#define src_offset	x7
+#define const_m1	x8
+#define tmp4		x9
+#define nul_match	x10
+#define chr_match	x11
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask_0	v7
+#define vrepmask_c	v16
+#define vend1		v17
+#define vend2		v18
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character; bit 1 is set
+   iff the relevant byte matched the NUL end of string (we trigger
+   off bit0 for the special case of looking for NUL).  Since the bits
+   in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination, and why.  */
+
+ENTRY (strrchr)
+	PTR_ARG (0)
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the requested byte.  Magic constant 0x80200802 used
+	   similarly for NUL termination.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask_c.4s, wtmp2
+	mov	src_offset, #0
+	ands	tmp1, srcin, #31
+	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+	b.eq	L(aligned)
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
+	mov	nul_match, vend1.d[0]
+	lsl	tmp1, tmp1, #1
+	mov	const_m1, #~0
+	lsr	tmp3, const_m1, tmp1
+	mov	chr_match, vend1.d[1]
+
+	bic	nul_match, nul_match, tmp3	// Mask padding bits.
+	bic	chr_match, chr_match, tmp3	// Mask padding bits.
+	cbnz	nul_match, L(tail)
+
+	.p2align 4
+L(loop):
+	cmp	chr_match, #0
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+L(aligned):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	uminp	vend1.16b, vdata1.16b, vdata2.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	cmeq	vend1.16b, vend1.16b, 0
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
+	mov	nul_match, vend1.d[0]
+	mov	chr_match, vend1.d[1]
+	cbz	nul_match, L(loop)
+
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
+	mov	nul_match, vhas_nul1.d[0]
+
+L(tail):
+	/* Work out exactly where the string ends.  */
+	sub	tmp4, nul_match, #1
+	eor	tmp4, tmp4, nul_match
+	ands	chr_match, chr_match, tmp4
+	/* And pick the values corresponding to the last match.  */
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+
+	/* Count down from the top of the syndrome to find the last match.  */
+	clz	tmp3, src_offset
+	/* Src_match points beyond the word containing the match, so we can
+	   simply subtract half the bit-offset into the syndrome.  Because
+	   we are counting down, we need to go back one more character.  */
+	add	tmp3, tmp3, #2
+	sub	result, src_match, tmp3, lsr #1
+	/* But if the syndrome shows no match was found, then return NULL.  */
+	cmp	src_offset, #0
+	csel	result, result, xzr, ne
+
+	ret
+
+END (strrchr)
+
diff --git a/libc/arch-arm64/generic/bionic/wmemmove.S b/libc/arch-arm64/generic/bionic/wmemmove.S
index e4f67f759..dc03f4c92 100644
--- a/libc/arch-arm64/generic/bionic/wmemmove.S
+++ b/libc/arch-arm64/generic/bionic/wmemmove.S
@@ -1,30 +1,33 @@
-/* Copyright (c) 2014, Linaro Limited
-   All rights reserved.
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
 
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
+#include <private/bionic_asm.h>
 
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define WMEMMOVE
-#include "memmove.S"
-#undef WMEMMOVE
+ENTRY(wmemmove)
+  #include "memcpy_base.S"
+END(wmemmove)
diff --git a/libc/private/bionic_asm.h b/libc/private/bionic_asm.h
index 6409563f2..d18f0fc42 100644
--- a/libc/private/bionic_asm.h
+++ b/libc/private/bionic_asm.h
@@ -83,4 +83,15 @@
     .globl alias; \
     .equ alias, original
 
+/* For arm-optimized-routines */
+#define L(l) .L ## l
+#ifdef __ILP32__
+  /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n)  mov w##n, w##n
+#define SIZE_ARG(n)  mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#define SIZE_ARG(n)
+#endif
+
 #endif
author	Danny Lin <danny@kdrag0n.dev>	2021-03-21 01:51:00 -0700
committer	alk3pInjection <webmaster@raspii.tech>	2021-09-27 21:17:05 +0800
commit	03559b84468a98cd72150a66092ec463035bb666 (patch)
tree	bb321caca05a1413985d29560130bde2550b5fc7
parent	ba602b8990ff1b695ea580a537a07a07224da937 (diff)