diff options
author | Haibo Huang <hhb@google.com> | 2018-05-24 20:39:18 -0700 |
---|---|---|
committer | Haibo Huang <hhb@google.com> | 2018-06-11 18:12:45 +0000 |
commit | 8a0f0ed5e7c2bc5665583db646876808cc3c2bc7 (patch) | |
tree | e37fb7e4c3706931a9bfde2d865b4d90589610f6 /libc | |
parent | 42596b7bf0652e44edff0370f75e1d5387c5cc7b (diff) |
Make memcpy memmove
Bug: http://b/63992911
Test: Change BoardConfig.mk and compile for each variant
Change-Id: Ia0cc68d8e90e3316ddb2e9ff1555a009b6a0c5be
Diffstat (limited to 'libc')
21 files changed, 58 insertions, 1047 deletions
diff --git a/libc/Android.bp b/libc/Android.bp index a6df75709..c6991d8b9 100644 --- a/libc/Android.bp +++ b/libc/Android.bp @@ -855,7 +855,6 @@ cc_library_static { arm: { srcs: [ "arch-arm/generic/bionic/memcmp.S", - "arch-arm/generic/bionic/memcpy.S", "arch-arm/generic/bionic/memmove.S", "arch-arm/generic/bionic/memset.S", "arch-arm/generic/bionic/strcmp.S", @@ -1125,7 +1124,6 @@ cc_library_static { "arch-x86/atom/string/sse2-wcsrchr-atom.S", "arch-x86/atom/string/sse2-wcslen-atom.S", "arch-x86/atom/string/sse2-wcscmp-atom.S", - "arch-x86/silvermont/string/sse2-memcpy-slm.S", "arch-x86/silvermont/string/sse2-memmove-slm.S", "arch-x86/silvermont/string/sse2-memset-slm.S", "arch-x86/silvermont/string/sse2-stpcpy-slm.S", @@ -1154,14 +1152,12 @@ cc_library_static { "arch-x86/atom/string/sse2-strlen-atom.S", "arch-x86/atom/string/ssse3-memcmp-atom.S", "arch-x86/atom/string/ssse3-memcpy-atom.S", - "arch-x86/atom/string/ssse3-memmove-atom.S", "arch-x86/atom/string/ssse3-strcpy-atom.S", "arch-x86/atom/string/ssse3-strncpy-atom.S", "arch-x86/atom/string/ssse3-wmemcmp-atom.S", ], exclude_srcs: [ "arch-x86/generic/string/memcmp.S", - "arch-x86/silvermont/string/sse2-memcpy-slm.S", "arch-x86/silvermont/string/sse2-memmove-slm.S", "arch-x86/silvermont/string/sse2-memset-slm.S", "arch-x86/silvermont/string/sse2-strcpy-slm.S", @@ -1198,7 +1194,6 @@ cc_library_static { }, x86_64: { srcs: [ - "arch-x86_64/string/sse2-memcpy-slm.S", "arch-x86_64/string/sse2-memmove-slm.S", "arch-x86_64/string/sse2-memset-slm.S", "arch-x86_64/string/sse2-stpcpy-slm.S", diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy.S b/libc/arch-arm/cortex-a15/bionic/memcpy.S index 4297cd6c1..0bab6ee7d 100644 --- a/libc/arch-arm/cortex-a15/bionic/memcpy.S +++ b/libc/arch-arm/cortex-a15/bionic/memcpy.S @@ -64,7 +64,7 @@ .arch armv7-a // Prototype: void *memcpy (void *dst, const void *src, size_t count). -ENTRY(memcpy) +ENTRY(__memcpy) pld [r1, #64] push {r0, lr} .cfi_def_cfa_offset 8 @@ -72,4 +72,4 @@ ENTRY(memcpy) .cfi_rel_offset lr, 4 #include "memcpy_base.S" -END(memcpy) +END(__memcpy) diff --git a/libc/arch-arm/cortex-a53/bionic/memcpy.S b/libc/arch-arm/cortex-a53/bionic/memcpy.S index 4297cd6c1..0bab6ee7d 100644 --- a/libc/arch-arm/cortex-a53/bionic/memcpy.S +++ b/libc/arch-arm/cortex-a53/bionic/memcpy.S @@ -64,7 +64,7 @@ .arch armv7-a // Prototype: void *memcpy (void *dst, const void *src, size_t count). -ENTRY(memcpy) +ENTRY(__memcpy) pld [r1, #64] push {r0, lr} .cfi_def_cfa_offset 8 @@ -72,4 +72,4 @@ ENTRY(memcpy) .cfi_rel_offset lr, 4 #include "memcpy_base.S" -END(memcpy) +END(__memcpy) diff --git a/libc/arch-arm/cortex-a7/bionic/memcpy.S b/libc/arch-arm/cortex-a7/bionic/memcpy.S index 4297cd6c1..0bab6ee7d 100644 --- a/libc/arch-arm/cortex-a7/bionic/memcpy.S +++ b/libc/arch-arm/cortex-a7/bionic/memcpy.S @@ -64,7 +64,7 @@ .arch armv7-a // Prototype: void *memcpy (void *dst, const void *src, size_t count). -ENTRY(memcpy) +ENTRY(__memcpy) pld [r1, #64] push {r0, lr} .cfi_def_cfa_offset 8 @@ -72,4 +72,4 @@ ENTRY(memcpy) .cfi_rel_offset lr, 4 #include "memcpy_base.S" -END(memcpy) +END(__memcpy) diff --git a/libc/arch-arm/cortex-a9/bionic/memcpy.S b/libc/arch-arm/cortex-a9/bionic/memcpy.S index 5a986d175..a71c9d21c 100644 --- a/libc/arch-arm/cortex-a9/bionic/memcpy.S +++ b/libc/arch-arm/cortex-a9/bionic/memcpy.S @@ -39,14 +39,14 @@ .thumb .thumb_func -ENTRY(memcpy) +ENTRY(__memcpy) pld [r1, #0] stmfd sp!, {r0, lr} .cfi_def_cfa_offset 8 .cfi_rel_offset r0, 0 .cfi_rel_offset lr, 4 pld [r1, #64] -END(memcpy) +END(__memcpy) #define MEMCPY_BASE __memcpy_base #define MEMCPY_BASE_ALIGNED __memcpy_base_aligned diff --git a/libc/arch-arm/denver/bionic/memcpy.S b/libc/arch-arm/denver/bionic/memcpy.S index 8528f2851..f0825421e 100644 --- a/libc/arch-arm/denver/bionic/memcpy.S +++ b/libc/arch-arm/denver/bionic/memcpy.S @@ -65,13 +65,13 @@ // arch. The code generated is exactly the same. .arch armv7-a -ENTRY(memcpy) +ENTRY(__memcpy) pld [r1, #64] push {r0, lr} .cfi_def_cfa_offset 8 .cfi_rel_offset r0, 0 .cfi_rel_offset lr, 4 -END(memcpy) +END(__memcpy) #define MEMCPY_BASE __memcpy_base #define MEMCPY_BASE_ALIGNED __memcpy_base_aligned diff --git a/libc/arch-arm/denver/bionic/memmove.S b/libc/arch-arm/denver/bionic/memmove.S index 94302f3fd..74d2b31bc 100644 --- a/libc/arch-arm/denver/bionic/memmove.S +++ b/libc/arch-arm/denver/bionic/memmove.S @@ -50,7 +50,7 @@ ENTRY(memmove) bhi .L_reversed_memcpy .L_jump_to_memcpy: - b memcpy + b __memcpy .L_reversed_memcpy: push {r0, lr} @@ -278,3 +278,5 @@ ENTRY(memmove) pop {r0, pc} END(memmove) + +ALIAS_SYMBOL(memcpy, memmove) diff --git a/libc/arch-arm/generic/bionic/memcpy.S b/libc/arch-arm/generic/bionic/memcpy.S deleted file mode 100644 index d1e4372e1..000000000 --- a/libc/arch-arm/generic/bionic/memcpy.S +++ /dev/null @@ -1,379 +0,0 @@ -/* - * Copyright (C) 2008 The Android Open Source Project - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <private/bionic_asm.h> - - /* - * Optimized memcpy() for ARM. - * - * note that memcpy() always returns the destination pointer, - * so we have to preserve R0. - */ - - .syntax unified - -ENTRY(memcpy) - /* The stack must always be 64-bits aligned to be compliant with the - * ARM ABI. Since we have to save R0, we might as well save R4 - * which we can use for better pipelining of the reads below - */ - stmfd sp!, {r0, r4, lr} - .cfi_def_cfa_offset 12 - .cfi_rel_offset r0, 0 - .cfi_rel_offset r4, 4 - .cfi_rel_offset lr, 8 - /* Making room for r5-r11 which will be spilled later */ - sub sp, sp, #28 - .cfi_adjust_cfa_offset 28 - - // preload the destination because we'll align it to a cache line - // with small writes. Also start the source "pump". - pld [r0, #0] - pld [r1, #0] - pld [r1, #32] - - /* it simplifies things to take care of len<4 early */ - cmp r2, #4 - blo .Lcopy_last_3_and_return - - /* compute the offset to align the source - * offset = (4-(src&3))&3 = -src & 3 - */ - rsb r3, r1, #0 - ands r3, r3, #3 - beq .Lsrc_aligned - - /* align source to 32 bits. We need to insert 2 instructions between - * a ldr[b|h] and str[b|h] because byte and half-word instructions - * stall 2 cycles. - */ - movs r12, r3, lsl #31 - sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ - ldrbmi r3, [r1], #1 - ldrbcs r4, [r1], #1 - ldrbcs r12,[r1], #1 - strbmi r3, [r0], #1 - strbcs r4, [r0], #1 - strbcs r12,[r0], #1 - -.Lsrc_aligned: - - /* see if src and dst are aligned together (congruent) */ - eor r12, r0, r1 - tst r12, #3 - bne .Lnon_congruent - - /* Use post-incriment mode for stm to spill r5-r11 to reserved stack - * frame. Don't update sp. - */ - stmea sp, {r5-r11} - - /* align the destination to a cache-line */ - rsb r3, r0, #0 - ands r3, r3, #0x1C - beq .Lcongruent_aligned32 - cmp r3, r2 - andhi r3, r2, #0x1C - - /* conditionally copies 0 to 7 words (length in r3) */ - movs r12, r3, lsl #28 - ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */ - ldmmi r1!, {r8, r9} /* 8 bytes */ - stmcs r0!, {r4, r5, r6, r7} - stmmi r0!, {r8, r9} - tst r3, #0x4 - ldrne r10,[r1], #4 /* 4 bytes */ - strne r10,[r0], #4 - sub r2, r2, r3 - -.Lcongruent_aligned32: - /* - * here source is aligned to 32 bytes. - */ - -.Lcached_aligned32: - subs r2, r2, #32 - blo .Lless_than_32_left - - /* - * We preload a cache-line up to 64 bytes ahead. On the 926, this will - * stall only until the requested world is fetched, but the linefill - * continues in the the background. - * While the linefill is going, we write our previous cache-line - * into the write-buffer (which should have some free space). - * When the linefill is done, the writebuffer will - * start dumping its content into memory - * - * While all this is going, we then load a full cache line into - * 8 registers, this cache line should be in the cache by now - * (or partly in the cache). - * - * This code should work well regardless of the source/dest alignment. - * - */ - - // Align the preload register to a cache-line because the cpu does - // "critical word first" (the first word requested is loaded first). - bic r12, r1, #0x1F - add r12, r12, #64 - -1: ldmia r1!, { r4-r11 } - pld [r12, #64] - subs r2, r2, #32 - - // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi - // for ARM9 preload will not be safely guarded by the preceding subs. - // When it is safely guarded the only possibility to have SIGSEGV here - // is because the caller overstates the length. - ldrhi r3, [r12], #32 /* cheap ARM9 preload */ - stmia r0!, { r4-r11 } - bhs 1b - - add r2, r2, #32 - -.Lless_than_32_left: - /* - * less than 32 bytes left at this point (length in r2) - */ - - /* skip all this if there is nothing to do, which should - * be a common case (if not executed the code below takes - * about 16 cycles) - */ - tst r2, #0x1F - beq 1f - - /* conditionnaly copies 0 to 31 bytes */ - movs r12, r2, lsl #28 - ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */ - ldmmi r1!, {r8, r9} /* 8 bytes */ - stmcs r0!, {r4, r5, r6, r7} - stmmi r0!, {r8, r9} - movs r12, r2, lsl #30 - ldrcs r3, [r1], #4 /* 4 bytes */ - ldrhmi r4, [r1], #2 /* 2 bytes */ - strcs r3, [r0], #4 - strhmi r4, [r0], #2 - tst r2, #0x1 - ldrbne r3, [r1] /* last byte */ - strbne r3, [r0] - - /* we're done! restore everything and return */ -1: ldmfd sp!, {r5-r11} - ldmfd sp!, {r0, r4, pc} - - /********************************************************************/ - -.Lnon_congruent: - /* - * here source is aligned to 4 bytes - * but destination is not. - * - * in the code below r2 is the number of bytes read - * (the number of bytes written is always smaller, because we have - * partial words in the shift queue) - */ - cmp r2, #4 - blo .Lcopy_last_3_and_return - - /* Use post-increment mode for stm to spill r5-r11 to reserved stack - * frame. Don't update sp. - */ - stmea sp, {r5-r11} - - /* compute shifts needed to align src to dest */ - rsb r5, r0, #0 - and r5, r5, #3 /* r5 = # bytes in partial words */ - mov r12, r5, lsl #3 /* r12 = right */ - rsb lr, r12, #32 /* lr = left */ - - /* read the first word */ - ldr r3, [r1], #4 - sub r2, r2, #4 - - /* write a partial word (0 to 3 bytes), such that destination - * becomes aligned to 32 bits (r5 = nb of words to copy for alignment) - */ - movs r5, r5, lsl #31 - strbmi r3, [r0], #1 - movmi r3, r3, lsr #8 - strbcs r3, [r0], #1 - movcs r3, r3, lsr #8 - strbcs r3, [r0], #1 - movcs r3, r3, lsr #8 - - cmp r2, #4 - blo .Lpartial_word_tail - - /* Align destination to 32 bytes (cache line boundary) */ -1: tst r0, #0x1c - beq 2f - ldr r5, [r1], #4 - sub r2, r2, #4 - orr r4, r3, r5, lsl lr - mov r3, r5, lsr r12 - str r4, [r0], #4 - cmp r2, #4 - bhs 1b - blo .Lpartial_word_tail - - /* copy 32 bytes at a time */ -2: subs r2, r2, #32 - blo .Lless_than_thirtytwo - - /* Use immediate mode for the shifts, because there is an extra cycle - * for register shifts, which could account for up to 50% of - * performance hit. - */ - - cmp r12, #24 - beq .Lloop24 - cmp r12, #8 - beq .Lloop8 - -.Lloop16: - ldr r12, [r1], #4 -1: mov r4, r12 - ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} - pld [r1, #64] - subs r2, r2, #32 - ldrhs r12, [r1], #4 - orr r3, r3, r4, lsl #16 - mov r4, r4, lsr #16 - orr r4, r4, r5, lsl #16 - mov r5, r5, lsr #16 - orr r5, r5, r6, lsl #16 - mov r6, r6, lsr #16 - orr r6, r6, r7, lsl #16 - mov r7, r7, lsr #16 - orr r7, r7, r8, lsl #16 - mov r8, r8, lsr #16 - orr r8, r8, r9, lsl #16 - mov r9, r9, lsr #16 - orr r9, r9, r10, lsl #16 - mov r10, r10, lsr #16 - orr r10, r10, r11, lsl #16 - stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} - mov r3, r11, lsr #16 - bhs 1b - b .Lless_than_thirtytwo - -.Lloop8: - ldr r12, [r1], #4 -1: mov r4, r12 - ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} - pld [r1, #64] - subs r2, r2, #32 - ldrhs r12, [r1], #4 - orr r3, r3, r4, lsl #24 - mov r4, r4, lsr #8 - orr r4, r4, r5, lsl #24 - mov r5, r5, lsr #8 - orr r5, r5, r6, lsl #24 - mov r6, r6, lsr #8 - orr r6, r6, r7, lsl #24 - mov r7, r7, lsr #8 - orr r7, r7, r8, lsl #24 - mov r8, r8, lsr #8 - orr r8, r8, r9, lsl #24 - mov r9, r9, lsr #8 - orr r9, r9, r10, lsl #24 - mov r10, r10, lsr #8 - orr r10, r10, r11, lsl #24 - stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} - mov r3, r11, lsr #8 - bhs 1b - b .Lless_than_thirtytwo - -.Lloop24: - ldr r12, [r1], #4 -1: mov r4, r12 - ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} - pld [r1, #64] - subs r2, r2, #32 - ldrhs r12, [r1], #4 - orr r3, r3, r4, lsl #8 - mov r4, r4, lsr #24 - orr r4, r4, r5, lsl #8 - mov r5, r5, lsr #24 - orr r5, r5, r6, lsl #8 - mov r6, r6, lsr #24 - orr r6, r6, r7, lsl #8 - mov r7, r7, lsr #24 - orr r7, r7, r8, lsl #8 - mov r8, r8, lsr #24 - orr r8, r8, r9, lsl #8 - mov r9, r9, lsr #24 - orr r9, r9, r10, lsl #8 - mov r10, r10, lsr #24 - orr r10, r10, r11, lsl #8 - stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} - mov r3, r11, lsr #24 - bhs 1b - - -.Lless_than_thirtytwo: - /* copy the last 0 to 31 bytes of the source */ - rsb r12, lr, #32 /* we corrupted r12, recompute it */ - add r2, r2, #32 - cmp r2, #4 - blo .Lpartial_word_tail - -1: ldr r5, [r1], #4 - sub r2, r2, #4 - orr r4, r3, r5, lsl lr - mov r3, r5, lsr r12 - str r4, [r0], #4 - cmp r2, #4 - bhs 1b - -.Lpartial_word_tail: - /* we have a partial word in the input buffer */ - movs r5, lr, lsl #(31-3) - strbmi r3, [r0], #1 - movmi r3, r3, lsr #8 - strbcs r3, [r0], #1 - movcs r3, r3, lsr #8 - strbcs r3, [r0], #1 - - /* Refill spilled registers from the stack. Don't update sp. */ - ldmfd sp, {r5-r11} - -.Lcopy_last_3_and_return: - movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */ - ldrbmi r2, [r1], #1 - ldrbcs r3, [r1], #1 - ldrbcs r12,[r1] - strbmi r2, [r0], #1 - strbcs r3, [r0], #1 - strbcs r12,[r0] - - /* we're done! restore sp and spilled registers and return */ - add sp, sp, #28 - ldmfd sp!, {r0, r4, pc} -END(memcpy) diff --git a/libc/arch-arm/generic/bionic/memmove.S b/libc/arch-arm/generic/bionic/memmove.S index d5ee99dd4..c52e17ed5 100644 --- a/libc/arch-arm/generic/bionic/memmove.S +++ b/libc/arch-arm/generic/bionic/memmove.S @@ -469,3 +469,5 @@ ENTRY(memmove) bl bsd_safe_memcpy ldmfd sp!, {r0, pc} END(memmove) + +ALIAS_SYMBOL(memcpy, memmove) diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S index f22c063ba..49fd04069 100644 --- a/libc/arch-arm/krait/bionic/memcpy.S +++ b/libc/arch-arm/krait/bionic/memcpy.S @@ -42,7 +42,7 @@ .thumb .thumb_func -ENTRY(memcpy) +ENTRY(__memcpy) pld [r1, #64] stmfd sp!, {r0, lr} .cfi_adjust_cfa_offset 8 @@ -50,4 +50,4 @@ ENTRY(memcpy) .cfi_rel_offset lr, 4 #include "memcpy_base.S" -END(memcpy) +END(__memcpy) diff --git a/libc/arch-arm/kryo/bionic/memcpy.S b/libc/arch-arm/kryo/bionic/memcpy.S index e9ee2ac2f..74036ef3a 100644 --- a/libc/arch-arm/kryo/bionic/memcpy.S +++ b/libc/arch-arm/kryo/bionic/memcpy.S @@ -34,7 +34,7 @@ #define PLDSIZE (128) /* L2 cache line size */ .code 32 -ENTRY(memcpy) +ENTRY(__memcpy) push {r0} .cfi_def_cfa_offset 4 .cfi_rel_offset r0, 0 @@ -123,4 +123,4 @@ ENTRY(memcpy) pop {r0} bx lr -END(memcpy) +END(__memcpy) diff --git a/libc/arch-arm64/denver64/bionic/memcpy.S b/libc/arch-arm64/denver64/bionic/memcpy.S index fc487d3a4..baadb9204 100644 --- a/libc/arch-arm64/denver64/bionic/memcpy.S +++ b/libc/arch-arm64/denver64/bionic/memcpy.S @@ -30,6 +30,6 @@ #include <private/bionic_asm.h> -ENTRY(memcpy) +ENTRY(__memcpy) #include "memcpy_base.S" -END(memcpy) +END(__memcpy) diff --git a/libc/arch-arm64/denver64/bionic/memmove.S b/libc/arch-arm64/denver64/bionic/memmove.S index 739ce4982..42271dc23 100644 --- a/libc/arch-arm64/denver64/bionic/memmove.S +++ b/libc/arch-arm64/denver64/bionic/memmove.S @@ -65,7 +65,7 @@ ENTRY(memmove) b.lo .Ldownwards add tmp1, src, count cmp dstin, tmp1 - b.hs memcpy /* No overlap. */ + b.hs __memcpy /* No overlap. */ /* Upwards move with potential overlap. * Need to move from the tail backwards. SRC and DST point one @@ -196,7 +196,7 @@ ENTRY(memmove) * DST is more than 16 bytes away from SRC. */ sub tmp1, src, #16 cmp dstin, tmp1 - b.ls memcpy /* May overlap, but not critically. */ + b.ls __memcpy /* May overlap, but not critically. */ mov dst, dstin /* Preserve DSTIN for return value. */ cmp count, #64 @@ -326,4 +326,6 @@ ENTRY(memmove) END(wmemmove) #else END(memmove) + +ALIAS_SYMBOL(memcpy, memmove) #endif diff --git a/libc/arch-arm64/generic/bionic/memcpy.S b/libc/arch-arm64/generic/bionic/memcpy.S index fc487d3a4..baadb9204 100644 --- a/libc/arch-arm64/generic/bionic/memcpy.S +++ b/libc/arch-arm64/generic/bionic/memcpy.S @@ -30,6 +30,6 @@ #include <private/bionic_asm.h> -ENTRY(memcpy) +ENTRY(__memcpy) #include "memcpy_base.S" -END(memcpy) +END(__memcpy) diff --git a/libc/arch-arm64/generic/bionic/memmove.S b/libc/arch-arm64/generic/bionic/memmove.S index c50112d94..335b7d6ce 100644 --- a/libc/arch-arm64/generic/bionic/memmove.S +++ b/libc/arch-arm64/generic/bionic/memmove.S @@ -92,7 +92,7 @@ ENTRY(memmove) sub tmp1, dstin, src cmp count, 96 ccmp tmp1, count, 2, hi - b.hs memcpy + b.hs __memcpy cbz tmp1, 3f add dstend, dstin, count @@ -150,4 +150,6 @@ ENTRY(memmove) END(wmemmove) #else END(memmove) + +ALIAS_SYMBOL(memcpy, memmove) #endif diff --git a/libc/arch-x86/atom/string/ssse3-memcpy-atom.S b/libc/arch-x86/atom/string/ssse3-memcpy-atom.S index 4b2fb8e8f..2b3b7a53d 100644 --- a/libc/arch-x86/atom/string/ssse3-memcpy-atom.S +++ b/libc/arch-x86/atom/string/ssse3-memcpy-atom.S @@ -34,6 +34,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # define MEMCPY memcpy #endif +#ifndef USE_AS_MEMMOVE +# define USE_AS_MEMMOVE +#endif + #ifndef L # define L(label) .L##label #endif @@ -67,6 +71,12 @@ name: \ cfi_startproc #endif +#ifndef ALIAS_SYMBOL +# define ALIAS_SYMBOL(alias, original) \ + .globl alias; \ + .equ alias, original +#endif + #ifndef END # define END(name) \ cfi_endproc; \ @@ -3122,3 +3132,5 @@ L(bk_ssse3_cpy): #endif END (MEMCPY) + +ALIAS_SYMBOL(memmove, MEMCPY) diff --git a/libc/arch-x86/atom/string/ssse3-memmove-atom.S b/libc/arch-x86/atom/string/ssse3-memmove-atom.S deleted file mode 100644 index be8559660..000000000 --- a/libc/arch-x86/atom/string/ssse3-memmove-atom.S +++ /dev/null @@ -1,34 +0,0 @@ -/* -Copyright (c) 2010, Intel Corporation -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - - -#define MEMCPY memmove -#define USE_AS_MEMMOVE -#include "ssse3-memcpy-atom.S" diff --git a/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S b/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S deleted file mode 100644 index 1b305c790..000000000 --- a/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S +++ /dev/null @@ -1,308 +0,0 @@ -/* -Copyright (c) 2014, Intel Corporation -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#include "cache.h" - -#ifndef MEMCPY -# define MEMCPY memcpy -#endif - -#ifndef L -# define L(label) .L##label -#endif - -#ifndef cfi_startproc -# define cfi_startproc .cfi_startproc -#endif - -#ifndef cfi_endproc -# define cfi_endproc .cfi_endproc -#endif - -#ifndef cfi_rel_offset -# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off -#endif - -#ifndef cfi_restore -# define cfi_restore(reg) .cfi_restore reg -#endif - -#ifndef cfi_adjust_cfa_offset -# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off -#endif - -#ifndef ENTRY -# define ENTRY(name) \ - .type name, @function; \ - .globl name; \ - .p2align 4; \ -name: \ - cfi_startproc -#endif - -#ifndef END -# define END(name) \ - cfi_endproc; \ - .size name, .-name -#endif - -#define DEST PARMS -#define SRC DEST+4 -#define LEN SRC+4 - -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) - -#define PARMS 8 /* Preserve EBX. */ -#define ENTRANCE PUSH (%ebx); -#define RETURN_END POP (%ebx); ret -#define RETURN RETURN_END; CFI_PUSH (%ebx) - - .section .text.sse2,"ax",@progbits -ENTRY (MEMCPY) - ENTRANCE - movl LEN(%esp), %ecx - movl SRC(%esp), %eax - movl DEST(%esp), %edx - - cmp %eax, %edx - je L(return) - - cmp $16, %ecx - jbe L(len_0_16_bytes) - - cmp $SHARED_CACHE_SIZE_HALF, %ecx - jae L(large_page) - - movdqu (%eax), %xmm0 - movdqu -16(%eax, %ecx), %xmm1 - cmpl $32, %ecx - movdqu %xmm0, (%edx) - movdqu %xmm1, -16(%edx, %ecx) - jbe L(return) - - movdqu 16(%eax), %xmm0 - movdqu -32(%eax, %ecx), %xmm1 - cmpl $64, %ecx - movdqu %xmm0, 16(%edx) - movdqu %xmm1, -32(%edx, %ecx) - jbe L(return) - - movdqu 32(%eax), %xmm0 - movdqu 48(%eax), %xmm1 - movdqu -48(%eax, %ecx), %xmm2 - movdqu -64(%eax, %ecx), %xmm3 - cmpl $128, %ecx - movdqu %xmm0, 32(%edx) - movdqu %xmm1, 48(%edx) - movdqu %xmm2, -48(%edx, %ecx) - movdqu %xmm3, -64(%edx, %ecx) - jbe L(return) - -/* Now the main loop: we align the address of the destination. */ - leal 64(%edx), %ebx - andl $-64, %ebx - - addl %edx, %ecx - andl $-64, %ecx - - subl %edx, %eax - -/* We should stop two iterations before the termination - (in order not to misprefetch). */ - subl $64, %ecx - cmpl %ebx, %ecx - je L(main_loop_just_one_iteration) - - subl $64, %ecx - cmpl %ebx, %ecx - je L(main_loop_last_two_iterations) - - - .p2align 4 -L(main_loop_cache): - - prefetcht0 128(%ebx, %eax) - - movdqu (%ebx, %eax), %xmm0 - movdqu 16(%ebx, %eax), %xmm1 - movdqu 32(%ebx, %eax), %xmm2 - movdqu 48(%ebx, %eax), %xmm3 - movdqa %xmm0, (%ebx) - movdqa %xmm1, 16(%ebx) - movdqa %xmm2, 32(%ebx) - movdqa %xmm3, 48(%ebx) - lea 64(%ebx), %ebx - cmpl %ebx, %ecx - jne L(main_loop_cache) - -L(main_loop_last_two_iterations): - movdqu (%ebx, %eax), %xmm0 - movdqu 16(%ebx, %eax), %xmm1 - movdqu 32(%ebx, %eax), %xmm2 - movdqu 48(%ebx, %eax), %xmm3 - movdqu 64(%ebx, %eax), %xmm4 - movdqu 80(%ebx, %eax), %xmm5 - movdqu 96(%ebx, %eax), %xmm6 - movdqu 112(%ebx, %eax), %xmm7 - movdqa %xmm0, (%ebx) - movdqa %xmm1, 16(%ebx) - movdqa %xmm2, 32(%ebx) - movdqa %xmm3, 48(%ebx) - movdqa %xmm4, 64(%ebx) - movdqa %xmm5, 80(%ebx) - movdqa %xmm6, 96(%ebx) - movdqa %xmm7, 112(%ebx) - jmp L(return) - -L(main_loop_just_one_iteration): - movdqu (%ebx, %eax), %xmm0 - movdqu 16(%ebx, %eax), %xmm1 - movdqu 32(%ebx, %eax), %xmm2 - movdqu 48(%ebx, %eax), %xmm3 - movdqa %xmm0, (%ebx) - movdqa %xmm1, 16(%ebx) - movdqa %xmm2, 32(%ebx) - movdqa %xmm3, 48(%ebx) - jmp L(return) - -L(large_page): - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu 32(%eax), %xmm2 - movdqu 48(%eax), %xmm3 - movdqu -64(%eax, %ecx), %xmm4 - movdqu -48(%eax, %ecx), %xmm5 - movdqu -32(%eax, %ecx), %xmm6 - movdqu -16(%eax, %ecx), %xmm7 - movdqu %xmm0, (%edx) - movdqu %xmm1, 16(%edx) - movdqu %xmm2, 32(%edx) - movdqu %xmm3, 48(%edx) - movdqu %xmm4, -64(%edx, %ecx) - movdqu %xmm5, -48(%edx, %ecx) - movdqu %xmm6, -32(%edx, %ecx) - movdqu %xmm7, -16(%edx, %ecx) - - movdqu 64(%eax), %xmm0 - movdqu 80(%eax), %xmm1 - movdqu 96(%eax), %xmm2 - movdqu 112(%eax), %xmm3 - movdqu -128(%eax, %ecx), %xmm4 - movdqu -112(%eax, %ecx), %xmm5 - movdqu -96(%eax, %ecx), %xmm6 - movdqu -80(%eax, %ecx), %xmm7 - movdqu %xmm0, 64(%edx) - movdqu %xmm1, 80(%edx) - movdqu %xmm2, 96(%edx) - movdqu %xmm3, 112(%edx) - movdqu %xmm4, -128(%edx, %ecx) - movdqu %xmm5, -112(%edx, %ecx) - movdqu %xmm6, -96(%edx, %ecx) - movdqu %xmm7, -80(%edx, %ecx) - -/* Now the main loop with non temporal stores. We align - the address of the destination. */ - leal 128(%edx), %ebx - andl $-128, %ebx - - addl %edx, %ecx - andl $-128, %ecx - - subl %edx, %eax - - .p2align 4 -L(main_loop_large_page): - movdqu (%ebx, %eax), %xmm0 - movdqu 16(%ebx, %eax), %xmm1 - movdqu 32(%ebx, %eax), %xmm2 - movdqu 48(%ebx, %eax), %xmm3 - movdqu 64(%ebx, %eax), %xmm4 - movdqu 80(%ebx, %eax), %xmm5 - movdqu 96(%ebx, %eax), %xmm6 - movdqu 112(%ebx, %eax), %xmm7 - movntdq %xmm0, (%ebx) - movntdq %xmm1, 16(%ebx) - movntdq %xmm2, 32(%ebx) - movntdq %xmm3, 48(%ebx) - movntdq %xmm4, 64(%ebx) - movntdq %xmm5, 80(%ebx) - movntdq %xmm6, 96(%ebx) - movntdq %xmm7, 112(%ebx) - lea 128(%ebx), %ebx - cmpl %ebx, %ecx - jne L(main_loop_large_page) - sfence - jmp L(return) - -L(len_0_16_bytes): - testb $24, %cl - jne L(len_9_16_bytes) - testb $4, %cl - .p2align 4,,5 - jne L(len_5_8_bytes) - testl %ecx, %ecx - .p2align 4,,2 - je L(return) - movzbl (%eax), %ebx - testb $2, %cl - movb %bl, (%edx) - je L(return) - movzwl -2(%eax,%ecx), %ebx - movw %bx, -2(%edx,%ecx) - jmp L(return) - -L(len_9_16_bytes): - movq (%eax), %xmm0 - movq -8(%eax, %ecx), %xmm1 - movq %xmm0, (%edx) - movq %xmm1, -8(%edx, %ecx) - jmp L(return) - -L(len_5_8_bytes): - movl (%eax), %ebx - movl %ebx, (%edx) - movl -4(%eax,%ecx), %ebx - movl %ebx, -4(%edx,%ecx) - jmp L(return) - -L(return): - movl %edx, %eax - RETURN - -END (MEMCPY) diff --git a/libc/arch-x86/silvermont/string/sse2-memmove-slm.S b/libc/arch-x86/silvermont/string/sse2-memmove-slm.S index bf9f85dd9..ceada1b7e 100644 --- a/libc/arch-x86/silvermont/string/sse2-memmove-slm.S +++ b/libc/arch-x86/silvermont/string/sse2-memmove-slm.S @@ -67,6 +67,12 @@ name: \ cfi_startproc #endif +#ifndef ALIAS_SYMBOL +# define ALIAS_SYMBOL(alias, original) \ + .globl alias; \ + .equ alias, original +#endif + #ifndef END # define END(name) \ cfi_endproc; \ @@ -537,3 +543,5 @@ L(mm_large_page_loop_backward): jmp L(mm_recalc_len) END (MEMMOVE) + +ALIAS_SYMBOL(memcpy, MEMMOVE) diff --git a/libc/arch-x86_64/string/sse2-memcpy-slm.S b/libc/arch-x86_64/string/sse2-memcpy-slm.S deleted file mode 100644 index 4c30fb62e..000000000 --- a/libc/arch-x86_64/string/sse2-memcpy-slm.S +++ /dev/null @@ -1,299 +0,0 @@ -/* -Copyright (c) 2014, Intel Corporation -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#include "cache.h" - -#ifndef MEMCPY -# define MEMCPY memcpy -#endif - -#ifndef L -# define L(label) .L##label -#endif - -#ifndef cfi_startproc -# define cfi_startproc .cfi_startproc -#endif - -#ifndef cfi_endproc -# define cfi_endproc .cfi_endproc -#endif - -#ifndef cfi_rel_offset -# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off -#endif - -#ifndef cfi_restore -# define cfi_restore(reg) .cfi_restore reg -#endif - -#ifndef cfi_adjust_cfa_offset -# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off -#endif - -#ifndef ENTRY -# define ENTRY(name) \ - .type name, @function; \ - .globl name; \ - .p2align 4; \ -name: \ - cfi_startproc -#endif - -#ifndef END -# define END(name) \ - cfi_endproc; \ - .size name, .-name -#endif - -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -#define PUSH(REG) push REG; -#define POP(REG) pop REG; - -#define ENTRANCE PUSH (%rbx); -#define RETURN_END POP (%rbx); ret -#define RETURN RETURN_END; - - .section .text.sse2,"ax",@progbits -ENTRY (MEMCPY) - ENTRANCE - cmp %rsi, %rdi - je L(return) - - cmp $16, %rdx - jbe L(len_0_16_bytes) - - cmp $SHARED_CACHE_SIZE_HALF, %rdx - jae L(large_page) - - movdqu (%rsi), %xmm0 - movdqu -16(%rsi, %rdx), %xmm1 - cmp $32, %rdx - movdqu %xmm0, (%rdi) - movdqu %xmm1, -16(%rdi, %rdx) - jbe L(return) - - movdqu 16(%rsi), %xmm0 - movdqu -32(%rsi, %rdx), %xmm1 - cmp $64, %rdx - movdqu %xmm0, 16(%rdi) - movdqu %xmm1, -32(%rdi, %rdx) - jbe L(return) - - movdqu 32(%rsi), %xmm0 - movdqu 48(%rsi), %xmm1 - movdqu -48(%rsi, %rdx), %xmm2 - movdqu -64(%rsi, %rdx), %xmm3 - cmp $128, %rdx - movdqu %xmm0, 32(%rdi) - movdqu %xmm1, 48(%rdi) - movdqu %xmm2, -48(%rdi, %rdx) - movdqu %xmm3, -64(%rdi, %rdx) - jbe L(return) - -/* Now the main loop: we align the address of the destination. */ - lea 64(%rdi), %r8 - and $-64, %r8 - - add %rdi, %rdx - and $-64, %rdx - - sub %rdi, %rsi - -/* We should stop two iterations before the termination - (in order not to misprefetch). */ - sub $64, %rdx - cmp %r8, %rdx - je L(main_loop_just_one_iteration) - - sub $64, %rdx - cmp %r8, %rdx - je L(main_loop_last_two_iterations) - - - .p2align 4 -L(main_loop_cache): - - prefetcht0 128(%r8, %rsi) - - movdqu (%r8, %rsi), %xmm0 - movdqu 16(%r8, %rsi), %xmm1 - movdqu 32(%r8, %rsi), %xmm2 - movdqu 48(%r8, %rsi), %xmm3 - movdqa %xmm0, (%r8) - movdqa %xmm1, 16(%r8) - movdqa %xmm2, 32(%r8) - movdqa %xmm3, 48(%r8) - lea 64(%r8), %r8 - cmp %r8, %rdx - jne L(main_loop_cache) - -L(main_loop_last_two_iterations): - movdqu (%r8, %rsi), %xmm0 - movdqu 16(%r8, %rsi), %xmm1 - movdqu 32(%r8, %rsi), %xmm2 - movdqu 48(%r8, %rsi), %xmm3 - movdqu 64(%r8, %rsi), %xmm4 - movdqu 80(%r8, %rsi), %xmm5 - movdqu 96(%r8, %rsi), %xmm6 - movdqu 112(%r8, %rsi), %xmm7 - movdqa %xmm0, (%r8) - movdqa %xmm1, 16(%r8) - movdqa %xmm2, 32(%r8) - movdqa %xmm3, 48(%r8) - movdqa %xmm4, 64(%r8) - movdqa %xmm5, 80(%r8) - movdqa %xmm6, 96(%r8) - movdqa %xmm7, 112(%r8) - jmp L(return) - -L(main_loop_just_one_iteration): - movdqu (%r8, %rsi), %xmm0 - movdqu 16(%r8, %rsi), %xmm1 - movdqu 32(%r8, %rsi), %xmm2 - movdqu 48(%r8, %rsi), %xmm3 - movdqa %xmm0, (%r8) - movdqa %xmm1, 16(%r8) - movdqa %xmm2, 32(%r8) - movdqa %xmm3, 48(%r8) - jmp L(return) - -L(large_page): - movdqu (%rsi), %xmm0 - movdqu 16(%rsi), %xmm1 - movdqu 32(%rsi), %xmm2 - movdqu 48(%rsi), %xmm3 - movdqu -64(%rsi, %rdx), %xmm4 - movdqu -48(%rsi, %rdx), %xmm5 - movdqu -32(%rsi, %rdx), %xmm6 - movdqu -16(%rsi, %rdx), %xmm7 - movdqu %xmm0, (%rdi) - movdqu %xmm1, 16(%rdi) - movdqu %xmm2, 32(%rdi) - movdqu %xmm3, 48(%rdi) - movdqu %xmm4, -64(%rdi, %rdx) - movdqu %xmm5, -48(%rdi, %rdx) - movdqu %xmm6, -32(%rdi, %rdx) - movdqu %xmm7, -16(%rdi, %rdx) - - movdqu 64(%rsi), %xmm0 - movdqu 80(%rsi), %xmm1 - movdqu 96(%rsi), %xmm2 - movdqu 112(%rsi), %xmm3 - movdqu -128(%rsi, %rdx), %xmm4 - movdqu -112(%rsi, %rdx), %xmm5 - movdqu -96(%rsi, %rdx), %xmm6 - movdqu -80(%rsi, %rdx), %xmm7 - movdqu %xmm0, 64(%rdi) - movdqu %xmm1, 80(%rdi) - movdqu %xmm2, 96(%rdi) - movdqu %xmm3, 112(%rdi) - movdqu %xmm4, -128(%rdi, %rdx) - movdqu %xmm5, -112(%rdi, %rdx) - movdqu %xmm6, -96(%rdi, %rdx) - movdqu %xmm7, -80(%rdi, %rdx) - -/* Now the main loop with non temporal stores. We align - the address of the destination. */ - lea 128(%rdi), %r8 - and $-128, %r8 - - add %rdi, %rdx - and $-128, %rdx - - sub %rdi, %rsi - - .p2align 4 -L(main_loop_large_page): - movdqu (%r8, %rsi), %xmm0 - movdqu 16(%r8, %rsi), %xmm1 - movdqu 32(%r8, %rsi), %xmm2 - movdqu 48(%r8, %rsi), %xmm3 - movdqu 64(%r8, %rsi), %xmm4 - movdqu 80(%r8, %rsi), %xmm5 - movdqu 96(%r8, %rsi), %xmm6 - movdqu 112(%r8, %rsi), %xmm7 - movntdq %xmm0, (%r8) - movntdq %xmm1, 16(%r8) - movntdq %xmm2, 32(%r8) - movntdq %xmm3, 48(%r8) - movntdq %xmm4, 64(%r8) - movntdq %xmm5, 80(%r8) - movntdq %xmm6, 96(%r8) - movntdq %xmm7, 112(%r8) - lea 128(%r8), %r8 - cmp %r8, %rdx - jne L(main_loop_large_page) - sfence - jmp L(return) - -L(len_0_16_bytes): - testb $24, %dl - jne L(len_9_16_bytes) - testb $4, %dl - .p2align 4,,5 - jne L(len_5_8_bytes) - test %rdx, %rdx - .p2align 4,,2 - je L(return) - movzbl (%rsi), %ebx - testb $2, %dl - movb %bl, (%rdi) - je L(return) - movzwl -2(%rsi,%rdx), %ebx - movw %bx, -2(%rdi,%rdx) - jmp L(return) - -L(len_9_16_bytes): - movq (%rsi), %xmm0 - movq -8(%rsi, %rdx), %xmm1 - movq %xmm0, (%rdi) - movq %xmm1, -8(%rdi, %rdx) - jmp L(return) - -L(len_5_8_bytes): - movl (%rsi), %ebx - movl %ebx, (%rdi) - movl -4(%rsi,%rdx), %ebx - movl %ebx, -4(%rdi,%rdx) - jmp L(return) - -L(return): - mov %rdi, %rax - RETURN - -END (MEMCPY) diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/string/sse2-memmove-slm.S index 6a5afd610..739502888 100644 --- a/libc/arch-x86_64/string/sse2-memmove-slm.S +++ b/libc/arch-x86_64/string/sse2-memmove-slm.S @@ -67,6 +67,12 @@ name: \ cfi_startproc #endif +#ifndef ALIAS_SYMBOL +# define ALIAS_SYMBOL(alias, original) \ + .globl alias; \ + .equ alias, original +#endif + #ifndef END # define END(name) \ cfi_endproc; \ @@ -508,3 +514,5 @@ L(mm_large_page_loop_backward): jmp L(mm_recalc_len) END (MEMMOVE) + +ALIAS_SYMBOL(memcpy, MEMMOVE) |