diff options
author | Mika Lindqvist <postmaster@raasu.org> | 2020-09-14 18:40:35 +0300 |
---|---|---|
committer | Hans Kristian Rosbach <hk-github@circlestorm.org> | 2020-09-19 09:52:01 +0200 |
commit | 6575fbffea85fcfabc8644bdf21043e1fb408acb (patch) | |
tree | 085dff84506f3cff36c7fa68d359c668b219a66e /arch | |
parent | 6539b769e677bc1e6f4954d4457aba0cd138468b (diff) |
Remove chunkmemset_3 and chunkmemset_6 on ARM/AArch64 as they need 3 chunks...
* Don't unroll distances smaller than chunk size.
Diffstat (limited to 'arch')
-rw-r--r-- | arch/arm/chunkset_neon.c | 72 |
1 files changed, 0 insertions, 72 deletions
diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index f2da73a..e9cbcb1 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -15,7 +15,6 @@ typedef uint8x16_t chunk_t; #define HAVE_CHUNKMEMSET_1 #define HAVE_CHUNKMEMSET_2 -#define HAVE_CHUNKMEMSET_3 #define HAVE_CHUNKMEMSET_4 #define HAVE_CHUNKMEMSET_8 @@ -42,77 +41,6 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { #define CHUNKMEMSET chunkmemset_neon #define CHUNKMEMSET_SAFE chunkmemset_safe_neon -uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len); -uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len); - -static inline uint8_t *chunkmemset_3(uint8_t *out, uint8_t *from, unsigned dist, unsigned len) { - uint8x8x3_t chunks; - unsigned sz = sizeof(chunks); - if (len < sz) { - out = CHUNKUNROLL(out, &dist, &len); - return CHUNKCOPY(out, out - dist, len); - } - - /* Load 3 bytes 'a,b,c' from FROM and duplicate across all lanes: - chunks[0] = {a,a,a,a,a,a,a,a} - chunks[1] = {b,b,b,b,b,b,b,b} - chunks[2] = {c,c,c,c,c,c,c,c}. */ - chunks = vld3_dup_u8(from); - - unsigned rem = len % sz; - len -= rem; - while (len) { - /* Store "a,b,c, ..., a,b,c". */ - vst3_u8(out, chunks); - out += sz; - len -= sz; - } - - if (!rem) - return out; - - /* Last, deal with the case when LEN is not a multiple of SZ. */ - out = CHUNKUNROLL(out, &dist, &rem); - return CHUNKCOPY(out, out - dist, rem); -} - -#if defined(__aarch64__) || defined(_M_ARM64) - -#define HAVE_CHUNKMEMSET_6 - -static inline uint8_t *chunkmemset_6(uint8_t *out, uint8_t *from, unsigned dist, unsigned len) { - uint16x8x3_t chunks; - unsigned sz = sizeof(chunks); - if (len < sz) { - out = CHUNKUNROLL(out, &dist, &len); - return CHUNKCOPY(out, out - dist, len); - } - - /* Load 6 bytes 'ab,cd,ef' from FROM and duplicate across all lanes: - chunks[0] = {ab,ab,ab,ab,ab,ab,ab,ab} - chunks[1] = {cd,cd,cd,cd,cd,cd,cd,cd} - chunks[2] = {ef,ef,ef,ef,ef,ef,ef,ef}. */ - chunks = vld3q_dup_u16((unsigned short *)from); - - unsigned rem = len % sz; - len -= rem; - while (len) { - /* Store "ab,cd,ef, ..., ab,cd,ef". */ - vst3q_u16((unsigned short *)out, chunks); - out += sz; - len -= sz; - } - - if (!rem) - return out; - - /* Last, deal with the case when LEN is not a multiple of SZ. */ - out = CHUNKUNROLL(out, &dist, &rem); - return CHUNKCOPY(out, out - dist, rem); -} - -#endif - static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { *chunk = vld1q_u8(s); } |