summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorMika Lindqvist <postmaster@raasu.org>2020-09-14 18:40:35 +0300
committerHans Kristian Rosbach <hk-github@circlestorm.org>2020-09-19 09:52:01 +0200
commit6575fbffea85fcfabc8644bdf21043e1fb408acb (patch)
tree085dff84506f3cff36c7fa68d359c668b219a66e /arch
parent6539b769e677bc1e6f4954d4457aba0cd138468b (diff)
Remove chunkmemset_3 and chunkmemset_6 on ARM/AArch64 as they need 3 chunks...
* Don't unroll distances smaller than chunk size.
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/chunkset_neon.c72
1 files changed, 0 insertions, 72 deletions
diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c
index f2da73a..e9cbcb1 100644
--- a/arch/arm/chunkset_neon.c
+++ b/arch/arm/chunkset_neon.c
@@ -15,7 +15,6 @@ typedef uint8x16_t chunk_t;
#define HAVE_CHUNKMEMSET_1
#define HAVE_CHUNKMEMSET_2
-#define HAVE_CHUNKMEMSET_3
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
@@ -42,77 +41,6 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
#define CHUNKMEMSET chunkmemset_neon
#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
-uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len);
-uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len);
-
-static inline uint8_t *chunkmemset_3(uint8_t *out, uint8_t *from, unsigned dist, unsigned len) {
- uint8x8x3_t chunks;
- unsigned sz = sizeof(chunks);
- if (len < sz) {
- out = CHUNKUNROLL(out, &dist, &len);
- return CHUNKCOPY(out, out - dist, len);
- }
-
- /* Load 3 bytes 'a,b,c' from FROM and duplicate across all lanes:
- chunks[0] = {a,a,a,a,a,a,a,a}
- chunks[1] = {b,b,b,b,b,b,b,b}
- chunks[2] = {c,c,c,c,c,c,c,c}. */
- chunks = vld3_dup_u8(from);
-
- unsigned rem = len % sz;
- len -= rem;
- while (len) {
- /* Store "a,b,c, ..., a,b,c". */
- vst3_u8(out, chunks);
- out += sz;
- len -= sz;
- }
-
- if (!rem)
- return out;
-
- /* Last, deal with the case when LEN is not a multiple of SZ. */
- out = CHUNKUNROLL(out, &dist, &rem);
- return CHUNKCOPY(out, out - dist, rem);
-}
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-
-#define HAVE_CHUNKMEMSET_6
-
-static inline uint8_t *chunkmemset_6(uint8_t *out, uint8_t *from, unsigned dist, unsigned len) {
- uint16x8x3_t chunks;
- unsigned sz = sizeof(chunks);
- if (len < sz) {
- out = CHUNKUNROLL(out, &dist, &len);
- return CHUNKCOPY(out, out - dist, len);
- }
-
- /* Load 6 bytes 'ab,cd,ef' from FROM and duplicate across all lanes:
- chunks[0] = {ab,ab,ab,ab,ab,ab,ab,ab}
- chunks[1] = {cd,cd,cd,cd,cd,cd,cd,cd}
- chunks[2] = {ef,ef,ef,ef,ef,ef,ef,ef}. */
- chunks = vld3q_dup_u16((unsigned short *)from);
-
- unsigned rem = len % sz;
- len -= rem;
- while (len) {
- /* Store "ab,cd,ef, ..., ab,cd,ef". */
- vst3q_u16((unsigned short *)out, chunks);
- out += sz;
- len -= sz;
- }
-
- if (!rem)
- return out;
-
- /* Last, deal with the case when LEN is not a multiple of SZ. */
- out = CHUNKUNROLL(out, &dist, &rem);
- return CHUNKCOPY(out, out - dist, rem);
-}
-
-#endif
-
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = vld1q_u8(s);
}