summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMika Lindqvist <postmaster@raasu.org>2021-06-19 00:10:44 +0300
committerHans Kristian Rosbach <hk-git@circlestorm.org>2021-06-21 11:34:28 +0200
commit089b2e7065b73cebb1f75099997d8f0c2194759d (patch)
treee2578868fbcc8072e63842972ad96affec55ad4f
parent61a5776eb6538efb08c50a74a2e723f678980d99 (diff)
[chunkset_neon] Use vdupq_n_u64.
* Using vdupq_n_u64 duplicates the unsigned 64-bit integer to two consecutive aligned memory locations in stack so compiler can use wider load instructions. All different-sized general-purpose registers overlay on ARM/AArch64, so any vector cast is no-op in assembly.
-rw-r--r--arch/arm/chunkset_neon.c4
1 files changed, 3 insertions, 1 deletions
diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c
index b1fcb24..e0ad3e0 100644
--- a/arch/arm/chunkset_neon.c
+++ b/arch/arm/chunkset_neon.c
@@ -37,7 +37,9 @@ static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
- *chunk = vcombine_u8(vld1_u8(from), vld1_u8(from));
+ uint64_t tmp;
+ memcpy(&tmp, from, 8);
+ *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
}
#define CHUNKSIZE chunksize_neon