diff options
author | Mika Lindqvist <postmaster@raasu.org> | 2021-06-19 00:10:44 +0300 |
---|---|---|
committer | Hans Kristian Rosbach <hk-git@circlestorm.org> | 2021-06-21 11:34:28 +0200 |
commit | 089b2e7065b73cebb1f75099997d8f0c2194759d (patch) | |
tree | e2578868fbcc8072e63842972ad96affec55ad4f | |
parent | 61a5776eb6538efb08c50a74a2e723f678980d99 (diff) |
[chunkset_neon] Use vdupq_n_u64.
* Using vdupq_n_u64 duplicates the unsigned 64-bit integer to two consecutive aligned memory locations in stack so compiler can use wider load instructions.
All different-sized general-purpose registers overlay on ARM/AArch64, so any vector cast is no-op in assembly.
-rw-r--r-- | arch/arm/chunkset_neon.c | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index b1fcb24..e0ad3e0 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -37,7 +37,9 @@ static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - *chunk = vcombine_u8(vld1_u8(from), vld1_u8(from)); + uint64_t tmp; + memcpy(&tmp, from, 8); + *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp)); } #define CHUNKSIZE chunksize_neon |