summaryrefslogtreecommitdiff
path: root/adler32.c
diff options
context:
space:
mode:
authorAdenilson Cavalcanti <cavalcantii@gmail.com>2017-04-22 02:41:47 -0700
committerHans Kristian Rosbach <hk-github@circlestorm.org>2017-04-22 11:41:47 +0200
commitec02ecf104e1d3f1836a908a359f20aa93494df5 (patch)
treef4981f964863a2037cdce45243b99976b8a7971e /adler32.c
parent9487690991eff131112153813e557572100ae92d (diff)
Implementing NEON-ized Adler32 checksum (#102)
The checksum is calculated in the uncompressed PNG data and can be made much faster by using SIMD. Tests in ARMv8 yielded an improvement of about 3x (e.g. walltime was 350ms x 125ms for a 4096x4096 bytes executed 30 times). This yields an improvement in image decoding in Chromium around 18% (see https://bugs.chromium.org/p/chromium/issues/detail?id=688601).
Diffstat (limited to 'adler32.c')
-rw-r--r--adler32.c8
1 files changed, 8 insertions, 0 deletions
diff --git a/adler32.c b/adler32.c
index 0da5dee..75c7233 100644
--- a/adler32.c
+++ b/adler32.c
@@ -7,6 +7,10 @@
#include "zutil.h"
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+
static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2);
#define BASE 65521U /* largest prime smaller than 65536 */
@@ -61,6 +65,10 @@ static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len
/* ========================================================================= */
uint32_t ZEXPORT adler32_z(uint32_t adler, const unsigned char *buf, size_t len) {
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+ return adler32_neon(adler, buf, len);
+#endif
+
uint32_t sum2;
unsigned n;