diff options
author | Nathan Moinvaziri <nathan@nathanm.com> | 2020-02-09 19:59:01 -0800 |
---|---|---|
committer | Hans Kristian Rosbach <hk-github@circlestorm.org> | 2020-05-01 00:21:18 +0200 |
commit | e09d131b5abbccf97532afb17c3da92cd0fc6f00 (patch) | |
tree | b9dd1cb744c931c743080ef9f2b20dfc02f489d1 | |
parent | 343596fc98a0d7a6283dbe5d25abefca18439184 (diff) |
Standardize fill_window implementations and abstract out slide_hash_neon for ARM.
-rw-r--r-- | CMakeLists.txt | 16 | ||||
-rw-r--r-- | arch/arm/Makefile.in | 10 | ||||
-rw-r--r-- | arch/arm/fill_window_arm.c | 167 | ||||
-rw-r--r-- | arch/arm/slide_neon.c | 48 | ||||
-rw-r--r-- | arch/x86/Makefile.in | 8 | ||||
-rw-r--r-- | arch/x86/README.md | 2 | ||||
-rw-r--r-- | arch/x86/deflate_quick.c | 3 | ||||
-rw-r--r-- | arch/x86/fill_window_sse.c | 155 | ||||
-rwxr-xr-x | configure | 49 | ||||
-rw-r--r-- | crc32.c | 1 | ||||
-rw-r--r-- | deflate.c | 30 | ||||
-rw-r--r-- | deflate.h | 2 | ||||
-rw-r--r-- | deflate_fast.c | 2 | ||||
-rw-r--r-- | deflate_medium.c | 2 | ||||
-rw-r--r-- | deflate_slow.c | 2 | ||||
-rw-r--r-- | fallback_builtins.h | 1 | ||||
-rw-r--r-- | functable.c | 61 | ||||
-rw-r--r-- | functable.h | 1 | ||||
-rw-r--r-- | inflate.c | 2 | ||||
-rw-r--r-- | win32/Makefile.a64 | 4 | ||||
-rw-r--r-- | win32/Makefile.arm | 4 | ||||
-rw-r--r-- | win32/Makefile.msc | 2 | ||||
-rw-r--r-- | zutil.h | 2 |
23 files changed, 150 insertions, 424 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 6fb57e1..32eb662 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -624,16 +624,19 @@ endif() if(WITH_OPTIM) if(BASEARCH_ARM_FOUND) - add_definitions(-DARM_GETAUXVAL) - list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/armfeature.c ${ARCHDIR}/fill_window_arm.c) + add_definitions(-DARM_CPUID) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm.h) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/armfeature.c) if(WITH_NEON) - list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/adler32_neon.c) - add_definitions(-DARM_NEON_ADLER32) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_neon.h) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/slide_neon.c) + add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH) add_intrinsics_option("${NEONFLAG}") if(MSVC) add_definitions(-D__ARM_NEON__) endif() - add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"") + add_feature_info(NEON_ALDER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"") + add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"") endif() if(WITH_ACLE) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c) @@ -659,6 +662,7 @@ if(WITH_OPTIM) endif() elseif(BASEARCH_X86_FOUND) add_definitions(-DX86_CPUID) + list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86.h) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86.c) if(MSVC) list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h) @@ -685,7 +689,7 @@ if(WITH_OPTIM) endif() if(WITH_SSE2 AND HAVE_SSE2_INTRIN) add_definitions(-DX86_SSE2) - list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/fill_window_sse.c ${ARCHDIR}/slide_sse.c) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_sse.c) if(NOT ${ARCH} MATCHES "x86_64") add_intrinsics_option("${SSE2FLAG}") add_feature_info(FORCE_SSE2 FORCE_SSE2 "Assume CPU is SSE2 capable") diff --git a/arch/arm/Makefile.in b/arch/arm/Makefile.in index a64d591..9a25482 100644 --- a/arch/arm/Makefile.in +++ b/arch/arm/Makefile.in @@ -12,7 +12,7 @@ SRCDIR=. SRCTOP=../.. TOPDIR=$(SRCTOP) -all: adler32_neon.o adler32_neon.lo armfeature.o armfeature.lo crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo +all: adler32_neon.o adler32_neon.lo armfeature.o armfeature.lo crc32_acle.o crc32_acle.lo slide_neon.o slide_neon.lo insert_string_acle.o insert_string_acle.lo adler32_neon.o: $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c @@ -32,11 +32,11 @@ crc32_acle.o: crc32_acle.lo: $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c -fill_window_arm.o: - $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c +slide_neon.o: + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c -fill_window_arm.lo: - $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c +slide_neon.lo: + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c insert_string_acle.o: $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c diff --git a/arch/arm/fill_window_arm.c b/arch/arm/fill_window_arm.c deleted file mode 100644 index 4367451..0000000 --- a/arch/arm/fill_window_arm.c +++ /dev/null @@ -1,167 +0,0 @@ -/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions - * Copyright (C) 2017 Mika T. Lindqvist - * - * Authors: - * Mika T. Lindqvist <postmaster@raasu.org> - * Jun He <jun.he@arm.com> - * - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#include "../../zbuild.h" -#include "../../deflate.h" -#include "../../deflate_p.h" -#include "../../functable.h" - -extern ZLIB_INTERNAL int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size); - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include <arm_neon.h> - -/* SIMD version of hash_chain rebase */ -static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) { - register uint16x8_t v, *p; - register size_t n; - - size_t size = entries*sizeof(table[0]); - Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); - - Assert(sizeof(Pos) == 2, "Wrong Pos size"); - v = vdupq_n_u16(window_size); - - p = (uint16x8_t *)table; - n = size / (sizeof(uint16x8_t) * 8); - do { - p[0] = vqsubq_u16(p[0], v); - p[1] = vqsubq_u16(p[1], v); - p[2] = vqsubq_u16(p[2], v); - p[3] = vqsubq_u16(p[3], v); - p[4] = vqsubq_u16(p[4], v); - p[5] = vqsubq_u16(p[5], v); - p[6] = vqsubq_u16(p[6], v); - p[7] = vqsubq_u16(p[7], v); - p += 8; - } while (--n); -} -#else -/* generic version for hash rebase */ -static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) { - unsigned int i; - for (i = 0; i < entries; i++) { - table[i] = (table[i] >= window_size) ? (table[i] - window_size) : NIL; - } -} -#endif - -void fill_window_arm(deflate_state *s) { - register unsigned n; - unsigned long more; /* Amount of free space at the end of the window. */ - unsigned int wsize = s->w_size; - - Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead"); - - do { - more = s->window_size - s->lookahead - s->strstart; - - /* If the window is almost full and there is insufficient lookahead, - * move the upper half to the lower one to make room in the upper half. - */ - if (s->strstart >= wsize+MAX_DIST(s)) { - memcpy(s->window, s->window+wsize, wsize); - s->match_start -= wsize; - s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ - s->block_start -= wsize; - - /* Slide the hash table (could be avoided with 32 bit values - at the expense of memory usage). We slide even when level == 0 - to keep the hash table consistent if we switch back to level > 0 - later. (Using level 0 permanently is not an optimal usage of - zlib, so we don't care about this pathological case.) - */ - - slide_hash_chain(s->head, s->hash_size, wsize); - slide_hash_chain(s->prev, wsize, wsize); - more += wsize; - } - if (s->strm->avail_in == 0) - break; - - /* If there was no sliding: - * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && - * more == window_size - lookahead - strstart - * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) - * => more >= window_size - 2*WSIZE + 2 - * In the BIG_MEM or MMAP case (not yet supported), - * window_size == input_size + MIN_LOOKAHEAD && - * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. - * Otherwise, window_size == 2*WSIZE so more >= 2. - * If there was sliding, more >= WSIZE. So in all cases, more >= 2. - */ - Assert(more >= 2, "more < 2"); - - n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more); - s->lookahead += n; - - /* Initialize the hash value now that we have some input: */ - if (s->lookahead + s->insert >= MIN_MATCH) { - unsigned int str = s->strstart - s->insert; - unsigned int insert_cnt = s->insert; - unsigned int slen; - - s->ins_h = s->window[str]; - - if (UNLIKELY(s->lookahead < MIN_MATCH)) - insert_cnt += s->lookahead - MIN_MATCH; - slen = insert_cnt; - if (str >= (MIN_MATCH - 2)) - { - str += 2 - MIN_MATCH; - insert_cnt += MIN_MATCH - 2; - } - if (insert_cnt > 0) - { - functable.insert_string(s, str, insert_cnt); - s->insert -= slen; - } - } - /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, - * but this is not important since only literal bytes will be emitted. - */ - } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); - - /* If the WIN_INIT bytes after the end of the current data have never been - * written, then zero those bytes in order to avoid memory check reports of - * the use of uninitialized (or uninitialised as Julian writes) bytes by - * the longest match routines. Update the high water mark for the next - * time through here. WIN_INIT is set to MAX_MATCH since the longest match - * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead. - */ - if (s->high_water < s->window_size) { - unsigned long curr = s->strstart + (unsigned long)s->lookahead; - unsigned long init; - - if (s->high_water < curr) { - /* Previous high water mark below current data -- zero WIN_INIT - * bytes or up to end of window, whichever is less. - */ - init = s->window_size - curr; - if (init > WIN_INIT) - init = WIN_INIT; - memset(s->window + curr, 0, init); - s->high_water = curr + init; - } else if (s->high_water < curr + WIN_INIT) { - /* High water mark at or above current data, but below current data - * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up - * to end of window, whichever is less. - */ - init = curr + WIN_INIT; - if (init > s->window_size) - init = s->window_size; - init -= s->high_water; - memset(s->window + s->high_water, 0, init); - s->high_water += init; - } - } - - Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search"); -} diff --git a/arch/arm/slide_neon.c b/arch/arm/slide_neon.c new file mode 100644 index 0000000..352d5a6 --- /dev/null +++ b/arch/arm/slide_neon.c @@ -0,0 +1,48 @@ +/* slide_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions + * Copyright (C) 2017 Mika T. Lindqvist + * + * Authors: + * Mika T. Lindqvist <postmaster@raasu.org> + * Jun He <jun.he@arm.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#if defined(ARM_NEON_SLIDEHASH) +#include <arm_neon.h> +#include "../../zbuild.h" +#include "../../deflate.h" + +/* SIMD version of hash_chain rebase */ +static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) { + register uint16x8_t v, *p; + register size_t n; + + size_t size = entries*sizeof(table[0]); + Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); + + Assert(sizeof(Pos) == 2, "Wrong Pos size"); + v = vdupq_n_u16(window_size); + + p = (uint16x8_t *)table; + n = size / (sizeof(uint16x8_t) * 8); + do { + p[0] = vqsubq_u16(p[0], v); + p[1] = vqsubq_u16(p[1], v); + p[2] = vqsubq_u16(p[2], v); + p[3] = vqsubq_u16(p[3], v); + p[4] = vqsubq_u16(p[4], v); + p[5] = vqsubq_u16(p[5], v); + p[6] = vqsubq_u16(p[6], v); + p[7] = vqsubq_u16(p[7], v); + p += 8; + } while (--n); +} + +ZLIB_INTERNAL void slide_hash_neon(deflate_state *s) { + unsigned int wsize = s->w_size; + + slide_hash_chain(s->head, s->hash_size, wsize); + slide_hash_chain(s->prev, wsize, wsize); +} +#endif diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 187d06f..8da40bf 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -17,7 +17,7 @@ SRCDIR=. SRCTOP=../.. TOPDIR=$(SRCTOP) -all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo +all: x86.o x86.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo x86.o: $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c @@ -25,12 +25,6 @@ x86.o: x86.lo: $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c -fill_window_sse.o: - $(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c - -fill_window_sse.lo: - $(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c - deflate_quick.o: $(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c diff --git a/arch/x86/README.md b/arch/x86/README.md index 6d23945..8bf6d08 100644 --- a/arch/x86/README.md +++ b/arch/x86/README.md @@ -3,6 +3,6 @@ Contents |Name|Description| |:-|:-| -|fill_window_sse.c|SSE2 optimized fill_window| |deflate_quick.c|SSE4 optimized deflate strategy for use as level 1| |crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation| +|slide_sse2.c|SSE2 optimized slide_hash| diff --git a/arch/x86/deflate_quick.c b/arch/x86/deflate_quick.c index 5cbc653..809c081 100644 --- a/arch/x86/deflate_quick.c +++ b/arch/x86/deflate_quick.c @@ -30,7 +30,6 @@ # include <ctype.h> #endif -extern void fill_window_sse(deflate_state *s); extern void flush_pending(PREFIX3(stream) *strm); static inline long compare258(const unsigned char *const src0, const unsigned char *const src1) { @@ -209,7 +208,7 @@ ZLIB_INTERNAL block_state deflate_quick(deflate_state *s, int flush) { } if (s->lookahead < MIN_LOOKAHEAD) { - fill_window_sse(s); + fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { static_emit_end_block(s, 0); return need_more; diff --git a/arch/x86/fill_window_sse.c b/arch/x86/fill_window_sse.c deleted file mode 100644 index be35f9f..0000000 --- a/arch/x86/fill_window_sse.c +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Fill Window with SSE2-optimized hash shifting - * - * Copyright (C) 2013 Intel Corporation - * Authors: - * Arjan van de Ven <arjan@linux.intel.com> - * Jim Kukunas <james.t.kukunas@linux.intel.com> - * - * For conditions of distribution and use, see copyright notice in zlib.h - */ -#ifdef X86_SSE2 - -#include "../../zbuild.h" -#include <immintrin.h> -#include "../../deflate.h" -#include "../../deflate_p.h" -#include "../../functable.h" - -extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size); -void slide_hash_sse2(deflate_state *s); -#ifdef X86_AVX2 -void slide_hash_avx2(deflate_state *s); -#endif - -ZLIB_INTERNAL void fill_window_sse(deflate_state *s) { - register unsigned n; - unsigned more; /* Amount of free space at the end of the window. */ - unsigned int wsize = s->w_size; - - Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead"); - - do { - more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart); - - /* Deal with !@#$% 64K limit: */ - if (sizeof(int) <= 2) { - if (more == 0 && s->strstart == 0 && s->lookahead == 0) { - more = wsize; - - } else if (more == (unsigned)(-1)) { - /* Very unlikely, but possible on 16 bit machine if - * strstart == 0 && lookahead == 1 (input done a byte at time) - */ - more--; - } - } - - /* If the window is almost full and there is insufficient lookahead, - * move the upper half to the lower one to make room in the upper half. - */ - if (s->strstart >= wsize+MAX_DIST(s)) { - memcpy(s->window, s->window+wsize, (unsigned)wsize); - s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0; - s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ - s->block_start -= (long) wsize; - - /* Slide the hash table (could be avoided with 32 bit values - at the expense of memory usage). We slide even when level == 0 - to keep the hash table consistent if we switch back to level > 0 - later. (Using level 0 permanently is not an optimal usage of - zlib, so we don't care about this pathological case.) - */ -#ifdef X86_AVX2 - if (x86_cpu_has_avx2) { - slide_hash_avx2(s); - } else -#endif - slide_hash_sse2(s); - more += wsize; - } - if (s->strm->avail_in == 0) break; - - /* If there was no sliding: - * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && - * more == window_size - lookahead - strstart - * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) - * => more >= window_size - 2*WSIZE + 2 - * In the BIG_MEM or MMAP case (not yet supported), - * window_size == input_size + MIN_LOOKAHEAD && - * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. - * Otherwise, window_size == 2*WSIZE so more >= 2. - * If there was sliding, more >= WSIZE. So in all cases, more >= 2. - */ - Assert(more >= 2, "more < 2"); - - n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more); - s->lookahead += n; - - /* Initialize the hash value now that we have some input: */ - if (s->lookahead + s->insert >= MIN_MATCH) { - unsigned int str = s->strstart - s->insert; - s->ins_h = s->window[str]; - if (str >= 1) - functable.quick_insert_string(s, str + 2 - MIN_MATCH); -#if MIN_MATCH != 3 -#error Call insert_string() MIN_MATCH-3 more times - while (s->insert) { - functable.quick_insert_string(s, str); - str++; - s->insert--; - if (s->lookahead + s->insert < MIN_MATCH) - break; - } -#else - unsigned int count; - if (UNLIKELY(s->lookahead == 1)) { - count = s->insert - 1; - } else { - count = s->insert; - } - functable.insert_string(s, str, count); - s->insert -= count; -#endif - } - /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, - * but this is not important since only literal bytes will be emitted. - */ - } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); - - /* If the WIN_INIT bytes after the end of the current data have never been - * written, then zero those bytes in order to avoid memory check reports of - * the use of uninitialized (or uninitialised as Julian writes) bytes by - * the longest match routines. Update the high water mark for the next - * time through here. WIN_INIT is set to MAX_MATCH since the longest match - * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead. - */ - if (s->high_water < s->window_size) { - unsigned long curr = s->strstart + (unsigned long)(s->lookahead); - unsigned long init; - - if (s->high_water < curr) { - /* Previous high water mark below current data -- zero WIN_INIT - * bytes or up to end of window, whichever is less. - */ - init = s->window_size - curr; - if (init > WIN_INIT) - init = WIN_INIT; - memset(s->window + curr, 0, (unsigned)init); - s->high_water = curr + init; - } else if (s->high_water < (unsigned long)curr + WIN_INIT) { - /* High water mark at or above current data, but below current data - * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up - * to end of window, whichever is less. - */ - init = (unsigned long)curr + WIN_INIT - s->high_water; - if (init > s->window_size - s->high_water) - init = s->window_size - s->high_water; - memset(s->window + s->high_water, 0, (unsigned)init); - s->high_water += init; - } - } - - Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search"); -} -#endif @@ -1019,8 +1019,8 @@ case "${ARCH}" in if test ${HAVE_SSE2_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSE2" SFLAGS="${SFLAGS} -DX86_SSE2" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_sse.o slide_sse.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_sse.lo slide_sse.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_sse.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_sse.lo" if test $forcesse2 -eq 1; then CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2" @@ -1028,7 +1028,6 @@ case "${ARCH}" in fi # Enable deflate_quick at level 1? - # requires SSE2: code uses fill_window_sse if test $without_new_strategies -eq 0; then CFLAGS="${CFLAGS} -DX86_QUICK_STRATEGY" SFLAGS="${SFLAGS} -DX86_QUICK_STRATEGY" @@ -1077,8 +1076,8 @@ case "${ARCH}" in CFLAGS="${CFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH" SFLAGS="${SFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o fill_window_sse.o insert_string_sse.o slide_sse.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo fill_window_sse.lo insert_string_sse.lo slide_sse.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o insert_string_sse.o slide_sse.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo insert_string_sse.lo slide_sse.lo" if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN" @@ -1116,10 +1115,10 @@ case "${ARCH}" in ARCHDIR=arch/arm if test $without_optimizations -eq 0; then - CFLAGS="${CFLAGS} -DARM_GETAUXVAL" - SFLAGS="${SFLAGS} -DARM_GETAUXVAL" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o fill_window_arm.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo fill_window_arm.lo" + CFLAGS="${CFLAGS} -DARM_CPUID" + SFLAGS="${SFLAGS} -DARM_CPUID" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo" fi @@ -1169,11 +1168,11 @@ case "${ARCH}" in fi if test $buildneon -eq 1; then - CFLAGS="${CFLAGS} -mfpu=neon -DARM_NEON_ADLER32" - SFLAGS="${SFLAGS} -mfpu=neon -DARM_NEON_ADLER32" + CFLAGS="${CFLAGS} -mfpu=neon -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH" + SFLAGS="${SFLAGS} -mfpu=neon -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo" fi fi ;; @@ -1192,11 +1191,11 @@ case "${ARCH}" in SFLAGS="${SFLAGS} -mfpu=neon" fi - CFLAGS="${CFLAGS} -DARM_NEON_ADLER32" - SFLAGS="${SFLAGS} -DARM_NEON_ADLER32" + CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH" + SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo" fi fi ;; @@ -1216,11 +1215,11 @@ case "${ARCH}" in SFLAGS="${SFLAGS} -mfpu=neon" fi - CFLAGS="${CFLAGS} -DARM_NEON_ADLER32" - SFLAGS="${SFLAGS} -DARM_NEON_ADLER32" + CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH" + SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo" fi fi ;; @@ -1239,10 +1238,10 @@ case "${ARCH}" in fi if test $without_optimizations -eq 0; then - CFLAGS="${CFLAGS} -DARM_GETAUXVAL" - SFLAGS="${SFLAGS} -DARM_GETAUXVAL" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o fill_window_arm.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo fill_window_arm.lo" + CFLAGS="${CFLAGS} -DARM_CPUID" + SFLAGS="${SFLAGS} -DARM_CPUID" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo" if test $buildacle -eq 1; then if test $native -eq 0; then @@ -201,6 +201,7 @@ ZLIB_INTERNAL void crc_finalize(deflate_state *const s) { ZLIB_INTERNAL void crc_reset(deflate_state *const s) { #ifdef X86_PCLMULQDQ_CRC + x86_check_features(); if (x86_cpu_has_pclmulqdq) { crc_fold_init(s); return; @@ -265,7 +265,7 @@ int ZEXPORT PREFIX(deflateInit2_)(PREFIX3(stream) *strm, int level, int method, #if defined(X86_CPUID) x86_check_features(); -#elif defined(ARM_GETAUXVAL) +#elif defined(ARM_CPUID) arm_check_features(); #endif @@ -473,14 +473,14 @@ int ZEXPORT PREFIX(deflateSetDictionary)(PREFIX3(stream) *strm, const unsigned c next = strm->next_in; strm->avail_in = dictLength; strm->next_in = (const unsigned char *)dictionary; - functable.fill_window(s); + fill_window(s); while (s->lookahead >= MIN_MATCH) { str = s->strstart; n = s->lookahead - (MIN_MATCH-1); functable.insert_string(s, str, n); s->strstart = str + n; s->lookahead = MIN_MATCH-1; - functable.fill_window(s); + fill_window(s); } s->strstart += s->lookahead; s->block_start = (long)s->strstart; @@ -1246,22 +1246,22 @@ void check_match(deflate_state *s, IPos start, IPos match, int length) { * option -- not supported here). */ -void ZLIB_INTERNAL fill_window_c(deflate_state *s) { +void ZLIB_INTERNAL fill_window(deflate_state *s) { unsigned n; - unsigned more; /* Amount of free space at the end of the window. */ + unsigned long more; /* Amount of free space at the end of the window. */ unsigned int wsize = s->w_size; Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead"); do { - more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart); + more = s->window_size - s->lookahead - s->strstart; /* If the window is almost full and there is insufficient lookahead, * move the upper half to the lower one to make room in the upper half. */ if (s->strstart >= wsize+MAX_DIST(s)) { - memcpy(s->window, s->window+wsize, (unsigned)wsize - more); - s->match_start -= wsize; + memcpy(s->window, s->window+wsize, (unsigned)wsize); + s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0; s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ s->block_start -= (long) wsize; if (s->insert > s->strstart) @@ -1310,7 +1310,7 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) { } else { count = s->insert; } - functable.insert_string(s,str,count); + functable.insert_string(s, str, count); s->insert -= count; #endif } @@ -1327,7 +1327,7 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) { * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead. */ if (s->high_water < s->window_size) { - unsigned long curr = s->strstart + (unsigned long)(s->lookahead); + unsigned long curr = s->strstart + (unsigned long)s->lookahead; unsigned long init; if (s->high_water < curr) { @@ -1337,9 +1337,9 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) { init = s->window_size - curr; if (init > WIN_INIT) init = WIN_INIT; - memset(s->window + curr, 0, (unsigned)init); + memset(s->window + curr, 0, init); s->high_water = curr + init; - } else if (s->high_water < (unsigned long)curr + WIN_INIT) { + } else if (s->high_water < curr + WIN_INIT) { /* High water mark at or above current data, but below current data * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up * to end of window, whichever is less. @@ -1347,7 +1347,7 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) { init = (unsigned long)curr + WIN_INIT - s->high_water; if (init > s->window_size - s->high_water) init = s->window_size - s->high_water; - memset(s->window + s->high_water, 0, (unsigned)init); + memset(s->window + s->high_water, 0, init); s->high_water += init; } } @@ -1562,7 +1562,7 @@ static block_state deflate_rle(deflate_state *s, int flush) { * for the longest run, plus one for the unrolled loop. */ if (s->lookahead <= MAX_MATCH) { - functable.fill_window(s); + fill_window(s); if (s->lookahead <= MAX_MATCH && flush == Z_NO_FLUSH) { return need_more; } @@ -1629,7 +1629,7 @@ static block_state deflate_huff(deflate_state *s, int flush) { for (;;) { /* Make sure that we have a literal to write. */ if (s->lookahead == 0) { - functable.fill_window(s); + fill_window(s); if (s->lookahead == 0) { if (flush == Z_NO_FLUSH) return need_more; @@ -372,7 +372,7 @@ static inline void put_uint32_msb(deflate_state *s, uint32_t dw) { memory checker errors from longest match routines */ -void ZLIB_INTERNAL fill_window_c(deflate_state *s); +void ZLIB_INTERNAL fill_window(deflate_state *s); void ZLIB_INTERNAL slide_hash_c(deflate_state *s); /* in trees.c */ diff --git a/deflate_fast.c b/deflate_fast.c index 5659926..9efda47 100644 --- a/deflate_fast.c +++ b/deflate_fast.c @@ -28,7 +28,7 @@ ZLIB_INTERNAL block_state deflate_fast(deflate_state *s, int flush) { * string following the next match. */ if (s->lookahead < MIN_LOOKAHEAD) { - functable.fill_window(s); + fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { return need_more; } diff --git a/deflate_medium.c b/deflate_medium.c index 31f8372..4e5d094 100644 --- a/deflate_medium.c +++ b/deflate_medium.c @@ -207,7 +207,7 @@ ZLIB_INTERNAL block_state deflate_medium(deflate_state *s, int flush) { * string following the next current_match. */ if (s->lookahead < MIN_LOOKAHEAD) { - functable.fill_window(s); + fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { return need_more; } diff --git a/deflate_slow.c b/deflate_slow.c index fb217cc..b8b1598 100644 --- a/deflate_slow.c +++ b/deflate_slow.c @@ -36,7 +36,7 @@ ZLIB_INTERNAL block_state deflate_slow(deflate_state *s, int flush) { * string following the next match. */ if (s->lookahead < MIN_LOOKAHEAD) { - functable.fill_window(s); + fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { return need_more; } diff --git a/fallback_builtins.h b/fallback_builtins.h index 3598b7c..8bd16ed 100644 --- a/fallback_builtins.h +++ b/fallback_builtins.h @@ -12,6 +12,7 @@ */ static __forceinline unsigned long __builtin_ctzl(unsigned long value) { #ifdef X86_CPUID + x86_check_features(); if (x86_cpu_has_tzcnt) return _tzcnt_u32(value); #endif diff --git a/functable.c b/functable.c index c462a92..8f63450 100644 --- a/functable.c +++ b/functable.c @@ -24,17 +24,14 @@ extern Pos quick_insert_string_sse4(deflate_state *const s, const Pos str); #elif defined(ARM_ACLE_CRC_HASH) extern Pos quick_insert_string_acle(deflate_state *const s, const Pos str); #endif - -/* fill_window */ -#if defined(X86_SSE2) -extern void fill_window_sse(deflate_state *s); -#elif defined(ARM_GETAUXVAL) -extern void fill_window_arm(deflate_state *s); -#endif - /* slide_hash */ #ifdef X86_SSE2 void slide_hash_sse2(deflate_state *s); +#elif defined(ARM_NEON_SLIDEHASH) +void slide_hash_neon(deflate_state *s); +#endif +#ifdef X86_AVX2 +void slide_hash_avx2(deflate_state *s); #endif /* adler32 */ @@ -65,14 +62,12 @@ extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t); /* stub definitions */ ZLIB_INTERNAL Pos insert_string_stub(deflate_state *const s, const Pos str, unsigned int count); ZLIB_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const Pos str); -ZLIB_INTERNAL void fill_window_stub(deflate_state *s); ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len); ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len); ZLIB_INTERNAL void slide_hash_stub(deflate_state *s); /* functable init */ ZLIB_INTERNAL __thread struct functable_s functable = { - fill_window_stub, insert_string_stub, quick_insert_string_stub, adler32_stub, @@ -80,11 +75,25 @@ ZLIB_INTERNAL __thread struct functable_s functable = { slide_hash_stub }; +ZLIB_INTERNAL void cpu_check_features(void) +{ + static int features_checked = 0; + if (features_checked) + return; +#ifdef X86_CPUID + x86_check_features(); +#elif ARM_CPUID + arm_check_features(); +#endif + features_checked = 1; +} /* stub functions */ ZLIB_INTERNAL Pos insert_string_stub(deflate_state *const s, const Pos str, unsigned int count) { // Initialize default + functable.insert_string = &insert_string_c; + cpu_check_features(); #ifdef X86_SSE42_CRC_HASH if (x86_cpu_has_sse42) @@ -111,31 +120,23 @@ ZLIB_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const Pos str return functable.quick_insert_string(s, str); } -ZLIB_INTERNAL void fill_window_stub(deflate_state *s) { - // Initialize default - functable.fill_window = &fill_window_c; - -#if defined(X86_SSE2) -# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) - if (x86_cpu_has_sse2) -# endif - functable.fill_window = &fill_window_sse; -#elif defined(ARM_GETAUXVAL) - functable.fill_window = &fill_window_arm; -#endif - - functable.fill_window(s); -} - ZLIB_INTERNAL void slide_hash_stub(deflate_state *s) { - // Initialize default + functable.slide_hash = &slide_hash_c; + cpu_check_features(); #ifdef X86_SSE2 -# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) +# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) -# endif +# endif functable.slide_hash = &slide_hash_sse2; +#elif defined(ARM_NEON_SLIDEHASH) + if (arm_cpu_has_neon) + functable.slide_hash = &slide_hash_neon; +#endif +#ifdef X86_AVX2 + if (x86_cpu_has_avx2) + functable.slide_hash = &slide_hash_avx2; #endif functable.slide_hash(s); @@ -144,6 +145,7 @@ ZLIB_INTERNAL void slide_hash_stub(deflate_state *s) { ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) { // Initialize default functable.adler32 = &adler32_c; + cpu_check_features(); #if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(ARM_NEON_ADLER32) if (arm_cpu_has_neon) @@ -163,6 +165,7 @@ ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64 if (crc_table_empty) make_crc_table(); #endif /* DYNAMIC_CRC_TABLE */ + cpu_check_features(); if (sizeof(void *) == sizeof(ptrdiff_t)) { #if BYTE_ORDER == LITTLE_ENDIAN diff --git a/functable.h b/functable.h index 98e068a..a03c1e4 100644 --- a/functable.h +++ b/functable.h @@ -9,7 +9,6 @@ #include "deflate.h" struct functable_s { - void (* fill_window) (deflate_state *s); Pos (* insert_string) (deflate_state *const s, const Pos str, unsigned int count); Pos (* quick_insert_string)(deflate_state *const s, const Pos str); uint32_t (* adler32) (uint32_t adler, const unsigned char *buf, size_t len); @@ -133,7 +133,7 @@ int ZEXPORT PREFIX(inflateInit2_)(PREFIX3(stream) *strm, int windowBits, const c #if defined(X86_CPUID) x86_check_features(); -#elif defined(ARM_GETAUXVAL) +#elif defined(ARM_CPUID) arm_check_features(); #endif diff --git a/win32/Makefile.a64 b/win32/Makefile.a64 index 28a41d8..94e7f57 100644 --- a/win32/Makefile.a64 +++ b/win32/Makefile.a64 @@ -50,8 +50,8 @@ RCFILE = zlib-ng1.rc RESFILE = zlib-ng1.res SUFFIX = -ng !endif -WFLAGS = $(WFLAGS) -DARM_ACLE_CRC_HASH -D__ARM_NEON__=1 -DARM_NEON_ADLER32 -DARM_NOCHECK_NEON -OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj adler32_neon.obj +WFLAGS = $(WFLAGS) -DARM_ACLE_CRC_HASH -D__ARM_NEON__=1 -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH -DARM_NOCHECK_NEON +OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj slide_neon.obj .adler32_neon.obj # targets all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \ diff --git a/win32/Makefile.arm b/win32/Makefile.arm index bd04f0f..2c3a82d 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -63,8 +63,8 @@ NEON_ARCH = /arch:VFPv3 !endif !if "$(WITH_NEON)" != "" CFLAGS = $(CFLAGS) $(NEON_ARCH) -WFLAGS = $(WFLAGS) -D__ARM_NEON__=1 -DARM_NEON_ADLER32 -DARM_NOCHECK_NEON -OBJS = $(OBJS) adler32_neon.obj +WFLAGS = $(WFLAGS) -D__ARM_NEON__=1 -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH -DARM_NOCHECK_NEON +OBJS = $(OBJS) adler32_neon.obj slide_neon.obj !endif # targets diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 687b083..9f9cd31 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -38,7 +38,7 @@ OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_q deflate_medium.obj \ functable.obj infback.obj inflate.obj inftrees.obj inffast.obj insert_string.obj \ slide_avx.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \ - x86.obj fill_window_sse.obj insert_string_sse.obj crc_folding.obj + x86.obj insert_string_sse.obj crc_folding.obj !if "$(ZLIB_COMPAT)" != "" WITH_GZFILEOP = yes WFLAGS = $(WFLAGS) -DZLIB_COMPAT @@ -246,7 +246,7 @@ void ZLIB_INTERNAL zng_cfree(void *opaque, void *ptr); #if defined(X86_CPUID) # include "arch/x86/x86.h" -#elif defined(ARM_GETAUXVAL) +#elif defined(ARM_CPUID) # include "arch/arm/arm.h" #endif |