summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt16
-rw-r--r--arch/arm/Makefile.in10
-rw-r--r--arch/arm/fill_window_arm.c167
-rw-r--r--arch/arm/slide_neon.c48
-rw-r--r--arch/x86/Makefile.in8
-rw-r--r--arch/x86/README.md2
-rw-r--r--arch/x86/deflate_quick.c3
-rw-r--r--arch/x86/fill_window_sse.c155
-rwxr-xr-xconfigure49
-rw-r--r--crc32.c1
-rw-r--r--deflate.c30
-rw-r--r--deflate.h2
-rw-r--r--deflate_fast.c2
-rw-r--r--deflate_medium.c2
-rw-r--r--deflate_slow.c2
-rw-r--r--fallback_builtins.h1
-rw-r--r--functable.c61
-rw-r--r--functable.h1
-rw-r--r--inflate.c2
-rw-r--r--win32/Makefile.a644
-rw-r--r--win32/Makefile.arm4
-rw-r--r--win32/Makefile.msc2
-rw-r--r--zutil.h2
23 files changed, 150 insertions, 424 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6fb57e1..32eb662 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -624,16 +624,19 @@ endif()
if(WITH_OPTIM)
if(BASEARCH_ARM_FOUND)
- add_definitions(-DARM_GETAUXVAL)
- list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/armfeature.c ${ARCHDIR}/fill_window_arm.c)
+ add_definitions(-DARM_CPUID)
+ list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm.h)
+ list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/armfeature.c)
if(WITH_NEON)
- list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/adler32_neon.c)
- add_definitions(-DARM_NEON_ADLER32)
+ list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_neon.h)
+ list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/slide_neon.c)
+ add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH)
add_intrinsics_option("${NEONFLAG}")
if(MSVC)
add_definitions(-D__ARM_NEON__)
endif()
- add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"")
+ add_feature_info(NEON_ALDER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"")
+ add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"")
endif()
if(WITH_ACLE)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
@@ -659,6 +662,7 @@ if(WITH_OPTIM)
endif()
elseif(BASEARCH_X86_FOUND)
add_definitions(-DX86_CPUID)
+ list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86.c)
if(MSVC)
list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
@@ -685,7 +689,7 @@ if(WITH_OPTIM)
endif()
if(WITH_SSE2 AND HAVE_SSE2_INTRIN)
add_definitions(-DX86_SSE2)
- list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/fill_window_sse.c ${ARCHDIR}/slide_sse.c)
+ list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_sse.c)
if(NOT ${ARCH} MATCHES "x86_64")
add_intrinsics_option("${SSE2FLAG}")
add_feature_info(FORCE_SSE2 FORCE_SSE2 "Assume CPU is SSE2 capable")
diff --git a/arch/arm/Makefile.in b/arch/arm/Makefile.in
index a64d591..9a25482 100644
--- a/arch/arm/Makefile.in
+++ b/arch/arm/Makefile.in
@@ -12,7 +12,7 @@ SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
-all: adler32_neon.o adler32_neon.lo armfeature.o armfeature.lo crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo
+all: adler32_neon.o adler32_neon.lo armfeature.o armfeature.lo crc32_acle.o crc32_acle.lo slide_neon.o slide_neon.lo insert_string_acle.o insert_string_acle.lo
adler32_neon.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
@@ -32,11 +32,11 @@ crc32_acle.o:
crc32_acle.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
-fill_window_arm.o:
- $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
+slide_neon.o:
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c
-fill_window_arm.lo:
- $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
+slide_neon.lo:
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c
insert_string_acle.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
diff --git a/arch/arm/fill_window_arm.c b/arch/arm/fill_window_arm.c
deleted file mode 100644
index 4367451..0000000
--- a/arch/arm/fill_window_arm.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions
- * Copyright (C) 2017 Mika T. Lindqvist
- *
- * Authors:
- * Mika T. Lindqvist <postmaster@raasu.org>
- * Jun He <jun.he@arm.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include "../../zbuild.h"
-#include "../../deflate.h"
-#include "../../deflate_p.h"
-#include "../../functable.h"
-
-extern ZLIB_INTERNAL int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-
-/* SIMD version of hash_chain rebase */
-static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
- register uint16x8_t v, *p;
- register size_t n;
-
- size_t size = entries*sizeof(table[0]);
- Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
-
- Assert(sizeof(Pos) == 2, "Wrong Pos size");
- v = vdupq_n_u16(window_size);
-
- p = (uint16x8_t *)table;
- n = size / (sizeof(uint16x8_t) * 8);
- do {
- p[0] = vqsubq_u16(p[0], v);
- p[1] = vqsubq_u16(p[1], v);
- p[2] = vqsubq_u16(p[2], v);
- p[3] = vqsubq_u16(p[3], v);
- p[4] = vqsubq_u16(p[4], v);
- p[5] = vqsubq_u16(p[5], v);
- p[6] = vqsubq_u16(p[6], v);
- p[7] = vqsubq_u16(p[7], v);
- p += 8;
- } while (--n);
-}
-#else
-/* generic version for hash rebase */
-static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
- unsigned int i;
- for (i = 0; i < entries; i++) {
- table[i] = (table[i] >= window_size) ? (table[i] - window_size) : NIL;
- }
-}
-#endif
-
-void fill_window_arm(deflate_state *s) {
- register unsigned n;
- unsigned long more; /* Amount of free space at the end of the window. */
- unsigned int wsize = s->w_size;
-
- Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
-
- do {
- more = s->window_size - s->lookahead - s->strstart;
-
- /* If the window is almost full and there is insufficient lookahead,
- * move the upper half to the lower one to make room in the upper half.
- */
- if (s->strstart >= wsize+MAX_DIST(s)) {
- memcpy(s->window, s->window+wsize, wsize);
- s->match_start -= wsize;
- s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
- s->block_start -= wsize;
-
- /* Slide the hash table (could be avoided with 32 bit values
- at the expense of memory usage). We slide even when level == 0
- to keep the hash table consistent if we switch back to level > 0
- later. (Using level 0 permanently is not an optimal usage of
- zlib, so we don't care about this pathological case.)
- */
-
- slide_hash_chain(s->head, s->hash_size, wsize);
- slide_hash_chain(s->prev, wsize, wsize);
- more += wsize;
- }
- if (s->strm->avail_in == 0)
- break;
-
- /* If there was no sliding:
- * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
- * more == window_size - lookahead - strstart
- * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
- * => more >= window_size - 2*WSIZE + 2
- * In the BIG_MEM or MMAP case (not yet supported),
- * window_size == input_size + MIN_LOOKAHEAD &&
- * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
- * Otherwise, window_size == 2*WSIZE so more >= 2.
- * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
- */
- Assert(more >= 2, "more < 2");
-
- n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
- s->lookahead += n;
-
- /* Initialize the hash value now that we have some input: */
- if (s->lookahead + s->insert >= MIN_MATCH) {
- unsigned int str = s->strstart - s->insert;
- unsigned int insert_cnt = s->insert;
- unsigned int slen;
-
- s->ins_h = s->window[str];
-
- if (UNLIKELY(s->lookahead < MIN_MATCH))
- insert_cnt += s->lookahead - MIN_MATCH;
- slen = insert_cnt;
- if (str >= (MIN_MATCH - 2))
- {
- str += 2 - MIN_MATCH;
- insert_cnt += MIN_MATCH - 2;
- }
- if (insert_cnt > 0)
- {
- functable.insert_string(s, str, insert_cnt);
- s->insert -= slen;
- }
- }
- /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
- * but this is not important since only literal bytes will be emitted.
- */
- } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
-
- /* If the WIN_INIT bytes after the end of the current data have never been
- * written, then zero those bytes in order to avoid memory check reports of
- * the use of uninitialized (or uninitialised as Julian writes) bytes by
- * the longest match routines. Update the high water mark for the next
- * time through here. WIN_INIT is set to MAX_MATCH since the longest match
- * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
- */
- if (s->high_water < s->window_size) {
- unsigned long curr = s->strstart + (unsigned long)s->lookahead;
- unsigned long init;
-
- if (s->high_water < curr) {
- /* Previous high water mark below current data -- zero WIN_INIT
- * bytes or up to end of window, whichever is less.
- */
- init = s->window_size - curr;
- if (init > WIN_INIT)
- init = WIN_INIT;
- memset(s->window + curr, 0, init);
- s->high_water = curr + init;
- } else if (s->high_water < curr + WIN_INIT) {
- /* High water mark at or above current data, but below current data
- * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
- * to end of window, whichever is less.
- */
- init = curr + WIN_INIT;
- if (init > s->window_size)
- init = s->window_size;
- init -= s->high_water;
- memset(s->window + s->high_water, 0, init);
- s->high_water += init;
- }
- }
-
- Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
-}
diff --git a/arch/arm/slide_neon.c b/arch/arm/slide_neon.c
new file mode 100644
index 0000000..352d5a6
--- /dev/null
+++ b/arch/arm/slide_neon.c
@@ -0,0 +1,48 @@
+/* slide_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
+ * Copyright (C) 2017 Mika T. Lindqvist
+ *
+ * Authors:
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ * Jun He <jun.he@arm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(ARM_NEON_SLIDEHASH)
+#include <arm_neon.h>
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
+ register uint16x8_t v, *p;
+ register size_t n;
+
+ size_t size = entries*sizeof(table[0]);
+ Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+ Assert(sizeof(Pos) == 2, "Wrong Pos size");
+ v = vdupq_n_u16(window_size);
+
+ p = (uint16x8_t *)table;
+ n = size / (sizeof(uint16x8_t) * 8);
+ do {
+ p[0] = vqsubq_u16(p[0], v);
+ p[1] = vqsubq_u16(p[1], v);
+ p[2] = vqsubq_u16(p[2], v);
+ p[3] = vqsubq_u16(p[3], v);
+ p[4] = vqsubq_u16(p[4], v);
+ p[5] = vqsubq_u16(p[5], v);
+ p[6] = vqsubq_u16(p[6], v);
+ p[7] = vqsubq_u16(p[7], v);
+ p += 8;
+ } while (--n);
+}
+
+ZLIB_INTERNAL void slide_hash_neon(deflate_state *s) {
+ unsigned int wsize = s->w_size;
+
+ slide_hash_chain(s->head, s->hash_size, wsize);
+ slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in
index 187d06f..8da40bf 100644
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@@ -17,7 +17,7 @@ SRCDIR=.
SRCTOP=../..
TOPDIR=$(SRCTOP)
-all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
+all: x86.o x86.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
x86.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
@@ -25,12 +25,6 @@ x86.o:
x86.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
-fill_window_sse.o:
- $(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
-
-fill_window_sse.lo:
- $(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
-
deflate_quick.o:
$(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
diff --git a/arch/x86/README.md b/arch/x86/README.md
index 6d23945..8bf6d08 100644
--- a/arch/x86/README.md
+++ b/arch/x86/README.md
@@ -3,6 +3,6 @@ Contents
|Name|Description|
|:-|:-|
-|fill_window_sse.c|SSE2 optimized fill_window|
|deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
+|slide_sse2.c|SSE2 optimized slide_hash|
diff --git a/arch/x86/deflate_quick.c b/arch/x86/deflate_quick.c
index 5cbc653..809c081 100644
--- a/arch/x86/deflate_quick.c
+++ b/arch/x86/deflate_quick.c
@@ -30,7 +30,6 @@
# include <ctype.h>
#endif
-extern void fill_window_sse(deflate_state *s);
extern void flush_pending(PREFIX3(stream) *strm);
static inline long compare258(const unsigned char *const src0, const unsigned char *const src1) {
@@ -209,7 +208,7 @@ ZLIB_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
}
if (s->lookahead < MIN_LOOKAHEAD) {
- fill_window_sse(s);
+ fill_window(s);
if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
static_emit_end_block(s, 0);
return need_more;
diff --git a/arch/x86/fill_window_sse.c b/arch/x86/fill_window_sse.c
deleted file mode 100644
index be35f9f..0000000
--- a/arch/x86/fill_window_sse.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Fill Window with SSE2-optimized hash shifting
- *
- * Copyright (C) 2013 Intel Corporation
- * Authors:
- * Arjan van de Ven <arjan@linux.intel.com>
- * Jim Kukunas <james.t.kukunas@linux.intel.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-#ifdef X86_SSE2
-
-#include "../../zbuild.h"
-#include <immintrin.h>
-#include "../../deflate.h"
-#include "../../deflate_p.h"
-#include "../../functable.h"
-
-extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
-void slide_hash_sse2(deflate_state *s);
-#ifdef X86_AVX2
-void slide_hash_avx2(deflate_state *s);
-#endif
-
-ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
- register unsigned n;
- unsigned more; /* Amount of free space at the end of the window. */
- unsigned int wsize = s->w_size;
-
- Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
-
- do {
- more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart);
-
- /* Deal with !@#$% 64K limit: */
- if (sizeof(int) <= 2) {
- if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
- more = wsize;
-
- } else if (more == (unsigned)(-1)) {
- /* Very unlikely, but possible on 16 bit machine if
- * strstart == 0 && lookahead == 1 (input done a byte at time)
- */
- more--;
- }
- }
-
- /* If the window is almost full and there is insufficient lookahead,
- * move the upper half to the lower one to make room in the upper half.
- */
- if (s->strstart >= wsize+MAX_DIST(s)) {
- memcpy(s->window, s->window+wsize, (unsigned)wsize);
- s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0;
- s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
- s->block_start -= (long) wsize;
-
- /* Slide the hash table (could be avoided with 32 bit values
- at the expense of memory usage). We slide even when level == 0
- to keep the hash table consistent if we switch back to level > 0
- later. (Using level 0 permanently is not an optimal usage of
- zlib, so we don't care about this pathological case.)
- */
-#ifdef X86_AVX2
- if (x86_cpu_has_avx2) {
- slide_hash_avx2(s);
- } else
-#endif
- slide_hash_sse2(s);
- more += wsize;
- }
- if (s->strm->avail_in == 0) break;
-
- /* If there was no sliding:
- * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
- * more == window_size - lookahead - strstart
- * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
- * => more >= window_size - 2*WSIZE + 2
- * In the BIG_MEM or MMAP case (not yet supported),
- * window_size == input_size + MIN_LOOKAHEAD &&
- * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
- * Otherwise, window_size == 2*WSIZE so more >= 2.
- * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
- */
- Assert(more >= 2, "more < 2");
-
- n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
- s->lookahead += n;
-
- /* Initialize the hash value now that we have some input: */
- if (s->lookahead + s->insert >= MIN_MATCH) {
- unsigned int str = s->strstart - s->insert;
- s->ins_h = s->window[str];
- if (str >= 1)
- functable.quick_insert_string(s, str + 2 - MIN_MATCH);
-#if MIN_MATCH != 3
-#error Call insert_string() MIN_MATCH-3 more times
- while (s->insert) {
- functable.quick_insert_string(s, str);
- str++;
- s->insert--;
- if (s->lookahead + s->insert < MIN_MATCH)
- break;
- }
-#else
- unsigned int count;
- if (UNLIKELY(s->lookahead == 1)) {
- count = s->insert - 1;
- } else {
- count = s->insert;
- }
- functable.insert_string(s, str, count);
- s->insert -= count;
-#endif
- }
- /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
- * but this is not important since only literal bytes will be emitted.
- */
- } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
-
- /* If the WIN_INIT bytes after the end of the current data have never been
- * written, then zero those bytes in order to avoid memory check reports of
- * the use of uninitialized (or uninitialised as Julian writes) bytes by
- * the longest match routines. Update the high water mark for the next
- * time through here. WIN_INIT is set to MAX_MATCH since the longest match
- * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
- */
- if (s->high_water < s->window_size) {
- unsigned long curr = s->strstart + (unsigned long)(s->lookahead);
- unsigned long init;
-
- if (s->high_water < curr) {
- /* Previous high water mark below current data -- zero WIN_INIT
- * bytes or up to end of window, whichever is less.
- */
- init = s->window_size - curr;
- if (init > WIN_INIT)
- init = WIN_INIT;
- memset(s->window + curr, 0, (unsigned)init);
- s->high_water = curr + init;
- } else if (s->high_water < (unsigned long)curr + WIN_INIT) {
- /* High water mark at or above current data, but below current data
- * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
- * to end of window, whichever is less.
- */
- init = (unsigned long)curr + WIN_INIT - s->high_water;
- if (init > s->window_size - s->high_water)
- init = s->window_size - s->high_water;
- memset(s->window + s->high_water, 0, (unsigned)init);
- s->high_water += init;
- }
- }
-
- Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
-}
-#endif
diff --git a/configure b/configure
index b865684..5177d17 100755
--- a/configure
+++ b/configure
@@ -1019,8 +1019,8 @@ case "${ARCH}" in
if test ${HAVE_SSE2_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_SSE2"
SFLAGS="${SFLAGS} -DX86_SSE2"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_sse.o slide_sse.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_sse.lo slide_sse.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_sse.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_sse.lo"
if test $forcesse2 -eq 1; then
CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2"
@@ -1028,7 +1028,6 @@ case "${ARCH}" in
fi
# Enable deflate_quick at level 1?
- # requires SSE2: code uses fill_window_sse
if test $without_new_strategies -eq 0; then
CFLAGS="${CFLAGS} -DX86_QUICK_STRATEGY"
SFLAGS="${SFLAGS} -DX86_QUICK_STRATEGY"
@@ -1077,8 +1076,8 @@ case "${ARCH}" in
CFLAGS="${CFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH"
SFLAGS="${SFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o fill_window_sse.o insert_string_sse.o slide_sse.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo fill_window_sse.lo insert_string_sse.lo slide_sse.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o insert_string_sse.o slide_sse.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo insert_string_sse.lo slide_sse.lo"
if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN"
@@ -1116,10 +1115,10 @@ case "${ARCH}" in
ARCHDIR=arch/arm
if test $without_optimizations -eq 0; then
- CFLAGS="${CFLAGS} -DARM_GETAUXVAL"
- SFLAGS="${SFLAGS} -DARM_GETAUXVAL"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o fill_window_arm.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo fill_window_arm.lo"
+ CFLAGS="${CFLAGS} -DARM_CPUID"
+ SFLAGS="${SFLAGS} -DARM_CPUID"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo"
fi
@@ -1169,11 +1168,11 @@ case "${ARCH}" in
fi
if test $buildneon -eq 1; then
- CFLAGS="${CFLAGS} -mfpu=neon -DARM_NEON_ADLER32"
- SFLAGS="${SFLAGS} -mfpu=neon -DARM_NEON_ADLER32"
+ CFLAGS="${CFLAGS} -mfpu=neon -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
+ SFLAGS="${SFLAGS} -mfpu=neon -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
fi
fi
;;
@@ -1192,11 +1191,11 @@ case "${ARCH}" in
SFLAGS="${SFLAGS} -mfpu=neon"
fi
- CFLAGS="${CFLAGS} -DARM_NEON_ADLER32"
- SFLAGS="${SFLAGS} -DARM_NEON_ADLER32"
+ CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
+ SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
fi
fi
;;
@@ -1216,11 +1215,11 @@ case "${ARCH}" in
SFLAGS="${SFLAGS} -mfpu=neon"
fi
- CFLAGS="${CFLAGS} -DARM_NEON_ADLER32"
- SFLAGS="${SFLAGS} -DARM_NEON_ADLER32"
+ CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
+ SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
fi
fi
;;
@@ -1239,10 +1238,10 @@ case "${ARCH}" in
fi
if test $without_optimizations -eq 0; then
- CFLAGS="${CFLAGS} -DARM_GETAUXVAL"
- SFLAGS="${SFLAGS} -DARM_GETAUXVAL"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o fill_window_arm.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo fill_window_arm.lo"
+ CFLAGS="${CFLAGS} -DARM_CPUID"
+ SFLAGS="${SFLAGS} -DARM_CPUID"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo"
if test $buildacle -eq 1; then
if test $native -eq 0; then
diff --git a/crc32.c b/crc32.c
index 6d9d6a6..7939d49 100644
--- a/crc32.c
+++ b/crc32.c
@@ -201,6 +201,7 @@ ZLIB_INTERNAL void crc_finalize(deflate_state *const s) {
ZLIB_INTERNAL void crc_reset(deflate_state *const s) {
#ifdef X86_PCLMULQDQ_CRC
+ x86_check_features();
if (x86_cpu_has_pclmulqdq) {
crc_fold_init(s);
return;
diff --git a/deflate.c b/deflate.c
index 00e2b56..4099782 100644
--- a/deflate.c
+++ b/deflate.c
@@ -265,7 +265,7 @@ int ZEXPORT PREFIX(deflateInit2_)(PREFIX3(stream) *strm, int level, int method,
#if defined(X86_CPUID)
x86_check_features();
-#elif defined(ARM_GETAUXVAL)
+#elif defined(ARM_CPUID)
arm_check_features();
#endif
@@ -473,14 +473,14 @@ int ZEXPORT PREFIX(deflateSetDictionary)(PREFIX3(stream) *strm, const unsigned c
next = strm->next_in;
strm->avail_in = dictLength;
strm->next_in = (const unsigned char *)dictionary;
- functable.fill_window(s);
+ fill_window(s);
while (s->lookahead >= MIN_MATCH) {
str = s->strstart;
n = s->lookahead - (MIN_MATCH-1);
functable.insert_string(s, str, n);
s->strstart = str + n;
s->lookahead = MIN_MATCH-1;
- functable.fill_window(s);
+ fill_window(s);
}
s->strstart += s->lookahead;
s->block_start = (long)s->strstart;
@@ -1246,22 +1246,22 @@ void check_match(deflate_state *s, IPos start, IPos match, int length) {
* option -- not supported here).
*/
-void ZLIB_INTERNAL fill_window_c(deflate_state *s) {
+void ZLIB_INTERNAL fill_window(deflate_state *s) {
unsigned n;
- unsigned more; /* Amount of free space at the end of the window. */
+ unsigned long more; /* Amount of free space at the end of the window. */
unsigned int wsize = s->w_size;
Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
do {
- more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart);
+ more = s->window_size - s->lookahead - s->strstart;
/* If the window is almost full and there is insufficient lookahead,
* move the upper half to the lower one to make room in the upper half.
*/
if (s->strstart >= wsize+MAX_DIST(s)) {
- memcpy(s->window, s->window+wsize, (unsigned)wsize - more);
- s->match_start -= wsize;
+ memcpy(s->window, s->window+wsize, (unsigned)wsize);
+ s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0;
s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
s->block_start -= (long) wsize;
if (s->insert > s->strstart)
@@ -1310,7 +1310,7 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) {
} else {
count = s->insert;
}
- functable.insert_string(s,str,count);
+ functable.insert_string(s, str, count);
s->insert -= count;
#endif
}
@@ -1327,7 +1327,7 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) {
* routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
*/
if (s->high_water < s->window_size) {
- unsigned long curr = s->strstart + (unsigned long)(s->lookahead);
+ unsigned long curr = s->strstart + (unsigned long)s->lookahead;
unsigned long init;
if (s->high_water < curr) {
@@ -1337,9 +1337,9 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) {
init = s->window_size - curr;
if (init > WIN_INIT)
init = WIN_INIT;
- memset(s->window + curr, 0, (unsigned)init);
+ memset(s->window + curr, 0, init);
s->high_water = curr + init;
- } else if (s->high_water < (unsigned long)curr + WIN_INIT) {
+ } else if (s->high_water < curr + WIN_INIT) {
/* High water mark at or above current data, but below current data
* plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
* to end of window, whichever is less.
@@ -1347,7 +1347,7 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) {
init = (unsigned long)curr + WIN_INIT - s->high_water;
if (init > s->window_size - s->high_water)
init = s->window_size - s->high_water;
- memset(s->window + s->high_water, 0, (unsigned)init);
+ memset(s->window + s->high_water, 0, init);
s->high_water += init;
}
}
@@ -1562,7 +1562,7 @@ static block_state deflate_rle(deflate_state *s, int flush) {
* for the longest run, plus one for the unrolled loop.
*/
if (s->lookahead <= MAX_MATCH) {
- functable.fill_window(s);
+ fill_window(s);
if (s->lookahead <= MAX_MATCH && flush == Z_NO_FLUSH) {
return need_more;
}
@@ -1629,7 +1629,7 @@ static block_state deflate_huff(deflate_state *s, int flush) {
for (;;) {
/* Make sure that we have a literal to write. */
if (s->lookahead == 0) {
- functable.fill_window(s);
+ fill_window(s);
if (s->lookahead == 0) {
if (flush == Z_NO_FLUSH)
return need_more;
diff --git a/deflate.h b/deflate.h
index 07c2587..70f9812 100644
--- a/deflate.h
+++ b/deflate.h
@@ -372,7 +372,7 @@ static inline void put_uint32_msb(deflate_state *s, uint32_t dw) {
memory checker errors from longest match routines */
-void ZLIB_INTERNAL fill_window_c(deflate_state *s);
+void ZLIB_INTERNAL fill_window(deflate_state *s);
void ZLIB_INTERNAL slide_hash_c(deflate_state *s);
/* in trees.c */
diff --git a/deflate_fast.c b/deflate_fast.c
index 5659926..9efda47 100644
--- a/deflate_fast.c
+++ b/deflate_fast.c
@@ -28,7 +28,7 @@ ZLIB_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
* string following the next match.
*/
if (s->lookahead < MIN_LOOKAHEAD) {
- functable.fill_window(s);
+ fill_window(s);
if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
return need_more;
}
diff --git a/deflate_medium.c b/deflate_medium.c
index 31f8372..4e5d094 100644
--- a/deflate_medium.c
+++ b/deflate_medium.c
@@ -207,7 +207,7 @@ ZLIB_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
* string following the next current_match.
*/
if (s->lookahead < MIN_LOOKAHEAD) {
- functable.fill_window(s);
+ fill_window(s);
if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
return need_more;
}
diff --git a/deflate_slow.c b/deflate_slow.c
index fb217cc..b8b1598 100644
--- a/deflate_slow.c
+++ b/deflate_slow.c
@@ -36,7 +36,7 @@ ZLIB_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
* string following the next match.
*/
if (s->lookahead < MIN_LOOKAHEAD) {
- functable.fill_window(s);
+ fill_window(s);
if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
return need_more;
}
diff --git a/fallback_builtins.h b/fallback_builtins.h
index 3598b7c..8bd16ed 100644
--- a/fallback_builtins.h
+++ b/fallback_builtins.h
@@ -12,6 +12,7 @@
*/
static __forceinline unsigned long __builtin_ctzl(unsigned long value) {
#ifdef X86_CPUID
+ x86_check_features();
if (x86_cpu_has_tzcnt)
return _tzcnt_u32(value);
#endif
diff --git a/functable.c b/functable.c
index c462a92..8f63450 100644
--- a/functable.c
+++ b/functable.c
@@ -24,17 +24,14 @@ extern Pos quick_insert_string_sse4(deflate_state *const s, const Pos str);
#elif defined(ARM_ACLE_CRC_HASH)
extern Pos quick_insert_string_acle(deflate_state *const s, const Pos str);
#endif
-
-/* fill_window */
-#if defined(X86_SSE2)
-extern void fill_window_sse(deflate_state *s);
-#elif defined(ARM_GETAUXVAL)
-extern void fill_window_arm(deflate_state *s);
-#endif
-
/* slide_hash */
#ifdef X86_SSE2
void slide_hash_sse2(deflate_state *s);
+#elif defined(ARM_NEON_SLIDEHASH)
+void slide_hash_neon(deflate_state *s);
+#endif
+#ifdef X86_AVX2
+void slide_hash_avx2(deflate_state *s);
#endif
/* adler32 */
@@ -65,14 +62,12 @@ extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
/* stub definitions */
ZLIB_INTERNAL Pos insert_string_stub(deflate_state *const s, const Pos str, unsigned int count);
ZLIB_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const Pos str);
-ZLIB_INTERNAL void fill_window_stub(deflate_state *s);
ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len);
ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len);
ZLIB_INTERNAL void slide_hash_stub(deflate_state *s);
/* functable init */
ZLIB_INTERNAL __thread struct functable_s functable = {
- fill_window_stub,
insert_string_stub,
quick_insert_string_stub,
adler32_stub,
@@ -80,11 +75,25 @@ ZLIB_INTERNAL __thread struct functable_s functable = {
slide_hash_stub
};
+ZLIB_INTERNAL void cpu_check_features(void)
+{
+ static int features_checked = 0;
+ if (features_checked)
+ return;
+#ifdef X86_CPUID
+ x86_check_features();
+#elif ARM_CPUID
+ arm_check_features();
+#endif
+ features_checked = 1;
+}
/* stub functions */
ZLIB_INTERNAL Pos insert_string_stub(deflate_state *const s, const Pos str, unsigned int count) {
// Initialize default
+
functable.insert_string = &insert_string_c;
+ cpu_check_features();
#ifdef X86_SSE42_CRC_HASH
if (x86_cpu_has_sse42)
@@ -111,31 +120,23 @@ ZLIB_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const Pos str
return functable.quick_insert_string(s, str);
}
-ZLIB_INTERNAL void fill_window_stub(deflate_state *s) {
- // Initialize default
- functable.fill_window = &fill_window_c;
-
-#if defined(X86_SSE2)
-# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
- if (x86_cpu_has_sse2)
-# endif
- functable.fill_window = &fill_window_sse;
-#elif defined(ARM_GETAUXVAL)
- functable.fill_window = &fill_window_arm;
-#endif
-
- functable.fill_window(s);
-}
-
ZLIB_INTERNAL void slide_hash_stub(deflate_state *s) {
- // Initialize default
+
functable.slide_hash = &slide_hash_c;
+ cpu_check_features();
#ifdef X86_SSE2
-# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
if (x86_cpu_has_sse2)
-# endif
+# endif
functable.slide_hash = &slide_hash_sse2;
+#elif defined(ARM_NEON_SLIDEHASH)
+ if (arm_cpu_has_neon)
+ functable.slide_hash = &slide_hash_neon;
+#endif
+#ifdef X86_AVX2
+ if (x86_cpu_has_avx2)
+ functable.slide_hash = &slide_hash_avx2;
#endif
functable.slide_hash(s);
@@ -144,6 +145,7 @@ ZLIB_INTERNAL void slide_hash_stub(deflate_state *s) {
ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
// Initialize default
functable.adler32 = &adler32_c;
+ cpu_check_features();
#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(ARM_NEON_ADLER32)
if (arm_cpu_has_neon)
@@ -163,6 +165,7 @@ ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64
if (crc_table_empty)
make_crc_table();
#endif /* DYNAMIC_CRC_TABLE */
+ cpu_check_features();
if (sizeof(void *) == sizeof(ptrdiff_t)) {
#if BYTE_ORDER == LITTLE_ENDIAN
diff --git a/functable.h b/functable.h
index 98e068a..a03c1e4 100644
--- a/functable.h
+++ b/functable.h
@@ -9,7 +9,6 @@
#include "deflate.h"
struct functable_s {
- void (* fill_window) (deflate_state *s);
Pos (* insert_string) (deflate_state *const s, const Pos str, unsigned int count);
Pos (* quick_insert_string)(deflate_state *const s, const Pos str);
uint32_t (* adler32) (uint32_t adler, const unsigned char *buf, size_t len);
diff --git a/inflate.c b/inflate.c
index bfaf85f..9ccb0de 100644
--- a/inflate.c
+++ b/inflate.c
@@ -133,7 +133,7 @@ int ZEXPORT PREFIX(inflateInit2_)(PREFIX3(stream) *strm, int windowBits, const c
#if defined(X86_CPUID)
x86_check_features();
-#elif defined(ARM_GETAUXVAL)
+#elif defined(ARM_CPUID)
arm_check_features();
#endif
diff --git a/win32/Makefile.a64 b/win32/Makefile.a64
index 28a41d8..94e7f57 100644
--- a/win32/Makefile.a64
+++ b/win32/Makefile.a64
@@ -50,8 +50,8 @@ RCFILE = zlib-ng1.rc
RESFILE = zlib-ng1.res
SUFFIX = -ng
!endif
-WFLAGS = $(WFLAGS) -DARM_ACLE_CRC_HASH -D__ARM_NEON__=1 -DARM_NEON_ADLER32 -DARM_NOCHECK_NEON
-OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj adler32_neon.obj
+WFLAGS = $(WFLAGS) -DARM_ACLE_CRC_HASH -D__ARM_NEON__=1 -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH -DARM_NOCHECK_NEON
+OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj slide_neon.obj .adler32_neon.obj
# targets
all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \
diff --git a/win32/Makefile.arm b/win32/Makefile.arm
index bd04f0f..2c3a82d 100644
--- a/win32/Makefile.arm
+++ b/win32/Makefile.arm
@@ -63,8 +63,8 @@ NEON_ARCH = /arch:VFPv3
!endif
!if "$(WITH_NEON)" != ""
CFLAGS = $(CFLAGS) $(NEON_ARCH)
-WFLAGS = $(WFLAGS) -D__ARM_NEON__=1 -DARM_NEON_ADLER32 -DARM_NOCHECK_NEON
-OBJS = $(OBJS) adler32_neon.obj
+WFLAGS = $(WFLAGS) -D__ARM_NEON__=1 -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH -DARM_NOCHECK_NEON
+OBJS = $(OBJS) adler32_neon.obj slide_neon.obj
!endif
# targets
diff --git a/win32/Makefile.msc b/win32/Makefile.msc
index 687b083..9f9cd31 100644
--- a/win32/Makefile.msc
+++ b/win32/Makefile.msc
@@ -38,7 +38,7 @@ OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_q
deflate_medium.obj \
functable.obj infback.obj inflate.obj inftrees.obj inffast.obj insert_string.obj \
slide_avx.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \
- x86.obj fill_window_sse.obj insert_string_sse.obj crc_folding.obj
+ x86.obj insert_string_sse.obj crc_folding.obj
!if "$(ZLIB_COMPAT)" != ""
WITH_GZFILEOP = yes
WFLAGS = $(WFLAGS) -DZLIB_COMPAT
diff --git a/zutil.h b/zutil.h
index 8e9d056..9da6652 100644
--- a/zutil.h
+++ b/zutil.h
@@ -246,7 +246,7 @@ void ZLIB_INTERNAL zng_cfree(void *opaque, void *ptr);
#if defined(X86_CPUID)
# include "arch/x86/x86.h"
-#elif defined(ARM_GETAUXVAL)
+#elif defined(ARM_CPUID)
# include "arch/arm/arm.h"
#endif