9 files changed, 152 insertions, 126 deletions
diff --git a/arch/arm/armfeature.c b/arch/arm/armfeature.c
index bef9b29..978c987 100644
--- a/arch/arm/armfeature.c
+++ b/arch/arm/armfeature.c
@@ -11,6 +11,9 @@
 #    define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
 #  endif
 #elif defined(__APPLE__)
+#  if !defined(_DARWIN_C_SOURCE)
+#    define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
+#  endif
 #  include <sys/sysctl.h>
 #elif defined(_WIN32)
 #  include <winapifamily.h>
diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c
index e0ad3e0..51dcf09 100644
--- a/arch/arm/chunkset_neon.c
+++ b/arch/arm/chunkset_neon.c
@@ -15,30 +15,25 @@ typedef uint8x16_t chunk_t;
 
 #define CHUNK_SIZE 16
 
-#define HAVE_CHUNKMEMSET_1
 #define HAVE_CHUNKMEMSET_2
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
 
-static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
-    *chunk = vld1q_dup_u8(from);
-}
-
 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
     uint16_t tmp;
-    memcpy(&tmp, from, 2);
+    memcpy(&tmp, from, sizeof(tmp));
     *chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
 }
 
 static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
     uint32_t tmp;
-    memcpy(&tmp, from, 4);
+    memcpy(&tmp, from, sizeof(tmp));
     *chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
 }
 
 static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
     uint64_t tmp;
-    memcpy(&tmp, from, 8);
+    memcpy(&tmp, from, sizeof(tmp));
     *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
 }
 
diff --git a/arch/arm/crc32_acle.c b/arch/arm/crc32_acle.c
index 99013e1..0bcd3cf 100644
--- a/arch/arm/crc32_acle.c
+++ b/arch/arm/crc32_acle.c
@@ -62,7 +62,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
         len--;
     }
 
-    if ((len > sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
+    if ((len >= sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
         buf2 = (const uint16_t *) buf;
         c = __crc32h(c, *buf2++);
         len -= sizeof(uint16_t);
@@ -72,22 +72,17 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
     }
 
 #if defined(__aarch64__)
-    if ((len > sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
+    if ((len >= sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
         c = __crc32w(c, *buf4++);
         len -= sizeof(uint32_t);
     }
 
-    const uint64_t *buf8 = (const uint64_t *) buf4;
-
-#ifdef UNROLL_MORE
-    while (len >= 4 * sizeof(uint64_t)) {
-        c = __crc32d(c, *buf8++);
-        c = __crc32d(c, *buf8++);
-        c = __crc32d(c, *buf8++);
-        c = __crc32d(c, *buf8++);
-        len -= 4 * sizeof(uint64_t);
+    if (len == 0) {
+        c = ~c;
+        return c;
     }
-#endif
+
+    const uint64_t *buf8 = (const uint64_t *) buf4;
 
     while (len >= sizeof(uint64_t)) {
         c = __crc32d(c, *buf8++);
@@ -111,19 +106,10 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
     buf = (const unsigned char *) buf2;
 #else /* __aarch64__ */
 
-#  ifdef UNROLL_MORE
-    while (len >= 8 * sizeof(uint32_t)) {
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        c = __crc32w(c, *buf4++);
-        len -= 8 * sizeof(uint32_t);
+    if (len == 0) {
+        c = ~c;
+        return c;
     }
-#  endif
 
     while (len >= sizeof(uint32_t)) {
         c = __crc32w(c, *buf4++);
diff --git a/arch/s390/dfltcc_deflate.c b/arch/s390/dfltcc_deflate.c
index e3b53ee..9ecc6ba 100644
--- a/arch/s390/dfltcc_deflate.c
+++ b/arch/s390/dfltcc_deflate.c
@@ -210,7 +210,10 @@ again:
         *strm->next_out = (unsigned char)state->bi_buf;
     /* Honor history and check value */
     param->nt = 0;
-    param->cv = state->wrap == 2 ? ZSWAP32(strm->adler) : strm->adler;
+    if (state->wrap == 1)
+        param->cv = strm->adler;
+    else if (state->wrap == 2)
+        param->cv = ZSWAP32(strm->adler);
 
     /* When opening a block, choose a Huffman-Table Type */
     if (!param->bcf) {
@@ -241,7 +244,10 @@ again:
         state->bi_buf = 0; /* Avoid accessing next_out */
     else
         state->bi_buf = *strm->next_out & ((1 << state->bi_valid) - 1);
-    strm->adler = state->wrap == 2 ? ZSWAP32(param->cv) : param->cv;
+    if (state->wrap == 1)
+        strm->adler = param->cv;
+    else if (state->wrap == 2)
+        strm->adler = ZSWAP32(param->cv);
 
     /* Unmask the input data */
     strm->avail_in += masked_avail_in;
diff --git a/arch/s390/dfltcc_detail.h b/arch/s390/dfltcc_detail.h
index 4ec03f8..411e9f6 100644
--- a/arch/s390/dfltcc_detail.h
+++ b/arch/s390/dfltcc_detail.h
@@ -26,74 +26,6 @@
 #endif
 
 /*
-   C wrapper for the DEFLATE CONVERSION CALL instruction.
- */
-typedef enum {
-    DFLTCC_CC_OK = 0,
-    DFLTCC_CC_OP1_TOO_SHORT = 1,
-    DFLTCC_CC_OP2_TOO_SHORT = 2,
-    DFLTCC_CC_OP2_CORRUPT = 2,
-    DFLTCC_CC_AGAIN = 3,
-} dfltcc_cc;
-
-#define DFLTCC_QAF 0
-#define DFLTCC_GDHT 1
-#define DFLTCC_CMPR 2
-#define DFLTCC_XPND 4
-#define HBT_CIRCULAR (1 << 7)
-#define HB_BITS 15
-#define HB_SIZE (1 << HB_BITS)
-#define DFLTCC_FACILITY 151
-
-static inline dfltcc_cc dfltcc(int fn, void *param,
-                               unsigned char **op1, size_t *len1, z_const unsigned char **op2, size_t *len2, void *hist) {
-    unsigned char *t2 = op1 ? *op1 : NULL;
-    size_t t3 = len1 ? *len1 : 0;
-    z_const unsigned char *t4 = op2 ? *op2 : NULL;
-    size_t t5 = len2 ? *len2 : 0;
-    Z_REGISTER int r0 __asm__("r0") = fn;
-    Z_REGISTER void *r1 __asm__("r1") = param;
-    Z_REGISTER unsigned char *r2 __asm__("r2") = t2;
-    Z_REGISTER size_t r3 __asm__("r3") = t3;
-    Z_REGISTER z_const unsigned char *r4 __asm__("r4") = t4;
-    Z_REGISTER size_t r5 __asm__("r5") = t5;
-    int cc;
-
-    __asm__ volatile(
-#ifdef HAVE_SYS_SDT_H
-                     STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
-#endif
-                     ".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
-#ifdef HAVE_SYS_SDT_H
-                     STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
-#endif
-                     "ipm %[cc]\n"
-                     : [r2] "+r" (r2)
-                     , [r3] "+r" (r3)
-                     , [r4] "+r" (r4)
-                     , [r5] "+r" (r5)
-                     , [cc] "=r" (cc)
-                     : [r0] "r" (r0)
-                     , [r1] "r" (r1)
-                     , [hist] "r" (hist)
-#ifdef HAVE_SYS_SDT_H
-                     , STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
-#endif
-                     : "cc", "memory");
-    t2 = r2; t3 = r3; t4 = r4; t5 = r5;
-
-    if (op1)
-        *op1 = t2;
-    if (len1)
-        *len1 = t3;
-    if (op2)
-        *op2 = t4;
-    if (len2)
-        *len2 = t5;
-    return (cc >> 28) & 3;
-}
-
-/*
    Parameter Block for Query Available Functions.
  */
 #define static_assert(c, msg) __attribute__((unused)) static char static_assert_failed_ ## msg[c ? 1 : -1]
@@ -105,7 +37,8 @@ struct dfltcc_qaf_param {
     char reserved2[6];
 };
 
-static_assert(sizeof(struct dfltcc_qaf_param) == 32, sizeof_struct_dfltcc_qaf_param_is_32);
+#define DFLTCC_SIZEOF_QAF 32
+static_assert(sizeof(struct dfltcc_qaf_param) == DFLTCC_SIZEOF_QAF, qaf);
 
 static inline int is_bit_set(const char *bits, int n) {
     return bits[n / 8] & (1 << (7 - (n % 8)));
@@ -115,6 +48,8 @@ static inline void clear_bit(char *bits, int n) {
     bits[n / 8] &= ~(1 << (7 - (n % 8)));
 }
 
+#define DFLTCC_FACILITY 151
+
 #define DFLTCC_FMT0 0
 
 /*
@@ -165,12 +100,16 @@ struct dfltcc_param_v0 {
     uint16_t cdhtl : 12;               /* Compressed-Dynamic-Huffman Table
                                           Length */
     uint8_t reserved464[6];
-    uint8_t cdht[288];
-    uint8_t reserved[32];
-    uint8_t csb[1152];
+    uint8_t cdht[288];                 /* Compressed-Dynamic-Huffman Table */
+    uint8_t reserved[24];
+    uint8_t ribm2[8];                  /* Reserved for IBM use */
+    uint8_t csb[1152];                 /* Continuation-State Buffer */
 };
 
-static_assert(sizeof(struct dfltcc_param_v0) == 1536, sizeof_struct_dfltcc_param_v0_is_1536);
+#define DFLTCC_SIZEOF_GDHT_V0 384
+#define DFLTCC_SIZEOF_CMPR_XPND_V0 1536
+static_assert(offsetof(struct dfltcc_param_v0, csb) == DFLTCC_SIZEOF_GDHT_V0, gdht_v0);
+static_assert(sizeof(struct dfltcc_param_v0) == DFLTCC_SIZEOF_CMPR_XPND_V0, cmpr_xpnd_v0);
 
 static inline z_const char *oesc_msg(char *buf, int oesc) {
     if (oesc == 0x00)
@@ -182,6 +121,97 @@ static inline z_const char *oesc_msg(char *buf, int oesc) {
 }
 
 /*
+   C wrapper for the DEFLATE CONVERSION CALL instruction.
+ */
+typedef enum {
+    DFLTCC_CC_OK = 0,
+    DFLTCC_CC_OP1_TOO_SHORT = 1,
+    DFLTCC_CC_OP2_TOO_SHORT = 2,
+    DFLTCC_CC_OP2_CORRUPT = 2,
+    DFLTCC_CC_AGAIN = 3,
+} dfltcc_cc;
+
+#define DFLTCC_QAF 0
+#define DFLTCC_GDHT 1
+#define DFLTCC_CMPR 2
+#define DFLTCC_XPND 4
+#define HBT_CIRCULAR (1 << 7)
+#define DFLTCC_FN_MASK ((1 << 7) - 1)
+#define HB_BITS 15
+#define HB_SIZE (1 << HB_BITS)
+
+static inline dfltcc_cc dfltcc(int fn, void *param,
+                               unsigned char **op1, size_t *len1,
+                               z_const unsigned char **op2, size_t *len2, void *hist) {
+    unsigned char *t2 = op1 ? *op1 : NULL;
+#ifdef Z_MEMORY_SANITIZER
+    unsigned char *orig_t2 = t2;
+#endif
+    size_t t3 = len1 ? *len1 : 0;
+    z_const unsigned char *t4 = op2 ? *op2 : NULL;
+    size_t t5 = len2 ? *len2 : 0;
+    Z_REGISTER int r0 __asm__("r0") = fn;
+    Z_REGISTER void *r1 __asm__("r1") = param;
+    Z_REGISTER unsigned char *r2 __asm__("r2") = t2;
+    Z_REGISTER size_t r3 __asm__("r3") = t3;
+    Z_REGISTER z_const unsigned char *r4 __asm__("r4") = t4;
+    Z_REGISTER size_t r5 __asm__("r5") = t5;
+    int cc;
+
+    __asm__ volatile(
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_entry, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     ".insn rrf,0xb9390000,%[r2],%[r4],%[hist],0\n"
+#ifdef HAVE_SYS_SDT_H
+                     STAP_PROBE_ASM(zlib, dfltcc_exit, STAP_PROBE_ASM_TEMPLATE(5))
+#endif
+                     "ipm %[cc]\n"
+                     : [r2] "+r" (r2)
+                     , [r3] "+r" (r3)
+                     , [r4] "+r" (r4)
+                     , [r5] "+r" (r5)
+                     , [cc] "=r" (cc)
+                     : [r0] "r" (r0)
+                     , [r1] "r" (r1)
+                     , [hist] "r" (hist)
+#ifdef HAVE_SYS_SDT_H
+                     , STAP_PROBE_ASM_OPERANDS(5, r2, r3, r4, r5, hist)
+#endif
+                     : "cc", "memory");
+    t2 = r2; t3 = r3; t4 = r4; t5 = r5;
+
+#ifdef Z_MEMORY_SANITIZER
+    switch (fn & DFLTCC_FN_MASK) {
+    case DFLTCC_QAF:
+        __msan_unpoison(param, DFLTCC_SIZEOF_QAF);
+        break;
+    case DFLTCC_GDHT:
+        __msan_unpoison(param, DFLTCC_SIZEOF_GDHT_V0);
+        break;
+    case DFLTCC_CMPR:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2 + (((struct dfltcc_param_v0 *)param)->sbb == 0 ? 0 : 1));
+        break;
+    case DFLTCC_XPND:
+        __msan_unpoison(param, DFLTCC_SIZEOF_CMPR_XPND_V0);
+        __msan_unpoison(orig_t2, t2 - orig_t2);
+        break;
+    }
+#endif
+
+    if (op1)
+        *op1 = t2;
+    if (len1)
+        *len1 = t3;
+    if (op2)
+        *op2 = t4;
+    if (len2)
+        *len2 = t5;
+    return (cc >> 28) & 3;
+}
+
+/*
    Extension of inflate_state and deflate_state. Must be doubleword-aligned.
 */
 struct dfltcc_state {
diff --git a/arch/s390/dfltcc_inflate.c b/arch/s390/dfltcc_inflate.c
index 2535064..801e547 100644
--- a/arch/s390/dfltcc_inflate.c
+++ b/arch/s390/dfltcc_inflate.c
@@ -81,13 +81,14 @@ dfltcc_inflate_action Z_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush
     }
 
     /* Translate stream to parameter block */
-    param->cvt = state->flags ? CVT_CRC32 : CVT_ADLER32;
+    param->cvt = ((state->wrap & 4) && state->flags) ? CVT_CRC32 : CVT_ADLER32;
     param->sbb = state->bits;
     param->hl = state->whave; /* Software and hardware history formats match */
     param->ho = (state->wnext - state->whave) & ((1 << HB_BITS) - 1);
     if (param->hl)
         param->nt = 0; /* Honor history for the first block */
-    param->cv = state->flags ? ZSWAP32(state->check) : state->check;
+    if (state->wrap & 4)
+        param->cv = state->flags ? ZSWAP32(state->check) : state->check;
 
     /* Inflate */
     do {
@@ -100,7 +101,8 @@ dfltcc_inflate_action Z_INTERNAL dfltcc_inflate(PREFIX3(streamp) strm, int flush
     state->bits = param->sbb;
     state->whave = param->hl;
     state->wnext = (param->ho + param->hl) & ((1 << HB_BITS) - 1);
-    state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
+    if (state->wrap & 4)
+        strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv;
     if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) {
         /* Report an error if stream is corrupted */
         state->mode = BAD;
diff --git a/arch/s390/self-hosted-builder/actions-runner.Dockerfile b/arch/s390/self-hosted-builder/actions-runner.Dockerfile
index a4bb774..a55a74d 100644
--- a/arch/s390/self-hosted-builder/actions-runner.Dockerfile
+++ b/arch/s390/self-hosted-builder/actions-runner.Dockerfile
@@ -11,11 +11,13 @@ FROM s390x/ubuntu:20.04
 # Packages for zlib-ng testing.
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get -y install \
+        clang-11 \
         cmake \
         curl \
         gcc \
         git \
         jq \
+        llvm-11-tools \
         ninja-build \
         python-is-python3 \
         python3 \
diff --git a/arch/x86/chunkset_avx.c b/arch/x86/chunkset_avx.c
index 7a9a56a..398d192 100644
--- a/arch/x86/chunkset_avx.c
+++ b/arch/x86/chunkset_avx.c
@@ -11,25 +11,26 @@ typedef __m256i chunk_t;
 
 #define CHUNK_SIZE 32
 
-#define HAVE_CHUNKMEMSET_1
 #define HAVE_CHUNKMEMSET_2
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
 
-static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
-    *chunk = _mm256_set1_epi8(*(int8_t *)from);
-}
-
 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
-    *chunk = _mm256_set1_epi16(*(int16_t *)from);
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi16(tmp);
 }
 
 static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
-    *chunk = _mm256_set1_epi32(*(int32_t *)from);
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi32(tmp);
 }
 
 static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
-    *chunk = _mm256_set1_epi64x(*(int64_t *)from);
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi64x(tmp);
 }
 
 static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
diff --git a/arch/x86/chunkset_sse.c b/arch/x86/chunkset_sse.c
index d38e99d..6b43d4a 100644
--- a/arch/x86/chunkset_sse.c
+++ b/arch/x86/chunkset_sse.c
@@ -12,25 +12,26 @@ typedef __m128i chunk_t;
 
 #define CHUNK_SIZE 16
 
-#define HAVE_CHUNKMEMSET_1
 #define HAVE_CHUNKMEMSET_2
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
 
-static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
-    *chunk = _mm_set1_epi8(*(int8_t *)from);
-}
-
 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
-    *chunk = _mm_set1_epi16(*(int16_t *)from);
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi16(tmp);
 }
 
 static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
-    *chunk = _mm_set1_epi32(*(int32_t *)from);
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi32(tmp);
 }
 
 static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
-    *chunk = _mm_set1_epi64x(*(int64_t *)from);
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi64x(tmp);
 }
 
 static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {