diff options
Diffstat (limited to 'inffast.c')
-rw-r--r-- | inffast.c | 86 |
1 files changed, 65 insertions, 21 deletions
@@ -113,8 +113,8 @@ static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, uns Entry assumptions: state->mode == LEN - strm->avail_in >= 6 - strm->avail_out >= 258 + strm->avail_in >= INFLATE_FAST_MIN_HAVE + strm->avail_out >= INFLATE_FAST_MIN_LEFT start >= strm->avail_out state->bits < 8 @@ -132,6 +132,10 @@ static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, uns Therefore if strm->avail_in >= 6, then there is enough input to avoid checking for available input while decoding. + - On some architectures, it can be significantly faster (e.g. up to 1.2x + faster on x86_64) to load from strm->next_in 64 bits, or 8 bytes, at a + time, so INFLATE_FAST_MIN_HAVE == 8. + - The maximum bytes that a single length/distance pair can output is 258 bytes, which is the maximum length that can be coded. inflate_fast() requires strm->avail_out >= 258 for each loop to avoid checking for @@ -155,7 +159,45 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { unsigned whave; /* valid bytes in the window */ unsigned wnext; /* window write index */ unsigned char *window; /* allocated sliding window, if wsize != 0 */ - uint32_t hold; /* local strm->hold */ + + /* hold is a local copy of strm->hold. By default, hold satisfies the same + invariants that strm->hold does, namely that (hold >> bits) == 0. This + invariant is kept by loading bits into hold one byte at a time, like: + + hold |= next_byte_of_input << bits; in++; bits += 8; + + If we need to ensure that bits >= 15 then this code snippet is simply + repeated. Over one iteration of the outermost do/while loop, this + happens up to six times (48 bits of input), as described in the NOTES + above. + + However, on some little endian architectures, it can be significantly + faster to load 64 bits once instead of 8 bits six times: + + if (bits <= 16) { + hold |= next_8_bytes_of_input << bits; in += 6; bits += 48; + } + + Unlike the simpler one byte load, shifting the next_8_bytes_of_input + by bits will overflow and lose those high bits, up to 2 bytes' worth. + The conservative estimate is therefore that we have read only 6 bytes + (48 bits). Again, as per the NOTES above, 48 bits is sufficient for the + rest of the iteration, and we will not need to load another 8 bytes. + + Inside this function, we no longer satisfy (hold >> bits) == 0, but + this is not problematic, even if that overflow does not land on an 8 bit + byte boundary. Those excess bits will eventually shift down lower as the + Huffman decoder consumes input, and when new input bits need to be loaded + into the bits variable, the same input bits will be or'ed over those + existing bits. A bitwise or is idempotent: (a | b | b) equals (a | b). + Note that we therefore write that load operation as "hold |= etc" and not + "hold += etc". + + Outside that loop, at the end of the function, hold is bitwise and'ed + with (1<<bits)-1 to drop those excess bits so that, on function exit, we + keep the invariant that (state->hold >> state->bits) == 0. + */ + uint64_t hold; /* local strm->hold */ unsigned bits; /* local strm->bits */ code const *lcode; /* local strm->lencode */ code const *dcode; /* local strm->distcode */ @@ -171,10 +213,10 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { /* copy state to local variables */ state = (struct inflate_state *)strm->state; in = strm->next_in; - last = in + (strm->avail_in - 5); + last = in + (strm->avail_in - (INFLATE_FAST_MIN_HAVE - 1)); out = strm->next_out; beg = out - (start - strm->avail_out); - end = out + (strm->avail_out - 257); + end = out + (strm->avail_out - (INFLATE_FAST_MIN_LEFT - 1)); #ifdef INFFAST_CHUNKSIZE safe = out + (strm->avail_out - INFFAST_CHUNKSIZE); @@ -197,9 +239,9 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { input data or output space */ do { if (bits < 15) { - hold += load_short(in, bits); - in += 2; - bits += 16; + hold |= load_64_bits(in, bits); + in += 6; + bits += 48; } here = lcode[hold & lmask]; dolen: @@ -215,17 +257,18 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { op &= 15; /* number of extra bits */ if (op) { if (bits < op) { - hold += (uint32_t)(*in++) << bits; - bits += 8; + hold |= load_64_bits(in, bits); + in += 6; + bits += 48; } len += BITS(op); DROPBITS(op); } Tracevv((stderr, "inflate: length %u\n", len)); if (bits < 15) { - hold += load_short(in, bits); - in += 2; - bits += 16; + hold |= load_64_bits(in, bits); + in += 6; + bits += 48; } here = dcode[hold & dmask]; dodist: @@ -235,12 +278,9 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { dist = here.val; op &= 15; /* number of extra bits */ if (bits < op) { - hold += (uint32_t)(*in++) << bits; - bits += 8; - if (bits < op) { - hold += (uint32_t)(*in++) << bits; - bits += 8; - } + hold |= load_64_bits(in, bits); + in += 6; + bits += 48; } dist += BITS(op); #ifdef INFLATE_STRICT @@ -412,8 +452,12 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { /* update state and return */ strm->next_in = in; strm->next_out = out; - strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); - strm->avail_out = (unsigned)(out < end ? 257 + (end - out) : 257 - (out - end)); + strm->avail_in = + (unsigned)(in < last ? (INFLATE_FAST_MIN_HAVE - 1) + (last - in) + : (INFLATE_FAST_MIN_HAVE - 1) - (in - last)); + strm->avail_out = + (unsigned)(out < end ? (INFLATE_FAST_MIN_LEFT - 1) + (end - out) + : (INFLATE_FAST_MIN_LEFT - 1) - (out - end)); state->hold = hold; state->bits = bits; return; |