Update libjpeg-turbo to v2.0.5

Update Chromium's copy of libjpeg-turbo to the latest stable upstream release (v2.0.5) and reapply our local changes documented in README.chromium. This update addresses three CVEs - CVE-2018-19664, CVE-2018-20330, CVE-2018-20330 - that do not affect Chromium. The fixes do, however, satisfy UBSan - allowing Chromium's libjpeg-turbo to be used in AOSP. Cherry-pick one additional change[1] from upstream to prevent AArch64 Windows builds from failing. [1] https://github.com/libjpeg-turbo/libjpeg-turbo/commit/6ee5d5f568fda1a7c6a49dd8995f2d89866ee42d Bug: 922430 Bug: https://issuetracker.google.com/135180511 Change-Id: I146fe82f7a016ce393eb0d37aebe0b7c2492a9da
author: Jonathan Wright <jonathan.wright@arm.com> 2020-08-05 11:42:22 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> 2020-08-07 17:04:34 +0100
commit: db870dfef8ab97950b4bdf22c66dd6c18326460b (patch)
tree: 9862c1aa45d014815ec798992ecb43eff1bc8d42
parent: 341272d909285da90e44015ca41f956fd00b9dd8 (diff)
131 files changed, 1475 insertions, 1390 deletions
diff --git a/BUILDING.md b/BUILDING.md
index afc1841..a4ae1e0 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -18,12 +18,15 @@ Build Requirements
     when building macho64 objects.)
   * If using YASM, 1.2.0 or later is required.
   * If building on macOS, NASM or YASM can be obtained from
-    [MacPorts](http://www.macports.org/) or [Homebrew](http://brew.sh).
+    [MacPorts](http://www.macports.org/) or [Homebrew](http://brew.sh/).
      - NOTE: Currently, if it is desirable to hide the SIMD function symbols in
        Mac executables or shared libraries that statically link with
        libjpeg-turbo, then NASM 2.14 or later or YASM must be used when
        building libjpeg-turbo.
   * If building on Windows, **nasm.exe**/**yasm.exe** should be in your `PATH`.
+  * NASM and YASM are located in the CRB (Code Ready Builder) repository on
+    Red Hat Enterprise Linux 8 and in the PowerTools repository on CentOS 8,
+    which is not enabled by default.
 
   The binary RPMs released by the NASM project do not work on older Linux
   systems, such as Red Hat Enterprise Linux 5.  On such systems, you can easily
@@ -527,7 +530,7 @@ a universal library.
 Building libjpeg-turbo for Android
 ----------------------------------
 
-Building libjpeg-turbo for Android platforms requires the
+Building libjpeg-turbo for Android platforms requires v13b or later of the
 [Android NDK](https://developer.android.com/tools/sdk/ndk).
 
 
@@ -537,35 +540,21 @@ The following is a general recipe script that can be modified for your specific
 needs.
 
     # Set these variables to suit your needs
-    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
-    BUILD_PLATFORM={the platform name for the NDK package you installed--
-      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-      toolchain directory under ${NDK_PATH}/toolchains/.}
-    ANDROID_VERSION={The minimum version of Android to support-- for example,
+    NDK_PATH={full path to the NDK directory-- for example,
+      /opt/android/android-ndk-r16b}
+    TOOLCHAIN={"gcc" or "clang"-- "gcc" must be used with NDK r16b and earlier,
+      and "clang" must be used with NDK r17c and later}
+    ANDROID_VERSION={the minimum version of Android to support-- for example,
       "16", "19", etc.}
 
-    # It should not be necessary to modify the rest
-    HOST=arm-linux-androideabi
-    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm
-    export CFLAGS="-march=armv7-a -mfloat-abi=softfp -fprefetch-loop-arrays \
-      -D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
-      -isystem ${NDK_PATH}/sysroot/usr/include \
-      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
-    export LDFLAGS=-pie
-    TOOLCHAIN=${NDK_PATH}/toolchains/${HOST}-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-
     cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Linux)
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER ${TOOLCHAIN}/bin/${HOST}-gcc)
-    set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN}/${HOST})
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_POSITION_INDEPENDENT_CODE=1 \
+    cmake -G"Unix Makefiles" \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_MODE=arm \
+      -DANDROID_PLATFORM=android-${ANDROID_VERSION} \
+      -DANDROID_TOOLCHAIN=${TOOLCHAIN} \
+      -DCMAKE_ASM_FLAGS="--target=arm-linux-androideabi${ANDROID_VERSION}" \
+      -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake \
       [additional CMake flags] {source_directory}
     make
 
@@ -576,34 +565,21 @@ The following is a general recipe script that can be modified for your specific
 needs.
 
     # Set these variables to suit your needs
-    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
-    BUILD_PLATFORM={the platform name for the NDK package you installed--
-      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-      toolchain directory under ${NDK_PATH}/toolchains/.}
-    ANDROID_VERSION={The minimum version of Android to support.  "21" or later
+    NDK_PATH={full path to the NDK directory-- for example,
+      /opt/android/android-ndk-r16b}
+    TOOLCHAIN={"gcc" or "clang"-- "gcc" must be used with NDK r14b and earlier,
+      and "clang" must be used with NDK r17c and later}
+    ANDROID_VERSION={the minimum version of Android to support.  "21" or later
       is required for a 64-bit build.}
 
-    # It should not be necessary to modify the rest
-    HOST=aarch64-linux-android
-    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm64
-    export CFLAGS="-D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
-      -isystem ${NDK_PATH}/sysroot/usr/include \
-      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
-    export LDFLAGS=-pie
-    TOOLCHAIN=${NDK_PATH}/toolchains/${HOST}-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-
     cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Linux)
-    set(CMAKE_SYSTEM_PROCESSOR aarch64)
-    set(CMAKE_C_COMPILER ${TOOLCHAIN}/bin/${HOST}-gcc)
-    set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN}/${HOST})
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_POSITION_INDEPENDENT_CODE=1 \
+    cmake -G"Unix Makefiles" \
+      -DANDROID_ABI=arm64-v8a \
+      -DANDROID_ARM_MODE=arm \
+      -DANDROID_PLATFORM=android-${ANDROID_VERSION} \
+      -DANDROID_TOOLCHAIN=${TOOLCHAIN} \
+      -DCMAKE_ASM_FLAGS="--target=aarch64-linux-android${ANDROID_VERSION}" \
+      -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake \
       [additional CMake flags] {source_directory}
     make
 
@@ -614,34 +590,19 @@ The following is a general recipe script that can be modified for your specific
 needs.
 
     # Set these variables to suit your needs
-    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
-    BUILD_PLATFORM={the platform name for the NDK package you installed--
-      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-      toolchain directory under ${NDK_PATH}/toolchains/.}
+    NDK_PATH={full path to the NDK directory-- for example,
+      /opt/android/android-ndk-r16b}
+    TOOLCHAIN={"gcc" or "clang"-- "gcc" must be used with NDK r14b and earlier,
+      and "clang" must be used with NDK r17c and later}
     ANDROID_VERSION={The minimum version of Android to support-- for example,
       "16", "19", etc.}
 
-    # It should not be necessary to modify the rest
-    HOST=i686-linux-android
-    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-x86
-    export CFLAGS="-D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
-      -isystem ${NDK_PATH}/sysroot/usr/include \
-      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
-    export LDFLAGS=-pie
-    TOOLCHAIN=${NDK_PATH}/toolchains/x86-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-
     cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Linux)
-    set(CMAKE_SYSTEM_PROCESSOR i386)
-    set(CMAKE_C_COMPILER ${TOOLCHAIN}/bin/${HOST}-gcc)
-    set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN}/${HOST})
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_POSITION_INDEPENDENT_CODE=1 \
+    cmake -G"Unix Makefiles" \
+      -DANDROID_ABI=x86 \
+      -DANDROID_PLATFORM=android-${ANDROID_VERSION} \
+      -DANDROID_TOOLCHAIN=${TOOLCHAIN} \
+      -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake \
       [additional CMake flags] {source_directory}
     make
 
@@ -652,45 +613,23 @@ The following is a general recipe script that can be modified for your specific
 needs.
 
     # Set these variables to suit your needs
-    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
-    BUILD_PLATFORM={the platform name for the NDK package you installed--
-      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-      toolchain directory under ${NDK_PATH}/toolchains/.}
-    ANDROID_VERSION={The minimum version of Android to support.  "21" or later
+    NDK_PATH={full path to the NDK directory-- for example,
+      /opt/android/android-ndk-r16b}
+    TOOLCHAIN={"gcc" or "clang"-- "gcc" must be used with NDK r14b and earlier,
+      and "clang" must be used with NDK r17c and later}
+    ANDROID_VERSION={the minimum version of Android to support.  "21" or later
       is required for a 64-bit build.}
 
-    # It should not be necessary to modify the rest
-    HOST=x86_64-linux-android
-    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-x86_64
-    export CFLAGS="-D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
-      -isystem ${NDK_PATH}/sysroot/usr/include \
-      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
-    export LDFLAGS=-pie
-    TOOLCHAIN=${NDK_PATH}/toolchains/x86_64-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-
     cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Linux)
-    set(CMAKE_SYSTEM_PROCESSOR x86_64)
-    set(CMAKE_C_COMPILER ${TOOLCHAIN}/bin/${HOST}-gcc)
-    set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN}/${HOST})
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_POSITION_INDEPENDENT_CODE=1 \
+    cmake -G"Unix Makefiles" \
+      -DANDROID_ABI=x86_64 \
+      -DANDROID_PLATFORM=android-${ANDROID_VERSION} \
+      -DANDROID_TOOLCHAIN=${TOOLCHAIN} \
+      -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake \
       [additional CMake flags] {source_directory}
     make
 
 
-If building for Android 4.0.x (API level < 16) or earlier, remove
-`-DCMAKE_POSITION_INDEPENDENT_CODE=1` from the CMake arguments and `-pie` from
-`LDFLAGS`.
-
-If building on Windows, add `.exe` to the end of `CMAKE_C_COMPILER`.
-
-
 Advanced CMake Options
 ----------------------
 
diff --git a/ChangeLog.md b/ChangeLog.md
index 8c529ec..59fb2de 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,3 +1,152 @@
+2.0.5
+=====
+
+### Significant changes relative to 2.0.4:
+
+1. Worked around issues in the MIPS DSPr2 SIMD extensions that caused failures
+in the libjpeg-turbo regression tests.  Specifically, the
+`jsimd_h2v1_downsample_dspr2()` and `jsimd_h2v2_downsample_dspr2()` functions
+in the MIPS DSPr2 SIMD extensions are now disabled until/unless they can be
+fixed, and other functions that are incompatible with big endian MIPS CPUs are
+disabled when building libjpeg-turbo for such CPUs.
+
+2. Fixed an oversight in the `TJCompressor.compress(int)` method in the
+TurboJPEG Java API that caused an error ("java.lang.IllegalStateException: No
+source image is associated with this instance") when attempting to use that
+method to compress a YUV image.
+
+3. Fixed an issue (CVE-2020-13790) in the PPM reader that caused a buffer
+overrun in cjpeg, TJBench, or the `tjLoadImage()` function if one of the values
+in a binary PPM/PGM input file exceeded the maximum value defined in the file's
+header and that maximum value was less than 255.  libjpeg-turbo 1.5.0 already
+included a similar fix for binary PPM/PGM files with maximum values greater
+than 255.
+
+4. The TurboJPEG API library's global error handler, which is used in functions
+such as `tjBufSize()` and `tjLoadImage()` that do not require a TurboJPEG
+instance handle, is now thread-safe on platforms that support thread-local
+storage.
+
+
+2.0.4
+=====
+
+### Significant changes relative to 2.0.3:
+
+1. Fixed a regression in the Windows packaging system (introduced by
+2.0 beta1[2]) whereby, if both the 64-bit libjpeg-turbo SDK for GCC and the
+64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
+one of them could be uninstalled.
+
+2. Fixed a signed integer overflow and subsequent segfault that occurred when
+attempting to decompress images with more than 715827882 pixels using the
+64-bit C version of TJBench.
+
+3. Fixed out-of-bounds write in `tjDecompressToYUV2()` and
+`tjDecompressToYUVPlanes()` (sometimes manifesting as a double free) that
+occurred when attempting to decompress grayscale JPEG images that were
+compressed with a sampling factor other than 1 (for instance, with
+`cjpeg -grayscale -sample 2x2`).
+
+4. Fixed a regression introduced by 2.0.2[5] that caused the TurboJPEG API to
+incorrectly identify some JPEG images with unusual sampling factors as 4:4:4
+JPEG images.  This was known to cause a buffer overflow when attempting to
+decompress some such images using `tjDecompressToYUV2()` or
+`tjDecompressToYUVPlanes()`.
+
+5. Fixed an issue, detected by ASan, whereby attempting to losslessly transform
+a specially-crafted malformed JPEG image containing an extremely-high-frequency
+coefficient block (junk image data that could never be generated by a
+legitimate JPEG compressor) could cause the Huffman encoder's local buffer to
+be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].)  Given that the buffer
+overrun was fully contained within the stack and did not cause a segfault or
+other user-visible errant behavior, and given that the lossless transformer
+(unlike the decompressor) is not generally exposed to arbitrary data exploits,
+this issue did not likely pose a security risk.
+
+6. The ARM 64-bit (ARMv8) NEON SIMD assembly code now stores constants in a
+separate read-only data section rather than in the text section, to support
+execute-only memory layouts.
+
+
+2.0.3
+=====
+
+### Significant changes relative to 2.0.2:
+
+1. Fixed "using JNI after critical get" errors that occurred on Android
+platforms when passing invalid arguments to certain methods in the TurboJPEG
+Java API.
+
+2. Fixed a regression in the SIMD feature detection code, introduced by
+the AVX2 SIMD extensions (2.0 beta1[1]), that was known to cause an illegal
+instruction exception, in rare cases, on CPUs that lack support for CPUID leaf
+07H (or on which the maximum CPUID leaf has been limited by way of a BIOS
+setting.)
+
+3. The 4:4:0 (h1v2) fancy (smooth) chroma upsampling algorithm in the
+decompressor now uses a similar bias pattern to that of the 4:2:2 (h2v1) fancy
+chroma upsampling algorithm, rounding up or down the upsampled result for
+alternate pixels rather than always rounding down.  This ensures that,
+regardless of whether a 4:2:2 JPEG image is rotated or transposed prior to
+decompression (in the frequency domain) or after decompression (in the spatial
+domain), the final image will be similar.
+
+4. Fixed an integer overflow and subsequent segfault that occurred when
+attempting to compress or decompress images with more than 1 billion pixels
+using the TurboJPEG API.
+
+5. Fixed a regression introduced by 2.0 beta1[15] whereby attempting to
+generate a progressive JPEG image on an SSE2-capable CPU using a scan script
+containing one or more scans with lengths divisible by 16 would result in an
+error ("Missing Huffman code table entry") and an invalid JPEG image.
+
+6. Fixed an issue whereby `tjDecodeYUV()` and `tjDecodeYUVPlanes()` would throw
+an error ("Invalid progressive parameters") or a warning ("Inconsistent
+progression sequence") if passed a TurboJPEG instance that was previously used
+to decompress a progressive JPEG image.
+
+
+2.0.2
+=====
+
+### Significant changes relative to 2.0.1:
+
+1. Fixed a regression introduced by 2.0.1[5] that prevented a runtime search
+path (rpath) from being embedded in the libjpeg-turbo shared libraries and
+executables for macOS and iOS.  This caused a fatal error of the form
+"dyld: Library not loaded" when attempting to use one of the executables,
+unless `DYLD_LIBRARY_PATH` was explicitly set to the location of the
+libjpeg-turbo shared libraries.
+
+2. Fixed an integer overflow and subsequent segfault (CVE-2018-20330) that
+occurred when attempting to load a BMP file with more than 1 billion pixels
+using the `tjLoadImage()` function.
+
+3. Fixed a buffer overrun (CVE-2018-19664) that occurred when attempting to
+decompress a specially-crafted malformed JPEG image to a 256-color BMP using
+djpeg.
+
+4. Fixed a floating point exception that occurred when attempting to
+decompress a specially-crafted malformed JPEG image with a specified image
+width or height of 0 using the C version of TJBench.
+
+5. The TurboJPEG API will now decompress 4:4:4 JPEG images with 2x1, 1x2, 3x1,
+or 1x3 luminance and chrominance sampling factors.  This is a non-standard way
+of specifying 1x subsampling (normally 4:4:4 JPEGs have 1x1 luminance and
+chrominance sampling factors), but the JPEG format and the libjpeg API both
+allow it.
+
+6. Fixed a regression introduced by 2.0 beta1[7] that caused djpeg to generate
+incorrect PPM images when used with the `-colors` option.
+
+7. Fixed an issue whereby a static build of libjpeg-turbo (a build in which
+`ENABLE_SHARED` is `0`) could not be installed using the Visual Studio IDE.
+
+8. Fixed a severe performance issue in the Loongson MMI SIMD extensions that
+occurred when compressing RGB images whose image rows were not 64-bit-aligned.
+
+
 2.0.1
 =====
 
@@ -60,10 +209,11 @@ would produce a "Bogus message code" error message if the underlying bitmap and
 PPM readers/writers threw an error that was specific to the readers/writers
 (as opposed to a general libjpeg API error.)
 
-4. Fixed an issue whereby a specially-crafted malformed BMP file, one in which
-the header specified an image width of 1073741824 pixels, would trigger a
-floating point exception (division by zero) in the `tjLoadImage()` function
-when attempting to load the BMP file into a 4-component image buffer.
+4. Fixed an issue (CVE-2018-1152) whereby a specially-crafted malformed BMP
+file, one in which the header specified an image width of 1073741824 pixels,
+would trigger a floating point exception (division by zero) in the
+`tjLoadImage()` function when attempting to load the BMP file into a
+4-component image buffer.
 
 5. Fixed an issue whereby certain combinations of calls to
 `jpeg_skip_scanlines()` and `jpeg_read_scanlines()` could trigger an infinite
@@ -77,10 +227,10 @@ a 4:2:2 or 4:2:0 JPEG image using the merged (non-fancy) upsampling algorithms
 7. The new CMake-based build system will now disable the MIPS DSPr2 SIMD
 extensions if it detects that the compiler does not support DSPr2 instructions.
 
-8. Fixed out-of-bounds read in cjpeg that occurred when attempting to compress
-a specially-crafted malformed color-index (8-bit-per-sample) BMP file in which
-some of the samples (color indices) exceeded the bounds of the BMP file's color
-table.
+8. Fixed out-of-bounds read in cjpeg (CVE-2018-14498) that occurred when
+attempting to compress a specially-crafted malformed color-index
+(8-bit-per-sample) BMP file in which some of the samples (color indices)
+exceeded the bounds of the BMP file's color table.
 
 9. Fixed a signed integer overflow in the progressive Huffman decoder, detected
 by the Clang and GCC undefined behavior sanitizers, that could be triggered by
@@ -240,8 +390,8 @@ write scanlines in bottom-up order.)  djpeg will now exit gracefully if an
 output format other than PPM/PGM, GIF, or Targa is selected along with the
 `-crop` option.
 
-4. Fixed an issue whereby `jpeg_skip_scanlines()` would segfault if color
-quantization was enabled.
+4. Fixed an issue (CVE-2017-15232) whereby `jpeg_skip_scanlines()` would
+segfault if color quantization was enabled.
 
 5. TJBench (both C and Java versions) will now display usage information if any
 command-line argument is unrecognized.  This prevents the program from silently
@@ -442,10 +592,10 @@ application was linked against.
 
 3. Fixed a couple of issues in the PPM reader that would cause buffer overruns
 in cjpeg if one of the values in a binary PPM/PGM input file exceeded the
-maximum value defined in the file's header.  libjpeg-turbo 1.4.2 already
-included a similar fix for ASCII PPM/PGM files.  Note that these issues were
-not security bugs, since they were confined to the cjpeg program and did not
-affect any of the libjpeg-turbo libraries.
+maximum value defined in the file's header and that maximum value was greater
+than 255.  libjpeg-turbo 1.4.2 already included a similar fix for ASCII PPM/PGM
+files.  Note that these issues were not security bugs, since they were confined
+to the cjpeg program and did not affect any of the libjpeg-turbo libraries.
 
 4. Fixed an issue whereby attempting to decompress a JPEG file with a corrupt
 header using the `tjDecompressToYUV2()` function would cause the function to
@@ -868,13 +1018,13 @@ and IDCT algorithms (both are used during JPEG decompression.)  For unknown
 reasons (probably related to clang), this code cannot currently be compiled for
 iOS.
 
-15. Fixed an extremely rare bug that could cause the Huffman encoder's local
-buffer to overrun when a very high-frequency MCU is compressed using quality
-100 and no subsampling, and when the JPEG output buffer is being dynamically
-resized by the destination manager.  This issue was so rare that, even with a
-test program specifically designed to make the bug occur (by injecting random
-high-frequency YUV data into the compressor), it was reproducible only once in
-about every 25 million iterations.
+15. Fixed an extremely rare bug (CVE-2014-9092) that could cause the Huffman
+encoder's local buffer to overrun when a very high-frequency MCU is compressed
+using quality 100 and no subsampling, and when the JPEG output buffer is being
+dynamically resized by the destination manager.  This issue was so rare that,
+even with a test program specifically designed to make the bug occur (by
+injecting random high-frequency YUV data into the compressor), it was
+reproducible only once in about every 25 million iterations.
 
 16. Fixed an oversight in the TurboJPEG C wrapper:  if any of the JPEG
 compression functions was called repeatedly with the same
@@ -909,8 +1059,9 @@ entropy coding (by passing arguments of `-progressive -arithmetic` to cjpeg or
 jpegtran, for instance) would result in an error, `Requested feature was
 omitted at compile time`.
 
-4. Fixed a couple of issues whereby malformed JPEG images would cause
-libjpeg-turbo to use uninitialized memory during decompression.
+4. Fixed a couple of issues (CVE-2013-6629 and CVE-2013-6630) whereby malformed
+JPEG images would cause libjpeg-turbo to use uninitialized memory during
+decompression.
 
 5. Fixed an error (`Buffer passed to JPEG library is too small`) that occurred
 when calling the TurboJPEG YUV encoding function with a very small (< 5x5)
@@ -1049,9 +1200,9 @@ correct behavior of the colorspace extensions when merged upsampling is used.
 upper 64 bits of xmm6 and xmm7 on Win64 platforms, which violated the Win64
 calling conventions.
 
-4. Fixed a regression caused by 1.2.0[6] whereby decompressing corrupt JPEG
-images (specifically, images in which the component count was erroneously set
-to a large value) would cause libjpeg-turbo to segfault.
+4. Fixed a regression (CVE-2012-2806) caused by 1.2.0[6] whereby decompressing
+corrupt JPEG images (specifically, images in which the component count was
+erroneously set to a large value) would cause libjpeg-turbo to segfault.
 
 5. Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
 processors.  The `MASKMOVDQU` instruction, which was used by the libjpeg-turbo
diff --git a/LICENSE.md b/LICENSE.md
index 0f6ec4b..99c9aad 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -14,7 +14,7 @@ libjpeg-turbo is covered by three compatible BSD-style open source licenses:
   This license covers the TurboJPEG API library and associated programs, as
   well as the build system.
 
-- The zlib License, which is listed below
+- The [zlib License](https://opensource.org/licenses/Zlib)
 
   This license is a subset of the other two, and it covers the libjpeg-turbo
   SIMD extensions.
@@ -66,7 +66,7 @@ best of our understanding.
 
     2.  If your binary distribution includes or uses the TurboJPEG API, then
         your product documentation must include the text of the Modified BSD
-        License.
+        License (see below.)
 
         **Origin**
         - Clause 2 of the Modified BSD License
@@ -91,7 +91,8 @@ best of our understanding.
 The Modified (3-clause) BSD License
 ===================================
 
-Copyright (C)\<YEAR\> \<AUTHOR\>.  All Rights Reserved.
+Copyright (C)2009-2020 D. R. Commander.  All Rights Reserved.
+Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
@@ -118,28 +119,6 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 
 
-The zlib License
-================
-
-Copyright (C) \<YEAR\>, \<AUTHOR\>.
-
-This software is provided 'as-is', without any express or implied
-warranty.  In no event will the authors be held liable for any damages
-arising from the use of this software.
-
-Permission is granted to anyone to use this software for any purpose,
-including commercial applications, and to alter it and redistribute it
-freely, subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not
-   claim that you wrote the original software. If you use this software
-   in a product, an acknowledgment in the product documentation would be
-   appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-
-
 Why Three Licenses?
 ===================
 
diff --git a/README.chromium b/README.chromium
index c0992dd..f70cb89 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libjpeg-turbo
 URL: https://github.com/libjpeg-turbo/libjpeg-turbo/
-Version: 2.0.1
+Version: 2.0.5
 License: Custom license
 License File: LICENSE.md
 Security Critical: yes
@@ -8,19 +8,17 @@ License Android Compatible: yes
 
 Description:
 This consists of the components:
-* libjpeg-turbo 2.0.1
+* libjpeg-turbo 2.0.5
 * This file (README.chromium)
 * A build file (BUILD.gn)
 * An OWNERS file
 * A codereview.settings file
 * Patched header files used by Chromium
-* Cherry picked an unused variable/function warning from upstream master:
-  https://github.com/libjpeg-turbo/libjpeg-turbo/commit/b46af82cc159bbd05312417d003cfab67c340156
-* Cherry picked checking the maximum supported CPUID leaf:
-  https://github.com/libjpeg-turbo/libjpeg-turbo/commit/aa9db616774e24af7ab2fbcddd5711057b8a901e
+* Cherry picked a fix from upstream master to enable AArch64 Windows builds:
+  https://github.com/libjpeg-turbo/libjpeg-turbo/commit/6ee5d5f568fda1a7c6a49dd8995f2d89866ee42d
 * Deleted unused directories: ci, cmakescripts, doc, java, release, sharedlib,
   simd/loongson, simd/mips, simd/powerpc, and win
-* Deleted unused files: appveyor.yml, Brewfile, CMakeLists.txt, doxygen.config,
+* Deleted unused files: appveyor.yml, CMakeLists.txt, doxygen.config,
   doxygen-extra.css, .gitattributes, tjexample.c, tjexampletest.java.in,
   .travis.yml
 
diff --git a/README.md b/README.md
index a769259..e7ff743 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,14 @@
 Background
 ==========
 
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
-on x86, x86-64, ARM, and PowerPC systems, as well as progressive JPEG
-compression on x86 and x86-64 systems.  On such systems, libjpeg-turbo is
-generally 2-6x as fast as libjpeg, all else being equal.  On other types of
-systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
-virtue of its highly-optimized Huffman coding routines.  In many cases, the
-performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
+baseline JPEG compression and decompression on x86, x86-64, ARM, PowerPC, and
+MIPS systems, as well as progressive JPEG compression on x86 and x86-64
+systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
+all else being equal.  On other types of systems, libjpeg-turbo can still
+outperform libjpeg by a significant amount, by virtue of its highly-optimized
+Huffman coding routines.  In many cases, the performance of libjpeg-turbo
+rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less
 powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
@@ -135,25 +135,24 @@ without recompiling.  libjpeg-turbo does not claim to support all of the
 libjpeg v7+ features, nor to produce identical output to libjpeg v7+ in all
 cases (see below.)
 
-By passing an argument of `--with-jpeg7` or `--with-jpeg8` to `configure`, or
-an argument of `-DWITH_JPEG7=1` or `-DWITH_JPEG8=1` to `cmake`, you can build a
-version of libjpeg-turbo that emulates the libjpeg v7 or v8 ABI, so that
-programs that are built against libjpeg v7 or v8 can be run with libjpeg-turbo.
-The following section describes which libjpeg v7+ features are supported and
-which aren't.
+By passing an argument of `-DWITH_JPEG7=1` or `-DWITH_JPEG8=1` to `cmake`, you
+can build a version of libjpeg-turbo that emulates the libjpeg v7 or v8 ABI, so
+that programs that are built against libjpeg v7 or v8 can be run with
+libjpeg-turbo.  The following section describes which libjpeg v7+ features are
+supported and which aren't.
 
 ### Support for libjpeg v7 and v8 Features
 
 #### Fully supported
 
-- **libjpeg: IDCT scaling extensions in decompressor**<br>
+- **libjpeg API: IDCT scaling extensions in decompressor**<br>
   libjpeg-turbo supports IDCT scaling with scaling factors of 1/8, 1/4, 3/8,
   1/2, 5/8, 3/4, 7/8, 9/8, 5/4, 11/8, 3/2, 13/8, 7/4, 15/8, and 2/1 (only 1/4
   and 1/2 are SIMD-accelerated.)
 
-- **libjpeg: Arithmetic coding**
+- **libjpeg API: Arithmetic coding**
 
-- **libjpeg: In-memory source and destination managers**<br>
+- **libjpeg API: In-memory source and destination managers**<br>
   See notes below.
 
 - **cjpeg: Separate quality settings for luminance and chrominance**<br>
@@ -185,14 +184,14 @@ means of quality improvement.  The reader is invited to peruse the research at
 but it is the general belief of our project that these features have not
 demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
 
-- **libjpeg: DCT scaling in compressor**<br>
+- **libjpeg API: DCT scaling in compressor**<br>
   `cinfo.scale_num` and `cinfo.scale_denom` are silently ignored.
   There is no technical reason why DCT scaling could not be supported when
   emulating the libjpeg v7+ API/ABI, but without the SmartScale extension (see
   below), only scaling factors of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and
   8/9 would be available, which is of limited usefulness.
 
-- **libjpeg: SmartScale**<br>
+- **libjpeg API: SmartScale**<br>
   `cinfo.block_size` is silently ignored.
   SmartScale is an extension to the JPEG format that allows for DCT block
   sizes other than 8x8.  Providing support for this new format would be
@@ -205,7 +204,7 @@ demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
   interest in providing this feature would be as a means of supporting
   additional DCT scaling factors.
 
-- **libjpeg: Fancy downsampling in compressor**<br>
+- **libjpeg API: Fancy downsampling in compressor**<br>
   `cinfo.do_fancy_downsampling` is silently ignored.
   This requires the DCT scaling feature, which is not supported.
 
@@ -247,15 +246,14 @@ don't, and it allows those functions to be provided in the "official"
 libjpeg-turbo binaries.
 
 Those who are concerned about maintaining strict conformance with the libjpeg
-v6b or v7 API can pass an argument of `--without-mem-srcdst` to `configure` or
-an argument of `-DWITH_MEM_SRCDST=0` to `cmake` prior to building
-libjpeg-turbo.  This will restore the pre-1.3 behavior, in which
+v6b or v7 API can pass an argument of `-DWITH_MEM_SRCDST=0` to `cmake` prior to
+building libjpeg-turbo.  This will restore the pre-1.3 behavior, in which
 `jpeg_mem_src()` and `jpeg_mem_dest()` are only included when emulating the
 libjpeg v8 API/ABI.
 
 On Un*x systems, including the in-memory source/destination managers changes
-the dynamic library version from 62.1.0 to 62.2.0 if using libjpeg v6b API/ABI
-emulation and from 7.1.0 to 7.2.0 if using libjpeg v7 API/ABI emulation.
+the dynamic library version from 62.2.0 to 62.3.0 if using libjpeg v6b API/ABI
+emulation and from 7.2.0 to 7.3.0 if using libjpeg v7 API/ABI emulation.
 
 Note that, on most Un*x systems, the dynamic linker will not look for a
 function in a library until that function is actually used.  Thus, if a program
@@ -331,7 +329,7 @@ in a way that makes the rest of the libjpeg infrastructure happy, so it is
 necessary to use the slow Huffman decoder when decompressing a JPEG image that
 has restart markers.  This can cause the decompression performance to drop by
 as much as 20%, but the performance will still be much greater than that of
-libjpeg.  Many consumer packages, such as PhotoShop, use restart markers when
+libjpeg.  Many consumer packages, such as Photoshop, use restart markers when
 generating JPEG images, so images generated by those programs will experience
 this issue.
 
@@ -344,3 +342,15 @@ quality of 98-100.  Thus, libjpeg-turbo must use the non-SIMD quantization
 function in those cases.  This causes performance to drop by as much as 40%.
 It is therefore strongly advised that you use the slow integer forward DCT
 whenever encoding images with a JPEG quality of 98 or higher.
+
+
+Memory Debugger Pitfalls
+========================
+
+Valgrind and Memory Sanitizer (MSan) can generate false positives
+(specifically, incorrect reports of uninitialized memory accesses) when used
+with libjpeg-turbo's SIMD extensions.  It is generally recommended that the
+SIMD extensions be disabled, either by passing an argument of `-DWITH_SIMD=0`
+to `cmake` when configuring the build or by setting the environment variable
+`JSIMD_FORCENONE` to `1` at run time, when testing libjpeg-turbo with Valgrind,
+MSan, or other memory debuggers.
diff --git a/cjpeg.c b/cjpeg.c
index e807510..e80d16b 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -686,12 +686,10 @@ main(int argc, char **argv)
 
   if (memdst) {
     fprintf(stderr, "Compressed size:  %lu bytes\n", outsize);
-    if (outbuffer != NULL)
-      free(outbuffer);
+    free(outbuffer);
   }
 
-  if (icc_profile != NULL)
-    free(icc_profile);
+  free(icc_profile);
 
   /* All done. */
   return (jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
diff --git a/djpeg.c b/djpeg.c
index acacfcb..2489ccc 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -520,7 +520,9 @@ main(int argc, char **argv)
   FILE *input_file;
   FILE *output_file;
   unsigned char *inbuffer = NULL;
+#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
   unsigned long insize = 0;
+#endif
   JDIMENSION num_scanlines;
 
   /* On Mac, fetch a command line. */
@@ -815,7 +817,7 @@ main(int argc, char **argv)
   end_progress_monitor((j_common_ptr)&cinfo);
 #endif
 
-  if (memsrc && inbuffer != NULL)
+  if (memsrc)
     free(inbuffer);
 
   /* All done. */
diff --git a/jchuff.c b/jchuff.c
index 939b3e7..cb05055 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014-2016, 2018-2019, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -43,8 +43,8 @@
  */
 
 /* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-#if !defined __thumb__ || defined __thumb2__
+#if defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
 #endif
@@ -356,6 +356,8 @@ dump_buffer(working_state *state)
   put_buffer = (put_buffer << size) | code; \
 }
 
+#if SIZEOF_SIZE_T != 8 && !defined(_WIN64)
+
 #define CHECKBUF15() { \
   if (put_bits > 15) { \
     EMIT_BYTE() \
@@ -363,6 +365,8 @@ dump_buffer(working_state *state)
   } \
 }
 
+#endif
+
 #define CHECKBUF31() { \
   if (put_bits > 31) { \
     EMIT_BYTE() \
@@ -428,7 +432,7 @@ dump_buffer(working_state *state)
  * scanning order-- 1, 8, 16, etc.), then this will produce an encoded block
  * larger than 200 bytes.
  */
-#define BUFSIZE  (DCTSIZE2 * 4)
+#define BUFSIZE  (DCTSIZE2 * 8)
 
 #define LOAD_BUFFER() { \
   if (state->free_in_buffer < BUFSIZE) { \
diff --git a/jcmaster.c b/jcmaster.c
index 93b3de6..998dc40 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -492,8 +492,8 @@ prepare_for_pass(j_compress_ptr cinfo)
      */
     master->pass_type = output_pass;
     master->pass_number++;
-    /*FALLTHROUGH*/
 #endif
+    /*FALLTHROUGH*/
   case output_pass:
     /* Do a data-output pass. */
     /* We need not repeat per-scan setup if prior optimization pass did it. */
diff --git a/jconfigint.h b/jconfigint.h
index fc6ed05..10bcac7 100644
--- a/jconfigint.h
+++ b/jconfigint.h
@@ -15,11 +15,18 @@
 #endif
 #endif
 
+/* How to obtain thread-local storage */
+#if defined(_MSC_VER) && (defined(_WIN32) || defined(_WIN64))
+#define THREAD_LOCAL  __declspec(thread)
+#else
+#define THREAD_LOCAL __thread
+#endif
+
 /* Define to the full name of this package. */
 #define PACKAGE_NAME "libjpeg-turbo"
 
 /* Version number of package */
-#define VERSION "2.0.1"
+#define VERSION "2.0.5"
 
 /* The size of `size_t', as computed by sizeof. */
 #if __WORDSIZE==64 || defined(_WIN64)
diff --git a/jconfigint.h.in b/jconfigint.h.in
index 55df053..68cbc2a 100644
--- a/jconfigint.h.in
+++ b/jconfigint.h.in
@@ -7,6 +7,9 @@
 /* How to obtain function inlining. */
 #define INLINE  @INLINE@
 
+/* How to obtain thread-local storage */
+#define THREAD_LOCAL  @THREAD_LOCAL@
+
 /* Define to the full name of this package. */
 #define PACKAGE_NAME  "@CMAKE_PROJECT_NAME@"
 
diff --git a/jcphuff.c b/jcphuff.c
index 024d3af..8c4efaf 100644
--- a/jcphuff.c
+++ b/jcphuff.c
@@ -52,8 +52,8 @@
  */
 
 /* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-#if !defined __thumb__ || defined __thumb2__
+#if defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
 #endif
diff --git a/jdatadst-tj.c b/jdatadst-tj.c
index 0bd961b..fdaa2de 100644
--- a/jdatadst-tj.c
+++ b/jdatadst-tj.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2012 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2014, 2016, D. R. Commander.
+ * Copyright (C) 2011, 2014, 2016, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -27,6 +27,8 @@
 extern void *malloc(size_t size);
 extern void free(void *ptr);
 #endif
+void jpeg_mem_dest_tj(j_compress_ptr cinfo, unsigned char **outbuffer,
+                      unsigned long *outsize, boolean alloc);
 
 
 #define OUTPUT_BUF_SIZE  4096   /* choose an efficiently fwrite'able size */
@@ -101,8 +103,7 @@ empty_mem_output_buffer(j_compress_ptr cinfo)
 
   MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
 
-  if (dest->newbuffer != NULL)
-    free(dest->newbuffer);
+  free(dest->newbuffer);
 
   dest->newbuffer = nextbuffer;
 
diff --git a/jdatadst.c b/jdatadst.c
index 3168b96..246fffb 100644
--- a/jdatadst.c
+++ b/jdatadst.c
@@ -143,8 +143,7 @@ empty_mem_output_buffer(j_compress_ptr cinfo)
 
   MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
 
-  if (dest->newbuffer != NULL)
-    free(dest->newbuffer);
+  free(dest->newbuffer);
 
   dest->newbuffer = nextbuffer;
 
diff --git a/jdatasrc-tj.c b/jdatasrc-tj.c
index 1c71307..69fb5ea 100644
--- a/jdatasrc-tj.c
+++ b/jdatasrc-tj.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2016, D. R. Commander.
+ * Copyright (C) 2011, 2016, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -23,6 +23,9 @@
 #include "jpeglib.h"
 #include "jerror.h"
 
+void jpeg_mem_src_tj(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+                     unsigned long insize);
+
 
 /*
  * Initialize source --- called by jpeg_read_header
diff --git a/jdhuff.c b/jdhuff.c
index 87dff97..3880890 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2018-2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -589,7 +589,11 @@ decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     if (entropy->dc_needed[blkn]) {
       /* Convert DC difference to actual value, update last_dc_val */
       int ci = cinfo->MCU_membership[blkn];
-      s += state.last_dc_val[ci];
+      /* This is really just
+       *   s += state.last_dc_val[ci];
+       * It is written this way in order to shut up UBSan.
+       */
+      s = (int)((unsigned int)s + (unsigned int)state.last_dc_val[ci]);
       state.last_dc_val[ci] = s;
       if (block) {
         /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
@@ -684,7 +688,7 @@ decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
     if (entropy->dc_needed[blkn]) {
       int ci = cinfo->MCU_membership[blkn];
-      s += state.last_dc_val[ci];
+      s = (int)((unsigned int)s + (unsigned int)state.last_dc_val[ci]);
       state.last_dc_val[ci] = s;
       if (block)
         (*block)[0] = (JCOEF)s;
diff --git a/jdmerge.c b/jdmerge.c
index b3fec04..dff5a35 100644
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -429,8 +429,6 @@ h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 #define PACK_TWO_PIXELS_LE(l, r)    ((r << 16) | l)
 #define PACK_TWO_PIXELS_BE(l, r)    ((l << 16) | r)
 
-#define PACK_NEED_ALIGNMENT(ptr)    (((size_t)(ptr)) & 3)
-
 #define WRITE_TWO_PIXELS_LE(addr, pixels) { \
   ((INT16 *)(addr))[0] = (INT16)(pixels); \
   ((INT16 *)(addr))[1] = (INT16)((pixels) >> 16); \
diff --git a/jdsample.c b/jdsample.c
index 59257e1..49016ce 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -8,6 +8,7 @@
  * Copyright (C) 2010, 2015-2016, D. R. Commander.
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2019, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
diff --git a/jfdctint.c b/jfdctint.c
index c0391a9..b47c306 100644
--- a/jfdctint.c
+++ b/jfdctint.c
@@ -1,7 +1,7 @@
 /*
  * jfdctint.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, D. R. Commander.
diff --git a/jidctint.c b/jidctint.c
index 5557342..98425d5 100644
--- a/jidctint.c
+++ b/jidctint.c
@@ -1,7 +1,7 @@
 /*
  * jidctint.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modification developed 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
diff --git a/jidctred.c b/jidctred.c
index 1ff352f..1dd65a9 100644
--- a/jidctred.c
+++ b/jidctred.c
@@ -1,7 +1,7 @@
 /*
  * jidctred.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, D. R. Commander.
diff --git a/jpegtran.c b/jpegtran.c
index 55bdb4a..1f7e521 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -595,8 +595,7 @@ main(int argc, char **argv)
   end_progress_monitor((j_common_ptr)&dstinfo);
 #endif
 
-  if (icc_profile != NULL)
-    free(icc_profile);
+  free(icc_profile);
 
   /* All done. */
   return (jsrcerr.num_warnings + jdsterr.num_warnings ?
diff --git a/jversion.h b/jversion.h
index 2039f44..ab4a2c5 100644
--- a/jversion.h
+++ b/jversion.h
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2012-2018, D. R. Commander.
+ * Copyright (C) 2010, 2012-2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -36,7 +36,7 @@
  */
 
 #define JCOPYRIGHT \
-  "Copyright (C) 2009-2018 D. R. Commander\n" \
+  "Copyright (C) 2009-2020 D. R. Commander\n" \
   "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
   "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
   "Copyright (C) 2015 Intel Corporation\n" \
@@ -49,4 +49,4 @@
   "Copyright (C) 1991-2016 Thomas G. Lane, Guido Vollbeding"
 
 #define JCOPYRIGHT_SHORT \
-  "Copyright (C) 1991-2018 The libjpeg-turbo Project and many others"
+  "Copyright (C) 1991-2020 The libjpeg-turbo Project and many others"
diff --git a/rdjpgcom.c b/rdjpgcom.c
index e9f31d2..620270e 100644
--- a/rdjpgcom.c
+++ b/rdjpgcom.c
@@ -118,7 +118,6 @@ read_2_bytes(void)
 #define M_SOI    0xD8           /* Start Of Image (beginning of datastream) */
 #define M_EOI    0xD9           /* End Of Image (end of datastream) */
 #define M_SOS    0xDA           /* Start Of Scan (begins compressed data) */
-#define M_APP0   0xE0           /* Application-specific marker, type N */
 #define M_APP12  0xEC           /* (we don't bother to list all 16 APPn's) */
 #define M_COM    0xFE           /* COMment */
 
diff --git a/rdppm.c b/rdppm.c
index 87bc330..a8507b9 100644
--- a/rdppm.c
+++ b/rdppm.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2009 by Bill Allombert, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2017, D. R. Commander.
+ * Copyright (C) 2015-2017, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -720,7 +720,7 @@ start_input_ppm(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
     /* On 16-bit-int machines we have to be careful of maxval = 65535 */
     source->rescale = (JSAMPLE *)
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                  (size_t)(((long)maxval + 1L) *
+                                  (size_t)(((long)MAX(maxval, 255) + 1L) *
                                            sizeof(JSAMPLE)));
     half_maxval = maxval / 2;
     for (val = 0; val <= (long)maxval; val++) {
diff --git a/rdswitch.c b/rdswitch.c
index c50c33e..886fec3 100644
--- a/rdswitch.c
+++ b/rdswitch.c
@@ -338,8 +338,8 @@ set_quality_ratings(j_compress_ptr cinfo, char *arg, boolean force_baseline)
 #else
       q_scale_factor[tblno] = jpeg_quality_scaling(val);
 #endif
-      while (*arg && *arg++ != ',') /* advance to next segment of arg string */
-        ;
+      while (*arg && *arg++ != ','); /* advance to next segment of arg
+                                        string */
     } else {
       /* reached end of parameter, set remaining factors to last value */
 #if JPEG_LIB_VERSION >= 70
@@ -378,8 +378,8 @@ set_quant_slots(j_compress_ptr cinfo, char *arg)
         return FALSE;
       }
       cinfo->comp_info[ci].quant_tbl_no = val;
-      while (*arg && *arg++ != ',') /* advance to next segment of arg string */
-        ;
+      while (*arg && *arg++ != ','); /* advance to next segment of arg
+                                        string */
     } else {
       /* reached end of parameter, set remaining components to last table */
       cinfo->comp_info[ci].quant_tbl_no = val;
@@ -412,8 +412,8 @@ set_sample_factors(j_compress_ptr cinfo, char *arg)
       }
       cinfo->comp_info[ci].h_samp_factor = val1;
       cinfo->comp_info[ci].v_samp_factor = val2;
-      while (*arg && *arg++ != ',') /* advance to next segment of arg string */
-        ;
+      while (*arg && *arg++ != ',');  /* advance to next segment of arg
+                                         string */
     } else {
       /* reached end of parameter, set remaining components to 1x1 sampling */
       cinfo->comp_info[ci].h_samp_factor = 1;
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 8dbd7f1..5c8009a 100644
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -38,6 +38,14 @@ elseif(CPU_TYPE STREQUAL "i386")
   endif()
 endif()
 
+if(NOT REQUIRE_SIMD)
+  include(CheckLanguage)
+  check_language(ASM_NASM)
+  if(NOT CMAKE_ASM_NASM_COMPILER)
+    simd_fail("SIMD extensions disabled: could not find NASM compiler")
+    return()
+  endif()
+endif()
 enable_language(ASM_NASM)
 message(STATUS "CMAKE_ASM_NASM_COMPILER = ${CMAKE_ASM_NASM_COMPILER}")
 
@@ -135,6 +143,9 @@ endif()
 if(MSVC_IDE)
   set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}")
   string(REGEX REPLACE " " ";" CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS}")
+elseif(XCODE)
+  set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}")
+  string(REGEX REPLACE " " ";" CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS}")
 endif()
 
 file(GLOB INC_FILES nasm/*.inc)
@@ -162,13 +173,13 @@ foreach(file ${SIMD_SOURCES})
       ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
   endif()
   set(OBJECT_DEPENDS ${OBJECT_DEPENDS} ${INC_FILES})
-  if(MSVC_IDE)
+  if(MSVC_IDE OR XCODE)
     # The CMake Visual Studio generators do not work properly with the ASM_NASM
     # language, so we have to go rogue here and use a custom command like we
     # did in prior versions of libjpeg-turbo.  (This is why we can't have nice
     # things.)
     string(REGEX REPLACE "${CPU_TYPE}/" "" filename ${file})
-    set(SIMD_OBJ ${OBJDIR}/${filename}.obj)
+    set(SIMD_OBJ ${OBJDIR}/${filename}${CMAKE_C_OUTPUT_EXTENSION})
     add_custom_command(OUTPUT ${SIMD_OBJ} DEPENDS ${file} ${OBJECT_DEPENDS}
       COMMAND ${CMAKE_ASM_NASM_COMPILER} -f${CMAKE_ASM_NASM_OBJECT_FORMAT}
         ${CMAKE_ASM_NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}
@@ -180,7 +191,7 @@ foreach(file ${SIMD_SOURCES})
   endif()
 endforeach()
 
-if(MSVC_IDE)
+if(MSVC_IDE OR XCODE)
   set(SIMD_OBJS ${SIMD_OBJS} PARENT_SCOPE)
   add_library(simd OBJECT ${CPU_TYPE}/jsimd.c)
   add_custom_target(simd-objs DEPENDS ${SIMD_OBJS})
diff --git a/simd/arm/arm/jsimd.c b/simd/arm/arm/jsimd.c
index 5476c36..ad176dc 100644
--- a/simd/arm/arm/jsimd.c
+++ b/simd/arm/arm/jsimd.c
@@ -5,7 +5,7 @@
  * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
  * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
- * Copyright 2019 Google LLC.
+ * Copyright (C) 2019, Google LLC.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index 2671148..020353f 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -31,6 +31,147 @@
 .section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
 #endif
 
+#if defined(__APPLE__)
+.section __DATA, __const
+#elif defined(_WIN32)
+.section .rdata
+#else
+.section .rodata, "a", %progbits
+#endif
+
+/* Constants for jsimd_*_ycc_neon() */
+
+.balign 16
+Ljsimd_rgb_ycc_neon_consts:
+  .short 19595, 38470, 7471, 11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128, 32767, 128
+  .short 32767, 128, 32767, 128
+
+/* Constants for jsimd_fdct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_fdct_ifast_neon() */
+
+.balign 16
+Ljsimd_fdct_ifast_neon_consts:
+  .short (98 * 128)               /* XFIX_0_382683433 */
+  .short (139 * 128)              /* XFIX_0_541196100 */
+  .short (181 * 128)              /* XFIX_0_707106781 */
+  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
+
+/* Constants for jsimd_h2*_downsample_neon() */
+
+.balign 16
+Ljsimd_h2_downsample_neon_consts:
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
+        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
+        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
+        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
+  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
+        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
+  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
+        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
+  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
+
+/* Constants for jsimd_huff_encode_one_block_neon() */
+
+.balign 16
+Ljsimd_huff_encode_one_block_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
+            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
+    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
+            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
+    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
+           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
+    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
+            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
+    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
+            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
+    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
+            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
+    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
+            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
+    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
+            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
+    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
+    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
+
 .text
 
 
@@ -55,6 +196,17 @@ _\fname:
 #endif
 .endm
 
+/* Get symbol location */
+.macro get_symbol_loc reg, symbol
+#ifdef __APPLE__
+    adrp            \reg, \symbol@PAGE
+    add             \reg, \reg, \symbol@PAGEOFF
+#else
+    adrp            \reg, \symbol
+    add             \reg, \reg, :lo12:\symbol
+#endif
+.endm
+
 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
     trn1            \t0\().8h, \l0\().8h, \l1\().8h
     trn1            \t1\().8h, \l2\().8h, \l3\().8h
@@ -271,17 +423,6 @@ _\fname:
     do_rgb_to_yuv_stage1
 .endm
 
-.balign 16
-.if \fast_ld3 == 1
-Ljsimd_\colorid\()_ycc_neon_consts:
-.else
-Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
-.endif
-  .short 19595, 38470, 7471, 11059
-  .short 21709, 32768, 27439, 5329
-  .short 32767, 128, 32767, 128
-  .short 32767, 128, 32767, 128
-
 .if \fast_ld3 == 1
 asm_function jsimd_\colorid\()_ycc_convert_neon
 .else
@@ -304,11 +445,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
     N               .req w12
 
     /* Load constants to d0, d1, d2, d3 */
-    .if \fast_ld3 == 1
-      adr           x13, Ljsimd_\colorid\()_ycc_neon_consts
-    .else
-      adr           x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
-    .endif
+    get_symbol_loc  x13, Ljsimd_rgb_ycc_neon_consts
     ld1             {v0.8h, v1.8h}, [x13]
 
     ldr             OUTPUT_BUF0, [OUTPUT_BUF]
@@ -508,50 +645,6 @@ asm_function jsimd_convsamp_neon
 #define DESCALE_P1  (CONST_BITS - PASS1_BITS)
 #define DESCALE_P2  (CONST_BITS + PASS1_BITS)
 
-#define F_0_298   2446  /* FIX(0.298631336) */
-#define F_0_390   3196  /* FIX(0.390180644) */
-#define F_0_541   4433  /* FIX(0.541196100) */
-#define F_0_765   6270  /* FIX(0.765366865) */
-#define F_0_899   7373  /* FIX(0.899976223) */
-#define F_1_175   9633  /* FIX(1.175875602) */
-#define F_1_501  12299  /* FIX(1.501321110) */
-#define F_1_847  15137  /* FIX(1.847759065) */
-#define F_1_961  16069  /* FIX(1.961570560) */
-#define F_2_053  16819  /* FIX(2.053119869) */
-#define F_2_562  20995  /* FIX(2.562915447) */
-#define F_3_072  25172  /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_fdct_islow_neon_consts:
-  .short F_0_298
-  .short -F_0_390
-  .short F_0_541
-  .short F_0_765
-  .short - F_0_899
-  .short F_1_175
-  .short F_1_501
-  .short - F_1_847
-  .short - F_1_961
-  .short F_2_053
-  .short - F_2_562
-  .short F_3_072
-  .short 0          /* padding */
-  .short 0
-  .short 0
-  .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
 #define XFIX_P_0_298  v0.h[0]
 #define XFIX_N_0_390  v0.h[1]
 #define XFIX_P_0_541  v0.h[2]
@@ -571,7 +664,7 @@ asm_function jsimd_fdct_islow_neon
     TMP             .req x9
 
     /* Load constants */
-    adr             TMP, Ljsimd_fdct_islow_neon_consts
+    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
     ld1             {v0.8h, v1.8h}, [TMP]
 
     /* Save NEON registers */
@@ -850,20 +943,13 @@ asm_function jsimd_fdct_islow_neon
 #define XFIX_0_707106781  v0.h[2]
 #define XFIX_1_306562965  v0.h[3]
 
-.balign 16
-Ljsimd_fdct_ifast_neon_consts:
-  .short (98 * 128)               /* XFIX_0_382683433 */
-  .short (139 * 128)              /* XFIX_0_541196100 */
-  .short (181 * 128)              /* XFIX_0_707106781 */
-  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
-
 asm_function jsimd_fdct_ifast_neon
 
     DATA            .req x0
     TMP             .req x9
 
     /* Load constants */
-    adr             TMP, Ljsimd_fdct_ifast_neon_consts
+    get_symbol_loc  TMP, Ljsimd_fdct_ifast_neon_consts
     ld1             {v0.4h}, [TMP]
 
     /* Load all DATA into NEON registers with the following allocation:
@@ -1042,41 +1128,6 @@ asm_function jsimd_quantize_neon
  *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
  */
 
-.balign 16
-Ljsimd_h2_downsample_neon_consts:
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
-        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
-        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
-        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
-  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
-        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
-  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
-        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
-  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
-
 asm_function jsimd_h2v1_downsample_neon
     IMAGE_WIDTH     .req x0
     MAX_V_SAMP      .req x1
@@ -1094,7 +1145,7 @@ asm_function jsimd_h2v1_downsample_neon
     mov             TMPDUP, #0x10000
     lsl             TMP2, BLOCK_WIDTH, #4
     sub             TMP2, TMP2, IMAGE_WIDTH
-    adr             TMP3, Ljsimd_h2_downsample_neon_consts
+    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
     add             TMP3, TMP3, TMP2, lsl #4
     dup             v16.4s, TMPDUP
     ld1             {v18.16b}, [TMP3]
@@ -1173,7 +1224,7 @@ asm_function jsimd_h2v2_downsample_neon
     lsl             TMP2, BLOCK_WIDTH, #4
     lsl             TMPDUP, TMPDUP, #17
     sub             TMP2, TMP2, IMAGE_WIDTH
-    adr             TMP3, Ljsimd_h2_downsample_neon_consts
+    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
     orr             TMPDUP, TMPDUP, #1
     add             TMP3, TMP3, TMP2, lsl #4
     dup             v16.4s, TMPDUP
@@ -1279,41 +1330,6 @@ asm_function jsimd_h2v2_downsample_neon
 
 .macro generate_jsimd_huff_encode_one_block fast_tbl
 
-.balign 16
-.if \fast_tbl == 1
-Ljsimd_huff_encode_one_block_neon_consts:
-.else
-Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
-.endif
-    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
-          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-.if \fast_tbl == 1
-    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
-            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
-    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
-            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
-    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
-           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
-    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
-            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
-    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
-            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
-    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
-            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
-    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
-            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
-    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
-            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
-    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
-           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
-    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
-             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
-    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
-           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
-    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
-           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
-.endif
-
 .if \fast_tbl == 1
 asm_function jsimd_huff_encode_one_block_neon
 .else
@@ -1323,11 +1339,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
     /* Save ARM registers */
     stp             x19, x20, [sp]
-.if \fast_tbl == 1
-    adr             x15, Ljsimd_huff_encode_one_block_neon_consts
-.else
-    adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
-.endif
+    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
     ldr             PUT_BUFFER, [x0, #0x10]
     ldr             PUT_BITSw, [x0, #0x18]
     ldrsh           w12, [x2]               /* load DC coeff in w12 */
diff --git a/simd/i386/jccolext-avx2.asm b/simd/i386/jccolext-avx2.asm
index 7a8d784..c46d684 100644
--- a/simd/i386/jccolext-avx2.asm
+++ b/simd/i386/jccolext-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -110,12 +108,12 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
-    movzx       eax, BYTE [esi+ecx]
+    movzx       eax, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
-    movzx       edx, WORD [esi+ecx]
+    movzx       edx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
diff --git a/simd/i386/jccolext-mmx.asm b/simd/i386/jccolext-mmx.asm
index 9a2c30e..6357a42 100644
--- a/simd/i386/jccolext-mmx.asm
+++ b/simd/i386/jccolext-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -111,13 +109,13 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
     xor         eax, eax
-    mov         al, BYTE [esi+ecx]
+    mov         al, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
     xor         edx, edx
-    mov         dx, WORD [esi+ecx]
+    mov         dx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
@@ -127,7 +125,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
     test        cl, SIZEOF_DWORD
     jz          short .column_ld8
     sub         ecx, byte SIZEOF_DWORD
-    movd        mmG, DWORD [esi+ecx]
+    movd        mmG, dword [esi+ecx]
     psllq       mmA, DWORD_BIT
     por         mmA, mmG
 .column_ld8:
@@ -197,7 +195,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
     test        cl, SIZEOF_MMWORD/8
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_MMWORD/8
-    movd        mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
 .column_ld2:
     test        cl, SIZEOF_MMWORD/4
     jz          short .column_ld4
diff --git a/simd/i386/jccolext-sse2.asm b/simd/i386/jccolext-sse2.asm
index e830562..c6c8085 100644
--- a/simd/i386/jccolext-sse2.asm
+++ b/simd/i386/jccolext-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -109,12 +107,12 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
-    movzx       eax, BYTE [esi+ecx]
+    movzx       eax, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
-    movzx       edx, WORD [esi+ecx]
+    movzx       edx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
diff --git a/simd/i386/jccolor-avx2.asm b/simd/i386/jccolor-avx2.asm
index 958517f..14944e9 100644
--- a/simd/i386/jccolor-avx2.asm
+++ b/simd/i386/jccolor-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jccolor-mmx.asm b/simd/i386/jccolor-mmx.asm
index 47be9e1..8cb399b 100644
--- a/simd/i386/jccolor-mmx.asm
+++ b/simd/i386/jccolor-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jccolor-sse2.asm b/simd/i386/jccolor-sse2.asm
index c0d5d45..686d222 100644
--- a/simd/i386/jccolor-sse2.asm
+++ b/simd/i386/jccolor-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jcgray-avx2.asm b/simd/i386/jcgray-avx2.asm
index 4d66242..560ee0c 100644
--- a/simd/i386/jcgray-avx2.asm
+++ b/simd/i386/jcgray-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jcgray-mmx.asm b/simd/i386/jcgray-mmx.asm
index 07c7ea6..79fdf08 100644
--- a/simd/i386/jcgray-mmx.asm
+++ b/simd/i386/jcgray-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jcgray-sse2.asm b/simd/i386/jcgray-sse2.asm
index 4b8c797..cb4b28e 100644
--- a/simd/i386/jcgray-sse2.asm
+++ b/simd/i386/jcgray-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jcgryext-avx2.asm b/simd/i386/jcgryext-avx2.asm
index 52e99a8..3fa7973 100644
--- a/simd/i386/jcgryext-avx2.asm
+++ b/simd/i386/jcgryext-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -102,12 +100,12 @@ EXTN(jsimd_rgb_gray_convert_avx2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
-    movzx       eax, BYTE [esi+ecx]
+    movzx       eax, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
-    movzx       edx, WORD [esi+ecx]
+    movzx       edx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
diff --git a/simd/i386/jcgryext-mmx.asm b/simd/i386/jcgryext-mmx.asm
index 4a9ab0d..8af42e5 100644
--- a/simd/i386/jcgryext-mmx.asm
+++ b/simd/i386/jcgryext-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -103,13 +101,13 @@ EXTN(jsimd_rgb_gray_convert_mmx):
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
     xor         eax, eax
-    mov         al, BYTE [esi+ecx]
+    mov         al, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
     xor         edx, edx
-    mov         dx, WORD [esi+ecx]
+    mov         dx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
@@ -119,7 +117,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
     test        cl, SIZEOF_DWORD
     jz          short .column_ld8
     sub         ecx, byte SIZEOF_DWORD
-    movd        mmG, DWORD [esi+ecx]
+    movd        mmG, dword [esi+ecx]
     psllq       mmA, DWORD_BIT
     por         mmA, mmG
 .column_ld8:
@@ -189,7 +187,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
     test        cl, SIZEOF_MMWORD/8
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_MMWORD/8
-    movd        mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
 .column_ld2:
     test        cl, SIZEOF_MMWORD/4
     jz          short .column_ld4
diff --git a/simd/i386/jcgryext-sse2.asm b/simd/i386/jcgryext-sse2.asm
index 04d891c..c9d6ff1 100644
--- a/simd/i386/jcgryext-sse2.asm
+++ b/simd/i386/jcgryext-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -101,12 +99,12 @@ EXTN(jsimd_rgb_gray_convert_sse2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
-    movzx       eax, BYTE [esi+ecx]
+    movzx       eax, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
-    movzx       edx, WORD [esi+ecx]
+    movzx       edx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
diff --git a/simd/i386/jchuff-sse2.asm b/simd/i386/jchuff-sse2.asm
index fea4de3..d0112e6 100644
--- a/simd/i386/jchuff-sse2.asm
+++ b/simd/i386/jchuff-sse2.asm
@@ -17,8 +17,6 @@
 ; This file contains an SSE2 implementation for Huffman coding of one block.
 ; The following code is based directly on jchuff.c; see jchuff.c for more
 ; details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
@@ -196,8 +194,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     push        ebp
 
     mov         esi, POINTER [eax+8]       ; (working_state *state)
-    mov         put_buffer, DWORD [esi+8]  ; put_buffer = state->cur.put_buffer;
-    mov         put_bits, DWORD [esi+12]   ; put_bits = state->cur.put_bits;
+    mov         put_buffer, dword [esi+8]  ; put_buffer = state->cur.put_buffer;
+    mov         put_bits, dword [esi+12]   ; put_bits = state->cur.put_bits;
     push        esi                        ; esi is now scratch
 
     get_GOT     edx                        ; get GOT address
@@ -213,7 +211,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     ; Encode the DC coefficient difference per section F.1.2.1
     mov         esi, POINTER [esp+block]  ; block
     movsx       ecx, word [esi]           ; temp = temp2 = block[0] - last_dc_val;
-    sub         ecx, DWORD [eax+20]
+    sub         ecx, dword [eax+20]
     mov         esi, ecx
 
     ; This is a well-known technique for obtaining the absolute value
@@ -228,12 +226,12 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     ; For a negative input, want temp2 = bitwise complement of abs(input)
     ; This code assumes we are on a two's complement machine
     add         esi, edx                ; temp2 += temp3;
-    mov         DWORD [esp+temp], esi   ; backup temp2 in temp
+    mov         dword [esp+temp], esi   ; backup temp2 in temp
 
     ; Find the number of bits needed for the magnitude of the coefficient
     movpic      ebp, POINTER [esp+gotptr]                        ; load GOT address (ebp)
     movzx       edx, byte [GOTOFF(ebp, EXTN(jpeg_nbits_table) + ecx)]  ; nbits = JPEG_NBITS(temp);
-    mov         DWORD [esp+temp2], edx                           ; backup nbits in temp2
+    mov         dword [esp+temp2], edx                           ; backup nbits in temp2
 
     ; Emit the Huffman-coded symbol for the number of bits
     mov         ebp, POINTER [eax+24]         ; After this point, arguments are not accessible anymore
@@ -241,13 +239,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     movzx       ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
     EMIT_BITS   eax                           ; EMIT_BITS(code, size)
 
-    mov         ecx, DWORD [esp+temp2]        ; restore nbits
+    mov         ecx, dword [esp+temp2]        ; restore nbits
 
     ; Mask off any extra bits in code
     mov         eax, 1
     shl         eax, cl
     dec         eax
-    and         eax, DWORD [esp+temp]   ; temp2 &= (((JLONG)1)<<nbits) - 1;
+    and         eax, dword [esp+temp]   ; temp2 &= (((JLONG)1)<<nbits) - 1;
 
     ; Emit that number of bits of the value, if positive,
     ; or the complement of its magnitude, if negative.
@@ -290,22 +288,22 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     jz          near .ELOOP
     lea         esi, [esi+ecx*2]        ; k += r;
     shr         edx, cl                 ; index >>= r;
-    mov         DWORD [esp+temp3], edx
+    mov         dword [esp+temp3], edx
 .BRLOOP:
     cmp         ecx, 16                       ; while (r > 15) {
     jl          near .ERLOOP
     sub         ecx, 16                       ; r -= 16;
-    mov         DWORD [esp+temp], ecx
+    mov         dword [esp+temp], ecx
     mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
     movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
     EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
-    mov         ecx, DWORD [esp+temp]
+    mov         ecx, dword [esp+temp]
     jmp         .BRLOOP
 .ERLOOP:
     movsx       eax, word [esi]                                  ; temp = t1[k];
     movpic      edx, POINTER [esp+gotptr]                        ; load GOT address (edx)
     movzx       eax, byte [GOTOFF(edx, EXTN(jpeg_nbits_table) + eax)]  ; nbits = JPEG_NBITS(temp);
-    mov         DWORD [esp+temp2], eax
+    mov         dword [esp+temp2], eax
     ; Emit Huffman symbol for run length / number of bits
     shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
     add         ecx, eax
@@ -315,13 +313,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
 
     movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
     ; Mask off any extra bits in code
-    mov         ecx, DWORD [esp+temp2]
+    mov         ecx, dword [esp+temp2]
     mov         eax, 1
     shl         eax, cl
     dec         eax
     and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
     EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
-    mov         edx, DWORD [esp+temp3]
+    mov         edx, dword [esp+temp3]
     add         esi, 2                  ; ++k;
     shr         edx, 1                  ; index >>= 1;
 
@@ -351,29 +349,29 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     shr         edx, cl                 ; index >>= r;
     add         ecx, eax
     lea         esi, [esi+ecx*2]        ; k += r;
-    mov         DWORD [esp+temp3], edx
+    mov         dword [esp+temp3], edx
     jmp         .BRLOOP2
 .BLOOP2:
     bsf         ecx, edx                ; r = __builtin_ctzl(index);
     jz          near .ELOOP2
     lea         esi, [esi+ecx*2]        ; k += r;
     shr         edx, cl                 ; index >>= r;
-    mov         DWORD [esp+temp3], edx
+    mov         dword [esp+temp3], edx
 .BRLOOP2:
     cmp         ecx, 16                       ; while (r > 15) {
     jl          near .ERLOOP2
     sub         ecx, 16                       ; r -= 16;
-    mov         DWORD [esp+temp], ecx
+    mov         dword [esp+temp], ecx
     mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
     movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
     EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
-    mov         ecx, DWORD [esp+temp]
+    mov         ecx, dword [esp+temp]
     jmp         .BRLOOP2
 .ERLOOP2:
     movsx       eax, word [esi]         ; temp = t1[k];
     bsr         eax, eax                ; nbits = 32 - __builtin_clz(temp);
     inc         eax
-    mov         DWORD [esp+temp2], eax
+    mov         dword [esp+temp2], eax
     ; Emit Huffman symbol for run length / number of bits
     shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
     add         ecx, eax
@@ -383,13 +381,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
 
     movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
     ; Mask off any extra bits in code
-    mov         ecx, DWORD [esp+temp2]
+    mov         ecx, dword [esp+temp2]
     mov         eax, 1
     shl         eax, cl
     dec         eax
     and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
     EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
-    mov         edx, DWORD [esp+temp3]
+    mov         edx, dword [esp+temp3]
     add         esi, 2                  ; ++k;
     shr         edx, 1                  ; index >>= 1;
 
@@ -406,8 +404,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     mov         eax, [esp+buffer]
     pop         esi
     ; Save put_buffer & put_bits
-    mov         DWORD [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
-    mov         DWORD [esi+12], put_bits   ; state->cur.put_bits = put_bits;
+    mov         dword [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
+    mov         dword [esi+12], put_bits   ; state->cur.put_bits = put_bits;
 
     pop         ebp
     pop         edi
diff --git a/simd/i386/jcphuff-sse2.asm b/simd/i386/jcphuff-sse2.asm
index 25c63c7..8b73178 100644
--- a/simd/i386/jcphuff-sse2.asm
+++ b/simd/i386/jcphuff-sse2.asm
@@ -15,8 +15,6 @@
 ;
 ; This file contains an SSE2 implementation of data preparation for progressive
 ; Huffman encoding.  See jcphuff.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
@@ -329,6 +327,8 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
     add         LUT, 16*SIZEOF_INT
     dec         K
     jnz         .BLOOP16
+    test        LEN, 15
+    je          .PADDING
 .ELOOP16:
     mov         LENEND, LEN
     and         LENEND, 7
diff --git a/simd/i386/jcsample-avx2.asm b/simd/i386/jcsample-avx2.asm
index 5bcdefd..0a20802 100644
--- a/simd/i386/jcsample-avx2.asm
+++ b/simd/i386/jcsample-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jcsample-mmx.asm b/simd/i386/jcsample-mmx.asm
index faf4234..2c223ee 100644
--- a/simd/i386/jcsample-mmx.asm
+++ b/simd/i386/jcsample-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jcsample-sse2.asm b/simd/i386/jcsample-sse2.asm
index b10fa83..4fea60d 100644
--- a/simd/i386/jcsample-sse2.asm
+++ b/simd/i386/jcsample-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdcolext-avx2.asm b/simd/i386/jdcolext-avx2.asm
index 46de9b9..015be04 100644
--- a/simd/i386/jdcolext-avx2.asm
+++ b/simd/i386/jdcolext-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -348,7 +346,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     vmovd       eax, xmmA
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi], ax
+    mov         word [edi], ax
     add         edi, byte SIZEOF_WORD
     sub         ecx, byte SIZEOF_WORD
     shr         eax, 16
@@ -357,7 +355,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     ; space.
     test        ecx, ecx
     jz          short .nextrow
-    mov         BYTE [edi], al
+    mov         byte [edi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/i386/jdcolext-mmx.asm b/simd/i386/jdcolext-mmx.asm
index cd2cb3f..5813cfc 100644
--- a/simd/i386/jdcolext-mmx.asm
+++ b/simd/i386/jdcolext-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -280,7 +278,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
     movd        eax, mmA
     cmp         ecx, byte SIZEOF_DWORD
     jb          short .column_st2
-    mov         DWORD [edi+0*SIZEOF_DWORD], eax
+    mov         dword [edi+0*SIZEOF_DWORD], eax
     psrlq       mmA, DWORD_BIT
     movd        eax, mmA
     sub         ecx, byte SIZEOF_DWORD
@@ -288,14 +286,14 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
 .column_st2:
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi+0*SIZEOF_WORD], ax
+    mov         word [edi+0*SIZEOF_WORD], ax
     shr         eax, WORD_BIT
     sub         ecx, byte SIZEOF_WORD
     add         edi, byte SIZEOF_WORD
 .column_st1:
     cmp         ecx, byte SIZEOF_BYTE
     jb          short .nextrow
-    mov         BYTE [edi+0*SIZEOF_BYTE], al
+    mov         byte [edi+0*SIZEOF_BYTE], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -367,7 +365,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
 .column_st4:
     cmp         ecx, byte SIZEOF_MMWORD/8
     jb          short .nextrow
-    movd        DWORD [edi+0*SIZEOF_DWORD], mmA
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
 
 %endif  ; RGB_PIXELSIZE ; ---------------
 
diff --git a/simd/i386/jdcolext-sse2.asm b/simd/i386/jdcolext-sse2.asm
index 0fcb006..d5572b3 100644
--- a/simd/i386/jdcolext-sse2.asm
+++ b/simd/i386/jdcolext-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -320,7 +318,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     movd        eax, xmmA
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi], ax
+    mov         word [edi], ax
     add         edi, byte SIZEOF_WORD
     sub         ecx, byte SIZEOF_WORD
     shr         eax, 16
@@ -329,7 +327,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     ; space.
     test        ecx, ecx
     jz          short .nextrow
-    mov         BYTE [edi], al
+    mov         byte [edi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/i386/jdcolor-avx2.asm b/simd/i386/jdcolor-avx2.asm
index d2f86e6..e05b60d 100644
--- a/simd/i386/jdcolor-avx2.asm
+++ b/simd/i386/jdcolor-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdcolor-mmx.asm b/simd/i386/jdcolor-mmx.asm
index 8f5a3b3..fb7e7bc 100644
--- a/simd/i386/jdcolor-mmx.asm
+++ b/simd/i386/jdcolor-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdcolor-sse2.asm b/simd/i386/jdcolor-sse2.asm
index ae553db..b736255 100644
--- a/simd/i386/jdcolor-sse2.asm
+++ b/simd/i386/jdcolor-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdmerge-avx2.asm b/simd/i386/jdmerge-avx2.asm
index 1731844..711e679 100644
--- a/simd/i386/jdmerge-avx2.asm
+++ b/simd/i386/jdmerge-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdmerge-mmx.asm b/simd/i386/jdmerge-mmx.asm
index 607bf39..6e8311d 100644
--- a/simd/i386/jdmerge-mmx.asm
+++ b/simd/i386/jdmerge-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdmerge-sse2.asm b/simd/i386/jdmerge-sse2.asm
index ddb1d5e..e32f90a 100644
--- a/simd/i386/jdmerge-sse2.asm
+++ b/simd/i386/jdmerge-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdmrgext-avx2.asm b/simd/i386/jdmrgext-avx2.asm
index cde4865..e35f728 100644
--- a/simd/i386/jdmrgext-avx2.asm
+++ b/simd/i386/jdmrgext-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -354,7 +352,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     vmovd       eax, xmmA
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi], ax
+    mov         word [edi], ax
     add         edi, byte SIZEOF_WORD
     sub         ecx, byte SIZEOF_WORD
     shr         eax, 16
@@ -363,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     ; space.
     test        ecx, ecx
     jz          short .endcolumn
-    mov         BYTE [edi], al
+    mov         byte [edi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/i386/jdmrgext-mmx.asm b/simd/i386/jdmrgext-mmx.asm
index 4b9e35d..eb3e36b 100644
--- a/simd/i386/jdmrgext-mmx.asm
+++ b/simd/i386/jdmrgext-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -283,7 +281,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
     movd        eax, mmA
     cmp         ecx, byte SIZEOF_DWORD
     jb          short .column_st2
-    mov         DWORD [edi+0*SIZEOF_DWORD], eax
+    mov         dword [edi+0*SIZEOF_DWORD], eax
     psrlq       mmA, DWORD_BIT
     movd        eax, mmA
     sub         ecx, byte SIZEOF_DWORD
@@ -291,14 +289,14 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
 .column_st2:
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi+0*SIZEOF_WORD], ax
+    mov         word [edi+0*SIZEOF_WORD], ax
     shr         eax, WORD_BIT
     sub         ecx, byte SIZEOF_WORD
     add         edi, byte SIZEOF_WORD
 .column_st1:
     cmp         ecx, byte SIZEOF_BYTE
     jb          short .endcolumn
-    mov         BYTE [edi+0*SIZEOF_BYTE], al
+    mov         byte [edi+0*SIZEOF_BYTE], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -373,7 +371,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
 .column_st4:
     cmp         ecx, byte SIZEOF_MMWORD/8
     jb          short .endcolumn
-    movd        DWORD [edi+0*SIZEOF_DWORD], mmA
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
 
 %endif  ; RGB_PIXELSIZE ; ---------------
 
diff --git a/simd/i386/jdmrgext-sse2.asm b/simd/i386/jdmrgext-sse2.asm
index ac4697e..c113dc4 100644
--- a/simd/i386/jdmrgext-sse2.asm
+++ b/simd/i386/jdmrgext-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -325,7 +323,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
     movd        eax, xmmA
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi], ax
+    mov         word [edi], ax
     add         edi, byte SIZEOF_WORD
     sub         ecx, byte SIZEOF_WORD
     shr         eax, 16
@@ -334,7 +332,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
     ; space.
     test        ecx, ecx
     jz          short .endcolumn
-    mov         BYTE [edi], al
+    mov         byte [edi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/i386/jdsample-avx2.asm b/simd/i386/jdsample-avx2.asm
index 61ce511..a800c35 100644
--- a/simd/i386/jdsample-avx2.asm
+++ b/simd/i386/jdsample-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdsample-mmx.asm b/simd/i386/jdsample-mmx.asm
index 1f810fa..12c49f0 100644
--- a/simd/i386/jdsample-mmx.asm
+++ b/simd/i386/jdsample-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jdsample-sse2.asm b/simd/i386/jdsample-sse2.asm
index f0da626..4e28d2f 100644
--- a/simd/i386/jdsample-sse2.asm
+++ b/simd/i386/jdsample-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/i386/jfdctflt-3dn.asm b/simd/i386/jfdctflt-3dn.asm
index 1d45865..322ab16 100644
--- a/simd/i386/jfdctflt-3dn.asm
+++ b/simd/i386/jfdctflt-3dn.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the forward DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jfdctflt-sse.asm b/simd/i386/jfdctflt-sse.asm
index 1faf835..86952c6 100644
--- a/simd/i386/jfdctflt-sse.asm
+++ b/simd/i386/jfdctflt-sse.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the forward DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jfdctfst-mmx.asm b/simd/i386/jfdctfst-mmx.asm
index 0271901..80645a5 100644
--- a/simd/i386/jfdctfst-mmx.asm
+++ b/simd/i386/jfdctfst-mmx.asm
@@ -18,8 +18,6 @@
 ; the forward DCT (Discrete Cosine Transform). The following code is
 ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
 ; for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jfdctfst-sse2.asm b/simd/i386/jfdctfst-sse2.asm
index f09dadd..446fa7a 100644
--- a/simd/i386/jfdctfst-sse2.asm
+++ b/simd/i386/jfdctfst-sse2.asm
@@ -18,8 +18,6 @@
 ; the forward DCT (Discrete Cosine Transform). The following code is
 ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
 ; for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jfdctint-avx2.asm b/simd/i386/jfdctint-avx2.asm
index ae258ee..97de230 100644
--- a/simd/i386/jfdctint-avx2.asm
+++ b/simd/i386/jfdctint-avx2.asm
@@ -18,8 +18,6 @@
 ; forward DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jfdctint-mmx.asm b/simd/i386/jfdctint-mmx.asm
index c6bd959..3ade9d4 100644
--- a/simd/i386/jfdctint-mmx.asm
+++ b/simd/i386/jfdctint-mmx.asm
@@ -18,8 +18,6 @@
 ; forward DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jfdctint-sse2.asm b/simd/i386/jfdctint-sse2.asm
index d67dcc1..71b684c 100644
--- a/simd/i386/jfdctint-sse2.asm
+++ b/simd/i386/jfdctint-sse2.asm
@@ -18,8 +18,6 @@
 ; forward DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jidctflt-3dn.asm b/simd/i386/jidctflt-3dn.asm
index 73aa18d..8795191 100644
--- a/simd/i386/jidctflt-3dn.asm
+++ b/simd/i386/jidctflt-3dn.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the inverse DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -92,23 +90,23 @@ EXTN(jsimd_idct_float_3dnow):
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     pushpic     ebx                     ; save GOT address
-    mov         ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-    mov         eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-    or          ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-    or          ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    mov         ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
     or          eax, ebx
     poppic      ebx                     ; restore GOT address
     jnz         short .columnDCT
 
     ; -- AC terms all zero
 
-    movd        mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
 
     punpcklwd   mm0, mm0
     psrad       mm0, (DWORD_BIT-WORD_BIT)
@@ -135,10 +133,10 @@ EXTN(jsimd_idct_float_3dnow):
 
     ; -- Even part
 
-    movd        mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-    movd        mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-    movd        mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-    movd        mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movd        mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
 
     punpcklwd   mm0, mm0
     punpcklwd   mm1, mm1
@@ -182,10 +180,10 @@ EXTN(jsimd_idct_float_3dnow):
 
     ; -- Odd part
 
-    movd        mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    movd        mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-    movd        mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-    movd        mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    movd        mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movd        mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
 
     punpcklwd   mm2, mm2
     punpcklwd   mm3, mm3
diff --git a/simd/i386/jidctflt-sse.asm b/simd/i386/jidctflt-sse.asm
index 386650f..b27ecfd 100644
--- a/simd/i386/jidctflt-sse.asm
+++ b/simd/i386/jidctflt-sse.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the inverse DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -102,8 +100,8 @@ EXTN(jsimd_idct_float_sse):
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctflt-sse2.asm b/simd/i386/jidctflt-sse2.asm
index 9de7139..c646eae 100644
--- a/simd/i386/jidctflt-sse2.asm
+++ b/simd/i386/jidctflt-sse2.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the inverse DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -102,8 +100,8 @@ EXTN(jsimd_idct_float_sse2):
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctfst-mmx.asm b/simd/i386/jidctfst-mmx.asm
index d3e8a5d..24622d4 100644
--- a/simd/i386/jidctfst-mmx.asm
+++ b/simd/i386/jidctfst-mmx.asm
@@ -18,8 +18,6 @@
 ; the inverse DCT (Discrete Cosine Transform). The following code is
 ; based directly on the IJG's original jidctfst.c; see the jidctfst.c
 ; for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -123,8 +121,8 @@ EXTN(jsimd_idct_ifast_mmx):
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctfst-sse2.asm b/simd/i386/jidctfst-sse2.asm
index 83bc414..19704ff 100644
--- a/simd/i386/jidctfst-sse2.asm
+++ b/simd/i386/jidctfst-sse2.asm
@@ -18,8 +18,6 @@
 ; the inverse DCT (Discrete Cosine Transform). The following code is
 ; based directly on the IJG's original jidctfst.c; see the jidctfst.c
 ; for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -118,8 +116,8 @@ EXTN(jsimd_idct_ifast_sse2):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctint-avx2.asm b/simd/i386/jidctint-avx2.asm
index b3b7b14..c371985 100644
--- a/simd/i386/jidctint-avx2.asm
+++ b/simd/i386/jidctint-avx2.asm
@@ -18,8 +18,6 @@
 ; inverse DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jidctint.c; see the jidctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -320,8 +318,8 @@ EXTN(jsimd_idct_islow_avx2):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctint-mmx.asm b/simd/i386/jidctint-mmx.asm
index 6ca6d06..4f07f56 100644
--- a/simd/i386/jidctint-mmx.asm
+++ b/simd/i386/jidctint-mmx.asm
@@ -18,8 +18,6 @@
 ; inverse DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jidctint.c; see the jidctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -136,8 +134,8 @@ EXTN(jsimd_idct_islow_mmx):
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctint-sse2.asm b/simd/i386/jidctint-sse2.asm
index a6bd00a..e442fdd 100644
--- a/simd/i386/jidctint-sse2.asm
+++ b/simd/i386/jidctint-sse2.asm
@@ -18,8 +18,6 @@
 ; inverse DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jidctint.c; see the jidctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -131,8 +129,8 @@ EXTN(jsimd_idct_islow_sse2):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctred-mmx.asm b/simd/i386/jidctred-mmx.asm
index 336ee3b..e2307e1 100644
--- a/simd/i386/jidctred-mmx.asm
+++ b/simd/i386/jidctred-mmx.asm
@@ -18,8 +18,6 @@
 ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
 ; The following code is based directly on the IJG's original jidctred.c;
 ; see the jidctred.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -144,8 +142,8 @@ EXTN(jsimd_idct_4x4_mmx):
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -464,16 +462,16 @@ EXTN(jsimd_idct_4x4_mmx):
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-    movd        DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-    movd        DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
 
     psrlq       mm1, 4*BYTE_BIT
     psrlq       mm0, 4*BYTE_BIT
 
     mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
     mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-    movd        DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-    movd        DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
 
     emms                                ; empty MMX state
 
@@ -688,8 +686,8 @@ EXTN(jsimd_idct_2x2_mmx):
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-    mov         WORD [edx+eax*SIZEOF_JSAMPLE], bx
-    mov         WORD [esi+eax*SIZEOF_JSAMPLE], cx
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
 
     emms                                ; empty MMX state
 
diff --git a/simd/i386/jidctred-sse2.asm b/simd/i386/jidctred-sse2.asm
index 97838ba..6e56494 100644
--- a/simd/i386/jidctred-sse2.asm
+++ b/simd/i386/jidctred-sse2.asm
@@ -18,8 +18,6 @@
 ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
 ; The following code is based directly on the IJG's original jidctred.c;
 ; see the jidctred.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -139,8 +137,8 @@ EXTN(jsimd_idct_4x4_sse2):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -578,8 +576,8 @@ EXTN(jsimd_idct_2x2_sse2):
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-    mov         WORD [edx+eax*SIZEOF_JSAMPLE], bx
-    mov         WORD [esi+eax*SIZEOF_JSAMPLE], cx
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
 
     pop         edi
     pop         esi
diff --git a/simd/i386/jquant-3dn.asm b/simd/i386/jquant-3dn.asm
index 1767f44..5cb60ca 100644
--- a/simd/i386/jquant-3dn.asm
+++ b/simd/i386/jquant-3dn.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jquant-mmx.asm b/simd/i386/jquant-mmx.asm
index 98932db..61305c6 100644
--- a/simd/i386/jquant-mmx.asm
+++ b/simd/i386/jquant-mmx.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jquant-sse.asm b/simd/i386/jquant-sse.asm
index cc244c4..218adc9 100644
--- a/simd/i386/jquant-sse.asm
+++ b/simd/i386/jquant-sse.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jquantf-sse2.asm b/simd/i386/jquantf-sse2.asm
index 8d1201c..a881ab5 100644
--- a/simd/i386/jquantf-sse2.asm
+++ b/simd/i386/jquantf-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jquanti-avx2.asm b/simd/i386/jquanti-avx2.asm
index ea8e1a1..5ed6bec 100644
--- a/simd/i386/jquanti-avx2.asm
+++ b/simd/i386/jquanti-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jquanti-sse2.asm b/simd/i386/jquanti-sse2.asm
index 2a69494..0a50940 100644
--- a/simd/i386/jquanti-sse2.asm
+++ b/simd/i386/jquanti-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/i386/jsimdcpu.asm b/simd/i386/jsimdcpu.asm
index 0af4eec..ddcafa9 100644
--- a/simd/i386/jsimdcpu.asm
+++ b/simd/i386/jsimdcpu.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/nasm/jcolsamp.inc b/simd/nasm/jcolsamp.inc
index a2d5b49..6f6d7f2 100644
--- a/simd/nasm/jcolsamp.inc
+++ b/simd/nasm/jcolsamp.inc
@@ -7,8 +7,6 @@
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; [TAB8]
 
 ; --------------------------------------------------------------------------
 
diff --git a/simd/nasm/jdct.inc b/simd/nasm/jdct.inc
index 79d5146..9192f66 100644
--- a/simd/nasm/jdct.inc
+++ b/simd/nasm/jdct.inc
@@ -7,8 +7,6 @@
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; [TAB8]
 
 ; Each IDCT routine is responsible for range-limiting its results and
 ; converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
diff --git a/simd/nasm/jsimdext.inc b/simd/nasm/jsimdext.inc
index fe9f026..9930d80 100644
--- a/simd/nasm/jsimdext.inc
+++ b/simd/nasm/jsimdext.inc
@@ -24,8 +24,6 @@
 ; 2. Altered source versions must be plainly marked as such, and must not be
 ;    misrepresented as being the original software.
 ; 3. This notice may not be removed or altered from any source distribution.
-;
-; [TAB8]
 
 ; ==========================================================================
 ;  System-dependent configurations
@@ -167,19 +165,19 @@ section .note.GNU-stack noalloc noexec nowrite progbits
 %define XMM_DWORD
 %define XMM_MMWORD
 
-%define SIZEOF_BYTE   1                 ; sizeof(BYTE)
-%define SIZEOF_WORD   2                 ; sizeof(WORD)
-%define SIZEOF_DWORD  4                 ; sizeof(DWORD)
-%define SIZEOF_QWORD  8                 ; sizeof(QWORD)
-%define SIZEOF_OWORD  16                ; sizeof(OWORD)
-%define SIZEOF_YWORD  32                ; sizeof(YWORD)
+%define SIZEOF_BYTE   1                 ; sizeof(byte)
+%define SIZEOF_WORD   2                 ; sizeof(word)
+%define SIZEOF_DWORD  4                 ; sizeof(dword)
+%define SIZEOF_QWORD  8                 ; sizeof(qword)
+%define SIZEOF_OWORD  16                ; sizeof(oword)
+%define SIZEOF_YWORD  32                ; sizeof(yword)
 
 %define BYTE_BIT      8                 ; CHAR_BIT in C
-%define WORD_BIT      16                ; sizeof(WORD)*BYTE_BIT
-%define DWORD_BIT     32                ; sizeof(DWORD)*BYTE_BIT
-%define QWORD_BIT     64                ; sizeof(QWORD)*BYTE_BIT
-%define OWORD_BIT     128               ; sizeof(OWORD)*BYTE_BIT
-%define YWORD_BIT     256               ; sizeof(YWORD)*BYTE_BIT
+%define WORD_BIT      16                ; sizeof(word)*BYTE_BIT
+%define DWORD_BIT     32                ; sizeof(dword)*BYTE_BIT
+%define QWORD_BIT     64                ; sizeof(qword)*BYTE_BIT
+%define OWORD_BIT     128               ; sizeof(oword)*BYTE_BIT
+%define YWORD_BIT     256               ; sizeof(yword)*BYTE_BIT
 
 ; --------------------------------------------------------------------------
 ;  External Symbol Name
diff --git a/simd/x86_64/jccolext-avx2.asm b/simd/x86_64/jccolext-avx2.asm
index 5fa3848..10d2834 100644
--- a/simd/x86_64/jccolext-avx2.asm
+++ b/simd/x86_64/jccolext-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -96,12 +94,12 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         rcx, byte SIZEOF_BYTE
-    movzx       rax, BYTE [rsi+rcx]
+    movzx       rax, byte [rsi+rcx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         rcx, byte SIZEOF_WORD
-    movzx       rdx, WORD [rsi+rcx]
+    movzx       rdx, word [rsi+rcx]
     shl         rax, WORD_BIT
     or          rax, rdx
 .column_ld4:
diff --git a/simd/x86_64/jccolext-sse2.asm b/simd/x86_64/jccolext-sse2.asm
index b1486c0..2c914d3 100644
--- a/simd/x86_64/jccolext-sse2.asm
+++ b/simd/x86_64/jccolext-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -95,12 +93,12 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         rcx, byte SIZEOF_BYTE
-    movzx       rax, BYTE [rsi+rcx]
+    movzx       rax, byte [rsi+rcx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         rcx, byte SIZEOF_WORD
-    movzx       rdx, WORD [rsi+rcx]
+    movzx       rdx, word [rsi+rcx]
     shl         rax, WORD_BIT
     or          rax, rdx
 .column_ld4:
diff --git a/simd/x86_64/jccolor-avx2.asm b/simd/x86_64/jccolor-avx2.asm
index f9f4be0..16b7829 100644
--- a/simd/x86_64/jccolor-avx2.asm
+++ b/simd/x86_64/jccolor-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jccolor-sse2.asm b/simd/x86_64/jccolor-sse2.asm
index 3e46601..e2955c2 100644
--- a/simd/x86_64/jccolor-sse2.asm
+++ b/simd/x86_64/jccolor-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jcgray-avx2.asm b/simd/x86_64/jcgray-avx2.asm
index 0ec2410..591255b 100644
--- a/simd/x86_64/jcgray-avx2.asm
+++ b/simd/x86_64/jcgray-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jcgray-sse2.asm b/simd/x86_64/jcgray-sse2.asm
index edf9222..e389904 100644
--- a/simd/x86_64/jcgray-sse2.asm
+++ b/simd/x86_64/jcgray-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jcgryext-avx2.asm b/simd/x86_64/jcgryext-avx2.asm
index 79e2aa0..175b60d 100644
--- a/simd/x86_64/jcgryext-avx2.asm
+++ b/simd/x86_64/jcgryext-avx2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -88,12 +86,12 @@ EXTN(jsimd_rgb_gray_convert_avx2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         rcx, byte SIZEOF_BYTE
-    movzx       rax, BYTE [rsi+rcx]
+    movzx       rax, byte [rsi+rcx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         rcx, byte SIZEOF_WORD
-    movzx       rdx, WORD [rsi+rcx]
+    movzx       rdx, word [rsi+rcx]
     shl         rax, WORD_BIT
     or          rax, rdx
 .column_ld4:
diff --git a/simd/x86_64/jcgryext-sse2.asm b/simd/x86_64/jcgryext-sse2.asm
index 9c3ae5e..873be80 100644
--- a/simd/x86_64/jcgryext-sse2.asm
+++ b/simd/x86_64/jcgryext-sse2.asm
@@ -12,8 +12,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -87,12 +85,12 @@ EXTN(jsimd_rgb_gray_convert_sse2):
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         rcx, byte SIZEOF_BYTE
-    movzx       rax, BYTE [rsi+rcx]
+    movzx       rax, byte [rsi+rcx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         rcx, byte SIZEOF_WORD
-    movzx       rdx, WORD [rsi+rcx]
+    movzx       rdx, word [rsi+rcx]
     shl         rax, WORD_BIT
     or          rax, rdx
 .column_ld4:
diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm
index 5ec8b1a..7deab58 100644
--- a/simd/x86_64/jchuff-sse2.asm
+++ b/simd/x86_64/jchuff-sse2.asm
@@ -17,8 +17,6 @@
 ; This file contains an SSE2 implementation for Huffman coding of one block.
 ; The following code is based directly on jchuff.c; see jchuff.c for more
 ; details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
@@ -199,7 +197,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     mov         buffer, r11                  ; r11 is now sratch
 
     mov         put_buffer, MMWORD [r10+16]  ; put_buffer = state->cur.put_buffer;
-    mov         put_bits,    DWORD [r10+24]  ; put_bits = state->cur.put_bits;
+    mov         put_bits,    dword [r10+24]  ; put_bits = state->cur.put_bits;
     push        r10                          ; r10 is now scratch
 
     ; Encode the DC coefficient difference per section F.1.2.1
@@ -332,7 +330,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     pop         r10
     ; Save put_buffer & put_bits
     mov         MMWORD [r10+16], put_buffer  ; state->cur.put_buffer = put_buffer;
-    mov         DWORD  [r10+24], put_bits    ; state->cur.put_bits = put_bits;
+    mov         dword  [r10+24], put_bits    ; state->cur.put_bits = put_bits;
 
     pop         rbx
     uncollect_args 6
diff --git a/simd/x86_64/jcphuff-sse2.asm b/simd/x86_64/jcphuff-sse2.asm
index b17488a..8ed4472 100644
--- a/simd/x86_64/jcphuff-sse2.asm
+++ b/simd/x86_64/jcphuff-sse2.asm
@@ -16,8 +16,6 @@
 ;
 ; This file contains an SSE2 implementation of data preparation for progressive
 ; Huffman encoding.  See jcphuff.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
@@ -322,6 +320,8 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
     add         LUT, 16*SIZEOF_INT
     dec         K
     jnz         .BLOOP16
+    test        LEN, 15
+    je          .PADDING
 .ELOOP16:
     test        LEN, 8
     jz          .TRY7
diff --git a/simd/x86_64/jcsample-avx2.asm b/simd/x86_64/jcsample-avx2.asm
index 9d5a861..d9922bb 100644
--- a/simd/x86_64/jcsample-avx2.asm
+++ b/simd/x86_64/jcsample-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jcsample-sse2.asm b/simd/x86_64/jcsample-sse2.asm
index 1b31536..0f107e9 100644
--- a/simd/x86_64/jcsample-sse2.asm
+++ b/simd/x86_64/jcsample-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jdcolext-avx2.asm b/simd/x86_64/jdcolext-avx2.asm
index e2b96c7..677b8ed 100644
--- a/simd/x86_64/jdcolext-avx2.asm
+++ b/simd/x86_64/jdcolext-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -334,7 +332,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     vmovd       eax, xmmA
     cmp         rcx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [rdi], ax
+    mov         word [rdi], ax
     add         rdi, byte SIZEOF_WORD
     sub         rcx, byte SIZEOF_WORD
     shr         rax, 16
@@ -343,7 +341,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     ; space.
     test        rcx, rcx
     jz          short .nextrow
-    mov         BYTE [rdi], al
+    mov         byte [rdi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/x86_64/jdcolext-sse2.asm b/simd/x86_64/jdcolext-sse2.asm
index a94954b..071aa62 100644
--- a/simd/x86_64/jdcolext-sse2.asm
+++ b/simd/x86_64/jdcolext-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -306,7 +304,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     movd        eax, xmmA
     cmp         rcx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [rdi], ax
+    mov         word [rdi], ax
     add         rdi, byte SIZEOF_WORD
     sub         rcx, byte SIZEOF_WORD
     shr         rax, 16
@@ -315,7 +313,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     ; space.
     test        rcx, rcx
     jz          short .nextrow
-    mov         BYTE [rdi], al
+    mov         byte [rdi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/x86_64/jdcolor-avx2.asm b/simd/x86_64/jdcolor-avx2.asm
index abad176..43de9db 100644
--- a/simd/x86_64/jdcolor-avx2.asm
+++ b/simd/x86_64/jdcolor-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jdcolor-sse2.asm b/simd/x86_64/jdcolor-sse2.asm
index e7079f6..b3f1fec 100644
--- a/simd/x86_64/jdcolor-sse2.asm
+++ b/simd/x86_64/jdcolor-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jdmerge-avx2.asm b/simd/x86_64/jdmerge-avx2.asm
index ca3f063..9515a17 100644
--- a/simd/x86_64/jdmerge-avx2.asm
+++ b/simd/x86_64/jdmerge-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jdmerge-sse2.asm b/simd/x86_64/jdmerge-sse2.asm
index f3e09fa..aedccc2 100644
--- a/simd/x86_64/jdmerge-sse2.asm
+++ b/simd/x86_64/jdmerge-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jdmrgext-avx2.asm b/simd/x86_64/jdmrgext-avx2.asm
index 04e8a94..bb733c5 100644
--- a/simd/x86_64/jdmrgext-avx2.asm
+++ b/simd/x86_64/jdmrgext-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -339,7 +337,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     vmovd       eax, xmmA
     cmp         rcx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [rdi], ax
+    mov         word [rdi], ax
     add         rdi, byte SIZEOF_WORD
     sub         rcx, byte SIZEOF_WORD
     shr         rax, 16
@@ -348,7 +346,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     ; space.
     test        rcx, rcx
     jz          short .endcolumn
-    mov         BYTE [rdi], al
+    mov         byte [rdi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/x86_64/jdmrgext-sse2.asm b/simd/x86_64/jdmrgext-sse2.asm
index 1cc3345..b176a4c 100644
--- a/simd/x86_64/jdmrgext-sse2.asm
+++ b/simd/x86_64/jdmrgext-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jcolsamp.inc"
 
@@ -310,7 +308,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
     movd        eax, xmmA
     cmp         rcx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [rdi], ax
+    mov         word [rdi], ax
     add         rdi, byte SIZEOF_WORD
     sub         rcx, byte SIZEOF_WORD
     shr         rax, 16
@@ -319,7 +317,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
     ; space.
     test        rcx, rcx
     jz          short .endcolumn
-    mov         BYTE [rdi], al
+    mov         byte [rdi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/x86_64/jdsample-avx2.asm b/simd/x86_64/jdsample-avx2.asm
index 10fa5c4..fc274a9 100644
--- a/simd/x86_64/jdsample-avx2.asm
+++ b/simd/x86_64/jdsample-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jdsample-sse2.asm b/simd/x86_64/jdsample-sse2.asm
index d8ccda9..20e0767 100644
--- a/simd/x86_64/jdsample-sse2.asm
+++ b/simd/x86_64/jdsample-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/simd/x86_64/jfdctflt-sse.asm b/simd/x86_64/jfdctflt-sse.asm
index 26f9fb6..ef27966 100644
--- a/simd/x86_64/jfdctflt-sse.asm
+++ b/simd/x86_64/jfdctflt-sse.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the forward DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jfdctfst-sse2.asm b/simd/x86_64/jfdctfst-sse2.asm
index aaf8b9e..2e1bfe6 100644
--- a/simd/x86_64/jfdctfst-sse2.asm
+++ b/simd/x86_64/jfdctfst-sse2.asm
@@ -18,8 +18,6 @@
 ; the forward DCT (Discrete Cosine Transform). The following code is
 ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
 ; for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jfdctint-avx2.asm b/simd/x86_64/jfdctint-avx2.asm
index 448f47d..6ad4cf0 100644
--- a/simd/x86_64/jfdctint-avx2.asm
+++ b/simd/x86_64/jfdctint-avx2.asm
@@ -18,8 +18,6 @@
 ; forward DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jfdctint-sse2.asm b/simd/x86_64/jfdctint-sse2.asm
index ef16a52..5d0de3c 100644
--- a/simd/x86_64/jfdctint-sse2.asm
+++ b/simd/x86_64/jfdctint-sse2.asm
@@ -18,8 +18,6 @@
 ; forward DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jidctflt-sse2.asm b/simd/x86_64/jidctflt-sse2.asm
index b676ef3..ab95e1a 100644
--- a/simd/x86_64/jidctflt-sse2.asm
+++ b/simd/x86_64/jidctflt-sse2.asm
@@ -17,8 +17,6 @@
 ; This file contains a floating-point implementation of the inverse DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -95,8 +93,8 @@ EXTN(jsimd_idct_float_sse2):
     mov         rcx, DCTSIZE/4          ; ctr
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctfst-sse2.asm b/simd/x86_64/jidctfst-sse2.asm
index c6c42f9..a66a681 100644
--- a/simd/x86_64/jidctfst-sse2.asm
+++ b/simd/x86_64/jidctfst-sse2.asm
@@ -18,8 +18,6 @@
 ; the inverse DCT (Discrete Cosine Transform). The following code is
 ; based directly on the IJG's original jidctfst.c; see the jidctfst.c
 ; for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -111,8 +109,8 @@ EXTN(jsimd_idct_ifast_sse2):
     mov         rsi, r11                ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctint-avx2.asm b/simd/x86_64/jidctint-avx2.asm
index b60b44f..50270f4 100644
--- a/simd/x86_64/jidctint-avx2.asm
+++ b/simd/x86_64/jidctint-avx2.asm
@@ -18,8 +18,6 @@
 ; inverse DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jidctint.c; see the jidctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -292,8 +290,8 @@ EXTN(jsimd_idct_islow_avx2):
     ; ---- Pass 1: process columns.
 
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
-    mov         eax, DWORD [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctint-sse2.asm b/simd/x86_64/jidctint-sse2.asm
index 83fc344..034530c 100644
--- a/simd/x86_64/jidctint-sse2.asm
+++ b/simd/x86_64/jidctint-sse2.asm
@@ -18,8 +18,6 @@
 ; inverse DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jidctint.c; see the jidctint.c for
 ; more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -124,8 +122,8 @@ EXTN(jsimd_idct_islow_sse2):
     mov         rsi, r11                ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctred-sse2.asm b/simd/x86_64/jidctred-sse2.asm
index af64fdc..7fbfcc5 100644
--- a/simd/x86_64/jidctred-sse2.asm
+++ b/simd/x86_64/jidctred-sse2.asm
@@ -18,8 +18,6 @@
 ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
 ; The following code is based directly on the IJG's original jidctred.c;
 ; see the jidctred.c for more details.
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
@@ -132,8 +130,8 @@ EXTN(jsimd_idct_4x4_sse2):
     mov         rsi, r11                ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
@@ -562,8 +560,8 @@ EXTN(jsimd_idct_2x2_sse2):
 
     mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
     mov         rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-    mov         WORD [rdx+rax*SIZEOF_JSAMPLE], bx
-    mov         WORD [rsi+rax*SIZEOF_JSAMPLE], cx
+    mov         word [rdx+rax*SIZEOF_JSAMPLE], bx
+    mov         word [rsi+rax*SIZEOF_JSAMPLE], cx
 
     pop         rbx
     uncollect_args 4
diff --git a/simd/x86_64/jquantf-sse2.asm b/simd/x86_64/jquantf-sse2.asm
index 4600eec..83596a9 100644
--- a/simd/x86_64/jquantf-sse2.asm
+++ b/simd/x86_64/jquantf-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jquanti-avx2.asm b/simd/x86_64/jquanti-avx2.asm
index b7243e4..5f04d22 100644
--- a/simd/x86_64/jquanti-avx2.asm
+++ b/simd/x86_64/jquanti-avx2.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jquanti-sse2.asm b/simd/x86_64/jquanti-sse2.asm
index 7ff7275..bb6fa69 100644
--- a/simd/x86_64/jquanti-sse2.asm
+++ b/simd/x86_64/jquanti-sse2.asm
@@ -13,8 +13,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 %include "jdct.inc"
diff --git a/simd/x86_64/jsimdcpu.asm b/simd/x86_64/jsimdcpu.asm
index a905282..705f813 100644
--- a/simd/x86_64/jsimdcpu.asm
+++ b/simd/x86_64/jsimdcpu.asm
@@ -14,8 +14,6 @@
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
 
 %include "jsimdext.inc"
 
diff --git a/tjbench.c b/tjbench.c
index f555fae..e098f3b 100644
--- a/tjbench.c
+++ b/tjbench.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2018 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2019 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,28 +32,29 @@
 #include <ctype.h>
 #include <math.h>
 #include <errno.h>
+#include <limits.h>
 #include <cdjpeg.h>
 #include "./tjutil.h"
 #include "./turbojpeg.h"
 
 
-#define _throw(op, err) { \
+#define THROW(op, err) { \
   fprintf(stderr, "ERROR in line %d while %s:\n%s\n", __LINE__, op, err); \
   retval = -1;  goto bailout; \
 }
-#define _throwunix(m)  _throw(m, strerror(errno))
+#define THROW_UNIX(m)  THROW(m, strerror(errno))
 
 static char tjErrorStr[JMSG_LENGTH_MAX] = "\0",
             tjErrorMsg[JMSG_LENGTH_MAX] = "\0";
 static int tjErrorLine = -1, tjErrorCode = -1;
 
-#define _throwtjg(m) { \
+#define THROW_TJG(m) { \
   fprintf(stderr, "ERROR in line %d while %s:\n%s\n", __LINE__, m, \
           tjGetErrorStr2(NULL)); \
   retval = -1;  goto bailout; \
 }
 
-#define _throwtj(m) { \
+#define THROW_TJ(m) { \
   int _tjErrorCode = tjGetErrorCode(handle); \
   char *_tjErrorStr = tjGetErrorStr2(handle); \
   \
@@ -143,7 +144,7 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
                   int subsamp, int jpegQual, char *fileName, int tilew,
                   int tileh)
 {
-  char tempStr[1024], sizeStr[20] = "\0", qualStr[6] = "\0", *ptr;
+  char tempStr[1024], sizeStr[24] = "\0", qualStr[13] = "\0", *ptr;
   FILE *file = NULL;
   tjhandle handle = NULL;
   int row, col, iter = 0, dstBufAlloc = 0, retval = 0;
@@ -156,29 +157,34 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
   unsigned char *dstPtr, *dstPtr2, *yuvBuf = NULL;
 
   if (jpegQual > 0) {
-    snprintf(qualStr, 6, "_Q%d", jpegQual);
-    qualStr[5] = 0;
+    snprintf(qualStr, 13, "_Q%d", jpegQual);
+    qualStr[12] = 0;
   }
 
   if ((handle = tjInitDecompress()) == NULL)
-    _throwtj("executing tjInitDecompress()");
+    THROW_TJ("executing tjInitDecompress()");
 
   if (dstBuf == NULL) {
-    if ((dstBuf = (unsigned char *)malloc(pitch * scaledh)) == NULL)
-      _throwunix("allocating destination buffer");
+    if ((unsigned long long)pitch * (unsigned long long)scaledh >
+        (unsigned long long)((size_t)-1))
+      THROW("allocating destination buffer", "Image is too large");
+    if ((dstBuf = (unsigned char *)malloc((size_t)pitch * scaledh)) == NULL)
+      THROW_UNIX("allocating destination buffer");
     dstBufAlloc = 1;
   }
   /* Set the destination buffer to gray so we know whether the decompressor
      attempted to write to it */
-  memset(dstBuf, 127, pitch * scaledh);
+  memset(dstBuf, 127, (size_t)pitch * scaledh);
 
   if (doYUV) {
     int width = doTile ? tilew : scaledw;
     int height = doTile ? tileh : scaledh;
-    int yuvSize = tjBufSizeYUV2(width, yuvPad, height, subsamp);
+    unsigned long yuvSize = tjBufSizeYUV2(width, yuvPad, height, subsamp);
 
+    if (yuvSize == (unsigned long)-1)
+      THROW_TJ("allocating YUV buffer");
     if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
-      _throwunix("allocating YUV buffer");
+      THROW_UNIX("allocating YUV buffer");
     memset(yuvBuf, 127, yuvSize);
   }
 
@@ -190,7 +196,7 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
     double start = getTime();
 
     for (row = 0, dstPtr = dstBuf; row < ntilesh;
-         row++, dstPtr += pitch * tileh) {
+         row++, dstPtr += (size_t)pitch * tileh) {
       for (col = 0, dstPtr2 = dstPtr; col < ntilesw;
            col++, tile++, dstPtr2 += ps * tilew) {
         int width = doTile ? min(tilew, w - col * tilew) : scaledw;
@@ -201,16 +207,16 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
 
           if (tjDecompressToYUV2(handle, jpegBuf[tile], jpegSize[tile], yuvBuf,
                                  width, yuvPad, height, flags) == -1)
-            _throwtj("executing tjDecompressToYUV2()");
+            THROW_TJ("executing tjDecompressToYUV2()");
           startDecode = getTime();
           if (tjDecodeYUV(handle, yuvBuf, yuvPad, subsamp, dstPtr2, width,
                           pitch, height, pf, flags) == -1)
-            _throwtj("executing tjDecodeYUV()");
+            THROW_TJ("executing tjDecodeYUV()");
           if (iter >= 0) elapsedDecode += getTime() - startDecode;
         } else if (tjDecompress2(handle, jpegBuf[tile], jpegSize[tile],
                                  dstPtr2, width, pitch, height, pf,
                                  flags) == -1)
-          _throwtj("executing tjDecompress2()");
+          THROW_TJ("executing tjDecompress2()");
       }
     }
     elapsed += getTime() - start;
@@ -224,14 +230,14 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
   }
   if (doYUV) elapsed -= elapsedDecode;
 
-  if (tjDestroy(handle) == -1) _throwtj("executing tjDestroy()");
+  if (tjDestroy(handle) == -1) THROW_TJ("executing tjDestroy()");
   handle = NULL;
 
   if (quiet) {
     fprintf(stderr, "%-6s%s",
             sigfig((double)(w * h) / 1000000. * (double)iter / elapsed, 4,
                    tempStr, 1024),
-            quiet == 2 ? "\n" : "  ");
+           quiet == 2 ? "\n" : "  ");
     if (doYUV)
       fprintf(stderr, "%s\n",
               sigfig((double)(w * h) / 1000000. * (double)iter / elapsedDecode,
@@ -248,17 +254,17 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
               (double)iter / elapsedDecode);
       fprintf(stderr,
               "                  Throughput:         %f Megapixels/sec\n",
-             (double)(w * h) / 1000000. * (double)iter / elapsedDecode);
+              (double)(w * h) / 1000000. * (double)iter / elapsedDecode);
     }
   }
 
   if (!doWrite) goto bailout;
 
   if (sf.num != 1 || sf.denom != 1)
-    snprintf(sizeStr, 20, "%d_%d", sf.num, sf.denom);
+    snprintf(sizeStr, 24, "%d_%d", sf.num, sf.denom);
   else if (tilew != w || tileh != h)
-    snprintf(sizeStr, 20, "%dx%d", tilew, tileh);
-  else snprintf(sizeStr, 20, "full");
+    snprintf(sizeStr, 24, "%dx%d", tilew, tileh);
+  else snprintf(sizeStr, 24, "full");
   if (decompOnly)
     snprintf(tempStr, 1024, "%s_%s.%s", fileName, sizeStr, ext);
   else
@@ -266,19 +272,19 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
              qualStr, sizeStr, ext);
 
   if (tjSaveImage(tempStr, dstBuf, scaledw, 0, scaledh, pf, flags) == -1)
-    _throwtjg("saving bitmap");
+    THROW_TJG("saving bitmap");
   ptr = strrchr(tempStr, '.');
   snprintf(ptr, 1024 - (ptr - tempStr), "-err.%s", ext);
   if (srcBuf && sf.num == 1 && sf.denom == 1) {
     if (!quiet) fprintf(stderr, "Compression error written to %s.\n", tempStr);
     if (subsamp == TJ_GRAYSCALE) {
-      int index, index2;
+      unsigned long index, index2;
 
       for (row = 0, index = 0; row < h; row++, index += pitch) {
         for (col = 0, index2 = index; col < w; col++, index2 += ps) {
-          int rindex = index2 + tjRedOffset[pf];
-          int gindex = index2 + tjGreenOffset[pf];
-          int bindex = index2 + tjBlueOffset[pf];
+          unsigned long rindex = index2 + tjRedOffset[pf];
+          unsigned long gindex = index2 + tjGreenOffset[pf];
+          unsigned long bindex = index2 + tjBlueOffset[pf];
           int y = (int)((double)srcBuf[rindex] * 0.299 +
                         (double)srcBuf[gindex] * 0.587 +
                         (double)srcBuf[bindex] * 0.114 + 0.5);
@@ -297,14 +303,14 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
             abs(dstBuf[pitch * row + col] - srcBuf[pitch * row + col]);
     }
     if (tjSaveImage(tempStr, dstBuf, w, 0, h, pf, flags) == -1)
-      _throwtjg("saving bitmap");
+      THROW_TJG("saving bitmap");
   }
 
 bailout:
   if (file) fclose(file);
   if (handle) tjDestroy(handle);
-  if (dstBuf && dstBufAlloc) free(dstBuf);
-  if (yuvBuf) free(yuvBuf);
+  if (dstBufAlloc) free(dstBuf);
+  free(yuvBuf);
   return retval;
 }
 
@@ -319,14 +325,17 @@ static int fullTest(unsigned char *srcBuf, int w, int h, int subsamp,
     *srcPtr2;
   double start, elapsed, elapsedEncode;
   int totalJpegSize = 0, row, col, i, tilew = w, tileh = h, retval = 0;
-  int iter, yuvSize = 0;
-  unsigned long *jpegSize = NULL;
+  int iter;
+  unsigned long *jpegSize = NULL, yuvSize = 0;
   int ps = tjPixelSize[pf];
   int ntilesw = 1, ntilesh = 1, pitch = w * ps;
   const char *pfStr = pixFormatStr[pf];
 
-  if ((tmpBuf = (unsigned char *)malloc(pitch * h)) == NULL)
-    _throwunix("allocating temporary image buffer");
+  if ((unsigned long long)pitch * (unsigned long long)h >
+      (unsigned long long)((size_t)-1))
+    THROW("allocating temporary image buffer", "Image is too large");
+  if ((tmpBuf = (unsigned char *)malloc((size_t)pitch * h)) == NULL)
+    THROW_UNIX("allocating temporary image buffer");
 
   if (!quiet)
     fprintf(stderr, ">>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", pfStr,
@@ -342,18 +351,20 @@ static int fullTest(unsigned char *srcBuf, int w, int h, int subsamp,
 
     if ((jpegBuf = (unsigned char **)malloc(sizeof(unsigned char *) *
                                             ntilesw * ntilesh)) == NULL)
-      _throwunix("allocating JPEG tile array");
+      THROW_UNIX("allocating JPEG tile array");
     memset(jpegBuf, 0, sizeof(unsigned char *) * ntilesw * ntilesh);
     if ((jpegSize = (unsigned long *)malloc(sizeof(unsigned long) *
                                             ntilesw * ntilesh)) == NULL)
-      _throwunix("allocating JPEG size array");
+      THROW_UNIX("allocating JPEG size array");
     memset(jpegSize, 0, sizeof(unsigned long) * ntilesw * ntilesh);
 
     if ((flags & TJFLAG_NOREALLOC) != 0)
       for (i = 0; i < ntilesw * ntilesh; i++) {
+        if (tjBufSize(tilew, tileh, subsamp) > (unsigned long)INT_MAX)
+          THROW("getting buffer size", "Image is too large");
         if ((jpegBuf[i] = (unsigned char *)
                           tjAlloc(tjBufSize(tilew, tileh, subsamp))) == NULL)
-          _throwunix("allocating JPEG tiles");
+          THROW_UNIX("allocating JPEG tiles");
       }
 
     /* Compression test */
@@ -364,12 +375,14 @@ static int fullTest(unsigned char *srcBuf, int w, int h, int subsamp,
     for (i = 0; i < h; i++)
       memcpy(&tmpBuf[pitch * i], &srcBuf[w * ps * i], w * ps);
     if ((handle = tjInitCompress()) == NULL)
-      _throwtj("executing tjInitCompress()");
+      THROW_TJ("executing tjInitCompress()");
 
     if (doYUV) {
       yuvSize = tjBufSizeYUV2(tilew, yuvPad, tileh, subsamp);
+      if (yuvSize == (unsigned long)-1)
+        THROW_TJ("allocating YUV buffer");
       if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
-        _throwunix("allocating YUV buffer");
+        THROW_UNIX("allocating YUV buffer");
       memset(yuvBuf, 127, yuvSize);
     }
 
@@ -393,17 +406,17 @@ static int fullTest(unsigned char *srcBuf, int w, int h, int subsamp,
 
             if (tjEncodeYUV3(handle, srcPtr2, width, pitch, height, pf, yuvBuf,
                              yuvPad, subsamp, flags) == -1)
-              _throwtj("executing tjEncodeYUV3()");
+              THROW_TJ("executing tjEncodeYUV3()");
             if (iter >= 0) elapsedEncode += getTime() - startEncode;
             if (tjCompressFromYUV(handle, yuvBuf, width, yuvPad, height,
                                   subsamp, &jpegBuf[tile], &jpegSize[tile],
                                   jpegQual, flags) == -1)
-              _throwtj("executing tjCompressFromYUV()");
+              THROW_TJ("executing tjCompressFromYUV()");
           } else {
             if (tjCompress2(handle, srcPtr2, width, pitch, height, pf,
                             &jpegBuf[tile], &jpegSize[tile], subsamp, jpegQual,
                             flags) == -1)
-              _throwtj("executing tjCompress2()");
+              THROW_TJ("executing tjCompress2()");
           }
           totalJpegSize += jpegSize[tile];
         }
@@ -419,7 +432,7 @@ static int fullTest(unsigned char *srcBuf, int w, int h, int subsamp,
     }
     if (doYUV) elapsed -= elapsedEncode;
 
-    if (tjDestroy(handle) == -1) _throwtj("executing tjDestroy()");
+    if (tjDestroy(handle) == -1) THROW_TJ("executing tjDestroy()");
     handle = NULL;
 
     if (quiet == 1) fprintf(stderr, "%-5d  %-5d   ", tilew, tileh);
@@ -430,9 +443,9 @@ static int fullTest(unsigned char *srcBuf, int w, int h, int subsamp,
                        (double)iter / elapsedEncode, 4, tempStr, 1024),
                 quiet == 2 ? "\n" : "  ");
       fprintf(stderr, "%-6s%s",
-             sigfig((double)(w * h) / 1000000. * (double)iter / elapsed, 4,
-                    tempStr, 1024),
-             quiet == 2 ? "\n" : "  ");
+              sigfig((double)(w * h) / 1000000. * (double)iter / elapsed, 4,
+                     tempStr, 1024),
+              quiet == 2 ? "\n" : "  ");
       fprintf(stderr, "%-6s%s",
               sigfig((double)(w * h * ps) / (double)totalJpegSize, 4, tempStr2,
                      80),
@@ -443,7 +456,7 @@ static int fullTest(unsigned char *srcBuf, int w, int h, int subsamp,
       if (doYUV) {
         fprintf(stderr, "Encode YUV    --> Frame rate:         %f fps\n",
                 (double)iter / elapsedEncode);
-        fprintf(stderr, "                  Output image size:  %d bytes\n",
+        fprintf(stderr, "                  Output image size:  %lu bytes\n",
                 yuvSize);
         fprintf(stderr, "                  Compression ratio:  %f:1\n",
                 (double)(w * h * ps) / (double)yuvSize);
@@ -452,8 +465,7 @@ static int fullTest(unsigned char *srcBuf, int w, int h, int subsamp,
                 (double)(w * h) / 1000000. * (double)iter / elapsedEncode);
         fprintf(stderr,
                 "                  Output bit stream:  %f Megabits/sec\n",
-                (double)yuvSize * 8. / 1000000. * (double)iter / elapsedEncode
-               );
+                (double)yuvSize * 8. / 1000000. * (double)iter / elapsedEncode);
       }
       fprintf(stderr, "%s --> Frame rate:         %f fps\n",
               doYUV ? "Comp from YUV" : "Compress     ",
@@ -473,9 +485,9 @@ static int fullTest(unsigned char *srcBuf, int w, int h, int subsamp,
       snprintf(tempStr, 1024, "%s_%s_Q%d.jpg", fileName, subName[subsamp],
                jpegQual);
       if ((file = fopen(tempStr, "wb")) == NULL)
-        _throwunix("opening reference image");
+        THROW_UNIX("opening reference image");
       if (fwrite(jpegBuf[0], jpegSize[0], 1, file) != 1)
-        _throwunix("writing reference image");
+        THROW_UNIX("writing reference image");
       fclose(file);  file = NULL;
       if (!quiet) fprintf(stderr, "Reference image written to %s\n", tempStr);
     }
@@ -485,10 +497,10 @@ static int fullTest(unsigned char *srcBuf, int w, int h, int subsamp,
       if (decomp(srcBuf, jpegBuf, jpegSize, tmpBuf, w, h, subsamp, jpegQual,
                  fileName, tilew, tileh) == -1)
         goto bailout;
-    }
+    } else if (quiet == 1) fprintf(stderr, "N/A\n");
 
     for (i = 0; i < ntilesw * ntilesh; i++) {
-      if (jpegBuf[i]) tjFree(jpegBuf[i]);
+      tjFree(jpegBuf[i]);
       jpegBuf[i] = NULL;
     }
     free(jpegBuf);  jpegBuf = NULL;
@@ -501,18 +513,16 @@ static int fullTest(unsigned char *srcBuf, int w, int h, int subsamp,
   }
 
 bailout:
-  if (file) { fclose(file);  file = NULL; }
+  if (file) fclose(file);
   if (jpegBuf) {
-    for (i = 0; i < ntilesw * ntilesh; i++) {
-      if (jpegBuf[i]) tjFree(jpegBuf[i]);
-      jpegBuf[i] = NULL;
-    }
-    free(jpegBuf);  jpegBuf = NULL;
+    for (i = 0; i < ntilesw * ntilesh; i++)
+      tjFree(jpegBuf[i]);
   }
-  if (yuvBuf) { free(yuvBuf);  yuvBuf = NULL; }
-  if (jpegSize) { free(jpegSize);  jpegSize = NULL; }
-  if (tmpBuf) { free(tmpBuf);  tmpBuf = NULL; }
-  if (handle) { tjDestroy(handle);  handle = NULL; }
+  free(jpegBuf);
+  free(yuvBuf);
+  free(jpegSize);
+  free(tmpBuf);
+  if (handle) tjDestroy(handle);
   return retval;
 }
 
@@ -534,26 +544,28 @@ static int decompTest(char *fileName)
   int tw, th, ttilew, ttileh, tntilesw, tntilesh, tsubsamp;
 
   if ((file = fopen(fileName, "rb")) == NULL)
-    _throwunix("opening file");
+    THROW_UNIX("opening file");
   if (fseek(file, 0, SEEK_END) < 0 ||
       (srcSize = ftell(file)) == (unsigned long)-1)
-    _throwunix("determining file size");
+    THROW_UNIX("determining file size");
   if ((srcBuf = (unsigned char *)malloc(srcSize)) == NULL)
-    _throwunix("allocating memory");
+    THROW_UNIX("allocating memory");
   if (fseek(file, 0, SEEK_SET) < 0)
-    _throwunix("setting file position");
+    THROW_UNIX("setting file position");
   if (fread(srcBuf, srcSize, 1, file) < 1)
-    _throwunix("reading JPEG data");
+    THROW_UNIX("reading JPEG data");
   fclose(file);  file = NULL;
 
   temp = strrchr(fileName, '.');
   if (temp != NULL) *temp = '\0';
 
   if ((handle = tjInitTransform()) == NULL)
-    _throwtj("executing tjInitTransform()");
+    THROW_TJ("executing tjInitTransform()");
   if (tjDecompressHeader3(handle, srcBuf, srcSize, &w, &h, &subsamp,
                           &cs) == -1)
-    _throwtj("executing tjDecompressHeader3()");
+    THROW_TJ("executing tjDecompressHeader3()");
+  if (w < 1 || h < 1)
+    THROW("reading JPEG header", "Invalid image dimensions");
   if (cs == TJCS_YCCK || cs == TJCS_CMYK) {
     pf = TJPF_CMYK;  ps = tjPixelSize[pf];
   }
@@ -583,18 +595,21 @@ static int decompTest(char *fileName)
 
     if ((jpegBuf = (unsigned char **)malloc(sizeof(unsigned char *) *
                                             ntilesw * ntilesh)) == NULL)
-      _throwunix("allocating JPEG tile array");
+      THROW_UNIX("allocating JPEG tile array");
     memset(jpegBuf, 0, sizeof(unsigned char *) * ntilesw * ntilesh);
     if ((jpegSize = (unsigned long *)malloc(sizeof(unsigned long) *
                                             ntilesw * ntilesh)) == NULL)
-      _throwunix("allocating JPEG size array");
+      THROW_UNIX("allocating JPEG size array");
     memset(jpegSize, 0, sizeof(unsigned long) * ntilesw * ntilesh);
 
-    if ((flags & TJFLAG_NOREALLOC) != 0 || !doTile)
+    if ((flags & TJFLAG_NOREALLOC) != 0 &&
+        (doTile || xformOp != TJXOP_NONE || xformOpt != 0 || customFilter))
       for (i = 0; i < ntilesw * ntilesh; i++) {
+        if (tjBufSize(tilew, tileh, subsamp) > (unsigned long)INT_MAX)
+          THROW("getting buffer size", "Image is too large");
         if ((jpegBuf[i] = (unsigned char *)
                           tjAlloc(tjBufSize(tilew, tileh, subsamp))) == NULL)
-          _throwunix("allocating JPEG tiles");
+          THROW_UNIX("allocating JPEG tiles");
       }
 
     tw = w;  th = h;  ttilew = tilew;  ttileh = tileh;
@@ -615,7 +630,7 @@ static int decompTest(char *fileName)
     if (doTile || xformOp != TJXOP_NONE || xformOpt != 0 || customFilter) {
       if ((t = (tjtransform *)malloc(sizeof(tjtransform) * ntilesw *
                                      ntilesh)) == NULL)
-        _throwunix("allocating image transform array");
+        THROW_UNIX("allocating image transform array");
 
       if (xformOp == TJXOP_TRANSPOSE || xformOp == TJXOP_TRANSVERSE ||
           xformOp == TJXOP_ROT90 || xformOp == TJXOP_ROT270) {
@@ -661,7 +676,7 @@ static int decompTest(char *fileName)
         start = getTime();
         if (tjTransform(handle, srcBuf, srcSize, tntilesw * tntilesh, jpegBuf,
                         jpegSize, t, flags) == -1)
-          _throwtj("executing tjTransform()");
+          THROW_TJ("executing tjTransform()");
         elapsed += getTime() - start;
         if (iter >= 0) {
           iter++;
@@ -715,26 +730,25 @@ static int decompTest(char *fileName)
     } else if (quiet == 1) fprintf(stderr, "N/A\n");
 
     for (i = 0; i < ntilesw * ntilesh; i++) {
-      tjFree(jpegBuf[i]);  jpegBuf[i] = NULL;
+      tjFree(jpegBuf[i]);
+      jpegBuf[i] = NULL;
     }
     free(jpegBuf);  jpegBuf = NULL;
-    if (jpegSize) { free(jpegSize);  jpegSize = NULL; }
+    free(jpegSize);  jpegSize = NULL;
 
     if (tilew == w && tileh == h) break;
   }
 
 bailout:
-  if (file) { fclose(file);  file = NULL; }
+  if (file) fclose(file);
   if (jpegBuf) {
-    for (i = 0; i < ntilesw * ntilesh; i++) {
-      if (jpegBuf[i]) tjFree(jpegBuf[i]);
-      jpegBuf[i] = NULL;
-    }
-    free(jpegBuf);  jpegBuf = NULL;
+    for (i = 0; i < ntilesw * ntilesh; i++)
+      tjFree(jpegBuf[i]);
   }
-  if (jpegSize) { free(jpegSize);  jpegSize = NULL; }
-  if (srcBuf) { free(srcBuf);  srcBuf = NULL; }
-  if (t) { free(t);  t = NULL; }
+  free(jpegBuf);
+  free(jpegSize);
+  free(srcBuf);
+  free(t);
   if (handle) { tjDestroy(handle);  handle = NULL; }
   return retval;
 }
@@ -810,7 +824,6 @@ static void usage(char *progName)
   exit(1);
 }
 
-
 #ifndef GTEST
 int main(int argc, char *argv[])
 #else
@@ -823,7 +836,7 @@ int tjbench(int argc, char *argv[])
   int minArg = 2, retval = 0, subsamp = -1;
 
   if ((scalingFactors = tjGetScalingFactors(&nsf)) == NULL || nsf == 0)
-    _throw("executing tjGetScalingFactors()", tjGetErrorStr());
+    THROW("executing tjGetScalingFactors()", tjGetErrorStr());
 
   if (argc < minArg) usage(argv[0]);
 
@@ -922,14 +935,14 @@ int tjbench(int argc, char *argv[])
       else if (!strcasecmp(argv[i], "-copynone"))
         xformOpt |= TJXOPT_COPYNONE;
       else if (!strcasecmp(argv[i], "-benchtime") && i < argc - 1) {
-        double temp = atof(argv[++i]);
+        double tempd = atof(argv[++i]);
 
-        if (temp > 0.0) benchTime = temp;
+        if (tempd > 0.0) benchTime = tempd;
         else usage(argv[0]);
       } else if (!strcasecmp(argv[i], "-warmup") && i < argc - 1) {
-        double temp = atof(argv[++i]);
+        double tempd = atof(argv[++i]);
 
-        if (temp >= 0.0) warmup = temp;
+        if (tempd >= 0.0) warmup = tempd;
         else usage(argv[0]);
         fprintf(stderr, "Warmup time = %.1f seconds\n\n", warmup);
       } else if (!strcasecmp(argv[i], "-alloc"))
@@ -940,16 +953,16 @@ int tjbench(int argc, char *argv[])
         fprintf(stderr, "Testing YUV planar encoding/decoding\n\n");
         doYUV = 1;
       } else if (!strcasecmp(argv[i], "-yuvpad") && i < argc - 1) {
-        int temp = atoi(argv[++i]);
+        int tempi = atoi(argv[++i]);
 
-        if (temp >= 1) yuvPad = temp;
+        if (tempi >= 1) yuvPad = tempi;
       } else if (!strcasecmp(argv[i], "-subsamp") && i < argc - 1) {
         i++;
         if (toupper(argv[i][0]) == 'G') subsamp = TJSAMP_GRAY;
         else {
-          int temp = atoi(argv[i]);
+          int tempi = atoi(argv[i]);
 
-          switch (temp) {
+          switch (tempi) {
           case 444:  subsamp = TJSAMP_444;  break;
           case 422:  subsamp = TJSAMP_422;  break;
           case 440:  subsamp = TJSAMP_440;  break;
@@ -981,7 +994,7 @@ int tjbench(int argc, char *argv[])
 
   if (!decompOnly) {
     if ((srcBuf = tjLoadImage(argv[1], &w, 1, &h, &pf, flags)) == NULL)
-      _throwtjg("loading bitmap");
+      THROW_TJG("loading bitmap");
     temp = strrchr(argv[1], '.');
     if (temp != NULL) *temp = '\0';
   }
@@ -1028,6 +1041,6 @@ int tjbench(int argc, char *argv[])
   }
 
 bailout:
-  if (srcBuf) tjFree(srcBuf);
+  tjFree(srcBuf);
   return retval;
 }
diff --git a/tjunittest.c b/tjunittest.c
index 6235b6f..af409a5 100644
--- a/tjunittest.c
+++ b/tjunittest.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2014, 2017-2018 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2014, 2017-2019 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -61,16 +61,16 @@ static void usage(char *progName)
 #endif
 
 
-#define _throwtj() { \
+#define THROW_TJ() { \
   fprintf(stderr, "TurboJPEG ERROR:\n%s\n", tjGetErrorStr()); \
-  bailout() \
+  BAILOUT() \
 }
-#define _tj(f) { if ((f) == -1) _throwtj(); }
-#define _throw(m) { fprintf(stderr, "ERROR: %s\n", m);  bailout() }
-#define _throwmd5(filename, md5sum, ref) { \
+#define TRY_TJ(f) { if ((f) == -1) THROW_TJ(); }
+#define THROW(m) { printf("ERROR: %s\n", m);  BAILOUT() }
+#define THROW_MD5(filename, md5sum, ref) { \
   fprintf(stderr, "\n%s has an MD5 sum of %s.\n   Should be %s.\n", filename, \
           md5sum, ref); \
-  bailout() \
+  BAILOUT() \
 }
 
 static const char *subNameLong[TJ_NUMSAMP] = {
@@ -80,7 +80,7 @@ static const char *subName[TJ_NUMSAMP] = {
   "444", "422", "420", "GRAY", "440", "411"
 };
 
-const char *pixFormatStr[TJ_NUMPF] = {
+static const char *pixFormatStr[TJ_NUMPF] = {
   "RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "Grayscale",
   "RGBA", "BGRA", "ABGR", "ARGB", "CMYK"
 };
@@ -95,7 +95,7 @@ static const int _onlyRGB[] = { TJPF_RGB };
 static int doYUV = 0, alloc = 0, pad = 4;
 
 static int exitStatus = 0;
-#define bailout() { exitStatus = -1;  goto bailout; }
+#define BAILOUT() { exitStatus = -1;  goto bailout; }
 
 static const size_t filePathSize = 1024;
 
@@ -155,7 +155,7 @@ static void initBuf(unsigned char *buf, int w, int h, int pf, int flags)
 }
 
 
-#define checkval(v, cv) { \
+#define CHECKVAL(v, cv) { \
   if (v < cv - 1 || v > cv + 1) { \
     fprintf(stderr, "\nComp. %s at %d,%d should be %d, not %d\n", #v, row, \
             col, cv, v); \
@@ -163,7 +163,7 @@ static void initBuf(unsigned char *buf, int w, int h, int pf, int flags)
   } \
 }
 
-#define checkval0(v) { \
+#define CHECKVAL0(v) { \
   if (v > 1) { \
     fprintf(stderr, "\nComp. %s at %d,%d should be 0, not %d\n", #v, row, \
             col, v); \
@@ -171,7 +171,7 @@ static void initBuf(unsigned char *buf, int w, int h, int pf, int flags)
   } \
 }
 
-#define checkval255(v) { \
+#define CHECKVAL255(v) { \
   if (v < 254) { \
     fprintf(stderr, "\nComp. %s at %d,%d should be 255, not %d\n", #v, row, \
             col, v); \
@@ -206,13 +206,13 @@ static int checkBuf(unsigned char *buf, int w, int h, int pf, int subsamp,
         y = buf[index * ps + 2];
         k = buf[index * ps + 3];
         if (((row / blocksize) + (col / blocksize)) % 2 == 0) {
-          checkval255(c);  checkval255(m);  checkval255(y);
-          if (row < halfway) checkval255(k)
-          else checkval0(k)
+          CHECKVAL255(c);  CHECKVAL255(m);  CHECKVAL255(y);
+          if (row < halfway) CHECKVAL255(k)
+          else CHECKVAL0(k)
         } else {
-          checkval255(c);  checkval0(y);  checkval255(k);
-          if (row < halfway) checkval0(m)
-          else checkval255(m)
+          CHECKVAL255(c);  CHECKVAL0(y);  CHECKVAL255(k);
+          if (row < halfway) CHECKVAL0(m)
+          else CHECKVAL255(m)
         }
       }
     }
@@ -231,26 +231,26 @@ static int checkBuf(unsigned char *buf, int w, int h, int pf, int subsamp,
       a = aoffset >= 0 ? buf[index * ps + aoffset] : 0xFF;
       if (((row / blocksize) + (col / blocksize)) % 2 == 0) {
         if (row < halfway) {
-          checkval255(r);  checkval255(g);  checkval255(b);
+          CHECKVAL255(r);  CHECKVAL255(g);  CHECKVAL255(b);
         } else {
-          checkval0(r);  checkval0(g);  checkval0(b);
+          CHECKVAL0(r);  CHECKVAL0(g);  CHECKVAL0(b);
         }
       } else {
         if (subsamp == TJSAMP_GRAY) {
           if (row < halfway) {
-            checkval(r, 76);  checkval(g, 76);  checkval(b, 76);
+            CHECKVAL(r, 76);  CHECKVAL(g, 76);  CHECKVAL(b, 76);
           } else {
-            checkval(r, 226);  checkval(g, 226);  checkval(b, 226);
+            CHECKVAL(r, 226);  CHECKVAL(g, 226);  CHECKVAL(b, 226);
           }
         } else {
           if (row < halfway) {
-            checkval255(r);  checkval0(g);  checkval0(b);
+            CHECKVAL255(r);  CHECKVAL0(g);  CHECKVAL0(b);
           } else {
-            checkval255(r);  checkval255(g);  checkval0(b);
+            CHECKVAL255(r);  CHECKVAL255(g);  CHECKVAL0(b);
           }
         }
       }
-      checkval255(a);
+      CHECKVAL255(a);
     }
   }
 
@@ -263,7 +263,8 @@ bailout:
                   buf[(row * w + col) * ps + 1], buf[(row * w + col) * ps + 2],
                   buf[(row * w + col) * ps + 3]);
         else
-          fprintf(stderr, "%.3d/%.3d/%.3d ", buf[(row * w + col) * ps + roffset],
+          fprintf(stderr, "%.3d/%.3d/%.3d ",
+                  buf[(row * w + col) * ps + roffset],
                   buf[(row * w + col) * ps + goffset],
                   buf[(row * w + col) * ps + boffset]);
       }
@@ -293,16 +294,16 @@ static int checkBufYUV(unsigned char *buf, int w, int h, int subsamp,
       unsigned char y = buf[ypitch * row + col];
 
       if (((row / blocksize) + (col / blocksize)) % 2 == 0) {
-        if (row < halfway) checkval255(y)
-        else checkval0(y);
+        if (row < halfway) CHECKVAL255(y)
+        else CHECKVAL0(y);
       } else {
-        if (row < halfway) checkval(y, 76)
-        else checkval(y, 226);
+        if (row < halfway) CHECKVAL(y, 76)
+        else CHECKVAL(y, 226);
       }
     }
   }
   if (subsamp != TJSAMP_GRAY) {
-    int halfway = 16 / vsf * sf.num / sf.denom;
+    halfway = 16 / vsf * sf.num / sf.denom;
 
     for (row = 0; row < ch; row++) {
       for (col = 0; col < cw; col++) {
@@ -310,12 +311,12 @@ static int checkBufYUV(unsigned char *buf, int w, int h, int subsamp,
           v = buf[ypitch * ph + uvpitch * ch + (uvpitch * row + col)];
 
         if (((row * vsf / blocksize) + (col * hsf / blocksize)) % 2 == 0) {
-          checkval(u, 128);  checkval(v, 128);
+          CHECKVAL(u, 128);  CHECKVAL(v, 128);
         } else {
           if (row < halfway) {
-            checkval(u, 85);  checkval255(v);
+            CHECKVAL(u, 85);  CHECKVAL255(v);
           } else {
-            checkval0(u);  checkval(v, 149);
+            CHECKVAL0(u);  CHECKVAL(v, 149);
           }
         }
       }
@@ -361,7 +362,7 @@ static void writeJPEG(unsigned char *jpegBuf, unsigned long jpegSize,
   if (!file || fwrite(jpegBuf, jpegSize, 1, file) != 1) {
     fprintf(stderr, "ERROR: Could not write to %s.\n%s\n", filename,
             strerror(errno));
-    bailout()
+    BAILOUT()
   }
 
 bailout:
@@ -381,7 +382,7 @@ static void compTest(tjhandle handle, unsigned char **dstBuf,
   const char *buStr = (flags & TJFLAG_BOTTOMUP) ? "BU" : "TD";
 
   if ((srcBuf = (unsigned char *)malloc(w * h * tjPixelSize[pf])) == NULL)
-    _throw("Memory allocation failure");
+    THROW("Memory allocation failure");
   initBuf(srcBuf, w, h, pf, flags);
 
   if (*dstBuf && *dstSize > 0) memset(*dstBuf, 0, *dstSize);
@@ -392,39 +393,39 @@ static void compTest(tjhandle handle, unsigned char **dstBuf,
     tjscalingfactor sf = { 1, 1 };
     tjhandle handle2 = tjInitCompress();
 
-    if (!handle2) _throwtj();
+    if (!handle2) THROW_TJ();
 
     if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
-      _throw("Memory allocation failure");
+      THROW("Memory allocation failure");
     memset(yuvBuf, 0, yuvSize);
 
     fprintf(stderr, "%s %s -> YUV %s ... ", pfStr, buStrLong,
             subNameLong[subsamp]);
-    _tj(tjEncodeYUV3(handle2, srcBuf, w, 0, h, pf, yuvBuf, pad, subsamp,
-                     flags));
+    TRY_TJ(tjEncodeYUV3(handle2, srcBuf, w, 0, h, pf, yuvBuf, pad, subsamp,
+                        flags));
     tjDestroy(handle2);
     if (checkBufYUV(yuvBuf, w, h, subsamp, sf)) fprintf(stderr, "Passed.\n");
     else fprintf(stderr, "FAILED!\n");
 
-    fprintf(stderr, "YUV %s %s -> JPEG Q%d ... ", subNameLong[subsamp], buStrLong,
-            jpegQual);
-    _tj(tjCompressFromYUV(handle, yuvBuf, w, pad, h, subsamp, dstBuf, dstSize,
-                          jpegQual, flags));
+    fprintf(stderr, "YUV %s %s -> JPEG Q%d ... ", subNameLong[subsamp],
+            buStrLong, jpegQual);
+    TRY_TJ(tjCompressFromYUV(handle, yuvBuf, w, pad, h, subsamp, dstBuf,
+                             dstSize, jpegQual, flags));
   } else {
     fprintf(stderr, "%s %s -> %s Q%d ... ", pfStr, buStrLong,
             subNameLong[subsamp], jpegQual);
-    _tj(tjCompress2(handle, srcBuf, w, 0, h, pf, dstBuf, dstSize, subsamp,
-                    jpegQual, flags));
+    TRY_TJ(tjCompress2(handle, srcBuf, w, 0, h, pf, dstBuf, dstSize, subsamp,
+                       jpegQual, flags));
   }
 
-  snprintf(tempStr, filePathSize, "%s_enc_%s_%s_%s_Q%d.jpg", basename, pfStr, buStr,
-           subName[subsamp], jpegQual);
+  snprintf(tempStr, filePathSize, "%s_enc_%s_%s_%s_Q%d.jpg", basename, pfStr,
+           buStr, subName[subsamp], jpegQual);
   writeJPEG(*dstBuf, *dstSize, tempStr);
   fprintf(stderr, "Done.\n  Result in %s\n", tempStr);
 
 bailout:
-  if (yuvBuf) free(yuvBuf);
-  if (srcBuf) free(srcBuf);
+  free(yuvBuf);
+  free(srcBuf);
 }
 
 
@@ -439,14 +440,14 @@ static void _decompTest(tjhandle handle, unsigned char *jpegBuf,
   int scaledHeight = TJSCALED(h, sf);
   unsigned long dstSize = 0;
 
-  _tj(tjDecompressHeader2(handle, jpegBuf, jpegSize, &_hdrw, &_hdrh,
-                          &_hdrsubsamp));
+  TRY_TJ(tjDecompressHeader2(handle, jpegBuf, jpegSize, &_hdrw, &_hdrh,
+                             &_hdrsubsamp));
   if (_hdrw != w || _hdrh != h || _hdrsubsamp != subsamp)
-    _throw("Incorrect JPEG header");
+    THROW("Incorrect JPEG header");
 
   dstSize = scaledWidth * scaledHeight * tjPixelSize[pf];
   if ((dstBuf = (unsigned char *)malloc(dstSize)) == NULL)
-    _throw("Memory allocation failure");
+    THROW("Memory allocation failure");
   memset(dstBuf, 0, dstSize);
 
   if (doYUV) {
@@ -454,18 +455,18 @@ static void _decompTest(tjhandle handle, unsigned char *jpegBuf,
                                           subsamp);
     tjhandle handle2 = tjInitDecompress();
 
-    if (!handle2) _throwtj();
+    if (!handle2) THROW_TJ();
 
     if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
-      _throw("Memory allocation failure");
+      THROW("Memory allocation failure");
     memset(yuvBuf, 0, yuvSize);
 
     fprintf(stderr, "JPEG -> YUV %s ", subNameLong[subsamp]);
     if (sf.num != 1 || sf.denom != 1)
       fprintf(stderr, "%d/%d ... ", sf.num, sf.denom);
     else fprintf(stderr, "... ");
-    _tj(tjDecompressToYUV2(handle, jpegBuf, jpegSize, yuvBuf, scaledWidth, pad,
-                           scaledHeight, flags));
+    TRY_TJ(tjDecompressToYUV2(handle, jpegBuf, jpegSize, yuvBuf, scaledWidth,
+                              pad, scaledHeight, flags));
     if (checkBufYUV(yuvBuf, scaledWidth, scaledHeight, subsamp, sf))
       fprintf(stderr, "Passed.\n");
     else fprintf(stderr, "FAILED!\n");
@@ -473,8 +474,8 @@ static void _decompTest(tjhandle handle, unsigned char *jpegBuf,
     fprintf(stderr, "YUV %s -> %s %s ... ", subNameLong[subsamp],
             pixFormatStr[pf],
             (flags & TJFLAG_BOTTOMUP) ? "Bottom-Up" : "Top-Down ");
-    _tj(tjDecodeYUV(handle2, yuvBuf, pad, subsamp, dstBuf, scaledWidth, 0,
-                    scaledHeight, pf, flags));
+    TRY_TJ(tjDecodeYUV(handle2, yuvBuf, pad, subsamp, dstBuf, scaledWidth, 0,
+                       scaledHeight, pf, flags));
     tjDestroy(handle2);
   } else {
     fprintf(stderr, "JPEG -> %s %s ", pixFormatStr[pf],
@@ -482,8 +483,8 @@ static void _decompTest(tjhandle handle, unsigned char *jpegBuf,
     if (sf.num != 1 || sf.denom != 1)
       fprintf(stderr, "%d/%d ... ", sf.num, sf.denom);
     else fprintf(stderr, "... ");
-    _tj(tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, scaledWidth, 0,
-                      scaledHeight, pf, flags));
+    TRY_TJ(tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, scaledWidth, 0,
+                         scaledHeight, pf, flags));
   }
 
   if (checkBuf(dstBuf, scaledWidth, scaledHeight, pf, subsamp, sf, flags))
@@ -492,8 +493,8 @@ static void _decompTest(tjhandle handle, unsigned char *jpegBuf,
   fprintf(stderr, "\n");
 
 bailout:
-  if (yuvBuf) free(yuvBuf);
-  if (dstBuf) free(dstBuf);
+  free(yuvBuf);
+  free(dstBuf);
 }
 
 
@@ -504,7 +505,7 @@ static void decompTest(tjhandle handle, unsigned char *jpegBuf,
   int i, n = 0;
   tjscalingfactor *sf = tjGetScalingFactors(&n);
 
-  if (!sf || !n) _throwtj();
+  if (!sf || !n) THROW_TJ();
 
   for (i = 0; i < n; i++) {
     if (subsamp == TJSAMP_444 || subsamp == TJSAMP_GRAY ||
@@ -533,11 +534,11 @@ static void doTest(int w, int h, const int *formats, int nformats, int subsamp,
     size = tjBufSize(w, h, subsamp);
   if (size != 0)
     if ((dstBuf = (unsigned char *)tjAlloc(size)) == NULL)
-      _throw("Memory allocation failure.");
+      THROW("Memory allocation failure.");
 
   if ((chandle = tjInitCompress()) == NULL ||
       (dhandle = tjInitDecompress()) == NULL)
-    _throwtj();
+    THROW_TJ();
 
   for (pfi = 0; pfi < nformats; pfi++) {
     for (i = 0; i < 2; i++) {
@@ -564,10 +565,48 @@ static void doTest(int w, int h, const int *formats, int nformats, int subsamp,
 bailout:
   if (chandle) tjDestroy(chandle);
   if (dhandle) tjDestroy(dhandle);
-  if (dstBuf) tjFree(dstBuf);
+  tjFree(dstBuf);
 }
 
 
+#if SIZEOF_SIZE_T == 8
+#define CHECKSIZE(function) { \
+  if ((unsigned long long)size < (unsigned long long)0xFFFFFFFF) \
+    THROW(#function " overflow"); \
+}
+#else
+#define CHECKSIZE(function) { \
+  if (size != (unsigned long)(-1) || \
+      !strcmp(tjGetErrorStr2(NULL), "No error")) \
+    THROW(#function " overflow"); \
+}
+#endif
+
+#ifndef GTEST
+static void overflowTest(void)
+{
+  /* Ensure that the various buffer size functions don't overflow */
+  unsigned long size;
+
+  size = tjBufSize(26755, 26755, TJSAMP_444);
+  CHECKSIZE(tjBufSize());
+  size = TJBUFSIZE(26755, 26755);
+  CHECKSIZE(TJBUFSIZE());
+  size = tjBufSizeYUV2(37838, 1, 37838, TJSAMP_444);
+  CHECKSIZE(tjBufSizeYUV2());
+  size = TJBUFSIZEYUV(37838, 37838, TJSAMP_444);
+  CHECKSIZE(TJBUFSIZEYUV());
+  size = tjBufSizeYUV(37838, 37838, TJSAMP_444);
+  CHECKSIZE(tjBufSizeYUV());
+  size = tjPlaneSizeYUV(0, 65536, 0, 65536, TJSAMP_444);
+  CHECKSIZE(tjPlaneSizeYUV());
+
+bailout:
+  return;
+}
+#endif
+
+
 static void bufSizeTest(void)
 {
   int w, h, i, subsamp;
@@ -575,7 +614,7 @@ static void bufSizeTest(void)
   tjhandle handle = NULL;
   unsigned long dstSize = 0;
 
-  if ((handle = tjInitCompress()) == NULL) _throwtj();
+  if ((handle = tjInitCompress()) == NULL) THROW_TJ();
 
   fprintf(stderr, "Buffer size regression test\n");
   for (subsamp = 0; subsamp < TJ_NUMSAMP; subsamp++) {
@@ -586,12 +625,12 @@ static void bufSizeTest(void)
         if (h % 100 == 0)
           fprintf(stderr, "%.4d x %.4d\b\b\b\b\b\b\b\b\b\b\b", w, h);
         if ((srcBuf = (unsigned char *)malloc(w * h * 4)) == NULL)
-          _throw("Memory allocation failure");
+          THROW("Memory allocation failure");
         if (!alloc || doYUV) {
           if (doYUV) dstSize = tjBufSizeYUV2(w, pad, h, subsamp);
           else dstSize = tjBufSize(w, h, subsamp);
           if ((dstBuf = (unsigned char *)tjAlloc(dstSize)) == NULL)
-            _throw("Memory allocation failure");
+            THROW("Memory allocation failure");
         }
 
         for (i = 0; i < w * h * 4; i++) {
@@ -600,12 +639,12 @@ static void bufSizeTest(void)
         }
 
         if (doYUV) {
-          _tj(tjEncodeYUV3(handle, srcBuf, w, 0, h, TJPF_BGRX, dstBuf, pad,
-                           subsamp, 0));
+          TRY_TJ(tjEncodeYUV3(handle, srcBuf, w, 0, h, TJPF_BGRX, dstBuf, pad,
+                              subsamp, 0));
         } else {
-          _tj(tjCompress2(handle, srcBuf, w, 0, h, TJPF_BGRX, &dstBuf,
-                          &dstSize, subsamp, 100,
-                          alloc ? 0 : TJFLAG_NOREALLOC));
+          TRY_TJ(tjCompress2(handle, srcBuf, w, 0, h, TJPF_BGRX, &dstBuf,
+                             &dstSize, subsamp, 100,
+                             alloc ? 0 : TJFLAG_NOREALLOC));
         }
         free(srcBuf);  srcBuf = NULL;
         if (!alloc || doYUV) {
@@ -613,12 +652,12 @@ static void bufSizeTest(void)
         }
 
         if ((srcBuf = (unsigned char *)malloc(h * w * 4)) == NULL)
-          _throw("Memory allocation failure");
+          THROW("Memory allocation failure");
         if (!alloc || doYUV) {
           if (doYUV) dstSize = tjBufSizeYUV2(h, pad, w, subsamp);
           else dstSize = tjBufSize(h, w, subsamp);
           if ((dstBuf = (unsigned char *)tjAlloc(dstSize)) == NULL)
-            _throw("Memory allocation failure");
+            THROW("Memory allocation failure");
         }
 
         for (i = 0; i < h * w * 4; i++) {
@@ -627,12 +666,12 @@ static void bufSizeTest(void)
         }
 
         if (doYUV) {
-          _tj(tjEncodeYUV3(handle, srcBuf, h, 0, w, TJPF_BGRX, dstBuf, pad,
-                           subsamp, 0));
+          TRY_TJ(tjEncodeYUV3(handle, srcBuf, h, 0, w, TJPF_BGRX, dstBuf, pad,
+                              subsamp, 0));
         } else {
-          _tj(tjCompress2(handle, srcBuf, h, 0, w, TJPF_BGRX, &dstBuf,
-                          &dstSize, subsamp, 100,
-                          alloc ? 0 : TJFLAG_NOREALLOC));
+          TRY_TJ(tjCompress2(handle, srcBuf, h, 0, w, TJPF_BGRX, &dstBuf,
+                             &dstSize, subsamp, 100,
+                             alloc ? 0 : TJFLAG_NOREALLOC));
         }
         free(srcBuf);  srcBuf = NULL;
         if (!alloc || doYUV) {
@@ -644,8 +683,8 @@ static void bufSizeTest(void)
   fprintf(stderr, "Done.      \n");
 
 bailout:
-  if (srcBuf) free(srcBuf);
-  if (dstBuf) tjFree(dstBuf);
+  free(srcBuf);
+  tjFree(dstBuf);
   if (handle) tjDestroy(handle);
 }
 
@@ -754,25 +793,26 @@ static int doBmpTest(const char *ext, int width, int align, int height, int pf,
   }
 
   if ((buf = (unsigned char *)tjAlloc(pitch * height)) == NULL)
-    _throw("Could not allocate memory");
+    THROW("Could not allocate memory");
   initBitmap(buf, width, pitch, height, pf, flags);
 
 #if defined(ANDROID) && defined(GTEST)
-  snprintf(filename, filenameSize, "/sdcard/test_bmp_%s_%d_%s.%s", pixFormatStr[pf],
-           align, (flags & TJFLAG_BOTTOMUP) ? "bu" : "td", ext);
+  snprintf(filename, filenameSize, "/sdcard/test_bmp_%s_%d_%s.%s",
+           pixFormatStr[pf], align, (flags & TJFLAG_BOTTOMUP) ? "bu" : "td",
+           ext);
 #else
-  snprintf(filename, filenameSize, "test_bmp_%s_%d_%s.%s", pixFormatStr[pf], align,
-           (flags & TJFLAG_BOTTOMUP) ? "bu" : "td", ext);
+  snprintf(filename, filenameSize, "test_bmp_%s_%d_%s.%s", pixFormatStr[pf],
+           align, (flags & TJFLAG_BOTTOMUP) ? "bu" : "td", ext);
 #endif
-  _tj(tjSaveImage(filename, buf, width, pitch, height, pf, flags));
+  TRY_TJ(tjSaveImage(filename, buf, width, pitch, height, pf, flags));
   md5sum = MD5File(filename, md5buf);
   if (strcasecmp(md5sum, md5ref))
-    _throwmd5(filename, md5sum, md5ref);
+    THROW_MD5(filename, md5sum, md5ref);
 
   tjFree(buf);  buf = NULL;
   if ((buf = tjLoadImage(filename, &loadWidth, align, &loadHeight, &pf,
                          flags)) == NULL)
-    _throwtj();
+    THROW_TJ();
   if (width != loadWidth || height != loadHeight) {
     fprintf(stderr, "\n   Image dimensions of %s are bogus\n", filename);
     retval = -1;  goto bailout;
@@ -786,7 +826,7 @@ static int doBmpTest(const char *ext, int width, int align, int height, int pf,
     pf = TJPF_XBGR;
     if ((buf = tjLoadImage(filename, &loadWidth, align, &loadHeight, &pf,
                            flags)) == NULL)
-      _throwtj();
+      THROW_TJ();
     pitch = PAD(width * tjPixelSize[pf], align);
     if (!cmpBitmap(buf, width, pitch, height, pf, flags, 1)) {
       fprintf(stderr, "\n   Converting %s to RGB failed\n", filename);
@@ -797,7 +837,7 @@ static int doBmpTest(const char *ext, int width, int align, int height, int pf,
     pf = TJPF_CMYK;
     if ((buf = tjLoadImage(filename, &loadWidth, align, &loadHeight, &pf,
                            flags)) == NULL)
-      _throwtj();
+      THROW_TJ();
     pitch = PAD(width * tjPixelSize[pf], align);
     if (!cmpBitmap(buf, width, pitch, height, pf, flags, 1)) {
       fprintf(stderr, "\n   Converting %s to CMYK failed\n", filename);
@@ -811,7 +851,7 @@ static int doBmpTest(const char *ext, int width, int align, int height, int pf,
   pixelFormat = TJPF_UNKNOWN;
   if ((buf = tjLoadImage(filename, &loadWidth, align, &loadHeight,
                          &pixelFormat, flags)) == NULL)
-    _throwtj();
+    THROW_TJ();
   if ((pf == TJPF_GRAY && pixelFormat != TJPF_GRAY) ||
       (pf != TJPF_GRAY && !strcasecmp(ext, "bmp") &&
        pixelFormat != TJPF_BGR) ||
@@ -825,7 +865,7 @@ static int doBmpTest(const char *ext, int width, int align, int height, int pf,
   unlink(filename);
 
 bailout:
-  if (buf) tjFree(buf);
+  tjFree(buf);
   if (exitStatus < 0) return exitStatus;
   return retval;
 }
@@ -868,6 +908,7 @@ static int bmpTest(void)
   return 0;
 }
 
+
 #ifdef GTEST
 static void initTJUnitTest(int yuv, int noyuvpad, int autoalloc)
 {
@@ -1100,6 +1141,7 @@ int main(int argc, char *argv[])
   }
   if (alloc) printf("Testing automatic buffer allocation\n");
   if (doYUV) num4bf = 4;
+  overflowTest();
   doTest(35, 39, _3byteFormats, 2, TJSAMP_444, "test");
   doTest(39, 41, _4byteFormats, num4bf, TJSAMP_444, "test");
   doTest(41, 35, _3byteFormats, 2, TJSAMP_422, "test");
diff --git a/tjutil.c b/tjutil.c
index b44086d..2018160 100644
--- a/tjutil.c
+++ b/tjutil.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011, 2019 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,6 +29,7 @@
 #ifdef _WIN32
 
 #include <windows.h>
+#include "tjutil.h"
 
 static double getFreq(void)
 {
@@ -56,6 +57,7 @@ double getTime(void)
 
 #include <stdlib.h>
 #include <sys/time.h>
+#include "tjutil.h"
 
 double getTime(void)
 {
diff --git a/turbojpeg-jni.c b/turbojpeg-jni.c
index d0a0935..9363450 100644
--- a/turbojpeg-jni.c
+++ b/turbojpeg-jni.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2018 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2019 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,59 +35,58 @@
 #include <jni.h>
 #include "java/org_libjpegturbo_turbojpeg_TJCompressor.h"
 #include "java/org_libjpegturbo_turbojpeg_TJDecompressor.h"
+#include "java/org_libjpegturbo_turbojpeg_TJTransformer.h"
 #include "java/org_libjpegturbo_turbojpeg_TJ.h"
 
-#define PAD(v, p)  ((v + (p) - 1) & (~((p) - 1)))
-
-#define bailif0(f) { \
+#define BAILIF0(f) { \
   if (!(f) || (*env)->ExceptionCheck(env)) { \
     goto bailout; \
   } \
 }
 
-#define _throw(msg, exceptionClass) { \
+#define THROW(msg, exceptionClass) { \
   jclass _exccls = (*env)->FindClass(env, exceptionClass); \
   \
-  bailif0(_exccls); \
+  BAILIF0(_exccls); \
   (*env)->ThrowNew(env, _exccls, msg); \
   goto bailout; \
 }
 
-#define _throwtj() { \
+#define THROW_TJ() { \
   jclass _exccls; \
   jmethodID _excid; \
   jobject _excobj; \
   jstring _errstr; \
   \
-  bailif0(_errstr = (*env)->NewStringUTF(env, tjGetErrorStr2(handle))); \
-  bailif0(_exccls = (*env)->FindClass(env, \
+  BAILIF0(_errstr = (*env)->NewStringUTF(env, tjGetErrorStr2(handle))); \
+  BAILIF0(_exccls = (*env)->FindClass(env, \
     "org/libjpegturbo/turbojpeg/TJException")); \
-  bailif0(_excid = (*env)->GetMethodID(env, _exccls, "<init>", \
+  BAILIF0(_excid = (*env)->GetMethodID(env, _exccls, "<init>", \
                                        "(Ljava/lang/String;I)V")); \
-  bailif0(_excobj = (*env)->NewObject(env, _exccls, _excid, _errstr, \
+  BAILIF0(_excobj = (*env)->NewObject(env, _exccls, _excid, _errstr, \
                                       tjGetErrorCode(handle))); \
   (*env)->Throw(env, _excobj); \
   goto bailout; \
 }
 
-#define _throwarg(msg)  _throw(msg, "java/lang/IllegalArgumentException")
+#define THROW_ARG(msg)  THROW(msg, "java/lang/IllegalArgumentException")
 
-#define _throwmem() \
-  _throw("Memory allocation failure", "java/lang/OutOfMemoryError");
+#define THROW_MEM() \
+  THROW("Memory allocation failure", "java/lang/OutOfMemoryError");
 
-#define gethandle() \
+#define GET_HANDLE() \
   jclass _cls = (*env)->GetObjectClass(env, obj); \
   jfieldID _fid; \
   \
-  bailif0(_cls); \
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "handle", "J")); \
+  BAILIF0(_cls); \
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "handle", "J")); \
   handle = (tjhandle)(size_t)(*env)->GetLongField(env, obj, _fid);
 
 #ifdef _WIN32
 #define setenv(envvar, value, dummy)  _putenv_s(envvar, value)
 #endif
 
-#define prop2env(property, envvar) { \
+#define PROP2ENV(property, envvar) { \
   if ((jName = (*env)->NewStringUTF(env, property)) != NULL && \
       (jValue = (*env)->CallStaticObjectMethod(env, cls, mid, \
                                                jName)) != NULL) { \
@@ -98,21 +97,27 @@
   } \
 }
 
-int ProcessSystemProperties(JNIEnv *env)
+#define SAFE_RELEASE(javaArray, cArray) { \
+  if (javaArray && cArray) \
+    (*env)->ReleasePrimitiveArrayCritical(env, javaArray, (void *)cArray, 0); \
+  cArray = NULL; \
+}
+
+static int ProcessSystemProperties(JNIEnv *env)
 {
   jclass cls;
   jmethodID mid;
   jstring jName, jValue;
   const char *value;
 
-  bailif0(cls = (*env)->FindClass(env, "java/lang/System"));
-  bailif0(mid = (*env)->GetStaticMethodID(env, cls, "getProperty",
+  BAILIF0(cls = (*env)->FindClass(env, "java/lang/System"));
+  BAILIF0(mid = (*env)->GetStaticMethodID(env, cls, "getProperty",
     "(Ljava/lang/String;)Ljava/lang/String;"));
 
-  prop2env("turbojpeg.optimize", "TJ_OPTIMIZE");
-  prop2env("turbojpeg.arithmetic", "TJ_ARITHMETIC");
-  prop2env("turbojpeg.restart", "TJ_RESTART");
-  prop2env("turbojpeg.progressive", "TJ_PROGRESSIVE");
+  PROP2ENV("turbojpeg.optimize", "TJ_OPTIMIZE");
+  PROP2ENV("turbojpeg.arithmetic", "TJ_ARITHMETIC");
+  PROP2ENV("turbojpeg.restart", "TJ_RESTART");
+  PROP2ENV("turbojpeg.progressive", "TJ_PROGRESSIVE");
   return 0;
 
 bailout:
@@ -125,7 +130,7 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSize
 {
   jint retval = (jint)tjBufSize(width, height, jpegSubsamp);
 
-  if (retval == -1) _throwarg(tjGetErrorStr());
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
 
 bailout:
   return retval;
@@ -137,7 +142,7 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII
 {
   jint retval = (jint)tjBufSizeYUV2(width, pad, height, subsamp);
 
-  if (retval == -1) _throwarg(tjGetErrorStr());
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
 
 bailout:
   return retval;
@@ -160,7 +165,7 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeSizeYUV__IIIII
   jint retval = (jint)tjPlaneSizeYUV(componentID, width, stride, height,
                                      subsamp);
 
-  if (retval == -1) _throwarg(tjGetErrorStr());
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
 
 bailout:
   return retval;
@@ -172,7 +177,7 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeWidth__III
 {
   jint retval = (jint)tjPlaneWidth(componentID, width, subsamp);
 
-  if (retval == -1) _throwarg(tjGetErrorStr());
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
 
 bailout:
   return retval;
@@ -184,7 +189,7 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeHeight__III
 {
   jint retval = (jint)tjPlaneHeight(componentID, height, subsamp);
 
-  if (retval == -1) _throwarg(tjGetErrorStr());
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
 
 bailout:
   return retval;
@@ -199,10 +204,10 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_init
   tjhandle handle;
 
   if ((handle = tjInitCompress()) == NULL)
-    _throw(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
+    THROW(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
 
-  bailif0(cls = (*env)->GetObjectClass(env, obj));
-  bailif0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  BAILIF0(cls = (*env)->GetObjectClass(env, obj));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
   (*env)->SetLongField(env, obj, fid, (size_t)handle);
 
 bailout:
@@ -219,35 +224,38 @@ static jint TJCompressor_compress
   jsize arraySize = 0, actualPitch;
   unsigned char *srcBuf = NULL, *jpegBuf = NULL;
 
-  gethandle();
+  GET_HANDLE();
 
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
       height < 1 || pitch < 0)
-    _throwarg("Invalid argument in compress()");
+    THROW_ARG("Invalid argument in compress()");
   if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
-    _throwarg("Mismatch between Java and C API");
+    THROW_ARG("Mismatch between Java and C API");
 
   actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
   arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
   if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
-    _throwarg("Source buffer is not large enough");
+    THROW_ARG("Source buffer is not large enough");
   jpegSize = tjBufSize(width, height, jpegSubsamp);
   if ((*env)->GetArrayLength(env, dst) < (jsize)jpegSize)
-    _throwarg("Destination buffer is not large enough");
+    THROW_ARG("Destination buffer is not large enough");
 
   if (ProcessSystemProperties(env) < 0) goto bailout;
 
-  bailif0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  BAILIF0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
   if (tjCompress2(handle, &srcBuf[y * actualPitch + x * tjPixelSize[pf]],
                   width, pitch, height, pf, &jpegBuf, &jpegSize, jpegSubsamp,
-                  jpegQual, flags | TJFLAG_NOREALLOC) == -1)
-    _throwtj();
+                  jpegQual, flags | TJFLAG_NOREALLOC) == -1) {
+    SAFE_RELEASE(dst, jpegBuf);
+    SAFE_RELEASE(src, srcBuf);
+    THROW_TJ();
+  }
 
 bailout:
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
-  if (srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
+  SAFE_RELEASE(dst, jpegBuf);
+  SAFE_RELEASE(src, srcBuf);
   return (jint)jpegSize;
 }
 
@@ -278,9 +286,9 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3
    jint jpegQual, jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in compress()");
+    THROW_ARG("Invalid argument in compress()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when compressing from an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when compressing from an integer buffer.");
 
   return TJCompressor_compress(env, obj, src, sizeof(jint), x, y, width,
                                stride * sizeof(jint), height, pf, dst,
@@ -297,9 +305,9 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3
    jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in compress()");
+    THROW_ARG("Invalid argument in compress()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when compressing from an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when compressing from an integer buffer.");
 
   return TJCompressor_compress(env, obj, src, sizeof(jint), 0, 0, width,
                                stride * sizeof(jint), height, pf, dst,
@@ -323,66 +331,76 @@ JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFrom
   int *srcOffsets = NULL, *srcStrides = NULL;
   int nc = (subsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3), i;
 
-  gethandle();
+  GET_HANDLE();
 
   if (subsamp < 0 || subsamp >= org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
-    _throwarg("Invalid argument in compressFromYUV()");
+    THROW_ARG("Invalid argument in compressFromYUV()");
   if (org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
-    _throwarg("Mismatch between Java and C API");
+    THROW_ARG("Mismatch between Java and C API");
 
   if ((*env)->GetArrayLength(env, srcobjs) < nc)
-    _throwarg("Planes array is too small for the subsampling type");
+    THROW_ARG("Planes array is too small for the subsampling type");
   if ((*env)->GetArrayLength(env, jSrcOffsets) < nc)
-    _throwarg("Offsets array is too small for the subsampling type");
+    THROW_ARG("Offsets array is too small for the subsampling type");
   if ((*env)->GetArrayLength(env, jSrcStrides) < nc)
-    _throwarg("Strides array is too small for the subsampling type");
+    THROW_ARG("Strides array is too small for the subsampling type");
 
   jpegSize = tjBufSize(width, height, subsamp);
   if ((*env)->GetArrayLength(env, dst) < (jsize)jpegSize)
-    _throwarg("Destination buffer is not large enough");
+    THROW_ARG("Destination buffer is not large enough");
 
   if (ProcessSystemProperties(env) < 0) goto bailout;
 
-  bailif0(srcOffsets = (*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
-  bailif0(srcStrides = (*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
+#define RELEASE_ARRAYS_COMPRESSFROMYUV() { \
+  SAFE_RELEASE(dst, jpegBuf); \
+  for (i = 0; i < nc; i++) \
+    SAFE_RELEASE(jSrcPlanes[i], srcPlanes[i]); \
+  SAFE_RELEASE(jSrcStrides, srcStrides); \
+  SAFE_RELEASE(jSrcOffsets, srcOffsets); \
+}
+
+  BAILIF0(srcOffsets = (*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
+  BAILIF0(srcStrides = (*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
   for (i = 0; i < nc; i++) {
     int planeSize = tjPlaneSizeYUV(i, width, srcStrides[i], height, subsamp);
     int pw = tjPlaneWidth(i, width, subsamp);
 
-    if (planeSize < 0 || pw < 0)
-      _throwarg(tjGetErrorStr());
+    if (planeSize < 0 || pw < 0) {
+      RELEASE_ARRAYS_COMPRESSFROMYUV();
+      THROW_ARG(tjGetErrorStr());
+    }
 
-    if (srcOffsets[i] < 0)
-      _throwarg("Invalid argument in compressFromYUV()");
-    if (srcStrides[i] < 0 && srcOffsets[i] - planeSize + pw < 0)
-      _throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+    if (srcOffsets[i] < 0) {
+      RELEASE_ARRAYS_COMPRESSFROMYUV();
+      THROW_ARG("Invalid argument in compressFromYUV()");
+    }
+    if (srcStrides[i] < 0 && srcOffsets[i] - planeSize + pw < 0) {
+      RELEASE_ARRAYS_COMPRESSFROMYUV();
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+    }
 
-    bailif0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
-    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) < srcOffsets[i] + planeSize)
-      _throwarg("Source plane is not large enough");
+    BAILIF0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
+    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) <
+        srcOffsets[i] + planeSize) {
+      RELEASE_ARRAYS_COMPRESSFROMYUV();
+      THROW_ARG("Source plane is not large enough");
+    }
 
-    bailif0(srcPlanes[i] =
+    BAILIF0(srcPlanes[i] =
             (*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i], 0));
     srcPlanes[i] = &srcPlanes[i][srcOffsets[i]];
   }
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
   if (tjCompressFromYUVPlanes(handle, srcPlanes, width, srcStrides, height,
                               subsamp, &jpegBuf, &jpegSize, jpegQual,
-                              flags | TJFLAG_NOREALLOC) == -1)
-    _throwtj();
+                              flags | TJFLAG_NOREALLOC) == -1) {
+    RELEASE_ARRAYS_COMPRESSFROMYUV();
+    THROW_TJ();
+  }
 
 bailout:
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
-  for (i = 0; i < nc; i++) {
-    if (srcPlanes[i] && jSrcPlanes[i])
-      (*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i],
-                                            (unsigned char *)srcPlanes[i], 0);
-  }
-  if (srcStrides)
-    (*env)->ReleasePrimitiveArrayCritical(env, jSrcStrides, srcStrides, 0);
-  if (srcOffsets)
-    (*env)->ReleasePrimitiveArrayCritical(env, jSrcOffsets, srcOffsets, 0);
+  RELEASE_ARRAYS_COMPRESSFROMYUV();
   return (jint)jpegSize;
 }
 
@@ -398,68 +416,78 @@ static void TJCompressor_encodeYUV
   int *dstOffsets = NULL, *dstStrides = NULL;
   int nc = (subsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3), i;
 
-  gethandle();
+  GET_HANDLE();
 
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
       height < 1 || pitch < 0 || subsamp < 0 ||
       subsamp >= org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
-    _throwarg("Invalid argument in encodeYUV()");
+    THROW_ARG("Invalid argument in encodeYUV()");
   if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF ||
       org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
-    _throwarg("Mismatch between Java and C API");
+    THROW_ARG("Mismatch between Java and C API");
 
   if ((*env)->GetArrayLength(env, dstobjs) < nc)
-    _throwarg("Planes array is too small for the subsampling type");
+    THROW_ARG("Planes array is too small for the subsampling type");
   if ((*env)->GetArrayLength(env, jDstOffsets) < nc)
-    _throwarg("Offsets array is too small for the subsampling type");
+    THROW_ARG("Offsets array is too small for the subsampling type");
   if ((*env)->GetArrayLength(env, jDstStrides) < nc)
-    _throwarg("Strides array is too small for the subsampling type");
+    THROW_ARG("Strides array is too small for the subsampling type");
 
   actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
   arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
   if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
-    _throwarg("Source buffer is not large enough");
+    THROW_ARG("Source buffer is not large enough");
+
+#define RELEASE_ARRAYS_ENCODEYUV() { \
+  SAFE_RELEASE(src, srcBuf); \
+  for (i = 0; i < nc; i++) \
+    SAFE_RELEASE(jDstPlanes[i], dstPlanes[i]); \
+  SAFE_RELEASE(jDstStrides, dstStrides); \
+  SAFE_RELEASE(jDstOffsets, dstOffsets); \
+}
 
-  bailif0(dstOffsets = (*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
-  bailif0(dstStrides = (*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
+  BAILIF0(dstOffsets = (*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
+  BAILIF0(dstStrides = (*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
   for (i = 0; i < nc; i++) {
     int planeSize = tjPlaneSizeYUV(i, width, dstStrides[i], height, subsamp);
     int pw = tjPlaneWidth(i, width, subsamp);
 
-    if (planeSize < 0 || pw < 0)
-      _throwarg(tjGetErrorStr());
+    if (planeSize < 0 || pw < 0) {
+      RELEASE_ARRAYS_ENCODEYUV();
+      THROW_ARG(tjGetErrorStr());
+    }
 
-    if (dstOffsets[i] < 0)
-      _throwarg("Invalid argument in encodeYUV()");
-    if (dstStrides[i] < 0 && dstOffsets[i] - planeSize + pw < 0)
-      _throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+    if (dstOffsets[i] < 0) {
+      RELEASE_ARRAYS_ENCODEYUV();
+      THROW_ARG("Invalid argument in encodeYUV()");
+    }
+    if (dstStrides[i] < 0 && dstOffsets[i] - planeSize + pw < 0) {
+      RELEASE_ARRAYS_ENCODEYUV();
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+    }
 
-    bailif0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
-    if ((*env)->GetArrayLength(env, jDstPlanes[i]) < dstOffsets[i] + planeSize)
-      _throwarg("Destination plane is not large enough");
+    BAILIF0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((*env)->GetArrayLength(env, jDstPlanes[i]) <
+        dstOffsets[i] + planeSize) {
+      RELEASE_ARRAYS_ENCODEYUV();
+      THROW_ARG("Destination plane is not large enough");
+    }
 
-    bailif0(dstPlanes[i] =
+    BAILIF0(dstPlanes[i] =
             (*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i], 0));
     dstPlanes[i] = &dstPlanes[i][dstOffsets[i]];
   }
-  bailif0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
 
   if (tjEncodeYUVPlanes(handle, &srcBuf[y * actualPitch + x * tjPixelSize[pf]],
                         width, pitch, height, pf, dstPlanes, dstStrides,
-                        subsamp, flags) == -1)
-    _throwtj();
+                        subsamp, flags) == -1) {
+    RELEASE_ARRAYS_ENCODEYUV();
+    THROW_TJ();
+  }
 
 bailout:
-  if (srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
-  for (i = 0; i < nc; i++) {
-    if (dstPlanes[i] && jDstPlanes[i])
-      (*env)->ReleasePrimitiveArrayCritical(env, jDstPlanes[i], dstPlanes[i],
-                                            0);
-  }
-  if (dstStrides)
-    (*env)->ReleasePrimitiveArrayCritical(env, jDstStrides, dstStrides, 0);
-  if (dstOffsets)
-    (*env)->ReleasePrimitiveArrayCritical(env, jDstOffsets, dstOffsets, 0);
+  RELEASE_ARRAYS_ENCODEYUV();
 }
 
 /* TurboJPEG 1.4.x: TJCompressor::encodeYUV() byte source */
@@ -479,9 +507,9 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___
    jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in encodeYUV()");
+    THROW_ARG("Invalid argument in encodeYUV()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when encoding from an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when encoding from an integer buffer.");
 
   TJCompressor_encodeYUV(env, obj, src, sizeof(jint), x, y, width,
                          stride * sizeof(jint), height, pf, dstobjs,
@@ -491,7 +519,7 @@ bailout:
   return;
 }
 
-JNIEXPORT void JNICALL TJCompressor_encodeYUV_12
+static void JNICALL TJCompressor_encodeYUV_12
   (JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint width,
    jint pitch, jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
 {
@@ -499,31 +527,34 @@ JNIEXPORT void JNICALL TJCompressor_encodeYUV_12
   jsize arraySize = 0;
   unsigned char *srcBuf = NULL, *dstBuf = NULL;
 
-  gethandle();
+  GET_HANDLE();
 
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
       height < 1 || pitch < 0)
-    _throwarg("Invalid argument in encodeYUV()");
+    THROW_ARG("Invalid argument in encodeYUV()");
   if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
-    _throwarg("Mismatch between Java and C API");
+    THROW_ARG("Mismatch between Java and C API");
 
   arraySize = (pitch == 0) ? width * tjPixelSize[pf] * height : pitch * height;
   if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
-    _throwarg("Source buffer is not large enough");
+    THROW_ARG("Source buffer is not large enough");
   if ((*env)->GetArrayLength(env, dst) <
       (jsize)tjBufSizeYUV(width, height, subsamp))
-    _throwarg("Destination buffer is not large enough");
+    THROW_ARG("Destination buffer is not large enough");
 
-  bailif0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
-  bailif0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  BAILIF0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
   if (tjEncodeYUV2(handle, srcBuf, width, pitch, height, pf, dstBuf, subsamp,
-                   flags) == -1)
-    _throwtj();
+                   flags) == -1) {
+    SAFE_RELEASE(dst, dstBuf);
+    SAFE_RELEASE(src, srcBuf);
+    THROW_TJ();
+  }
 
 bailout:
-  if (dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-  if (srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
+  SAFE_RELEASE(dst, dstBuf);
+  SAFE_RELEASE(src, srcBuf);
 }
 
 /* TurboJPEG 1.2.x: TJCompressor::encodeYUV() byte source */
@@ -541,9 +572,9 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___
    jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in encodeYUV()");
+    THROW_ARG("Invalid argument in encodeYUV()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when encoding from an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when encoding from an integer buffer.");
 
   TJCompressor_encodeYUV_12(env, obj, src, sizeof(jint), width,
                             stride * sizeof(jint), height, pf, dst, subsamp,
@@ -559,9 +590,9 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy
 {
   tjhandle handle = 0;
 
-  gethandle();
+  GET_HANDLE();
 
-  if (tjDestroy(handle) == -1) _throwtj();
+  if (tjDestroy(handle) == -1) THROW_TJ();
   (*env)->SetLongField(env, obj, _fid, 0);
 
 bailout:
@@ -577,10 +608,10 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_init
   tjhandle handle;
 
   if ((handle = tjInitDecompress()) == NULL)
-    _throw(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
+    THROW(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
 
-  bailif0(cls = (*env)->GetObjectClass(env, obj));
-  bailif0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  BAILIF0(cls = (*env)->GetObjectClass(env, obj));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
   (*env)->SetLongField(env, obj, fid, (size_t)handle);
 
 bailout:
@@ -599,17 +630,17 @@ JNIEXPORT jobjectArray JNICALL Java_org_libjpegturbo_turbojpeg_TJ_getScalingFact
   jobjectArray sfjava = NULL;
 
   if ((sf = tjGetScalingFactors(&n)) == NULL || n == 0)
-    _throwarg(tjGetErrorStr());
+    THROW_ARG(tjGetErrorStr());
 
-  bailif0(sfcls = (*env)->FindClass(env,
+  BAILIF0(sfcls = (*env)->FindClass(env,
     "org/libjpegturbo/turbojpeg/TJScalingFactor"));
-  bailif0(sfjava = (jobjectArray)(*env)->NewObjectArray(env, n, sfcls, 0));
+  BAILIF0(sfjava = (jobjectArray)(*env)->NewObjectArray(env, n, sfcls, 0));
 
   for (i = 0; i < n; i++) {
-    bailif0(sfobj = (*env)->AllocObject(env, sfcls));
-    bailif0(fid = (*env)->GetFieldID(env, sfcls, "num", "I"));
+    BAILIF0(sfobj = (*env)->AllocObject(env, sfcls));
+    BAILIF0(fid = (*env)->GetFieldID(env, sfcls, "num", "I"));
     (*env)->SetIntField(env, sfobj, fid, sf[i].num);
-    bailif0(fid = (*env)->GetFieldID(env, sfcls, "denom", "I"));
+    BAILIF0(fid = (*env)->GetFieldID(env, sfcls, "denom", "I"));
     (*env)->SetIntField(env, sfobj, fid, sf[i].denom);
     (*env)->SetObjectArrayElement(env, sfjava, i, sfobj);
   }
@@ -626,33 +657,34 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
   unsigned char *jpegBuf = NULL;
   int width = 0, height = 0, jpegSubsamp = -1, jpegColorspace = -1;
 
-  gethandle();
+  GET_HANDLE();
 
   if ((*env)->GetArrayLength(env, src) < jpegSize)
-    _throwarg("Source buffer is not large enough");
+    THROW_ARG("Source buffer is not large enough");
 
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
 
   if (tjDecompressHeader3(handle, jpegBuf, (unsigned long)jpegSize, &width,
-                          &height, &jpegSubsamp, &jpegColorspace) == -1)
-    _throwtj();
+                          &height, &jpegSubsamp, &jpegColorspace) == -1) {
+    SAFE_RELEASE(src, jpegBuf);
+    THROW_TJ();
+  }
 
-  (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-  jpegBuf = NULL;
+  SAFE_RELEASE(src, jpegBuf);
 
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
   (*env)->SetIntField(env, obj, _fid, jpegSubsamp);
   if ((_fid = (*env)->GetFieldID(env, _cls, "jpegColorspace", "I")) == 0)
     (*env)->ExceptionClear(env);
   else
     (*env)->SetIntField(env, obj, _fid, jpegColorspace);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
   (*env)->SetIntField(env, obj, _fid, width);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
   (*env)->SetIntField(env, obj, _fid, height);
 
 bailout:
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
+  SAFE_RELEASE(src, jpegBuf);
 }
 
 static void TJDecompressor_decompress
@@ -664,31 +696,34 @@ static void TJDecompressor_decompress
   jsize arraySize = 0, actualPitch;
   unsigned char *jpegBuf = NULL, *dstBuf = NULL;
 
-  gethandle();
+  GET_HANDLE();
 
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in decompress()");
+    THROW_ARG("Invalid argument in decompress()");
   if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
-    _throwarg("Mismatch between Java and C API");
+    THROW_ARG("Mismatch between Java and C API");
 
   if ((*env)->GetArrayLength(env, src) < jpegSize)
-    _throwarg("Source buffer is not large enough");
+    THROW_ARG("Source buffer is not large enough");
   actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
   arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
   if ((*env)->GetArrayLength(env, dst) * dstElementSize < arraySize)
-    _throwarg("Destination buffer is not large enough");
+    THROW_ARG("Destination buffer is not large enough");
 
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
-  bailif0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
   if (tjDecompress2(handle, jpegBuf, (unsigned long)jpegSize,
                     &dstBuf[y * actualPitch + x * tjPixelSize[pf]], width,
-                    pitch, height, pf, flags) == -1)
-    _throwtj();
+                    pitch, height, pf, flags) == -1) {
+    SAFE_RELEASE(dst, dstBuf);
+    SAFE_RELEASE(src, jpegBuf);
+    THROW_TJ();
+  }
 
 bailout:
-  if (dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
+  SAFE_RELEASE(dst, dstBuf);
+  SAFE_RELEASE(src, jpegBuf);
 }
 
 /* TurboJPEG 1.3.x: TJDecompressor::decompress() byte destination */
@@ -715,9 +750,9 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
    jint x, jint y, jint width, jint stride, jint height, jint pf, jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in decompress()");
+    THROW_ARG("Invalid argument in decompress()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when decompressing to an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when decompressing to an integer buffer.");
 
   TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), x, y,
                             width, stride * sizeof(jint), height, pf, flags);
@@ -732,9 +767,9 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
    jint width, jint stride, jint height, jint pf, jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in decompress()");
+    THROW_ARG("Invalid argument in decompress()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when decompressing to an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when decompressing to an integer buffer.");
 
   TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), 0, 0,
                             width, stride * sizeof(jint), height, pf, flags);
@@ -757,15 +792,15 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
   int nc = 0, i, width, height, scaledWidth, scaledHeight, nsf = 0;
   tjscalingfactor *sf;
 
-  gethandle();
+  GET_HANDLE();
 
   if ((*env)->GetArrayLength(env, src) < jpegSize)
-    _throwarg("Source buffer is not large enough");
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+    THROW_ARG("Source buffer is not large enough");
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
   jpegSubsamp = (int)(*env)->GetIntField(env, obj, _fid);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
   jpegWidth = (int)(*env)->GetIntField(env, obj, _fid);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
   jpegHeight = (int)(*env)->GetIntField(env, obj, _fid);
 
   nc = (jpegSubsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3);
@@ -776,7 +811,7 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
   if (height == 0) height = jpegHeight;
   sf = tjGetScalingFactors(&nsf);
   if (!sf || nsf < 1)
-    _throwarg(tjGetErrorStr());
+    THROW_ARG(tjGetErrorStr());
   for (i = 0; i < nsf; i++) {
     scaledWidth = TJSCALED(jpegWidth, sf[i]);
     scaledHeight = TJSCALED(jpegHeight, sf[i]);
@@ -784,49 +819,59 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
       break;
   }
   if (i >= nsf)
-    _throwarg("Could not scale down to desired image dimensions");
+    THROW_ARG("Could not scale down to desired image dimensions");
+
+#define RELEASE_ARRAYS_DECOMPRESSTOYUV() { \
+  SAFE_RELEASE(src, jpegBuf); \
+  for (i = 0; i < nc; i++) \
+    SAFE_RELEASE(jDstPlanes[i], dstPlanes[i]); \
+  SAFE_RELEASE(jDstStrides, dstStrides); \
+  SAFE_RELEASE(jDstOffsets, dstOffsets); \
+}
 
-  bailif0(dstOffsets = (*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
-  bailif0(dstStrides = (*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
+  BAILIF0(dstOffsets = (*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
+  BAILIF0(dstStrides = (*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
   for (i = 0; i < nc; i++) {
     int planeSize = tjPlaneSizeYUV(i, scaledWidth, dstStrides[i], scaledHeight,
                                    jpegSubsamp);
     int pw = tjPlaneWidth(i, scaledWidth, jpegSubsamp);
 
-    if (planeSize < 0 || pw < 0)
-      _throwarg(tjGetErrorStr());
+    if (planeSize < 0 || pw < 0) {
+      RELEASE_ARRAYS_DECOMPRESSTOYUV();
+      THROW_ARG(tjGetErrorStr());
+    }
 
-    if (dstOffsets[i] < 0)
-      _throwarg("Invalid argument in decompressToYUV()");
-    if (dstStrides[i] < 0 && dstOffsets[i] - planeSize + pw < 0)
-      _throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+    if (dstOffsets[i] < 0) {
+      RELEASE_ARRAYS_DECOMPRESSTOYUV();
+      THROW_ARG("Invalid argument in decompressToYUV()");
+    }
+    if (dstStrides[i] < 0 && dstOffsets[i] - planeSize + pw < 0) {
+      RELEASE_ARRAYS_DECOMPRESSTOYUV();
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+    }
 
-    bailif0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
-    if ((*env)->GetArrayLength(env, jDstPlanes[i]) < dstOffsets[i] + planeSize)
-      _throwarg("Destination plane is not large enough");
+    BAILIF0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((*env)->GetArrayLength(env, jDstPlanes[i]) <
+        dstOffsets[i] + planeSize) {
+      RELEASE_ARRAYS_DECOMPRESSTOYUV();
+      THROW_ARG("Destination plane is not large enough");
+    }
 
-    bailif0(dstPlanes[i] =
+    BAILIF0(dstPlanes[i] =
             (*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i], 0));
     dstPlanes[i] = &dstPlanes[i][dstOffsets[i]];
   }
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
 
   if (tjDecompressToYUVPlanes(handle, jpegBuf, (unsigned long)jpegSize,
                               dstPlanes, desiredWidth, dstStrides,
-                              desiredHeight, flags) == -1)
-    _throwtj();
+                              desiredHeight, flags) == -1) {
+    RELEASE_ARRAYS_DECOMPRESSTOYUV();
+    THROW_TJ();
+  }
 
 bailout:
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-  for (i = 0; i < nc; i++) {
-    if (dstPlanes[i] && jDstPlanes[i])
-      (*env)->ReleasePrimitiveArrayCritical(env, jDstPlanes[i], dstPlanes[i],
-                                            0);
-  }
-  if (dstStrides)
-    (*env)->ReleasePrimitiveArrayCritical(env, jDstStrides, dstStrides, 0);
-  if (dstOffsets)
-    (*env)->ReleasePrimitiveArrayCritical(env, jDstOffsets, dstOffsets, 0);
+  RELEASE_ARRAYS_DECOMPRESSTOYUV();
 }
 
 /* TurboJPEG 1.2.x: TJDecompressor::decompressToYUV() */
@@ -838,30 +883,33 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress
   unsigned char *jpegBuf = NULL, *dstBuf = NULL;
   int jpegSubsamp = -1, jpegWidth = 0, jpegHeight = 0;
 
-  gethandle();
+  GET_HANDLE();
 
   if ((*env)->GetArrayLength(env, src) < jpegSize)
-    _throwarg("Source buffer is not large enough");
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+    THROW_ARG("Source buffer is not large enough");
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
   jpegSubsamp = (int)(*env)->GetIntField(env, obj, _fid);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
   jpegWidth = (int)(*env)->GetIntField(env, obj, _fid);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
   jpegHeight = (int)(*env)->GetIntField(env, obj, _fid);
   if ((*env)->GetArrayLength(env, dst) <
       (jsize)tjBufSizeYUV(jpegWidth, jpegHeight, jpegSubsamp))
-    _throwarg("Destination buffer is not large enough");
+    THROW_ARG("Destination buffer is not large enough");
 
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
-  bailif0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
   if (tjDecompressToYUV(handle, jpegBuf, (unsigned long)jpegSize, dstBuf,
-                        flags) == -1)
-    _throwtj();
+                        flags) == -1) {
+    SAFE_RELEASE(dst, dstBuf);
+    SAFE_RELEASE(src, jpegBuf);
+    THROW_TJ();
+  }
 
 bailout:
-  if (dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
+  SAFE_RELEASE(dst, dstBuf);
+  SAFE_RELEASE(src, jpegBuf);
 }
 
 static void TJDecompressor_decodeYUV
@@ -877,67 +925,77 @@ static void TJDecompressor_decodeYUV
   int *srcOffsets = NULL, *srcStrides = NULL;
   int nc = (subsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3), i;
 
-  gethandle();
+  GET_HANDLE();
 
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || subsamp < 0 ||
       subsamp >= org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
-    _throwarg("Invalid argument in decodeYUV()");
+    THROW_ARG("Invalid argument in decodeYUV()");
   if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF ||
       org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
-    _throwarg("Mismatch between Java and C API");
+    THROW_ARG("Mismatch between Java and C API");
 
   if ((*env)->GetArrayLength(env, srcobjs) < nc)
-    _throwarg("Planes array is too small for the subsampling type");
+    THROW_ARG("Planes array is too small for the subsampling type");
   if ((*env)->GetArrayLength(env, jSrcOffsets) < nc)
-    _throwarg("Offsets array is too small for the subsampling type");
+    THROW_ARG("Offsets array is too small for the subsampling type");
   if ((*env)->GetArrayLength(env, jSrcStrides) < nc)
-    _throwarg("Strides array is too small for the subsampling type");
+    THROW_ARG("Strides array is too small for the subsampling type");
 
   actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
   arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
   if ((*env)->GetArrayLength(env, dst) * dstElementSize < arraySize)
-    _throwarg("Destination buffer is not large enough");
+    THROW_ARG("Destination buffer is not large enough");
+
+#define RELEASE_ARRAYS_DECODEYUV() { \
+  SAFE_RELEASE(dst, dstBuf); \
+  for (i = 0; i < nc; i++) \
+    SAFE_RELEASE(jSrcPlanes[i], srcPlanes[i]); \
+  SAFE_RELEASE(jSrcStrides, srcStrides); \
+  SAFE_RELEASE(jSrcOffsets, srcOffsets); \
+}
 
-  bailif0(srcOffsets = (*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
-  bailif0(srcStrides = (*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
+  BAILIF0(srcOffsets = (*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
+  BAILIF0(srcStrides = (*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
   for (i = 0; i < nc; i++) {
     int planeSize = tjPlaneSizeYUV(i, width, srcStrides[i], height, subsamp);
     int pw = tjPlaneWidth(i, width, subsamp);
 
-    if (planeSize < 0 || pw < 0)
-      _throwarg(tjGetErrorStr());
+    if (planeSize < 0 || pw < 0) {
+      RELEASE_ARRAYS_DECODEYUV();
+      THROW_ARG(tjGetErrorStr());
+    }
 
-    if (srcOffsets[i] < 0)
-      _throwarg("Invalid argument in decodeYUV()");
-    if (srcStrides[i] < 0 && srcOffsets[i] - planeSize + pw < 0)
-      _throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+    if (srcOffsets[i] < 0) {
+      RELEASE_ARRAYS_DECODEYUV();
+      THROW_ARG("Invalid argument in decodeYUV()");
+    }
+    if (srcStrides[i] < 0 && srcOffsets[i] - planeSize + pw < 0) {
+      RELEASE_ARRAYS_DECODEYUV();
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+    }
 
-    bailif0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
-    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) < srcOffsets[i] + planeSize)
-      _throwarg("Source plane is not large enough");
+    BAILIF0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
+    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) <
+        srcOffsets[i] + planeSize) {
+      RELEASE_ARRAYS_DECODEYUV();
+      THROW_ARG("Source plane is not large enough");
+    }
 
-    bailif0(srcPlanes[i] =
+    BAILIF0(srcPlanes[i] =
             (*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i], 0));
     srcPlanes[i] = &srcPlanes[i][srcOffsets[i]];
   }
-  bailif0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  BAILIF0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
   if (tjDecodeYUVPlanes(handle, srcPlanes, srcStrides, subsamp,
                         &dstBuf[y * actualPitch + x * tjPixelSize[pf]], width,
-                        pitch, height, pf, flags) == -1)
-    _throwtj();
+                        pitch, height, pf, flags) == -1) {
+    RELEASE_ARRAYS_DECODEYUV();
+    THROW_TJ();
+  }
 
 bailout:
-  if (dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-  for (i = 0; i < nc; i++) {
-    if (srcPlanes[i] && jSrcPlanes[i])
-      (*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i],
-                                            (unsigned char *)srcPlanes[i], 0);
-  }
-  if (srcStrides)
-    (*env)->ReleasePrimitiveArrayCritical(env, jSrcStrides, srcStrides, 0);
-  if (srcOffsets)
-    (*env)->ReleasePrimitiveArrayCritical(env, jSrcOffsets, srcOffsets, 0);
+  RELEASE_ARRAYS_DECODEYUV();
 }
 
 /* TurboJPEG 1.4.x: TJDecompressor::decodeYUV() byte destination */
@@ -958,9 +1016,9 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV_
    jint width, jint stride, jint height, jint pf, jint flags)
 {
   if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
-    _throwarg("Invalid argument in decodeYUV()");
+    THROW_ARG("Invalid argument in decodeYUV()");
   if (tjPixelSize[pf] != sizeof(jint))
-    _throwarg("Pixel format must be 32-bit when decoding to an integer buffer.");
+    THROW_ARG("Pixel format must be 32-bit when decoding to an integer buffer.");
 
   TJDecompressor_decodeYUV(env, obj, srcobjs, jSrcOffsets, jSrcStrides,
                            subsamp, dst, sizeof(jint), x, y, width,
@@ -979,10 +1037,10 @@ JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_init
   tjhandle handle;
 
   if ((handle = tjInitTransform()) == NULL)
-    _throw(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
+    THROW(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
 
-  bailif0(cls = (*env)->GetObjectClass(env, obj));
-  bailif0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  BAILIF0(cls = (*env)->GetObjectClass(env, obj));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
   (*env)->SetLongField(env, obj, fid, (size_t)handle);
 
 bailout:
@@ -1007,43 +1065,43 @@ static int JNICustomFilter(short *coeffs, tjregion arrayRegion,
   jmethodID mid;
   jfieldID fid;
 
-  bailif0(bufobj = (*env)->NewDirectByteBuffer(env, coeffs,
+  BAILIF0(bufobj = (*env)->NewDirectByteBuffer(env, coeffs,
     sizeof(short) * arrayRegion.w * arrayRegion.h));
-  bailif0(cls = (*env)->FindClass(env, "java/nio/ByteOrder"));
-  bailif0(mid = (*env)->GetStaticMethodID(env, cls, "nativeOrder",
+  BAILIF0(cls = (*env)->FindClass(env, "java/nio/ByteOrder"));
+  BAILIF0(mid = (*env)->GetStaticMethodID(env, cls, "nativeOrder",
                                           "()Ljava/nio/ByteOrder;"));
-  bailif0(borobj = (*env)->CallStaticObjectMethod(env, cls, mid));
-  bailif0(cls = (*env)->GetObjectClass(env, bufobj));
-  bailif0(mid = (*env)->GetMethodID(env, cls, "order",
+  BAILIF0(borobj = (*env)->CallStaticObjectMethod(env, cls, mid));
+  BAILIF0(cls = (*env)->GetObjectClass(env, bufobj));
+  BAILIF0(mid = (*env)->GetMethodID(env, cls, "order",
     "(Ljava/nio/ByteOrder;)Ljava/nio/ByteBuffer;"));
   (*env)->CallObjectMethod(env, bufobj, mid, borobj);
-  bailif0(mid = (*env)->GetMethodID(env, cls, "asShortBuffer",
+  BAILIF0(mid = (*env)->GetMethodID(env, cls, "asShortBuffer",
                                     "()Ljava/nio/ShortBuffer;"));
-  bailif0(bufobj = (*env)->CallObjectMethod(env, bufobj, mid));
+  BAILIF0(bufobj = (*env)->CallObjectMethod(env, bufobj, mid));
 
-  bailif0(cls = (*env)->FindClass(env, "java/awt/Rectangle"));
-  bailif0(arrayRegionObj = (*env)->AllocObject(env, cls));
-  bailif0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
+  BAILIF0(cls = (*env)->FindClass(env, "java/awt/Rectangle"));
+  BAILIF0(arrayRegionObj = (*env)->AllocObject(env, cls));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
   (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.x);
-  bailif0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
   (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.y);
-  bailif0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
   (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.w);
-  bailif0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
   (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.h);
 
-  bailif0(planeRegionObj = (*env)->AllocObject(env, cls));
-  bailif0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
+  BAILIF0(planeRegionObj = (*env)->AllocObject(env, cls));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
   (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.x);
-  bailif0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
   (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.y);
-  bailif0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
   (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.w);
-  bailif0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
   (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.h);
 
-  bailif0(cls = (*env)->GetObjectClass(env, cfobj));
-  bailif0(mid = (*env)->GetMethodID(env, cls, "customFilter",
+  BAILIF0(cls = (*env)->GetObjectClass(env, cfobj));
+  BAILIF0(mid = (*env)->GetMethodID(env, cls, "customFilter",
     "(Ljava/nio/ShortBuffer;Ljava/awt/Rectangle;Ljava/awt/Rectangle;IILorg/libjpegturbo/turbojpeg/TJTransform;)V"));
   (*env)->CallVoidMethod(env, cfobj, mid, bufobj, arrayRegionObj,
                          planeRegionObj, componentIndex, transformIndex, tobj);
@@ -1070,33 +1128,33 @@ JNIEXPORT jintArray JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_transf
   jint *dstSizesi = NULL;
   JNICustomFilterParams *params = NULL;
 
-  gethandle();
+  GET_HANDLE();
 
   if ((*env)->GetArrayLength(env, jsrcBuf) < jpegSize)
-    _throwarg("Source buffer is not large enough");
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+    THROW_ARG("Source buffer is not large enough");
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
   jpegWidth = (int)(*env)->GetIntField(env, obj, _fid);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
   jpegHeight = (int)(*env)->GetIntField(env, obj, _fid);
-  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
   jpegSubsamp = (int)(*env)->GetIntField(env, obj, _fid);
 
   n = (*env)->GetArrayLength(env, dstobjs);
   if (n != (*env)->GetArrayLength(env, tobjs))
-    _throwarg("Mismatch between size of transforms array and destination buffers array");
+    THROW_ARG("Mismatch between size of transforms array and destination buffers array");
 
   if ((dstBufs =
        (unsigned char **)malloc(sizeof(unsigned char *) * n)) == NULL)
-    _throwmem();
+    THROW_MEM();
   if ((jdstBufs = (jbyteArray *)malloc(sizeof(jbyteArray) * n)) == NULL)
-    _throwmem();
+    THROW_MEM();
   if ((dstSizes = (unsigned long *)malloc(sizeof(unsigned long) * n)) == NULL)
-    _throwmem();
+    THROW_MEM();
   if ((t = (tjtransform *)malloc(sizeof(tjtransform) * n)) == NULL)
-    _throwmem();
+    THROW_MEM();
   if ((params = (JNICustomFilterParams *)malloc(sizeof(JNICustomFilterParams) *
                                                 n)) == NULL)
-    _throwmem();
+    THROW_MEM();
   for (i = 0; i < n; i++) {
     dstBufs[i] = NULL;  jdstBufs[i] = NULL;  dstSizes[i] = 0;
     memset(&t[i], 0, sizeof(tjtransform));
@@ -1106,22 +1164,22 @@ JNIEXPORT jintArray JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_transf
   for (i = 0; i < n; i++) {
     jobject tobj, cfobj;
 
-    bailif0(tobj = (*env)->GetObjectArrayElement(env, tobjs, i));
-    bailif0(_cls = (*env)->GetObjectClass(env, tobj));
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "op", "I"));
+    BAILIF0(tobj = (*env)->GetObjectArrayElement(env, tobjs, i));
+    BAILIF0(_cls = (*env)->GetObjectClass(env, tobj));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "op", "I"));
     t[i].op = (*env)->GetIntField(env, tobj, _fid);
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "options", "I"));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "options", "I"));
     t[i].options = (*env)->GetIntField(env, tobj, _fid);
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "x", "I"));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "x", "I"));
     t[i].r.x = (*env)->GetIntField(env, tobj, _fid);
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "y", "I"));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "y", "I"));
     t[i].r.y = (*env)->GetIntField(env, tobj, _fid);
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "width", "I"));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "width", "I"));
     t[i].r.w = (*env)->GetIntField(env, tobj, _fid);
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "height", "I"));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "height", "I"));
     t[i].r.h = (*env)->GetIntField(env, tobj, _fid);
 
-    bailif0(_fid = (*env)->GetFieldID(env, _cls, "cf",
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "cf",
       "Lorg/libjpegturbo/turbojpeg/TJCustomFilter;"));
     cfobj = (*env)->GetObjectField(env, tobj, _fid);
     if (cfobj) {
@@ -1138,29 +1196,30 @@ JNIEXPORT jintArray JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_transf
 
     if (t[i].r.w != 0) w = t[i].r.w;
     if (t[i].r.h != 0) h = t[i].r.h;
-    bailif0(jdstBufs[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    BAILIF0(jdstBufs[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
     if ((unsigned long)(*env)->GetArrayLength(env, jdstBufs[i]) <
         tjBufSize(w, h, jpegSubsamp))
-      _throwarg("Destination buffer is not large enough");
+      THROW_ARG("Destination buffer is not large enough");
   }
-  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, jsrcBuf, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, jsrcBuf, 0));
   for (i = 0; i < n; i++)
-    bailif0(dstBufs[i] =
+    BAILIF0(dstBufs[i] =
             (*env)->GetPrimitiveArrayCritical(env, jdstBufs[i], 0));
 
   if (tjTransform(handle, jpegBuf, jpegSize, n, dstBufs, dstSizes, t,
-                  flags | TJFLAG_NOREALLOC) == -1)
-    _throwtj();
-
-  for (i = 0; i < n; i++) {
-    (*env)->ReleasePrimitiveArrayCritical(env, jdstBufs[i], dstBufs[i], 0);
-    dstBufs[i] = NULL;
+                  flags | TJFLAG_NOREALLOC) == -1) {
+    for (i = 0; i < n; i++)
+      SAFE_RELEASE(jdstBufs[i], dstBufs[i]);
+    SAFE_RELEASE(jsrcBuf, jpegBuf);
+    THROW_TJ();
   }
-  (*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jpegBuf, 0);
-  jpegBuf = NULL;
+
+  for (i = 0; i < n; i++)
+    SAFE_RELEASE(jdstBufs[i], dstBufs[i]);
+  SAFE_RELEASE(jsrcBuf, jpegBuf);
 
   jdstSizes = (*env)->NewIntArray(env, n);
-  bailif0(dstSizesi = (*env)->GetIntArrayElements(env, jdstSizes, 0));
+  BAILIF0(dstSizesi = (*env)->GetIntArrayElements(env, jdstSizes, 0));
   for (i = 0; i < n; i++) dstSizesi[i] = (int)dstSizes[i];
 
 bailout:
@@ -1172,10 +1231,10 @@ bailout:
     }
     free(dstBufs);
   }
-  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jpegBuf, 0);
-  if (jdstBufs) free(jdstBufs);
-  if (dstSizes) free(dstSizes);
-  if (t) free(t);
+  SAFE_RELEASE(jsrcBuf, jpegBuf);
+  free(jdstBufs);
+  free(dstSizes);
+  free(t);
   return jdstSizes;
 }
 
diff --git a/turbojpeg.c b/turbojpeg.c
index 90a9ce6..e63d113 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2018 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2020 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -43,6 +43,7 @@
 #include "transupp.h"
 #include "./jpegcomp.h"
 #include "./cdjpeg.h"
+#include "jconfigint.h"
 
 extern void jpeg_mem_dest_tj(j_compress_ptr, unsigned char **, unsigned long *,
                              boolean);
@@ -50,12 +51,12 @@ extern void jpeg_mem_src_tj(j_decompress_ptr, const unsigned char *,
                             unsigned long);
 
 #define PAD(v, p)  ((v + (p) - 1) & (~((p) - 1)))
-#define isPow2(x)  (((x) & (x - 1)) == 0)
+#define IS_POW2(x)  (((x) & (x - 1)) == 0)
 
 
 /* Error handling (based on example in example.txt) */
 
-static char errStr[JMSG_LENGTH_MAX] = "No error";
+static THREAD_LOCAL char errStr[JMSG_LENGTH_MAX] = "No error";
 
 struct my_error_mgr {
   struct jpeg_error_mgr pub;
@@ -164,20 +165,20 @@ static int cs2pf[JPEG_NUMCS] = {
   TJPF_UNKNOWN
 };
 
-#define _throwg(m) { \
+#define THROWG(m) { \
   snprintf(errStr, JMSG_LENGTH_MAX, "%s", m); \
   retval = -1;  goto bailout; \
 }
-#define _throwunix(m) { \
+#define THROW_UNIX(m) { \
   snprintf(errStr, JMSG_LENGTH_MAX, "%s\n%s", m, strerror(errno)); \
   retval = -1;  goto bailout; \
 }
-#define _throw(m) { \
+#define THROW(m) { \
   snprintf(this->errStr, JMSG_LENGTH_MAX, "%s", m); \
-  this->isInstanceError = TRUE;  _throwg(m) \
+  this->isInstanceError = TRUE;  THROWG(m) \
 }
 
-#define getinstance(handle) \
+#define GET_INSTANCE(handle) \
   tjinstance *this = (tjinstance *)handle; \
   j_compress_ptr cinfo = NULL; \
   j_decompress_ptr dinfo = NULL; \
@@ -190,7 +191,7 @@ static int cs2pf[JPEG_NUMCS] = {
   this->jerr.warning = FALSE; \
   this->isInstanceError = FALSE;
 
-#define getcinstance(handle) \
+#define GET_CINSTANCE(handle) \
   tjinstance *this = (tjinstance *)handle; \
   j_compress_ptr cinfo = NULL; \
   \
@@ -202,7 +203,7 @@ static int cs2pf[JPEG_NUMCS] = {
   this->jerr.warning = FALSE; \
   this->isInstanceError = FALSE;
 
-#define getdinstance(handle) \
+#define GET_DINSTANCE(handle) \
   tjinstance *this = (tjinstance *)handle; \
   j_decompress_ptr dinfo = NULL; \
   \
@@ -237,7 +238,9 @@ static int setCompDefaults(struct jpeg_compress_struct *cinfo, int pixelFormat,
                            int subsamp, int jpegQual, int flags)
 {
   int retval = 0;
+#ifndef NO_GETENV
   char *env = NULL;
+#endif
 
   cinfo->in_color_space = pf2cs[pixelFormat];
   cinfo->input_components = tjPixelSize[pixelFormat];
@@ -359,6 +362,23 @@ static int getSubsamp(j_decompress_ptr dinfo)
           retval = i;  break;
         }
       }
+      /* Handle 4:4:4 images whose sampling factors are specified in
+         non-standard ways. */
+      if (dinfo->comp_info[0].h_samp_factor *
+          dinfo->comp_info[0].v_samp_factor <=
+          D_MAX_BLOCKS_IN_MCU / pixelsize[i] && i == TJSAMP_444) {
+        int match = 0;
+        for (k = 1; k < dinfo->num_components; k++) {
+          if (dinfo->comp_info[k].h_samp_factor ==
+              dinfo->comp_info[0].h_samp_factor &&
+              dinfo->comp_info[k].v_samp_factor ==
+              dinfo->comp_info[0].v_samp_factor)
+            match++;
+          if (match == dinfo->num_components - 1) {
+            retval = i;  break;
+          }
+        }
+      }
     }
   }
   return retval;
@@ -396,7 +416,7 @@ DLLEXPORT int tjGetErrorCode(tjhandle handle)
 
 DLLEXPORT int tjDestroy(tjhandle handle)
 {
-  getinstance(handle);
+  GET_INSTANCE(handle);
 
   if (setjmp(this->jerr.setjmp_buffer)) return -1;
   if (this->init & COMPRESS) jpeg_destroy_compress(cinfo);
@@ -413,7 +433,7 @@ DLLEXPORT int tjDestroy(tjhandle handle)
 
 DLLEXPORT void tjFree(unsigned char *buf)
 {
-  if (buf) free(buf);
+  free(buf);
 }
 
 
@@ -443,7 +463,7 @@ static tjhandle _tjInitCompress(tjinstance *this)
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
-    if (this) free(this);
+    free(this);
     return NULL;
   }
 
@@ -472,11 +492,11 @@ DLLEXPORT tjhandle tjInitCompress(void)
 
 DLLEXPORT unsigned long tjBufSize(int width, int height, int jpegSubsamp)
 {
-  unsigned long retval = 0;
+  unsigned long long retval = 0;
   int mcuw, mcuh, chromasf;
 
   if (width < 1 || height < 1 || jpegSubsamp < 0 || jpegSubsamp >= NUMSUBOPT)
-    _throwg("tjBufSize(): Invalid argument");
+    THROWG("tjBufSize(): Invalid argument");
 
   /* This allows for rare corner cases in which a JPEG image can actually be
      larger than the uncompressed input (we wouldn't mention it if it hadn't
@@ -484,36 +504,41 @@ DLLEXPORT unsigned long tjBufSize(int width, int height, int jpegSubsamp)
   mcuw = tjMCUWidth[jpegSubsamp];
   mcuh = tjMCUHeight[jpegSubsamp];
   chromasf = jpegSubsamp == TJSAMP_GRAY ? 0 : 4 * 64 / (mcuw * mcuh);
-  retval = PAD(width, mcuw) * PAD(height, mcuh) * (2 + chromasf) + 2048;
+  retval = PAD(width, mcuw) * PAD(height, mcuh) * (2ULL + chromasf) + 2048ULL;
+  if (retval > (unsigned long long)((unsigned long)-1))
+    THROWG("tjBufSize(): Image is too large");
 
 bailout:
-  return retval;
+  return (unsigned long)retval;
 }
 
 DLLEXPORT unsigned long TJBUFSIZE(int width, int height)
 {
-  unsigned long retval = 0;
+  unsigned long long retval = 0;
 
   if (width < 1 || height < 1)
-    _throwg("TJBUFSIZE(): Invalid argument");
+    THROWG("TJBUFSIZE(): Invalid argument");
 
   /* This allows for rare corner cases in which a JPEG image can actually be
      larger than the uncompressed input (we wouldn't mention it if it hadn't
      happened before.) */
-  retval = PAD(width, 16) * PAD(height, 16) * 6 + 2048;
+  retval = PAD(width, 16) * PAD(height, 16) * 6ULL + 2048ULL;
+  if (retval > (unsigned long long)((unsigned long)-1))
+    THROWG("TJBUFSIZE(): Image is too large");
 
 bailout:
-  return retval;
+  return (unsigned long)retval;
 }
 
 
 DLLEXPORT unsigned long tjBufSizeYUV2(int width, int pad, int height,
                                       int subsamp)
 {
-  int retval = 0, nc, i;
+  unsigned long long retval = 0;
+  int nc, i;
 
   if (subsamp < 0 || subsamp >= NUMSUBOPT)
-    _throwg("tjBufSizeYUV2(): Invalid argument");
+    THROWG("tjBufSizeYUV2(): Invalid argument");
 
   nc = (subsamp == TJSAMP_GRAY ? 1 : 3);
   for (i = 0; i < nc; i++) {
@@ -522,11 +547,13 @@ DLLEXPORT unsigned long tjBufSizeYUV2(int width, int pad, int height,
     int ph = tjPlaneHeight(i, height, subsamp);
 
     if (pw < 0 || ph < 0) return -1;
-    else retval += stride * ph;
+    else retval += (unsigned long long)stride * ph;
   }
+  if (retval > (unsigned long long)((unsigned long)-1))
+    THROWG("tjBufSizeYUV2(): Image is too large");
 
 bailout:
-  return retval;
+  return (unsigned long)retval;
 }
 
 DLLEXPORT unsigned long tjBufSizeYUV(int width, int height, int subsamp)
@@ -545,10 +572,10 @@ DLLEXPORT int tjPlaneWidth(int componentID, int width, int subsamp)
   int pw, nc, retval = 0;
 
   if (width < 1 || subsamp < 0 || subsamp >= TJ_NUMSAMP)
-    _throwg("tjPlaneWidth(): Invalid argument");
+    THROWG("tjPlaneWidth(): Invalid argument");
   nc = (subsamp == TJSAMP_GRAY ? 1 : 3);
   if (componentID < 0 || componentID >= nc)
-    _throwg("tjPlaneWidth(): Invalid argument");
+    THROWG("tjPlaneWidth(): Invalid argument");
 
   pw = PAD(width, tjMCUWidth[subsamp] / 8);
   if (componentID == 0)
@@ -566,10 +593,10 @@ DLLEXPORT int tjPlaneHeight(int componentID, int height, int subsamp)
   int ph, nc, retval = 0;
 
   if (height < 1 || subsamp < 0 || subsamp >= TJ_NUMSAMP)
-    _throwg("tjPlaneHeight(): Invalid argument");
+    THROWG("tjPlaneHeight(): Invalid argument");
   nc = (subsamp == TJSAMP_GRAY ? 1 : 3);
   if (componentID < 0 || componentID >= nc)
-    _throwg("tjPlaneHeight(): Invalid argument");
+    THROWG("tjPlaneHeight(): Invalid argument");
 
   ph = PAD(height, tjMCUHeight[subsamp] / 8);
   if (componentID == 0)
@@ -585,11 +612,11 @@ bailout:
 DLLEXPORT unsigned long tjPlaneSizeYUV(int componentID, int width, int stride,
                                        int height, int subsamp)
 {
-  unsigned long retval = 0;
+  unsigned long long retval = 0;
   int pw, ph;
 
   if (width < 1 || height < 1 || subsamp < 0 || subsamp >= NUMSUBOPT)
-    _throwg("tjPlaneSizeYUV(): Invalid argument");
+    THROWG("tjPlaneSizeYUV(): Invalid argument");
 
   pw = tjPlaneWidth(componentID, width, subsamp);
   ph = tjPlaneHeight(componentID, height, subsamp);
@@ -598,10 +625,12 @@ DLLEXPORT unsigned long tjPlaneSizeYUV(int componentID, int width, int stride,
   if (stride == 0) stride = pw;
   else stride = abs(stride);
 
-  retval = stride * (ph - 1) + pw;
+  retval = (unsigned long long)stride * (ph - 1) + pw;
+  if (retval > (unsigned long long)((unsigned long)-1))
+    THROWG("tjPlaneSizeYUV(): Image is too large");
 
 bailout:
-  return retval;
+  return (unsigned long)retval;
 }
 
 
@@ -613,21 +642,21 @@ DLLEXPORT int tjCompress2(tjhandle handle, const unsigned char *srcBuf,
   int i, retval = 0, alloc = 1;
   JSAMPROW *row_pointer = NULL;
 
-  getcinstance(handle)
+  GET_CINSTANCE(handle)
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
   if ((this->init & COMPRESS) == 0)
-    _throw("tjCompress2(): Instance has not been initialized for compression");
+    THROW("tjCompress2(): Instance has not been initialized for compression");
 
   if (srcBuf == NULL || width <= 0 || pitch < 0 || height <= 0 ||
       pixelFormat < 0 || pixelFormat >= TJ_NUMPF || jpegBuf == NULL ||
       jpegSize == NULL || jpegSubsamp < 0 || jpegSubsamp >= NUMSUBOPT ||
       jpegQual < 0 || jpegQual > 100)
-    _throw("tjCompress2(): Invalid argument");
+    THROW("tjCompress2(): Invalid argument");
 
   if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
 
   if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * height)) == NULL)
-    _throw("tjCompress2(): Memory allocation failure");
+    THROW("tjCompress2(): Memory allocation failure");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
@@ -653,9 +682,9 @@ DLLEXPORT int tjCompress2(tjhandle handle, const unsigned char *srcBuf,
   jpeg_start_compress(cinfo, TRUE);
   for (i = 0; i < height; i++) {
     if (flags & TJFLAG_BOTTOMUP)
-      row_pointer[i] = (JSAMPROW)&srcBuf[(height - i - 1) * pitch];
+      row_pointer[i] = (JSAMPROW)&srcBuf[(height - i - 1) * (size_t)pitch];
     else
-      row_pointer[i] = (JSAMPROW)&srcBuf[i * pitch];
+      row_pointer[i] = (JSAMPROW)&srcBuf[i * (size_t)pitch];
   }
   while (cinfo->next_scanline < cinfo->image_height)
     jpeg_write_scanlines(cinfo, &row_pointer[cinfo->next_scanline],
@@ -664,7 +693,7 @@ DLLEXPORT int tjCompress2(tjhandle handle, const unsigned char *srcBuf,
 
 bailout:
   if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
-  if (row_pointer) free(row_pointer);
+  free(row_pointer);
   if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
   return retval;
@@ -706,7 +735,7 @@ DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
   JSAMPLE *ptr;
   jpeg_component_info *compptr;
 
-  getcinstance(handle);
+  GET_CINSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
   for (i = 0; i < MAX_COMPONENTS; i++) {
@@ -715,17 +744,17 @@ DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
   }
 
   if ((this->init & COMPRESS) == 0)
-    _throw("tjEncodeYUVPlanes(): Instance has not been initialized for compression");
+    THROW("tjEncodeYUVPlanes(): Instance has not been initialized for compression");
 
   if (srcBuf == NULL || width <= 0 || pitch < 0 || height <= 0 ||
       pixelFormat < 0 || pixelFormat >= TJ_NUMPF || !dstPlanes ||
       !dstPlanes[0] || subsamp < 0 || subsamp >= NUMSUBOPT)
-    _throw("tjEncodeYUVPlanes(): Invalid argument");
+    THROW("tjEncodeYUVPlanes(): Invalid argument");
   if (subsamp != TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
-    _throw("tjEncodeYUVPlanes(): Invalid argument");
+    THROW("tjEncodeYUVPlanes(): Invalid argument");
 
   if (pixelFormat == TJPF_CMYK)
-    _throw("tjEncodeYUVPlanes(): Cannot generate YUV images from CMYK pixels");
+    THROW("tjEncodeYUVPlanes(): Cannot generate YUV images from CMYK pixels");
 
   if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
 
@@ -750,7 +779,7 @@ DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
      to write the file headers, which could overflow the output buffer if the
      YUV image were very small. */
   if (cinfo->global_state != CSTATE_START)
-    _throw("tjEncodeYUVPlanes(): libjpeg API is in the wrong state");
+    THROW("tjEncodeYUVPlanes(): libjpeg API is in the wrong state");
   (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   jinit_c_master_control(cinfo, FALSE);
   jinit_color_converter(cinfo);
@@ -761,12 +790,12 @@ DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
   ph0 = PAD(height, cinfo->max_v_samp_factor);
 
   if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph0)) == NULL)
-    _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+    THROW("tjEncodeYUVPlanes(): Memory allocation failure");
   for (i = 0; i < height; i++) {
     if (flags & TJFLAG_BOTTOMUP)
-      row_pointer[i] = (JSAMPROW)&srcBuf[(height - i - 1) * pitch];
+      row_pointer[i] = (JSAMPROW)&srcBuf[(height - i - 1) * (size_t)pitch];
     else
-      row_pointer[i] = (JSAMPROW)&srcBuf[i * pitch];
+      row_pointer[i] = (JSAMPROW)&srcBuf[i * (size_t)pitch];
   }
   if (height < ph0)
     for (i = height; i < ph0; i++) row_pointer[i] = row_pointer[height - 1];
@@ -778,11 +807,11 @@ DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
           compptr->h_samp_factor, 32) *
       cinfo->max_v_samp_factor + 32);
     if (!_tmpbuf[i])
-      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+      THROW("tjEncodeYUVPlanes(): Memory allocation failure");
     tmpbuf[i] =
       (JSAMPROW *)malloc(sizeof(JSAMPROW) * cinfo->max_v_samp_factor);
     if (!tmpbuf[i])
-      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+      THROW("tjEncodeYUVPlanes(): Memory allocation failure");
     for (row = 0; row < cinfo->max_v_samp_factor; row++) {
       unsigned char *_tmpbuf_aligned =
         (unsigned char *)PAD((size_t)_tmpbuf[i], 32);
@@ -795,10 +824,10 @@ DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
       (JSAMPLE *)malloc(PAD(compptr->width_in_blocks * DCTSIZE, 32) *
                         compptr->v_samp_factor + 32);
     if (!_tmpbuf2[i])
-      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+      THROW("tjEncodeYUVPlanes(): Memory allocation failure");
     tmpbuf2[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * compptr->v_samp_factor);
     if (!tmpbuf2[i])
-      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+      THROW("tjEncodeYUVPlanes(): Memory allocation failure");
     for (row = 0; row < compptr->v_samp_factor; row++) {
       unsigned char *_tmpbuf2_aligned =
         (unsigned char *)PAD((size_t)_tmpbuf2[i], 32);
@@ -810,7 +839,7 @@ DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
     ph[i] = ph0 * compptr->v_samp_factor / cinfo->max_v_samp_factor;
     outbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i]);
     if (!outbuf[i])
-      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+      THROW("tjEncodeYUVPlanes(): Memory allocation failure");
     ptr = dstPlanes[i];
     for (row = 0; row < ph[i]; row++) {
       outbuf[i][row] = ptr;
@@ -838,13 +867,13 @@ DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
 
 bailout:
   if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
-  if (row_pointer) free(row_pointer);
+  free(row_pointer);
   for (i = 0; i < MAX_COMPONENTS; i++) {
-    if (tmpbuf[i] != NULL) free(tmpbuf[i]);
-    if (_tmpbuf[i] != NULL) free(_tmpbuf[i]);
-    if (tmpbuf2[i] != NULL) free(tmpbuf2[i]);
-    if (_tmpbuf2[i] != NULL) free(_tmpbuf2[i]);
-    if (outbuf[i] != NULL) free(outbuf[i]);
+    free(tmpbuf[i]);
+    free(_tmpbuf[i]);
+    free(tmpbuf2[i]);
+    free(_tmpbuf2[i]);
+    free(outbuf[i]);
   }
   if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
@@ -860,12 +889,12 @@ DLLEXPORT int tjEncodeYUV3(tjhandle handle, const unsigned char *srcBuf,
   int pw0, ph0, strides[3], retval = -1;
   tjinstance *this = (tjinstance *)handle;
 
-  if (!this) _throwg("tjEncodeYUV3(): Invalid handle");
+  if (!this) THROWG("tjEncodeYUV3(): Invalid handle");
   this->isInstanceError = FALSE;
 
-  if (width <= 0 || height <= 0 || dstBuf == NULL || pad < 0 || !isPow2(pad) ||
-      subsamp < 0 || subsamp >= NUMSUBOPT)
-    _throw("tjEncodeYUV3(): Invalid argument");
+  if (width <= 0 || height <= 0 || dstBuf == NULL || pad < 0 ||
+      !IS_POW2(pad) || subsamp < 0 || subsamp >= NUMSUBOPT)
+    THROW("tjEncodeYUV3(): Invalid argument");
 
   pw0 = tjPlaneWidth(0, width, subsamp);
   ph0 = tjPlaneHeight(0, height, subsamp);
@@ -922,7 +951,7 @@ DLLEXPORT int tjCompressFromYUVPlanes(tjhandle handle,
   JSAMPLE *_tmpbuf = NULL, *ptr;
   JSAMPROW *inbuf[MAX_COMPONENTS], *tmpbuf[MAX_COMPONENTS];
 
-  getcinstance(handle)
+  GET_CINSTANCE(handle)
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
   for (i = 0; i < MAX_COMPONENTS; i++) {
@@ -930,14 +959,14 @@ DLLEXPORT int tjCompressFromYUVPlanes(tjhandle handle,
   }
 
   if ((this->init & COMPRESS) == 0)
-    _throw("tjCompressFromYUVPlanes(): Instance has not been initialized for compression");
+    THROW("tjCompressFromYUVPlanes(): Instance has not been initialized for compression");
 
   if (!srcPlanes || !srcPlanes[0] || width <= 0 || height <= 0 ||
       subsamp < 0 || subsamp >= NUMSUBOPT || jpegBuf == NULL ||
       jpegSize == NULL || jpegQual < 0 || jpegQual > 100)
-    _throw("tjCompressFromYUVPlanes(): Invalid argument");
+    THROW("tjCompressFromYUVPlanes(): Invalid argument");
   if (subsamp != TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
-    _throw("tjCompressFromYUVPlanes(): Invalid argument");
+    THROW("tjCompressFromYUVPlanes(): Invalid argument");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
@@ -976,7 +1005,7 @@ DLLEXPORT int tjCompressFromYUVPlanes(tjhandle handle,
     th[i] = compptr->v_samp_factor * DCTSIZE;
     tmpbufsize += iw[i] * th[i];
     if ((inbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i])) == NULL)
-      _throw("tjCompressFromYUVPlanes(): Memory allocation failure");
+      THROW("tjCompressFromYUVPlanes(): Memory allocation failure");
     ptr = (JSAMPLE *)srcPlanes[i];
     for (row = 0; row < ph[i]; row++) {
       inbuf[i][row] = ptr;
@@ -985,11 +1014,11 @@ DLLEXPORT int tjCompressFromYUVPlanes(tjhandle handle,
   }
   if (usetmpbuf) {
     if ((_tmpbuf = (JSAMPLE *)malloc(sizeof(JSAMPLE) * tmpbufsize)) == NULL)
-      _throw("tjCompressFromYUVPlanes(): Memory allocation failure");
+      THROW("tjCompressFromYUVPlanes(): Memory allocation failure");
     ptr = _tmpbuf;
     for (i = 0; i < cinfo->num_components; i++) {
       if ((tmpbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * th[i])) == NULL)
-        _throw("tjCompressFromYUVPlanes(): Memory allocation failure");
+        THROW("tjCompressFromYUVPlanes(): Memory allocation failure");
       for (row = 0; row < th[i]; row++) {
         tmpbuf[i][row] = ptr;
         ptr += iw[i];
@@ -1034,10 +1063,10 @@ DLLEXPORT int tjCompressFromYUVPlanes(tjhandle handle,
 bailout:
   if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
   for (i = 0; i < MAX_COMPONENTS; i++) {
-    if (tmpbuf[i]) free(tmpbuf[i]);
-    if (inbuf[i]) free(inbuf[i]);
+    free(tmpbuf[i]);
+    free(inbuf[i]);
   }
-  if (_tmpbuf) free(_tmpbuf);
+  free(_tmpbuf);
   if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
   return retval;
@@ -1053,12 +1082,12 @@ DLLEXPORT int tjCompressFromYUV(tjhandle handle, const unsigned char *srcBuf,
   int pw0, ph0, strides[3], retval = -1;
   tjinstance *this = (tjinstance *)handle;
 
-  if (!this) _throwg("tjCompressFromYUV(): Invalid handle");
+  if (!this) THROWG("tjCompressFromYUV(): Invalid handle");
   this->isInstanceError = FALSE;
 
   if (srcBuf == NULL || width <= 0 || pad < 1 || height <= 0 || subsamp < 0 ||
       subsamp >= NUMSUBOPT)
-    _throw("tjCompressFromYUV(): Invalid argument");
+    THROW("tjCompressFromYUV(): Invalid argument");
 
   pw0 = tjPlaneWidth(0, width, subsamp);
   ph0 = tjPlaneHeight(0, height, subsamp);
@@ -1102,7 +1131,7 @@ static tjhandle _tjInitDecompress(tjinstance *this)
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
-    if (this) free(this);
+    free(this);
     return NULL;
   }
 
@@ -1137,13 +1166,13 @@ DLLEXPORT int tjDecompressHeader3(tjhandle handle,
 {
   int retval = 0;
 
-  getdinstance(handle);
+  GET_DINSTANCE(handle);
   if ((this->init & DECOMPRESS) == 0)
-    _throw("tjDecompressHeader3(): Instance has not been initialized for decompression");
+    THROW("tjDecompressHeader3(): Instance has not been initialized for decompression");
 
   if (jpegBuf == NULL || jpegSize <= 0 || width == NULL || height == NULL ||
       jpegSubsamp == NULL || jpegColorspace == NULL)
-    _throw("tjDecompressHeader3(): Invalid argument");
+    THROW("tjDecompressHeader3(): Invalid argument");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
@@ -1168,11 +1197,11 @@ DLLEXPORT int tjDecompressHeader3(tjhandle handle,
   jpeg_abort_decompress(dinfo);
 
   if (*jpegSubsamp < 0)
-    _throw("tjDecompressHeader3(): Could not determine subsampling type for JPEG image");
+    THROW("tjDecompressHeader3(): Could not determine subsampling type for JPEG image");
   if (*jpegColorspace < 0)
-    _throw("tjDecompressHeader3(): Could not determine colorspace of JPEG image");
+    THROW("tjDecompressHeader3(): Could not determine colorspace of JPEG image");
   if (*width < 1 || *height < 1)
-    _throw("tjDecompressHeader3(): Invalid data returned in header");
+    THROW("tjDecompressHeader3(): Invalid data returned in header");
 
 bailout:
   if (this->jerr.warning) retval = -1;
@@ -1221,14 +1250,14 @@ DLLEXPORT int tjDecompress2(tjhandle handle, const unsigned char *jpegBuf,
   JSAMPROW *row_pointer = NULL;
   int i, retval = 0, jpegwidth, jpegheight, scaledw, scaledh;
 
-  getdinstance(handle);
+  GET_DINSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
   if ((this->init & DECOMPRESS) == 0)
-    _throw("tjDecompress2(): Instance has not been initialized for decompression");
+    THROW("tjDecompress2(): Instance has not been initialized for decompression");
 
   if (jpegBuf == NULL || jpegSize <= 0 || dstBuf == NULL || width < 0 ||
       pitch < 0 || height < 0 || pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
-    _throw("tjDecompress2(): Invalid argument");
+    THROW("tjDecompress2(): Invalid argument");
 
 #ifndef NO_PUTENV
   if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
@@ -1257,7 +1286,7 @@ DLLEXPORT int tjDecompress2(tjhandle handle, const unsigned char *jpegBuf,
       break;
   }
   if (i >= NUMSF)
-    _throw("tjDecompress2(): Could not scale down to desired image dimensions");
+    THROW("tjDecompress2(): Could not scale down to desired image dimensions");
   width = scaledw;  height = scaledh;
   dinfo->scale_num = sf[i].num;
   dinfo->scale_denom = sf[i].denom;
@@ -1267,16 +1296,16 @@ DLLEXPORT int tjDecompress2(tjhandle handle, const unsigned char *jpegBuf,
 
   if ((row_pointer =
        (JSAMPROW *)malloc(sizeof(JSAMPROW) * dinfo->output_height)) == NULL)
-    _throw("tjDecompress2(): Memory allocation failure");
+    THROW("tjDecompress2(): Memory allocation failure");
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
     retval = -1;  goto bailout;
   }
   for (i = 0; i < (int)dinfo->output_height; i++) {
     if (flags & TJFLAG_BOTTOMUP)
-      row_pointer[i] = &dstBuf[(dinfo->output_height - i - 1) * pitch];
+      row_pointer[i] = &dstBuf[(dinfo->output_height - i - 1) * (size_t)pitch];
     else
-      row_pointer[i] = &dstBuf[i * pitch];
+      row_pointer[i] = &dstBuf[i * (size_t)pitch];
   }
   while (dinfo->output_scanline < dinfo->output_height)
     jpeg_read_scanlines(dinfo, &row_pointer[dinfo->output_scanline],
@@ -1285,7 +1314,7 @@ DLLEXPORT int tjDecompress2(tjhandle handle, const unsigned char *jpegBuf,
 
 bailout:
   if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
-  if (row_pointer) free(row_pointer);
+  free(row_pointer);
   if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
   return retval;
@@ -1345,12 +1374,12 @@ static int setDecodeDefaults(struct jpeg_decompress_struct *dinfo,
 }
 
 
-int my_read_markers(j_decompress_ptr dinfo)
+static int my_read_markers(j_decompress_ptr dinfo)
 {
   return JPEG_REACHED_SOS;
 }
 
-void my_reset_marker_reader(j_decompress_ptr dinfo)
+static void my_reset_marker_reader(j_decompress_ptr dinfo)
 {
 }
 
@@ -1369,7 +1398,7 @@ DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
   int (*old_read_markers) (j_decompress_ptr);
   void (*old_reset_marker_reader) (j_decompress_ptr);
 
-  getdinstance(handle);
+  GET_DINSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
   for (i = 0; i < MAX_COMPONENTS; i++) {
@@ -1377,14 +1406,14 @@ DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
   }
 
   if ((this->init & DECOMPRESS) == 0)
-    _throw("tjDecodeYUVPlanes(): Instance has not been initialized for decompression");
+    THROW("tjDecodeYUVPlanes(): Instance has not been initialized for decompression");
 
   if (!srcPlanes || !srcPlanes[0] || subsamp < 0 || subsamp >= NUMSUBOPT ||
       dstBuf == NULL || width <= 0 || pitch < 0 || height <= 0 ||
       pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
-    _throw("tjDecodeYUVPlanes(): Invalid argument");
+    THROW("tjDecodeYUVPlanes(): Invalid argument");
   if (subsamp != TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
-    _throw("tjDecodeYUVPlanes(): Invalid argument");
+    THROW("tjDecodeYUVPlanes(): Invalid argument");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
@@ -1392,7 +1421,7 @@ DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
   }
 
   if (pixelFormat == TJPF_CMYK)
-    _throw("tjDecodeYUVPlanes(): Cannot decode YUV images into CMYK pixels.");
+    THROW("tjDecodeYUVPlanes(): Cannot decode YUV images into CMYK pixels.");
 
   if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
   dinfo->image_width = width;
@@ -1404,6 +1433,9 @@ DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
   else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 #endif
 
+  dinfo->progressive_mode = dinfo->inputctl->has_multiple_scans = FALSE;
+  dinfo->Ss = dinfo->Ah = dinfo->Al = 0;
+  dinfo->Se = DCTSIZE2 - 1;
   if (setDecodeDefaults(dinfo, pixelFormat, subsamp, flags) == -1) {
     retval = -1;  goto bailout;
   }
@@ -1428,12 +1460,12 @@ DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
   if (pitch == 0) pitch = dinfo->output_width * tjPixelSize[pixelFormat];
 
   if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph0)) == NULL)
-    _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+    THROW("tjDecodeYUVPlanes(): Memory allocation failure");
   for (i = 0; i < height; i++) {
     if (flags & TJFLAG_BOTTOMUP)
-      row_pointer[i] = &dstBuf[(height - i - 1) * pitch];
+      row_pointer[i] = &dstBuf[(height - i - 1) * (size_t)pitch];
     else
-      row_pointer[i] = &dstBuf[i * pitch];
+      row_pointer[i] = &dstBuf[i * (size_t)pitch];
   }
   if (height < ph0)
     for (i = height; i < ph0; i++) row_pointer[i] = row_pointer[height - 1];
@@ -1444,10 +1476,10 @@ DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
       (JSAMPLE *)malloc(PAD(compptr->width_in_blocks * DCTSIZE, 32) *
                         compptr->v_samp_factor + 32);
     if (!_tmpbuf[i])
-      _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+      THROW("tjDecodeYUVPlanes(): Memory allocation failure");
     tmpbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * compptr->v_samp_factor);
     if (!tmpbuf[i])
-      _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+      THROW("tjDecodeYUVPlanes(): Memory allocation failure");
     for (row = 0; row < compptr->v_samp_factor; row++) {
       unsigned char *_tmpbuf_aligned =
         (unsigned char *)PAD((size_t)_tmpbuf[i], 32);
@@ -1459,7 +1491,7 @@ DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
     ph[i] = ph0 * compptr->v_samp_factor / dinfo->max_v_samp_factor;
     inbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i]);
     if (!inbuf[i])
-      _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+      THROW("tjDecodeYUVPlanes(): Memory allocation failure");
     ptr = (JSAMPLE *)srcPlanes[i];
     for (row = 0; row < ph[i]; row++) {
       inbuf[i][row] = ptr;
@@ -1488,11 +1520,11 @@ DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
 
 bailout:
   if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
-  if (row_pointer) free(row_pointer);
+  free(row_pointer);
   for (i = 0; i < MAX_COMPONENTS; i++) {
-    if (tmpbuf[i] != NULL) free(tmpbuf[i]);
-    if (_tmpbuf[i] != NULL) free(_tmpbuf[i]);
-    if (inbuf[i] != NULL) free(inbuf[i]);
+    free(tmpbuf[i]);
+    free(_tmpbuf[i]);
+    free(inbuf[i]);
   }
   if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
@@ -1508,12 +1540,12 @@ DLLEXPORT int tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
   int pw0, ph0, strides[3], retval = -1;
   tjinstance *this = (tjinstance *)handle;
 
-  if (!this) _throwg("tjDecodeYUV(): Invalid handle");
+  if (!this) THROWG("tjDecodeYUV(): Invalid handle");
   this->isInstanceError = FALSE;
 
-  if (srcBuf == NULL || pad < 0 || !isPow2(pad) || subsamp < 0 ||
+  if (srcBuf == NULL || pad < 0 || !IS_POW2(pad) || subsamp < 0 ||
       subsamp >= NUMSUBOPT || width <= 0 || height <= 0)
-    _throw("tjDecodeYUV(): Invalid argument");
+    THROW("tjDecodeYUV(): Invalid argument");
 
   pw0 = tjPlaneWidth(0, width, subsamp);
   ph0 = tjPlaneHeight(0, height, subsamp);
@@ -1552,7 +1584,7 @@ DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
   JSAMPROW *outbuf[MAX_COMPONENTS], *tmpbuf[MAX_COMPONENTS];
   int dctsize;
 
-  getdinstance(handle);
+  GET_DINSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
   for (i = 0; i < MAX_COMPONENTS; i++) {
@@ -1560,11 +1592,11 @@ DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
   }
 
   if ((this->init & DECOMPRESS) == 0)
-    _throw("tjDecompressToYUVPlanes(): Instance has not been initialized for decompression");
+    THROW("tjDecompressToYUVPlanes(): Instance has not been initialized for decompression");
 
   if (jpegBuf == NULL || jpegSize <= 0 || !dstPlanes || !dstPlanes[0] ||
       width < 0 || height < 0)
-    _throw("tjDecompressToYUVPlanes(): Invalid argument");
+    THROW("tjDecompressToYUVPlanes(): Invalid argument");
 
 #ifndef NO_PUTENV
   if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
@@ -1584,10 +1616,10 @@ DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
   this->headerRead = 0;
   jpegSubsamp = getSubsamp(dinfo);
   if (jpegSubsamp < 0)
-    _throw("tjDecompressToYUVPlanes(): Could not determine subsampling type for JPEG image");
+    THROW("tjDecompressToYUVPlanes(): Could not determine subsampling type for JPEG image");
 
   if (jpegSubsamp != TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
-    _throw("tjDecompressToYUVPlanes(): Invalid argument");
+    THROW("tjDecompressToYUVPlanes(): Invalid argument");
 
   jpegwidth = dinfo->image_width;  jpegheight = dinfo->image_height;
   if (width == 0) width = jpegwidth;
@@ -1599,9 +1631,9 @@ DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
       break;
   }
   if (i >= NUMSF)
-    _throw("tjDecompressToYUVPlanes(): Could not scale down to desired image dimensions");
+    THROW("tjDecompressToYUVPlanes(): Could not scale down to desired image dimensions");
   if (dinfo->num_components > 3)
-    _throw("tjDecompressToYUVPlanes(): JPEG image must have 3 or fewer components");
+    THROW("tjDecompressToYUVPlanes(): JPEG image must have 3 or fewer components");
 
   width = scaledw;  height = scaledh;
   dinfo->scale_num = sf[i].num;
@@ -1617,15 +1649,13 @@ DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
 
     iw[i] = compptr->width_in_blocks * dctsize;
     ih = compptr->height_in_blocks * dctsize;
-    pw[i] = PAD(dinfo->output_width, dinfo->max_h_samp_factor) *
-            compptr->h_samp_factor / dinfo->max_h_samp_factor;
-    ph[i] = PAD(dinfo->output_height, dinfo->max_v_samp_factor) *
-            compptr->v_samp_factor / dinfo->max_v_samp_factor;
+    pw[i] = tjPlaneWidth(i, dinfo->output_width, jpegSubsamp);
+    ph[i] = tjPlaneHeight(i, dinfo->output_height, jpegSubsamp);
     if (iw[i] != pw[i] || ih != ph[i]) usetmpbuf = 1;
     th[i] = compptr->v_samp_factor * dctsize;
     tmpbufsize += iw[i] * th[i];
     if ((outbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i])) == NULL)
-      _throw("tjDecompressToYUVPlanes(): Memory allocation failure");
+      THROW("tjDecompressToYUVPlanes(): Memory allocation failure");
     ptr = dstPlanes[i];
     for (row = 0; row < ph[i]; row++) {
       outbuf[i][row] = ptr;
@@ -1634,11 +1664,11 @@ DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
   }
   if (usetmpbuf) {
     if ((_tmpbuf = (JSAMPLE *)malloc(sizeof(JSAMPLE) * tmpbufsize)) == NULL)
-      _throw("tjDecompressToYUVPlanes(): Memory allocation failure");
+      THROW("tjDecompressToYUVPlanes(): Memory allocation failure");
     ptr = _tmpbuf;
     for (i = 0; i < dinfo->num_components; i++) {
       if ((tmpbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * th[i])) == NULL)
-        _throw("tjDecompressToYUVPlanes(): Memory allocation failure");
+        THROW("tjDecompressToYUVPlanes(): Memory allocation failure");
       for (row = 0; row < th[i]; row++) {
         tmpbuf[i][row] = ptr;
         ptr += iw[i];
@@ -1702,10 +1732,10 @@ DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
 bailout:
   if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
   for (i = 0; i < MAX_COMPONENTS; i++) {
-    if (tmpbuf[i]) free(tmpbuf[i]);
-    if (outbuf[i]) free(outbuf[i]);
+    free(tmpbuf[i]);
+    free(outbuf[i]);
   }
-  if (_tmpbuf) free(_tmpbuf);
+  free(_tmpbuf);
   if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
   return retval;
@@ -1719,12 +1749,12 @@ DLLEXPORT int tjDecompressToYUV2(tjhandle handle, const unsigned char *jpegBuf,
   int pw0, ph0, strides[3], retval = -1, jpegSubsamp = -1;
   int i, jpegwidth, jpegheight, scaledw, scaledh;
 
-  getdinstance(handle);
+  GET_DINSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
   if (jpegBuf == NULL || jpegSize <= 0 || dstBuf == NULL || width < 0 ||
-      pad < 1 || !isPow2(pad) || height < 0)
-    _throw("tjDecompressToYUV2(): Invalid argument");
+      pad < 1 || !IS_POW2(pad) || height < 0)
+    THROW("tjDecompressToYUV2(): Invalid argument");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
@@ -1735,7 +1765,7 @@ DLLEXPORT int tjDecompressToYUV2(tjhandle handle, const unsigned char *jpegBuf,
   jpeg_read_header(dinfo, TRUE);
   jpegSubsamp = getSubsamp(dinfo);
   if (jpegSubsamp < 0)
-    _throw("tjDecompressToYUV2(): Could not determine subsampling type for JPEG image");
+    THROW("tjDecompressToYUV2(): Could not determine subsampling type for JPEG image");
 
   jpegwidth = dinfo->image_width;  jpegheight = dinfo->image_height;
   if (width == 0) width = jpegwidth;
@@ -1748,7 +1778,7 @@ DLLEXPORT int tjDecompressToYUV2(tjhandle handle, const unsigned char *jpegBuf,
       break;
   }
   if (i >= NUMSF)
-    _throw("tjDecompressToYUV2(): Could not scale down to desired image dimensions");
+    THROW("tjDecompressToYUV2(): Could not scale down to desired image dimensions");
 
   pw0 = tjPlaneWidth(0, width, jpegSubsamp);
   ph0 = tjPlaneHeight(0, height, jpegSubsamp);
@@ -1813,14 +1843,14 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
   jvirt_barray_ptr *srccoefs, *dstcoefs;
   int retval = 0, i, jpegSubsamp, saveMarkers = 0;
 
-  getinstance(handle);
+  GET_INSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
   if ((this->init & COMPRESS) == 0 || (this->init & DECOMPRESS) == 0)
-    _throw("tjTransform(): Instance has not been initialized for transformation");
+    THROW("tjTransform(): Instance has not been initialized for transformation");
 
   if (jpegBuf == NULL || jpegSize <= 0 || n < 1 || dstBufs == NULL ||
       dstSizes == NULL || t == NULL || flags < 0)
-    _throw("tjTransform(): Invalid argument");
+    THROW("tjTransform(): Invalid argument");
 
 #ifndef NO_PUTENV
   if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
@@ -1830,7 +1860,7 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
 
   if ((xinfo =
        (jpeg_transform_info *)malloc(sizeof(jpeg_transform_info) * n)) == NULL)
-    _throw("tjTransform(): Memory allocation failure");
+    THROW("tjTransform(): Memory allocation failure");
   MEMZERO(xinfo, sizeof(jpeg_transform_info) * n);
 
   if (setjmp(this->jerr.setjmp_buffer)) {
@@ -1868,19 +1898,20 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
   jpeg_read_header(dinfo, TRUE);
   jpegSubsamp = getSubsamp(dinfo);
   if (jpegSubsamp < 0)
-    _throw("tjTransform(): Could not determine subsampling type for JPEG image");
+    THROW("tjTransform(): Could not determine subsampling type for JPEG image");
 
   for (i = 0; i < n; i++) {
     if (!jtransform_request_workspace(dinfo, &xinfo[i]))
-      _throw("tjTransform(): Transform is not perfect");
+      THROW("tjTransform(): Transform is not perfect");
 
     if (xinfo[i].crop) {
       if ((t[i].r.x % xinfo[i].iMCU_sample_width) != 0 ||
           (t[i].r.y % xinfo[i].iMCU_sample_height) != 0) {
-        snprintf(errStr, JMSG_LENGTH_MAX,
+        snprintf(this->errStr, JMSG_LENGTH_MAX,
                  "To crop this JPEG image, x must be a multiple of %d\n"
                  "and y must be a multiple of %d.\n",
                  xinfo[i].iMCU_sample_width, xinfo[i].iMCU_sample_height);
+        this->isInstanceError = TRUE;
         retval = -1;  goto bailout;
       }
     }
@@ -1935,7 +1966,7 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
           for (y = 0; y < compptr->v_samp_factor; y++) {
             if (t[i].customFilter(barray[y][0], arrayRegion, planeRegion, ci,
                                   i, &t[i]) == -1)
-              _throw("tjTransform(): Error in custom filter");
+              THROW("tjTransform(): Error in custom filter");
             arrayRegion.y += DCTSIZE;
           }
         }
@@ -1949,7 +1980,7 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
 bailout:
   if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
   if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
-  if (xinfo) free(xinfo);
+  free(xinfo);
   if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
   return retval;
@@ -1960,7 +1991,8 @@ DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
                                      int align, int *height, int *pixelFormat,
                                      int flags)
 {
-  int retval = 0, tempc, pitch;
+  int retval = 0, tempc;
+  size_t pitch;
   tjhandle handle = NULL;
   tjinstance *this;
   j_compress_ptr cinfo = NULL;
@@ -1971,21 +2003,21 @@ DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
 
   if (!filename || !width || align < 1 || !height || !pixelFormat ||
       *pixelFormat < TJPF_UNKNOWN || *pixelFormat >= TJ_NUMPF)
-    _throwg("tjLoadImage(): Invalid argument");
+    THROWG("tjLoadImage(): Invalid argument");
   if ((align & (align - 1)) != 0)
-    _throwg("tjLoadImage(): Alignment must be a power of 2");
+    THROWG("tjLoadImage(): Alignment must be a power of 2");
 
   if ((handle = tjInitCompress()) == NULL) return NULL;
   this = (tjinstance *)handle;
   cinfo = &this->cinfo;
 
   if ((file = fopen(filename, "rb")) == NULL)
-    _throwunix("tjLoadImage(): Cannot open input file");
+    THROW_UNIX("tjLoadImage(): Cannot open input file");
 
   if ((tempc = getc(file)) < 0 || ungetc(tempc, file) == EOF)
-    _throwunix("tjLoadImage(): Could not read input file")
+    THROW_UNIX("tjLoadImage(): Could not read input file")
   else if (tempc == EOF)
-    _throwg("tjLoadImage(): Input file contains no data");
+    THROWG("tjLoadImage(): Input file contains no data");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
@@ -1996,14 +2028,14 @@ DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
   else cinfo->in_color_space = pf2cs[*pixelFormat];
   if (tempc == 'B') {
     if ((src = jinit_read_bmp(cinfo, FALSE)) == NULL)
-      _throwg("tjLoadImage(): Could not initialize bitmap loader");
+      THROWG("tjLoadImage(): Could not initialize bitmap loader");
     invert = (flags & TJFLAG_BOTTOMUP) == 0;
   } else if (tempc == 'P') {
     if ((src = jinit_read_ppm(cinfo)) == NULL)
-      _throwg("tjLoadImage(): Could not initialize bitmap loader");
+      THROWG("tjLoadImage(): Could not initialize bitmap loader");
     invert = (flags & TJFLAG_BOTTOMUP) != 0;
   } else
-    _throwg("tjLoadImage(): Unsupported file type");
+    THROWG("tjLoadImage(): Unsupported file type");
 
   src->input_file = file;
   (*src->start_input) (cinfo, src);
@@ -2013,8 +2045,10 @@ DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
   *pixelFormat = cs2pf[cinfo->in_color_space];
 
   pitch = PAD((*width) * tjPixelSize[*pixelFormat], align);
-  if ((dstBuf = (unsigned char *)malloc(pitch * (*height))) == NULL)
-    _throwg("tjLoadImage(): Memory allocation failure");
+  if ((unsigned long long)pitch * (unsigned long long)(*height) >
+      (unsigned long long)((size_t)-1) ||
+      (dstBuf = (unsigned char *)malloc(pitch * (*height))) == NULL)
+    THROWG("tjLoadImage(): Memory allocation failure");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
@@ -2041,7 +2075,7 @@ DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
 bailout:
   if (handle) tjDestroy(handle);
   if (file) fclose(file);
-  if (retval < 0 && dstBuf) { free(dstBuf);  dstBuf = NULL; }
+  if (retval < 0) { free(dstBuf);  dstBuf = NULL; }
   return dstBuf;
 }
 
@@ -2061,7 +2095,7 @@ DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
 
   if (!filename || !buffer || width < 1 || pitch < 0 || height < 1 ||
       pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
-    _throwg("tjSaveImage(): Invalid argument");
+    THROWG("tjSaveImage(): Invalid argument");
 
   if ((handle = tjInitDecompress()) == NULL)
     return -1;
@@ -2069,7 +2103,7 @@ DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
   dinfo = &this->dinfo;
 
   if ((file = fopen(filename, "wb")) == NULL)
-    _throwunix("tjSaveImage(): Cannot open output file");
+    THROW_UNIX("tjSaveImage(): Cannot open output file");
 
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
@@ -2084,11 +2118,11 @@ DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
   ptr = strrchr(filename, '.');
   if (ptr && !strcasecmp(ptr, ".bmp")) {
     if ((dst = jinit_write_bmp(dinfo, FALSE, FALSE)) == NULL)
-      _throwg("tjSaveImage(): Could not initialize bitmap writer");
+      THROWG("tjSaveImage(): Could not initialize bitmap writer");
     invert = (flags & TJFLAG_BOTTOMUP) == 0;
   } else {
     if ((dst = jinit_write_ppm(dinfo)) == NULL)
-      _throwg("tjSaveImage(): Could not initialize PPM writer");
+      THROWG("tjSaveImage(): Could not initialize PPM writer");
     invert = (flags & TJFLAG_BOTTOMUP) != 0;
   }
 
diff --git a/turbojpeg.h b/turbojpeg.h
index 9c0a371..f3209dd 100644
--- a/turbojpeg.h
+++ b/turbojpeg.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2015, 2017 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2015, 2017, 2020 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -1636,7 +1636,8 @@ DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
  * (re)allocated by the compression and transform functions or that were
  * manually allocated using #tjAlloc().
  *
- * @param buffer address of the buffer to free
+ * @param buffer address of the buffer to free.  If the address is NULL, then
+ * this function has no effect.
  *
  * @sa tjAlloc()
  */
@@ -1649,7 +1650,7 @@ DLLEXPORT void tjFree(unsigned char *buffer);
  * @param handle a handle to a TurboJPEG compressor, decompressor, or
  * transformer instance, or NULL if the error was generated by a global
  * function (but note that retrieving the error message for a global function
- * is not thread-safe.)
+ * is thread-safe only on platforms that support thread-local storage.)
  *
  * @return a descriptive error message explaining why the last command failed.
  */
diff --git a/wrbmp.c b/wrbmp.c
index 38a64e8..239f64e 100644
--- a/wrbmp.c
+++ b/wrbmp.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014-2015, 2017, D. R. Commander.
+ * Copyright (C) 2014-2015, 2017, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -303,9 +303,7 @@ write_os2_header(j_decompress_ptr cinfo, bmp_dest_ptr dest)
   int bits_per_pixel, cmap_entries;
 
   /* Compute colormap size and total file size */
-  if (cinfo->out_color_space == JCS_RGB ||
-      (cinfo->out_color_space >= JCS_EXT_RGB &&
-       cinfo->out_color_space <= JCS_EXT_ARGB)) {
+  if (IsExtRGB(cinfo->out_color_space)) {
     if (cinfo->quantize_colors) {
       /* Colormapped RGB */
       bits_per_pixel = 8;
@@ -499,15 +497,14 @@ jinit_write_bmp(j_decompress_ptr cinfo, boolean is_os2,
 
   if (cinfo->out_color_space == JCS_GRAYSCALE) {
     dest->pub.put_pixel_rows = put_gray_rows;
-  } else if (cinfo->out_color_space == JCS_RGB ||
-             (cinfo->out_color_space >= JCS_EXT_RGB &&
-              cinfo->out_color_space <= JCS_EXT_ARGB)) {
+  } else if (IsExtRGB(cinfo->out_color_space)) {
     if (cinfo->quantize_colors)
       dest->pub.put_pixel_rows = put_gray_rows;
     else
       dest->pub.put_pixel_rows = put_pixel_rows;
-  } else if (cinfo->out_color_space == JCS_RGB565 ||
-             cinfo->out_color_space == JCS_CMYK) {
+  } else if (!cinfo->quantize_colors &&
+             (cinfo->out_color_space == JCS_RGB565 ||
+              cinfo->out_color_space == JCS_CMYK)) {
     dest->pub.put_pixel_rows = put_pixel_rows;
   } else {
     ERREXIT(cinfo, JERR_BMP_COLORSPACE);
diff --git a/wrppm.c b/wrppm.c
index 819a0a7..69f91e8 100644
--- a/wrppm.c
+++ b/wrppm.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * Modified 2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2017, D. R. Commander.
+ * Copyright (C) 2017, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -256,6 +256,8 @@ start_output_ppm(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
   case JCS_EXT_ABGR:
   case JCS_EXT_ARGB:
   case JCS_CMYK:
+    if (!IsExtRGB(cinfo->out_color_space) && cinfo->quantize_colors)
+      ERREXIT(cinfo, JERR_PPM_COLORSPACE);
     /* emit header for raw PPM format */
     fprintf(dest->pub.output_file, "P6\n%ld %ld\n%d\n",
             (long)cinfo->output_width, (long)cinfo->output_height, PPM_MAXVAL);
@@ -337,13 +339,14 @@ jinit_write_ppm(j_decompress_ptr cinfo)
       ((j_common_ptr)cinfo, JPOOL_IMAGE,
        cinfo->output_width * cinfo->output_components, (JDIMENSION)1);
     dest->pub.buffer_height = 1;
-    if (IsExtRGB(cinfo->out_color_space))
-      dest->pub.put_pixel_rows = put_rgb;
-    else if (cinfo->out_color_space == JCS_CMYK)
-      dest->pub.put_pixel_rows = put_cmyk;
-    else if (!cinfo->quantize_colors)
-      dest->pub.put_pixel_rows = copy_pixel_rows;
-    else if (cinfo->out_color_space == JCS_GRAYSCALE)
+    if (!cinfo->quantize_colors) {
+      if (IsExtRGB(cinfo->out_color_space))
+        dest->pub.put_pixel_rows = put_rgb;
+      else if (cinfo->out_color_space == JCS_CMYK)
+        dest->pub.put_pixel_rows = put_cmyk;
+      else
+        dest->pub.put_pixel_rows = copy_pixel_rows;
+    } else if (cinfo->out_color_space == JCS_GRAYSCALE)
       dest->pub.put_pixel_rows = put_demapped_gray;
     else
       dest->pub.put_pixel_rows = put_demapped_rgb;
author	Jonathan Wright <jonathan.wright@arm.com>	2020-08-05 11:42:22 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	2020-08-07 17:04:34 +0100
commit	db870dfef8ab97950b4bdf22c66dd6c18326460b (patch)
tree	9862c1aa45d014815ec798992ecb43eff1bc8d42
parent	341272d909285da90e44015ca41f956fd00b9dd8 (diff)