1 files changed, 0 insertions, 557 deletions
diff --git a/simd/arm/common/jdsample-neon.c b/simd/arm/common/jdsample-neon.c
deleted file mode 100644
index e4f5129..0000000
--- a/simd/arm/common/jdsample-neon.c
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
- * jdsample-neon.c - upsampling (Arm NEON)
- *
- * Copyright 2019 The Chromium Authors. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#define JPEG_INTERNALS
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
-#include "../../jsimd.h"
-
-#include <arm_neon.h>
-
-/*
- * The diagram below shows a row of samples (luma or chroma) produced by h2v1
- * downsampling.
- *
- *                 s0        s1        s2
- *            +---------+---------+---------+
- *            |         |         |         |
- *            | p0   p1 | p2   p3 | p4   p5 |
- *            |         |         |         |
- *            +---------+---------+---------+
- *
- * Each sample contains two of the original pixel channel values. These pixel
- * channel values are centred at positions p0, p1, p2, p3, p4 and p5 above. To
- * compute the channel values of the original image, we proportionally blend
- * the adjacent samples in each row.
- *
- * There are three cases to consider:
- *
- * 1) The first pixel in the original image.
- *    Pixel channel value p0 contains only a component from sample s0, so we
- *    set p0 = s0.
- * 2) The last pixel in the original image.
- *    Pixel channel value p5 contains only a component from sample s2, so we
- *    set p5 = s2.
- * 3) General case (all other pixels in the row).
- *    Apart from the first and last pixels, every other pixel channel value is
- *    computed by blending the containing sample and the nearest neigbouring
- *    sample in the ratio 3:1.
- *    For example, the pixel channel value centred at p1 would be computed as
- *    follows:
- *        3/4 * s0 + 1/4 * s1
- *    while the pixel channel value centred at p2 would be:
- *        3/4 * s1 + 1/4 * s0
- */
-
-void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
-                                    JDIMENSION downsampled_width,
-                                    JSAMPARRAY input_data,
-                                    JSAMPARRAY *output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  JSAMPROW inptr, outptr;
-  /* Setup constants. */
-  const uint16x8_t one_u16 = vdupq_n_u16(1);
-  const uint8x8_t three_u8 = vdup_n_u8(3);
-
-  for (int inrow = 0; inrow < max_v_samp_factor; inrow++) {
-    inptr = input_data[inrow];
-    outptr = output_data[inrow];
-    /* Case 1: first pixel channel value in this row of the original image. */
-    *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
-
-    /* General case: */
-    /*    3/4 * containing sample + 1/4 * nearest neighbouring sample */
-    /* For p1: containing sample = s0, nearest neighbouring sample = s1. */
-    /* For p2: containing sample = s1, nearest neighbouring sample = s0. */
-    uint8x16_t s0 = vld1q_u8(inptr);
-    uint8x16_t s1 = vld1q_u8(inptr + 1);
-    /* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */
-    /* denote low half and high half respectively. */
-    uint16x8_t s1_add_3s0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1)),
-                                       vget_low_u8(s0), three_u8);
-    uint16x8_t s1_add_3s0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1)),
-                                       vget_high_u8(s0), three_u8);
-    uint16x8_t s0_add_3s1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0)),
-                                       vget_low_u8(s1), three_u8);
-    uint16x8_t s0_add_3s1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0)),
-                                       vget_high_u8(s1), three_u8);
-    /* Add ordered dithering bias to odd pixel values. */
-    s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
-    s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
-
-    /* Initially 1 - due to having already stored the first pixel of the */
-    /* image. However, in subsequent iterations of the SIMD loop this offset */
-    /* is (2 * colctr - 1) to stay within the bounds of the sample buffers */
-    /* without having to resort to a slow scalar tail case for the last */
-    /* (downsampled_width % 16) samples. See "Creation of 2-D sample arrays" */
-    /* in jmemmgr.c for details. */
-    unsigned outptr_offset = 1;
-    uint8x16x2_t output_pixels;
-
-#if defined(__aarch64__) && defined(__clang__) && !defined(__OPTIMIZE_SIZE__)
-    /* Unrolling by four is beneficial on AArch64 as there are 16 additional */
-    /* 128-bit SIMD registers to accommodate the extra data in flight. */
-    #pragma clang loop unroll_count(4)
-#endif
-    /* We use software pipelining to maximise performance. The code indented */
-    /* an extra 6 spaces begins the next iteration of the loop. */
-    for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) {
-            s0 = vld1q_u8(inptr + colctr - 1);
-            s1 = vld1q_u8(inptr + colctr);
-      /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */
-      output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
-                                         vrshrn_n_u16(s1_add_3s0_h, 2));
-      output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
-                                         vshrn_n_u16(s0_add_3s1_h, 2));
-            /* Multiplication makes vectors twice as wide: '_l' and '_h' */
-            /* suffixes denote low half and high half respectively. */
-            s1_add_3s0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1)),
-                                             vget_low_u8(s0), three_u8);
-            s1_add_3s0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1)),
-                                             vget_high_u8(s0), three_u8);
-            s0_add_3s1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0)),
-                                             vget_low_u8(s1), three_u8);
-            s0_add_3s1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0)),
-                                             vget_high_u8(s1), three_u8);
-            /* Add ordered dithering bias to odd pixel values. */
-            s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
-            s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
-      /* Store pixel channel values to memory. */
-      vst2q_u8(outptr + outptr_offset, output_pixels);
-      outptr_offset = 2 * colctr - 1;
-    }
-
-    /* Complete the last iteration of the loop. */
-    /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */
-    output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
-                                       vrshrn_n_u16(s1_add_3s0_h, 2));
-    output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
-                                       vshrn_n_u16(s0_add_3s1_h, 2));
-    /* Store pixel channel values to memory. */
-    vst2q_u8(outptr + outptr_offset, output_pixels);
-
-    /* Case 2: last pixel channel value in this row of the original image. */
-    outptr[2 * downsampled_width - 1] =
-          GETJSAMPLE(inptr[downsampled_width - 1]);
-  }
-}
-
-
-/*
- * The diagram below shows a grid-window of samples (luma or chroma) produced
- * by h2v2 downsampling.
- *
- *                  s0        s1
- *             +---------+---------+
- *             | p0   p1 | p2   p3 |
- *     r0      |         |         |
- *             | p4   p5 | p6   p7 |
- *             +---------+---------+
- *             | p8   p9 | p10  p11|
- *     r1      |         |         |
- *             | p12  p13| p14  p15|
- *             +---------+---------+
- *             | p16  p17| p18  p19|
- *     r2      |         |         |
- *             | p20  p21| p22  p23|
- *             +---------+---------+
- *
- * Every sample contains four of the original pixel channel values. The pixels'
- * channel values are centred at positions p0, p1, p2,..., p23 above. For a
- * given grid-window position, r1 is always used to denote the row of samples
- * containing the pixel channel values we are computing. For the top row of
- * pixel channel values in r1 (p8-p11), the nearest neighbouring samples are in
- * the row above - denoted by r0. Likewise, for the bottom row of pixels in r1
- * (p12-p15), the nearest neighbouring samples are in the row below - denoted
- * by r2.
- *
- * To compute the pixel channel values of the original image, we proportionally
- * blend the sample containing the pixel centre with the nearest neighbouring
- * samples in each row, column and diagonal.
- *
- * There are three cases to consider:
- *
- * 1) The first pixel in this row of the original image.
- *    Pixel channel value p8 only contains components from sample column s0.
- *    Its value is computed by blending samples s0r1 and s0r0 in the ratio 3:1.
- * 2) The last pixel in this row of the original image.
- *    Pixel channel value p11 only contains components from sample column s1.
- *    Its value is computed by blending samples s1r1 and s1r0 in the ratio 3:1.
- * 3) General case (all other pixels in the row).
- *    Apart from the first and last pixels, every other pixel channel value in
- *    the row contains components from samples in adjacent columns.
- *
- *    For example, the pixel centred at p9 would be computed as follows:
- *        (9/16 * s0r1) + (3/16 * s0r0) + (3/16 * s1r1) + (1/16 * s1r0)
- *
- *    This can be broken down into two steps:
- *    1) Blend samples vertically in columns s0 and s1 in the ratio 3:1:
- *        s0colsum = 3/4 * s0r1 + 1/4 * s0r0
- *        s1colsum = 3/4 * s1r1 + 1/4 * s1r0
- *    2) Blend the already-blended columns in the ratio 3:1:
- *        p9 = 3/4 * s0colsum + 1/4 * s1colsum
- *
- * The bottom row of pixel channel values in row r1 can be computed in the same
- * way for each of the three cases, only using samples in row r2 instead of row
- * r0 - as r2 is the nearest neighbouring row.
- */
-
-void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
-                                    JDIMENSION downsampled_width,
-                                    JSAMPARRAY input_data,
-                                    JSAMPARRAY *output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
-  int inrow, outrow;
-  /* Setup constants. */
-  const uint16x8_t seven_u16 = vdupq_n_u16(7);
-  const uint8x8_t three_u8 = vdup_n_u8(3);
-  const uint16x8_t three_u16 = vdupq_n_u16(3);
-
-  inrow = outrow = 0;
-  while (outrow < max_v_samp_factor) {
-    inptr0 = input_data[inrow - 1];
-    inptr1 = input_data[inrow];
-    inptr2 = input_data[inrow + 1];
-    /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */
-    /* respectively. */
-    outptr0 = output_data[outrow++];
-    outptr1 = output_data[outrow++];
-
-    /* Case 1: first pixel channel value in this row of original image. */
-    int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
-    *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
-    int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
-    *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
-
-    /* General case as described above. */
-    /* Step 1: Blend samples vertically in columns s0 and s1. */
-    /* Leave the divide by 4 to the end when it can be done for both */
-    /* dimensions at once, right-shifting by 4. */
-
-    /* Load and compute s0colsum0 and s0colsum1. */
-    uint8x16_t s0r0 = vld1q_u8(inptr0);
-    uint8x16_t s0r1 = vld1q_u8(inptr1);
-    uint8x16_t s0r2 = vld1q_u8(inptr2);
-    /* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */
-    /* denote low half and high half respectively. */
-    uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
-                                      vget_low_u8(s0r1), three_u8);
-    uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
-                                      vget_high_u8(s0r1), three_u8);
-    uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
-                                      vget_low_u8(s0r1), three_u8);
-    uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
-                                      vget_high_u8(s0r1), three_u8);
-    /* Load and compute s1colsum0 and s1colsum1. */
-    uint8x16_t s1r0 = vld1q_u8(inptr0 + 1);
-    uint8x16_t s1r1 = vld1q_u8(inptr1 + 1);
-    uint8x16_t s1r2 = vld1q_u8(inptr2 + 1);
-    uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
-                                      vget_low_u8(s1r1), three_u8);
-    uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
-                                      vget_high_u8(s1r1), three_u8);
-    uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
-                                      vget_low_u8(s1r1), three_u8);
-    uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
-                                      vget_high_u8(s1r1), three_u8);
-    /* Step 2: Blend the already-blended columns. */
-    uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
-    uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
-    uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
-    uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
-    uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
-    uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
-    uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
-    uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
-    /* Add ordered dithering bias to odd pixel values. */
-    output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
-    output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
-    output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
-    output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
-    /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
-    uint8x16x2_t output_pixels0 = { vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
-                                                vshrn_n_u16(output0_p1_h, 4)),
-                                    vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
-                                                vrshrn_n_u16(output0_p2_h, 4))
-                                  };
-    uint8x16x2_t output_pixels1 = { vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
-                                                vshrn_n_u16(output1_p1_h, 4)),
-                                    vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
-                                                vrshrn_n_u16(output1_p2_h, 4))
-                                  };
-    /* Store pixel channel values to memory. */
-    /* The minimum size of the output buffer for each row is 64 bytes => no */
-    /* need to worry about buffer overflow here. See "Creation of 2-D sample */
-    /* arrays" in jmemmgr.c for details. */
-    vst2q_u8(outptr0 + 1, output_pixels0);
-    vst2q_u8(outptr1 + 1, output_pixels1);
-
-    /* The first pixel of the image shifted our loads and stores by one */
-    /* byte. We have to re-align on a 32-byte boundary at some point before */
-    /* the end of the row (we do it now on the 32/33 pixel boundary) to stay */
-    /* within the bounds of the sample buffers without having to resort to a */
-    /* slow scalar tail case for the last (downsampled_width % 16) samples. */
-    /* See "Creation of 2-D sample arrays" in jmemmgr.c for details.*/
-    for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) {
-      /* Step 1: Blend samples vertically in columns s0 and s1. */
-      /* Load and compute s0colsum0 and s0colsum1. */
-      s0r0 = vld1q_u8(inptr0 + colctr - 1);
-      s0r1 = vld1q_u8(inptr1 + colctr - 1);
-      s0r2 = vld1q_u8(inptr2 + colctr - 1);
-      s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
-                             vget_low_u8(s0r1), three_u8);
-      s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
-                             vget_high_u8(s0r1), three_u8);
-      s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
-                             vget_low_u8(s0r1), three_u8);
-      s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
-                             vget_high_u8(s0r1), three_u8);
-      /* Load and compute s1colsum0 and s1colsum1. */
-      s1r0 = vld1q_u8(inptr0 + colctr);
-      s1r1 = vld1q_u8(inptr1 + colctr);
-      s1r2 = vld1q_u8(inptr2 + colctr);
-      s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
-                             vget_low_u8(s1r1), three_u8);
-      s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
-                             vget_high_u8(s1r1), three_u8);
-      s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
-                             vget_low_u8(s1r1), three_u8);
-      s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
-                             vget_high_u8(s1r1), three_u8);
-      /* Step 2: Blend the already-blended columns. */
-      output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
-      output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
-      output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
-      output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
-      output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
-      output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
-      output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
-      output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
-      /* Add ordered dithering bias to odd pixel values. */
-      output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
-      output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
-      output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
-      output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
-      /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
-      output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
-                                          vshrn_n_u16(output0_p1_h, 4));
-      output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
-                                          vrshrn_n_u16(output0_p2_h, 4));
-      output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
-                                          vshrn_n_u16(output1_p1_h, 4));
-      output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
-                                          vrshrn_n_u16(output1_p2_h, 4));
-      /* Store pixel channel values to memory. */
-      vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
-      vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
-    }
-
-    /* Case 2: last pixel channel value in this row of the original image. */
-    int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
-                    GETJSAMPLE(inptr0[downsampled_width - 1]);
-    outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
-    int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
-                    GETJSAMPLE(inptr2[downsampled_width - 1]);
-    outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
-    inrow++;
-  }
-}
-
-
-/*
- * The diagram below shows a grid-window of samples (luma or chroma) produced
- * by h2v1 downsampling; which has been subsequently rotated 90 degrees. (The
- * usual use of h1v2 upsampling is upsampling rotated or transposed h2v1
- * downsampled images.)
- *
- *                  s0        s1
- *             +---------+---------+
- *             |    p0   |    p1   |
- *     r0      |         |         |
- *             |    p2   |    p3   |
- *             +---------+---------+
- *             |    p4   |    p5   |
- *     r1      |         |         |
- *             |    p6   |    p7   |
- *             +---------+---------+
- *             |    p8   |    p9   |
- *     r2      |         |         |
- *             |    p10  |    p11  |
- *             +---------+---------+
- *
- * Every sample contains two of the original pixel channel values. The pixels'
- * channel values are centred at positions p0, p1, p2,..., p11 above. For a
- * given grid-window position, r1 is always used to denote the row of samples
- * containing the pixel channel values we are computing. For the top row of
- * pixel channel values in r1 (p4 and p5), the nearest neighbouring samples are
- * in the row above - denoted by r0. Likewise, for the bottom row of pixels in
- * r1 (p6 and p7), the nearest neighbouring samples are in the row below -
- * denoted by r2.
- *
- * To compute the pixel channel values of the original image, we proportionally
- * blend the adjacent samples in each column.
- *
- * For example, the pixel channel value centred at p4 would be computed as
- * follows:
- *     3/4 * s0r1 + 1/4 * s0r0
- * while the pixel channel value centred at p6 would be:
- *     3/4 * s0r1 + 1/4 * s0r2
- */
-
-void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
-                                    JDIMENSION downsampled_width,
-                                    JSAMPARRAY input_data,
-                                    JSAMPARRAY *output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
-  int inrow, outrow;
-  /* Setup constants. */
-  const uint16x8_t one_u16 = vdupq_n_u16(1);
-  const uint8x8_t three_u8 = vdup_n_u8(3);
-
-  inrow = outrow = 0;
-  while (outrow < max_v_samp_factor) {
-    inptr0 = input_data[inrow - 1];
-    inptr1 = input_data[inrow];
-    inptr2 = input_data[inrow + 1];
-    /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */
-    /* respectively. */
-    outptr0 = output_data[outrow++];
-    outptr1 = output_data[outrow++];
-    inrow++;
-
-    /* The size of the input and output buffers is always a multiple of 32 */
-    /* bytes => no need to worry about buffer overflow when reading/writing */
-    /* memory. See "Creation of 2-D sample arrays" in jmemmgr.c for details. */
-    for (unsigned colctr = 0; colctr < downsampled_width; colctr += 16) {
-      /* Load samples. */
-      uint8x16_t r0 = vld1q_u8(inptr0 + colctr);
-      uint8x16_t r1 = vld1q_u8(inptr1 + colctr);
-      uint8x16_t r2 = vld1q_u8(inptr2 + colctr);
-      /* Blend samples vertically. */
-      uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(r0)),
-                                      vget_low_u8(r1), three_u8);
-      uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(r0)),
-                                      vget_high_u8(r1), three_u8);
-      uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(r2)),
-                                      vget_low_u8(r1), three_u8);
-      uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(r2)),
-                                      vget_high_u8(r1), three_u8);
-      /* Add ordered dithering bias to pixel values in even output rows. */
-      colsum0_l = vaddq_u16(colsum0_l, one_u16);
-      colsum0_h = vaddq_u16(colsum0_h, one_u16);
-      /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */
-      uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
-                                              vshrn_n_u16(colsum0_h, 2));
-      uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
-                                              vrshrn_n_u16(colsum1_h, 2));
-      /* Store pixel channel values to memory. */
-      vst1q_u8(outptr0 + colctr, output_pixels0);
-      vst1q_u8(outptr1 + colctr, output_pixels1);
-    }
-  }
-}
-
-
-/*
- * The diagram below shows the operation of h2v1 (simple) upsampling. Each
- * sample in the row is duplicated to form two output pixel channel values.
- *
- *                          p0   p1   p2   p3
- *     +----+----+        +----+----+----+----+
- *     | s0 | s1 |   ->   | s0 | s0 | s1 | s1 |
- *     +----+----+        +----+----+----+----+
- */
-
-void jsimd_h2v1_upsample_neon(int max_v_samp_factor,
-                              JDIMENSION output_width,
-                              JSAMPARRAY input_data,
-                              JSAMPARRAY *output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  JSAMPROW inptr, outptr;
-
-  for (int inrow = 0; inrow < max_v_samp_factor; inrow++) {
-    inptr = input_data[inrow];
-    outptr = output_data[inrow];
-    for (unsigned colctr = 0; 2 * colctr < output_width; colctr += 16) {
-      uint8x16_t samples = vld1q_u8(inptr + colctr);
-      /* Duplicate the samples - the store interleaves them to produce the */
-      /* pattern in the diagram above. */
-      uint8x16x2_t output_pixels = { samples, samples };
-      /* Store pixel values to memory. */
-      /* Due to the way sample buffers are allocated, we don't need to worry */
-      /* about tail cases when output_width is not a multiple of 32. */
-      /* See "Creation of 2-D sample arrays" in jmemmgr.c for details. */
-      vst2q_u8(outptr + 2 * colctr, output_pixels);
-    }
-  }
-}
-
-
-/*
- * The diagram below shows the operation of h2v2 (simple) upsampling. Each
- * sample in the row is duplicated to form two output pixel channel values.
- * This horizontally-upsampled row is then also duplicated.
- *
- *                                  p0    p1    p2    p3
- *       +-----+-----+           +-----+-----+-----+-----+
- *       |  s0 |  s1 |    ->     |  s0 |  s0 |  s1 |  s1 |
- *       +-----+-----+           +-----+-----+-----+-----+
- *                               |  s0 |  s0 |  s1 |  s1 |
- *                               +-----+-----+-----+-----+
- */
-
-void jsimd_h2v2_upsample_neon(int max_v_samp_factor,
-                              JDIMENSION output_width,
-                              JSAMPARRAY input_data,
-                              JSAMPARRAY *output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  JSAMPROW inptr, outptr0, outptr1;
-
-  for (int inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
-    inptr = input_data[inrow];
-    outptr0 = output_data[outrow++];
-    outptr1 = output_data[outrow++];
-
-    for (unsigned colctr = 0; 2 * colctr < output_width; colctr += 16) {
-      uint8x16_t samples = vld1q_u8(inptr + colctr);
-      /* Duplicate the samples - the store interleaves them to produce the */
-      /* pattern in the diagram above. */
-      uint8x16x2_t output_pixels = { samples, samples };
-      /* Store pixel values to memory for both output rows. */
-      /* Due to the way sample buffers are allocated, we don't need to worry */
-      /* about tail cases when output_width is not a multiple of 32. */
-      /* See "Creation of 2-D sample arrays" in jmemmgr.c for details. */
-      vst2q_u8(outptr0 + 2 * colctr, output_pixels);
-      vst2q_u8(outptr1 + 2 * colctr, output_pixels);
-    }
-  }
-}