Add Arm NEON implementation of h2v2_fancy_upsample

Adds an Arm NEON intrinsics implementation of h2v2_fancy_usample. There was no previous NEON assembly implementation for either AArch32 or AArch64. Bug: 922430 Change-Id: I479dd075cdeea945c8e0c4c9c60d1b55e74cb5ab
author: Jonathan Wright <jonathan.wright@arm.com> 2018-06-21 17:55:51 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> 2019-05-31 13:47:12 +0100
commit: 2a34770be9715cfc1badff10fceba52dd393b094 (patch)
tree: b5dbfe547e13866be6a6192868080ebd5119da0c /simd
parent: 0aabb3f90fc8f4e477609a2c15d18804f14fb218 (diff)
6 files changed, 294 insertions, 10 deletions
diff --git a/simd/arm/jsimd.c b/simd/arm/arm/jsimd.c
index ed70ead..3a31eb7 100644
--- a/simd/arm/jsimd.c
+++ b/simd/arm/arm/jsimd.c
@@ -17,12 +17,12 @@
  */
 
 #define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
 #include "../../jsimd.h"
-#include "../../jdct.h"
-#include "../../jsimddct.h"
-#include "../jsimd.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -346,6 +346,17 @@ jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 GLOBAL(int)
 jsimd_can_h2v2_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -370,6 +381,9 @@ GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(void)
diff --git a/simd/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S
index af929fe..af929fe 100644
--- a/simd/arm/jsimd_neon.S
+++ b/simd/arm/arm/jsimd_neon.S
diff --git a/simd/arm64/jsimd.c b/simd/arm/arm64/jsimd.c
index 0e6c7b9..17ca247 100644
--- a/simd/arm64/jsimd.c
+++ b/simd/arm/arm64/jsimd.c
@@ -16,12 +16,12 @@
  */
 
 #define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
 #include "../../jsimd.h"
-#include "../../jdct.h"
-#include "../../jsimddct.h"
-#include "../jsimd.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -421,6 +421,17 @@ jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 GLOBAL(int)
 jsimd_can_h2v2_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -434,6 +445,9 @@ GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(void)
diff --git a/simd/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index 93472ef..93472ef 100644
--- a/simd/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
diff --git a/simd/arm/common/jdsample-neon.c b/simd/arm/common/jdsample-neon.c
new file mode 100644
index 0000000..c6bdac1
--- /dev/null
+++ b/simd/arm/common/jdsample-neon.c
@@ -0,0 +1,253 @@
+/*
+ * jdsample-neon.c - upsampling (Arm NEON)
+ *
+ * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/*
+ * The diagram below shows a grid-window of samples (luma or chroma) produced
+ * by h2v2 downsampling.
+ *
+ *                  s0        s1
+ *             +---------+---------+
+ *             | p0   p1 | p2   p3 |
+ *     r0      |         |         |
+ *             | p4   p5 | p6   p7 |
+ *             +---------+---------+
+ *             | p8   p9 | p10  p11|
+ *     r1      |         |         |
+ *             | p12  p13| p14  p15|
+ *             +---------+---------+
+ *             | p16  p17| p18  p19|
+ *     r2      |         |         |
+ *             | p20  p21| p22  p23|
+ *             +---------+---------+
+ *
+ * Every sample contains four of the original pixel channel values. The pixels'
+ * channel values are centred at positions p0, p1, p2,..., p23 above. For a
+ * given grid-window position, r1 is always used to denote the row of samples
+ * containing the pixel channel values we are computing. For the top row of
+ * pixel channel values in r1 (p8-p11), the nearest neighbouring samples are in
+ * the row above - denoted by r0. Likewise, for the bottom row of pixels in r1
+ * (p12-p15), the nearest neighbouring samples are in the row below - denoted
+ * by r2.
+ *
+ * To compute the pixel channel values of the original image, we proportionally
+ * blend the sample containing the pixel centre with the nearest neighbouring
+ * samples in each row, column and diagonal.
+ *
+ * There are three cases to consider:
+ *
+ * 1) The first pixel in this row of the original image.
+ *    Pixel channel value p8 only contains components from sample column s0.
+ *    Its value is computed by blending samples s0r1 and s0r0 in the ratio 3:1.
+ * 2) The last pixel in this row of the original image.
+ *    Pixel channel value p11 only contains components from sample column s1.
+ *    Its value is computed by blending samples s1r1 and s1r0 in the ratio 3:1.
+ * 3) General case (all other pixels in the row).
+ *    Apart from the first and last pixels, every other pixel channel value in
+ *    the row contains components from samples in adjacent columns.
+ *
+ *    For example, the pixel centred at p9 would be computed as follows:
+ *        (9/16 * s0r1) + (3/16 * s0r0) + (3/16 * s1r1) + (1/16 * s1r0)
+ *
+ *    This can be broken down into two steps:
+ *    1) Blend samples vertically in columns s0 and s1 in the ratio 3:1:
+ *        s0colsum = 3/4 * s0r1 + 1/4 * s0r0
+ *        s1colsum = 3/4 * s1r1 + 1/4 * s1r0
+ *    2) Blend the already-blended columns in the ratio 3:1:
+ *        p9 = 3/4 * s0colsum + 1/4 * s1colsum
+ *
+ * The bottom row of pixel channel values in row r1 can be computed in the same
+ * way for each of the three cases, only using samples in row r2 instead of row
+ * r0 - as r2 is the nearest neighbouring row.
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  /* Setup constants. */
+  const uint16x8_t seven_u16 = vdupq_n_u16(7);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+  const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */
+    /* respectively. */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    /* Case 1: first pixel channel value in this row of original image. */
+    int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+    *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+    int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+    *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+    /* General case as described above. */
+    /* Step 1: Blend samples vertically in columns s0 and s1. */
+    /* Leave the divide by 4 to the end when it can be done for both */
+    /* dimensions at once, right-shifting by 4. */
+
+    /* Load and compute s0colsum0 and s0colsum1. */
+    uint8x16_t s0r0 = vld1q_u8(inptr0);
+    uint8x16_t s0r1 = vld1q_u8(inptr1);
+    uint8x16_t s0r2 = vld1q_u8(inptr2);
+    /* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */
+    /* denote low half and high half respectively. */
+    uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
+                                      vget_low_u8(s0r1), three_u8);
+    uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
+                                      vget_high_u8(s0r1), three_u8);
+    uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
+                                      vget_low_u8(s0r1), three_u8);
+    uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
+                                      vget_high_u8(s0r1), three_u8);
+    /* Load and compute s1colsum0 and s1colsum1. */
+    uint8x16_t s1r0 = vld1q_u8(inptr0 + 1);
+    uint8x16_t s1r1 = vld1q_u8(inptr1 + 1);
+    uint8x16_t s1r2 = vld1q_u8(inptr2 + 1);
+    uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
+                                      vget_low_u8(s1r1), three_u8);
+    uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
+                                      vget_high_u8(s1r1), three_u8);
+    uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
+                                      vget_low_u8(s1r1), three_u8);
+    uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
+                                      vget_high_u8(s1r1), three_u8);
+    /* Step 2: Blend the already-blended columns. */
+    uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+    uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+    uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+    uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+    uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+    uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+    uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+    uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+    /* Add ordered dithering bias to odd pixel values. */
+    output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+    output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+    output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+    output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+    /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
+    uint8x16x2_t output_pixels0 = { vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+                                                vshrn_n_u16(output0_p1_h, 4)),
+                                    vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+                                                vrshrn_n_u16(output0_p2_h, 4))
+                                  };
+    uint8x16x2_t output_pixels1 = { vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+                                                vshrn_n_u16(output1_p1_h, 4)),
+                                    vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+                                                vrshrn_n_u16(output1_p2_h, 4))
+                                  };
+    /* Store pixel channel values to memory. */
+    /* The minimum size of the output buffer for each row is 64 bytes => no */
+    /* need to worry about buffer overflow here. See "Creation of 2-D sample */
+    /* arrays" in jmemmgr.c for details. */
+    vst2q_u8(outptr0 + 1, output_pixels0);
+    vst2q_u8(outptr1 + 1, output_pixels1);
+
+    /* The first pixel of the image shifted our loads and stores by one */
+    /* byte. We have to re-align on a 32-byte boundary at some point before */
+    /* the end of the row (we do it now on the 32/33 pixel boundary) to stay */
+    /* within the bounds of the sample buffers without having to resort to a */
+    /* slow scalar tail case for the last (downsampled_width % 16) samples. */
+    /* See "Creation of 2-D sample arrays" in jmemmgr.c for details.*/
+    for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) {
+      /* Step 1: Blend samples vertically in columns s0 and s1. */
+      /* Load and compute s0colsum0 and s0colsum1. */
+      s0r0 = vld1q_u8(inptr0 + colctr - 1);
+      s0r1 = vld1q_u8(inptr1 + colctr - 1);
+      s0r2 = vld1q_u8(inptr2 + colctr - 1);
+      s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
+                             vget_low_u8(s0r1), three_u8);
+      s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
+                             vget_high_u8(s0r1), three_u8);
+      s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
+                             vget_low_u8(s0r1), three_u8);
+      s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
+                             vget_high_u8(s0r1), three_u8);
+      /* Load and compute s1colsum0 and s1colsum1. */
+      s1r0 = vld1q_u8(inptr0 + colctr);
+      s1r1 = vld1q_u8(inptr1 + colctr);
+      s1r2 = vld1q_u8(inptr2 + colctr);
+      s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
+                             vget_low_u8(s1r1), three_u8);
+      s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
+                             vget_high_u8(s1r1), three_u8);
+      s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
+                             vget_low_u8(s1r1), three_u8);
+      s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
+                             vget_high_u8(s1r1), three_u8);
+      /* Step 2: Blend the already-blended columns. */
+      output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+      output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+      output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+      output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+      output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+      output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+      output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+      output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+      /* Add ordered dithering bias to odd pixel values. */
+      output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+      output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+      output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+      output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+      /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
+      output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+                                          vshrn_n_u16(output0_p1_h, 4));
+      output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+                                          vrshrn_n_u16(output0_p2_h, 4));
+      output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+                                          vshrn_n_u16(output1_p1_h, 4));
+      output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+                                          vrshrn_n_u16(output1_p2_h, 4));
+      /* Store pixel channel values to memory. */
+      vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+      vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+    }
+
+    /* Case 2: last pixel channel value in this row of the original image. */
+    int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr0[downsampled_width - 1]);
+    outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+    int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr2[downsampled_width - 1]);
+    outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+    inrow++;
+  }
+}
diff --git a/simd/jsimd.h b/simd/jsimd.h
index a9fc812..7c1e9e7 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -608,6 +608,9 @@ EXTERN(void) jsimd_h2v2_fancy_upsample_avx2
 EXTERN(void) jsimd_h2v1_fancy_upsample_neon
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
author	Jonathan Wright <jonathan.wright@arm.com>	2018-06-21 17:55:51 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	2019-05-31 13:47:12 +0100
commit	2a34770be9715cfc1badff10fceba52dd393b094 (patch)
tree	b5dbfe547e13866be6a6192868080ebd5119da0c /simd
parent	0aabb3f90fc8f4e477609a2c15d18804f14fb218 (diff)