diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2018-06-21 17:55:51 +0100 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2019-05-31 13:47:12 +0100 |
commit | 2a34770be9715cfc1badff10fceba52dd393b094 (patch) | |
tree | b5dbfe547e13866be6a6192868080ebd5119da0c /simd | |
parent | 0aabb3f90fc8f4e477609a2c15d18804f14fb218 (diff) |
Add Arm NEON implementation of h2v2_fancy_upsample
Adds an Arm NEON intrinsics implementation of h2v2_fancy_usample.
There was no previous NEON assembly implementation for either AArch32
or AArch64.
Bug: 922430
Change-Id: I479dd075cdeea945c8e0c4c9c60d1b55e74cb5ab
Diffstat (limited to 'simd')
-rw-r--r-- | simd/arm/arm/jsimd.c (renamed from simd/arm/jsimd.c) | 24 | ||||
-rw-r--r-- | simd/arm/arm/jsimd_neon.S (renamed from simd/arm/jsimd_neon.S) | 0 | ||||
-rw-r--r-- | simd/arm/arm64/jsimd.c (renamed from simd/arm64/jsimd.c) | 24 | ||||
-rw-r--r-- | simd/arm/arm64/jsimd_neon.S (renamed from simd/arm64/jsimd_neon.S) | 0 | ||||
-rw-r--r-- | simd/arm/common/jdsample-neon.c | 253 | ||||
-rw-r--r-- | simd/jsimd.h | 3 |
6 files changed, 294 insertions, 10 deletions
diff --git a/simd/arm/jsimd.c b/simd/arm/arm/jsimd.c index ed70ead..3a31eb7 100644 --- a/simd/arm/jsimd.c +++ b/simd/arm/arm/jsimd.c @@ -17,12 +17,12 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" #include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" -#include "../jsimd.h" #include <stdio.h> #include <string.h> @@ -346,6 +346,17 @@ jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, GLOBAL(int) jsimd_can_h2v2_fancy_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -370,6 +381,9 @@ GLOBAL(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); } GLOBAL(void) diff --git a/simd/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S index af929fe..af929fe 100644 --- a/simd/arm/jsimd_neon.S +++ b/simd/arm/arm/jsimd_neon.S diff --git a/simd/arm64/jsimd.c b/simd/arm/arm64/jsimd.c index 0e6c7b9..17ca247 100644 --- a/simd/arm64/jsimd.c +++ b/simd/arm/arm64/jsimd.c @@ -16,12 +16,12 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" #include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" -#include "../jsimd.h" #include <stdio.h> #include <string.h> @@ -421,6 +421,17 @@ jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, GLOBAL(int) jsimd_can_h2v2_fancy_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -434,6 +445,9 @@ GLOBAL(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); } GLOBAL(void) diff --git a/simd/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S index 93472ef..93472ef 100644 --- a/simd/arm64/jsimd_neon.S +++ b/simd/arm/arm64/jsimd_neon.S diff --git a/simd/arm/common/jdsample-neon.c b/simd/arm/common/jdsample-neon.c new file mode 100644 index 0000000..c6bdac1 --- /dev/null +++ b/simd/arm/common/jdsample-neon.c @@ -0,0 +1,253 @@ +/* + * jdsample-neon.c - upsampling (Arm NEON) + * + * Copyright 2019 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +/* + * The diagram below shows a grid-window of samples (luma or chroma) produced + * by h2v2 downsampling. + * + * s0 s1 + * +---------+---------+ + * | p0 p1 | p2 p3 | + * r0 | | | + * | p4 p5 | p6 p7 | + * +---------+---------+ + * | p8 p9 | p10 p11| + * r1 | | | + * | p12 p13| p14 p15| + * +---------+---------+ + * | p16 p17| p18 p19| + * r2 | | | + * | p20 p21| p22 p23| + * +---------+---------+ + * + * Every sample contains four of the original pixel channel values. The pixels' + * channel values are centred at positions p0, p1, p2,..., p23 above. For a + * given grid-window position, r1 is always used to denote the row of samples + * containing the pixel channel values we are computing. For the top row of + * pixel channel values in r1 (p8-p11), the nearest neighbouring samples are in + * the row above - denoted by r0. Likewise, for the bottom row of pixels in r1 + * (p12-p15), the nearest neighbouring samples are in the row below - denoted + * by r2. + * + * To compute the pixel channel values of the original image, we proportionally + * blend the sample containing the pixel centre with the nearest neighbouring + * samples in each row, column and diagonal. + * + * There are three cases to consider: + * + * 1) The first pixel in this row of the original image. + * Pixel channel value p8 only contains components from sample column s0. + * Its value is computed by blending samples s0r1 and s0r0 in the ratio 3:1. + * 2) The last pixel in this row of the original image. + * Pixel channel value p11 only contains components from sample column s1. + * Its value is computed by blending samples s1r1 and s1r0 in the ratio 3:1. + * 3) General case (all other pixels in the row). + * Apart from the first and last pixels, every other pixel channel value in + * the row contains components from samples in adjacent columns. + * + * For example, the pixel centred at p9 would be computed as follows: + * (9/16 * s0r1) + (3/16 * s0r0) + (3/16 * s1r1) + (1/16 * s1r0) + * + * This can be broken down into two steps: + * 1) Blend samples vertically in columns s0 and s1 in the ratio 3:1: + * s0colsum = 3/4 * s0r1 + 1/4 * s0r0 + * s1colsum = 3/4 * s1r1 + 1/4 * s1r0 + * 2) Blend the already-blended columns in the ratio 3:1: + * p9 = 3/4 * s0colsum + 1/4 * s1colsum + * + * The bottom row of pixel channel values in row r1 can be computed in the same + * way for each of the three cases, only using samples in row r2 instead of row + * r0 - as r2 is the nearest neighbouring row. + */ + +void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1; + int inrow, outrow; + /* Setup constants. */ + const uint16x8_t seven_u16 = vdupq_n_u16(7); + const uint8x8_t three_u8 = vdup_n_u8(3); + const uint16x8_t three_u16 = vdupq_n_u16(3); + + inrow = outrow = 0; + while (outrow < max_v_samp_factor) { + inptr0 = input_data[inrow - 1]; + inptr1 = input_data[inrow]; + inptr2 = input_data[inrow + 1]; + /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */ + /* respectively. */ + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + /* Case 1: first pixel channel value in this row of original image. */ + int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0); + *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4); + int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2); + *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4); + + /* General case as described above. */ + /* Step 1: Blend samples vertically in columns s0 and s1. */ + /* Leave the divide by 4 to the end when it can be done for both */ + /* dimensions at once, right-shifting by 4. */ + + /* Load and compute s0colsum0 and s0colsum1. */ + uint8x16_t s0r0 = vld1q_u8(inptr0); + uint8x16_t s0r1 = vld1q_u8(inptr1); + uint8x16_t s0r2 = vld1q_u8(inptr2); + /* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */ + /* denote low half and high half respectively. */ + uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)), + vget_low_u8(s0r1), three_u8); + uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)), + vget_high_u8(s0r1), three_u8); + uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)), + vget_low_u8(s0r1), three_u8); + uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)), + vget_high_u8(s0r1), three_u8); + /* Load and compute s1colsum0 and s1colsum1. */ + uint8x16_t s1r0 = vld1q_u8(inptr0 + 1); + uint8x16_t s1r1 = vld1q_u8(inptr1 + 1); + uint8x16_t s1r2 = vld1q_u8(inptr2 + 1); + uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)), + vget_low_u8(s1r1), three_u8); + uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)), + vget_high_u8(s1r1), three_u8); + uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)), + vget_low_u8(s1r1), three_u8); + uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)), + vget_high_u8(s1r1), three_u8); + /* Step 2: Blend the already-blended columns. */ + uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16); + uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16); + uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16); + uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16); + uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16); + uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16); + uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16); + uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16); + /* Add ordered dithering bias to odd pixel values. */ + output0_p1_l = vaddq_u16(output0_p1_l, seven_u16); + output0_p1_h = vaddq_u16(output0_p1_h, seven_u16); + output1_p1_l = vaddq_u16(output1_p1_l, seven_u16); + output1_p1_h = vaddq_u16(output1_p1_h, seven_u16); + /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */ + uint8x16x2_t output_pixels0 = { vcombine_u8(vshrn_n_u16(output0_p1_l, 4), + vshrn_n_u16(output0_p1_h, 4)), + vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), + vrshrn_n_u16(output0_p2_h, 4)) + }; + uint8x16x2_t output_pixels1 = { vcombine_u8(vshrn_n_u16(output1_p1_l, 4), + vshrn_n_u16(output1_p1_h, 4)), + vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), + vrshrn_n_u16(output1_p2_h, 4)) + }; + /* Store pixel channel values to memory. */ + /* The minimum size of the output buffer for each row is 64 bytes => no */ + /* need to worry about buffer overflow here. See "Creation of 2-D sample */ + /* arrays" in jmemmgr.c for details. */ + vst2q_u8(outptr0 + 1, output_pixels0); + vst2q_u8(outptr1 + 1, output_pixels1); + + /* The first pixel of the image shifted our loads and stores by one */ + /* byte. We have to re-align on a 32-byte boundary at some point before */ + /* the end of the row (we do it now on the 32/33 pixel boundary) to stay */ + /* within the bounds of the sample buffers without having to resort to a */ + /* slow scalar tail case for the last (downsampled_width % 16) samples. */ + /* See "Creation of 2-D sample arrays" in jmemmgr.c for details.*/ + for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) { + /* Step 1: Blend samples vertically in columns s0 and s1. */ + /* Load and compute s0colsum0 and s0colsum1. */ + s0r0 = vld1q_u8(inptr0 + colctr - 1); + s0r1 = vld1q_u8(inptr1 + colctr - 1); + s0r2 = vld1q_u8(inptr2 + colctr - 1); + s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)), + vget_low_u8(s0r1), three_u8); + s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)), + vget_high_u8(s0r1), three_u8); + s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)), + vget_low_u8(s0r1), three_u8); + s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)), + vget_high_u8(s0r1), three_u8); + /* Load and compute s1colsum0 and s1colsum1. */ + s1r0 = vld1q_u8(inptr0 + colctr); + s1r1 = vld1q_u8(inptr1 + colctr); + s1r2 = vld1q_u8(inptr2 + colctr); + s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)), + vget_low_u8(s1r1), three_u8); + s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)), + vget_high_u8(s1r1), three_u8); + s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)), + vget_low_u8(s1r1), three_u8); + s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)), + vget_high_u8(s1r1), three_u8); + /* Step 2: Blend the already-blended columns. */ + output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16); + output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16); + output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16); + output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16); + output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16); + output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16); + output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16); + output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16); + /* Add ordered dithering bias to odd pixel values. */ + output0_p1_l = vaddq_u16(output0_p1_l, seven_u16); + output0_p1_h = vaddq_u16(output0_p1_h, seven_u16); + output1_p1_l = vaddq_u16(output1_p1_l, seven_u16); + output1_p1_h = vaddq_u16(output1_p1_h, seven_u16); + /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */ + output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4), + vshrn_n_u16(output0_p1_h, 4)); + output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), + vrshrn_n_u16(output0_p2_h, 4)); + output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4), + vshrn_n_u16(output1_p1_h, 4)); + output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), + vrshrn_n_u16(output1_p2_h, 4)); + /* Store pixel channel values to memory. */ + vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0); + vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1); + } + + /* Case 2: last pixel channel value in this row of the original image. */ + int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 + + GETJSAMPLE(inptr0[downsampled_width - 1]); + outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4); + int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 + + GETJSAMPLE(inptr2[downsampled_width - 1]); + outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4); + inrow++; + } +} diff --git a/simd/jsimd.h b/simd/jsimd.h index a9fc812..7c1e9e7 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -608,6 +608,9 @@ EXTERN(void) jsimd_h2v2_fancy_upsample_avx2 EXTERN(void) jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_neon + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2 (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, |