diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2019-05-08 18:18:27 +0100 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2019-06-13 12:59:30 +0100 |
commit | e1669e3707c6448a01c8a0dc3e4b20976a4dacf3 (patch) | |
tree | 2b38312bf724c4245a2a62994bb8f52d46b70f33 /simd | |
parent | 0927aa3f57d4a90d02576155e6ce4380aee98bcb (diff) |
Add Arm NEON implementation of h1v2_fancy_upsample
Adds an Arm NEON intrinsics implementation of h1v2_fancy_upsample.
There was no previous NEON assembly implementation for either AArch32
or AArch64.
Bug: 922430
Change-Id: Ifd10626083a48bd59a048a8de3ec8132b91d4bd5
Diffstat (limited to 'simd')
-rw-r--r-- | simd/arm/arm/jsimd.c | 14 | ||||
-rw-r--r-- | simd/arm/arm64/jsimd.c | 14 | ||||
-rw-r--r-- | simd/arm/common/jdsample-neon.c | 96 | ||||
-rw-r--r-- | simd/jsimd.h | 3 |
4 files changed, 127 insertions, 0 deletions
diff --git a/simd/arm/arm/jsimd.c b/simd/arm/arm/jsimd.c index 28012c1..264876e 100644 --- a/simd/arm/arm/jsimd.c +++ b/simd/arm/arm/jsimd.c @@ -380,6 +380,17 @@ jsimd_can_h2v1_fancy_upsample(void) GLOBAL(int) jsimd_can_h1v2_fancy_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -405,6 +416,9 @@ GLOBAL(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); } GLOBAL(int) diff --git a/simd/arm/arm64/jsimd.c b/simd/arm/arm64/jsimd.c index 7a472fc..a7a2547 100644 --- a/simd/arm/arm64/jsimd.c +++ b/simd/arm/arm64/jsimd.c @@ -444,6 +444,17 @@ jsimd_can_h2v1_fancy_upsample(void) GLOBAL(int) jsimd_can_h1v2_fancy_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -466,6 +477,9 @@ GLOBAL(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); } GLOBAL(int) diff --git a/simd/arm/common/jdsample-neon.c b/simd/arm/common/jdsample-neon.c index c6bdac1..d5a953c 100644 --- a/simd/arm/common/jdsample-neon.c +++ b/simd/arm/common/jdsample-neon.c @@ -251,3 +251,99 @@ void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor, inrow++; } } + + +/* + * The diagram below shows a grid-window of samples (luma or chroma) produced + * by h2v1 downsampling; which has been subsequently rotated 90 degrees. (The + * usual use of h1v2 upsampling is upsampling rotated or transposed h2v1 + * downsampled images.) + * + * s0 s1 + * +---------+---------+ + * | p0 | p1 | + * r0 | | | + * | p2 | p3 | + * +---------+---------+ + * | p4 | p5 | + * r1 | | | + * | p6 | p7 | + * +---------+---------+ + * | p8 | p9 | + * r2 | | | + * | p10 | p11 | + * +---------+---------+ + * + * Every sample contains two of the original pixel channel values. The pixels' + * channel values are centred at positions p0, p1, p2,..., p11 above. For a + * given grid-window position, r1 is always used to denote the row of samples + * containing the pixel channel values we are computing. For the top row of + * pixel channel values in r1 (p4 and p5), the nearest neighbouring samples are + * in the row above - denoted by r0. Likewise, for the bottom row of pixels in + * r1 (p6 and p7), the nearest neighbouring samples are in the row below - + * denoted by r2. + * + * To compute the pixel channel values of the original image, we proportionally + * blend the adjacent samples in each column. + * + * For example, the pixel channel value centred at p4 would be computed as + * follows: + * 3/4 * s0r1 + 1/4 * s0r0 + * while the pixel channel value centred at p6 would be: + * 3/4 * s0r1 + 1/4 * s0r2 + */ + +void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1; + int inrow, outrow; + /* Setup constants. */ + const uint16x8_t one_u16 = vdupq_n_u16(1); + const uint8x8_t three_u8 = vdup_n_u8(3); + + inrow = outrow = 0; + while (outrow < max_v_samp_factor) { + inptr0 = input_data[inrow - 1]; + inptr1 = input_data[inrow]; + inptr2 = input_data[inrow + 1]; + /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */ + /* respectively. */ + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + inrow++; + + /* The size of the input and output buffers is always a multiple of 32 */ + /* bytes => no need to worry about buffer overflow when reading/writing */ + /* memory. See "Creation of 2-D sample arrays" in jmemmgr.c for details. */ + for (unsigned colctr = 0; colctr < downsampled_width; colctr += 16) { + /* Load samples. */ + uint8x16_t r0 = vld1q_u8(inptr0 + colctr); + uint8x16_t r1 = vld1q_u8(inptr1 + colctr); + uint8x16_t r2 = vld1q_u8(inptr2 + colctr); + /* Blend samples vertically. */ + uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(r0)), + vget_low_u8(r1), three_u8); + uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(r0)), + vget_high_u8(r1), three_u8); + uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(r2)), + vget_low_u8(r1), three_u8); + uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(r2)), + vget_high_u8(r1), three_u8); + /* Add ordered dithering bias to pixel values in even output rows. */ + colsum0_l = vaddq_u16(colsum0_l, one_u16); + colsum0_h = vaddq_u16(colsum0_h, one_u16); + /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */ + uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2), + vshrn_n_u16(colsum0_h, 2)); + uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2), + vrshrn_n_u16(colsum1_h, 2)); + /* Store pixel channel values to memory. */ + vst1q_u8(outptr0 + colctr, output_pixels0); + vst1q_u8(outptr1 + colctr, output_pixels1); + } + } +} diff --git a/simd/jsimd.h b/simd/jsimd.h index 7c1e9e7..d20084b 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -611,6 +611,9 @@ EXTERN(void) jsimd_h2v1_fancy_upsample_neon EXTERN(void) jsimd_h2v2_fancy_upsample_neon (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h1v2_fancy_upsample_neon + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2 (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, |