diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2020-06-30 15:27:11 +0100 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2020-08-10 14:51:20 +0100 |
commit | 64fc43d52351ed52143208ce6a656c03db56462b (patch) | |
tree | 497cf54f4ff2619c413fcaca1b5dc978e90bc1f9 | |
parent | 9d4f8005bc6c888e66b00fd00188531ee9bd3344 (diff) |
Add Arm NEON implementation of h2v1_downsample
Adds an Arm NEON intrinsics implementation of h2v1_downsample. This
intrinsics implementation is used to generate code for AArch32 and
AArch64; previously there was only an AArch64 NEON assembly
implementation.
Removes the AArch64 NEON assembly implementation from jsimd_neon.S.
Bug: 922430
Change-Id: I58a83635959fa1dd4923cecdef64792ebb57b54f
-rw-r--r-- | BUILD.gn | 2 | ||||
-rw-r--r-- | README.chromium | 1 | ||||
-rw-r--r-- | simd/arm/arm/jsimd.c | 16 | ||||
-rw-r--r-- | simd/arm/arm64/jsimd_neon.S | 76 | ||||
-rw-r--r-- | simd/arm/common/jcsample-neon.c | 122 |
5 files changed, 141 insertions, 76 deletions
@@ -155,6 +155,7 @@ static_library("simd") { sources = [ "simd/arm/arm/jsimd.c", "simd/arm/arm/jsimd_neon.S", + "simd/arm/common/jcsample-neon.c", "simd/arm/common/jdcolor-neon.c", "simd/arm/common/jdmerge-neon.c", "simd/arm/common/jdsample-neon.c", @@ -168,6 +169,7 @@ static_library("simd") { sources = [ "simd/arm/arm64/jsimd.c", "simd/arm/arm64/jsimd_neon.S", + "simd/arm/common/jcsample-neon.c", "simd/arm/common/jdcolor-neon.c", "simd/arm/common/jdmerge-neon.c", "simd/arm/common/jdsample-neon.c", diff --git a/README.chromium b/README.chromium index f70cb89..38a63b9 100644 --- a/README.chromium +++ b/README.chromium @@ -71,6 +71,7 @@ following changes which are not merged to upstream: - Implement slow IDCT using Arm NEON intrinsics - Precompute DCT block output pointers in IDCT functions - Implement fast IDCT using Arm NEON intrinsics + - Add Arm NEON implementation of h2v1_downsample * Patches to enable running the upstream unit tests through gtest. The upstream unit tests are defined here under the section 'TESTS': https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/CMakeLists.txt diff --git a/simd/arm/arm/jsimd.c b/simd/arm/arm/jsimd.c index ad176dc..2c5f4b9 100644 --- a/simd/arm/arm/jsimd.c +++ b/simd/arm/arm/jsimd.c @@ -304,6 +304,19 @@ jsimd_can_h2v2_downsample(void) GLOBAL(int) jsimd_can_h2v1_downsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -317,6 +330,9 @@ GLOBAL(void) jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); } GLOBAL(int) diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S index 020353f..0adea9d 100644 --- a/simd/arm/arm64/jsimd_neon.S +++ b/simd/arm/arm64/jsimd_neon.S @@ -1118,82 +1118,6 @@ asm_function jsimd_quantize_neon /* * Downsample pixel values of a single component. - * This version handles the common case of 2:1 horizontal and 1:1 vertical, - * without smoothing. - * - * GLOBAL(void) - * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, - * JDIMENSION v_samp_factor, - * JDIMENSION width_in_blocks, - * JSAMPARRAY input_data, JSAMPARRAY output_data); - */ - -asm_function jsimd_h2v1_downsample_neon - IMAGE_WIDTH .req x0 - MAX_V_SAMP .req x1 - V_SAMP .req x2 - BLOCK_WIDTH .req x3 - INPUT_DATA .req x4 - OUTPUT_DATA .req x5 - OUTPTR .req x9 - INPTR .req x10 - TMP1 .req x11 - TMP2 .req x12 - TMP3 .req x13 - TMPDUP .req w15 - - mov TMPDUP, #0x10000 - lsl TMP2, BLOCK_WIDTH, #4 - sub TMP2, TMP2, IMAGE_WIDTH - get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts - add TMP3, TMP3, TMP2, lsl #4 - dup v16.4s, TMPDUP - ld1 {v18.16b}, [TMP3] - -1: /* row loop */ - ldr INPTR, [INPUT_DATA], #8 - ldr OUTPTR, [OUTPUT_DATA], #8 - subs TMP1, BLOCK_WIDTH, #1 - b.eq 3f -2: /* columns */ - ld1 {v0.16b}, [INPTR], #16 - mov v4.16b, v16.16b - subs TMP1, TMP1, #1 - uadalp v4.8h, v0.16b - shrn v6.8b, v4.8h, #1 - st1 {v6.8b}, [OUTPTR], #8 - b.ne 2b -3: /* last columns */ - ld1 {v0.16b}, [INPTR] - mov v4.16b, v16.16b - subs V_SAMP, V_SAMP, #1 - /* expand right */ - tbl v2.16b, {v0.16b}, v18.16b - uadalp v4.8h, v2.16b - shrn v6.8b, v4.8h, #1 - st1 {v6.8b}, [OUTPTR], #8 - b.ne 1b - - br x30 - - .unreq IMAGE_WIDTH - .unreq MAX_V_SAMP - .unreq V_SAMP - .unreq BLOCK_WIDTH - .unreq INPUT_DATA - .unreq OUTPUT_DATA - .unreq OUTPTR - .unreq INPTR - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMPDUP - - -/*****************************************************************************/ - -/* - * Downsample pixel values of a single component. * This version handles the common case of 2:1 horizontal and 2:1 vertical, * without smoothing. * diff --git a/simd/arm/common/jcsample-neon.c b/simd/arm/common/jcsample-neon.c new file mode 100644 index 0000000..2b110ea --- /dev/null +++ b/simd/arm/common/jcsample-neon.c @@ -0,0 +1,122 @@ +/* + * jcsample-neon.c - downsampling (Arm NEON) + * + * Copyright 2020 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + + +static const uint8_t jsimd_h2_downsample_consts[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */ + 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */ + 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */ + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */ + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */ + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */ + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */ + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, + 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */ + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */ + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + + +/* + * Downsample pixel values of a single chroma component i.e. Cb, Cr. + * This version handles the common case of 2:1 horizontal and 1:1 vertical, + * without smoothing. + */ + +void jsimd_h2v1_downsample_neon(JDIMENSION image_width, + int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, + JSAMPARRAY input_data, + JSAMPARRAY output_data) +{ + JSAMPROW inptr, outptr; + /* Load expansion mask to pad remaining elements of last DCT block. */ + const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width); + const uint8x16_t expand_mask = vld1q_u8( + &jsimd_h2_downsample_consts[mask_offset]); + /* Load bias pattern alternating every pixel. */ + const uint16x8_t bias = { 0, 1, 0, 1, 0, 1, 0, 1 }; + + for (unsigned outrow = 0; outrow < v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr = input_data[outrow]; + + /* Downsample all but the last DCT block of pixels. */ + for (unsigned i = 0; i < width_in_blocks - 1; i++) { + uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE); + /* Add adjacent pixel values, widen to 16-bit and add bias. */ + uint16x8_t samples_u16 = vpadalq_u8(bias, pixels); + /* Divide total by 2 and narrow to 8-bit. */ + uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1); + /* Store samples to memory. */ + vst1_u8(outptr + i * DCTSIZE, samples_u8); + } + + /* Load pixels in last DCT block into a table. */ + uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE); +#if defined(__aarch64__) + /* Pad the empty elements with the value of the last pixel. */ + pixels = vqtbl1q_u8(pixels, expand_mask); +#else + uint8x8x2_t table = { vget_low_u8(pixels), vget_high_u8(pixels) }; + pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)), + vtbl2_u8(table, vget_high_u8(expand_mask))); +#endif + /* Add adjacent pixel values, widen to 16-bit and add bias. */ + uint16x8_t samples_u16 = vpadalq_u8(bias, pixels); + /* Divide total by 2, narrow to 8-bit and store. */ + uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1); + vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8); + } +} |