diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2020-06-30 16:37:55 +0100 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2020-08-10 14:51:20 +0100 |
commit | c7fb4c5dcfb72be3edac5941ee4632c8a69cf8a4 (patch) | |
tree | 8c42d6a1336f80a61c148e2db07d602574a7ec28 /simd | |
parent | 64fc43d52351ed52143208ce6a656c03db56462b (diff) |
Add Arm NEON implementation of h2v2_downsample
Adds an Arm NEON intrinsics implementation of h2v2_downsample. This
intrinsics implementation is used to generate code for AArch32 and
AArch64; previously there was only an AArch64 NEON assembly
implementation.
Removes the AArch64 NEON assembly implementation from jsimd_neon.S.
Bug: 922430
Change-Id: Ic529dfb1533ffdd56f9d92eb521f6a4299345adc
Diffstat (limited to 'simd')
-rw-r--r-- | simd/arm/arm/jsimd.c | 16 | ||||
-rw-r--r-- | simd/arm/arm64/jsimd_neon.S | 124 | ||||
-rw-r--r-- | simd/arm/common/jcsample-neon.c | 68 |
3 files changed, 84 insertions, 124 deletions
diff --git a/simd/arm/arm/jsimd.c b/simd/arm/arm/jsimd.c index 2c5f4b9..0ae1b57 100644 --- a/simd/arm/arm/jsimd.c +++ b/simd/arm/arm/jsimd.c @@ -298,6 +298,19 @@ jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, GLOBAL(int) jsimd_can_h2v2_downsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -324,6 +337,9 @@ GLOBAL(void) jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); } GLOBAL(void) diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S index 0adea9d..16614d1 100644 --- a/simd/arm/arm64/jsimd_neon.S +++ b/simd/arm/arm64/jsimd_neon.S @@ -104,43 +104,6 @@ Ljsimd_fdct_ifast_neon_consts: .short (181 * 128) /* XFIX_0_707106781 */ .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ -/* Constants for jsimd_h2*_downsample_neon() */ - -.balign 16 -Ljsimd_h2_downsample_neon_consts: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \ - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \ - 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \ - 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \ - 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */ - .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \ - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */ - .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \ - 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */ - .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */ - /* Constants for jsimd_huff_encode_one_block_neon() */ .balign 16 @@ -1117,93 +1080,6 @@ asm_function jsimd_quantize_neon /*****************************************************************************/ /* - * Downsample pixel values of a single component. - * This version handles the common case of 2:1 horizontal and 2:1 vertical, - * without smoothing. - * - * GLOBAL(void) - * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, - * JDIMENSION v_samp_factor, - * JDIMENSION width_in_blocks, - * JSAMPARRAY input_data, JSAMPARRAY output_data); - */ - -.balign 16 -asm_function jsimd_h2v2_downsample_neon - IMAGE_WIDTH .req x0 - MAX_V_SAMP .req x1 - V_SAMP .req x2 - BLOCK_WIDTH .req x3 - INPUT_DATA .req x4 - OUTPUT_DATA .req x5 - OUTPTR .req x9 - INPTR0 .req x10 - INPTR1 .req x14 - TMP1 .req x11 - TMP2 .req x12 - TMP3 .req x13 - TMPDUP .req w15 - - mov TMPDUP, #1 - lsl TMP2, BLOCK_WIDTH, #4 - lsl TMPDUP, TMPDUP, #17 - sub TMP2, TMP2, IMAGE_WIDTH - get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts - orr TMPDUP, TMPDUP, #1 - add TMP3, TMP3, TMP2, lsl #4 - dup v16.4s, TMPDUP - ld1 {v18.16b}, [TMP3] - -1: /* row loop */ - ldr INPTR0, [INPUT_DATA], #8 - ldr OUTPTR, [OUTPUT_DATA], #8 - ldr INPTR1, [INPUT_DATA], #8 - subs TMP1, BLOCK_WIDTH, #1 - b.eq 3f -2: /* columns */ - ld1 {v0.16b}, [INPTR0], #16 - ld1 {v1.16b}, [INPTR1], #16 - mov v4.16b, v16.16b - subs TMP1, TMP1, #1 - uadalp v4.8h, v0.16b - uadalp v4.8h, v1.16b - shrn v6.8b, v4.8h, #2 - st1 {v6.8b}, [OUTPTR], #8 - b.ne 2b -3: /* last columns */ - ld1 {v0.16b}, [INPTR0], #16 - ld1 {v1.16b}, [INPTR1], #16 - mov v4.16b, v16.16b - subs V_SAMP, V_SAMP, #1 - /* expand right */ - tbl v2.16b, {v0.16b}, v18.16b - tbl v3.16b, {v1.16b}, v18.16b - uadalp v4.8h, v2.16b - uadalp v4.8h, v3.16b - shrn v6.8b, v4.8h, #2 - st1 {v6.8b}, [OUTPTR], #8 - b.ne 1b - - br x30 - - .unreq IMAGE_WIDTH - .unreq MAX_V_SAMP - .unreq V_SAMP - .unreq BLOCK_WIDTH - .unreq INPUT_DATA - .unreq OUTPUT_DATA - .unreq OUTPTR - .unreq INPTR0 - .unreq INPTR1 - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMPDUP - - -/*****************************************************************************/ - -/* * GLOBAL(JOCTET *) * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, * JCOEFPTR block, int last_dc_val, diff --git a/simd/arm/common/jcsample-neon.c b/simd/arm/common/jcsample-neon.c index 2b110ea..ff989dc 100644 --- a/simd/arm/common/jcsample-neon.c +++ b/simd/arm/common/jcsample-neon.c @@ -120,3 +120,71 @@ void jsimd_h2v1_downsample_neon(JDIMENSION image_width, vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8); } } + + +/* + * Downsample pixel values of a single chroma component i.e. Cb, Cr. + * This version handles the standard case of 2:1 horizontal and 2:1 vertical, + * without smoothing. + */ + +void jsimd_h2v2_downsample_neon(JDIMENSION image_width, + int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, + JSAMPARRAY input_data, + JSAMPARRAY output_data) +{ + JSAMPROW inptr0, inptr1, outptr; + /* Load expansion mask to pad remaining elements of last DCT block. */ + const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width); + const uint8x16_t expand_mask = vld1q_u8( + &jsimd_h2_downsample_consts[mask_offset]); + /* Load bias pattern alternating every pixel. */ + const uint16x8_t bias = { 1, 2, 1, 2, 1, 2, 1, 2 }; + + for (unsigned outrow = 0; outrow < v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr0 = input_data[outrow]; + inptr1 = input_data[outrow + 1]; + + /* Downsample all but the last DCT block of pixels. */ + for (unsigned i = 0; i < width_in_blocks - 1; i++) { + uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE); + uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE); + /* Add adjacent pixel values in row 0, widen to 16-bit and add bias. */ + uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0); + /* Add adjacent pixel values in row 1, widen to 16-bit and accumulate. */ + samples_u16 = vpadalq_u8(samples_u16, pixels_r1); + /* Divide total by 4 and narrow to 8-bit. */ + uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2); + /* Store samples to memory and increment pointers. */ + vst1_u8(outptr + i * DCTSIZE, samples_u8); + } + + /* Load pixels in last DCT block into a table. */ + uint8x16_t pixels_r0 = vld1q_u8( + inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE); + uint8x16_t pixels_r1 = vld1q_u8( + inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE); +#if defined(__aarch64__) + /* Pad the empty elements with the value of the last pixel. */ + pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask); + pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask); +#else + uint8x8x2_t table_r0 = { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) }; + uint8x8x2_t table_r1 = { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) }; + pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)), + vtbl2_u8(table_r0, vget_high_u8(expand_mask))); + pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)), + vtbl2_u8(table_r1, vget_high_u8(expand_mask))); +#endif + /* Add adjacent pixel values in row 0, widen to 16-bit and add bias. */ + uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0); + /* Add adjacent pixel values in row 1, widen to 16-bit and accumulate. */ + samples_u16 = vpadalq_u8(samples_u16, pixels_r1); + /* Divide total by 4, narrow to 8-bit and store. */ + uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2); + vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8); + } +} |