summaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2020-06-30 16:37:55 +0100
committerJonathan Wright <jonathan.wright@arm.com>2020-08-10 14:51:20 +0100
commitc7fb4c5dcfb72be3edac5941ee4632c8a69cf8a4 (patch)
tree8c42d6a1336f80a61c148e2db07d602574a7ec28 /simd
parent64fc43d52351ed52143208ce6a656c03db56462b (diff)
Add Arm NEON implementation of h2v2_downsample
Adds an Arm NEON intrinsics implementation of h2v2_downsample. This intrinsics implementation is used to generate code for AArch32 and AArch64; previously there was only an AArch64 NEON assembly implementation. Removes the AArch64 NEON assembly implementation from jsimd_neon.S. Bug: 922430 Change-Id: Ic529dfb1533ffdd56f9d92eb521f6a4299345adc
Diffstat (limited to 'simd')
-rw-r--r--simd/arm/arm/jsimd.c16
-rw-r--r--simd/arm/arm64/jsimd_neon.S124
-rw-r--r--simd/arm/common/jcsample-neon.c68
3 files changed, 84 insertions, 124 deletions
diff --git a/simd/arm/arm/jsimd.c b/simd/arm/arm/jsimd.c
index 2c5f4b9..0ae1b57 100644
--- a/simd/arm/arm/jsimd.c
+++ b/simd/arm/arm/jsimd.c
@@ -298,6 +298,19 @@ jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
GLOBAL(int)
jsimd_can_h2v2_downsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -324,6 +337,9 @@ GLOBAL(void)
jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
+ jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
}
GLOBAL(void)
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index 0adea9d..16614d1 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -104,43 +104,6 @@ Ljsimd_fdct_ifast_neon_consts:
.short (181 * 128) /* XFIX_0_707106781 */
.short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
-/* Constants for jsimd_h2*_downsample_neon() */
-
-.balign 16
-Ljsimd_h2_downsample_neon_consts:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
- 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
- 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
- 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
- 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */
- .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
- 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */
- .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
- 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */
- .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */
-
/* Constants for jsimd_huff_encode_one_block_neon() */
.balign 16
@@ -1117,93 +1080,6 @@ asm_function jsimd_quantize_neon
/*****************************************************************************/
/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 2:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
- * JDIMENSION v_samp_factor,
- * JDIMENSION width_in_blocks,
- * JSAMPARRAY input_data, JSAMPARRAY output_data);
- */
-
-.balign 16
-asm_function jsimd_h2v2_downsample_neon
- IMAGE_WIDTH .req x0
- MAX_V_SAMP .req x1
- V_SAMP .req x2
- BLOCK_WIDTH .req x3
- INPUT_DATA .req x4
- OUTPUT_DATA .req x5
- OUTPTR .req x9
- INPTR0 .req x10
- INPTR1 .req x14
- TMP1 .req x11
- TMP2 .req x12
- TMP3 .req x13
- TMPDUP .req w15
-
- mov TMPDUP, #1
- lsl TMP2, BLOCK_WIDTH, #4
- lsl TMPDUP, TMPDUP, #17
- sub TMP2, TMP2, IMAGE_WIDTH
- get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts
- orr TMPDUP, TMPDUP, #1
- add TMP3, TMP3, TMP2, lsl #4
- dup v16.4s, TMPDUP
- ld1 {v18.16b}, [TMP3]
-
-1: /* row loop */
- ldr INPTR0, [INPUT_DATA], #8
- ldr OUTPTR, [OUTPUT_DATA], #8
- ldr INPTR1, [INPUT_DATA], #8
- subs TMP1, BLOCK_WIDTH, #1
- b.eq 3f
-2: /* columns */
- ld1 {v0.16b}, [INPTR0], #16
- ld1 {v1.16b}, [INPTR1], #16
- mov v4.16b, v16.16b
- subs TMP1, TMP1, #1
- uadalp v4.8h, v0.16b
- uadalp v4.8h, v1.16b
- shrn v6.8b, v4.8h, #2
- st1 {v6.8b}, [OUTPTR], #8
- b.ne 2b
-3: /* last columns */
- ld1 {v0.16b}, [INPTR0], #16
- ld1 {v1.16b}, [INPTR1], #16
- mov v4.16b, v16.16b
- subs V_SAMP, V_SAMP, #1
- /* expand right */
- tbl v2.16b, {v0.16b}, v18.16b
- tbl v3.16b, {v1.16b}, v18.16b
- uadalp v4.8h, v2.16b
- uadalp v4.8h, v3.16b
- shrn v6.8b, v4.8h, #2
- st1 {v6.8b}, [OUTPTR], #8
- b.ne 1b
-
- br x30
-
- .unreq IMAGE_WIDTH
- .unreq MAX_V_SAMP
- .unreq V_SAMP
- .unreq BLOCK_WIDTH
- .unreq INPUT_DATA
- .unreq OUTPUT_DATA
- .unreq OUTPTR
- .unreq INPTR0
- .unreq INPTR1
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMPDUP
-
-
-/*****************************************************************************/
-
-/*
* GLOBAL(JOCTET *)
* jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
* JCOEFPTR block, int last_dc_val,
diff --git a/simd/arm/common/jcsample-neon.c b/simd/arm/common/jcsample-neon.c
index 2b110ea..ff989dc 100644
--- a/simd/arm/common/jcsample-neon.c
+++ b/simd/arm/common/jcsample-neon.c
@@ -120,3 +120,71 @@ void jsimd_h2v1_downsample_neon(JDIMENSION image_width,
vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
}
}
+
+
+/*
+ * Downsample pixel values of a single chroma component i.e. Cb, Cr.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width,
+ int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data)
+{
+ JSAMPROW inptr0, inptr1, outptr;
+ /* Load expansion mask to pad remaining elements of last DCT block. */
+ const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+ const uint8x16_t expand_mask = vld1q_u8(
+ &jsimd_h2_downsample_consts[mask_offset]);
+ /* Load bias pattern alternating every pixel. */
+ const uint16x8_t bias = { 1, 2, 1, 2, 1, 2, 1, 2 };
+
+ for (unsigned outrow = 0; outrow < v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr0 = input_data[outrow];
+ inptr1 = input_data[outrow + 1];
+
+ /* Downsample all but the last DCT block of pixels. */
+ for (unsigned i = 0; i < width_in_blocks - 1; i++) {
+ uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
+ uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
+ /* Add adjacent pixel values in row 0, widen to 16-bit and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+ /* Add adjacent pixel values in row 1, widen to 16-bit and accumulate. */
+ samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+ /* Divide total by 4 and narrow to 8-bit. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+ /* Store samples to memory and increment pointers. */
+ vst1_u8(outptr + i * DCTSIZE, samples_u8);
+ }
+
+ /* Load pixels in last DCT block into a table. */
+ uint8x16_t pixels_r0 = vld1q_u8(
+ inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+ uint8x16_t pixels_r1 = vld1q_u8(
+ inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__)
+ /* Pad the empty elements with the value of the last pixel. */
+ pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
+ pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
+#else
+ uint8x8x2_t table_r0 = { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) };
+ uint8x8x2_t table_r1 = { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) };
+ pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
+ vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
+ pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
+ vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
+#endif
+ /* Add adjacent pixel values in row 0, widen to 16-bit and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+ /* Add adjacent pixel values in row 1, widen to 16-bit and accumulate. */
+ samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+ /* Divide total by 4, narrow to 8-bit and store. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+ vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+ }
+}