diff options
author | Martyn Jacques <martyn.jacques@arm.com> | 2018-09-18 18:28:31 +0000 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2019-09-11 13:12:39 +0100 |
commit | 4591b71c9ff20680fcde089a37eeacb95d2d58d7 (patch) | |
tree | df1e9e67a1cc0d566f9c8bc49b24642156a8614c /simd | |
parent | baa5dc24258bf9af873b9105e8988c558c425a17 (diff) |
Implement 2x2 IDCT using Arm NEON intrinsics
Adds an Arm NEON intrinsics implementation of the 2x2 reduced-size
Inverse Discrete Cosine Transform (IDCT).
Removes the NEON assembly implementations for both AArch32 and
AArch64.
Bug: 922430
Change-Id: I8e0d4518d742f33957252d77e5f3f0ad3f80f234
Diffstat (limited to 'simd')
-rw-r--r-- | simd/arm/arm/jsimd_neon.S | 153 | ||||
-rw-r--r-- | simd/arm/arm64/jsimd_neon.S | 179 | ||||
-rw-r--r-- | simd/arm/common/jidctred-neon.c | 145 |
3 files changed, 145 insertions, 332 deletions
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S index 0e05819..2c44f88 100644 --- a/simd/arm/arm/jsimd_neon.S +++ b/simd/arm/arm/jsimd_neon.S @@ -1117,159 +1117,6 @@ asm_function jsimd_idct_4x4_neon /*****************************************************************************/ /* - * jsimd_idct_2x2_neon - * - * This function contains inverse-DCT code for getting reduced-size - * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' - * function from jpeg-6b (jidctred.c). - * - * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which - * requires much less arithmetic operations and hence should be faster. - * The primary purpose of this particular NEON optimized function is - * bit exact compatibility with jpeg-6b. - */ - -.balign 8 -jsimd_idct_2x2_neon_consts: - .short -FIX_0_720959822 /* d0[0] */ - .short FIX_0_850430095 /* d0[1] */ - .short -FIX_1_272758580 /* d0[2] */ - .short FIX_3_624509785 /* d0[3] */ - -.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 - vshll.s16 q14, \x4, #15 - vmull.s16 q13, \x6, d0[3] - vmlal.s16 q13, \x10, d0[2] - vmlal.s16 q13, \x12, d0[1] - vmlal.s16 q13, \x16, d0[0] - - vadd.s32 q10, q14, q13 - vsub.s32 q14, q14, q13 - - .if \shift > 16 - vrshr.s32 q10, q10, #\shift - vrshr.s32 q14, q14, #\shift - vmovn.s32 \y26, q10 - vmovn.s32 \y27, q14 - .else - vrshrn.s32 \y26, q10, #\shift - vrshrn.s32 \y27, q14, #\shift - .endif -.endm - -asm_function jsimd_idct_2x2_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req ip - - vpush {d8-d15} - - /* Load constants */ - adr TMP2, jsimd_idct_2x2_neon_consts - vld1.16 {d0}, [TMP2, :64] - - /* Load all COEF_BLOCK into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d4 | d5 - * 1 | d6 | d7 - * 2 | - | - - * 3 | d10 | d11 - * 4 | - | - - * 5 | d12 | d13 - * 6 | - | - - * 7 | d16 | d17 - */ - vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d10, d11}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d12, d13}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d16, d17}, [COEF_BLOCK, :128]! - /* Dequantize */ - vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! - vmul.s16 q2, q2, q9 - vmul.s16 q3, q3, q10 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d24, d25}, [DCT_TABLE, :128]! - vmul.s16 q5, q5, q12 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d26, d27}, [DCT_TABLE, :128]! - vmul.s16 q6, q6, q13 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d30, d31}, [DCT_TABLE, :128]! - vmul.s16 q8, q8, q15 - - /* Pass 1 */ -#if 0 - idct_helper d4, d6, d10, d12, d16, 13, d4, d6 - transpose_4x4 d4, d6, d8, d10 - idct_helper d5, d7, d11, d13, d17, 13, d5, d7 - transpose_4x4 d5, d7, d9, d11 -#else - vmull.s16 q13, d6, d0[3] - vmlal.s16 q13, d10, d0[2] - vmlal.s16 q13, d12, d0[1] - vmlal.s16 q13, d16, d0[0] - vmull.s16 q12, d7, d0[3] - vmlal.s16 q12, d11, d0[2] - vmlal.s16 q12, d13, d0[1] - vmlal.s16 q12, d17, d0[0] - vshll.s16 q14, d4, #15 - vshll.s16 q15, d5, #15 - vadd.s32 q10, q14, q13 - vsub.s32 q14, q14, q13 - vrshrn.s32 d4, q10, #13 - vrshrn.s32 d6, q14, #13 - vadd.s32 q10, q15, q12 - vsub.s32 q14, q15, q12 - vrshrn.s32 d5, q10, #13 - vrshrn.s32 d7, q14, #13 - vtrn.16 q2, q3 - vtrn.32 q3, q5 -#endif - - /* Pass 2 */ - idct_helper d4, d6, d10, d7, d11, 20, d26, d27 - - /* Range limit */ - vmov.u16 q15, #0x80 - vadd.s16 q13, q13, q15 - vqmovun.s16 d26, q13 - vqmovun.s16 d27, q13 - - /* Store results to the output buffer */ - ldmia OUTPUT_BUF, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - - vst1.8 {d26[0]}, [TMP1]! - vst1.8 {d27[4]}, [TMP1]! - vst1.8 {d26[1]}, [TMP2]! - vst1.8 {d27[5]}, [TMP2]! - - vpop {d8-d15} - bx lr - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - -.purgem idct_helper - - -/*****************************************************************************/ - -/* * jsimd_extrgb_ycc_convert_neon * jsimd_extbgr_ycc_convert_neon * jsimd_extrgbx_ycc_convert_neon diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S index 3a1d1ef..94f9b11 100644 --- a/simd/arm/arm64/jsimd_neon.S +++ b/simd/arm/arm64/jsimd_neon.S @@ -1251,185 +1251,6 @@ asm_function jsimd_idct_4x4_neon /*****************************************************************************/ /* - * jsimd_idct_2x2_neon - * - * This function contains inverse-DCT code for getting reduced-size - * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' - * function from jpeg-6b (jidctred.c). - * - * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which - * requires much less arithmetic operations and hence should be faster. - * The primary purpose of this particular NEON optimized function is - * bit exact compatibility with jpeg-6b. - */ - -.balign 8 -Ljsimd_idct_2x2_neon_consts: - .short -FIX_0_720959822 /* v14[0] */ - .short FIX_0_850430095 /* v14[1] */ - .short -FIX_1_272758580 /* v14[2] */ - .short FIX_3_624509785 /* v14[3] */ - -.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 - sshll v15.4s, \x4, #15 - smull v26.4s, \x6, v14.h[3] - smlal v26.4s, \x10, v14.h[2] - smlal v26.4s, \x12, v14.h[1] - smlal v26.4s, \x16, v14.h[0] - - add v20.4s, v15.4s, v26.4s - sub v15.4s, v15.4s, v26.4s - - .if \shift > 16 - srshr v20.4s, v20.4s, #\shift - srshr v15.4s, v15.4s, #\shift - xtn \y26, v20.4s - xtn \y27, v15.4s - .else - rshrn \y26, v20.4s, #\shift - rshrn \y27, v15.4s, #\shift - .endif -.endm - -asm_function jsimd_idct_2x2_neon - - DCT_TABLE .req x0 - COEF_BLOCK .req x1 - OUTPUT_BUF .req x2 - OUTPUT_COL .req x3 - TMP1 .req x0 - TMP2 .req x15 - - /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't - guarantee that the upper (unused) 32 bits of x3 are valid. This - instruction ensures that those bits are set to zero. */ - uxtw x3, w3 - - /* vpush {v8.4h - v15.4h} ; not available */ - sub sp, sp, 64 - mov x9, sp - - /* Load constants */ - adr TMP2, Ljsimd_idct_2x2_neon_consts - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 - ld1 {v14.4h}, [TMP2] - - /* Load all COEF_BLOCK into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | v4.4h | v5.4h - * 1 | v6.4h | v7.4h - * 2 | - | - - * 3 | v10.4h | v11.4h - * 4 | - | - - * 5 | v12.4h | v13.4h - * 6 | - | - - * 7 | v16.4h | v17.4h - */ - ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 - add COEF_BLOCK, COEF_BLOCK, #16 - ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 - add COEF_BLOCK, COEF_BLOCK, #16 - ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 - add COEF_BLOCK, COEF_BLOCK, #16 - ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 - /* Dequantize */ - ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 - mul v4.4h, v4.4h, v18.4h - mul v5.4h, v5.4h, v19.4h - ins v4.d[1], v5.d[0] - mul v6.4h, v6.4h, v20.4h - mul v7.4h, v7.4h, v21.4h - ins v6.d[1], v7.d[0] - add DCT_TABLE, DCT_TABLE, #16 - ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 - mul v10.4h, v10.4h, v24.4h - mul v11.4h, v11.4h, v25.4h - ins v10.d[1], v11.d[0] - add DCT_TABLE, DCT_TABLE, #16 - ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 - mul v12.4h, v12.4h, v26.4h - mul v13.4h, v13.4h, v27.4h - ins v12.d[1], v13.d[0] - add DCT_TABLE, DCT_TABLE, #16 - ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 - mul v16.4h, v16.4h, v30.4h - mul v17.4h, v17.4h, v31.4h - ins v16.d[1], v17.d[0] - - /* Pass 1 */ -#if 0 - idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h - transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h - idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h - transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h -#else - smull v26.4s, v6.4h, v14.h[3] - smlal v26.4s, v10.4h, v14.h[2] - smlal v26.4s, v12.4h, v14.h[1] - smlal v26.4s, v16.4h, v14.h[0] - smull v24.4s, v7.4h, v14.h[3] - smlal v24.4s, v11.4h, v14.h[2] - smlal v24.4s, v13.4h, v14.h[1] - smlal v24.4s, v17.4h, v14.h[0] - sshll v15.4s, v4.4h, #15 - sshll v30.4s, v5.4h, #15 - add v20.4s, v15.4s, v26.4s - sub v15.4s, v15.4s, v26.4s - rshrn v4.4h, v20.4s, #13 - rshrn v6.4h, v15.4s, #13 - add v20.4s, v30.4s, v24.4s - sub v15.4s, v30.4s, v24.4s - rshrn v5.4h, v20.4s, #13 - rshrn v7.4h, v15.4s, #13 - ins v4.d[1], v5.d[0] - ins v6.d[1], v7.d[0] - transpose v4, v6, v3, .16b, .8h - transpose v6, v10, v3, .16b, .4s - ins v11.d[0], v10.d[1] - ins v7.d[0], v6.d[1] -#endif - - /* Pass 2 */ - idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h - - /* Range limit */ - movi v30.8h, #0x80 - ins v26.d[1], v27.d[0] - add v26.8h, v26.8h, v30.8h - sqxtun v30.8b, v26.8h - ins v26.d[0], v30.d[0] - sqxtun v27.8b, v26.8h - - /* Store results to the output buffer */ - ldp TMP1, TMP2, [OUTPUT_BUF] - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - - st1 {v26.b}[0], [TMP1], 1 - st1 {v27.b}[4], [TMP1], 1 - st1 {v26.b}[1], [TMP2], 1 - st1 {v27.b}[5], [TMP2], 1 - - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - blr x30 - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - -.purgem idct_helper - - -/*****************************************************************************/ - -/* * jsimd_extrgb_ycc_convert_neon * jsimd_extbgr_ycc_convert_neon * jsimd_extrgbx_ycc_convert_neon diff --git a/simd/arm/common/jidctred-neon.c b/simd/arm/common/jidctred-neon.c new file mode 100644 index 0000000..1c1e24e --- /dev/null +++ b/simd/arm/common/jidctred-neon.c @@ -0,0 +1,145 @@ +/* + * jidctred-neon.c - reduced-size IDCT (Arm NEON) + * + * Copyright 2019 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define F_0_720 5906 +#define F_0_850 6967 +#define F_1_272 10426 +#define F_3_624 29692 + +/* + * 'jsimd_idct_2x2_neon' is an inverse-DCT function for getting reduced-size + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations and + * produces exactly the same output as IJG's original 'jpeg_idct_2x2' function + * from jpeg-6b, which can be found in jidctred.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.720959822 = 5906 * 2^-13 + * 0.850430095 = 6967 * 2^-13 + * 1.272758580 = 10426 * 2^-13 + * 3.624509785 = 29692 * 2^-13 + * + * See jidctred.c for further details of the 2x2 reduced IDCT algorithm. Where + * possible, the variable names and comments here in 'jsimd_idct_2x2_neon' + * match up with those in 'jpeg_idct_2x2'. + * + * NOTE: jpeg-8 has an improved implementation of the 2x2 inverse-DCT which + * requires fewer arithmetic operations and hence should be faster. The + * primary purpose of this particular NEON optimized function is bit + * exact compatibility with jpeg-6b. + */ + +void jsimd_idct_2x2_neon(void *dct_table, + JCOEFPTR coef_block, + JSAMPARRAY restrict output_buf, + JDIMENSION output_col) +{ + ISLOW_MULT_TYPE *quantptr = dct_table; + + /* Load DCT coefficients. */ + int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE); + int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE); + int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE); + int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE); + int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE); + + /* Load DCT quantization table. */ + int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE); + int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE); + int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE); + int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE); + int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE); + + /* Dequantize DCT coefficients. */ + row0 = vmulq_s16(row0, quant_row0); + row1 = vmulq_s16(row1, quant_row1); + row3 = vmulq_s16(row3, quant_row3); + row5 = vmulq_s16(row5, quant_row5); + row7 = vmulq_s16(row7, quant_row7); + + /* Pass 1: process input columns; put results in vectors row0 and row1. */ + /* Even part. */ + int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2); + int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2); + + /* Odd part. */ + int32x4_t tmp0_l = vmull_n_s16(vget_low_s16(row1), F_3_624); + tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row3), -F_1_272); + tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row5), F_0_850); + tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row7), -F_0_720); + int32x4_t tmp0_h = vmull_n_s16(vget_high_s16(row1), F_3_624); + tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row3), -F_1_272); + tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row5), F_0_850); + tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row7), -F_0_720); + + /* Final output stage: descale and narrow to 16-bit. */ + row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS), + vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS)); + row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS), + vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS)); + + /* Transpose two rows ready for second pass. */ + int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1); + int16x8_t cols_0246 = cols_0246_1357.val[0]; + int16x8_t cols_1357 = cols_0246_1357.val[1]; + /* Duplicate columns such that each is accessible in its own vector. */ + int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357), + vreinterpretq_s32_s16(cols_1357)); + int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]); + int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]); + + /* Pass 2: process 2 rows, store to output array. */ + /* Even part: only interested in col0; top half of tmp10 is "don't care". */ + int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2); + + /* Odd part. Only interested in bottom half of tmp0. */ + int32x4_t tmp0 = vmull_n_s16(vget_low_s16(cols_1155), F_3_624); + tmp0 = vmlal_n_s16(tmp0, vget_low_s16(cols_3377), -F_1_272); + tmp0 = vmlal_n_s16(tmp0, vget_high_s16(cols_1155), F_0_850); + tmp0 = vmlal_n_s16(tmp0, vget_high_s16(cols_3377), -F_0_720); + + /* Final output stage: descale and clamp to range [0-255]. */ + int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0), + vsubhn_s32(tmp10, tmp0)); + output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16, + CONST_BITS + PASS1_BITS + 3 + 2 - 16); + /* Narrow to 8-bit and convert to unsigned. */ + uint8x8_t output_u8 = vqmovun_s16(output_s16); + + /* Store 2x2 block to memory. */ + vst1_lane_u8(output_buf[0] + output_col, output_u8, 0); + vst1_lane_u8(output_buf[1] + output_col, output_u8, 1); + vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4); + vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5); +} |