diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2020-07-01 17:24:54 +0100 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2020-09-03 13:55:00 +0100 |
commit | 628b6a47cc8f16610d1ada02fc5c907652aa735d (patch) | |
tree | 608941267e5fb03be45a7122cc2c53d556a91529 | |
parent | a2a9d5e790805e88e560e7c84732b4e393597c36 (diff) |
Implement sample conversion using Arm NEON intrinsics
Adds an Arm NEON intrinsics implementation of 'convsamp'.
Removes the NEON assembly implementations for both AArch32 and
AArch64.
Bug: 922430
Change-Id: I77a705c6ebb1eb6be5ec9b73fc440046b84df76e
-rw-r--r-- | BUILD.gn | 2 | ||||
-rw-r--r-- | README.chromium | 1 | ||||
-rw-r--r-- | simd/arm/arm/jsimd_neon.S | 63 | ||||
-rw-r--r-- | simd/arm/arm64/jsimd_neon.S | 76 | ||||
-rw-r--r-- | simd/arm/common/jquanti-neon.c | 82 |
5 files changed, 85 insertions, 139 deletions
@@ -164,6 +164,7 @@ static_library("simd") { "simd/arm/common/jidctfst-neon.c", "simd/arm/common/jidctint-neon.c", "simd/arm/common/jidctred-neon.c", + "simd/arm/common/jquanti-neon.c", ] configs -= [ "//build/config/compiler:default_optimization" ] configs += [ "//build/config/compiler:optimize_speed" ] @@ -180,6 +181,7 @@ static_library("simd") { "simd/arm/common/jidctfst-neon.c", "simd/arm/common/jidctint-neon.c", "simd/arm/common/jidctred-neon.c", + "simd/arm/common/jquanti-neon.c", ] configs -= [ "//build/config/compiler:default_optimization" ] configs += [ "//build/config/compiler:optimize_speed" ] diff --git a/README.chromium b/README.chromium index 1ae3e1a..6db7a50 100644 --- a/README.chromium +++ b/README.chromium @@ -76,6 +76,7 @@ following changes which are not merged to upstream: - Implement RGB->YCbCr using Arm NEON intrinsics - Add Arm NEON implementation of RGB->Grayscale - Add compiler-independent alignment macro + - Implement sample conversion using Arm NEON intrinsics * Patches to enable running the upstream unit tests through gtest. The upstream unit tests are defined here under the section 'TESTS': https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/CMakeLists.txt diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S index 2aac28b..6565a0d 100644 --- a/simd/arm/arm/jsimd_neon.S +++ b/simd/arm/arm/jsimd_neon.S @@ -65,69 +65,6 @@ _\fname: /*****************************************************************************/ /* - * Load data into workspace, applying unsigned->signed conversion - * - * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get - * rid of VST1.16 instructions - */ - -asm_function jsimd_convsamp_neon - SAMPLE_DATA .req r0 - START_COL .req r1 - WORKSPACE .req r2 - TMP1 .req r3 - TMP2 .req r4 - TMP3 .req r5 - TMP4 .req ip - - push {r4, r5} - vmov.u8 d0, #128 - - ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, START_COL - add TMP2, TMP2, START_COL - add TMP3, TMP3, START_COL - add TMP4, TMP4, START_COL - vld1.8 {d16}, [TMP1] - vsubl.u8 q8, d16, d0 - vld1.8 {d18}, [TMP2] - vsubl.u8 q9, d18, d0 - vld1.8 {d20}, [TMP3] - vsubl.u8 q10, d20, d0 - vld1.8 {d22}, [TMP4] - ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} - vsubl.u8 q11, d22, d0 - vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! - add TMP1, TMP1, START_COL - add TMP2, TMP2, START_COL - vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! - add TMP3, TMP3, START_COL - add TMP4, TMP4, START_COL - vld1.8 {d24}, [TMP1] - vsubl.u8 q12, d24, d0 - vld1.8 {d26}, [TMP2] - vsubl.u8 q13, d26, d0 - vld1.8 {d28}, [TMP3] - vsubl.u8 q14, d28, d0 - vld1.8 {d30}, [TMP4] - vsubl.u8 q15, d30, d0 - vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! - vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! - pop {r4, r5} - bx lr - - .unreq SAMPLE_DATA - .unreq START_COL - .unreq WORKSPACE - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - - -/*****************************************************************************/ - -/* * jsimd_fdct_ifast_neon * * This function contains a fast, not so accurate integer implementation of diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S index 7c13445..fc60ad4 100644 --- a/simd/arm/arm64/jsimd_neon.S +++ b/simd/arm/arm64/jsimd_neon.S @@ -196,82 +196,6 @@ _\fname: /*****************************************************************************/ /* - * Load data into workspace, applying unsigned->signed conversion - * - * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get - * rid of VST1.16 instructions - */ - -asm_function jsimd_convsamp_neon - SAMPLE_DATA .req x0 - START_COL .req x1 - WORKSPACE .req x2 - TMP1 .req x9 - TMP2 .req x10 - TMP3 .req x11 - TMP4 .req x12 - TMP5 .req x13 - TMP6 .req x14 - TMP7 .req x15 - TMP8 .req x4 - TMPDUP .req w3 - - /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't - guarantee that the upper (unused) 32 bits of x1 are valid. This - instruction ensures that those bits are set to zero. */ - uxtw x1, w1 - - mov TMPDUP, #128 - ldp TMP1, TMP2, [SAMPLE_DATA], 16 - ldp TMP3, TMP4, [SAMPLE_DATA], 16 - dup v0.8b, TMPDUP - add TMP1, TMP1, START_COL - add TMP2, TMP2, START_COL - ldp TMP5, TMP6, [SAMPLE_DATA], 16 - add TMP3, TMP3, START_COL - add TMP4, TMP4, START_COL - ldp TMP7, TMP8, [SAMPLE_DATA], 16 - add TMP5, TMP5, START_COL - add TMP6, TMP6, START_COL - ld1 {v16.8b}, [TMP1] - add TMP7, TMP7, START_COL - add TMP8, TMP8, START_COL - ld1 {v17.8b}, [TMP2] - usubl v16.8h, v16.8b, v0.8b - ld1 {v18.8b}, [TMP3] - usubl v17.8h, v17.8b, v0.8b - ld1 {v19.8b}, [TMP4] - usubl v18.8h, v18.8b, v0.8b - ld1 {v20.8b}, [TMP5] - usubl v19.8h, v19.8b, v0.8b - ld1 {v21.8b}, [TMP6] - st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64 - usubl v20.8h, v20.8b, v0.8b - ld1 {v22.8b}, [TMP7] - usubl v21.8h, v21.8b, v0.8b - ld1 {v23.8b}, [TMP8] - usubl v22.8h, v22.8b, v0.8b - usubl v23.8h, v23.8b, v0.8b - st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64 - - br x30 - - .unreq SAMPLE_DATA - .unreq START_COL - .unreq WORKSPACE - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - .unreq TMP5 - .unreq TMP6 - .unreq TMP7 - .unreq TMP8 - .unreq TMPDUP - -/*****************************************************************************/ - -/* * jsimd_fdct_islow_neon * * This file contains a slow-but-accurate integer implementation of the diff --git a/simd/arm/common/jquanti-neon.c b/simd/arm/common/jquanti-neon.c new file mode 100644 index 0000000..ed0c1b3 --- /dev/null +++ b/simd/arm/common/jquanti-neon.c @@ -0,0 +1,82 @@ +/* + * jquanti-neon.c - sample quantization (Arm NEON) + * + * Copyright 2020 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +/* + * Pixel channel sample values have range [0,255]. The Discrete Cosine + * Transform (DCT) operates on values centered around 0. + * + * To prepare sample values for the DCT, load samples into a DCT workspace, + * subtracting CENTREJSAMPLE (128). The samples, now in range [-128, 127], + * are also widened from 8- to 16-bit. + * + * The equivalent scalar C function 'convsamp' can be found in jcdctmgr.c. + */ + +void jsimd_convsamp_neon(JSAMPARRAY sample_data, + JDIMENSION start_col, + DCTELEM *workspace) +{ + uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col); + uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col); + uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col); + uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col); + uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col); + uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col); + uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col); + uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col); + + int16x8_t row0 = vreinterpretq_s16_u16(vsubl_u8(samp_row0, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row1 = vreinterpretq_s16_u16(vsubl_u8(samp_row1, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row2 = vreinterpretq_s16_u16(vsubl_u8(samp_row2, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row3 = vreinterpretq_s16_u16(vsubl_u8(samp_row3, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row4 = vreinterpretq_s16_u16(vsubl_u8(samp_row4, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row5 = vreinterpretq_s16_u16(vsubl_u8(samp_row5, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row6 = vreinterpretq_s16_u16(vsubl_u8(samp_row6, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row7 = vreinterpretq_s16_u16(vsubl_u8(samp_row7, + vdup_n_u8(CENTERJSAMPLE))); + + vst1q_s16(workspace + 0 * DCTSIZE, row0); + vst1q_s16(workspace + 1 * DCTSIZE, row1); + vst1q_s16(workspace + 2 * DCTSIZE, row2); + vst1q_s16(workspace + 3 * DCTSIZE, row3); + vst1q_s16(workspace + 4 * DCTSIZE, row4); + vst1q_s16(workspace + 5 * DCTSIZE, row5); + vst1q_s16(workspace + 6 * DCTSIZE, row6); + vst1q_s16(workspace + 7 * DCTSIZE, row7); +} |