summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2020-07-01 17:24:54 +0100
committerJonathan Wright <jonathan.wright@arm.com>2020-09-03 13:55:00 +0100
commit628b6a47cc8f16610d1ada02fc5c907652aa735d (patch)
tree608941267e5fb03be45a7122cc2c53d556a91529
parenta2a9d5e790805e88e560e7c84732b4e393597c36 (diff)
Implement sample conversion using Arm NEON intrinsics
Adds an Arm NEON intrinsics implementation of 'convsamp'. Removes the NEON assembly implementations for both AArch32 and AArch64. Bug: 922430 Change-Id: I77a705c6ebb1eb6be5ec9b73fc440046b84df76e
-rw-r--r--BUILD.gn2
-rw-r--r--README.chromium1
-rw-r--r--simd/arm/arm/jsimd_neon.S63
-rw-r--r--simd/arm/arm64/jsimd_neon.S76
-rw-r--r--simd/arm/common/jquanti-neon.c82
5 files changed, 85 insertions, 139 deletions
diff --git a/BUILD.gn b/BUILD.gn
index 901956a..942c490 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -164,6 +164,7 @@ static_library("simd") {
"simd/arm/common/jidctfst-neon.c",
"simd/arm/common/jidctint-neon.c",
"simd/arm/common/jidctred-neon.c",
+ "simd/arm/common/jquanti-neon.c",
]
configs -= [ "//build/config/compiler:default_optimization" ]
configs += [ "//build/config/compiler:optimize_speed" ]
@@ -180,6 +181,7 @@ static_library("simd") {
"simd/arm/common/jidctfst-neon.c",
"simd/arm/common/jidctint-neon.c",
"simd/arm/common/jidctred-neon.c",
+ "simd/arm/common/jquanti-neon.c",
]
configs -= [ "//build/config/compiler:default_optimization" ]
configs += [ "//build/config/compiler:optimize_speed" ]
diff --git a/README.chromium b/README.chromium
index 1ae3e1a..6db7a50 100644
--- a/README.chromium
+++ b/README.chromium
@@ -76,6 +76,7 @@ following changes which are not merged to upstream:
- Implement RGB->YCbCr using Arm NEON intrinsics
- Add Arm NEON implementation of RGB->Grayscale
- Add compiler-independent alignment macro
+ - Implement sample conversion using Arm NEON intrinsics
* Patches to enable running the upstream unit tests through gtest.
The upstream unit tests are defined here under the section 'TESTS':
https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/CMakeLists.txt
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S
index 2aac28b..6565a0d 100644
--- a/simd/arm/arm/jsimd_neon.S
+++ b/simd/arm/arm/jsimd_neon.S
@@ -65,69 +65,6 @@ _\fname:
/*****************************************************************************/
/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- * rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
- SAMPLE_DATA .req r0
- START_COL .req r1
- WORKSPACE .req r2
- TMP1 .req r3
- TMP2 .req r4
- TMP3 .req r5
- TMP4 .req ip
-
- push {r4, r5}
- vmov.u8 d0, #128
-
- ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- vld1.8 {d16}, [TMP1]
- vsubl.u8 q8, d16, d0
- vld1.8 {d18}, [TMP2]
- vsubl.u8 q9, d18, d0
- vld1.8 {d20}, [TMP3]
- vsubl.u8 q10, d20, d0
- vld1.8 {d22}, [TMP4]
- ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
- vsubl.u8 q11, d22, d0
- vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- vld1.8 {d24}, [TMP1]
- vsubl.u8 q12, d24, d0
- vld1.8 {d26}, [TMP2]
- vsubl.u8 q13, d26, d0
- vld1.8 {d28}, [TMP3]
- vsubl.u8 q14, d28, d0
- vld1.8 {d30}, [TMP4]
- vsubl.u8 q15, d30, d0
- vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
- vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
- pop {r4, r5}
- bx lr
-
- .unreq SAMPLE_DATA
- .unreq START_COL
- .unreq WORKSPACE
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
-
-/*****************************************************************************/
-
-/*
* jsimd_fdct_ifast_neon
*
* This function contains a fast, not so accurate integer implementation of
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index 7c13445..fc60ad4 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -196,82 +196,6 @@ _\fname:
/*****************************************************************************/
/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- * rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
- SAMPLE_DATA .req x0
- START_COL .req x1
- WORKSPACE .req x2
- TMP1 .req x9
- TMP2 .req x10
- TMP3 .req x11
- TMP4 .req x12
- TMP5 .req x13
- TMP6 .req x14
- TMP7 .req x15
- TMP8 .req x4
- TMPDUP .req w3
-
- /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x1 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x1, w1
-
- mov TMPDUP, #128
- ldp TMP1, TMP2, [SAMPLE_DATA], 16
- ldp TMP3, TMP4, [SAMPLE_DATA], 16
- dup v0.8b, TMPDUP
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- ldp TMP5, TMP6, [SAMPLE_DATA], 16
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- ldp TMP7, TMP8, [SAMPLE_DATA], 16
- add TMP5, TMP5, START_COL
- add TMP6, TMP6, START_COL
- ld1 {v16.8b}, [TMP1]
- add TMP7, TMP7, START_COL
- add TMP8, TMP8, START_COL
- ld1 {v17.8b}, [TMP2]
- usubl v16.8h, v16.8b, v0.8b
- ld1 {v18.8b}, [TMP3]
- usubl v17.8h, v17.8b, v0.8b
- ld1 {v19.8b}, [TMP4]
- usubl v18.8h, v18.8b, v0.8b
- ld1 {v20.8b}, [TMP5]
- usubl v19.8h, v19.8b, v0.8b
- ld1 {v21.8b}, [TMP6]
- st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
- usubl v20.8h, v20.8b, v0.8b
- ld1 {v22.8b}, [TMP7]
- usubl v21.8h, v21.8b, v0.8b
- ld1 {v23.8b}, [TMP8]
- usubl v22.8h, v22.8b, v0.8b
- usubl v23.8h, v23.8b, v0.8b
- st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
-
- br x30
-
- .unreq SAMPLE_DATA
- .unreq START_COL
- .unreq WORKSPACE
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
- .unreq TMP5
- .unreq TMP6
- .unreq TMP7
- .unreq TMP8
- .unreq TMPDUP
-
-/*****************************************************************************/
-
-/*
* jsimd_fdct_islow_neon
*
* This file contains a slow-but-accurate integer implementation of the
diff --git a/simd/arm/common/jquanti-neon.c b/simd/arm/common/jquanti-neon.c
new file mode 100644
index 0000000..ed0c1b3
--- /dev/null
+++ b/simd/arm/common/jquanti-neon.c
@@ -0,0 +1,82 @@
+/*
+ * jquanti-neon.c - sample quantization (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/*
+ * Pixel channel sample values have range [0,255]. The Discrete Cosine
+ * Transform (DCT) operates on values centered around 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTREJSAMPLE (128). The samples, now in range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function 'convsamp' can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+ uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+ uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+ uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+ uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+ uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+ uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+ uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+ int16x8_t row0 = vreinterpretq_s16_u16(vsubl_u8(samp_row0,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row1 = vreinterpretq_s16_u16(vsubl_u8(samp_row1,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row2 = vreinterpretq_s16_u16(vsubl_u8(samp_row2,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row3 = vreinterpretq_s16_u16(vsubl_u8(samp_row3,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row4 = vreinterpretq_s16_u16(vsubl_u8(samp_row4,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row5 = vreinterpretq_s16_u16(vsubl_u8(samp_row5,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row6 = vreinterpretq_s16_u16(vsubl_u8(samp_row6,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row7 = vreinterpretq_s16_u16(vsubl_u8(samp_row7,
+ vdup_n_u8(CENTERJSAMPLE)));
+
+ vst1q_s16(workspace + 0 * DCTSIZE, row0);
+ vst1q_s16(workspace + 1 * DCTSIZE, row1);
+ vst1q_s16(workspace + 2 * DCTSIZE, row2);
+ vst1q_s16(workspace + 3 * DCTSIZE, row3);
+ vst1q_s16(workspace + 4 * DCTSIZE, row4);
+ vst1q_s16(workspace + 5 * DCTSIZE, row5);
+ vst1q_s16(workspace + 6 * DCTSIZE, row6);
+ vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}