summaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2020-07-01 17:24:54 +0100
committerJonathan Wright <jonathan.wright@arm.com>2020-09-03 13:55:00 +0100
commit628b6a47cc8f16610d1ada02fc5c907652aa735d (patch)
tree608941267e5fb03be45a7122cc2c53d556a91529 /simd
parenta2a9d5e790805e88e560e7c84732b4e393597c36 (diff)
Implement sample conversion using Arm NEON intrinsics
Adds an Arm NEON intrinsics implementation of 'convsamp'. Removes the NEON assembly implementations for both AArch32 and AArch64. Bug: 922430 Change-Id: I77a705c6ebb1eb6be5ec9b73fc440046b84df76e
Diffstat (limited to 'simd')
-rw-r--r--simd/arm/arm/jsimd_neon.S63
-rw-r--r--simd/arm/arm64/jsimd_neon.S76
-rw-r--r--simd/arm/common/jquanti-neon.c82
3 files changed, 82 insertions, 139 deletions
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S
index 2aac28b..6565a0d 100644
--- a/simd/arm/arm/jsimd_neon.S
+++ b/simd/arm/arm/jsimd_neon.S
@@ -65,69 +65,6 @@ _\fname:
/*****************************************************************************/
/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- * rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
- SAMPLE_DATA .req r0
- START_COL .req r1
- WORKSPACE .req r2
- TMP1 .req r3
- TMP2 .req r4
- TMP3 .req r5
- TMP4 .req ip
-
- push {r4, r5}
- vmov.u8 d0, #128
-
- ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- vld1.8 {d16}, [TMP1]
- vsubl.u8 q8, d16, d0
- vld1.8 {d18}, [TMP2]
- vsubl.u8 q9, d18, d0
- vld1.8 {d20}, [TMP3]
- vsubl.u8 q10, d20, d0
- vld1.8 {d22}, [TMP4]
- ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
- vsubl.u8 q11, d22, d0
- vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- vld1.8 {d24}, [TMP1]
- vsubl.u8 q12, d24, d0
- vld1.8 {d26}, [TMP2]
- vsubl.u8 q13, d26, d0
- vld1.8 {d28}, [TMP3]
- vsubl.u8 q14, d28, d0
- vld1.8 {d30}, [TMP4]
- vsubl.u8 q15, d30, d0
- vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
- vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
- pop {r4, r5}
- bx lr
-
- .unreq SAMPLE_DATA
- .unreq START_COL
- .unreq WORKSPACE
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
-
-/*****************************************************************************/
-
-/*
* jsimd_fdct_ifast_neon
*
* This function contains a fast, not so accurate integer implementation of
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index 7c13445..fc60ad4 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -196,82 +196,6 @@ _\fname:
/*****************************************************************************/
/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- * rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
- SAMPLE_DATA .req x0
- START_COL .req x1
- WORKSPACE .req x2
- TMP1 .req x9
- TMP2 .req x10
- TMP3 .req x11
- TMP4 .req x12
- TMP5 .req x13
- TMP6 .req x14
- TMP7 .req x15
- TMP8 .req x4
- TMPDUP .req w3
-
- /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x1 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x1, w1
-
- mov TMPDUP, #128
- ldp TMP1, TMP2, [SAMPLE_DATA], 16
- ldp TMP3, TMP4, [SAMPLE_DATA], 16
- dup v0.8b, TMPDUP
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- ldp TMP5, TMP6, [SAMPLE_DATA], 16
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- ldp TMP7, TMP8, [SAMPLE_DATA], 16
- add TMP5, TMP5, START_COL
- add TMP6, TMP6, START_COL
- ld1 {v16.8b}, [TMP1]
- add TMP7, TMP7, START_COL
- add TMP8, TMP8, START_COL
- ld1 {v17.8b}, [TMP2]
- usubl v16.8h, v16.8b, v0.8b
- ld1 {v18.8b}, [TMP3]
- usubl v17.8h, v17.8b, v0.8b
- ld1 {v19.8b}, [TMP4]
- usubl v18.8h, v18.8b, v0.8b
- ld1 {v20.8b}, [TMP5]
- usubl v19.8h, v19.8b, v0.8b
- ld1 {v21.8b}, [TMP6]
- st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
- usubl v20.8h, v20.8b, v0.8b
- ld1 {v22.8b}, [TMP7]
- usubl v21.8h, v21.8b, v0.8b
- ld1 {v23.8b}, [TMP8]
- usubl v22.8h, v22.8b, v0.8b
- usubl v23.8h, v23.8b, v0.8b
- st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
-
- br x30
-
- .unreq SAMPLE_DATA
- .unreq START_COL
- .unreq WORKSPACE
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
- .unreq TMP5
- .unreq TMP6
- .unreq TMP7
- .unreq TMP8
- .unreq TMPDUP
-
-/*****************************************************************************/
-
-/*
* jsimd_fdct_islow_neon
*
* This file contains a slow-but-accurate integer implementation of the
diff --git a/simd/arm/common/jquanti-neon.c b/simd/arm/common/jquanti-neon.c
new file mode 100644
index 0000000..ed0c1b3
--- /dev/null
+++ b/simd/arm/common/jquanti-neon.c
@@ -0,0 +1,82 @@
+/*
+ * jquanti-neon.c - sample quantization (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/*
+ * Pixel channel sample values have range [0,255]. The Discrete Cosine
+ * Transform (DCT) operates on values centered around 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTREJSAMPLE (128). The samples, now in range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function 'convsamp' can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+ uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+ uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+ uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+ uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+ uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+ uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+ uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+ int16x8_t row0 = vreinterpretq_s16_u16(vsubl_u8(samp_row0,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row1 = vreinterpretq_s16_u16(vsubl_u8(samp_row1,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row2 = vreinterpretq_s16_u16(vsubl_u8(samp_row2,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row3 = vreinterpretq_s16_u16(vsubl_u8(samp_row3,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row4 = vreinterpretq_s16_u16(vsubl_u8(samp_row4,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row5 = vreinterpretq_s16_u16(vsubl_u8(samp_row5,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row6 = vreinterpretq_s16_u16(vsubl_u8(samp_row6,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row7 = vreinterpretq_s16_u16(vsubl_u8(samp_row7,
+ vdup_n_u8(CENTERJSAMPLE)));
+
+ vst1q_s16(workspace + 0 * DCTSIZE, row0);
+ vst1q_s16(workspace + 1 * DCTSIZE, row1);
+ vst1q_s16(workspace + 2 * DCTSIZE, row2);
+ vst1q_s16(workspace + 3 * DCTSIZE, row3);
+ vst1q_s16(workspace + 4 * DCTSIZE, row4);
+ vst1q_s16(workspace + 5 * DCTSIZE, row5);
+ vst1q_s16(workspace + 6 * DCTSIZE, row6);
+ vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}