Implement sample conversion using Arm NEON intrinsics

Adds an Arm NEON intrinsics implementation of 'convsamp'. Removes the NEON assembly implementations for both AArch32 and AArch64. Bug: 922430 Change-Id: I77a705c6ebb1eb6be5ec9b73fc440046b84df76e
author: Jonathan Wright <jonathan.wright@arm.com> 2020-07-01 17:24:54 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> 2020-09-03 13:55:00 +0100
commit: 628b6a47cc8f16610d1ada02fc5c907652aa735d (patch)
tree: 608941267e5fb03be45a7122cc2c53d556a91529
parent: a2a9d5e790805e88e560e7c84732b4e393597c36 (diff)
5 files changed, 85 insertions, 139 deletions
diff --git a/BUILD.gn b/BUILD.gn
index 901956a..942c490 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -164,6 +164,7 @@ static_library("simd") {
       "simd/arm/common/jidctfst-neon.c",
       "simd/arm/common/jidctint-neon.c",
       "simd/arm/common/jidctred-neon.c",
+      "simd/arm/common/jquanti-neon.c",
     ]
     configs -= [ "//build/config/compiler:default_optimization" ]
     configs += [ "//build/config/compiler:optimize_speed" ]
@@ -180,6 +181,7 @@ static_library("simd") {
       "simd/arm/common/jidctfst-neon.c",
       "simd/arm/common/jidctint-neon.c",
       "simd/arm/common/jidctred-neon.c",
+      "simd/arm/common/jquanti-neon.c",
     ]
     configs -= [ "//build/config/compiler:default_optimization" ]
     configs += [ "//build/config/compiler:optimize_speed" ]
diff --git a/README.chromium b/README.chromium
index 1ae3e1a..6db7a50 100644
--- a/README.chromium
+++ b/README.chromium
@@ -76,6 +76,7 @@ following changes which are not merged to upstream:
   - Implement RGB->YCbCr using Arm NEON intrinsics
   - Add Arm NEON implementation of RGB->Grayscale
   - Add compiler-independent alignment macro
+  - Implement sample conversion using Arm NEON intrinsics
 * Patches to enable running the upstream unit tests through gtest.
   The upstream unit tests are defined here under the section 'TESTS':
   https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/CMakeLists.txt
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S
index 2aac28b..6565a0d 100644
--- a/simd/arm/arm/jsimd_neon.S
+++ b/simd/arm/arm/jsimd_neon.S
@@ -65,69 +65,6 @@ _\fname:
 /*****************************************************************************/
 
 /*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- *       rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
-    SAMPLE_DATA     .req r0
-    START_COL       .req r1
-    WORKSPACE       .req r2
-    TMP1            .req r3
-    TMP2            .req r4
-    TMP3            .req r5
-    TMP4            .req ip
-
-    push            {r4, r5}
-    vmov.u8         d0, #128
-
-    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    vld1.8          {d16}, [TMP1]
-    vsubl.u8        q8, d16, d0
-    vld1.8          {d18}, [TMP2]
-    vsubl.u8        q9, d18, d0
-    vld1.8          {d20}, [TMP3]
-    vsubl.u8        q10, d20, d0
-    vld1.8          {d22}, [TMP4]
-    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
-    vsubl.u8        q11, d22, d0
-    vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    vld1.8          {d24}, [TMP1]
-    vsubl.u8        q12, d24, d0
-    vld1.8          {d26}, [TMP2]
-    vsubl.u8        q13, d26, d0
-    vld1.8          {d28}, [TMP3]
-    vsubl.u8        q14, d28, d0
-    vld1.8          {d30}, [TMP4]
-    vsubl.u8        q15, d30, d0
-    vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
-    vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
-    pop             {r4, r5}
-    bx              lr
-
-    .unreq          SAMPLE_DATA
-    .unreq          START_COL
-    .unreq          WORKSPACE
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-
-/*****************************************************************************/
-
-/*
  * jsimd_fdct_ifast_neon
  *
  * This function contains a fast, not so accurate integer implementation of
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index 7c13445..fc60ad4 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -196,82 +196,6 @@ _\fname:
 /*****************************************************************************/
 
 /*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- *       rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
-    SAMPLE_DATA     .req x0
-    START_COL       .req x1
-    WORKSPACE       .req x2
-    TMP1            .req x9
-    TMP2            .req x10
-    TMP3            .req x11
-    TMP4            .req x12
-    TMP5            .req x13
-    TMP6            .req x14
-    TMP7            .req x15
-    TMP8            .req x4
-    TMPDUP          .req w3
-
-    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x1 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x1, w1
-
-    mov             TMPDUP, #128
-    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
-    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
-    dup             v0.8b, TMPDUP
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
-    add             TMP5, TMP5, START_COL
-    add             TMP6, TMP6, START_COL
-    ld1             {v16.8b}, [TMP1]
-    add             TMP7, TMP7, START_COL
-    add             TMP8, TMP8, START_COL
-    ld1             {v17.8b}, [TMP2]
-    usubl           v16.8h, v16.8b, v0.8b
-    ld1             {v18.8b}, [TMP3]
-    usubl           v17.8h, v17.8b, v0.8b
-    ld1             {v19.8b}, [TMP4]
-    usubl           v18.8h, v18.8b, v0.8b
-    ld1             {v20.8b}, [TMP5]
-    usubl           v19.8h, v19.8b, v0.8b
-    ld1             {v21.8b}, [TMP6]
-    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
-    usubl           v20.8h, v20.8b, v0.8b
-    ld1             {v22.8b}, [TMP7]
-    usubl           v21.8h, v21.8b, v0.8b
-    ld1             {v23.8b}, [TMP8]
-    usubl           v22.8h, v22.8b, v0.8b
-    usubl           v23.8h, v23.8b, v0.8b
-    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
-
-    br              x30
-
-    .unreq          SAMPLE_DATA
-    .unreq          START_COL
-    .unreq          WORKSPACE
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-    .unreq          TMP5
-    .unreq          TMP6
-    .unreq          TMP7
-    .unreq          TMP8
-    .unreq          TMPDUP
-
-/*****************************************************************************/
-
-/*
  * jsimd_fdct_islow_neon
  *
  * This file contains a slow-but-accurate integer implementation of the
diff --git a/simd/arm/common/jquanti-neon.c b/simd/arm/common/jquanti-neon.c
new file mode 100644
index 0000000..ed0c1b3
--- /dev/null
+++ b/simd/arm/common/jquanti-neon.c
@@ -0,0 +1,82 @@
+/*
+ * jquanti-neon.c - sample quantization (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/*
+ * Pixel channel sample values have range [0,255]. The Discrete Cosine
+ * Transform (DCT) operates on values centered around 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTREJSAMPLE (128). The samples, now in range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function 'convsamp' can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data,
+                         JDIMENSION start_col,
+                         DCTELEM *workspace)
+{
+  uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+  uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+  uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+  uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+  uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+  uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+  uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+  uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+  int16x8_t row0 = vreinterpretq_s16_u16(vsubl_u8(samp_row0,
+                                                  vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row1 = vreinterpretq_s16_u16(vsubl_u8(samp_row1,
+                                                  vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row2 = vreinterpretq_s16_u16(vsubl_u8(samp_row2,
+                                                  vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row3 = vreinterpretq_s16_u16(vsubl_u8(samp_row3,
+                                                  vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row4 = vreinterpretq_s16_u16(vsubl_u8(samp_row4,
+                                                  vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row5 = vreinterpretq_s16_u16(vsubl_u8(samp_row5,
+                                                  vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row6 = vreinterpretq_s16_u16(vsubl_u8(samp_row6,
+                                                  vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row7 = vreinterpretq_s16_u16(vsubl_u8(samp_row7,
+                                                  vdup_n_u8(CENTERJSAMPLE)));
+
+  vst1q_s16(workspace + 0 * DCTSIZE, row0);
+  vst1q_s16(workspace + 1 * DCTSIZE, row1);
+  vst1q_s16(workspace + 2 * DCTSIZE, row2);
+  vst1q_s16(workspace + 3 * DCTSIZE, row3);
+  vst1q_s16(workspace + 4 * DCTSIZE, row4);
+  vst1q_s16(workspace + 5 * DCTSIZE, row5);
+  vst1q_s16(workspace + 6 * DCTSIZE, row6);
+  vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
author	Jonathan Wright <jonathan.wright@arm.com>	2020-07-01 17:24:54 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	2020-09-03 13:55:00 +0100
commit	628b6a47cc8f16610d1ada02fc5c907652aa735d (patch)
tree	608941267e5fb03be45a7122cc2c53d556a91529
parent	a2a9d5e790805e88e560e7c84732b4e393597c36 (diff)