summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--BUILD.gn2
-rw-r--r--README.chromium1
-rw-r--r--simd/arm/arm/jsimd_neon.S153
-rw-r--r--simd/arm/arm64/jsimd_neon.S179
-rw-r--r--simd/arm/common/jidctred-neon.c145
5 files changed, 148 insertions, 332 deletions
diff --git a/BUILD.gn b/BUILD.gn
index 36df0fc..ff70cf0 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -153,6 +153,7 @@ static_library("simd") {
sources = [
"simd/arm/arm/jsimd.c",
"simd/arm/arm/jsimd_neon.S",
+ "simd/arm/common/jidctred-neon.c",
"simd/arm/common/jdcolor-neon.c",
"simd/arm/common/jdmerge-neon.c",
"simd/arm/common/jdsample-neon.c",
@@ -163,6 +164,7 @@ static_library("simd") {
sources = [
"simd/arm/arm64/jsimd.c",
"simd/arm/arm64/jsimd_neon.S",
+ "simd/arm/common/jidctred-neon.c",
"simd/arm/common/jdcolor-neon.c",
"simd/arm/common/jdmerge-neon.c",
"simd/arm/common/jdsample-neon.c",
diff --git a/README.chromium b/README.chromium
index 3be11df..7a8aa13 100644
--- a/README.chromium
+++ b/README.chromium
@@ -64,6 +64,7 @@ following changes which are not merged to upstream:
- Implement YCbCr->RGB565 using Arm NEON intrinsics
- Add Arm NEON implementation of h2v1_merged_upsample
- Add Arm NEON implementation of h2v2_merged_upsample
+ - Implement 2x2 IDCT using Arm NEON intrinsics
Refer to working-with-nested-repos [1] for details of how to setup your git
svn client to update the code (for making local changes, cherry picking from
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S
index 0e05819..2c44f88 100644
--- a/simd/arm/arm/jsimd_neon.S
+++ b/simd/arm/arm/jsimd_neon.S
@@ -1117,159 +1117,6 @@ asm_function jsimd_idct_4x4_neon
/*****************************************************************************/
/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- * requires much less arithmetic operations and hence should be faster.
- * The primary purpose of this particular NEON optimized function is
- * bit exact compatibility with jpeg-6b.
- */
-
-.balign 8
-jsimd_idct_2x2_neon_consts:
- .short -FIX_0_720959822 /* d0[0] */
- .short FIX_0_850430095 /* d0[1] */
- .short -FIX_1_272758580 /* d0[2] */
- .short FIX_3_624509785 /* d0[3] */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
- vshll.s16 q14, \x4, #15
- vmull.s16 q13, \x6, d0[3]
- vmlal.s16 q13, \x10, d0[2]
- vmlal.s16 q13, \x12, d0[1]
- vmlal.s16 q13, \x16, d0[0]
-
- vadd.s32 q10, q14, q13
- vsub.s32 q14, q14, q13
-
- .if \shift > 16
- vrshr.s32 q10, q10, #\shift
- vrshr.s32 q14, q14, #\shift
- vmovn.s32 \y26, q10
- vmovn.s32 \y27, q14
- .else
- vrshrn.s32 \y26, q10, #\shift
- vrshrn.s32 \y27, q14, #\shift
- .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
- DCT_TABLE .req r0
- COEF_BLOCK .req r1
- OUTPUT_BUF .req r2
- OUTPUT_COL .req r3
- TMP1 .req r0
- TMP2 .req ip
-
- vpush {d8-d15}
-
- /* Load constants */
- adr TMP2, jsimd_idct_2x2_neon_consts
- vld1.16 {d0}, [TMP2, :64]
-
- /* Load all COEF_BLOCK into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d4 | d5
- * 1 | d6 | d7
- * 2 | - | -
- * 3 | d10 | d11
- * 4 | - | -
- * 5 | d12 | d13
- * 6 | - | -
- * 7 | d16 | d17
- */
- vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
- add COEF_BLOCK, COEF_BLOCK, #16
- vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
- add COEF_BLOCK, COEF_BLOCK, #16
- vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
- add COEF_BLOCK, COEF_BLOCK, #16
- vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
- /* Dequantize */
- vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
- vmul.s16 q2, q2, q9
- vmul.s16 q3, q3, q10
- add DCT_TABLE, DCT_TABLE, #16
- vld1.16 {d24, d25}, [DCT_TABLE, :128]!
- vmul.s16 q5, q5, q12
- add DCT_TABLE, DCT_TABLE, #16
- vld1.16 {d26, d27}, [DCT_TABLE, :128]!
- vmul.s16 q6, q6, q13
- add DCT_TABLE, DCT_TABLE, #16
- vld1.16 {d30, d31}, [DCT_TABLE, :128]!
- vmul.s16 q8, q8, q15
-
- /* Pass 1 */
-#if 0
- idct_helper d4, d6, d10, d12, d16, 13, d4, d6
- transpose_4x4 d4, d6, d8, d10
- idct_helper d5, d7, d11, d13, d17, 13, d5, d7
- transpose_4x4 d5, d7, d9, d11
-#else
- vmull.s16 q13, d6, d0[3]
- vmlal.s16 q13, d10, d0[2]
- vmlal.s16 q13, d12, d0[1]
- vmlal.s16 q13, d16, d0[0]
- vmull.s16 q12, d7, d0[3]
- vmlal.s16 q12, d11, d0[2]
- vmlal.s16 q12, d13, d0[1]
- vmlal.s16 q12, d17, d0[0]
- vshll.s16 q14, d4, #15
- vshll.s16 q15, d5, #15
- vadd.s32 q10, q14, q13
- vsub.s32 q14, q14, q13
- vrshrn.s32 d4, q10, #13
- vrshrn.s32 d6, q14, #13
- vadd.s32 q10, q15, q12
- vsub.s32 q14, q15, q12
- vrshrn.s32 d5, q10, #13
- vrshrn.s32 d7, q14, #13
- vtrn.16 q2, q3
- vtrn.32 q3, q5
-#endif
-
- /* Pass 2 */
- idct_helper d4, d6, d10, d7, d11, 20, d26, d27
-
- /* Range limit */
- vmov.u16 q15, #0x80
- vadd.s16 q13, q13, q15
- vqmovun.s16 d26, q13
- vqmovun.s16 d27, q13
-
- /* Store results to the output buffer */
- ldmia OUTPUT_BUF, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
-
- vst1.8 {d26[0]}, [TMP1]!
- vst1.8 {d27[4]}, [TMP1]!
- vst1.8 {d26[1]}, [TMP2]!
- vst1.8 {d27[5]}, [TMP2]!
-
- vpop {d8-d15}
- bx lr
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
* jsimd_extrgb_ycc_convert_neon
* jsimd_extbgr_ycc_convert_neon
* jsimd_extrgbx_ycc_convert_neon
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
index 3a1d1ef..94f9b11 100644
--- a/simd/arm/arm64/jsimd_neon.S
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -1251,185 +1251,6 @@ asm_function jsimd_idct_4x4_neon
/*****************************************************************************/
/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- * requires much less arithmetic operations and hence should be faster.
- * The primary purpose of this particular NEON optimized function is
- * bit exact compatibility with jpeg-6b.
- */
-
-.balign 8
-Ljsimd_idct_2x2_neon_consts:
- .short -FIX_0_720959822 /* v14[0] */
- .short FIX_0_850430095 /* v14[1] */
- .short -FIX_1_272758580 /* v14[2] */
- .short FIX_3_624509785 /* v14[3] */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
- sshll v15.4s, \x4, #15
- smull v26.4s, \x6, v14.h[3]
- smlal v26.4s, \x10, v14.h[2]
- smlal v26.4s, \x12, v14.h[1]
- smlal v26.4s, \x16, v14.h[0]
-
- add v20.4s, v15.4s, v26.4s
- sub v15.4s, v15.4s, v26.4s
-
- .if \shift > 16
- srshr v20.4s, v20.4s, #\shift
- srshr v15.4s, v15.4s, #\shift
- xtn \y26, v20.4s
- xtn \y27, v15.4s
- .else
- rshrn \y26, v20.4s, #\shift
- rshrn \y27, v15.4s, #\shift
- .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
- DCT_TABLE .req x0
- COEF_BLOCK .req x1
- OUTPUT_BUF .req x2
- OUTPUT_COL .req x3
- TMP1 .req x0
- TMP2 .req x15
-
- /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x3 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x3, w3
-
- /* vpush {v8.4h - v15.4h} ; not available */
- sub sp, sp, 64
- mov x9, sp
-
- /* Load constants */
- adr TMP2, Ljsimd_idct_2x2_neon_consts
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
- ld1 {v14.4h}, [TMP2]
-
- /* Load all COEF_BLOCK into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | v4.4h | v5.4h
- * 1 | v6.4h | v7.4h
- * 2 | - | -
- * 3 | v10.4h | v11.4h
- * 4 | - | -
- * 5 | v12.4h | v13.4h
- * 6 | - | -
- * 7 | v16.4h | v17.4h
- */
- ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
- add COEF_BLOCK, COEF_BLOCK, #16
- ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
- add COEF_BLOCK, COEF_BLOCK, #16
- ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
- add COEF_BLOCK, COEF_BLOCK, #16
- ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
- /* Dequantize */
- ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
- mul v4.4h, v4.4h, v18.4h
- mul v5.4h, v5.4h, v19.4h
- ins v4.d[1], v5.d[0]
- mul v6.4h, v6.4h, v20.4h
- mul v7.4h, v7.4h, v21.4h
- ins v6.d[1], v7.d[0]
- add DCT_TABLE, DCT_TABLE, #16
- ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
- mul v10.4h, v10.4h, v24.4h
- mul v11.4h, v11.4h, v25.4h
- ins v10.d[1], v11.d[0]
- add DCT_TABLE, DCT_TABLE, #16
- ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
- mul v12.4h, v12.4h, v26.4h
- mul v13.4h, v13.4h, v27.4h
- ins v12.d[1], v13.d[0]
- add DCT_TABLE, DCT_TABLE, #16
- ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
- mul v16.4h, v16.4h, v30.4h
- mul v17.4h, v17.4h, v31.4h
- ins v16.d[1], v17.d[0]
-
- /* Pass 1 */
-#if 0
- idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
- transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
- idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
- transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
-#else
- smull v26.4s, v6.4h, v14.h[3]
- smlal v26.4s, v10.4h, v14.h[2]
- smlal v26.4s, v12.4h, v14.h[1]
- smlal v26.4s, v16.4h, v14.h[0]
- smull v24.4s, v7.4h, v14.h[3]
- smlal v24.4s, v11.4h, v14.h[2]
- smlal v24.4s, v13.4h, v14.h[1]
- smlal v24.4s, v17.4h, v14.h[0]
- sshll v15.4s, v4.4h, #15
- sshll v30.4s, v5.4h, #15
- add v20.4s, v15.4s, v26.4s
- sub v15.4s, v15.4s, v26.4s
- rshrn v4.4h, v20.4s, #13
- rshrn v6.4h, v15.4s, #13
- add v20.4s, v30.4s, v24.4s
- sub v15.4s, v30.4s, v24.4s
- rshrn v5.4h, v20.4s, #13
- rshrn v7.4h, v15.4s, #13
- ins v4.d[1], v5.d[0]
- ins v6.d[1], v7.d[0]
- transpose v4, v6, v3, .16b, .8h
- transpose v6, v10, v3, .16b, .4s
- ins v11.d[0], v10.d[1]
- ins v7.d[0], v6.d[1]
-#endif
-
- /* Pass 2 */
- idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
-
- /* Range limit */
- movi v30.8h, #0x80
- ins v26.d[1], v27.d[0]
- add v26.8h, v26.8h, v30.8h
- sqxtun v30.8b, v26.8h
- ins v26.d[0], v30.d[0]
- sqxtun v27.8b, v26.8h
-
- /* Store results to the output buffer */
- ldp TMP1, TMP2, [OUTPUT_BUF]
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
-
- st1 {v26.b}[0], [TMP1], 1
- st1 {v27.b}[4], [TMP1], 1
- st1 {v26.b}[1], [TMP2], 1
- st1 {v27.b}[5], [TMP2], 1
-
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- blr x30
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
* jsimd_extrgb_ycc_convert_neon
* jsimd_extbgr_ycc_convert_neon
* jsimd_extrgbx_ycc_convert_neon
diff --git a/simd/arm/common/jidctred-neon.c b/simd/arm/common/jidctred-neon.c
new file mode 100644
index 0000000..1c1e24e
--- /dev/null
+++ b/simd/arm/common/jidctred-neon.c
@@ -0,0 +1,145 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm NEON)
+ *
+ * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define F_0_720 5906
+#define F_0_850 6967
+#define F_1_272 10426
+#define F_3_624 29692
+
+/*
+ * 'jsimd_idct_2x2_neon' is an inverse-DCT function for getting reduced-size
+ * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations and
+ * produces exactly the same output as IJG's original 'jpeg_idct_2x2' function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.720959822 = 5906 * 2^-13
+ * 0.850430095 = 6967 * 2^-13
+ * 1.272758580 = 10426 * 2^-13
+ * 3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 reduced IDCT algorithm. Where
+ * possible, the variable names and comments here in 'jsimd_idct_2x2_neon'
+ * match up with those in 'jpeg_idct_2x2'.
+ *
+ * NOTE: jpeg-8 has an improved implementation of the 2x2 inverse-DCT which
+ * requires fewer arithmetic operations and hence should be faster. The
+ * primary purpose of this particular NEON optimized function is bit
+ * exact compatibility with jpeg-6b.
+ */
+
+void jsimd_idct_2x2_neon(void *dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY restrict output_buf,
+ JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load DCT quantization table. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Dequantize DCT coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+ row1 = vmulq_s16(row1, quant_row1);
+ row3 = vmulq_s16(row3, quant_row3);
+ row5 = vmulq_s16(row5, quant_row5);
+ row7 = vmulq_s16(row7, quant_row7);
+
+ /* Pass 1: process input columns; put results in vectors row0 and row1. */
+ /* Even part. */
+ int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+ int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+ /* Odd part. */
+ int32x4_t tmp0_l = vmull_n_s16(vget_low_s16(row1), F_3_624);
+ tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row3), -F_1_272);
+ tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row5), F_0_850);
+ tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row7), -F_0_720);
+ int32x4_t tmp0_h = vmull_n_s16(vget_high_s16(row1), F_3_624);
+ tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row3), -F_1_272);
+ tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row5), F_0_850);
+ tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row7), -F_0_720);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+ vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+ row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+ vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+ /* Transpose two rows ready for second pass. */
+ int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+ int16x8_t cols_0246 = cols_0246_1357.val[0];
+ int16x8_t cols_1357 = cols_0246_1357.val[1];
+ /* Duplicate columns such that each is accessible in its own vector. */
+ int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+ vreinterpretq_s32_s16(cols_1357));
+ int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+ int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+ /* Pass 2: process 2 rows, store to output array. */
+ /* Even part: only interested in col0; top half of tmp10 is "don't care". */
+ int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+ /* Odd part. Only interested in bottom half of tmp0. */
+ int32x4_t tmp0 = vmull_n_s16(vget_low_s16(cols_1155), F_3_624);
+ tmp0 = vmlal_n_s16(tmp0, vget_low_s16(cols_3377), -F_1_272);
+ tmp0 = vmlal_n_s16(tmp0, vget_high_s16(cols_1155), F_0_850);
+ tmp0 = vmlal_n_s16(tmp0, vget_high_s16(cols_3377), -F_0_720);
+
+ /* Final output stage: descale and clamp to range [0-255]. */
+ int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+ vsubhn_s32(tmp10, tmp0));
+ output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+ CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+ /* Narrow to 8-bit and convert to unsigned. */
+ uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+ /* Store 2x2 block to memory. */
+ vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+ vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+ vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+ vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}