diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2019-12-10 11:20:15 +0000 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2019-12-11 11:22:40 +0000 |
commit | 86afb9fe2842c76f39d266bded98c61956d2226a (patch) | |
tree | 2f4f75bcfe9ee133e532594e3cd4bef1dbbb0145 | |
parent | bc13578529255ec75005ffc98aae151666122892 (diff) |
Precompute DCT block output pointers in IDCT functions
Computes the output pointer values for storing DCT blocks ahead of
time so the compiler knows that the storage locations for each row do
not overlap and are not changed by storing each successive row of
coefficients.
The base pointer can be loaded once, rather than being reloaded after
storing each row of the DCT block - breaking a memory dependency
chain and increasing overall decode performance.
Bug: 922430
Change-Id: I93030fd48f048a28e1eb57d234054f3f85b1bf88
-rw-r--r-- | README.chromium | 1 | ||||
-rw-r--r-- | simd/arm/common/jidctint-neon.c | 34 | ||||
-rw-r--r-- | simd/arm/common/jidctred-neon.c | 12 |
3 files changed, 27 insertions, 20 deletions
diff --git a/README.chromium b/README.chromium index cef4f03..5761c9c 100644 --- a/README.chromium +++ b/README.chromium @@ -67,6 +67,7 @@ following changes which are not merged to upstream: - Implement 2x2 IDCT using Arm NEON intrinsics - Implement 4x4 IDCT using Arm NEON intrinsics - Implement slow IDCT using Arm NEON intrinsics + - Precompute DCT block output pointers in IDCT functions Refer to working-with-nested-repos [1] for details of how to setup your git svn client to update the code (for making local changes, cherry picking from diff --git a/simd/arm/common/jidctint-neon.c b/simd/arm/common/jidctint-neon.c index 7fb683b..11076a0 100644 --- a/simd/arm/common/jidctint-neon.c +++ b/simd/arm/common/jidctint-neon.c @@ -644,15 +644,16 @@ static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace, vreinterpret_u16_u8(cols_45_67.val[0]), vreinterpret_u16_u8(cols_45_67.val[1]) }; + + JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col; + JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col; + JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col; + JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col; /* VST4 of 16-bit elements completes the transpose. */ - vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 0] + output_col), - cols_01_23_45_67, 0); - vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 1] + output_col), - cols_01_23_45_67, 1); - vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 2] + output_col), - cols_01_23_45_67, 2); - vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 3] + output_col), - cols_01_23_45_67, 3); + vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0); + vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1); + vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2); + vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3); } @@ -735,13 +736,14 @@ static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace, vreinterpret_u16_u8(cols_45_67.val[0]), vreinterpret_u16_u8(cols_45_67.val[1]) }; + + JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col; + JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col; + JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col; + JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col; /* VST4 of 16-bit elements completes the transpose. */ - vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 0] + output_col), - cols_01_23_45_67, 0); - vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 1] + output_col), - cols_01_23_45_67, 1); - vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 2] + output_col), - cols_01_23_45_67, 2); - vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 3] + output_col), - cols_01_23_45_67, 3); + vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0); + vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1); + vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2); + vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3); } diff --git a/simd/arm/common/jidctred-neon.c b/simd/arm/common/jidctred-neon.c index aa10799..7e95bf3 100644 --- a/simd/arm/common/jidctred-neon.c +++ b/simd/arm/common/jidctred-neon.c @@ -455,8 +455,12 @@ void jsimd_idct_4x4_neon(void *dct_table, uint16x4x2_t output_01_23 = { output_0123.val[0], output_0123.val[1] }; /* Store 4x4 block to memory. */ - vst2_lane_u16((uint16_t *)(output_buf[0] + output_col), output_01_23, 0); - vst2_lane_u16((uint16_t *)(output_buf[1] + output_col), output_01_23, 1); - vst2_lane_u16((uint16_t *)(output_buf[2] + output_col), output_01_23, 2); - vst2_lane_u16((uint16_t *)(output_buf[3] + output_col), output_01_23, 3); + JSAMPROW outptr0 = output_buf[0] + output_col; + JSAMPROW outptr1 = output_buf[1] + output_col; + JSAMPROW outptr2 = output_buf[2] + output_col; + JSAMPROW outptr3 = output_buf[3] + output_col; + vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0); + vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1); + vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2); + vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3); } |