Precompute DCT block output pointers in IDCT functions

Computes the output pointer values for storing DCT blocks ahead of time so the compiler knows that the storage locations for each row do not overlap and are not changed by storing each successive row of coefficients. The base pointer can be loaded once, rather than being reloaded after storing each row of the DCT block - breaking a memory dependency chain and increasing overall decode performance. Bug: 922430 Change-Id: I93030fd48f048a28e1eb57d234054f3f85b1bf88
author: Jonathan Wright <jonathan.wright@arm.com> 2019-12-10 11:20:15 +0000
committer: Jonathan Wright <jonathan.wright@arm.com> 2019-12-11 11:22:40 +0000
commit: 86afb9fe2842c76f39d266bded98c61956d2226a (patch)
tree: 2f4f75bcfe9ee133e532594e3cd4bef1dbbb0145
parent: bc13578529255ec75005ffc98aae151666122892 (diff)
3 files changed, 27 insertions, 20 deletions
diff --git a/README.chromium b/README.chromium
index cef4f03..5761c9c 100644
--- a/README.chromium
+++ b/README.chromium
@@ -67,6 +67,7 @@ following changes which are not merged to upstream:
   - Implement 2x2 IDCT using Arm NEON intrinsics
   - Implement 4x4 IDCT using Arm NEON intrinsics
   - Implement slow IDCT using Arm NEON intrinsics
+  - Precompute DCT block output pointers in IDCT functions
 
 Refer to working-with-nested-repos [1] for details of how to setup your git
 svn client to update the code (for making local changes, cherry picking from
diff --git a/simd/arm/common/jidctint-neon.c b/simd/arm/common/jidctint-neon.c
index 7fb683b..11076a0 100644
--- a/simd/arm/common/jidctint-neon.c
+++ b/simd/arm/common/jidctint-neon.c
@@ -644,15 +644,16 @@ static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
                                     vreinterpret_u16_u8(cols_45_67.val[0]),
                                     vreinterpret_u16_u8(cols_45_67.val[1])
                                   };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
   /* VST4 of 16-bit elements completes the transpose. */
-  vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 0] + output_col),
-                cols_01_23_45_67, 0);
-  vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 1] + output_col),
-                cols_01_23_45_67, 1);
-  vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 2] + output_col),
-                cols_01_23_45_67, 2);
-  vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 3] + output_col),
-                cols_01_23_45_67, 3);
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
 }
 
 
@@ -735,13 +736,14 @@ static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
                                     vreinterpret_u16_u8(cols_45_67.val[0]),
                                     vreinterpret_u16_u8(cols_45_67.val[1])
                                   };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
   /* VST4 of 16-bit elements completes the transpose. */
-  vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 0] + output_col),
-                cols_01_23_45_67, 0);
-  vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 1] + output_col),
-                cols_01_23_45_67, 1);
-  vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 2] + output_col),
-                cols_01_23_45_67, 2);
-  vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 3] + output_col),
-                cols_01_23_45_67, 3);
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
 }
diff --git a/simd/arm/common/jidctred-neon.c b/simd/arm/common/jidctred-neon.c
index aa10799..7e95bf3 100644
--- a/simd/arm/common/jidctred-neon.c
+++ b/simd/arm/common/jidctred-neon.c
@@ -455,8 +455,12 @@ void jsimd_idct_4x4_neon(void *dct_table,
   uint16x4x2_t output_01_23 = { output_0123.val[0], output_0123.val[1] };
 
   /* Store 4x4 block to memory. */
-  vst2_lane_u16((uint16_t *)(output_buf[0] + output_col), output_01_23, 0);
-  vst2_lane_u16((uint16_t *)(output_buf[1] + output_col), output_01_23, 1);
-  vst2_lane_u16((uint16_t *)(output_buf[2] + output_col), output_01_23, 2);
-  vst2_lane_u16((uint16_t *)(output_buf[3] + output_col), output_01_23, 3);
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+  vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+  vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+  vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
 }
author	Jonathan Wright <jonathan.wright@arm.com>	2019-12-10 11:20:15 +0000
committer	Jonathan Wright <jonathan.wright@arm.com>	2019-12-11 11:22:40 +0000
commit	86afb9fe2842c76f39d266bded98c61956d2226a (patch)
tree	2f4f75bcfe9ee133e532594e3cd4bef1dbbb0145
parent	bc13578529255ec75005ffc98aae151666122892 (diff)