summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2019-12-10 11:20:15 +0000
committerJonathan Wright <jonathan.wright@arm.com>2019-12-11 11:22:40 +0000
commit86afb9fe2842c76f39d266bded98c61956d2226a (patch)
tree2f4f75bcfe9ee133e532594e3cd4bef1dbbb0145
parentbc13578529255ec75005ffc98aae151666122892 (diff)
Precompute DCT block output pointers in IDCT functions
Computes the output pointer values for storing DCT blocks ahead of time so the compiler knows that the storage locations for each row do not overlap and are not changed by storing each successive row of coefficients. The base pointer can be loaded once, rather than being reloaded after storing each row of the DCT block - breaking a memory dependency chain and increasing overall decode performance. Bug: 922430 Change-Id: I93030fd48f048a28e1eb57d234054f3f85b1bf88
-rw-r--r--README.chromium1
-rw-r--r--simd/arm/common/jidctint-neon.c34
-rw-r--r--simd/arm/common/jidctred-neon.c12
3 files changed, 27 insertions, 20 deletions
diff --git a/README.chromium b/README.chromium
index cef4f03..5761c9c 100644
--- a/README.chromium
+++ b/README.chromium
@@ -67,6 +67,7 @@ following changes which are not merged to upstream:
- Implement 2x2 IDCT using Arm NEON intrinsics
- Implement 4x4 IDCT using Arm NEON intrinsics
- Implement slow IDCT using Arm NEON intrinsics
+ - Precompute DCT block output pointers in IDCT functions
Refer to working-with-nested-repos [1] for details of how to setup your git
svn client to update the code (for making local changes, cherry picking from
diff --git a/simd/arm/common/jidctint-neon.c b/simd/arm/common/jidctint-neon.c
index 7fb683b..11076a0 100644
--- a/simd/arm/common/jidctint-neon.c
+++ b/simd/arm/common/jidctint-neon.c
@@ -644,15 +644,16 @@ static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
vreinterpret_u16_u8(cols_45_67.val[0]),
vreinterpret_u16_u8(cols_45_67.val[1])
};
+
+ JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+ JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+ JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+ JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
/* VST4 of 16-bit elements completes the transpose. */
- vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 0] + output_col),
- cols_01_23_45_67, 0);
- vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 1] + output_col),
- cols_01_23_45_67, 1);
- vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 2] + output_col),
- cols_01_23_45_67, 2);
- vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 3] + output_col),
- cols_01_23_45_67, 3);
+ vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+ vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+ vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+ vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
}
@@ -735,13 +736,14 @@ static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
vreinterpret_u16_u8(cols_45_67.val[0]),
vreinterpret_u16_u8(cols_45_67.val[1])
};
+
+ JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+ JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+ JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+ JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
/* VST4 of 16-bit elements completes the transpose. */
- vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 0] + output_col),
- cols_01_23_45_67, 0);
- vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 1] + output_col),
- cols_01_23_45_67, 1);
- vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 2] + output_col),
- cols_01_23_45_67, 2);
- vst4_lane_u16((uint16_t *)(output_buf[buf_offset + 3] + output_col),
- cols_01_23_45_67, 3);
+ vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+ vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+ vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+ vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
}
diff --git a/simd/arm/common/jidctred-neon.c b/simd/arm/common/jidctred-neon.c
index aa10799..7e95bf3 100644
--- a/simd/arm/common/jidctred-neon.c
+++ b/simd/arm/common/jidctred-neon.c
@@ -455,8 +455,12 @@ void jsimd_idct_4x4_neon(void *dct_table,
uint16x4x2_t output_01_23 = { output_0123.val[0], output_0123.val[1] };
/* Store 4x4 block to memory. */
- vst2_lane_u16((uint16_t *)(output_buf[0] + output_col), output_01_23, 0);
- vst2_lane_u16((uint16_t *)(output_buf[1] + output_col), output_01_23, 1);
- vst2_lane_u16((uint16_t *)(output_buf[2] + output_col), output_01_23, 2);
- vst2_lane_u16((uint16_t *)(output_buf[3] + output_col), output_01_23, 3);
+ JSAMPROW outptr0 = output_buf[0] + output_col;
+ JSAMPROW outptr1 = output_buf[1] + output_col;
+ JSAMPROW outptr2 = output_buf[2] + output_col;
+ JSAMPROW outptr3 = output_buf[3] + output_col;
+ vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+ vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+ vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+ vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
}