diff options
author | Artem Serov <artem.serov@linaro.org> | 2017-09-01 10:59:03 +0100 |
---|---|---|
committer | Artem Serov <artem.serov@linaro.org> | 2017-09-07 21:29:41 +0100 |
commit | f26bb6c74a973fde3d2783ac35324d5ce8def814 (patch) | |
tree | 70149908a20503dfaf1276d04d561024f3441c6f /compiler/optimizing/loop_optimization.cc | |
parent | 66e3af9ce5b3aaa43e5ce3bce8233235af139072 (diff) |
ARM64: Tune SIMD loop unrolling factor heuristic.
Improve SIMD loop unrolling factor heuristic for ARM64 by
accounting for max desired loop size, trip_count, etc. The
following example shows 21% perf increase:
for (int i = 0; i < LENGTH; i++) {
bc[i] = ba[i]; // Byte arrays
}
Test: test-art-host, test-art-target.
Change-Id: Ic587759c51aa4354df621ffb1c7ce4ebd798dfc1
Diffstat (limited to 'compiler/optimizing/loop_optimization.cc')
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 34 |
1 files changed, 23 insertions, 11 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index a249cacc93..e150b65628 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -1761,21 +1761,33 @@ void HLoopOptimization::SetPeelingCandidate(const ArrayReference* candidate, vector_peeling_candidate_ = candidate; } +static constexpr uint32_t ARM64_SIMD_MAXIMUM_UNROLL_FACTOR = 8; +static constexpr uint32_t ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE = 50; + uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) { - // Current heuristic: unroll by 2 on ARM64/X86 for large known trip - // counts and small loop bodies. - // TODO: refine with operation count, remaining iterations, etc. - // Artem had some really cool ideas for this already. switch (compiler_driver_->GetInstructionSet()) { - case kArm64: - case kX86: - case kX86_64: { - size_t num_instructions = block->GetInstructions().CountSize(); - if (num_instructions <= 10 && trip_count >= 4 * vector_length_) { - return 2; + case kArm64: { + DCHECK_NE(vector_length_, 0u); + // TODO: Unroll loops with unknown trip count. + if (trip_count < 2 * vector_length_) { + return 1; } - return 1; + + uint32_t instruction_count = block->GetInstructions().CountSize(); + + // Find a beneficial unroll factor with the following restrictions: + // - At least one iteration of the transformed loop should be executed. + // - The loop body shouldn't be "too big" (heuristic). + uint32_t uf1 = ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE / instruction_count; + uint32_t uf2 = trip_count / vector_length_; + uint32_t unroll_factor = + TruncToPowerOfTwo(std::min({uf1, uf2, ARM64_SIMD_MAXIMUM_UNROLL_FACTOR})); + DCHECK_GE(unroll_factor, 1u); + + return unroll_factor; } + case kX86: + case kX86_64: default: return 1; } |