diff options
author | Aart Bik <ajcbik@google.com> | 2018-03-26 23:31:19 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2018-03-26 23:31:19 +0000 |
commit | 2a93c7644d4300a3b0fc532c72bc9aa935dd3551 (patch) | |
tree | f6752bf7b43996bcd7f04671d849f681c5fb9eb5 /compiler/optimizing/loop_optimization.cc | |
parent | f069d359c31f91abd7a74cce6627decd54fad4d1 (diff) | |
parent | 121f2038e9c8afe12f8f4096b7c84a167e7adea5 (diff) |
Merge "ART: Implement scalar loop unrolling."
Diffstat (limited to 'compiler/optimizing/loop_optimization.cc')
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 127 |
1 files changed, 87 insertions, 40 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 758aca2d0c..1d83815e1f 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -33,8 +33,8 @@ namespace art { // Enables vectorization (SIMDization) in the loop optimizer. static constexpr bool kEnableVectorization = true; -// No loop unrolling factor (just one copy of the loop-body). -static constexpr uint32_t kNoUnrollingFactor = 1; +// Enables scalar loop unrolling in the loop optimizer. +static constexpr bool kEnableScalarUnrolling = false; // // Static helpers. @@ -532,7 +532,11 @@ HLoopOptimization::HLoopOptimization(HGraph* graph, vector_preheader_(nullptr), vector_header_(nullptr), vector_body_(nullptr), - vector_index_(nullptr) { + vector_index_(nullptr), + arch_loop_helper_(ArchDefaultLoopHelper::Create(compiler_driver_ != nullptr + ? compiler_driver_->GetInstructionSet() + : InstructionSet::kNone, + global_allocator_)) { } void HLoopOptimization::Run() { @@ -743,7 +747,7 @@ void HLoopOptimization::SimplifyBlocks(LoopNode* node) { } } -bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { +bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) { HBasicBlock* header = node->loop_info->GetHeader(); HBasicBlock* preheader = node->loop_info->GetPreHeader(); // Ensure loop header logic is finite. @@ -813,6 +817,83 @@ bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { return false; } +bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { + return TryOptimizeInnerLoopFinite(node) || + TryUnrollingForBranchPenaltyReduction(node); +} + +void HLoopOptimization::PeelOrUnrollOnce(LoopNode* loop_node, + bool do_unrolling, + SuperblockCloner::HBasicBlockMap* bb_map, + SuperblockCloner::HInstructionMap* hir_map) { + // TODO: peel loop nests. + DCHECK(loop_node->inner == nullptr); + + // Check that loop info is up-to-date. + HLoopInformation* loop_info = loop_node->loop_info; + HBasicBlock* header = loop_info->GetHeader(); + DCHECK(loop_info == header->GetLoopInformation()); + + PeelUnrollHelper helper(loop_info, bb_map, hir_map); + DCHECK(helper.IsLoopClonable()); + HBasicBlock* new_header = do_unrolling ? helper.DoUnrolling() : helper.DoPeeling(); + DCHECK(header == new_header); + DCHECK(loop_info == new_header->GetLoopInformation()); +} + +// +// Loop unrolling: generic part methods. +// + +bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* loop_node) { + // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests) + // as InstructionSet is needed. + if (!kEnableScalarUnrolling || compiler_driver_ == nullptr) { + return false; + } + + HLoopInformation* loop_info = loop_node->loop_info; + int64_t trip_count = 0; + // Only unroll loops with a known tripcount. + if (!induction_range_.HasKnownTripCount(loop_info, &trip_count)) { + return false; + } + + uint32_t unrolling_factor = arch_loop_helper_->GetScalarUnrollingFactor(loop_info, trip_count); + if (unrolling_factor == kNoUnrollingFactor) { + return false; + } + + LoopAnalysisInfo loop_analysis_info(loop_info); + LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info); + + // Check "IsLoopClonable" last as it can be time-consuming. + if (arch_loop_helper_->IsLoopTooBigForScalarUnrolling(&loop_analysis_info) || + (loop_analysis_info.GetNumberOfExits() > 1) || + loop_analysis_info.HasInstructionsPreventingScalarUnrolling() || + !PeelUnrollHelper::IsLoopClonable(loop_info)) { + return false; + } + + // TODO: support other unrolling factors. + DCHECK_EQ(unrolling_factor, 2u); + + // Perform unrolling. + ArenaAllocator* arena = loop_info->GetHeader()->GetGraph()->GetAllocator(); + SuperblockCloner::HBasicBlockMap bb_map( + std::less<HBasicBlock*>(), arena->Adapter(kArenaAllocSuperblockCloner)); + SuperblockCloner::HInstructionMap hir_map( + std::less<HInstruction*>(), arena->Adapter(kArenaAllocSuperblockCloner)); + PeelOrUnrollOnce(loop_node, /* unrolling */ true, &bb_map, &hir_map); + + // Remove the redundant loop check after unrolling. + HIf* copy_hif = bb_map.Get(loop_info->GetHeader())->GetLastInstruction()->AsIf(); + int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0; + copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u); + + return true; +} + // // Loop vectorization. The implementation is based on the book by Aart J.C. Bik: // "The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance." @@ -943,7 +1024,8 @@ void HLoopOptimization::Vectorize(LoopNode* node, HBasicBlock* preheader = node->loop_info->GetPreHeader(); // Pick a loop unrolling factor for the vector loop. - uint32_t unroll = GetUnrollingFactor(block, trip_count); + uint32_t unroll = arch_loop_helper_->GetSIMDUnrollingFactor( + block, trip_count, MaxNumberPeeled(), vector_length_); uint32_t chunk = vector_length_ * unroll; DCHECK(trip_count == 0 || (trip_count >= MaxNumberPeeled() + chunk)); @@ -2226,41 +2308,6 @@ bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) { return true; } -static constexpr uint32_t ARM64_SIMD_MAXIMUM_UNROLL_FACTOR = 8; -static constexpr uint32_t ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE = 50; - -uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) { - uint32_t max_peel = MaxNumberPeeled(); - switch (compiler_driver_->GetInstructionSet()) { - case InstructionSet::kArm64: { - // Don't unroll with insufficient iterations. - // TODO: Unroll loops with unknown trip count. - DCHECK_NE(vector_length_, 0u); - if (trip_count < (2 * vector_length_ + max_peel)) { - return kNoUnrollingFactor; - } - // Don't unroll for large loop body size. - uint32_t instruction_count = block->GetInstructions().CountSize(); - if (instruction_count >= ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE) { - return kNoUnrollingFactor; - } - // Find a beneficial unroll factor with the following restrictions: - // - At least one iteration of the transformed loop should be executed. - // - The loop body shouldn't be "too big" (heuristic). - uint32_t uf1 = ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE / instruction_count; - uint32_t uf2 = (trip_count - max_peel) / vector_length_; - uint32_t unroll_factor = - TruncToPowerOfTwo(std::min({uf1, uf2, ARM64_SIMD_MAXIMUM_UNROLL_FACTOR})); - DCHECK_GE(unroll_factor, 1u); - return unroll_factor; - } - case InstructionSet::kX86: - case InstructionSet::kX86_64: - default: - return kNoUnrollingFactor; - } -} - // // Helpers. // |