summaryrefslogtreecommitdiff
path: root/compiler/optimizing/loop_optimization.cc
diff options
context:
space:
mode:
authorArtem Serov <artem.serov@linaro.org>2017-10-23 19:19:06 +0100
committerArtem Serov <artem.serov@linaro.org>2018-03-26 19:46:23 +0100
commit121f2038e9c8afe12f8f4096b7c84a167e7adea5 (patch)
tree655e2bba77ac34208c54b290286104b124003e59 /compiler/optimizing/loop_optimization.cc
parentf9635aab3f2db9b1b13184e8146530a53246b82c (diff)
ART: Implement scalar loop unrolling.
Implement scalar loop unrolling for small loops (on arm64) with known trip count to reduce loop check and branch penalty and to provide more opportunities for instruction scheduling. Note: this functionality is turned off by default now. Test: cloner_test.cc Test: test-art-target, test-art-host Change-Id: Ic27fd8fb0bc0d7b69251252da37b8b510bc30acc
Diffstat (limited to 'compiler/optimizing/loop_optimization.cc')
-rw-r--r--compiler/optimizing/loop_optimization.cc127
1 files changed, 87 insertions, 40 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index e1fb7ac17e..69080340e4 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -33,8 +33,8 @@ namespace art {
// Enables vectorization (SIMDization) in the loop optimizer.
static constexpr bool kEnableVectorization = true;
-// No loop unrolling factor (just one copy of the loop-body).
-static constexpr uint32_t kNoUnrollingFactor = 1;
+// Enables scalar loop unrolling in the loop optimizer.
+static constexpr bool kEnableScalarUnrolling = false;
//
// Static helpers.
@@ -480,7 +480,11 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
vector_preheader_(nullptr),
vector_header_(nullptr),
vector_body_(nullptr),
- vector_index_(nullptr) {
+ vector_index_(nullptr),
+ arch_loop_helper_(ArchDefaultLoopHelper::Create(compiler_driver_ != nullptr
+ ? compiler_driver_->GetInstructionSet()
+ : InstructionSet::kNone,
+ global_allocator_)) {
}
void HLoopOptimization::Run() {
@@ -691,7 +695,7 @@ void HLoopOptimization::SimplifyBlocks(LoopNode* node) {
}
}
-bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
+bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) {
HBasicBlock* header = node->loop_info->GetHeader();
HBasicBlock* preheader = node->loop_info->GetPreHeader();
// Ensure loop header logic is finite.
@@ -761,6 +765,83 @@ bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
return false;
}
+bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
+ return TryOptimizeInnerLoopFinite(node) ||
+ TryUnrollingForBranchPenaltyReduction(node);
+}
+
+void HLoopOptimization::PeelOrUnrollOnce(LoopNode* loop_node,
+ bool do_unrolling,
+ SuperblockCloner::HBasicBlockMap* bb_map,
+ SuperblockCloner::HInstructionMap* hir_map) {
+ // TODO: peel loop nests.
+ DCHECK(loop_node->inner == nullptr);
+
+ // Check that loop info is up-to-date.
+ HLoopInformation* loop_info = loop_node->loop_info;
+ HBasicBlock* header = loop_info->GetHeader();
+ DCHECK(loop_info == header->GetLoopInformation());
+
+ PeelUnrollHelper helper(loop_info, bb_map, hir_map);
+ DCHECK(helper.IsLoopClonable());
+ HBasicBlock* new_header = do_unrolling ? helper.DoUnrolling() : helper.DoPeeling();
+ DCHECK(header == new_header);
+ DCHECK(loop_info == new_header->GetLoopInformation());
+}
+
+//
+// Loop unrolling: generic part methods.
+//
+
+bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* loop_node) {
+ // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
+ // as InstructionSet is needed.
+ if (!kEnableScalarUnrolling || compiler_driver_ == nullptr) {
+ return false;
+ }
+
+ HLoopInformation* loop_info = loop_node->loop_info;
+ int64_t trip_count = 0;
+ // Only unroll loops with a known tripcount.
+ if (!induction_range_.HasKnownTripCount(loop_info, &trip_count)) {
+ return false;
+ }
+
+ uint32_t unrolling_factor = arch_loop_helper_->GetScalarUnrollingFactor(loop_info, trip_count);
+ if (unrolling_factor == kNoUnrollingFactor) {
+ return false;
+ }
+
+ LoopAnalysisInfo loop_analysis_info(loop_info);
+ LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
+
+ // Check "IsLoopClonable" last as it can be time-consuming.
+ if (arch_loop_helper_->IsLoopTooBigForScalarUnrolling(&loop_analysis_info) ||
+ (loop_analysis_info.GetNumberOfExits() > 1) ||
+ loop_analysis_info.HasInstructionsPreventingScalarUnrolling() ||
+ !PeelUnrollHelper::IsLoopClonable(loop_info)) {
+ return false;
+ }
+
+ // TODO: support other unrolling factors.
+ DCHECK_EQ(unrolling_factor, 2u);
+
+ // Perform unrolling.
+ ArenaAllocator* arena = loop_info->GetHeader()->GetGraph()->GetAllocator();
+ SuperblockCloner::HBasicBlockMap bb_map(
+ std::less<HBasicBlock*>(), arena->Adapter(kArenaAllocSuperblockCloner));
+ SuperblockCloner::HInstructionMap hir_map(
+ std::less<HInstruction*>(), arena->Adapter(kArenaAllocSuperblockCloner));
+ PeelOrUnrollOnce(loop_node, /* unrolling */ true, &bb_map, &hir_map);
+
+ // Remove the redundant loop check after unrolling.
+ HIf* copy_hif = bb_map.Get(loop_info->GetHeader())->GetLastInstruction()->AsIf();
+ int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0;
+ copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u);
+
+ return true;
+}
+
//
// Loop vectorization. The implementation is based on the book by Aart J.C. Bik:
// "The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance."
@@ -891,7 +972,8 @@ void HLoopOptimization::Vectorize(LoopNode* node,
HBasicBlock* preheader = node->loop_info->GetPreHeader();
// Pick a loop unrolling factor for the vector loop.
- uint32_t unroll = GetUnrollingFactor(block, trip_count);
+ uint32_t unroll = arch_loop_helper_->GetSIMDUnrollingFactor(
+ block, trip_count, MaxNumberPeeled(), vector_length_);
uint32_t chunk = vector_length_ * unroll;
DCHECK(trip_count == 0 || (trip_count >= MaxNumberPeeled() + chunk));
@@ -2174,41 +2256,6 @@ bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) {
return true;
}
-static constexpr uint32_t ARM64_SIMD_MAXIMUM_UNROLL_FACTOR = 8;
-static constexpr uint32_t ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE = 50;
-
-uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
- uint32_t max_peel = MaxNumberPeeled();
- switch (compiler_driver_->GetInstructionSet()) {
- case InstructionSet::kArm64: {
- // Don't unroll with insufficient iterations.
- // TODO: Unroll loops with unknown trip count.
- DCHECK_NE(vector_length_, 0u);
- if (trip_count < (2 * vector_length_ + max_peel)) {
- return kNoUnrollingFactor;
- }
- // Don't unroll for large loop body size.
- uint32_t instruction_count = block->GetInstructions().CountSize();
- if (instruction_count >= ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE) {
- return kNoUnrollingFactor;
- }
- // Find a beneficial unroll factor with the following restrictions:
- // - At least one iteration of the transformed loop should be executed.
- // - The loop body shouldn't be "too big" (heuristic).
- uint32_t uf1 = ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE / instruction_count;
- uint32_t uf2 = (trip_count - max_peel) / vector_length_;
- uint32_t unroll_factor =
- TruncToPowerOfTwo(std::min({uf1, uf2, ARM64_SIMD_MAXIMUM_UNROLL_FACTOR}));
- DCHECK_GE(unroll_factor, 1u);
- return unroll_factor;
- }
- case InstructionSet::kX86:
- case InstructionSet::kX86_64:
- default:
- return kNoUnrollingFactor;
- }
-}
-
//
// Helpers.
//