summaryrefslogtreecommitdiff
path: root/compiler/optimizing/loop_optimization.cc
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/optimizing/loop_optimization.cc')
-rw-r--r--compiler/optimizing/loop_optimization.cc265
1 files changed, 192 insertions, 73 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index d2493137fe..d39bc16ed2 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -31,6 +31,9 @@ namespace art {
// Enables vectorization (SIMDization) in the loop optimizer.
static constexpr bool kEnableVectorization = true;
+// All current SIMD targets want 16-byte alignment.
+static constexpr size_t kAlignedBase = 16;
+
// Remove the instruction from the graph. A bit more elaborate than the usual
// instruction removal, since there may be a cycle in the use structure.
static void RemoveFromCycle(HInstruction* instruction) {
@@ -283,6 +286,9 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
simplified_(false),
vector_length_(0),
vector_refs_(nullptr),
+ vector_peeling_candidate_(nullptr),
+ vector_runtime_test_a_(nullptr),
+ vector_runtime_test_b_(nullptr),
vector_map_(nullptr) {
}
@@ -422,23 +428,6 @@ void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) {
// Optimization.
//
-bool HLoopOptimization::CanRemoveCycle() {
- for (HInstruction* i : *iset_) {
- // We can never remove instructions that have environment
- // uses when we compile 'debuggable'.
- if (i->HasEnvironmentUses() && graph_->IsDebuggable()) {
- return false;
- }
- // A deoptimization should never have an environment input removed.
- for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) {
- if (use.GetUser()->GetHolder()->IsDeoptimize()) {
- return false;
- }
- }
- }
- return true;
-}
-
void HLoopOptimization::SimplifyInduction(LoopNode* node) {
HBasicBlock* header = node->loop_info->GetHeader();
HBasicBlock* preheader = node->loop_info->GetPreHeader();
@@ -565,7 +554,7 @@ void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
if (kEnableVectorization) {
iset_->clear(); // prepare phi induction
if (TrySetSimpleLoopHeader(header) &&
- CanVectorize(node, body, trip_count) &&
+ ShouldVectorize(node, body, trip_count) &&
TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) {
Vectorize(node, body, exit, trip_count);
graph_->SetHasSIMD(true); // flag SIMD usage
@@ -580,10 +569,11 @@ void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
// Intel Press, June, 2004 (http://www.aartbik.com/).
//
-bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) {
+bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) {
// Reset vector bookkeeping.
vector_length_ = 0;
vector_refs_->clear();
+ vector_peeling_candidate_ = nullptr;
vector_runtime_test_a_ =
vector_runtime_test_b_= nullptr;
@@ -600,12 +590,9 @@ bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t
}
}
- // Heuristics. Does vectorization seem profitable?
- // TODO: refine
- if (vector_length_ == 0) {
- return false; // nothing found
- } else if (0 < trip_count && trip_count < vector_length_) {
- return false; // insufficient iterations
+ // Does vectorization seem profitable?
+ if (!IsVectorizationProfitable(trip_count)) {
+ return false;
}
// Data dependence analysis. Find each pair of references with same type, where
@@ -645,6 +632,9 @@ bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t
}
}
+ // Consider dynamic loop peeling for alignment.
+ SetPeelingCandidate(trip_count);
+
// Success!
return true;
}
@@ -657,28 +647,52 @@ void HLoopOptimization::Vectorize(LoopNode* node,
HBasicBlock* header = node->loop_info->GetHeader();
HBasicBlock* preheader = node->loop_info->GetPreHeader();
- // A cleanup is needed for any unknown trip count or for a known trip count
- // with remainder iterations after vectorization.
- bool needs_cleanup = trip_count == 0 || (trip_count % vector_length_) != 0;
+ // Pick a loop unrolling factor for the vector loop.
+ uint32_t unroll = GetUnrollingFactor(block, trip_count);
+ uint32_t chunk = vector_length_ * unroll;
+
+ // A cleanup loop is needed, at least, for any unknown trip count or
+ // for a known trip count with remainder iterations after vectorization.
+ bool needs_cleanup = trip_count == 0 || (trip_count % chunk) != 0;
// Adjust vector bookkeeping.
iset_->clear(); // prepare phi induction
bool is_simple_loop_header = TrySetSimpleLoopHeader(header); // fills iset_
DCHECK(is_simple_loop_header);
+ vector_header_ = header;
+ vector_body_ = block;
+
+ // Generate dynamic loop peeling trip count, if needed:
+ // ptc = <peeling-needed-for-candidate>
+ HInstruction* ptc = nullptr;
+ if (vector_peeling_candidate_ != nullptr) {
+ DCHECK_LT(vector_length_, trip_count) << "dynamic peeling currently requires known trip count";
+ //
+ // TODO: Implement this. Compute address of first access memory location and
+ // compute peeling factor to obtain kAlignedBase alignment.
+ //
+ needs_cleanup = true;
+ }
- // Generate preheader:
+ // Generate loop control:
// stc = <trip-count>;
- // vtc = stc - stc % VL;
+ // vtc = stc - (stc - ptc) % chunk;
+ // i = 0;
HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
HInstruction* vtc = stc;
if (needs_cleanup) {
- DCHECK(IsPowerOfTwo(vector_length_));
+ DCHECK(IsPowerOfTwo(chunk));
+ HInstruction* diff = stc;
+ if (ptc != nullptr) {
+ diff = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, ptc));
+ }
HInstruction* rem = Insert(
preheader, new (global_allocator_) HAnd(induc_type,
- stc,
- graph_->GetIntConstant(vector_length_ - 1)));
+ diff,
+ graph_->GetIntConstant(chunk - 1)));
vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem));
}
+ vector_index_ = graph_->GetIntConstant(0);
// Generate runtime disambiguation test:
// vtc = a != b ? vtc : 0;
@@ -691,16 +705,31 @@ void HLoopOptimization::Vectorize(LoopNode* node,
needs_cleanup = true;
}
- // Generate vector loop:
- // for (i = 0; i < vtc; i += VL)
+ // Generate dynamic peeling loop for alignment, if needed:
+ // for ( ; i < ptc; i += 1)
+ // <loop-body>
+ if (ptc != nullptr) {
+ vector_mode_ = kSequential;
+ GenerateNewLoop(node,
+ block,
+ graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
+ vector_index_,
+ ptc,
+ graph_->GetIntConstant(1),
+ /*unroll*/ 1);
+ }
+
+ // Generate vector loop, possibly further unrolled:
+ // for ( ; i < vtc; i += chunk)
// <vectorized-loop-body>
vector_mode_ = kVector;
GenerateNewLoop(node,
block,
- graph_->TransformLoopForVectorization(header, block, exit),
- graph_->GetIntConstant(0),
+ graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
+ vector_index_,
vtc,
- graph_->GetIntConstant(vector_length_));
+ graph_->GetIntConstant(vector_length_), // increment per unroll
+ unroll);
HLoopInformation* vloop = vector_header_->GetLoopInformation();
// Generate cleanup loop, if needed:
@@ -711,9 +740,10 @@ void HLoopOptimization::Vectorize(LoopNode* node,
GenerateNewLoop(node,
block,
graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
- vector_phi_,
+ vector_index_,
stc,
- graph_->GetIntConstant(1));
+ graph_->GetIntConstant(1),
+ /*unroll*/ 1);
}
// Remove the original loop by disconnecting the body block
@@ -722,8 +752,9 @@ void HLoopOptimization::Vectorize(LoopNode* node,
while (!header->GetFirstInstruction()->IsGoto()) {
header->RemoveInstruction(header->GetFirstInstruction());
}
- // Update loop hierarchy: the old header now resides in the
- // same outer loop as the old preheader.
+ // Update loop hierarchy: the old header now resides in the same outer loop
+ // as the old preheader. Note that we don't bother putting sequential
+ // loops back in the hierarchy at this point.
header->SetLoopInformation(preheader->GetLoopInformation()); // outward
node->loop_info = vloop;
}
@@ -733,44 +764,64 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node,
HBasicBlock* new_preheader,
HInstruction* lo,
HInstruction* hi,
- HInstruction* step) {
+ HInstruction* step,
+ uint32_t unroll) {
+ DCHECK(unroll == 1 || vector_mode_ == kVector);
Primitive::Type induc_type = Primitive::kPrimInt;
// Prepare new loop.
- vector_map_->clear();
vector_preheader_ = new_preheader,
vector_header_ = vector_preheader_->GetSingleSuccessor();
vector_body_ = vector_header_->GetSuccessors()[1];
- vector_phi_ = new (global_allocator_) HPhi(global_allocator_,
- kNoRegNumber,
- 0,
- HPhi::ToPhiType(induc_type));
+ HPhi* phi = new (global_allocator_) HPhi(global_allocator_,
+ kNoRegNumber,
+ 0,
+ HPhi::ToPhiType(induc_type));
// Generate header and prepare body.
// for (i = lo; i < hi; i += step)
// <loop-body>
- HInstruction* cond = new (global_allocator_) HAboveOrEqual(vector_phi_, hi);
- vector_header_->AddPhi(vector_phi_);
+ HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi);
+ vector_header_->AddPhi(phi);
vector_header_->AddInstruction(cond);
vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
- for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
- bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
- DCHECK(vectorized_def);
- }
- // Generate body from the instruction map, but in original program order.
- HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
- for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
- auto i = vector_map_->find(it.Current());
- if (i != vector_map_->end() && !i->second->IsInBlock()) {
- Insert(vector_body_, i->second);
- // Deal with instructions that need an environment, such as the scalar intrinsics.
- if (i->second->NeedsEnvironment()) {
- i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+ vector_index_ = phi;
+ for (uint32_t u = 0; u < unroll; u++) {
+ // Clear map, leaving loop invariants setup during unrolling.
+ if (u == 0) {
+ vector_map_->clear();
+ } else {
+ for (auto i = vector_map_->begin(); i != vector_map_->end(); ) {
+ if (i->second->IsVecReplicateScalar()) {
+ DCHECK(node->loop_info->IsDefinedOutOfTheLoop(i->first));
+ ++i;
+ } else {
+ i = vector_map_->erase(i);
+ }
}
}
+ // Generate instruction map.
+ for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+ bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
+ DCHECK(vectorized_def);
+ }
+ // Generate body from the instruction map, but in original program order.
+ HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
+ for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+ auto i = vector_map_->find(it.Current());
+ if (i != vector_map_->end() && !i->second->IsInBlock()) {
+ Insert(vector_body_, i->second);
+ // Deal with instructions that need an environment, such as the scalar intrinsics.
+ if (i->second->NeedsEnvironment()) {
+ i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+ }
+ }
+ }
+ vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step);
+ Insert(vector_body_, vector_index_);
}
- // Finalize increment and phi.
- HInstruction* inc = new (global_allocator_) HAdd(induc_type, vector_phi_, step);
- vector_phi_->AddInput(lo);
- vector_phi_->AddInput(Insert(vector_body_, inc));
+ // Finalize phi for the loop index.
+ phi->AddInput(lo);
+ phi->AddInput(vector_index_);
+ vector_index_ = phi;
}
// TODO: accept reductions at left-hand-side, mixed-type store idioms, etc.
@@ -795,7 +846,7 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node,
VectorizeUse(node, value, generate_code, type, restrictions)) {
if (generate_code) {
GenerateVecSub(index, offset);
- GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), type);
+ GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), offset, type);
} else {
vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ true));
}
@@ -852,7 +903,7 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node,
induction_range_.IsUnitStride(instruction, index, &offset)) {
if (generate_code) {
GenerateVecSub(index, offset);
- GenerateVecMem(instruction, vector_map_->Get(index), nullptr, type);
+ GenerateVecMem(instruction, vector_map_->Get(index), nullptr, offset, type);
} else {
vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ false));
}
@@ -1164,7 +1215,7 @@ void HLoopOptimization::GenerateVecInv(HInstruction* org, Primitive::Type type)
void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset) {
if (vector_map_->find(org) == vector_map_->end()) {
- HInstruction* subscript = vector_phi_;
+ HInstruction* subscript = vector_index_;
if (offset != nullptr) {
subscript = new (global_allocator_) HAdd(Primitive::kPrimInt, subscript, offset);
if (org->IsPhi()) {
@@ -1178,17 +1229,27 @@ void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset)
void HLoopOptimization::GenerateVecMem(HInstruction* org,
HInstruction* opa,
HInstruction* opb,
+ HInstruction* offset,
Primitive::Type type) {
HInstruction* vector = nullptr;
if (vector_mode_ == kVector) {
// Vector store or load.
+ HInstruction* base = org->InputAt(0);
if (opb != nullptr) {
vector = new (global_allocator_) HVecStore(
- global_allocator_, org->InputAt(0), opa, opb, type, vector_length_);
+ global_allocator_, base, opa, opb, type, vector_length_);
} else {
bool is_string_char_at = org->AsArrayGet()->IsStringCharAt();
vector = new (global_allocator_) HVecLoad(
- global_allocator_, org->InputAt(0), opa, type, vector_length_, is_string_char_at);
+ global_allocator_, base, opa, type, vector_length_, is_string_char_at);
+ }
+ // Known dynamically enforced alignment?
+ // TODO: detect offset + constant differences.
+ // TODO: long run, static alignment analysis?
+ if (vector_peeling_candidate_ != nullptr &&
+ vector_peeling_candidate_->base == base &&
+ vector_peeling_candidate_->offset == offset) {
+ vector->AsVecMemoryOperation()->SetAlignment(Alignment(kAlignedBase, 0));
}
} else {
// Scalar store or load.
@@ -1444,6 +1505,47 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
}
//
+// Vectorization heuristics.
+//
+
+bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) {
+ // Current heuristic: non-empty body with sufficient number
+ // of iterations (if known).
+ // TODO: refine by looking at e.g. operation count, alignment, etc.
+ if (vector_length_ == 0) {
+ return false; // nothing found
+ } else if (0 < trip_count && trip_count < vector_length_) {
+ return false; // insufficient iterations
+ }
+ return true;
+}
+
+void HLoopOptimization::SetPeelingCandidate(int64_t trip_count ATTRIBUTE_UNUSED) {
+ // Current heuristic: none.
+ // TODO: implement
+}
+
+uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
+ // Current heuristic: unroll by 2 on ARM64/X86 for large known trip
+ // counts and small loop bodies.
+ // TODO: refine with operation count, remaining iterations, etc.
+ // Artem had some really cool ideas for this already.
+ switch (compiler_driver_->GetInstructionSet()) {
+ case kArm64:
+ case kX86:
+ case kX86_64: {
+ size_t num_instructions = block->GetInstructions().CountSize();
+ if (num_instructions <= 10 && trip_count >= 4 * vector_length_) {
+ return 2;
+ }
+ return 1;
+ }
+ default:
+ return 1;
+ }
+}
+
+//
// Helpers.
//
@@ -1576,8 +1678,8 @@ bool HLoopOptimization::TryReplaceWithLastValue(HLoopInformation* loop_info,
size_t index = it->GetIndex();
++it; // increment before replacing
if (iset_->find(user->GetHolder()) == iset_->end()) { // not excluded?
- HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation();
// Only update environment uses after the loop.
+ HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation();
if (other_loop_info == nullptr || !other_loop_info->IsIn(*loop_info)) {
user->RemoveAsUserOfInput(index);
user->SetRawEnvAt(index, replacement);
@@ -1614,4 +1716,21 @@ void HLoopOptimization::RemoveDeadInstructions(const HInstructionList& list) {
}
}
+bool HLoopOptimization::CanRemoveCycle() {
+ for (HInstruction* i : *iset_) {
+ // We can never remove instructions that have environment
+ // uses when we compile 'debuggable'.
+ if (i->HasEnvironmentUses() && graph_->IsDebuggable()) {
+ return false;
+ }
+ // A deoptimization should never have an environment input removed.
+ for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) {
+ if (use.GetUser()->GetHolder()->IsDeoptimize()) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
} // namespace art