diff options
Diffstat (limited to 'compiler/optimizing/loop_optimization.cc')
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 291 |
1 files changed, 210 insertions, 81 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index d2493137fe..b61d7b80d1 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -31,6 +31,9 @@ namespace art { // Enables vectorization (SIMDization) in the loop optimizer. static constexpr bool kEnableVectorization = true; +// All current SIMD targets want 16-byte alignment. +static constexpr size_t kAlignedBase = 16; + // Remove the instruction from the graph. A bit more elaborate than the usual // instruction removal, since there may be a cycle in the use structure. static void RemoveFromCycle(HInstruction* instruction) { @@ -283,6 +286,9 @@ HLoopOptimization::HLoopOptimization(HGraph* graph, simplified_(false), vector_length_(0), vector_refs_(nullptr), + vector_peeling_candidate_(nullptr), + vector_runtime_test_a_(nullptr), + vector_runtime_test_b_(nullptr), vector_map_(nullptr) { } @@ -422,23 +428,6 @@ void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) { // Optimization. // -bool HLoopOptimization::CanRemoveCycle() { - for (HInstruction* i : *iset_) { - // We can never remove instructions that have environment - // uses when we compile 'debuggable'. - if (i->HasEnvironmentUses() && graph_->IsDebuggable()) { - return false; - } - // A deoptimization should never have an environment input removed. - for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) { - if (use.GetUser()->GetHolder()->IsDeoptimize()) { - return false; - } - } - } - return true; -} - void HLoopOptimization::SimplifyInduction(LoopNode* node) { HBasicBlock* header = node->loop_info->GetHeader(); HBasicBlock* preheader = node->loop_info->GetPreHeader(); @@ -565,7 +554,7 @@ void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { if (kEnableVectorization) { iset_->clear(); // prepare phi induction if (TrySetSimpleLoopHeader(header) && - CanVectorize(node, body, trip_count) && + ShouldVectorize(node, body, trip_count) && TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) { Vectorize(node, body, exit, trip_count); graph_->SetHasSIMD(true); // flag SIMD usage @@ -580,10 +569,11 @@ void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { // Intel Press, June, 2004 (http://www.aartbik.com/). // -bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) { +bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) { // Reset vector bookkeeping. vector_length_ = 0; vector_refs_->clear(); + vector_peeling_candidate_ = nullptr; vector_runtime_test_a_ = vector_runtime_test_b_= nullptr; @@ -600,12 +590,9 @@ bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t } } - // Heuristics. Does vectorization seem profitable? - // TODO: refine - if (vector_length_ == 0) { - return false; // nothing found - } else if (0 < trip_count && trip_count < vector_length_) { - return false; // insufficient iterations + // Does vectorization seem profitable? + if (!IsVectorizationProfitable(trip_count)) { + return false; } // Data dependence analysis. Find each pair of references with same type, where @@ -633,18 +620,24 @@ bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t // Conservatively assume a potential loop-carried data dependence otherwise, avoided by // generating an explicit a != b disambiguation runtime test on the two references. if (x != y) { - // For now, we reject after one test to avoid excessive overhead. - if (vector_runtime_test_a_ != nullptr) { - return false; + // To avoid excessive overhead, we only accept one a != b test. + if (vector_runtime_test_a_ == nullptr) { + // First test found. + vector_runtime_test_a_ = a; + vector_runtime_test_b_ = b; + } else if ((vector_runtime_test_a_ != a || vector_runtime_test_b_ != b) && + (vector_runtime_test_a_ != b || vector_runtime_test_b_ != a)) { + return false; // second test would be needed } - vector_runtime_test_a_ = a; - vector_runtime_test_b_ = b; } } } } } + // Consider dynamic loop peeling for alignment. + SetPeelingCandidate(trip_count); + // Success! return true; } @@ -657,28 +650,52 @@ void HLoopOptimization::Vectorize(LoopNode* node, HBasicBlock* header = node->loop_info->GetHeader(); HBasicBlock* preheader = node->loop_info->GetPreHeader(); - // A cleanup is needed for any unknown trip count or for a known trip count - // with remainder iterations after vectorization. - bool needs_cleanup = trip_count == 0 || (trip_count % vector_length_) != 0; + // Pick a loop unrolling factor for the vector loop. + uint32_t unroll = GetUnrollingFactor(block, trip_count); + uint32_t chunk = vector_length_ * unroll; + + // A cleanup loop is needed, at least, for any unknown trip count or + // for a known trip count with remainder iterations after vectorization. + bool needs_cleanup = trip_count == 0 || (trip_count % chunk) != 0; // Adjust vector bookkeeping. iset_->clear(); // prepare phi induction bool is_simple_loop_header = TrySetSimpleLoopHeader(header); // fills iset_ DCHECK(is_simple_loop_header); + vector_header_ = header; + vector_body_ = block; + + // Generate dynamic loop peeling trip count, if needed: + // ptc = <peeling-needed-for-candidate> + HInstruction* ptc = nullptr; + if (vector_peeling_candidate_ != nullptr) { + DCHECK_LT(vector_length_, trip_count) << "dynamic peeling currently requires known trip count"; + // + // TODO: Implement this. Compute address of first access memory location and + // compute peeling factor to obtain kAlignedBase alignment. + // + needs_cleanup = true; + } - // Generate preheader: + // Generate loop control: // stc = <trip-count>; - // vtc = stc - stc % VL; + // vtc = stc - (stc - ptc) % chunk; + // i = 0; HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader); HInstruction* vtc = stc; if (needs_cleanup) { - DCHECK(IsPowerOfTwo(vector_length_)); + DCHECK(IsPowerOfTwo(chunk)); + HInstruction* diff = stc; + if (ptc != nullptr) { + diff = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, ptc)); + } HInstruction* rem = Insert( preheader, new (global_allocator_) HAnd(induc_type, - stc, - graph_->GetIntConstant(vector_length_ - 1))); + diff, + graph_->GetIntConstant(chunk - 1))); vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem)); } + vector_index_ = graph_->GetIntConstant(0); // Generate runtime disambiguation test: // vtc = a != b ? vtc : 0; @@ -691,16 +708,31 @@ void HLoopOptimization::Vectorize(LoopNode* node, needs_cleanup = true; } - // Generate vector loop: - // for (i = 0; i < vtc; i += VL) + // Generate dynamic peeling loop for alignment, if needed: + // for ( ; i < ptc; i += 1) + // <loop-body> + if (ptc != nullptr) { + vector_mode_ = kSequential; + GenerateNewLoop(node, + block, + graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), + vector_index_, + ptc, + graph_->GetIntConstant(1), + /*unroll*/ 1); + } + + // Generate vector loop, possibly further unrolled: + // for ( ; i < vtc; i += chunk) // <vectorized-loop-body> vector_mode_ = kVector; GenerateNewLoop(node, block, - graph_->TransformLoopForVectorization(header, block, exit), - graph_->GetIntConstant(0), + graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), + vector_index_, vtc, - graph_->GetIntConstant(vector_length_)); + graph_->GetIntConstant(vector_length_), // increment per unroll + unroll); HLoopInformation* vloop = vector_header_->GetLoopInformation(); // Generate cleanup loop, if needed: @@ -711,9 +743,10 @@ void HLoopOptimization::Vectorize(LoopNode* node, GenerateNewLoop(node, block, graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), - vector_phi_, + vector_index_, stc, - graph_->GetIntConstant(1)); + graph_->GetIntConstant(1), + /*unroll*/ 1); } // Remove the original loop by disconnecting the body block @@ -722,8 +755,9 @@ void HLoopOptimization::Vectorize(LoopNode* node, while (!header->GetFirstInstruction()->IsGoto()) { header->RemoveInstruction(header->GetFirstInstruction()); } - // Update loop hierarchy: the old header now resides in the - // same outer loop as the old preheader. + // Update loop hierarchy: the old header now resides in the same outer loop + // as the old preheader. Note that we don't bother putting sequential + // loops back in the hierarchy at this point. header->SetLoopInformation(preheader->GetLoopInformation()); // outward node->loop_info = vloop; } @@ -733,44 +767,64 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node, HBasicBlock* new_preheader, HInstruction* lo, HInstruction* hi, - HInstruction* step) { + HInstruction* step, + uint32_t unroll) { + DCHECK(unroll == 1 || vector_mode_ == kVector); Primitive::Type induc_type = Primitive::kPrimInt; // Prepare new loop. - vector_map_->clear(); vector_preheader_ = new_preheader, vector_header_ = vector_preheader_->GetSingleSuccessor(); vector_body_ = vector_header_->GetSuccessors()[1]; - vector_phi_ = new (global_allocator_) HPhi(global_allocator_, - kNoRegNumber, - 0, - HPhi::ToPhiType(induc_type)); + HPhi* phi = new (global_allocator_) HPhi(global_allocator_, + kNoRegNumber, + 0, + HPhi::ToPhiType(induc_type)); // Generate header and prepare body. // for (i = lo; i < hi; i += step) // <loop-body> - HInstruction* cond = new (global_allocator_) HAboveOrEqual(vector_phi_, hi); - vector_header_->AddPhi(vector_phi_); + HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi); + vector_header_->AddPhi(phi); vector_header_->AddInstruction(cond); vector_header_->AddInstruction(new (global_allocator_) HIf(cond)); - for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { - bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true); - DCHECK(vectorized_def); - } - // Generate body from the instruction map, but in original program order. - HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment(); - for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { - auto i = vector_map_->find(it.Current()); - if (i != vector_map_->end() && !i->second->IsInBlock()) { - Insert(vector_body_, i->second); - // Deal with instructions that need an environment, such as the scalar intrinsics. - if (i->second->NeedsEnvironment()) { - i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_); + vector_index_ = phi; + for (uint32_t u = 0; u < unroll; u++) { + // Clear map, leaving loop invariants setup during unrolling. + if (u == 0) { + vector_map_->clear(); + } else { + for (auto i = vector_map_->begin(); i != vector_map_->end(); ) { + if (i->second->IsVecReplicateScalar()) { + DCHECK(node->loop_info->IsDefinedOutOfTheLoop(i->first)); + ++i; + } else { + i = vector_map_->erase(i); + } + } + } + // Generate instruction map. + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true); + DCHECK(vectorized_def); + } + // Generate body from the instruction map, but in original program order. + HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment(); + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + auto i = vector_map_->find(it.Current()); + if (i != vector_map_->end() && !i->second->IsInBlock()) { + Insert(vector_body_, i->second); + // Deal with instructions that need an environment, such as the scalar intrinsics. + if (i->second->NeedsEnvironment()) { + i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_); + } } } + vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step); + Insert(vector_body_, vector_index_); } - // Finalize increment and phi. - HInstruction* inc = new (global_allocator_) HAdd(induc_type, vector_phi_, step); - vector_phi_->AddInput(lo); - vector_phi_->AddInput(Insert(vector_body_, inc)); + // Finalize phi for the loop index. + phi->AddInput(lo); + phi->AddInput(vector_index_); + vector_index_ = phi; } // TODO: accept reductions at left-hand-side, mixed-type store idioms, etc. @@ -791,11 +845,11 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node, HInstruction* offset = nullptr; if (TrySetVectorType(type, &restrictions) && node->loop_info->IsDefinedOutOfTheLoop(base) && - induction_range_.IsUnitStride(instruction, index, &offset) && + induction_range_.IsUnitStride(instruction, index, graph_, &offset) && VectorizeUse(node, value, generate_code, type, restrictions)) { if (generate_code) { GenerateVecSub(index, offset); - GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), type); + GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), offset, type); } else { vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ true)); } @@ -849,10 +903,10 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, HInstruction* offset = nullptr; if (type == instruction->GetType() && node->loop_info->IsDefinedOutOfTheLoop(base) && - induction_range_.IsUnitStride(instruction, index, &offset)) { + induction_range_.IsUnitStride(instruction, index, graph_, &offset)) { if (generate_code) { GenerateVecSub(index, offset); - GenerateVecMem(instruction, vector_map_->Get(index), nullptr, type); + GenerateVecMem(instruction, vector_map_->Get(index), nullptr, offset, type); } else { vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ false)); } @@ -1164,8 +1218,9 @@ void HLoopOptimization::GenerateVecInv(HInstruction* org, Primitive::Type type) void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset) { if (vector_map_->find(org) == vector_map_->end()) { - HInstruction* subscript = vector_phi_; - if (offset != nullptr) { + HInstruction* subscript = vector_index_; + int64_t value = 0; + if (!IsInt64AndGet(offset, &value) || value != 0) { subscript = new (global_allocator_) HAdd(Primitive::kPrimInt, subscript, offset); if (org->IsPhi()) { Insert(vector_body_, subscript); // lacks layout placeholder @@ -1178,17 +1233,27 @@ void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset) void HLoopOptimization::GenerateVecMem(HInstruction* org, HInstruction* opa, HInstruction* opb, + HInstruction* offset, Primitive::Type type) { HInstruction* vector = nullptr; if (vector_mode_ == kVector) { // Vector store or load. + HInstruction* base = org->InputAt(0); if (opb != nullptr) { vector = new (global_allocator_) HVecStore( - global_allocator_, org->InputAt(0), opa, opb, type, vector_length_); + global_allocator_, base, opa, opb, type, vector_length_); } else { bool is_string_char_at = org->AsArrayGet()->IsStringCharAt(); vector = new (global_allocator_) HVecLoad( - global_allocator_, org->InputAt(0), opa, type, vector_length_, is_string_char_at); + global_allocator_, base, opa, type, vector_length_, is_string_char_at); + } + // Known dynamically enforced alignment? + // TODO: detect offset + constant differences. + // TODO: long run, static alignment analysis? + if (vector_peeling_candidate_ != nullptr && + vector_peeling_candidate_->base == base && + vector_peeling_candidate_->offset == offset) { + vector->AsVecMemoryOperation()->SetAlignment(Alignment(kAlignedBase, 0)); } } else { // Scalar store or load. @@ -1444,10 +1509,57 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, } // +// Vectorization heuristics. +// + +bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) { + // Current heuristic: non-empty body with sufficient number + // of iterations (if known). + // TODO: refine by looking at e.g. operation count, alignment, etc. + if (vector_length_ == 0) { + return false; // nothing found + } else if (0 < trip_count && trip_count < vector_length_) { + return false; // insufficient iterations + } + return true; +} + +void HLoopOptimization::SetPeelingCandidate(int64_t trip_count ATTRIBUTE_UNUSED) { + // Current heuristic: none. + // TODO: implement +} + +uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) { + // Current heuristic: unroll by 2 on ARM64/X86 for large known trip + // counts and small loop bodies. + // TODO: refine with operation count, remaining iterations, etc. + // Artem had some really cool ideas for this already. + switch (compiler_driver_->GetInstructionSet()) { + case kArm64: + case kX86: + case kX86_64: { + size_t num_instructions = block->GetInstructions().CountSize(); + if (num_instructions <= 10 && trip_count >= 4 * vector_length_) { + return 2; + } + return 1; + } + default: + return 1; + } +} + +// // Helpers. // bool HLoopOptimization::TrySetPhiInduction(HPhi* phi, bool restrict_uses) { + // Special case Phis that have equivalent in a debuggable setup. Our graph checker isn't + // smart enough to follow strongly connected components (and it's probably not worth + // it to make it so). See b/33775412. + if (graph_->IsDebuggable() && phi->HasEquivalentPhi()) { + return false; + } DCHECK(iset_->empty()); ArenaSet<HInstruction*>* set = induction_range_.LookupCycle(phi); if (set != nullptr) { @@ -1576,8 +1688,8 @@ bool HLoopOptimization::TryReplaceWithLastValue(HLoopInformation* loop_info, size_t index = it->GetIndex(); ++it; // increment before replacing if (iset_->find(user->GetHolder()) == iset_->end()) { // not excluded? - HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation(); // Only update environment uses after the loop. + HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation(); if (other_loop_info == nullptr || !other_loop_info->IsIn(*loop_info)) { user->RemoveAsUserOfInput(index); user->SetRawEnvAt(index, replacement); @@ -1614,4 +1726,21 @@ void HLoopOptimization::RemoveDeadInstructions(const HInstructionList& list) { } } +bool HLoopOptimization::CanRemoveCycle() { + for (HInstruction* i : *iset_) { + // We can never remove instructions that have environment + // uses when we compile 'debuggable'. + if (i->HasEnvironmentUses() && graph_->IsDebuggable()) { + return false; + } + // A deoptimization should never have an environment input removed. + for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) { + if (use.GetUser()->GetHolder()->IsDeoptimize()) { + return false; + } + } + } + return true; +} + } // namespace art |