diff options
Diffstat (limited to 'compiler/optimizing/loop_optimization.cc')
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 187 |
1 files changed, 148 insertions, 39 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 4c9b01c97e..1210dbe67b 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -473,6 +473,7 @@ HLoopOptimization::HLoopOptimization(HGraph* graph, iset_(nullptr), reductions_(nullptr), simplified_(false), + predicated_vectorization_mode_(codegen.SupportsPredicatedSIMD()), vector_length_(0), vector_refs_(nullptr), vector_static_peeling_factor_(0), @@ -486,10 +487,7 @@ HLoopOptimization::HLoopOptimization(HGraph* graph, vector_header_(nullptr), vector_body_(nullptr), vector_index_(nullptr), - arch_loop_helper_(ArchNoOptsLoopHelper::Create(compiler_options_ != nullptr - ? compiler_options_->GetInstructionSet() - : InstructionSet::kNone, - global_allocator_)) { + arch_loop_helper_(ArchNoOptsLoopHelper::Create(codegen, global_allocator_)) { } bool HLoopOptimization::Run() { @@ -1024,8 +1022,10 @@ bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int6 } } // for i - // Find a suitable alignment strategy. - SetAlignmentStrategy(peeling_votes, peeling_candidate); + if (!IsInPredicatedVectorizationMode()) { + // Find a suitable alignment strategy. + SetAlignmentStrategy(peeling_votes, peeling_candidate); + } // Does vectorization seem profitable? if (!IsVectorizationProfitable(trip_count)) { @@ -1052,8 +1052,8 @@ void HLoopOptimization::Vectorize(LoopNode* node, // A cleanup loop is needed, at least, for any unknown trip count or // for a known trip count with remainder iterations after vectorization. - bool needs_cleanup = trip_count == 0 || - ((trip_count - vector_static_peeling_factor_) % chunk) != 0; + bool needs_cleanup = !IsInPredicatedVectorizationMode() && + (trip_count == 0 || ((trip_count - vector_static_peeling_factor_) % chunk) != 0); // Adjust vector bookkeeping. HPhi* main_phi = nullptr; @@ -1071,11 +1071,13 @@ void HLoopOptimization::Vectorize(LoopNode* node, // ptc = <peeling factor>; HInstruction* ptc = nullptr; if (vector_static_peeling_factor_ != 0) { + DCHECK(!IsInPredicatedVectorizationMode()); // Static loop peeling for SIMD alignment (using the most suitable // fixed peeling factor found during prior alignment analysis). DCHECK(vector_dynamic_peeling_candidate_ == nullptr); ptc = graph_->GetConstant(induc_type, vector_static_peeling_factor_); } else if (vector_dynamic_peeling_candidate_ != nullptr) { + DCHECK(!IsInPredicatedVectorizationMode()); // Dynamic loop peeling for SIMD alignment (using the most suitable // candidate found during prior alignment analysis): // rem = offset % ALIGN; // adjusted as #elements @@ -1106,6 +1108,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader); HInstruction* vtc = stc; if (needs_cleanup) { + DCHECK(!IsInPredicatedVectorizationMode()); DCHECK(IsPowerOfTwo(chunk)); HInstruction* diff = stc; if (ptc != nullptr) { @@ -1143,6 +1146,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, // moved around during suspend checks, since all analysis was based on // nothing more than the Android runtime alignment conventions. if (ptc != nullptr) { + DCHECK(!IsInPredicatedVectorizationMode()); vector_mode_ = kSequential; GenerateNewLoop(node, block, @@ -1170,6 +1174,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, // for ( ; i < stc; i += 1) // <loop-body> if (needs_cleanup) { + DCHECK(!IsInPredicatedVectorizationMode() || vector_runtime_test_a_ != nullptr); vector_mode_ = kSequential; GenerateNewLoop(node, block, @@ -1227,9 +1232,35 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node, // Generate header and prepare body. // for (i = lo; i < hi; i += step) // <loop-body> - HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi); - vector_header_->AddPhi(phi); - vector_header_->AddInstruction(cond); + HInstruction* cond = nullptr; + HInstruction* set_pred = nullptr; + if (IsInPredicatedVectorizationMode()) { + HVecPredWhile* pred_while = + new (global_allocator_) HVecPredWhile(global_allocator_, + phi, + hi, + HVecPredWhile::CondKind::kLO, + DataType::Type::kInt32, + vector_length_, + 0u); + + cond = new (global_allocator_) HVecPredCondition(global_allocator_, + pred_while, + HVecPredCondition::PCondKind::kNFirst, + DataType::Type::kInt32, + vector_length_, + 0u); + + vector_header_->AddPhi(phi); + vector_header_->AddInstruction(pred_while); + vector_header_->AddInstruction(cond); + set_pred = pred_while; + } else { + cond = new (global_allocator_) HAboveOrEqual(phi, hi); + vector_header_->AddPhi(phi); + vector_header_->AddInstruction(cond); + } + vector_header_->AddInstruction(new (global_allocator_) HIf(cond)); vector_index_ = phi; vector_permanent_map_->clear(); // preserved over unrolling @@ -1246,6 +1277,10 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node, auto i = vector_map_->find(it.Current()); if (i != vector_map_->end() && !i->second->IsInBlock()) { Insert(vector_body_, i->second); + if (IsInPredicatedVectorizationMode() && i->second->IsVecOperation()) { + HVecOperation* op = i->second->AsVecOperation(); + op->SetMergingGoverningPredicate(set_pred); + } // Deal with instructions that need an environment, such as the scalar intrinsics. if (i->second->NeedsEnvironment()) { i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_); @@ -1360,7 +1395,10 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, } else if (instruction->IsArrayGet()) { // Deal with vector restrictions. bool is_string_char_at = instruction->AsArrayGet()->IsStringCharAt(); - if (is_string_char_at && HasVectorRestrictions(restrictions, kNoStringCharAt)) { + + if (is_string_char_at && (HasVectorRestrictions(restrictions, kNoStringCharAt) || + IsInPredicatedVectorizationMode())) { + // TODO: Support CharAt for predicated mode. return false; } // Accept a right-hand-side array base[index] for @@ -1575,32 +1613,73 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict } return false; case InstructionSet::kArm64: - // Allow vectorization for all ARM devices, because Android assumes that - // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD). - switch (type) { - case DataType::Type::kBool: - case DataType::Type::kUint8: - case DataType::Type::kInt8: - *restrictions |= kNoDiv; - return TrySetVectorLength(type, 16); - case DataType::Type::kUint16: - case DataType::Type::kInt16: - *restrictions |= kNoDiv; - return TrySetVectorLength(type, 8); - case DataType::Type::kInt32: - *restrictions |= kNoDiv; - return TrySetVectorLength(type, 4); - case DataType::Type::kInt64: - *restrictions |= kNoDiv | kNoMul; - return TrySetVectorLength(type, 2); - case DataType::Type::kFloat32: - *restrictions |= kNoReduction; - return TrySetVectorLength(type, 4); - case DataType::Type::kFloat64: - *restrictions |= kNoReduction; - return TrySetVectorLength(type, 2); - default: - return false; + if (IsInPredicatedVectorizationMode()) { + // SVE vectorization. + CHECK(features->AsArm64InstructionSetFeatures()->HasSVE()); + switch (type) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + *restrictions |= kNoDiv | + kNoSignedHAdd | + kNoUnsignedHAdd | + kNoUnroundedHAdd | + kNoSAD; + return TrySetVectorLength(type, 16); + case DataType::Type::kUint16: + case DataType::Type::kInt16: + *restrictions |= kNoDiv | + kNoSignedHAdd | + kNoUnsignedHAdd | + kNoUnroundedHAdd | + kNoSAD | + kNoDotProd; + return TrySetVectorLength(type, 8); + case DataType::Type::kInt32: + *restrictions |= kNoDiv | kNoSAD; + return TrySetVectorLength(type, 4); + case DataType::Type::kInt64: + *restrictions |= kNoDiv | kNoSAD; + return TrySetVectorLength(type, 2); + case DataType::Type::kFloat32: + *restrictions |= kNoReduction; + return TrySetVectorLength(type, 4); + case DataType::Type::kFloat64: + *restrictions |= kNoReduction; + return TrySetVectorLength(type, 2); + default: + break; + } + return false; + } else { + // Allow vectorization for all ARM devices, because Android assumes that + // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD). + switch (type) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + *restrictions |= kNoDiv; + return TrySetVectorLength(type, 16); + case DataType::Type::kUint16: + case DataType::Type::kInt16: + *restrictions |= kNoDiv; + return TrySetVectorLength(type, 8); + case DataType::Type::kInt32: + *restrictions |= kNoDiv; + return TrySetVectorLength(type, 4); + case DataType::Type::kInt64: + *restrictions |= kNoDiv | kNoMul; + return TrySetVectorLength(type, 2); + case DataType::Type::kFloat32: + *restrictions |= kNoReduction; + return TrySetVectorLength(type, 4); + case DataType::Type::kFloat64: + *restrictions |= kNoReduction; + return TrySetVectorLength(type, 2); + default: + break; + } + return false; } case InstructionSet::kX86: case InstructionSet::kX86_64: @@ -1693,6 +1772,15 @@ void HLoopOptimization::GenerateVecInv(HInstruction* org, DataType::Type type) { vector = new (global_allocator_) HVecReplicateScalar(global_allocator_, input, type, vector_length_, kNoDexPc); vector_permanent_map_->Put(org, Insert(vector_preheader_, vector)); + if (IsInPredicatedVectorizationMode()) { + HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_, + graph_->GetIntConstant(1), + type, + vector_length_, + 0u); + vector_preheader_->InsertInstructionBefore(set_pred, vector); + vector->AsVecOperation()->SetMergingGoverningPredicate(set_pred); + } } vector_map_->Put(org, vector); } @@ -1821,6 +1909,15 @@ void HLoopOptimization::GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* r vector_length, kNoDexPc)); } + if (IsInPredicatedVectorizationMode()) { + HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_, + graph_->GetIntConstant(1), + type, + vector_length, + 0u); + vector_preheader_->InsertInstructionBefore(set_pred, new_init); + new_init->AsVecOperation()->SetMergingGoverningPredicate(set_pred); + } } else { new_init = ReduceAndExtractIfNeeded(new_init); } @@ -1852,6 +1949,17 @@ HInstruction* HLoopOptimization::ReduceAndExtractIfNeeded(HInstruction* instruct instruction = new (global_allocator_) HVecExtractScalar( global_allocator_, reduce, type, vector_length, 0, kNoDexPc); exit->InsertInstructionAfter(instruction, reduce); + + if (IsInPredicatedVectorizationMode()) { + HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_, + graph_->GetIntConstant(1), + type, + vector_length, + 0u); + exit->InsertInstructionBefore(set_pred, reduce); + reduce->AsVecOperation()->SetMergingGoverningPredicate(set_pred); + instruction->AsVecOperation()->SetMergingGoverningPredicate(set_pred); + } } } return instruction; @@ -1991,7 +2099,8 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, return false; } // Deal with vector restrictions. - if ((!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) || + if ((is_unsigned && HasVectorRestrictions(restrictions, kNoUnsignedHAdd)) || + (!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) || (!is_rounded && HasVectorRestrictions(restrictions, kNoUnroundedHAdd))) { return false; } |