summaryrefslogtreecommitdiff
path: root/compiler/optimizing/loop_optimization.cc
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/optimizing/loop_optimization.cc')
-rw-r--r--compiler/optimizing/loop_optimization.cc315
1 files changed, 215 insertions, 100 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 5784707d0e..02ee4ec057 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -21,6 +21,7 @@
#include "arch/instruction_set.h"
#include "arch/x86/instruction_set_features_x86.h"
#include "arch/x86_64/instruction_set_features_x86_64.h"
+#include "code_generator.h"
#include "driver/compiler_options.h"
#include "linear_order.h"
#include "mirror/array-inl.h"
@@ -305,7 +306,8 @@ static bool IsAddConst2(HGraph* graph,
/*out*/ HInstruction** a,
/*out*/ HInstruction** b,
/*out*/ int64_t* c) {
- if (IsAddConst(instruction, a, b, c) && *a != nullptr) {
+ // We want an actual add/sub and not the trivial case where {b: 0, c: 0}.
+ if (IsAddOrSub(instruction) && IsAddConst(instruction, a, b, c) && *a != nullptr) {
if (*b == nullptr) {
// Constant is usually already present, unless accumulated.
*b = graph->GetConstant(instruction->GetType(), (*c));
@@ -429,7 +431,7 @@ static void PeelByCount(HLoopInformation* loop_info,
InductionVarRange* induction_range) {
for (int i = 0; i < count; i++) {
// Perform peeling.
- PeelUnrollSimpleHelper helper(loop_info, induction_range);
+ LoopClonerSimpleHelper helper(loop_info, induction_range);
helper.DoPeeling();
}
}
@@ -456,12 +458,13 @@ static DataType::Type GetNarrowerType(HInstruction* a, HInstruction* b) {
//
HLoopOptimization::HLoopOptimization(HGraph* graph,
- const CompilerOptions* compiler_options,
+ const CodeGenerator& codegen,
HInductionVarAnalysis* induction_analysis,
OptimizingCompilerStats* stats,
const char* name)
: HOptimization(graph, name, stats),
- compiler_options_(compiler_options),
+ compiler_options_(&codegen.GetCompilerOptions()),
+ simd_register_size_(codegen.GetSIMDRegisterWidth()),
induction_range_(induction_analysis),
loop_allocator_(nullptr),
global_allocator_(graph_->GetAllocator()),
@@ -470,6 +473,7 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
iset_(nullptr),
reductions_(nullptr),
simplified_(false),
+ predicated_vectorization_mode_(codegen.SupportsPredicatedSIMD()),
vector_length_(0),
vector_refs_(nullptr),
vector_static_peeling_factor_(0),
@@ -483,10 +487,7 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
vector_header_(nullptr),
vector_body_(nullptr),
vector_index_(nullptr),
- arch_loop_helper_(ArchNoOptsLoopHelper::Create(compiler_options_ != nullptr
- ? compiler_options_->GetInstructionSet()
- : InstructionSet::kNone,
- global_allocator_)) {
+ arch_loop_helper_(ArchNoOptsLoopHelper::Create(codegen, global_allocator_)) {
}
bool HLoopOptimization::Run() {
@@ -804,7 +805,7 @@ bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo*
// Perform unrolling.
HLoopInformation* loop_info = analysis_info->GetLoopInfo();
- PeelUnrollSimpleHelper helper(loop_info, &induction_range_);
+ LoopClonerSimpleHelper helper(loop_info, &induction_range_);
helper.DoUnrolling();
// Remove the redundant loop check after unrolling.
@@ -829,7 +830,7 @@ bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopAnalysisI
if (generate_code) {
// Perform peeling.
- PeelUnrollSimpleHelper helper(loop_info, &induction_range_);
+ LoopClonerSimpleHelper helper(loop_info, &induction_range_);
helper.DoPeeling();
// Statically evaluate loop check after peeling for loop invariant condition.
@@ -885,12 +886,6 @@ bool HLoopOptimization::TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool g
}
bool HLoopOptimization::TryPeelingAndUnrolling(LoopNode* node) {
- // Don't run peeling/unrolling if compiler_options_ is nullptr (i.e., running under tests)
- // as InstructionSet is needed.
- if (compiler_options_ == nullptr) {
- return false;
- }
-
HLoopInformation* loop_info = node->loop_info;
int64_t trip_count = LoopAnalysis::GetLoopTripCount(loop_info, &induction_range_);
LoopAnalysisInfo analysis_info(loop_info);
@@ -908,7 +903,7 @@ bool HLoopOptimization::TryPeelingAndUnrolling(LoopNode* node) {
}
// Run 'IsLoopClonable' the last as it might be time-consuming.
- if (!PeelUnrollHelper::IsLoopClonable(loop_info)) {
+ if (!LoopClonerHelper::IsLoopClonable(loop_info)) {
return false;
}
@@ -951,9 +946,10 @@ bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int6
// make one particular reference aligned), never to exceed (1).
// (3) variable to record how many references share same alignment.
// (4) variable to record suitable candidate for dynamic loop peeling.
- uint32_t desired_alignment = GetVectorSizeInBytes();
- DCHECK_LE(desired_alignment, 16u);
- uint32_t peeling_votes[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ size_t desired_alignment = GetVectorSizeInBytes();
+ ScopedArenaVector<uint32_t> peeling_votes(desired_alignment, 0u,
+ loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+
uint32_t max_num_same_alignment = 0;
const ArrayReference* peeling_candidate = nullptr;
@@ -1027,8 +1023,10 @@ bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int6
}
} // for i
- // Find a suitable alignment strategy.
- SetAlignmentStrategy(peeling_votes, peeling_candidate);
+ if (!IsInPredicatedVectorizationMode()) {
+ // Find a suitable alignment strategy.
+ SetAlignmentStrategy(peeling_votes, peeling_candidate);
+ }
// Does vectorization seem profitable?
if (!IsVectorizationProfitable(trip_count)) {
@@ -1055,8 +1053,8 @@ void HLoopOptimization::Vectorize(LoopNode* node,
// A cleanup loop is needed, at least, for any unknown trip count or
// for a known trip count with remainder iterations after vectorization.
- bool needs_cleanup = trip_count == 0 ||
- ((trip_count - vector_static_peeling_factor_) % chunk) != 0;
+ bool needs_cleanup = !IsInPredicatedVectorizationMode() &&
+ (trip_count == 0 || ((trip_count - vector_static_peeling_factor_) % chunk) != 0);
// Adjust vector bookkeeping.
HPhi* main_phi = nullptr;
@@ -1074,11 +1072,13 @@ void HLoopOptimization::Vectorize(LoopNode* node,
// ptc = <peeling factor>;
HInstruction* ptc = nullptr;
if (vector_static_peeling_factor_ != 0) {
+ DCHECK(!IsInPredicatedVectorizationMode());
// Static loop peeling for SIMD alignment (using the most suitable
// fixed peeling factor found during prior alignment analysis).
DCHECK(vector_dynamic_peeling_candidate_ == nullptr);
ptc = graph_->GetConstant(induc_type, vector_static_peeling_factor_);
} else if (vector_dynamic_peeling_candidate_ != nullptr) {
+ DCHECK(!IsInPredicatedVectorizationMode());
// Dynamic loop peeling for SIMD alignment (using the most suitable
// candidate found during prior alignment analysis):
// rem = offset % ALIGN; // adjusted as #elements
@@ -1109,6 +1109,7 @@ void HLoopOptimization::Vectorize(LoopNode* node,
HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
HInstruction* vtc = stc;
if (needs_cleanup) {
+ DCHECK(!IsInPredicatedVectorizationMode());
DCHECK(IsPowerOfTwo(chunk));
HInstruction* diff = stc;
if (ptc != nullptr) {
@@ -1146,6 +1147,7 @@ void HLoopOptimization::Vectorize(LoopNode* node,
// moved around during suspend checks, since all analysis was based on
// nothing more than the Android runtime alignment conventions.
if (ptc != nullptr) {
+ DCHECK(!IsInPredicatedVectorizationMode());
vector_mode_ = kSequential;
GenerateNewLoop(node,
block,
@@ -1173,6 +1175,7 @@ void HLoopOptimization::Vectorize(LoopNode* node,
// for ( ; i < stc; i += 1)
// <loop-body>
if (needs_cleanup) {
+ DCHECK(!IsInPredicatedVectorizationMode() || vector_runtime_test_a_ != nullptr);
vector_mode_ = kSequential;
GenerateNewLoop(node,
block,
@@ -1230,9 +1233,35 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node,
// Generate header and prepare body.
// for (i = lo; i < hi; i += step)
// <loop-body>
- HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi);
- vector_header_->AddPhi(phi);
- vector_header_->AddInstruction(cond);
+ HInstruction* cond = nullptr;
+ HInstruction* set_pred = nullptr;
+ if (IsInPredicatedVectorizationMode()) {
+ HVecPredWhile* pred_while =
+ new (global_allocator_) HVecPredWhile(global_allocator_,
+ phi,
+ hi,
+ HVecPredWhile::CondKind::kLO,
+ DataType::Type::kInt32,
+ vector_length_,
+ 0u);
+
+ cond = new (global_allocator_) HVecPredCondition(global_allocator_,
+ pred_while,
+ HVecPredCondition::PCondKind::kNFirst,
+ DataType::Type::kInt32,
+ vector_length_,
+ 0u);
+
+ vector_header_->AddPhi(phi);
+ vector_header_->AddInstruction(pred_while);
+ vector_header_->AddInstruction(cond);
+ set_pred = pred_while;
+ } else {
+ cond = new (global_allocator_) HAboveOrEqual(phi, hi);
+ vector_header_->AddPhi(phi);
+ vector_header_->AddInstruction(cond);
+ }
+
vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
vector_index_ = phi;
vector_permanent_map_->clear(); // preserved over unrolling
@@ -1249,6 +1278,10 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node,
auto i = vector_map_->find(it.Current());
if (i != vector_map_->end() && !i->second->IsInBlock()) {
Insert(vector_body_, i->second);
+ if (IsInPredicatedVectorizationMode() && i->second->IsVecOperation()) {
+ HVecOperation* op = i->second->AsVecOperation();
+ op->SetMergingGoverningPredicate(set_pred);
+ }
// Deal with instructions that need an environment, such as the scalar intrinsics.
if (i->second->NeedsEnvironment()) {
i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
@@ -1363,7 +1396,10 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node,
} else if (instruction->IsArrayGet()) {
// Deal with vector restrictions.
bool is_string_char_at = instruction->AsArrayGet()->IsStringCharAt();
- if (is_string_char_at && HasVectorRestrictions(restrictions, kNoStringCharAt)) {
+
+ if (is_string_char_at && (HasVectorRestrictions(restrictions, kNoStringCharAt) ||
+ IsInPredicatedVectorizationMode())) {
+ // TODO: Support CharAt for predicated mode.
return false;
}
// Accept a right-hand-side array base[index] for
@@ -1542,13 +1578,7 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node,
}
uint32_t HLoopOptimization::GetVectorSizeInBytes() {
- switch (compiler_options_->GetInstructionSet()) {
- case InstructionSet::kArm:
- case InstructionSet::kThumb2:
- return 8; // 64-bit SIMD
- default:
- return 16; // 128-bit SIMD
- }
+ return simd_register_size_;
}
bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrictions) {
@@ -1563,45 +1593,88 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
case DataType::Type::kUint8:
case DataType::Type::kInt8:
*restrictions |= kNoDiv | kNoReduction | kNoDotProd;
- return TrySetVectorLength(8);
+ return TrySetVectorLength(type, 8);
case DataType::Type::kUint16:
case DataType::Type::kInt16:
*restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoDotProd;
- return TrySetVectorLength(4);
+ return TrySetVectorLength(type, 4);
case DataType::Type::kInt32:
*restrictions |= kNoDiv | kNoWideSAD;
- return TrySetVectorLength(2);
+ return TrySetVectorLength(type, 2);
default:
break;
}
return false;
case InstructionSet::kArm64:
- // Allow vectorization for all ARM devices, because Android assumes that
- // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
- switch (type) {
- case DataType::Type::kBool:
- case DataType::Type::kUint8:
- case DataType::Type::kInt8:
- *restrictions |= kNoDiv;
- return TrySetVectorLength(16);
- case DataType::Type::kUint16:
- case DataType::Type::kInt16:
- *restrictions |= kNoDiv;
- return TrySetVectorLength(8);
- case DataType::Type::kInt32:
- *restrictions |= kNoDiv;
- return TrySetVectorLength(4);
- case DataType::Type::kInt64:
- *restrictions |= kNoDiv | kNoMul;
- return TrySetVectorLength(2);
- case DataType::Type::kFloat32:
- *restrictions |= kNoReduction;
- return TrySetVectorLength(4);
- case DataType::Type::kFloat64:
- *restrictions |= kNoReduction;
- return TrySetVectorLength(2);
- default:
- return false;
+ if (IsInPredicatedVectorizationMode()) {
+ // SVE vectorization.
+ CHECK(features->AsArm64InstructionSetFeatures()->HasSVE());
+ size_t vector_length = simd_register_size_ / DataType::Size(type);
+ DCHECK_EQ(simd_register_size_ % DataType::Size(type), 0u);
+ switch (type) {
+ case DataType::Type::kBool:
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ *restrictions |= kNoDiv |
+ kNoSignedHAdd |
+ kNoUnsignedHAdd |
+ kNoUnroundedHAdd |
+ kNoSAD;
+ return TrySetVectorLength(type, vector_length);
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ *restrictions |= kNoDiv |
+ kNoSignedHAdd |
+ kNoUnsignedHAdd |
+ kNoUnroundedHAdd |
+ kNoSAD |
+ kNoDotProd;
+ return TrySetVectorLength(type, vector_length);
+ case DataType::Type::kInt32:
+ *restrictions |= kNoDiv | kNoSAD;
+ return TrySetVectorLength(type, vector_length);
+ case DataType::Type::kInt64:
+ *restrictions |= kNoDiv | kNoSAD;
+ return TrySetVectorLength(type, vector_length);
+ case DataType::Type::kFloat32:
+ *restrictions |= kNoReduction;
+ return TrySetVectorLength(type, vector_length);
+ case DataType::Type::kFloat64:
+ *restrictions |= kNoReduction;
+ return TrySetVectorLength(type, vector_length);
+ default:
+ break;
+ }
+ return false;
+ } else {
+ // Allow vectorization for all ARM devices, because Android assumes that
+ // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
+ switch (type) {
+ case DataType::Type::kBool:
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ *restrictions |= kNoDiv;
+ return TrySetVectorLength(type, 16);
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ *restrictions |= kNoDiv;
+ return TrySetVectorLength(type, 8);
+ case DataType::Type::kInt32:
+ *restrictions |= kNoDiv;
+ return TrySetVectorLength(type, 4);
+ case DataType::Type::kInt64:
+ *restrictions |= kNoDiv | kNoMul;
+ return TrySetVectorLength(type, 2);
+ case DataType::Type::kFloat32:
+ *restrictions |= kNoReduction;
+ return TrySetVectorLength(type, 4);
+ case DataType::Type::kFloat64:
+ *restrictions |= kNoReduction;
+ return TrySetVectorLength(type, 2);
+ default:
+ break;
+ }
+ return false;
}
case InstructionSet::kX86:
case InstructionSet::kX86_64:
@@ -1619,7 +1692,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
kNoUnroundedHAdd |
kNoSAD |
kNoDotProd;
- return TrySetVectorLength(16);
+ return TrySetVectorLength(type, 16);
case DataType::Type::kUint16:
*restrictions |= kNoDiv |
kNoAbs |
@@ -1627,26 +1700,26 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
kNoUnroundedHAdd |
kNoSAD |
kNoDotProd;
- return TrySetVectorLength(8);
+ return TrySetVectorLength(type, 8);
case DataType::Type::kInt16:
*restrictions |= kNoDiv |
kNoAbs |
kNoSignedHAdd |
kNoUnroundedHAdd |
kNoSAD;
- return TrySetVectorLength(8);
+ return TrySetVectorLength(type, 8);
case DataType::Type::kInt32:
*restrictions |= kNoDiv | kNoSAD;
- return TrySetVectorLength(4);
+ return TrySetVectorLength(type, 4);
case DataType::Type::kInt64:
*restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoSAD;
- return TrySetVectorLength(2);
+ return TrySetVectorLength(type, 2);
case DataType::Type::kFloat32:
*restrictions |= kNoReduction;
- return TrySetVectorLength(4);
+ return TrySetVectorLength(type, 4);
case DataType::Type::kFloat64:
*restrictions |= kNoReduction;
- return TrySetVectorLength(2);
+ return TrySetVectorLength(type, 2);
default:
break;
} // switch type
@@ -1657,7 +1730,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
} // switch instruction set
}
-bool HLoopOptimization::TrySetVectorLength(uint32_t length) {
+bool HLoopOptimization::TrySetVectorLengthImpl(uint32_t length) {
DCHECK(IsPowerOfTwo(length) && length >= 2u);
// First time set?
if (vector_length_ == 0) {
@@ -1694,6 +1767,15 @@ void HLoopOptimization::GenerateVecInv(HInstruction* org, DataType::Type type) {
vector = new (global_allocator_)
HVecReplicateScalar(global_allocator_, input, type, vector_length_, kNoDexPc);
vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
+ if (IsInPredicatedVectorizationMode()) {
+ HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+ graph_->GetIntConstant(1),
+ type,
+ vector_length_,
+ 0u);
+ vector_preheader_->InsertInstructionBefore(set_pred, vector);
+ vector->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+ }
}
vector_map_->Put(org, vector);
}
@@ -1822,6 +1904,15 @@ void HLoopOptimization::GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* r
vector_length,
kNoDexPc));
}
+ if (IsInPredicatedVectorizationMode()) {
+ HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+ graph_->GetIntConstant(1),
+ type,
+ vector_length,
+ 0u);
+ vector_preheader_->InsertInstructionBefore(set_pred, new_init);
+ new_init->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+ }
} else {
new_init = ReduceAndExtractIfNeeded(new_init);
}
@@ -1853,6 +1944,17 @@ HInstruction* HLoopOptimization::ReduceAndExtractIfNeeded(HInstruction* instruct
instruction = new (global_allocator_) HVecExtractScalar(
global_allocator_, reduce, type, vector_length, 0, kNoDexPc);
exit->InsertInstructionAfter(instruction, reduce);
+
+ if (IsInPredicatedVectorizationMode()) {
+ HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+ graph_->GetIntConstant(1),
+ type,
+ vector_length,
+ 0u);
+ exit->InsertInstructionBefore(set_pred, reduce);
+ reduce->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+ instruction->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+ }
}
}
return instruction;
@@ -1992,7 +2094,8 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
return false;
}
// Deal with vector restrictions.
- if ((!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
+ if ((is_unsigned && HasVectorRestrictions(restrictions, kNoUnsignedHAdd)) ||
+ (!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
(!is_rounded && HasVectorRestrictions(restrictions, kNoUnroundedHAdd))) {
return false;
}
@@ -2044,13 +2147,13 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
(reduction_type != DataType::Type::kInt32 && reduction_type != DataType::Type::kInt64)) {
return false;
}
- HInstruction* q = instruction->InputAt(0);
- HInstruction* v = instruction->InputAt(1);
+ HInstruction* acc = instruction->InputAt(0);
+ HInstruction* abs = instruction->InputAt(1);
HInstruction* a = nullptr;
HInstruction* b = nullptr;
- if (v->IsAbs() &&
- v->GetType() == reduction_type &&
- IsSubConst2(graph_, v->InputAt(0), /*out*/ &a, /*out*/ &b)) {
+ if (abs->IsAbs() &&
+ abs->GetType() == reduction_type &&
+ IsSubConst2(graph_, abs->InputAt(0), /*out*/ &a, /*out*/ &b)) {
DCHECK(a != nullptr && b != nullptr);
} else {
return false;
@@ -2076,16 +2179,16 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
// idiomatic operation. Sequential code uses the original scalar expressions.
DCHECK(r != nullptr && s != nullptr);
if (generate_code && vector_mode_ != kVector) { // de-idiom
- r = s = v->InputAt(0);
+ r = s = abs->InputAt(0);
}
- if (VectorizeUse(node, q, generate_code, sub_type, restrictions) &&
+ if (VectorizeUse(node, acc, generate_code, sub_type, restrictions) &&
VectorizeUse(node, r, generate_code, sub_type, restrictions) &&
VectorizeUse(node, s, generate_code, sub_type, restrictions)) {
if (generate_code) {
if (vector_mode_ == kVector) {
vector_map_->Put(instruction, new (global_allocator_) HVecSADAccumulate(
global_allocator_,
- vector_map_->Get(q),
+ vector_map_->Get(acc),
vector_map_->Get(r),
vector_map_->Get(s),
HVecOperation::ToProperType(reduction_type, is_unsigned),
@@ -2093,8 +2196,14 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
kNoDexPc));
MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
} else {
- GenerateVecOp(v, vector_map_->Get(r), nullptr, reduction_type);
- GenerateVecOp(instruction, vector_map_->Get(q), vector_map_->Get(v), reduction_type);
+ // "GenerateVecOp()" must not be called more than once for each original loop body
+ // instruction. As the SAD idiom processes both "current" instruction ("instruction")
+ // and its ABS input in one go, we must check that for the scalar case the ABS instruction
+ // has not yet been processed.
+ if (vector_map_->find(abs) == vector_map_->end()) {
+ GenerateVecOp(abs, vector_map_->Get(r), nullptr, reduction_type);
+ }
+ GenerateVecOp(instruction, vector_map_->Get(acc), vector_map_->Get(abs), reduction_type);
}
}
return true;
@@ -2116,20 +2225,20 @@ bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node,
return false;
}
- HInstruction* q = instruction->InputAt(0);
- HInstruction* v = instruction->InputAt(1);
- if (!v->IsMul() || v->GetType() != reduction_type) {
+ HInstruction* const acc = instruction->InputAt(0);
+ HInstruction* const mul = instruction->InputAt(1);
+ if (!mul->IsMul() || mul->GetType() != reduction_type) {
return false;
}
- HInstruction* a = v->InputAt(0);
- HInstruction* b = v->InputAt(1);
- HInstruction* r = a;
- HInstruction* s = b;
- DataType::Type op_type = GetNarrowerType(a, b);
+ HInstruction* const mul_left = mul->InputAt(0);
+ HInstruction* const mul_right = mul->InputAt(1);
+ HInstruction* r = mul_left;
+ HInstruction* s = mul_right;
+ DataType::Type op_type = GetNarrowerType(mul_left, mul_right);
bool is_unsigned = false;
- if (!IsNarrowerOperands(a, b, op_type, &r, &s, &is_unsigned)) {
+ if (!IsNarrowerOperands(mul_left, mul_right, op_type, &r, &s, &is_unsigned)) {
return false;
}
op_type = HVecOperation::ToProperType(op_type, is_unsigned);
@@ -2143,17 +2252,17 @@ bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node,
// Accept dot product idiom for vectorizable operands. Vectorized code uses the shorthand
// idiomatic operation. Sequential code uses the original scalar expressions.
if (generate_code && vector_mode_ != kVector) { // de-idiom
- r = a;
- s = b;
+ r = mul_left;
+ s = mul_right;
}
- if (VectorizeUse(node, q, generate_code, op_type, restrictions) &&
+ if (VectorizeUse(node, acc, generate_code, op_type, restrictions) &&
VectorizeUse(node, r, generate_code, op_type, restrictions) &&
VectorizeUse(node, s, generate_code, op_type, restrictions)) {
if (generate_code) {
if (vector_mode_ == kVector) {
vector_map_->Put(instruction, new (global_allocator_) HVecDotProd(
global_allocator_,
- vector_map_->Get(q),
+ vector_map_->Get(acc),
vector_map_->Get(r),
vector_map_->Get(s),
reduction_type,
@@ -2162,8 +2271,14 @@ bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node,
kNoDexPc));
MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
} else {
- GenerateVecOp(v, vector_map_->Get(r), vector_map_->Get(s), reduction_type);
- GenerateVecOp(instruction, vector_map_->Get(q), vector_map_->Get(v), reduction_type);
+ // "GenerateVecOp()" must not be called more than once for each original loop body
+ // instruction. As the DotProd idiom processes both "current" instruction ("instruction")
+ // and its MUL input in one go, we must check that for the scalar case the MUL instruction
+ // has not yet been processed.
+ if (vector_map_->find(mul) == vector_map_->end()) {
+ GenerateVecOp(mul, vector_map_->Get(r), vector_map_->Get(s), reduction_type);
+ }
+ GenerateVecOp(instruction, vector_map_->Get(acc), vector_map_->Get(mul), reduction_type);
}
}
return true;
@@ -2191,12 +2306,12 @@ Alignment HLoopOptimization::ComputeAlignment(HInstruction* offset,
return Alignment(DataType::Size(type), 0);
}
-void HLoopOptimization::SetAlignmentStrategy(uint32_t peeling_votes[],
+void HLoopOptimization::SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes,
const ArrayReference* peeling_candidate) {
// Current heuristic: pick the best static loop peeling factor, if any,
// or otherwise use dynamic loop peeling on suggested peeling candidate.
uint32_t max_vote = 0;
- for (int32_t i = 0; i < 16; i++) {
+ for (size_t i = 0; i < peeling_votes.size(); i++) {
if (peeling_votes[i] > max_vote) {
max_vote = peeling_votes[i];
vector_static_peeling_factor_ = i;