summaryrefslogtreecommitdiff
path: root/compiler/optimizing/loop_optimization.cc
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/optimizing/loop_optimization.cc')
-rw-r--r--compiler/optimizing/loop_optimization.cc765
1 files changed, 496 insertions, 269 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 899496328eb..1462404932e 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -33,8 +33,8 @@ namespace art {
// Enables vectorization (SIMDization) in the loop optimizer.
static constexpr bool kEnableVectorization = true;
-// No loop unrolling factor (just one copy of the loop-body).
-static constexpr uint32_t kNoUnrollingFactor = 1;
+// Enables scalar loop unrolling in the loop optimizer.
+static constexpr bool kEnableScalarPeelingUnrolling = false;
//
// Static helpers.
@@ -153,6 +153,18 @@ static bool IsSignExtensionAndGet(HInstruction* instruction,
return false;
}
}
+ // A MIN-MAX on narrower operands qualifies as well
+ // (returning the operator itself).
+ if (instruction->IsMin() || instruction->IsMax()) {
+ HBinaryOperation* min_max = instruction->AsBinaryOperation();
+ DCHECK(min_max->GetType() == DataType::Type::kInt32 ||
+ min_max->GetType() == DataType::Type::kInt64);
+ if (IsSignExtensionAndGet(min_max->InputAt(0), type, operand) &&
+ IsSignExtensionAndGet(min_max->InputAt(1), type, operand)) {
+ *operand = min_max;
+ return true;
+ }
+ }
return false;
}
@@ -216,6 +228,18 @@ static bool IsZeroExtensionAndGet(HInstruction* instruction,
return false;
}
}
+ // A MIN-MAX on narrower operands qualifies as well
+ // (returning the operator itself).
+ if (instruction->IsMin() || instruction->IsMax()) {
+ HBinaryOperation* min_max = instruction->AsBinaryOperation();
+ DCHECK(min_max->GetType() == DataType::Type::kInt32 ||
+ min_max->GetType() == DataType::Type::kInt64);
+ if (IsZeroExtensionAndGet(min_max->InputAt(0), type, operand) &&
+ IsZeroExtensionAndGet(min_max->InputAt(1), type, operand)) {
+ *operand = min_max;
+ return true;
+ }
+ }
return false;
}
@@ -227,6 +251,7 @@ static bool IsNarrowerOperands(HInstruction* a,
/*out*/ HInstruction** r,
/*out*/ HInstruction** s,
/*out*/ bool* is_unsigned) {
+ DCHECK(a != nullptr && b != nullptr);
// Look for a matching sign extension.
DataType::Type stype = HVecOperation::ToSignedType(type);
if (IsSignExtensionAndGet(a, stype, r) && IsSignExtensionAndGet(b, stype, s)) {
@@ -247,6 +272,7 @@ static bool IsNarrowerOperand(HInstruction* a,
DataType::Type type,
/*out*/ HInstruction** r,
/*out*/ bool* is_unsigned) {
+ DCHECK(a != nullptr);
// Look for a matching sign extension.
DataType::Type stype = HVecOperation::ToSignedType(type);
if (IsSignExtensionAndGet(a, stype, r)) {
@@ -270,20 +296,28 @@ static uint32_t GetOtherVL(DataType::Type other_type, DataType::Type vector_type
return vl >> (DataType::SizeShift(other_type) - DataType::SizeShift(vector_type));
}
-// Detect up to two instructions a and b, and an acccumulated constant c.
-static bool IsAddConstHelper(HInstruction* instruction,
- /*out*/ HInstruction** a,
- /*out*/ HInstruction** b,
- /*out*/ int64_t* c,
- int32_t depth) {
- static constexpr int32_t kMaxDepth = 8; // don't search too deep
+// Detect up to two added operands a and b and an acccumulated constant c.
+static bool IsAddConst(HInstruction* instruction,
+ /*out*/ HInstruction** a,
+ /*out*/ HInstruction** b,
+ /*out*/ int64_t* c,
+ int32_t depth = 8) { // don't search too deep
int64_t value = 0;
+ // Enter add/sub while still within reasonable depth.
+ if (depth > 0) {
+ if (instruction->IsAdd()) {
+ return IsAddConst(instruction->InputAt(0), a, b, c, depth - 1) &&
+ IsAddConst(instruction->InputAt(1), a, b, c, depth - 1);
+ } else if (instruction->IsSub() &&
+ IsInt64AndGet(instruction->InputAt(1), &value)) {
+ *c -= value;
+ return IsAddConst(instruction->InputAt(0), a, b, c, depth - 1);
+ }
+ }
+ // Otherwise, deal with leaf nodes.
if (IsInt64AndGet(instruction, &value)) {
*c += value;
return true;
- } else if (instruction->IsAdd() && depth <= kMaxDepth) {
- return IsAddConstHelper(instruction->InputAt(0), a, b, c, depth + 1) &&
- IsAddConstHelper(instruction->InputAt(1), a, b, c, depth + 1);
} else if (*a == nullptr) {
*a = instruction;
return true;
@@ -291,72 +325,170 @@ static bool IsAddConstHelper(HInstruction* instruction,
*b = instruction;
return true;
}
- return false; // too many non-const operands
+ return false; // too many operands
}
-// Detect a + b + c for an optional constant c.
-static bool IsAddConst(HInstruction* instruction,
- /*out*/ HInstruction** a,
- /*out*/ HInstruction** b,
- /*out*/ int64_t* c) {
- if (instruction->IsAdd()) {
- // Try to find a + b and accumulated c.
- if (IsAddConstHelper(instruction->InputAt(0), a, b, c, /*depth*/ 0) &&
- IsAddConstHelper(instruction->InputAt(1), a, b, c, /*depth*/ 0) &&
- *b != nullptr) {
- return true;
+// Detect a + b + c with optional constant c.
+static bool IsAddConst2(HGraph* graph,
+ HInstruction* instruction,
+ /*out*/ HInstruction** a,
+ /*out*/ HInstruction** b,
+ /*out*/ int64_t* c) {
+ if (IsAddConst(instruction, a, b, c) && *a != nullptr) {
+ if (*b == nullptr) {
+ // Constant is usually already present, unless accumulated.
+ *b = graph->GetConstant(instruction->GetType(), (*c));
+ *c = 0;
}
- // Found a + b.
+ return true;
+ }
+ return false;
+}
+
+// Detect a direct a - b or a hidden a - (-c).
+static bool IsSubConst2(HGraph* graph,
+ HInstruction* instruction,
+ /*out*/ HInstruction** a,
+ /*out*/ HInstruction** b) {
+ int64_t c = 0;
+ if (instruction->IsSub()) {
*a = instruction->InputAt(0);
*b = instruction->InputAt(1);
- *c = 0;
+ return true;
+ } else if (IsAddConst(instruction, a, b, &c) && *a != nullptr && *b == nullptr) {
+ // Constant for the hidden subtraction.
+ *b = graph->GetConstant(instruction->GetType(), -c);
return true;
}
return false;
}
-// Detect a + c for constant c.
-static bool IsAddConst(HInstruction* instruction,
- /*out*/ HInstruction** a,
- /*out*/ int64_t* c) {
- if (instruction->IsAdd()) {
- if (IsInt64AndGet(instruction->InputAt(0), c)) {
- *a = instruction->InputAt(1);
- return true;
- } else if (IsInt64AndGet(instruction->InputAt(1), c)) {
- *a = instruction->InputAt(0);
- return true;
+// Detect clipped [lo, hi] range for nested MIN-MAX operations on a clippee,
+// such as MIN(hi, MAX(lo, clippee)) for an arbitrary clippee expression.
+// Example: MIN(10, MIN(20, MAX(0, x))) yields [0, 10] with clippee x.
+static HInstruction* FindClippee(HInstruction* instruction,
+ /*out*/ int64_t* lo,
+ /*out*/ int64_t* hi) {
+ // Iterate into MIN(.., c)-MAX(.., c) expressions and 'tighten' the range [lo, hi].
+ while (instruction->IsMin() || instruction->IsMax()) {
+ HBinaryOperation* min_max = instruction->AsBinaryOperation();
+ DCHECK(min_max->GetType() == DataType::Type::kInt32 ||
+ min_max->GetType() == DataType::Type::kInt64);
+ // Process the constant.
+ HConstant* right = min_max->GetConstantRight();
+ if (right == nullptr) {
+ break;
+ } else if (instruction->IsMin()) {
+ *hi = std::min(*hi, Int64FromConstant(right));
+ } else {
+ *lo = std::max(*lo, Int64FromConstant(right));
}
+ instruction = min_max->GetLeastConstantLeft();
+ }
+ // Iteration ends in any other expression (possibly MIN/MAX without constant).
+ // This leaf expression is the clippee with range [lo, hi].
+ return instruction;
+}
+
+// Set value range for type (or fail).
+static bool CanSetRange(DataType::Type type,
+ /*out*/ int64_t* uhi,
+ /*out*/ int64_t* slo,
+ /*out*/ int64_t* shi) {
+ if (DataType::Size(type) == 1) {
+ *uhi = std::numeric_limits<uint8_t>::max();
+ *slo = std::numeric_limits<int8_t>::min();
+ *shi = std::numeric_limits<int8_t>::max();
+ return true;
+ } else if (DataType::Size(type) == 2) {
+ *uhi = std::numeric_limits<uint16_t>::max();
+ *slo = std::numeric_limits<int16_t>::min();
+ *shi = std::numeric_limits<int16_t>::max();
+ return true;
}
return false;
}
+// Accept various saturated addition forms.
+static bool IsSaturatedAdd(HInstruction* a,
+ HInstruction* b,
+ DataType::Type type,
+ int64_t lo,
+ int64_t hi,
+ bool is_unsigned) {
+ int64_t ulo = 0, uhi = 0, slo = 0, shi = 0;
+ if (!CanSetRange(type, &uhi, &slo, &shi)) {
+ return false;
+ }
+ // Tighten the range for signed single clipping on constant.
+ if (!is_unsigned) {
+ int64_t c = 0;
+ if (IsInt64AndGet(a, &c) || IsInt64AndGet(b, &c)) {
+ // For c in proper range and narrower operand r:
+ // MIN(r + c, 127) c > 0
+ // or MAX(r + c, -128) c < 0 (and possibly redundant bound).
+ if (0 < c && c <= shi && hi == shi) {
+ if (lo <= (slo + c)) {
+ return true;
+ }
+ } else if (slo <= c && c < 0 && lo == slo) {
+ if (hi >= (shi + c)) {
+ return true;
+ }
+ }
+ }
+ }
+ // Detect for narrower operands r and s:
+ // MIN(r + s, 255) => SAT_ADD_unsigned
+ // MAX(MIN(r + s, 127), -128) => SAT_ADD_signed.
+ return is_unsigned ? (lo <= ulo && hi == uhi) : (lo == slo && hi == shi);
+}
+
+// Accept various saturated subtraction forms.
+static bool IsSaturatedSub(HInstruction* a,
+ DataType::Type type,
+ int64_t lo,
+ int64_t hi,
+ bool is_unsigned) {
+ int64_t ulo = 0, uhi = 0, slo = 0, shi = 0;
+ if (!CanSetRange(type, &uhi, &slo, &shi)) {
+ return false;
+ }
+ // Tighten the range for signed single clipping on constant.
+ if (!is_unsigned) {
+ int64_t c = 0;
+ if (IsInt64AndGet(a, /*out*/ &c)) {
+ // For c in proper range and narrower operand r:
+ // MIN(c - r, 127) c > 0
+ // or MAX(c - r, -128) c < 0 (and possibly redundant bound).
+ if (0 < c && c <= shi && hi == shi) {
+ if (lo <= (c - shi)) {
+ return true;
+ }
+ } else if (slo <= c && c < 0 && lo == slo) {
+ if (hi >= (c - slo)) {
+ return true;
+ }
+ }
+ }
+ }
+ // Detect for narrower operands r and s:
+ // MAX(r - s, 0) => SAT_SUB_unsigned
+ // MIN(MAX(r - s, -128), 127) => SAT_ADD_signed.
+ return is_unsigned ? (lo == ulo && hi >= uhi) : (lo == slo && hi == shi);
+}
+
// Detect reductions of the following forms,
// x = x_phi + ..
// x = x_phi - ..
-// x = max(x_phi, ..)
// x = min(x_phi, ..)
+// x = max(x_phi, ..)
static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) {
- if (reduction->IsAdd()) {
+ if (reduction->IsAdd() || reduction->IsMin() || reduction->IsMax()) {
return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) ||
(reduction->InputAt(0) != phi && reduction->InputAt(1) == phi);
} else if (reduction->IsSub()) {
return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi);
- } else if (reduction->IsInvokeStaticOrDirect()) {
- switch (reduction->AsInvokeStaticOrDirect()->GetIntrinsic()) {
- case Intrinsics::kMathMinIntInt:
- case Intrinsics::kMathMinLongLong:
- case Intrinsics::kMathMinFloatFloat:
- case Intrinsics::kMathMinDoubleDouble:
- case Intrinsics::kMathMaxIntInt:
- case Intrinsics::kMathMaxLongLong:
- case Intrinsics::kMathMaxFloatFloat:
- case Intrinsics::kMathMaxDoubleDouble:
- return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) ||
- (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi);
- default:
- return false;
- }
}
return false;
}
@@ -401,6 +533,43 @@ static bool CheckInductionSetFullyRemoved(ScopedArenaSet<HInstruction*>* iset) {
return true;
}
+// Tries to statically evaluate condition of the specified "HIf" for other condition checks.
+static void TryToEvaluateIfCondition(HIf* instruction, HGraph* graph) {
+ HInstruction* cond = instruction->InputAt(0);
+
+ // If a condition 'cond' is evaluated in an HIf instruction then in the successors of the
+ // IF_BLOCK we statically know the value of the condition 'cond' (TRUE in TRUE_SUCC, FALSE in
+ // FALSE_SUCC). Using that we can replace another evaluation (use) EVAL of the same 'cond'
+ // with TRUE value (FALSE value) if every path from the ENTRY_BLOCK to EVAL_BLOCK contains the
+ // edge HIF_BLOCK->TRUE_SUCC (HIF_BLOCK->FALSE_SUCC).
+ // if (cond) { if(cond) {
+ // if (cond) {} if (1) {}
+ // } else { =======> } else {
+ // if (cond) {} if (0) {}
+ // } }
+ if (!cond->IsConstant()) {
+ HBasicBlock* true_succ = instruction->IfTrueSuccessor();
+ HBasicBlock* false_succ = instruction->IfFalseSuccessor();
+
+ DCHECK_EQ(true_succ->GetPredecessors().size(), 1u);
+ DCHECK_EQ(false_succ->GetPredecessors().size(), 1u);
+
+ const HUseList<HInstruction*>& uses = cond->GetUses();
+ for (auto it = uses.begin(), end = uses.end(); it != end; /* ++it below */) {
+ HInstruction* user = it->GetUser();
+ size_t index = it->GetIndex();
+ HBasicBlock* user_block = user->GetBlock();
+ // Increment `it` now because `*it` may disappear thanks to user->ReplaceInput().
+ ++it;
+ if (true_succ->Dominates(user_block)) {
+ user->ReplaceInput(graph->GetIntConstant(1), index);
+ } else if (false_succ->Dominates(user_block)) {
+ user->ReplaceInput(graph->GetIntConstant(0), index);
+ }
+ }
+ }
+}
+
//
// Public methods.
//
@@ -432,7 +601,11 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
vector_preheader_(nullptr),
vector_header_(nullptr),
vector_body_(nullptr),
- vector_index_(nullptr) {
+ vector_index_(nullptr),
+ arch_loop_helper_(ArchDefaultLoopHelper::Create(compiler_driver_ != nullptr
+ ? compiler_driver_->GetInstructionSet()
+ : InstructionSet::kNone,
+ global_allocator_)) {
}
void HLoopOptimization::Run() {
@@ -643,7 +816,7 @@ void HLoopOptimization::SimplifyBlocks(LoopNode* node) {
}
}
-bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
+bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) {
HBasicBlock* header = node->loop_info->GetHeader();
HBasicBlock* preheader = node->loop_info->GetPreHeader();
// Ensure loop header logic is finite.
@@ -713,6 +886,103 @@ bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
return false;
}
+bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
+ return TryOptimizeInnerLoopFinite(node) ||
+ TryPeelingForLoopInvariantExitsElimination(node) ||
+ TryUnrollingForBranchPenaltyReduction(node);
+}
+
+
+
+//
+// Loop unrolling: generic part methods.
+//
+
+bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* node) {
+ // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
+ // as InstructionSet is needed.
+ if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) {
+ return false;
+ }
+
+ HLoopInformation* loop_info = node->loop_info;
+ int64_t trip_count = 0;
+ // Only unroll loops with a known tripcount.
+ if (!induction_range_.HasKnownTripCount(loop_info, &trip_count)) {
+ return false;
+ }
+
+ uint32_t unrolling_factor = arch_loop_helper_->GetScalarUnrollingFactor(loop_info, trip_count);
+ if (unrolling_factor == kNoUnrollingFactor) {
+ return false;
+ }
+
+ LoopAnalysisInfo loop_analysis_info(loop_info);
+ LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
+
+ // Check "IsLoopClonable" last as it can be time-consuming.
+ if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) ||
+ (loop_analysis_info.GetNumberOfExits() > 1) ||
+ loop_analysis_info.HasInstructionsPreventingScalarUnrolling() ||
+ !PeelUnrollHelper::IsLoopClonable(loop_info)) {
+ return false;
+ }
+
+ // TODO: support other unrolling factors.
+ DCHECK_EQ(unrolling_factor, 2u);
+
+ // Perform unrolling.
+ PeelUnrollSimpleHelper helper(loop_info);
+ helper.DoUnrolling();
+
+ // Remove the redundant loop check after unrolling.
+ HIf* copy_hif =
+ helper.GetBasicBlockMap()->Get(loop_info->GetHeader())->GetLastInstruction()->AsIf();
+ int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0;
+ copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u);
+
+ return true;
+}
+
+bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* node) {
+ // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
+ // as InstructionSet is needed.
+ if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) {
+ return false;
+ }
+
+ HLoopInformation* loop_info = node->loop_info;
+ // Check 'IsLoopClonable' the last as it might be time-consuming.
+ if (!arch_loop_helper_->IsLoopPeelingEnabled()) {
+ return false;
+ }
+
+ LoopAnalysisInfo loop_analysis_info(loop_info);
+ LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
+
+ // Check "IsLoopClonable" last as it can be time-consuming.
+ if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) ||
+ loop_analysis_info.HasInstructionsPreventingScalarPeeling() ||
+ !LoopAnalysis::HasLoopAtLeastOneInvariantExit(loop_info) ||
+ !PeelUnrollHelper::IsLoopClonable(loop_info)) {
+ return false;
+ }
+
+ // Perform peeling.
+ PeelUnrollSimpleHelper helper(loop_info);
+ helper.DoPeeling();
+
+ const SuperblockCloner::HInstructionMap* hir_map = helper.GetInstructionMap();
+ for (auto entry : *hir_map) {
+ HInstruction* copy = entry.second;
+ if (copy->IsIf()) {
+ TryToEvaluateIfCondition(copy->AsIf(), graph_);
+ }
+ }
+
+ return true;
+}
+
//
// Loop vectorization. The implementation is based on the book by Aart J.C. Bik:
// "The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance."
@@ -843,7 +1113,8 @@ void HLoopOptimization::Vectorize(LoopNode* node,
HBasicBlock* preheader = node->loop_info->GetPreHeader();
// Pick a loop unrolling factor for the vector loop.
- uint32_t unroll = GetUnrollingFactor(block, trip_count);
+ uint32_t unroll = arch_loop_helper_->GetSIMDUnrollingFactor(
+ block, trip_count, MaxNumberPeeled(), vector_length_);
uint32_t chunk = vector_length_ * unroll;
DCHECK(trip_count == 0 || (trip_count >= MaxNumberPeeled() + chunk));
@@ -1082,6 +1353,11 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node,
HInstruction* index = instruction->InputAt(1);
HInstruction* value = instruction->InputAt(2);
HInstruction* offset = nullptr;
+ // For narrow types, explicit type conversion may have been
+ // optimized way, so set the no hi bits restriction here.
+ if (DataType::Size(type) <= 2) {
+ restrictions |= kNoHiBits;
+ }
if (TrySetVectorType(type, &restrictions) &&
node->loop_info->IsDefinedOutOfTheLoop(base) &&
induction_range_.IsUnitStride(instruction, index, graph_, &offset) &&
@@ -1124,7 +1400,6 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node,
return !IsUsedOutsideLoop(node->loop_info, instruction) && !instruction->DoesAnyWrite();
}
-// TODO: saturation arithmetic.
bool HLoopOptimization::VectorizeUse(LoopNode* node,
HInstruction* instruction,
bool generate_code,
@@ -1297,80 +1572,62 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node,
return true;
}
}
- } else if (instruction->IsInvokeStaticOrDirect()) {
- // Accept particular intrinsics.
- HInvokeStaticOrDirect* invoke = instruction->AsInvokeStaticOrDirect();
- switch (invoke->GetIntrinsic()) {
- case Intrinsics::kMathAbsInt:
- case Intrinsics::kMathAbsLong:
- case Intrinsics::kMathAbsFloat:
- case Intrinsics::kMathAbsDouble: {
- // Deal with vector restrictions.
- HInstruction* opa = instruction->InputAt(0);
- HInstruction* r = opa;
- bool is_unsigned = false;
- if (HasVectorRestrictions(restrictions, kNoAbs)) {
- return false;
- } else if (HasVectorRestrictions(restrictions, kNoHiBits) &&
- (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) {
- return false; // reject, unless operand is sign-extension narrower
- }
- // Accept ABS(x) for vectorizable operand.
- DCHECK(r != nullptr);
- if (generate_code && vector_mode_ != kVector) { // de-idiom
- r = opa;
- }
- if (VectorizeUse(node, r, generate_code, type, restrictions)) {
- if (generate_code) {
- GenerateVecOp(instruction,
- vector_map_->Get(r),
- nullptr,
- HVecOperation::ToProperType(type, is_unsigned));
- }
- return true;
- }
- return false;
+ } else if (instruction->IsAbs()) {
+ // Deal with vector restrictions.
+ HInstruction* opa = instruction->InputAt(0);
+ HInstruction* r = opa;
+ bool is_unsigned = false;
+ if (HasVectorRestrictions(restrictions, kNoAbs)) {
+ return false;
+ } else if (HasVectorRestrictions(restrictions, kNoHiBits) &&
+ (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) {
+ return false; // reject, unless operand is sign-extension narrower
+ }
+ // Accept ABS(x) for vectorizable operand.
+ DCHECK(r != nullptr);
+ if (generate_code && vector_mode_ != kVector) { // de-idiom
+ r = opa;
+ }
+ if (VectorizeUse(node, r, generate_code, type, restrictions)) {
+ if (generate_code) {
+ GenerateVecOp(instruction,
+ vector_map_->Get(r),
+ nullptr,
+ HVecOperation::ToProperType(type, is_unsigned));
}
- case Intrinsics::kMathMinIntInt:
- case Intrinsics::kMathMinLongLong:
- case Intrinsics::kMathMinFloatFloat:
- case Intrinsics::kMathMinDoubleDouble:
- case Intrinsics::kMathMaxIntInt:
- case Intrinsics::kMathMaxLongLong:
- case Intrinsics::kMathMaxFloatFloat:
- case Intrinsics::kMathMaxDoubleDouble: {
- // Deal with vector restrictions.
- HInstruction* opa = instruction->InputAt(0);
- HInstruction* opb = instruction->InputAt(1);
- HInstruction* r = opa;
- HInstruction* s = opb;
- bool is_unsigned = false;
- if (HasVectorRestrictions(restrictions, kNoMinMax)) {
- return false;
- } else if (HasVectorRestrictions(restrictions, kNoHiBits) &&
- !IsNarrowerOperands(opa, opb, type, &r, &s, &is_unsigned)) {
- return false; // reject, unless all operands are same-extension narrower
- }
- // Accept MIN/MAX(x, y) for vectorizable operands.
- DCHECK(r != nullptr);
- DCHECK(s != nullptr);
- if (generate_code && vector_mode_ != kVector) { // de-idiom
- r = opa;
- s = opb;
- }
- if (VectorizeUse(node, r, generate_code, type, restrictions) &&
- VectorizeUse(node, s, generate_code, type, restrictions)) {
- if (generate_code) {
- GenerateVecOp(
- instruction, vector_map_->Get(r), vector_map_->Get(s), type, is_unsigned);
- }
- return true;
- }
- return false;
+ return true;
+ }
+ } else if (instruction->IsMin() || instruction->IsMax()) {
+ // Recognize saturation arithmetic.
+ if (VectorizeSaturationIdiom(node, instruction, generate_code, type, restrictions)) {
+ return true;
+ }
+ // Deal with vector restrictions.
+ HInstruction* opa = instruction->InputAt(0);
+ HInstruction* opb = instruction->InputAt(1);
+ HInstruction* r = opa;
+ HInstruction* s = opb;
+ bool is_unsigned = false;
+ if (HasVectorRestrictions(restrictions, kNoMinMax)) {
+ return false;
+ } else if (HasVectorRestrictions(restrictions, kNoHiBits) &&
+ !IsNarrowerOperands(opa, opb, type, &r, &s, &is_unsigned)) {
+ return false; // reject, unless all operands are same-extension narrower
+ }
+ // Accept MIN/MAX(x, y) for vectorizable operands.
+ DCHECK(r != nullptr && s != nullptr);
+ if (generate_code && vector_mode_ != kVector) { // de-idiom
+ r = opa;
+ s = opb;
+ }
+ if (VectorizeUse(node, r, generate_code, type, restrictions) &&
+ VectorizeUse(node, s, generate_code, type, restrictions)) {
+ if (generate_code) {
+ GenerateVecOp(
+ instruction, vector_map_->Get(r), vector_map_->Get(s), type, is_unsigned);
}
- default:
- return false;
- } // switch
+ return true;
+ }
}
return false;
}
@@ -1475,11 +1732,11 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
- *restrictions |= kNoDiv;
+ *restrictions |= kNoDiv | kNoSaturation;
return TrySetVectorLength(16);
case DataType::Type::kUint16:
case DataType::Type::kInt16:
- *restrictions |= kNoDiv | kNoStringCharAt;
+ *restrictions |= kNoDiv | kNoSaturation | kNoStringCharAt;
return TrySetVectorLength(8);
case DataType::Type::kInt32:
*restrictions |= kNoDiv;
@@ -1504,11 +1761,11 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
- *restrictions |= kNoDiv;
+ *restrictions |= kNoDiv | kNoSaturation;
return TrySetVectorLength(16);
case DataType::Type::kUint16:
case DataType::Type::kInt16:
- *restrictions |= kNoDiv | kNoStringCharAt;
+ *restrictions |= kNoDiv | kNoSaturation | kNoStringCharAt;
return TrySetVectorLength(8);
case DataType::Type::kInt32:
*restrictions |= kNoDiv;
@@ -1811,83 +2068,29 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org,
GENERATE_VEC(
new (global_allocator_) HVecUShr(global_allocator_, opa, opb, type, vector_length_, dex_pc),
new (global_allocator_) HUShr(org_type, opa, opb, dex_pc));
- case HInstruction::kInvokeStaticOrDirect: {
- HInvokeStaticOrDirect* invoke = org->AsInvokeStaticOrDirect();
- if (vector_mode_ == kVector) {
- switch (invoke->GetIntrinsic()) {
- case Intrinsics::kMathAbsInt:
- case Intrinsics::kMathAbsLong:
- case Intrinsics::kMathAbsFloat:
- case Intrinsics::kMathAbsDouble:
- DCHECK(opb == nullptr);
- vector = new (global_allocator_)
- HVecAbs(global_allocator_, opa, type, vector_length_, dex_pc);
- break;
- case Intrinsics::kMathMinIntInt:
- case Intrinsics::kMathMinLongLong:
- case Intrinsics::kMathMinFloatFloat:
- case Intrinsics::kMathMinDoubleDouble: {
- vector = new (global_allocator_)
- HVecMin(global_allocator_,
- opa,
- opb,
- HVecOperation::ToProperType(type, is_unsigned),
- vector_length_,
- dex_pc);
- break;
- }
- case Intrinsics::kMathMaxIntInt:
- case Intrinsics::kMathMaxLongLong:
- case Intrinsics::kMathMaxFloatFloat:
- case Intrinsics::kMathMaxDoubleDouble: {
- vector = new (global_allocator_)
- HVecMax(global_allocator_,
- opa,
- opb,
- HVecOperation::ToProperType(type, is_unsigned),
- vector_length_,
- dex_pc);
- break;
- }
- default:
- LOG(FATAL) << "Unsupported SIMD intrinsic " << org->GetId();
- UNREACHABLE();
- } // switch invoke
- } else {
- // In scalar code, simply clone the method invoke, and replace its operands with the
- // corresponding new scalar instructions in the loop. The instruction will get an
- // environment while being inserted from the instruction map in original program order.
- DCHECK(vector_mode_ == kSequential);
- size_t num_args = invoke->GetNumberOfArguments();
- HInvokeStaticOrDirect* new_invoke = new (global_allocator_) HInvokeStaticOrDirect(
- global_allocator_,
- num_args,
- invoke->GetType(),
- invoke->GetDexPc(),
- invoke->GetDexMethodIndex(),
- invoke->GetResolvedMethod(),
- invoke->GetDispatchInfo(),
- invoke->GetInvokeType(),
- invoke->GetTargetMethod(),
- invoke->GetClinitCheckRequirement());
- HInputsRef inputs = invoke->GetInputs();
- size_t num_inputs = inputs.size();
- DCHECK_LE(num_args, num_inputs);
- DCHECK_EQ(num_inputs, new_invoke->GetInputs().size()); // both invokes agree
- for (size_t index = 0; index < num_inputs; ++index) {
- HInstruction* new_input = index < num_args
- ? vector_map_->Get(inputs[index])
- : inputs[index]; // beyond arguments: just pass through
- new_invoke->SetArgumentAt(index, new_input);
- }
- new_invoke->SetIntrinsic(invoke->GetIntrinsic(),
- kNeedsEnvironmentOrCache,
- kNoSideEffects,
- kNoThrow);
- vector = new_invoke;
- }
- break;
- }
+ case HInstruction::kMin:
+ GENERATE_VEC(
+ new (global_allocator_) HVecMin(global_allocator_,
+ opa,
+ opb,
+ HVecOperation::ToProperType(type, is_unsigned),
+ vector_length_,
+ dex_pc),
+ new (global_allocator_) HMin(org_type, opa, opb, dex_pc));
+ case HInstruction::kMax:
+ GENERATE_VEC(
+ new (global_allocator_) HVecMax(global_allocator_,
+ opa,
+ opb,
+ HVecOperation::ToProperType(type, is_unsigned),
+ vector_length_,
+ dex_pc),
+ new (global_allocator_) HMax(org_type, opa, opb, dex_pc));
+ case HInstruction::kAbs:
+ DCHECK(opb == nullptr);
+ GENERATE_VEC(
+ new (global_allocator_) HVecAbs(global_allocator_, opa, type, vector_length_, dex_pc),
+ new (global_allocator_) HAbs(org_type, opa, dex_pc));
default:
break;
} // switch
@@ -1901,6 +2104,79 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org,
// Vectorization idioms.
//
+// Method recognizes single and double clipping saturation arithmetic.
+bool HLoopOptimization::VectorizeSaturationIdiom(LoopNode* node,
+ HInstruction* instruction,
+ bool generate_code,
+ DataType::Type type,
+ uint64_t restrictions) {
+ // Deal with vector restrictions.
+ if (HasVectorRestrictions(restrictions, kNoSaturation)) {
+ return false;
+ }
+ // Restrict type (generalize if one day we generalize allowed MIN/MAX integral types).
+ if (instruction->GetType() != DataType::Type::kInt32 &&
+ instruction->GetType() != DataType::Type::kInt64) {
+ return false;
+ }
+ // Clipped addition or subtraction on narrower operands? We will try both
+ // formats since, e.g., x+c can be interpreted as x+c and x-(-c), depending
+ // on what clipping values are used, to get most benefits.
+ int64_t lo = std::numeric_limits<int64_t>::min();
+ int64_t hi = std::numeric_limits<int64_t>::max();
+ HInstruction* clippee = FindClippee(instruction, &lo, &hi);
+ HInstruction* a = nullptr;
+ HInstruction* b = nullptr;
+ HInstruction* r = nullptr;
+ HInstruction* s = nullptr;
+ bool is_unsigned = false;
+ bool is_add = true;
+ int64_t c = 0;
+ // First try for saturated addition.
+ if (IsAddConst2(graph_, clippee, /*out*/ &a, /*out*/ &b, /*out*/ &c) && c == 0 &&
+ IsNarrowerOperands(a, b, type, &r, &s, &is_unsigned) &&
+ IsSaturatedAdd(r, s, type, lo, hi, is_unsigned)) {
+ is_add = true;
+ } else {
+ // Then try again for saturated subtraction.
+ a = b = r = s = nullptr;
+ if (IsSubConst2(graph_, clippee, /*out*/ &a, /*out*/ &b) &&
+ IsNarrowerOperands(a, b, type, &r, &s, &is_unsigned) &&
+ IsSaturatedSub(r, type, lo, hi, is_unsigned)) {
+ is_add = false;
+ } else {
+ return false;
+ }
+ }
+ // Accept saturation idiom for vectorizable operands.
+ DCHECK(r != nullptr && s != nullptr);
+ if (generate_code && vector_mode_ != kVector) { // de-idiom
+ r = instruction->InputAt(0);
+ s = instruction->InputAt(1);
+ restrictions &= ~(kNoHiBits | kNoMinMax); // allow narrow MIN/MAX in seq
+ }
+ if (VectorizeUse(node, r, generate_code, type, restrictions) &&
+ VectorizeUse(node, s, generate_code, type, restrictions)) {
+ if (generate_code) {
+ if (vector_mode_ == kVector) {
+ DataType::Type vtype = HVecOperation::ToProperType(type, is_unsigned);
+ HInstruction* op1 = vector_map_->Get(r);
+ HInstruction* op2 = vector_map_->Get(s);
+ vector_map_->Put(instruction, is_add
+ ? reinterpret_cast<HInstruction*>(new (global_allocator_) HVecSaturationAdd(
+ global_allocator_, op1, op2, vtype, vector_length_, kNoDexPc))
+ : reinterpret_cast<HInstruction*>(new (global_allocator_) HVecSaturationSub(
+ global_allocator_, op1, op2, vtype, vector_length_, kNoDexPc)));
+ MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
+ } else {
+ GenerateVecOp(instruction, vector_map_->Get(r), vector_map_->Get(s), type);
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
// Method recognizes the following idioms:
// rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
// truncated halving add (a + b) >> 1 for unsigned/signed operands a, b
@@ -1924,8 +2200,7 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
HInstruction* a = nullptr;
HInstruction* b = nullptr;
int64_t c = 0;
- if (IsAddConst(instruction->InputAt(0), /*out*/ &a, /*out*/ &b, /*out*/ &c)) {
- DCHECK(a != nullptr && b != nullptr);
+ if (IsAddConst2(graph_, instruction->InputAt(0), /*out*/ &a, /*out*/ &b, /*out*/ &c)) {
// Accept c == 1 (rounded) or c == 0 (not rounded).
bool is_rounded = false;
if (c == 1) {
@@ -1947,8 +2222,7 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
}
// Accept recognized halving add for vectorizable operands. Vectorized code uses the
// shorthand idiomatic operation. Sequential code uses the original scalar expressions.
- DCHECK(r != nullptr);
- DCHECK(s != nullptr);
+ DCHECK(r != nullptr && s != nullptr);
if (generate_code && vector_mode_ != kVector) { // de-idiom
r = instruction->InputAt(0);
s = instruction->InputAt(1);
@@ -1998,21 +2272,11 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
HInstruction* v = instruction->InputAt(1);
HInstruction* a = nullptr;
HInstruction* b = nullptr;
- if (v->IsInvokeStaticOrDirect() &&
- (v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsInt ||
- v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsLong)) {
- HInstruction* x = v->InputAt(0);
- if (x->GetType() == reduction_type) {
- int64_t c = 0;
- if (x->IsSub()) {
- a = x->InputAt(0);
- b = x->InputAt(1);
- } else if (IsAddConst(x, /*out*/ &a, /*out*/ &c)) {
- b = graph_->GetConstant(reduction_type, -c); // hidden SUB!
- }
- }
- }
- if (a == nullptr || b == nullptr) {
+ if (v->IsAbs() &&
+ v->GetType() == reduction_type &&
+ IsSubConst2(graph_, v->InputAt(0), /*out*/ &a, /*out*/ &b)) {
+ DCHECK(a != nullptr && b != nullptr);
+ } else {
return false;
}
// Accept same-type or consistent sign extension for narrower-type on operands a and b.
@@ -2045,8 +2309,7 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
}
// Accept SAD idiom for vectorizable operands. Vectorized code uses the shorthand
// idiomatic operation. Sequential code uses the original scalar expressions.
- DCHECK(r != nullptr);
- DCHECK(s != nullptr);
+ DCHECK(r != nullptr && s != nullptr);
if (generate_code && vector_mode_ != kVector) { // de-idiom
r = s = v->InputAt(0);
}
@@ -2054,14 +2317,13 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
VectorizeUse(node, r, generate_code, sub_type, restrictions) &&
VectorizeUse(node, s, generate_code, sub_type, restrictions)) {
if (generate_code) {
- reduction_type = HVecOperation::ToProperType(reduction_type, is_unsigned);
if (vector_mode_ == kVector) {
vector_map_->Put(instruction, new (global_allocator_) HVecSADAccumulate(
global_allocator_,
vector_map_->Get(q),
vector_map_->Get(r),
vector_map_->Get(s),
- reduction_type,
+ HVecOperation::ToProperType(reduction_type, is_unsigned),
GetOtherVL(reduction_type, sub_type, vector_length_),
kNoDexPc));
MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
@@ -2134,41 +2396,6 @@ bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) {
return true;
}
-static constexpr uint32_t ARM64_SIMD_MAXIMUM_UNROLL_FACTOR = 8;
-static constexpr uint32_t ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE = 50;
-
-uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
- uint32_t max_peel = MaxNumberPeeled();
- switch (compiler_driver_->GetInstructionSet()) {
- case InstructionSet::kArm64: {
- // Don't unroll with insufficient iterations.
- // TODO: Unroll loops with unknown trip count.
- DCHECK_NE(vector_length_, 0u);
- if (trip_count < (2 * vector_length_ + max_peel)) {
- return kNoUnrollingFactor;
- }
- // Don't unroll for large loop body size.
- uint32_t instruction_count = block->GetInstructions().CountSize();
- if (instruction_count >= ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE) {
- return kNoUnrollingFactor;
- }
- // Find a beneficial unroll factor with the following restrictions:
- // - At least one iteration of the transformed loop should be executed.
- // - The loop body shouldn't be "too big" (heuristic).
- uint32_t uf1 = ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE / instruction_count;
- uint32_t uf2 = (trip_count - max_peel) / vector_length_;
- uint32_t unroll_factor =
- TruncToPowerOfTwo(std::min({uf1, uf2, ARM64_SIMD_MAXIMUM_UNROLL_FACTOR}));
- DCHECK_GE(unroll_factor, 1u);
- return unroll_factor;
- }
- case InstructionSet::kX86:
- case InstructionSet::kX86_64:
- default:
- return kNoUnrollingFactor;
- }
-}
-
//
// Helpers.
//