diff options
Diffstat (limited to 'compiler/optimizing/loop_optimization.cc')
| -rw-r--r-- | compiler/optimizing/loop_optimization.cc | 765 |
1 files changed, 496 insertions, 269 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 899496328eb..1462404932e 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -33,8 +33,8 @@ namespace art { // Enables vectorization (SIMDization) in the loop optimizer. static constexpr bool kEnableVectorization = true; -// No loop unrolling factor (just one copy of the loop-body). -static constexpr uint32_t kNoUnrollingFactor = 1; +// Enables scalar loop unrolling in the loop optimizer. +static constexpr bool kEnableScalarPeelingUnrolling = false; // // Static helpers. @@ -153,6 +153,18 @@ static bool IsSignExtensionAndGet(HInstruction* instruction, return false; } } + // A MIN-MAX on narrower operands qualifies as well + // (returning the operator itself). + if (instruction->IsMin() || instruction->IsMax()) { + HBinaryOperation* min_max = instruction->AsBinaryOperation(); + DCHECK(min_max->GetType() == DataType::Type::kInt32 || + min_max->GetType() == DataType::Type::kInt64); + if (IsSignExtensionAndGet(min_max->InputAt(0), type, operand) && + IsSignExtensionAndGet(min_max->InputAt(1), type, operand)) { + *operand = min_max; + return true; + } + } return false; } @@ -216,6 +228,18 @@ static bool IsZeroExtensionAndGet(HInstruction* instruction, return false; } } + // A MIN-MAX on narrower operands qualifies as well + // (returning the operator itself). + if (instruction->IsMin() || instruction->IsMax()) { + HBinaryOperation* min_max = instruction->AsBinaryOperation(); + DCHECK(min_max->GetType() == DataType::Type::kInt32 || + min_max->GetType() == DataType::Type::kInt64); + if (IsZeroExtensionAndGet(min_max->InputAt(0), type, operand) && + IsZeroExtensionAndGet(min_max->InputAt(1), type, operand)) { + *operand = min_max; + return true; + } + } return false; } @@ -227,6 +251,7 @@ static bool IsNarrowerOperands(HInstruction* a, /*out*/ HInstruction** r, /*out*/ HInstruction** s, /*out*/ bool* is_unsigned) { + DCHECK(a != nullptr && b != nullptr); // Look for a matching sign extension. DataType::Type stype = HVecOperation::ToSignedType(type); if (IsSignExtensionAndGet(a, stype, r) && IsSignExtensionAndGet(b, stype, s)) { @@ -247,6 +272,7 @@ static bool IsNarrowerOperand(HInstruction* a, DataType::Type type, /*out*/ HInstruction** r, /*out*/ bool* is_unsigned) { + DCHECK(a != nullptr); // Look for a matching sign extension. DataType::Type stype = HVecOperation::ToSignedType(type); if (IsSignExtensionAndGet(a, stype, r)) { @@ -270,20 +296,28 @@ static uint32_t GetOtherVL(DataType::Type other_type, DataType::Type vector_type return vl >> (DataType::SizeShift(other_type) - DataType::SizeShift(vector_type)); } -// Detect up to two instructions a and b, and an acccumulated constant c. -static bool IsAddConstHelper(HInstruction* instruction, - /*out*/ HInstruction** a, - /*out*/ HInstruction** b, - /*out*/ int64_t* c, - int32_t depth) { - static constexpr int32_t kMaxDepth = 8; // don't search too deep +// Detect up to two added operands a and b and an acccumulated constant c. +static bool IsAddConst(HInstruction* instruction, + /*out*/ HInstruction** a, + /*out*/ HInstruction** b, + /*out*/ int64_t* c, + int32_t depth = 8) { // don't search too deep int64_t value = 0; + // Enter add/sub while still within reasonable depth. + if (depth > 0) { + if (instruction->IsAdd()) { + return IsAddConst(instruction->InputAt(0), a, b, c, depth - 1) && + IsAddConst(instruction->InputAt(1), a, b, c, depth - 1); + } else if (instruction->IsSub() && + IsInt64AndGet(instruction->InputAt(1), &value)) { + *c -= value; + return IsAddConst(instruction->InputAt(0), a, b, c, depth - 1); + } + } + // Otherwise, deal with leaf nodes. if (IsInt64AndGet(instruction, &value)) { *c += value; return true; - } else if (instruction->IsAdd() && depth <= kMaxDepth) { - return IsAddConstHelper(instruction->InputAt(0), a, b, c, depth + 1) && - IsAddConstHelper(instruction->InputAt(1), a, b, c, depth + 1); } else if (*a == nullptr) { *a = instruction; return true; @@ -291,72 +325,170 @@ static bool IsAddConstHelper(HInstruction* instruction, *b = instruction; return true; } - return false; // too many non-const operands + return false; // too many operands } -// Detect a + b + c for an optional constant c. -static bool IsAddConst(HInstruction* instruction, - /*out*/ HInstruction** a, - /*out*/ HInstruction** b, - /*out*/ int64_t* c) { - if (instruction->IsAdd()) { - // Try to find a + b and accumulated c. - if (IsAddConstHelper(instruction->InputAt(0), a, b, c, /*depth*/ 0) && - IsAddConstHelper(instruction->InputAt(1), a, b, c, /*depth*/ 0) && - *b != nullptr) { - return true; +// Detect a + b + c with optional constant c. +static bool IsAddConst2(HGraph* graph, + HInstruction* instruction, + /*out*/ HInstruction** a, + /*out*/ HInstruction** b, + /*out*/ int64_t* c) { + if (IsAddConst(instruction, a, b, c) && *a != nullptr) { + if (*b == nullptr) { + // Constant is usually already present, unless accumulated. + *b = graph->GetConstant(instruction->GetType(), (*c)); + *c = 0; } - // Found a + b. + return true; + } + return false; +} + +// Detect a direct a - b or a hidden a - (-c). +static bool IsSubConst2(HGraph* graph, + HInstruction* instruction, + /*out*/ HInstruction** a, + /*out*/ HInstruction** b) { + int64_t c = 0; + if (instruction->IsSub()) { *a = instruction->InputAt(0); *b = instruction->InputAt(1); - *c = 0; + return true; + } else if (IsAddConst(instruction, a, b, &c) && *a != nullptr && *b == nullptr) { + // Constant for the hidden subtraction. + *b = graph->GetConstant(instruction->GetType(), -c); return true; } return false; } -// Detect a + c for constant c. -static bool IsAddConst(HInstruction* instruction, - /*out*/ HInstruction** a, - /*out*/ int64_t* c) { - if (instruction->IsAdd()) { - if (IsInt64AndGet(instruction->InputAt(0), c)) { - *a = instruction->InputAt(1); - return true; - } else if (IsInt64AndGet(instruction->InputAt(1), c)) { - *a = instruction->InputAt(0); - return true; +// Detect clipped [lo, hi] range for nested MIN-MAX operations on a clippee, +// such as MIN(hi, MAX(lo, clippee)) for an arbitrary clippee expression. +// Example: MIN(10, MIN(20, MAX(0, x))) yields [0, 10] with clippee x. +static HInstruction* FindClippee(HInstruction* instruction, + /*out*/ int64_t* lo, + /*out*/ int64_t* hi) { + // Iterate into MIN(.., c)-MAX(.., c) expressions and 'tighten' the range [lo, hi]. + while (instruction->IsMin() || instruction->IsMax()) { + HBinaryOperation* min_max = instruction->AsBinaryOperation(); + DCHECK(min_max->GetType() == DataType::Type::kInt32 || + min_max->GetType() == DataType::Type::kInt64); + // Process the constant. + HConstant* right = min_max->GetConstantRight(); + if (right == nullptr) { + break; + } else if (instruction->IsMin()) { + *hi = std::min(*hi, Int64FromConstant(right)); + } else { + *lo = std::max(*lo, Int64FromConstant(right)); } + instruction = min_max->GetLeastConstantLeft(); + } + // Iteration ends in any other expression (possibly MIN/MAX without constant). + // This leaf expression is the clippee with range [lo, hi]. + return instruction; +} + +// Set value range for type (or fail). +static bool CanSetRange(DataType::Type type, + /*out*/ int64_t* uhi, + /*out*/ int64_t* slo, + /*out*/ int64_t* shi) { + if (DataType::Size(type) == 1) { + *uhi = std::numeric_limits<uint8_t>::max(); + *slo = std::numeric_limits<int8_t>::min(); + *shi = std::numeric_limits<int8_t>::max(); + return true; + } else if (DataType::Size(type) == 2) { + *uhi = std::numeric_limits<uint16_t>::max(); + *slo = std::numeric_limits<int16_t>::min(); + *shi = std::numeric_limits<int16_t>::max(); + return true; } return false; } +// Accept various saturated addition forms. +static bool IsSaturatedAdd(HInstruction* a, + HInstruction* b, + DataType::Type type, + int64_t lo, + int64_t hi, + bool is_unsigned) { + int64_t ulo = 0, uhi = 0, slo = 0, shi = 0; + if (!CanSetRange(type, &uhi, &slo, &shi)) { + return false; + } + // Tighten the range for signed single clipping on constant. + if (!is_unsigned) { + int64_t c = 0; + if (IsInt64AndGet(a, &c) || IsInt64AndGet(b, &c)) { + // For c in proper range and narrower operand r: + // MIN(r + c, 127) c > 0 + // or MAX(r + c, -128) c < 0 (and possibly redundant bound). + if (0 < c && c <= shi && hi == shi) { + if (lo <= (slo + c)) { + return true; + } + } else if (slo <= c && c < 0 && lo == slo) { + if (hi >= (shi + c)) { + return true; + } + } + } + } + // Detect for narrower operands r and s: + // MIN(r + s, 255) => SAT_ADD_unsigned + // MAX(MIN(r + s, 127), -128) => SAT_ADD_signed. + return is_unsigned ? (lo <= ulo && hi == uhi) : (lo == slo && hi == shi); +} + +// Accept various saturated subtraction forms. +static bool IsSaturatedSub(HInstruction* a, + DataType::Type type, + int64_t lo, + int64_t hi, + bool is_unsigned) { + int64_t ulo = 0, uhi = 0, slo = 0, shi = 0; + if (!CanSetRange(type, &uhi, &slo, &shi)) { + return false; + } + // Tighten the range for signed single clipping on constant. + if (!is_unsigned) { + int64_t c = 0; + if (IsInt64AndGet(a, /*out*/ &c)) { + // For c in proper range and narrower operand r: + // MIN(c - r, 127) c > 0 + // or MAX(c - r, -128) c < 0 (and possibly redundant bound). + if (0 < c && c <= shi && hi == shi) { + if (lo <= (c - shi)) { + return true; + } + } else if (slo <= c && c < 0 && lo == slo) { + if (hi >= (c - slo)) { + return true; + } + } + } + } + // Detect for narrower operands r and s: + // MAX(r - s, 0) => SAT_SUB_unsigned + // MIN(MAX(r - s, -128), 127) => SAT_ADD_signed. + return is_unsigned ? (lo == ulo && hi >= uhi) : (lo == slo && hi == shi); +} + // Detect reductions of the following forms, // x = x_phi + .. // x = x_phi - .. -// x = max(x_phi, ..) // x = min(x_phi, ..) +// x = max(x_phi, ..) static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) { - if (reduction->IsAdd()) { + if (reduction->IsAdd() || reduction->IsMin() || reduction->IsMax()) { return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) || (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi); } else if (reduction->IsSub()) { return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi); - } else if (reduction->IsInvokeStaticOrDirect()) { - switch (reduction->AsInvokeStaticOrDirect()->GetIntrinsic()) { - case Intrinsics::kMathMinIntInt: - case Intrinsics::kMathMinLongLong: - case Intrinsics::kMathMinFloatFloat: - case Intrinsics::kMathMinDoubleDouble: - case Intrinsics::kMathMaxIntInt: - case Intrinsics::kMathMaxLongLong: - case Intrinsics::kMathMaxFloatFloat: - case Intrinsics::kMathMaxDoubleDouble: - return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) || - (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi); - default: - return false; - } } return false; } @@ -401,6 +533,43 @@ static bool CheckInductionSetFullyRemoved(ScopedArenaSet<HInstruction*>* iset) { return true; } +// Tries to statically evaluate condition of the specified "HIf" for other condition checks. +static void TryToEvaluateIfCondition(HIf* instruction, HGraph* graph) { + HInstruction* cond = instruction->InputAt(0); + + // If a condition 'cond' is evaluated in an HIf instruction then in the successors of the + // IF_BLOCK we statically know the value of the condition 'cond' (TRUE in TRUE_SUCC, FALSE in + // FALSE_SUCC). Using that we can replace another evaluation (use) EVAL of the same 'cond' + // with TRUE value (FALSE value) if every path from the ENTRY_BLOCK to EVAL_BLOCK contains the + // edge HIF_BLOCK->TRUE_SUCC (HIF_BLOCK->FALSE_SUCC). + // if (cond) { if(cond) { + // if (cond) {} if (1) {} + // } else { =======> } else { + // if (cond) {} if (0) {} + // } } + if (!cond->IsConstant()) { + HBasicBlock* true_succ = instruction->IfTrueSuccessor(); + HBasicBlock* false_succ = instruction->IfFalseSuccessor(); + + DCHECK_EQ(true_succ->GetPredecessors().size(), 1u); + DCHECK_EQ(false_succ->GetPredecessors().size(), 1u); + + const HUseList<HInstruction*>& uses = cond->GetUses(); + for (auto it = uses.begin(), end = uses.end(); it != end; /* ++it below */) { + HInstruction* user = it->GetUser(); + size_t index = it->GetIndex(); + HBasicBlock* user_block = user->GetBlock(); + // Increment `it` now because `*it` may disappear thanks to user->ReplaceInput(). + ++it; + if (true_succ->Dominates(user_block)) { + user->ReplaceInput(graph->GetIntConstant(1), index); + } else if (false_succ->Dominates(user_block)) { + user->ReplaceInput(graph->GetIntConstant(0), index); + } + } + } +} + // // Public methods. // @@ -432,7 +601,11 @@ HLoopOptimization::HLoopOptimization(HGraph* graph, vector_preheader_(nullptr), vector_header_(nullptr), vector_body_(nullptr), - vector_index_(nullptr) { + vector_index_(nullptr), + arch_loop_helper_(ArchDefaultLoopHelper::Create(compiler_driver_ != nullptr + ? compiler_driver_->GetInstructionSet() + : InstructionSet::kNone, + global_allocator_)) { } void HLoopOptimization::Run() { @@ -643,7 +816,7 @@ void HLoopOptimization::SimplifyBlocks(LoopNode* node) { } } -bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { +bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) { HBasicBlock* header = node->loop_info->GetHeader(); HBasicBlock* preheader = node->loop_info->GetPreHeader(); // Ensure loop header logic is finite. @@ -713,6 +886,103 @@ bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { return false; } +bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { + return TryOptimizeInnerLoopFinite(node) || + TryPeelingForLoopInvariantExitsElimination(node) || + TryUnrollingForBranchPenaltyReduction(node); +} + + + +// +// Loop unrolling: generic part methods. +// + +bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* node) { + // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests) + // as InstructionSet is needed. + if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) { + return false; + } + + HLoopInformation* loop_info = node->loop_info; + int64_t trip_count = 0; + // Only unroll loops with a known tripcount. + if (!induction_range_.HasKnownTripCount(loop_info, &trip_count)) { + return false; + } + + uint32_t unrolling_factor = arch_loop_helper_->GetScalarUnrollingFactor(loop_info, trip_count); + if (unrolling_factor == kNoUnrollingFactor) { + return false; + } + + LoopAnalysisInfo loop_analysis_info(loop_info); + LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info); + + // Check "IsLoopClonable" last as it can be time-consuming. + if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) || + (loop_analysis_info.GetNumberOfExits() > 1) || + loop_analysis_info.HasInstructionsPreventingScalarUnrolling() || + !PeelUnrollHelper::IsLoopClonable(loop_info)) { + return false; + } + + // TODO: support other unrolling factors. + DCHECK_EQ(unrolling_factor, 2u); + + // Perform unrolling. + PeelUnrollSimpleHelper helper(loop_info); + helper.DoUnrolling(); + + // Remove the redundant loop check after unrolling. + HIf* copy_hif = + helper.GetBasicBlockMap()->Get(loop_info->GetHeader())->GetLastInstruction()->AsIf(); + int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0; + copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u); + + return true; +} + +bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* node) { + // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests) + // as InstructionSet is needed. + if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) { + return false; + } + + HLoopInformation* loop_info = node->loop_info; + // Check 'IsLoopClonable' the last as it might be time-consuming. + if (!arch_loop_helper_->IsLoopPeelingEnabled()) { + return false; + } + + LoopAnalysisInfo loop_analysis_info(loop_info); + LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info); + + // Check "IsLoopClonable" last as it can be time-consuming. + if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) || + loop_analysis_info.HasInstructionsPreventingScalarPeeling() || + !LoopAnalysis::HasLoopAtLeastOneInvariantExit(loop_info) || + !PeelUnrollHelper::IsLoopClonable(loop_info)) { + return false; + } + + // Perform peeling. + PeelUnrollSimpleHelper helper(loop_info); + helper.DoPeeling(); + + const SuperblockCloner::HInstructionMap* hir_map = helper.GetInstructionMap(); + for (auto entry : *hir_map) { + HInstruction* copy = entry.second; + if (copy->IsIf()) { + TryToEvaluateIfCondition(copy->AsIf(), graph_); + } + } + + return true; +} + // // Loop vectorization. The implementation is based on the book by Aart J.C. Bik: // "The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance." @@ -843,7 +1113,8 @@ void HLoopOptimization::Vectorize(LoopNode* node, HBasicBlock* preheader = node->loop_info->GetPreHeader(); // Pick a loop unrolling factor for the vector loop. - uint32_t unroll = GetUnrollingFactor(block, trip_count); + uint32_t unroll = arch_loop_helper_->GetSIMDUnrollingFactor( + block, trip_count, MaxNumberPeeled(), vector_length_); uint32_t chunk = vector_length_ * unroll; DCHECK(trip_count == 0 || (trip_count >= MaxNumberPeeled() + chunk)); @@ -1082,6 +1353,11 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node, HInstruction* index = instruction->InputAt(1); HInstruction* value = instruction->InputAt(2); HInstruction* offset = nullptr; + // For narrow types, explicit type conversion may have been + // optimized way, so set the no hi bits restriction here. + if (DataType::Size(type) <= 2) { + restrictions |= kNoHiBits; + } if (TrySetVectorType(type, &restrictions) && node->loop_info->IsDefinedOutOfTheLoop(base) && induction_range_.IsUnitStride(instruction, index, graph_, &offset) && @@ -1124,7 +1400,6 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node, return !IsUsedOutsideLoop(node->loop_info, instruction) && !instruction->DoesAnyWrite(); } -// TODO: saturation arithmetic. bool HLoopOptimization::VectorizeUse(LoopNode* node, HInstruction* instruction, bool generate_code, @@ -1297,80 +1572,62 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, return true; } } - } else if (instruction->IsInvokeStaticOrDirect()) { - // Accept particular intrinsics. - HInvokeStaticOrDirect* invoke = instruction->AsInvokeStaticOrDirect(); - switch (invoke->GetIntrinsic()) { - case Intrinsics::kMathAbsInt: - case Intrinsics::kMathAbsLong: - case Intrinsics::kMathAbsFloat: - case Intrinsics::kMathAbsDouble: { - // Deal with vector restrictions. - HInstruction* opa = instruction->InputAt(0); - HInstruction* r = opa; - bool is_unsigned = false; - if (HasVectorRestrictions(restrictions, kNoAbs)) { - return false; - } else if (HasVectorRestrictions(restrictions, kNoHiBits) && - (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) { - return false; // reject, unless operand is sign-extension narrower - } - // Accept ABS(x) for vectorizable operand. - DCHECK(r != nullptr); - if (generate_code && vector_mode_ != kVector) { // de-idiom - r = opa; - } - if (VectorizeUse(node, r, generate_code, type, restrictions)) { - if (generate_code) { - GenerateVecOp(instruction, - vector_map_->Get(r), - nullptr, - HVecOperation::ToProperType(type, is_unsigned)); - } - return true; - } - return false; + } else if (instruction->IsAbs()) { + // Deal with vector restrictions. + HInstruction* opa = instruction->InputAt(0); + HInstruction* r = opa; + bool is_unsigned = false; + if (HasVectorRestrictions(restrictions, kNoAbs)) { + return false; + } else if (HasVectorRestrictions(restrictions, kNoHiBits) && + (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) { + return false; // reject, unless operand is sign-extension narrower + } + // Accept ABS(x) for vectorizable operand. + DCHECK(r != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = opa; + } + if (VectorizeUse(node, r, generate_code, type, restrictions)) { + if (generate_code) { + GenerateVecOp(instruction, + vector_map_->Get(r), + nullptr, + HVecOperation::ToProperType(type, is_unsigned)); } - case Intrinsics::kMathMinIntInt: - case Intrinsics::kMathMinLongLong: - case Intrinsics::kMathMinFloatFloat: - case Intrinsics::kMathMinDoubleDouble: - case Intrinsics::kMathMaxIntInt: - case Intrinsics::kMathMaxLongLong: - case Intrinsics::kMathMaxFloatFloat: - case Intrinsics::kMathMaxDoubleDouble: { - // Deal with vector restrictions. - HInstruction* opa = instruction->InputAt(0); - HInstruction* opb = instruction->InputAt(1); - HInstruction* r = opa; - HInstruction* s = opb; - bool is_unsigned = false; - if (HasVectorRestrictions(restrictions, kNoMinMax)) { - return false; - } else if (HasVectorRestrictions(restrictions, kNoHiBits) && - !IsNarrowerOperands(opa, opb, type, &r, &s, &is_unsigned)) { - return false; // reject, unless all operands are same-extension narrower - } - // Accept MIN/MAX(x, y) for vectorizable operands. - DCHECK(r != nullptr); - DCHECK(s != nullptr); - if (generate_code && vector_mode_ != kVector) { // de-idiom - r = opa; - s = opb; - } - if (VectorizeUse(node, r, generate_code, type, restrictions) && - VectorizeUse(node, s, generate_code, type, restrictions)) { - if (generate_code) { - GenerateVecOp( - instruction, vector_map_->Get(r), vector_map_->Get(s), type, is_unsigned); - } - return true; - } - return false; + return true; + } + } else if (instruction->IsMin() || instruction->IsMax()) { + // Recognize saturation arithmetic. + if (VectorizeSaturationIdiom(node, instruction, generate_code, type, restrictions)) { + return true; + } + // Deal with vector restrictions. + HInstruction* opa = instruction->InputAt(0); + HInstruction* opb = instruction->InputAt(1); + HInstruction* r = opa; + HInstruction* s = opb; + bool is_unsigned = false; + if (HasVectorRestrictions(restrictions, kNoMinMax)) { + return false; + } else if (HasVectorRestrictions(restrictions, kNoHiBits) && + !IsNarrowerOperands(opa, opb, type, &r, &s, &is_unsigned)) { + return false; // reject, unless all operands are same-extension narrower + } + // Accept MIN/MAX(x, y) for vectorizable operands. + DCHECK(r != nullptr && s != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = opa; + s = opb; + } + if (VectorizeUse(node, r, generate_code, type, restrictions) && + VectorizeUse(node, s, generate_code, type, restrictions)) { + if (generate_code) { + GenerateVecOp( + instruction, vector_map_->Get(r), vector_map_->Get(s), type, is_unsigned); } - default: - return false; - } // switch + return true; + } } return false; } @@ -1475,11 +1732,11 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoSaturation; return TrySetVectorLength(16); case DataType::Type::kUint16: case DataType::Type::kInt16: - *restrictions |= kNoDiv | kNoStringCharAt; + *restrictions |= kNoDiv | kNoSaturation | kNoStringCharAt; return TrySetVectorLength(8); case DataType::Type::kInt32: *restrictions |= kNoDiv; @@ -1504,11 +1761,11 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoSaturation; return TrySetVectorLength(16); case DataType::Type::kUint16: case DataType::Type::kInt16: - *restrictions |= kNoDiv | kNoStringCharAt; + *restrictions |= kNoDiv | kNoSaturation | kNoStringCharAt; return TrySetVectorLength(8); case DataType::Type::kInt32: *restrictions |= kNoDiv; @@ -1811,83 +2068,29 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, GENERATE_VEC( new (global_allocator_) HVecUShr(global_allocator_, opa, opb, type, vector_length_, dex_pc), new (global_allocator_) HUShr(org_type, opa, opb, dex_pc)); - case HInstruction::kInvokeStaticOrDirect: { - HInvokeStaticOrDirect* invoke = org->AsInvokeStaticOrDirect(); - if (vector_mode_ == kVector) { - switch (invoke->GetIntrinsic()) { - case Intrinsics::kMathAbsInt: - case Intrinsics::kMathAbsLong: - case Intrinsics::kMathAbsFloat: - case Intrinsics::kMathAbsDouble: - DCHECK(opb == nullptr); - vector = new (global_allocator_) - HVecAbs(global_allocator_, opa, type, vector_length_, dex_pc); - break; - case Intrinsics::kMathMinIntInt: - case Intrinsics::kMathMinLongLong: - case Intrinsics::kMathMinFloatFloat: - case Intrinsics::kMathMinDoubleDouble: { - vector = new (global_allocator_) - HVecMin(global_allocator_, - opa, - opb, - HVecOperation::ToProperType(type, is_unsigned), - vector_length_, - dex_pc); - break; - } - case Intrinsics::kMathMaxIntInt: - case Intrinsics::kMathMaxLongLong: - case Intrinsics::kMathMaxFloatFloat: - case Intrinsics::kMathMaxDoubleDouble: { - vector = new (global_allocator_) - HVecMax(global_allocator_, - opa, - opb, - HVecOperation::ToProperType(type, is_unsigned), - vector_length_, - dex_pc); - break; - } - default: - LOG(FATAL) << "Unsupported SIMD intrinsic " << org->GetId(); - UNREACHABLE(); - } // switch invoke - } else { - // In scalar code, simply clone the method invoke, and replace its operands with the - // corresponding new scalar instructions in the loop. The instruction will get an - // environment while being inserted from the instruction map in original program order. - DCHECK(vector_mode_ == kSequential); - size_t num_args = invoke->GetNumberOfArguments(); - HInvokeStaticOrDirect* new_invoke = new (global_allocator_) HInvokeStaticOrDirect( - global_allocator_, - num_args, - invoke->GetType(), - invoke->GetDexPc(), - invoke->GetDexMethodIndex(), - invoke->GetResolvedMethod(), - invoke->GetDispatchInfo(), - invoke->GetInvokeType(), - invoke->GetTargetMethod(), - invoke->GetClinitCheckRequirement()); - HInputsRef inputs = invoke->GetInputs(); - size_t num_inputs = inputs.size(); - DCHECK_LE(num_args, num_inputs); - DCHECK_EQ(num_inputs, new_invoke->GetInputs().size()); // both invokes agree - for (size_t index = 0; index < num_inputs; ++index) { - HInstruction* new_input = index < num_args - ? vector_map_->Get(inputs[index]) - : inputs[index]; // beyond arguments: just pass through - new_invoke->SetArgumentAt(index, new_input); - } - new_invoke->SetIntrinsic(invoke->GetIntrinsic(), - kNeedsEnvironmentOrCache, - kNoSideEffects, - kNoThrow); - vector = new_invoke; - } - break; - } + case HInstruction::kMin: + GENERATE_VEC( + new (global_allocator_) HVecMin(global_allocator_, + opa, + opb, + HVecOperation::ToProperType(type, is_unsigned), + vector_length_, + dex_pc), + new (global_allocator_) HMin(org_type, opa, opb, dex_pc)); + case HInstruction::kMax: + GENERATE_VEC( + new (global_allocator_) HVecMax(global_allocator_, + opa, + opb, + HVecOperation::ToProperType(type, is_unsigned), + vector_length_, + dex_pc), + new (global_allocator_) HMax(org_type, opa, opb, dex_pc)); + case HInstruction::kAbs: + DCHECK(opb == nullptr); + GENERATE_VEC( + new (global_allocator_) HVecAbs(global_allocator_, opa, type, vector_length_, dex_pc), + new (global_allocator_) HAbs(org_type, opa, dex_pc)); default: break; } // switch @@ -1901,6 +2104,79 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, // Vectorization idioms. // +// Method recognizes single and double clipping saturation arithmetic. +bool HLoopOptimization::VectorizeSaturationIdiom(LoopNode* node, + HInstruction* instruction, + bool generate_code, + DataType::Type type, + uint64_t restrictions) { + // Deal with vector restrictions. + if (HasVectorRestrictions(restrictions, kNoSaturation)) { + return false; + } + // Restrict type (generalize if one day we generalize allowed MIN/MAX integral types). + if (instruction->GetType() != DataType::Type::kInt32 && + instruction->GetType() != DataType::Type::kInt64) { + return false; + } + // Clipped addition or subtraction on narrower operands? We will try both + // formats since, e.g., x+c can be interpreted as x+c and x-(-c), depending + // on what clipping values are used, to get most benefits. + int64_t lo = std::numeric_limits<int64_t>::min(); + int64_t hi = std::numeric_limits<int64_t>::max(); + HInstruction* clippee = FindClippee(instruction, &lo, &hi); + HInstruction* a = nullptr; + HInstruction* b = nullptr; + HInstruction* r = nullptr; + HInstruction* s = nullptr; + bool is_unsigned = false; + bool is_add = true; + int64_t c = 0; + // First try for saturated addition. + if (IsAddConst2(graph_, clippee, /*out*/ &a, /*out*/ &b, /*out*/ &c) && c == 0 && + IsNarrowerOperands(a, b, type, &r, &s, &is_unsigned) && + IsSaturatedAdd(r, s, type, lo, hi, is_unsigned)) { + is_add = true; + } else { + // Then try again for saturated subtraction. + a = b = r = s = nullptr; + if (IsSubConst2(graph_, clippee, /*out*/ &a, /*out*/ &b) && + IsNarrowerOperands(a, b, type, &r, &s, &is_unsigned) && + IsSaturatedSub(r, type, lo, hi, is_unsigned)) { + is_add = false; + } else { + return false; + } + } + // Accept saturation idiom for vectorizable operands. + DCHECK(r != nullptr && s != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = instruction->InputAt(0); + s = instruction->InputAt(1); + restrictions &= ~(kNoHiBits | kNoMinMax); // allow narrow MIN/MAX in seq + } + if (VectorizeUse(node, r, generate_code, type, restrictions) && + VectorizeUse(node, s, generate_code, type, restrictions)) { + if (generate_code) { + if (vector_mode_ == kVector) { + DataType::Type vtype = HVecOperation::ToProperType(type, is_unsigned); + HInstruction* op1 = vector_map_->Get(r); + HInstruction* op2 = vector_map_->Get(s); + vector_map_->Put(instruction, is_add + ? reinterpret_cast<HInstruction*>(new (global_allocator_) HVecSaturationAdd( + global_allocator_, op1, op2, vtype, vector_length_, kNoDexPc)) + : reinterpret_cast<HInstruction*>(new (global_allocator_) HVecSaturationSub( + global_allocator_, op1, op2, vtype, vector_length_, kNoDexPc))); + MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom); + } else { + GenerateVecOp(instruction, vector_map_->Get(r), vector_map_->Get(s), type); + } + } + return true; + } + return false; +} + // Method recognizes the following idioms: // rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b // truncated halving add (a + b) >> 1 for unsigned/signed operands a, b @@ -1924,8 +2200,7 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, HInstruction* a = nullptr; HInstruction* b = nullptr; int64_t c = 0; - if (IsAddConst(instruction->InputAt(0), /*out*/ &a, /*out*/ &b, /*out*/ &c)) { - DCHECK(a != nullptr && b != nullptr); + if (IsAddConst2(graph_, instruction->InputAt(0), /*out*/ &a, /*out*/ &b, /*out*/ &c)) { // Accept c == 1 (rounded) or c == 0 (not rounded). bool is_rounded = false; if (c == 1) { @@ -1947,8 +2222,7 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, } // Accept recognized halving add for vectorizable operands. Vectorized code uses the // shorthand idiomatic operation. Sequential code uses the original scalar expressions. - DCHECK(r != nullptr); - DCHECK(s != nullptr); + DCHECK(r != nullptr && s != nullptr); if (generate_code && vector_mode_ != kVector) { // de-idiom r = instruction->InputAt(0); s = instruction->InputAt(1); @@ -1998,21 +2272,11 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node, HInstruction* v = instruction->InputAt(1); HInstruction* a = nullptr; HInstruction* b = nullptr; - if (v->IsInvokeStaticOrDirect() && - (v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsInt || - v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsLong)) { - HInstruction* x = v->InputAt(0); - if (x->GetType() == reduction_type) { - int64_t c = 0; - if (x->IsSub()) { - a = x->InputAt(0); - b = x->InputAt(1); - } else if (IsAddConst(x, /*out*/ &a, /*out*/ &c)) { - b = graph_->GetConstant(reduction_type, -c); // hidden SUB! - } - } - } - if (a == nullptr || b == nullptr) { + if (v->IsAbs() && + v->GetType() == reduction_type && + IsSubConst2(graph_, v->InputAt(0), /*out*/ &a, /*out*/ &b)) { + DCHECK(a != nullptr && b != nullptr); + } else { return false; } // Accept same-type or consistent sign extension for narrower-type on operands a and b. @@ -2045,8 +2309,7 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node, } // Accept SAD idiom for vectorizable operands. Vectorized code uses the shorthand // idiomatic operation. Sequential code uses the original scalar expressions. - DCHECK(r != nullptr); - DCHECK(s != nullptr); + DCHECK(r != nullptr && s != nullptr); if (generate_code && vector_mode_ != kVector) { // de-idiom r = s = v->InputAt(0); } @@ -2054,14 +2317,13 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node, VectorizeUse(node, r, generate_code, sub_type, restrictions) && VectorizeUse(node, s, generate_code, sub_type, restrictions)) { if (generate_code) { - reduction_type = HVecOperation::ToProperType(reduction_type, is_unsigned); if (vector_mode_ == kVector) { vector_map_->Put(instruction, new (global_allocator_) HVecSADAccumulate( global_allocator_, vector_map_->Get(q), vector_map_->Get(r), vector_map_->Get(s), - reduction_type, + HVecOperation::ToProperType(reduction_type, is_unsigned), GetOtherVL(reduction_type, sub_type, vector_length_), kNoDexPc)); MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom); @@ -2134,41 +2396,6 @@ bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) { return true; } -static constexpr uint32_t ARM64_SIMD_MAXIMUM_UNROLL_FACTOR = 8; -static constexpr uint32_t ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE = 50; - -uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) { - uint32_t max_peel = MaxNumberPeeled(); - switch (compiler_driver_->GetInstructionSet()) { - case InstructionSet::kArm64: { - // Don't unroll with insufficient iterations. - // TODO: Unroll loops with unknown trip count. - DCHECK_NE(vector_length_, 0u); - if (trip_count < (2 * vector_length_ + max_peel)) { - return kNoUnrollingFactor; - } - // Don't unroll for large loop body size. - uint32_t instruction_count = block->GetInstructions().CountSize(); - if (instruction_count >= ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE) { - return kNoUnrollingFactor; - } - // Find a beneficial unroll factor with the following restrictions: - // - At least one iteration of the transformed loop should be executed. - // - The loop body shouldn't be "too big" (heuristic). - uint32_t uf1 = ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE / instruction_count; - uint32_t uf2 = (trip_count - max_peel) / vector_length_; - uint32_t unroll_factor = - TruncToPowerOfTwo(std::min({uf1, uf2, ARM64_SIMD_MAXIMUM_UNROLL_FACTOR})); - DCHECK_GE(unroll_factor, 1u); - return unroll_factor; - } - case InstructionSet::kX86: - case InstructionSet::kX86_64: - default: - return kNoUnrollingFactor; - } -} - // // Helpers. // |
