1 files changed, 496 insertions, 269 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 899496328eb..1462404932e 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -33,8 +33,8 @@ namespace art {
 // Enables vectorization (SIMDization) in the loop optimizer.
 static constexpr bool kEnableVectorization = true;
 
-// No loop unrolling factor (just one copy of the loop-body).
-static constexpr uint32_t kNoUnrollingFactor = 1;
+// Enables scalar loop unrolling in the loop optimizer.
+static constexpr bool kEnableScalarPeelingUnrolling = false;
 
 //
 // Static helpers.
@@ -153,6 +153,18 @@ static bool IsSignExtensionAndGet(HInstruction* instruction,
         return false;
     }
   }
+  // A MIN-MAX on narrower operands qualifies as well
+  // (returning the operator itself).
+  if (instruction->IsMin() || instruction->IsMax()) {
+    HBinaryOperation* min_max = instruction->AsBinaryOperation();
+    DCHECK(min_max->GetType() == DataType::Type::kInt32 ||
+           min_max->GetType() == DataType::Type::kInt64);
+    if (IsSignExtensionAndGet(min_max->InputAt(0), type, operand) &&
+        IsSignExtensionAndGet(min_max->InputAt(1), type, operand)) {
+      *operand = min_max;
+      return true;
+    }
+  }
   return false;
 }
 
@@ -216,6 +228,18 @@ static bool IsZeroExtensionAndGet(HInstruction* instruction,
         return false;
     }
   }
+  // A MIN-MAX on narrower operands qualifies as well
+  // (returning the operator itself).
+  if (instruction->IsMin() || instruction->IsMax()) {
+    HBinaryOperation* min_max = instruction->AsBinaryOperation();
+    DCHECK(min_max->GetType() == DataType::Type::kInt32 ||
+           min_max->GetType() == DataType::Type::kInt64);
+    if (IsZeroExtensionAndGet(min_max->InputAt(0), type, operand) &&
+        IsZeroExtensionAndGet(min_max->InputAt(1), type, operand)) {
+      *operand = min_max;
+      return true;
+    }
+  }
   return false;
 }
 
@@ -227,6 +251,7 @@ static bool IsNarrowerOperands(HInstruction* a,
                                /*out*/ HInstruction** r,
                                /*out*/ HInstruction** s,
                                /*out*/ bool* is_unsigned) {
+  DCHECK(a != nullptr && b != nullptr);
   // Look for a matching sign extension.
   DataType::Type stype = HVecOperation::ToSignedType(type);
   if (IsSignExtensionAndGet(a, stype, r) && IsSignExtensionAndGet(b, stype, s)) {
@@ -247,6 +272,7 @@ static bool IsNarrowerOperand(HInstruction* a,
                               DataType::Type type,
                               /*out*/ HInstruction** r,
                               /*out*/ bool* is_unsigned) {
+  DCHECK(a != nullptr);
   // Look for a matching sign extension.
   DataType::Type stype = HVecOperation::ToSignedType(type);
   if (IsSignExtensionAndGet(a, stype, r)) {
@@ -270,20 +296,28 @@ static uint32_t GetOtherVL(DataType::Type other_type, DataType::Type vector_type
   return vl >> (DataType::SizeShift(other_type) - DataType::SizeShift(vector_type));
 }
 
-// Detect up to two instructions a and b, and an acccumulated constant c.
-static bool IsAddConstHelper(HInstruction* instruction,
-                             /*out*/ HInstruction** a,
-                             /*out*/ HInstruction** b,
-                             /*out*/ int64_t* c,
-                             int32_t depth) {
-  static constexpr int32_t kMaxDepth = 8;  // don't search too deep
+// Detect up to two added operands a and b and an acccumulated constant c.
+static bool IsAddConst(HInstruction* instruction,
+                       /*out*/ HInstruction** a,
+                       /*out*/ HInstruction** b,
+                       /*out*/ int64_t* c,
+                       int32_t depth = 8) {  // don't search too deep
   int64_t value = 0;
+  // Enter add/sub while still within reasonable depth.
+  if (depth > 0) {
+    if (instruction->IsAdd()) {
+      return IsAddConst(instruction->InputAt(0), a, b, c, depth - 1) &&
+             IsAddConst(instruction->InputAt(1), a, b, c, depth - 1);
+    } else if (instruction->IsSub() &&
+               IsInt64AndGet(instruction->InputAt(1), &value)) {
+      *c -= value;
+      return IsAddConst(instruction->InputAt(0), a, b, c, depth - 1);
+    }
+  }
+  // Otherwise, deal with leaf nodes.
   if (IsInt64AndGet(instruction, &value)) {
     *c += value;
     return true;
-  } else if (instruction->IsAdd() && depth <= kMaxDepth) {
-    return IsAddConstHelper(instruction->InputAt(0), a, b, c, depth + 1) &&
-           IsAddConstHelper(instruction->InputAt(1), a, b, c, depth + 1);
   } else if (*a == nullptr) {
     *a = instruction;
     return true;
@@ -291,72 +325,170 @@ static bool IsAddConstHelper(HInstruction* instruction,
     *b = instruction;
     return true;
   }
-  return false;  // too many non-const operands
+  return false;  // too many operands
 }
 
-// Detect a + b + c for an optional constant c.
-static bool IsAddConst(HInstruction* instruction,
-                       /*out*/ HInstruction** a,
-                       /*out*/ HInstruction** b,
-                       /*out*/ int64_t* c) {
-  if (instruction->IsAdd()) {
-    // Try to find a + b and accumulated c.
-    if (IsAddConstHelper(instruction->InputAt(0), a, b, c, /*depth*/ 0) &&
-        IsAddConstHelper(instruction->InputAt(1), a, b, c, /*depth*/ 0) &&
-        *b != nullptr) {
-      return true;
+// Detect a + b + c with optional constant c.
+static bool IsAddConst2(HGraph* graph,
+                        HInstruction* instruction,
+                        /*out*/ HInstruction** a,
+                        /*out*/ HInstruction** b,
+                        /*out*/ int64_t* c) {
+  if (IsAddConst(instruction, a, b, c) && *a != nullptr) {
+    if (*b == nullptr) {
+      // Constant is usually already present, unless accumulated.
+      *b = graph->GetConstant(instruction->GetType(), (*c));
+      *c = 0;
     }
-    // Found a + b.
+    return true;
+  }
+  return false;
+}
+
+// Detect a direct a - b or a hidden a - (-c).
+static bool IsSubConst2(HGraph* graph,
+                        HInstruction* instruction,
+                        /*out*/ HInstruction** a,
+                        /*out*/ HInstruction** b) {
+  int64_t c = 0;
+  if (instruction->IsSub()) {
     *a = instruction->InputAt(0);
     *b = instruction->InputAt(1);
-    *c = 0;
+    return true;
+  } else if (IsAddConst(instruction, a, b, &c) && *a != nullptr && *b == nullptr) {
+    // Constant for the hidden subtraction.
+    *b = graph->GetConstant(instruction->GetType(), -c);
     return true;
   }
   return false;
 }
 
-// Detect a + c for constant c.
-static bool IsAddConst(HInstruction* instruction,
-                       /*out*/ HInstruction** a,
-                       /*out*/ int64_t* c) {
-  if (instruction->IsAdd()) {
-    if (IsInt64AndGet(instruction->InputAt(0), c)) {
-      *a = instruction->InputAt(1);
-      return true;
-    } else if (IsInt64AndGet(instruction->InputAt(1), c)) {
-      *a = instruction->InputAt(0);
-      return true;
+// Detect clipped [lo, hi] range for nested MIN-MAX operations on a clippee,
+// such as MIN(hi, MAX(lo, clippee)) for an arbitrary clippee expression.
+// Example: MIN(10, MIN(20, MAX(0, x))) yields [0, 10] with clippee x.
+static HInstruction* FindClippee(HInstruction* instruction,
+                                 /*out*/ int64_t* lo,
+                                 /*out*/ int64_t* hi) {
+  // Iterate into MIN(.., c)-MAX(.., c) expressions and 'tighten' the range [lo, hi].
+  while (instruction->IsMin() || instruction->IsMax()) {
+    HBinaryOperation* min_max = instruction->AsBinaryOperation();
+    DCHECK(min_max->GetType() == DataType::Type::kInt32 ||
+           min_max->GetType() == DataType::Type::kInt64);
+    // Process the constant.
+    HConstant* right = min_max->GetConstantRight();
+    if (right == nullptr) {
+      break;
+    } else if (instruction->IsMin()) {
+      *hi = std::min(*hi, Int64FromConstant(right));
+    } else {
+      *lo = std::max(*lo, Int64FromConstant(right));
     }
+    instruction = min_max->GetLeastConstantLeft();
+  }
+  // Iteration ends in any other expression (possibly MIN/MAX without constant).
+  // This leaf expression is the clippee with range [lo, hi].
+  return instruction;
+}
+
+// Set value range for type (or fail).
+static bool CanSetRange(DataType::Type type,
+                        /*out*/ int64_t* uhi,
+                        /*out*/ int64_t* slo,
+                        /*out*/ int64_t* shi) {
+  if (DataType::Size(type) == 1) {
+    *uhi = std::numeric_limits<uint8_t>::max();
+    *slo = std::numeric_limits<int8_t>::min();
+    *shi = std::numeric_limits<int8_t>::max();
+    return true;
+  } else if (DataType::Size(type) == 2) {
+    *uhi = std::numeric_limits<uint16_t>::max();
+    *slo = std::numeric_limits<int16_t>::min();
+    *shi = std::numeric_limits<int16_t>::max();
+    return true;
   }
   return false;
 }
 
+// Accept various saturated addition forms.
+static bool IsSaturatedAdd(HInstruction* a,
+                           HInstruction* b,
+                           DataType::Type type,
+                           int64_t lo,
+                           int64_t hi,
+                           bool is_unsigned) {
+  int64_t ulo = 0, uhi = 0, slo = 0, shi = 0;
+  if (!CanSetRange(type, &uhi, &slo, &shi)) {
+    return false;
+  }
+  // Tighten the range for signed single clipping on constant.
+  if (!is_unsigned) {
+    int64_t c = 0;
+    if (IsInt64AndGet(a, &c) || IsInt64AndGet(b, &c)) {
+      // For c in proper range and narrower operand r:
+      //    MIN(r + c,  127) c > 0
+      // or MAX(r + c, -128) c < 0 (and possibly redundant bound).
+      if (0 < c && c <= shi && hi == shi) {
+        if (lo <= (slo + c)) {
+          return true;
+        }
+      } else if (slo <= c && c < 0 && lo == slo) {
+        if (hi >= (shi + c)) {
+          return true;
+        }
+      }
+    }
+  }
+  // Detect for narrower operands r and s:
+  //     MIN(r + s, 255)        => SAT_ADD_unsigned
+  // MAX(MIN(r + s, 127), -128) => SAT_ADD_signed.
+  return is_unsigned ? (lo <= ulo && hi == uhi) : (lo == slo && hi == shi);
+}
+
+// Accept various saturated subtraction forms.
+static bool IsSaturatedSub(HInstruction* a,
+                           DataType::Type type,
+                           int64_t lo,
+                           int64_t hi,
+                           bool is_unsigned) {
+  int64_t ulo = 0, uhi = 0, slo = 0, shi = 0;
+  if (!CanSetRange(type, &uhi, &slo, &shi)) {
+    return false;
+  }
+  // Tighten the range for signed single clipping on constant.
+  if (!is_unsigned) {
+    int64_t c = 0;
+    if (IsInt64AndGet(a, /*out*/ &c)) {
+      // For c in proper range and narrower operand r:
+      //    MIN(c - r,  127) c > 0
+      // or MAX(c - r, -128) c < 0 (and possibly redundant bound).
+      if (0 < c && c <= shi && hi == shi) {
+        if (lo <= (c - shi)) {
+          return true;
+        }
+      } else if (slo <= c && c < 0 && lo == slo) {
+        if (hi >= (c - slo)) {
+          return true;
+        }
+      }
+    }
+  }
+  // Detect for narrower operands r and s:
+  //     MAX(r - s, 0)          => SAT_SUB_unsigned
+  // MIN(MAX(r - s, -128), 127) => SAT_ADD_signed.
+  return is_unsigned ? (lo == ulo && hi >= uhi) : (lo == slo && hi == shi);
+}
+
 // Detect reductions of the following forms,
 //   x = x_phi + ..
 //   x = x_phi - ..
-//   x = max(x_phi, ..)
 //   x = min(x_phi, ..)
+//   x = max(x_phi, ..)
 static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) {
-  if (reduction->IsAdd()) {
+  if (reduction->IsAdd() || reduction->IsMin() || reduction->IsMax()) {
     return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) ||
            (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi);
   } else if (reduction->IsSub()) {
     return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi);
-  } else if (reduction->IsInvokeStaticOrDirect()) {
-    switch (reduction->AsInvokeStaticOrDirect()->GetIntrinsic()) {
-      case Intrinsics::kMathMinIntInt:
-      case Intrinsics::kMathMinLongLong:
-      case Intrinsics::kMathMinFloatFloat:
-      case Intrinsics::kMathMinDoubleDouble:
-      case Intrinsics::kMathMaxIntInt:
-      case Intrinsics::kMathMaxLongLong:
-      case Intrinsics::kMathMaxFloatFloat:
-      case Intrinsics::kMathMaxDoubleDouble:
-        return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) ||
-               (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi);
-      default:
-        return false;
-    }
   }
   return false;
 }
@@ -401,6 +533,43 @@ static bool CheckInductionSetFullyRemoved(ScopedArenaSet<HInstruction*>* iset) {
   return true;
 }
 
+// Tries to statically evaluate condition of the specified "HIf" for other condition checks.
+static void TryToEvaluateIfCondition(HIf* instruction, HGraph* graph) {
+  HInstruction* cond = instruction->InputAt(0);
+
+  // If a condition 'cond' is evaluated in an HIf instruction then in the successors of the
+  // IF_BLOCK we statically know the value of the condition 'cond' (TRUE in TRUE_SUCC, FALSE in
+  // FALSE_SUCC). Using that we can replace another evaluation (use) EVAL of the same 'cond'
+  // with TRUE value (FALSE value) if every path from the ENTRY_BLOCK to EVAL_BLOCK contains the
+  // edge HIF_BLOCK->TRUE_SUCC (HIF_BLOCK->FALSE_SUCC).
+  //     if (cond) {               if(cond) {
+  //       if (cond) {}              if (1) {}
+  //     } else {        =======>  } else {
+  //       if (cond) {}              if (0) {}
+  //     }                         }
+  if (!cond->IsConstant()) {
+    HBasicBlock* true_succ = instruction->IfTrueSuccessor();
+    HBasicBlock* false_succ = instruction->IfFalseSuccessor();
+
+    DCHECK_EQ(true_succ->GetPredecessors().size(), 1u);
+    DCHECK_EQ(false_succ->GetPredecessors().size(), 1u);
+
+    const HUseList<HInstruction*>& uses = cond->GetUses();
+    for (auto it = uses.begin(), end = uses.end(); it != end; /* ++it below */) {
+      HInstruction* user = it->GetUser();
+      size_t index = it->GetIndex();
+      HBasicBlock* user_block = user->GetBlock();
+      // Increment `it` now because `*it` may disappear thanks to user->ReplaceInput().
+      ++it;
+      if (true_succ->Dominates(user_block)) {
+        user->ReplaceInput(graph->GetIntConstant(1), index);
+     } else if (false_succ->Dominates(user_block)) {
+        user->ReplaceInput(graph->GetIntConstant(0), index);
+      }
+    }
+  }
+}
+
 //
 // Public methods.
 //
@@ -432,7 +601,11 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
       vector_preheader_(nullptr),
       vector_header_(nullptr),
       vector_body_(nullptr),
-      vector_index_(nullptr) {
+      vector_index_(nullptr),
+      arch_loop_helper_(ArchDefaultLoopHelper::Create(compiler_driver_ != nullptr
+                                                          ? compiler_driver_->GetInstructionSet()
+                                                          : InstructionSet::kNone,
+                                                      global_allocator_)) {
 }
 
 void HLoopOptimization::Run() {
@@ -643,7 +816,7 @@ void HLoopOptimization::SimplifyBlocks(LoopNode* node) {
   }
 }
 
-bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
+bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
   // Ensure loop header logic is finite.
@@ -713,6 +886,103 @@ bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
   return false;
 }
 
+bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
+  return TryOptimizeInnerLoopFinite(node) ||
+         TryPeelingForLoopInvariantExitsElimination(node) ||
+         TryUnrollingForBranchPenaltyReduction(node);
+}
+
+
+
+//
+// Loop unrolling: generic part methods.
+//
+
+bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* node) {
+  // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
+  // as InstructionSet is needed.
+  if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) {
+    return false;
+  }
+
+  HLoopInformation* loop_info = node->loop_info;
+  int64_t trip_count = 0;
+  // Only unroll loops with a known tripcount.
+  if (!induction_range_.HasKnownTripCount(loop_info, &trip_count)) {
+    return false;
+  }
+
+  uint32_t unrolling_factor = arch_loop_helper_->GetScalarUnrollingFactor(loop_info, trip_count);
+  if (unrolling_factor == kNoUnrollingFactor) {
+    return false;
+  }
+
+  LoopAnalysisInfo loop_analysis_info(loop_info);
+  LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
+
+  // Check "IsLoopClonable" last as it can be time-consuming.
+  if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) ||
+      (loop_analysis_info.GetNumberOfExits() > 1) ||
+      loop_analysis_info.HasInstructionsPreventingScalarUnrolling() ||
+      !PeelUnrollHelper::IsLoopClonable(loop_info)) {
+    return false;
+  }
+
+  // TODO: support other unrolling factors.
+  DCHECK_EQ(unrolling_factor, 2u);
+
+  // Perform unrolling.
+  PeelUnrollSimpleHelper helper(loop_info);
+  helper.DoUnrolling();
+
+  // Remove the redundant loop check after unrolling.
+  HIf* copy_hif =
+      helper.GetBasicBlockMap()->Get(loop_info->GetHeader())->GetLastInstruction()->AsIf();
+  int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0;
+  copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u);
+
+  return true;
+}
+
+bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* node) {
+  // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
+  // as InstructionSet is needed.
+  if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) {
+    return false;
+  }
+
+  HLoopInformation* loop_info = node->loop_info;
+  // Check 'IsLoopClonable' the last as it might be time-consuming.
+  if (!arch_loop_helper_->IsLoopPeelingEnabled()) {
+    return false;
+  }
+
+  LoopAnalysisInfo loop_analysis_info(loop_info);
+  LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
+
+  // Check "IsLoopClonable" last as it can be time-consuming.
+  if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) ||
+      loop_analysis_info.HasInstructionsPreventingScalarPeeling() ||
+      !LoopAnalysis::HasLoopAtLeastOneInvariantExit(loop_info) ||
+      !PeelUnrollHelper::IsLoopClonable(loop_info)) {
+    return false;
+  }
+
+  // Perform peeling.
+  PeelUnrollSimpleHelper helper(loop_info);
+  helper.DoPeeling();
+
+  const SuperblockCloner::HInstructionMap* hir_map = helper.GetInstructionMap();
+  for (auto entry : *hir_map) {
+    HInstruction* copy = entry.second;
+    if (copy->IsIf()) {
+      TryToEvaluateIfCondition(copy->AsIf(), graph_);
+    }
+  }
+
+  return true;
+}
+
 //
 // Loop vectorization. The implementation is based on the book by Aart J.C. Bik:
 // "The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance."
@@ -843,7 +1113,8 @@ void HLoopOptimization::Vectorize(LoopNode* node,
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
 
   // Pick a loop unrolling factor for the vector loop.
-  uint32_t unroll = GetUnrollingFactor(block, trip_count);
+  uint32_t unroll = arch_loop_helper_->GetSIMDUnrollingFactor(
+      block, trip_count, MaxNumberPeeled(), vector_length_);
   uint32_t chunk = vector_length_ * unroll;
 
   DCHECK(trip_count == 0 || (trip_count >= MaxNumberPeeled() + chunk));
@@ -1082,6 +1353,11 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node,
     HInstruction* index = instruction->InputAt(1);
     HInstruction* value = instruction->InputAt(2);
     HInstruction* offset = nullptr;
+    // For narrow types, explicit type conversion may have been
+    // optimized way, so set the no hi bits restriction here.
+    if (DataType::Size(type) <= 2) {
+      restrictions |= kNoHiBits;
+    }
     if (TrySetVectorType(type, &restrictions) &&
         node->loop_info->IsDefinedOutOfTheLoop(base) &&
         induction_range_.IsUnitStride(instruction, index, graph_, &offset) &&
@@ -1124,7 +1400,6 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node,
   return !IsUsedOutsideLoop(node->loop_info, instruction) && !instruction->DoesAnyWrite();
 }
 
-// TODO: saturation arithmetic.
 bool HLoopOptimization::VectorizeUse(LoopNode* node,
                                      HInstruction* instruction,
                                      bool generate_code,
@@ -1297,80 +1572,62 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node,
         return true;
       }
     }
-  } else if (instruction->IsInvokeStaticOrDirect()) {
-    // Accept particular intrinsics.
-    HInvokeStaticOrDirect* invoke = instruction->AsInvokeStaticOrDirect();
-    switch (invoke->GetIntrinsic()) {
-      case Intrinsics::kMathAbsInt:
-      case Intrinsics::kMathAbsLong:
-      case Intrinsics::kMathAbsFloat:
-      case Intrinsics::kMathAbsDouble: {
-        // Deal with vector restrictions.
-        HInstruction* opa = instruction->InputAt(0);
-        HInstruction* r = opa;
-        bool is_unsigned = false;
-        if (HasVectorRestrictions(restrictions, kNoAbs)) {
-          return false;
-        } else if (HasVectorRestrictions(restrictions, kNoHiBits) &&
-                   (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) {
-          return false;  // reject, unless operand is sign-extension narrower
-        }
-        // Accept ABS(x) for vectorizable operand.
-        DCHECK(r != nullptr);
-        if (generate_code && vector_mode_ != kVector) {  // de-idiom
-          r = opa;
-        }
-        if (VectorizeUse(node, r, generate_code, type, restrictions)) {
-          if (generate_code) {
-            GenerateVecOp(instruction,
-                          vector_map_->Get(r),
-                          nullptr,
-                          HVecOperation::ToProperType(type, is_unsigned));
-          }
-          return true;
-        }
-        return false;
+  } else if (instruction->IsAbs()) {
+    // Deal with vector restrictions.
+    HInstruction* opa = instruction->InputAt(0);
+    HInstruction* r = opa;
+    bool is_unsigned = false;
+    if (HasVectorRestrictions(restrictions, kNoAbs)) {
+      return false;
+    } else if (HasVectorRestrictions(restrictions, kNoHiBits) &&
+               (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) {
+      return false;  // reject, unless operand is sign-extension narrower
+    }
+    // Accept ABS(x) for vectorizable operand.
+    DCHECK(r != nullptr);
+    if (generate_code && vector_mode_ != kVector) {  // de-idiom
+      r = opa;
+    }
+    if (VectorizeUse(node, r, generate_code, type, restrictions)) {
+      if (generate_code) {
+        GenerateVecOp(instruction,
+                      vector_map_->Get(r),
+                      nullptr,
+                      HVecOperation::ToProperType(type, is_unsigned));
       }
-      case Intrinsics::kMathMinIntInt:
-      case Intrinsics::kMathMinLongLong:
-      case Intrinsics::kMathMinFloatFloat:
-      case Intrinsics::kMathMinDoubleDouble:
-      case Intrinsics::kMathMaxIntInt:
-      case Intrinsics::kMathMaxLongLong:
-      case Intrinsics::kMathMaxFloatFloat:
-      case Intrinsics::kMathMaxDoubleDouble: {
-        // Deal with vector restrictions.
-        HInstruction* opa = instruction->InputAt(0);
-        HInstruction* opb = instruction->InputAt(1);
-        HInstruction* r = opa;
-        HInstruction* s = opb;
-        bool is_unsigned = false;
-        if (HasVectorRestrictions(restrictions, kNoMinMax)) {
-          return false;
-        } else if (HasVectorRestrictions(restrictions, kNoHiBits) &&
-                   !IsNarrowerOperands(opa, opb, type, &r, &s, &is_unsigned)) {
-          return false;  // reject, unless all operands are same-extension narrower
-        }
-        // Accept MIN/MAX(x, y) for vectorizable operands.
-        DCHECK(r != nullptr);
-        DCHECK(s != nullptr);
-        if (generate_code && vector_mode_ != kVector) {  // de-idiom
-          r = opa;
-          s = opb;
-        }
-        if (VectorizeUse(node, r, generate_code, type, restrictions) &&
-            VectorizeUse(node, s, generate_code, type, restrictions)) {
-          if (generate_code) {
-            GenerateVecOp(
-                instruction, vector_map_->Get(r), vector_map_->Get(s), type, is_unsigned);
-          }
-          return true;
-        }
-        return false;
+      return true;
+    }
+  } else if (instruction->IsMin() || instruction->IsMax()) {
+    // Recognize saturation arithmetic.
+    if (VectorizeSaturationIdiom(node, instruction, generate_code, type, restrictions)) {
+      return true;
+    }
+    // Deal with vector restrictions.
+    HInstruction* opa = instruction->InputAt(0);
+    HInstruction* opb = instruction->InputAt(1);
+    HInstruction* r = opa;
+    HInstruction* s = opb;
+    bool is_unsigned = false;
+    if (HasVectorRestrictions(restrictions, kNoMinMax)) {
+      return false;
+    } else if (HasVectorRestrictions(restrictions, kNoHiBits) &&
+               !IsNarrowerOperands(opa, opb, type, &r, &s, &is_unsigned)) {
+      return false;  // reject, unless all operands are same-extension narrower
+    }
+    // Accept MIN/MAX(x, y) for vectorizable operands.
+    DCHECK(r != nullptr && s != nullptr);
+    if (generate_code && vector_mode_ != kVector) {  // de-idiom
+      r = opa;
+      s = opb;
+    }
+    if (VectorizeUse(node, r, generate_code, type, restrictions) &&
+        VectorizeUse(node, s, generate_code, type, restrictions)) {
+      if (generate_code) {
+        GenerateVecOp(
+            instruction, vector_map_->Get(r), vector_map_->Get(s), type, is_unsigned);
       }
-      default:
-        return false;
-    }  // switch
+      return true;
+    }
   }
   return false;
 }
@@ -1475,11 +1732,11 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
           case DataType::Type::kInt8:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoSaturation;
             return TrySetVectorLength(16);
           case DataType::Type::kUint16:
           case DataType::Type::kInt16:
-            *restrictions |= kNoDiv | kNoStringCharAt;
+            *restrictions |= kNoDiv | kNoSaturation | kNoStringCharAt;
             return TrySetVectorLength(8);
           case DataType::Type::kInt32:
             *restrictions |= kNoDiv;
@@ -1504,11 +1761,11 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
           case DataType::Type::kInt8:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoSaturation;
             return TrySetVectorLength(16);
           case DataType::Type::kUint16:
           case DataType::Type::kInt16:
-            *restrictions |= kNoDiv | kNoStringCharAt;
+            *restrictions |= kNoDiv | kNoSaturation | kNoStringCharAt;
             return TrySetVectorLength(8);
           case DataType::Type::kInt32:
             *restrictions |= kNoDiv;
@@ -1811,83 +2068,29 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org,
       GENERATE_VEC(
         new (global_allocator_) HVecUShr(global_allocator_, opa, opb, type, vector_length_, dex_pc),
         new (global_allocator_) HUShr(org_type, opa, opb, dex_pc));
-    case HInstruction::kInvokeStaticOrDirect: {
-      HInvokeStaticOrDirect* invoke = org->AsInvokeStaticOrDirect();
-      if (vector_mode_ == kVector) {
-        switch (invoke->GetIntrinsic()) {
-          case Intrinsics::kMathAbsInt:
-          case Intrinsics::kMathAbsLong:
-          case Intrinsics::kMathAbsFloat:
-          case Intrinsics::kMathAbsDouble:
-            DCHECK(opb == nullptr);
-            vector = new (global_allocator_)
-                HVecAbs(global_allocator_, opa, type, vector_length_, dex_pc);
-            break;
-          case Intrinsics::kMathMinIntInt:
-          case Intrinsics::kMathMinLongLong:
-          case Intrinsics::kMathMinFloatFloat:
-          case Intrinsics::kMathMinDoubleDouble: {
-            vector = new (global_allocator_)
-                HVecMin(global_allocator_,
-                        opa,
-                        opb,
-                        HVecOperation::ToProperType(type, is_unsigned),
-                        vector_length_,
-                        dex_pc);
-            break;
-          }
-          case Intrinsics::kMathMaxIntInt:
-          case Intrinsics::kMathMaxLongLong:
-          case Intrinsics::kMathMaxFloatFloat:
-          case Intrinsics::kMathMaxDoubleDouble: {
-            vector = new (global_allocator_)
-                HVecMax(global_allocator_,
-                        opa,
-                        opb,
-                        HVecOperation::ToProperType(type, is_unsigned),
-                        vector_length_,
-                        dex_pc);
-            break;
-          }
-          default:
-            LOG(FATAL) << "Unsupported SIMD intrinsic " << org->GetId();
-            UNREACHABLE();
-        }  // switch invoke
-      } else {
-        // In scalar code, simply clone the method invoke, and replace its operands with the
-        // corresponding new scalar instructions in the loop. The instruction will get an
-        // environment while being inserted from the instruction map in original program order.
-        DCHECK(vector_mode_ == kSequential);
-        size_t num_args = invoke->GetNumberOfArguments();
-        HInvokeStaticOrDirect* new_invoke = new (global_allocator_) HInvokeStaticOrDirect(
-            global_allocator_,
-            num_args,
-            invoke->GetType(),
-            invoke->GetDexPc(),
-            invoke->GetDexMethodIndex(),
-            invoke->GetResolvedMethod(),
-            invoke->GetDispatchInfo(),
-            invoke->GetInvokeType(),
-            invoke->GetTargetMethod(),
-            invoke->GetClinitCheckRequirement());
-        HInputsRef inputs = invoke->GetInputs();
-        size_t num_inputs = inputs.size();
-        DCHECK_LE(num_args, num_inputs);
-        DCHECK_EQ(num_inputs, new_invoke->GetInputs().size());  // both invokes agree
-        for (size_t index = 0; index < num_inputs; ++index) {
-          HInstruction* new_input = index < num_args
-              ? vector_map_->Get(inputs[index])
-              : inputs[index];  // beyond arguments: just pass through
-          new_invoke->SetArgumentAt(index, new_input);
-        }
-        new_invoke->SetIntrinsic(invoke->GetIntrinsic(),
-                                 kNeedsEnvironmentOrCache,
-                                 kNoSideEffects,
-                                 kNoThrow);
-        vector = new_invoke;
-      }
-      break;
-    }
+    case HInstruction::kMin:
+      GENERATE_VEC(
+        new (global_allocator_) HVecMin(global_allocator_,
+                                        opa,
+                                        opb,
+                                        HVecOperation::ToProperType(type, is_unsigned),
+                                        vector_length_,
+                                        dex_pc),
+        new (global_allocator_) HMin(org_type, opa, opb, dex_pc));
+    case HInstruction::kMax:
+      GENERATE_VEC(
+        new (global_allocator_) HVecMax(global_allocator_,
+                                        opa,
+                                        opb,
+                                        HVecOperation::ToProperType(type, is_unsigned),
+                                        vector_length_,
+                                        dex_pc),
+        new (global_allocator_) HMax(org_type, opa, opb, dex_pc));
+    case HInstruction::kAbs:
+      DCHECK(opb == nullptr);
+      GENERATE_VEC(
+        new (global_allocator_) HVecAbs(global_allocator_, opa, type, vector_length_, dex_pc),
+        new (global_allocator_) HAbs(org_type, opa, dex_pc));
     default:
       break;
   }  // switch
@@ -1901,6 +2104,79 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org,
 // Vectorization idioms.
 //
 
+// Method recognizes single and double clipping saturation arithmetic.
+bool HLoopOptimization::VectorizeSaturationIdiom(LoopNode* node,
+                                                 HInstruction* instruction,
+                                                 bool generate_code,
+                                                 DataType::Type type,
+                                                 uint64_t restrictions) {
+  // Deal with vector restrictions.
+  if (HasVectorRestrictions(restrictions, kNoSaturation)) {
+    return false;
+  }
+  // Restrict type (generalize if one day we generalize allowed MIN/MAX integral types).
+  if (instruction->GetType() != DataType::Type::kInt32 &&
+      instruction->GetType() != DataType::Type::kInt64) {
+    return false;
+  }
+  // Clipped addition or subtraction on narrower operands? We will try both
+  // formats since, e.g., x+c can be interpreted as x+c and x-(-c), depending
+  // on what clipping values are used, to get most benefits.
+  int64_t lo = std::numeric_limits<int64_t>::min();
+  int64_t hi = std::numeric_limits<int64_t>::max();
+  HInstruction* clippee = FindClippee(instruction, &lo, &hi);
+  HInstruction* a = nullptr;
+  HInstruction* b = nullptr;
+  HInstruction* r = nullptr;
+  HInstruction* s = nullptr;
+  bool is_unsigned = false;
+  bool is_add = true;
+  int64_t c = 0;
+  // First try for saturated addition.
+  if (IsAddConst2(graph_, clippee, /*out*/ &a, /*out*/ &b, /*out*/ &c) && c == 0 &&
+      IsNarrowerOperands(a, b, type, &r, &s, &is_unsigned) &&
+      IsSaturatedAdd(r, s, type, lo, hi, is_unsigned)) {
+    is_add = true;
+  } else {
+    // Then try again for saturated subtraction.
+    a = b = r = s = nullptr;
+    if (IsSubConst2(graph_, clippee, /*out*/ &a, /*out*/ &b) &&
+        IsNarrowerOperands(a, b, type, &r, &s, &is_unsigned) &&
+        IsSaturatedSub(r, type, lo, hi, is_unsigned)) {
+      is_add = false;
+    } else {
+      return false;
+    }
+  }
+  // Accept saturation idiom for vectorizable operands.
+  DCHECK(r != nullptr && s != nullptr);
+  if (generate_code && vector_mode_ != kVector) {  // de-idiom
+    r = instruction->InputAt(0);
+    s = instruction->InputAt(1);
+    restrictions &= ~(kNoHiBits | kNoMinMax);  // allow narrow MIN/MAX in seq
+  }
+  if (VectorizeUse(node, r, generate_code, type, restrictions) &&
+      VectorizeUse(node, s, generate_code, type, restrictions)) {
+    if (generate_code) {
+      if (vector_mode_ == kVector) {
+        DataType::Type vtype = HVecOperation::ToProperType(type, is_unsigned);
+        HInstruction* op1 = vector_map_->Get(r);
+        HInstruction* op2 = vector_map_->Get(s);
+        vector_map_->Put(instruction, is_add
+          ? reinterpret_cast<HInstruction*>(new (global_allocator_) HVecSaturationAdd(
+              global_allocator_, op1, op2, vtype, vector_length_, kNoDexPc))
+          : reinterpret_cast<HInstruction*>(new (global_allocator_) HVecSaturationSub(
+              global_allocator_, op1, op2, vtype, vector_length_, kNoDexPc)));
+        MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
+      } else {
+        GenerateVecOp(instruction, vector_map_->Get(r), vector_map_->Get(s), type);
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
 // Method recognizes the following idioms:
 //   rounding  halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
 //   truncated halving add (a + b)     >> 1 for unsigned/signed operands a, b
@@ -1924,8 +2200,7 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
     HInstruction* a = nullptr;
     HInstruction* b = nullptr;
     int64_t       c = 0;
-    if (IsAddConst(instruction->InputAt(0), /*out*/ &a, /*out*/ &b, /*out*/ &c)) {
-      DCHECK(a != nullptr && b != nullptr);
+    if (IsAddConst2(graph_, instruction->InputAt(0), /*out*/ &a, /*out*/ &b, /*out*/ &c)) {
       // Accept c == 1 (rounded) or c == 0 (not rounded).
       bool is_rounded = false;
       if (c == 1) {
@@ -1947,8 +2222,7 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
       }
       // Accept recognized halving add for vectorizable operands. Vectorized code uses the
       // shorthand idiomatic operation. Sequential code uses the original scalar expressions.
-      DCHECK(r != nullptr);
-      DCHECK(s != nullptr);
+      DCHECK(r != nullptr && s != nullptr);
       if (generate_code && vector_mode_ != kVector) {  // de-idiom
         r = instruction->InputAt(0);
         s = instruction->InputAt(1);
@@ -1998,21 +2272,11 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
   HInstruction* v = instruction->InputAt(1);
   HInstruction* a = nullptr;
   HInstruction* b = nullptr;
-  if (v->IsInvokeStaticOrDirect() &&
-       (v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsInt ||
-        v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsLong)) {
-    HInstruction* x = v->InputAt(0);
-    if (x->GetType() == reduction_type) {
-      int64_t c = 0;
-      if (x->IsSub()) {
-        a = x->InputAt(0);
-        b = x->InputAt(1);
-      } else if (IsAddConst(x, /*out*/ &a, /*out*/ &c)) {
-        b = graph_->GetConstant(reduction_type, -c);  // hidden SUB!
-      }
-    }
-  }
-  if (a == nullptr || b == nullptr) {
+  if (v->IsAbs() &&
+      v->GetType() == reduction_type &&
+      IsSubConst2(graph_, v->InputAt(0), /*out*/ &a, /*out*/ &b)) {
+    DCHECK(a != nullptr && b != nullptr);
+  } else {
     return false;
   }
   // Accept same-type or consistent sign extension for narrower-type on operands a and b.
@@ -2045,8 +2309,7 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
   }
   // Accept SAD idiom for vectorizable operands. Vectorized code uses the shorthand
   // idiomatic operation. Sequential code uses the original scalar expressions.
-  DCHECK(r != nullptr);
-  DCHECK(s != nullptr);
+  DCHECK(r != nullptr && s != nullptr);
   if (generate_code && vector_mode_ != kVector) {  // de-idiom
     r = s = v->InputAt(0);
   }
@@ -2054,14 +2317,13 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
       VectorizeUse(node, r, generate_code, sub_type, restrictions) &&
       VectorizeUse(node, s, generate_code, sub_type, restrictions)) {
     if (generate_code) {
-      reduction_type = HVecOperation::ToProperType(reduction_type, is_unsigned);
       if (vector_mode_ == kVector) {
         vector_map_->Put(instruction, new (global_allocator_) HVecSADAccumulate(
             global_allocator_,
             vector_map_->Get(q),
             vector_map_->Get(r),
             vector_map_->Get(s),
-            reduction_type,
+            HVecOperation::ToProperType(reduction_type, is_unsigned),
             GetOtherVL(reduction_type, sub_type, vector_length_),
             kNoDexPc));
         MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
@@ -2134,41 +2396,6 @@ bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) {
   return true;
 }
 
-static constexpr uint32_t ARM64_SIMD_MAXIMUM_UNROLL_FACTOR = 8;
-static constexpr uint32_t ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE = 50;
-
-uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
-  uint32_t max_peel = MaxNumberPeeled();
-  switch (compiler_driver_->GetInstructionSet()) {
-    case InstructionSet::kArm64: {
-      // Don't unroll with insufficient iterations.
-      // TODO: Unroll loops with unknown trip count.
-      DCHECK_NE(vector_length_, 0u);
-      if (trip_count < (2 * vector_length_ + max_peel)) {
-        return kNoUnrollingFactor;
-      }
-      // Don't unroll for large loop body size.
-      uint32_t instruction_count = block->GetInstructions().CountSize();
-      if (instruction_count >= ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE) {
-        return kNoUnrollingFactor;
-      }
-      // Find a beneficial unroll factor with the following restrictions:
-      //  - At least one iteration of the transformed loop should be executed.
-      //  - The loop body shouldn't be "too big" (heuristic).
-      uint32_t uf1 = ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE / instruction_count;
-      uint32_t uf2 = (trip_count - max_peel) / vector_length_;
-      uint32_t unroll_factor =
-          TruncToPowerOfTwo(std::min({uf1, uf2, ARM64_SIMD_MAXIMUM_UNROLL_FACTOR}));
-      DCHECK_GE(unroll_factor, 1u);
-      return unroll_factor;
-    }
-    case InstructionSet::kX86:
-    case InstructionSet::kX86_64:
-    default:
-      return kNoUnrollingFactor;
-  }
-}
-
 //
 // Helpers.
 //