1 files changed, 215 insertions, 100 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 5784707d0e..02ee4ec057 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -21,6 +21,7 @@
 #include "arch/instruction_set.h"
 #include "arch/x86/instruction_set_features_x86.h"
 #include "arch/x86_64/instruction_set_features_x86_64.h"
+#include "code_generator.h"
 #include "driver/compiler_options.h"
 #include "linear_order.h"
 #include "mirror/array-inl.h"
@@ -305,7 +306,8 @@ static bool IsAddConst2(HGraph* graph,
                         /*out*/ HInstruction** a,
                         /*out*/ HInstruction** b,
                         /*out*/ int64_t* c) {
-  if (IsAddConst(instruction, a, b, c) && *a != nullptr) {
+  // We want an actual add/sub and not the trivial case where {b: 0, c: 0}.
+  if (IsAddOrSub(instruction) && IsAddConst(instruction, a, b, c) && *a != nullptr) {
     if (*b == nullptr) {
       // Constant is usually already present, unless accumulated.
       *b = graph->GetConstant(instruction->GetType(), (*c));
@@ -429,7 +431,7 @@ static void PeelByCount(HLoopInformation* loop_info,
                         InductionVarRange* induction_range) {
   for (int i = 0; i < count; i++) {
     // Perform peeling.
-    PeelUnrollSimpleHelper helper(loop_info, induction_range);
+    LoopClonerSimpleHelper helper(loop_info, induction_range);
     helper.DoPeeling();
   }
 }
@@ -456,12 +458,13 @@ static DataType::Type GetNarrowerType(HInstruction* a, HInstruction* b) {
 //
 
 HLoopOptimization::HLoopOptimization(HGraph* graph,
-                                     const CompilerOptions* compiler_options,
+                                     const CodeGenerator& codegen,
                                      HInductionVarAnalysis* induction_analysis,
                                      OptimizingCompilerStats* stats,
                                      const char* name)
     : HOptimization(graph, name, stats),
-      compiler_options_(compiler_options),
+      compiler_options_(&codegen.GetCompilerOptions()),
+      simd_register_size_(codegen.GetSIMDRegisterWidth()),
       induction_range_(induction_analysis),
       loop_allocator_(nullptr),
       global_allocator_(graph_->GetAllocator()),
@@ -470,6 +473,7 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
       iset_(nullptr),
       reductions_(nullptr),
       simplified_(false),
+      predicated_vectorization_mode_(codegen.SupportsPredicatedSIMD()),
       vector_length_(0),
       vector_refs_(nullptr),
       vector_static_peeling_factor_(0),
@@ -483,10 +487,7 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
       vector_header_(nullptr),
       vector_body_(nullptr),
       vector_index_(nullptr),
-      arch_loop_helper_(ArchNoOptsLoopHelper::Create(compiler_options_ != nullptr
-                                                          ? compiler_options_->GetInstructionSet()
-                                                          : InstructionSet::kNone,
-                                                      global_allocator_)) {
+      arch_loop_helper_(ArchNoOptsLoopHelper::Create(codegen, global_allocator_)) {
 }
 
 bool HLoopOptimization::Run() {
@@ -804,7 +805,7 @@ bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo*
 
     // Perform unrolling.
     HLoopInformation* loop_info = analysis_info->GetLoopInfo();
-    PeelUnrollSimpleHelper helper(loop_info, &induction_range_);
+    LoopClonerSimpleHelper helper(loop_info, &induction_range_);
     helper.DoUnrolling();
 
     // Remove the redundant loop check after unrolling.
@@ -829,7 +830,7 @@ bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopAnalysisI
 
   if (generate_code) {
     // Perform peeling.
-    PeelUnrollSimpleHelper helper(loop_info, &induction_range_);
+    LoopClonerSimpleHelper helper(loop_info, &induction_range_);
     helper.DoPeeling();
 
     // Statically evaluate loop check after peeling for loop invariant condition.
@@ -885,12 +886,6 @@ bool HLoopOptimization::TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool g
 }
 
 bool HLoopOptimization::TryPeelingAndUnrolling(LoopNode* node) {
-  // Don't run peeling/unrolling if compiler_options_ is nullptr (i.e., running under tests)
-  // as InstructionSet is needed.
-  if (compiler_options_ == nullptr) {
-    return false;
-  }
-
   HLoopInformation* loop_info = node->loop_info;
   int64_t trip_count = LoopAnalysis::GetLoopTripCount(loop_info, &induction_range_);
   LoopAnalysisInfo analysis_info(loop_info);
@@ -908,7 +903,7 @@ bool HLoopOptimization::TryPeelingAndUnrolling(LoopNode* node) {
   }
 
   // Run 'IsLoopClonable' the last as it might be time-consuming.
-  if (!PeelUnrollHelper::IsLoopClonable(loop_info)) {
+  if (!LoopClonerHelper::IsLoopClonable(loop_info)) {
     return false;
   }
 
@@ -951,9 +946,10 @@ bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int6
   //     make one particular reference aligned), never to exceed (1).
   // (3) variable to record how many references share same alignment.
   // (4) variable to record suitable candidate for dynamic loop peeling.
-  uint32_t desired_alignment = GetVectorSizeInBytes();
-  DCHECK_LE(desired_alignment, 16u);
-  uint32_t peeling_votes[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  size_t desired_alignment = GetVectorSizeInBytes();
+  ScopedArenaVector<uint32_t> peeling_votes(desired_alignment, 0u,
+      loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+
   uint32_t max_num_same_alignment = 0;
   const ArrayReference* peeling_candidate = nullptr;
 
@@ -1027,8 +1023,10 @@ bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int6
     }
   }  // for i
 
-  // Find a suitable alignment strategy.
-  SetAlignmentStrategy(peeling_votes, peeling_candidate);
+  if (!IsInPredicatedVectorizationMode()) {
+    // Find a suitable alignment strategy.
+    SetAlignmentStrategy(peeling_votes, peeling_candidate);
+  }
 
   // Does vectorization seem profitable?
   if (!IsVectorizationProfitable(trip_count)) {
@@ -1055,8 +1053,8 @@ void HLoopOptimization::Vectorize(LoopNode* node,
 
   // A cleanup loop is needed, at least, for any unknown trip count or
   // for a known trip count with remainder iterations after vectorization.
-  bool needs_cleanup = trip_count == 0 ||
-      ((trip_count - vector_static_peeling_factor_) % chunk) != 0;
+  bool needs_cleanup = !IsInPredicatedVectorizationMode() &&
+      (trip_count == 0 || ((trip_count - vector_static_peeling_factor_) % chunk) != 0);
 
   // Adjust vector bookkeeping.
   HPhi* main_phi = nullptr;
@@ -1074,11 +1072,13 @@ void HLoopOptimization::Vectorize(LoopNode* node,
   // ptc = <peeling factor>;
   HInstruction* ptc = nullptr;
   if (vector_static_peeling_factor_ != 0) {
+    DCHECK(!IsInPredicatedVectorizationMode());
     // Static loop peeling for SIMD alignment (using the most suitable
     // fixed peeling factor found during prior alignment analysis).
     DCHECK(vector_dynamic_peeling_candidate_ == nullptr);
     ptc = graph_->GetConstant(induc_type, vector_static_peeling_factor_);
   } else if (vector_dynamic_peeling_candidate_ != nullptr) {
+    DCHECK(!IsInPredicatedVectorizationMode());
     // Dynamic loop peeling for SIMD alignment (using the most suitable
     // candidate found during prior alignment analysis):
     // rem = offset % ALIGN;    // adjusted as #elements
@@ -1109,6 +1109,7 @@ void HLoopOptimization::Vectorize(LoopNode* node,
   HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
   HInstruction* vtc = stc;
   if (needs_cleanup) {
+    DCHECK(!IsInPredicatedVectorizationMode());
     DCHECK(IsPowerOfTwo(chunk));
     HInstruction* diff = stc;
     if (ptc != nullptr) {
@@ -1146,6 +1147,7 @@ void HLoopOptimization::Vectorize(LoopNode* node,
   //       moved around during suspend checks, since all analysis was based on
   //       nothing more than the Android runtime alignment conventions.
   if (ptc != nullptr) {
+    DCHECK(!IsInPredicatedVectorizationMode());
     vector_mode_ = kSequential;
     GenerateNewLoop(node,
                     block,
@@ -1173,6 +1175,7 @@ void HLoopOptimization::Vectorize(LoopNode* node,
   // for ( ; i < stc; i += 1)
   //    <loop-body>
   if (needs_cleanup) {
+    DCHECK(!IsInPredicatedVectorizationMode() || vector_runtime_test_a_ != nullptr);
     vector_mode_ = kSequential;
     GenerateNewLoop(node,
                     block,
@@ -1230,9 +1233,35 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node,
   // Generate header and prepare body.
   // for (i = lo; i < hi; i += step)
   //    <loop-body>
-  HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi);
-  vector_header_->AddPhi(phi);
-  vector_header_->AddInstruction(cond);
+  HInstruction* cond = nullptr;
+  HInstruction* set_pred = nullptr;
+  if (IsInPredicatedVectorizationMode()) {
+    HVecPredWhile* pred_while =
+        new (global_allocator_) HVecPredWhile(global_allocator_,
+                                              phi,
+                                              hi,
+                                              HVecPredWhile::CondKind::kLO,
+                                              DataType::Type::kInt32,
+                                              vector_length_,
+                                              0u);
+
+    cond = new (global_allocator_) HVecPredCondition(global_allocator_,
+                                                     pred_while,
+                                                     HVecPredCondition::PCondKind::kNFirst,
+                                                     DataType::Type::kInt32,
+                                                     vector_length_,
+                                                     0u);
+
+    vector_header_->AddPhi(phi);
+    vector_header_->AddInstruction(pred_while);
+    vector_header_->AddInstruction(cond);
+    set_pred = pred_while;
+  } else {
+    cond = new (global_allocator_) HAboveOrEqual(phi, hi);
+    vector_header_->AddPhi(phi);
+    vector_header_->AddInstruction(cond);
+  }
+
   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
   vector_index_ = phi;
   vector_permanent_map_->clear();  // preserved over unrolling
@@ -1249,6 +1278,10 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node,
       auto i = vector_map_->find(it.Current());
       if (i != vector_map_->end() && !i->second->IsInBlock()) {
         Insert(vector_body_, i->second);
+        if (IsInPredicatedVectorizationMode() && i->second->IsVecOperation()) {
+          HVecOperation* op = i->second->AsVecOperation();
+          op->SetMergingGoverningPredicate(set_pred);
+        }
         // Deal with instructions that need an environment, such as the scalar intrinsics.
         if (i->second->NeedsEnvironment()) {
           i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
@@ -1363,7 +1396,10 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node,
   } else if (instruction->IsArrayGet()) {
     // Deal with vector restrictions.
     bool is_string_char_at = instruction->AsArrayGet()->IsStringCharAt();
-    if (is_string_char_at && HasVectorRestrictions(restrictions, kNoStringCharAt)) {
+
+    if (is_string_char_at && (HasVectorRestrictions(restrictions, kNoStringCharAt) ||
+                              IsInPredicatedVectorizationMode())) {
+      // TODO: Support CharAt for predicated mode.
       return false;
     }
     // Accept a right-hand-side array base[index] for
@@ -1542,13 +1578,7 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node,
 }
 
 uint32_t HLoopOptimization::GetVectorSizeInBytes() {
-  switch (compiler_options_->GetInstructionSet()) {
-    case InstructionSet::kArm:
-    case InstructionSet::kThumb2:
-      return 8;  // 64-bit SIMD
-    default:
-      return 16;  // 128-bit SIMD
-  }
+  return simd_register_size_;
 }
 
 bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrictions) {
@@ -1563,45 +1593,88 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
         case DataType::Type::kUint8:
         case DataType::Type::kInt8:
           *restrictions |= kNoDiv | kNoReduction | kNoDotProd;
-          return TrySetVectorLength(8);
+          return TrySetVectorLength(type, 8);
         case DataType::Type::kUint16:
         case DataType::Type::kInt16:
           *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoDotProd;
-          return TrySetVectorLength(4);
+          return TrySetVectorLength(type, 4);
         case DataType::Type::kInt32:
           *restrictions |= kNoDiv | kNoWideSAD;
-          return TrySetVectorLength(2);
+          return TrySetVectorLength(type, 2);
         default:
           break;
       }
       return false;
     case InstructionSet::kArm64:
-      // Allow vectorization for all ARM devices, because Android assumes that
-      // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
-      switch (type) {
-        case DataType::Type::kBool:
-        case DataType::Type::kUint8:
-        case DataType::Type::kInt8:
-          *restrictions |= kNoDiv;
-          return TrySetVectorLength(16);
-        case DataType::Type::kUint16:
-        case DataType::Type::kInt16:
-          *restrictions |= kNoDiv;
-          return TrySetVectorLength(8);
-        case DataType::Type::kInt32:
-          *restrictions |= kNoDiv;
-          return TrySetVectorLength(4);
-        case DataType::Type::kInt64:
-          *restrictions |= kNoDiv | kNoMul;
-          return TrySetVectorLength(2);
-        case DataType::Type::kFloat32:
-          *restrictions |= kNoReduction;
-          return TrySetVectorLength(4);
-        case DataType::Type::kFloat64:
-          *restrictions |= kNoReduction;
-          return TrySetVectorLength(2);
-        default:
-          return false;
+      if (IsInPredicatedVectorizationMode()) {
+        // SVE vectorization.
+        CHECK(features->AsArm64InstructionSetFeatures()->HasSVE());
+        size_t vector_length = simd_register_size_ / DataType::Size(type);
+        DCHECK_EQ(simd_register_size_ % DataType::Size(type), 0u);
+        switch (type) {
+          case DataType::Type::kBool:
+          case DataType::Type::kUint8:
+          case DataType::Type::kInt8:
+            *restrictions |= kNoDiv |
+                             kNoSignedHAdd |
+                             kNoUnsignedHAdd |
+                             kNoUnroundedHAdd |
+                             kNoSAD;
+            return TrySetVectorLength(type, vector_length);
+          case DataType::Type::kUint16:
+          case DataType::Type::kInt16:
+            *restrictions |= kNoDiv |
+                             kNoSignedHAdd |
+                             kNoUnsignedHAdd |
+                             kNoUnroundedHAdd |
+                             kNoSAD |
+                             kNoDotProd;
+            return TrySetVectorLength(type, vector_length);
+          case DataType::Type::kInt32:
+            *restrictions |= kNoDiv | kNoSAD;
+            return TrySetVectorLength(type, vector_length);
+          case DataType::Type::kInt64:
+            *restrictions |= kNoDiv | kNoSAD;
+            return TrySetVectorLength(type, vector_length);
+          case DataType::Type::kFloat32:
+            *restrictions |= kNoReduction;
+            return TrySetVectorLength(type, vector_length);
+          case DataType::Type::kFloat64:
+            *restrictions |= kNoReduction;
+            return TrySetVectorLength(type, vector_length);
+          default:
+            break;
+        }
+        return false;
+      } else {
+        // Allow vectorization for all ARM devices, because Android assumes that
+        // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
+        switch (type) {
+          case DataType::Type::kBool:
+          case DataType::Type::kUint8:
+          case DataType::Type::kInt8:
+            *restrictions |= kNoDiv;
+            return TrySetVectorLength(type, 16);
+          case DataType::Type::kUint16:
+          case DataType::Type::kInt16:
+            *restrictions |= kNoDiv;
+            return TrySetVectorLength(type, 8);
+          case DataType::Type::kInt32:
+            *restrictions |= kNoDiv;
+            return TrySetVectorLength(type, 4);
+          case DataType::Type::kInt64:
+            *restrictions |= kNoDiv | kNoMul;
+            return TrySetVectorLength(type, 2);
+          case DataType::Type::kFloat32:
+            *restrictions |= kNoReduction;
+            return TrySetVectorLength(type, 4);
+          case DataType::Type::kFloat64:
+            *restrictions |= kNoReduction;
+            return TrySetVectorLength(type, 2);
+          default:
+            break;
+        }
+        return false;
       }
     case InstructionSet::kX86:
     case InstructionSet::kX86_64:
@@ -1619,7 +1692,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
                              kNoUnroundedHAdd |
                              kNoSAD |
                              kNoDotProd;
-            return TrySetVectorLength(16);
+            return TrySetVectorLength(type, 16);
           case DataType::Type::kUint16:
             *restrictions |= kNoDiv |
                              kNoAbs |
@@ -1627,26 +1700,26 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
                              kNoUnroundedHAdd |
                              kNoSAD |
                              kNoDotProd;
-            return TrySetVectorLength(8);
+            return TrySetVectorLength(type, 8);
           case DataType::Type::kInt16:
             *restrictions |= kNoDiv |
                              kNoAbs |
                              kNoSignedHAdd |
                              kNoUnroundedHAdd |
                              kNoSAD;
-            return TrySetVectorLength(8);
+            return TrySetVectorLength(type, 8);
           case DataType::Type::kInt32:
             *restrictions |= kNoDiv | kNoSAD;
-            return TrySetVectorLength(4);
+            return TrySetVectorLength(type, 4);
           case DataType::Type::kInt64:
             *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoSAD;
-            return TrySetVectorLength(2);
+            return TrySetVectorLength(type, 2);
           case DataType::Type::kFloat32:
             *restrictions |= kNoReduction;
-            return TrySetVectorLength(4);
+            return TrySetVectorLength(type, 4);
           case DataType::Type::kFloat64:
             *restrictions |= kNoReduction;
-            return TrySetVectorLength(2);
+            return TrySetVectorLength(type, 2);
           default:
             break;
         }  // switch type
@@ -1657,7 +1730,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
   }  // switch instruction set
 }
 
-bool HLoopOptimization::TrySetVectorLength(uint32_t length) {
+bool HLoopOptimization::TrySetVectorLengthImpl(uint32_t length) {
   DCHECK(IsPowerOfTwo(length) && length >= 2u);
   // First time set?
   if (vector_length_ == 0) {
@@ -1694,6 +1767,15 @@ void HLoopOptimization::GenerateVecInv(HInstruction* org, DataType::Type type) {
       vector = new (global_allocator_)
           HVecReplicateScalar(global_allocator_, input, type, vector_length_, kNoDexPc);
       vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
+      if (IsInPredicatedVectorizationMode()) {
+        HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+                                                                          graph_->GetIntConstant(1),
+                                                                          type,
+                                                                          vector_length_,
+                                                                          0u);
+        vector_preheader_->InsertInstructionBefore(set_pred, vector);
+        vector->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+      }
     }
     vector_map_->Put(org, vector);
   }
@@ -1822,6 +1904,15 @@ void HLoopOptimization::GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* r
                                                                     vector_length,
                                                                     kNoDexPc));
     }
+    if (IsInPredicatedVectorizationMode()) {
+      HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+                                                                        graph_->GetIntConstant(1),
+                                                                        type,
+                                                                        vector_length,
+                                                                        0u);
+      vector_preheader_->InsertInstructionBefore(set_pred, new_init);
+      new_init->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+    }
   } else {
     new_init = ReduceAndExtractIfNeeded(new_init);
   }
@@ -1853,6 +1944,17 @@ HInstruction* HLoopOptimization::ReduceAndExtractIfNeeded(HInstruction* instruct
       instruction = new (global_allocator_) HVecExtractScalar(
           global_allocator_, reduce, type, vector_length, 0, kNoDexPc);
       exit->InsertInstructionAfter(instruction, reduce);
+
+      if (IsInPredicatedVectorizationMode()) {
+        HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+                                                                          graph_->GetIntConstant(1),
+                                                                          type,
+                                                                          vector_length,
+                                                                          0u);
+        exit->InsertInstructionBefore(set_pred, reduce);
+        reduce->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+        instruction->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+      }
     }
   }
   return instruction;
@@ -1992,7 +2094,8 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
         return false;
       }
       // Deal with vector restrictions.
-      if ((!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
+      if ((is_unsigned && HasVectorRestrictions(restrictions, kNoUnsignedHAdd)) ||
+          (!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
           (!is_rounded && HasVectorRestrictions(restrictions, kNoUnroundedHAdd))) {
         return false;
       }
@@ -2044,13 +2147,13 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
       (reduction_type != DataType::Type::kInt32 && reduction_type != DataType::Type::kInt64)) {
     return false;
   }
-  HInstruction* q = instruction->InputAt(0);
-  HInstruction* v = instruction->InputAt(1);
+  HInstruction* acc = instruction->InputAt(0);
+  HInstruction* abs = instruction->InputAt(1);
   HInstruction* a = nullptr;
   HInstruction* b = nullptr;
-  if (v->IsAbs() &&
-      v->GetType() == reduction_type &&
-      IsSubConst2(graph_, v->InputAt(0), /*out*/ &a, /*out*/ &b)) {
+  if (abs->IsAbs() &&
+      abs->GetType() == reduction_type &&
+      IsSubConst2(graph_, abs->InputAt(0), /*out*/ &a, /*out*/ &b)) {
     DCHECK(a != nullptr && b != nullptr);
   } else {
     return false;
@@ -2076,16 +2179,16 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
   // idiomatic operation. Sequential code uses the original scalar expressions.
   DCHECK(r != nullptr && s != nullptr);
   if (generate_code && vector_mode_ != kVector) {  // de-idiom
-    r = s = v->InputAt(0);
+    r = s = abs->InputAt(0);
   }
-  if (VectorizeUse(node, q, generate_code, sub_type, restrictions) &&
+  if (VectorizeUse(node, acc, generate_code, sub_type, restrictions) &&
       VectorizeUse(node, r, generate_code, sub_type, restrictions) &&
       VectorizeUse(node, s, generate_code, sub_type, restrictions)) {
     if (generate_code) {
       if (vector_mode_ == kVector) {
         vector_map_->Put(instruction, new (global_allocator_) HVecSADAccumulate(
             global_allocator_,
-            vector_map_->Get(q),
+            vector_map_->Get(acc),
             vector_map_->Get(r),
             vector_map_->Get(s),
             HVecOperation::ToProperType(reduction_type, is_unsigned),
@@ -2093,8 +2196,14 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
             kNoDexPc));
         MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
       } else {
-        GenerateVecOp(v, vector_map_->Get(r), nullptr, reduction_type);
-        GenerateVecOp(instruction, vector_map_->Get(q), vector_map_->Get(v), reduction_type);
+        // "GenerateVecOp()" must not be called more than once for each original loop body
+        // instruction. As the SAD idiom processes both "current" instruction ("instruction")
+        // and its ABS input in one go, we must check that for the scalar case the ABS instruction
+        // has not yet been processed.
+        if (vector_map_->find(abs) == vector_map_->end()) {
+          GenerateVecOp(abs, vector_map_->Get(r), nullptr, reduction_type);
+        }
+        GenerateVecOp(instruction, vector_map_->Get(acc), vector_map_->Get(abs), reduction_type);
       }
     }
     return true;
@@ -2116,20 +2225,20 @@ bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node,
     return false;
   }
 
-  HInstruction* q = instruction->InputAt(0);
-  HInstruction* v = instruction->InputAt(1);
-  if (!v->IsMul() || v->GetType() != reduction_type) {
+  HInstruction* const acc = instruction->InputAt(0);
+  HInstruction* const mul = instruction->InputAt(1);
+  if (!mul->IsMul() || mul->GetType() != reduction_type) {
     return false;
   }
 
-  HInstruction* a = v->InputAt(0);
-  HInstruction* b = v->InputAt(1);
-  HInstruction* r = a;
-  HInstruction* s = b;
-  DataType::Type op_type = GetNarrowerType(a, b);
+  HInstruction* const mul_left = mul->InputAt(0);
+  HInstruction* const mul_right = mul->InputAt(1);
+  HInstruction* r = mul_left;
+  HInstruction* s = mul_right;
+  DataType::Type op_type = GetNarrowerType(mul_left, mul_right);
   bool is_unsigned = false;
 
-  if (!IsNarrowerOperands(a, b, op_type, &r, &s, &is_unsigned)) {
+  if (!IsNarrowerOperands(mul_left, mul_right, op_type, &r, &s, &is_unsigned)) {
     return false;
   }
   op_type = HVecOperation::ToProperType(op_type, is_unsigned);
@@ -2143,17 +2252,17 @@ bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node,
   // Accept dot product idiom for vectorizable operands. Vectorized code uses the shorthand
   // idiomatic operation. Sequential code uses the original scalar expressions.
   if (generate_code && vector_mode_ != kVector) {  // de-idiom
-    r = a;
-    s = b;
+    r = mul_left;
+    s = mul_right;
   }
-  if (VectorizeUse(node, q, generate_code, op_type, restrictions) &&
+  if (VectorizeUse(node, acc, generate_code, op_type, restrictions) &&
       VectorizeUse(node, r, generate_code, op_type, restrictions) &&
       VectorizeUse(node, s, generate_code, op_type, restrictions)) {
     if (generate_code) {
       if (vector_mode_ == kVector) {
         vector_map_->Put(instruction, new (global_allocator_) HVecDotProd(
             global_allocator_,
-            vector_map_->Get(q),
+            vector_map_->Get(acc),
             vector_map_->Get(r),
             vector_map_->Get(s),
             reduction_type,
@@ -2162,8 +2271,14 @@ bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node,
             kNoDexPc));
         MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
       } else {
-        GenerateVecOp(v, vector_map_->Get(r), vector_map_->Get(s), reduction_type);
-        GenerateVecOp(instruction, vector_map_->Get(q), vector_map_->Get(v), reduction_type);
+        // "GenerateVecOp()" must not be called more than once for each original loop body
+        // instruction. As the DotProd idiom processes both "current" instruction ("instruction")
+        // and its MUL input in one go, we must check that for the scalar case the MUL instruction
+        // has not yet been processed.
+        if (vector_map_->find(mul) == vector_map_->end()) {
+          GenerateVecOp(mul, vector_map_->Get(r), vector_map_->Get(s), reduction_type);
+        }
+        GenerateVecOp(instruction, vector_map_->Get(acc), vector_map_->Get(mul), reduction_type);
       }
     }
     return true;
@@ -2191,12 +2306,12 @@ Alignment HLoopOptimization::ComputeAlignment(HInstruction* offset,
   return Alignment(DataType::Size(type), 0);
 }
 
-void HLoopOptimization::SetAlignmentStrategy(uint32_t peeling_votes[],
+void HLoopOptimization::SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes,
                                              const ArrayReference* peeling_candidate) {
   // Current heuristic: pick the best static loop peeling factor, if any,
   // or otherwise use dynamic loop peeling on suggested peeling candidate.
   uint32_t max_vote = 0;
-  for (int32_t i = 0; i < 16; i++) {
+  for (size_t i = 0; i < peeling_votes.size(); i++) {
     if (peeling_votes[i] > max_vote) {
       max_vote = peeling_votes[i];
       vector_static_peeling_factor_ = i;