Unrolling and dynamic loop peeling framework in vectorizer.

Rationale: This CL introduces the basic framework for dynamically peeling (to obtain aligned access) and unrolling the vector loop (to reduce looping overhead and allow more target specific optimizations on e.g. SIMD loads and stores). NOTE: The current heuristics are "bogus" and merely meant to exercise the new framework. This CL focuses on introducing correct code for the vectorizer. Heuristics and the memory computations for alignment are to be implemented later. Test: test-art-target, test-art-host Change-Id: I010af1475f42f92fd1daa6a967d7a85922beace8
author: Aart Bik <ajcbik@google.com> 2017-06-08 14:06:58 -0700
committer: Aart Bik <ajcbik@google.com> 2017-06-27 11:29:07 -0700
commit: 14a68b4aa9620e4fd58907255b049fb5c18bd1ec (patch)
tree: 692319b6a9344d84a2e8916c388be954d8878c41 /compiler/optimizing/loop_optimization.cc
parent: afdcd847498abc0f4e295bece443afabf8aaf868 (diff)
1 files changed, 192 insertions, 73 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index d2493137fe..d39bc16ed2 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -31,6 +31,9 @@ namespace art {
 // Enables vectorization (SIMDization) in the loop optimizer.
 static constexpr bool kEnableVectorization = true;
 
+// All current SIMD targets want 16-byte alignment.
+static constexpr size_t kAlignedBase = 16;
+
 // Remove the instruction from the graph. A bit more elaborate than the usual
 // instruction removal, since there may be a cycle in the use structure.
 static void RemoveFromCycle(HInstruction* instruction) {
@@ -283,6 +286,9 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
       simplified_(false),
       vector_length_(0),
       vector_refs_(nullptr),
+      vector_peeling_candidate_(nullptr),
+      vector_runtime_test_a_(nullptr),
+      vector_runtime_test_b_(nullptr),
       vector_map_(nullptr) {
 }
 
@@ -422,23 +428,6 @@ void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) {
 // Optimization.
 //
 
-bool HLoopOptimization::CanRemoveCycle() {
-  for (HInstruction* i : *iset_) {
-    // We can never remove instructions that have environment
-    // uses when we compile 'debuggable'.
-    if (i->HasEnvironmentUses() && graph_->IsDebuggable()) {
-      return false;
-    }
-    // A deoptimization should never have an environment input removed.
-    for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) {
-      if (use.GetUser()->GetHolder()->IsDeoptimize()) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
 void HLoopOptimization::SimplifyInduction(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
@@ -565,7 +554,7 @@ void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
   if (kEnableVectorization) {
     iset_->clear();  // prepare phi induction
     if (TrySetSimpleLoopHeader(header) &&
-        CanVectorize(node, body, trip_count) &&
+        ShouldVectorize(node, body, trip_count) &&
         TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) {
       Vectorize(node, body, exit, trip_count);
       graph_->SetHasSIMD(true);  // flag SIMD usage
@@ -580,10 +569,11 @@ void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
 // Intel Press, June, 2004 (http://www.aartbik.com/).
 //
 
-bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) {
+bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) {
   // Reset vector bookkeeping.
   vector_length_ = 0;
   vector_refs_->clear();
+  vector_peeling_candidate_ = nullptr;
   vector_runtime_test_a_ =
   vector_runtime_test_b_= nullptr;
 
@@ -600,12 +590,9 @@ bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t
     }
   }
 
-  // Heuristics. Does vectorization seem profitable?
-  // TODO: refine
-  if (vector_length_ == 0) {
-    return false;  // nothing found
-  } else if (0 < trip_count && trip_count < vector_length_) {
-    return false;  // insufficient iterations
+  // Does vectorization seem profitable?
+  if (!IsVectorizationProfitable(trip_count)) {
+    return false;
   }
 
   // Data dependence analysis. Find each pair of references with same type, where
@@ -645,6 +632,9 @@ bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t
     }
   }
 
+  // Consider dynamic loop peeling for alignment.
+  SetPeelingCandidate(trip_count);
+
   // Success!
   return true;
 }
@@ -657,28 +647,52 @@ void HLoopOptimization::Vectorize(LoopNode* node,
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
 
-  // A cleanup is needed for any unknown trip count or for a known trip count
-  // with remainder iterations after vectorization.
-  bool needs_cleanup = trip_count == 0 || (trip_count % vector_length_) != 0;
+  // Pick a loop unrolling factor for the vector loop.
+  uint32_t unroll = GetUnrollingFactor(block, trip_count);
+  uint32_t chunk = vector_length_ * unroll;
+
+  // A cleanup loop is needed, at least, for any unknown trip count or
+  // for a known trip count with remainder iterations after vectorization.
+  bool needs_cleanup = trip_count == 0 || (trip_count % chunk) != 0;
 
   // Adjust vector bookkeeping.
   iset_->clear();  // prepare phi induction
   bool is_simple_loop_header = TrySetSimpleLoopHeader(header);  // fills iset_
   DCHECK(is_simple_loop_header);
+  vector_header_ = header;
+  vector_body_ = block;
+
+  // Generate dynamic loop peeling trip count, if needed:
+  // ptc = <peeling-needed-for-candidate>
+  HInstruction* ptc = nullptr;
+  if (vector_peeling_candidate_ != nullptr) {
+    DCHECK_LT(vector_length_, trip_count) << "dynamic peeling currently requires known trip count";
+    //
+    // TODO: Implement this. Compute address of first access memory location and
+    //       compute peeling factor to obtain kAlignedBase alignment.
+    //
+    needs_cleanup = true;
+  }
 
-  // Generate preheader:
+  // Generate loop control:
   // stc = <trip-count>;
-  // vtc = stc - stc % VL;
+  // vtc = stc - (stc - ptc) % chunk;
+  // i = 0;
   HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
   HInstruction* vtc = stc;
   if (needs_cleanup) {
-    DCHECK(IsPowerOfTwo(vector_length_));
+    DCHECK(IsPowerOfTwo(chunk));
+    HInstruction* diff = stc;
+    if (ptc != nullptr) {
+      diff = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, ptc));
+    }
     HInstruction* rem = Insert(
         preheader, new (global_allocator_) HAnd(induc_type,
-                                                stc,
-                                                graph_->GetIntConstant(vector_length_ - 1)));
+                                                diff,
+                                                graph_->GetIntConstant(chunk - 1)));
     vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem));
   }
+  vector_index_ = graph_->GetIntConstant(0);
 
   // Generate runtime disambiguation test:
   // vtc = a != b ? vtc : 0;
@@ -691,16 +705,31 @@ void HLoopOptimization::Vectorize(LoopNode* node,
     needs_cleanup = true;
   }
 
-  // Generate vector loop:
-  // for (i = 0; i < vtc; i += VL)
+  // Generate dynamic peeling loop for alignment, if needed:
+  // for ( ; i < ptc; i += 1)
+  //    <loop-body>
+  if (ptc != nullptr) {
+    vector_mode_ = kSequential;
+    GenerateNewLoop(node,
+                    block,
+                    graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
+                    vector_index_,
+                    ptc,
+                    graph_->GetIntConstant(1),
+                    /*unroll*/ 1);
+  }
+
+  // Generate vector loop, possibly further unrolled:
+  // for ( ; i < vtc; i += chunk)
   //    <vectorized-loop-body>
   vector_mode_ = kVector;
   GenerateNewLoop(node,
                   block,
-                  graph_->TransformLoopForVectorization(header, block, exit),
-                  graph_->GetIntConstant(0),
+                  graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
+                  vector_index_,
                   vtc,
-                  graph_->GetIntConstant(vector_length_));
+                  graph_->GetIntConstant(vector_length_),  // increment per unroll
+                  unroll);
   HLoopInformation* vloop = vector_header_->GetLoopInformation();
 
   // Generate cleanup loop, if needed:
@@ -711,9 +740,10 @@ void HLoopOptimization::Vectorize(LoopNode* node,
     GenerateNewLoop(node,
                     block,
                     graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
-                    vector_phi_,
+                    vector_index_,
                     stc,
-                    graph_->GetIntConstant(1));
+                    graph_->GetIntConstant(1),
+                    /*unroll*/ 1);
   }
 
   // Remove the original loop by disconnecting the body block
@@ -722,8 +752,9 @@ void HLoopOptimization::Vectorize(LoopNode* node,
   while (!header->GetFirstInstruction()->IsGoto()) {
     header->RemoveInstruction(header->GetFirstInstruction());
   }
-  // Update loop hierarchy: the old header now resides in the
-  // same outer loop as the old preheader.
+  // Update loop hierarchy: the old header now resides in the same outer loop
+  // as the old preheader. Note that we don't bother putting sequential
+  // loops back in the hierarchy at this point.
   header->SetLoopInformation(preheader->GetLoopInformation());  // outward
   node->loop_info = vloop;
 }
@@ -733,44 +764,64 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node,
                                         HBasicBlock* new_preheader,
                                         HInstruction* lo,
                                         HInstruction* hi,
-                                        HInstruction* step) {
+                                        HInstruction* step,
+                                        uint32_t unroll) {
+  DCHECK(unroll == 1 || vector_mode_ == kVector);
   Primitive::Type induc_type = Primitive::kPrimInt;
   // Prepare new loop.
-  vector_map_->clear();
   vector_preheader_ = new_preheader,
   vector_header_ = vector_preheader_->GetSingleSuccessor();
   vector_body_ = vector_header_->GetSuccessors()[1];
-  vector_phi_ = new (global_allocator_) HPhi(global_allocator_,
-                                             kNoRegNumber,
-                                             0,
-                                             HPhi::ToPhiType(induc_type));
+  HPhi* phi = new (global_allocator_) HPhi(global_allocator_,
+                                           kNoRegNumber,
+                                           0,
+                                           HPhi::ToPhiType(induc_type));
   // Generate header and prepare body.
   // for (i = lo; i < hi; i += step)
   //    <loop-body>
-  HInstruction* cond = new (global_allocator_) HAboveOrEqual(vector_phi_, hi);
-  vector_header_->AddPhi(vector_phi_);
+  HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi);
+  vector_header_->AddPhi(phi);
   vector_header_->AddInstruction(cond);
   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
-  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-    bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
-    DCHECK(vectorized_def);
-  }
-  // Generate body from the instruction map, but in original program order.
-  HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
-  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-    auto i = vector_map_->find(it.Current());
-    if (i != vector_map_->end() && !i->second->IsInBlock()) {
-      Insert(vector_body_, i->second);
-      // Deal with instructions that need an environment, such as the scalar intrinsics.
-      if (i->second->NeedsEnvironment()) {
-        i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+  vector_index_ = phi;
+  for (uint32_t u = 0; u < unroll; u++) {
+    // Clear map, leaving loop invariants setup during unrolling.
+    if (u == 0) {
+      vector_map_->clear();
+    } else {
+      for (auto i = vector_map_->begin(); i != vector_map_->end(); ) {
+        if (i->second->IsVecReplicateScalar()) {
+          DCHECK(node->loop_info->IsDefinedOutOfTheLoop(i->first));
+          ++i;
+        } else {
+          i = vector_map_->erase(i);
+        }
       }
     }
+    // Generate instruction map.
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
+      DCHECK(vectorized_def);
+    }
+    // Generate body from the instruction map, but in original program order.
+    HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      auto i = vector_map_->find(it.Current());
+      if (i != vector_map_->end() && !i->second->IsInBlock()) {
+        Insert(vector_body_, i->second);
+        // Deal with instructions that need an environment, such as the scalar intrinsics.
+        if (i->second->NeedsEnvironment()) {
+          i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+        }
+      }
+    }
+    vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step);
+    Insert(vector_body_, vector_index_);
   }
-  // Finalize increment and phi.
-  HInstruction* inc = new (global_allocator_) HAdd(induc_type, vector_phi_, step);
-  vector_phi_->AddInput(lo);
-  vector_phi_->AddInput(Insert(vector_body_, inc));
+  // Finalize phi for the loop index.
+  phi->AddInput(lo);
+  phi->AddInput(vector_index_);
+  vector_index_ = phi;
 }
 
 // TODO: accept reductions at left-hand-side, mixed-type store idioms, etc.
@@ -795,7 +846,7 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node,
         VectorizeUse(node, value, generate_code, type, restrictions)) {
       if (generate_code) {
         GenerateVecSub(index, offset);
-        GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), type);
+        GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), offset, type);
       } else {
         vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ true));
       }
@@ -852,7 +903,7 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node,
         induction_range_.IsUnitStride(instruction, index, &offset)) {
       if (generate_code) {
         GenerateVecSub(index, offset);
-        GenerateVecMem(instruction, vector_map_->Get(index), nullptr, type);
+        GenerateVecMem(instruction, vector_map_->Get(index), nullptr, offset, type);
       } else {
         vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ false));
       }
@@ -1164,7 +1215,7 @@ void HLoopOptimization::GenerateVecInv(HInstruction* org, Primitive::Type type)
 
 void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset) {
   if (vector_map_->find(org) == vector_map_->end()) {
-    HInstruction* subscript = vector_phi_;
+    HInstruction* subscript = vector_index_;
     if (offset != nullptr) {
       subscript = new (global_allocator_) HAdd(Primitive::kPrimInt, subscript, offset);
       if (org->IsPhi()) {
@@ -1178,17 +1229,27 @@ void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset)
 void HLoopOptimization::GenerateVecMem(HInstruction* org,
                                        HInstruction* opa,
                                        HInstruction* opb,
+                                       HInstruction* offset,
                                        Primitive::Type type) {
   HInstruction* vector = nullptr;
   if (vector_mode_ == kVector) {
     // Vector store or load.
+    HInstruction* base = org->InputAt(0);
     if (opb != nullptr) {
       vector = new (global_allocator_) HVecStore(
-          global_allocator_, org->InputAt(0), opa, opb, type, vector_length_);
+          global_allocator_, base, opa, opb, type, vector_length_);
     } else  {
       bool is_string_char_at = org->AsArrayGet()->IsStringCharAt();
       vector = new (global_allocator_) HVecLoad(
-          global_allocator_, org->InputAt(0), opa, type, vector_length_, is_string_char_at);
+          global_allocator_, base, opa, type, vector_length_, is_string_char_at);
+    }
+    // Known dynamically enforced alignment?
+    // TODO: detect offset + constant differences.
+    // TODO: long run, static alignment analysis?
+    if (vector_peeling_candidate_ != nullptr &&
+        vector_peeling_candidate_->base == base &&
+        vector_peeling_candidate_->offset == offset) {
+      vector->AsVecMemoryOperation()->SetAlignment(Alignment(kAlignedBase, 0));
     }
   } else {
     // Scalar store or load.
@@ -1444,6 +1505,47 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
 }
 
 //
+// Vectorization heuristics.
+//
+
+bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) {
+  // Current heuristic: non-empty body with sufficient number
+  // of iterations (if known).
+  // TODO: refine by looking at e.g. operation count, alignment, etc.
+  if (vector_length_ == 0) {
+    return false;  // nothing found
+  } else if (0 < trip_count && trip_count < vector_length_) {
+    return false;  // insufficient iterations
+  }
+  return true;
+}
+
+void HLoopOptimization::SetPeelingCandidate(int64_t trip_count ATTRIBUTE_UNUSED) {
+  // Current heuristic: none.
+  // TODO: implement
+}
+
+uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
+  // Current heuristic: unroll by 2 on ARM64/X86 for large known trip
+  // counts and small loop bodies.
+  // TODO: refine with operation count, remaining iterations, etc.
+  //       Artem had some really cool ideas for this already.
+  switch (compiler_driver_->GetInstructionSet()) {
+    case kArm64:
+    case kX86:
+    case kX86_64: {
+      size_t num_instructions = block->GetInstructions().CountSize();
+      if (num_instructions <= 10 && trip_count >= 4 * vector_length_) {
+        return 2;
+      }
+      return 1;
+    }
+    default:
+      return 1;
+  }
+}
+
+//
 // Helpers.
 //
 
@@ -1576,8 +1678,8 @@ bool HLoopOptimization::TryReplaceWithLastValue(HLoopInformation* loop_info,
       size_t index = it->GetIndex();
       ++it;  // increment before replacing
       if (iset_->find(user->GetHolder()) == iset_->end()) {  // not excluded?
-        HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation();
         // Only update environment uses after the loop.
+        HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation();
         if (other_loop_info == nullptr || !other_loop_info->IsIn(*loop_info)) {
           user->RemoveAsUserOfInput(index);
           user->SetRawEnvAt(index, replacement);
@@ -1614,4 +1716,21 @@ void HLoopOptimization::RemoveDeadInstructions(const HInstructionList& list) {
   }
 }
 
+bool HLoopOptimization::CanRemoveCycle() {
+  for (HInstruction* i : *iset_) {
+    // We can never remove instructions that have environment
+    // uses when we compile 'debuggable'.
+    if (i->HasEnvironmentUses() && graph_->IsDebuggable()) {
+      return false;
+    }
+    // A deoptimization should never have an environment input removed.
+    for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) {
+      if (use.GetUser()->GetHolder()->IsDeoptimize()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 }  // namespace art
author	Aart Bik <ajcbik@google.com>	2017-06-08 14:06:58 -0700
committer	Aart Bik <ajcbik@google.com>	2017-06-27 11:29:07 -0700
commit	14a68b4aa9620e4fd58907255b049fb5c18bd1ec (patch)
tree	692319b6a9344d84a2e8916c388be954d8878c41 /compiler/optimizing/loop_optimization.cc
parent	afdcd847498abc0f4e295bece443afabf8aaf868 (diff)