Partial loop unrolling during auto-vectorization for x86_64 architectures.

Test: ./test.py --host --64 Change-Id: I5186b025a7343f5b1190c1eb4de1610090d113c8 Signed-off-by: Neeraj Solanki <neeraj.solanki@intel.com>
author: Neeraj Solanki <neeraj.solanki@intel.com> 2019-10-15 14:04:15 +0530
committer: Treehugger Robot <treehugger-gerrit@google.com> 2019-11-19 14:04:33 +0000
commit: 26f6330d72e90b3162f8ead17a774a78effc82fc (patch)
tree: 55897a392a4eaf43e664cf9be090c8bbcdb1ad92 /compiler/optimizing/loop_analysis.cc
parent: a6d7f50d36365e2dc8d19d7b25328b0aff7fd5ce (diff)
1 files changed, 220 insertions, 0 deletions
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
index 2ae3683ffa..78505171cb 100644
--- a/compiler/optimizing/loop_analysis.cc
+++ b/compiler/optimizing/loop_analysis.cc
@@ -178,12 +178,232 @@ class Arm64LoopHelper : public ArchDefaultLoopHelper {
   }
 };
 
+// Custom implementation of loop helper for X86_64 target. Enables heuristics for scalar loop
+// peeling and unrolling and supports SIMD loop unrolling.
+class X86_64LoopHelper : public ArchDefaultLoopHelper {
+  // mapping of machine instruction count for most used IR instructions
+  // Few IRs generate different number of instructions based on input and result type.
+  // We checked top java apps, benchmarks and used the most generated instruction count.
+  uint32_t GetMachineInstructionCount(HInstruction* inst) const {
+    switch (inst->GetKind()) {
+      case HInstruction::InstructionKind::kAbs:
+        return 3;
+      case HInstruction::InstructionKind::kAdd:
+        return 1;
+      case HInstruction::InstructionKind::kAnd:
+        return 1;
+      case HInstruction::InstructionKind::kArrayLength:
+        return 1;
+      case HInstruction::InstructionKind::kArrayGet:
+        return 1;
+      case HInstruction::InstructionKind::kArraySet:
+        return 1;
+      case HInstruction::InstructionKind::kBoundsCheck:
+        return 2;
+      case HInstruction::InstructionKind::kCheckCast:
+        return 9;
+      case HInstruction::InstructionKind::kDiv:
+        return 8;
+      case HInstruction::InstructionKind::kDivZeroCheck:
+        return 2;
+      case HInstruction::InstructionKind::kEqual:
+        return 3;
+      case HInstruction::InstructionKind::kGreaterThan:
+        return 3;
+      case HInstruction::InstructionKind::kGreaterThanOrEqual:
+        return 3;
+      case HInstruction::InstructionKind::kIf:
+        return 2;
+      case HInstruction::InstructionKind::kInstanceFieldGet:
+        return 2;
+      case HInstruction::InstructionKind::kInstanceFieldSet:
+        return 1;
+      case HInstruction::InstructionKind::kLessThan:
+        return 3;
+      case HInstruction::InstructionKind::kLessThanOrEqual:
+        return 3;
+      case HInstruction::InstructionKind::kMax:
+        return 2;
+      case HInstruction::InstructionKind::kMin:
+        return 2;
+      case HInstruction::InstructionKind::kMul:
+        return 1;
+      case HInstruction::InstructionKind::kNotEqual:
+        return 3;
+      case HInstruction::InstructionKind::kOr:
+        return 1;
+      case HInstruction::InstructionKind::kRem:
+        return 11;
+      case HInstruction::InstructionKind::kSelect:
+        return 2;
+      case HInstruction::InstructionKind::kShl:
+        return 1;
+      case HInstruction::InstructionKind::kShr:
+        return 1;
+      case HInstruction::InstructionKind::kSub:
+        return 1;
+      case HInstruction::InstructionKind::kTypeConversion:
+        return 1;
+      case HInstruction::InstructionKind::kUShr:
+        return 1;
+      case HInstruction::InstructionKind::kVecReplicateScalar:
+        return 2;
+      case HInstruction::InstructionKind::kVecExtractScalar:
+       return 1;
+      case HInstruction::InstructionKind::kVecReduce:
+        return 4;
+      case HInstruction::InstructionKind::kVecNeg:
+        return 2;
+      case HInstruction::InstructionKind::kVecAbs:
+        return 4;
+      case HInstruction::InstructionKind::kVecNot:
+        return 3;
+      case HInstruction::InstructionKind::kVecAdd:
+        return 1;
+      case HInstruction::InstructionKind::kVecSub:
+        return 1;
+      case HInstruction::InstructionKind::kVecMul:
+        return 1;
+      case HInstruction::InstructionKind::kVecDiv:
+        return 1;
+      case HInstruction::InstructionKind::kVecMax:
+        return 1;
+      case HInstruction::InstructionKind::kVecMin:
+        return 1;
+      case HInstruction::InstructionKind::kVecOr:
+        return 1;
+      case HInstruction::InstructionKind::kVecXor:
+        return 1;
+      case HInstruction::InstructionKind::kVecShl:
+        return 1;
+      case HInstruction::InstructionKind::kVecShr:
+        return 1;
+      case HInstruction::InstructionKind::kVecLoad:
+        return 1;
+      case HInstruction::InstructionKind::kVecStore:
+        return 1;
+      case HInstruction::InstructionKind::kXor:
+        return 1;
+      default:
+        return 1;
+    }
+  }
+
+  // Maximum possible unrolling factor.
+  static constexpr uint32_t kX86_64MaxUnrollFactor = 2;  // pow(2,2) = 4
+
+  // According to Intel® 64 and IA-32 Architectures Optimization Reference Manual,
+  // avoid excessive loop unrolling to ensure LSD (loop stream decoder) is operating efficiently.
+  // This variable takes care that unrolled loop instructions should not exceed LSD size.
+  // For Intel Atom processors (silvermont & goldmont), LSD size is 28
+  // TODO - identify architecture and LSD size at runtime
+  static constexpr uint32_t kX86_64UnrolledMaxBodySizeInstr = 28;
+
+  // Loop's maximum basic block count. Loops with higher count will not be partial
+  // unrolled (unknown iterations).
+  static constexpr uint32_t kX86_64UnknownIterMaxBodySizeBlocks = 2;
+
+  uint32_t GetUnrollingFactor(HLoopInformation* loop_info, HBasicBlock* header) const;
+
+ public:
+  uint32_t GetSIMDUnrollingFactor(HBasicBlock* block,
+                                  int64_t trip_count,
+                                  uint32_t max_peel,
+                                  uint32_t vector_length) const override {
+    DCHECK_NE(vector_length, 0u);
+    HLoopInformation* loop_info = block->GetLoopInformation();
+    DCHECK(loop_info);
+    HBasicBlock* header = loop_info->GetHeader();
+    DCHECK(header);
+    uint32_t unroll_factor = 0;
+
+    if ((trip_count == 0) || (trip_count == LoopAnalysisInfo::kUnknownTripCount)) {
+      // Don't unroll for large loop body size.
+      unroll_factor = GetUnrollingFactor(loop_info, header);
+      if (unroll_factor <= 1) {
+        return LoopAnalysisInfo::kNoUnrollingFactor;
+      }
+    } else {
+      // Don't unroll with insufficient iterations.
+      if (trip_count < (2 * vector_length + max_peel)) {
+        return LoopAnalysisInfo::kNoUnrollingFactor;
+      }
+
+      // Don't unroll for large loop body size.
+      uint32_t unroll_cnt = GetUnrollingFactor(loop_info, header);
+      if (unroll_cnt <= 1) {
+        return LoopAnalysisInfo::kNoUnrollingFactor;
+      }
+
+      // Find a beneficial unroll factor with the following restrictions:
+      //  - At least one iteration of the transformed loop should be executed.
+      //  - The loop body shouldn't be "too big" (heuristic).
+      uint32_t uf2 = (trip_count - max_peel) / vector_length;
+      unroll_factor = TruncToPowerOfTwo(std::min(uf2, unroll_cnt));
+      DCHECK_GE(unroll_factor, 1u);
+    }
+
+    return unroll_factor;
+  }
+};
+
+uint32_t X86_64LoopHelper::GetUnrollingFactor(HLoopInformation* loop_info,
+                                              HBasicBlock* header) const {
+  uint32_t num_inst = 0, num_inst_header = 0, num_inst_loop_body = 0;
+  for (HBlocksInLoopIterator it(*loop_info); !it.Done(); it.Advance()) {
+    HBasicBlock* block = it.Current();
+    DCHECK(block);
+    num_inst = 0;
+
+    for (HInstructionIterator it1(block->GetInstructions()); !it1.Done(); it1.Advance()) {
+      HInstruction* inst = it1.Current();
+      DCHECK(inst);
+
+      // SuspendCheck inside loop is handled with Goto.
+      // Ignoring SuspendCheck & Goto as partially unrolled loop body will have only one Goto.
+      // Instruction count for Goto is being handled during unroll factor calculation below.
+      if (inst->IsSuspendCheck() || inst->IsGoto()) {
+        continue;
+      }
+
+      num_inst += GetMachineInstructionCount(inst);
+    }
+
+    if (block == header) {
+      num_inst_header = num_inst;
+    } else {
+      num_inst_loop_body += num_inst;
+    }
+  }
+
+  // Calculate actual unroll factor.
+  uint32_t unrolling_factor = kX86_64MaxUnrollFactor;
+  uint32_t unrolling_inst = kX86_64UnrolledMaxBodySizeInstr;
+  // "-3" for one Goto instruction.
+  uint32_t desired_size = unrolling_inst - num_inst_header - 3;
+  if (desired_size < (2 * num_inst_loop_body)) {
+    return 1;
+  }
+
+  while (unrolling_factor > 0) {
+    if ((desired_size >> unrolling_factor) >= num_inst_loop_body) {
+      break;
+    }
+    unrolling_factor--;
+  }
+
+  return (1 << unrolling_factor);
+}
+
 ArchNoOptsLoopHelper* ArchNoOptsLoopHelper::Create(InstructionSet isa,
                                                    ArenaAllocator* allocator) {
   switch (isa) {
     case InstructionSet::kArm64: {
       return new (allocator) Arm64LoopHelper;
     }
+    case InstructionSet::kX86_64: {
+      return new (allocator) X86_64LoopHelper;
+    }
     default: {
       return new (allocator) ArchDefaultLoopHelper;
     }
author	Neeraj Solanki <neeraj.solanki@intel.com>	2019-10-15 14:04:15 +0530
committer	Treehugger Robot <treehugger-gerrit@google.com>	2019-11-19 14:04:33 +0000
commit	26f6330d72e90b3162f8ead17a774a78effc82fc (patch)
tree	55897a392a4eaf43e664cf9be090c8bbcdb1ad92 /compiler/optimizing/loop_analysis.cc
parent	a6d7f50d36365e2dc8d19d7b25328b0aff7fd5ce (diff)