diff options
author | Aart Bik <ajcbik@google.com> | 2017-02-06 15:35:29 -0800 |
---|---|---|
committer | Aart Bik <ajcbik@google.com> | 2017-03-31 10:58:11 -0700 |
commit | f8f5a16ed7bad1e18179e38453e59c96a944de10 (patch) | |
tree | 53369083a97103563467cc5910a439a1864dd0b1 /compiler/optimizing/loop_optimization.h | |
parent | 7298b1ae3e9af5fdb46d168302a26cfbf5d475f5 (diff) |
ART vectorizer.
Rationale:
Make SIMD great again with a retargetable and easily extendable vectorizer.
Provides a full x86/x86_64 and a proof-of-concept ARM implementation. Sample
improvement (without any perf tuning yet) for Linpack on x86 is about 20% to 50%.
Test: test-art-host, test-art-target (angler)
Bug: 34083438, 30933338
Change-Id: Ifb77a0f25f690a87cd65bf3d5e9f6be7ea71d6c1
Diffstat (limited to 'compiler/optimizing/loop_optimization.h')
-rw-r--r-- | compiler/optimizing/loop_optimization.h | 115 |
1 files changed, 106 insertions, 9 deletions
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index 0b798fc7a9..16f7691af2 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -27,7 +27,8 @@ class CompilerDriver; /** * Loop optimizations. Builds a loop hierarchy and applies optimizations to - * the detected nested loops, such as removal of dead induction and empty loops. + * the detected nested loops, such as removal of dead induction and empty loops + * and inner loop vectorization. */ class HLoopOptimization : public HOptimization { public: @@ -50,34 +51,105 @@ class HLoopOptimization : public HOptimization { inner(nullptr), previous(nullptr), next(nullptr) {} - HLoopInformation* const loop_info; + HLoopInformation* loop_info; LoopNode* outer; LoopNode* inner; LoopNode* previous; LoopNode* next; }; - void LocalRun(); + /* + * Vectorization restrictions (bit mask). + */ + enum VectorRestrictions { + kNone = 0, // no restrictions + kNoMul = 1, // no multiplication + kNoDiv = 2, // no division + kNoShift = 4, // no shift + kNoShr = 8, // no arithmetic shift right + kNoHiBits = 16, // "wider" operations cannot bring in higher order bits + }; + + /* + * Vectorization mode during synthesis + * (sequential peeling/cleanup loop or vector loop). + */ + enum VectorMode { + kSequential, + kVector + }; + + /* + * Representation of a unit-stride array reference. + */ + struct ArrayReference { + ArrayReference(HInstruction* b, HInstruction* o, Primitive::Type t, bool l) + : base(b), offset(o), type(t), lhs(l) { } + bool operator<(const ArrayReference& other) const { + return + (base < other.base) || + (base == other.base && + (offset < other.offset || (offset == other.offset && + (type < other.type || + (type == other.type && lhs < other.lhs))))); + } + HInstruction* base; // base address + HInstruction* offset; // offset + i + Primitive::Type type; // component type + bool lhs; // def/use + }; + // Loop setup and traversal. + void LocalRun(); void AddLoop(HLoopInformation* loop_info); void RemoveLoop(LoopNode* node); - void TraverseLoopsInnerToOuter(LoopNode* node); - // Simplification. + // Optimization. void SimplifyInduction(LoopNode* node); void SimplifyBlocks(LoopNode* node); - bool SimplifyInnerLoop(LoopNode* node); + void OptimizeInnerLoop(LoopNode* node); + + // Vectorization analysis and synthesis. + bool CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count); + void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count); + void GenerateNewLoop(LoopNode* node, + HBasicBlock* block, + HBasicBlock* new_preheader, + HInstruction* lo, + HInstruction* hi, + HInstruction* step); + bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code); + bool VectorizeUse(LoopNode* node, + HInstruction* instruction, + bool generate_code, + Primitive::Type type, + uint64_t restrictions); + bool TrySetVectorType(Primitive::Type type, /*out*/ uint64_t* restrictions); + bool TrySetVectorLength(uint32_t length); + void GenerateVecInv(HInstruction* org, Primitive::Type type); + void GenerateVecSub(HInstruction* org, HInstruction* off); + void GenerateVecMem(HInstruction* org, + HInstruction* opa, + HInstruction* opb, + Primitive::Type type); + void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, Primitive::Type type); // Helpers. - bool IsPhiInduction(HPhi* phi); - bool IsEmptyHeader(HBasicBlock* block); + bool TrySetPhiInduction(HPhi* phi, bool restrict_uses); + bool TrySetSimpleLoopHeader(HBasicBlock* block); bool IsEmptyBody(HBasicBlock* block); bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info, HInstruction* instruction, bool collect_loop_uses, /*out*/ int32_t* use_count); + bool IsUsedOutsideLoop(HLoopInformation* loop_info, + HInstruction* instruction); bool TryReplaceWithLastValue(HInstruction* instruction, HBasicBlock* block); + bool TryAssignLastValue(HLoopInformation* loop_info, + HInstruction* instruction, + HBasicBlock* block, + bool collect_loop_uses); void RemoveDeadInstructions(const HInstructionList& list); // Compiler driver (to query ISA features). @@ -90,6 +162,9 @@ class HLoopOptimization : public HOptimization { // through this allocator is immediately released when the loop optimizer is done. ArenaAllocator* loop_allocator_; + // Global heap memory allocator. Used to build HIR. + ArenaAllocator* global_allocator_; + // Entries into the loop hierarchy representation. The hierarchy resides // in phase-local heap memory. LoopNode* top_loop_; @@ -102,11 +177,33 @@ class HLoopOptimization : public HOptimization { // Counter that tracks how many induction cycles have been simplified. Useful // to trigger incremental updates of induction variable analysis of outer loops // when the induction of inner loops has changed. - int32_t induction_simplication_count_; + uint32_t induction_simplication_count_; // Flag that tracks if any simplifications have occurred. bool simplified_; + // Number of "lanes" for selected packed type. + uint32_t vector_length_; + + // Set of array references in the vector loop. + // Contents reside in phase-local heap memory. + ArenaSet<ArrayReference>* vector_refs_; + + // Mapping used during vectorization synthesis for both the scalar peeling/cleanup + // loop (simd_ is false) and the actual vector loop (simd_ is true). The data + // structure maps original instructions into the new instructions. + // Contents reside in phase-local heap memory. + ArenaSafeMap<HInstruction*, HInstruction*>* vector_map_; + + // Temporary vectorization bookkeeping. + HBasicBlock* vector_preheader_; // preheader of the new loop + HBasicBlock* vector_header_; // header of the new loop + HBasicBlock* vector_body_; // body of the new loop + HInstruction* vector_runtime_test_a_; + HInstruction* vector_runtime_test_b_; // defines a != b runtime test + HPhi* vector_phi_; // the Phi representing the normalized loop index + VectorMode vector_mode_; // selects synthesis mode + friend class LoopOptimizationTest; DISALLOW_COPY_AND_ASSIGN(HLoopOptimization); |