diff options
Diffstat (limited to 'compiler/optimizing/load_store_elimination.cc')
-rw-r--r-- | compiler/optimizing/load_store_elimination.cc | 1321 |
1 files changed, 1290 insertions, 31 deletions
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc index 2e0f2b12d5..17ce694750 100644 --- a/compiler/optimizing/load_store_elimination.cc +++ b/compiler/optimizing/load_store_elimination.cc @@ -27,16 +27,23 @@ #include "base/bit_vector-inl.h" #include "base/bit_vector.h" #include "base/globals.h" +#include "base/indenter.h" #include "base/iteration_range.h" #include "base/scoped_arena_allocator.h" #include "base/scoped_arena_containers.h" +#include "base/transform_iterator.h" #include "escape.h" #include "execution_subgraph.h" +#include "handle.h" #include "load_store_analysis.h" +#include "mirror/class_loader.h" +#include "mirror/dex_cache.h" #include "nodes.h" +#include "optimizing/execution_subgraph.h" #include "optimizing_compiler_stats.h" #include "reference_type_propagation.h" #include "side_effects_analysis.h" +#include "stack_map.h" /** * The general algorithm of load-store elimination (LSE). @@ -57,6 +64,9 @@ * - In phase 4, we commit the changes, replacing loads marked for elimination * in previous processing and removing stores not marked for keeping. We also * remove allocations that are no longer needed. + * - In phase 5, we move allocations which only escape along some executions + * closer to their escape points and fixup non-escaping paths with their actual + * values, creating PHIs when needed. * * 1. Walk over blocks and their instructions. * @@ -82,7 +92,9 @@ * to maintain the validity of all heap locations during the optimization * phase, we only record substitutes at this phase and the real elimination * is delayed till the end of LSE. Loads that require a loop Phi placeholder - * replacement are recorded for processing later. + * replacement are recorded for processing later. We also keep track of the + * heap-value at the start load so that later partial-LSE can predicate the + * load. * - If the instruction is a store, it updates the heap value for the heap * location with the stored value and records the store itself so that we can * mark it for keeping if the value becomes observable. Heap values are @@ -228,7 +240,80 @@ * The time complexity of this phase is * O(instructions + instruction_uses) . * - * FIXME: The time complexity described above assumes that the + * 5. Partial LSE + * + * Move allocations closer to their escapes and remove/predicate loads and + * stores as required. + * + * Partial singletons are objects which only escape from the function or have + * multiple names along certain execution paths. In cases where we recognize + * these partial singletons we can move the allocation and initialization + * closer to the actual escape(s). We can then perform a simplified version of + * LSE step 2 to determine the unescaped value of any reads performed after the + * object may have escaped. These are used to replace these reads with + * 'predicated-read' instructions where the value is only read if the object + * has actually escaped. We use the existence of the object itself as the + * marker of whether escape has occurred. + * + * There are several steps in this sub-pass + * + * 5.1 Group references + * + * Since all heap-locations for a single reference escape at the same time, we + * need to group the heap-locations by reference and process them at the same + * time. + * + * O(heap_locations). + * + * FIXME: The time complexity above assumes we can bucket the heap-locations in + * O(1) which is not true since we just perform a linear-scan of the heap-ref + * list. Since there are generally only a small number of heap-references which + * are partial-singletons this is fine and lower real overhead than a hash map. + * + * 5.2 Generate materializations + * + * Once we have the references we add new 'materialization blocks' on the edges + * where escape becomes inevitable. This information is calculated by the + * execution-subgraphs created during load-store-analysis. We create new + * 'materialization's in these blocks and initialize them with the value of + * each heap-location ignoring side effects (since the object hasn't escaped + * yet). Worst case this is the same time-complexity as step 3 since we may + * need to materialize phis. + * + * O(heap_locations^2 * materialization_edges) + * + * 5.3 Propagate materializations + * + * Since we use the materialization as the marker for escape we need to + * propagate it throughout the graph. Since the subgraph analysis considers any + * lifetime that escapes a loop (and hence would require a loop-phi) to be + * escaping at the loop-header we do not need to create any loop-phis to do + * this. + * + * O(edges) + * + * NB: Currently the subgraph analysis considers all objects to have their + * lifetimes start at the entry block. This simplifies that analysis enormously + * but means that we cannot distinguish between an escape in a loop where the + * lifetime does not escape the loop (in which case this pass could optimize) + * and one where it does escape the loop (in which case the whole loop is + * escaping). This is a shortcoming that would be good to fix at some point. + * + * 5.4 Propagate partial values + * + * We need to replace loads and stores to the partial reference with predicated + * ones that have default non-escaping values. Again this is the same as step 3. + * + * O(heap_locations^2 * edges) + * + * 5.5 Final fixup + * + * Now all we need to do is replace and remove uses of the old reference with the + * appropriate materialization. + * + * O(instructions + uses) + * + * FIXME: The time complexities described above assumes that the * HeapLocationCollector finds a heap location for an instruction in O(1) * time but it is currently O(heap_locations); this can be fixed by adding * a hash map to the HeapLocationCollector. @@ -236,11 +321,18 @@ namespace art { +#define LSE_VLOG \ + if (::art::LoadStoreElimination::kVerboseLoggingMode && VLOG_IS_ON(compiler)) LOG(INFO) + +class PartialLoadStoreEliminationHelper; +class HeapRefHolder; + // Use HGraphDelegateVisitor for which all VisitInvokeXXX() delegate to VisitInvoke(). class LSEVisitor final : private HGraphDelegateVisitor { public: LSEVisitor(HGraph* graph, const HeapLocationCollector& heap_location_collector, + bool perform_partial_lse, OptimizingCompilerStats* stats); void Run(); @@ -278,6 +370,45 @@ class LSEVisitor final : private HGraphDelegateVisitor { uint32_t heap_location_; }; + struct Marker {}; + + class Value; + + class PriorValueHolder { + public: + constexpr explicit PriorValueHolder(Value prior); + + constexpr bool IsInstruction() const { + return std::holds_alternative<HInstruction*>(value_); + } + constexpr bool IsPhi() const { + return std::holds_alternative<PhiPlaceholder>(value_); + } + constexpr bool IsDefault() const { + return std::holds_alternative<Marker>(value_); + } + constexpr PhiPlaceholder GetPhiPlaceholder() const { + DCHECK(IsPhi()); + return std::get<PhiPlaceholder>(value_); + } + constexpr HInstruction* GetInstruction() const { + DCHECK(IsInstruction()); + return std::get<HInstruction*>(value_); + } + + Value ToValue() const; + void Dump(std::ostream& oss) const; + + constexpr bool Equals(PriorValueHolder other) const { + return value_ == other.value_; + } + + private: + std::variant<Marker, HInstruction*, PhiPlaceholder> value_; + }; + + friend constexpr bool operator==(const Marker&, const Marker&); + friend constexpr bool operator==(const PriorValueHolder& p1, const PriorValueHolder& p2); friend constexpr bool operator==(const PhiPlaceholder& p1, const PhiPlaceholder& p2); friend std::ostream& operator<<(std::ostream& oss, const PhiPlaceholder& p2); @@ -310,6 +441,14 @@ class LSEVisitor final : private HGraphDelegateVisitor { return Value(ValuelessType::kPureUnknown); } + static constexpr Value PartialUnknown(Value old_value) { + if (old_value.IsInvalid() || old_value.IsPureUnknown()) { + return PureUnknown(); + } else { + return Value(PriorValueHolder(old_value)); + } + } + static constexpr Value MergedUnknown(PhiPlaceholder phi_placeholder) { return Value(MergedUnknownMarker{phi_placeholder}); } @@ -346,6 +485,10 @@ class LSEVisitor final : private HGraphDelegateVisitor { GetValuelessType() == ValuelessType::kInvalid; } + bool IsPartialUnknown() const { + return std::holds_alternative<PriorValueHolder>(value_); + } + bool IsMergedUnknown() const { return std::holds_alternative<MergedUnknownMarker>(value_); } @@ -356,7 +499,7 @@ class LSEVisitor final : private HGraphDelegateVisitor { } bool IsUnknown() const { - return IsPureUnknown() || IsMergedUnknown(); + return IsPureUnknown() || IsMergedUnknown() || IsPartialUnknown(); } bool IsDefault() const { @@ -381,10 +524,15 @@ class LSEVisitor final : private HGraphDelegateVisitor { } HInstruction* GetInstruction() const { - DCHECK(IsInstruction()); + DCHECK(IsInstruction()) << *this; return std::get<HInstruction*>(value_); } + PriorValueHolder GetPriorValue() const { + DCHECK(IsPartialUnknown()); + return std::get<PriorValueHolder>(value_); + } + PhiPlaceholder GetPhiPlaceholder() const { DCHECK(NeedsPhi() || IsMergedUnknown()); if (NeedsNonLoopPhi()) { @@ -402,7 +550,7 @@ class LSEVisitor final : private HGraphDelegateVisitor { } HBasicBlock* GetMergeBlock(const HGraph* graph) const { - DCHECK(IsMergedUnknown()) << this; + DCHECK(IsMergedUnknown()) << *this; return graph->GetBlocks()[GetMergeBlockId()]; } @@ -411,6 +559,8 @@ class LSEVisitor final : private HGraphDelegateVisitor { return GetPhiPlaceholder().GetHeapLocation(); } + constexpr bool ExactEquals(Value other) const; + constexpr bool Equals(Value other) const; constexpr bool Equals(HInstruction* instruction) const { @@ -427,7 +577,8 @@ class LSEVisitor final : private HGraphDelegateVisitor { HInstruction*, MergedUnknownMarker, NeedsNonLoopPhiMarker, - NeedsLoopPhiMarker>; + NeedsLoopPhiMarker, + PriorValueHolder>; constexpr ValuelessType GetValuelessType() const { return std::get<ValuelessType>(value_); } @@ -493,7 +644,9 @@ class LSEVisitor final : private HGraphDelegateVisitor { } Value Replacement(Value value) const { - DCHECK(value.NeedsPhi()); + DCHECK(value.NeedsPhi() || + (current_phase_ == Phase::kPartialElimination && value.IsMergedUnknown())) + << value << " phase: " << current_phase_; Value replacement = phi_placeholder_replacements_[PhiPlaceholderIndex(value)]; DCHECK(replacement.IsUnknown() || replacement.IsInstruction()); DCHECK(replacement.IsUnknown() || @@ -502,6 +655,16 @@ class LSEVisitor final : private HGraphDelegateVisitor { } Value ReplacementOrValue(Value value) const { + if (current_phase_ == Phase::kPartialElimination) { + if (value.IsPartialUnknown()) { + value = value.GetPriorValue().ToValue(); + } + if (value.IsMergedUnknown()) { + return phi_placeholder_replacements_[PhiPlaceholderIndex(value)].IsValid() + ? Replacement(value) + : Value::ForLoopPhiPlaceholder(value.GetPhiPlaceholder()); + } + } if (value.NeedsPhi() && phi_placeholder_replacements_[PhiPlaceholderIndex(value)].IsValid()) { return Replacement(value); } else { @@ -598,6 +761,7 @@ class LSEVisitor final : private HGraphDelegateVisitor { static bool IsLoad(HInstruction* instruction) { // Unresolved load is not treated as a load. return instruction->IsInstanceFieldGet() || + instruction->IsPredicatedInstanceFieldGet() || instruction->IsStaticFieldGet() || instruction->IsVecLoad() || instruction->IsArrayGet(); @@ -623,7 +787,7 @@ class LSEVisitor final : private HGraphDelegateVisitor { // Keep the store referenced by the instruction, or all stores that feed a Phi placeholder. // This is necessary if the stored heap value can be observed. void KeepStores(Value value) { - if (value.IsPureUnknown()) { + if (value.IsPureUnknown() || value.IsPartialUnknown()) { return; } if (value.IsMergedUnknown()) { @@ -743,12 +907,16 @@ class LSEVisitor final : private HGraphDelegateVisitor { void VisitGetLocation(HInstruction* instruction, size_t idx); void VisitSetLocation(HInstruction* instruction, size_t idx, HInstruction* value); + void RecordFieldInfo(const FieldInfo* info, size_t heap_loc) { + field_infos_[heap_loc] = info; + } void VisitBasicBlock(HBasicBlock* block) override; enum class Phase { kLoadElimination, - kStoreElimination + kStoreElimination, + kPartialElimination, }; bool TryReplacingLoopPhiPlaceholderWithDefault( @@ -765,8 +933,10 @@ class LSEVisitor final : private HGraphDelegateVisitor { bool can_use_default_or_phi); bool MaterializeLoopPhis(const ScopedArenaVector<size_t>& phi_placeholder_indexes, DataType::Type type); + bool MaterializeLoopPhis(ArrayRef<const size_t> phi_placeholder_indexes, DataType::Type type); bool MaterializeLoopPhis(const ArenaBitVector& phi_placeholders_to_materialize, DataType::Type type); + bool FullyMaterializePhi(PhiPlaceholder phi_placeholder, DataType::Type type); std::optional<PhiPlaceholder> TryToMaterializeLoopPhis(PhiPlaceholder phi_placeholder, HInstruction* load); void ProcessLoopPhiWithUnknownInput(PhiPlaceholder loop_phi_with_unknown_input); @@ -776,6 +946,22 @@ class LSEVisitor final : private HGraphDelegateVisitor { void UpdateValueRecordForStoreElimination(/*inout*/ValueRecord* value_record); void FindOldValueForPhiPlaceholder(PhiPlaceholder phi_placeholder, DataType::Type type); void FindStoresWritingOldValues(); + void FinishFullLSE(); + void PrepareForPartialPhiComputation(); + // Create materialization block and materialization object for the given predecessor of entry. + HInstruction* SetupPartialMaterialization(PartialLoadStoreEliminationHelper& helper, + HeapRefHolder&& holder, + size_t pred_idx, + HBasicBlock* blk); + // Returns the value that would be read by the 'read' instruction on + // 'orig_new_inst' if 'orig_new_inst' has not escaped. + HInstruction* GetPartialValueAt(HNewInstance* orig_new_inst, HInstruction* read); + void MovePartialEscapes(); + + void VisitPredicatedInstanceFieldGet(HPredicatedInstanceFieldGet* instruction) override { + LOG(FATAL) << "Visited instruction " << instruction->DumpWithoutArgs() + << " but LSE should be the only source of predicated-ifield-gets!"; + } void VisitInstanceFieldGet(HInstanceFieldGet* instruction) override { HInstruction* object = instruction->InputAt(0); @@ -914,7 +1100,7 @@ class LSEVisitor final : private HGraphDelegateVisitor { } if (side_effects.DoesAnyWrite()) { // The value may be clobbered. - heap_values[i].value = Value::PureUnknown(); + heap_values[i].value = Value::PartialUnknown(heap_values[i].value); } } } @@ -1010,6 +1196,12 @@ class LSEVisitor final : private HGraphDelegateVisitor { } } + bool ShouldPerformPartialLSE() const { + return perform_partial_lse_ && !GetGraph()->IsCompilingOsr(); + } + + bool perform_partial_lse_; + const HeapLocationCollector& heap_location_collector_; // Use local allocator for allocating memory. @@ -1035,6 +1227,12 @@ class LSEVisitor final : private HGraphDelegateVisitor { // in the end. These are indexed by the load's id. ScopedArenaVector<HInstruction*> substitute_instructions_for_loads_; + // Value at the start of the given instruction for instructions which directly + // read from a heap-location (i.e. FieldGet). The mapping to heap-location is + // implicit through the fact that each instruction can only directly refer to + // a single heap-location. + ScopedArenaHashMap<HInstruction*, Value> intermediate_values_; + // Record stores to keep in a bit vector indexed by instruction ID. ArenaBitVector kept_stores_; // When we need to keep all stores that feed a Phi placeholder, we just record the @@ -1063,23 +1261,70 @@ class LSEVisitor final : private HGraphDelegateVisitor { ScopedArenaVector<HInstruction*> singleton_new_instances_; + // The field infos for each heap location (if relevant). + ScopedArenaVector<const FieldInfo*> field_infos_; + Phase current_phase_; + friend class PartialLoadStoreEliminationHelper; + friend struct ScopedRestoreHeapValues; + friend std::ostream& operator<<(std::ostream& os, const Value& v); - friend std::ostream& operator<<(std::ostream& os, const Phase& phase); + friend std::ostream& operator<<(std::ostream& os, const PriorValueHolder& v); + friend std::ostream& operator<<(std::ostream& oss, const LSEVisitor::Phase& phase); DISALLOW_COPY_AND_ASSIGN(LSEVisitor); }; +std::ostream& operator<<(std::ostream& oss, const LSEVisitor::PriorValueHolder& p) { + p.Dump(oss); + return oss; +} + std::ostream& operator<<(std::ostream& oss, const LSEVisitor::Phase& phase) { switch (phase) { case LSEVisitor::Phase::kLoadElimination: return oss << "kLoadElimination"; case LSEVisitor::Phase::kStoreElimination: return oss << "kStoreElimination"; + case LSEVisitor::Phase::kPartialElimination: + return oss << "kPartialElimination"; } } +void LSEVisitor::PriorValueHolder::Dump(std::ostream& oss) const { + if (IsDefault()) { + oss << "Default"; + } else if (IsPhi()) { + oss << "Phi: " << GetPhiPlaceholder(); + } else { + oss << "Instruction: " << *GetInstruction(); + } +} + +constexpr LSEVisitor::PriorValueHolder::PriorValueHolder(Value val) + : value_(Marker{}) { + DCHECK(!val.IsInvalid() && !val.IsPureUnknown()); + if (val.IsPartialUnknown()) { + value_ = val.GetPriorValue().value_; + } else if (val.IsMergedUnknown() || val.NeedsPhi()) { + value_ = val.GetPhiPlaceholder(); + } else if (val.IsInstruction()) { + value_ = val.GetInstruction(); + } else { + DCHECK(val.IsDefault()); + } +} + +constexpr bool operator==(const LSEVisitor::Marker&, const LSEVisitor::Marker&) { + return true; +} + +constexpr bool operator==(const LSEVisitor::PriorValueHolder& p1, + const LSEVisitor::PriorValueHolder& p2) { + return p1.Equals(p2); +} + constexpr bool operator==(const LSEVisitor::PhiPlaceholder& p1, const LSEVisitor::PhiPlaceholder& p2) { return p1.Equals(p2); @@ -1105,6 +1350,20 @@ std::ostream& operator<<(std::ostream& oss, const LSEVisitor::PhiPlaceholder& p) return oss; } +LSEVisitor::Value LSEVisitor::PriorValueHolder::ToValue() const { + if (IsDefault()) { + return Value::Default(); + } else if (IsPhi()) { + return Value::ForLoopPhiPlaceholder(GetPhiPlaceholder()); + } else { + return Value::ForInstruction(GetInstruction()); + } +} + +constexpr bool LSEVisitor::Value::ExactEquals(LSEVisitor::Value other) const { + return value_ == other.value_; +} + constexpr bool LSEVisitor::Value::Equals(LSEVisitor::Value other) const { // Only valid values can be compared. DCHECK(IsValid()); @@ -1129,6 +1388,8 @@ std::ostream& LSEVisitor::Value::Dump(std::ostream& os) const { case ValuelessType::kInvalid: return os << "Invalid"; } + } else if (IsPartialUnknown()) { + return os << "PartialUnknown[" << GetPriorValue() << "]"; } else if (IsInstruction()) { return os << "Instruction[id: " << GetInstruction()->GetId() << ", block: " << GetInstruction()->GetBlock()->GetBlockId() << "]"; @@ -1151,8 +1412,10 @@ std::ostream& operator<<(std::ostream& os, const LSEVisitor::Value& v) { LSEVisitor::LSEVisitor(HGraph* graph, const HeapLocationCollector& heap_location_collector, + bool perform_partial_lse, OptimizingCompilerStats* stats) : HGraphDelegateVisitor(graph, stats), + perform_partial_lse_(perform_partial_lse), heap_location_collector_(heap_location_collector), allocator_(graph->GetArenaStack()), num_phi_placeholders_(GetGraph()->GetBlocks().size() * @@ -1166,13 +1429,14 @@ LSEVisitor::LSEVisitor(HGraph* graph, substitute_instructions_for_loads_(graph->GetCurrentInstructionId(), nullptr, allocator_.Adapter(kArenaAllocLSE)), + intermediate_values_(allocator_.Adapter(kArenaAllocLSE)), kept_stores_(&allocator_, - /*start_bits=*/ graph->GetCurrentInstructionId(), - /*expandable=*/ false, + /*start_bits=*/graph->GetCurrentInstructionId(), + /*expandable=*/false, kArenaAllocLSE), phi_placeholders_to_search_for_kept_stores_(&allocator_, num_phi_placeholders_, - /*expandable=*/ false, + /*expandable=*/false, kArenaAllocLSE), loads_requiring_loop_phi_(allocator_.Adapter(kArenaAllocLSE)), store_records_(allocator_.Adapter(kArenaAllocLSE)), @@ -1180,10 +1444,12 @@ LSEVisitor::LSEVisitor(HGraph* graph, Value::Invalid(), allocator_.Adapter(kArenaAllocLSE)), kept_merged_unknowns_(&allocator_, - /*start_bits=*/ num_phi_placeholders_, - /*expandable=*/ false, + /*start_bits=*/num_phi_placeholders_, + /*expandable=*/false, kArenaAllocLSE), singleton_new_instances_(allocator_.Adapter(kArenaAllocLSE)), + field_infos_(heap_location_collector_.GetNumberOfHeapLocations(), + allocator_.Adapter(kArenaAllocLSE)), current_phase_(Phase::kLoadElimination) { // Clear bit vectors. phi_placeholders_to_search_for_kept_stores_.ClearAllBits(); @@ -1249,9 +1515,13 @@ void LSEVisitor::PrepareLoopRecords(HBasicBlock* block) { // Don't eliminate loads in irreducible loops. if (block->GetLoopInformation()->IsIrreducible()) { heap_values.resize(num_heap_locations, - {/*value=*/Value::PureUnknown(), /*stored_by=*/Value::PureUnknown()}); + {/*value=*/Value::Invalid(), /*stored_by=*/Value::PureUnknown()}); // Also keep the stores before the loop header, including in blocks that were not visited yet. + bool is_osr = GetGraph()->IsCompilingOsr(); for (size_t idx = 0u; idx != num_heap_locations; ++idx) { + heap_values[idx].value = + is_osr ? Value::PureUnknown() + : Value::MergedUnknown(GetPhiPlaceholder(block->GetBlockId(), idx)); KeepStores(Value::ForLoopPhiPlaceholder(GetPhiPlaceholder(block->GetBlockId(), idx))); } return; @@ -1410,9 +1680,10 @@ void LSEVisitor::MaterializeNonLoopPhis(PhiPlaceholder phi_placeholder, DataType phi_inputs.clear(); for (HBasicBlock* predecessor : current_block->GetPredecessors()) { Value pred_value = ReplacementOrValue(heap_values_for_[predecessor->GetBlockId()][idx].value); - DCHECK(!pred_value.IsUnknown()) - << "block " << current_block->GetBlockId() << " pred: " << predecessor->GetBlockId(); - if (pred_value.NeedsNonLoopPhi()) { + DCHECK(!pred_value.IsPureUnknown()) << pred_value << " block " << current_block->GetBlockId() + << " pred: " << predecessor->GetBlockId(); + if (pred_value.NeedsNonLoopPhi() || + (current_phase_ == Phase::kPartialElimination && pred_value.IsMergedUnknown())) { // We need to process the Phi placeholder first. work_queue.push_back(pred_value.GetPhiPlaceholder()); } else if (pred_value.IsDefault()) { @@ -1439,7 +1710,17 @@ void LSEVisitor::VisitGetLocation(HInstruction* instruction, size_t idx) { uint32_t block_id = instruction->GetBlock()->GetBlockId(); ScopedArenaVector<ValueRecord>& heap_values = heap_values_for_[block_id]; ValueRecord& record = heap_values[idx]; + if (instruction->IsFieldAccess()) { + RecordFieldInfo(&instruction->GetFieldInfo(), idx); + } DCHECK(record.value.IsUnknown() || record.value.Equals(ReplacementOrValue(record.value))); + // If we are unknown, we either come from somewhere untracked or we can reconstruct the partial + // value. + DCHECK(!record.value.IsPureUnknown() || + heap_location_collector_.GetHeapLocation(idx)->GetReferenceInfo() == nullptr || + !heap_location_collector_.GetHeapLocation(idx)->GetReferenceInfo()->IsPartialSingleton()) + << "In " << GetGraph()->PrettyMethod() << ": " << record.value << " for " << *instruction; + intermediate_values_.insert({instruction, record.value}); loads_and_stores_.push_back({ instruction, idx }); if ((record.value.IsDefault() || record.value.NeedsNonLoopPhi()) && !IsDefaultOrPhiAllowedForLoad(instruction)) { @@ -1475,6 +1756,9 @@ void LSEVisitor::VisitGetLocation(HInstruction* instruction, size_t idx) { void LSEVisitor::VisitSetLocation(HInstruction* instruction, size_t idx, HInstruction* value) { DCHECK_NE(idx, HeapLocationCollector::kHeapLocationNotFound); DCHECK(!IsStore(value)) << value->DebugName(); + if (instruction->IsFieldAccess()) { + RecordFieldInfo(&instruction->GetFieldInfo(), idx); + } // value may already have a substitute. value = FindSubstitute(value); HBasicBlock* block = instruction->GetBlock(); @@ -1533,7 +1817,7 @@ void LSEVisitor::VisitSetLocation(HInstruction* instruction, size_t idx, HInstru // Kill heap locations that may alias and keep previous stores to these locations. KeepStores(heap_values[i].stored_by); heap_values[i].stored_by = Value::PureUnknown(); - heap_values[i].value = Value::PureUnknown(); + heap_values[i].value = Value::PartialUnknown(heap_values[i].value); } } @@ -1753,6 +2037,8 @@ std::optional<LSEVisitor::PhiPlaceholder> LSEVisitor::FindLoopPhisToMaterialize( if (!phi_placeholders_to_materialize->IsBitSet(PhiPlaceholderIndex(value))) { phi_placeholders_to_materialize->SetBit(PhiPlaceholderIndex(value)); work_queue.push_back(value.GetPhiPlaceholder()); + LSE_VLOG << "For materialization of " << phi_placeholder + << " we need to materialize " << value; } } } @@ -1764,6 +2050,11 @@ std::optional<LSEVisitor::PhiPlaceholder> LSEVisitor::FindLoopPhisToMaterialize( bool LSEVisitor::MaterializeLoopPhis(const ScopedArenaVector<size_t>& phi_placeholder_indexes, DataType::Type type) { + return MaterializeLoopPhis(ArrayRef<const size_t>(phi_placeholder_indexes), type); +} + +bool LSEVisitor::MaterializeLoopPhis(ArrayRef<const size_t> phi_placeholder_indexes, + DataType::Type type) { // Materialize all predecessors that do not need a loop Phi and determine if all inputs // other than loop Phis are the same. const ArenaVector<HBasicBlock*>& blocks = GetGraph()->GetBlocks(); @@ -1775,8 +2066,11 @@ bool LSEVisitor::MaterializeLoopPhis(const ScopedArenaVector<size_t>& phi_placeh size_t idx = phi_placeholder.GetHeapLocation(); for (HBasicBlock* predecessor : block->GetPredecessors()) { Value value = ReplacementOrValue(heap_values_for_[predecessor->GetBlockId()][idx].value); - if (value.NeedsNonLoopPhi()) { - DCHECK(current_phase_ == Phase::kLoadElimination); + if (value.NeedsNonLoopPhi() || + (current_phase_ == Phase::kPartialElimination && value.IsMergedUnknown())) { + DCHECK(current_phase_ == Phase::kLoadElimination || + current_phase_ == Phase::kPartialElimination) + << current_phase_; MaterializeNonLoopPhis(value.GetPhiPlaceholder(), type); value = Replacement(value); } @@ -2001,6 +2295,15 @@ bool LSEVisitor::MaterializeLoopPhis(const ArenaBitVector& phi_placeholders_to_m return true; } +bool LSEVisitor::FullyMaterializePhi(PhiPlaceholder phi_placeholder, DataType::Type type) { + ScopedArenaAllocator saa(GetGraph()->GetArenaStack()); + ArenaBitVector abv(&saa, num_phi_placeholders_, false, ArenaAllocKind::kArenaAllocLSE); + auto res = + FindLoopPhisToMaterialize(phi_placeholder, &abv, type, /* can_use_default_or_phi=*/true); + CHECK(!res.has_value()) << *res; + return MaterializeLoopPhis(abv, type); +} + std::optional<LSEVisitor::PhiPlaceholder> LSEVisitor::TryToMaterializeLoopPhis( PhiPlaceholder phi_placeholder, HInstruction* load) { DCHECK(phi_placeholder_replacements_[PhiPlaceholderIndex(phi_placeholder)].IsInvalid()); @@ -2144,7 +2447,7 @@ void LSEVisitor::ProcessLoopPhiWithUnknownInput(PhiPlaceholder loop_phi_with_unk // propagated as a value to this load) and store the load as the new heap value. found_unreplaceable_load = true; KeepStores(record.value); - record.value = Value::PureUnknown(); + record.value = Value::MergedUnknown(record.value.GetPhiPlaceholder()); local_heap_values[idx] = Value::ForInstruction(load_or_store); } else if (local_heap_values[idx].NeedsLoopPhi()) { // The load may still be replaced with a Phi later. @@ -2386,7 +2689,57 @@ void LSEVisitor::FindOldValueForPhiPlaceholder(PhiPlaceholder phi_placeholder, !success); } +struct ScopedRestoreHeapValues { + public: + ScopedRestoreHeapValues(ArenaStack* alloc, + size_t num_heap_locs, + ScopedArenaVector<ScopedArenaVector<LSEVisitor::ValueRecord>>& to_restore) + : alloc_(alloc), + updated_values_(alloc_.Adapter(kArenaAllocLSE)), + to_restore_(to_restore) { + updated_values_.reserve(num_heap_locs * to_restore_.size()); + } + + ~ScopedRestoreHeapValues() { + for (const auto& rec : updated_values_) { + to_restore_[rec.blk_id][rec.heap_loc].value = rec.val_; + } + } + + template<typename Func> + void ForEachRecord(Func func) { + for (size_t blk_id : Range(to_restore_.size())) { + for (size_t heap_loc : Range(to_restore_[blk_id].size())) { + LSEVisitor::ValueRecord* vr = &to_restore_[blk_id][heap_loc]; + LSEVisitor::Value initial = vr->value; + func(vr); + if (!vr->value.ExactEquals(initial)) { + updated_values_.push_back({blk_id, heap_loc, initial}); + } + } + } + } + + private: + struct UpdateRecord { + size_t blk_id; + size_t heap_loc; + LSEVisitor::Value val_; + }; + ScopedArenaAllocator alloc_; + ScopedArenaVector<UpdateRecord> updated_values_; + ScopedArenaVector<ScopedArenaVector<LSEVisitor::ValueRecord>>& to_restore_; + + DISALLOW_COPY_AND_ASSIGN(ScopedRestoreHeapValues); +}; + void LSEVisitor::FindStoresWritingOldValues() { + // Partial LSE relies on knowing the real heap-values not the + // store-replacement versions so we need to restore the map after removing + // stores. + ScopedRestoreHeapValues heap_vals(allocator_.GetArenaStack(), + heap_location_collector_.GetNumberOfHeapLocations(), + heap_values_for_); // The Phi placeholder replacements have so far been used for eliminating loads, // tracking values that would be stored if all stores were kept. As we want to // compare actual old values after removing unmarked stores, prune the Phi @@ -2401,10 +2754,14 @@ void LSEVisitor::FindStoresWritingOldValues() { } // Update heap values at end of blocks. - for (HBasicBlock* block : GetGraph()->GetReversePostOrder()) { - for (ValueRecord& value_record : heap_values_for_[block->GetBlockId()]) { - UpdateValueRecordForStoreElimination(&value_record); - } + heap_vals.ForEachRecord([&](ValueRecord* rec) { + UpdateValueRecordForStoreElimination(rec); + }); + + if (kIsDebugBuild) { + heap_vals.ForEachRecord([](ValueRecord* rec) { + DCHECK(!rec->value.NeedsNonLoopPhi()) << rec->value; + }); } // Use local allocator to reduce peak memory usage. @@ -2458,7 +2815,903 @@ void LSEVisitor::Run() { FindStoresWritingOldValues(); // 4. Replace loads and remove unnecessary stores and singleton allocations. + FinishFullLSE(); + + // 5. Move partial escapes down and fixup with PHIs. + current_phase_ = Phase::kPartialElimination; + MovePartialEscapes(); +} + +// Clear unknown loop-phi results. Here we'll be able to use partial-unknowns so we need to +// retry all of them with more information about where they come from. +void LSEVisitor::PrepareForPartialPhiComputation() { + std::replace_if( + phi_placeholder_replacements_.begin(), + phi_placeholder_replacements_.end(), + [](const Value& val) { return val.IsPureUnknown(); }, + Value::Invalid()); +} + +class PartialLoadStoreEliminationHelper { + public: + PartialLoadStoreEliminationHelper(LSEVisitor* lse, ScopedArenaAllocator* alloc) + : lse_(lse), + alloc_(alloc), + new_ref_phis_(alloc_->Adapter(kArenaAllocLSE)), + heap_refs_(alloc_->Adapter(kArenaAllocLSE)), + max_preds_per_block_((*std::max_element(GetGraph()->GetActiveBlocks().begin(), + GetGraph()->GetActiveBlocks().end(), + [](HBasicBlock* a, HBasicBlock* b) { + return a->GetNumberOfPredecessors() < + b->GetNumberOfPredecessors(); + })) + ->GetNumberOfPredecessors()), + materialization_blocks_(GetGraph()->GetBlocks().size() * max_preds_per_block_, + nullptr, + alloc_->Adapter(kArenaAllocLSE)), + first_materialization_block_id_(GetGraph()->GetBlocks().size()) { + heap_refs_.reserve(lse_->heap_location_collector_.GetNumberOfReferenceInfos()); + new_ref_phis_.reserve(lse_->heap_location_collector_.GetNumberOfReferenceInfos() * + GetGraph()->GetBlocks().size()); + CollectInterestingHeapRefs(); + } + + ~PartialLoadStoreEliminationHelper() { + if (heap_refs_.empty()) { + return; + } + ReferenceTypePropagation rtp_fixup(GetGraph(), + Handle<mirror::ClassLoader>(), + Handle<mirror::DexCache>(), + /* is_first_run= */ false); + rtp_fixup.Visit(ArrayRef<HInstruction* const>(new_ref_phis_)); + GetGraph()->ClearLoopInformation(); + GetGraph()->ClearDominanceInformation(); + GetGraph()->ClearReachabilityInformation(); + GetGraph()->BuildDominatorTree(); + GetGraph()->ComputeReachabilityInformation(); + } + class IdxToHeapLoc { + public: + explicit IdxToHeapLoc(const HeapLocationCollector* hlc) : collector_(hlc) {} + HeapLocation* operator()(size_t idx) const { + return collector_->GetHeapLocation(idx); + } + + private: + const HeapLocationCollector* collector_; + }; + + + class HeapReferenceData { + public: + using LocIterator = IterationRange<TransformIterator<BitVector::IndexIterator, IdxToHeapLoc>>; + HeapReferenceData(PartialLoadStoreEliminationHelper* helper, + HNewInstance* new_inst, + const ExecutionSubgraph* subgraph, + ScopedArenaAllocator* alloc) + : new_instance_(new_inst), + helper_(helper), + heap_locs_(alloc, + helper->lse_->heap_location_collector_.GetNumberOfHeapLocations(), + /* expandable= */ false, + kArenaAllocLSE), + materializations_( + // We generally won't need to create too many materialization blocks and we can expand + // this as needed so just start off with 2x. + 2 * helper->lse_->GetGraph()->GetBlocks().size(), + nullptr, + alloc->Adapter(kArenaAllocLSE)), + collector_(helper->lse_->heap_location_collector_), + subgraph_(subgraph) {} + + LocIterator IterateLocations() { + auto idxs = heap_locs_.Indexes(); + return MakeTransformRange(idxs, IdxToHeapLoc(&collector_)); + } + + void AddHeapLocation(size_t idx) { + heap_locs_.SetBit(idx); + } + + const ExecutionSubgraph* GetNoEscapeSubgraph() const { + return subgraph_; + } + + bool IsPostEscape(HBasicBlock* blk) { + return std::any_of( + subgraph_->GetExcludedCohorts().cbegin(), + subgraph_->GetExcludedCohorts().cend(), + [&](const ExecutionSubgraph::ExcludedCohort& ec) { return ec.PrecedesBlock(blk); }); + } + + bool InEscapeCohort(HBasicBlock* blk) { + return std::any_of( + subgraph_->GetExcludedCohorts().cbegin(), + subgraph_->GetExcludedCohorts().cend(), + [&](const ExecutionSubgraph::ExcludedCohort& ec) { return ec.ContainsBlock(blk); }); + } + + bool BeforeAllEscapes(HBasicBlock* b) { + return std::none_of(subgraph_->GetExcludedCohorts().cbegin(), + subgraph_->GetExcludedCohorts().cend(), + [&](const ExecutionSubgraph::ExcludedCohort& ec) { + return ec.PrecedesBlock(b) || ec.ContainsBlock(b); + }); + } + + HNewInstance* OriginalNewInstance() const { + return new_instance_; + } + + // Collect and replace all uses. We need to perform this twice since we will + // generate PHIs and additional uses as we create the default-values for + // pred-gets. These values might be other references that are also being + // partially eliminated. By running just the replacement part again we are + // able to avoid having to keep another whole in-progress partial map + // around. Since we will have already handled all the other uses in the + // first pass the second one will be quite fast. + void FixupUses(bool first_pass) { + ScopedArenaAllocator saa(GetGraph()->GetArenaStack()); + // Replace uses with materialized values. + ScopedArenaVector<InstructionUse<HInstruction>> to_replace(saa.Adapter(kArenaAllocLSE)); + ScopedArenaVector<HInstruction*> to_remove(saa.Adapter(kArenaAllocLSE)); + // Do we need to add a constructor-fence. + ScopedArenaVector<InstructionUse<HConstructorFence>> constructor_fences( + saa.Adapter(kArenaAllocLSE)); + ScopedArenaVector<InstructionUse<HInstruction>> to_predicate(saa.Adapter(kArenaAllocLSE)); + + CollectReplacements(to_replace, to_remove, constructor_fences, to_predicate); + + if (!first_pass) { + // If another partial creates new references they can only be in Phis or pred-get defaults + // so they must be in the to_replace group. + DCHECK(to_predicate.empty()); + DCHECK(constructor_fences.empty()); + DCHECK(to_remove.empty()); + } + + ReplaceInput(to_replace); + RemoveInput(to_remove); + CreateConstructorFences(constructor_fences); + PredicateInstructions(to_predicate); + + CHECK(OriginalNewInstance()->GetUses().empty()) + << OriginalNewInstance()->GetUses() << ", " << OriginalNewInstance()->GetEnvUses(); + } + + void AddMaterialization(HBasicBlock* blk, HInstruction* ins) { + if (blk->GetBlockId() >= materializations_.size()) { + // Make sure the materialization array is large enough, try to avoid + // re-sizing too many times by giving extra space. + materializations_.resize(blk->GetBlockId() * 2, nullptr); + } + DCHECK(materializations_[blk->GetBlockId()] == nullptr) + << "Already have a materialization in block " << blk->GetBlockId() << ": " + << *materializations_[blk->GetBlockId()] << " when trying to set materialization to " + << *ins; + materializations_[blk->GetBlockId()] = ins; + LSE_VLOG << "In block " << blk->GetBlockId() << " materialization is " << *ins; + helper_->NotifyNewMaterialization(ins); + } + + bool HasMaterialization(HBasicBlock* blk) const { + return blk->GetBlockId() < materializations_.size() && + materializations_[blk->GetBlockId()] != nullptr; + } + + HInstruction* GetMaterialization(HBasicBlock* blk) const { + if (materializations_.size() <= blk->GetBlockId() || + materializations_[blk->GetBlockId()] == nullptr) { + // This must be a materialization block added after the partial LSE of + // the current reference finished. Since every edge can only have at + // most one materialization block added to it we can just check the + // blocks predecessor. + DCHECK(helper_->IsMaterializationBlock(blk)); + blk = helper_->FindDominatingNonMaterializationBlock(blk); + DCHECK(!helper_->IsMaterializationBlock(blk)); + } + DCHECK_GT(materializations_.size(), blk->GetBlockId()); + DCHECK(materializations_[blk->GetBlockId()] != nullptr); + return materializations_[blk->GetBlockId()]; + } + + void GenerateMaterializationValueFromPredecessors(HBasicBlock* blk) { + DCHECK(std::none_of(GetNoEscapeSubgraph()->GetExcludedCohorts().begin(), + GetNoEscapeSubgraph()->GetExcludedCohorts().end(), + [&](const ExecutionSubgraph::ExcludedCohort& cohort) { + return cohort.IsEntryBlock(blk); + })); + DCHECK(!HasMaterialization(blk)); + if (blk->IsExitBlock()) { + return; + } else if (blk->IsLoopHeader()) { + // See comment in execution_subgraph.h. Currently we act as though every + // allocation for partial elimination takes place in the entry block. + // This simplifies the analysis by making it so any escape cohort + // expands to contain any loops it is a part of. This is something that + // we should rectify at some point. In either case however we can still + // special case the loop-header since (1) currently the loop can't have + // any merges between different cohort entries since the pre-header will + // be the earliest place entry can happen and (2) even if the analysis + // is improved to consider lifetime of the object WRT loops any values + // which would require loop-phis would have to make the whole loop + // escape anyway. + // This all means we can always use value from the pre-header when the + // block is the loop-header and we didn't already create a + // materialization block. (NB when we do improve the analysis we will + // need to modify the materialization creation code to deal with this + // correctly.) + HInstruction* pre_header_val = + GetMaterialization(blk->GetLoopInformation()->GetPreHeader()); + AddMaterialization(blk, pre_header_val); + return; + } + ScopedArenaAllocator saa(GetGraph()->GetArenaStack()); + ScopedArenaVector<HInstruction*> pred_vals(saa.Adapter(kArenaAllocLSE)); + pred_vals.reserve(blk->GetNumberOfPredecessors()); + for (HBasicBlock* pred : blk->GetPredecessors()) { + DCHECK(HasMaterialization(pred)); + pred_vals.push_back(GetMaterialization(pred)); + } + GenerateMaterializationValueFromPredecessorsDirect(blk, pred_vals); + } + + void GenerateMaterializationValueFromPredecessorsForEntry( + HBasicBlock* entry, const ScopedArenaVector<HInstruction*>& pred_vals) { + DCHECK(std::any_of(GetNoEscapeSubgraph()->GetExcludedCohorts().begin(), + GetNoEscapeSubgraph()->GetExcludedCohorts().end(), + [&](const ExecutionSubgraph::ExcludedCohort& cohort) { + return cohort.IsEntryBlock(entry); + })); + GenerateMaterializationValueFromPredecessorsDirect(entry, pred_vals); + } + + private: + template <typename InstructionType> + struct InstructionUse { + InstructionType* instruction_; + size_t index_; + }; + + void ReplaceInput(const ScopedArenaVector<InstructionUse<HInstruction>>& to_replace) { + for (auto& [ins, idx] : to_replace) { + HInstruction* merged_inst = GetMaterialization(ins->GetBlock()); + if (ins->IsPhi() && merged_inst->IsPhi() && ins->GetBlock() == merged_inst->GetBlock()) { + // Phis we just pass through the appropriate inputs. + ins->ReplaceInput(merged_inst->InputAt(idx), idx); + } else { + ins->ReplaceInput(merged_inst, idx); + } + } + } + + void RemoveInput(const ScopedArenaVector<HInstruction*>& to_remove) { + for (HInstruction* ins : to_remove) { + if (ins->GetBlock() == nullptr) { + // Already dealt with. + continue; + } + DCHECK(BeforeAllEscapes(ins->GetBlock())) << *ins; + if (ins->IsInstanceFieldGet() || ins->IsInstanceFieldSet()) { + ins->GetBlock()->RemoveInstruction(ins); + } else { + // Can only be obj == other, obj != other, obj == obj (!?) or, obj != obj (!?) + // Since PHIs are escapes as far as LSE is concerned and we are before + // any escapes these are the only 4 options. + DCHECK(ins->IsEqual() || ins->IsNotEqual()) << *ins; + HInstruction* replacement; + if (UNLIKELY(ins->InputAt(0) == ins->InputAt(1))) { + replacement = ins->IsEqual() ? GetGraph()->GetIntConstant(1) + : GetGraph()->GetIntConstant(0); + } else { + replacement = ins->IsEqual() ? GetGraph()->GetIntConstant(0) + : GetGraph()->GetIntConstant(1); + } + ins->ReplaceWith(replacement); + ins->GetBlock()->RemoveInstruction(ins); + } + } + } + + void CreateConstructorFences( + const ScopedArenaVector<InstructionUse<HConstructorFence>>& constructor_fences) { + if (!constructor_fences.empty()) { + uint32_t pc = constructor_fences.front().instruction_->GetDexPc(); + for (auto& [cf, idx] : constructor_fences) { + if (cf->GetInputs().size() == 1) { + cf->GetBlock()->RemoveInstruction(cf); + } else { + cf->RemoveInputAt(idx); + } + } + for (const ExecutionSubgraph::ExcludedCohort& ec : + GetNoEscapeSubgraph()->GetExcludedCohorts()) { + for (HBasicBlock* blk : ec.EntryBlocks()) { + for (HBasicBlock* materializer : + Filter(MakeIterationRange(blk->GetPredecessors()), + [&](HBasicBlock* blk) { return helper_->IsMaterializationBlock(blk); })) { + HInstruction* new_cf = new (GetGraph()->GetAllocator()) HConstructorFence( + GetMaterialization(materializer), pc, GetGraph()->GetAllocator()); + materializer->InsertInstructionBefore(new_cf, materializer->GetLastInstruction()); + } + } + } + } + } + + void PredicateInstructions( + const ScopedArenaVector<InstructionUse<HInstruction>>& to_predicate) { + for (auto& [ins, idx] : to_predicate) { + if (UNLIKELY(ins->GetBlock() == nullptr)) { + // Already handled due to obj == obj; + continue; + } else if (ins->IsInstanceFieldGet()) { + // IFieldGet[obj] => PredicatedIFieldGet[PartialValue, obj] + HInstruction* new_fget = new (GetGraph()->GetAllocator()) HPredicatedInstanceFieldGet( + ins->AsInstanceFieldGet(), + GetMaterialization(ins->GetBlock()), + helper_->lse_->GetPartialValueAt(OriginalNewInstance(), ins)); + MaybeRecordStat(helper_->lse_->stats_, MethodCompilationStat::kPredicatedLoadAdded); + ins->GetBlock()->InsertInstructionBefore(new_fget, ins); + if (ins->GetType() == DataType::Type::kReference) { + // Reference info is the same + new_fget->SetReferenceTypeInfo(ins->GetReferenceTypeInfo()); + } + ins->ReplaceWith(new_fget); + ins->ReplaceEnvUsesDominatedBy(ins, new_fget); + CHECK(ins->GetEnvUses().empty() && ins->GetUses().empty()) + << "Instruction: " << *ins << " uses: " << ins->GetUses() + << ", env: " << ins->GetEnvUses(); + ins->GetBlock()->RemoveInstruction(ins); + } else if (ins->IsInstanceFieldSet()) { + // Any predicated sets shouldn't require movement. + ins->AsInstanceFieldSet()->SetIsPredicatedSet(); + MaybeRecordStat(helper_->lse_->stats_, MethodCompilationStat::kPredicatedStoreAdded); + HInstruction* merged_inst = GetMaterialization(ins->GetBlock()); + ins->ReplaceInput(merged_inst, idx); + } else { + // comparisons need to be split into 2. + DCHECK(ins->IsEqual() || ins->IsNotEqual()) << "bad instruction " << *ins; + bool this_is_first = idx == 0; + if (ins->InputAt(0) == ins->InputAt(1)) { + // This is a obj == obj or obj != obj. + // No idea why anyone would do this but whatever. + ins->ReplaceWith(GetGraph()->GetIntConstant(ins->IsEqual() ? 1 : 0)); + ins->GetBlock()->RemoveInstruction(ins); + continue; + } else { + HInstruction* is_escaped = new (GetGraph()->GetAllocator()) + HNotEqual(GetMaterialization(ins->GetBlock()), GetGraph()->GetNullConstant()); + HInstruction* combine_inst = + ins->IsEqual() ? static_cast<HInstruction*>(new (GetGraph()->GetAllocator()) HAnd( + DataType::Type::kBool, is_escaped, ins)) + : static_cast<HInstruction*>(new (GetGraph()->GetAllocator()) HOr( + DataType::Type::kBool, is_escaped, ins)); + ins->ReplaceInput(GetMaterialization(ins->GetBlock()), this_is_first ? 0 : 1); + ins->GetBlock()->InsertInstructionBefore(is_escaped, ins); + ins->GetBlock()->InsertInstructionAfter(combine_inst, ins); + ins->ReplaceWith(combine_inst); + combine_inst->ReplaceInput(ins, 1); + } + } + } + } + + // Figure out all the instructions we need to + // fixup/replace/remove/duplicate. Since this requires an iteration of an + // intrusive linked list we want to do it only once and collect all the data + // here. + void CollectReplacements( + ScopedArenaVector<InstructionUse<HInstruction>>& to_replace, + ScopedArenaVector<HInstruction*>& to_remove, + ScopedArenaVector<InstructionUse<HConstructorFence>>& constructor_fences, + ScopedArenaVector<InstructionUse<HInstruction>>& to_predicate) { + size_t size = new_instance_->GetUses().SizeSlow(); + to_replace.reserve(size); + to_remove.reserve(size); + constructor_fences.reserve(size); + to_predicate.reserve(size); + for (auto& use : new_instance_->GetUses()) { + HBasicBlock* blk = + helper_->FindDominatingNonMaterializationBlock(use.GetUser()->GetBlock()); + if (InEscapeCohort(blk)) { + LSE_VLOG << "Replacing " << *new_instance_ << " use in " << *use.GetUser() << " with " + << *GetMaterialization(blk); + to_replace.push_back({use.GetUser(), use.GetIndex()}); + } else if (IsPostEscape(blk)) { + LSE_VLOG << "User " << *use.GetUser() << " after escapes!"; + // The fields + cmp are normal uses. Phi can only be here if it was + // generated by full LSE so whatever store+load that created the phi + // is the escape. + if (use.GetUser()->IsPhi()) { + to_replace.push_back({use.GetUser(), use.GetIndex()}); + } else { + DCHECK(use.GetUser()->IsFieldAccess() || + use.GetUser()->IsEqual() || + use.GetUser()->IsNotEqual()) + << *use.GetUser() << "@" << use.GetIndex(); + to_predicate.push_back({use.GetUser(), use.GetIndex()}); + } + } else if (use.GetUser()->IsConstructorFence()) { + LSE_VLOG << "User " << *use.GetUser() << " being moved to materialization!"; + constructor_fences.push_back({use.GetUser()->AsConstructorFence(), use.GetIndex()}); + } else { + LSE_VLOG << "User " << *use.GetUser() << " not contained in cohort!"; + to_remove.push_back(use.GetUser()); + } + } + DCHECK_EQ( + to_replace.size() + to_remove.size() + constructor_fences.size() + to_predicate.size(), + size); + } + + void GenerateMaterializationValueFromPredecessorsDirect( + HBasicBlock* blk, const ScopedArenaVector<HInstruction*>& pred_vals) { + DCHECK(!pred_vals.empty()); + bool all_equal = std::all_of(pred_vals.begin() + 1, pred_vals.end(), [&](HInstruction* val) { + return val == pred_vals.front(); + }); + if (LIKELY(all_equal)) { + AddMaterialization(blk, pred_vals.front()); + } else { + // Make a PHI for the predecessors. + HPhi* phi = new (GetGraph()->GetAllocator()) HPhi( + GetGraph()->GetAllocator(), kNoRegNumber, pred_vals.size(), DataType::Type::kReference); + for (const auto& [ins, off] : ZipCount(MakeIterationRange(pred_vals))) { + phi->SetRawInputAt(off, ins); + } + blk->AddPhi(phi); + AddMaterialization(blk, phi); + } + } + + HGraph* GetGraph() const { + return helper_->GetGraph(); + } + + HNewInstance* new_instance_; + PartialLoadStoreEliminationHelper* helper_; + ArenaBitVector heap_locs_; + ScopedArenaVector<HInstruction*> materializations_; + const HeapLocationCollector& collector_; + const ExecutionSubgraph* subgraph_; + }; + + ArrayRef<HeapReferenceData> GetHeapRefs() { + return ArrayRef<HeapReferenceData>(heap_refs_); + } + + bool IsMaterializationBlock(HBasicBlock* blk) const { + return blk->GetBlockId() >= first_materialization_block_id_; + } + + HBasicBlock* GetOrCreateMaterializationBlock(HBasicBlock* entry, size_t pred_num) { + size_t idx = GetMaterializationBlockIndex(entry, pred_num); + HBasicBlock* blk = materialization_blocks_[idx]; + if (blk == nullptr) { + blk = new (GetGraph()->GetAllocator()) HBasicBlock(GetGraph()); + GetGraph()->AddBlock(blk); + LSE_VLOG << "creating materialization block " << blk->GetBlockId() << " on edge " + << entry->GetPredecessors()[pred_num]->GetBlockId() << "->" << entry->GetBlockId(); + blk->AddInstruction(new (GetGraph()->GetAllocator()) HGoto()); + materialization_blocks_[idx] = blk; + } + return blk; + } + + HBasicBlock* GetMaterializationBlock(HBasicBlock* entry, size_t pred_num) { + HBasicBlock* out = materialization_blocks_[GetMaterializationBlockIndex(entry, pred_num)]; + DCHECK(out != nullptr) << "No materialization block for edge " << entry->GetBlockId() << "->" + << entry->GetPredecessors()[pred_num]->GetBlockId(); + return out; + } + + IterationRange<ArenaVector<HBasicBlock*>::const_iterator> IterateMaterializationBlocks() { + return MakeIterationRange(GetGraph()->GetBlocks().begin() + first_materialization_block_id_, + GetGraph()->GetBlocks().end()); + } + + void FixupPartialObjectUsers() { + for (PartialLoadStoreEliminationHelper::HeapReferenceData& ref_data : GetHeapRefs()) { + // Use the materialized instances to replace original instance + ref_data.FixupUses(/*first_pass=*/true); + CHECK(ref_data.OriginalNewInstance()->GetUses().empty()) + << ref_data.OriginalNewInstance()->GetUses() << ", " + << ref_data.OriginalNewInstance()->GetEnvUses(); + } + // This can cause new uses to be created due to the creation of phis/pred-get defaults + for (PartialLoadStoreEliminationHelper::HeapReferenceData& ref_data : GetHeapRefs()) { + // Only need to handle new phis/pred-get defaults. DCHECK that's all we find. + ref_data.FixupUses(/*first_pass=*/false); + CHECK(ref_data.OriginalNewInstance()->GetUses().empty()) + << ref_data.OriginalNewInstance()->GetUses() << ", " + << ref_data.OriginalNewInstance()->GetEnvUses(); + } + } + + // Finds the first block which either is or dominates the given block which is + // not a materialization block + HBasicBlock* FindDominatingNonMaterializationBlock(HBasicBlock* blk) { + if (LIKELY(!IsMaterializationBlock(blk))) { + // Not a materialization block so itself. + return blk; + } else if (blk->GetNumberOfPredecessors() != 0) { + // We're far enough along that the materialization blocks have been + // inserted into the graph so no need to go searching. + return blk->GetSinglePredecessor(); + } + // Search through the materialization blocks to find where it will be + // inserted. + for (auto [mat, idx] : ZipCount(MakeIterationRange(materialization_blocks_))) { + if (mat == blk) { + size_t cur_pred_idx = idx % max_preds_per_block_; + HBasicBlock* entry = GetGraph()->GetBlocks()[idx / max_preds_per_block_]; + return entry->GetPredecessors()[cur_pred_idx]; + } + } + LOG(FATAL) << "Unable to find materialization block position for " << blk->GetBlockId() << "!"; + return nullptr; + } + + void InsertMaterializationBlocks() { + for (auto [mat, idx] : ZipCount(MakeIterationRange(materialization_blocks_))) { + if (mat == nullptr) { + continue; + } + size_t cur_pred_idx = idx % max_preds_per_block_; + HBasicBlock* entry = GetGraph()->GetBlocks()[idx / max_preds_per_block_]; + HBasicBlock* pred = entry->GetPredecessors()[cur_pred_idx]; + mat->InsertBetween(pred, entry); + LSE_VLOG << "Adding materialization block " << mat->GetBlockId() << " on edge " + << pred->GetBlockId() << "->" << entry->GetBlockId(); + } + } + + // Replace any env-uses remaining of the partial singletons with the + // appropriate phis and remove the instructions. + void RemoveReplacedInstructions() { + for (HeapReferenceData& ref_data : GetHeapRefs()) { + CHECK(ref_data.OriginalNewInstance()->GetUses().empty()) + << ref_data.OriginalNewInstance()->GetUses() << ", " + << ref_data.OriginalNewInstance()->GetEnvUses() + << " inst is: " << ref_data.OriginalNewInstance(); + const auto& env_uses = ref_data.OriginalNewInstance()->GetEnvUses(); + while (!env_uses.empty()) { + const HUseListNode<HEnvironment*>& use = env_uses.front(); + HInstruction* merged_inst = + ref_data.GetMaterialization(use.GetUser()->GetHolder()->GetBlock()); + LSE_VLOG << "Replacing env use of " << *use.GetUser()->GetHolder() << "@" << use.GetIndex() + << " with " << *merged_inst; + use.GetUser()->ReplaceInput(merged_inst, use.GetIndex()); + } + ref_data.OriginalNewInstance()->GetBlock()->RemoveInstruction(ref_data.OriginalNewInstance()); + } + } + + // We need to make sure any allocations dominate their environment uses. + // Technically we could probably remove the env-uses and be fine but this is easy. + void ReorderMaterializationsForEnvDominance() { + for (HBasicBlock* blk : IterateMaterializationBlocks()) { + ScopedArenaAllocator alloc(alloc_->GetArenaStack()); + ArenaBitVector still_unsorted( + &alloc, GetGraph()->GetCurrentInstructionId(), false, kArenaAllocLSE); + // This is guaranteed to be very short (since we will abandon LSE if there + // are >= kMaxNumberOfHeapLocations (32) heap locations so that is the + // absolute maximum size this list can be) so doing a selection sort is + // fine. This avoids the need to do a complicated recursive check to + // ensure transitivity for std::sort. + ScopedArenaVector<HNewInstance*> materializations(alloc.Adapter(kArenaAllocLSE)); + materializations.reserve(GetHeapRefs().size()); + for (HInstruction* ins : + MakeSTLInstructionIteratorRange(HInstructionIterator(blk->GetInstructions()))) { + if (ins->IsNewInstance()) { + materializations.push_back(ins->AsNewInstance()); + still_unsorted.SetBit(ins->GetId()); + } + } + using Iter = ScopedArenaVector<HNewInstance*>::iterator; + Iter unsorted_start = materializations.begin(); + Iter unsorted_end = materializations.end(); + // selection sort. Required since the only check we can easily perform a + // is-before-all-unsorted check. + while (unsorted_start != unsorted_end) { + bool found_instruction = false; + for (Iter candidate = unsorted_start; candidate != unsorted_end; ++candidate) { + HNewInstance* ni = *candidate; + if (std::none_of(ni->GetAllEnvironments().cbegin(), + ni->GetAllEnvironments().cend(), + [&](const HEnvironment* env) { + return std::any_of( + env->GetEnvInputs().cbegin(), + env->GetEnvInputs().cend(), + [&](const HInstruction* env_element) { + return env_element != nullptr && + still_unsorted.IsBitSet(env_element->GetId()); + }); + })) { + still_unsorted.ClearBit(ni->GetId()); + std::swap(*unsorted_start, *candidate); + ++unsorted_start; + found_instruction = true; + break; + } + } + CHECK(found_instruction) << "Unable to select next materialization instruction." + << " Environments have a dependency loop!"; + } + // Reverse so we as we prepend them we end up with the correct order. + auto reverse_iter = MakeIterationRange(materializations.rbegin(), materializations.rend()); + for (HNewInstance* ins : reverse_iter) { + if (blk->GetFirstInstruction() != ins) { + // Don't do checks since that makes sure the move is safe WRT + // ins->CanBeMoved which for NewInstance is false. + ins->MoveBefore(blk->GetFirstInstruction(), /*do_checks=*/false); + } + } + } + } + + private: + void CollectInterestingHeapRefs() { + // Get all the partials we need to move around. + for (size_t i = 0; i < lse_->heap_location_collector_.GetNumberOfHeapLocations(); ++i) { + ReferenceInfo* ri = lse_->heap_location_collector_.GetHeapLocation(i)->GetReferenceInfo(); + if (ri->IsPartialSingleton() && + ri->GetReference()->GetBlock() != nullptr && + ri->GetNoEscapeSubgraph()->ContainsBlock(ri->GetReference()->GetBlock())) { + RecordHeapRefField(ri->GetReference()->AsNewInstance(), i); + } + } + } + + void RecordHeapRefField(HNewInstance* ni, size_t loc) { + DCHECK(ni != nullptr); + // This is likely to be very short so just do a linear search. + auto it = std::find_if(heap_refs_.begin(), heap_refs_.end(), [&](HeapReferenceData& data) { + return data.OriginalNewInstance() == ni; + }); + HeapReferenceData& cur_ref = + (it == heap_refs_.end()) + ? heap_refs_.emplace_back(this, + ni, + lse_->heap_location_collector_.GetHeapLocation(loc) + ->GetReferenceInfo() + ->GetNoEscapeSubgraph(), + alloc_) + : *it; + cur_ref.AddHeapLocation(loc); + } + + + void NotifyNewMaterialization(HInstruction* ins) { + if (ins->IsPhi()) { + new_ref_phis_.push_back(ins->AsPhi()); + } + } + + size_t GetMaterializationBlockIndex(HBasicBlock* blk, size_t pred_num) const { + DCHECK_LT(blk->GetBlockId(), first_materialization_block_id_) + << "block is a materialization block!"; + DCHECK_LT(pred_num, max_preds_per_block_); + return blk->GetBlockId() * max_preds_per_block_ + pred_num; + } + + HGraph* GetGraph() const { + return lse_->GetGraph(); + } + + LSEVisitor* lse_; + ScopedArenaAllocator* alloc_; + ScopedArenaVector<HInstruction*> new_ref_phis_; + ScopedArenaVector<HeapReferenceData> heap_refs_; + size_t max_preds_per_block_; + // An array of (# of non-materialization blocks) * max_preds_per_block + // arranged in block-id major order. Since we can only have at most one + // materialization block on each edge this is the maximum possible number of + // materialization blocks. + ScopedArenaVector<HBasicBlock*> materialization_blocks_; + size_t first_materialization_block_id_; + + friend void LSEVisitor::MovePartialEscapes(); + friend class HeapReferenceData; +}; + +// Work around c++ type checking annoyances with not being able to forward-declare inner types. +class HeapRefHolder + : public std::reference_wrapper<PartialLoadStoreEliminationHelper::HeapReferenceData> {}; + +HInstruction* LSEVisitor::SetupPartialMaterialization(PartialLoadStoreEliminationHelper& helper, + HeapRefHolder&& holder, + size_t pred_idx, + HBasicBlock* entry) { + PartialLoadStoreEliminationHelper::HeapReferenceData& ref_data = holder.get(); + HBasicBlock* old_pred = entry->GetPredecessors()[pred_idx]; + HInstruction* new_inst = ref_data.OriginalNewInstance(); + if (UNLIKELY(!new_inst->GetBlock()->Dominates(entry))) { + LSE_VLOG << "Initial materialization in non-dominating block " << entry->GetBlockId() + << " is null!"; + return GetGraph()->GetNullConstant(); + } + HBasicBlock* bb = helper.GetOrCreateMaterializationBlock(entry, pred_idx); + CHECK(bb != nullptr) << "entry " << entry->GetBlockId() << " -> " << old_pred->GetBlockId(); + HNewInstance* repl_create = new_inst->Clone(GetGraph()->GetAllocator())->AsNewInstance(); + repl_create->SetPartialMaterialization(); + bb->InsertInstructionBefore(repl_create, bb->GetLastInstruction()); + repl_create->CopyEnvironmentFrom(new_inst->GetEnvironment()); + MaybeRecordStat(stats_, MethodCompilationStat::kPartialAllocationMoved); + LSE_VLOG << "In blk " << bb->GetBlockId() << " initial materialization is " << *repl_create; + ref_data.AddMaterialization(bb, repl_create); + const FieldInfo* info = nullptr; + for (const HeapLocation* loc : ref_data.IterateLocations()) { + size_t loc_off = heap_location_collector_.GetHeapLocationIndex(loc); + info = field_infos_[loc_off]; + DCHECK(loc->GetIndex() == nullptr); + Value value = ReplacementOrValue(heap_values_for_[old_pred->GetBlockId()][loc_off].value); + if (value.NeedsLoopPhi()) { + Value repl = phi_placeholder_replacements_[PhiPlaceholderIndex(value.GetPhiPlaceholder())]; + DCHECK(!repl.IsUnknown()); + DCHECK(repl.IsDefault() || repl.IsInvalid() || repl.IsInstruction()) + << repl << " from " << value << " pred is " << old_pred->GetBlockId(); + if (!repl.IsInvalid()) { + value = repl; + } else { + FullyMaterializePhi(value.GetPhiPlaceholder(), info->GetFieldType()); + value = phi_placeholder_replacements_[PhiPlaceholderIndex(value.GetPhiPlaceholder())]; + } + } else if (value.NeedsNonLoopPhi()) { + Value repl = phi_placeholder_replacements_[PhiPlaceholderIndex(value.GetPhiPlaceholder())]; + DCHECK(repl.IsDefault() || repl.IsInvalid() || repl.IsInstruction()) + << repl << " from " << value << " pred is " << old_pred->GetBlockId(); + if (!repl.IsInvalid()) { + value = repl; + } else { + MaterializeNonLoopPhis(value.GetPhiPlaceholder(), info->GetFieldType()); + value = phi_placeholder_replacements_[PhiPlaceholderIndex(value.GetPhiPlaceholder())]; + } + } + DCHECK(value.IsDefault() || value.IsInstruction()) + << GetGraph()->PrettyMethod() << ": " << value; + + if (!value.IsDefault() && + // shadow$_klass_ doesn't need to be manually initialized. + MemberOffset(loc->GetOffset()) != mirror::Object::ClassOffset()) { + CHECK(info != nullptr); + HInstruction* set_value = + new (GetGraph()->GetAllocator()) HInstanceFieldSet(repl_create, + value.GetInstruction(), + field_infos_[loc_off]->GetField(), + loc->GetType(), + MemberOffset(loc->GetOffset()), + false, + field_infos_[loc_off]->GetFieldIndex(), + loc->GetDeclaringClassDefIndex(), + field_infos_[loc_off]->GetDexFile(), + 0u); + bb->InsertInstructionAfter(set_value, repl_create); + LSE_VLOG << "Adding " << *set_value << " for materialization setup!"; + } + } + return repl_create; +} + +HInstruction* LSEVisitor::GetPartialValueAt(HNewInstance* orig_new_inst, HInstruction* read) { + size_t loc = heap_location_collector_.GetFieldHeapLocation(orig_new_inst, &read->GetFieldInfo()); + Value pred = ReplacementOrValue(intermediate_values_.find(read)->second); + LSE_VLOG << "using " << pred << " as default value for " << *read; + if (pred.IsInstruction()) { + return pred.GetInstruction(); + } else if (pred.IsMergedUnknown() || pred.NeedsPhi()) { + FullyMaterializePhi(pred.GetPhiPlaceholder(), + heap_location_collector_.GetHeapLocation(loc)->GetType()); + HInstruction* res = Replacement(pred).GetInstruction(); + LSE_VLOG << pred << " materialized to " << res->DumpWithArgs(); + return res; + } + LOG(FATAL) << "Unable to find unescaped value at " << read->DumpWithArgs() + << "! This should be impossible!"; + UNREACHABLE(); +} + +void LSEVisitor::MovePartialEscapes() { + if (!ShouldPerformPartialLSE()) { + return; + } + + ScopedArenaAllocator saa(allocator_.GetArenaStack()); + PartialLoadStoreEliminationHelper helper(this, &saa); + + // Since for PHIs we now will have more information (since we know the object + // hasn't escaped) we need to clear the old phi-replacements where we weren't + // able to find the value. + PrepareForPartialPhiComputation(); + + for (PartialLoadStoreEliminationHelper::HeapReferenceData& ref_data : helper.GetHeapRefs()) { + LSE_VLOG << "Creating materializations for " << *ref_data.OriginalNewInstance(); + // Setup entry and exit blocks. + for (const auto& excluded_cohort : ref_data.GetNoEscapeSubgraph()->GetExcludedCohorts()) { + // Setup materialization blocks. + for (HBasicBlock* entry : excluded_cohort.EntryBlocksReversePostOrder()) { + // Setup entries. + // TODO Assuming we correctly break critical edges every entry block + // must have only a single predecessor so we could just put all this + // stuff in there. OTOH simplifier can do it for us and this is simpler + // to implement - giving clean separation between the original graph and + // materialization blocks - so for now we might as well have these new + // blocks. + ScopedArenaAllocator pred_alloc(saa.GetArenaStack()); + ScopedArenaVector<HInstruction*> pred_vals(pred_alloc.Adapter(kArenaAllocLSE)); + pred_vals.reserve(entry->GetNumberOfPredecessors()); + for (const auto& [pred, pred_idx] : + ZipCount(MakeIterationRange(entry->GetPredecessors()))) { + DCHECK(!helper.IsMaterializationBlock(pred)); + if (excluded_cohort.IsEntryBlock(pred)) { + pred_vals.push_back(ref_data.GetMaterialization(pred)); + continue; + } else { + pred_vals.push_back(SetupPartialMaterialization(helper, {ref_data}, pred_idx, entry)); + } + } + ref_data.GenerateMaterializationValueFromPredecessorsForEntry(entry, pred_vals); + } + + // Setup exit block heap-values for later phi-generation. + for (HBasicBlock* exit : excluded_cohort.ExitBlocks()) { + // mark every exit of cohorts as having a value so we can easily + // materialize the PHIs. + // TODO By setting this we can easily use the normal MaterializeLoopPhis + // (via FullyMaterializePhis) in order to generate the default-values + // for predicated-gets. This has the unfortunate side effect of creating + // somewhat more phis than are really needed (in some cases). We really + // should try to eventually know that we can lower these PHIs to only + // the non-escaping value in cases where it is possible. Currently this + // is done to some extent in instruction_simplifier but we have more + // information here to do the right thing. + for (const HeapLocation* loc : ref_data.IterateLocations()) { + size_t loc_off = heap_location_collector_.GetHeapLocationIndex(loc); + // This Value::Default() is only used to fill in PHIs used as the + // default value for PredicatedInstanceFieldGets. The actual value + // stored there is meaningless since the Predicated-iget will use the + // actual field value instead on these paths. + heap_values_for_[exit->GetBlockId()][loc_off].value = Value::Default(); + } + } + } + + // string materialization through the graph. + // // Visit RPO to PHI the materialized object through the cohort. + for (HBasicBlock* blk : GetGraph()->GetReversePostOrder()) { + // NB This doesn't include materialization blocks. + DCHECK(!helper.IsMaterializationBlock(blk)) + << "Materialization blocks should not be in RPO yet."; + if (ref_data.HasMaterialization(blk)) { + continue; + } else if (ref_data.BeforeAllEscapes(blk)) { + ref_data.AddMaterialization(blk, GetGraph()->GetNullConstant()); + continue; + } else { + ref_data.GenerateMaterializationValueFromPredecessors(blk); + } + } + } + + // Once we've generated all the materializations we can update the users. + helper.FixupPartialObjectUsers(); + + // Actually put materialization blocks into the graph + helper.InsertMaterializationBlocks(); + + // Get rid of the original instructions. + helper.RemoveReplacedInstructions(); + + // Ensure everything is ordered correctly in the materialization blocks. This + // involves moving every NewInstance to the top and ordering them so that any + // required env-uses are correctly ordered. + helper.ReorderMaterializationsForEnvDominance(); +} + +void LSEVisitor::FinishFullLSE() { // Remove recorded load instructions that should be eliminated. for (const LoadStoreRecord& record : loads_and_stores_) { size_t id = dchecked_integral_cast<size_t>(record.load_or_store->GetId()); @@ -2505,7 +3758,7 @@ void LSEVisitor::Run() { } } -bool LoadStoreElimination::Run() { +bool LoadStoreElimination::Run(bool enable_partial_lse) { if (graph_->IsDebuggable() || graph_->HasTryCatch()) { // Debugger may set heap values or trigger deoptimization of callers. // Try/catch support not implemented yet. @@ -2519,7 +3772,11 @@ bool LoadStoreElimination::Run() { // O(1) though. graph_->ComputeReachabilityInformation(); ScopedArenaAllocator allocator(graph_->GetArenaStack()); - LoadStoreAnalysis lsa(graph_, stats_, &allocator, /*for_elimination=*/true); + LoadStoreAnalysis lsa(graph_, + stats_, + &allocator, + enable_partial_lse ? LoadStoreAnalysisType::kFull + : LoadStoreAnalysisType::kNoPredicatedInstructions); lsa.Run(); const HeapLocationCollector& heap_location_collector = lsa.GetHeapLocationCollector(); if (heap_location_collector.GetNumberOfHeapLocations() == 0) { @@ -2527,9 +3784,11 @@ bool LoadStoreElimination::Run() { return false; } - LSEVisitor lse_visitor(graph_, heap_location_collector, stats_); + LSEVisitor lse_visitor(graph_, heap_location_collector, enable_partial_lse, stats_); lse_visitor.Run(); return true; } +#undef LSE_VLOG + } // namespace art |