diff options
author | Aart Bik <ajcbik@google.com> | 2017-08-31 09:08:13 -0700 |
---|---|---|
committer | Aart Bik <ajcbik@google.com> | 2017-09-01 10:32:50 -0700 |
commit | cfa59b49cde265dc5329a7e6956445f9f7a75f15 (patch) | |
tree | eed953f62e796f7e64252520a40d7e77d1f117af /compiler/optimizing | |
parent | 82a63734d3067ea0c96f8ba15bc40caaf798c625 (diff) |
Basic SIMD reduction support.
Rationale:
Enables vectorization of x += .... for very basic (simple, same-type)
constructs. Paves the way for more complex (narrower and/or mixed-type)
constructs, which will be handled by the next CL.
This is a revert^2 of I7880c135aee3ed0a39da9ae5b468cbf80e613766
and thus a revert of I1c1c87b6323e01442e8fbd94869ddc9e760ea1fc
PS1-2 shows what needed to change, with regression tests
Test: test-art-host test-art-target
Bug: 64091002, 65212948
Change-Id: I2454778dd0ef1da915c178c7274e1cf33e271d0f
Diffstat (limited to 'compiler/optimizing')
-rw-r--r-- | compiler/optimizing/code_generator_vector_arm64.cc | 172 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_arm_vixl.cc | 28 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_mips.cc | 28 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_mips64.cc | 28 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_x86.cc | 279 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_x86_64.cc | 252 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 216 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.h | 30 | ||||
-rw-r--r-- | compiler/optimizing/nodes.h | 14 | ||||
-rw-r--r-- | compiler/optimizing/nodes_vector.h | 97 | ||||
-rw-r--r-- | compiler/optimizing/nodes_vector_test.cc | 28 | ||||
-rw-r--r-- | compiler/optimizing/scheduler_arm64.cc | 16 | ||||
-rw-r--r-- | compiler/optimizing/scheduler_arm64.h | 5 |
13 files changed, 1006 insertions, 187 deletions
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc index 9095ecdf16..18a55c8b09 100644 --- a/compiler/optimizing/code_generator_vector_arm64.cc +++ b/compiler/optimizing/code_generator_vector_arm64.cc @@ -27,12 +27,13 @@ namespace arm64 { using helpers::ARM64EncodableConstantOrRegister; using helpers::Arm64CanEncodeConstantAsImmediate; using helpers::DRegisterFrom; -using helpers::VRegisterFrom; using helpers::HeapOperand; using helpers::InputRegisterAt; using helpers::Int64ConstantFrom; -using helpers::XRegisterFrom; +using helpers::OutputRegister; +using helpers::VRegisterFrom; using helpers::WRegisterFrom; +using helpers::XRegisterFrom; #define __ GetVIXLAssembler()-> @@ -127,20 +128,51 @@ void InstructionCodeGeneratorARM64::VisitVecReplicateScalar(HVecReplicateScalar* } } -void LocationsBuilderARM64::VisitVecSetScalars(HVecSetScalars* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void InstructionCodeGeneratorARM64::VisitVecSetScalars(HVecSetScalars* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void LocationsBuilderARM64::VisitVecSumReduce(HVecSumReduce* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); +void LocationsBuilderARM64::VisitVecExtractScalar(HVecExtractScalar* instruction) { + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresRegister()); + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } -void InstructionCodeGeneratorARM64::VisitVecSumReduce(HVecSumReduce* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); +void InstructionCodeGeneratorARM64::VisitVecExtractScalar(HVecExtractScalar* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister src = VRegisterFrom(locations->InAt(0)); + switch (instruction->GetPackedType()) { + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Umov(OutputRegister(instruction), src.V4S(), 0); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Umov(OutputRegister(instruction), src.V2D(), 0); + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 4u); + DCHECK(locations->InAt(0).Equals(locations->Out())); // no code required + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } // Helper to set up locations for vector unary operations. @@ -169,6 +201,46 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in } } +void LocationsBuilderARM64::VisitVecReduce(HVecReduce* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecReduce(HVecReduce* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister src = VRegisterFrom(locations->InAt(0)); + VRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + switch (instruction->GetKind()) { + case HVecReduce::kSum: + __ Addv(dst.S(), src.V4S()); + break; + case HVecReduce::kMin: + __ Sminv(dst.S(), src.V4S()); + break; + case HVecReduce::kMax: + __ Smaxv(dst.S(), src.V4S()); + break; + } + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + switch (instruction->GetKind()) { + case HVecReduce::kSum: + __ Addp(dst.D(), src.V2D()); + break; + default: + LOG(FATAL) << "Unsupported SIMD min/max"; + UNREACHABLE(); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + void LocationsBuilderARM64::VisitVecCnv(HVecCnv* instruction) { CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); } @@ -263,6 +335,7 @@ void InstructionCodeGeneratorARM64::VisitVecAbs(HVecAbs* instruction) { break; default: LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); } } @@ -805,6 +878,77 @@ void InstructionCodeGeneratorARM64::VisitVecUShr(HVecUShr* instruction) { } } +void LocationsBuilderARM64::VisitVecSetScalars(HVecSetScalars* instruction) { + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + + DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented + + HInstruction* input = instruction->InputAt(0); + bool is_zero = IsZeroBitPattern(input); + + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void InstructionCodeGeneratorARM64::VisitVecSetScalars(HVecSetScalars* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister dst = VRegisterFrom(locations->Out()); + + DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented + + // Zero out all other elements first. + __ Movi(dst.V16B(), 0); + + // Shorthand for any type of zero. + if (IsZeroBitPattern(instruction->InputAt(0))) { + return; + } + + // Set required elements. + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Mov(dst.V16B(), 0, InputRegisterAt(instruction, 0)); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Mov(dst.V8H(), 0, InputRegisterAt(instruction, 0)); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Mov(dst.V4S(), 0, InputRegisterAt(instruction, 0)); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Mov(dst.V2D(), 0, InputRegisterAt(instruction, 0)); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr); switch (instr->GetPackedType()) { diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc index 527691d9d9..7a11dff41e 100644 --- a/compiler/optimizing/code_generator_vector_arm_vixl.cc +++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc @@ -73,19 +73,11 @@ void InstructionCodeGeneratorARMVIXL::VisitVecReplicateScalar(HVecReplicateScala } } -void LocationsBuilderARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void InstructionCodeGeneratorARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) { +void LocationsBuilderARMVIXL::VisitVecExtractScalar(HVecExtractScalar* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } -void LocationsBuilderARMVIXL::VisitVecSumReduce(HVecSumReduce* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void InstructionCodeGeneratorARMVIXL::VisitVecSumReduce(HVecSumReduce* instruction) { +void InstructionCodeGeneratorARMVIXL::VisitVecExtractScalar(HVecExtractScalar* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } @@ -112,6 +104,14 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in } } +void LocationsBuilderARMVIXL::VisitVecReduce(HVecReduce* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecReduce(HVecReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderARMVIXL::VisitVecCnv(HVecCnv* instruction) { CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); } @@ -621,6 +621,14 @@ void InstructionCodeGeneratorARMVIXL::VisitVecUShr(HVecUShr* instruction) { } } +void LocationsBuilderARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { LOG(FATAL) << "No SIMD for " << instr->GetId(); } diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc index 6bf28ab1a3..c2fbf7f04b 100644 --- a/compiler/optimizing/code_generator_vector_mips.cc +++ b/compiler/optimizing/code_generator_vector_mips.cc @@ -88,19 +88,11 @@ void InstructionCodeGeneratorMIPS::VisitVecReplicateScalar(HVecReplicateScalar* } } -void LocationsBuilderMIPS::VisitVecSetScalars(HVecSetScalars* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void InstructionCodeGeneratorMIPS::VisitVecSetScalars(HVecSetScalars* instruction) { +void LocationsBuilderMIPS::VisitVecExtractScalar(HVecExtractScalar* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } -void LocationsBuilderMIPS::VisitVecSumReduce(HVecSumReduce* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void InstructionCodeGeneratorMIPS::VisitVecSumReduce(HVecSumReduce* instruction) { +void InstructionCodeGeneratorMIPS::VisitVecExtractScalar(HVecExtractScalar* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } @@ -133,6 +125,14 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in } } +void LocationsBuilderMIPS::VisitVecReduce(HVecReduce* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecReduce(HVecReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderMIPS::VisitVecCnv(HVecCnv* instruction) { CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); } @@ -818,6 +818,14 @@ void InstructionCodeGeneratorMIPS::VisitVecUShr(HVecUShr* instruction) { } } +void LocationsBuilderMIPS::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorMIPS::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr); switch (instr->GetPackedType()) { diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc index 75bf7a7cbb..9d3a777c13 100644 --- a/compiler/optimizing/code_generator_vector_mips64.cc +++ b/compiler/optimizing/code_generator_vector_mips64.cc @@ -91,19 +91,11 @@ void InstructionCodeGeneratorMIPS64::VisitVecReplicateScalar(HVecReplicateScalar } } -void LocationsBuilderMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void InstructionCodeGeneratorMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) { +void LocationsBuilderMIPS64::VisitVecExtractScalar(HVecExtractScalar* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } -void LocationsBuilderMIPS64::VisitVecSumReduce(HVecSumReduce* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void InstructionCodeGeneratorMIPS64::VisitVecSumReduce(HVecSumReduce* instruction) { +void InstructionCodeGeneratorMIPS64::VisitVecExtractScalar(HVecExtractScalar* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } @@ -136,6 +128,14 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in } } +void LocationsBuilderMIPS64::VisitVecReduce(HVecReduce* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecReduce(HVecReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderMIPS64::VisitVecCnv(HVecCnv* instruction) { CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); } @@ -822,6 +822,14 @@ void InstructionCodeGeneratorMIPS64::VisitVecUShr(HVecUShr* instruction) { } } +void LocationsBuilderMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr); switch (instr->GetPackedType()) { diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc index e7aec76aff..37190f8363 100644 --- a/compiler/optimizing/code_generator_vector_x86.cc +++ b/compiler/optimizing/code_generator_vector_x86.cc @@ -27,23 +27,31 @@ namespace x86 { void LocationsBuilderX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + HInstruction* input = instruction->InputAt(0); + bool is_zero = IsZeroBitPattern(input); switch (instruction->GetPackedType()) { case Primitive::kPrimLong: - // Long needs extra temporary to load the register pair. - locations->AddTemp(Location::RequiresFpuRegister()); + // Long needs extra temporary to load from the register pair. + if (!is_zero) { + locations->AddTemp(Location::RequiresFpuRegister()); + } FALLTHROUGH_INTENDED; case Primitive::kPrimBoolean: case Primitive::kPrimByte: case Primitive::kPrimChar: case Primitive::kPrimShort: case Primitive::kPrimInt: - locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresRegister()); locations->SetOut(Location::RequiresFpuRegister()); break; case Primitive::kPrimFloat: case Primitive::kPrimDouble: - locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetOut(Location::SameAsFirstInput()); + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresFpuRegister()); + locations->SetOut(is_zero ? Location::RequiresFpuRegister() + : Location::SameAsFirstInput()); + break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -53,46 +61,53 @@ void LocationsBuilderX86::VisitVecReplicateScalar(HVecReplicateScalar* instructi void InstructionCodeGeneratorX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { LocationSummary* locations = instruction->GetLocations(); - XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + + // Shorthand for any type of zero. + if (IsZeroBitPattern(instruction->InputAt(0))) { + __ xorps(dst, dst); + return; + } + switch (instruction->GetPackedType()) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: DCHECK_EQ(16u, instruction->GetVectorLength()); - __ movd(reg, locations->InAt(0).AsRegister<Register>()); - __ punpcklbw(reg, reg); - __ punpcklwd(reg, reg); - __ pshufd(reg, reg, Immediate(0)); + __ movd(dst, locations->InAt(0).AsRegister<Register>()); + __ punpcklbw(dst, dst); + __ punpcklwd(dst, dst); + __ pshufd(dst, dst, Immediate(0)); break; case Primitive::kPrimChar: case Primitive::kPrimShort: DCHECK_EQ(8u, instruction->GetVectorLength()); - __ movd(reg, locations->InAt(0).AsRegister<Register>()); - __ punpcklwd(reg, reg); - __ pshufd(reg, reg, Immediate(0)); + __ movd(dst, locations->InAt(0).AsRegister<Register>()); + __ punpcklwd(dst, dst); + __ pshufd(dst, dst, Immediate(0)); break; case Primitive::kPrimInt: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ movd(reg, locations->InAt(0).AsRegister<Register>()); - __ pshufd(reg, reg, Immediate(0)); + __ movd(dst, locations->InAt(0).AsRegister<Register>()); + __ pshufd(dst, dst, Immediate(0)); break; case Primitive::kPrimLong: { XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); DCHECK_EQ(2u, instruction->GetVectorLength()); - __ movd(reg, locations->InAt(0).AsRegisterPairLow<Register>()); + __ movd(dst, locations->InAt(0).AsRegisterPairLow<Register>()); __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>()); - __ punpckldq(reg, tmp); - __ punpcklqdq(reg, reg); + __ punpckldq(dst, tmp); + __ punpcklqdq(dst, dst); break; } case Primitive::kPrimFloat: DCHECK(locations->InAt(0).Equals(locations->Out())); DCHECK_EQ(4u, instruction->GetVectorLength()); - __ shufps(reg, reg, Immediate(0)); + __ shufps(dst, dst, Immediate(0)); break; case Primitive::kPrimDouble: DCHECK(locations->InAt(0).Equals(locations->Out())); DCHECK_EQ(2u, instruction->GetVectorLength()); - __ shufpd(reg, reg, Immediate(0)); + __ shufpd(dst, dst, Immediate(0)); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -100,20 +115,65 @@ void InstructionCodeGeneratorX86::VisitVecReplicateScalar(HVecReplicateScalar* i } } -void LocationsBuilderX86::VisitVecSetScalars(HVecSetScalars* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void LocationsBuilderX86::VisitVecSumReduce(HVecSumReduce* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); +void LocationsBuilderX86::VisitVecExtractScalar(HVecExtractScalar* instruction) { + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimLong: + // Long needs extra temporary to store into the register pair. + locations->AddTemp(Location::RequiresFpuRegister()); + FALLTHROUGH_INTENDED; + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresRegister()); + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } -void InstructionCodeGeneratorX86::VisitVecSumReduce(HVecSumReduce* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); +void InstructionCodeGeneratorX86::VisitVecExtractScalar(HVecExtractScalar* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: // TODO: up to here, and? + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + case Primitive::kPrimInt: + DCHECK_LE(4u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ movd(locations->Out().AsRegister<Register>(), src); + break; + case Primitive::kPrimLong: { + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ movd(locations->Out().AsRegisterPairLow<Register>(), src); + __ pshufd(tmp, src, Immediate(1)); + __ movd(locations->Out().AsRegisterPairHigh<Register>(), tmp); + break; + } + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 4u); + DCHECK(locations->InAt(0).Equals(locations->Out())); // no code required + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } // Helper to set up locations for vector unary operations. @@ -137,6 +197,73 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in } } +void LocationsBuilderX86::VisitVecReduce(HVecReduce* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); + // Long reduction or min/max require a temporary. + if (instruction->GetPackedType() == Primitive::kPrimLong || + instruction->GetKind() == HVecReduce::kMin || + instruction->GetKind() == HVecReduce::kMax) { + instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister()); + } +} + +void InstructionCodeGeneratorX86::VisitVecReduce(HVecReduce* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + switch (instruction->GetKind()) { + case HVecReduce::kSum: + __ movaps(dst, src); + __ phaddd(dst, dst); + __ phaddd(dst, dst); + break; + case HVecReduce::kMin: { + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + __ movaps(tmp, src); + __ movaps(dst, src); + __ psrldq(tmp, Immediate(8)); + __ pminsd(dst, tmp); + __ psrldq(tmp, Immediate(4)); + __ pminsd(dst, tmp); + break; + } + case HVecReduce::kMax: { + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + __ movaps(tmp, src); + __ movaps(dst, src); + __ psrldq(tmp, Immediate(8)); + __ pmaxsd(dst, tmp); + __ psrldq(tmp, Immediate(4)); + __ pmaxsd(dst, tmp); + break; + } + } + break; + case Primitive::kPrimLong: { + DCHECK_EQ(2u, instruction->GetVectorLength()); + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + switch (instruction->GetKind()) { + case HVecReduce::kSum: + __ movaps(tmp, src); + __ movaps(dst, src); + __ punpckhqdq(tmp, tmp); + __ paddq(dst, tmp); + break; + case HVecReduce::kMin: + case HVecReduce::kMax: + LOG(FATAL) << "Unsupported SIMD type"; + } + break; + } + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + void LocationsBuilderX86::VisitVecCnv(HVecCnv* instruction) { CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); } @@ -821,6 +948,91 @@ void InstructionCodeGeneratorX86::VisitVecUShr(HVecUShr* instruction) { } } +void LocationsBuilderX86::VisitVecSetScalars(HVecSetScalars* instruction) { + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + + DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented + + HInstruction* input = instruction->InputAt(0); + bool is_zero = IsZeroBitPattern(input); + + switch (instruction->GetPackedType()) { + case Primitive::kPrimLong: + // Long needs extra temporary to load from register pairs. + if (!is_zero) { + locations->AddTemp(Location::RequiresFpuRegister()); + } + FALLTHROUGH_INTENDED; + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + + DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented + + // Zero out all other elements first. + __ xorps(dst, dst); + + // Shorthand for any type of zero. + if (IsZeroBitPattern(instruction->InputAt(0))) { + return; + } + + // Set required elements. + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: // TODO: up to here, and? + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ movd(dst, locations->InAt(0).AsRegister<Register>()); + break; + case Primitive::kPrimLong: { + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ xorps(tmp, tmp); + __ movd(dst, locations->InAt(0).AsRegisterPairLow<Register>()); + __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>()); + __ punpckldq(dst, tmp); + break; + } + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ movss(dst, locations->InAt(1).AsFpuRegister<XmmRegister>()); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ movsd(dst, locations->InAt(1).AsFpuRegister<XmmRegister>()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { LOG(FATAL) << "No SIMD for " << instr->GetId(); } @@ -868,6 +1080,7 @@ static Address VecAddress(LocationSummary* locations, size_t size, bool is_strin case 8: scale = TIMES_8; break; default: break; } + // Incorporate the string or array offset in the address computation. uint32_t offset = is_string_char_at ? mirror::String::ValueOffset().Uint32Value() : mirror::Array::DataOffset(size).Uint32Value(); @@ -902,7 +1115,7 @@ void InstructionCodeGeneratorX86::VisitVecLoad(HVecLoad* instruction) { __ testb(Address(locations->InAt(0).AsRegister<Register>(), count_offset), Immediate(1)); __ j(kNotZero, ¬_compressed); // Zero extend 8 compressed bytes into 8 chars. - __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true)); + __ movsd(reg, VecAddress(locations, 1, instruction->IsStringCharAt())); __ pxor(tmp, tmp); __ punpcklbw(reg, tmp); __ jmp(&done); diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc index c7ee81c60d..7051ba041f 100644 --- a/compiler/optimizing/code_generator_vector_x86_64.cc +++ b/compiler/optimizing/code_generator_vector_x86_64.cc @@ -27,6 +27,8 @@ namespace x86_64 { void LocationsBuilderX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + HInstruction* input = instruction->InputAt(0); + bool is_zero = IsZeroBitPattern(input); switch (instruction->GetPackedType()) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: @@ -34,13 +36,16 @@ void LocationsBuilderX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instru case Primitive::kPrimShort: case Primitive::kPrimInt: case Primitive::kPrimLong: - locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresRegister()); locations->SetOut(Location::RequiresFpuRegister()); break; case Primitive::kPrimFloat: case Primitive::kPrimDouble: - locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetOut(Location::SameAsFirstInput()); + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresFpuRegister()); + locations->SetOut(is_zero ? Location::RequiresFpuRegister() + : Location::SameAsFirstInput()); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -50,42 +55,49 @@ void LocationsBuilderX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instru void InstructionCodeGeneratorX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { LocationSummary* locations = instruction->GetLocations(); - XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + + // Shorthand for any type of zero. + if (IsZeroBitPattern(instruction->InputAt(0))) { + __ xorps(dst, dst); + return; + } + switch (instruction->GetPackedType()) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: DCHECK_EQ(16u, instruction->GetVectorLength()); - __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>()); - __ punpcklbw(reg, reg); - __ punpcklwd(reg, reg); - __ pshufd(reg, reg, Immediate(0)); + __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>()); + __ punpcklbw(dst, dst); + __ punpcklwd(dst, dst); + __ pshufd(dst, dst, Immediate(0)); break; case Primitive::kPrimChar: case Primitive::kPrimShort: DCHECK_EQ(8u, instruction->GetVectorLength()); - __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>()); - __ punpcklwd(reg, reg); - __ pshufd(reg, reg, Immediate(0)); + __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>()); + __ punpcklwd(dst, dst); + __ pshufd(dst, dst, Immediate(0)); break; case Primitive::kPrimInt: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>()); - __ pshufd(reg, reg, Immediate(0)); + __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>()); + __ pshufd(dst, dst, Immediate(0)); break; case Primitive::kPrimLong: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>()); // is 64-bit - __ punpcklqdq(reg, reg); + __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>()); // is 64-bit + __ punpcklqdq(dst, dst); break; case Primitive::kPrimFloat: DCHECK(locations->InAt(0).Equals(locations->Out())); DCHECK_EQ(4u, instruction->GetVectorLength()); - __ shufps(reg, reg, Immediate(0)); + __ shufps(dst, dst, Immediate(0)); break; case Primitive::kPrimDouble: DCHECK(locations->InAt(0).Equals(locations->Out())); DCHECK_EQ(2u, instruction->GetVectorLength()); - __ shufpd(reg, reg, Immediate(0)); + __ shufpd(dst, dst, Immediate(0)); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -93,20 +105,57 @@ void InstructionCodeGeneratorX86_64::VisitVecReplicateScalar(HVecReplicateScalar } } -void LocationsBuilderX86_64::VisitVecSetScalars(HVecSetScalars* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void InstructionCodeGeneratorX86_64::VisitVecSetScalars(HVecSetScalars* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void LocationsBuilderX86_64::VisitVecSumReduce(HVecSumReduce* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); +void LocationsBuilderX86_64::VisitVecExtractScalar(HVecExtractScalar* instruction) { + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresRegister()); + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } -void InstructionCodeGeneratorX86_64::VisitVecSumReduce(HVecSumReduce* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); +void InstructionCodeGeneratorX86_64::VisitVecExtractScalar(HVecExtractScalar* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: // TODO: up to here, and? + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ movd(locations->Out().AsRegister<CpuRegister>(), src); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ movd(locations->Out().AsRegister<CpuRegister>(), src); // is 64-bit + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 4u); + DCHECK(locations->InAt(0).Equals(locations->Out())); // no code required + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } // Helper to set up locations for vector unary operations. @@ -130,6 +179,73 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in } } +void LocationsBuilderX86_64::VisitVecReduce(HVecReduce* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); + // Long reduction or min/max require a temporary. + if (instruction->GetPackedType() == Primitive::kPrimLong || + instruction->GetKind() == HVecReduce::kMin || + instruction->GetKind() == HVecReduce::kMax) { + instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister()); + } +} + +void InstructionCodeGeneratorX86_64::VisitVecReduce(HVecReduce* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + switch (instruction->GetKind()) { + case HVecReduce::kSum: + __ movaps(dst, src); + __ phaddd(dst, dst); + __ phaddd(dst, dst); + break; + case HVecReduce::kMin: { + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + __ movaps(tmp, src); + __ movaps(dst, src); + __ psrldq(tmp, Immediate(8)); + __ pminsd(dst, tmp); + __ psrldq(tmp, Immediate(4)); + __ pminsd(dst, tmp); + break; + } + case HVecReduce::kMax: { + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + __ movaps(tmp, src); + __ movaps(dst, src); + __ psrldq(tmp, Immediate(8)); + __ pmaxsd(dst, tmp); + __ psrldq(tmp, Immediate(4)); + __ pmaxsd(dst, tmp); + break; + } + } + break; + case Primitive::kPrimLong: { + DCHECK_EQ(2u, instruction->GetVectorLength()); + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + switch (instruction->GetKind()) { + case HVecReduce::kSum: + __ movaps(tmp, src); + __ movaps(dst, src); + __ punpckhqdq(tmp, tmp); + __ paddq(dst, tmp); + break; + case HVecReduce::kMin: + case HVecReduce::kMax: + LOG(FATAL) << "Unsupported SIMD type"; + } + break; + } + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + void LocationsBuilderX86_64::VisitVecCnv(HVecCnv* instruction) { CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); } @@ -814,6 +930,81 @@ void InstructionCodeGeneratorX86_64::VisitVecUShr(HVecUShr* instruction) { } } +void LocationsBuilderX86_64::VisitVecSetScalars(HVecSetScalars* instruction) { + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + + DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented + + HInstruction* input = instruction->InputAt(0); + bool is_zero = IsZeroBitPattern(input); + + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void InstructionCodeGeneratorX86_64::VisitVecSetScalars(HVecSetScalars* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + + DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented + + // Zero out all other elements first. + __ xorps(dst, dst); + + // Shorthand for any type of zero. + if (IsZeroBitPattern(instruction->InputAt(0))) { + return; + } + + // Set required elements. + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: // TODO: up to here, and? + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>()); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>()); // is 64-bit + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ movss(dst, locations->InAt(0).AsFpuRegister<XmmRegister>()); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ movsd(dst, locations->InAt(0).AsFpuRegister<XmmRegister>()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { LOG(FATAL) << "No SIMD for " << instr->GetId(); } @@ -861,6 +1052,7 @@ static Address VecAddress(LocationSummary* locations, size_t size, bool is_strin case 8: scale = TIMES_8; break; default: break; } + // Incorporate the string or array offset in the address computation. uint32_t offset = is_string_char_at ? mirror::String::ValueOffset().Uint32Value() : mirror::Array::DataOffset(size).Uint32Value(); @@ -895,7 +1087,7 @@ void InstructionCodeGeneratorX86_64::VisitVecLoad(HVecLoad* instruction) { __ testb(Address(locations->InAt(0).AsRegister<CpuRegister>(), count_offset), Immediate(1)); __ j(kNotZero, ¬_compressed); // Zero extend 8 compressed bytes into 8 chars. - __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true)); + __ movsd(reg, VecAddress(locations, 1, instruction->IsStringCharAt())); __ pxor(tmp, tmp); __ punpcklbw(reg, tmp); __ jmp(&done); diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 027ba7741c..f8f4eb2ae3 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -285,6 +285,19 @@ static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) { return false; } +// Translates operation to reduction kind. +static HVecReduce::ReductionKind GetReductionKind(HInstruction* reduction) { + if (reduction->IsVecAdd() || reduction->IsVecSub()) { + return HVecReduce::kSum; + } else if (reduction->IsVecMin()) { + return HVecReduce::kMin; + } else if (reduction->IsVecMax()) { + return HVecReduce::kMax; + } + LOG(FATAL) << "Unsupported SIMD reduction"; + UNREACHABLE(); +} + // Test vector restrictions. static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) { return (restrictions & tested) != 0; @@ -334,7 +347,8 @@ HLoopOptimization::HLoopOptimization(HGraph* graph, vector_peeling_candidate_(nullptr), vector_runtime_test_a_(nullptr), vector_runtime_test_b_(nullptr), - vector_map_(nullptr) { + vector_map_(nullptr), + vector_permanent_map_(nullptr) { } void HLoopOptimization::Run() { @@ -388,11 +402,14 @@ void HLoopOptimization::LocalRun() { ArenaSet<ArrayReference> refs(loop_allocator_->Adapter(kArenaAllocLoopOptimization)); ArenaSafeMap<HInstruction*, HInstruction*> map( std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization)); + ArenaSafeMap<HInstruction*, HInstruction*> perm( + std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization)); // Attach. iset_ = &iset; reductions_ = &reds; vector_refs_ = &refs; vector_map_ = ↦ + vector_permanent_map_ = &perm; // Traverse. TraverseLoopsInnerToOuter(top_loop_); // Detach. @@ -400,6 +417,7 @@ void HLoopOptimization::LocalRun() { reductions_ = nullptr; vector_refs_ = nullptr; vector_map_ = nullptr; + vector_permanent_map_ = nullptr; } } @@ -603,7 +621,6 @@ bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { // Vectorize loop, if possible and valid. if (kEnableVectorization && TrySetSimpleLoopHeader(header, &main_phi) && - reductions_->empty() && // TODO: possible with some effort ShouldVectorize(node, body, trip_count) && TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) { Vectorize(node, body, exit, trip_count); @@ -802,6 +819,13 @@ void HLoopOptimization::Vectorize(LoopNode* node, /*unroll*/ 1); } + // Link reductions to their final uses. + for (auto i = reductions_->begin(); i != reductions_->end(); ++i) { + if (i->first->IsPhi()) { + i->first->ReplaceWith(ReduceAndExtractIfNeeded(i->second)); + } + } + // Remove the original loop by disconnecting the body block // and removing all instructions from the header. block->DisconnectAndDelete(); @@ -841,21 +865,10 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node, vector_header_->AddInstruction(cond); vector_header_->AddInstruction(new (global_allocator_) HIf(cond)); vector_index_ = phi; + vector_permanent_map_->clear(); // preserved over unrolling for (uint32_t u = 0; u < unroll; u++) { - // Clear map, leaving loop invariants setup during unrolling. - if (u == 0) { - vector_map_->clear(); - } else { - for (auto i = vector_map_->begin(); i != vector_map_->end(); ) { - if (i->second->IsVecReplicateScalar()) { - DCHECK(node->loop_info->IsDefinedOutOfTheLoop(i->first)); - ++i; - } else { - i = vector_map_->erase(i); - } - } - } // Generate instruction map. + vector_map_->clear(); for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true); DCHECK(vectorized_def); @@ -872,9 +885,17 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node, } } } + // Generate the induction. vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step); Insert(vector_body_, vector_index_); } + // Finalize phi inputs for the reductions (if any). + for (auto i = reductions_->begin(); i != reductions_->end(); ++i) { + if (!i->first->IsPhi()) { + DCHECK(i->second->IsPhi()); + GenerateVecReductionPhiInputs(i->second->AsPhi(), i->first); + } + } // Finalize phi inputs for the loop index. phi->AddInput(lo); phi->AddInput(vector_index_); @@ -910,6 +931,23 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node, } return false; } + // Accept a left-hand-side reduction for + // (1) supported vector type, + // (2) vectorizable right-hand-side value. + auto redit = reductions_->find(instruction); + if (redit != reductions_->end()) { + Primitive::Type type = instruction->GetType(); + if (TrySetVectorType(type, &restrictions) && + VectorizeUse(node, instruction, generate_code, type, restrictions)) { + if (generate_code) { + HInstruction* new_red = vector_map_->Get(instruction); + vector_permanent_map_->Put(new_red, vector_map_->Get(redit->second)); + vector_permanent_map_->Overwrite(redit->second, new_red); + } + return true; + } + return false; + } // Branch back okay. if (instruction->IsGoto()) { return true; @@ -965,6 +1003,21 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, } return true; } + } else if (instruction->IsPhi()) { + // Accept particular phi operations. + if (reductions_->find(instruction) != reductions_->end()) { + // Deal with vector restrictions. + if (HasVectorRestrictions(restrictions, kNoReduction)) { + return false; + } + // Accept a reduction. + if (generate_code) { + GenerateVecReductionPhi(instruction->AsPhi()); + } + return true; + } + // TODO: accept right-hand-side induction? + return false; } else if (instruction->IsTypeConversion()) { // Accept particular type conversions. HTypeConversion* conversion = instruction->AsTypeConversion(); @@ -1155,14 +1208,14 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoReduction; return TrySetVectorLength(8); case Primitive::kPrimChar: case Primitive::kPrimShort: - *restrictions |= kNoDiv | kNoStringCharAt; + *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction; return TrySetVectorLength(4); case Primitive::kPrimInt: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoReduction; return TrySetVectorLength(2); default: break; @@ -1174,11 +1227,11 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoReduction; return TrySetVectorLength(16); case Primitive::kPrimChar: case Primitive::kPrimShort: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoReduction; return TrySetVectorLength(8); case Primitive::kPrimInt: *restrictions |= kNoDiv; @@ -1187,8 +1240,10 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric *restrictions |= kNoDiv | kNoMul | kNoMinMax; return TrySetVectorLength(2); case Primitive::kPrimFloat: + *restrictions |= kNoReduction; return TrySetVectorLength(4); case Primitive::kPrimDouble: + *restrictions |= kNoReduction; return TrySetVectorLength(2); default: return false; @@ -1200,11 +1255,12 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - *restrictions |= kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd; + *restrictions |= + kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction; return TrySetVectorLength(16); case Primitive::kPrimChar: case Primitive::kPrimShort: - *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd; + *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction; return TrySetVectorLength(8); case Primitive::kPrimInt: *restrictions |= kNoDiv; @@ -1213,10 +1269,10 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax; return TrySetVectorLength(2); case Primitive::kPrimFloat: - *restrictions |= kNoMinMax; // -0.0 vs +0.0 + *restrictions |= kNoMinMax | kNoReduction; // minmax: -0.0 vs +0.0 return TrySetVectorLength(4); case Primitive::kPrimDouble: - *restrictions |= kNoMinMax; // -0.0 vs +0.0 + *restrictions |= kNoMinMax | kNoReduction; // minmax: -0.0 vs +0.0 return TrySetVectorLength(2); default: break; @@ -1228,23 +1284,23 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoReduction; return TrySetVectorLength(16); case Primitive::kPrimChar: case Primitive::kPrimShort: - *restrictions |= kNoDiv | kNoStringCharAt; + *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction; return TrySetVectorLength(8); case Primitive::kPrimInt: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoReduction; return TrySetVectorLength(4); case Primitive::kPrimLong: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoReduction; return TrySetVectorLength(2); case Primitive::kPrimFloat: - *restrictions |= kNoMinMax; // min/max(x, NaN) + *restrictions |= kNoMinMax | kNoReduction; // min/max(x, NaN) return TrySetVectorLength(4); case Primitive::kPrimDouble: - *restrictions |= kNoMinMax; // min/max(x, NaN) + *restrictions |= kNoMinMax | kNoReduction; // min/max(x, NaN) return TrySetVectorLength(2); default: break; @@ -1256,23 +1312,23 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoReduction; return TrySetVectorLength(16); case Primitive::kPrimChar: case Primitive::kPrimShort: - *restrictions |= kNoDiv | kNoStringCharAt; + *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction; return TrySetVectorLength(8); case Primitive::kPrimInt: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoReduction; return TrySetVectorLength(4); case Primitive::kPrimLong: - *restrictions |= kNoDiv; + *restrictions |= kNoDiv | kNoReduction; return TrySetVectorLength(2); case Primitive::kPrimFloat: - *restrictions |= kNoMinMax; // min/max(x, NaN) + *restrictions |= kNoMinMax | kNoReduction; // min/max(x, NaN) return TrySetVectorLength(4); case Primitive::kPrimDouble: - *restrictions |= kNoMinMax; // min/max(x, NaN) + *restrictions |= kNoMinMax | kNoReduction; // min/max(x, NaN) return TrySetVectorLength(2); default: break; @@ -1305,9 +1361,16 @@ void HLoopOptimization::GenerateVecInv(HInstruction* org, Primitive::Type type) return; } // In vector code, explicit scalar expansion is needed. - HInstruction* vector = new (global_allocator_) HVecReplicateScalar( - global_allocator_, org, type, vector_length_); - vector_map_->Put(org, Insert(vector_preheader_, vector)); + HInstruction* vector = nullptr; + auto it = vector_permanent_map_->find(org); + if (it != vector_permanent_map_->end()) { + vector = it->second; // reuse during unrolling + } else { + vector = new (global_allocator_) HVecReplicateScalar( + global_allocator_, org, type, vector_length_); + vector_permanent_map_->Put(org, Insert(vector_preheader_, vector)); + } + vector_map_->Put(org, vector); } } @@ -1362,6 +1425,78 @@ void HLoopOptimization::GenerateVecMem(HInstruction* org, vector_map_->Put(org, vector); } +void HLoopOptimization::GenerateVecReductionPhi(HPhi* phi) { + DCHECK(reductions_->find(phi) != reductions_->end()); + DCHECK(reductions_->Get(phi->InputAt(1)) == phi); + HInstruction* vector = nullptr; + if (vector_mode_ == kSequential) { + HPhi* new_phi = new (global_allocator_) HPhi( + global_allocator_, kNoRegNumber, 0, phi->GetType()); + vector_header_->AddPhi(new_phi); + vector = new_phi; + } else { + // Link vector reduction back to prior unrolled update, or a first phi. + auto it = vector_permanent_map_->find(phi); + if (it != vector_permanent_map_->end()) { + vector = it->second; + } else { + HPhi* new_phi = new (global_allocator_) HPhi( + global_allocator_, kNoRegNumber, 0, HVecOperation::kSIMDType); + vector_header_->AddPhi(new_phi); + vector = new_phi; + } + } + vector_map_->Put(phi, vector); +} + +void HLoopOptimization::GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction) { + HInstruction* new_phi = vector_map_->Get(phi); + HInstruction* new_init = reductions_->Get(phi); + HInstruction* new_red = vector_map_->Get(reduction); + // Link unrolled vector loop back to new phi. + for (; !new_phi->IsPhi(); new_phi = vector_permanent_map_->Get(new_phi)) { + DCHECK(new_phi->IsVecOperation()); + } + // Prepare the new initialization. + if (vector_mode_ == kVector) { + // Generate a [initial, 0, .., 0] vector. + new_init = Insert( + vector_preheader_, + new (global_allocator_) HVecSetScalars( + global_allocator_, &new_init, phi->GetType(), vector_length_, 1)); + } else { + new_init = ReduceAndExtractIfNeeded(new_init); + } + // Set the phi inputs. + DCHECK(new_phi->IsPhi()); + new_phi->AsPhi()->AddInput(new_init); + new_phi->AsPhi()->AddInput(new_red); + // New feed value for next phi (safe mutation in iteration). + reductions_->find(phi)->second = new_phi; +} + +HInstruction* HLoopOptimization::ReduceAndExtractIfNeeded(HInstruction* instruction) { + if (instruction->IsPhi()) { + HInstruction* input = instruction->InputAt(1); + if (input->IsVecOperation()) { + Primitive::Type type = input->AsVecOperation()->GetPackedType(); + HBasicBlock* exit = instruction->GetBlock()->GetSuccessors()[0]; + // Generate a vector reduction and scalar extract + // x = REDUCE( [x_1, .., x_n] ) + // y = x_1 + // along the exit of the defining loop. + HVecReduce::ReductionKind kind = GetReductionKind(input); + HInstruction* reduce = new (global_allocator_) HVecReduce( + global_allocator_, instruction, type, vector_length_, kind); + exit->InsertInstructionBefore(reduce, exit->GetFirstInstruction()); + instruction = new (global_allocator_) HVecExtractScalar( + global_allocator_, reduce, type, vector_length_, 0); + exit->InsertInstructionAfter(instruction, reduce); + } + } + return instruction; +} + #define GENERATE_VEC(x, y) \ if (vector_mode_ == kVector) { \ vector = (x); \ @@ -1542,10 +1677,9 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, // Test for top level arithmetic shift right x >> 1 or logical shift right x >>> 1 // (note whether the sign bit in wider precision is shifted in has no effect // on the narrow precision computed by the idiom). - int64_t distance = 0; if ((instruction->IsShr() || instruction->IsUShr()) && - IsInt64AndGet(instruction->InputAt(1), /*out*/ &distance) && distance == 1) { + IsInt64Value(instruction->InputAt(1), 1)) { // Test for (a + b + c) >> 1 for optional constant c. HInstruction* a = nullptr; HInstruction* b = nullptr; diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index 49be8a3fb4..ba9126c5f6 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -62,17 +62,18 @@ class HLoopOptimization : public HOptimization { * Vectorization restrictions (bit mask). */ enum VectorRestrictions { - kNone = 0, // no restrictions - kNoMul = 1, // no multiplication - kNoDiv = 2, // no division - kNoShift = 4, // no shift - kNoShr = 8, // no arithmetic shift right - kNoHiBits = 16, // "wider" operations cannot bring in higher order bits - kNoSignedHAdd = 32, // no signed halving add - kNoUnroundedHAdd = 64, // no unrounded halving add - kNoAbs = 128, // no absolute value - kNoMinMax = 256, // no min/max - kNoStringCharAt = 512, // no StringCharAt + kNone = 0, // no restrictions + kNoMul = 1 << 0, // no multiplication + kNoDiv = 1 << 1, // no division + kNoShift = 1 << 2, // no shift + kNoShr = 1 << 3, // no arithmetic shift right + kNoHiBits = 1 << 4, // "wider" operations cannot bring in higher order bits + kNoSignedHAdd = 1 << 5, // no signed halving add + kNoUnroundedHAdd = 1 << 6, // no unrounded halving add + kNoAbs = 1 << 7, // no absolute value + kNoMinMax = 1 << 8, // no min/max + kNoStringCharAt = 1 << 9, // no StringCharAt + kNoReduction = 1 << 10, // no reduction }; /* @@ -155,6 +156,9 @@ class HLoopOptimization : public HOptimization { HInstruction* opb, HInstruction* offset, Primitive::Type type); + void GenerateVecReductionPhi(HPhi* phi); + void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction); + HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction); void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, @@ -253,6 +257,10 @@ class HLoopOptimization : public HOptimization { // Contents reside in phase-local heap memory. ArenaSafeMap<HInstruction*, HInstruction*>* vector_map_; + // Permanent mapping used during vectorization synthesis. + // Contents reside in phase-local heap memory. + ArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_; + // Temporary vectorization bookkeeping. VectorMode vector_mode_; // synthesis mode HBasicBlock* vector_preheader_; // preheader of the new loop diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index f60d532c37..869fdd4182 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -1374,7 +1374,8 @@ class HLoopInformationOutwardIterator : public ValueObject { M(UShr, BinaryOperation) \ M(Xor, BinaryOperation) \ M(VecReplicateScalar, VecUnaryOperation) \ - M(VecSumReduce, VecUnaryOperation) \ + M(VecExtractScalar, VecUnaryOperation) \ + M(VecReduce, VecUnaryOperation) \ M(VecCnv, VecUnaryOperation) \ M(VecNeg, VecUnaryOperation) \ M(VecAbs, VecUnaryOperation) \ @@ -7030,6 +7031,17 @@ inline bool IsInt64AndGet(HInstruction* instruction, /*out*/ int64_t* value) { return false; } +// Returns true iff instruction is the given integral constant. +inline bool IsInt64Value(HInstruction* instruction, int64_t value) { + int64_t val = 0; + return IsInt64AndGet(instruction, &val) && val == value; +} + +// Returns true iff instruction is a zero bit pattern. +inline bool IsZeroBitPattern(HInstruction* instruction) { + return instruction->IsConstant() && instruction->AsConstant()->IsZeroBitPattern(); +} + #define INSTRUCTION_TYPE_CHECK(type, super) \ inline bool HInstruction::Is##type() const { return GetKind() == k##type; } \ inline const H##type* HInstruction::As##type() const { \ diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index 6261171a00..886d75e5c7 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -63,6 +63,10 @@ class Alignment { // GetVectorLength() x GetPackedType() operations simultaneously. class HVecOperation : public HVariableInputSizeInstruction { public: + // A SIMD operation looks like a FPU location. + // TODO: we could introduce SIMD types in HIR. + static constexpr Primitive::Type kSIMDType = Primitive::kPrimDouble; + HVecOperation(ArenaAllocator* arena, Primitive::Type packed_type, SideEffects side_effects, @@ -89,10 +93,9 @@ class HVecOperation : public HVariableInputSizeInstruction { return vector_length_ * Primitive::ComponentSize(GetPackedType()); } - // Returns the type of the vector operation: a SIMD operation looks like a FPU location. - // TODO: we could introduce SIMD types in HIR. + // Returns the type of the vector operation. Primitive::Type GetType() const OVERRIDE { - return Primitive::kPrimDouble; + return kSIMDType; } // Returns the true component type packed in a vector. @@ -220,8 +223,11 @@ class HVecMemoryOperation : public HVecOperation { DISALLOW_COPY_AND_ASSIGN(HVecMemoryOperation); }; -// Packed type consistency checker (same vector length integral types may mix freely). +// Packed type consistency checker ("same vector length" integral types may mix freely). inline static bool HasConsistentPackedTypes(HInstruction* input, Primitive::Type type) { + if (input->IsPhi()) { + return input->GetType() == HVecOperation::kSIMDType; // carries SIMD + } DCHECK(input->IsVecOperation()); Primitive::Type input_type = input->AsVecOperation()->GetPackedType(); switch (input_type) { @@ -265,27 +271,77 @@ class HVecReplicateScalar FINAL : public HVecUnaryOperation { DISALLOW_COPY_AND_ASSIGN(HVecReplicateScalar); }; -// Sum-reduces the given vector into a shorter vector (m < n) or scalar (m = 1), -// viz. sum-reduce[ x1, .. , xn ] = [ y1, .., ym ], where yi = sum_j x_j. -class HVecSumReduce FINAL : public HVecUnaryOperation { - HVecSumReduce(ArenaAllocator* arena, - HInstruction* input, - Primitive::Type packed_type, - size_t vector_length, - uint32_t dex_pc = kNoDexPc) +// Extracts a particular scalar from the given vector, +// viz. extract[ x1, .. , xn ] = x_i. +// +// TODO: for now only i == 1 case supported. +class HVecExtractScalar FINAL : public HVecUnaryOperation { + public: + HVecExtractScalar(ArenaAllocator* arena, + HInstruction* input, + Primitive::Type packed_type, + size_t vector_length, + size_t index, + uint32_t dex_pc = kNoDexPc) : HVecUnaryOperation(arena, input, packed_type, vector_length, dex_pc) { DCHECK(HasConsistentPackedTypes(input, packed_type)); + DCHECK_LT(index, vector_length); + DCHECK_EQ(index, 0u); + } + + // Yields a single component in the vector. + Primitive::Type GetType() const OVERRIDE { + return GetPackedType(); + } + + // An extract needs to stay in place, since SIMD registers are not + // kept alive across vector loop boundaries (yet). + bool CanBeMoved() const OVERRIDE { return false; } + + DECLARE_INSTRUCTION(VecExtractScalar); + + private: + DISALLOW_COPY_AND_ASSIGN(HVecExtractScalar); +}; + +// Reduces the given vector into the first element as sum/min/max, +// viz. sum-reduce[ x1, .. , xn ] = [ y, ---- ], where y = sum xi +// and the "-" denotes "don't care" (implementation dependent). +class HVecReduce FINAL : public HVecUnaryOperation { + public: + enum ReductionKind { + kSum = 1, + kMin = 2, + kMax = 3 + }; + + HVecReduce(ArenaAllocator* arena, + HInstruction* input, + Primitive::Type packed_type, + size_t vector_length, + ReductionKind kind, + uint32_t dex_pc = kNoDexPc) + : HVecUnaryOperation(arena, input, packed_type, vector_length, dex_pc), + kind_(kind) { + DCHECK(HasConsistentPackedTypes(input, packed_type)); } - // TODO: probably integral promotion - Primitive::Type GetType() const OVERRIDE { return GetPackedType(); } + ReductionKind GetKind() const { return kind_; } bool CanBeMoved() const OVERRIDE { return true; } - DECLARE_INSTRUCTION(VecSumReduce); + bool InstructionDataEquals(const HInstruction* other) const OVERRIDE { + DCHECK(other->IsVecReduce()); + const HVecReduce* o = other->AsVecReduce(); + return HVecOperation::InstructionDataEquals(o) && GetKind() == o->GetKind(); + } + + DECLARE_INSTRUCTION(VecReduce); private: - DISALLOW_COPY_AND_ASSIGN(HVecSumReduce); + const ReductionKind kind_; + + DISALLOW_COPY_AND_ASSIGN(HVecReduce); }; // Converts every component in the vector, @@ -754,20 +810,23 @@ class HVecUShr FINAL : public HVecBinaryOperation { // // Assigns the given scalar elements to a vector, -// viz. set( array(x1, .., xn) ) = [ x1, .. , xn ]. +// viz. set( array(x1, .., xn) ) = [ x1, .. , xn ] if n == m, +// set( array(x1, .., xm) ) = [ x1, .. , xm, 0, .., 0 ] if m < n. class HVecSetScalars FINAL : public HVecOperation { + public: HVecSetScalars(ArenaAllocator* arena, HInstruction** scalars, // array Primitive::Type packed_type, size_t vector_length, + size_t number_of_scalars, uint32_t dex_pc = kNoDexPc) : HVecOperation(arena, packed_type, SideEffects::None(), - /* number_of_inputs */ vector_length, + number_of_scalars, vector_length, dex_pc) { - for (size_t i = 0; i < vector_length; i++) { + for (size_t i = 0; i < number_of_scalars; i++) { DCHECK(!scalars[i]->IsVecOperation()); SetRawInputAt(0, scalars[i]); } diff --git a/compiler/optimizing/nodes_vector_test.cc b/compiler/optimizing/nodes_vector_test.cc index 0238ea4602..5a56a2c210 100644 --- a/compiler/optimizing/nodes_vector_test.cc +++ b/compiler/optimizing/nodes_vector_test.cc @@ -332,4 +332,32 @@ TEST_F(NodesVectorTest, VectorOperationMattersOnMultiplyAccumulate) { EXPECT_FALSE(v1->Equals(v3)); // different vector lengths } +TEST_F(NodesVectorTest, VectorKindMattersOnReduce) { + HVecOperation* v0 = new (&allocator_) + HVecReplicateScalar(&allocator_, parameter_, Primitive::kPrimInt, 4); + + HVecReduce* v1 = new (&allocator_) HVecReduce( + &allocator_, v0, Primitive::kPrimInt, 4, HVecReduce::kSum); + HVecReduce* v2 = new (&allocator_) HVecReduce( + &allocator_, v0, Primitive::kPrimInt, 4, HVecReduce::kMin); + HVecReduce* v3 = new (&allocator_) HVecReduce( + &allocator_, v0, Primitive::kPrimInt, 4, HVecReduce::kMax); + + EXPECT_FALSE(v0->CanBeMoved()); + EXPECT_TRUE(v1->CanBeMoved()); + EXPECT_TRUE(v2->CanBeMoved()); + EXPECT_TRUE(v3->CanBeMoved()); + + EXPECT_EQ(HVecReduce::kSum, v1->GetKind()); + EXPECT_EQ(HVecReduce::kMin, v2->GetKind()); + EXPECT_EQ(HVecReduce::kMax, v3->GetKind()); + + EXPECT_TRUE(v1->Equals(v1)); + EXPECT_TRUE(v2->Equals(v2)); + EXPECT_TRUE(v3->Equals(v3)); + + EXPECT_FALSE(v1->Equals(v2)); // different kinds + EXPECT_FALSE(v1->Equals(v3)); +} + } // namespace art diff --git a/compiler/optimizing/scheduler_arm64.cc b/compiler/optimizing/scheduler_arm64.cc index 510619faf9..1d9d28ab24 100644 --- a/compiler/optimizing/scheduler_arm64.cc +++ b/compiler/optimizing/scheduler_arm64.cc @@ -215,12 +215,12 @@ void SchedulingLatencyVisitorARM64::VisitVecReplicateScalar( last_visited_latency_ = kArm64SIMDReplicateOpLatency; } -void SchedulingLatencyVisitorARM64::VisitVecSetScalars(HVecSetScalars* instr) { - LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId(); +void SchedulingLatencyVisitorARM64::VisitVecExtractScalar(HVecExtractScalar* instr) { + HandleSimpleArithmeticSIMD(instr); } -void SchedulingLatencyVisitorARM64::VisitVecSumReduce(HVecSumReduce* instr) { - LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId(); +void SchedulingLatencyVisitorARM64::VisitVecReduce(HVecReduce* instr) { + HandleSimpleArithmeticSIMD(instr); } void SchedulingLatencyVisitorARM64::VisitVecCnv(HVecCnv* instr ATTRIBUTE_UNUSED) { @@ -283,8 +283,8 @@ void SchedulingLatencyVisitorARM64::VisitVecAnd(HVecAnd* instr ATTRIBUTE_UNUSED) last_visited_latency_ = kArm64SIMDIntegerOpLatency; } -void SchedulingLatencyVisitorARM64::VisitVecAndNot(HVecAndNot* instr) { - LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId(); +void SchedulingLatencyVisitorARM64::VisitVecAndNot(HVecAndNot* instr ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArm64SIMDIntegerOpLatency; } void SchedulingLatencyVisitorARM64::VisitVecOr(HVecOr* instr ATTRIBUTE_UNUSED) { @@ -307,6 +307,10 @@ void SchedulingLatencyVisitorARM64::VisitVecUShr(HVecUShr* instr) { HandleSimpleArithmeticSIMD(instr); } +void SchedulingLatencyVisitorARM64::VisitVecSetScalars(HVecSetScalars* instr) { + HandleSimpleArithmeticSIMD(instr); +} + void SchedulingLatencyVisitorARM64::VisitVecMultiplyAccumulate( HVecMultiplyAccumulate* instr ATTRIBUTE_UNUSED) { last_visited_latency_ = kArm64SIMDMulIntegerLatency; diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h index 63d5b7d6b6..e1a80ec6fb 100644 --- a/compiler/optimizing/scheduler_arm64.h +++ b/compiler/optimizing/scheduler_arm64.h @@ -83,8 +83,8 @@ class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor { M(SuspendCheck , unused) \ M(TypeConversion , unused) \ M(VecReplicateScalar , unused) \ - M(VecSetScalars , unused) \ - M(VecSumReduce , unused) \ + M(VecExtractScalar , unused) \ + M(VecReduce , unused) \ M(VecCnv , unused) \ M(VecNeg , unused) \ M(VecAbs , unused) \ @@ -103,6 +103,7 @@ class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor { M(VecShl , unused) \ M(VecShr , unused) \ M(VecUShr , unused) \ + M(VecSetScalars , unused) \ M(VecMultiplyAccumulate, unused) \ M(VecLoad , unused) \ M(VecStore , unused) |