diff options
author | Aart Bik <ajcbik@google.com> | 2017-08-31 09:08:13 -0700 |
---|---|---|
committer | Aart Bik <ajcbik@google.com> | 2017-09-01 10:32:50 -0700 |
commit | cfa59b49cde265dc5329a7e6956445f9f7a75f15 (patch) | |
tree | eed953f62e796f7e64252520a40d7e77d1f117af /compiler/optimizing/code_generator_vector_x86.cc | |
parent | 82a63734d3067ea0c96f8ba15bc40caaf798c625 (diff) |
Basic SIMD reduction support.
Rationale:
Enables vectorization of x += .... for very basic (simple, same-type)
constructs. Paves the way for more complex (narrower and/or mixed-type)
constructs, which will be handled by the next CL.
This is a revert^2 of I7880c135aee3ed0a39da9ae5b468cbf80e613766
and thus a revert of I1c1c87b6323e01442e8fbd94869ddc9e760ea1fc
PS1-2 shows what needed to change, with regression tests
Test: test-art-host test-art-target
Bug: 64091002, 65212948
Change-Id: I2454778dd0ef1da915c178c7274e1cf33e271d0f
Diffstat (limited to 'compiler/optimizing/code_generator_vector_x86.cc')
-rw-r--r-- | compiler/optimizing/code_generator_vector_x86.cc | 279 |
1 files changed, 246 insertions, 33 deletions
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc index e7aec76aff..37190f8363 100644 --- a/compiler/optimizing/code_generator_vector_x86.cc +++ b/compiler/optimizing/code_generator_vector_x86.cc @@ -27,23 +27,31 @@ namespace x86 { void LocationsBuilderX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + HInstruction* input = instruction->InputAt(0); + bool is_zero = IsZeroBitPattern(input); switch (instruction->GetPackedType()) { case Primitive::kPrimLong: - // Long needs extra temporary to load the register pair. - locations->AddTemp(Location::RequiresFpuRegister()); + // Long needs extra temporary to load from the register pair. + if (!is_zero) { + locations->AddTemp(Location::RequiresFpuRegister()); + } FALLTHROUGH_INTENDED; case Primitive::kPrimBoolean: case Primitive::kPrimByte: case Primitive::kPrimChar: case Primitive::kPrimShort: case Primitive::kPrimInt: - locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresRegister()); locations->SetOut(Location::RequiresFpuRegister()); break; case Primitive::kPrimFloat: case Primitive::kPrimDouble: - locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetOut(Location::SameAsFirstInput()); + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresFpuRegister()); + locations->SetOut(is_zero ? Location::RequiresFpuRegister() + : Location::SameAsFirstInput()); + break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -53,46 +61,53 @@ void LocationsBuilderX86::VisitVecReplicateScalar(HVecReplicateScalar* instructi void InstructionCodeGeneratorX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { LocationSummary* locations = instruction->GetLocations(); - XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + + // Shorthand for any type of zero. + if (IsZeroBitPattern(instruction->InputAt(0))) { + __ xorps(dst, dst); + return; + } + switch (instruction->GetPackedType()) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: DCHECK_EQ(16u, instruction->GetVectorLength()); - __ movd(reg, locations->InAt(0).AsRegister<Register>()); - __ punpcklbw(reg, reg); - __ punpcklwd(reg, reg); - __ pshufd(reg, reg, Immediate(0)); + __ movd(dst, locations->InAt(0).AsRegister<Register>()); + __ punpcklbw(dst, dst); + __ punpcklwd(dst, dst); + __ pshufd(dst, dst, Immediate(0)); break; case Primitive::kPrimChar: case Primitive::kPrimShort: DCHECK_EQ(8u, instruction->GetVectorLength()); - __ movd(reg, locations->InAt(0).AsRegister<Register>()); - __ punpcklwd(reg, reg); - __ pshufd(reg, reg, Immediate(0)); + __ movd(dst, locations->InAt(0).AsRegister<Register>()); + __ punpcklwd(dst, dst); + __ pshufd(dst, dst, Immediate(0)); break; case Primitive::kPrimInt: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ movd(reg, locations->InAt(0).AsRegister<Register>()); - __ pshufd(reg, reg, Immediate(0)); + __ movd(dst, locations->InAt(0).AsRegister<Register>()); + __ pshufd(dst, dst, Immediate(0)); break; case Primitive::kPrimLong: { XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); DCHECK_EQ(2u, instruction->GetVectorLength()); - __ movd(reg, locations->InAt(0).AsRegisterPairLow<Register>()); + __ movd(dst, locations->InAt(0).AsRegisterPairLow<Register>()); __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>()); - __ punpckldq(reg, tmp); - __ punpcklqdq(reg, reg); + __ punpckldq(dst, tmp); + __ punpcklqdq(dst, dst); break; } case Primitive::kPrimFloat: DCHECK(locations->InAt(0).Equals(locations->Out())); DCHECK_EQ(4u, instruction->GetVectorLength()); - __ shufps(reg, reg, Immediate(0)); + __ shufps(dst, dst, Immediate(0)); break; case Primitive::kPrimDouble: DCHECK(locations->InAt(0).Equals(locations->Out())); DCHECK_EQ(2u, instruction->GetVectorLength()); - __ shufpd(reg, reg, Immediate(0)); + __ shufpd(dst, dst, Immediate(0)); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -100,20 +115,65 @@ void InstructionCodeGeneratorX86::VisitVecReplicateScalar(HVecReplicateScalar* i } } -void LocationsBuilderX86::VisitVecSetScalars(HVecSetScalars* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); -} - -void LocationsBuilderX86::VisitVecSumReduce(HVecSumReduce* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); +void LocationsBuilderX86::VisitVecExtractScalar(HVecExtractScalar* instruction) { + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimLong: + // Long needs extra temporary to store into the register pair. + locations->AddTemp(Location::RequiresFpuRegister()); + FALLTHROUGH_INTENDED; + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresRegister()); + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } -void InstructionCodeGeneratorX86::VisitVecSumReduce(HVecSumReduce* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); +void InstructionCodeGeneratorX86::VisitVecExtractScalar(HVecExtractScalar* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: // TODO: up to here, and? + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + case Primitive::kPrimInt: + DCHECK_LE(4u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ movd(locations->Out().AsRegister<Register>(), src); + break; + case Primitive::kPrimLong: { + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ movd(locations->Out().AsRegisterPairLow<Register>(), src); + __ pshufd(tmp, src, Immediate(1)); + __ movd(locations->Out().AsRegisterPairHigh<Register>(), tmp); + break; + } + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 4u); + DCHECK(locations->InAt(0).Equals(locations->Out())); // no code required + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } } // Helper to set up locations for vector unary operations. @@ -137,6 +197,73 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in } } +void LocationsBuilderX86::VisitVecReduce(HVecReduce* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); + // Long reduction or min/max require a temporary. + if (instruction->GetPackedType() == Primitive::kPrimLong || + instruction->GetKind() == HVecReduce::kMin || + instruction->GetKind() == HVecReduce::kMax) { + instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister()); + } +} + +void InstructionCodeGeneratorX86::VisitVecReduce(HVecReduce* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + switch (instruction->GetKind()) { + case HVecReduce::kSum: + __ movaps(dst, src); + __ phaddd(dst, dst); + __ phaddd(dst, dst); + break; + case HVecReduce::kMin: { + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + __ movaps(tmp, src); + __ movaps(dst, src); + __ psrldq(tmp, Immediate(8)); + __ pminsd(dst, tmp); + __ psrldq(tmp, Immediate(4)); + __ pminsd(dst, tmp); + break; + } + case HVecReduce::kMax: { + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + __ movaps(tmp, src); + __ movaps(dst, src); + __ psrldq(tmp, Immediate(8)); + __ pmaxsd(dst, tmp); + __ psrldq(tmp, Immediate(4)); + __ pmaxsd(dst, tmp); + break; + } + } + break; + case Primitive::kPrimLong: { + DCHECK_EQ(2u, instruction->GetVectorLength()); + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + switch (instruction->GetKind()) { + case HVecReduce::kSum: + __ movaps(tmp, src); + __ movaps(dst, src); + __ punpckhqdq(tmp, tmp); + __ paddq(dst, tmp); + break; + case HVecReduce::kMin: + case HVecReduce::kMax: + LOG(FATAL) << "Unsupported SIMD type"; + } + break; + } + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + void LocationsBuilderX86::VisitVecCnv(HVecCnv* instruction) { CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); } @@ -821,6 +948,91 @@ void InstructionCodeGeneratorX86::VisitVecUShr(HVecUShr* instruction) { } } +void LocationsBuilderX86::VisitVecSetScalars(HVecSetScalars* instruction) { + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + + DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented + + HInstruction* input = instruction->InputAt(0); + bool is_zero = IsZeroBitPattern(input); + + switch (instruction->GetPackedType()) { + case Primitive::kPrimLong: + // Long needs extra temporary to load from register pairs. + if (!is_zero) { + locations->AddTemp(Location::RequiresFpuRegister()); + } + FALLTHROUGH_INTENDED; + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + + DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented + + // Zero out all other elements first. + __ xorps(dst, dst); + + // Shorthand for any type of zero. + if (IsZeroBitPattern(instruction->InputAt(0))) { + return; + } + + // Set required elements. + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: // TODO: up to here, and? + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ movd(dst, locations->InAt(0).AsRegister<Register>()); + break; + case Primitive::kPrimLong: { + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ xorps(tmp, tmp); + __ movd(dst, locations->InAt(0).AsRegisterPairLow<Register>()); + __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>()); + __ punpckldq(dst, tmp); + break; + } + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ movss(dst, locations->InAt(1).AsFpuRegister<XmmRegister>()); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ movsd(dst, locations->InAt(1).AsFpuRegister<XmmRegister>()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { LOG(FATAL) << "No SIMD for " << instr->GetId(); } @@ -868,6 +1080,7 @@ static Address VecAddress(LocationSummary* locations, size_t size, bool is_strin case 8: scale = TIMES_8; break; default: break; } + // Incorporate the string or array offset in the address computation. uint32_t offset = is_string_char_at ? mirror::String::ValueOffset().Uint32Value() : mirror::Array::DataOffset(size).Uint32Value(); @@ -902,7 +1115,7 @@ void InstructionCodeGeneratorX86::VisitVecLoad(HVecLoad* instruction) { __ testb(Address(locations->InAt(0).AsRegister<Register>(), count_offset), Immediate(1)); __ j(kNotZero, ¬_compressed); // Zero extend 8 compressed bytes into 8 chars. - __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true)); + __ movsd(reg, VecAddress(locations, 1, instruction->IsStringCharAt())); __ pxor(tmp, tmp); __ punpcklbw(reg, tmp); __ jmp(&done); |