Basic SIMD reduction support.

Rationale: Enables vectorization of x += .... for very basic (simple, same-type) constructs. Paves the way for more complex (narrower and/or mixed-type) constructs, which will be handled by the next CL. This is a revert^2 of I7880c135aee3ed0a39da9ae5b468cbf80e613766 and thus a revert of I1c1c87b6323e01442e8fbd94869ddc9e760ea1fc PS1-2 shows what needed to change, with regression tests Test: test-art-host test-art-target Bug: 64091002, 65212948 Change-Id: I2454778dd0ef1da915c178c7274e1cf33e271d0f
author: Aart Bik <ajcbik@google.com> 2017-08-31 09:08:13 -0700
committer: Aart Bik <ajcbik@google.com> 2017-09-01 10:32:50 -0700
commit: cfa59b49cde265dc5329a7e6956445f9f7a75f15 (patch)
tree: eed953f62e796f7e64252520a40d7e77d1f117af /compiler/optimizing
parent: 82a63734d3067ea0c96f8ba15bc40caaf798c625 (diff)
13 files changed, 1006 insertions, 187 deletions
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 9095ecdf16..18a55c8b09 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -27,12 +27,13 @@ namespace arm64 {
 using helpers::ARM64EncodableConstantOrRegister;
 using helpers::Arm64CanEncodeConstantAsImmediate;
 using helpers::DRegisterFrom;
-using helpers::VRegisterFrom;
 using helpers::HeapOperand;
 using helpers::InputRegisterAt;
 using helpers::Int64ConstantFrom;
-using helpers::XRegisterFrom;
+using helpers::OutputRegister;
+using helpers::VRegisterFrom;
 using helpers::WRegisterFrom;
+using helpers::XRegisterFrom;
 
 #define __ GetVIXLAssembler()->
 
@@ -127,20 +128,51 @@ void InstructionCodeGeneratorARM64::VisitVecReplicateScalar(HVecReplicateScalar*
   }
 }
 
-void LocationsBuilderARM64::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorARM64::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void LocationsBuilderARM64::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+void LocationsBuilderARM64::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
-void InstructionCodeGeneratorARM64::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+void InstructionCodeGeneratorARM64::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister src = VRegisterFrom(locations->InAt(0));
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Umov(OutputRegister(instruction), src.V4S(), 0);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Umov(OutputRegister(instruction), src.V2D(), 0);
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 4u);
+      DCHECK(locations->InAt(0).Equals(locations->Out()));  // no code required
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 // Helper to set up locations for vector unary operations.
@@ -169,6 +201,46 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in
   }
 }
 
+void LocationsBuilderARM64::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARM64::VisitVecReduce(HVecReduce* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister src = VRegisterFrom(locations->InAt(0));
+  VRegister dst = DRegisterFrom(locations->Out());
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ Addv(dst.S(), src.V4S());
+          break;
+        case HVecReduce::kMin:
+          __ Sminv(dst.S(), src.V4S());
+          break;
+        case HVecReduce::kMax:
+          __ Smaxv(dst.S(), src.V4S());
+          break;
+      }
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ Addp(dst.D(), src.V2D());
+          break;
+        default:
+          LOG(FATAL) << "Unsupported SIMD min/max";
+          UNREACHABLE();
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderARM64::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -263,6 +335,7 @@ void InstructionCodeGeneratorARM64::VisitVecAbs(HVecAbs* instruction) {
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
   }
 }
 
@@ -805,6 +878,77 @@ void InstructionCodeGeneratorARM64::VisitVecUShr(HVecUShr* instruction) {
   }
 }
 
+void LocationsBuilderARM64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
+
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void InstructionCodeGeneratorARM64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister dst = VRegisterFrom(locations->Out());
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  // Zero out all other elements first.
+  __ Movi(dst.V16B(), 0);
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    return;
+  }
+
+  // Set required elements.
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Mov(dst.V16B(), 0, InputRegisterAt(instruction, 0));
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Mov(dst.V8H(), 0, InputRegisterAt(instruction, 0));
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Mov(dst.V4S(), 0, InputRegisterAt(instruction, 0));
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Mov(dst.V2D(), 0, InputRegisterAt(instruction, 0));
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
   switch (instr->GetPackedType()) {
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index 527691d9d9..7a11dff41e 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -73,19 +73,11 @@ void InstructionCodeGeneratorARMVIXL::VisitVecReplicateScalar(HVecReplicateScala
   }
 }
 
-void LocationsBuilderARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) {
+void LocationsBuilderARMVIXL::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
-void LocationsBuilderARMVIXL::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorARMVIXL::VisitVecSumReduce(HVecSumReduce* instruction) {
+void InstructionCodeGeneratorARMVIXL::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
@@ -112,6 +104,14 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in
   }
 }
 
+void LocationsBuilderARMVIXL::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecReduce(HVecReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderARMVIXL::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -621,6 +621,14 @@ void InstructionCodeGeneratorARMVIXL::VisitVecUShr(HVecUShr* instruction) {
   }
 }
 
+void LocationsBuilderARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
   LOG(FATAL) << "No SIMD for " << instr->GetId();
 }
diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
index 6bf28ab1a3..c2fbf7f04b 100644
--- a/compiler/optimizing/code_generator_vector_mips.cc
+++ b/compiler/optimizing/code_generator_vector_mips.cc
@@ -88,19 +88,11 @@ void InstructionCodeGeneratorMIPS::VisitVecReplicateScalar(HVecReplicateScalar*
   }
 }
 
-void LocationsBuilderMIPS::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorMIPS::VisitVecSetScalars(HVecSetScalars* instruction) {
+void LocationsBuilderMIPS::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
-void LocationsBuilderMIPS::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorMIPS::VisitVecSumReduce(HVecSumReduce* instruction) {
+void InstructionCodeGeneratorMIPS::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
@@ -133,6 +125,14 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in
   }
 }
 
+void LocationsBuilderMIPS::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecReduce(HVecReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderMIPS::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -818,6 +818,14 @@ void InstructionCodeGeneratorMIPS::VisitVecUShr(HVecUShr* instruction) {
   }
 }
 
+void LocationsBuilderMIPS::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
   switch (instr->GetPackedType()) {
diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index 75bf7a7cbb..9d3a777c13 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc
@@ -91,19 +91,11 @@ void InstructionCodeGeneratorMIPS64::VisitVecReplicateScalar(HVecReplicateScalar
   }
 }
 
-void LocationsBuilderMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) {
+void LocationsBuilderMIPS64::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
-void LocationsBuilderMIPS64::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorMIPS64::VisitVecSumReduce(HVecSumReduce* instruction) {
+void InstructionCodeGeneratorMIPS64::VisitVecExtractScalar(HVecExtractScalar* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
@@ -136,6 +128,14 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in
   }
 }
 
+void LocationsBuilderMIPS64::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecReduce(HVecReduce* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderMIPS64::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -822,6 +822,14 @@ void InstructionCodeGeneratorMIPS64::VisitVecUShr(HVecUShr* instruction) {
   }
 }
 
+void LocationsBuilderMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
   switch (instr->GetPackedType()) {
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index e7aec76aff..37190f8363 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -27,23 +27,31 @@ namespace x86 {
 
 void LocationsBuilderX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimLong:
-      // Long needs extra temporary to load the register pair.
-      locations->AddTemp(Location::RequiresFpuRegister());
+      // Long needs extra temporary to load from the register pair.
+      if (!is_zero) {
+        locations->AddTemp(Location::RequiresFpuRegister());
+      }
       FALLTHROUGH_INTENDED;
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
-      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
       locations->SetOut(Location::RequiresFpuRegister());
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
-      locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetOut(Location::SameAsFirstInput());
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresFpuRegister());
+      locations->SetOut(is_zero ? Location::RequiresFpuRegister()
+                                : Location::SameAsFirstInput());
+
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -53,46 +61,53 @@ void LocationsBuilderX86::VisitVecReplicateScalar(HVecReplicateScalar* instructi
 
 void InstructionCodeGeneratorX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    __ xorps(dst, dst);
+    return;
+  }
+
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<Register>());
-      __ punpcklbw(reg, reg);
-      __ punpcklwd(reg, reg);
-      __ pshufd(reg, reg, Immediate(0));
+      __ movd(dst, locations->InAt(0).AsRegister<Register>());
+      __ punpcklbw(dst, dst);
+      __ punpcklwd(dst, dst);
+      __ pshufd(dst, dst, Immediate(0));
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<Register>());
-      __ punpcklwd(reg, reg);
-      __ pshufd(reg, reg, Immediate(0));
+      __ movd(dst, locations->InAt(0).AsRegister<Register>());
+      __ punpcklwd(dst, dst);
+      __ pshufd(dst, dst, Immediate(0));
       break;
     case Primitive::kPrimInt:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<Register>());
-      __ pshufd(reg, reg, Immediate(0));
+      __ movd(dst, locations->InAt(0).AsRegister<Register>());
+      __ pshufd(dst, dst, Immediate(0));
       break;
     case Primitive::kPrimLong: {
       XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegisterPairLow<Register>());
+      __ movd(dst, locations->InAt(0).AsRegisterPairLow<Register>());
       __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>());
-      __ punpckldq(reg, tmp);
-      __ punpcklqdq(reg, reg);
+      __ punpckldq(dst, tmp);
+      __ punpcklqdq(dst, dst);
       break;
     }
     case Primitive::kPrimFloat:
       DCHECK(locations->InAt(0).Equals(locations->Out()));
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ shufps(reg, reg, Immediate(0));
+      __ shufps(dst, dst, Immediate(0));
       break;
     case Primitive::kPrimDouble:
       DCHECK(locations->InAt(0).Equals(locations->Out()));
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ shufpd(reg, reg, Immediate(0));
+      __ shufpd(dst, dst, Immediate(0));
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -100,20 +115,65 @@ void InstructionCodeGeneratorX86::VisitVecReplicateScalar(HVecReplicateScalar* i
   }
 }
 
-void LocationsBuilderX86::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void LocationsBuilderX86::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+void LocationsBuilderX86::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimLong:
+      // Long needs extra temporary to store into the register pair.
+      locations->AddTemp(Location::RequiresFpuRegister());
+      FALLTHROUGH_INTENDED;
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
-void InstructionCodeGeneratorX86::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+void InstructionCodeGeneratorX86::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:  // TODO: up to here, and?
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+    case Primitive::kPrimInt:
+      DCHECK_LE(4u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 16u);
+      __ movd(locations->Out().AsRegister<Register>(), src);
+      break;
+    case Primitive::kPrimLong: {
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movd(locations->Out().AsRegisterPairLow<Register>(), src);
+      __ pshufd(tmp, src, Immediate(1));
+      __ movd(locations->Out().AsRegisterPairHigh<Register>(), tmp);
+      break;
+    }
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 4u);
+      DCHECK(locations->InAt(0).Equals(locations->Out()));  // no code required
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 // Helper to set up locations for vector unary operations.
@@ -137,6 +197,73 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in
   }
 }
 
+void LocationsBuilderX86::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+  // Long reduction or min/max require a temporary.
+  if (instruction->GetPackedType() == Primitive::kPrimLong ||
+      instruction->GetKind() == HVecReduce::kMin ||
+      instruction->GetKind() == HVecReduce::kMax) {
+    instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  }
+}
+
+void InstructionCodeGeneratorX86::VisitVecReduce(HVecReduce* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ movaps(dst, src);
+          __ phaddd(dst, dst);
+          __ phaddd(dst, dst);
+          break;
+        case HVecReduce::kMin: {
+          XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ psrldq(tmp, Immediate(8));
+          __ pminsd(dst, tmp);
+          __ psrldq(tmp, Immediate(4));
+          __ pminsd(dst, tmp);
+          break;
+        }
+        case HVecReduce::kMax: {
+          XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ psrldq(tmp, Immediate(8));
+          __ pmaxsd(dst, tmp);
+          __ psrldq(tmp, Immediate(4));
+          __ pmaxsd(dst, tmp);
+          break;
+        }
+      }
+      break;
+    case Primitive::kPrimLong: {
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ punpckhqdq(tmp, tmp);
+          __ paddq(dst, tmp);
+          break;
+        case HVecReduce::kMin:
+        case HVecReduce::kMax:
+          LOG(FATAL) << "Unsupported SIMD type";
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -821,6 +948,91 @@ void InstructionCodeGeneratorX86::VisitVecUShr(HVecUShr* instruction) {
   }
 }
 
+void LocationsBuilderX86::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
+
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimLong:
+      // Long needs extra temporary to load from register pairs.
+      if (!is_zero) {
+        locations->AddTemp(Location::RequiresFpuRegister());
+      }
+      FALLTHROUGH_INTENDED;
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  // Zero out all other elements first.
+  __ xorps(dst, dst);
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    return;
+  }
+
+  // Set required elements.
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:  // TODO: up to here, and?
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<Register>());
+      break;
+    case Primitive::kPrimLong: {
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ xorps(tmp, tmp);
+      __ movd(dst, locations->InAt(0).AsRegisterPairLow<Register>());
+      __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>());
+      __ punpckldq(dst, tmp);
+      break;
+    }
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movss(dst, locations->InAt(1).AsFpuRegister<XmmRegister>());
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movsd(dst, locations->InAt(1).AsFpuRegister<XmmRegister>());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
   LOG(FATAL) << "No SIMD for " << instr->GetId();
 }
@@ -868,6 +1080,7 @@ static Address VecAddress(LocationSummary* locations, size_t size, bool is_strin
     case 8: scale = TIMES_8; break;
     default: break;
   }
+  // Incorporate the string or array offset in the address computation.
   uint32_t offset = is_string_char_at
       ? mirror::String::ValueOffset().Uint32Value()
       : mirror::Array::DataOffset(size).Uint32Value();
@@ -902,7 +1115,7 @@ void InstructionCodeGeneratorX86::VisitVecLoad(HVecLoad* instruction) {
         __ testb(Address(locations->InAt(0).AsRegister<Register>(), count_offset), Immediate(1));
         __ j(kNotZero, &not_compressed);
         // Zero extend 8 compressed bytes into 8 chars.
-        __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true));
+        __ movsd(reg, VecAddress(locations, 1, instruction->IsStringCharAt()));
         __ pxor(tmp, tmp);
         __ punpcklbw(reg, tmp);
         __ jmp(&done);
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index c7ee81c60d..7051ba041f 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -27,6 +27,8 @@ namespace x86_64 {
 
 void LocationsBuilderX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -34,13 +36,16 @@ void LocationsBuilderX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instru
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
-      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
       locations->SetOut(Location::RequiresFpuRegister());
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble:
-      locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetOut(Location::SameAsFirstInput());
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresFpuRegister());
+      locations->SetOut(is_zero ? Location::RequiresFpuRegister()
+                                : Location::SameAsFirstInput());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -50,42 +55,49 @@ void LocationsBuilderX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instru
 
 void InstructionCodeGeneratorX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    __ xorps(dst, dst);
+    return;
+  }
+
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>());
-      __ punpcklbw(reg, reg);
-      __ punpcklwd(reg, reg);
-      __ pshufd(reg, reg, Immediate(0));
+      __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>());
+      __ punpcklbw(dst, dst);
+      __ punpcklwd(dst, dst);
+      __ pshufd(dst, dst, Immediate(0));
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>());
-      __ punpcklwd(reg, reg);
-      __ pshufd(reg, reg, Immediate(0));
+      __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>());
+      __ punpcklwd(dst, dst);
+      __ pshufd(dst, dst, Immediate(0));
       break;
     case Primitive::kPrimInt:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>());
-      __ pshufd(reg, reg, Immediate(0));
+      __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>());
+      __ pshufd(dst, dst, Immediate(0));
       break;
     case Primitive::kPrimLong:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>());  // is 64-bit
-      __ punpcklqdq(reg, reg);
+      __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>());  // is 64-bit
+      __ punpcklqdq(dst, dst);
       break;
     case Primitive::kPrimFloat:
       DCHECK(locations->InAt(0).Equals(locations->Out()));
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ shufps(reg, reg, Immediate(0));
+      __ shufps(dst, dst, Immediate(0));
       break;
     case Primitive::kPrimDouble:
       DCHECK(locations->InAt(0).Equals(locations->Out()));
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ shufpd(reg, reg, Immediate(0));
+      __ shufpd(dst, dst, Immediate(0));
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -93,20 +105,57 @@ void InstructionCodeGeneratorX86_64::VisitVecReplicateScalar(HVecReplicateScalar
   }
 }
 
-void LocationsBuilderX86_64::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void InstructionCodeGeneratorX86_64::VisitVecSetScalars(HVecSetScalars* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-}
-
-void LocationsBuilderX86_64::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+void LocationsBuilderX86_64::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
-void InstructionCodeGeneratorX86_64::VisitVecSumReduce(HVecSumReduce* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+void InstructionCodeGeneratorX86_64::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:  // TODO: up to here, and?
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movd(locations->Out().AsRegister<CpuRegister>(), src);
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movd(locations->Out().AsRegister<CpuRegister>(), src);   // is 64-bit
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      DCHECK_LE(2u, instruction->GetVectorLength());
+      DCHECK_LE(instruction->GetVectorLength(), 4u);
+      DCHECK(locations->InAt(0).Equals(locations->Out()));  // no code required
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
 // Helper to set up locations for vector unary operations.
@@ -130,6 +179,73 @@ static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* in
   }
 }
 
+void LocationsBuilderX86_64::VisitVecReduce(HVecReduce* instruction) {
+  CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
+  // Long reduction or min/max require a temporary.
+  if (instruction->GetPackedType() == Primitive::kPrimLong ||
+      instruction->GetKind() == HVecReduce::kMin ||
+      instruction->GetKind() == HVecReduce::kMax) {
+    instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  }
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecReduce(HVecReduce* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ movaps(dst, src);
+          __ phaddd(dst, dst);
+          __ phaddd(dst, dst);
+          break;
+        case HVecReduce::kMin: {
+          XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ psrldq(tmp, Immediate(8));
+          __ pminsd(dst, tmp);
+          __ psrldq(tmp, Immediate(4));
+          __ pminsd(dst, tmp);
+          break;
+        }
+        case HVecReduce::kMax: {
+          XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ psrldq(tmp, Immediate(8));
+          __ pmaxsd(dst, tmp);
+          __ psrldq(tmp, Immediate(4));
+          __ pmaxsd(dst, tmp);
+          break;
+        }
+      }
+      break;
+    case Primitive::kPrimLong: {
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+      switch (instruction->GetKind()) {
+        case HVecReduce::kSum:
+          __ movaps(tmp, src);
+          __ movaps(dst, src);
+          __ punpckhqdq(tmp, tmp);
+          __ paddq(dst, tmp);
+          break;
+        case HVecReduce::kMin:
+        case HVecReduce::kMax:
+          LOG(FATAL) << "Unsupported SIMD type";
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86_64::VisitVecCnv(HVecCnv* instruction) {
   CreateVecUnOpLocations(GetGraph()->GetArena(), instruction);
 }
@@ -814,6 +930,81 @@ void InstructionCodeGeneratorX86_64::VisitVecUShr(HVecUShr* instruction) {
   }
 }
 
+void LocationsBuilderX86_64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  HInstruction* input = instruction->InputAt(0);
+  bool is_zero = IsZeroBitPattern(input);
+
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant())
+                                    : Location::RequiresFpuRegister());
+      locations->SetOut(Location::RequiresFpuRegister());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecSetScalars(HVecSetScalars* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+
+  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+
+  // Zero out all other elements first.
+  __ xorps(dst, dst);
+
+  // Shorthand for any type of zero.
+  if (IsZeroBitPattern(instruction->InputAt(0))) {
+    return;
+  }
+
+  // Set required elements.
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:  // TODO: up to here, and?
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>());
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movd(dst, locations->InAt(0).AsRegister<CpuRegister>());  // is 64-bit
+      break;
+    case Primitive::kPrimFloat:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ movss(dst, locations->InAt(0).AsFpuRegister<XmmRegister>());
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ movsd(dst, locations->InAt(0).AsFpuRegister<XmmRegister>());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
   LOG(FATAL) << "No SIMD for " << instr->GetId();
 }
@@ -861,6 +1052,7 @@ static Address VecAddress(LocationSummary* locations, size_t size, bool is_strin
     case 8: scale = TIMES_8; break;
     default: break;
   }
+  // Incorporate the string or array offset in the address computation.
   uint32_t offset = is_string_char_at
       ? mirror::String::ValueOffset().Uint32Value()
       : mirror::Array::DataOffset(size).Uint32Value();
@@ -895,7 +1087,7 @@ void InstructionCodeGeneratorX86_64::VisitVecLoad(HVecLoad* instruction) {
         __ testb(Address(locations->InAt(0).AsRegister<CpuRegister>(), count_offset), Immediate(1));
         __ j(kNotZero, &not_compressed);
         // Zero extend 8 compressed bytes into 8 chars.
-        __ movsd(reg, VecAddress(locations, 1, /*is_string_char_at*/ true));
+        __ movsd(reg, VecAddress(locations, 1, instruction->IsStringCharAt()));
         __ pxor(tmp, tmp);
         __ punpcklbw(reg, tmp);
         __ jmp(&done);
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 027ba7741c..f8f4eb2ae3 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -285,6 +285,19 @@ static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) {
   return false;
 }
 
+// Translates operation to reduction kind.
+static HVecReduce::ReductionKind GetReductionKind(HInstruction* reduction) {
+  if (reduction->IsVecAdd() || reduction->IsVecSub()) {
+    return HVecReduce::kSum;
+  } else if (reduction->IsVecMin()) {
+    return HVecReduce::kMin;
+  } else if (reduction->IsVecMax()) {
+    return HVecReduce::kMax;
+  }
+  LOG(FATAL) << "Unsupported SIMD reduction";
+  UNREACHABLE();
+}
+
 // Test vector restrictions.
 static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) {
   return (restrictions & tested) != 0;
@@ -334,7 +347,8 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
       vector_peeling_candidate_(nullptr),
       vector_runtime_test_a_(nullptr),
       vector_runtime_test_b_(nullptr),
-      vector_map_(nullptr) {
+      vector_map_(nullptr),
+      vector_permanent_map_(nullptr) {
 }
 
 void HLoopOptimization::Run() {
@@ -388,11 +402,14 @@ void HLoopOptimization::LocalRun() {
     ArenaSet<ArrayReference> refs(loop_allocator_->Adapter(kArenaAllocLoopOptimization));
     ArenaSafeMap<HInstruction*, HInstruction*> map(
         std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+    ArenaSafeMap<HInstruction*, HInstruction*> perm(
+        std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
     // Attach.
     iset_ = &iset;
     reductions_ = &reds;
     vector_refs_ = &refs;
     vector_map_ = &map;
+    vector_permanent_map_ = &perm;
     // Traverse.
     TraverseLoopsInnerToOuter(top_loop_);
     // Detach.
@@ -400,6 +417,7 @@ void HLoopOptimization::LocalRun() {
     reductions_ = nullptr;
     vector_refs_ = nullptr;
     vector_map_ = nullptr;
+    vector_permanent_map_ = nullptr;
   }
 }
 
@@ -603,7 +621,6 @@ bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
   // Vectorize loop, if possible and valid.
   if (kEnableVectorization &&
       TrySetSimpleLoopHeader(header, &main_phi) &&
-      reductions_->empty() &&  // TODO: possible with some effort
       ShouldVectorize(node, body, trip_count) &&
       TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) {
     Vectorize(node, body, exit, trip_count);
@@ -802,6 +819,13 @@ void HLoopOptimization::Vectorize(LoopNode* node,
                     /*unroll*/ 1);
   }
 
+  // Link reductions to their final uses.
+  for (auto i = reductions_->begin(); i != reductions_->end(); ++i) {
+    if (i->first->IsPhi()) {
+      i->first->ReplaceWith(ReduceAndExtractIfNeeded(i->second));
+    }
+  }
+
   // Remove the original loop by disconnecting the body block
   // and removing all instructions from the header.
   block->DisconnectAndDelete();
@@ -841,21 +865,10 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node,
   vector_header_->AddInstruction(cond);
   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
   vector_index_ = phi;
+  vector_permanent_map_->clear();  // preserved over unrolling
   for (uint32_t u = 0; u < unroll; u++) {
-    // Clear map, leaving loop invariants setup during unrolling.
-    if (u == 0) {
-      vector_map_->clear();
-    } else {
-      for (auto i = vector_map_->begin(); i != vector_map_->end(); ) {
-        if (i->second->IsVecReplicateScalar()) {
-          DCHECK(node->loop_info->IsDefinedOutOfTheLoop(i->first));
-          ++i;
-        } else {
-          i = vector_map_->erase(i);
-        }
-      }
-    }
     // Generate instruction map.
+    vector_map_->clear();
     for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
       bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
       DCHECK(vectorized_def);
@@ -872,9 +885,17 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node,
         }
       }
     }
+    // Generate the induction.
     vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step);
     Insert(vector_body_, vector_index_);
   }
+  // Finalize phi inputs for the reductions (if any).
+  for (auto i = reductions_->begin(); i != reductions_->end(); ++i) {
+    if (!i->first->IsPhi()) {
+      DCHECK(i->second->IsPhi());
+      GenerateVecReductionPhiInputs(i->second->AsPhi(), i->first);
+    }
+  }
   // Finalize phi inputs for the loop index.
   phi->AddInput(lo);
   phi->AddInput(vector_index_);
@@ -910,6 +931,23 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node,
     }
     return false;
   }
+  // Accept a left-hand-side reduction for
+  // (1) supported vector type,
+  // (2) vectorizable right-hand-side value.
+  auto redit = reductions_->find(instruction);
+  if (redit != reductions_->end()) {
+    Primitive::Type type = instruction->GetType();
+    if (TrySetVectorType(type, &restrictions) &&
+        VectorizeUse(node, instruction, generate_code, type, restrictions)) {
+      if (generate_code) {
+        HInstruction* new_red = vector_map_->Get(instruction);
+        vector_permanent_map_->Put(new_red, vector_map_->Get(redit->second));
+        vector_permanent_map_->Overwrite(redit->second, new_red);
+      }
+      return true;
+    }
+    return false;
+  }
   // Branch back okay.
   if (instruction->IsGoto()) {
     return true;
@@ -965,6 +1003,21 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node,
       }
       return true;
     }
+  } else if (instruction->IsPhi()) {
+    // Accept particular phi operations.
+    if (reductions_->find(instruction) != reductions_->end()) {
+      // Deal with vector restrictions.
+      if (HasVectorRestrictions(restrictions, kNoReduction)) {
+        return false;
+      }
+      // Accept a reduction.
+      if (generate_code) {
+        GenerateVecReductionPhi(instruction->AsPhi());
+      }
+      return true;
+    }
+    // TODO: accept right-hand-side induction?
+    return false;
   } else if (instruction->IsTypeConversion()) {
     // Accept particular type conversions.
     HTypeConversion* conversion = instruction->AsTypeConversion();
@@ -1155,14 +1208,14 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric
       switch (type) {
         case Primitive::kPrimBoolean:
         case Primitive::kPrimByte:
-          *restrictions |= kNoDiv;
+          *restrictions |= kNoDiv | kNoReduction;
           return TrySetVectorLength(8);
         case Primitive::kPrimChar:
         case Primitive::kPrimShort:
-          *restrictions |= kNoDiv | kNoStringCharAt;
+          *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
           return TrySetVectorLength(4);
         case Primitive::kPrimInt:
-          *restrictions |= kNoDiv;
+          *restrictions |= kNoDiv | kNoReduction;
           return TrySetVectorLength(2);
         default:
           break;
@@ -1174,11 +1227,11 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric
       switch (type) {
         case Primitive::kPrimBoolean:
         case Primitive::kPrimByte:
-          *restrictions |= kNoDiv;
+          *restrictions |= kNoDiv | kNoReduction;
           return TrySetVectorLength(16);
         case Primitive::kPrimChar:
         case Primitive::kPrimShort:
-          *restrictions |= kNoDiv;
+          *restrictions |= kNoDiv | kNoReduction;
           return TrySetVectorLength(8);
         case Primitive::kPrimInt:
           *restrictions |= kNoDiv;
@@ -1187,8 +1240,10 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric
           *restrictions |= kNoDiv | kNoMul | kNoMinMax;
           return TrySetVectorLength(2);
         case Primitive::kPrimFloat:
+          *restrictions |= kNoReduction;
           return TrySetVectorLength(4);
         case Primitive::kPrimDouble:
+          *restrictions |= kNoReduction;
           return TrySetVectorLength(2);
         default:
           return false;
@@ -1200,11 +1255,12 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd;
+            *restrictions |=
+                kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd;
+            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
             *restrictions |= kNoDiv;
@@ -1213,10 +1269,10 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric
             *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
-            *restrictions |= kNoMinMax;  // -0.0 vs +0.0
+            *restrictions |= kNoMinMax | kNoReduction;  // minmax: -0.0 vs +0.0
             return TrySetVectorLength(4);
           case Primitive::kPrimDouble:
-            *restrictions |= kNoMinMax;  // -0.0 vs +0.0
+            *restrictions |= kNoMinMax | kNoReduction;  // minmax: -0.0 vs +0.0
             return TrySetVectorLength(2);
           default:
             break;
@@ -1228,23 +1284,23 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoStringCharAt;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
-            *restrictions |= kNoMinMax;  // min/max(x, NaN)
+            *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
             return TrySetVectorLength(4);
           case Primitive::kPrimDouble:
-            *restrictions |= kNoMinMax;  // min/max(x, NaN)
+            *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
             return TrySetVectorLength(2);
           default:
             break;
@@ -1256,23 +1312,23 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoStringCharAt;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
-            *restrictions |= kNoMinMax;  // min/max(x, NaN)
+            *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
             return TrySetVectorLength(4);
           case Primitive::kPrimDouble:
-            *restrictions |= kNoMinMax;  // min/max(x, NaN)
+            *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
             return TrySetVectorLength(2);
           default:
             break;
@@ -1305,9 +1361,16 @@ void HLoopOptimization::GenerateVecInv(HInstruction* org, Primitive::Type type)
       return;
     }
     // In vector code, explicit scalar expansion is needed.
-    HInstruction* vector = new (global_allocator_) HVecReplicateScalar(
-        global_allocator_, org, type, vector_length_);
-    vector_map_->Put(org, Insert(vector_preheader_, vector));
+    HInstruction* vector = nullptr;
+    auto it = vector_permanent_map_->find(org);
+    if (it != vector_permanent_map_->end()) {
+      vector = it->second;  // reuse during unrolling
+    } else {
+      vector = new (global_allocator_) HVecReplicateScalar(
+          global_allocator_, org, type, vector_length_);
+      vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
+    }
+    vector_map_->Put(org, vector);
   }
 }
 
@@ -1362,6 +1425,78 @@ void HLoopOptimization::GenerateVecMem(HInstruction* org,
   vector_map_->Put(org, vector);
 }
 
+void HLoopOptimization::GenerateVecReductionPhi(HPhi* phi) {
+  DCHECK(reductions_->find(phi) != reductions_->end());
+  DCHECK(reductions_->Get(phi->InputAt(1)) == phi);
+  HInstruction* vector = nullptr;
+  if (vector_mode_ == kSequential) {
+    HPhi* new_phi = new (global_allocator_) HPhi(
+        global_allocator_, kNoRegNumber, 0, phi->GetType());
+    vector_header_->AddPhi(new_phi);
+    vector = new_phi;
+  } else {
+    // Link vector reduction back to prior unrolled update, or a first phi.
+    auto it = vector_permanent_map_->find(phi);
+    if (it != vector_permanent_map_->end()) {
+      vector = it->second;
+    } else {
+      HPhi* new_phi = new (global_allocator_) HPhi(
+          global_allocator_, kNoRegNumber, 0, HVecOperation::kSIMDType);
+      vector_header_->AddPhi(new_phi);
+      vector = new_phi;
+    }
+  }
+  vector_map_->Put(phi, vector);
+}
+
+void HLoopOptimization::GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction) {
+  HInstruction* new_phi = vector_map_->Get(phi);
+  HInstruction* new_init = reductions_->Get(phi);
+  HInstruction* new_red = vector_map_->Get(reduction);
+  // Link unrolled vector loop back to new phi.
+  for (; !new_phi->IsPhi(); new_phi = vector_permanent_map_->Get(new_phi)) {
+    DCHECK(new_phi->IsVecOperation());
+  }
+  // Prepare the new initialization.
+  if (vector_mode_ == kVector) {
+    // Generate a [initial, 0, .., 0] vector.
+    new_init = Insert(
+            vector_preheader_,
+            new (global_allocator_) HVecSetScalars(
+                global_allocator_, &new_init, phi->GetType(), vector_length_, 1));
+  } else {
+    new_init = ReduceAndExtractIfNeeded(new_init);
+  }
+  // Set the phi inputs.
+  DCHECK(new_phi->IsPhi());
+  new_phi->AsPhi()->AddInput(new_init);
+  new_phi->AsPhi()->AddInput(new_red);
+  // New feed value for next phi (safe mutation in iteration).
+  reductions_->find(phi)->second = new_phi;
+}
+
+HInstruction* HLoopOptimization::ReduceAndExtractIfNeeded(HInstruction* instruction) {
+  if (instruction->IsPhi()) {
+    HInstruction* input = instruction->InputAt(1);
+    if (input->IsVecOperation()) {
+      Primitive::Type type = input->AsVecOperation()->GetPackedType();
+      HBasicBlock* exit = instruction->GetBlock()->GetSuccessors()[0];
+      // Generate a vector reduction and scalar extract
+      //    x = REDUCE( [x_1, .., x_n] )
+      //    y = x_1
+      // along the exit of the defining loop.
+      HVecReduce::ReductionKind kind = GetReductionKind(input);
+      HInstruction* reduce = new (global_allocator_) HVecReduce(
+          global_allocator_, instruction, type, vector_length_, kind);
+      exit->InsertInstructionBefore(reduce, exit->GetFirstInstruction());
+      instruction = new (global_allocator_) HVecExtractScalar(
+          global_allocator_, reduce, type, vector_length_, 0);
+      exit->InsertInstructionAfter(instruction, reduce);
+    }
+  }
+  return instruction;
+}
+
 #define GENERATE_VEC(x, y) \
   if (vector_mode_ == kVector) { \
     vector = (x); \
@@ -1542,10 +1677,9 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
   // Test for top level arithmetic shift right x >> 1 or logical shift right x >>> 1
   // (note whether the sign bit in wider precision is shifted in has no effect
   // on the narrow precision computed by the idiom).
-  int64_t distance = 0;
   if ((instruction->IsShr() ||
        instruction->IsUShr()) &&
-      IsInt64AndGet(instruction->InputAt(1), /*out*/ &distance) && distance == 1) {
+      IsInt64Value(instruction->InputAt(1), 1)) {
     // Test for (a + b + c) >> 1 for optional constant c.
     HInstruction* a = nullptr;
     HInstruction* b = nullptr;
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 49be8a3fb4..ba9126c5f6 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -62,17 +62,18 @@ class HLoopOptimization : public HOptimization {
    * Vectorization restrictions (bit mask).
    */
   enum VectorRestrictions {
-    kNone            = 0,    // no restrictions
-    kNoMul           = 1,    // no multiplication
-    kNoDiv           = 2,    // no division
-    kNoShift         = 4,    // no shift
-    kNoShr           = 8,    // no arithmetic shift right
-    kNoHiBits        = 16,   // "wider" operations cannot bring in higher order bits
-    kNoSignedHAdd    = 32,   // no signed halving add
-    kNoUnroundedHAdd = 64,   // no unrounded halving add
-    kNoAbs           = 128,  // no absolute value
-    kNoMinMax        = 256,  // no min/max
-    kNoStringCharAt  = 512,  // no StringCharAt
+    kNone            = 0,        // no restrictions
+    kNoMul           = 1 << 0,   // no multiplication
+    kNoDiv           = 1 << 1,   // no division
+    kNoShift         = 1 << 2,   // no shift
+    kNoShr           = 1 << 3,   // no arithmetic shift right
+    kNoHiBits        = 1 << 4,   // "wider" operations cannot bring in higher order bits
+    kNoSignedHAdd    = 1 << 5,   // no signed halving add
+    kNoUnroundedHAdd = 1 << 6,   // no unrounded halving add
+    kNoAbs           = 1 << 7,   // no absolute value
+    kNoMinMax        = 1 << 8,   // no min/max
+    kNoStringCharAt  = 1 << 9,   // no StringCharAt
+    kNoReduction     = 1 << 10,  // no reduction
   };
 
   /*
@@ -155,6 +156,9 @@ class HLoopOptimization : public HOptimization {
                       HInstruction* opb,
                       HInstruction* offset,
                       Primitive::Type type);
+  void GenerateVecReductionPhi(HPhi* phi);
+  void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
+  HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
   void GenerateVecOp(HInstruction* org,
                      HInstruction* opa,
                      HInstruction* opb,
@@ -253,6 +257,10 @@ class HLoopOptimization : public HOptimization {
   // Contents reside in phase-local heap memory.
   ArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
 
+  // Permanent mapping used during vectorization synthesis.
+  // Contents reside in phase-local heap memory.
+  ArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_;
+
   // Temporary vectorization bookkeeping.
   VectorMode vector_mode_;  // synthesis mode
   HBasicBlock* vector_preheader_;  // preheader of the new loop
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index f60d532c37..869fdd4182 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1374,7 +1374,8 @@ class HLoopInformationOutwardIterator : public ValueObject {
   M(UShr, BinaryOperation)                                              \
   M(Xor, BinaryOperation)                                               \
   M(VecReplicateScalar, VecUnaryOperation)                              \
-  M(VecSumReduce, VecUnaryOperation)                                    \
+  M(VecExtractScalar, VecUnaryOperation)                                \
+  M(VecReduce, VecUnaryOperation)                                       \
   M(VecCnv, VecUnaryOperation)                                          \
   M(VecNeg, VecUnaryOperation)                                          \
   M(VecAbs, VecUnaryOperation)                                          \
@@ -7030,6 +7031,17 @@ inline bool IsInt64AndGet(HInstruction* instruction, /*out*/ int64_t* value) {
   return false;
 }
 
+// Returns true iff instruction is the given integral constant.
+inline bool IsInt64Value(HInstruction* instruction, int64_t value) {
+  int64_t val = 0;
+  return IsInt64AndGet(instruction, &val) && val == value;
+}
+
+// Returns true iff instruction is a zero bit pattern.
+inline bool IsZeroBitPattern(HInstruction* instruction) {
+  return instruction->IsConstant() && instruction->AsConstant()->IsZeroBitPattern();
+}
+
 #define INSTRUCTION_TYPE_CHECK(type, super)                                    \
   inline bool HInstruction::Is##type() const { return GetKind() == k##type; }  \
   inline const H##type* HInstruction::As##type() const {                       \
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 6261171a00..886d75e5c7 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -63,6 +63,10 @@ class Alignment {
 // GetVectorLength() x GetPackedType() operations simultaneously.
 class HVecOperation : public HVariableInputSizeInstruction {
  public:
+  // A SIMD operation looks like a FPU location.
+  // TODO: we could introduce SIMD types in HIR.
+  static constexpr Primitive::Type kSIMDType = Primitive::kPrimDouble;
+
   HVecOperation(ArenaAllocator* arena,
                 Primitive::Type packed_type,
                 SideEffects side_effects,
@@ -89,10 +93,9 @@ class HVecOperation : public HVariableInputSizeInstruction {
     return vector_length_ * Primitive::ComponentSize(GetPackedType());
   }
 
-  // Returns the type of the vector operation: a SIMD operation looks like a FPU location.
-  // TODO: we could introduce SIMD types in HIR.
+  // Returns the type of the vector operation.
   Primitive::Type GetType() const OVERRIDE {
-    return Primitive::kPrimDouble;
+    return kSIMDType;
   }
 
   // Returns the true component type packed in a vector.
@@ -220,8 +223,11 @@ class HVecMemoryOperation : public HVecOperation {
   DISALLOW_COPY_AND_ASSIGN(HVecMemoryOperation);
 };
 
-// Packed type consistency checker (same vector length integral types may mix freely).
+// Packed type consistency checker ("same vector length" integral types may mix freely).
 inline static bool HasConsistentPackedTypes(HInstruction* input, Primitive::Type type) {
+  if (input->IsPhi()) {
+    return input->GetType() == HVecOperation::kSIMDType;  // carries SIMD
+  }
   DCHECK(input->IsVecOperation());
   Primitive::Type input_type = input->AsVecOperation()->GetPackedType();
   switch (input_type) {
@@ -265,27 +271,77 @@ class HVecReplicateScalar FINAL : public HVecUnaryOperation {
   DISALLOW_COPY_AND_ASSIGN(HVecReplicateScalar);
 };
 
-// Sum-reduces the given vector into a shorter vector (m < n) or scalar (m = 1),
-// viz. sum-reduce[ x1, .. , xn ] = [ y1, .., ym ], where yi = sum_j x_j.
-class HVecSumReduce FINAL : public HVecUnaryOperation {
-  HVecSumReduce(ArenaAllocator* arena,
-                HInstruction* input,
-                Primitive::Type packed_type,
-                size_t vector_length,
-                uint32_t dex_pc = kNoDexPc)
+// Extracts a particular scalar from the given vector,
+// viz. extract[ x1, .. , xn ] = x_i.
+//
+// TODO: for now only i == 1 case supported.
+class HVecExtractScalar FINAL : public HVecUnaryOperation {
+ public:
+  HVecExtractScalar(ArenaAllocator* arena,
+                    HInstruction* input,
+                    Primitive::Type packed_type,
+                    size_t vector_length,
+                    size_t index,
+                    uint32_t dex_pc = kNoDexPc)
       : HVecUnaryOperation(arena, input, packed_type, vector_length, dex_pc) {
     DCHECK(HasConsistentPackedTypes(input, packed_type));
+    DCHECK_LT(index, vector_length);
+    DCHECK_EQ(index, 0u);
+  }
+
+  // Yields a single component in the vector.
+  Primitive::Type GetType() const OVERRIDE {
+    return GetPackedType();
+  }
+
+  // An extract needs to stay in place, since SIMD registers are not
+  // kept alive across vector loop boundaries (yet).
+  bool CanBeMoved() const OVERRIDE { return false; }
+
+  DECLARE_INSTRUCTION(VecExtractScalar);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecExtractScalar);
+};
+
+// Reduces the given vector into the first element as sum/min/max,
+// viz. sum-reduce[ x1, .. , xn ] = [ y, ---- ], where y = sum xi
+// and the "-" denotes "don't care" (implementation dependent).
+class HVecReduce FINAL : public HVecUnaryOperation {
+ public:
+  enum ReductionKind {
+    kSum = 1,
+    kMin = 2,
+    kMax = 3
+  };
+
+  HVecReduce(ArenaAllocator* arena,
+             HInstruction* input,
+             Primitive::Type packed_type,
+             size_t vector_length,
+             ReductionKind kind,
+             uint32_t dex_pc = kNoDexPc)
+      : HVecUnaryOperation(arena, input, packed_type, vector_length, dex_pc),
+        kind_(kind) {
+    DCHECK(HasConsistentPackedTypes(input, packed_type));
   }
 
-  // TODO: probably integral promotion
-  Primitive::Type GetType() const OVERRIDE { return GetPackedType(); }
+  ReductionKind GetKind() const { return kind_; }
 
   bool CanBeMoved() const OVERRIDE { return true; }
 
-  DECLARE_INSTRUCTION(VecSumReduce);
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
+    DCHECK(other->IsVecReduce());
+    const HVecReduce* o = other->AsVecReduce();
+    return HVecOperation::InstructionDataEquals(o) && GetKind() == o->GetKind();
+  }
+
+  DECLARE_INSTRUCTION(VecReduce);
 
  private:
-  DISALLOW_COPY_AND_ASSIGN(HVecSumReduce);
+  const ReductionKind kind_;
+
+  DISALLOW_COPY_AND_ASSIGN(HVecReduce);
 };
 
 // Converts every component in the vector,
@@ -754,20 +810,23 @@ class HVecUShr FINAL : public HVecBinaryOperation {
 //
 
 // Assigns the given scalar elements to a vector,
-// viz. set( array(x1, .., xn) ) = [ x1, .. , xn ].
+// viz. set( array(x1, .., xn) ) = [ x1, .. ,           xn ] if n == m,
+//      set( array(x1, .., xm) ) = [ x1, .. , xm, 0, .., 0 ] if m <  n.
 class HVecSetScalars FINAL : public HVecOperation {
+ public:
   HVecSetScalars(ArenaAllocator* arena,
                  HInstruction** scalars,  // array
                  Primitive::Type packed_type,
                  size_t vector_length,
+                 size_t number_of_scalars,
                  uint32_t dex_pc = kNoDexPc)
       : HVecOperation(arena,
                       packed_type,
                       SideEffects::None(),
-                      /* number_of_inputs */ vector_length,
+                      number_of_scalars,
                       vector_length,
                       dex_pc) {
-    for (size_t i = 0; i < vector_length; i++) {
+    for (size_t i = 0; i < number_of_scalars; i++) {
       DCHECK(!scalars[i]->IsVecOperation());
       SetRawInputAt(0, scalars[i]);
     }
diff --git a/compiler/optimizing/nodes_vector_test.cc b/compiler/optimizing/nodes_vector_test.cc
index 0238ea4602..5a56a2c210 100644
--- a/compiler/optimizing/nodes_vector_test.cc
+++ b/compiler/optimizing/nodes_vector_test.cc
@@ -332,4 +332,32 @@ TEST_F(NodesVectorTest, VectorOperationMattersOnMultiplyAccumulate) {
   EXPECT_FALSE(v1->Equals(v3));  // different vector lengths
 }
 
+TEST_F(NodesVectorTest, VectorKindMattersOnReduce) {
+  HVecOperation* v0 = new (&allocator_)
+      HVecReplicateScalar(&allocator_, parameter_, Primitive::kPrimInt, 4);
+
+  HVecReduce* v1 = new (&allocator_) HVecReduce(
+      &allocator_, v0, Primitive::kPrimInt, 4, HVecReduce::kSum);
+  HVecReduce* v2 = new (&allocator_) HVecReduce(
+      &allocator_, v0, Primitive::kPrimInt, 4, HVecReduce::kMin);
+  HVecReduce* v3 = new (&allocator_) HVecReduce(
+      &allocator_, v0, Primitive::kPrimInt, 4, HVecReduce::kMax);
+
+  EXPECT_FALSE(v0->CanBeMoved());
+  EXPECT_TRUE(v1->CanBeMoved());
+  EXPECT_TRUE(v2->CanBeMoved());
+  EXPECT_TRUE(v3->CanBeMoved());
+
+  EXPECT_EQ(HVecReduce::kSum, v1->GetKind());
+  EXPECT_EQ(HVecReduce::kMin, v2->GetKind());
+  EXPECT_EQ(HVecReduce::kMax, v3->GetKind());
+
+  EXPECT_TRUE(v1->Equals(v1));
+  EXPECT_TRUE(v2->Equals(v2));
+  EXPECT_TRUE(v3->Equals(v3));
+
+  EXPECT_FALSE(v1->Equals(v2));  // different kinds
+  EXPECT_FALSE(v1->Equals(v3));
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/scheduler_arm64.cc b/compiler/optimizing/scheduler_arm64.cc
index 510619faf9..1d9d28ab24 100644
--- a/compiler/optimizing/scheduler_arm64.cc
+++ b/compiler/optimizing/scheduler_arm64.cc
@@ -215,12 +215,12 @@ void SchedulingLatencyVisitorARM64::VisitVecReplicateScalar(
   last_visited_latency_ = kArm64SIMDReplicateOpLatency;
 }
 
-void SchedulingLatencyVisitorARM64::VisitVecSetScalars(HVecSetScalars* instr) {
-  LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId();
+void SchedulingLatencyVisitorARM64::VisitVecExtractScalar(HVecExtractScalar* instr) {
+  HandleSimpleArithmeticSIMD(instr);
 }
 
-void SchedulingLatencyVisitorARM64::VisitVecSumReduce(HVecSumReduce* instr) {
-  LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId();
+void SchedulingLatencyVisitorARM64::VisitVecReduce(HVecReduce* instr) {
+  HandleSimpleArithmeticSIMD(instr);
 }
 
 void SchedulingLatencyVisitorARM64::VisitVecCnv(HVecCnv* instr ATTRIBUTE_UNUSED) {
@@ -283,8 +283,8 @@ void SchedulingLatencyVisitorARM64::VisitVecAnd(HVecAnd* instr ATTRIBUTE_UNUSED)
   last_visited_latency_ = kArm64SIMDIntegerOpLatency;
 }
 
-void SchedulingLatencyVisitorARM64::VisitVecAndNot(HVecAndNot* instr) {
-  LOG(FATAL) << "Unsupported SIMD instruction " << instr->GetId();
+void SchedulingLatencyVisitorARM64::VisitVecAndNot(HVecAndNot* instr ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArm64SIMDIntegerOpLatency;
 }
 
 void SchedulingLatencyVisitorARM64::VisitVecOr(HVecOr* instr ATTRIBUTE_UNUSED) {
@@ -307,6 +307,10 @@ void SchedulingLatencyVisitorARM64::VisitVecUShr(HVecUShr* instr) {
   HandleSimpleArithmeticSIMD(instr);
 }
 
+void SchedulingLatencyVisitorARM64::VisitVecSetScalars(HVecSetScalars* instr) {
+  HandleSimpleArithmeticSIMD(instr);
+}
+
 void SchedulingLatencyVisitorARM64::VisitVecMultiplyAccumulate(
     HVecMultiplyAccumulate* instr ATTRIBUTE_UNUSED) {
   last_visited_latency_ = kArm64SIMDMulIntegerLatency;
diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h
index 63d5b7d6b6..e1a80ec6fb 100644
--- a/compiler/optimizing/scheduler_arm64.h
+++ b/compiler/optimizing/scheduler_arm64.h
@@ -83,8 +83,8 @@ class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor {
   M(SuspendCheck         , unused)                   \
   M(TypeConversion       , unused)                   \
   M(VecReplicateScalar   , unused)                   \
-  M(VecSetScalars        , unused)                   \
-  M(VecSumReduce         , unused)                   \
+  M(VecExtractScalar     , unused)                   \
+  M(VecReduce            , unused)                   \
   M(VecCnv               , unused)                   \
   M(VecNeg               , unused)                   \
   M(VecAbs               , unused)                   \
@@ -103,6 +103,7 @@ class SchedulingLatencyVisitorARM64 : public SchedulingLatencyVisitor {
   M(VecShl               , unused)                   \
   M(VecShr               , unused)                   \
   M(VecUShr              , unused)                   \
+  M(VecSetScalars        , unused)                   \
   M(VecMultiplyAccumulate, unused)                   \
   M(VecLoad              , unused)                   \
   M(VecStore             , unused)
author	Aart Bik <ajcbik@google.com>	2017-08-31 09:08:13 -0700
committer	Aart Bik <ajcbik@google.com>	2017-09-01 10:32:50 -0700
commit	cfa59b49cde265dc5329a7e6956445f9f7a75f15 (patch)
tree	eed953f62e796f7e64252520a40d7e77d1f117af /compiler/optimizing
parent	82a63734d3067ea0c96f8ba15bc40caaf798c625 (diff)