summaryrefslogtreecommitdiff
path: root/compiler/optimizing/codegen_test_utils.h
diff options
context:
space:
mode:
authorUsama Arif <usama.arif@linaro.org>2019-11-11 15:29:59 +0000
committerVladimir Marko <vmarko@google.com>2019-11-28 09:51:05 +0000
commit457e9fa3833ef11530056d010f247ad087fd2184 (patch)
tree54b8a9dcf44646c3e43a9085d581660c5d9a0132 /compiler/optimizing/codegen_test_utils.h
parent17a39babb7f42cbe108d6fab2760cbdc68b821a2 (diff)
ARM64: FP16 greater/less/greaterEquals/lessEquals intrinsics for ARMv8
This CL implements intrinsics for greater, greaterEquals, less, lessEquals methods with ARMv8.2 FP16 instructions. This requires the ARMv8.2 AArch64 asimd half precision extension. The time required in milliseconds to execute the below code for the four intrinsics on Pixel3 is (The code below is for FP16.less but is similar for the rest of the intrinsics): - Java implementation libcore.util.FP16.less(): - big cluster only: 19876 - little cluster only: 47525 - arm64 Intrinisic implementationi for less: - big cluster only: 14526 (~27% faster) - little cluster only: 45815 (~4% faster) - Java implementation libcore.util.FP16.lessEquals(): - big cluster only: 19856 - little cluster only: 47419 - arm64 Intrinisic implementation for lessEquals: - big cluster only: 14469 (~27% faster) - little cluster only: 45762 (~4% faster) - Java implementation libcore.util.FP16.greater(): - big cluster only: 19854 - little cluster only: 47623 - arm64 Intrinisic implementation for greater: - big cluster only: 14519 (~27% faster) - little cluster only: 45722 (~4% faster) - Java implementation libcore.util.FP16.greaterEquals(): - big cluster only: 19865 - little cluster only: 47216 - arm64 Intrinisic implementation for greaterEquals: - big cluster only: 14485 (~27% faster) - little cluster only: 45729 (~4% faster) public static boolean benchmarkComparison(){ boolean ret = false; long before = 0; long after = 0; before = System.currentTimeMillis(); for(long i = 0; i < 1e9; i++){ // FP16.toHalf(12.3) = 0x4a26, FP16.toHalf(12.4) = 0x4a33 // FP16.toHalf(-12.3) = 0xca26, FP16.toHalf(-12.4) = 0xca33 ret |= FP16.less((short) 0x4a26,(short) 0x4a33); ret |= FP16.less((short) 0x4a33,(short) 0x4a26); ret |= FP16.less((short) 0xca26,(short) 0xca33); ret |= FP16.less((short) 0xca33,(short) 0xca26); } after = System.currentTimeMillis(); System.out.println("Time of FP16.less (ms): " + (after - before)); System.out.println(ret); return ret; } Test: 580-fp16 Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac Change-Id: Id1a2c3e7328c82c798fcaf1fa74f5908a822cd0b
Diffstat (limited to 'compiler/optimizing/codegen_test_utils.h')
0 files changed, 0 insertions, 0 deletions