summaryrefslogtreecommitdiff
path: root/libart/src
diff options
context:
space:
mode:
authorRock.Yeh <rock.yeh@mediatek.com>2020-12-29 16:49:33 +0800
committeralk3pInjection <webmaster@raspii.tech>2021-09-27 21:17:05 +0800
commitfb154ee319480b6eb7f930243a6e0113c2581277 (patch)
treedda8e72e1eea0a0f5173a123611c9e4677fa93f2 /libart/src
parenteef9acb4ef362082cd7d4139011e1d8836343897 (diff)
Add newStringFromUtf8Bytes native implementation.
Porting a part of StringFactory.newStringFromBytes from libcore to native for UTF-8 character set. It can improve the UX score of Antutu v8 a little bit. Only watch UX score. We test 10 times and average the total score. Bechmark results - before(Java implementation): 10 times avg,: 13133.9 - after (Native implementation): 10 times avg.: 13324.2 Diff.: +1.4% Bug: 176514597 Test: cts Change-Id: I6b601c09663b21fdacde7f14b0db1ac4f0a94c0f
Diffstat (limited to 'libart/src')
-rw-r--r--libart/src/main/java/java/lang/StringFactory.java153
1 files changed, 4 insertions, 149 deletions
diff --git a/libart/src/main/java/java/lang/StringFactory.java b/libart/src/main/java/java/lang/StringFactory.java
index 6ef664b150..ea80b9ff52 100644
--- a/libart/src/main/java/java/lang/StringFactory.java
+++ b/libart/src/main/java/java/lang/StringFactory.java
@@ -65,14 +65,6 @@ public final class StringFactory {
return newStringFromBytes(data, 0, data.length, Charset.forNameUEE(charsetName));
}
- private static final int[] TABLE_UTF8_NEEDED = new int[] {
- // 0 1 2 3 4 5 6 7 8 9 a b c d e f
- 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xc0 - 0xcf
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xd0 - 0xdf
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0 - 0xef
- 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff
- };
-
// TODO: Implement this method natively.
public static String newStringFromBytes(byte[] data, int offset, int byteCount, Charset charset) {
if ((offset | byteCount) < 0 || byteCount > data.length - offset) {
@@ -85,147 +77,7 @@ public final class StringFactory {
// We inline UTF-8, ISO-8859-1, and US-ASCII decoders for speed.
String canonicalCharsetName = charset.name();
if (canonicalCharsetName.equals("UTF-8")) {
- /*
- This code converts a UTF-8 byte sequence to a Java String (UTF-16).
- It implements the W3C recommended UTF-8 decoder.
- https://www.w3.org/TR/encoding/#utf-8-decoder
-
- Unicode 3.2 Well-Formed UTF-8 Byte Sequences
- Code Points First Second Third Fourth
- U+0000..U+007F 00..7F
- U+0080..U+07FF C2..DF 80..BF
- U+0800..U+0FFF E0 A0..BF 80..BF
- U+1000..U+CFFF E1..EC 80..BF 80..BF
- U+D000..U+D7FF ED 80..9F 80..BF
- U+E000..U+FFFF EE..EF 80..BF 80..BF
- U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
- U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
- U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
-
- Please refer to Unicode as the authority.
- p.126 Table 3-7 in http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
-
- Handling Malformed Input
- The maximal subpart should be replaced by a single U+FFFD. Maximal subpart is
- the longest code unit subsequence starting at an unconvertible offset that is either
- 1) the initial subsequence of a well-formed code unit sequence, or
- 2) a subsequence of length one:
- One U+FFFD should be emitted for every sequence of bytes that is an incomplete prefix
- of a valid sequence, and with the conversion to restart after the incomplete sequence.
-
- For example, in byte sequence "41 C0 AF 41 F4 80 80 41", the maximal subparts are
- "C0", "AF", and "F4 80 80". "F4 80 80" can be the initial subsequence of "F4 80 80 80",
- but "C0" can't be the initial subsequence of any well-formed code unit sequence.
- Thus, the output should be "A\ufffd\ufffdA\ufffdA".
-
- Please refer to section "Best Practices for Using U+FFFD." in
- http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
- */
- byte[] d = data;
- char[] v = new char[byteCount];
-
- int idx = offset;
- int last = offset + byteCount;
- int s = 0;
-
- int codePoint = 0;
- int utf8BytesSeen = 0;
- int utf8BytesNeeded = 0;
- int lowerBound = 0x80;
- int upperBound = 0xbf;
-
- while (idx < last) {
- int b = d[idx++] & 0xff;
- if (utf8BytesNeeded == 0) {
- if ((b & 0x80) == 0) { // ASCII char. 0xxxxxxx
- v[s++] = (char) b;
- continue;
- }
-
- if ((b & 0x40) == 0) { // 10xxxxxx is illegal as first byte
- v[s++] = REPLACEMENT_CHAR;
- continue;
- }
-
- // 11xxxxxx
- int tableLookupIndex = b & 0x3f;
- utf8BytesNeeded = TABLE_UTF8_NEEDED[tableLookupIndex];
- if (utf8BytesNeeded == 0) {
- v[s++] = REPLACEMENT_CHAR;
- continue;
- }
-
- // utf8BytesNeeded
- // 1: b & 0x1f
- // 2: b & 0x0f
- // 3: b & 0x07
- codePoint = b & (0x3f >> utf8BytesNeeded);
- if (b == 0xe0) {
- lowerBound = 0xa0;
- } else if (b == 0xed) {
- upperBound = 0x9f;
- } else if (b == 0xf0) {
- lowerBound = 0x90;
- } else if (b == 0xf4) {
- upperBound = 0x8f;
- }
- } else {
- if (b < lowerBound || b > upperBound) {
- // The bytes seen are ill-formed. Substitute them with U+FFFD
- v[s++] = REPLACEMENT_CHAR;
- codePoint = 0;
- utf8BytesNeeded = 0;
- utf8BytesSeen = 0;
- lowerBound = 0x80;
- upperBound = 0xbf;
- /*
- * According to the Unicode Standard,
- * "a UTF-8 conversion process is required to never consume well-formed
- * subsequences as part of its error handling for ill-formed subsequences"
- * The current byte could be part of well-formed subsequences. Reduce the
- * index by 1 to parse it in next loop.
- */
- idx--;
- continue;
- }
-
- lowerBound = 0x80;
- upperBound = 0xbf;
- codePoint = (codePoint << 6) | (b & 0x3f);
- utf8BytesSeen++;
- if (utf8BytesNeeded != utf8BytesSeen) {
- continue;
- }
-
- // Encode chars from U+10000 up as surrogate pairs
- if (codePoint < 0x10000) {
- v[s++] = (char) codePoint;
- } else {
- v[s++] = (char) ((codePoint >> 10) + 0xd7c0);
- v[s++] = (char) ((codePoint & 0x3ff) + 0xdc00);
- }
-
- utf8BytesSeen = 0;
- utf8BytesNeeded = 0;
- codePoint = 0;
- }
- }
-
- // The bytes seen are ill-formed. Substitute them by U+FFFD
- if (utf8BytesNeeded != 0) {
- v[s++] = REPLACEMENT_CHAR;
- }
-
- if (s == byteCount) {
- // We guessed right, so we can use our temporary array as-is.
- value = v;
- length = s;
- } else {
- // Our temporary array was too big, so reallocate and copy.
- value = new char[s];
- length = s;
- System.arraycopy(v, 0, value, 0, s);
- }
+ return newStringFromUtf8Bytes(data, offset, byteCount);
} else if (canonicalCharsetName.equals("ISO-8859-1")) {
value = new char[byteCount];
length = byteCount;
@@ -267,6 +119,9 @@ public final class StringFactory {
@FastNative
public static native String newStringFromString(String toCopy);
+ @FastNative
+ public static native String newStringFromUtf8Bytes(byte[] data, int offset, int byteCount);
+
public static String newStringFromStringBuffer(StringBuffer stringBuffer) {
synchronized (stringBuffer) {
return newStringFromChars(stringBuffer.getValue(), 0, stringBuffer.length());