diff options
author | Rock.Yeh <rock.yeh@mediatek.com> | 2020-12-29 16:49:33 +0800 |
---|---|---|
committer | alk3pInjection <webmaster@raspii.tech> | 2021-09-27 21:17:05 +0800 |
commit | fb154ee319480b6eb7f930243a6e0113c2581277 (patch) | |
tree | dda8e72e1eea0a0f5173a123611c9e4677fa93f2 | |
parent | eef9acb4ef362082cd7d4139011e1d8836343897 (diff) |
Add newStringFromUtf8Bytes native implementation.
Porting a part of StringFactory.newStringFromBytes from libcore to
native
for UTF-8 character set. It can improve the UX score of Antutu v8 a
little
bit.
Only watch UX score. We test 10 times and average the total score.
Bechmark results
- before(Java implementation):
10 times avg,: 13133.9
- after (Native implementation):
10 times avg.: 13324.2
Diff.: +1.4%
Bug: 176514597
Test: cts
Change-Id: I6b601c09663b21fdacde7f14b0db1ac4f0a94c0f
-rw-r--r-- | libart/src/main/java/java/lang/StringFactory.java | 153 |
1 files changed, 4 insertions, 149 deletions
diff --git a/libart/src/main/java/java/lang/StringFactory.java b/libart/src/main/java/java/lang/StringFactory.java index 6ef664b150..ea80b9ff52 100644 --- a/libart/src/main/java/java/lang/StringFactory.java +++ b/libart/src/main/java/java/lang/StringFactory.java @@ -65,14 +65,6 @@ public final class StringFactory { return newStringFromBytes(data, 0, data.length, Charset.forNameUEE(charsetName)); } - private static final int[] TABLE_UTF8_NEEDED = new int[] { - // 0 1 2 3 4 5 6 7 8 9 a b c d e f - 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xc0 - 0xcf - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xd0 - 0xdf - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0 - 0xef - 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff - }; - // TODO: Implement this method natively. public static String newStringFromBytes(byte[] data, int offset, int byteCount, Charset charset) { if ((offset | byteCount) < 0 || byteCount > data.length - offset) { @@ -85,147 +77,7 @@ public final class StringFactory { // We inline UTF-8, ISO-8859-1, and US-ASCII decoders for speed. String canonicalCharsetName = charset.name(); if (canonicalCharsetName.equals("UTF-8")) { - /* - This code converts a UTF-8 byte sequence to a Java String (UTF-16). - It implements the W3C recommended UTF-8 decoder. - https://www.w3.org/TR/encoding/#utf-8-decoder - - Unicode 3.2 Well-Formed UTF-8 Byte Sequences - Code Points First Second Third Fourth - U+0000..U+007F 00..7F - U+0080..U+07FF C2..DF 80..BF - U+0800..U+0FFF E0 A0..BF 80..BF - U+1000..U+CFFF E1..EC 80..BF 80..BF - U+D000..U+D7FF ED 80..9F 80..BF - U+E000..U+FFFF EE..EF 80..BF 80..BF - U+10000..U+3FFFF F0 90..BF 80..BF 80..BF - U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF - U+100000..U+10FFFF F4 80..8F 80..BF 80..BF - - Please refer to Unicode as the authority. - p.126 Table 3-7 in http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf - - Handling Malformed Input - The maximal subpart should be replaced by a single U+FFFD. Maximal subpart is - the longest code unit subsequence starting at an unconvertible offset that is either - 1) the initial subsequence of a well-formed code unit sequence, or - 2) a subsequence of length one: - One U+FFFD should be emitted for every sequence of bytes that is an incomplete prefix - of a valid sequence, and with the conversion to restart after the incomplete sequence. - - For example, in byte sequence "41 C0 AF 41 F4 80 80 41", the maximal subparts are - "C0", "AF", and "F4 80 80". "F4 80 80" can be the initial subsequence of "F4 80 80 80", - but "C0" can't be the initial subsequence of any well-formed code unit sequence. - Thus, the output should be "A\ufffd\ufffdA\ufffdA". - - Please refer to section "Best Practices for Using U+FFFD." in - http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf - */ - byte[] d = data; - char[] v = new char[byteCount]; - - int idx = offset; - int last = offset + byteCount; - int s = 0; - - int codePoint = 0; - int utf8BytesSeen = 0; - int utf8BytesNeeded = 0; - int lowerBound = 0x80; - int upperBound = 0xbf; - - while (idx < last) { - int b = d[idx++] & 0xff; - if (utf8BytesNeeded == 0) { - if ((b & 0x80) == 0) { // ASCII char. 0xxxxxxx - v[s++] = (char) b; - continue; - } - - if ((b & 0x40) == 0) { // 10xxxxxx is illegal as first byte - v[s++] = REPLACEMENT_CHAR; - continue; - } - - // 11xxxxxx - int tableLookupIndex = b & 0x3f; - utf8BytesNeeded = TABLE_UTF8_NEEDED[tableLookupIndex]; - if (utf8BytesNeeded == 0) { - v[s++] = REPLACEMENT_CHAR; - continue; - } - - // utf8BytesNeeded - // 1: b & 0x1f - // 2: b & 0x0f - // 3: b & 0x07 - codePoint = b & (0x3f >> utf8BytesNeeded); - if (b == 0xe0) { - lowerBound = 0xa0; - } else if (b == 0xed) { - upperBound = 0x9f; - } else if (b == 0xf0) { - lowerBound = 0x90; - } else if (b == 0xf4) { - upperBound = 0x8f; - } - } else { - if (b < lowerBound || b > upperBound) { - // The bytes seen are ill-formed. Substitute them with U+FFFD - v[s++] = REPLACEMENT_CHAR; - codePoint = 0; - utf8BytesNeeded = 0; - utf8BytesSeen = 0; - lowerBound = 0x80; - upperBound = 0xbf; - /* - * According to the Unicode Standard, - * "a UTF-8 conversion process is required to never consume well-formed - * subsequences as part of its error handling for ill-formed subsequences" - * The current byte could be part of well-formed subsequences. Reduce the - * index by 1 to parse it in next loop. - */ - idx--; - continue; - } - - lowerBound = 0x80; - upperBound = 0xbf; - codePoint = (codePoint << 6) | (b & 0x3f); - utf8BytesSeen++; - if (utf8BytesNeeded != utf8BytesSeen) { - continue; - } - - // Encode chars from U+10000 up as surrogate pairs - if (codePoint < 0x10000) { - v[s++] = (char) codePoint; - } else { - v[s++] = (char) ((codePoint >> 10) + 0xd7c0); - v[s++] = (char) ((codePoint & 0x3ff) + 0xdc00); - } - - utf8BytesSeen = 0; - utf8BytesNeeded = 0; - codePoint = 0; - } - } - - // The bytes seen are ill-formed. Substitute them by U+FFFD - if (utf8BytesNeeded != 0) { - v[s++] = REPLACEMENT_CHAR; - } - - if (s == byteCount) { - // We guessed right, so we can use our temporary array as-is. - value = v; - length = s; - } else { - // Our temporary array was too big, so reallocate and copy. - value = new char[s]; - length = s; - System.arraycopy(v, 0, value, 0, s); - } + return newStringFromUtf8Bytes(data, offset, byteCount); } else if (canonicalCharsetName.equals("ISO-8859-1")) { value = new char[byteCount]; length = byteCount; @@ -267,6 +119,9 @@ public final class StringFactory { @FastNative public static native String newStringFromString(String toCopy); + @FastNative + public static native String newStringFromUtf8Bytes(byte[] data, int offset, int byteCount); + public static String newStringFromStringBuffer(StringBuffer stringBuffer) { synchronized (stringBuffer) { return newStringFromChars(stringBuffer.getValue(), 0, stringBuffer.length()); |