diff options
Diffstat (limited to 'runtime/native/java_lang_StringFactory.cc')
-rw-r--r-- | runtime/native/java_lang_StringFactory.cc | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/runtime/native/java_lang_StringFactory.cc b/runtime/native/java_lang_StringFactory.cc index 178d5dabbd..9086ee932d 100644 --- a/runtime/native/java_lang_StringFactory.cc +++ b/runtime/native/java_lang_StringFactory.cc @@ -89,10 +89,186 @@ static jstring StringFactory_newStringFromString(JNIEnv* env, jclass, jstring to return soa.AddLocalReference<jstring>(result); } +static jstring StringFactory_newStringFromUtf8Bytes(JNIEnv* env, jclass, jbyteArray java_data, + jint offset, jint byte_count) { + // Local Define in here + static const jchar kReplacementChar = 0xfffd; + static const int kDefaultBufferSize = 256; + static const int kTableUtf8Needed[] = { + // 0 1 2 3 4 5 6 7 8 9 a b c d e f + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xc0 - 0xcf + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xd0 - 0xdf + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0 - 0xef + 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff + }; + + ScopedFastNativeObjectAccess soa(env); + if (UNLIKELY(java_data == nullptr)) { + ThrowNullPointerException("data == null"); + return nullptr; + } + + StackHandleScope<1> hs(soa.Self()); + Handle<mirror::ByteArray> byte_array(hs.NewHandle(soa.Decode<mirror::ByteArray>(java_data))); + int32_t data_size = byte_array->GetLength(); + if ((offset | byte_count) < 0 || byte_count > data_size - offset) { + soa.Self()->ThrowNewExceptionF("Ljava/lang/StringIndexOutOfBoundsException;", + "length=%d; regionStart=%d; regionLength=%d", data_size, + offset, byte_count); + return nullptr; + } + + /* + * This code converts a UTF-8 byte sequence to a Java String (UTF-16). + * It implements the W3C recommended UTF-8 decoder. + * https://www.w3.org/TR/encoding/#utf-8-decoder + * + * Unicode 3.2 Well-Formed UTF-8 Byte Sequences + * Code Points First Second Third Fourth + * U+0000..U+007F 00..7F + * U+0080..U+07FF C2..DF 80..BF + * U+0800..U+0FFF E0 A0..BF 80..BF + * U+1000..U+CFFF E1..EC 80..BF 80..BF + * U+D000..U+D7FF ED 80..9F 80..BF + * U+E000..U+FFFF EE..EF 80..BF 80..BF + * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + * + * Please refer to Unicode as the authority. + * p.126 Table 3-7 in http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf + * + * Handling Malformed Input + * The maximal subpart should be replaced by a single U+FFFD. Maximal subpart is + * the longest code unit subsequence starting at an unconvertible offset that is either + * 1) the initial subsequence of a well-formed code unit sequence, or + * 2) a subsequence of length one: + * One U+FFFD should be emitted for every sequence of bytes that is an incomplete prefix + * of a valid sequence, and with the conversion to restart after the incomplete sequence. + * + * For example, in byte sequence "41 C0 AF 41 F4 80 80 41", the maximal subparts are + * "C0", "AF", and "F4 80 80". "F4 80 80" can be the initial subsequence of "F4 80 80 80", + * but "C0" can't be the initial subsequence of any well-formed code unit sequence. + * Thus, the output should be "A\ufffd\ufffdA\ufffdA". + * + * Please refer to section "Best Practices for Using U+FFFD." in + * http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf + */ + + // Initial value + jchar stack_buffer[kDefaultBufferSize]; + std::unique_ptr<jchar[]> allocated_buffer; + jchar* v; + if (byte_count <= kDefaultBufferSize) { + v = stack_buffer; + } else { + allocated_buffer.reset(new jchar[byte_count]); + v = allocated_buffer.get(); + } + + jbyte* d = byte_array->GetData(); + DCHECK(d != nullptr); + + int idx = offset; + int last = offset + byte_count; + int s = 0; + + int code_point = 0; + int utf8_bytes_seen = 0; + int utf8_bytes_needed = 0; + int lower_bound = 0x80; + int upper_bound = 0xbf; + while (idx < last) { + int b = d[idx++] & 0xff; + if (utf8_bytes_needed == 0) { + if ((b & 0x80) == 0) { // ASCII char. 0xxxxxxx + v[s++] = (jchar) b; + continue; + } + + if ((b & 0x40) == 0) { // 10xxxxxx is illegal as first byte + v[s++] = kReplacementChar; + continue; + } + + // 11xxxxxx + int tableLookupIndex = b & 0x3f; + utf8_bytes_needed = kTableUtf8Needed[tableLookupIndex]; + if (utf8_bytes_needed == 0) { + v[s++] = kReplacementChar; + continue; + } + + // utf8_bytes_needed + // 1: b & 0x1f + // 2: b & 0x0f + // 3: b & 0x07 + code_point = b & (0x3f >> utf8_bytes_needed); + if (b == 0xe0) { + lower_bound = 0xa0; + } else if (b == 0xed) { + upper_bound = 0x9f; + } else if (b == 0xf0) { + lower_bound = 0x90; + } else if (b == 0xf4) { + upper_bound = 0x8f; + } + } else { + if (b < lower_bound || b > upper_bound) { + // The bytes seen are ill-formed. Substitute them with U+FFFD + v[s++] = kReplacementChar; + code_point = 0; + utf8_bytes_needed = 0; + utf8_bytes_seen = 0; + lower_bound = 0x80; + upper_bound = 0xbf; + /* + * According to the Unicode Standard, + * "a UTF-8 conversion process is required to never consume well-formed + * subsequences as part of its error handling for ill-formed subsequences" + * The current byte could be part of well-formed subsequences. Reduce the + * index by 1 to parse it in next loop. + */ + idx--; + continue; + } + + lower_bound = 0x80; + upper_bound = 0xbf; + code_point = (code_point << 6) | (b & 0x3f); + utf8_bytes_seen++; + if (utf8_bytes_needed != utf8_bytes_seen) { + continue; + } + + // Encode chars from U+10000 up as surrogate pairs + if (code_point < 0x10000) { + v[s++] = (jchar) code_point; + } else { + v[s++] = (jchar) ((code_point >> 10) + 0xd7c0); + v[s++] = (jchar) ((code_point & 0x3ff) + 0xdc00); + } + + utf8_bytes_seen = 0; + utf8_bytes_needed = 0; + code_point = 0; + } + } + + // The bytes seen are ill-formed. Substitute them by U+FFFD + if (utf8_bytes_needed != 0) { + v[s++] = kReplacementChar; + } + + ObjPtr<mirror::String> result = mirror::String::AllocFromUtf16(soa.Self(), s, v); + return soa.AddLocalReference<jstring>(result); +} + static JNINativeMethod gMethods[] = { FAST_NATIVE_METHOD(StringFactory, newStringFromBytes, "([BIII)Ljava/lang/String;"), FAST_NATIVE_METHOD(StringFactory, newStringFromChars, "(II[C)Ljava/lang/String;"), FAST_NATIVE_METHOD(StringFactory, newStringFromString, "(Ljava/lang/String;)Ljava/lang/String;"), + FAST_NATIVE_METHOD(StringFactory, newStringFromUtf8Bytes, "([BII)Ljava/lang/String;"), }; void register_java_lang_StringFactory(JNIEnv* env) { |