diff options
-rw-r--r-- | android/Android.mk | 4 | ||||
-rw-r--r-- | android/PhoneticStringUtils.cpp | 243 | ||||
-rw-r--r-- | android/PhoneticStringUtils.h | 9 | ||||
-rw-r--r-- | android/PhoneticStringUtilsTest.cpp | 85 | ||||
-rw-r--r-- | dist/Android.mk | 3 |
5 files changed, 93 insertions, 251 deletions
diff --git a/android/Android.mk b/android/Android.mk index a9f68da..44d77b6 100644 --- a/android/Android.mk +++ b/android/Android.mk @@ -11,7 +11,6 @@ LOCAL_C_INCLUDES := \ external/icu4c/i18n \ external/icu4c/common - LOCAL_MODULE:= libsqlite3_android include $(BUILD_STATIC_LIBRARY) @@ -29,6 +28,9 @@ LOCAL_SRC_FILES := \ LOCAL_MODULE_TAGS := optional +LOCAL_SHARED_LIBRARIES := \ + libutils + include $(BUILD_EXECUTABLE) # Test for PhoneNumberUtils diff --git a/android/PhoneticStringUtils.cpp b/android/PhoneticStringUtils.cpp index 24b1647..fa32d38 100644 --- a/android/PhoneticStringUtils.cpp +++ b/android/PhoneticStringUtils.cpp @@ -18,6 +18,7 @@ #include <stdlib.h> #include "PhoneticStringUtils.h" +#include <utils/String8.h> // We'd like 0 length string last of sorted list. So when input string is NULL // or 0 length string, we use these instead. @@ -30,60 +31,9 @@ namespace android { -int GetCodePointFromUtf8(const char *src, size_t len, size_t index, int *next) { - if (src == NULL || len <= index) { - return -1; - } - - if ((src[index] >> 7) == 0) { - if (next != NULL) { - *next = index + 1; - } - return src[index]; - } - if ((src[index] & 64) == 0) { - return -1; - } - int mask; - size_t num_to_read; - for (num_to_read = 1, mask = 64; // 01000000 - num_to_read < 7 && (src[index] & mask) == mask; - num_to_read++, mask >>= 1) { - } - if (num_to_read == 7) { - return -1; - } - - if (num_to_read + index > len) { - return -1; - } - - { - size_t i; - for (i = 0, mask = 0; i < (7 - num_to_read); i++) { - mask = (mask << 1) + 1; - } - } - - int codepoint = mask & src[index]; - - for (size_t i = 1; i < num_to_read; i++) { - if ((src[i + index] & 192) != 128) { // must be 10xxxxxx - return -1; - } - codepoint = (codepoint << 6) + (src[i + index] & 63); - } - - if (next != NULL) { - *next = index + num_to_read; - } - - return codepoint; -} - // Get hiragana from halfwidth katakana. -static int GetHiraganaFromHalfwidthKatakana(int codepoint, - int next_codepoint, +static int GetHiraganaFromHalfwidthKatakana(char32_t codepoint, + char32_t next_codepoint, bool *next_is_consumed) { if (codepoint < 0xFF66 || 0xFF9F < codepoint) { return codepoint; @@ -214,8 +164,8 @@ static int GetNormalizedHiragana(int codepoint) { } } -static int GetNormalizedKana(int codepoint, - int next_codepoint, +static int GetNormalizedKana(char32_t codepoint, + char32_t next_codepoint, bool *next_is_consumed) { // First, convert fullwidth katakana and halfwidth katakana to hiragana. if (0x30A1 <= codepoint && codepoint <= 0x30F6) { @@ -231,8 +181,8 @@ static int GetNormalizedKana(int codepoint, return GetNormalizedHiragana(codepoint); } -int GetPhoneticallySortableCodePoint(int codepoint, - int next_codepoint, +int GetPhoneticallySortableCodePoint(char32_t codepoint, + char32_t next_codepoint, bool *next_is_consumed) { if (next_is_consumed != NULL) { *next_is_consumed = false; @@ -302,8 +252,8 @@ int GetPhoneticallySortableCodePoint(int codepoint, return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed); } -int GetNormalizedCodePoint(int codepoint, - int next_codepoint, +int GetNormalizedCodePoint(char32_t codepoint, + char32_t next_codepoint, bool *next_is_consumed) { if (next_is_consumed != NULL) { *next_is_consumed = false; @@ -331,73 +281,10 @@ int GetNormalizedCodePoint(int codepoint, return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed); } - -bool GetUtf8FromCodePoint(int codepoint, char *dst, size_t len, size_t *index) { - if (codepoint < 128) { // 1 << 7 - if (*index >= len) { - return false; - } - // 0xxxxxxx - dst[*index] = static_cast<char>(codepoint); - (*index)++; - } else if (codepoint < 2048) { // 1 << (6 + 5) - if (*index + 1 >= len) { - return false; - } - // 110xxxxx - dst[(*index)++] = static_cast<char>(192 | (codepoint >> 6)); - // 10xxxxxx - dst[(*index)++] = static_cast<char>(128 | (codepoint & 63)); - } else if (codepoint < 65536) { // 1 << (6 * 2 + 4) - if (*index + 2 >= len) { - return false; - } - // 1110xxxx - dst[(*index)++] = static_cast<char>(224 | (codepoint >> 12)); - // 10xxxxxx - dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63)); - dst[(*index)++] = static_cast<char>(128 | (codepoint & 63)); - } else if (codepoint < 2097152) { // 1 << (6 * 3 + 3) - if (*index + 3 >= len) { - return false; - } - // 11110xxx - dst[(*index)++] = static_cast<char>(240 | (codepoint >> 18)); - // 10xxxxxx - dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63)); - dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63)); - dst[(*index)++] = static_cast<char>(128 | (codepoint & 63)); - } else if (codepoint < 67108864) { // 1 << (6 * 2 + 2) - if (*index + 4 >= len) { - return false; - } - // 111110xx - dst[(*index)++] = static_cast<char>(248 | (codepoint >> 24)); - // 10xxxxxx - dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63)); - dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63)); - dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63)); - dst[(*index)++] = static_cast<char>(128 | (codepoint & 63)); - } else { - if (*index + 5 >= len) { - return false; - } - // 1111110x - dst[(*index)++] = static_cast<char>(252 | (codepoint >> 30)); - // 10xxxxxx - dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 24) & 63)); - dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63)); - dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63)); - dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63)); - dst[(*index)++] = static_cast<char>(128 | (codepoint & 63)); - } - return true; -} - static bool GetExpectedString( - const char *src, char **dst, size_t *len, - int (*get_codepoint_function)(int, int, bool*)) { - if (dst == NULL || len == NULL) { + const char *src, char **dst, size_t *dst_len, + int (*get_codepoint_function)(char32_t, char32_t, bool*)) { + if (dst == NULL || dst_len == NULL) { return false; } @@ -405,99 +292,55 @@ static bool GetExpectedString( src = STR_FOR_NULL_STR; } - size_t src_len = strlen(src); - int codepoints[MAX_CODEPOINTS]; - size_t new_len = 0; - - size_t codepoint_index; - { - int i, next; - for (codepoint_index = 0, i = 0, next = 0; - static_cast<size_t>(i) < src_len && - codepoint_index < MAX_CODEPOINTS; - i = next) { - int codepoint = GetCodePointFromUtf8(src, src_len, i, &next); - if (codepoint <= 0) { - return false; - } - int tmp_next; - int next_codepoint = GetCodePointFromUtf8(src, src_len, - next, &tmp_next); - bool next_is_consumed = false; - - // It is ok even if next_codepoint is negative. - codepoints[codepoint_index] = - get_codepoint_function(codepoint, - next_codepoint, - &next_is_consumed); - // dakuten (voiced mark) or han-dakuten (half-voiced mark) existed. - if (next_is_consumed) { - next = tmp_next; - } - - if (codepoints[codepoint_index] < 0) { - // Do not increment codepoint_index. - continue; - } - - if (codepoints[codepoint_index] < 128) { // 1 << 7 - new_len++; - } else if (codepoints[codepoint_index] < 2048) { - // 1 << (6 + 5) - new_len += 2; - } else if (codepoints[codepoint_index] < 65536) { - // 1 << (6 * 2 + 4) - new_len += 3; - } else if (codepoints[codepoint_index] < 2097152) { - // 1 << (6 * 3 + 3) - new_len += 4; - } else if (codepoints[codepoint_index] < 67108864) { - // 1 << (6 * 2 + 2) - new_len += 5; - } else { - new_len += 6; - } + char32_t codepoints[MAX_CODEPOINTS]; - codepoint_index++; + size_t src_len = GetUtf8LengthOrZero(src); + if (src_len == 0) { + return false; + } + bool next_is_consumed; + size_t j = 0; + for (size_t i = 0; i < src_len;) { + int32_t ret = GetUtf32AtFromUtf8(src, src_len, i, &i); + if (ret < 0) { + // failed to parse UTF-8 + return false; + } + ret = get_codepoint_function( + static_cast<char32_t>(ret), + i + 1 < src_len ? codepoints[i + 1] : 0, + &next_is_consumed); + if (ret > 0) { + codepoints[j] = static_cast<char32_t>(ret); + j++; + } + if (next_is_consumed) { + i++; } } + size_t length = j; - if (codepoint_index == 0) { + if (length == 0) { // If all of codepoints are invalid, we place the string at the end of // the list. codepoints[0] = 0x10000 + CODEPOINT_FOR_NULL_STR; - codepoint_index = 1; - new_len = 4; + length = 1; } - new_len += 1; // For '\0'. - - *dst = static_cast<char *>(malloc(sizeof(char) * new_len)); + size_t new_len = GetUtf8LengthFromUtf32(codepoints, length); + *dst = static_cast<char *>(malloc(new_len + 1)); if (*dst == NULL) { return false; } - size_t ch_index; - { - size_t i; - for (i = 0, ch_index = 0; i < codepoint_index; i++) { - if (!GetUtf8FromCodePoint(codepoints[i], *dst, - new_len, &ch_index)) { - free(*dst); - *dst = NULL; - return false; - } - } - } - - if (ch_index != new_len - 1) { + printf("new_len: %u\n", new_len); + if (GetUtf8FromUtf32(codepoints, length, *dst, new_len + 1) != new_len) { free(*dst); *dst = NULL; return false; } - (*dst)[new_len - 1] = '\0'; - *len = new_len; + *dst_len = new_len; return true; } diff --git a/android/PhoneticStringUtils.h b/android/PhoneticStringUtils.h index 68a4928..9da7d29 100644 --- a/android/PhoneticStringUtils.h +++ b/android/PhoneticStringUtils.h @@ -18,6 +18,7 @@ #define _ANDROID_PHONETIC_STRING_UTILS_H #include <string.h> // For size_t. +#include <utils/String8.h> namespace android { @@ -31,8 +32,8 @@ int GetCodePointFromUtf8(const char *src, size_t len, size_t index, int *next); // is "consumed" (e.g. Japanese halfwidth katakana's voiced mark is consumed // when previous "codepoint" is appropriate). If the codepoint should not be // considered when sorting (e.g. whitespaces), -1 is returned. -int GetPhoneticallySortableCodePoint(int codepoint, - int next_codepoint, +int GetPhoneticallySortableCodePoint(char32_t codepoint, + char32_t next_codepoint, bool *next_is_consumed); // Returns codepoint which is "normalized", whose definition depends on each @@ -44,8 +45,8 @@ int GetPhoneticallySortableCodePoint(int codepoint, // // In Japanese, "normalized" means that half-width and full-width katakana is // appropriately converted to hiragana. -int GetNormalizedCodePoint(int codepoint, - int next_codepoint, +int GetNormalizedCodePoint(char32_t codepoint, + char32_t next_codepoint, bool *next_is_consumed); // Pushes Utf8 expression of "codepoint" to "dst". Returns true when successful. diff --git a/android/PhoneticStringUtilsTest.cpp b/android/PhoneticStringUtilsTest.cpp index e74f67f..06a7ba8 100644 --- a/android/PhoneticStringUtilsTest.cpp +++ b/android/PhoneticStringUtilsTest.cpp @@ -20,6 +20,8 @@ #include <stdlib.h> #include <string.h> +#include <utils/String8.h> + using namespace android; class TestExecutor { @@ -29,12 +31,12 @@ class TestExecutor { private: void DoOneTest(void (TestExecutor::*test)()); - void testGetCodePointFromUtf8(); + void testGetUtf32At(); void testGetPhoneticallySortableCodePointAscii(); void testGetPhoneticallySortableCodePointKana(); void testGetPhoneticallySortableCodePointWhitespaceOnly(); void testGetPhoneticallySortableCodePointSimpleCompare(); - void testGetUtf8FromCodePoint(); + void testGetUtf8FromUtf32(); void testGetPhoneticallySortableString(); void testGetNormalizedString(); @@ -65,12 +67,12 @@ class TestExecutor { bool TestExecutor::DoAllTests() { - DoOneTest(&TestExecutor::testGetCodePointFromUtf8); + DoOneTest(&TestExecutor::testGetUtf32At); DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointAscii); DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointKana); DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly); DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare); - DoOneTest(&TestExecutor::testGetUtf8FromCodePoint); + DoOneTest(&TestExecutor::testGetUtf8FromUtf32); DoOneTest(&TestExecutor::testGetPhoneticallySortableString); DoOneTest(&TestExecutor::testGetNormalizedString); @@ -92,26 +94,35 @@ void TestExecutor::DoOneTest(void (TestExecutor::*test)()) { m_success_count += m_success ? 1 : 0; } -void TestExecutor::testGetCodePointFromUtf8() { - printf("testGetCodePointFromUtf8()\n"); - int next; +#define TEST_GET_UTF32AT(src, index, expected_next, expected_value) \ + ({ \ + size_t next; \ + String8 string8(src); \ + int32_t ret = string8.getUtf32At((index), &next); \ + if (ret < 0) { \ + printf("getUtf32At() returned negative value (src: %s, index: %d)\n", \ + (src), (index)); \ + m_success = false; \ + } else if (next != (expected_next)) { \ + printf("next is unexpected value (src: %s, actual: %u, expected: %u)\n", \ + (src), next, (expected_next)); \ + } else { \ + EXPECT_EQ_VALUE(ret, (expected_value)); \ + } \ + }) + +void TestExecutor::testGetUtf32At() { + printf("testGetUtf32At()\n"); - EXPECT_EQ_VALUE(GetCodePointFromUtf8("a", 1, 0, &next), 97); - EXPECT_EQ_VALUE(next, 1); + TEST_GET_UTF32AT("a", 0, 1, 97); // Japanese hiragana "a" - EXPECT_EQ_VALUE(GetCodePointFromUtf8("\xE3\x81\x82", 3, 0, &next), 0x3042); - EXPECT_EQ_VALUE(next, 3); + TEST_GET_UTF32AT("\xE3\x81\x82", 0, 3, 0x3042); // Japanese fullwidth katakana "a" with ascii a - EXPECT_EQ_VALUE(GetCodePointFromUtf8("a\xE3\x82\xA2", 4, 1, &next), 0x30A2); - EXPECT_EQ_VALUE(next, 4); + TEST_GET_UTF32AT("a\xE3\x82\xA2", 1, 4, 0x30A2); // 2 PUA - ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", - 8, 0, &next), 0xFE000); - ASSERT_EQ_VALUE(next, 4); - ASSERT_EQ_VALUE(GetCodePointFromUtf8("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", - 8, next, &next), 0xFE008); - ASSERT_EQ_VALUE(next, 8); + TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 0, 4, 0xFE000); + TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 4, 8, 0xFE008); } void TestExecutor::testGetPhoneticallySortableCodePointAscii() { @@ -282,20 +293,18 @@ void TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare() { } } -#define EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, i) \ +#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected) \ ({ \ - index = i; \ - if (!GetUtf8FromCodePoint(codepoint, dst, 10, &index)) { \ + char32_t codepoints[1] = {codepoint}; \ + status_t ret = string8.setTo(codepoints, 1); \ + if (ret != NO_ERROR) { \ printf("GetUtf8FromCodePoint() returned false at 0x%04X\n", codepoint); \ m_success = false; \ - } else if (index >= 10) { \ - printf("index (%d) >= 10\n", index); \ - m_success = false; \ } else { \ - dst[index] = '\0'; \ - if (strcmp(dst + i, expected) != 0) { \ + const char* string = string8.string(); \ + if (strcmp(string, expected) != 0) { \ printf("Failed at codepoint 0x%04X\n", codepoint); \ - for (const char *ch = dst; *ch != '\0'; ++ch) { \ + for (const char *ch = string; *ch != '\0'; ++ch) { \ printf("0x%X ", *ch); \ } \ printf("!= "); \ @@ -308,14 +317,9 @@ void TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare() { } \ }) -#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected) \ - EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(codepoint, expected, 0) - - -void TestExecutor::testGetUtf8FromCodePoint() { - printf("testGetUtf8FromCodePoint()\n"); - size_t index = 0; - char dst[10]; +void TestExecutor::testGetUtf8FromUtf32() { + printf("testGetUtf8FromUtf32()\n"); + String8 string8; EXPECT_EQ_CODEPOINT_UTF8('a', "\x61"); // Armenian capital letter AYB (2 bytes in UTF8) @@ -327,15 +331,6 @@ void TestExecutor::testGetUtf8FromCodePoint() { // PUA (4 byets in UTF8) EXPECT_EQ_CODEPOINT_UTF8(0xFE016, "\xF3\xBE\x80\x96"); EXPECT_EQ_CODEPOINT_UTF8(0xFE972, "\xF3\xBE\xA5\xB2"); - - EXPECT_EQ_CODEPOINT_UTF8_WITH_INDEX(0x058F, "\xD6\x8F", 3); - - index = 0; - if (GetUtf8FromCodePoint(0x3043, dst, 2, &index)) { - printf("GetUtf8FromCodePont() returned true even when destination length" - "is not enough\n"); - m_success = false; - } } #define EXPECT_EQ_UTF8_UTF8(src, expected) \ diff --git a/dist/Android.mk b/dist/Android.mk index 1d8e1eb..b7aad30 100644 --- a/dist/Android.mk +++ b/dist/Android.mk @@ -29,7 +29,8 @@ LOCAL_MODULE:= libsqlite LOCAL_C_INCLUDES += $(call include-path-for, system-core)/cutils LOCAL_SHARED_LIBRARIES += liblog \ libicuuc \ - libicui18n + libicui18n \ + libutils # include android specific methods LOCAL_WHOLE_STATIC_LIBRARIES := libsqlite3_android |