summaryrefslogtreecommitdiff
path: root/android/PhoneticStringUtils.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'android/PhoneticStringUtils.cpp')
-rw-r--r--android/PhoneticStringUtils.cpp243
1 files changed, 43 insertions, 200 deletions
diff --git a/android/PhoneticStringUtils.cpp b/android/PhoneticStringUtils.cpp
index 24b1647..fa32d38 100644
--- a/android/PhoneticStringUtils.cpp
+++ b/android/PhoneticStringUtils.cpp
@@ -18,6 +18,7 @@
#include <stdlib.h>
#include "PhoneticStringUtils.h"
+#include <utils/String8.h>
// We'd like 0 length string last of sorted list. So when input string is NULL
// or 0 length string, we use these instead.
@@ -30,60 +31,9 @@
namespace android {
-int GetCodePointFromUtf8(const char *src, size_t len, size_t index, int *next) {
- if (src == NULL || len <= index) {
- return -1;
- }
-
- if ((src[index] >> 7) == 0) {
- if (next != NULL) {
- *next = index + 1;
- }
- return src[index];
- }
- if ((src[index] & 64) == 0) {
- return -1;
- }
- int mask;
- size_t num_to_read;
- for (num_to_read = 1, mask = 64; // 01000000
- num_to_read < 7 && (src[index] & mask) == mask;
- num_to_read++, mask >>= 1) {
- }
- if (num_to_read == 7) {
- return -1;
- }
-
- if (num_to_read + index > len) {
- return -1;
- }
-
- {
- size_t i;
- for (i = 0, mask = 0; i < (7 - num_to_read); i++) {
- mask = (mask << 1) + 1;
- }
- }
-
- int codepoint = mask & src[index];
-
- for (size_t i = 1; i < num_to_read; i++) {
- if ((src[i + index] & 192) != 128) { // must be 10xxxxxx
- return -1;
- }
- codepoint = (codepoint << 6) + (src[i + index] & 63);
- }
-
- if (next != NULL) {
- *next = index + num_to_read;
- }
-
- return codepoint;
-}
-
// Get hiragana from halfwidth katakana.
-static int GetHiraganaFromHalfwidthKatakana(int codepoint,
- int next_codepoint,
+static int GetHiraganaFromHalfwidthKatakana(char32_t codepoint,
+ char32_t next_codepoint,
bool *next_is_consumed) {
if (codepoint < 0xFF66 || 0xFF9F < codepoint) {
return codepoint;
@@ -214,8 +164,8 @@ static int GetNormalizedHiragana(int codepoint) {
}
}
-static int GetNormalizedKana(int codepoint,
- int next_codepoint,
+static int GetNormalizedKana(char32_t codepoint,
+ char32_t next_codepoint,
bool *next_is_consumed) {
// First, convert fullwidth katakana and halfwidth katakana to hiragana.
if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
@@ -231,8 +181,8 @@ static int GetNormalizedKana(int codepoint,
return GetNormalizedHiragana(codepoint);
}
-int GetPhoneticallySortableCodePoint(int codepoint,
- int next_codepoint,
+int GetPhoneticallySortableCodePoint(char32_t codepoint,
+ char32_t next_codepoint,
bool *next_is_consumed) {
if (next_is_consumed != NULL) {
*next_is_consumed = false;
@@ -302,8 +252,8 @@ int GetPhoneticallySortableCodePoint(int codepoint,
return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
}
-int GetNormalizedCodePoint(int codepoint,
- int next_codepoint,
+int GetNormalizedCodePoint(char32_t codepoint,
+ char32_t next_codepoint,
bool *next_is_consumed) {
if (next_is_consumed != NULL) {
*next_is_consumed = false;
@@ -331,73 +281,10 @@ int GetNormalizedCodePoint(int codepoint,
return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
}
-
-bool GetUtf8FromCodePoint(int codepoint, char *dst, size_t len, size_t *index) {
- if (codepoint < 128) { // 1 << 7
- if (*index >= len) {
- return false;
- }
- // 0xxxxxxx
- dst[*index] = static_cast<char>(codepoint);
- (*index)++;
- } else if (codepoint < 2048) { // 1 << (6 + 5)
- if (*index + 1 >= len) {
- return false;
- }
- // 110xxxxx
- dst[(*index)++] = static_cast<char>(192 | (codepoint >> 6));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- } else if (codepoint < 65536) { // 1 << (6 * 2 + 4)
- if (*index + 2 >= len) {
- return false;
- }
- // 1110xxxx
- dst[(*index)++] = static_cast<char>(224 | (codepoint >> 12));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- } else if (codepoint < 2097152) { // 1 << (6 * 3 + 3)
- if (*index + 3 >= len) {
- return false;
- }
- // 11110xxx
- dst[(*index)++] = static_cast<char>(240 | (codepoint >> 18));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- } else if (codepoint < 67108864) { // 1 << (6 * 2 + 2)
- if (*index + 4 >= len) {
- return false;
- }
- // 111110xx
- dst[(*index)++] = static_cast<char>(248 | (codepoint >> 24));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- } else {
- if (*index + 5 >= len) {
- return false;
- }
- // 1111110x
- dst[(*index)++] = static_cast<char>(252 | (codepoint >> 30));
- // 10xxxxxx
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 24) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
- dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
- dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
- }
- return true;
-}
-
static bool GetExpectedString(
- const char *src, char **dst, size_t *len,
- int (*get_codepoint_function)(int, int, bool*)) {
- if (dst == NULL || len == NULL) {
+ const char *src, char **dst, size_t *dst_len,
+ int (*get_codepoint_function)(char32_t, char32_t, bool*)) {
+ if (dst == NULL || dst_len == NULL) {
return false;
}
@@ -405,99 +292,55 @@ static bool GetExpectedString(
src = STR_FOR_NULL_STR;
}
- size_t src_len = strlen(src);
- int codepoints[MAX_CODEPOINTS];
- size_t new_len = 0;
-
- size_t codepoint_index;
- {
- int i, next;
- for (codepoint_index = 0, i = 0, next = 0;
- static_cast<size_t>(i) < src_len &&
- codepoint_index < MAX_CODEPOINTS;
- i = next) {
- int codepoint = GetCodePointFromUtf8(src, src_len, i, &next);
- if (codepoint <= 0) {
- return false;
- }
- int tmp_next;
- int next_codepoint = GetCodePointFromUtf8(src, src_len,
- next, &tmp_next);
- bool next_is_consumed = false;
-
- // It is ok even if next_codepoint is negative.
- codepoints[codepoint_index] =
- get_codepoint_function(codepoint,
- next_codepoint,
- &next_is_consumed);
- // dakuten (voiced mark) or han-dakuten (half-voiced mark) existed.
- if (next_is_consumed) {
- next = tmp_next;
- }
-
- if (codepoints[codepoint_index] < 0) {
- // Do not increment codepoint_index.
- continue;
- }
-
- if (codepoints[codepoint_index] < 128) { // 1 << 7
- new_len++;
- } else if (codepoints[codepoint_index] < 2048) {
- // 1 << (6 + 5)
- new_len += 2;
- } else if (codepoints[codepoint_index] < 65536) {
- // 1 << (6 * 2 + 4)
- new_len += 3;
- } else if (codepoints[codepoint_index] < 2097152) {
- // 1 << (6 * 3 + 3)
- new_len += 4;
- } else if (codepoints[codepoint_index] < 67108864) {
- // 1 << (6 * 2 + 2)
- new_len += 5;
- } else {
- new_len += 6;
- }
+ char32_t codepoints[MAX_CODEPOINTS];
- codepoint_index++;
+ size_t src_len = GetUtf8LengthOrZero(src);
+ if (src_len == 0) {
+ return false;
+ }
+ bool next_is_consumed;
+ size_t j = 0;
+ for (size_t i = 0; i < src_len;) {
+ int32_t ret = GetUtf32AtFromUtf8(src, src_len, i, &i);
+ if (ret < 0) {
+ // failed to parse UTF-8
+ return false;
+ }
+ ret = get_codepoint_function(
+ static_cast<char32_t>(ret),
+ i + 1 < src_len ? codepoints[i + 1] : 0,
+ &next_is_consumed);
+ if (ret > 0) {
+ codepoints[j] = static_cast<char32_t>(ret);
+ j++;
+ }
+ if (next_is_consumed) {
+ i++;
}
}
+ size_t length = j;
- if (codepoint_index == 0) {
+ if (length == 0) {
// If all of codepoints are invalid, we place the string at the end of
// the list.
codepoints[0] = 0x10000 + CODEPOINT_FOR_NULL_STR;
- codepoint_index = 1;
- new_len = 4;
+ length = 1;
}
- new_len += 1; // For '\0'.
-
- *dst = static_cast<char *>(malloc(sizeof(char) * new_len));
+ size_t new_len = GetUtf8LengthFromUtf32(codepoints, length);
+ *dst = static_cast<char *>(malloc(new_len + 1));
if (*dst == NULL) {
return false;
}
- size_t ch_index;
- {
- size_t i;
- for (i = 0, ch_index = 0; i < codepoint_index; i++) {
- if (!GetUtf8FromCodePoint(codepoints[i], *dst,
- new_len, &ch_index)) {
- free(*dst);
- *dst = NULL;
- return false;
- }
- }
- }
-
- if (ch_index != new_len - 1) {
+ printf("new_len: %u\n", new_len);
+ if (GetUtf8FromUtf32(codepoints, length, *dst, new_len + 1) != new_len) {
free(*dst);
*dst = NULL;
return false;
}
- (*dst)[new_len - 1] = '\0';
- *len = new_len;
+ *dst_len = new_len;
return true;
}