summaryrefslogtreecommitdiff
path: root/android
diff options
context:
space:
mode:
authorJay Shrauner <shrauner@google.com>2012-12-17 11:12:30 -0800
committerJay Shrauner <shrauner@google.com>2013-02-05 16:14:59 -0800
commitdb8a386d111b11463c877b3a14ab62aec761a3f5 (patch)
tree1a76447eebd49c0b3a5e605a64a1ec74ec942dee /android
parent7dbe9638c4c7cdb01a71528d8cb1f0065120d7b8 (diff)
Fix contacts index labels for i18n
Switch ContactsProvider to using ICU for generation of index labels, and remove custom KO and JA code. Add i18n test cases. Bug:7351596 Change-Id: I7ac25add8b29ff2c6c395f04a83b279b541e4125
Diffstat (limited to 'android')
-rw-r--r--android/Android.mk47
-rw-r--r--android/PhonebookIndex.cpp317
-rw-r--r--android/PhonebookIndexTest.cpp243
-rw-r--r--android/PhoneticStringUtils.cpp319
-rw-r--r--android/PhoneticStringUtils.h59
-rw-r--r--android/PhoneticStringUtilsTest.cpp217
-rw-r--r--android/sqlite3_android.cpp8
7 files changed, 430 insertions, 780 deletions
diff --git a/android/Android.mk b/android/Android.mk
index 151a5cb..0bb78d3 100644
--- a/android/Android.mk
+++ b/android/Android.mk
@@ -2,7 +2,6 @@ LOCAL_PATH:= $(call my-dir)
libsqlite3_android_local_src_files := \
PhoneNumberUtils.cpp \
- PhoneticStringUtils.cpp \
OldPhoneNumberUtils.cpp \
PhonebookIndex.cpp \
sqlite3_android.cpp
@@ -10,7 +9,8 @@ libsqlite3_android_local_src_files := \
libsqlite3_android_c_includes := \
external/sqlite/dist \
external/icu4c/i18n \
- external/icu4c/common
+ external/icu4c/common \
+ frameworks/native/include
include $(CLEAR_VARS)
LOCAL_SRC_FILES:= $(libsqlite3_android_local_src_files)
@@ -26,24 +26,6 @@ ifeq ($(WITH_HOST_DALVIK),true)
include $(BUILD_HOST_STATIC_LIBRARY)
endif
-# Test for PhoneticStringUtils
-include $(CLEAR_VARS)
-
-LOCAL_MODULE:= libsqlite3_phonetic_string_utils_test
-
-LOCAL_CFLAGS += -Wall -Werror
-
-LOCAL_SRC_FILES := \
- PhoneticStringUtils.cpp \
- PhoneticStringUtilsTest.cpp
-
-LOCAL_MODULE_TAGS := optional
-
-LOCAL_SHARED_LIBRARIES := \
- libutils
-
-include $(BUILD_EXECUTABLE)
-
# Test for PhoneNumberUtils
#
# You can also test this in Unix, like this:
@@ -71,3 +53,28 @@ LOCAL_SRC_FILES := \
LOCAL_MODULE_TAGS := optional
include $(BUILD_EXECUTABLE)
+
+ifeq ($(WITH_HOST_DALVIK),true)
+ include $(CLEAR_VARS)
+
+ LOCAL_MODULE:= libsqlite3_phone_book_index_test
+
+ LOCAL_SRC_FILES := \
+ PhonebookIndex.cpp \
+ PhonebookIndexTest.cpp
+
+ LOCAL_C_INCLUDES := \
+ external/icu4c/i18n \
+ external/icu4c/common \
+ frameworks/native/include
+
+ LOCAL_MODULE_TAGS := optional
+
+ LOCAL_SHARED_LIBRARIES := \
+ libicui18n libicuuc
+
+ LOCAL_STATIC_LIBRARIES := \
+ libutils libcutils
+
+ include $(BUILD_HOST_EXECUTABLE)
+endif
diff --git a/android/PhonebookIndex.cpp b/android/PhonebookIndex.cpp
index 5cc26e5..68674f4 100644
--- a/android/PhonebookIndex.cpp
+++ b/android/PhonebookIndex.cpp
@@ -14,192 +14,193 @@
* limitations under the License.
*/
+#include <stdlib.h>
#include <ctype.h>
#include <string.h>
+#include <stdio.h>
+#include <unicode/alphaindex.h>
#include <unicode/ucol.h>
#include <unicode/uiter.h>
#include <unicode/ustring.h>
#include <unicode/utypes.h>
+#include <unicode/uloc.h>
+#include <utils/Mutex.h>
+#include <utils/RefBase.h>
#include "PhonebookIndex.h"
-#include "PhoneticStringUtils.h"
#define MIN_OUTPUT_SIZE 6 // Minimum required size for the output buffer (in bytes)
namespace android {
-// IMPORTANT! Keep the codes below SORTED. We are doing a binary search on the array
-static UChar DEFAULT_CHAR_MAP[] = {
- 0x00C6, 'A', // AE
- 0x00DF, 'S', // Etzett
- 0x1100, 0x3131, // HANGUL LETTER KIYEOK
- 0x1101, 0x3132, // HANGUL LETTER SSANGKIYEOK
- 0x1102, 0x3134, // HANGUL LETTER NIEUN
- 0x1103, 0x3137, // HANGUL LETTER TIKEUT
- 0x1104, 0x3138, // HANGUL LETTER SSANGTIKEUT
- 0x1105, 0x3139, // HANGUL LETTER RIEUL
- 0x1106, 0x3141, // HANGUL LETTER MIEUM
- 0x1107, 0x3142, // HANGUL LETTER PIEUP
- 0x1108, 0x3143, // HANGUL LETTER SSANGPIEUP
- 0x1109, 0x3145, // HANGUL LETTER SIOS
- 0x110A, 0x3146, // HANGUL LETTER SSANGSIOS
- 0x110B, 0x3147, // HANGUL LETTER IEUNG
- 0x110C, 0x3148, // HANGUL LETTER CIEUC
- 0x110D, 0x3149, // HANGUL LETTER SSANGCIEUC
- 0x110E, 0x314A, // HANGUL LETTER CHIEUCH
- 0x110F, 0x314B, // HANGUL LETTER KHIEUKH
- 0x1110, 0x314C, // HANGUL LETTER THIEUTH
- 0x1111, 0x314D, // HANGUL LETTER PHIEUPH
- 0x1112, 0x314E, // HANGUL LETTER HIEUH
- 0x111A, 0x3140, // HANGUL LETTER RIEUL-HIEUH
- 0x1121, 0x3144, // HANGUL LETTER PIEUP-SIOS
- 0x1161, 0x314F, // HANGUL LETTER A
- 0x1162, 0x3150, // HANGUL LETTER AE
- 0x1163, 0x3151, // HANGUL LETTER YA
- 0x1164, 0x3152, // HANGUL LETTER YAE
- 0x1165, 0x3153, // HANGUL LETTER EO
- 0x1166, 0x3154, // HANGUL LETTER E
- 0x1167, 0x3155, // HANGUL LETTER YEO
- 0x1168, 0x3156, // HANGUL LETTER YE
- 0x1169, 0x3157, // HANGUL LETTER O
- 0x116A, 0x3158, // HANGUL LETTER WA
- 0x116B, 0x3159, // HANGUL LETTER WAE
- 0x116C, 0x315A, // HANGUL LETTER OE
- 0x116D, 0x315B, // HANGUL LETTER YO
- 0x116E, 0x315C, // HANGUL LETTER U
- 0x116F, 0x315D, // HANGUL LETTER WEO
- 0x1170, 0x315E, // HANGUL LETTER WE
- 0x1171, 0x315F, // HANGUL LETTER WI
- 0x1172, 0x3160, // HANGUL LETTER YU
- 0x1173, 0x3161, // HANGUL LETTER EU
- 0x1174, 0x3162, // HANGUL LETTER YI
- 0x1175, 0x3163, // HANGUL LETTER I
- 0x11AA, 0x3133, // HANGUL LETTER KIYEOK-SIOS
- 0x11AC, 0x3135, // HANGUL LETTER NIEUN-CIEUC
- 0x11AD, 0x3136, // HANGUL LETTER NIEUN-HIEUH
- 0x11B0, 0x313A, // HANGUL LETTER RIEUL-KIYEOK
- 0x11B1, 0x313B, // HANGUL LETTER RIEUL-MIEUM
- 0x11B3, 0x313D, // HANGUL LETTER RIEUL-SIOS
- 0x11B4, 0x313E, // HANGUL LETTER RIEUL-THIEUTH
- 0x11B5, 0x313F, // HANGUL LETTER RIEUL-PHIEUPH
+// Wrapper class to enable using libutil SmartPointers with AlphabeticIndex.
+class AlphabeticIndexRef : public RefBase {
+public:
+ AlphabeticIndexRef(const char *locale, UErrorCode &status) :
+ m_index(locale, status), m_locale(NULL), m_isJapanese(false) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+ m_locale = strdup(locale);
+ if (m_locale == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ char language[4];
+ uloc_getLanguage(locale, language, sizeof(language), &status);
+ if (U_FAILURE(status)) {
+ return;
+ }
+ m_isJapanese = (strcmp(language, ULOC_JAPANESE) == 0);
+ }
+ virtual ~AlphabeticIndexRef() { free(m_locale); }
+
+ AlphabeticIndex& operator*() { return m_index; }
+ AlphabeticIndex* operator->() { return &m_index; }
+
+ bool isLocale(const char *locale) const {
+ return (locale != NULL && m_locale != NULL &&
+ strcmp(m_locale, locale) == 0);
+ }
+ bool isJapanese() const { return m_isJapanese; }
+ int32_t getLabel(int32_t bucketIndex, UChar *labelBuf, int32_t labelBufSize);
+
+private:
+ AlphabeticIndex m_index;
+ char *m_locale;
+ bool m_isJapanese;
};
-/**
- * Binary search to map an individual character to the corresponding phone book index.
- */
-static UChar map_character(UChar c, UChar * char_map, int32_t length) {
- int from = 0, to = length;
- while (from < to) {
- int m = ((to + from) >> 1) & ~0x1; // Only consider even positions
- UChar cm = char_map[m];
- if (cm == c) {
- return char_map[m + 1];
- } else if (cm < c) {
- from = m + 2;
+int32_t AlphabeticIndexRef::getLabel(int32_t bucketIndex, UChar *labelBuf,
+ int32_t labelBufSize) {
+ UErrorCode status = U_ZERO_ERROR;
+ m_index.resetBucketIterator(status);
+ if (U_FAILURE(status)) {
+ return -1;
+ }
+ for(int i = 0; i <= bucketIndex; ++i) {
+ if (!m_index.nextBucket(status) || U_FAILURE(status)) {
+ return -1;
+ }
+ }
+
+ int32_t len;
+ if (m_index.getBucketLabelType() == U_ALPHAINDEX_NORMAL) {
+ len = m_index.getBucketLabel().extract(labelBuf, labelBufSize, status);
+ if (U_FAILURE(status)) {
+ return -1;
+ }
} else {
- to = m;
+ // Use no label for underflow/inflow/overflow buckets
+ labelBuf[0] = '\0';
+ len = 0;
}
- }
- return 0;
+ return len;
}
+static Mutex gIndexMutex;
+static sp<AlphabeticIndexRef> gIndex;
+
/**
* Returns TRUE if the character belongs to a Hanzi unicode block
*/
-static bool is_CJK(UChar c) {
- return
- (0x4e00 <= c && c <= 0x9fff) // CJK_UNIFIED_IDEOGRAPHS
- || (0x3400 <= c && c <= 0x4dbf) // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
- || (0x3000 <= c && c <= 0x303f) // CJK_SYMBOLS_AND_PUNCTUATION
- || (0x2e80 <= c && c <= 0x2eff) // CJK_RADICALS_SUPPLEMENT
- || (0x3300 <= c && c <= 0x33ff) // CJK_COMPATIBILITY
- || (0xfe30 <= c && c <= 0xfe4f) // CJK_COMPATIBILITY_FORMS
- || (0xf900 <= c && c <= 0xfaff); // CJK_COMPATIBILITY_IDEOGRAPHS
+static bool is_CJ(UChar32 c) {
+ return (uscript_hasScript(c, USCRIPT_HAN) ||
+ uscript_hasScript(c, USCRIPT_HIRAGANA) ||
+ uscript_hasScript(c, USCRIPT_KATAKANA));
+}
+
+static bool initIndexForLocale(const char *locale) {
+ if (locale == NULL) {
+ return false;
+ }
+
+ if (gIndex != NULL && gIndex->isLocale(locale)) {
+ return true;
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ sp<AlphabeticIndexRef> newIndex(new AlphabeticIndexRef(locale, status));
+ if (newIndex == NULL || U_FAILURE(status)) {
+ return false;
+ }
+ // Always create labels for Latin characters if not present in native set
+ (*newIndex)->addLabels("en", status);
+ if (U_FAILURE(status)) {
+ return false;
+ }
+ if ((*newIndex)->getBucketCount(status) <= 0 || U_FAILURE(status)) {
+ return false;
+ }
+
+ gIndex = newIndex;
+ return true;
}
-int32_t GetPhonebookIndex(UCharIterator * iter, const char * locale, UChar * out, int32_t size,
- UBool * isError)
+int32_t GetPhonebookIndex(UCharIterator *iter, const char *locale,
+ UChar *out, int32_t size, UBool *isError)
{
- if (size < MIN_OUTPUT_SIZE) {
- *isError = TRUE;
- return 0;
- }
-
- *isError = FALSE;
-
- // Normalize the first character to remove accents using the NFD normalization
- UErrorCode errorCode = U_ZERO_ERROR;
- int32_t len = unorm_next(iter, out, size, UNORM_NFD,
- 0 /* options */, TRUE /* normalize */, NULL, &errorCode);
- if (U_FAILURE(errorCode)) {
- *isError = TRUE;
- return 0;
- }
-
- if (len == 0) { // Empty input string
- return 0;
- }
-
- UChar c = out[0];
-
- if (!u_isalpha(c)) {
- // Digits go into a # section. Everything else goes into the empty section
- // The unicode function u_isdigit would also identify other characters as digits (arabic),
- // but if we caught them here we'd risk having the same section before and after alpha-letters
- // which might break the assumption that each section exists only once
- if (c >= '0' && c <= '9') {
- out[0] = '#';
- return 1;
- }
- return 0;
- }
-
- c = u_toupper(c);
-
- // Check for explicitly mapped characters
- UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar));
- if (c_mapped != 0) {
- out[0] = c_mapped;
- return 1;
- }
-
- // Convert Kanas to Hiragana
- UChar next = len > 2 ? out[1] : 0;
- c = android::GetNormalizedCodePoint(c, next, NULL);
-
- // Traditional grouping of Hiragana characters
- if (0x3041 <= c && c <= 0x309F) {
- if (c < 0x304B) c = 0x3042; // a
- else if (c < 0x3055) c = 0x304B; // ka
- else if (c < 0x305F) c = 0x3055; // sa
- else if (c < 0x306A) c = 0x305F; // ta
- else if (c < 0x306F) c = 0x306A; // na
- else if (c < 0x307E) c = 0x306F; // ha
- else if (c < 0x3083) c = 0x307E; // ma
- else if (c < 0x3089) c = 0x3084; // ya
- else if (c < 0x308E) c = 0x3089; // ra
- else if (c < 0x3094) c = 0x308F; // wa
- else return 0; // Others are not readable
- out[0] = c;
- return 1;
- } else if (0x30A0 <= c && c <= 0x30FF) {
- // Dot, onbiki, iteration marks are not readable
- return 0;
- }
-
- if (is_CJK(c)) {
- if (strncmp(locale, "ja", 2) == 0) {
- // Japanese word meaning "misc" or "other"
- out[0] = 0x4ED6;
- return 1;
- } else {
- return 0;
+ if (size < MIN_OUTPUT_SIZE) {
+ *isError = TRUE;
+ return 0;
+ }
+
+ *isError = FALSE;
+ out[0] = '\0';
+ iter->move(iter, 0, UITER_ZERO);
+ if (!iter->hasNext(iter)) { // Empty input string
+ return 0;
+ }
+ UnicodeString ustr;
+ bool prefixIsNonNumeric = false;
+ bool prefixIsNumeric = false;
+ while (iter->hasNext(iter)) {
+ UChar32 ch = uiter_next32(iter);
+ // Ignore standard phone number separators and identify any string
+ // that otherwise starts with a number.
+ if (!prefixIsNumeric && !prefixIsNonNumeric) {
+ if (u_isdigit(ch)) {
+ prefixIsNumeric = true;
+ } else if (!u_isspace(ch) && ch != '+' && ch != '(' &&
+ ch != ')' && ch != '.' && ch != '-' && ch != '#') {
+ prefixIsNonNumeric = true;
+ }
+ }
+ ustr.append(ch);
+ }
+ if (prefixIsNumeric) {
+ out[0] = '#';
+ return 1;
+ }
+
+ Mutex::Autolock autolock(gIndexMutex);
+ if (!initIndexForLocale(locale)) {
+ *isError = TRUE;
+ return 0;
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ int32_t bucketIndex = (*gIndex)->getBucketIndex(ustr, status);
+ if (U_FAILURE(status)) {
+ *isError = TRUE;
+ return 0;
+ }
+
+ int32_t len = gIndex->getLabel(bucketIndex, out, size);
+ if (len < 0) {
+ *isError = TRUE;
+ return 0;
+ }
+
+ // For Japanese, label unclassified CJK ideographs with
+ // Japanese word meaning "misc" or "other"
+ if (gIndex->isJapanese() && len == 0 && is_CJ(ustr.char32At(0))) {
+ out[0] = 0x4ED6;
+ len = 1;
}
- }
- out[0] = c;
- return 1;
+ return len;
}
} // namespace android
diff --git a/android/PhonebookIndexTest.cpp b/android/PhonebookIndexTest.cpp
new file mode 100644
index 0000000..2f11dbe
--- /dev/null
+++ b/android/PhonebookIndexTest.cpp
@@ -0,0 +1,243 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PhonebookIndex.h"
+
+#include <unicode/unistr.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+using namespace android;
+
+class TestExecutor {
+public:
+ TestExecutor() : m_total_count(0), m_success_count(0), m_success(true) {}
+ bool DoAllTests();
+private:
+ void DoOneTest(void (TestExecutor::*test)());
+
+ void testGetIndex(const char *src, const char *locale,
+ int32_t expected_len, UChar *expected_value);
+ void testEnglish();
+
+ // Note: When adding a test, do not forget to add it to DoOneTest().
+
+ int m_total_count;
+ int m_success_count;
+
+ bool m_success;
+};
+
+
+bool TestExecutor::DoAllTests() {
+ DoOneTest(&TestExecutor::testEnglish);
+
+ printf("Test total: %d\nSuccess: %d\nFailure: %d\n",
+ m_total_count, m_success_count, m_total_count - m_success_count);
+
+ bool success = m_total_count == m_success_count;
+ printf("\n%s\n", success ? "Success" : "Failure");
+
+ return success;
+}
+
+void TestExecutor::DoOneTest(void (TestExecutor::*test)()) {
+ m_success = true;
+
+ (this->*test)();
+
+ ++m_total_count;
+ m_success_count += m_success ? 1 : 0;
+}
+
+#define BUFFER_SIZE 10
+
+static void printUTF8Str(const char *utf8_str) {
+ printf("%s (", utf8_str);
+ for(; *utf8_str != '\0'; ++utf8_str) {
+ printf("\\x%02hhX", *utf8_str);
+ }
+ printf(")");
+}
+
+static void printUChars(const UChar *uc_str, int32_t len) {
+ std::string utf8_str;
+ UnicodeString(uc_str, len).toUTF8String(utf8_str);
+ printf("%s (", utf8_str.c_str());
+ for(int i=0; i<len; ++i) {
+ printf("0x%02hx%s", uc_str[i], i < (len - 1) ? " " : "");
+ }
+ printf(")");
+}
+
+void TestExecutor::testGetIndex(
+ const char *src, const char *locale,
+ int32_t expected_len, UChar *expected_value) {
+ UBool isError;
+
+ UCharIterator iter;
+ uiter_setUTF8(&iter, src, -1);
+
+ UChar outBuf[BUFFER_SIZE];
+
+ int32_t len = GetPhonebookIndex(&iter, locale, outBuf, sizeof(outBuf), &isError);
+ if (isError) {
+ printf("GetPhonebookIndex returned error (%s:%s)\n", locale, src);
+ m_success = false;
+ } else if (len != expected_len) {
+ printf("len is unexpected value (src: [%s] %s, ", locale, src);
+ printf("actual: %u (", len);
+ printUChars(outBuf, len);
+ printf("), expected: %u (", expected_len);
+ printUChars(expected_value, expected_len);
+ printf("))\n");
+ m_success = false;
+ } else {
+ printf("[%s] %s: ", locale, src);
+ printUChars(outBuf, len);
+
+ if (memcmp(outBuf, expected_value, sizeof(UChar)*expected_len) != 0) {
+ printf(", expected ");
+ printUChars(expected_value, expected_len);
+ m_success = false;
+ }
+ printf("\n");
+ }
+}
+
+#define TEST_GET_UTF8STR_INDEX(src, locale, ...) \
+ ({ \
+ UChar uc_expected[] = {__VA_ARGS__}; \
+ int32_t len = sizeof(uc_expected)/sizeof(UChar); \
+ testGetIndex((src), (locale), len, uc_expected); \
+ })
+
+#define TEST_GET_UCHAR_INDEX(src, locale, ...) \
+ ({ \
+ std::string utf8_str; \
+ UnicodeString((UChar) (src)).toUTF8String(utf8_str); \
+ TEST_GET_UTF8STR_INDEX(utf8_str.c_str(), (locale), __VA_ARGS__); \
+ })
+
+void TestExecutor::testEnglish() {
+ printf("testEnglish()\n");
+
+ // English [A-Z]
+ TEST_GET_UTF8STR_INDEX("Allen", "en", 'A');
+ TEST_GET_UTF8STR_INDEX("allen", "en", 'A');
+ TEST_GET_UTF8STR_INDEX("123456", "en", '#');
+ TEST_GET_UTF8STR_INDEX("+1 (123) 456-7890", "en", '#');
+ TEST_GET_UTF8STR_INDEX("(33) 44.55.66.08", "en", '#');
+ TEST_GET_UTF8STR_INDEX("123 Jump", "en", '#');
+ // Arabic numbers
+ TEST_GET_UTF8STR_INDEX("\u0662\u0663\u0664\u0665\u0666", "en", '#');
+
+ // Japanese
+ // sorts hiragana/katakana, Kanji/Chinese, English, other
+ // …, あ, か, さ, た, な, は, ま, や, ら, わ, …
+ // hiragana "a"
+ TEST_GET_UCHAR_INDEX(0x3041, "ja", 0x3042);
+ // katakana "a"
+ TEST_GET_UCHAR_INDEX(0x30A1, "ja", 0x3042);
+
+ // Kanji (sorts to inflow section)
+ TEST_GET_UCHAR_INDEX(0x65E5, "ja", 0x4ed6);
+ // English
+ TEST_GET_UTF8STR_INDEX("Smith", "ja", 'S');
+ TEST_GET_UTF8STR_INDEX("234567", "ja", '#');
+ // Chinese (sorts to inflow section)
+ TEST_GET_UCHAR_INDEX(0x6c88 /* Shen/Chen */, "ja", 0x4ed6);
+ // Korean Hangul (sorts to overflow section)
+ TEST_GET_UCHAR_INDEX(0x1100, "ja", /* null */ );
+
+ // Korean (sorts Korean, then English)
+ // …, ᄀ, ᄂ, ᄃ, ᄅ, ᄆ, ᄇ, ᄉ, ᄋ, ᄌ, ᄎ, ᄏ, ᄐ, ᄑ, ᄒ, …
+ TEST_GET_UCHAR_INDEX(0x1100, "ko", 0x1100);
+ TEST_GET_UCHAR_INDEX(0x3131, "ko", 0x1100);
+ TEST_GET_UCHAR_INDEX(0x1101, "ko", 0x1100);
+ TEST_GET_UCHAR_INDEX(0x1161, "ko", 0x1112);
+
+ // Czech
+ // …, [A-C], Č,[D-H], CH, [I-R], Ř, S, Š, [T-Z], Ž, …
+ TEST_GET_UTF8STR_INDEX("Cena", "cs", 'C');
+ TEST_GET_UTF8STR_INDEX("Čáp", "cs", 0x010c);
+ TEST_GET_UTF8STR_INDEX("Ruda", "cs", 'R');
+ TEST_GET_UTF8STR_INDEX("Řada", "cs", 0x0158);
+ TEST_GET_UTF8STR_INDEX("Selka", "cs", 'S');
+ TEST_GET_UTF8STR_INDEX("Šála", "cs", 0x0160);
+ TEST_GET_UTF8STR_INDEX("Zebra", "cs", 'Z');
+ TEST_GET_UTF8STR_INDEX("Žába", "cs", 0x017d);
+ TEST_GET_UTF8STR_INDEX("Chata", "cs", 'C', 'H');
+
+ // French: [A-Z] (no accented chars)
+ TEST_GET_UTF8STR_INDEX("Øfer", "fr", 'O');
+ TEST_GET_UTF8STR_INDEX("Œster", "fr", 'O');
+
+ // Danish: [A-Z], Æ, Ø, Å
+ TEST_GET_UTF8STR_INDEX("Ænes", "da", 0xc6);
+ TEST_GET_UTF8STR_INDEX("Øfer", "da", 0xd8);
+ TEST_GET_UTF8STR_INDEX("Œster", "da", 0xd8);
+ TEST_GET_UTF8STR_INDEX("Ågård", "da", 0xc5);
+
+ // German: [A-Z] (no ß or umlauted characters in standard alphabet)
+ TEST_GET_UTF8STR_INDEX("ßind", "de", 'S');
+
+ // Simplified Chinese (default collator Pinyin): [A-Z]
+ // Shen/Chen (simplified): should be, usually, 'S' for name collator and 'C' for apps/other
+ TEST_GET_UCHAR_INDEX(0x6c88 /* Shen/Chen */, "zh_CN", 'C');
+ // Shen/Chen (traditional)
+ TEST_GET_UCHAR_INDEX(0x700b, "zh_CN", 'S');
+ // Jia/Gu: should be, usually, 'J' for name collator and 'G' for apps/other
+ TEST_GET_UCHAR_INDEX(0x8d3e /* Jia/Gu */, "zh_CN", 'J');
+
+ // Traditional Chinese
+ // …, 一, 丁, 丈, 不, 且, 丞, 串, 並, 亭, 乘, 乾, 傀, 亂, 僎, 僵, 儐, 償, 叢, 儳, 嚴, 儷, 儻, 囌, 囑, 廳, …
+ TEST_GET_UCHAR_INDEX(0x6c88 /* Shen/Chen */, "zh_TW", 0x5080);
+ TEST_GET_UCHAR_INDEX(0x700b /* Shen/Chen */, "zh_TW", 0x53e2);
+ TEST_GET_UCHAR_INDEX(0x8d3e /* Jia/Gu */, "zh_TW", 0x5080);
+
+ // Thai (sorts English then Thai)
+ // …, ก, ข, ฃ, ค, ฅ, ฆ, ง, จ, ฉ, ช, ซ, ฌ, ญ, ฎ, ฏ, ฐ, ฑ, ฒ, ณ, ด, ต, ถ, ท, ธ, น, บ, ป, ผ, ฝ, พ, ฟ, ภ, ม, ย, ร, ฤ, ล, ฦ, ว, ศ, ษ, ส, ห, ฬ, อ, ฮ, …,
+
+ TEST_GET_UTF8STR_INDEX("\u0e2d\u0e07\u0e04\u0e4c\u0e40\u0e25\u0e47\u0e01",
+ "th", 0xe2d);
+ TEST_GET_UTF8STR_INDEX("\u0e2a\u0e34\u0e07\u0e2b\u0e40\u0e2a\u0e19\u0e35",
+ "th", 0xe2a);
+ // Thai numbers ((02) 432-0281)
+ TEST_GET_UTF8STR_INDEX("(\u0e50\u0e52) \u0e54\u0e53\u0e52-"
+ "\u0e50\u0e52\u0e58\u0e51", "th", '#');
+
+ // Arabic (sorts English then Arabic)
+ // …, ا, ب, ت, ث, ج, ح, خ, د, ذ, ر, ز, س, ش, ص, ض, ط, ظ, ع, غ, ف, ق, ك, ل, م, ن, ه, و, ي, …
+ TEST_GET_UTF8STR_INDEX("\u0646\u0648\u0631" /* Noor */, "ar", 0x646);
+ // Arabic numbers (34567)
+ TEST_GET_UTF8STR_INDEX("\u0662\u0663\u0664\u0665\u0666", "ar", '#');
+
+ // Hebrew (sorts English then Hebrew)
+ // …, א, ב, ג, ד, ה, ו, ז, ח, ט, י, כ, ל, מ, נ, ס, ע, פ, צ, ק, ר, ש, ת, …
+ TEST_GET_UTF8STR_INDEX("\u05e4\u05e8\u05d9\u05d3\u05de\u05df", "he", 0x5e4);
+}
+
+int main() {
+ TestExecutor executor;
+ if(executor.DoAllTests()) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
diff --git a/android/PhoneticStringUtils.cpp b/android/PhoneticStringUtils.cpp
deleted file mode 100644
index 796eaa2..0000000
--- a/android/PhoneticStringUtils.cpp
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "PhoneticStringUtils.h"
-#include <utils/Unicode.h>
-
-// We'd like 0 length string last of sorted list. So when input string is NULL
-// or 0 length string, we use these instead.
-#define CODEPOINT_FOR_NULL_STR 0xFFFD
-#define STR_FOR_NULL_STR "\xEF\xBF\xBD"
-
-// We assume that users will not notice strings not sorted properly when the
-// first 128 characters are the same.
-#define MAX_CODEPOINTS 128
-
-namespace android {
-
-// Get hiragana from halfwidth katakana.
-static int GetHiraganaFromHalfwidthKatakana(char32_t codepoint,
- char32_t next_codepoint,
- bool *next_is_consumed) {
- if (codepoint < 0xFF66 || 0xFF9F < codepoint) {
- return codepoint;
- }
-
- switch (codepoint) {
- case 0xFF66: // wo
- return 0x3092;
- case 0xFF67: // xa
- return 0x3041;
- case 0xFF68: // xi
- return 0x3043;
- case 0xFF69: // xu
- return 0x3045;
- case 0xFF6A: // xe
- return 0x3047;
- case 0xFF6B: // xo
- return 0x3049;
- case 0xFF6C: // xya
- return 0x3083;
- case 0xFF6D: // xyu
- return 0x3085;
- case 0xFF6E: // xyo
- return 0x3087;
- case 0xFF6F: // xtsu
- return 0x3063;
- case 0xFF70: // -
- return 0x30FC;
- case 0xFF9C: // wa
- return 0x308F;
- case 0xFF9D: // n
- return 0x3093;
- break;
- default: {
- if (0xFF71 <= codepoint && codepoint <= 0xFF75) {
- // a, i, u, e, o
- if (codepoint == 0xFF73 && next_codepoint == 0xFF9E) {
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- return 0x3094; // vu
- } else {
- return 0x3042 + (codepoint - 0xFF71) * 2;
- }
- } else if (0xFF76 <= codepoint && codepoint <= 0xFF81) {
- // ka - chi
- if (next_codepoint == 0xFF9E) {
- // "dakuten" (voiced mark)
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- return 0x304B + (codepoint - 0xFF76) * 2 + 1;
- } else {
- return 0x304B + (codepoint - 0xFF76) * 2;
- }
- } else if (0xFF82 <= codepoint && codepoint <= 0xFF84) {
- // tsu, te, to (skip xtsu)
- if (next_codepoint == 0xFF9E) {
- // "dakuten" (voiced mark)
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- return 0x3064 + (codepoint - 0xFF82) * 2 + 1;
- } else {
- return 0x3064 + (codepoint - 0xFF82) * 2;
- }
- } else if (0xFF85 <= codepoint && codepoint <= 0xFF89) {
- // na, ni, nu, ne, no
- return 0x306A + (codepoint - 0xFF85);
- } else if (0xFF8A <= codepoint && codepoint <= 0xFF8E) {
- // ha, hi, hu, he, ho
- if (next_codepoint == 0xFF9E) {
- // "dakuten" (voiced mark)
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- return 0x306F + (codepoint - 0xFF8A) * 3 + 1;
- } else if (next_codepoint == 0xFF9F) {
- // "han-dakuten" (half voiced mark)
- if (next_is_consumed != NULL) {
- *next_is_consumed = true;
- }
- return 0x306F + (codepoint - 0xFF8A) * 3 + 2;
- } else {
- return 0x306F + (codepoint - 0xFF8A) * 3;
- }
- } else if (0xFF8F <= codepoint && codepoint <= 0xFF93) {
- // ma, mi, mu, me, mo
- return 0x307E + (codepoint - 0xFF8F);
- } else if (0xFF94 <= codepoint && codepoint <= 0xFF96) {
- // ya, yu, yo
- return 0x3084 + (codepoint - 0xFF94) * 2;
- } else if (0xFF97 <= codepoint && codepoint <= 0xFF9B) {
- // ra, ri, ru, re, ro
- return 0x3089 + (codepoint - 0xFF97);
- }
- // Note: 0xFF9C, 0xFF9D are handled above
- } // end of default
- }
-
- return codepoint;
-}
-
-// Assuming input is hiragana, convert the hiragana to "normalized" hiragana.
-static int GetNormalizedHiragana(int codepoint) {
- if (codepoint < 0x3040 || 0x309F < codepoint) {
- return codepoint;
- }
-
- // TODO: should care (semi-)voiced mark (0x3099, 0x309A).
-
- // Trivial kana conversions.
- // e.g. xa => a
- switch (codepoint) {
- case 0x3041:
- case 0x3043:
- case 0x3045:
- case 0x3047:
- case 0x3049:
- case 0x3063:
- case 0x3083:
- case 0x3085:
- case 0x3087:
- case 0x308E: // xwa
- return codepoint + 1;
- case 0x3095: // xka
- return 0x304B;
- case 0x3096: // xke
- return 0x3051;
- case 0x31F0: // xku
- return 0x304F;
- case 0x31F1: // xsi
- return 0x3057;
- case 0x31F2: // xsu
- return 0x3059;
- case 0x31F3: // xto
- return 0x3068;
- case 0x31F4: // xnu
- return 0x306C;
- case 0x31F5: // xha
- return 0x306F;
- case 0x31F6: // xhi
- return 0x3072;
- case 0x31F7: // xhu
- return 0x3075;
- case 0x31F8: // xhe
- return 0x3078;
- case 0x31F9: // xho
- return 0x307B;
- case 0x31FA: // xmu
- return 0x3080;
- case 0x31FB: // xra
- case 0x31FC: // xri
- case 0x31FD: // xru
- case 0x31FE: // xre
- case 0x31FF: // xro
- // ra: 0x3089
- return 0x3089 + (codepoint - 0x31FB);
- default:
- return codepoint;
- }
-}
-
-static int GetNormalizedKana(char32_t codepoint,
- char32_t next_codepoint,
- bool *next_is_consumed) {
- // First, convert fullwidth katakana and halfwidth katakana to hiragana.
- if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
- // Make fullwidth katakana same as hiragana.
- // 96 == 0x30A1 - 0x3041c
- codepoint = codepoint - 96;
- } else if (codepoint == 0x309F) {
- // Digraph YORI; Yo
- codepoint = 0x3088;
- } else if (codepoint == 0x30FF) {
- // Digraph KOTO; Ko
- codepoint = 0x3053;
- } else {
- codepoint = GetHiraganaFromHalfwidthKatakana(
- codepoint, next_codepoint, next_is_consumed);
- }
-
- // Normalize Hiragana.
- return GetNormalizedHiragana(codepoint);
-}
-
-int GetNormalizedCodePoint(char32_t codepoint,
- char32_t next_codepoint,
- bool *next_is_consumed) {
- if (next_is_consumed != NULL) {
- *next_is_consumed = false;
- }
-
- if (codepoint <= 0x0020 || codepoint == 0x3000) {
- // Whitespaces. Keep it as is.
- return codepoint;
- } else if ((0x0021 <= codepoint && codepoint <= 0x007E) ||
- (0xFF01 <= codepoint && codepoint <= 0xFF5E)) {
- // Ascii and fullwidth ascii. Keep it as is
- return codepoint;
- } else if (codepoint == 0x02DC || codepoint == 0x223C) {
- // tilde
- return 0xFF5E;
- } else if (codepoint <= 0x3040 ||
- (0x3100 <= codepoint && codepoint < 0xFF00) ||
- codepoint == CODEPOINT_FOR_NULL_STR) {
- // Keep it as is.
- return codepoint;
- }
-
- // Below is Kana-related handling.
-
- return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
-}
-
-static bool GetExpectedString(
- const char *src, char **dst, size_t *dst_len,
- int (*get_codepoint_function)(char32_t, char32_t, bool*)) {
- if (dst == NULL || dst_len == NULL) {
- return false;
- }
-
- if (src == NULL || *src == '\0') {
- src = STR_FOR_NULL_STR;
- }
-
- char32_t codepoints[MAX_CODEPOINTS]; // if array size is changed the for loop needs to be changed
-
- ssize_t src_len = utf8_length(src);
- if (src_len <= 0) {
- return false;
- }
-
- bool next_is_consumed;
- size_t j = 0;
- for (size_t i = 0; i < (size_t)src_len && j < MAX_CODEPOINTS;) {
- int32_t ret = utf32_from_utf8_at(src, src_len, i, &i);
- if (ret < 0) {
- // failed to parse UTF-8
- return false;
- }
- ret = get_codepoint_function(
- static_cast<char32_t>(ret),
- i + 1 < (size_t)src_len ? src[i + 1] : 0,
- &next_is_consumed);
- if (ret > 0) {
- codepoints[j] = static_cast<char32_t>(ret);
- j++;
- }
- if (next_is_consumed) {
- i++;
- }
- }
- size_t length = j;
-
- if (length == 0) {
- // If all of codepoints are invalid, we place the string at the end of
- // the list.
- codepoints[0] = 0x10000 + CODEPOINT_FOR_NULL_STR;
- length = 1;
- }
-
- ssize_t new_len = utf32_to_utf8_length(codepoints, length);
- if (new_len < 0) {
- return false;
- }
-
- *dst = static_cast<char *>(malloc(new_len + 1));
- if (*dst == NULL) {
- return false;
- }
-
- utf32_to_utf8(codepoints, length, *dst);
-
- *dst_len = new_len;
- return true;
-}
-
-bool GetNormalizedString(const char *src, char **dst, size_t *len) {
- return GetExpectedString(src, dst, len, GetNormalizedCodePoint);
-}
-
-} // namespace android
diff --git a/android/PhoneticStringUtils.h b/android/PhoneticStringUtils.h
deleted file mode 100644
index a567a27..0000000
--- a/android/PhoneticStringUtils.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _ANDROID_PHONETIC_STRING_UTILS_H
-#define _ANDROID_PHONETIC_STRING_UTILS_H
-
-#include <string.h> // For size_t.
-#include <utils/String8.h>
-
-namespace android {
-
-// Returns codepoint which is "normalized", whose definition depends on each
-// Locale. Note that currently this function normalizes only Japanese; the
-// other characters are remained as is.
-// The variable "next_is_consumed" is set to true if "next_codepoint"
-// is "consumed" (e.g. Japanese halfwidth katakana's voiced mark is consumed
-// when previous "codepoint" is appropriate, like half-width "ka").
-//
-// In Japanese, "normalized" means that half-width and full-width katakana is
-// appropriately converted to hiragana.
-int GetNormalizedCodePoint(char32_t codepoint,
- char32_t next_codepoint,
- bool *next_is_consumed);
-
-// Pushes Utf8 expression of "codepoint" to "dst". Returns true when successful.
-// If input is invalid or the length of the destination is not enough,
-// returns false.
-bool GetUtf8FromCodePoint(int codepoint, char *dst, size_t len, size_t *index);
-
-// Creates a "phonetically sortable" Utf8 string and push it into "dst".
-// *dst must be freed after being used outside.
-// If "src" is NULL or its length is 0, "dst" is set to \uFFFF.
-//
-// Note that currently this function considers only Japanese.
-bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len);
-
-// Creates a "normalized" Utf8 string and push it into "dst". *dst must be
-// freed after being used outside.
-// If "src" is NULL or its length is 0, "dst" is set to \uFFFF.
-//
-// Note that currently this function considers only Japanese.
-bool GetNormalizedString(const char *src, char **dst, size_t *len);
-
-} // namespace android
-
-#endif
diff --git a/android/PhoneticStringUtilsTest.cpp b/android/PhoneticStringUtilsTest.cpp
deleted file mode 100644
index 9885823..0000000
--- a/android/PhoneticStringUtilsTest.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "PhoneticStringUtils.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <utils/String8.h>
-
-using namespace android;
-
-class TestExecutor {
- public:
- TestExecutor() : m_total_count(0), m_success_count(0), m_success(true) {}
- bool DoAllTests();
- private:
- void DoOneTest(void (TestExecutor::*test)());
-
- void testUtf32At();
- void testGetUtf8FromUtf32();
- void testGetNormalizedString();
- void testLongString();
-
- // Note: When adding a test, do not forget to add it to DoOneTest().
-
- int m_total_count;
- int m_success_count;
-
- bool m_success;
-};
-
-#define ASSERT_EQ_VALUE(input, expected) \
- ({ \
- if ((expected) != (input)) { \
- printf("0x%X(result) != 0x%X(expected)\n", input, expected); \
- m_success = false; \
- return; \
- } \
- })
-
-#define EXPECT_EQ_VALUE(input, expected) \
- ({ \
- if ((expected) != (input)) { \
- printf("0x%X(result) != 0x%X(expected)\n", input, expected); \
- m_success = false; \
- } \
- })
-
-
-bool TestExecutor::DoAllTests() {
- DoOneTest(&TestExecutor::testUtf32At);
- DoOneTest(&TestExecutor::testGetUtf8FromUtf32);
- DoOneTest(&TestExecutor::testGetNormalizedString);
- DoOneTest(&TestExecutor::testLongString);
-
- printf("Test total: %d\nSuccess: %d\nFailure: %d\n",
- m_total_count, m_success_count, m_total_count - m_success_count);
-
- bool success = m_total_count == m_success_count;
- printf("\n%s\n", success ? "Success" : "Failure");
-
- return success;
-}
-
-void TestExecutor::DoOneTest(void (TestExecutor::*test)()) {
- m_success = true;
-
- (this->*test)();
-
- ++m_total_count;
- m_success_count += m_success ? 1 : 0;
-}
-
-#define TEST_GET_UTF32AT(src, index, expected_next, expected_value) \
- ({ \
- size_t next; \
- int32_t ret = utf32_from_utf8_at(src, strlen(src), index, &next); \
- if (ret < 0) { \
- printf("getUtf32At() returned negative value (src: %s, index: %d)\n", \
- (src), (index)); \
- m_success = false; \
- } else if (next != (expected_next)) { \
- printf("next is unexpected value (src: %s, actual: %u, expected: %u)\n", \
- (src), next, (expected_next)); \
- } else { \
- EXPECT_EQ_VALUE(ret, (expected_value)); \
- } \
- })
-
-void TestExecutor::testUtf32At() {
- printf("testUtf32At()\n");
-
- TEST_GET_UTF32AT("a", 0, 1, 97);
- // Japanese hiragana "a"
- TEST_GET_UTF32AT("\xE3\x81\x82", 0, 3, 0x3042);
- // Japanese fullwidth katakana "a" with ascii a
- TEST_GET_UTF32AT("a\xE3\x82\xA2", 1, 4, 0x30A2);
-
- // 2 PUA
- TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 0, 4, 0xFE000);
- TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 4, 8, 0xFE008);
-}
-
-
-#define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected) \
- ({ \
- char32_t codepoints[1] = {codepoint}; \
- status_t ret = string8.setTo(codepoints, 1); \
- if (ret != NO_ERROR) { \
- printf("GetUtf8FromCodePoint() returned false at 0x%04X\n", codepoint); \
- m_success = false; \
- } else { \
- const char* string = string8.string(); \
- if (strcmp(string, expected) != 0) { \
- printf("Failed at codepoint 0x%04X\n", codepoint); \
- for (const char *ch = string; *ch != '\0'; ++ch) { \
- printf("0x%X ", *ch); \
- } \
- printf("!= "); \
- for (const char *ch = expected; *ch != '\0'; ++ch) { \
- printf("0x%X ", *ch); \
- } \
- printf("\n"); \
- m_success = false; \
- } \
- } \
- })
-
-void TestExecutor::testGetUtf8FromUtf32() {
- printf("testGetUtf8FromUtf32()\n");
- String8 string8;
-
- EXPECT_EQ_CODEPOINT_UTF8('a', "\x61");
- // Armenian capital letter AYB (2 bytes in UTF8)
- EXPECT_EQ_CODEPOINT_UTF8(0x0530, "\xD4\xB0");
- // Japanese 'a' (3 bytes in UTF8)
- EXPECT_EQ_CODEPOINT_UTF8(0x3042, "\xE3\x81\x82");
- // Kanji
- EXPECT_EQ_CODEPOINT_UTF8(0x65E5, "\xE6\x97\xA5");
- // PUA (4 byets in UTF8)
- EXPECT_EQ_CODEPOINT_UTF8(0xFE016, "\xF3\xBE\x80\x96");
- EXPECT_EQ_CODEPOINT_UTF8(0xFE972, "\xF3\xBE\xA5\xB2");
-}
-
-#define EXPECT_EQ_UTF8_UTF8(src, expected) \
- ({ \
- if (!GetNormalizedString(src, &dst, &len)) { \
- printf("GetNormalizedSortableString() returned false.\n"); \
- m_success = false; \
- } else { \
- if (strcmp(dst, expected) != 0) { \
- for (const char *ch = dst; *ch != '\0'; ++ch) { \
- printf("0x%X ", *ch); \
- } \
- printf("!= "); \
- for (const char *ch = expected; *ch != '\0'; ++ch) { \
- printf("0x%X ", *ch); \
- } \
- printf("\n"); \
- m_success = false; \
- } \
- free(dst); \
- } \
- })
-
-void TestExecutor::testGetNormalizedString() {
- printf("testGetNormalizedString()\n");
- char *dst;
- size_t len;
-
- // halfwidth alphabets/symbols -> keep it as is.
- EXPECT_EQ_UTF8_UTF8("ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()",
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()");
- EXPECT_EQ_UTF8_UTF8("abcdefghijklmnopqrstuvwxyz[]{}\\@/",
- "abcdefghijklmnopqrstuvwxyz[]{}\\@/");
-
- // halfwidth/fullwidth-katakana -> hiragana
- EXPECT_EQ_UTF8_UTF8(
- "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA",
- "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A");
-
- // whitespace -> keep it as is.
- EXPECT_EQ_UTF8_UTF8(" \t", " \t");
-}
-
-void TestExecutor::testLongString() {
- printf("testLongString()\n");
- char * dst;
- size_t len;
- EXPECT_EQ_UTF8_UTF8("Qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqtttttttttttttttttttttttttttttttttttttttttttttttttgggggggggggggggggggggggggggggggggggggggbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
- "Qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqtttttttttttttttttttttttttttttttttttttttttttttttttggggggggggggggggggggggggggggggggggg");
-}
-
-
-int main() {
- TestExecutor executor;
- if(executor.DoAllTests()) {
- return 0;
- } else {
- return 1;
- }
-}
diff --git a/android/sqlite3_android.cpp b/android/sqlite3_android.cpp
index 5daf15e..fe826fd 100644
--- a/android/sqlite3_android.cpp
+++ b/android/sqlite3_android.cpp
@@ -509,14 +509,8 @@ extern "C" int register_localized_collators(sqlite3* handle, const char* systemL
//// PHONEBOOK_COLLATOR
- // The collator may be removed in the near future. Do not depend on it.
- // TODO: it might be better to have another function for registering phonebook collator.
status = U_ZERO_ERROR;
- if (strcmp(systemLocale, "ja") == 0 || strcmp(systemLocale, "ja_JP") == 0) {
- collator = ucol_open("ja@collation=phonebook", &status);
- } else {
- collator = ucol_open(systemLocale, &status);
- }
+ collator = ucol_open(systemLocale, &status);
if (U_FAILURE(status)) {
return -1;
}