diff options
Diffstat (limited to 'android/PhonebookIndex.cpp')
-rw-r--r-- | android/PhonebookIndex.cpp | 317 |
1 files changed, 159 insertions, 158 deletions
diff --git a/android/PhonebookIndex.cpp b/android/PhonebookIndex.cpp index 5cc26e5..68674f4 100644 --- a/android/PhonebookIndex.cpp +++ b/android/PhonebookIndex.cpp @@ -14,192 +14,193 @@ * limitations under the License. */ +#include <stdlib.h> #include <ctype.h> #include <string.h> +#include <stdio.h> +#include <unicode/alphaindex.h> #include <unicode/ucol.h> #include <unicode/uiter.h> #include <unicode/ustring.h> #include <unicode/utypes.h> +#include <unicode/uloc.h> +#include <utils/Mutex.h> +#include <utils/RefBase.h> #include "PhonebookIndex.h" -#include "PhoneticStringUtils.h" #define MIN_OUTPUT_SIZE 6 // Minimum required size for the output buffer (in bytes) namespace android { -// IMPORTANT! Keep the codes below SORTED. We are doing a binary search on the array -static UChar DEFAULT_CHAR_MAP[] = { - 0x00C6, 'A', // AE - 0x00DF, 'S', // Etzett - 0x1100, 0x3131, // HANGUL LETTER KIYEOK - 0x1101, 0x3132, // HANGUL LETTER SSANGKIYEOK - 0x1102, 0x3134, // HANGUL LETTER NIEUN - 0x1103, 0x3137, // HANGUL LETTER TIKEUT - 0x1104, 0x3138, // HANGUL LETTER SSANGTIKEUT - 0x1105, 0x3139, // HANGUL LETTER RIEUL - 0x1106, 0x3141, // HANGUL LETTER MIEUM - 0x1107, 0x3142, // HANGUL LETTER PIEUP - 0x1108, 0x3143, // HANGUL LETTER SSANGPIEUP - 0x1109, 0x3145, // HANGUL LETTER SIOS - 0x110A, 0x3146, // HANGUL LETTER SSANGSIOS - 0x110B, 0x3147, // HANGUL LETTER IEUNG - 0x110C, 0x3148, // HANGUL LETTER CIEUC - 0x110D, 0x3149, // HANGUL LETTER SSANGCIEUC - 0x110E, 0x314A, // HANGUL LETTER CHIEUCH - 0x110F, 0x314B, // HANGUL LETTER KHIEUKH - 0x1110, 0x314C, // HANGUL LETTER THIEUTH - 0x1111, 0x314D, // HANGUL LETTER PHIEUPH - 0x1112, 0x314E, // HANGUL LETTER HIEUH - 0x111A, 0x3140, // HANGUL LETTER RIEUL-HIEUH - 0x1121, 0x3144, // HANGUL LETTER PIEUP-SIOS - 0x1161, 0x314F, // HANGUL LETTER A - 0x1162, 0x3150, // HANGUL LETTER AE - 0x1163, 0x3151, // HANGUL LETTER YA - 0x1164, 0x3152, // HANGUL LETTER YAE - 0x1165, 0x3153, // HANGUL LETTER EO - 0x1166, 0x3154, // HANGUL LETTER E - 0x1167, 0x3155, // HANGUL LETTER YEO - 0x1168, 0x3156, // HANGUL LETTER YE - 0x1169, 0x3157, // HANGUL LETTER O - 0x116A, 0x3158, // HANGUL LETTER WA - 0x116B, 0x3159, // HANGUL LETTER WAE - 0x116C, 0x315A, // HANGUL LETTER OE - 0x116D, 0x315B, // HANGUL LETTER YO - 0x116E, 0x315C, // HANGUL LETTER U - 0x116F, 0x315D, // HANGUL LETTER WEO - 0x1170, 0x315E, // HANGUL LETTER WE - 0x1171, 0x315F, // HANGUL LETTER WI - 0x1172, 0x3160, // HANGUL LETTER YU - 0x1173, 0x3161, // HANGUL LETTER EU - 0x1174, 0x3162, // HANGUL LETTER YI - 0x1175, 0x3163, // HANGUL LETTER I - 0x11AA, 0x3133, // HANGUL LETTER KIYEOK-SIOS - 0x11AC, 0x3135, // HANGUL LETTER NIEUN-CIEUC - 0x11AD, 0x3136, // HANGUL LETTER NIEUN-HIEUH - 0x11B0, 0x313A, // HANGUL LETTER RIEUL-KIYEOK - 0x11B1, 0x313B, // HANGUL LETTER RIEUL-MIEUM - 0x11B3, 0x313D, // HANGUL LETTER RIEUL-SIOS - 0x11B4, 0x313E, // HANGUL LETTER RIEUL-THIEUTH - 0x11B5, 0x313F, // HANGUL LETTER RIEUL-PHIEUPH +// Wrapper class to enable using libutil SmartPointers with AlphabeticIndex. +class AlphabeticIndexRef : public RefBase { +public: + AlphabeticIndexRef(const char *locale, UErrorCode &status) : + m_index(locale, status), m_locale(NULL), m_isJapanese(false) { + if (U_FAILURE(status)) { + return; + } + m_locale = strdup(locale); + if (m_locale == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + char language[4]; + uloc_getLanguage(locale, language, sizeof(language), &status); + if (U_FAILURE(status)) { + return; + } + m_isJapanese = (strcmp(language, ULOC_JAPANESE) == 0); + } + virtual ~AlphabeticIndexRef() { free(m_locale); } + + AlphabeticIndex& operator*() { return m_index; } + AlphabeticIndex* operator->() { return &m_index; } + + bool isLocale(const char *locale) const { + return (locale != NULL && m_locale != NULL && + strcmp(m_locale, locale) == 0); + } + bool isJapanese() const { return m_isJapanese; } + int32_t getLabel(int32_t bucketIndex, UChar *labelBuf, int32_t labelBufSize); + +private: + AlphabeticIndex m_index; + char *m_locale; + bool m_isJapanese; }; -/** - * Binary search to map an individual character to the corresponding phone book index. - */ -static UChar map_character(UChar c, UChar * char_map, int32_t length) { - int from = 0, to = length; - while (from < to) { - int m = ((to + from) >> 1) & ~0x1; // Only consider even positions - UChar cm = char_map[m]; - if (cm == c) { - return char_map[m + 1]; - } else if (cm < c) { - from = m + 2; +int32_t AlphabeticIndexRef::getLabel(int32_t bucketIndex, UChar *labelBuf, + int32_t labelBufSize) { + UErrorCode status = U_ZERO_ERROR; + m_index.resetBucketIterator(status); + if (U_FAILURE(status)) { + return -1; + } + for(int i = 0; i <= bucketIndex; ++i) { + if (!m_index.nextBucket(status) || U_FAILURE(status)) { + return -1; + } + } + + int32_t len; + if (m_index.getBucketLabelType() == U_ALPHAINDEX_NORMAL) { + len = m_index.getBucketLabel().extract(labelBuf, labelBufSize, status); + if (U_FAILURE(status)) { + return -1; + } } else { - to = m; + // Use no label for underflow/inflow/overflow buckets + labelBuf[0] = '\0'; + len = 0; } - } - return 0; + return len; } +static Mutex gIndexMutex; +static sp<AlphabeticIndexRef> gIndex; + /** * Returns TRUE if the character belongs to a Hanzi unicode block */ -static bool is_CJK(UChar c) { - return - (0x4e00 <= c && c <= 0x9fff) // CJK_UNIFIED_IDEOGRAPHS - || (0x3400 <= c && c <= 0x4dbf) // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A - || (0x3000 <= c && c <= 0x303f) // CJK_SYMBOLS_AND_PUNCTUATION - || (0x2e80 <= c && c <= 0x2eff) // CJK_RADICALS_SUPPLEMENT - || (0x3300 <= c && c <= 0x33ff) // CJK_COMPATIBILITY - || (0xfe30 <= c && c <= 0xfe4f) // CJK_COMPATIBILITY_FORMS - || (0xf900 <= c && c <= 0xfaff); // CJK_COMPATIBILITY_IDEOGRAPHS +static bool is_CJ(UChar32 c) { + return (uscript_hasScript(c, USCRIPT_HAN) || + uscript_hasScript(c, USCRIPT_HIRAGANA) || + uscript_hasScript(c, USCRIPT_KATAKANA)); +} + +static bool initIndexForLocale(const char *locale) { + if (locale == NULL) { + return false; + } + + if (gIndex != NULL && gIndex->isLocale(locale)) { + return true; + } + + UErrorCode status = U_ZERO_ERROR; + sp<AlphabeticIndexRef> newIndex(new AlphabeticIndexRef(locale, status)); + if (newIndex == NULL || U_FAILURE(status)) { + return false; + } + // Always create labels for Latin characters if not present in native set + (*newIndex)->addLabels("en", status); + if (U_FAILURE(status)) { + return false; + } + if ((*newIndex)->getBucketCount(status) <= 0 || U_FAILURE(status)) { + return false; + } + + gIndex = newIndex; + return true; } -int32_t GetPhonebookIndex(UCharIterator * iter, const char * locale, UChar * out, int32_t size, - UBool * isError) +int32_t GetPhonebookIndex(UCharIterator *iter, const char *locale, + UChar *out, int32_t size, UBool *isError) { - if (size < MIN_OUTPUT_SIZE) { - *isError = TRUE; - return 0; - } - - *isError = FALSE; - - // Normalize the first character to remove accents using the NFD normalization - UErrorCode errorCode = U_ZERO_ERROR; - int32_t len = unorm_next(iter, out, size, UNORM_NFD, - 0 /* options */, TRUE /* normalize */, NULL, &errorCode); - if (U_FAILURE(errorCode)) { - *isError = TRUE; - return 0; - } - - if (len == 0) { // Empty input string - return 0; - } - - UChar c = out[0]; - - if (!u_isalpha(c)) { - // Digits go into a # section. Everything else goes into the empty section - // The unicode function u_isdigit would also identify other characters as digits (arabic), - // but if we caught them here we'd risk having the same section before and after alpha-letters - // which might break the assumption that each section exists only once - if (c >= '0' && c <= '9') { - out[0] = '#'; - return 1; - } - return 0; - } - - c = u_toupper(c); - - // Check for explicitly mapped characters - UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar)); - if (c_mapped != 0) { - out[0] = c_mapped; - return 1; - } - - // Convert Kanas to Hiragana - UChar next = len > 2 ? out[1] : 0; - c = android::GetNormalizedCodePoint(c, next, NULL); - - // Traditional grouping of Hiragana characters - if (0x3041 <= c && c <= 0x309F) { - if (c < 0x304B) c = 0x3042; // a - else if (c < 0x3055) c = 0x304B; // ka - else if (c < 0x305F) c = 0x3055; // sa - else if (c < 0x306A) c = 0x305F; // ta - else if (c < 0x306F) c = 0x306A; // na - else if (c < 0x307E) c = 0x306F; // ha - else if (c < 0x3083) c = 0x307E; // ma - else if (c < 0x3089) c = 0x3084; // ya - else if (c < 0x308E) c = 0x3089; // ra - else if (c < 0x3094) c = 0x308F; // wa - else return 0; // Others are not readable - out[0] = c; - return 1; - } else if (0x30A0 <= c && c <= 0x30FF) { - // Dot, onbiki, iteration marks are not readable - return 0; - } - - if (is_CJK(c)) { - if (strncmp(locale, "ja", 2) == 0) { - // Japanese word meaning "misc" or "other" - out[0] = 0x4ED6; - return 1; - } else { - return 0; + if (size < MIN_OUTPUT_SIZE) { + *isError = TRUE; + return 0; + } + + *isError = FALSE; + out[0] = '\0'; + iter->move(iter, 0, UITER_ZERO); + if (!iter->hasNext(iter)) { // Empty input string + return 0; + } + UnicodeString ustr; + bool prefixIsNonNumeric = false; + bool prefixIsNumeric = false; + while (iter->hasNext(iter)) { + UChar32 ch = uiter_next32(iter); + // Ignore standard phone number separators and identify any string + // that otherwise starts with a number. + if (!prefixIsNumeric && !prefixIsNonNumeric) { + if (u_isdigit(ch)) { + prefixIsNumeric = true; + } else if (!u_isspace(ch) && ch != '+' && ch != '(' && + ch != ')' && ch != '.' && ch != '-' && ch != '#') { + prefixIsNonNumeric = true; + } + } + ustr.append(ch); + } + if (prefixIsNumeric) { + out[0] = '#'; + return 1; + } + + Mutex::Autolock autolock(gIndexMutex); + if (!initIndexForLocale(locale)) { + *isError = TRUE; + return 0; + } + + UErrorCode status = U_ZERO_ERROR; + int32_t bucketIndex = (*gIndex)->getBucketIndex(ustr, status); + if (U_FAILURE(status)) { + *isError = TRUE; + return 0; + } + + int32_t len = gIndex->getLabel(bucketIndex, out, size); + if (len < 0) { + *isError = TRUE; + return 0; + } + + // For Japanese, label unclassified CJK ideographs with + // Japanese word meaning "misc" or "other" + if (gIndex->isJapanese() && len == 0 && is_CJ(ustr.char32At(0))) { + out[0] = 0x4ED6; + len = 1; } - } - out[0] = c; - return 1; + return len; } } // namespace android |