1 files changed, 159 insertions, 158 deletions
diff --git a/android/PhonebookIndex.cpp b/android/PhonebookIndex.cpp
index 5cc26e5..68674f4 100644
--- a/android/PhonebookIndex.cpp
+++ b/android/PhonebookIndex.cpp
@@ -14,192 +14,193 @@
  * limitations under the License.
  */
 
+#include <stdlib.h>
 #include <ctype.h>
 #include <string.h>
+#include <stdio.h>
 
+#include <unicode/alphaindex.h>
 #include <unicode/ucol.h>
 #include <unicode/uiter.h>
 #include <unicode/ustring.h>
 #include <unicode/utypes.h>
+#include <unicode/uloc.h>
+#include <utils/Mutex.h>
+#include <utils/RefBase.h>
 
 #include "PhonebookIndex.h"
-#include "PhoneticStringUtils.h"
 
 #define MIN_OUTPUT_SIZE 6       // Minimum required size for the output buffer (in bytes)
 
 namespace android {
 
-// IMPORTANT!  Keep the codes below SORTED. We are doing a binary search on the array
-static UChar DEFAULT_CHAR_MAP[] = {
-    0x00C6,    'A',       // AE
-    0x00DF,    'S',       // Etzett
-    0x1100, 0x3131,       // HANGUL LETTER KIYEOK
-    0x1101, 0x3132,       // HANGUL LETTER SSANGKIYEOK
-    0x1102, 0x3134,       // HANGUL LETTER NIEUN
-    0x1103, 0x3137,       // HANGUL LETTER TIKEUT
-    0x1104, 0x3138,       // HANGUL LETTER SSANGTIKEUT
-    0x1105, 0x3139,       // HANGUL LETTER RIEUL
-    0x1106, 0x3141,       // HANGUL LETTER MIEUM
-    0x1107, 0x3142,       // HANGUL LETTER PIEUP
-    0x1108, 0x3143,       // HANGUL LETTER SSANGPIEUP
-    0x1109, 0x3145,       // HANGUL LETTER SIOS
-    0x110A, 0x3146,       // HANGUL LETTER SSANGSIOS
-    0x110B, 0x3147,       // HANGUL LETTER IEUNG
-    0x110C, 0x3148,       // HANGUL LETTER CIEUC
-    0x110D, 0x3149,       // HANGUL LETTER SSANGCIEUC
-    0x110E, 0x314A,       // HANGUL LETTER CHIEUCH
-    0x110F, 0x314B,       // HANGUL LETTER KHIEUKH
-    0x1110, 0x314C,       // HANGUL LETTER THIEUTH
-    0x1111, 0x314D,       // HANGUL LETTER PHIEUPH
-    0x1112, 0x314E,       // HANGUL LETTER HIEUH
-    0x111A, 0x3140,       // HANGUL LETTER RIEUL-HIEUH
-    0x1121, 0x3144,       // HANGUL LETTER PIEUP-SIOS
-    0x1161, 0x314F,       // HANGUL LETTER A
-    0x1162, 0x3150,       // HANGUL LETTER AE
-    0x1163, 0x3151,       // HANGUL LETTER YA
-    0x1164, 0x3152,       // HANGUL LETTER YAE
-    0x1165, 0x3153,       // HANGUL LETTER EO
-    0x1166, 0x3154,       // HANGUL LETTER E
-    0x1167, 0x3155,       // HANGUL LETTER YEO
-    0x1168, 0x3156,       // HANGUL LETTER YE
-    0x1169, 0x3157,       // HANGUL LETTER O
-    0x116A, 0x3158,       // HANGUL LETTER WA
-    0x116B, 0x3159,       // HANGUL LETTER WAE
-    0x116C, 0x315A,       // HANGUL LETTER OE
-    0x116D, 0x315B,       // HANGUL LETTER YO
-    0x116E, 0x315C,       // HANGUL LETTER U
-    0x116F, 0x315D,       // HANGUL LETTER WEO
-    0x1170, 0x315E,       // HANGUL LETTER WE
-    0x1171, 0x315F,       // HANGUL LETTER WI
-    0x1172, 0x3160,       // HANGUL LETTER YU
-    0x1173, 0x3161,       // HANGUL LETTER EU
-    0x1174, 0x3162,       // HANGUL LETTER YI
-    0x1175, 0x3163,       // HANGUL LETTER I
-    0x11AA, 0x3133,       // HANGUL LETTER KIYEOK-SIOS
-    0x11AC, 0x3135,       // HANGUL LETTER NIEUN-CIEUC
-    0x11AD, 0x3136,       // HANGUL LETTER NIEUN-HIEUH
-    0x11B0, 0x313A,       // HANGUL LETTER RIEUL-KIYEOK
-    0x11B1, 0x313B,       // HANGUL LETTER RIEUL-MIEUM
-    0x11B3, 0x313D,       // HANGUL LETTER RIEUL-SIOS
-    0x11B4, 0x313E,       // HANGUL LETTER RIEUL-THIEUTH
-    0x11B5, 0x313F,       // HANGUL LETTER RIEUL-PHIEUPH
+// Wrapper class to enable using libutil SmartPointers with AlphabeticIndex.
+class AlphabeticIndexRef : public RefBase {
+public:
+    AlphabeticIndexRef(const char *locale, UErrorCode &status) :
+        m_index(locale, status), m_locale(NULL), m_isJapanese(false) {
+        if (U_FAILURE(status)) {
+            return;
+        }
+        m_locale = strdup(locale);
+        if (m_locale == NULL) {
+            status = U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
+        char language[4];
+        uloc_getLanguage(locale, language, sizeof(language), &status);
+        if (U_FAILURE(status)) {
+            return;
+        }
+        m_isJapanese = (strcmp(language, ULOC_JAPANESE) == 0);
+    }
+    virtual ~AlphabeticIndexRef() { free(m_locale); }
+
+    AlphabeticIndex& operator*() { return m_index; }
+    AlphabeticIndex* operator->() { return &m_index; }
+
+    bool isLocale(const char *locale) const {
+        return (locale != NULL && m_locale != NULL &&
+                strcmp(m_locale, locale) == 0);
+    }
+    bool isJapanese() const { return m_isJapanese; }
+    int32_t getLabel(int32_t bucketIndex, UChar *labelBuf, int32_t labelBufSize);
+
+private:
+    AlphabeticIndex m_index;
+    char *m_locale;
+    bool m_isJapanese;
 };
 
-/**
- * Binary search to map an individual character to the corresponding phone book index.
- */
-static UChar map_character(UChar c, UChar * char_map, int32_t length) {
-  int from = 0, to = length;
-  while (from < to) {
-    int m = ((to + from) >> 1) & ~0x1;    // Only consider even positions
-    UChar cm = char_map[m];
-    if (cm == c) {
-      return char_map[m + 1];
-    } else if (cm < c) {
-      from = m + 2;
+int32_t AlphabeticIndexRef::getLabel(int32_t bucketIndex, UChar *labelBuf,
+                                     int32_t labelBufSize) {
+    UErrorCode status = U_ZERO_ERROR;
+    m_index.resetBucketIterator(status);
+    if (U_FAILURE(status)) {
+        return -1;
+    }
+    for(int i = 0; i <= bucketIndex; ++i) {
+        if (!m_index.nextBucket(status) || U_FAILURE(status)) {
+            return -1;
+        }
+    }
+
+    int32_t len;
+    if (m_index.getBucketLabelType() == U_ALPHAINDEX_NORMAL) {
+        len = m_index.getBucketLabel().extract(labelBuf, labelBufSize, status);
+        if (U_FAILURE(status)) {
+            return -1;
+        }
     } else {
-      to = m;
+        // Use no label for underflow/inflow/overflow buckets
+        labelBuf[0] = '\0';
+        len = 0;
     }
-  }
-  return 0;
+    return len;
 }
 
+static Mutex gIndexMutex;
+static sp<AlphabeticIndexRef> gIndex;
+
 /**
  * Returns TRUE if the character belongs to a Hanzi unicode block
  */
-static bool is_CJK(UChar c) {
-  return
-       (0x4e00 <= c && c <= 0x9fff)     // CJK_UNIFIED_IDEOGRAPHS
-    || (0x3400 <= c && c <= 0x4dbf)     // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
-    || (0x3000 <= c && c <= 0x303f)     // CJK_SYMBOLS_AND_PUNCTUATION
-    || (0x2e80 <= c && c <= 0x2eff)     // CJK_RADICALS_SUPPLEMENT
-    || (0x3300 <= c && c <= 0x33ff)     // CJK_COMPATIBILITY
-    || (0xfe30 <= c && c <= 0xfe4f)     // CJK_COMPATIBILITY_FORMS
-    || (0xf900 <= c && c <= 0xfaff);    // CJK_COMPATIBILITY_IDEOGRAPHS
+static bool is_CJ(UChar32 c) {
+    return (uscript_hasScript(c, USCRIPT_HAN) ||
+            uscript_hasScript(c, USCRIPT_HIRAGANA) ||
+            uscript_hasScript(c, USCRIPT_KATAKANA));
+}
+
+static bool initIndexForLocale(const char *locale) {
+    if (locale == NULL) {
+        return false;
+    }
+
+    if (gIndex != NULL && gIndex->isLocale(locale)) {
+        return true;
+    }
+
+    UErrorCode status = U_ZERO_ERROR;
+    sp<AlphabeticIndexRef> newIndex(new AlphabeticIndexRef(locale, status));
+    if (newIndex == NULL || U_FAILURE(status)) {
+        return false;
+    }
+    // Always create labels for Latin characters if not present in native set
+    (*newIndex)->addLabels("en", status);
+    if (U_FAILURE(status)) {
+        return false;
+    }
+    if ((*newIndex)->getBucketCount(status) <= 0 || U_FAILURE(status)) {
+        return false;
+    }
+
+    gIndex = newIndex;
+    return true;
 }
 
-int32_t GetPhonebookIndex(UCharIterator * iter, const char * locale, UChar * out, int32_t size,
-        UBool * isError)
+int32_t GetPhonebookIndex(UCharIterator *iter, const char *locale,
+                          UChar *out, int32_t size, UBool *isError)
 {
-  if (size < MIN_OUTPUT_SIZE) {
-    *isError = TRUE;
-    return 0;
-  }
-
-  *isError = FALSE;
-
-  // Normalize the first character to remove accents using the NFD normalization
-  UErrorCode errorCode = U_ZERO_ERROR;
-  int32_t len = unorm_next(iter, out, size, UNORM_NFD,
-          0 /* options */, TRUE /* normalize */, NULL, &errorCode);
-  if (U_FAILURE(errorCode)) {
-    *isError = TRUE;
-    return 0;
-  }
-
-  if (len == 0) {   // Empty input string
-    return 0;
-  }
-
-  UChar c = out[0];
-
-  if (!u_isalpha(c)) {
-    // Digits go into a # section. Everything else goes into the empty section
-    // The unicode function u_isdigit would also identify other characters as digits (arabic),
-    // but if we caught them here we'd risk having the same section before and after alpha-letters
-    // which might break the assumption that each section exists only once
-    if (c >= '0' && c <= '9') {
-      out[0] = '#';
-      return 1;
-    }
-    return 0;
-  }
-
-  c = u_toupper(c);
-
-  // Check for explicitly mapped characters
-  UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar));
-  if (c_mapped != 0) {
-    out[0] = c_mapped;
-    return 1;
-  }
-
-  // Convert Kanas to Hiragana
-  UChar next = len > 2 ? out[1] : 0;
-  c = android::GetNormalizedCodePoint(c, next, NULL);
-
-  // Traditional grouping of Hiragana characters
-  if (0x3041 <= c && c <= 0x309F) {
-    if (c < 0x304B) c = 0x3042;         // a
-    else if (c < 0x3055) c = 0x304B;    // ka
-    else if (c < 0x305F) c = 0x3055;    // sa
-    else if (c < 0x306A) c = 0x305F;    // ta
-    else if (c < 0x306F) c = 0x306A;    // na
-    else if (c < 0x307E) c = 0x306F;    // ha
-    else if (c < 0x3083) c = 0x307E;    // ma
-    else if (c < 0x3089) c = 0x3084;    // ya
-    else if (c < 0x308E) c = 0x3089;    // ra
-    else if (c < 0x3094) c = 0x308F;    // wa
-    else return 0;                      // Others are not readable
-    out[0] = c;
-    return 1;
-  } else if (0x30A0 <= c && c <= 0x30FF) {
-    // Dot, onbiki, iteration marks are not readable
-    return 0;
-  }
-
-  if (is_CJK(c)) {
-    if (strncmp(locale, "ja", 2) == 0) {
-      // Japanese word meaning "misc" or "other"
-      out[0] = 0x4ED6;
-      return 1;
-    } else {
-      return 0;
+    if (size < MIN_OUTPUT_SIZE) {
+        *isError = TRUE;
+        return 0;
+    }
+
+    *isError = FALSE;
+    out[0] = '\0';
+    iter->move(iter, 0, UITER_ZERO);
+    if (!iter->hasNext(iter)) {   // Empty input string
+        return 0;
+    }
+    UnicodeString ustr;
+    bool prefixIsNonNumeric = false;
+    bool prefixIsNumeric = false;
+    while (iter->hasNext(iter)) {
+        UChar32 ch = uiter_next32(iter);
+        // Ignore standard phone number separators and identify any string
+        // that otherwise starts with a number.
+        if (!prefixIsNumeric && !prefixIsNonNumeric) {
+            if (u_isdigit(ch)) {
+                prefixIsNumeric = true;
+            } else if (!u_isspace(ch) && ch != '+' && ch != '(' &&
+                       ch != ')' && ch != '.' && ch != '-' && ch != '#') {
+                prefixIsNonNumeric = true;
+            }
+        }
+        ustr.append(ch);
+    }
+    if (prefixIsNumeric) {
+        out[0] = '#';
+        return 1;
+    }
+
+    Mutex::Autolock autolock(gIndexMutex);
+    if (!initIndexForLocale(locale)) {
+        *isError = TRUE;
+        return 0;
+    }
+
+    UErrorCode status = U_ZERO_ERROR;
+    int32_t bucketIndex = (*gIndex)->getBucketIndex(ustr, status);
+    if (U_FAILURE(status)) {
+        *isError = TRUE;
+        return 0;
+    }
+
+    int32_t len = gIndex->getLabel(bucketIndex, out, size);
+    if (len < 0) {
+        *isError = TRUE;
+        return 0;
+    }
+
+    // For Japanese, label unclassified CJK ideographs with
+    // Japanese word meaning "misc" or "other"
+    if (gIndex->isJapanese() && len == 0 && is_CJ(ustr.char32At(0))) {
+        out[0] = 0x4ED6;
+        len = 1;
     }
-  }
 
-  out[0] = c;
-  return 1;
+    return len;
 }
 
 }  // namespace android