summaryrefslogtreecommitdiff
path: root/android/PhonebookIndex.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'android/PhonebookIndex.cpp')
-rw-r--r--android/PhonebookIndex.cpp317
1 files changed, 159 insertions, 158 deletions
diff --git a/android/PhonebookIndex.cpp b/android/PhonebookIndex.cpp
index 5cc26e5..68674f4 100644
--- a/android/PhonebookIndex.cpp
+++ b/android/PhonebookIndex.cpp
@@ -14,192 +14,193 @@
* limitations under the License.
*/
+#include <stdlib.h>
#include <ctype.h>
#include <string.h>
+#include <stdio.h>
+#include <unicode/alphaindex.h>
#include <unicode/ucol.h>
#include <unicode/uiter.h>
#include <unicode/ustring.h>
#include <unicode/utypes.h>
+#include <unicode/uloc.h>
+#include <utils/Mutex.h>
+#include <utils/RefBase.h>
#include "PhonebookIndex.h"
-#include "PhoneticStringUtils.h"
#define MIN_OUTPUT_SIZE 6 // Minimum required size for the output buffer (in bytes)
namespace android {
-// IMPORTANT! Keep the codes below SORTED. We are doing a binary search on the array
-static UChar DEFAULT_CHAR_MAP[] = {
- 0x00C6, 'A', // AE
- 0x00DF, 'S', // Etzett
- 0x1100, 0x3131, // HANGUL LETTER KIYEOK
- 0x1101, 0x3132, // HANGUL LETTER SSANGKIYEOK
- 0x1102, 0x3134, // HANGUL LETTER NIEUN
- 0x1103, 0x3137, // HANGUL LETTER TIKEUT
- 0x1104, 0x3138, // HANGUL LETTER SSANGTIKEUT
- 0x1105, 0x3139, // HANGUL LETTER RIEUL
- 0x1106, 0x3141, // HANGUL LETTER MIEUM
- 0x1107, 0x3142, // HANGUL LETTER PIEUP
- 0x1108, 0x3143, // HANGUL LETTER SSANGPIEUP
- 0x1109, 0x3145, // HANGUL LETTER SIOS
- 0x110A, 0x3146, // HANGUL LETTER SSANGSIOS
- 0x110B, 0x3147, // HANGUL LETTER IEUNG
- 0x110C, 0x3148, // HANGUL LETTER CIEUC
- 0x110D, 0x3149, // HANGUL LETTER SSANGCIEUC
- 0x110E, 0x314A, // HANGUL LETTER CHIEUCH
- 0x110F, 0x314B, // HANGUL LETTER KHIEUKH
- 0x1110, 0x314C, // HANGUL LETTER THIEUTH
- 0x1111, 0x314D, // HANGUL LETTER PHIEUPH
- 0x1112, 0x314E, // HANGUL LETTER HIEUH
- 0x111A, 0x3140, // HANGUL LETTER RIEUL-HIEUH
- 0x1121, 0x3144, // HANGUL LETTER PIEUP-SIOS
- 0x1161, 0x314F, // HANGUL LETTER A
- 0x1162, 0x3150, // HANGUL LETTER AE
- 0x1163, 0x3151, // HANGUL LETTER YA
- 0x1164, 0x3152, // HANGUL LETTER YAE
- 0x1165, 0x3153, // HANGUL LETTER EO
- 0x1166, 0x3154, // HANGUL LETTER E
- 0x1167, 0x3155, // HANGUL LETTER YEO
- 0x1168, 0x3156, // HANGUL LETTER YE
- 0x1169, 0x3157, // HANGUL LETTER O
- 0x116A, 0x3158, // HANGUL LETTER WA
- 0x116B, 0x3159, // HANGUL LETTER WAE
- 0x116C, 0x315A, // HANGUL LETTER OE
- 0x116D, 0x315B, // HANGUL LETTER YO
- 0x116E, 0x315C, // HANGUL LETTER U
- 0x116F, 0x315D, // HANGUL LETTER WEO
- 0x1170, 0x315E, // HANGUL LETTER WE
- 0x1171, 0x315F, // HANGUL LETTER WI
- 0x1172, 0x3160, // HANGUL LETTER YU
- 0x1173, 0x3161, // HANGUL LETTER EU
- 0x1174, 0x3162, // HANGUL LETTER YI
- 0x1175, 0x3163, // HANGUL LETTER I
- 0x11AA, 0x3133, // HANGUL LETTER KIYEOK-SIOS
- 0x11AC, 0x3135, // HANGUL LETTER NIEUN-CIEUC
- 0x11AD, 0x3136, // HANGUL LETTER NIEUN-HIEUH
- 0x11B0, 0x313A, // HANGUL LETTER RIEUL-KIYEOK
- 0x11B1, 0x313B, // HANGUL LETTER RIEUL-MIEUM
- 0x11B3, 0x313D, // HANGUL LETTER RIEUL-SIOS
- 0x11B4, 0x313E, // HANGUL LETTER RIEUL-THIEUTH
- 0x11B5, 0x313F, // HANGUL LETTER RIEUL-PHIEUPH
+// Wrapper class to enable using libutil SmartPointers with AlphabeticIndex.
+class AlphabeticIndexRef : public RefBase {
+public:
+ AlphabeticIndexRef(const char *locale, UErrorCode &status) :
+ m_index(locale, status), m_locale(NULL), m_isJapanese(false) {
+ if (U_FAILURE(status)) {
+ return;
+ }
+ m_locale = strdup(locale);
+ if (m_locale == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ char language[4];
+ uloc_getLanguage(locale, language, sizeof(language), &status);
+ if (U_FAILURE(status)) {
+ return;
+ }
+ m_isJapanese = (strcmp(language, ULOC_JAPANESE) == 0);
+ }
+ virtual ~AlphabeticIndexRef() { free(m_locale); }
+
+ AlphabeticIndex& operator*() { return m_index; }
+ AlphabeticIndex* operator->() { return &m_index; }
+
+ bool isLocale(const char *locale) const {
+ return (locale != NULL && m_locale != NULL &&
+ strcmp(m_locale, locale) == 0);
+ }
+ bool isJapanese() const { return m_isJapanese; }
+ int32_t getLabel(int32_t bucketIndex, UChar *labelBuf, int32_t labelBufSize);
+
+private:
+ AlphabeticIndex m_index;
+ char *m_locale;
+ bool m_isJapanese;
};
-/**
- * Binary search to map an individual character to the corresponding phone book index.
- */
-static UChar map_character(UChar c, UChar * char_map, int32_t length) {
- int from = 0, to = length;
- while (from < to) {
- int m = ((to + from) >> 1) & ~0x1; // Only consider even positions
- UChar cm = char_map[m];
- if (cm == c) {
- return char_map[m + 1];
- } else if (cm < c) {
- from = m + 2;
+int32_t AlphabeticIndexRef::getLabel(int32_t bucketIndex, UChar *labelBuf,
+ int32_t labelBufSize) {
+ UErrorCode status = U_ZERO_ERROR;
+ m_index.resetBucketIterator(status);
+ if (U_FAILURE(status)) {
+ return -1;
+ }
+ for(int i = 0; i <= bucketIndex; ++i) {
+ if (!m_index.nextBucket(status) || U_FAILURE(status)) {
+ return -1;
+ }
+ }
+
+ int32_t len;
+ if (m_index.getBucketLabelType() == U_ALPHAINDEX_NORMAL) {
+ len = m_index.getBucketLabel().extract(labelBuf, labelBufSize, status);
+ if (U_FAILURE(status)) {
+ return -1;
+ }
} else {
- to = m;
+ // Use no label for underflow/inflow/overflow buckets
+ labelBuf[0] = '\0';
+ len = 0;
}
- }
- return 0;
+ return len;
}
+static Mutex gIndexMutex;
+static sp<AlphabeticIndexRef> gIndex;
+
/**
* Returns TRUE if the character belongs to a Hanzi unicode block
*/
-static bool is_CJK(UChar c) {
- return
- (0x4e00 <= c && c <= 0x9fff) // CJK_UNIFIED_IDEOGRAPHS
- || (0x3400 <= c && c <= 0x4dbf) // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
- || (0x3000 <= c && c <= 0x303f) // CJK_SYMBOLS_AND_PUNCTUATION
- || (0x2e80 <= c && c <= 0x2eff) // CJK_RADICALS_SUPPLEMENT
- || (0x3300 <= c && c <= 0x33ff) // CJK_COMPATIBILITY
- || (0xfe30 <= c && c <= 0xfe4f) // CJK_COMPATIBILITY_FORMS
- || (0xf900 <= c && c <= 0xfaff); // CJK_COMPATIBILITY_IDEOGRAPHS
+static bool is_CJ(UChar32 c) {
+ return (uscript_hasScript(c, USCRIPT_HAN) ||
+ uscript_hasScript(c, USCRIPT_HIRAGANA) ||
+ uscript_hasScript(c, USCRIPT_KATAKANA));
+}
+
+static bool initIndexForLocale(const char *locale) {
+ if (locale == NULL) {
+ return false;
+ }
+
+ if (gIndex != NULL && gIndex->isLocale(locale)) {
+ return true;
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ sp<AlphabeticIndexRef> newIndex(new AlphabeticIndexRef(locale, status));
+ if (newIndex == NULL || U_FAILURE(status)) {
+ return false;
+ }
+ // Always create labels for Latin characters if not present in native set
+ (*newIndex)->addLabels("en", status);
+ if (U_FAILURE(status)) {
+ return false;
+ }
+ if ((*newIndex)->getBucketCount(status) <= 0 || U_FAILURE(status)) {
+ return false;
+ }
+
+ gIndex = newIndex;
+ return true;
}
-int32_t GetPhonebookIndex(UCharIterator * iter, const char * locale, UChar * out, int32_t size,
- UBool * isError)
+int32_t GetPhonebookIndex(UCharIterator *iter, const char *locale,
+ UChar *out, int32_t size, UBool *isError)
{
- if (size < MIN_OUTPUT_SIZE) {
- *isError = TRUE;
- return 0;
- }
-
- *isError = FALSE;
-
- // Normalize the first character to remove accents using the NFD normalization
- UErrorCode errorCode = U_ZERO_ERROR;
- int32_t len = unorm_next(iter, out, size, UNORM_NFD,
- 0 /* options */, TRUE /* normalize */, NULL, &errorCode);
- if (U_FAILURE(errorCode)) {
- *isError = TRUE;
- return 0;
- }
-
- if (len == 0) { // Empty input string
- return 0;
- }
-
- UChar c = out[0];
-
- if (!u_isalpha(c)) {
- // Digits go into a # section. Everything else goes into the empty section
- // The unicode function u_isdigit would also identify other characters as digits (arabic),
- // but if we caught them here we'd risk having the same section before and after alpha-letters
- // which might break the assumption that each section exists only once
- if (c >= '0' && c <= '9') {
- out[0] = '#';
- return 1;
- }
- return 0;
- }
-
- c = u_toupper(c);
-
- // Check for explicitly mapped characters
- UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar));
- if (c_mapped != 0) {
- out[0] = c_mapped;
- return 1;
- }
-
- // Convert Kanas to Hiragana
- UChar next = len > 2 ? out[1] : 0;
- c = android::GetNormalizedCodePoint(c, next, NULL);
-
- // Traditional grouping of Hiragana characters
- if (0x3041 <= c && c <= 0x309F) {
- if (c < 0x304B) c = 0x3042; // a
- else if (c < 0x3055) c = 0x304B; // ka
- else if (c < 0x305F) c = 0x3055; // sa
- else if (c < 0x306A) c = 0x305F; // ta
- else if (c < 0x306F) c = 0x306A; // na
- else if (c < 0x307E) c = 0x306F; // ha
- else if (c < 0x3083) c = 0x307E; // ma
- else if (c < 0x3089) c = 0x3084; // ya
- else if (c < 0x308E) c = 0x3089; // ra
- else if (c < 0x3094) c = 0x308F; // wa
- else return 0; // Others are not readable
- out[0] = c;
- return 1;
- } else if (0x30A0 <= c && c <= 0x30FF) {
- // Dot, onbiki, iteration marks are not readable
- return 0;
- }
-
- if (is_CJK(c)) {
- if (strncmp(locale, "ja", 2) == 0) {
- // Japanese word meaning "misc" or "other"
- out[0] = 0x4ED6;
- return 1;
- } else {
- return 0;
+ if (size < MIN_OUTPUT_SIZE) {
+ *isError = TRUE;
+ return 0;
+ }
+
+ *isError = FALSE;
+ out[0] = '\0';
+ iter->move(iter, 0, UITER_ZERO);
+ if (!iter->hasNext(iter)) { // Empty input string
+ return 0;
+ }
+ UnicodeString ustr;
+ bool prefixIsNonNumeric = false;
+ bool prefixIsNumeric = false;
+ while (iter->hasNext(iter)) {
+ UChar32 ch = uiter_next32(iter);
+ // Ignore standard phone number separators and identify any string
+ // that otherwise starts with a number.
+ if (!prefixIsNumeric && !prefixIsNonNumeric) {
+ if (u_isdigit(ch)) {
+ prefixIsNumeric = true;
+ } else if (!u_isspace(ch) && ch != '+' && ch != '(' &&
+ ch != ')' && ch != '.' && ch != '-' && ch != '#') {
+ prefixIsNonNumeric = true;
+ }
+ }
+ ustr.append(ch);
+ }
+ if (prefixIsNumeric) {
+ out[0] = '#';
+ return 1;
+ }
+
+ Mutex::Autolock autolock(gIndexMutex);
+ if (!initIndexForLocale(locale)) {
+ *isError = TRUE;
+ return 0;
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ int32_t bucketIndex = (*gIndex)->getBucketIndex(ustr, status);
+ if (U_FAILURE(status)) {
+ *isError = TRUE;
+ return 0;
+ }
+
+ int32_t len = gIndex->getLabel(bucketIndex, out, size);
+ if (len < 0) {
+ *isError = TRUE;
+ return 0;
+ }
+
+ // For Japanese, label unclassified CJK ideographs with
+ // Japanese word meaning "misc" or "other"
+ if (gIndex->isJapanese() && len == 0 && is_CJ(ustr.char32At(0))) {
+ out[0] = 0x4ED6;
+ len = 1;
}
- }
- out[0] = c;
- return 1;
+ return len;
}
} // namespace android