summaryrefslogtreecommitdiff
path: root/tools/localedata
diff options
context:
space:
mode:
authorRoozbeh Pournader <roozbeh@google.com>2016-01-15 11:23:42 -0800
committerRoozbeh Pournader <roozbeh@google.com>2016-01-21 13:47:22 -0800
commitb927c559e1ef8530b08712507f320502627db298 (patch)
treef0d2fd051cb4486239e6f6187e0da53e416fdaeb /tools/localedata
parentac3e599069e1b87ea190f008aef60a506c8561c7 (diff)
Implement smarter locale resource selection
* Add support for determining script from language and region. * Add support for determining special parents of locales. * Add support for smart comparison of locales with only a difference in region, using the locale parentage tree. * Fix LocaleData.matchScore() to not fallback to old locale matching behavior if we can't determine a script. * Allow four-character variant codes. (Previously, only five- to eight-character variant codes were allowed.) Bug: 7296673 Bug: 26589793 Change-Id: Ibde0a48c0564ff383b41068095a5cbacfe7b94bc
Diffstat (limited to 'tools/localedata')
-rwxr-xr-xtools/localedata/extract_icu_data.py286
1 files changed, 286 insertions, 0 deletions
diff --git a/tools/localedata/extract_icu_data.py b/tools/localedata/extract_icu_data.py
new file mode 100755
index 000000000000..b071093a5615
--- /dev/null
+++ b/tools/localedata/extract_icu_data.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python
+#
+# Copyright 2016 The Android Open Source Project. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Generate a C++ data table containing locale data."""
+
+import collections
+import glob
+import os.path
+import sys
+
+
+def get_locale_parts(locale):
+ """Split a locale into three parts, for langauge, script, and region."""
+ parts = locale.split('_')
+ if len(parts) == 1:
+ return (parts[0], None, None)
+ elif len(parts) == 2:
+ if len(parts[1]) == 4: # parts[1] is a script
+ return (parts[0], parts[1], None)
+ else:
+ return (parts[0], None, parts[1])
+ else:
+ assert len(parts) == 3
+ return tuple(parts)
+
+
+def read_likely_subtags(input_file_name):
+ """Read and parse ICU's likelySubtags.txt."""
+ with open(input_file_name) as input_file:
+ likely_script_dict = {
+ # Android's additions for pseudo-locales. These internal codes make
+ # sure that the pseudo-locales would not match other English or
+ # Arabic locales. (We can't use private-use ISO 15924 codes, since
+ # they may be used by apps for other purposes.)
+ "en_XA": "~~~A",
+ "ar_XB": "~~~B",
+ }
+ representative_locales = {
+ # Android's additions
+ "en_Latn_GB", # representative for en_Latn_001
+ "es_Latn_MX", # representative for es_Latn_419
+ "es_Latn_US", # representative for es_Latn_419 (not the best idea,
+ # but Android has been shipping with it for quite a
+ # while. Fortunately, MX < US, so if both exist, MX
+ # would be chosen.)
+ }
+ for line in input_file:
+ line = unicode(line, 'UTF-8').strip(u' \n\uFEFF').encode('UTF-8')
+ if line.startswith('//'):
+ continue
+ if '{' in line and '}' in line:
+ from_locale = line[:line.index('{')]
+ to_locale = line[line.index('"')+1:line.rindex('"')]
+ from_lang, from_scr, from_region = get_locale_parts(from_locale)
+ _, to_scr, to_region = get_locale_parts(to_locale)
+ if from_lang == 'und':
+ continue # not very useful for our purposes
+ if from_region is None and to_region != '001':
+ representative_locales.add(to_locale)
+ if from_scr is None:
+ likely_script_dict[from_locale] = to_scr
+ return likely_script_dict, frozenset(representative_locales)
+
+
+# From packLanguageOrRegion() in ResourceTypes.cpp
+def pack_language_or_region(inp, base):
+ """Pack langauge or region in a two-byte tuple."""
+ if inp is None:
+ return (0, 0)
+ elif len(inp) == 2:
+ return ord(inp[0]), ord(inp[1])
+ else:
+ assert len(inp) == 3
+ base = ord(base)
+ first = ord(inp[0]) - base
+ second = ord(inp[1]) - base
+ third = ord(inp[2]) - base
+
+ return (0x80 | (third << 2) | (second >>3),
+ ((second << 5) | first) & 0xFF)
+
+
+# From packLanguage() in ResourceTypes.cpp
+def pack_language(language):
+ """Pack language in a two-byte tuple."""
+ return pack_language_or_region(language, 'a')
+
+
+# From packRegion() in ResourceTypes.cpp
+def pack_region(region):
+ """Pack region in a two-byte tuple."""
+ return pack_language_or_region(region, '0')
+
+
+def pack_to_uint32(locale):
+ """Pack language+region of locale into a 32-bit unsigned integer."""
+ lang, _, region = get_locale_parts(locale)
+ plang = pack_language(lang)
+ pregion = pack_region(region)
+ return (plang[0] << 24) | (plang[1] << 16) | (pregion[0] << 8) | pregion[1]
+
+
+def dump_script_codes(all_scripts):
+ """Dump the SCRIPT_CODES table."""
+ print 'const char SCRIPT_CODES[][4] = {'
+ for index, script in enumerate(all_scripts):
+ print " /* %-2d */ {'%c', '%c', '%c', '%c'}," % (
+ index, script[0], script[1], script[2], script[3])
+ print '};'
+ print
+
+
+def dump_script_data(likely_script_dict, all_scripts):
+ """Dump the script data."""
+ print
+ print 'const std::unordered_map<uint32_t, uint8_t> LIKELY_SCRIPTS({'
+ for locale in sorted(likely_script_dict.keys()):
+ script = likely_script_dict[locale]
+ print ' {0x%08Xu, %2du}, // %s -> %s' % (
+ pack_to_uint32(locale),
+ all_scripts.index(script),
+ locale.replace('_', '-'),
+ script)
+ print '});'
+
+
+def pack_to_uint64(locale):
+ """Pack a full locale into a 64-bit unsigned integer."""
+ _, script, _ = get_locale_parts(locale)
+ return ((pack_to_uint32(locale) << 32) |
+ (ord(script[0]) << 24) |
+ (ord(script[1]) << 16) |
+ (ord(script[2]) << 8) |
+ ord(script[3]))
+
+
+def dump_representative_locales(representative_locales):
+ """Dump the set of representative locales."""
+ print
+ print 'std::unordered_set<uint64_t> REPRESENTATIVE_LOCALES({'
+ for locale in sorted(representative_locales):
+ print ' 0x%08Xllu, // %s' % (
+ pack_to_uint64(locale),
+ locale)
+ print '});'
+
+
+def read_and_dump_likely_data(icu_data_dir):
+ """Read and dump the likely-script data."""
+ likely_subtags_txt = os.path.join(icu_data_dir, 'misc', 'likelySubtags.txt')
+ likely_script_dict, representative_locales = read_likely_subtags(
+ likely_subtags_txt)
+
+ all_scripts = list(set(likely_script_dict.values()))
+ assert len(all_scripts) <= 256
+ all_scripts.sort()
+
+ dump_script_codes(all_scripts)
+ dump_script_data(likely_script_dict, all_scripts)
+ dump_representative_locales(representative_locales)
+ return likely_script_dict
+
+
+def read_parent_data(icu_data_dir):
+ """Read locale parent data from ICU data files."""
+ all_icu_data_files = glob.glob(os.path.join(icu_data_dir, '*', '*.txt'))
+ parent_dict = {}
+ for data_file in all_icu_data_files:
+ locale = os.path.splitext(os.path.basename(data_file))[0]
+ with open(data_file) as input_file:
+ for line in input_file:
+ if '%%Parent' in line:
+ parent = line[line.index('"')+1:line.rindex('"')]
+ if locale in parent_dict:
+ # Different files shouldn't have different parent info
+ assert parent_dict[locale] == parent
+ else:
+ parent_dict[locale] = parent
+ elif locale.startswith('ar_') and 'default{"latn"}' in line:
+ # Arabic parent overrides for ASCII digits. Since
+ # Unicode extensions are not supported in ResourceTypes,
+ # we will use ar-015 (Arabic, Northern Africa) instead
+ # of the more correct ar-u-nu-latn.
+ parent_dict[locale] = 'ar_015'
+ return parent_dict
+
+
+def get_likely_script(locale, likely_script_dict):
+ """Find the likely script for a locale, given the likely-script dictionary.
+ """
+ if locale.count('_') == 2:
+ # it already has a script
+ return locale.split('_')[1]
+ elif locale in likely_script_dict:
+ return likely_script_dict[locale]
+ else:
+ language = locale.split('_')[0]
+ return likely_script_dict[language]
+
+
+def dump_parent_data(script_organized_dict):
+ """Dump information for parents of locales."""
+ sorted_scripts = sorted(script_organized_dict.keys())
+ print
+ for script in sorted_scripts:
+ parent_dict = script_organized_dict[script]
+ print ('const std::unordered_map<uint32_t, uint32_t> %s_PARENTS({'
+ % script.upper())
+ for locale in sorted(parent_dict.keys()):
+ parent = parent_dict[locale]
+ print ' {0x%08Xu, 0x%08Xu}, // %s -> %s' % (
+ pack_to_uint32(locale),
+ pack_to_uint32(parent),
+ locale.replace('_', '-'),
+ parent.replace('_', '-'))
+ print '});'
+ print
+
+ print 'const struct {'
+ print ' const char script[4];'
+ print ' const std::unordered_map<uint32_t, uint32_t>* map;'
+ print '} SCRIPT_PARENTS[] = {'
+ for script in sorted_scripts:
+ print " {{'%c', '%c', '%c', '%c'}, &%s_PARENTS}," % (
+ script[0], script[1], script[2], script[3],
+ script.upper())
+ print '};'
+
+
+def dump_parent_tree_depth(parent_dict):
+ """Find and dump the depth of the parent tree."""
+ max_depth = 1
+ for locale, _ in parent_dict.items():
+ depth = 1
+ while locale in parent_dict:
+ locale = parent_dict[locale]
+ depth += 1
+ max_depth = max(max_depth, depth)
+ assert max_depth < 5 # Our algorithms assume small max_depth
+ print
+ print 'const size_t MAX_PARENT_DEPTH = %d;' % max_depth
+
+
+def read_and_dump_parent_data(icu_data_dir, likely_script_dict):
+ """Read parent data from ICU and dump it."""
+ parent_dict = read_parent_data(icu_data_dir)
+ script_organized_dict = collections.defaultdict(dict)
+ for locale in parent_dict:
+ parent = parent_dict[locale]
+ if parent == 'root':
+ continue
+ script = get_likely_script(locale, likely_script_dict)
+ script_organized_dict[script][locale] = parent_dict[locale]
+ dump_parent_data(script_organized_dict)
+ dump_parent_tree_depth(parent_dict)
+
+
+def main():
+ """Read the data files from ICU and dump the output to a C++ file."""
+ source_root = sys.argv[1]
+ icu_data_dir = os.path.join(
+ source_root,
+ 'external', 'icu', 'icu4c', 'source', 'data')
+
+ print '// Auto-generated by %s' % sys.argv[0]
+ print
+ likely_script_dict = read_and_dump_likely_data(icu_data_dir)
+ read_and_dump_parent_data(icu_data_dir, likely_script_dict)
+
+
+if __name__ == '__main__':
+ main()