[WIP] locale python scripts for cldr updates

Message ID	20160413235624.GV6588@vapier.lan
State	RFC, archived
Delegated to:	Mike Frysinger
Headers	Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org Date: Wed, 13 Apr 2016 19:56:24 -0400 From: Mike Frysinger <vapier@gentoo.org> To: libc-alpha@sourceware.org Subject: [WIP] locale python scripts for cldr updates Message-ID: <20160413235624.GV6588@vapier.lan> Mail-Followup-To: libc-alpha@sourceware.org MIME-Version: 1.0 Content-Type: multipart/signed; micalg=pgp-sha256; protocol="application/pgp-signature"; boundary="gAzlVcfGMNCdruOw" Content-Disposition: inline

--- /dev/null +++ locales.py @@ -0,0 +1,568 @@ +# -*- coding: utf-8 -*- +# Written by Mike Frysinger <vapier@gentoo.org> for much great glory. +# +# Copyright (C) 2016 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +"""Helper library for working with locale datafiles.""" + +from __future__ import print_function + +import os +import re +import sys + + +# Whether we should clean up newlines/comments. +REWRITE_STYLE = False + +# Comment block that should be at the top of all files. +FILE_HEADER = """\ +% This file is part of the GNU C Library and contains locale data. +% The Free Software Foundation does not claim any copyright interest +% in the locale data contained in this file. The foregoing does not +% affect the license of the GNU C Library as a whole. It does not +% exempt you from the conditions of the license if your use would +% otherwise be governed by that license. + +""" + +# The order of content in the data files. +CATEGORY_ORDER = ( + 'LC_IDENTIFICATION', + 'LC_CTYPE', + 'LC_COLLATE', + 'LC_MONETARY', + 'LC_NUMERIC', + 'LC_TIME', + 'LC_MESSAGES', + 'LC_PAPER', + 'LC_NAME', + 'LC_ADDRESS', + 'LC_TELEPHONE', + 'LC_MEASUREMENT', +) + + +def u_encode(text): + """Convert unicode |text| to <U####> format.""" + return ''.join('<U%04X>' % ord(x) for x in text) + + +_U_MATCH = re.compile(r'<U([0-9A-Fa-f]+)>') +def u_decode(text): + """Convert <U####> format in |text|.""" + unirep = lambda m: chr(int(m.group(1), 16)) + return _U_MATCH.sub(unirep, text) + + +def dequote(text): + """Remove leading/trailing quotes.""" + if text[0] == '"': + return text[1:-1] + else: + return text + + +class LocaleError(Exception): + """Error w/Locale objects""" + + +class LocaleName(object): + """Locale name object.""" + + # We support the POSIX format: <lang>_<territory>@<alt> + _POSIX_RE = re.compile( + r'(?P<lang>[^_]*)' + r'(_(?P<territory>[^@]*)' + r'(@(?P<alt>.*))?)?') + + # Script mappings between POSIX & CLDR. + _SCRIPT_MAP_PC = { + 'cyrillic': 'Cyrl', + 'latin': 'Latn', + } + # Handle languages that default to a specific script. + _SCRIPT_MAP_LANG_PC = { + # XXX: Is this correct for zh_HK ? What about Hant ? + # What about the cmn language (e.g. cmn_TW) ? + 'zh': 'Hans', + } + #_SCRIPT_MAP_CP = dict((v, k) for k, v in _SCRIPT_MAP_PC) + + def __init__(self, name): + """A new locale name in POSIX format.""" + self.name = name + m = self._POSIX_RE.match(name) + self.lang = m.group('lang') + self.territory = m.group('territory') + self.alt = m.group('alt') + + def __str__(self): + return self.posix + + @property + def posix(self): + """Name of locale as POSIX uses it.""" + ret = self.lang + if self.territory: + ret += '_' + self.territory + if self.alt: + ret += '@' + self.alt + return ret + + @property + def cldr_lang(self): + """Name of language as CLDR uses it.""" + ret = self.lang + + script = self._SCRIPT_MAP_PC.get(self.alt) + if not script: + script = self._SCRIPT_MAP_LANG_PC.get(self.lang) + if script: + ret += '_' + script + + return ret + + @property + def cldr(self): + """Name of locale as CLDR uses it.""" + # First deal with ugly variants. + if (self.lang, self.territory, self.alt) == ('ca', 'ES', 'valencia'): + return 'ca_ES_VALENCIA' + + ret = self.cldr_lang + if self.territory: + ret += '_' + self.territory + return ret + + +class LocaleCategory(object): + """Content for a single locale category.""" + + FIELDS = () + + def __init__(self, name='', content=(), header=(), comment_char=None, + copies={}): + self.name = name.lower() + self.content = content + self.header = header + self.fields = dict((k, None) for k in self.FIELDS) + + self._merge_content(content, comment_char, copies) + + def _merge_content(self, content, comment_char, copies): + for line in content: + line = line.split(None, 1) + if len(line) == 2: + k, v = line + if k in self.FIELDS: + if comment_char: + v = v.rsplit(comment_char, 1)[0].rstrip() + self.fields[k] = u_decode(dequote(v)) + if k == 'copy': + if comment_char: + v = v.rsplit(comment_char, 1)[0].rstrip() + v = u_decode(dequote(v)) + copy = copies[v] + if (isinstance(copy, Locale) and + self.NAME in copy.categories): + self._merge_content(getattr(copy, self.name).content, + copy.comment_char, copies) + + def __str__(self): + padding = '\n' if REWRITE_STYLE else '' + ret = '' + if self.header: + ret += padding + '\n'.join(self.header) + '\n' + lc_name = self.name.upper() + ret += (padding + + '\n'.join([lc_name] + self.content + ['END %s' % lc_name]) + + '\n') + return ret + + +class LCIdentification(LocaleCategory): + """LC_IDENTIFICATION object.""" + + NAME = 'LC_IDENTIFICATION' + FIELDS = ( + 'title', + 'source', + 'address', + 'contact', + 'email', + 'tel', + 'fax', + 'language', + 'territory', + 'audience', + 'application', + 'abbreviation', + 'revision', + 'date', + 'category', + ) + + +class LCCtype(LocaleCategory): + """LC_CTYPE object.""" + + NAME = 'LC_CTYPE' + FIELDS = ( + ) + + +class LCCollate(LocaleCategory): + """LC_COLLATE object.""" + + NAME = 'LC_COLLATE' + FIELDS = ( + ) + + +class LCMonetary(LocaleCategory): + """LC_MONETARY object.""" + + NAME = 'LC_MONETARY' + FIELDS = ( + 'int_curr_symbol', + 'currency_symbol', + 'mon_decimal_point', + 'mon_thousands_sep', + 'mon_grouping', + 'positive_sign', + 'negative_sign', + 'int_frac_digits', + 'frac_digits', + 'p_cs_precedes', + 'p_sep_by_space', + 'n_cs_precedes', + 'n_sep_by_space', + 'p_sign_posn', + 'n_sign_posn', + 'int_p_cs_precedes', + 'int_n_cs_precedes', + 'int_p_sep_by_space', + 'int_n_sep_by_space', + 'int_p_sign_posn', + 'int_n_sign_posn', + ) + + +class LCNumeric(LocaleCategory): + """LC_NUMERIC object.""" + + NAME = 'LC_NUMERIC' + FIELDS = ( + 'decimal_point', + 'thousands_sep', + 'grouping', + ) + + +class LCTime(LocaleCategory): + """LC_TIME object.""" + + NAME = 'LC_TIME' + FIELDS = ( + 'abday', + 'day', + 'abmon', + 'mon', + 'am_pm', + 'd_t_fmt', + 'd_fmt', + 't_fmt', + 't_fmt_ampm', + 'era', + 'era_year', + 'era_d_fmt', + 'alt_digits', + 'era_d_t_fmt', + 'era_t_fmt', + 'week', + 'first_weekday', + 'first_workday', + 'cal_direction', + 'date_fmt', + ) + + +class LCMessages(LocaleCategory): + """LC_MESSAGES object.""" + + NAME = 'LC_MESSAGES' + FIELDS = ( + 'yesexpr', + 'noexpr', + 'yesstr', + 'nostr', + ) + + +class LCPaper(LocaleCategory): + """LC_PAPER object.""" + + NAME = 'LC_PAPER' + FIELDS = ( + 'height', + 'width', + ) + + +class LCName(LocaleCategory): + """LC_NAME object.""" + + NAME = 'LC_NAME' + FIELDS = ( + 'name_fmt', + 'name_gen', + 'name_mr', + 'name_mrs', + 'name_miss', + 'name_ms', + ) + + +class LCAddress(LocaleCategory): + """LC_ADDRESS object.""" + + NAME = 'LC_ADDRESS' + FIELDS = ( + 'postal_fmt', + 'country_name', + 'country_post', + 'country_ab2', + 'country_ab3', + 'country_car', + 'country_num', + 'country_isbn', + 'lang_name', + 'lang_ab', + 'lang_term', + 'lang_lib', + ) + + +class LCTelephone(LocaleCategory): + """LC_TELEPHONE object.""" + + NAME = 'LC_TELEPHONE' + FIELDS = ( + 'tel_int_fmt', + 'tel_dom_fmt', + 'int_select', + 'int_prefix', + ) + + +class LCMeasurement(LocaleCategory): + """LC_MEASUREMENT object.""" + + NAME = 'LC_MEASUREMENT' + FIELDS = ( + 'measurement', + ) + + +class Locale(object): + """Content for a locale file itself.""" + + _COPY_CACHE = {} + + def __init__(self, name=None, path=None): + self.name = name + self.path = path + self.locale = LocaleName(name) + self.header = [] #FILE_HEADER.splitlines() + for cat in CATEGORY_ORDER: + setattr(self, cat.lower(), None) + self.categories = [] + self.cldr = None + self.escape_char = '\\' + self.comment_char = '#' + + if path is not None: + self.read(path) + + @staticmethod + def _trim_extra_lines(lines, leading=True, trailing=True, + consecutive=True, comments=False): + """Helper to clean up the style of the data files.""" + if not REWRITE_STYLE: + return lines + + # Clear leading blank lines. + if leading: + while lines and not lines[0]: + lines.pop(0) + + # Clear trailing blank lines. + if trailing: + while lines and not lines[-1]: + lines.pop(-1) + + # Clear consecutive blank lines. + if consecutive: + i = 0 + while i < len(lines) - 1: + if not lines[i] and not lines[i + 1]: + lines.pop(i) + else: + i += 1 + + # Trim blank comment lines that start/end a section. + if comments: + i = 0 + while i < len(lines): + if (lines[i] == '%' and + (i == 0 or not lines[i - 1] or lines[i - 1][0] != '%')): + lines.pop(i) + elif (lines[i] == '%' and + (i == len(lines) - 1 or not lines[i + 1] or + lines[i + 1][0] != '%')): + lines.pop(i) + else: + i += 1 + + return lines + + def readfp(self, fp): + """Load the locale content from |fp|""" + Locale._COPY_CACHE[self.locale] = self + + lines = [x.rstrip() for x in fp.readlines()] + self._trim_extra_lines(lines) + + # Process the leading few lines. + comment_line = 'comment_char %' + if lines[0] != comment_line: + print('%s: warning: first line should be: "%s", not "%s"' % + (self.locale, comment_line, lines[0])) + escape_line = 'escape_char /' + if lines[1] != escape_line: + print('%s: warning: second line should be: "%s", not "%s"' % + (self.locale, escape_line, lines[0])) + + # Now walk each locale category. + while lines: + # Extract any leading comments. + header = [] + while lines: + line = lines[0] + if line.startswith('LC_'): + break + elif not line or line[0] == self.comment_char: + header.append(line) + lines.pop(0) + continue + elif line.startswith('comment_char'): + self.comment_char = line.split()[1] + if self.comment_char != '%': + raise LocaleError('%s: bad comment_char: %s' % + (self.locale, line)) + header.append(line) + lines.pop(0) + continue + elif line.startswith('escape_char'): + self.escape_char = line.split()[1] + if self.escape_char != '/': + raise LocaleError('%s: bad escape_char: %s' % + (self.locale, line)) + header.append(line) + lines.pop(0) + continue + else: + break + self._trim_extra_lines(header) + + if not lines: + if header: + print('%s: throwing away trailing lines: %r' % + (self.name, header), file=sys.stderr) + return + + line = lines.pop(0) + if line[0:3] != 'LC_': + raise LocaleError('%s: bad line state: %s' % (self.name, line)) + + cat = line.split()[0] + if cat not in CATEGORY_ORDER: + raise LocaleError('%s: unknown cateogry: %s' % (self.name, cat)) + + cat_lines = [] + full_line = '' + while lines: + # Accumulate multilines. + line = lines.pop(0) + if line.endswith(self.escape_char): + full_line += line[:-1] + continue + elif full_line: + line = full_line + line.lstrip() + full_line = '' + + # Halt when we get to the end of this category. + if line.split()[0:2] == ['END', cat]: + break + cat_lines.append(line) + + # Deal with loading other locales. + if line.startswith('copy '): + copy = u_decode(dequote(line.split()[1])) + self._load_copy(copy) + + self._trim_extra_lines(cat_lines) + lc_obj_name = 'LC%s%s' % (cat[3], cat[4:].lower()) + lc_obj = getattr(sys.modules[__name__], lc_obj_name) + lc = lc_obj(name=cat, content=cat_lines, header=header, + comment_char=self.comment_char, copies=self._COPY_CACHE) + setattr(self, cat.lower(), lc) + self.categories.append(cat) + + def read(self, path): + """Load the locale file from |path|""" + self.readfp(open(path)) + + def _load_copy(self, copy): + """Load the locale named by |copy|""" + if not self.path: + return + if copy in Locale._COPY_CACHE: + return + # Flag it as in progress to avoid loops. + path = os.path.join(os.path.dirname(self.path), copy) + Locale._COPY_CACHE[copy] = 'loading' + Locale._COPY_CACHE[copy] = Locale(name=copy, path=path) + + def writefp(self, fp): + """Write the locale content to |fp|""" + if REWRITE_STYLE: + header = ['comment_char %', 'escape_char /'] + else: + header = self.header + if header: + fp.write('\n'.join(header) + '\n') + + for category in self.categories: + lc = getattr(self, category.lower()) + fp.write(str(lc)) + + def write(self, path): + """Write the locale content to |path|""" + self.writefp(open(path, 'w')) --- /dev/null +++ locale_lint.py @@ -0,0 +1,446 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# Written by Mike Frysinger <vapier@gentoo.org> for much great glory. +# +# Copyright (C) 2016 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +"""Linting tool for locale datafiles.""" + +# TODO: Validate set of locale data files and SUPPORTED file. + +from __future__ import print_function + +import argparse +import os +import re +import subprocess +import sys + +import locales + + +def get_parser(): + """Return an argument parser for this module.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('locales', nargs='*', help='Locales to lint') + return parser + + +class Check(object): + """Check class for locale problems.""" + + def __init__(self, locale, lc): + self.locale = locale + self.lc = lc + self.failed = False + + def fail(self, key, msg): + """Set state to failed and show |msg| for |key|.""" + self.failed = True + print('ERROR: %s: %s.%s: %s' % (self.locale.name, self.lc.NAME, key, msg)) + + def assertTrue(self, key, value, msg): + """Verify |value| is a boolean True value.""" + if not value: + self.fail(key, msg) + + def assertIn(self, key, value, exp_set, msg): + """Verify |value| is in |exp_set|.""" + self.assertTrue(key, value in exp_set, msg) + + def assertEqual(self, key, value, exp_value, msg): + """Verify |value| is equal to |exp_value|.""" + self.assertTrue(key, value == exp_value, msg) + + def assertNotEqual(self, key, value, exp_value, msg): + """Verify |value| is not equal to |exp_value|.""" + self.assertTrue(key, value != exp_value, msg) + + def assertDefined(self, key, value): + """Verify |value| is set to something (allows empty string).""" + if value is None: + self.fail(key, 'missing definition') + + def assertNonEmpty(self, key, value): + """Verify |value| is set to a non-empty value.""" + if not value: + self.fail(key, 'missing value') + + def assertEmpty(self, key, value): + """Verify |value| is set to an empty value.""" + if value: + self.fail(key, 'value should be left empty') + + def assertFormat(self, key, value, exp_formats, msg): + m = re.match(r'%%[^%s]' % (exp_formats,), value) + if m: + self.fail(key, msg) + + +class CheckLCIdentification(Check): + """Check LC_IDENTIFICATION object for problems.""" + + def run(self): + lc = self.lc + + self.assertNonEmpty('email', lc.fields['email']) + + self.assertEmpty('tel', lc.fields['tel']) + self.assertEmpty('fax', lc.fields['fax']) + + # TODO: Check language & territory. + # TODO: Check category fields are one of: + # i18n:2002 posix:1993 + + +class CheckLCCtype(Check): + """Check LC_CTYPE object for problems.""" + + def run(self): + lc = self.lc + + +class CheckLCCollate(Check): + """Check LC_COLLATE object for problems.""" + + def run(self): + lc = self.lc + + +class CheckLCMonetary(Check): + """Check LC_MONETARY object for problems.""" + + def run(self): + lc = self.lc + + k = 'int_curr_symbol' + v = lc.fields[k] + self.assertDefined(k, v) + if v: + self.assertEqual(k, len(v), 4, + 'symbol should be 4 characters, not %s' % (v,)) + # TODO: We can validate the value against ISO 4217. + self.assertEqual(k, v[3], ' ', + 'symbol must end with a space, not %s' % (v[3],)) + + for k in ('currency_symbol', 'mon_decimal_point', 'mon_thousands_sep', + 'positive_sign', 'negative_sign', 'mon_grouping', + 'int_frac_digits', 'frac_digits'): + self.assertDefined(k, lc.fields[k]) + + # XXX: The value of -1 is permitted for the POSIX locale. + + valid_values = (None, '0', '1') + for k in ('p_cs_precedes', 'n_cs_precedes', 'int_p_cs_precedes', 'int_n_cs_precedes'): + v = lc.fields[k] + self.assertIn(k, v, valid_values, + 'should be 0 or 1, not %s' % (v,)) + + valid_values = (None, '0', '1', '2') + for k in ('p_sep_by_space', 'n_sep_by_space', 'int_p_sep_by_space', 'int_n_sep_by_space'): + v = lc.fields[k] + self.assertIn(k, v, valid_values, + 'should be between [0, 2], not %s' % (v,)) + + valid_values = (None, '0', '1', '2', '3', '4') + for k in ('p_sign_posn', 'n_sign_posn', 'int_p_sign_posn', 'int_n_sign_posn'): + v = lc.fields[k] + self.assertIn(k, v, valid_values, + 'should be between [0, 4], not %s' % (v,)) + + +class CheckLCNumeric(Check): + """Check LC_NUMERIC object for problems.""" + + def run(self): + lc = self.lc + + # TODO: grouping: Verify it's a list of positive ints (and -1). + k = 'grouping' + self.assertDefined(k, lc.fields[k]) + + k = 'decimal_point' + self.assertNonEmpty(k, lc.fields[k]) + + +class CheckLCTime(Check): + """Check LC_TIME object for problems.""" + + def run(self): + lc = self.lc + + valid_len = 7 + for k in ('abday', 'day'): + v = lc.fields[k] + if v: + v = v.split(';') + self.assertEqual(k, len(v), valid_len, + 'need %s elements: %s' % (valid_len, v)) + + valid_len = 12 + for k in ('abmon', 'mon'): + v = lc.fields[k] + if v: + v = v.split(';') + self.assertEqual(k, len(v), valid_len, + 'need %s elements: %s' % (valid_len, v)) + + k = 'date_fmt' + default_value = '%a %b %e %H:%M:%S %Z %Y' + v = lc.fields[k] + self.assertNotEqual(k, v, default_value, + 'value (%s) is same as the default; delete it' % (v,)) + + # Should we filter out date/time fields rather than allow each one full + # access to the strftime api? + valid_values = '-aAbBcCdDeEFgGhHIjklmMnOpPrRsStTuUVwWxXyYzZ' + for k in ('d_t_fmt', 'd_fmt', 't_fmt'): + v = lc.fields[k] + if v: + self.assertFormat(k, v, valid_values, + 'only %s formats are accepted, not %s' % (valid_values, v)) + + # TODO: am_pm: Verify it has 2 entries. + + k = 'week' + v = lc.fields[k] + if v: + default_week = '7;19971130;4' + if v == default_week: + self.fail(k, 'value (%s) is same as the default; delete it' % (default_week,)) + + k = 'first_weekday' + self.assertNotEqual(k, lc.fields[k], '1', 'value (%s) is same as the default; delete it' % (v,)) + + k = 'first_workday' + self.assertNotEqual(k, lc.fields[k], '2', 'value (%s) is same as the default; delete it' % (v,)) + else: + va = v.split(';') + if len(va) != 3: + self.fail(k, 'value should have 3 fields, not %s' % (v,)) + else: + default_start = default_week.split(';')[1] + self.assertEqual(k, va[1], default_start, + 'should be %s, not %s (remember to adjust other fields too)' % (default_start, va[1])) + + k = 'first_weekday' + v = lc.fields[k] + valid_values = (None, '1', '2') + self.assertIn(k, v, valid_values, + 'should be 1 or 2, not %s' % (v,)) + + k = 'first_workday' + v = lc.fields[k] + valid_values = (None, '1', '2') + self.assertIn(k, v, valid_values, + 'should be 1 or 2, not %s' % (v,)) + + k = 'cal_direction' + v = lc.fields[k] + valid_values = (None, '1', '2', '3') + self.assertIn(k, v, valid_values, + 'should be between [0, 3], not %s' % (v,)) + + +class CheckLCMessages(Check): + """Check LC_MESSAGES object for problems.""" + + def run(self): + lc = self.lc + + for k in ('yesexpr', 'noexpr'): + v = lc.fields[k] + if v: + try: + re.compile(v) + except re.error: + self.fail(k, 'invalid regular expression: %s' % (v,)) + + +class CheckLCPaper(Check): + """Check LC_PAPER object for problems.""" + + def run(self): + lc = self.lc + + paper = (lc.fields['height'], lc.fields['width']) + valid_values = ( + ('279', '216'), # US-Letter. + ('297', '210'), # A4. + # XXX: Drop this? Need to implement copy directives. + (None, None), # Not set. + ) + self.assertIn('(height, width)', paper, valid_values, + '%r' % (paper,)) + + +class CheckLCName(Check): + """Check LC_NAME object for problems.""" + + def run(self): + lc = self.lc + + k = 'name_fmt' + v = lc.fields[k] + self.assertNonEmpty(k, v) + # Same value as ld-name.c. + valid_values = 'dfFgGlomMpsSt' + if v: + self.assertFormat(k, v, valid_values, + 'only %s formats are accepted, not %s' % (valid_values, v)) + + for k in ('name_gen', 'name_mr', 'name_mrs', 'name_miss', 'name_ms'): + self.assertDefined(k, lc.fields[k]) + + +class CheckLCAddress(Check): + """Check LC_ADDRESS object for problems.""" + + def run(self): + lc = self.lc + + k = 'postal_fmt' + v = lc.fields[k] + self.assertNonEmpty(k, v) + # Same value as ld-address.c. + valid_values = 'afdbshNtreCzTSc%' + if v: + self.assertFormat(k, v, valid_values, + 'only %s formats are accepted, not %s' % (valid_values, v)) + + k = 'country_ab2' + v = lc.fields[k] + self.assertDefined(k, v) + if v: + self.assertEqual(k, len(v), 2, 'must be 2 letters, not %s' % (v,)) + + # XXX: We can validate lang_ab more. + k = 'lang_ab' + v = lc.fields[k] + if len(self.locale.locale.lang) == 2: + self.assertDefined(k, v) + if v: + self.assertEqual(k, len(v), 2, 'must be 2 letters, not %s' % (v,)) + self.assertEqual(k, v, v.lower(), 'must be lowercase, not %s' % (v,)) + + for k in ('country_ab3', 'lang_term', 'lang_lib'): + v = lc.fields[k] + self.assertDefined(k, v) + if v: + self.assertEqual(k, len(v), 3, 'must be 3 letters, not %s' % (v,)) + + # TODO: We can validate country_post, country_car, country_isbn. + for k in ('country_name', 'country_post', 'country_car', 'country_isbn', + 'lang_name'): + self.assertDefined(k, lc.fields[k]) + + # TODO: We can validate this value more. + k = 'country_num' + v = lc.fields[k] + self.assertNonEmpty(k, v) + if v: + if isinstance(v, int): + v = '%03i' % v + self.assertEqual(k, '', re.sub(r'[0-9]', '', v), + 'must be 3 numbers, not %s' % (v,)) + self.assertEqual(k, len(v), 3, 'must be 3 numbers, not %s' % (v,)) + + +class CheckLCTelephone(Check): + """Check LC_TELEPHONE object for problems.""" + + def run(self): + lc = self.lc + + # XXX: ld-telephone.c is more restrictive. + valid_values = 'aAcCelt' + for k in ('tel_int_fmt', 'tel_dom_fmt'): + v = lc.fields[k] + self.assertNonEmpty(k, v) + if v: + self.assertFormat(k, v, valid_values, + 'only %s formats are accepted, not %s' % (valid_values, v)) + + for k in ('int_select', 'int_prefix'): + self.assertDefined(k, lc.fields[k]) + + +class CheckLCMeasurement(Check): + """Check LC_MEASUREMENT object for problems.""" + + def run(self): + lc = self.lc + + k = 'measurement' + v = lc.fields[k] + valid_values = ( + '1', # Imperial units. + '2', # Metric units. + # XXX: Drop this? Need to implement copy directives. + None, # Noet set. + ) + self.assertIn(k, v, valid_values, 'should be 1 or 2, not %s' % (v,)) + + +def check(loc): + """Check locale |loc| object for problems.""" + ret = True + for cat in locales.CATEGORY_ORDER: + if cat in loc.categories: + # TODO: We should throw an error if |cat| is missing. + lc = getattr(loc, cat.lower()) + checker = getattr(sys.modules[__name__], 'CheckLC%s%s' % + (cat[3], cat[4:].lower()), None) + check = checker(loc, lc) + check.run() + if check.failed: + ret = False + return ret + + +def main(argv): + """The main entry point.""" + parser = get_parser() + opts = parser.parse_args(argv) + + # These are not "real" locales, so skip them. + SKIP_LOCALES = () #'i18n', 'iso14651', 'translit', 'C', 'POSIX') + + # Process all the locales the user told us to. + ret = 0 + for locale in opts.locales: + name = os.path.basename(locale) + if name.split('_', 1)[0] in SKIP_LOCALES: + continue + + try: + loc = locales.Locale(name=name, path=locale) + except UnicodeDecodeError: + print('%s: bad encodings' % (locale,)) + subprocess.check_call(['file', locale]) + except locales.LocaleError as e: + print('%s: %s' % (name, e)) + continue + if not check(loc): + #print('%s: please correct issues' % name) + ret = 1 + return ret + + +if __name__ == '__main__': + exit(main(sys.argv[1:])) --- /dev/null +++ cldr.py @@ -0,0 +1,1204 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# Written by Mike Frysinger <vapier@gentoo.org> for much great glory. +# +# Copyright (C) 2016 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +"""Helper tool for importing current CLDR data. + +See http://cldr.unicode.org/ for more details.""" + +# TODO: Need to handle copy directives better so we can see when a value +# has changed for a specific locale, but it's copying the (wrong) values +# from others. +# TODO: Add missing fields. +# TODO: Add support for updating locale/iso-3166.def via supplementalData.xml. +# TODO: Add support for updating locale/iso-4217.def. +# TODO: In cases where a locale & lang do not exist in the CLDR, we should +# still be able to update English names in the description and aspects that +# are territory specific (and lang independent). +# TODO: To address the previous case, we should split CldrLocale up into a +# base class and CldrLanguage and CldrTerritory children. Then the CldrLocale +# object would take care of blending those into its own results. +# TODO: Add ISBN support: https://www.isbn-international.org/range_file_generation + +from __future__ import print_function + +import argparse +import datetime +import errno +import logging +import os +import re +import subprocess +import sys +import time +from xml.etree import ElementTree + +import locales +u_encode = locales.u_encode +u_decode = locales.u_decode + + +# Where to store CLDR/etc... data files we fetch. +DEFAULT_WORKING_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), + 'cldr-%(version)s') + + +def get_parser(): + """Return an argument parser for this module.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--working-dir', default=DEFAULT_WORKING_DIR, + help='Where to download files (default: %(default)s)') + parser.add_argument('-v', '--version', default=Cldr.CURR_VERSION, + help='Version of CLDR to use (default: %(default)s)') + parser.add_argument('locales', nargs='*', help='Locales to generate') + return parser + + +def logging_init(debug=False): + """Set up the logging module.""" + fmt = '%(asctime)s: %(levelname)-7s: ' + fmt += '%(message)s' + # 'Sat, 05 Oct 2013 18:58:50 -0400 (EST)' + tzname = time.strftime('%Z', time.localtime()) + datefmt = '%a, %d %b %Y %H:%M:%S ' + tzname + level = logging.DEBUG if debug else logging.INFO + handler = logging.StreamHandler(stream=sys.stdout) + formatter = logging.Formatter(fmt, datefmt) + + handler.setFormatter(formatter) + + logger = logging.getLogger() + logger.addHandler(handler) + logger.setLevel(level) + + +class cached_property(object): # pylint: disable=invalid-name + """Like @property but cached""" + + def __init__(self, func): + self.func = func + + def __get__(self, instance, _owner): + if instance is None: + return self + value = instance.__dict__[self.func.__name__] = self.func(instance) + return value + + +class Iso639(object): + """Content for the ISO-639 database.""" + + # Link to upstream ISO-639-2 database. + ISO639_2_URI = 'http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt' + + # Path to our local copy of the ISO-639 database. + PATH = os.path.join(os.path.dirname(os.path.dirname( + os.path.realpath(__file__))), 'locale', 'iso-639.def') + + # Regex to process our local ISO-639 def file. + _LINE_MATCH = re.compile( + r'^(DEFINE_LANGUAGE_CODE $"([^"]*)", ([^,]*), ([^,]*), ([^,]*)$' + r'|DEFINE_LANGUAGE_CODE3 $"([^"]*)", ([^,]*), ([^,]*)$)$') + + def __init__(self): + self.db = {} + with open(self.PATH) as fp: + for line in fp: + m = self._LINE_MATCH.match(line) + if m: + if m.group(1) is None: + # DEFINE_LANGUAGE_CODE3 form. + self.db[m.group(6)] = (m.group(5), m.group(7)) + else: + # DEFINE_LANGUAGE_CODE form. + self.db[m.group(2)] = (m.group(1), m.group(3), + m.group(4)) + + def get_term(self, lang): + """Return the ISO 639-2/T (Terminology) code.""" + entry = self.db.get(lang, ()) + if len(entry) == 3: + return entry[1] + + def get_bib(self, lang): + """Return the ISO 639-2/B (Bibliographic) code.""" + entry = self.db.get(lang, ()) + if len(entry) == 3: + return entry[2] + + def _download_uri(self, path): + """Download the ISO-639-2 db.""" + iso639 = os.path.join(path, os.path.basename(self.ISO639_2_URI)) + if not os.path.exists(iso639): + subprocess.check_call(['wget', '-O', iso639, self.ISO639_2_URI]) + self._load_iso639(iso639) + + @staticmethod + def _load_iso639(db): + """Load ISO-639-2 database. + + http://www.loc.gov/standards/iso639-2/ascii_8bits.html + + An alpha-3 (bibliographic) code, + an alpha-3 (terminologic) code (when given), + an alpha-2 code (when given), + an English name, and + a French name of a language are all separated by pipe (|) characters. + """ + db = {} + with open(db) as fp: + for line in fp: + bcode, tcode, code, _en, _fr = line.rstrip().split('|') + if code: + db[code] = (bcode, tcode) + return db + + +class CarDatabase(object): + """Content for international licence plate country code.""" + + # Path to our local copy of the database. + PATH = os.path.join(os.path.dirname(os.path.dirname( + os.path.realpath(__file__))), 'locale', 'car.def') + + def __init__(self): + lines = [x.strip() for x in open(self.PATH).readlines() if '|' in x] + self.db = dict(x.split('|') for x in lines) + + def get(self, territory): + return self.db.get(territory) + + +class CldrLocale(object): + """Content for a single locale in the cldr database.""" + + _DAY_KEYS = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat') + + def __init__(self, cldr, locale, iso639, cardb): + self._lang = None + self._territory = None + + self.cldr = cldr + self.locale = locale + self.iso639 = iso639 + self.cardb = cardb + + # Try a few variations to try and find a suitable data source. + + # Try the original locale name. + try: + self.locale_root = cldr.load_lang(locale.cldr) + except OSError as e: + if e.errno != errno.ENOENT: + raise + + # See if there is a "world" locale for this lang. + try: + self.locale_root = cldr.load_lang(locale.lang + '_001') + # Override the territory though so it isn't "world". + self._territory = locale.territory + except OSError as e: + if e.errno != errno.ENOENT: + raise + + # Generate the locale ourselves. + self.locale_root = self.generate_locale(locale.lang, locale.territory) + + # We might have languages that are not in CLDR. + + # Try the language w/script name details. + try: + self.lang_root = cldr.load_lang(locale.cldr_lang) + except OSError as e: + if e.errno != errno.ENOENT: + raise + + # Try the plain language then. + try: + self.lang_root = cldr.load_lang(locale.cldr_lang) + except OSError as e: + if e.errno != errno.ENOENT: + raise + + # Stub out the lang. + #self._lang = locale.lang + self.lang_root = None + + @staticmethod + def generate_locale(lang, territory): + """Generate a simple locale XML for this lang/territory. + + Used when we have a locale that isn't in CLDR, but CLDR does have + the lang and we know the territory. + """ + return ElementTree.fromstring( + '<ldml><identity>' + + ('<language type="%s"/>' % lang) + + ('<territory type="%s"/>' % territory) + + '</identity></ldml>' + ) + + @cached_property + def lang(self): + """The locale's short language code.""" + root = self.locale_root.find('identity/language') + return root.get('type') + + @cached_property + def territory(self): + """The locale's short territory code.""" + if self._territory: + return self._territory + root = self.locale_root.find('identity/territory') + return root.get('type') + + @cached_property + def en_lang(self): + """The name of the language in English.""" + root = self.cldr.load_lang('en') + names = root.find('localeDisplayNames') + # First see if the locale has a name before we fall back to the lang. + langs_root = names.find('languages') + lang_root = langs_root.find('language[@type="%s"]' % self.locale) + if lang_root is None: + lang_root = langs_root.find('language[@type="%s"]' % self.lang) + # The CLDR is missing some languages. + if lang_root is None: + logging.warning('%s: en_lang: CLDR is missing english name for ' + 'the language', self.locale) + return None + return lang_root.findtext('.') + + @cached_property + def en_territory(self): + """The name of the territory in English.""" + root = self.cldr.load_lang('en') + names = root.find('localeDisplayNames') + return names.find('territories/territory[@type="%s"]' % + self.territory).findtext('.') + + @cached_property + def country_ab2(self): + """Two-letter ISO-3166 country code.""" + # TODO: Implement this. + + @cached_property + def country_ab3(self): + """Three-letter ISO-3166 country code.""" + # TODO: Implement this. + + @cached_property + def lang_name(self): + """The localized name for the language.""" + for root in (self.locale_root, self.lang_root): + if root is None: + continue + + names = root.find('localeDisplayNames') + if names is not None: + langs_root = names.find('languages') + if langs_root is not None: + lang_root = langs_root.find('language[@type="%s"]' % + self.lang) + if lang_root is not None: + return lang_root.findtext('.') + + @cached_property + def unicode_language_subtag(self): + """Two-letter ISO 639-1 code""" + #root = self.cldr.load_supp('supplementalMetadata') + #alias = root.find('metadata/alias/languageAlias[@type="%s"]' % self.lang) + return self.lang if len(self.lang) == 2 else '' + + @cached_property + def lang_term(self): + """Three-letter ISO 639-2/T (Terminology) code""" + return self.iso639.get_term(self.lang) + + @cached_property + def lang_lib(self): + """Three-letter ISO 639-2/B (Bibliographic) code""" + return self.iso639.get_bib(self.lang) + + @cached_property + def country_name(self): + """The localiezd name for the territory.""" + for root in (self.locale_root, self.lang_root): + if root is None: + continue + + names = root.find('localeDisplayNames') + if names is not None: + name = names.find('territories/territory[@type="%s"]' % + self.territory) + if name is not None: + return name.findtext('.') + + @cached_property + def country_num(self): + """ISO 3166-1 numeric code.""" + root = self.cldr.load_supp('supplementalData') + codes = root.find('codeMappings/territoryCodes[@type="%s"]' % + self.territory) + return int(codes.get('numeric')) + + @cached_property + def country_car(self): + """International licence plate country code.""" + return self.cardb.get(self.territory) + + @cached_property + def country_term(self): + """ISO 3166-1 alpha-3 code""" + root = self.cldr.load_supp('supplementalData') + codes = root.find('codeMappings/territoryCodes[@type="%s"]' % + self.territory) + return codes.get('alpha3').lower() + + @cached_property + def tel_int_fmt(self): + """Telephone format for international calling.""" + # TODO: Implement this. + + @cached_property + def tel_dom_fmt(self): + """Telephone format for domestic calling.""" + # TODO: Implement this. + + @cached_property + def int_select(self): + """Telephone prefix for calling international numbers.""" + # TODO: Implement this. + + @cached_property + def int_prefix(self): + """Telephone international country code prefix.""" + root = self.cldr.load_supp('telephoneCodeData') + code = root.find('telephoneCodeData/codesByTerritory[@territory="%s"]' + '/telephoneCountryCode' % self.territory) + # The CLDR is missing some territories. + if code is None: + logging.warning('%s: int_prefix: CLDR is missing country code; ' + 'try https://countrycode.org/%s', + self.locale, self.territory) + return None + return code.get('code') + + @cached_property + def int_curr_symbol(self): + """Need to rectify w/locale/iso-4217.def.""" + # The xmlpath support in python is not complete, so we need to search + # for the currency w/missing @to attribute ourselves. + root = self.cldr.load_supp('supplementalData') + currencies = root.find('currencyData/region[@iso3166="%s"]' % + self.territory) + for currency in currencies.getchildren(): + if 'to' not in currency.keys(): + return currency.get('iso4217') + + raise ValueError('Could not find a currency for %s' % (self.territory,)) + + @cached_property + def currency_symbol(self): + """Need to rectify w/locale/iso-4217.def.""" + def filter_markers(sym): + """Strip out some content we don't care about like the RTL marker.""" + return sym.replace(u'\u200f', '') + + # First search the locale, then the lang dbs. + for root in (self.locale_root, self.lang_root): + if root is None: + continue + + numbers_root = root.find('numbers') + if numbers_root is None: + continue + symbol_ele = numbers_root.find('currencies/currency[@type="%s"]' + '/symbol' % self.int_curr_symbol) + if symbol_ele is not None: + return filter_markers(symbol_ele.findtext('.')) + + # Try the common currency database. + chars_root = self.cldr.load_supp('characters') + for symbol_ele in chars_root.find('characters' + '/character-fallback').getchildren(): + if symbol_ele.findtext('substitute') == self.int_curr_symbol: + return filter_markers(symbol_ele.get('value')) + + # A few symbols have no translation. + return None #self.int_curr_symbol + + @cached_property + def number_system(self): + """Get the active number system for this locale.""" + for root in (self.locale_root, self.lang_root): + if root is None: + continue + + numbers_root = root.find('numbers') + if numbers_root is None: + continue + + # If there's a default labeled, use it. Otherwise just go with + # the first one found. It should be the only one. + num_sys_ele = root.find('defaultNumberingSystem') + if num_sys_ele is None: + return numbers_root.find('symbols') + else: + return numbers_root.find('symbols[@numberSystem="%s"]' % + num_sys_ele.findtext('.')) + + @cached_property + def decimal_point(self): + """The symbol used to denote decimal points.""" + num_symbols_root = self.number_system + try: + return num_symbols_root.find('decimal').findtext('.') + except AttributeError: + return None + + @cached_property + def thousands_sep(self): + """The symbol used to group thousands digits.""" + num_symbols_root = self.number_system + try: + return num_symbols_root.find('group').findtext('.') + except AttributeError: + return None + + @cached_property + def grouping(self): + # TODO: Implement this. + pass + + def _lookup_day_mon(self, cal_field, cal_type, cal_idxs): + """Look up various calendar fields.""" + for root in (self.locale_root, self.lang_root): + if root is None: + continue + + dates_root = root.find('dates') + if dates_root is None: + continue + calendars_root = dates_root.find('calendars') + if calendars_root is None: + continue + # XXX: Look up type in calendarPreference ? + calendar_root = calendars_root.find('calendar[@type="gregorian"]') + if calendar_root is None: + continue + + dm_root = None + for key in ('stand-alone', 'format', 'narrow'): + ctx_root = calendar_root.find('%ss/%sContext[@type="%s"]' % + (cal_field, cal_field, key)) + if ctx_root is None: + continue + dm_root = ctx_root.find('%sWidth[@type="%s"]' % + (cal_field, cal_type)) + if dm_root is None: + continue + + ret = [dm_root.find('%s[@type="%s"]' % (cal_field, x)) + for x in cal_idxs] + if None not in ret: + return [x.findtext('.') for x in ret] + + def _lookup_day(self, width_type): + """Internal helper for abday/day lookups.""" + return self._lookup_day_mon('day', width_type, self._DAY_KEYS) + + def _lookup_mon(self, width_type): + """Internal helper for abmon/mon lookups.""" + return self._lookup_day_mon('month', width_type, range(1, 13)) + + @cached_property + def abday(self): + """Abbreviated localized names for the days of the week.""" + return self._lookup_day('abbreviated') + + @cached_property + def day(self): + """Full localized names for the days of the week.""" + return self._lookup_day('wide') + + @cached_property + def abmon(self): + """Abbreviated localized names for the months.""" + return self._lookup_mon('abbreviated') + + @cached_property + def mon(self): + """Full localized names for the months.""" + return self._lookup_mon('wide') + + # http://www.unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns + _CLDR_TO_POSIX_FMT = { + # year + 'y': '%%-y', + 'yy': '%%y', + 'yyy': '%%-Y', + 'yyyy': '%%Y', + # month + 'M': '%%-m', + 'MM': '%%m', + 'MMM': '%%b', + 'MMMM': '%%B', + # day + 'd': '%%-d', + 'dd': '%%d', + # period + 'a': '%%p', + # hour + 'h': '%%-I', + 'hh': '%%I', + 'H': '%%-H', + 'HH': '%%H', + # minute + 'm': '%%-M', + 'mm': '%%M', + # second + 's': '%%-S', + 'ss': '%%S', + } + + @classmethod + def _to_posix_fmt(cls, fmt): + """Convert the CLDR notation to what POSIX uses.""" + lookup = lambda m: cls._CLDR_TO_POSIX_FMT[m.group(1)] + return re.sub(r'\b(' + '|'.join(cls._CLDR_TO_POSIX_FMT.keys()) + r')\b', + lookup, fmt) + + @cached_property + def hours_format(self): + """Return 24 or 12 depending on preferred %H or %h format""" + root = self.cldr.load_supp('supplementalData') + datasets = root.find('timeData') + pref = None + for dataset in datasets.findall('hours'): + territories = dataset.get('regions') + value = dataset.get('preferred') + + # TODO: Make this walk logic more robust/common. + territories = territories.split() + if '001' in territories: + if pref is None: + # The allowed field makes this tricky. + #pref = value + pass + if self.territory in territories: + pref = value + + if pref == 'H': + return '24' + elif pref == 'h': + return '12' + elif pref is None: + return None + else: + raise ValueError('Unknown hour value: %s' % pref) + + @cached_property + def am_pm(self): + """Localized AM/PM time fields when 12 hour clocks are used.""" + if self.hours_format == '24': + return ['', ''] + elif self.hours_format is None: + return None + + return self._lookup_day_mon('dayPeriod', 'abbreviated', ('am', 'pm')) + + def _lookup_d_t_fmt(self, dt, dt_type='medium'): + """Internal helper for various fmt lookups.""" + for root in (self.locale_root, self.lang_root): + if root is None: + continue + + dates_root = root.find('dates') + if dates_root is None: + continue + calendars_root = dates_root.find('calendars') + # XXX: Look up type in calendarPreference ? + calendar_root = calendars_root.find('calendar[@type="gregorian"]') + + fmts = calendar_root.find('%sFormats/%sFormatLength[@type="%s"]' + '/%sFormat/pattern' % + (dt, dt, dt_type, dt)) + if fmts is not None: + return fmts.findtext('.') + + @cached_property + def d_t_fmt(self): + """Appropriate date and time representation (%c) + + Example: + $ date +'%a %d %b %Y %r %Z' + Tue 09 Feb 2016 06:39:48 PM EST + """ + return self._to_posix_fmt( + self._lookup_d_t_fmt('dateTime').replace( + '{0}', self._t_fmt).replace( + '{1}', self._d_fmt)) + + @cached_property + def _d_fmt(self): + """Internal helper for the raw d_fmt field.""" + return self._lookup_d_t_fmt('date') + + @cached_property + def d_fmt(self): + """Appropriate date representation (%x) + + Example: + $ date +'%m/%d/%Y' + 02/09/2016 + """ + return self._to_posix_fmt(self._d_fmt) + + @cached_property + def _t_fmt(self): + """Internal helper for the raw t_fmt field.""" + return self._lookup_d_t_fmt('time') + + @cached_property + def t_fmt(self): + """Appropriate time representation (%X) + + Example: + $ date +%r + 06:41:21 PM + """ + return self._to_posix_fmt(self._t_fmt) + + @cached_property + def t_fmt_ampm(self): + """Appropriate AM/PM time representation (%r) + + Example: + $ date +'%I:%M:%S %p' + 06:41:21 PM + """ + if self.hours_format == '24': + return '' + elif self.hours_format is None: + return None + + return None + + @cached_property + def date_fmt(self): + """Appropriate date representation (date(1)) + + $ date +'%a %b %e %H:%M:%S %Z %Y' + Tue Feb 9 06:39:48 EST 2016 + """ + pass + + @cached_property + def week(self): + """DAYSINWEEK;WEEKSTARTDATE;MINWEEKLEN field""" + root = self.cldr.load_supp('supplementalData') + data = root.find('weekData') + ret = None + for start in data.findall('minDays'): + territories = start.get('territories') + value = start.get('count') + + # TODO: Make this walk logic more robust/common. + territories = territories.split() + if '001' in territories: + if ret is None: + ret = value + if self.territory in territories: + ret = value + + # Just hardcode this as no one changes it. + daysinweek = 7 + + # Hardcode this as well as there's no advantage to it otherwise. + # It's also what CLDR bases things on. + weekstartdate = 19971130 + + minweeklen = int(ret) + + return (daysinweek, weekstartdate, minweeklen) + + @cached_property + def first_weekday(self): + """Number of day in the week for the first column in the calendar. + + Sunday = 1, Monday = 2, ... + """ + root = self.cldr.load_supp('supplementalData') + data = root.find('weekData') + first = None + for start in data.findall('firstDay'): + territories = start.get('territories') + day = start.get('day') + + # Throw out ones we don't care about. + if start.get('alt') is not None: + continue + + # TODO: Make this walk logic more robust/common. + territories = territories.split() + if '001' in territories: + if first is None: + first = day + if self.territory in territories: + first = day + + # We add +1 for index->day-of-week adjustment, + return self._DAY_KEYS.index(first) + 1 + + @cached_property + def first_workday(self): + """Number of day in the week for the first working day. + + Sunday = 1, Monday = 2, ... + """ + root = self.cldr.load_supp('supplementalData') + data = root.find('weekData') + first = None + for start in data.findall('weekendEnd'): + territories = start.get('territories') + day = start.get('day') + + # TODO: Make this walk logic more robust/common. + territories = territories.split() + if '001' in territories: + if first is None: + first = day + if self.territory in territories: + first = day + + # We add +1 for index->day-of-week adjustment, + # and we add +1 for weekendEnd->workdayStart. + # We do the % to handle sat->sun wrapping. + return ((self._DAY_KEYS.index(first) + 1) % 7) + 1 + + @cached_property + def measurement(self): + """Return 1 for metric and 2 for imperial""" + root = self.cldr.load_supp('supplementalData') + measurement = None + for system in root.findall('measurementData/measurementSystem'): + territories = system.get('territories') + stype = system.get('type') + + # Throw out ones we don't care about. + if system.get('category') == 'temperature' or stype == 'UK': + continue + + # TODO: Make this walk logic more robust/common. + territories = territories.split() + if '001' in territories: + if measurement is None: + measurement = stype + if self.territory in territories: + measurement = stype + + # We don't use imperial settings for Myanmar even though CLDR does. + # https://en.wikipedia.org/wiki/Myanmar_units_of_measurement + if self.territory == 'MM': + if measurement == 'US': + measurement = 'metric' + else: + raise ValueError('CLDR is updated; drop this hack') + + if measurement == 'metric': + return 1 + elif measurement == 'US': + return 2 + else: + raise ValueError('Do not understand type %s' % measurement) + + @cached_property + def measurement_copy(self): + """We copy other locales for most""" + if self.locale in ('en_US', 'i18n'): + return None + elif self.measurement == 1: + return 'i18n' + elif self.measurement == 2: + return 'en_US' + else: + raise ValueError('Unknown measurement %s' % self.measurement) + + @cached_property + def paper(self): + """Return the paper type""" + root = self.cldr.load_supp('supplementalData') + paper = None + for system in root.findall('measurementData/paperSize'): + territories = system.get('territories') + stype = system.get('type') + + # TODO: Make this walk logic more robust/common. + territories = territories.split() + if '001' in territories: + if paper is None: + paper = stype + if self.territory in territories: + paper = stype + + return paper + + @cached_property + def paper_height(self): + """Return the height of paper (in mm)""" + return {'A4': 297, 'US-Letter': 279}.get(self.paper) + + @cached_property + def paper_width(self): + """Return the width of paper (in mm)""" + return {'A4': 210, 'US-Letter': 216}.get(self.paper) + + @cached_property + def paper_copy(self): + """We copy other locales for most""" + if self.locale in ('en_US', 'i18n'): + return None + elif self.paper == 'A4': + return 'i18n' + elif self.paper == 'US-Letter': + return 'en_US' + else: + raise ValueError('Unknown paper %s' % self.paper) + + +class Cldr(object): + """Content for the cldr database.""" + + # The current release version that we use. + CURR_VERSION = '29' + + # Where to find the CLDR data. + URI = 'http://unicode.org/Public/cldr/%(version)s/core.zip' + + def __init__(self, path, version): + fields = {'version': version} + self.dir = path % fields + self.uri = self.URI % fields + self.version = version + self.date = None + self.main_dbs = {} + self.supp_dbs = {} + self.iso639 = Iso639() + self.cardb = CarDatabase() + + # Set up the working dir. + if not os.path.exists(self.dir): + os.makedirs(self.dir) + + def download(self): + """Download the current cldr database.""" + # Download the CLDR data. + archive = os.path.join(self.dir, 'core.zip') + if not os.path.exists(archive): + subprocess.check_call(['wget', '-O', archive, self.uri]) + self.date = datetime.datetime.fromtimestamp(os.path.getmtime(archive)) + + # Unpack the CLDR data. + common_dir = os.path.join(self.dir, 'common') + if not os.path.exists(common_dir): + subprocess.check_call(['unzip', '-u', 'core.zip'], cwd=self.dir) + + def _load_db(self, db, subdir, cache): + """Load the database |db| out of |subdir| using |cache|.""" + if db not in cache: + db_path = os.path.join(self.dir, 'common', subdir, '%s.xml' % db) + tree = ElementTree.parse(db_path) + cache[db] = tree.getroot() + return cache[db] + + def _load_main(self, db): + """Load database |db| from the main repo.""" + return self._load_db(db, 'main', self.main_dbs) + + def load_lang(self, lang): + """Load the language |lang| database.""" + return self._load_main(lang) + + def load_supp(self, db): + """Load database |db| from the supplemental repo.""" + return self._load_db(db, 'supplemental', self.supp_dbs) + + def locale(self, locale): + """Get an object for a specific cldr |locale|.""" + return CldrLocale(self, locale, self.iso639, self.cardb) + + +class Locale(locales.Locale): + """An updated locale datafile.""" + + def update_cldr(self, cldr): + """Merge CLDR updates in to this locale.""" + cldr_locale = cldr.locale(self.locale) + if cldr_locale is None: + logging.warning('%s: no CLDR entry found for %s', + self.name, self.locale) + return + + # Start updating the actual data. + cldr_values = { + 'generator': os.path.basename(__file__), + 'english_territory_name': cldr_locale.en_territory, + 'source_name': 'Unicode Common Locale Data Repository (CLDR)', + 'source_version': cldr.version, + 'source_uri': cldr.uri.replace('/', '//'), + 'source_date': cldr.date.strftime('%Y-%m-%d'), + 'lang': cldr_locale.lang, + 'territory': cldr_locale.territory, + 'locale': cldr_locale.locale, + } + if cldr_locale.en_lang: + cldr_values.update({ + 'english_lang_name': u_decode(cldr_locale.en_lang), + }) + + all_values = {} + all_values['LC_IDENTIFICATION'] = { + #'source': 'Based on %(source_name)s', + #'address': '%(source_uri)s', + #'contact': 'http:////cldr.unicode.org//index//process', + #'email': 'bug-glibc-locales@gnu.org', + 'tel': '', + 'fax': '', + 'territory': '%(english_territory_name)s', + #'revision': '%(source_version)s', + #'date': '%(source_date)s', + } + if cldr_locale.en_lang: + all_values['LC_IDENTIFICATION'].update({ + 'title': ('%(english_lang_name)s language locale for ' + '%(english_territory_name)s'), + 'language': '%(english_lang_name)s', + }) + + # These are based on the charset, not the locale. + all_values['LC_CTYPE'] = {} + all_values['LC_COLLATE'] = {} + all_values['LC_TIME'] = { + #'abday': cldr_locale.abday, + #'day': cldr_locale.day, + #'abmon': cldr_locale.abmon, + #'mon': cldr_locale.mon, + #'am_pm': cldr_locale.am_pm, + #'d_t_fmt': cldr_locale.d_t_fmt, + #'d_fmt': cldr_locale.d_fmt, + #'t_fmt': cldr_locale.t_fmt, + #'t_fmt_ampm': cldr_locale.t_fmt_ampm, + #'date_fmt': cldr_locale.date_fmt, + #'week': cldr_locale.week, + #'first_weekday': cldr_locale.first_weekday, + #'first_workday': cldr_locale.first_workday, + } + all_values['LC_NUMERIC'] = { + #'decimal_point': cldr_locale.decimal_point, + #'thousands_sep': cldr_locale.thousands_sep, + #'grouping': cldr_locale.grouping, + } + all_values['LC_MONETARY'] = { + 'int_curr_symbol': cldr_locale.int_curr_symbol + ' ', + 'currency_symbol': cldr_locale.currency_symbol, + } + # See lang/posix/messages/{yes,no}str. + all_values['LC_MESSAGES'] = { + #'yesexpr': cldr_locale.yesexpr, + #'noexpr': cldr_locale.noexpr, + } + all_values['LC_PAPER'] = { + 'height': cldr_locale.paper_height, + 'width': cldr_locale.paper_width, + #'copy': cldr_locale.paper_copy, + } + # XXX: Need a data source for this. + all_values['LC_NAME'] = { + } + all_values['LC_ADDRESS'] = { + #'postal_fmt': + 'country_name': cldr_locale.country_name, + #'country_post': + 'country_ab2': cldr_locale.country_ab2, + 'country_ab3': cldr_locale.country_ab3, + 'country_num': cldr_locale.country_num, + 'country_car': cldr_locale.country_car, + #'country_isbn': + 'lang_name': cldr_locale.lang_name, + 'lang_ab': cldr_locale.unicode_language_subtag, + 'lang_term': cldr_locale.lang_term, + 'lang_lib': cldr_locale.lang_lib, + } + all_values['LC_TELEPHONE'] = { + #'tel_int_fmt': cldr_locale.tel_int_fmt, + #'tel_dom_fmt': cldr_locale.tel_dom_fmt, + #'int_select': cldr_locale.int_select, + } + if cldr_locale.int_prefix: + all_values['LC_TELEPHONE'].update({ + 'int_prefix': cldr_locale.int_prefix, + }) + all_values['LC_MEASUREMENT'] = { + 'measurement': cldr_locale.measurement, + #'copy': cldr_locale.measurement_copy, + } + + a = str(cldr_locale.lang) + b = self.lc_address.fields['lang_ab'] + if b and a != b: + print('%s: mismatch: %s %s' % (self.name, a, b)) + + # Walk all the categories. + for category in self.categories: + lc = getattr(self, category.lower()) + values = all_values[category] + if not values: + continue + + # Walk each line in this locale category. + start_of_line = None + full_line = '' + i = 0 + seen_keys = set() + while i < len(lc.content): + line = lc.content[i] + if not line: + i += 1 + continue + + # If the line ends with an escape it is wrapped, so unwrap it + # before we check for updates to the value. + if (not line.startswith(self.comment_char) and + line.endswith(self.escape_char)): + if not full_line: + start_of_line = i + full_line += line[:-1].lstrip() + i += 1 + continue + elif full_line: + line = full_line + line.lstrip() + full_line = '' + else: + start_of_line = None + + # Process this line. + key = line.split()[0] + new_value = values.get(key) + seen_keys.add(key) + if new_value is not None: + is_int = isinstance(new_value, int) + is_list = isinstance(new_value, (tuple, list, set)) + if not is_int and is_list: + is_int = isinstance(new_value[0], int) + if is_int: + if is_list: + new_value = ';'.join(str(x) for x in new_value) + else: + new_value = str(new_value) + m = re.match(r'\s*(.*?)\s+([0-9;]+)$', line) + else: + if is_list: + new_value = '";"'.join(u_encode(x % cldr_values) + for x in new_value) + elif key != 'copy': + new_value %= cldr_values + if category != 'LC_IDENTIFICATION': + new_value = u_encode(new_value) + m = re.match(r'\s*([^"]*)"(.*)"$', line) + + # We should standardize case at some point. + if m and new_value.lower() != m.group(2).lower(): + disp_key = ('%s:%s' % (category.upper(), key) + if key == 'copy' else key) + logging.info('%s: %s: changing {%s} to {%s}', + self.name, disp_key, + u_decode(m.group(2)), + u_decode(new_value)) + leading_line = m.group(1) + + # This is tricky as we have to delete most of the + # multiline, then update the one remaining. + if start_of_line is not None: + #for _ in range(start_of_line, i): + # lc.content.pop(start_of_line) + del lc.content[start_of_line:i] + i = start_of_line + if '";"' in new_value: + leading_line = leading_line.rstrip() + '\t' + num_tabs = (len(leading_line) // 8) + 1 + new_value = new_value.replace( + '";"', + '";/\n' + ('\t' * num_tabs) + '"') + + # Finally deploy the updated line. + fmt = '%s %s' if is_int else '%s"%s"' + lc.content[i] = fmt % (leading_line, new_value) + + i += 1 + + missing_keys = set(values.keys()) - seen_keys + for key in missing_keys: + # TODO: Merge with the logic above. + new_value = str(values[key]) + old_value = str(lc.fields.get(key)) + if new_value is not None and new_value != old_value: + logging.info('%s: %s: changing {%s} to {%s}', + self.name, key, old_value, new_value) + lc.content.append('%s "%s"' % (key, new_value)) + + +def main(argv): + """The main entry point.""" + parser = get_parser() + opts = parser.parse_args(argv) + logging_init(opts) + + # Get a handle to the cldr database. + cldr = Cldr(opts.working_dir, opts.version) + cldr.download() + + # These are not "real" locales, so skip them. + SKIP_LOCALES = ('i18n', 'iso14651', 'translit', 'C', 'POSIX') + + # Process all the locales the user told us to. + for locale in opts.locales: + name = os.path.basename(locale) + if name.split('_', 1)[0] in SKIP_LOCALES: + continue + + logging.info('Updating %s', locale) + try: + loc = Locale(name=name, path=locale) + try: + loc.update_cldr(cldr) + except Exception: + logging.error('%s: updating failed', locale, exc_info=True) + loc.write(locale + '.new') + os.rename(locale + '.new', locale) + except UnicodeDecodeError: + logging.error('%s: bad encodings', locale, exc_info=True) + subprocess.check_call(['file', locale]) + except (IndexError, locales.LocaleError): + logging.error('%s: loading failed', locale, exc_info=True) + + +if __name__ == '__main__': + exit(main(sys.argv[1:])) --- /dev/null +++ filter.py @@ -0,0 +1,69 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# Written by Mike Frysinger <vapier@gentoo.org> for much great glory. +# +# Copyright (C) 2016 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +"""Simple script to quickly make locale files readable. + +Example: %(prog)s locales/en_US | less +""" + +from __future__ import print_function + +import argparse +import sys + +import locales + + +def process(_opts, fp): + for line in fp: + try: + line = locales.u_decode(line) + except ValueError as e: + # Python's chr() does not support the full UTF-8 codepoint + # range. Just use the line as-is if it fails. + print('FILTER ERROR: %s' % e) + print(line, end='') + + +def process_path(opts, path): + return process(opts, open(path)) + + +def get_parser(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('files', nargs='*') + return parser + + +def main(argv): + parser = get_parser() + opts = parser.parse_args(argv) + if not opts.files: + process(opts, sys.stdin) + else: + for f in opts.files: + process_path(opts, f) + + +if __name__ == '__main__': + exit(main(sys.argv[1:]))

[WIP] locale python scripts for cldr updates

Commit Message

Patch