From patchwork Fri Aug 18 11:04:29 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Mike FABIAN X-Patchwork-Id: 22211 X-Patchwork-Delegate: mfabian@redhat.com Received: (qmail 84695 invoked by alias); 18 Aug 2017 11:04:53 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 51697 invoked by uid 89); 18 Aug 2017 11:04:34 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-25.9 required=5.0 tests=BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_ASCII_DIVIDERS, KAM_LOTSOFHASH, RP_MATCHES_RCVD, SPF_HELO_PASS autolearn=ham version=3.3.2 spammy=Attached X-Spam-User: qpsmtpd, 2 recipients X-HELO: mx1.redhat.com DMARC-Filter: OpenDMARC Filter v1.3.2 mx1.redhat.com 9EBED81DF0 Authentication-Results: ext-mx01.extmail.prod.ext.phx2.redhat.com; dmarc=none (p=none dis=none) header.from=redhat.com Authentication-Results: ext-mx01.extmail.prod.ext.phx2.redhat.com; spf=fail smtp.mailfrom=mfabian@redhat.com From: Mike FABIAN To: Florian Weimer Cc: Joseph Myers , GNU C Library , libc-locales@sourceware.org Subject: Re: localedef during tests suddenly needs a lot of memory References: <8d69bb17-b684-dfa4-2248-36a7fd65457d@redhat.com> <19bb4e0c-7452-142c-94c4-e8ad543dda3d@redhat.com> <365430de-a824-dcbf-6513-5eec3f18a97a@redhat.com> <47bfc62d-217a-51b3-4178-0e3dc48a256d@redhat.com> Date: Fri, 18 Aug 2017 13:04:29 +0200 In-Reply-To: (Mike FABIAN's message of "Fri, 18 Aug 2017 12:09:09 +0200") Message-ID: User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/25.1.50 (gnu/linux) MIME-Version: 1.0 Mike FABIAN wrote: > Florian Weimer wrote: > >> This started with: >> >> commit 7a79e321c6f85b204036c33d85f6b2aa794e7c76 >> Author: Thorsten Glaser >> Date: Fri Jul 14 14:02:50 2017 +0200 >> >> Refresh generated charmap data and ChangeLog >> >> [BZ #21750] >> * charmaps/UTF-8: Refresh. >> >> Mike is looking at re-adding the range generation support to the Python >> script, hopefully that should reduce the memory requirements again. >> >> Florian > > Attached is muy patch to use ranges instead of single code points in the width > data of charmaps/UTF-8 whereever possible. To do this, I rewrote most of the code Thorsten Glaser added. ---------------------------------------------------------------------- Here is the diff of my patched utf8_gen.py against Thorsten Glaser’s version: $ git diff a3fe6a20bf81ef6a97a761dac9050517e7fd7a1f..4f737628ef23033b8d78e0acead37b2722419822 localedata/unicode-gen/utf8_gen.py diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py index 1563aa11d2..7efae08461 100755 --- a/localedata/unicode-gen/utf8_gen.py +++ b/localedata/unicode-gen/utf8_gen.py @@ -229,27 +229,45 @@ def process_width(outfile, ulines, elines): code_points = fields[0].split("..") for key in range(int(code_points[0], 16), int(code_points[1], 16)+1): - width_dict[key] = unicode_utils.ucs_symbol(key) + '\t2' + width_dict[key] = 2 for line in ulines: fields = line.split(";") if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"): - width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( - int(fields[0], 16)) + '\t0' + width_dict[int(fields[0], 16)] = 0 # handle special cases for compatibility - for key in list(range(0x1160, 0x1200)) + list(range(0x3248, 0x3250)) + \ - list(range(0x4DC0, 0x4E00)) + list((0x00AD,)): + for key in list((0x00AD,)): if key in width_dict: del width_dict[key] - width_dict[0x1160] = '{:s}...{:s}\t0'.format( - unicode_utils.ucs_symbol(0x1160), unicode_utils.ucs_symbol(0x11FF)) - width_dict[0x3248] = '{:s}...{:s}\t2'.format( - unicode_utils.ucs_symbol(0x3248), unicode_utils.ucs_symbol(0x324F)) - width_dict[0x4DC0] = '{:s}...{:s}\t2'.format( - unicode_utils.ucs_symbol(0x4DC0), unicode_utils.ucs_symbol(0x4DFF)) + for key in list(range(0x1160, 0x1200)): + width_dict[key] = 0 + for key in list(range(0x3248, 0x3250)) + list(range(0x4DC0, 0x4E00)): + width_dict[key] = 2 + same_width_lists = [] + current_width_list = [] for key in sorted(width_dict): - outfile.write(width_dict[key]+'\n') + if not current_width_list: + current_width_list = [key] + elif (key == current_width_list[-1] + 1 + and width_dict[key] == width_dict[current_width_list[0]]): + current_width_list.append(key) + else: + same_width_lists.append(current_width_list) + current_width_list = [key] + if current_width_list: + same_width_lists.append(current_width_list) + + for same_width_list in same_width_lists: + if len(same_width_list) == 1: + outfile.write('{:s}\t{:d}\n'.format( + unicode_utils.ucs_symbol(same_width_list[0]), + width_dict[same_width_list[0]])) + else: + outfile.write('{:s}...{:s}\t{:d}\n'.format( + unicode_utils.ucs_symbol(same_width_list[0]), + unicode_utils.ucs_symbol(same_width_list[-1]), + width_dict[same_width_list[0]])) if __name__ == "__main__": if len(sys.argv) < 3: ---------------------------------------------------------------------- Here is the diff of my patched utf8_gen.py against a version before Thorsten Glaser’s patches: $ git diff bfff8b1becd7d01c074177df7196ab327cd8c844..4f737628ef23033b8d78e0acead37b2722419822 localedata/unicode-gen/utf8_gen.py diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py index ab03e750a6..7efae08461 100755 --- a/localedata/unicode-gen/utf8_gen.py +++ b/localedata/unicode-gen/utf8_gen.py @@ -221,31 +221,53 @@ def process_width(outfile, ulines, elines): ''' width_dict = {} - for line in ulines: - fields = line.split(";") - if fields[4] == "NSM" or fields[2] == "Cf": - width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( - int(fields[0], 16)) + '\t0' - for line in elines: - # If an entry in EastAsianWidth.txt is found, it overrides entries in - # UnicodeData.txt: fields = line.split(";") if not '..' in fields[0]: - width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( - int(fields[0], 16)) + '\t2' + code_points = (fields[0], fields[0]) else: code_points = fields[0].split("..") - for key in range(int(code_points[0], 16), - int(code_points[1], 16)+1): - if key in width_dict: - del width_dict[key] - width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format( - unicode_utils.ucs_symbol(int(code_points[0], 16)), - unicode_utils.ucs_symbol(int(code_points[1], 16))) + for key in range(int(code_points[0], 16), + int(code_points[1], 16)+1): + width_dict[key] = 2 + for line in ulines: + fields = line.split(";") + if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"): + width_dict[int(fields[0], 16)] = 0 + # handle special cases for compatibility + for key in list((0x00AD,)): + if key in width_dict: + del width_dict[key] + for key in list(range(0x1160, 0x1200)): + width_dict[key] = 0 + for key in list(range(0x3248, 0x3250)) + list(range(0x4DC0, 0x4E00)): + width_dict[key] = 2 + + same_width_lists = [] + current_width_list = [] for key in sorted(width_dict): - outfile.write(width_dict[key]+'\n') + if not current_width_list: + current_width_list = [key] + elif (key == current_width_list[-1] + 1 + and width_dict[key] == width_dict[current_width_list[0]]): + current_width_list.append(key) + else: + same_width_lists.append(current_width_list) + current_width_list = [key] + if current_width_list: + same_width_lists.append(current_width_list) + + for same_width_list in same_width_lists: + if len(same_width_list) == 1: + outfile.write('{:s}\t{:d}\n'.format( + unicode_utils.ucs_symbol(same_width_list[0]), + width_dict[same_width_list[0]])) + else: + outfile.write('{:s}...{:s}\t{:d}\n'.format( + unicode_utils.ucs_symbol(same_width_list[0]), + unicode_utils.ucs_symbol(same_width_list[-1]), + width_dict[same_width_list[0]])) if __name__ == "__main__": if len(sys.argv) < 3: