From patchwork Fri Dec 8 09:19:24 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Arnold Robbins X-Patchwork-Id: 24802 Received: (qmail 97522 invoked by alias); 8 Dec 2017 09:21:21 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 96846 invoked by uid 89); 8 Dec 2017 09:20:30 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-26.5 required=5.0 tests=AWL, BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_LAZY_DOMAIN_SECURITY, KAM_SHORT, MANY_HDRS_LCASE, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.2 spammy=HContent-type:text, Hx-spam-relays-external:ESMTPA X-HELO: mxout4.netvision.net.il MIME-version: 1.0 Content-transfer-encoding: 7BIT Content-type: text/plain; CHARSET=US-ASCII From: Arnold Robbins Message-id: <201712080919.vB89JO1Q005653@skeeve.com> Date: Fri, 08 Dec 2017 11:19:24 +0200 To: carlos@redhat.com, libc-alpha@sourceware.org Subject: [PATCH 17/17] Regex: Implement Rational Range Interpretation. User-Agent: Heirloom mailx 12.5 6/20/10 This patch implements Rational Range Interpretation. For some history and discussion, see the gawk manual: https://www.gnu.org/software/gawk/manual/html_node/Ranges-and-Locales.html. 2017-11-30 Arnold D. Robbins Implement Rational Range Interpretation. * posix/regcomp.c (build_range_exp): Pass in the syntax bits. Use it to check RE_NO_EMPTY_RANGES, and check wide char values. Remove use of wscoll to determine range start and end. (parse_bracket_exp): Pass the syntax bits to build_range_exp. * posix/regexec.c (check_node_accept_bytes): Don't use wscoll to check ranges, but rather wide character values. diff --git a/posix/regcomp.c b/posix/regcomp.c index e63c258..0005fe7 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -2654,11 +2654,12 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, static reg_errcode_t # ifdef RE_ENABLE_I18N -build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, - bracket_elem_t *start_elem, bracket_elem_t *end_elem) -# else /* not RE_ENABLE_I18N */ -build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, +build_range_exp (reg_syntax_t syntax, bitset_t sbcset, re_charset_t *mbcset, + int *range_alloc, bracket_elem_t *start_elem, bracket_elem_t *end_elem) +# else /* not RE_ENABLE_I18N */ +build_range_exp (reg_syntax_t syntax, bitset_t sbcset, + bracket_elem_t *start_elem, bracket_elem_t *end_elem) # endif /* not RE_ENABLE_I18N */ { unsigned int start_ch, end_ch; @@ -2681,7 +2682,6 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, wchar_t wc; wint_t start_wc; wint_t end_wc; - wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] @@ -2695,9 +2695,7 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, ? __btowc (end_ch) : end_elem->opr.wch); if (start_wc == WEOF || end_wc == WEOF) return REG_ECOLLATE; - cmp_buf[0] = start_wc; - cmp_buf[4] = end_wc; - if (__wcscoll (cmp_buf, cmp_buf + 4) > 0) + else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0)) return REG_ERANGE; /* Got valid collation sequence values, add them as a new entry. @@ -2745,9 +2743,7 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, /* Build the table for single byte characters. */ for (wc = 0; wc < SBC_MAX; ++wc) { - cmp_buf[2] = wc; - if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0 - && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) + if (start_wc <= wc && wc <= end_wc) bitset_set (sbcset, wc); } } @@ -3190,15 +3186,15 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, token_len = peek_token_bracket (token, regexp, syntax); #ifdef _LIBC - *err = build_range_exp (sbcset, mbcset, &range_alloc, + *err = build_range_exp (syntax, sbcset, mbcset, &range_alloc, &start_elem, &end_elem); #else # ifdef RE_ENABLE_I18N - *err = build_range_exp (sbcset, + *err = build_range_exp (syntax, sbcset, dfa->mb_cur_max > 1 ? mbcset : NULL, &range_alloc, &start_elem, &end_elem); # else - *err = build_range_exp (sbcset, &start_elem, &end_elem); + *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); # endif #endif /* RE_ENABLE_I18N */ if (BE (*err != REG_NOERROR, 0)) diff --git a/posix/regexec.c b/posix/regexec.c index dcdd33b..a9ed91f 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -3885,18 +3885,10 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, # endif /* _LIBC */ { /* match with range expression? */ -#if __GNUC__ >= 2 - wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'}; -#else - wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; - cmp_buf[2] = wc; -#endif for (i = 0; i < cset->nranges; ++i) { - cmp_buf[0] = cset->range_starts[i]; - cmp_buf[4] = cset->range_ends[i]; - if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0 - && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) + if (cset->range_starts[i] <= wc + && wc <= cset->range_ends[i]) { match_len = char_len; goto check_node_accept_bytes_match;