[17/17] Regex: Implement Rational Range Interpretation.

Message ID 201712080919.vB89JO1Q005653@skeeve.com
State New, archived
Headers

Commit Message

Arnold Robbins Dec. 8, 2017, 9:19 a.m. UTC
  This patch implements Rational Range Interpretation. For some history
and discussion, see the gawk manual:
https://www.gnu.org/software/gawk/manual/html_node/Ranges-and-Locales.html.

2017-11-30         Arnold D. Robbins     <arnold@skeeve.com>

	Implement Rational Range Interpretation.

	* posix/regcomp.c (build_range_exp): Pass in the syntax bits.
	Use it to check RE_NO_EMPTY_RANGES, and check wide char values.
	Remove use of wscoll to determine range start and end.
	(parse_bracket_exp): Pass the syntax bits to build_range_exp.
	* posix/regexec.c (check_node_accept_bytes): Don't use wscoll
	to check ranges, but rather wide character values.
  

Patch

diff --git a/posix/regcomp.c b/posix/regcomp.c
index e63c258..0005fe7 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -2654,11 +2654,12 @@  parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
 
 static reg_errcode_t
 # ifdef RE_ENABLE_I18N
-build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
-		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
-# else /* not RE_ENABLE_I18N */
-build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
+build_range_exp (reg_syntax_t syntax, bitset_t sbcset, re_charset_t *mbcset,
+		int *range_alloc, bracket_elem_t *start_elem,
 		 bracket_elem_t *end_elem)
+# else /* not RE_ENABLE_I18N */
+build_range_exp (reg_syntax_t syntax, bitset_t sbcset,
+		bracket_elem_t *start_elem, bracket_elem_t *end_elem)
 # endif /* not RE_ENABLE_I18N */
 {
   unsigned int start_ch, end_ch;
@@ -2681,7 +2682,6 @@  build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
     wchar_t wc;
     wint_t start_wc;
     wint_t end_wc;
-    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
 
     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
@@ -2695,9 +2695,7 @@  build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
 	      ? __btowc (end_ch) : end_elem->opr.wch);
     if (start_wc == WEOF || end_wc == WEOF)
       return REG_ECOLLATE;
-    cmp_buf[0] = start_wc;
-    cmp_buf[4] = end_wc;
-    if (__wcscoll (cmp_buf, cmp_buf + 4) > 0)
+    else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
       return REG_ERANGE;
 
     /* Got valid collation sequence values, add them as a new entry.
@@ -2745,9 +2743,7 @@  build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
     /* Build the table for single byte characters.  */
     for (wc = 0; wc < SBC_MAX; ++wc)
       {
-	cmp_buf[2] = wc;
-	if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
-	    && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+         if (start_wc <= wc && wc <= end_wc)
 	  bitset_set (sbcset, wc);
       }
   }
@@ -3190,15 +3186,15 @@  parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 	  token_len = peek_token_bracket (token, regexp, syntax);
 
 #ifdef _LIBC
-	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
+	  *err = build_range_exp (syntax, sbcset, mbcset, &range_alloc,
 				  &start_elem, &end_elem);
 #else
 # ifdef RE_ENABLE_I18N
-	  *err = build_range_exp (sbcset,
+	  *err = build_range_exp (syntax, sbcset,
 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
 				  &range_alloc, &start_elem, &end_elem);
 # else
-	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
+	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
 # endif
 #endif /* RE_ENABLE_I18N */
 	  if (BE (*err != REG_NOERROR, 0))
diff --git a/posix/regexec.c b/posix/regexec.c
index dcdd33b..a9ed91f 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -3885,18 +3885,10 @@  check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 # endif /* _LIBC */
 	{
 	  /* match with range expression?  */
-#if __GNUC__ >= 2
-	  wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
-#else
-	  wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
-	  cmp_buf[2] = wc;
-#endif
 	  for (i = 0; i < cset->nranges; ++i)
 	    {
-	      cmp_buf[0] = cset->range_starts[i];
-	      cmp_buf[4] = cset->range_ends[i];
-	      if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
-		  && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+              if (cset->range_starts[i] <= wc
+                  && wc <= cset->range_ends[i])
 		{
 		  match_len = char_len;
 		  goto check_node_accept_bytes_match;