[v4,1/3] Add support for locales with zero collation rules.
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
While there is code to handle 'nrules == 0' in various locations
within posix/fnmatch_loop.c, posix/regcomp.c and posix/regexec.c,
these conditionals do not work. The only collation with zero
rules in effect today is the builtin C/POSIX locale which is
built by hand, and despite have zero rules it has a collseqmb
and collseqwc tables stored in the locale data. These tables are
simple identity tables which are not actually required and could
be removed at a later date after this change. The changes are in
order to prepare for C.UTF-8 which has zero rules and has no
collation sequence tables (multibyte or widechar).
No regressions on x86_64 or i686.
---
posix/fnmatch_loop.c | 95 +++++++++++++++++++++++++++-----------------
posix/regcomp.c | 12 +++---
posix/regexec.c | 85 ++++++++++++++++++---------------------
3 files changed, 104 insertions(+), 88 deletions(-)
Comments
* Carlos O'Donell via Libc-alpha:
> @@ -600,42 +597,51 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
> if (c == L_('-') && *p != L_(']'))
> {
> #if _LIBC
> - /* We have to find the collation sequence
> - value for C. Collation sequence is nothing
> - we can regularly access. The sequence
> - value is defined by the order in which the
> - definitions of the collation values for the
> - various characters appear in the source
> - file. A strange concept, nowhere
> - documented. */
> - uint32_t fcollseq;
> - uint32_t lcollseq;
> + /* We must find the collation sequence values for
> + the low part of the range, the high part of the
> + range and the searched value FN. We do this by
> + using the POSIX concept of Collation Element
> + Ordering, which is the defined order of elements
> + in the source locale. FCOLLSEQ is the searched
> + element in the range, while LCOLLSEQ is the low
> + element in the range. If we have no collation
> + rules (nrules == 0) then we must fall back to a
> + basic code point value for the collation
> + sequence value (which is correct for ASCII and
> + UTF-8). We must never use collseq if nrules ==
> + 0 since none of the tables we need will be
> + present in the compiled binary locale. We start
> + with fcollseq and lcollseq at unknown collation
> + sequences. We only compute hcollseq, the high
> + part of the range if required. */
> + uint32_t fcollseq = ~((uint32_t) 0);
> + uint32_t lcollseq = ~((uint32_t) 0);
> UCHAR cend = *p++;
Looks like ~((uint32_t) 0) needs to be added as a macro/constant to
__collseq_table_lookup.
>
> + if (nrules != 0)
> + {
> # if WIDE_CHAR_VERSION
> + /* Search the collation data for the character. */
> + fcollseq = __collseq_table_lookup (collseq, fn);
> + if (fcollseq == ~((uint32_t) 0))
> + /* We don't know anything about the character
> + we are supposed to match. This means we are
> + failing. */
> + goto range_not_matched;
> +
> + if (is_seqval)
> + lcollseq = cold;
> + else
> + lcollseq = __collseq_table_lookup (collseq, cold);
> # else
> + fcollseq = collseq[fn];
> + lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
> # endif
> + }
>
> is_seqval = false;
> if (cend == L_('[') && *p == L_('.'))
> {
> const CHAR *startp = p;
> size_t c1 = 0;
>
> @@ -752,14 +758,20 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
> cend = FOLD (cend);
> }
>
> + /* If we have rules, and the low sequence is lower than
> + the value of the searched sequence then we must
> + lookup the high collation sequence value and
> + determine if the fcollseq falls within the range.
> + If hcollseq is unknown then we could still match
> + fcollseq on the low end of the range. If lcollseq
> + if unknown (0xffffffff) we will still fail to
> + match, but in the future we might consider matching
> + the high end of the range on an exact match. */
> + if (nrules != 0 && (
> # if WIDE_CHAR_VERSION
> lcollseq == 0xffffffff ||
This should use the same constant as the initialization. The #if is now
unnecessary and can be removed.
> # endif
> + lcollseq <= fcollseq))
> {
> /* We have to look at the upper bound. */
> uint32_t hcollseq;
> @@ -789,6 +801,17 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
> if (lcollseq <= hcollseq && fcollseq <= hcollseq)
> goto matched;
> }
> +
> + /* No rules, but we have a range. */
> + if (nrules == 0)
> + {
> + if (cend == L_('\0'))
> + return FNM_NOMATCH;
> +
> + /* Compare that fn is within the range. */
> + if ((UCHAR) cold <= fn && fn <= cend)
> + goto matched;
> +
This part looks okay to me.
> diff --git a/posix/regcomp.c b/posix/regcomp.c
> index d93698ae78..f55d20cbfd 100644
> --- a/posix/regcomp.c
> +++ b/posix/regcomp.c
> @@ -2889,7 +2889,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
> if (MB_CUR_MAX == 1)
> */
> if (nrules == 0)
> - return collseqmb[br_elem->opr.ch];
> + return br_elem->opr.ch;
> else
> {
> wint_t wc = __btowc (br_elem->opr.ch);
> @@ -2900,6 +2900,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
> {
> if (nrules != 0)
> return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
> + else
> + return br_elem->opr.wch;
> }
> else if (br_elem->type == COLL_SYM)
> {
> @@ -2935,7 +2937,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
> }
> }
> else if (sym_name_len == 1)
> - return collseqmb[br_elem->opr.name[0]];
> + return br_elem->opr.name[0];
> }
> return UINT_MAX;
> }
> @@ -3017,7 +3019,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
> if (MB_CUR_MAX == 1)
> */
> if (nrules == 0)
> - ch_collseq = collseqmb[ch];
> + ch_collseq = ch;
> else
> ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
> if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
> @@ -3103,11 +3105,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
> int token_len;
> bool first_round = true;
> #ifdef _LIBC
> - collseqmb = (const unsigned char *)
> - _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
> nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
> if (nrules)
> {
> + collseqmb = (const unsigned char *)
> + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
> /*
> if (MB_CUR_MAX > 1)
> */
These changes look good.
> diff --git a/posix/regexec.c b/posix/regexec.c
> index f7b4f9cfc3..6cc23831aa 100644
> --- a/posix/regexec.c
> +++ b/posix/regexec.c
> @@ -3858,62 +3858,53 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
> }
>
> # ifdef _LIBC
> +#include <assert.h>
This really should got to the start of the file.
> +
> static unsigned int
> find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
> {
> + int32_t idx;
> + const unsigned char *extra = (const unsigned char *)
> _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
> + int32_t extrasize = (const unsigned char *)
> _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
> + uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
> +
> + /* Only called from within 'if (nrules != 0)'. */
Trailing whitespace on the last line. Most of this is just
reindentation, so okay.
> + assert (nrules != 0);
Right, the single caller is under nrules != 0.
strcoll and strxfrm are already correct.
Thanks,
Florian
On 7/29/21 2:44 AM, Florian Weimer via Libc-alpha wrote:
> Looks like ~((uint32_t) 0) needs to be added as a macro/constant to
> __collseq_table_lookup.
A small point: UINT32_MAX would be simpler and cleaner definiens for
that macro, as UINT32_MAX can be used in #if and is more portable to
potential future platforms where int is wider than 32 bits.
@@ -51,6 +51,7 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
_NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQMB);
# endif
#endif
+ uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
while ((c = *p++) != L_('\0'))
{
@@ -324,8 +325,6 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
diagnose a "used initialized" in a dead branch in the
findidx function. */
UCHAR str;
- uint32_t nrules =
- _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
const CHAR *startp = p;
c = *++p;
@@ -437,8 +436,6 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
if (c == L_('[') && *p == L_('.'))
{
- uint32_t nrules =
- _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
const CHAR *startp = p;
size_t c1 = 0;
@@ -600,42 +597,51 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
if (c == L_('-') && *p != L_(']'))
{
#if _LIBC
- /* We have to find the collation sequence
- value for C. Collation sequence is nothing
- we can regularly access. The sequence
- value is defined by the order in which the
- definitions of the collation values for the
- various characters appear in the source
- file. A strange concept, nowhere
- documented. */
- uint32_t fcollseq;
- uint32_t lcollseq;
+ /* We must find the collation sequence values for
+ the low part of the range, the high part of the
+ range and the searched value FN. We do this by
+ using the POSIX concept of Collation Element
+ Ordering, which is the defined order of elements
+ in the source locale. FCOLLSEQ is the searched
+ element in the range, while LCOLLSEQ is the low
+ element in the range. If we have no collation
+ rules (nrules == 0) then we must fall back to a
+ basic code point value for the collation
+ sequence value (which is correct for ASCII and
+ UTF-8). We must never use collseq if nrules ==
+ 0 since none of the tables we need will be
+ present in the compiled binary locale. We start
+ with fcollseq and lcollseq at unknown collation
+ sequences. We only compute hcollseq, the high
+ part of the range if required. */
+ uint32_t fcollseq = ~((uint32_t) 0);
+ uint32_t lcollseq = ~((uint32_t) 0);
UCHAR cend = *p++;
+ if (nrules != 0)
+ {
# if WIDE_CHAR_VERSION
- /* Search in the 'names' array for the characters. */
- fcollseq = __collseq_table_lookup (collseq, fn);
- if (fcollseq == ~((uint32_t) 0))
- /* XXX We don't know anything about the character
- we are supposed to match. This means we are
- failing. */
- goto range_not_matched;
-
- if (is_seqval)
- lcollseq = cold;
- else
- lcollseq = __collseq_table_lookup (collseq, cold);
+ /* Search the collation data for the character. */
+ fcollseq = __collseq_table_lookup (collseq, fn);
+ if (fcollseq == ~((uint32_t) 0))
+ /* We don't know anything about the character
+ we are supposed to match. This means we are
+ failing. */
+ goto range_not_matched;
+
+ if (is_seqval)
+ lcollseq = cold;
+ else
+ lcollseq = __collseq_table_lookup (collseq, cold);
# else
- fcollseq = collseq[fn];
- lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
+ fcollseq = collseq[fn];
+ lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
# endif
+ }
is_seqval = false;
if (cend == L_('[') && *p == L_('.'))
{
- uint32_t nrules =
- _NL_CURRENT_WORD (LC_COLLATE,
- _NL_COLLATE_NRULES);
const CHAR *startp = p;
size_t c1 = 0;
@@ -752,14 +758,20 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
cend = FOLD (cend);
}
- /* XXX It is not entirely clear to me how to handle
- characters which are not mentioned in the
- collation specification. */
- if (
+ /* If we have rules, and the low sequence is lower than
+ the value of the searched sequence then we must
+ lookup the high collation sequence value and
+ determine if the fcollseq falls within the range.
+ If hcollseq is unknown then we could still match
+ fcollseq on the low end of the range. If lcollseq
+ if unknown (0xffffffff) we will still fail to
+ match, but in the future we might consider matching
+ the high end of the range on an exact match. */
+ if (nrules != 0 && (
# if WIDE_CHAR_VERSION
lcollseq == 0xffffffff ||
# endif
- lcollseq <= fcollseq)
+ lcollseq <= fcollseq))
{
/* We have to look at the upper bound. */
uint32_t hcollseq;
@@ -789,6 +801,17 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
if (lcollseq <= hcollseq && fcollseq <= hcollseq)
goto matched;
}
+
+ /* No rules, but we have a range. */
+ if (nrules == 0)
+ {
+ if (cend == L_('\0'))
+ return FNM_NOMATCH;
+
+ /* Compare that fn is within the range. */
+ if ((UCHAR) cold <= fn && fn <= cend)
+ goto matched;
+ }
# if WIDE_CHAR_VERSION
range_not_matched:
# endif
@@ -2889,7 +2889,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
if (MB_CUR_MAX == 1)
*/
if (nrules == 0)
- return collseqmb[br_elem->opr.ch];
+ return br_elem->opr.ch;
else
{
wint_t wc = __btowc (br_elem->opr.ch);
@@ -2900,6 +2900,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
{
if (nrules != 0)
return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
+ else
+ return br_elem->opr.wch;
}
else if (br_elem->type == COLL_SYM)
{
@@ -2935,7 +2937,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
}
}
else if (sym_name_len == 1)
- return collseqmb[br_elem->opr.name[0]];
+ return br_elem->opr.name[0];
}
return UINT_MAX;
}
@@ -3017,7 +3019,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
if (MB_CUR_MAX == 1)
*/
if (nrules == 0)
- ch_collseq = collseqmb[ch];
+ ch_collseq = ch;
else
ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
@@ -3103,11 +3105,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
int token_len;
bool first_round = true;
#ifdef _LIBC
- collseqmb = (const unsigned char *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
if (nrules)
{
+ collseqmb = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
/*
if (MB_CUR_MAX > 1)
*/
@@ -3858,62 +3858,53 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
}
# ifdef _LIBC
+#include <assert.h>
+
static unsigned int
find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
{
- uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
- if (nrules == 0)
- {
- if (mbs_len == 1)
- {
- /* No valid character. Match it as a single byte character. */
- const unsigned char *collseq = (const unsigned char *)
- _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
- return collseq[mbs[0]];
- }
- return UINT_MAX;
- }
- else
- {
- int32_t idx;
- const unsigned char *extra = (const unsigned char *)
+ int32_t idx;
+ const unsigned char *extra = (const unsigned char *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
- int32_t extrasize = (const unsigned char *)
+ int32_t extrasize = (const unsigned char *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
+ uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+
+ /* Only called from within 'if (nrules != 0)'. */
+ assert (nrules != 0);
- for (idx = 0; idx < extrasize;)
+ for (idx = 0; idx < extrasize;)
+ {
+ int mbs_cnt;
+ bool found = false;
+ int32_t elem_mbs_len;
+ /* Skip the name of collating element name. */
+ idx = idx + extra[idx] + 1;
+ elem_mbs_len = extra[idx++];
+ if (mbs_len == elem_mbs_len)
{
- int mbs_cnt;
- bool found = false;
- int32_t elem_mbs_len;
- /* Skip the name of collating element name. */
- idx = idx + extra[idx] + 1;
- elem_mbs_len = extra[idx++];
- if (mbs_len == elem_mbs_len)
- {
- for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
- if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
- break;
- if (mbs_cnt == elem_mbs_len)
- /* Found the entry. */
- found = true;
- }
- /* Skip the byte sequence of the collating element. */
- idx += elem_mbs_len;
- /* Adjust for the alignment. */
- idx = (idx + 3) & ~3;
- /* Skip the collation sequence value. */
- idx += sizeof (uint32_t);
- /* Skip the wide char sequence of the collating element. */
- idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
- /* If we found the entry, return the sequence value. */
- if (found)
- return *(uint32_t *) (extra + idx);
- /* Skip the collation sequence value. */
- idx += sizeof (uint32_t);
+ for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
+ if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
+ break;
+ if (mbs_cnt == elem_mbs_len)
+ /* Found the entry. */
+ found = true;
}
- return UINT_MAX;
+ /* Skip the byte sequence of the collating element. */
+ idx += elem_mbs_len;
+ /* Adjust for the alignment. */
+ idx = (idx + 3) & ~3;
+ /* Skip the collation sequence value. */
+ idx += sizeof (uint32_t);
+ /* Skip the wide char sequence of the collating element. */
+ idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
+ /* If we found the entry, return the sequence value. */
+ if (found)
+ return *(uint32_t *) (extra + idx);
+ /* Skip the collation sequence value. */
+ idx += sizeof (uint32_t);
}
+ return UINT_MAX;
}
# endif /* _LIBC */
#endif /* RE_ENABLE_I18N */