libstdc++: Allow std::ctype_base masks to contain multiple set bits
Commit Message
The C++ standard does not require that each std::ctype_base mask value
have only one bit set; in fact the expository values presented in the
text explicitly include two multi-bit values:
// numeric values are for exposition only.
static const mask space = 1 << 0;
static const mask print = 1 << 1;
static const mask cntrl = 1 << 2;
static const mask upper = 1 << 3;
static const mask lower = 1 << 4;
static const mask alpha = 1 << 5;
static const mask digit = 1 << 6;
static const mask punct = 1 << 7;
static const mask xdigit = 1 << 8;
static const mask blank = 1 << 9;
static const mask alnum = alpha | digit;
static const mask graph = alnum | punct;
Adapt the code to handle this possibility in three places.
1. In _M_initialize_ctype use explicit assignments from the
std::ctype_base mask values into _M_bit array instead of
assuming that iterating over values from 1<<0 to 1<<15 will
match all of the possible std::ctype_base mask values.
2. In do_is(mask, char_type) change the condition testing __m against
_M_bit[__bitcur] to require *all* bits in _M_bit[__bitcur] to be
present in __m before calling iswctype with the matching _M_wmask
value.
For a C library where
alpha = (upper|lower)
when
__m is upper
the previous version would have done
iswctype(*__lo, wctype("alpha"))
iswctype(*__lo, wctype("lower"))
the new version will do
iswctype(*__lo, wctype("upper"))
3. In do_is(const wchar_t *, const wchar_t *, mask *), replace the
code setting _M_bit values when iswctype returns true to code
which clears _M_bit values when iswctype returns false.
For a C library where
alpha = (upper|lower), when
when
iswctype(wctype("alpha")) returns true
iswctype(wctype("upper")) returns false
iswctype(wctype("lower")) returns true
the old version would have done
__m = 0
__m |= alpha
__m |= lower
return value = (upper|lower)
the new version will do
__m = upper|lower
__m &= ~upper
return value = (lower)
These changes should not introduce significant additional complexity
into these computations nor affect results with C libraries which use
independent values for the std::ctype_base masks. These changes do
reduce the range of mask values used in iterations from 16 to 12, so
there may actually be a tiny performance improvement.
Signed-off-by: Keith Packard <keithp@keithp.com>
---
.../config/locale/generic/ctype_members.cc | 81 +++++++++++++++----
1 file changed, 64 insertions(+), 17 deletions(-)
Comments
On Thu, 8 Jan 2026 at 18:10, Keith Packard <keithp@keithp.com> wrote:
>
> The C++ standard does not require that each std::ctype_base mask value
> have only one bit set; in fact the expository values presented in the
> text explicitly include two multi-bit values:
>
> // numeric values are for exposition only.
> static const mask space = 1 << 0;
> static const mask print = 1 << 1;
> static const mask cntrl = 1 << 2;
> static const mask upper = 1 << 3;
> static const mask lower = 1 << 4;
> static const mask alpha = 1 << 5;
> static const mask digit = 1 << 6;
> static const mask punct = 1 << 7;
> static const mask xdigit = 1 << 8;
> static const mask blank = 1 << 9;
> static const mask alnum = alpha | digit;
> static const mask graph = alnum | punct;
>
> Adapt the code to handle this possibility in three places.
This will fix at least part of https://gcc.gnu.org/PR51772 (maybe all
of it, I need to review it properly and refresh my memory of that
bug).
>
> 1. In _M_initialize_ctype use explicit assignments from the
> std::ctype_base mask values into _M_bit array instead of
> assuming that iterating over values from 1<<0 to 1<<15 will
> match all of the possible std::ctype_base mask values.
>
> 2. In do_is(mask, char_type) change the condition testing __m against
> _M_bit[__bitcur] to require *all* bits in _M_bit[__bitcur] to be
> present in __m before calling iswctype with the matching _M_wmask
> value.
>
> For a C library where
> alpha = (upper|lower)
>
> when
> __m is upper
>
> the previous version would have done
> iswctype(*__lo, wctype("alpha"))
> iswctype(*__lo, wctype("lower"))
>
> the new version will do
> iswctype(*__lo, wctype("upper"))
>
> 3. In do_is(const wchar_t *, const wchar_t *, mask *), replace the
> code setting _M_bit values when iswctype returns true to code
> which clears _M_bit values when iswctype returns false.
>
> For a C library where
> alpha = (upper|lower), when
>
> when
> iswctype(wctype("alpha")) returns true
> iswctype(wctype("upper")) returns false
> iswctype(wctype("lower")) returns true
>
> the old version would have done
> __m = 0
> __m |= alpha
> __m |= lower
>
> return value = (upper|lower)
>
> the new version will do
> __m = upper|lower
> __m &= ~upper
>
> return value = (lower)
>
> These changes should not introduce significant additional complexity
> into these computations nor affect results with C libraries which use
> independent values for the std::ctype_base masks. These changes do
> reduce the range of mask values used in iterations from 16 to 12, so
> there may actually be a tiny performance improvement.
>
> Signed-off-by: Keith Packard <keithp@keithp.com>
> ---
> .../config/locale/generic/ctype_members.cc | 81 +++++++++++++++----
> 1 file changed, 64 insertions(+), 17 deletions(-)
>
> diff --git a/libstdc++-v3/config/locale/generic/ctype_members.cc b/libstdc++-v3/config/locale/generic/ctype_members.cc
> index 2d61347feb1..21c25a68915 100644
> --- a/libstdc++-v3/config/locale/generic/ctype_members.cc
> +++ b/libstdc++-v3/config/locale/generic/ctype_members.cc
> @@ -53,6 +53,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> { }
>
> #ifdef _GLIBCXX_USE_WCHAR_T
> +
> + // Values which index into _M_bit and _M_wmask for each classifier
> + enum {
> + _id_alnum, _id_alpha, _id_blank, _id_cntrl,
> + _id_digit, _id_graph, _id_lower, _id_print,
> + _id_punct, _id_space, _id_upper, _id_xdigit,
> + _id_num,
> + };
> +
> ctype<wchar_t>::__wmask_type
> ctype<wchar_t>::_M_convert_to_wmask(const mask __m) const throw()
> {
> @@ -138,11 +147,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> do_is(mask __m, char_type __c) const
> {
> bool __ret = false;
> - // Generically, 15 (instead of 11) since we don't know the numerical
> - // encoding of the various categories in /usr/include/ctype.h.
> - const size_t __bitmasksize = 15;
> - for (size_t __bitcur = 0; __bitcur <= __bitmasksize; ++__bitcur)
> - if (__m & _M_bit[__bitcur]
> +
> + // Each _M_bit may have more than one bit set. Only check tests for which
> + // all of the matching _M_bit values are set. This skips tests with some
> + // bits which are not set. For example:
> + //
> + // alpha = lower|upper
> + // __c = 'a'
> + // __m = upper
> + //
> + // We evaluate iswupper (which returns false), but we skip iswalpha (which
> + // would return true) because __m is missing the lower bit, so the return
> + // value is false.
> + for (size_t __bitcur = 0; __bitcur <= _id_num; ++__bitcur)
> + if ((__m & _M_bit[__bitcur]) == _M_bit[__bitcur]
> && iswctype(__c, _M_wmask[__bitcur]))
> {
> __ret = true;
> @@ -157,13 +175,29 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> {
> for (;__lo < __hi; ++__vec, ++__lo)
> {
> - // Generically, 15 (instead of 11) since we don't know the numerical
> - // encoding of the various categories in /usr/include/ctype.h.
> - const size_t __bitmasksize = 15;
> - mask __m = 0;
> - for (size_t __bitcur = 0; __bitcur <= __bitmasksize; ++__bitcur)
> - if (iswctype(*__lo, _M_wmask[__bitcur]))
> - __m |= _M_bit[__bitcur];
> +
> + // Start with all bits set. Iterate over all wctypes, clearing bits for
> + // non-matching values. For example:
> + //
> + // alpha == lower|upper
> + // *__lo == 'a'
> + //
> + // __m = <all bits>
> + // iswalpha and iswlower match
> + // __m left alone
> + // iswupper does not match
> + // __m &= ~upper
> + //
> + // The return value will satisfy the following conditions:
> + //
> + // (__m & alpha) != 0
> + // (__m & lower) != 0
> + // (__m & upper) == 0
> + mask __m = (alnum|alpha|blank|cntrl|digit|graph|
> + lower|print|punct|space|upper|xdigit);
> + for (size_t __bitcur = 0; __bitcur < _id_num; ++__bitcur)
> + if (!iswctype(*__lo, _M_wmask[__bitcur]))
> + __m &= ~_M_bit[__bitcur];
> *__vec = __m;
> }
> return __hi;
> @@ -264,11 +298,24 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> __i < sizeof(_M_widen) / sizeof(wint_t); ++__i)
> _M_widen[__i] = btowc(__i);
>
> - for (size_t __i = 0; __i <= 15; ++__i)
> - {
> - _M_bit[__i] = static_cast<mask>(1 << __i);
> - _M_wmask[__i] = _M_convert_to_wmask(_M_bit[__i]);
> - }
> + // Set these directly from the OS provided values.
> + // We can't simply iterate over bit values because
> + // they may have more than one bit set
> + _M_bit[_id_alnum] = alnum;
> + _M_bit[_id_alpha] = alpha;
> + _M_bit[_id_blank] = blank;
> + _M_bit[_id_cntrl] = cntrl;
> + _M_bit[_id_digit] = digit;
> + _M_bit[_id_graph] = graph;
> + _M_bit[_id_lower] = lower;
> + _M_bit[_id_print] = print;
> + _M_bit[_id_punct] = punct;
> + _M_bit[_id_space] = space;
> + _M_bit[_id_upper] = upper;
> + _M_bit[_id_xdigit] = xdigit;
> +
> + for (size_t __i = 0; __i <= _id_num; __i++)
> + _M_wmask[__i] = _M_convert_to_wmask(_M_bit[__i]);
> }
> #endif // _GLIBCXX_USE_WCHAR_T
>
> --
> 2.51.0
>
> From: Jonathan Wakely <jwakely@redhat.com>
> Date: Fri, 09 Jan 2026 11:32:58 +0000
>
> This will fix at least part of https://gcc.gnu.org/PR51772 (maybe all
> of it, I need to review it properly and refresh my memory of that
> bug).
It fixes the xdigit issue, resolving
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104587 and it removes the
assumption that 'mask' can hold at least 16 bits, so the result should
resolve https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115524
There doesn't seem to be any mention of problems with is(char *, char *,
mask *) though; that causes problems even for cases where a compound
mask value is composed of published mask values, e.g. alnum = alpha |
digit. Compute a mask value for 'a':
iswalnum('a') -> true
iswalpha('a') -> true
iswdigit('a') -> false
mask = alnum | alpha = digit | alpha
This incorrectly classifies 'a' as a digit.
I've built a test case for this function which covers the C locale
(verifying only ASCII). Would it be reasonable to include that? Should
I expand it to check all of the ctype functions?
@@ -53,6 +53,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{ }
#ifdef _GLIBCXX_USE_WCHAR_T
+
+ // Values which index into _M_bit and _M_wmask for each classifier
+ enum {
+ _id_alnum, _id_alpha, _id_blank, _id_cntrl,
+ _id_digit, _id_graph, _id_lower, _id_print,
+ _id_punct, _id_space, _id_upper, _id_xdigit,
+ _id_num,
+ };
+
ctype<wchar_t>::__wmask_type
ctype<wchar_t>::_M_convert_to_wmask(const mask __m) const throw()
{
@@ -138,11 +147,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
do_is(mask __m, char_type __c) const
{
bool __ret = false;
- // Generically, 15 (instead of 11) since we don't know the numerical
- // encoding of the various categories in /usr/include/ctype.h.
- const size_t __bitmasksize = 15;
- for (size_t __bitcur = 0; __bitcur <= __bitmasksize; ++__bitcur)
- if (__m & _M_bit[__bitcur]
+
+ // Each _M_bit may have more than one bit set. Only check tests for which
+ // all of the matching _M_bit values are set. This skips tests with some
+ // bits which are not set. For example:
+ //
+ // alpha = lower|upper
+ // __c = 'a'
+ // __m = upper
+ //
+ // We evaluate iswupper (which returns false), but we skip iswalpha (which
+ // would return true) because __m is missing the lower bit, so the return
+ // value is false.
+ for (size_t __bitcur = 0; __bitcur <= _id_num; ++__bitcur)
+ if ((__m & _M_bit[__bitcur]) == _M_bit[__bitcur]
&& iswctype(__c, _M_wmask[__bitcur]))
{
__ret = true;
@@ -157,13 +175,29 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
for (;__lo < __hi; ++__vec, ++__lo)
{
- // Generically, 15 (instead of 11) since we don't know the numerical
- // encoding of the various categories in /usr/include/ctype.h.
- const size_t __bitmasksize = 15;
- mask __m = 0;
- for (size_t __bitcur = 0; __bitcur <= __bitmasksize; ++__bitcur)
- if (iswctype(*__lo, _M_wmask[__bitcur]))
- __m |= _M_bit[__bitcur];
+
+ // Start with all bits set. Iterate over all wctypes, clearing bits for
+ // non-matching values. For example:
+ //
+ // alpha == lower|upper
+ // *__lo == 'a'
+ //
+ // __m = <all bits>
+ // iswalpha and iswlower match
+ // __m left alone
+ // iswupper does not match
+ // __m &= ~upper
+ //
+ // The return value will satisfy the following conditions:
+ //
+ // (__m & alpha) != 0
+ // (__m & lower) != 0
+ // (__m & upper) == 0
+ mask __m = (alnum|alpha|blank|cntrl|digit|graph|
+ lower|print|punct|space|upper|xdigit);
+ for (size_t __bitcur = 0; __bitcur < _id_num; ++__bitcur)
+ if (!iswctype(*__lo, _M_wmask[__bitcur]))
+ __m &= ~_M_bit[__bitcur];
*__vec = __m;
}
return __hi;
@@ -264,11 +298,24 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__i < sizeof(_M_widen) / sizeof(wint_t); ++__i)
_M_widen[__i] = btowc(__i);
- for (size_t __i = 0; __i <= 15; ++__i)
- {
- _M_bit[__i] = static_cast<mask>(1 << __i);
- _M_wmask[__i] = _M_convert_to_wmask(_M_bit[__i]);
- }
+ // Set these directly from the OS provided values.
+ // We can't simply iterate over bit values because
+ // they may have more than one bit set
+ _M_bit[_id_alnum] = alnum;
+ _M_bit[_id_alpha] = alpha;
+ _M_bit[_id_blank] = blank;
+ _M_bit[_id_cntrl] = cntrl;
+ _M_bit[_id_digit] = digit;
+ _M_bit[_id_graph] = graph;
+ _M_bit[_id_lower] = lower;
+ _M_bit[_id_print] = print;
+ _M_bit[_id_punct] = punct;
+ _M_bit[_id_space] = space;
+ _M_bit[_id_upper] = upper;
+ _M_bit[_id_xdigit] = xdigit;
+
+ for (size_t __i = 0; __i <= _id_num; __i++)
+ _M_wmask[__i] = _M_convert_to_wmask(_M_bit[__i]);
}
#endif // _GLIBCXX_USE_WCHAR_T