libstdc++: Allow std::ctype_base masks to contain multiple set bits

Message ID 20260108180946.1298387-1-keithp@keithp.com
State New
Headers
Series libstdc++: Allow std::ctype_base masks to contain multiple set bits |

Commit Message

Keith Packard Jan. 8, 2026, 6:09 p.m. UTC
  The C++ standard does not require that each std::ctype_base mask value
have only one bit set; in fact the expository values presented in the
text explicitly include two multi-bit values:

	// numeric values are for exposition only.
	static const mask space = 1 << 0;
	static const mask print = 1 << 1;
	static const mask cntrl = 1 << 2;
	static const mask upper = 1 << 3;
	static const mask lower = 1 << 4;
	static const mask alpha = 1 << 5;
	static const mask digit = 1 << 6;
	static const mask punct = 1 << 7;
	static const mask xdigit = 1 << 8;
	static const mask blank = 1 << 9;
	static const mask alnum = alpha | digit;
	static const mask graph = alnum | punct;

Adapt the code to handle this possibility in three places.

 1. In _M_initialize_ctype use explicit assignments from the
    std::ctype_base mask values into _M_bit array instead of
    assuming that iterating over values from 1<<0 to 1<<15 will
    match all of the possible std::ctype_base mask values.

 2. In do_is(mask, char_type) change the condition testing __m against
    _M_bit[__bitcur] to require *all* bits in _M_bit[__bitcur] to be
    present in __m before calling iswctype with the matching _M_wmask
    value.

    For a C library where
	alpha = (upper|lower)

    when
	__m is upper

    the previous version would have done
	iswctype(*__lo, wctype("alpha"))
	iswctype(*__lo, wctype("lower"))

    the new version will do
	iswctype(*__lo, wctype("upper"))

 3. In do_is(const wchar_t *, const wchar_t *, mask *), replace the
    code setting _M_bit values when iswctype returns true to code
    which clears _M_bit values when iswctype returns false.

    For a C library where
	alpha = (upper|lower), when

    when
	iswctype(wctype("alpha")) returns true
	iswctype(wctype("upper")) returns false
	iswctype(wctype("lower")) returns true

    the old version would have done
	__m = 0
	__m |= alpha
	__m |= lower

	return value = (upper|lower)

    the new version will do
	__m = upper|lower
	__m &= ~upper

	return value = (lower)

These changes should not introduce significant additional complexity
into these computations nor affect results with C libraries which use
independent values for the std::ctype_base masks. These changes do
reduce the range of mask values used in iterations from 16 to 12, so
there may actually be a tiny performance improvement.

Signed-off-by: Keith Packard <keithp@keithp.com>
---
 .../config/locale/generic/ctype_members.cc    | 81 +++++++++++++++----
 1 file changed, 64 insertions(+), 17 deletions(-)
  

Comments

Jonathan Wakely Jan. 9, 2026, 11:32 a.m. UTC | #1
On Thu, 8 Jan 2026 at 18:10, Keith Packard <keithp@keithp.com> wrote:
>
> The C++ standard does not require that each std::ctype_base mask value
> have only one bit set; in fact the expository values presented in the
> text explicitly include two multi-bit values:
>
>         // numeric values are for exposition only.
>         static const mask space = 1 << 0;
>         static const mask print = 1 << 1;
>         static const mask cntrl = 1 << 2;
>         static const mask upper = 1 << 3;
>         static const mask lower = 1 << 4;
>         static const mask alpha = 1 << 5;
>         static const mask digit = 1 << 6;
>         static const mask punct = 1 << 7;
>         static const mask xdigit = 1 << 8;
>         static const mask blank = 1 << 9;
>         static const mask alnum = alpha | digit;
>         static const mask graph = alnum | punct;
>
> Adapt the code to handle this possibility in three places.

This will fix at least part of https://gcc.gnu.org/PR51772 (maybe all
of it, I need to review it properly and refresh my memory of that
bug).


>
>  1. In _M_initialize_ctype use explicit assignments from the
>     std::ctype_base mask values into _M_bit array instead of
>     assuming that iterating over values from 1<<0 to 1<<15 will
>     match all of the possible std::ctype_base mask values.
>
>  2. In do_is(mask, char_type) change the condition testing __m against
>     _M_bit[__bitcur] to require *all* bits in _M_bit[__bitcur] to be
>     present in __m before calling iswctype with the matching _M_wmask
>     value.
>
>     For a C library where
>         alpha = (upper|lower)
>
>     when
>         __m is upper
>
>     the previous version would have done
>         iswctype(*__lo, wctype("alpha"))
>         iswctype(*__lo, wctype("lower"))
>
>     the new version will do
>         iswctype(*__lo, wctype("upper"))
>
>  3. In do_is(const wchar_t *, const wchar_t *, mask *), replace the
>     code setting _M_bit values when iswctype returns true to code
>     which clears _M_bit values when iswctype returns false.
>
>     For a C library where
>         alpha = (upper|lower), when
>
>     when
>         iswctype(wctype("alpha")) returns true
>         iswctype(wctype("upper")) returns false
>         iswctype(wctype("lower")) returns true
>
>     the old version would have done
>         __m = 0
>         __m |= alpha
>         __m |= lower
>
>         return value = (upper|lower)
>
>     the new version will do
>         __m = upper|lower
>         __m &= ~upper
>
>         return value = (lower)
>
> These changes should not introduce significant additional complexity
> into these computations nor affect results with C libraries which use
> independent values for the std::ctype_base masks. These changes do
> reduce the range of mask values used in iterations from 16 to 12, so
> there may actually be a tiny performance improvement.
>
> Signed-off-by: Keith Packard <keithp@keithp.com>
> ---
>  .../config/locale/generic/ctype_members.cc    | 81 +++++++++++++++----
>  1 file changed, 64 insertions(+), 17 deletions(-)
>
> diff --git a/libstdc++-v3/config/locale/generic/ctype_members.cc b/libstdc++-v3/config/locale/generic/ctype_members.cc
> index 2d61347feb1..21c25a68915 100644
> --- a/libstdc++-v3/config/locale/generic/ctype_members.cc
> +++ b/libstdc++-v3/config/locale/generic/ctype_members.cc
> @@ -53,6 +53,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>    { }
>
>  #ifdef _GLIBCXX_USE_WCHAR_T
> +
> +  // Values which index into _M_bit and _M_wmask for each classifier
> +  enum {
> +    _id_alnum, _id_alpha, _id_blank, _id_cntrl,
> +    _id_digit, _id_graph, _id_lower, _id_print,
> +    _id_punct, _id_space, _id_upper, _id_xdigit,
> +    _id_num,
> +  };
> +
>    ctype<wchar_t>::__wmask_type
>    ctype<wchar_t>::_M_convert_to_wmask(const mask __m) const throw()
>    {
> @@ -138,11 +147,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>    do_is(mask __m, char_type __c) const
>    {
>      bool __ret = false;
> -    // Generically, 15 (instead of 11) since we don't know the numerical
> -    // encoding of the various categories in /usr/include/ctype.h.
> -    const size_t __bitmasksize = 15;
> -    for (size_t __bitcur = 0; __bitcur <= __bitmasksize; ++__bitcur)
> -      if (__m & _M_bit[__bitcur]
> +
> +    // Each _M_bit may have more than one bit set. Only check tests for which
> +    // all of the matching _M_bit values are set. This skips tests with some
> +    // bits which are not set. For example:
> +    //
> +    //  alpha = lower|upper
> +    //  __c = 'a'
> +    //  __m = upper
> +    //
> +    // We evaluate iswupper (which returns false), but we skip iswalpha (which
> +    // would return true) because __m is missing the lower bit, so the return
> +    // value is false.
> +    for (size_t __bitcur = 0; __bitcur <= _id_num; ++__bitcur)
> +      if ((__m & _M_bit[__bitcur]) == _M_bit[__bitcur]
>           && iswctype(__c, _M_wmask[__bitcur]))
>         {
>           __ret = true;
> @@ -157,13 +175,29 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>    {
>      for (;__lo < __hi; ++__vec, ++__lo)
>        {
> -       // Generically, 15 (instead of 11) since we don't know the numerical
> -       // encoding of the various categories in /usr/include/ctype.h.
> -       const size_t __bitmasksize = 15;
> -       mask __m = 0;
> -       for (size_t __bitcur = 0; __bitcur <= __bitmasksize; ++__bitcur)
> -         if (iswctype(*__lo, _M_wmask[__bitcur]))
> -           __m |= _M_bit[__bitcur];
> +
> +       // Start with all bits set. Iterate over all wctypes, clearing bits for
> +       // non-matching values. For example:
> +       //
> +       //  alpha == lower|upper
> +       //  *__lo == 'a'
> +       //
> +       //  __m = <all bits>
> +       //  iswalpha and iswlower match
> +       //    __m left alone
> +       //  iswupper does not match
> +       //    __m &= ~upper
> +       //
> +       // The return value will satisfy the following conditions:
> +       //
> +       //  (__m & alpha) != 0
> +       //  (__m & lower) != 0
> +       //  (__m & upper) == 0
> +       mask __m = (alnum|alpha|blank|cntrl|digit|graph|
> +                   lower|print|punct|space|upper|xdigit);
> +       for (size_t __bitcur = 0; __bitcur < _id_num; ++__bitcur)
> +         if (!iswctype(*__lo, _M_wmask[__bitcur]))
> +           __m &= ~_M_bit[__bitcur];
>         *__vec = __m;
>        }
>      return __hi;
> @@ -264,11 +298,24 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>          __i < sizeof(_M_widen) / sizeof(wint_t); ++__i)
>        _M_widen[__i] = btowc(__i);
>
> -    for (size_t __i = 0; __i <= 15; ++__i)
> -      {
> -       _M_bit[__i] = static_cast<mask>(1 << __i);
> -       _M_wmask[__i] = _M_convert_to_wmask(_M_bit[__i]);
> -      }
> +    // Set these directly from the OS provided values.
> +    // We can't simply iterate over bit values because
> +    // they may have more than one bit set
> +    _M_bit[_id_alnum] = alnum;
> +    _M_bit[_id_alpha] = alpha;
> +    _M_bit[_id_blank] = blank;
> +    _M_bit[_id_cntrl] = cntrl;
> +    _M_bit[_id_digit] = digit;
> +    _M_bit[_id_graph] = graph;
> +    _M_bit[_id_lower] = lower;
> +    _M_bit[_id_print] = print;
> +    _M_bit[_id_punct] = punct;
> +    _M_bit[_id_space] = space;
> +    _M_bit[_id_upper] = upper;
> +    _M_bit[_id_xdigit] = xdigit;
> +
> +    for (size_t __i = 0; __i <= _id_num; __i++)
> +      _M_wmask[__i] = _M_convert_to_wmask(_M_bit[__i]);
>    }
>  #endif //  _GLIBCXX_USE_WCHAR_T
>
> --
> 2.51.0
>
  
Keith Packard Jan. 9, 2026, 5:18 p.m. UTC | #2
> From: Jonathan Wakely <jwakely@redhat.com>
> Date: Fri, 09 Jan 2026 11:32:58 +0000
> 
> This will fix at least part of https://gcc.gnu.org/PR51772 (maybe all
> of it, I need to review it properly and refresh my memory of that
> bug).

It fixes the xdigit issue, resolving
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104587 and it removes the
assumption that 'mask' can hold at least 16 bits, so the result should
resolve https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115524

There doesn't seem to be any mention of problems with is(char *, char *,
mask *) though; that causes problems even for cases where a compound
mask value is composed of published mask values, e.g. alnum = alpha |
digit. Compute a mask value for 'a':

        iswalnum('a') -> true
        iswalpha('a') -> true
        iswdigit('a') -> false

        mask = alnum | alpha = digit | alpha

This incorrectly classifies 'a' as a digit.

I've built a test case for this function which covers the C locale
(verifying only ASCII). Would it be reasonable to include that? Should
I expand it to check all of the ctype functions?
  

Patch

diff --git a/libstdc++-v3/config/locale/generic/ctype_members.cc b/libstdc++-v3/config/locale/generic/ctype_members.cc
index 2d61347feb1..21c25a68915 100644
--- a/libstdc++-v3/config/locale/generic/ctype_members.cc
+++ b/libstdc++-v3/config/locale/generic/ctype_members.cc
@@ -53,6 +53,15 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
   { }
 
 #ifdef _GLIBCXX_USE_WCHAR_T
+
+  // Values which index into _M_bit and _M_wmask for each classifier
+  enum {
+    _id_alnum, _id_alpha, _id_blank, _id_cntrl,
+    _id_digit, _id_graph, _id_lower, _id_print,
+    _id_punct, _id_space, _id_upper, _id_xdigit,
+    _id_num,
+  };
+
   ctype<wchar_t>::__wmask_type
   ctype<wchar_t>::_M_convert_to_wmask(const mask __m) const throw()
   {
@@ -138,11 +147,20 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
   do_is(mask __m, char_type __c) const
   {
     bool __ret = false;
-    // Generically, 15 (instead of 11) since we don't know the numerical
-    // encoding of the various categories in /usr/include/ctype.h.
-    const size_t __bitmasksize = 15;
-    for (size_t __bitcur = 0; __bitcur <= __bitmasksize; ++__bitcur)
-      if (__m & _M_bit[__bitcur]
+
+    // Each _M_bit may have more than one bit set. Only check tests for which
+    // all of the matching _M_bit values are set. This skips tests with some
+    // bits which are not set. For example:
+    //
+    //  alpha = lower|upper
+    //  __c = 'a'
+    //  __m = upper
+    //
+    // We evaluate iswupper (which returns false), but we skip iswalpha (which
+    // would return true) because __m is missing the lower bit, so the return
+    // value is false.
+    for (size_t __bitcur = 0; __bitcur <= _id_num; ++__bitcur)
+      if ((__m & _M_bit[__bitcur]) == _M_bit[__bitcur]
 	  && iswctype(__c, _M_wmask[__bitcur]))
 	{
 	  __ret = true;
@@ -157,13 +175,29 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
   {
     for (;__lo < __hi; ++__vec, ++__lo)
       {
-	// Generically, 15 (instead of 11) since we don't know the numerical
-	// encoding of the various categories in /usr/include/ctype.h.
-	const size_t __bitmasksize = 15;
-	mask __m = 0;
-	for (size_t __bitcur = 0; __bitcur <= __bitmasksize; ++__bitcur)
-	  if (iswctype(*__lo, _M_wmask[__bitcur]))
-	    __m |= _M_bit[__bitcur];
+
+	// Start with all bits set. Iterate over all wctypes, clearing bits for
+	// non-matching values. For example:
+	//
+	//  alpha == lower|upper
+	//  *__lo == 'a'
+	//
+	//  __m = <all bits>
+	//  iswalpha and iswlower match
+	//    __m left alone
+	//  iswupper does not match
+	//    __m &= ~upper
+	//
+	// The return value will satisfy the following conditions:
+	//
+	//  (__m & alpha) != 0
+	//  (__m & lower) != 0
+	//  (__m & upper) == 0
+	mask __m = (alnum|alpha|blank|cntrl|digit|graph|
+		    lower|print|punct|space|upper|xdigit);
+	for (size_t __bitcur = 0; __bitcur < _id_num; ++__bitcur)
+	  if (!iswctype(*__lo, _M_wmask[__bitcur]))
+	    __m &= ~_M_bit[__bitcur];
 	*__vec = __m;
       }
     return __hi;
@@ -264,11 +298,24 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	 __i < sizeof(_M_widen) / sizeof(wint_t); ++__i)
       _M_widen[__i] = btowc(__i);
 
-    for (size_t __i = 0; __i <= 15; ++__i)
-      {
-	_M_bit[__i] = static_cast<mask>(1 << __i);
-	_M_wmask[__i] = _M_convert_to_wmask(_M_bit[__i]);
-      }
+    // Set these directly from the OS provided values.
+    // We can't simply iterate over bit values because
+    // they may have more than one bit set
+    _M_bit[_id_alnum] = alnum;
+    _M_bit[_id_alpha] = alpha;
+    _M_bit[_id_blank] = blank;
+    _M_bit[_id_cntrl] = cntrl;
+    _M_bit[_id_digit] = digit;
+    _M_bit[_id_graph] = graph;
+    _M_bit[_id_lower] = lower;
+    _M_bit[_id_print] = print;
+    _M_bit[_id_punct] = punct;
+    _M_bit[_id_space] = space;
+    _M_bit[_id_upper] = upper;
+    _M_bit[_id_xdigit] = xdigit;
+
+    for (size_t __i = 0; __i <= _id_num; __i++)
+      _M_wmask[__i] = _M_convert_to_wmask(_M_bit[__i]);
   }
 #endif //  _GLIBCXX_USE_WCHAR_T