[1/2] : C N2653 char8_t: Language support

Message ID 5799be74-78c5-7b75-a5c8-5b27c33ea7fd@honermann.net
State New
Headers
Series : C N2653 char8_t implementation |

Commit Message

Tom Honermann Jan. 8, 2022, 12:42 a.m. UTC
  This patch implements the core language and compiler dependent library 
changes proposed in WG14 N2653 [1] for C2x. The changes include:
- Change of type for UTF-8 string literals from array of char to array
   of char8_t (unsigned char) when targeting C2x.
- A new atomic_char8_t typedef.
- A new ATOMIC_CHAR8_T_LOCK_FREE macro defined in terms of the existing
   __GCC_ATOMIC_CHAR8_T_LOCK_FREE predefined macro.

Tested on Linux x86_64.

gcc/ChangeLog:

2022-01-07  Tom Honermann  <tom@honermann.net>

	* ginclude/stdatomic.h (atomic_char8_t,
	ATOMIC_CHAR8_T_LOCK_FREE): New typedef and macro.

gcc/c/ChangeLog:

2022-01-07  Tom Honermann  <tom@honermann.net>

	* c-parser.c (c_parser_string_literal): Use char8_t as the type
	of CPP_UTF8STRING when char8_t support is enabled.
	* c-typeck.c (digest_init): Allow initialization of an array
	of character type by a string literal with type array of
	char8_t.

gcc/c-family/ChangeLog:

2022-01-07  Tom Honermann  <tom@honermann.net>

	* c-lex.c (lex_string, lex_charconst): Use char8_t as the type
	of CPP_UTF8CHAR and CPP_UTF8STRING when char8_t support is
	enabled.
	* c-opts.c (c_common_post_options): Set flag_char8_t if
	targeting C2x.

Tom.

[1]: WG14 N2653
      "char8_t: A type for UTF-8 characters and strings (Revision 1)"
      http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm
  

Patch

commit c041cce5d262908349be3f1f2e361c824db15845
Author: Tom Honermann <tom@honermann.net>
Date:   Sat Jan 1 18:10:41 2022 -0500

    N2653 char8_t for C: Language support
    
    This patch implements the core language and compiler dependent library
    changes proposed in WG14 N2653 for C2X.  The changes include:
    - Change of type for UTF-8 string literals from array of const char to
      array of const char8_t (unsigned char).
    - A new atomic_char8_t typedef.
    - A new ATOMIC_CHAR8_T_LOCK_FREE macro defined in terms of the existing
      __GCC_ATOMIC_CHAR8_T_LOCK_FREE predefined macro.

diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c
index 2651331e683..0b3debbb9bd 100644
--- a/gcc/c-family/c-lex.c
+++ b/gcc/c-family/c-lex.c
@@ -1352,7 +1352,14 @@  lex_string (const cpp_token *tok, tree *valp, bool objc_string, bool translate)
 	default:
 	case CPP_STRING:
 	case CPP_UTF8STRING:
-	  value = build_string (1, "");
+	  if (type == CPP_UTF8STRING && flag_char8_t)
+	    {
+	      value = build_string (TYPE_PRECISION (char8_type_node)
+				    / TYPE_PRECISION (char_type_node),
+				    "");  /* char8_t is 8 bits */
+	    }
+	  else
+	    value = build_string (1, "");
 	  break;
 	case CPP_STRING16:
 	  value = build_string (TYPE_PRECISION (char16_type_node)
@@ -1425,10 +1432,10 @@  lex_charconst (const cpp_token *token)
     type = char16_type_node;
   else if (token->type == CPP_UTF8CHAR)
     {
-      if (!c_dialect_cxx ())
-	type = unsigned_char_type_node;
-      else if (flag_char8_t)
+      if (flag_char8_t)
         type = char8_type_node;
+      else if (!c_dialect_cxx ())
+	type = unsigned_char_type_node;
       else
         type = char_type_node;
     }
diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c
index 4c20e44f5b5..bd96e1319ad 100644
--- a/gcc/c-family/c-opts.c
+++ b/gcc/c-family/c-opts.c
@@ -1060,9 +1060,9 @@  c_common_post_options (const char **pfilename)
   if (flag_sized_deallocation == -1)
     flag_sized_deallocation = (cxx_dialect >= cxx14);
 
-  /* char8_t support is new in C++20.  */
+  /* char8_t support is implicitly enabled in C++20 and C2x.  */
   if (flag_char8_t == -1)
-    flag_char8_t = (cxx_dialect >= cxx20);
+    flag_char8_t = (cxx_dialect >= cxx20) || flag_isoc2x;
 
   if (flag_extern_tls_init)
     {
diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index b09ad307acd..4239633e295 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -7439,7 +7439,14 @@  c_parser_string_literal (c_parser *parser, bool translate, bool wide_ok)
 	default:
 	case CPP_STRING:
 	case CPP_UTF8STRING:
-	  value = build_string (1, "");
+	  if (type == CPP_UTF8STRING && flag_char8_t)
+	    {
+	      value = build_string (TYPE_PRECISION (char8_type_node)
+				    / TYPE_PRECISION (char_type_node),
+				    "");  /* char8_t is 8 bits */
+	    }
+	  else
+	    value = build_string (1, "");
 	  break;
 	case CPP_STRING16:
 	  value = build_string (TYPE_PRECISION (char16_type_node)
@@ -7464,9 +7471,14 @@  c_parser_string_literal (c_parser *parser, bool translate, bool wide_ok)
     {
     default:
     case CPP_STRING:
-    case CPP_UTF8STRING:
       TREE_TYPE (value) = char_array_type_node;
       break;
+    case CPP_UTF8STRING:
+      if (flag_char8_t)
+	TREE_TYPE (value) = char8_array_type_node;
+      else
+	TREE_TYPE (value) = char_array_type_node;
+      break;
     case CPP_STRING16:
       TREE_TYPE (value) = char16_array_type_node;
       break;
diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
index 78a6c68aaa6..b4eeea545a9 100644
--- a/gcc/c/c-typeck.c
+++ b/gcc/c/c-typeck.c
@@ -8028,7 +8028,7 @@  digest_init (location_t init_loc, tree type, tree init, tree origtype,
 
 	  if (char_array)
 	    {
-	      if (typ2 != char_type_node)
+	      if (typ2 != char_type_node && typ2 != char8_type_node)
 		incompat_string_cst = true;
 	    }
 	  else if (!comptypes (typ1, typ2))
diff --git a/gcc/ginclude/stdatomic.h b/gcc/ginclude/stdatomic.h
index 23c07be2a48..b36703b2dc2 100644
--- a/gcc/ginclude/stdatomic.h
+++ b/gcc/ginclude/stdatomic.h
@@ -49,6 +49,10 @@  typedef _Atomic long atomic_long;
 typedef _Atomic unsigned long atomic_ulong;
 typedef _Atomic long long atomic_llong;
 typedef _Atomic unsigned long long atomic_ullong;
+#if (defined(__CHAR8_TYPE__) \
+     && (defined(_GNU_SOURCE) || defined(_ISOC2X_SOURCE)))
+typedef _Atomic __CHAR8_TYPE__ atomic_char8_t;
+#endif
 typedef _Atomic __CHAR16_TYPE__ atomic_char16_t;
 typedef _Atomic __CHAR32_TYPE__ atomic_char32_t;
 typedef _Atomic __WCHAR_TYPE__ atomic_wchar_t;
@@ -97,6 +101,10 @@  extern void atomic_signal_fence (memory_order);
 
 #define ATOMIC_BOOL_LOCK_FREE		__GCC_ATOMIC_BOOL_LOCK_FREE
 #define ATOMIC_CHAR_LOCK_FREE		__GCC_ATOMIC_CHAR_LOCK_FREE
+#if (defined(__GCC_ATOMIC_CHAR8_T_LOCK_FREE) \
+     && (defined(_GNU_SOURCE) || defined(_ISOC2X_SOURCE)))
+#define ATOMIC_CHAR8_T_LOCK_FREE	__GCC_ATOMIC_CHAR8_T_LOCK_FREE
+#endif
 #define ATOMIC_CHAR16_T_LOCK_FREE	__GCC_ATOMIC_CHAR16_T_LOCK_FREE
 #define ATOMIC_CHAR32_T_LOCK_FREE	__GCC_ATOMIC_CHAR32_T_LOCK_FREE
 #define ATOMIC_WCHAR_T_LOCK_FREE	__GCC_ATOMIC_WCHAR_T_LOCK_FREE