support C/C++ identifiers named with non-ASCII characters

Message ID 22de68be-6266-7957-5957-615ccb5660ba@redhat.com
State New, archived
Headers

Commit Message

Pedro Alves May 22, 2018, 4:42 p.m. UTC
  On 05/22/2018 04:48 PM, 張俊芝 wrote:

> I read through your code. If I understand it correctly, you keep all the valid ASCII characters, and treat all non-ASCII characters "as valid".
> 
> This is pretty much the same thing as I did. The only difference is that I blacklist invalid ASCII characters and you whitelist valid ASCII characters. But we both "validate" all the non-ASCII characters.

Right.  Non-7-bit/base ASCII characters must either be part of the identifier,
or invalid, but we don't need to be pedantic here, as you've also expressed
elsewhere in the thread, I believe.

> 
> But I think your code seems better than mine because it updates and reuses some common code. So I think I can abondon my patch and the unfinished test case.

Alright, I'm pushing this in then, as below.  I've added your name
to the ChangeLog too.  And fixed the Copyright years to include 2018.
Sorry that I didn't say I had a patch mostly written in the PR!

Note I've filed gdb/23211 for the completion issue.  I'm not working
on it right now.  Let me know if you'd like to take a look at that one.

From b1b60145aedb8adcb0b9dcf43a5ae735c2f03b51 Mon Sep 17 00:00:00 2001
From: Pedro Alves <palves@redhat.com>
Date: Tue, 22 May 2018 17:35:38 +0100
Subject: [PATCH] Support UTF-8 identifiers in C/C++ expressions (PR gdb/22973)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Factor out cp_ident_is_alpha/cp_ident_is_alnum out of
gdb/cp-name-parser.y and use it in the C/C++ expression parser too.

New test included.

gdb/ChangeLog:
2018-05-22  Pedro Alves  <palves@redhat.com>
	    張俊芝  <zjz@zjz.name>

	PR gdb/22973
	* c-exp.y: Include "c-support.h".
	(parse_number, c_parse_escape, lex_one_token): Use TOLOWER instead
	of tolower.  Use c_ident_is_alpha to scan names.
	* c-lang.c: Include "c-support.h".
	(convert_ucn, convert_octal, convert_hex, convert_escape): Use
	ISXDIGIT instead of isxdigit and ISDIGIT instead of isdigit.
	* c-support.h: New file, with bits factored out from ...
	* cp-name-parser.y: ... this file.
	Include "c-support.h".
	(cp_ident_is_alpha, cp_ident_is_alnum): Deleted, moved to
	c-support.h and renamed.
	(symbol_end, yylex): Adjust.

gdb/testsuite/ChangeLog:
2018-05-22  Pedro Alves  <palves@redhat.com>

	PR gdb/22973
	* gdb.base/utf8-identifiers.c: New file.
	* gdb.base/utf8-identifiers.exp: New file.
---
 gdb/ChangeLog                               | 17 +++++++
 gdb/testsuite/ChangeLog                     |  6 +++
 gdb/c-exp.y                                 | 27 +++++-----
 gdb/c-lang.c                                | 11 +++--
 gdb/c-support.h                             | 46 +++++++++++++++++
 gdb/cp-name-parser.y                        | 29 ++---------
 gdb/testsuite/gdb.base/utf8-identifiers.c   | 71 ++++++++++++++++++++++++++
 gdb/testsuite/gdb.base/utf8-identifiers.exp | 77 +++++++++++++++++++++++++++++
 8 files changed, 240 insertions(+), 44 deletions(-)
 create mode 100644 gdb/c-support.h
 create mode 100644 gdb/testsuite/gdb.base/utf8-identifiers.c
 create mode 100644 gdb/testsuite/gdb.base/utf8-identifiers.exp
  

Patch

diff --git a/gdb/ChangeLog b/gdb/ChangeLog
index ff5e0a2cbe..b34fa7b74e 100644
--- a/gdb/ChangeLog
+++ b/gdb/ChangeLog
@@ -1,3 +1,20 @@ 
+2018-05-22  Pedro Alves  <palves@redhat.com>
+	    張俊芝  <zjz@zjz.name>
+
+	PR gdb/22973
+	* c-exp.y: Include "c-support.h".
+	(parse_number, c_parse_escape, lex_one_token): Use TOLOWER instead
+	of tolower.  Use c_ident_is_alpha to scan names.
+	* c-lang.c: Include "c-support.h".
+	(convert_ucn, convert_octal, convert_hex, convert_escape): Use
+	ISXDIGIT instead of isxdigit and ISDIGIT instead of isdigit.
+	* c-support.h: New file, with bits factored out from ...
+	* cp-name-parser.y: ... this file.
+	Include "c-support.h".
+	(cp_ident_is_alpha, cp_ident_is_alnum): Deleted, moved to
+	c-support.h and renamed.
+	(symbol_end, yylex): Adjust.
+
 2018-05-22  Pedro Franco de Carvalho  <pedromfc@linux.vnet.ibm.com>
 
 	* arch/ppc-linux-common.c (ppc_linux_has_isa205): Change the
diff --git a/gdb/testsuite/ChangeLog b/gdb/testsuite/ChangeLog
index 208939bf82..393ab8884a 100644
--- a/gdb/testsuite/ChangeLog
+++ b/gdb/testsuite/ChangeLog
@@ -1,3 +1,9 @@ 
+2018-05-22  Pedro Alves  <palves@redhat.com>
+
+	PR gdb/22973
+	* gdb.base/utf8-identifiers.c: New file.
+	* gdb.base/utf8-identifiers.exp: New file.
+
 2018-05-22  Pedro Franco de Carvalho  <pedromfc@linux.vnet.ibm.com>
 
 	* gdb.arch/powerpc-fpscr-gcore.exp: New file.
diff --git a/gdb/c-exp.y b/gdb/c-exp.y
index 5e10d2a3b4..ae31af52df 100644
--- a/gdb/c-exp.y
+++ b/gdb/c-exp.y
@@ -42,6 +42,7 @@ 
 #include "parser-defs.h"
 #include "language.h"
 #include "c-lang.h"
+#include "c-support.h"
 #include "bfd.h" /* Required by objfiles.h.  */
 #include "symfile.h" /* Required by objfiles.h.  */
 #include "objfiles.h" /* For have_full_symbols and have_partial_symbols */
@@ -1806,13 +1807,13 @@  parse_number (struct parser_state *par_state,
 	  len -= 2;
 	}
       /* Handle suffixes: 'f' for float, 'l' for long double.  */
-      else if (len >= 1 && tolower (p[len - 1]) == 'f')
+      else if (len >= 1 && TOLOWER (p[len - 1]) == 'f')
 	{
 	  putithere->typed_val_float.type
 	    = parse_type (par_state)->builtin_float;
 	  len -= 1;
 	}
-      else if (len >= 1 && tolower (p[len - 1]) == 'l')
+      else if (len >= 1 && TOLOWER (p[len - 1]) == 'l')
 	{
 	  putithere->typed_val_float.type
 	    = parse_type (par_state)->builtin_long_double;
@@ -2023,9 +2024,9 @@  c_parse_escape (const char **ptr, struct obstack *output)
       if (output)
 	obstack_grow_str (output, "\\x");
       ++tokptr;
-      if (!isxdigit (*tokptr))
+      if (!ISXDIGIT (*tokptr))
 	error (_("\\x escape without a following hex digit"));
-      while (isxdigit (*tokptr))
+      while (ISXDIGIT (*tokptr))
 	{
 	  if (output)
 	    obstack_1grow (output, *tokptr);
@@ -2048,7 +2049,7 @@  c_parse_escape (const char **ptr, struct obstack *output)
 	if (output)
 	  obstack_grow_str (output, "\\");
 	for (i = 0;
-	     i < 3 && isdigit (*tokptr) && *tokptr != '8' && *tokptr != '9';
+	     i < 3 && ISDIGIT (*tokptr) && *tokptr != '8' && *tokptr != '9';
 	     ++i)
 	  {
 	    if (output)
@@ -2073,9 +2074,9 @@  c_parse_escape (const char **ptr, struct obstack *output)
 	    obstack_1grow (output, *tokptr);
 	  }
 	++tokptr;
-	if (!isxdigit (*tokptr))
+	if (!ISXDIGIT (*tokptr))
 	  error (_("\\%c escape without a following hex digit"), c);
-	for (i = 0; i < len && isxdigit (*tokptr); ++i)
+	for (i = 0; i < len && ISXDIGIT (*tokptr); ++i)
 	  {
 	    if (output)
 	      obstack_1grow (output, *tokptr);
@@ -2668,7 +2669,7 @@  lex_one_token (struct parser_state *par_state, bool *is_quoted_name)
 	    size_t len = strlen ("selector");
 
 	    if (strncmp (p, "selector", len) == 0
-		&& (p[len] == '\0' || isspace (p[len])))
+		&& (p[len] == '\0' || ISSPACE (p[len])))
 	      {
 		lexptr = p + len;
 		return SELECTOR;
@@ -2677,9 +2678,9 @@  lex_one_token (struct parser_state *par_state, bool *is_quoted_name)
 	      goto parse_string;
 	  }
 
-	while (isspace (*p))
+	while (ISSPACE (*p))
 	  p++;
-	if (strncmp (p, "entry", len) == 0 && !isalnum (p[len])
+	if (strncmp (p, "entry", len) == 0 && !c_ident_is_alnum (p[len])
 	    && p[len] != '_')
 	  {
 	    lexptr = &p[len];
@@ -2741,16 +2742,14 @@  lex_one_token (struct parser_state *par_state, bool *is_quoted_name)
       }
     }
 
-  if (!(c == '_' || c == '$'
-	|| (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')))
+  if (!(c == '_' || c == '$' || c_ident_is_alpha (c)))
     /* We must have come across a bad character (e.g. ';').  */
     error (_("Invalid character '%c' in expression."), c);
 
   /* It's a name.  See how long it is.  */
   namelen = 0;
   for (c = tokstart[namelen];
-       (c == '_' || c == '$' || (c >= '0' && c <= '9')
-	|| (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '<');)
+       (c == '_' || c == '$' || c_ident_is_alnum (c) || c == '<');)
     {
       /* Template parameter lists are part of the name.
 	 FIXME: This mishandles `print $a<4&&$a>3'.  */
diff --git a/gdb/c-lang.c b/gdb/c-lang.c
index 15e633f8c8..6bbb470957 100644
--- a/gdb/c-lang.c
+++ b/gdb/c-lang.c
@@ -25,6 +25,7 @@ 
 #include "language.h"
 #include "varobj.h"
 #include "c-lang.h"
+#include "c-support.h"
 #include "valprint.h"
 #include "macroscope.h"
 #include "charset.h"
@@ -382,7 +383,7 @@  convert_ucn (char *p, char *limit, const char *dest_charset,
   gdb_byte data[4];
   int i;
 
-  for (i = 0; i < length && p < limit && isxdigit (*p); ++i, ++p)
+  for (i = 0; i < length && p < limit && ISXDIGIT (*p); ++i, ++p)
     result = (result << 4) + host_hex_value (*p);
 
   for (i = 3; i >= 0; --i)
@@ -424,7 +425,7 @@  convert_octal (struct type *type, char *p,
   unsigned long value = 0;
 
   for (i = 0;
-       i < 3 && p < limit && isdigit (*p) && *p != '8' && *p != '9';
+       i < 3 && p < limit && ISDIGIT (*p) && *p != '8' && *p != '9';
        ++i)
     {
       value = 8 * value + host_hex_value (*p);
@@ -447,7 +448,7 @@  convert_hex (struct type *type, char *p,
 {
   unsigned long value = 0;
 
-  while (p < limit && isxdigit (*p))
+  while (p < limit && ISXDIGIT (*p))
     {
       value = 16 * value + host_hex_value (*p);
       ++p;
@@ -488,7 +489,7 @@  convert_escape (struct type *type, const char *dest_charset,
 
     case 'x':
       ADVANCE;
-      if (!isxdigit (*p))
+      if (!ISXDIGIT (*p))
 	error (_("\\x used with no following hex digits."));
       p = convert_hex (type, p, limit, output);
       break;
@@ -510,7 +511,7 @@  convert_escape (struct type *type, const char *dest_charset,
 	int length = *p == 'u' ? 4 : 8;
 
 	ADVANCE;
-	if (!isxdigit (*p))
+	if (!ISXDIGIT (*p))
 	  error (_("\\u used with no following hex digits"));
 	p = convert_ucn (p, limit, dest_charset, output, length);
       }
diff --git a/gdb/c-support.h b/gdb/c-support.h
new file mode 100644
index 0000000000..3641d6f534
--- /dev/null
+++ b/gdb/c-support.h
@@ -0,0 +1,46 @@ 
+/* Helper routines for C support in GDB.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of GDB.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#ifndef C_SUPPORT_H
+#define C_SUPPORT_H
+
+#include "safe-ctype.h"
+
+/* Like ISALPHA, but also returns true for the union of all UTF-8
+   multi-byte sequence bytes and non-ASCII characters in
+   extended-ASCII charsets (e.g., Latin1).  I.e., returns true if the
+   high bit is set.  Note that not all UTF-8 ranges are allowed in C++
+   identifiers, but we don't need to be pedantic so for simplicity we
+   ignore that here.  Plus this avoids the complication of actually
+   knowing what was the right encoding.  */
+
+static inline bool
+c_ident_is_alpha (unsigned char ch)
+{
+  return ISALPHA (ch) || ch >= 0x80;
+}
+
+/* Similarly, but Like ISALNUM.  */
+
+static inline bool
+c_ident_is_alnum (unsigned char ch)
+{
+  return ISALNUM (ch) || ch >= 0x80;
+}
+
+#endif /* C_SUPPORT_H */
diff --git a/gdb/cp-name-parser.y b/gdb/cp-name-parser.y
index f522e46419..ebae56261b 100644
--- a/gdb/cp-name-parser.y
+++ b/gdb/cp-name-parser.y
@@ -35,6 +35,7 @@ 
 #include "safe-ctype.h"
 #include "demangle.h"
 #include "cp-support.h"
+#include "c-support.h"
 
 /* Bison does not make it easy to create a parser without global
    state, unfortunately.  Here are all the global variables used
@@ -1304,28 +1305,6 @@  d_binary (const char *name, struct demangle_component *lhs, struct demangle_comp
 		      fill_comp (DEMANGLE_COMPONENT_BINARY_ARGS, lhs, rhs));
 }
 
-/* Like ISALPHA, but also returns true for the union of all UTF-8
-   multi-byte sequence bytes and non-ASCII characters in
-   extended-ASCII charsets (e.g., Latin1).  I.e., returns true if the
-   high bit is set.  Note that not all UTF-8 ranges are allowed in C++
-   identifiers, but we don't need to be pedantic so for simplicity we
-   ignore that here.  Plus this avoids the complication of actually
-   knowing what was the right encoding.  */
-
-static inline bool
-cp_ident_is_alpha (unsigned char ch)
-{
-  return ISALPHA (ch) || ch >= 0x80;
-}
-
-/* Similarly, but Like ISALNUM.  */
-
-static inline bool
-cp_ident_is_alnum (unsigned char ch)
-{
-  return ISALNUM (ch) || ch >= 0x80;
-}
-
 /* Find the end of a symbol name starting at LEXPTR.  */
 
 static const char *
@@ -1333,7 +1312,7 @@  symbol_end (const char *lexptr)
 {
   const char *p = lexptr;
 
-  while (*p && (cp_ident_is_alnum (*p) || *p == '_' || *p == '$' || *p == '.'))
+  while (*p && (c_ident_is_alnum (*p) || *p == '_' || *p == '$' || *p == '.'))
     p++;
 
   return p;
@@ -1813,7 +1792,7 @@  yylex (void)
       return ERROR;
     }
 
-  if (!(c == '_' || c == '$' || cp_ident_is_alpha (c)))
+  if (!(c == '_' || c == '$' || c_ident_is_alpha (c)))
     {
       /* We must have come across a bad character (e.g. ';').  */
       yyerror (_("invalid character"));
@@ -1824,7 +1803,7 @@  yylex (void)
   namelen = 0;
   do
     c = tokstart[++namelen];
-  while (cp_ident_is_alnum (c) || c == '_' || c == '$');
+  while (c_ident_is_alnum (c) || c == '_' || c == '$');
 
   lexptr += namelen;
 
diff --git a/gdb/testsuite/gdb.base/utf8-identifiers.c b/gdb/testsuite/gdb.base/utf8-identifiers.c
new file mode 100644
index 0000000000..c80b42a03d
--- /dev/null
+++ b/gdb/testsuite/gdb.base/utf8-identifiers.c
@@ -0,0 +1,71 @@ 
+/* -*- coding: utf-8 -*- */
+
+/* This testcase is part of GDB, the GNU debugger.
+
+   Copyright 2017-2018 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* UTF-8 "função1".  */
+#define FUNCAO1 fun\u00e7\u00e3o1
+
+/* UTF-8 "função2".  */
+#define FUNCAO2 fun\u00e7\u00e3o2
+
+/* UTF-8 "my_função".  */
+#define MY_FUNCAO my_fun\u00e7\u00e3o
+
+/* UTF-8 "num_€".  */
+#define NUM_EUROS num_\u20ac
+
+struct S
+{
+  int NUM_EUROS;
+} g_s;
+
+void
+FUNCAO1 (void)
+{
+  g_s.NUM_EUROS = 1000;
+}
+
+void
+FUNCAO2 (void)
+{
+  g_s.NUM_EUROS = 1000;
+}
+
+void
+MY_FUNCAO (void)
+{
+}
+
+int NUM_EUROS = 2000;
+
+static void
+done ()
+{
+}
+
+int
+main ()
+{
+  FUNCAO1 ();
+  done ();
+  FUNCAO2 ();
+  MY_FUNCAO ();
+
+  return 0;
+}
diff --git a/gdb/testsuite/gdb.base/utf8-identifiers.exp b/gdb/testsuite/gdb.base/utf8-identifiers.exp
new file mode 100644
index 0000000000..12fe3768e2
--- /dev/null
+++ b/gdb/testsuite/gdb.base/utf8-identifiers.exp
@@ -0,0 +1,77 @@ 
+# -*- coding: utf-8 -*- */
+
+# This testcase is part of GDB, the GNU debugger.
+
+# Copyright 2017-2018 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Test GDB's support for UTF-8 C/C++ identifiers.
+
+load_lib completion-support.exp
+
+standard_testfile
+
+# Enable basic use of UTF-8.  LC_ALL gets reset for each testfile.
+setenv LC_ALL C.UTF-8
+
+if { [prepare_for_testing "failed to prepare" ${testfile} [list $srcfile]] } {
+    return -1
+}
+
+if ![runto done] {
+    fail "couldn't run to done"
+    return
+}
+
+# Test expressions.
+gdb_test "print g_s.num_€" " = 1000"
+gdb_test "print num_€" " = 2000"
+
+# Test linespecs/breakpoints.
+gdb_test "break função2" "Breakpoint $decimal at .*$srcfile.*"
+
+set test "info breakpoints"
+gdb_test_multiple $test $test {
+    -re "in função2 at .*$srcfile.*$gdb_prompt $" {
+	pass $test
+    }
+}
+
+gdb_test "continue" \
+    "Breakpoint $decimal, função2 \\(\\) at .*$srcfile.*"
+
+# Unload symbols from shared libraries to avoid random symbol and file
+# names getting in the way of completion.
+gdb_test_no_output "nosharedlibrary"
+
+# Test linespec completion.
+
+# A unique completion.
+test_gdb_complete_unique "break my_fun" "break my_função"
+
+# A multiple-matches completion:
+
+# kfailed because gdb/readline display the completion match list like
+# this, with no separating space:
+#
+#  (gdb) break função[TAB]
+#  função1função2
+#
+# ... which is bogus.
+setup_kfail "gdb/23211" "*-*-*"
+test_gdb_complete_multiple "break " "fun" "ção" {"função1" "função2"}
+
+# Test expression completion.
+test_gdb_complete_unique "print g_s.num" "print g_s.num_€"