gconv: Replace norm_add_slashes with __gconv_norm_add_slashes

Message ID 20170619161703.C2225402AEC3C@oldenburg.str.redhat.com
State New, archived
Headers

Commit Message

Florian Weimer June 19, 2017, 4:17 p.m. UTC
  2017-06-19  Florian Weimer  <fweimer@redhat.com>

	* iconv/Makefile (routine): Add norm_add_slashes.
	* iconv/norm_add_slashes.c: New file, extracted from
	iconv/gconv_int.h.
	* iconv/gconv_int.h (norm_add_slashes): Remove.
	(__gconv_norm_add_slashes): Declare.
	* wcsmbs/wcsmbsload.c (__wcsmbs_load_conv): Use
	__gconv_norm_add_slashes.
	* intl/dcigettext.c (_nl_find_msg): Likewise.  Simplify !_LIBC
	case.
  

Comments

Adhemerval Zanella June 19, 2017, 9:38 p.m. UTC | #1
On 19/06/2017 13:17, Florian Weimer wrote:
> 2017-06-19  Florian Weimer  <fweimer@redhat.com>

> diff --git a/iconv/norm_add_slashes.c b/iconv/norm_add_slashes.c
> new file mode 100644
> index 0000000..f380524
> --- /dev/null
> +++ b/iconv/norm_add_slashes.c
> @@ -0,0 +1,54 @@
> +/* Normalize the charset name and add a suffix with slashes.
> +   Copyright (C) 1997-2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <gconv_int.h>
> +
> +char *
> +__gconv_norm_add_slashes (const char *name, size_t name_len,
> +                          const char *suffix)
> +{
> +  const char *name_end  = name + name_len;
> +  const char *cp = name;
> +  char *result;
> +  char *tmp;
> +  size_t cnt = 0;
> +  const size_t suffix_len = strlen (suffix);
> +
> +  while (cp != name_end)
> +    if (*cp++ == '/')
> +      ++cnt;
> +
> +  tmp = result = malloc (name_len + 3 + suffix_len);
> +  if (result == NULL)
> +    return NULL;
> +  cp = name;
> +  while (cp != name_end)
> +    *tmp++ = __toupper_l (*cp++, _nl_C_locobj_ptr);
> +  if (cnt < 2)
> +    {
> +      *tmp++ = '/';
> +      if (cnt < 1)
> +        {
> +          *tmp++ = '/';
> +          if (suffix_len != 0)
> +            tmp = __mempcpy (tmp, suffix, suffix_len);
> +        }
> +    }
> +  *tmp = '\0';
> +  return result;
> +}

I think this is a good candidate for char_array usage [1]:

#define CHAR_ARRAY_INITIAL_SIZE 0
#include <malloc/char_array-skeleton.c>

char *
__gconv_norm_add_slashes (const char *name, size_t name_len,
                          const char *suffix)
{ 
  size_t cnt = 0; 
  for (size_t i = 0; i < name_len; i++)
    if (name[i] == '/')
      cnt++;
  
  struct char_array result;
  if (!char_array_init_str_size (&result, name, name_len))
    return NULL;
  
  for (size_t i = 0; i < char_array_length (&result); i++)
    *char_array_at (&result, i) = __toupper_l (name[i], _nl_C_locobj_ptr);
  
  if (cnt < 2)
    { 
      if (!char_array_append_str (&result, "/"))
        return NULL;
      if (cnt < 1)
        { 
          if (!char_array_append_str (&result, "/")
              | !char_array_append_str (&result, suffix))
            return NULL;
        }
    }
  
  return char_array_finalize (&result, NULL);
}

It has the advantages of index check of dynarray plus the malloc overflow
check.

[1] https://sourceware.org/ml/libc-alpha/2017-06/msg00325.html


> diff --git a/intl/dcigettext.c b/intl/dcigettext.c
> index d97746c..49127d0 100644
> --- a/intl/dcigettext.c
> +++ b/intl/dcigettext.c
> @@ -1123,25 +1123,24 @@ _nl_find_msg (struct loaded_l10nfile *domain_file,
>  		    {
>  		      size_t len;
>  		      char *charset;
> -		      const char *outcharset;
> +		      char *outcharset;
>  
>  		      charsetstr += strlen ("charset=");
>  		      len = strcspn (charsetstr, " \t\n");
>  
> -		      charset = (char *) alloca (len + 1);
> -# if defined _LIBC || HAVE_MEMPCPY
> -		      *((char *) mempcpy (charset, charsetstr, len)) = '\0';
> -# else
> -		      memcpy (charset, charsetstr, len);
> -		      charset[len] = '\0';
> -# endif
> -
> -		      outcharset = encoding;
> -
>  # ifdef _LIBC
>  		      /* We always want to use transliteration.  */
> -		      outcharset = norm_add_slashes (outcharset, "TRANSLIT");
> -		      charset = norm_add_slashes (charset, "");
> +		      charset = __gconv_norm_add_slashes (charsetstr, len, "");
> +		      outcharset = __gconv_norm_add_slashes
> +			(encoding, strlen (encoding),  "TRANSLIT");
> +		      if (charset == NULL || outcharset == NULL)
> +			{
> +			  free ((char *) encoding);

No need to cast on free.

> +			  free (outcharset);
> +			  free (charset);
> +			  goto unlock_fail;
> +			}
> +
>  		      int r = __gconv_open (outcharset, charset, &convd->conv,
>  					    GCONV_AVOID_NOCONV);
>  		      if (__builtin_expect (r != __GCONV_OK, 0))
> @@ -1153,6 +1152,8 @@ _nl_find_msg (struct loaded_l10nfile *domain_file,
>  			    {
>  			      gl_rwlock_unlock (domain->conversions_lock);
>  			      free ((char *) encoding);
> +			      free (outcharset);
> +			      free (charset);
>  			      return NULL;
>  			    }
>  
> @@ -1165,27 +1166,31 @@ _nl_find_msg (struct loaded_l10nfile *domain_file,
>  #   if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
>  	&& !defined __UCLIBC__) \
>         || _LIBICONV_VERSION >= 0x0105
> +		      charset = strndup (charsetstr, len);
>  		      if (strchr (outcharset, '/') == NULL)
>  			{
> -			  char *tmp;
> -
> -			  len = strlen (outcharset);
> -			  tmp = (char *) alloca (len + 10 + 1);
> -			  memcpy (tmp, outcharset, len);
> -			  memcpy (tmp + len, "//TRANSLIT", 10 + 1);
> -			  outcharset = tmp;
> -
> -			  convd->conv = iconv_open (outcharset, charset);
> -
> -			  freea (outcharset);
> +			  if (asprintf (&outcharset, "%s//TRANSLIT",
> +					encoding) < 0)
> +			    outcharset = NULL;
>  			}
>  		      else
> +			  outcharset = strdup (encoding);
> +		      if (charset == NULL || outcharset == NULL)
> +			{
> +			  gl_rwlock_unlock (domain->conversions_lock);
> +			  free (outcharset);
> +			  free (charset);
> +			  free ((char *) encoding);
> +			  return NULL;
> +			}
>  #   endif
> -			convd->conv = iconv_open (outcharset, charset);
> +		      convd->conv = iconv_open (outcharset, charset);
>  #  endif
>  # endif
> -
> -		      freea (charset);
> +		      free (outcharset);
> +		      free (charset);
> +		      /* Do not free encoding here because
> +                         convd->encoding takes ownership.  */
>  		    }
>  		}
>  	    }

In fact these seems another place where a char_array could find some use
to avoid all the boilerplate of managing buffers size to appendages and
buffer management (and we could get some speed up by using the stack as
well).

> diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
> index 656cc0a..cf7f815 100644
> --- a/wcsmbs/wcsmbsload.c
> +++ b/wcsmbs/wcsmbsload.c
> @@ -160,7 +160,6 @@ __wcsmbs_load_conv (struct __locale_data *new_category)
>      {
>        /* We must find the real functions.  */
>        const char *charset_name;
> -      const char *complete_name;
>        struct gconv_fcts *new_fcts;
>        int use_translit;
>  
> @@ -177,8 +176,11 @@ __wcsmbs_load_conv (struct __locale_data *new_category)
>  
>        /* Normalize the name and add the slashes necessary for a
>  	 complete lookup.  */
> -      complete_name = norm_add_slashes (charset_name,
> -					use_translit ? "TRANSLIT" : "");
> +      char *complete_name = __gconv_norm_add_slashes
> +	(charset_name, strlen (charset_name),
> +	 use_translit ? "TRANSLIT" : "");
> +      if (complete_name ==NULL)
> +	goto failed;
>  
>        /* It is not necessary to use transliteration in this direction
>  	 since the internal character set is supposed to be able to
> @@ -188,6 +190,7 @@ __wcsmbs_load_conv (struct __locale_data *new_category)
>        if (new_fcts->towc != NULL)
>  	new_fcts->tomb = __wcsmbs_getfct (complete_name, "INTERNAL",
>  					  &new_fcts->tomb_nsteps);
> +      free (complete_name);
>  
>        /* If any of the conversion functions is not available we don't
>  	 use any since this would mean we cannot convert back and
>
  
Florian Weimer June 19, 2017, 10:17 p.m. UTC | #2
On 06/19/2017 11:38 PM, Adhemerval Zanella wrote:

> I think this is a good candidate for char_array usage [1]:
> 
> #define CHAR_ARRAY_INITIAL_SIZE 0
> #include <malloc/char_array-skeleton.c>
> […]

Hmm, I'll review your latest char_array patch, then.  But if we have
multiple users, we should think about out-lining more of the code.

I believe Arjun may also need a variant of this function
(norm_add_slashes) for his iconv -c fix.

>> +		      if (charset == NULL || outcharset == NULL)
>> +			{
>> +			  free ((char *) encoding);
> 
> No need to cast on free.

encoding is a const char *.

> In fact these seems another place where a char_array could find some use
> to avoid all the boilerplate of managing buffers size to appendages and
> buffer management (and we could get some speed up by using the stack as
> well).

But we can realistically make such far-reaching changes only after
eliminating the conditional compilation, and that's not something I want
to do in this patch.

Thanks,
Florian
  
Adhemerval Zanella June 20, 2017, 12:18 a.m. UTC | #3
On 19/06/2017 19:17, Florian Weimer wrote:
> On 06/19/2017 11:38 PM, Adhemerval Zanella wrote:
> 
>> I think this is a good candidate for char_array usage [1]:
>>
>> #define CHAR_ARRAY_INITIAL_SIZE 0
>> #include <malloc/char_array-skeleton.c>
>> […]
> 
> Hmm, I'll review your latest char_array patch, then.  But if we have
> multiple users, we should think about out-lining more of the code.

Yes, I will resend a new version with some out-lining code.

> 
> I believe Arjun may also need a variant of this function
> (norm_add_slashes) for his iconv -c fix.
> 
>>> +		      if (charset == NULL || outcharset == NULL)
>>> +			{
>>> +			  free ((char *) encoding);
>>
>> No need to cast on free.
> 
> encoding is a const char *.

Indeed... I think we can live with it for now, but this is really a bad idiom
and we should cleanup this later.

> 
>> In fact these seems another place where a char_array could find some use
>> to avoid all the boilerplate of managing buffers size to appendages and
>> buffer management (and we could get some speed up by using the stack as
>> well).
> 
> But we can realistically make such far-reaching changes only after
> eliminating the conditional compilation, and that's not something I want
> to do in this patch.

Fair enough, I also think we should focus on alloca elimination as well.
  

Patch

diff --git a/iconv/Makefile b/iconv/Makefile
index b2fead0..df82b0d 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,8 @@  include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_trans gconv_cache \
+		  norm_add_slashes
 routines	+= gconv_dl
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 85a67ad..7a54e0f 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -121,38 +121,13 @@  extern const char *__gconv_path_envvar attribute_hidden;
 __libc_lock_define (extern, __gconv_lock attribute_hidden)
 
 
-/* The gconv functions expects the name to be in upper case and complete,
-   including the trailing slashes if necessary.  */
-#define norm_add_slashes(str,suffix) \
-  ({									      \
-    const char *cp = (str);						      \
-    char *result;							      \
-    char *tmp;								      \
-    size_t cnt = 0;							      \
-    const size_t suffix_len = strlen (suffix);				      \
-									      \
-    while (*cp != '\0')							      \
-      if (*cp++ == '/')							      \
-	++cnt;								      \
-									      \
-    tmp = result = __alloca (cp - (str) + 3 + suffix_len);		      \
-    cp = (str);								      \
-    while (*cp != '\0')							      \
-      *tmp++ = __toupper_l (*cp++, _nl_C_locobj_ptr);			      \
-    if (cnt < 2)							      \
-      {									      \
-	*tmp++ = '/';							      \
-	if (cnt < 1)							      \
-	  {								      \
-	    *tmp++ = '/';						      \
-	    if (suffix_len != 0)					      \
-	      tmp = __mempcpy (tmp, suffix, suffix_len);		      \
-	  }								      \
-      }									      \
-    *tmp = '\0';							      \
-    result;								      \
-  })
-
+/* Convert NAME of NAME_LEN bytes to the form expected by the gconv
+   functions, including the trailing slashes if necessary.  The caller
+   has to free the returned string.  Return NULL on allocation
+   failure.  */
+char *__gconv_norm_add_slashes (const char *name, size_t name_len,
+				const char *suffix)
+  attribute_hidden;
 
 /* Return in *HANDLE decriptor for transformation from FROMSET to TOSET.  */
 extern int __gconv_open (const char *toset, const char *fromset,
diff --git a/iconv/norm_add_slashes.c b/iconv/norm_add_slashes.c
new file mode 100644
index 0000000..f380524
--- /dev/null
+++ b/iconv/norm_add_slashes.c
@@ -0,0 +1,54 @@ 
+/* Normalize the charset name and add a suffix with slashes.
+   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <gconv_int.h>
+
+char *
+__gconv_norm_add_slashes (const char *name, size_t name_len,
+                          const char *suffix)
+{
+  const char *name_end  = name + name_len;
+  const char *cp = name;
+  char *result;
+  char *tmp;
+  size_t cnt = 0;
+  const size_t suffix_len = strlen (suffix);
+
+  while (cp != name_end)
+    if (*cp++ == '/')
+      ++cnt;
+
+  tmp = result = malloc (name_len + 3 + suffix_len);
+  if (result == NULL)
+    return NULL;
+  cp = name;
+  while (cp != name_end)
+    *tmp++ = __toupper_l (*cp++, _nl_C_locobj_ptr);
+  if (cnt < 2)
+    {
+      *tmp++ = '/';
+      if (cnt < 1)
+        {
+          *tmp++ = '/';
+          if (suffix_len != 0)
+            tmp = __mempcpy (tmp, suffix, suffix_len);
+        }
+    }
+  *tmp = '\0';
+  return result;
+}
diff --git a/intl/dcigettext.c b/intl/dcigettext.c
index d97746c..49127d0 100644
--- a/intl/dcigettext.c
+++ b/intl/dcigettext.c
@@ -1123,25 +1123,24 @@  _nl_find_msg (struct loaded_l10nfile *domain_file,
 		    {
 		      size_t len;
 		      char *charset;
-		      const char *outcharset;
+		      char *outcharset;
 
 		      charsetstr += strlen ("charset=");
 		      len = strcspn (charsetstr, " \t\n");
 
-		      charset = (char *) alloca (len + 1);
-# if defined _LIBC || HAVE_MEMPCPY
-		      *((char *) mempcpy (charset, charsetstr, len)) = '\0';
-# else
-		      memcpy (charset, charsetstr, len);
-		      charset[len] = '\0';
-# endif
-
-		      outcharset = encoding;
-
 # ifdef _LIBC
 		      /* We always want to use transliteration.  */
-		      outcharset = norm_add_slashes (outcharset, "TRANSLIT");
-		      charset = norm_add_slashes (charset, "");
+		      charset = __gconv_norm_add_slashes (charsetstr, len, "");
+		      outcharset = __gconv_norm_add_slashes
+			(encoding, strlen (encoding),  "TRANSLIT");
+		      if (charset == NULL || outcharset == NULL)
+			{
+			  free ((char *) encoding);
+			  free (outcharset);
+			  free (charset);
+			  goto unlock_fail;
+			}
+
 		      int r = __gconv_open (outcharset, charset, &convd->conv,
 					    GCONV_AVOID_NOCONV);
 		      if (__builtin_expect (r != __GCONV_OK, 0))
@@ -1153,6 +1152,8 @@  _nl_find_msg (struct loaded_l10nfile *domain_file,
 			    {
 			      gl_rwlock_unlock (domain->conversions_lock);
 			      free ((char *) encoding);
+			      free (outcharset);
+			      free (charset);
 			      return NULL;
 			    }
 
@@ -1165,27 +1166,31 @@  _nl_find_msg (struct loaded_l10nfile *domain_file,
 #   if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
 	&& !defined __UCLIBC__) \
        || _LIBICONV_VERSION >= 0x0105
+		      charset = strndup (charsetstr, len);
 		      if (strchr (outcharset, '/') == NULL)
 			{
-			  char *tmp;
-
-			  len = strlen (outcharset);
-			  tmp = (char *) alloca (len + 10 + 1);
-			  memcpy (tmp, outcharset, len);
-			  memcpy (tmp + len, "//TRANSLIT", 10 + 1);
-			  outcharset = tmp;
-
-			  convd->conv = iconv_open (outcharset, charset);
-
-			  freea (outcharset);
+			  if (asprintf (&outcharset, "%s//TRANSLIT",
+					encoding) < 0)
+			    outcharset = NULL;
 			}
 		      else
+			  outcharset = strdup (encoding);
+		      if (charset == NULL || outcharset == NULL)
+			{
+			  gl_rwlock_unlock (domain->conversions_lock);
+			  free (outcharset);
+			  free (charset);
+			  free ((char *) encoding);
+			  return NULL;
+			}
 #   endif
-			convd->conv = iconv_open (outcharset, charset);
+		      convd->conv = iconv_open (outcharset, charset);
 #  endif
 # endif
-
-		      freea (charset);
+		      free (outcharset);
+		      free (charset);
+		      /* Do not free encoding here because
+                         convd->encoding takes ownership.  */
 		    }
 		}
 	    }
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 656cc0a..cf7f815 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -160,7 +160,6 @@  __wcsmbs_load_conv (struct __locale_data *new_category)
     {
       /* We must find the real functions.  */
       const char *charset_name;
-      const char *complete_name;
       struct gconv_fcts *new_fcts;
       int use_translit;
 
@@ -177,8 +176,11 @@  __wcsmbs_load_conv (struct __locale_data *new_category)
 
       /* Normalize the name and add the slashes necessary for a
 	 complete lookup.  */
-      complete_name = norm_add_slashes (charset_name,
-					use_translit ? "TRANSLIT" : "");
+      char *complete_name = __gconv_norm_add_slashes
+	(charset_name, strlen (charset_name),
+	 use_translit ? "TRANSLIT" : "");
+      if (complete_name ==NULL)
+	goto failed;
 
       /* It is not necessary to use transliteration in this direction
 	 since the internal character set is supposed to be able to
@@ -188,6 +190,7 @@  __wcsmbs_load_conv (struct __locale_data *new_category)
       if (new_fcts->towc != NULL)
 	new_fcts->tomb = __wcsmbs_getfct (complete_name, "INTERNAL",
 					  &new_fcts->tomb_nsteps);
+      free (complete_name);
 
       /* If any of the conversion functions is not available we don't
 	 use any since this would mean we cannot convert back and