[v6] Rewrite iconv option parsing [BZ #19519]
Commit Message
From: Arjun Shankar <arjun@redhat.com>
This commit replaces string manipulation during `iconv_open' and iconv_prog
option parsing with a structured, flag based conversion specification. In
doing so, it alters the internal `__gconv_open' interface and accordingly
adjusts its uses.
This change fixes several hangs in the iconv program and therefore includes
a new test to exercise iconv_prog options that originally led to these hangs.
It also includes a new regression test for option handling in the iconv
function.
Reviewed-by: Florian Weimer <fweimer@redhat.com>
Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
---
v5 was here:
https://sourceware.org/pipermail/libc-alpha/2020-February/110761.html
Last major review was for v4, here:
https://sourceware.org/pipermail/libc-alpha/2020-January/110094.html
Changes since v5:
1. Use Carlos' comment for comment in __gconv_create_spec about ignoring
suffixes.
2. Avoid spawning a subshell unnecessarily in tst-iconv_prog.sh, as per
Andreas' suggestion.
Rebased and re-tested against master.
---
iconv/Makefile | 17 +-
iconv/Versions | 1 +
iconv/gconv_charset.c | 218 +++++++++++++++++++++++++
iconv/gconv_charset.h | 61 ++++++-
iconv/gconv_int.h | 19 ++-
iconv/gconv_open.c | 64 ++------
iconv/iconv_open.c | 46 +-----
iconv/iconv_prog.c | 63 +++-----
iconv/tst-iconv-opt.c | 347 ++++++++++++++++++++++++++++++++++++++++
iconv/tst-iconv_prog.sh | 280 ++++++++++++++++++++++++++++++++
intl/dcigettext.c | 15 +-
11 files changed, 988 insertions(+), 143 deletions(-)
create mode 100644 iconv/gconv_charset.c
create mode 100644 iconv/tst-iconv-opt.c
create mode 100644 iconv/tst-iconv_prog.sh
Comments
On Jul 07 2020, Arjun Shankar wrote:
> +for hangcommand in "${hangarray[@]}"; do
> + twobyte="$(echo "$hangcommand" | cut -d";" -f 1)"
> + c="$(echo "$hangcommand" | cut -d";" -f 2)"
> + from="$(echo "$hangcommand" | cut -d";" -f 3)"
> + to="$(echo "$hangcommand" | cut -d";" -f 4)"
That could be replaced by doing an IFS split with IFS=';'.
IFS=';' args=($hangcommand)
and then referring to ${args[0]} etc. (or similar).
Andreas.
On 7/7/20 11:05 AM, Arjun Shankar wrote:
> From: Arjun Shankar <arjun@redhat.com>
>
> This commit replaces string manipulation during `iconv_open' and iconv_prog
> option parsing with a structured, flag based conversion specification. In
> doing so, it alters the internal `__gconv_open' interface and accordingly
> adjusts its uses.
>
> This change fixes several hangs in the iconv program and therefore includes
> a new test to exercise iconv_prog options that originally led to these hangs.
> It also includes a new regression test for option handling in the iconv
> function.
OK for master.
Plese push this so we can include the fixes in 2.32.
I see Andreas has some suggestions for improving the shell code, but we can
always clean that up with further changes. I know you have some ideas for
improving the test coverage and we can tidy this up later.
> Reviewed-by: Florian Weimer <fweimer@redhat.com>
> Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
> Reviewed-by: Carlos O'Donell <carlos@redhat.com>
> ---
> v5 was here:
> https://sourceware.org/pipermail/libc-alpha/2020-February/110761.html
> Last major review was for v4, here:
> https://sourceware.org/pipermail/libc-alpha/2020-January/110094.html
> Changes since v5:
> 1. Use Carlos' comment for comment in __gconv_create_spec about ignoring
> suffixes.
> 2. Avoid spawning a subshell unnecessarily in tst-iconv_prog.sh, as per
> Andreas' suggestion.
>
> Rebased and re-tested against master.
> ---
> iconv/Makefile | 17 +-
> iconv/Versions | 1 +
> iconv/gconv_charset.c | 218 +++++++++++++++++++++++++
> iconv/gconv_charset.h | 61 ++++++-
> iconv/gconv_int.h | 19 ++-
> iconv/gconv_open.c | 64 ++------
> iconv/iconv_open.c | 46 +-----
> iconv/iconv_prog.c | 63 +++-----
> iconv/tst-iconv-opt.c | 347 ++++++++++++++++++++++++++++++++++++++++
> iconv/tst-iconv_prog.sh | 280 ++++++++++++++++++++++++++++++++
> intl/dcigettext.c | 15 +-
> 11 files changed, 988 insertions(+), 143 deletions(-)
> create mode 100644 iconv/gconv_charset.c
> create mode 100644 iconv/tst-iconv-opt.c
> create mode 100644 iconv/tst-iconv_prog.sh
>
> diff --git a/iconv/Makefile b/iconv/Makefile
> index b8fe8c47df..3349dfeccd 100644
> --- a/iconv/Makefile
> +++ b/iconv/Makefile
> @@ -26,7 +26,7 @@ headers = iconv.h gconv.h
> routines = iconv_open iconv iconv_close \
> gconv_open gconv gconv_close gconv_db gconv_conf \
> gconv_builtin gconv_simple gconv_trans gconv_cache
> -routines += gconv_dl
> +routines += gconv_dl gconv_charset
OK.
>
> vpath %.c ../locale/programs ../intl
>
> @@ -44,7 +44,7 @@ CFLAGS-linereader.c += -DNO_TRANSLITERATION
> CFLAGS-simple-hash.c += -I../locale
>
> tests = tst-iconv1 tst-iconv2 tst-iconv3 tst-iconv4 tst-iconv5 tst-iconv6 \
> - tst-iconv7 tst-iconv-mt
> + tst-iconv7 tst-iconv-mt tst-iconv-opt
OK.
>
> others = iconv_prog iconvconfig
> install-others-programs = $(inst_bindir)/iconv
> @@ -61,6 +61,7 @@ include $(patsubst %,$(..)libof-iterator.mk,$(cpp-srcs-left))
>
> ifeq ($(run-built-tests),yes)
> xtests-special += $(objpfx)test-iconvconfig.out
> +tests-special += $(objpfx)tst-iconv_prog.out
OK.
> endif
>
> # Make a copy of the file because gconv module names are constructed
> @@ -81,6 +82,13 @@ endif
>
> include ../Rules
>
> +ifeq ($(run-built-tests),yes)
> +LOCALES := en_US.UTF-8
> +include ../gen-locales.mk
> +
> +$(objpfx)tst-iconv-opt.out: $(gen-locales)
OK.
> +endif
> +
> $(inst_bindir)/iconv: $(objpfx)iconv_prog $(+force)
> $(do-install-program)
>
> @@ -95,3 +103,8 @@ $(objpfx)test-iconvconfig.out: /dev/null $(objpfx)iconvconfig
> cmp $$tmp $(inst_gconvdir)/gconv-modules.cache; \
> rm -f $$tmp) > $@; \
> $(evaluate-test)
> +
> +$(objpfx)tst-iconv_prog.out: tst-iconv_prog.sh $(objpfx)iconv_prog
> + $(SHELL) $< $(common-objdir) '$(test-wrapper-env)' \
> + '$(run-program-env)' > $@; \
> + $(evaluate-test)
OK.
> diff --git a/iconv/Versions b/iconv/Versions
> index 60ab10a277..8a5f4cf780 100644
> --- a/iconv/Versions
> +++ b/iconv/Versions
> @@ -6,6 +6,7 @@ libc {
> GLIBC_PRIVATE {
> # functions shared with iconv program
> __gconv_get_alias_db; __gconv_get_cache; __gconv_get_modules_db;
> + __gconv_open; __gconv_create_spec;
OK.
>
> # function used by the gconv modules
> __gconv_transliterate;
> diff --git a/iconv/gconv_charset.c b/iconv/gconv_charset.c
> new file mode 100644
> index 0000000000..6ccd0773cc
> --- /dev/null
> +++ b/iconv/gconv_charset.c
> @@ -0,0 +1,218 @@
> +/* Charset name normalization.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +
> +#include <stdlib.h>
> +#include <ctype.h>
> +#include <locale.h>
> +#include <stdbool.h>
> +#include <string.h>
> +#include <sys/stat.h>
> +#include "gconv_int.h"
> +#include "gconv_charset.h"
> +
> +
> +/* This function returns a pointer to the last suffix in a conversion code
> + string. Valid suffixes matched by this function are of the form: '/' or ','
> + followed by arbitrary text that doesn't contain '/' or ','. It does not
> + edit the string in any way. The caller is expected to parse the suffix and
> + remove it (by e.g. truncating the string) before the next call. */
> +static char *
> +find_suffix (char *s)
> +{
> + /* The conversion code is in the form of a triplet, separated by '/' chars.
> + The third component of the triplet contains suffixes. If we don't have two
> + slashes, we don't have a suffix. */
> +
> + int slash_count = 0;
> + char *suffix_term = NULL;
> +
> + for (int i = 0; s[i] != '\0'; i++)
> + switch (s[i])
> + {
> + case '/':
> + slash_count++;
> + /* Fallthrough */
> + case ',':
> + suffix_term = &s[i];
> + }
> +
> + if (slash_count >= 2)
> + return suffix_term;
> +
> + return NULL;
> +}
OK.
> +
> +
> +struct gconv_parsed_code
> +{
> + char *code;
> + bool translit;
> + bool ignore;
> +};
> +
> +
> +/* This function parses an iconv_open encoding PC.CODE, strips any suffixes
> + (such as TRANSLIT or IGNORE) from it and sets corresponding flags in it. */
> +static void
> +gconv_parse_code (struct gconv_parsed_code *pc)
> +{
> + pc->translit = false;
> + pc->ignore = false;
> +
> + while (1)
> + {
> + /* First drop any trailing whitespaces and separators. */
> + size_t len = strlen (pc->code);
> + while ((len > 0)
> + && (isspace (pc->code[len - 1])
> + || pc->code[len - 1] == ','
> + || pc->code[len - 1] == '/'))
> + len--;
> +
> + pc->code[len] = '\0';
> +
> + if (len == 0)
> + return;
> +
> + char * suffix = find_suffix (pc->code);
> + if (suffix == NULL)
> + {
> + /* At this point, we have processed and removed all suffixes from the
> + code and what remains of the code is suffix free. */
> + return;
> + }
> + else
> + {
> + /* A suffix is processed from the end of the code array going
> + backwards, one suffix at a time. The suffix is an index into the
> + code character array and points to: one past the end of the code
> + and any unprocessed suffixes, and to the beginning of the suffix
> + currently being processed during this iteration. We must process
> + this suffix and then drop it from the code by terminating the
> + preceding text with NULL.
> +
> + We want to allow and recognize suffixes such as:
> +
> + "/TRANSLIT" i.e. single suffix
> + "//TRANSLIT" i.e. single suffix and multiple separators
> + "//TRANSLIT/IGNORE" i.e. suffixes separated by "/"
> + "/TRANSLIT//IGNORE" i.e. suffixes separated by "//"
> + "//IGNORE,TRANSLIT" i.e. suffixes separated by ","
> + "//IGNORE," i.e. trailing ","
> + "//TRANSLIT/" i.e. trailing "/"
> + "//TRANSLIT//" i.e. trailing "//"
> + "/" i.e. empty suffix.
> +
> + Unknown suffixes are silently discarded and ignored. */
> +
> + if ((__strcasecmp_l (suffix,
> + GCONV_TRIPLE_SEPARATOR
> + GCONV_TRANSLIT_SUFFIX,
> + _nl_C_locobj_ptr) == 0)
> + || (__strcasecmp_l (suffix,
> + GCONV_SUFFIX_SEPARATOR
> + GCONV_TRANSLIT_SUFFIX,
> + _nl_C_locobj_ptr) == 0))
> + pc->translit = true;
> +
> + if ((__strcasecmp_l (suffix,
> + GCONV_TRIPLE_SEPARATOR
> + GCONV_IGNORE_ERRORS_SUFFIX,
> + _nl_C_locobj_ptr) == 0)
> + || (__strcasecmp_l (suffix,
> + GCONV_SUFFIX_SEPARATOR
> + GCONV_IGNORE_ERRORS_SUFFIX,
> + _nl_C_locobj_ptr) == 0))
> + pc->ignore = true;
> +
> + /* We just processed this suffix. We can now drop it from the
> + code string by truncating it at the suffix's position. */
> + suffix[0] = '\0';
> + }
> + }
> +}
OK.
> +
> +
> +/* This function accepts the charset names of the source and destination of the
> + conversion and populates *conv_spec with an equivalent conversion
> + specification that may later be used by __gconv_open. The charset names
> + might contain options in the form of suffixes that alter the conversion,
> + e.g. "ISO-10646/UTF-8/TRANSLIT". It processes the charset names, ignoring
> + and truncating any suffix options in fromcode, and processing and truncating
> + any suffix options in tocode. Supported suffix options ("TRANSLIT" or
> + "IGNORE") when found in tocode lead to the corresponding flag in *conv_spec
> + to be set to true. Unrecognized suffix options are silently discarded. If
> + the function succeeds, it returns conv_spec back to the caller. It returns
> + NULL upon failure. conv_spec must be allocated and freed by the caller. */
> +struct gconv_spec *
> +__gconv_create_spec (struct gconv_spec *conv_spec, const char *fromcode,
> + const char *tocode)
> +{
> + struct gconv_parsed_code pfc, ptc;
> + struct gconv_spec *ret = NULL;
> +
> + pfc.code = __strdup (fromcode);
> + ptc.code = __strdup (tocode);
> +
> + if ((pfc.code == NULL)
> + || (ptc.code == NULL))
> + goto out;
> +
> + gconv_parse_code (&pfc);
> + gconv_parse_code (&ptc);
> +
> + /* We ignore suffixes in the fromcode because that is how the current
> + implementation has always handled them. Only suffixes in the tocode are
> + processed and handled. The reality is that invalid input in the input
> + character set should only be ignored if the fromcode specifies IGNORE.
> + The current implementation ignores invalid intput in the input character
> + set if the tocode contains IGNORE. We preserve this behavior for
> + backwards compatibility. In the future we may split the handling of
> + IGNORE to allow a finer grained specification of ignorning invalid input
> + and/or ignoring invalid output. */
OK.
> + conv_spec->translit = ptc.translit;
> + conv_spec->ignore = ptc.ignore;
> +
> + /* 3 extra bytes because 1 extra for '\0', and 2 extra so strip might
> + be able to add one or two trailing '/' characters if necessary. */
> + conv_spec->fromcode = malloc (strlen (fromcode) + 3);
> + if (conv_spec->fromcode == NULL)
> + goto out;
> +
> + conv_spec->tocode = malloc (strlen (tocode) + 3);
> + if (conv_spec->tocode == NULL)
> + {
> + free (conv_spec->fromcode);
> + conv_spec->fromcode = NULL;
> + goto out;
> + }
> +
> + /* Strip unrecognized characters and ensure that the code has two '/'
> + characters as per conversion code triplet specification. */
> + strip (conv_spec->fromcode, pfc.code);
> + strip (conv_spec->tocode, ptc.code);
> + ret = conv_spec;
> +
> +out:
> + free (pfc.code);
> + free (ptc.code);
> +
> + return ret;
> +}
> +libc_hidden_def (__gconv_create_spec)
> diff --git a/iconv/gconv_charset.h b/iconv/gconv_charset.h
> index 348acc089b..b39b09aea1 100644
> --- a/iconv/gconv_charset.h
> +++ b/iconv/gconv_charset.h
> @@ -19,9 +19,68 @@
>
> #include <ctype.h>
> #include <locale.h>
> +#include <stdbool.h>
> +#include <string.h>
> +#include <sys/stat.h>
> +#include <stdlib.h>
> +#include "gconv_int.h"
>
>
> -static void
> +/* An iconv encoding is in the form of a triplet, with parts separated by
> + a '/' character. The first part is the standard name, the second part is
> + the character set, and the third part is the error handler. If the first
> + part is sufficient to identify both the standard and the character set
> + then the second part can be empty e.g. UTF-8//. If the first part is not
> + sufficient to identify both the standard and the character set then the
> + second part is required e.g. ISO-10646/UTF8/. If neither the first or
> + second parts are provided e.g. //, then the current locale is used.
> + The actual values used in the first and second parts are not entirely
> + relevant to the implementation. The values themselves are used in a hash
> + table to lookup modules and so the naming convention of the first two parts
> + is somewhat arbitrary and only helps locate the entries in the cache.
> + The third part is the error handler and is comprised of a ',' or '/'
> + separated list of suffixes. Currently, we support "TRANSLIT" for
> + transliteration and "IGNORE" for ignoring conversion errors due to
> + unrecognized input characters. */
> +#define GCONV_TRIPLE_SEPARATOR "/"
> +#define GCONV_SUFFIX_SEPARATOR ","
> +#define GCONV_TRANSLIT_SUFFIX "TRANSLIT"
> +#define GCONV_IGNORE_ERRORS_SUFFIX "IGNORE"
> +
> +
> +/* This function accepts the charset names of the source and destination of the
> + conversion and populates *conv_spec with an equivalent conversion
> + specification that may later be used by __gconv_open. The charset names
> + might contain options in the form of suffixes that alter the conversion,
> + e.g. "ISO-10646/UTF-8/TRANSLIT". It processes the charset names, ignoring
> + and truncating any suffix options in fromcode, and processing and truncating
> + any suffix options in tocode. Supported suffix options ("TRANSLIT" or
> + "IGNORE") when found in tocode lead to the corresponding flag in *conv_spec
> + to be set to true. Unrecognized suffix options are silently discarded. If
> + the function succeeds, it returns conv_spec back to the caller. It returns
> + NULL upon failure. */
> +struct gconv_spec *
> +__gconv_create_spec (struct gconv_spec *conv_spec, const char *fromcode,
> + const char *tocode);
> +libc_hidden_proto (__gconv_create_spec)
> +
> +
> +/* This function frees all heap memory allocated by __gconv_create_spec. */
> +static void __attribute__ ((unused))
> +gconv_destroy_spec (struct gconv_spec *conv_spec)
> +{
> + free (conv_spec->fromcode);
> + free (conv_spec->tocode);
> + return;
> +}
> +
> +
> +/* This function copies in-order, characters from the source 's' that are
> + either alpha-numeric or one in one of these: "_-.,:/" - into the destination
> + 'wp' while dropping all other characters. In the process, it converts all
> + alphabetical characters to upper case. It then appends up to two '/'
> + characters so that the total number of '/'es in the destination is 2. */
> +static inline void __attribute__ ((unused, always_inline))
> strip (char *wp, const char *s)
> {
> int slash_count = 0;
> diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
> index fbaf12cee2..e86938dae7 100644
> --- a/iconv/gconv_int.h
> +++ b/iconv/gconv_int.h
> @@ -75,6 +75,15 @@ struct gconv_module
> };
>
>
> +/* The specification of the conversion that needs to be performed. */
> +struct gconv_spec
> +{
> + char *fromcode;
> + char *tocode;
> + bool translit;
> + bool ignore;
> +};
> +
> /* Flags for `gconv_open'. */
> enum
> {
> @@ -136,10 +145,12 @@ __libc_lock_define (extern, __gconv_lock attribute_hidden)
> })
>
>
> -/* Return in *HANDLE decriptor for transformation from FROMSET to TOSET. */
> -extern int __gconv_open (const char *toset, const char *fromset,
> - __gconv_t *handle, int flags)
> - attribute_hidden;
> +/* Return in *HANDLE, a decriptor for the transformation. The function expects
> + the specification of the transformation in the structure pointed to by
> + CONV_SPEC. It only reads *CONV_SPEC and does not take ownership of it. */
> +extern int __gconv_open (struct gconv_spec *conv_spec,
> + __gconv_t *handle, int flags);
> +libc_hidden_proto (__gconv_open)
>
> /* Free resources associated with transformation descriptor CD. */
> extern int __gconv_close (__gconv_t cd)
> diff --git a/iconv/gconv_open.c b/iconv/gconv_open.c
> index b39626d252..2878620957 100644
> --- a/iconv/gconv_open.c
> +++ b/iconv/gconv_open.c
> @@ -31,7 +31,7 @@
>
>
> int
> -__gconv_open (const char *toset, const char *fromset, __gconv_t *handle,
> +__gconv_open (struct gconv_spec *conv_spec, __gconv_t *handle,
> int flags)
> {
> struct __gconv_step *steps;
> @@ -40,77 +40,38 @@ __gconv_open (const char *toset, const char *fromset, __gconv_t *handle,
> size_t cnt = 0;
> int res;
> int conv_flags = 0;
> - const char *errhand;
> - const char *ignore;
> bool translit = false;
> + char *tocode, *fromcode;
>
> /* Find out whether any error handling method is specified. */
> - errhand = strchr (toset, '/');
> - if (errhand != NULL)
> - errhand = strchr (errhand + 1, '/');
> - if (__glibc_likely (errhand != NULL))
> - {
> - if (*++errhand == '\0')
> - errhand = NULL;
> - else
> - {
> - /* Make copy without the error handling description. */
> - char *newtoset = (char *) alloca (errhand - toset + 1);
> - char *tok;
> - char *ptr = NULL /* Work around a bogus warning */;
> -
> - newtoset[errhand - toset] = '\0';
> - toset = memcpy (newtoset, toset, errhand - toset);
> + translit = conv_spec->translit;
>
> - /* Find the appropriate transliteration handlers. */
> - tok = strdupa (errhand);
> + if (conv_spec->ignore)
> + conv_flags |= __GCONV_IGNORE_ERRORS;
>
> - tok = __strtok_r (tok, ",", &ptr);
> - while (tok != NULL)
> - {
> - if (__strcasecmp_l (tok, "TRANSLIT", _nl_C_locobj_ptr) == 0)
> - translit = true;
> - else if (__strcasecmp_l (tok, "IGNORE", _nl_C_locobj_ptr) == 0)
> - /* Set the flag to ignore all errors. */
> - conv_flags |= __GCONV_IGNORE_ERRORS;
> -
> - tok = __strtok_r (NULL, ",", &ptr);
> - }
> - }
> - }
> -
> - /* For the source character set we ignore the error handler specification.
> - XXX Is this really always the best? */
> - ignore = strchr (fromset, '/');
> - if (ignore != NULL && (ignore = strchr (ignore + 1, '/')) != NULL
> - && *++ignore != '\0')
> - {
> - char *newfromset = (char *) alloca (ignore - fromset + 1);
> -
> - newfromset[ignore - fromset] = '\0';
> - fromset = memcpy (newfromset, fromset, ignore - fromset);
> - }
> + tocode = conv_spec->tocode;
> + fromcode = conv_spec->fromcode;
>
> /* If the string is empty define this to mean the charset of the
> currently selected locale. */
> - if (strcmp (toset, "//") == 0)
> + if (strcmp (tocode, "//") == 0)
> {
> const char *codeset = _NL_CURRENT (LC_CTYPE, CODESET);
> size_t len = strlen (codeset);
> char *dest;
> - toset = dest = (char *) alloca (len + 3);
> + tocode = dest = (char *) alloca (len + 3);
> memcpy (__mempcpy (dest, codeset, len), "//", 3);
> }
> - if (strcmp (fromset, "//") == 0)
> + if (strcmp (fromcode, "//") == 0)
> {
> const char *codeset = _NL_CURRENT (LC_CTYPE, CODESET);
> size_t len = strlen (codeset);
> char *dest;
> - fromset = dest = (char *) alloca (len + 3);
> + fromcode = dest = (char *) alloca (len + 3);
> memcpy (__mempcpy (dest, codeset, len), "//", 3);
> }
>
> - res = __gconv_find_transform (toset, fromset, &steps, &nsteps, flags);
> + res = __gconv_find_transform (tocode, fromcode, &steps, &nsteps, flags);
> if (res == __GCONV_OK)
> {
> /* Allocate room for handle. */
> @@ -209,3 +170,4 @@ __gconv_open (const char *toset, const char *fromset, __gconv_t *handle,
> *handle = result;
> return res;
> }
> +libc_hidden_def (__gconv_open)
> diff --git a/iconv/iconv_open.c b/iconv/iconv_open.c
> index 687067070a..dd54bc12e0 100644
> --- a/iconv/iconv_open.c
> +++ b/iconv/iconv_open.c
> @@ -31,49 +31,15 @@
> iconv_t
> iconv_open (const char *tocode, const char *fromcode)
> {
> - /* Normalize the name. We remove all characters beside alpha-numeric,
> - '_', '-', '/', '.', and ':'. */
> - size_t tocode_len = strlen (tocode) + 3;
> - char *tocode_conv;
> - bool tocode_usealloca = __libc_use_alloca (tocode_len);
> - if (tocode_usealloca)
> - tocode_conv = (char *) alloca (tocode_len);
> - else
> - {
> - tocode_conv = (char *) malloc (tocode_len);
> - if (tocode_conv == NULL)
> - return (iconv_t) -1;
> - }
> - strip (tocode_conv, tocode);
> - tocode = (tocode_conv[2] == '\0' && tocode[0] != '\0'
> - ? upstr (tocode_conv, tocode) : tocode_conv);
> + __gconv_t cd;
> + struct gconv_spec conv_spec;
>
> - size_t fromcode_len = strlen (fromcode) + 3;
> - char *fromcode_conv;
> - bool fromcode_usealloca = __libc_use_alloca (fromcode_len);
> - if (fromcode_usealloca)
> - fromcode_conv = (char *) alloca (fromcode_len);
> - else
> - {
> - fromcode_conv = (char *) malloc (fromcode_len);
> - if (fromcode_conv == NULL)
> - {
> - if (! tocode_usealloca)
> - free (tocode_conv);
> - return (iconv_t) -1;
> - }
> - }
> - strip (fromcode_conv, fromcode);
> - fromcode = (fromcode_conv[2] == '\0' && fromcode[0] != '\0'
> - ? upstr (fromcode_conv, fromcode) : fromcode_conv);
> + if (__gconv_create_spec (&conv_spec, fromcode, tocode) == NULL)
> + return (iconv_t) -1;
>
> - __gconv_t cd;
> - int res = __gconv_open (tocode, fromcode, &cd, 0);
> + int res = __gconv_open (&conv_spec, &cd, 0);
>
> - if (! fromcode_usealloca)
> - free (fromcode_conv);
> - if (! tocode_usealloca)
> - free (tocode_conv);
> + gconv_destroy_spec (&conv_spec);
>
> if (__builtin_expect (res, __GCONV_OK) != __GCONV_OK)
> {
> diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c
> index 9709e4a701..b4334faa57 100644
> --- a/iconv/iconv_prog.c
> +++ b/iconv/iconv_prog.c
> @@ -39,6 +39,7 @@
> #include <gconv_int.h>
> #include "iconv_prog.h"
> #include "iconvconfig.h"
> +#include "gconv_charset.h"
>
> /* Get libc version number. */
> #include "../version.h"
> @@ -118,8 +119,7 @@ main (int argc, char *argv[])
> {
> int status = EXIT_SUCCESS;
> int remaining;
> - iconv_t cd;
> - const char *orig_to_code;
> + __gconv_t cd;
> struct charmap_t *from_charmap = NULL;
> struct charmap_t *to_charmap = NULL;
>
> @@ -139,39 +139,6 @@ main (int argc, char *argv[])
> exit (EXIT_SUCCESS);
> }
>
> - /* If we have to ignore errors make sure we use the appropriate name for
> - the to-character-set. */
> - orig_to_code = to_code;
> - if (omit_invalid)
> - {
> - const char *errhand = strchrnul (to_code, '/');
> - int nslash = 2;
> - char *newp;
> - char *cp;
> -
> - if (*errhand == '/')
> - {
> - --nslash;
> - errhand = strchrnul (errhand + 1, '/');
> -
> - if (*errhand == '/')
> - {
> - --nslash;
> - errhand = strchr (errhand, '\0');
> - }
> - }
> -
> - newp = (char *) alloca (errhand - to_code + nslash + 7 + 1);
> - cp = mempcpy (newp, to_code, errhand - to_code);
> - while (nslash-- > 0)
> - *cp++ = '/';
> - if (cp[-1] != '/')
> - *cp++ = ',';
> - memcpy (cp, "IGNORE", sizeof ("IGNORE"));
> -
> - to_code = newp;
> - }
> -
> /* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
> can be file names of charmaps. In this case iconv will have to read
> those charmaps and use them to do the conversion. But there are
> @@ -184,10 +151,10 @@ main (int argc, char *argv[])
> file. */
> from_charmap = charmap_read (from_code, /*0, 1*/1, 0, 0, 0);
>
> - if (strchr (orig_to_code, '/') != NULL)
> + if (strchr (to_code, '/') != NULL)
> /* The to-name might be a charmap file name. Try reading the
> file. */
> - to_charmap = charmap_read (orig_to_code, /*0, 1,*/1, 0, 0, 0);
> + to_charmap = charmap_read (to_code, /*0, 1,*/1, 0, 0, 0);
>
>
> /* At this point we have to handle two cases. The first one is
> @@ -201,9 +168,25 @@ main (int argc, char *argv[])
> argc, remaining, argv, output_file);
> else
> {
> + struct gconv_spec conv_spec;
> + int res;
> +
> + if (__gconv_create_spec (&conv_spec, from_code, to_code) == NULL)
> + {
> + error (EXIT_FAILURE, errno,
> + _("failed to start conversion processing"));
> + exit (1);
> + }
> +
> + if (omit_invalid)
> + conv_spec.ignore = true;
> +
> /* Let's see whether we have these coded character sets. */
> - cd = iconv_open (to_code, from_code);
> - if (cd == (iconv_t) -1)
> + res = __gconv_open (&conv_spec, &cd, 0);
> +
> + gconv_destroy_spec (&conv_spec);
> +
> + if (res != __GCONV_OK)
> {
> if (errno == EINVAL)
> {
> @@ -221,7 +204,7 @@ main (int argc, char *argv[])
> const char *from_pretty =
> (from_code[0] ? from_code : nl_langinfo (CODESET));
> const char *to_pretty =
> - (orig_to_code[0] ? orig_to_code : nl_langinfo (CODESET));
> + (to_code[0] ? to_code : nl_langinfo (CODESET));
>
> if (from_wrong)
> {
> diff --git a/iconv/tst-iconv-opt.c b/iconv/tst-iconv-opt.c
> new file mode 100644
> index 0000000000..669d812a6a
> --- /dev/null
> +++ b/iconv/tst-iconv-opt.c
> @@ -0,0 +1,347 @@
> +/* Test iconv's TRANSLIT and IGNORE option handling
> +
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <iconv.h>
> +#include <locale.h>
> +#include <errno.h>
> +#include <string.h>
> +#include <support/support.h>
> +#include <support/check.h>
> +
> +
> +/* Run one iconv test. Arguments:
> + to: destination character set and options
> + from: source character set
> + input: input string to be converted
> + exp_in: expected number of bytes consumed
> + exp_ret: expected return value (error or number of irreversible conversions)
> + exp_out: expected output string
> + exp_err: expected value of `errno' after iconv returns. */
> +static void
> +test_iconv (const char *to, const char *from, char *input, size_t exp_in,
> + size_t exp_ret, const char *exp_out, int exp_err)
> +{
> + iconv_t cd;
> + char outbuf[500];
> + size_t inlen, outlen;
> + char *inptr, *outptr;
> + size_t n;
> +
> + cd = iconv_open (to, from);
> + TEST_VERIFY (cd != (iconv_t) -1);
> +
> + inlen = strlen (input);
> + outlen = sizeof (outbuf);
> + inptr = input;
> + outptr = outbuf;
> +
> + errno = 0;
> + n = iconv (cd, &inptr, &inlen, &outptr, &outlen);
> +
> + TEST_COMPARE (n, exp_ret);
> + TEST_VERIFY (inptr == input + exp_in);
> + TEST_COMPARE (errno, exp_err);
> + TEST_COMPARE_BLOB (outbuf, outptr - outbuf, exp_out, strlen (exp_out));
> + TEST_VERIFY (iconv_close (cd) == 0);
> +}
> +
> +
> +/* We test option parsing by converting UTF-8 inputs to ASCII under various
> + option combinations. The UTF-8 inputs fall into three categories:
> + - ASCII-only,
> + - non-ASCII,
> + - non-ASCII with invalid UTF-8 characters. */
> +
> +/* 1. */
> +char ascii[] = "Just some ASCII text";
> +
> +/* 2. Valid UTF-8 input and some corresponding expected outputs with various
> + options. The two non-ASCII characters below are accented alphabets:
> + an `a' then an `o'. */
> +char utf8[] = "UTF-8 text with \u00E1 couple \u00F3f non-ASCII characters";
> +char u2a[] = "UTF-8 text with ";
> +char u2a_translit[] = "UTF-8 text with a couple of non-ASCII characters";
> +char u2a_ignore[] = "UTF-8 text with couple f non-ASCII characters";
> +
> +/* 3. Invalid UTF-8 input and some corresponding expected outputs. \xff is
> + invalid UTF-8. It's followed by some valid but non-ASCII UTF-8. */
> +char iutf8[] = "Invalid UTF-8 \xff\u27E6text\u27E7";
> +char iu2a[] = "Invalid UTF-8 ";
> +char iu2a_ignore[] = "Invalid UTF-8 text";
> +char iu2a_both[] = "Invalid UTF-8 [|text|]";
> +
> +/* 4. Another invalid UTF-8 input and corresponding expected outputs. This time
> + the valid non-ASCII UTF-8 characters appear before the invalid \xff. */
> +char jutf8[] = "Invalid \u27E6UTF-8\u27E7 \xfftext";
> +char ju2a[] = "Invalid ";
> +char ju2a_translit[] = "Invalid [|UTF-8|] ";
> +char ju2a_ignore[] = "Invalid UTF-8 text";
> +char ju2a_both[] = "Invalid [|UTF-8|] text";
> +
> +/* We also test option handling for character set names that have the form
> + "A/B". In this test, we test conversions "ISO-10646/UTF-8", and either
> + ISO-8859-1 or ASCII. */
> +
> +/* 5. Accented 'A' and 'a' characters in ISO-8859-1 and UTF-8, and an
> + equivalent ASCII transliteration. */
> +char iso8859_1_a[] = {0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, /* Accented A's. */
> + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, /* Accented a's. */
> + 0x00};
> +char utf8_a[] = "\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5"
> + "\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5";
> +char ascii_a[] = "AAAAAAaaaaaa";
> +
> +/* 6. An invalid ASCII string where [0] is invalid and [1] is '~'. */
> +char iascii [] = {0x80, '~', '\0'};
> +char empty[] = "";
> +char ia2u_ignore[] = "~";
> +
> +static int
> +do_test (void)
> +{
> + xsetlocale (LC_ALL, "en_US.UTF-8");
> +
> +
> + /* 0. iconv_open should gracefully fail for invalid character sets. */
> +
> + TEST_VERIFY (iconv_open ("INVALID", "UTF-8") == (iconv_t) -1);
> + TEST_VERIFY (iconv_open ("UTF-8", "INVALID") == (iconv_t) -1);
> + TEST_VERIFY (iconv_open ("INVALID", "INVALID") == (iconv_t) -1);
> +
> +
> + /* 1. ASCII-only UTF-8 input should convert to ASCII with no changes: */
> +
> + test_iconv ("ASCII", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
> + test_iconv ("ASCII//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
> + test_iconv ("ASCII//TRANSLIT", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
> + test_iconv ("ASCII//TRANSLIT//", "UTF-8", ascii, strlen (ascii), 0, ascii,
> + 0);
> + test_iconv ("ASCII//IGNORE", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
> + test_iconv ("ASCII//IGNORE//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
> +
> +
> + /* 2. Valid UTF-8 input with non-ASCII characters: */
> +
> + /* EILSEQ when converted to ASCII. */
> + test_iconv ("ASCII", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a, EILSEQ);
> +
> + /* Converted without error with TRANSLIT enabled. */
> + test_iconv ("ASCII//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2, u2a_translit,
> + 0);
> +
> + /* EILSEQ with IGNORE enabled. Non-ASCII chars dropped from output. */
> + test_iconv ("ASCII//IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
> + u2a_ignore, EILSEQ);
> +
> + /* With TRANSLIT and IGNORE enabled, transliterated without error. We test
> + four combinations. */
> +
> + test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", utf8, strlen (utf8), 2,
> + u2a_translit, 0);
> + test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", utf8, strlen (utf8), 2,
> + u2a_translit, 0);
> + test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
> + u2a_translit, 0);
> + /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
> + test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
> + u2a_translit, 0);
> +
> + /* Misspellings of TRANSLIT and IGNORE are ignored, but conversion still
> + works while respecting any other correctly spelled options. */
> +
> + test_iconv ("ASCII//T", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
> + EILSEQ);
> + test_iconv ("ASCII//TRANSLITERATE", "UTF-8", utf8, strlen (u2a), (size_t) -1,
> + u2a, EILSEQ);
> + test_iconv ("ASCII//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
> + EILSEQ);
> + test_iconv ("ASCII//IGNORED", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
> + EILSEQ);
> + test_iconv ("ASCII//TRANSLITERATE//IGNORED", "UTF-8", utf8, strlen (u2a),
> + (size_t) -1, u2a, EILSEQ);
> + test_iconv ("ASCII//IGNORED,TRANSLITERATE", "UTF-8", utf8, strlen (u2a),
> + (size_t) -1, u2a, EILSEQ);
> + test_iconv ("ASCII//T//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
> + EILSEQ);
> +
> + test_iconv ("ASCII//TRANSLIT//I", "UTF-8", utf8, strlen (utf8), 2,
> + u2a_translit, 0);
> + /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
> + test_iconv ("ASCII//I//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
> + u2a_translit, 0);
> + test_iconv ("ASCII//IGNORED,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
> + u2a_translit, 0);
> + test_iconv ("ASCII//TRANSLIT,IGNORED", "UTF-8", utf8, strlen (utf8), 2,
> + u2a_translit, 0);
> +
> + test_iconv ("ASCII//IGNORE,T", "UTF-8", utf8, strlen (utf8), (size_t) -1,
> + u2a_ignore, EILSEQ);
> + test_iconv ("ASCII//T,IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
> + u2a_ignore, EILSEQ);
> + /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
> + test_iconv ("ASCII//TRANSLITERATE//IGNORE", "UTF-8", utf8, strlen (utf8),
> + (size_t) -1, u2a_ignore, EILSEQ);
> + test_iconv ("ASCII//IGNORE//TRANSLITERATE", "UTF-8", utf8, strlen (utf8),
> + (size_t) -1, u2a_ignore, EILSEQ);
> +
> +
> + /* 3. Invalid UTF-8 followed by some valid non-ASCII UTF-8 characters: */
> +
> + /* EILSEQ; output is truncated at the first invalid UTF-8 character. */
> + test_iconv ("ASCII", "UTF-8", iutf8, strlen (iu2a), (size_t) -1, iu2a,
> + EILSEQ);
> +
> + /* With TRANSLIT enabled: EILSEQ; output still truncated at the first invalid
> + UTF-8 character. */
> + test_iconv ("ASCII//TRANSLIT", "UTF-8", iutf8, strlen (iu2a), (size_t) -1,
> + iu2a, EILSEQ);
> +
> + /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
> + valid UTF-8 non-ASCII characters. */
> + test_iconv ("ASCII//IGNORE", "UTF-8", iutf8, strlen (iutf8), (size_t) -1,
> + iu2a_ignore, EILSEQ);
> +
> + /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
> + characters and transliterates valid non-ASCII UTF-8 characters. We test
> + four combinations. */
> +
> + test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
> + iu2a_both, 0);
> + /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
> + test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
> + iu2a_both, 0);
> + test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
> + iu2a_both, 0);
> + /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
> + test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
> + iu2a_both, 0);
> +
> +
> + /* 4. Invalid UTF-8 with valid non-ASCII UTF-8 chars appearing first: */
> +
> + /* EILSEQ; output is truncated at the first non-ASCII character. */
> + test_iconv ("ASCII", "UTF-8", jutf8, strlen (ju2a), (size_t) -1, ju2a,
> + EILSEQ);
> +
> + /* With TRANSLIT enabled: EILSEQ; output now truncated at the first invalid
> + UTF-8 character. */
> + test_iconv ("ASCII//TRANSLIT", "UTF-8", jutf8, strlen (jutf8) - 5,
> + (size_t) -1, ju2a_translit, EILSEQ);
> + test_iconv ("ASCII//translit", "UTF-8", jutf8, strlen (jutf8) - 5,
> + (size_t) -1, ju2a_translit, EILSEQ);
> +
> + /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
> + valid UTF-8 non-ASCII characters. */
> + test_iconv ("ASCII//IGNORE", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
> + ju2a_ignore, EILSEQ);
> + test_iconv ("ASCII//ignore", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
> + ju2a_ignore, EILSEQ);
> +
> + /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
> + characters and transliterates valid non-ASCII UTF-8 characters. We test
> + several combinations. */
> +
> + test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
> + ju2a_both, 0);
> + /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
> + test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
> + ju2a_both, 0);
> + test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
> + ju2a_both, 0);
> + /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
> + test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
> + ju2a_both, 0);
> + test_iconv ("ASCII//translit,ignore", "UTF-8", jutf8, strlen (jutf8), 2,
> + ju2a_both, 0);
> + /* Trailing whitespace and separators should be ignored. */
> + test_iconv ("ASCII//IGNORE,TRANSLIT ", "UTF-8", jutf8, strlen (jutf8), 2,
> + ju2a_both, 0);
> + test_iconv ("ASCII//IGNORE,TRANSLIT/", "UTF-8", jutf8, strlen (jutf8), 2,
> + ju2a_both, 0);
> + test_iconv ("ASCII//IGNORE,TRANSLIT//", "UTF-8", jutf8, strlen (jutf8), 2,
> + ju2a_both, 0);
> + test_iconv ("ASCII//IGNORE,TRANSLIT,", "UTF-8", jutf8, strlen (jutf8), 2,
> + ju2a_both, 0);
> + test_iconv ("ASCII//IGNORE,TRANSLIT,,", "UTF-8", jutf8, strlen (jutf8), 2,
> + ju2a_both, 0);
> + test_iconv ("ASCII//IGNORE,TRANSLIT /,", "UTF-8", jutf8, strlen (jutf8), 2,
> + ju2a_both, 0);
> +
> + /* TRANSLIT or IGNORE suffixes in fromcode should be ignored. */
> + test_iconv ("ASCII", "UTF-8//TRANSLIT", jutf8, strlen (ju2a), (size_t) -1,
> + ju2a, EILSEQ);
> + test_iconv ("ASCII", "UTF-8//IGNORE", jutf8, strlen (ju2a), (size_t) -1,
> + ju2a, EILSEQ);
> + test_iconv ("ASCII", "UTF-8//TRANSLIT,IGNORE", jutf8, strlen (ju2a),
> + (size_t) -1, ju2a, EILSEQ);
> +
> +
> + /* 5. Charset names of the form "A/B/": */
> +
> + /* ISO-8859-1 is converted to UTF-8 without needing transliteration. */
> + test_iconv ("ISO-10646/UTF-8", "ISO-8859-1", iso8859_1_a,
> + strlen (iso8859_1_a), 0, utf8_a, 0);
> + test_iconv ("ISO-10646/UTF-8/", "ISO-8859-1", iso8859_1_a,
> + strlen (iso8859_1_a), 0, utf8_a, 0);
> + test_iconv ("ISO-10646/UTF-8/IGNORE", "ISO-8859-1", iso8859_1_a,
> + strlen (iso8859_1_a), 0, utf8_a, 0);
> + test_iconv ("ISO-10646/UTF-8//IGNORE", "ISO-8859-1", iso8859_1_a,
> + strlen (iso8859_1_a), 0, utf8_a, 0);
> + test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ISO-8859-1", iso8859_1_a,
> + strlen (iso8859_1_a), 0, utf8_a, 0);
> + test_iconv ("ISO-10646/UTF-8//TRANSLIT", "ISO-8859-1", iso8859_1_a,
> + strlen (iso8859_1_a), 0, utf8_a, 0);
> + test_iconv ("ISO-10646/UTF-8//TRANSLIT/IGNORE", "ISO-8859-1", iso8859_1_a,
> + strlen (iso8859_1_a), 0, utf8_a, 0);
> + test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ISO-8859-1", iso8859_1_a,
> + strlen (iso8859_1_a), 0, utf8_a, 0);
> + test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ISO-8859-1", iso8859_1_a,
> + strlen (iso8859_1_a), 0, utf8_a, 0);
> +
> + /* UTF-8 with accented A's is converted to ASCII with transliteration. */
> + test_iconv ("ASCII", "ISO-10646/UTF-8", utf8_a,
> + 0, (size_t) -1, empty, EILSEQ);
> + test_iconv ("ASCII//IGNORE", "ISO-10646/UTF-8", utf8_a,
> + strlen (utf8_a), (size_t) -1, empty, EILSEQ);
> + test_iconv ("ASCII//TRANSLIT", "ISO-10646/UTF-8", utf8_a,
> + strlen (utf8_a), 12, ascii_a, 0);
> +
> + /* Invalid ASCII is converted to UTF-8 only with IGNORE. */
> + test_iconv ("ISO-10646/UTF-8", "ASCII", iascii, strlen (empty), (size_t) -1,
> + empty, EILSEQ);
> + test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ASCII", iascii, strlen (empty),
> + (size_t) -1, empty, EILSEQ);
> + test_iconv ("ISO-10646/UTF-8/IGNORE", "ASCII", iascii, strlen (iascii),
> + (size_t) -1, ia2u_ignore, EILSEQ);
> + test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ASCII", iascii,
> + strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
> + /* Due to bug 19519, iconv was ignoring IGNORE for the following three
> + inputs: */
> + test_iconv ("ISO-10646/UTF-8/TRANSLIT/IGNORE", "ASCII", iascii,
> + strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
> + test_iconv ("ISO-10646/UTF-8//TRANSLIT,IGNORE", "ASCII", iascii,
> + strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
> + test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ASCII", iascii,
> + strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
> +
> + return 0;
> +}
> +
> +#include <support/test-driver.c>
> diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
> new file mode 100644
> index 0000000000..8298136b7f
> --- /dev/null
> +++ b/iconv/tst-iconv_prog.sh
> @@ -0,0 +1,280 @@
> +#!/bin/bash
> +# Test for some known iconv(1) hangs from bug 19519, and miscellaneous
> +# iconv(1) program error conditions.
> +# Copyright (C) 2020 Free Software Foundation, Inc.
> +# This file is part of the GNU C Library.
> +
> +# The GNU C Library is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU Lesser General Public
> +# License as published by the Free Software Foundation; either
> +# version 2.1 of the License, or (at your option) any later version.
> +
> +# The GNU C Library is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +# Lesser General Public License for more details.
> +
> +# You should have received a copy of the GNU Lesser General Public
> +# License along with the GNU C Library; if not, see
> +# <https://www.gnu.org/licenses/>.
> +
> +codir=$1
> +test_wrapper_env="$2"
> +run_program_env="$3"
> +
> +# We have to have some directories in the library path.
> +LIBPATH=$codir:$codir/iconvdata
> +
> +# How the start the iconv(1) program. $from is not defined/expanded yet.
> +ICONV='
> +$codir/elf/ld.so --library-path $LIBPATH --inhibit-rpath ${from}.so
> +$codir/iconv/iconv_prog
> +'
> +ICONV="$test_wrapper_env $run_program_env $ICONV"
> +
> +# List of known hangs;
> +# Gathered by running an exhaustive 2 byte input search against glibc-2.28
> +hangarray=(
> +"\x00\x23;-c;ANSI_X3.110;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xa1;-c;ARMSCII-8;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xa1;-c;ASMO_449;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x81;-c;BIG5;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xff;-c;BIG5HKSCS;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xff;-c;BRF;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xff;-c;BS_4730;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x81;-c;CP1250;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x98;-c;CP1251;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x81;-c;CP1252;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x81;-c;CP1253;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x81;-c;CP1254;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x81;-c;CP1255;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x81;-c;CP1257;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x81;-c;CP1258;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;CP932;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;CSA_Z243.4-1985-1;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;CSA_Z243.4-1985-2;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;DEC-MCS;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;DIN_66003;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;DS_2089;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-AT-DE;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-AT-DE-A;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-CA-FR;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-DK-NO;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-DK-NO-A;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-ES;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-ES-A;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-ES-S;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-FI-SE;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-FI-SE-A;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-FR;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-IS-FRISS;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-IT;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-PT;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-UK;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;EBCDIC-US;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ES;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ES2;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;EUC-CN;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;EUC-JISX0213;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;EUC-JP;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;EUC-JP-MS;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;EUC-KR;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;EUC-TW;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;GB18030;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;GB_1988-80;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;GBK;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;GOST_19768-74;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;GREEK7;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;GREEK7-OLD;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;GREEK-CCITT;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;HP-GREEK8;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;HP-ROMAN8;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;HP-ROMAN9;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;HP-THAI8;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;HP-TURKISH8;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;IBM038;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;IBM1004;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xff;-c;IBM1008;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;IBM1046;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x51;-c;IBM1132;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xa0;-c;IBM1133;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xce;-c;IBM1137;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;IBM1161;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xdb;-c;IBM1162;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x70;-c;IBM12712;UTF-8//TRANSLIT//IGNORE"
> +# These are known hangs that are yet to be fixed:
> +# "\x00\x0f;-c;IBM1364;UTF-8"
> +# "\x00\x0f;-c;IBM1371;UTF-8"
> +# "\x00\x0f;-c;IBM1388;UTF-8"
> +# "\x00\x0f;-c;IBM1390;UTF-8"
> +# "\x00\x0f;-c;IBM1399;UTF-8"
> +"\x00\x53;-c;IBM16804;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;IBM274;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;IBM275;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;IBM281;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x57;-c;IBM290;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x45;-c;IBM420;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x68;-c;IBM423;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x70;-c;IBM424;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x53;-c;IBM4517;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x53;-c;IBM4899;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xa5;-c;IBM4909;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xdc;-c;IBM4971;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;IBM803;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x91;-c;IBM851;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x9b;-c;IBM856;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xd5;-c;IBM857;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;IBM864;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x94;-c;IBM868;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x94;-c;IBM869;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;IBM874;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x6a;-c;IBM875;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;IBM880;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;IBM891;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;IBM903;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;IBM904;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;IBM905;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;IBM9066;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x48;-c;IBM918;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x57;-c;IBM930;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;IBM932;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;IBM933;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;IBM935;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;IBM937;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x41;-c;IBM939;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;IBM943;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;INIS;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;INIS-8;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;INIS-CYRILLIC;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xec;-c;ISIRI-3342;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xec;-c;ISO_10367-BOX;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO-2022-CN;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO-2022-CN-EXT;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO-2022-JP;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO-2022-JP-2;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO-2022-JP-3;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO-2022-KR;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO_2033;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO_5427;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO_5427-EXT;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO_5428;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xa4;-c;ISO_6937;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xa0;-c;ISO_6937-2;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO-8859-11;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xa5;-c;ISO-8859-3;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO-8859-6;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO-8859-7;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;ISO-8859-8;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;ISO-IR-197;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;ISO-IR-209;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;IT;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;JIS_C6220-1969-RO;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;JIS_C6229-1984-B;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;JOHAB;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;JUS_I.B1.002;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;KOI-8;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x88;-c;KOI8-T;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;KSC5636;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;LATIN-GREEK;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;LATIN-GREEK-1;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xf6;-c;MAC-IS;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;MSZ_7795.3;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;NATS-DANO;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;NATS-SEFI;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;NC_NC00-10;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;NF_Z_62-010;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;NF_Z_62-010_1973;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;NS_4551-1;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;NS_4551-2;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;PT;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;PT2;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x98;-c;RK1048;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x98;-c;SEN_850200_B;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x98;-c;SEN_850200_C;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;Shift_JISX0213;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x80;-c;SJIS;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x23;-c;T.61-8BIT;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;TIS-620;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;TSCII;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;UHC;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xd8;-c;UNICODE;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xdc;-c;UTF-16;UTF-8//TRANSLIT//IGNORE"
> +"\xdc\x00;-c;UTF-16BE;UTF-8//TRANSLIT//IGNORE"
> +"\x00\xdc;-c;UTF-16LE;UTF-8//TRANSLIT//IGNORE"
> +"\xff\xff;-c;UTF-7;UTF-8//TRANSLIT//IGNORE"
> +"\x00\x81;-c;WIN-SAMI-2;UTF-8//TRANSLIT//IGNORE"
> +)
> +
> +# List of option combinations that *should* lead to an error
> +errorarray=(
> +# Converting from/to invalid character sets should cause error
> +"\x00\x00;;INVALID;INVALID"
> +"\x00\x00;;INVALID;UTF-8"
> +"\x00\x00;;UTF-8;INVALID"
> +)
> +
> +# Requires $twobyte input, $c flag, $from, and $to to be set; sets $ret
> +execute_test ()
> +{
> + eval PROG=\"$ICONV\"
> + echo -en "$twobyte" \
> + | timeout -k 4 3 $PROG $c -f $from -t "$to" &>/dev/null
> + ret=$?
> +}
> +
> +check_hangtest_result ()
> +{
> + if [ "$ret" -eq "124" ] || [ "$ret" -eq "137" ]; then # timeout/hang
> + result="HANG"
> + else
> + if [ "$ret" -eq "139" ]; then # segfault
> + result="SEGFAULT"
> + else
> + if [ "$ret" -gt "127" ]; then # unexpected error
> + result="UNEXPECTED"
> + else
> + result="OK"
> + fi
> + fi
> + fi
> +
> + echo -n "$result: from: \"$from\", to: \"$to\","
> + echo " input \"$twobyte\", flags \"$c\""
> +
> + if [ "$result" != "OK" ]; then
> + exit 1
> + fi
> +}
> +
> +for hangcommand in "${hangarray[@]}"; do
> + twobyte="$(echo "$hangcommand" | cut -d";" -f 1)"
> + c="$(echo "$hangcommand" | cut -d";" -f 2)"
> + from="$(echo "$hangcommand" | cut -d";" -f 3)"
> + to="$(echo "$hangcommand" | cut -d";" -f 4)"
> + execute_test
> + check_hangtest_result
> +done
> +
> +check_errtest_result ()
> +{
> + if [ "$ret" -eq "1" ]; then # we errored out as expected
> + result="PASS"
> + else
> + result="FAIL"
> + fi
> + echo -n "$result: from: \"$from\", to: \"$to\","
> + echo " input \"$twobyte\", flags \"$c\", return code $ret"
> +
> + if [ "$result" != "PASS" ]; then
> + exit 1
> + fi
> +}
> +
> +for errorcommand in "${errorarray[@]}"; do
> + twobyte="$(echo "$errorcommand" | cut -d";" -f 1)"
> + c="$(echo "$errorcommand" | cut -d";" -f 2)"
> + from="$(echo "$errorcommand" | cut -d";" -f 3)"
> + to="$(echo "$errorcommand" | cut -d";" -f 4)"
> + execute_test
> + check_errtest_result
> +done
> diff --git a/intl/dcigettext.c b/intl/dcigettext.c
> index 465c8df34c..2e7c662bc7 100644
> --- a/intl/dcigettext.c
> +++ b/intl/dcigettext.c
> @@ -1119,11 +1119,16 @@ _nl_find_msg (struct loaded_l10nfile *domain_file,
> outcharset = encoding;
>
> # ifdef _LIBC
> - /* We always want to use transliteration. */
> - outcharset = norm_add_slashes (outcharset, "TRANSLIT");
> - charset = norm_add_slashes (charset, "");
> - int r = __gconv_open (outcharset, charset, &convd->conv,
> - GCONV_AVOID_NOCONV);
> +
> + struct gconv_spec conv_spec
> + = { .fromcode = norm_add_slashes (charset, ""),
> + .tocode = norm_add_slashes (outcharset, ""),
> + /* We always want to use transliteration. */
> + .translit = true,
> + .ignore = false
> + };
> + int r = __gconv_open (&conv_spec, &convd->conv,
> + GCONV_AVOID_NOCONV);
> if (__builtin_expect (r != __GCONV_OK, 0))
> {
> /* If the output encoding is the same there is
>
On Tue, 7 Jul 2020, Arjun Shankar wrote:
> +$(objpfx)tst-iconv_prog.out: tst-iconv_prog.sh $(objpfx)iconv_prog
> + $(SHELL) $< $(common-objdir) '$(test-wrapper-env)' \
> + '$(run-program-env)' > $@; \
> + $(evaluate-test)
This test is using bash-specific features, so should be run with $(BASH)
not $(SHELL).
On Tue, Jul 07, 2020 at 03:58:01PM +0000, Joseph Myers wrote:
> On Tue, 7 Jul 2020, Arjun Shankar wrote:
>
> > +$(objpfx)tst-iconv_prog.out: tst-iconv_prog.sh $(objpfx)iconv_prog
> > + $(SHELL) $< $(common-objdir) '$(test-wrapper-env)' \
> > + '$(run-program-env)' > $@; \
> > + $(evaluate-test)
>
> This test is using bash-specific features, so should be run with $(BASH)
> not $(SHELL).
Thanks, Joseph. I edited this into my commit and pushed it.
On Tue, Jul 07, 2020 at 05:33:37PM +0200, Andreas Schwab wrote:
> On Jul 07 2020, Arjun Shankar wrote:
>
> > +for hangcommand in "${hangarray[@]}"; do
> > + twobyte="$(echo "$hangcommand" | cut -d";" -f 1)"
> > + c="$(echo "$hangcommand" | cut -d";" -f 2)"
> > + from="$(echo "$hangcommand" | cut -d";" -f 3)"
> > + to="$(echo "$hangcommand" | cut -d";" -f 4)"
>
> That could be replaced by doing an IFS split with IFS=';'.
>
> IFS=';' args=($hangcommand)
>
> and then referring to ${args[0]} etc. (or similar).
Thanks, Andreas. I'll do a follow-up with this and possibly other
improvements to the test, and include you in CC when I post it.
On Tue, Jul 07, 2020 at 11:54:24AM -0400, Carlos O'Donell wrote:
> OK for master.
>
> Plese push this so we can include the fixes in 2.32.
>
> I see Andreas has some suggestions for improving the shell code, but we can
> always clean that up with further changes. I know you have some ideas for
> improving the test coverage and we can tidy this up later.
Thanks! Pushed to master.
I'll address that in a follow up with changes to the test soon enough.
Joseph noted that I was using Bash features, and I took care of that in my
patch before pushing it.
On 7/7/20 2:46 PM, Arjun Shankar wrote:
> On Tue, Jul 07, 2020 at 11:54:24AM -0400, Carlos O'Donell wrote:
>> OK for master.
>>
>> Plese push this so we can include the fixes in 2.32.
>>
>> I see Andreas has some suggestions for improving the shell code, but we can
>> always clean that up with further changes. I know you have some ideas for
>> improving the test coverage and we can tidy this up later.
>
> Thanks! Pushed to master.
>
> I'll address that in a follow up with changes to the test soon enough.
> Joseph noted that I was using Bash features, and I took care of that in my
> patch before pushing it.
Thanks. Please don't forget to close out any related bugs in bugzilla.
@@ -26,7 +26,7 @@ headers = iconv.h gconv.h
routines = iconv_open iconv iconv_close \
gconv_open gconv gconv_close gconv_db gconv_conf \
gconv_builtin gconv_simple gconv_trans gconv_cache
-routines += gconv_dl
+routines += gconv_dl gconv_charset
vpath %.c ../locale/programs ../intl
@@ -44,7 +44,7 @@ CFLAGS-linereader.c += -DNO_TRANSLITERATION
CFLAGS-simple-hash.c += -I../locale
tests = tst-iconv1 tst-iconv2 tst-iconv3 tst-iconv4 tst-iconv5 tst-iconv6 \
- tst-iconv7 tst-iconv-mt
+ tst-iconv7 tst-iconv-mt tst-iconv-opt
others = iconv_prog iconvconfig
install-others-programs = $(inst_bindir)/iconv
@@ -61,6 +61,7 @@ include $(patsubst %,$(..)libof-iterator.mk,$(cpp-srcs-left))
ifeq ($(run-built-tests),yes)
xtests-special += $(objpfx)test-iconvconfig.out
+tests-special += $(objpfx)tst-iconv_prog.out
endif
# Make a copy of the file because gconv module names are constructed
@@ -81,6 +82,13 @@ endif
include ../Rules
+ifeq ($(run-built-tests),yes)
+LOCALES := en_US.UTF-8
+include ../gen-locales.mk
+
+$(objpfx)tst-iconv-opt.out: $(gen-locales)
+endif
+
$(inst_bindir)/iconv: $(objpfx)iconv_prog $(+force)
$(do-install-program)
@@ -95,3 +103,8 @@ $(objpfx)test-iconvconfig.out: /dev/null $(objpfx)iconvconfig
cmp $$tmp $(inst_gconvdir)/gconv-modules.cache; \
rm -f $$tmp) > $@; \
$(evaluate-test)
+
+$(objpfx)tst-iconv_prog.out: tst-iconv_prog.sh $(objpfx)iconv_prog
+ $(SHELL) $< $(common-objdir) '$(test-wrapper-env)' \
+ '$(run-program-env)' > $@; \
+ $(evaluate-test)
@@ -6,6 +6,7 @@ libc {
GLIBC_PRIVATE {
# functions shared with iconv program
__gconv_get_alias_db; __gconv_get_cache; __gconv_get_modules_db;
+ __gconv_open; __gconv_create_spec;
# function used by the gconv modules
__gconv_transliterate;
new file mode 100644
@@ -0,0 +1,218 @@
+/* Charset name normalization.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <locale.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/stat.h>
+#include "gconv_int.h"
+#include "gconv_charset.h"
+
+
+/* This function returns a pointer to the last suffix in a conversion code
+ string. Valid suffixes matched by this function are of the form: '/' or ','
+ followed by arbitrary text that doesn't contain '/' or ','. It does not
+ edit the string in any way. The caller is expected to parse the suffix and
+ remove it (by e.g. truncating the string) before the next call. */
+static char *
+find_suffix (char *s)
+{
+ /* The conversion code is in the form of a triplet, separated by '/' chars.
+ The third component of the triplet contains suffixes. If we don't have two
+ slashes, we don't have a suffix. */
+
+ int slash_count = 0;
+ char *suffix_term = NULL;
+
+ for (int i = 0; s[i] != '\0'; i++)
+ switch (s[i])
+ {
+ case '/':
+ slash_count++;
+ /* Fallthrough */
+ case ',':
+ suffix_term = &s[i];
+ }
+
+ if (slash_count >= 2)
+ return suffix_term;
+
+ return NULL;
+}
+
+
+struct gconv_parsed_code
+{
+ char *code;
+ bool translit;
+ bool ignore;
+};
+
+
+/* This function parses an iconv_open encoding PC.CODE, strips any suffixes
+ (such as TRANSLIT or IGNORE) from it and sets corresponding flags in it. */
+static void
+gconv_parse_code (struct gconv_parsed_code *pc)
+{
+ pc->translit = false;
+ pc->ignore = false;
+
+ while (1)
+ {
+ /* First drop any trailing whitespaces and separators. */
+ size_t len = strlen (pc->code);
+ while ((len > 0)
+ && (isspace (pc->code[len - 1])
+ || pc->code[len - 1] == ','
+ || pc->code[len - 1] == '/'))
+ len--;
+
+ pc->code[len] = '\0';
+
+ if (len == 0)
+ return;
+
+ char * suffix = find_suffix (pc->code);
+ if (suffix == NULL)
+ {
+ /* At this point, we have processed and removed all suffixes from the
+ code and what remains of the code is suffix free. */
+ return;
+ }
+ else
+ {
+ /* A suffix is processed from the end of the code array going
+ backwards, one suffix at a time. The suffix is an index into the
+ code character array and points to: one past the end of the code
+ and any unprocessed suffixes, and to the beginning of the suffix
+ currently being processed during this iteration. We must process
+ this suffix and then drop it from the code by terminating the
+ preceding text with NULL.
+
+ We want to allow and recognize suffixes such as:
+
+ "/TRANSLIT" i.e. single suffix
+ "//TRANSLIT" i.e. single suffix and multiple separators
+ "//TRANSLIT/IGNORE" i.e. suffixes separated by "/"
+ "/TRANSLIT//IGNORE" i.e. suffixes separated by "//"
+ "//IGNORE,TRANSLIT" i.e. suffixes separated by ","
+ "//IGNORE," i.e. trailing ","
+ "//TRANSLIT/" i.e. trailing "/"
+ "//TRANSLIT//" i.e. trailing "//"
+ "/" i.e. empty suffix.
+
+ Unknown suffixes are silently discarded and ignored. */
+
+ if ((__strcasecmp_l (suffix,
+ GCONV_TRIPLE_SEPARATOR
+ GCONV_TRANSLIT_SUFFIX,
+ _nl_C_locobj_ptr) == 0)
+ || (__strcasecmp_l (suffix,
+ GCONV_SUFFIX_SEPARATOR
+ GCONV_TRANSLIT_SUFFIX,
+ _nl_C_locobj_ptr) == 0))
+ pc->translit = true;
+
+ if ((__strcasecmp_l (suffix,
+ GCONV_TRIPLE_SEPARATOR
+ GCONV_IGNORE_ERRORS_SUFFIX,
+ _nl_C_locobj_ptr) == 0)
+ || (__strcasecmp_l (suffix,
+ GCONV_SUFFIX_SEPARATOR
+ GCONV_IGNORE_ERRORS_SUFFIX,
+ _nl_C_locobj_ptr) == 0))
+ pc->ignore = true;
+
+ /* We just processed this suffix. We can now drop it from the
+ code string by truncating it at the suffix's position. */
+ suffix[0] = '\0';
+ }
+ }
+}
+
+
+/* This function accepts the charset names of the source and destination of the
+ conversion and populates *conv_spec with an equivalent conversion
+ specification that may later be used by __gconv_open. The charset names
+ might contain options in the form of suffixes that alter the conversion,
+ e.g. "ISO-10646/UTF-8/TRANSLIT". It processes the charset names, ignoring
+ and truncating any suffix options in fromcode, and processing and truncating
+ any suffix options in tocode. Supported suffix options ("TRANSLIT" or
+ "IGNORE") when found in tocode lead to the corresponding flag in *conv_spec
+ to be set to true. Unrecognized suffix options are silently discarded. If
+ the function succeeds, it returns conv_spec back to the caller. It returns
+ NULL upon failure. conv_spec must be allocated and freed by the caller. */
+struct gconv_spec *
+__gconv_create_spec (struct gconv_spec *conv_spec, const char *fromcode,
+ const char *tocode)
+{
+ struct gconv_parsed_code pfc, ptc;
+ struct gconv_spec *ret = NULL;
+
+ pfc.code = __strdup (fromcode);
+ ptc.code = __strdup (tocode);
+
+ if ((pfc.code == NULL)
+ || (ptc.code == NULL))
+ goto out;
+
+ gconv_parse_code (&pfc);
+ gconv_parse_code (&ptc);
+
+ /* We ignore suffixes in the fromcode because that is how the current
+ implementation has always handled them. Only suffixes in the tocode are
+ processed and handled. The reality is that invalid input in the input
+ character set should only be ignored if the fromcode specifies IGNORE.
+ The current implementation ignores invalid intput in the input character
+ set if the tocode contains IGNORE. We preserve this behavior for
+ backwards compatibility. In the future we may split the handling of
+ IGNORE to allow a finer grained specification of ignorning invalid input
+ and/or ignoring invalid output. */
+ conv_spec->translit = ptc.translit;
+ conv_spec->ignore = ptc.ignore;
+
+ /* 3 extra bytes because 1 extra for '\0', and 2 extra so strip might
+ be able to add one or two trailing '/' characters if necessary. */
+ conv_spec->fromcode = malloc (strlen (fromcode) + 3);
+ if (conv_spec->fromcode == NULL)
+ goto out;
+
+ conv_spec->tocode = malloc (strlen (tocode) + 3);
+ if (conv_spec->tocode == NULL)
+ {
+ free (conv_spec->fromcode);
+ conv_spec->fromcode = NULL;
+ goto out;
+ }
+
+ /* Strip unrecognized characters and ensure that the code has two '/'
+ characters as per conversion code triplet specification. */
+ strip (conv_spec->fromcode, pfc.code);
+ strip (conv_spec->tocode, ptc.code);
+ ret = conv_spec;
+
+out:
+ free (pfc.code);
+ free (ptc.code);
+
+ return ret;
+}
+libc_hidden_def (__gconv_create_spec)
@@ -19,9 +19,68 @@
#include <ctype.h>
#include <locale.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include "gconv_int.h"
-static void
+/* An iconv encoding is in the form of a triplet, with parts separated by
+ a '/' character. The first part is the standard name, the second part is
+ the character set, and the third part is the error handler. If the first
+ part is sufficient to identify both the standard and the character set
+ then the second part can be empty e.g. UTF-8//. If the first part is not
+ sufficient to identify both the standard and the character set then the
+ second part is required e.g. ISO-10646/UTF8/. If neither the first or
+ second parts are provided e.g. //, then the current locale is used.
+ The actual values used in the first and second parts are not entirely
+ relevant to the implementation. The values themselves are used in a hash
+ table to lookup modules and so the naming convention of the first two parts
+ is somewhat arbitrary and only helps locate the entries in the cache.
+ The third part is the error handler and is comprised of a ',' or '/'
+ separated list of suffixes. Currently, we support "TRANSLIT" for
+ transliteration and "IGNORE" for ignoring conversion errors due to
+ unrecognized input characters. */
+#define GCONV_TRIPLE_SEPARATOR "/"
+#define GCONV_SUFFIX_SEPARATOR ","
+#define GCONV_TRANSLIT_SUFFIX "TRANSLIT"
+#define GCONV_IGNORE_ERRORS_SUFFIX "IGNORE"
+
+
+/* This function accepts the charset names of the source and destination of the
+ conversion and populates *conv_spec with an equivalent conversion
+ specification that may later be used by __gconv_open. The charset names
+ might contain options in the form of suffixes that alter the conversion,
+ e.g. "ISO-10646/UTF-8/TRANSLIT". It processes the charset names, ignoring
+ and truncating any suffix options in fromcode, and processing and truncating
+ any suffix options in tocode. Supported suffix options ("TRANSLIT" or
+ "IGNORE") when found in tocode lead to the corresponding flag in *conv_spec
+ to be set to true. Unrecognized suffix options are silently discarded. If
+ the function succeeds, it returns conv_spec back to the caller. It returns
+ NULL upon failure. */
+struct gconv_spec *
+__gconv_create_spec (struct gconv_spec *conv_spec, const char *fromcode,
+ const char *tocode);
+libc_hidden_proto (__gconv_create_spec)
+
+
+/* This function frees all heap memory allocated by __gconv_create_spec. */
+static void __attribute__ ((unused))
+gconv_destroy_spec (struct gconv_spec *conv_spec)
+{
+ free (conv_spec->fromcode);
+ free (conv_spec->tocode);
+ return;
+}
+
+
+/* This function copies in-order, characters from the source 's' that are
+ either alpha-numeric or one in one of these: "_-.,:/" - into the destination
+ 'wp' while dropping all other characters. In the process, it converts all
+ alphabetical characters to upper case. It then appends up to two '/'
+ characters so that the total number of '/'es in the destination is 2. */
+static inline void __attribute__ ((unused, always_inline))
strip (char *wp, const char *s)
{
int slash_count = 0;
@@ -75,6 +75,15 @@ struct gconv_module
};
+/* The specification of the conversion that needs to be performed. */
+struct gconv_spec
+{
+ char *fromcode;
+ char *tocode;
+ bool translit;
+ bool ignore;
+};
+
/* Flags for `gconv_open'. */
enum
{
@@ -136,10 +145,12 @@ __libc_lock_define (extern, __gconv_lock attribute_hidden)
})
-/* Return in *HANDLE decriptor for transformation from FROMSET to TOSET. */
-extern int __gconv_open (const char *toset, const char *fromset,
- __gconv_t *handle, int flags)
- attribute_hidden;
+/* Return in *HANDLE, a decriptor for the transformation. The function expects
+ the specification of the transformation in the structure pointed to by
+ CONV_SPEC. It only reads *CONV_SPEC and does not take ownership of it. */
+extern int __gconv_open (struct gconv_spec *conv_spec,
+ __gconv_t *handle, int flags);
+libc_hidden_proto (__gconv_open)
/* Free resources associated with transformation descriptor CD. */
extern int __gconv_close (__gconv_t cd)
@@ -31,7 +31,7 @@
int
-__gconv_open (const char *toset, const char *fromset, __gconv_t *handle,
+__gconv_open (struct gconv_spec *conv_spec, __gconv_t *handle,
int flags)
{
struct __gconv_step *steps;
@@ -40,77 +40,38 @@ __gconv_open (const char *toset, const char *fromset, __gconv_t *handle,
size_t cnt = 0;
int res;
int conv_flags = 0;
- const char *errhand;
- const char *ignore;
bool translit = false;
+ char *tocode, *fromcode;
/* Find out whether any error handling method is specified. */
- errhand = strchr (toset, '/');
- if (errhand != NULL)
- errhand = strchr (errhand + 1, '/');
- if (__glibc_likely (errhand != NULL))
- {
- if (*++errhand == '\0')
- errhand = NULL;
- else
- {
- /* Make copy without the error handling description. */
- char *newtoset = (char *) alloca (errhand - toset + 1);
- char *tok;
- char *ptr = NULL /* Work around a bogus warning */;
-
- newtoset[errhand - toset] = '\0';
- toset = memcpy (newtoset, toset, errhand - toset);
+ translit = conv_spec->translit;
- /* Find the appropriate transliteration handlers. */
- tok = strdupa (errhand);
+ if (conv_spec->ignore)
+ conv_flags |= __GCONV_IGNORE_ERRORS;
- tok = __strtok_r (tok, ",", &ptr);
- while (tok != NULL)
- {
- if (__strcasecmp_l (tok, "TRANSLIT", _nl_C_locobj_ptr) == 0)
- translit = true;
- else if (__strcasecmp_l (tok, "IGNORE", _nl_C_locobj_ptr) == 0)
- /* Set the flag to ignore all errors. */
- conv_flags |= __GCONV_IGNORE_ERRORS;
-
- tok = __strtok_r (NULL, ",", &ptr);
- }
- }
- }
-
- /* For the source character set we ignore the error handler specification.
- XXX Is this really always the best? */
- ignore = strchr (fromset, '/');
- if (ignore != NULL && (ignore = strchr (ignore + 1, '/')) != NULL
- && *++ignore != '\0')
- {
- char *newfromset = (char *) alloca (ignore - fromset + 1);
-
- newfromset[ignore - fromset] = '\0';
- fromset = memcpy (newfromset, fromset, ignore - fromset);
- }
+ tocode = conv_spec->tocode;
+ fromcode = conv_spec->fromcode;
/* If the string is empty define this to mean the charset of the
currently selected locale. */
- if (strcmp (toset, "//") == 0)
+ if (strcmp (tocode, "//") == 0)
{
const char *codeset = _NL_CURRENT (LC_CTYPE, CODESET);
size_t len = strlen (codeset);
char *dest;
- toset = dest = (char *) alloca (len + 3);
+ tocode = dest = (char *) alloca (len + 3);
memcpy (__mempcpy (dest, codeset, len), "//", 3);
}
- if (strcmp (fromset, "//") == 0)
+ if (strcmp (fromcode, "//") == 0)
{
const char *codeset = _NL_CURRENT (LC_CTYPE, CODESET);
size_t len = strlen (codeset);
char *dest;
- fromset = dest = (char *) alloca (len + 3);
+ fromcode = dest = (char *) alloca (len + 3);
memcpy (__mempcpy (dest, codeset, len), "//", 3);
}
- res = __gconv_find_transform (toset, fromset, &steps, &nsteps, flags);
+ res = __gconv_find_transform (tocode, fromcode, &steps, &nsteps, flags);
if (res == __GCONV_OK)
{
/* Allocate room for handle. */
@@ -209,3 +170,4 @@ __gconv_open (const char *toset, const char *fromset, __gconv_t *handle,
*handle = result;
return res;
}
+libc_hidden_def (__gconv_open)
@@ -31,49 +31,15 @@
iconv_t
iconv_open (const char *tocode, const char *fromcode)
{
- /* Normalize the name. We remove all characters beside alpha-numeric,
- '_', '-', '/', '.', and ':'. */
- size_t tocode_len = strlen (tocode) + 3;
- char *tocode_conv;
- bool tocode_usealloca = __libc_use_alloca (tocode_len);
- if (tocode_usealloca)
- tocode_conv = (char *) alloca (tocode_len);
- else
- {
- tocode_conv = (char *) malloc (tocode_len);
- if (tocode_conv == NULL)
- return (iconv_t) -1;
- }
- strip (tocode_conv, tocode);
- tocode = (tocode_conv[2] == '\0' && tocode[0] != '\0'
- ? upstr (tocode_conv, tocode) : tocode_conv);
+ __gconv_t cd;
+ struct gconv_spec conv_spec;
- size_t fromcode_len = strlen (fromcode) + 3;
- char *fromcode_conv;
- bool fromcode_usealloca = __libc_use_alloca (fromcode_len);
- if (fromcode_usealloca)
- fromcode_conv = (char *) alloca (fromcode_len);
- else
- {
- fromcode_conv = (char *) malloc (fromcode_len);
- if (fromcode_conv == NULL)
- {
- if (! tocode_usealloca)
- free (tocode_conv);
- return (iconv_t) -1;
- }
- }
- strip (fromcode_conv, fromcode);
- fromcode = (fromcode_conv[2] == '\0' && fromcode[0] != '\0'
- ? upstr (fromcode_conv, fromcode) : fromcode_conv);
+ if (__gconv_create_spec (&conv_spec, fromcode, tocode) == NULL)
+ return (iconv_t) -1;
- __gconv_t cd;
- int res = __gconv_open (tocode, fromcode, &cd, 0);
+ int res = __gconv_open (&conv_spec, &cd, 0);
- if (! fromcode_usealloca)
- free (fromcode_conv);
- if (! tocode_usealloca)
- free (tocode_conv);
+ gconv_destroy_spec (&conv_spec);
if (__builtin_expect (res, __GCONV_OK) != __GCONV_OK)
{
@@ -39,6 +39,7 @@
#include <gconv_int.h>
#include "iconv_prog.h"
#include "iconvconfig.h"
+#include "gconv_charset.h"
/* Get libc version number. */
#include "../version.h"
@@ -118,8 +119,7 @@ main (int argc, char *argv[])
{
int status = EXIT_SUCCESS;
int remaining;
- iconv_t cd;
- const char *orig_to_code;
+ __gconv_t cd;
struct charmap_t *from_charmap = NULL;
struct charmap_t *to_charmap = NULL;
@@ -139,39 +139,6 @@ main (int argc, char *argv[])
exit (EXIT_SUCCESS);
}
- /* If we have to ignore errors make sure we use the appropriate name for
- the to-character-set. */
- orig_to_code = to_code;
- if (omit_invalid)
- {
- const char *errhand = strchrnul (to_code, '/');
- int nslash = 2;
- char *newp;
- char *cp;
-
- if (*errhand == '/')
- {
- --nslash;
- errhand = strchrnul (errhand + 1, '/');
-
- if (*errhand == '/')
- {
- --nslash;
- errhand = strchr (errhand, '\0');
- }
- }
-
- newp = (char *) alloca (errhand - to_code + nslash + 7 + 1);
- cp = mempcpy (newp, to_code, errhand - to_code);
- while (nslash-- > 0)
- *cp++ = '/';
- if (cp[-1] != '/')
- *cp++ = ',';
- memcpy (cp, "IGNORE", sizeof ("IGNORE"));
-
- to_code = newp;
- }
-
/* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
can be file names of charmaps. In this case iconv will have to read
those charmaps and use them to do the conversion. But there are
@@ -184,10 +151,10 @@ main (int argc, char *argv[])
file. */
from_charmap = charmap_read (from_code, /*0, 1*/1, 0, 0, 0);
- if (strchr (orig_to_code, '/') != NULL)
+ if (strchr (to_code, '/') != NULL)
/* The to-name might be a charmap file name. Try reading the
file. */
- to_charmap = charmap_read (orig_to_code, /*0, 1,*/1, 0, 0, 0);
+ to_charmap = charmap_read (to_code, /*0, 1,*/1, 0, 0, 0);
/* At this point we have to handle two cases. The first one is
@@ -201,9 +168,25 @@ main (int argc, char *argv[])
argc, remaining, argv, output_file);
else
{
+ struct gconv_spec conv_spec;
+ int res;
+
+ if (__gconv_create_spec (&conv_spec, from_code, to_code) == NULL)
+ {
+ error (EXIT_FAILURE, errno,
+ _("failed to start conversion processing"));
+ exit (1);
+ }
+
+ if (omit_invalid)
+ conv_spec.ignore = true;
+
/* Let's see whether we have these coded character sets. */
- cd = iconv_open (to_code, from_code);
- if (cd == (iconv_t) -1)
+ res = __gconv_open (&conv_spec, &cd, 0);
+
+ gconv_destroy_spec (&conv_spec);
+
+ if (res != __GCONV_OK)
{
if (errno == EINVAL)
{
@@ -221,7 +204,7 @@ main (int argc, char *argv[])
const char *from_pretty =
(from_code[0] ? from_code : nl_langinfo (CODESET));
const char *to_pretty =
- (orig_to_code[0] ? orig_to_code : nl_langinfo (CODESET));
+ (to_code[0] ? to_code : nl_langinfo (CODESET));
if (from_wrong)
{
new file mode 100644
@@ -0,0 +1,347 @@
+/* Test iconv's TRANSLIT and IGNORE option handling
+
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <iconv.h>
+#include <locale.h>
+#include <errno.h>
+#include <string.h>
+#include <support/support.h>
+#include <support/check.h>
+
+
+/* Run one iconv test. Arguments:
+ to: destination character set and options
+ from: source character set
+ input: input string to be converted
+ exp_in: expected number of bytes consumed
+ exp_ret: expected return value (error or number of irreversible conversions)
+ exp_out: expected output string
+ exp_err: expected value of `errno' after iconv returns. */
+static void
+test_iconv (const char *to, const char *from, char *input, size_t exp_in,
+ size_t exp_ret, const char *exp_out, int exp_err)
+{
+ iconv_t cd;
+ char outbuf[500];
+ size_t inlen, outlen;
+ char *inptr, *outptr;
+ size_t n;
+
+ cd = iconv_open (to, from);
+ TEST_VERIFY (cd != (iconv_t) -1);
+
+ inlen = strlen (input);
+ outlen = sizeof (outbuf);
+ inptr = input;
+ outptr = outbuf;
+
+ errno = 0;
+ n = iconv (cd, &inptr, &inlen, &outptr, &outlen);
+
+ TEST_COMPARE (n, exp_ret);
+ TEST_VERIFY (inptr == input + exp_in);
+ TEST_COMPARE (errno, exp_err);
+ TEST_COMPARE_BLOB (outbuf, outptr - outbuf, exp_out, strlen (exp_out));
+ TEST_VERIFY (iconv_close (cd) == 0);
+}
+
+
+/* We test option parsing by converting UTF-8 inputs to ASCII under various
+ option combinations. The UTF-8 inputs fall into three categories:
+ - ASCII-only,
+ - non-ASCII,
+ - non-ASCII with invalid UTF-8 characters. */
+
+/* 1. */
+char ascii[] = "Just some ASCII text";
+
+/* 2. Valid UTF-8 input and some corresponding expected outputs with various
+ options. The two non-ASCII characters below are accented alphabets:
+ an `a' then an `o'. */
+char utf8[] = "UTF-8 text with \u00E1 couple \u00F3f non-ASCII characters";
+char u2a[] = "UTF-8 text with ";
+char u2a_translit[] = "UTF-8 text with a couple of non-ASCII characters";
+char u2a_ignore[] = "UTF-8 text with couple f non-ASCII characters";
+
+/* 3. Invalid UTF-8 input and some corresponding expected outputs. \xff is
+ invalid UTF-8. It's followed by some valid but non-ASCII UTF-8. */
+char iutf8[] = "Invalid UTF-8 \xff\u27E6text\u27E7";
+char iu2a[] = "Invalid UTF-8 ";
+char iu2a_ignore[] = "Invalid UTF-8 text";
+char iu2a_both[] = "Invalid UTF-8 [|text|]";
+
+/* 4. Another invalid UTF-8 input and corresponding expected outputs. This time
+ the valid non-ASCII UTF-8 characters appear before the invalid \xff. */
+char jutf8[] = "Invalid \u27E6UTF-8\u27E7 \xfftext";
+char ju2a[] = "Invalid ";
+char ju2a_translit[] = "Invalid [|UTF-8|] ";
+char ju2a_ignore[] = "Invalid UTF-8 text";
+char ju2a_both[] = "Invalid [|UTF-8|] text";
+
+/* We also test option handling for character set names that have the form
+ "A/B". In this test, we test conversions "ISO-10646/UTF-8", and either
+ ISO-8859-1 or ASCII. */
+
+/* 5. Accented 'A' and 'a' characters in ISO-8859-1 and UTF-8, and an
+ equivalent ASCII transliteration. */
+char iso8859_1_a[] = {0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, /* Accented A's. */
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, /* Accented a's. */
+ 0x00};
+char utf8_a[] = "\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5"
+ "\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5";
+char ascii_a[] = "AAAAAAaaaaaa";
+
+/* 6. An invalid ASCII string where [0] is invalid and [1] is '~'. */
+char iascii [] = {0x80, '~', '\0'};
+char empty[] = "";
+char ia2u_ignore[] = "~";
+
+static int
+do_test (void)
+{
+ xsetlocale (LC_ALL, "en_US.UTF-8");
+
+
+ /* 0. iconv_open should gracefully fail for invalid character sets. */
+
+ TEST_VERIFY (iconv_open ("INVALID", "UTF-8") == (iconv_t) -1);
+ TEST_VERIFY (iconv_open ("UTF-8", "INVALID") == (iconv_t) -1);
+ TEST_VERIFY (iconv_open ("INVALID", "INVALID") == (iconv_t) -1);
+
+
+ /* 1. ASCII-only UTF-8 input should convert to ASCII with no changes: */
+
+ test_iconv ("ASCII", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
+ test_iconv ("ASCII//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
+ test_iconv ("ASCII//TRANSLIT", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
+ test_iconv ("ASCII//TRANSLIT//", "UTF-8", ascii, strlen (ascii), 0, ascii,
+ 0);
+ test_iconv ("ASCII//IGNORE", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
+ test_iconv ("ASCII//IGNORE//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
+
+
+ /* 2. Valid UTF-8 input with non-ASCII characters: */
+
+ /* EILSEQ when converted to ASCII. */
+ test_iconv ("ASCII", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a, EILSEQ);
+
+ /* Converted without error with TRANSLIT enabled. */
+ test_iconv ("ASCII//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2, u2a_translit,
+ 0);
+
+ /* EILSEQ with IGNORE enabled. Non-ASCII chars dropped from output. */
+ test_iconv ("ASCII//IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
+ u2a_ignore, EILSEQ);
+
+ /* With TRANSLIT and IGNORE enabled, transliterated without error. We test
+ four combinations. */
+
+ test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", utf8, strlen (utf8), 2,
+ u2a_translit, 0);
+ test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", utf8, strlen (utf8), 2,
+ u2a_translit, 0);
+ test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
+ u2a_translit, 0);
+ /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
+ test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
+ u2a_translit, 0);
+
+ /* Misspellings of TRANSLIT and IGNORE are ignored, but conversion still
+ works while respecting any other correctly spelled options. */
+
+ test_iconv ("ASCII//T", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
+ EILSEQ);
+ test_iconv ("ASCII//TRANSLITERATE", "UTF-8", utf8, strlen (u2a), (size_t) -1,
+ u2a, EILSEQ);
+ test_iconv ("ASCII//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
+ EILSEQ);
+ test_iconv ("ASCII//IGNORED", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
+ EILSEQ);
+ test_iconv ("ASCII//TRANSLITERATE//IGNORED", "UTF-8", utf8, strlen (u2a),
+ (size_t) -1, u2a, EILSEQ);
+ test_iconv ("ASCII//IGNORED,TRANSLITERATE", "UTF-8", utf8, strlen (u2a),
+ (size_t) -1, u2a, EILSEQ);
+ test_iconv ("ASCII//T//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
+ EILSEQ);
+
+ test_iconv ("ASCII//TRANSLIT//I", "UTF-8", utf8, strlen (utf8), 2,
+ u2a_translit, 0);
+ /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
+ test_iconv ("ASCII//I//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
+ u2a_translit, 0);
+ test_iconv ("ASCII//IGNORED,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
+ u2a_translit, 0);
+ test_iconv ("ASCII//TRANSLIT,IGNORED", "UTF-8", utf8, strlen (utf8), 2,
+ u2a_translit, 0);
+
+ test_iconv ("ASCII//IGNORE,T", "UTF-8", utf8, strlen (utf8), (size_t) -1,
+ u2a_ignore, EILSEQ);
+ test_iconv ("ASCII//T,IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
+ u2a_ignore, EILSEQ);
+ /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
+ test_iconv ("ASCII//TRANSLITERATE//IGNORE", "UTF-8", utf8, strlen (utf8),
+ (size_t) -1, u2a_ignore, EILSEQ);
+ test_iconv ("ASCII//IGNORE//TRANSLITERATE", "UTF-8", utf8, strlen (utf8),
+ (size_t) -1, u2a_ignore, EILSEQ);
+
+
+ /* 3. Invalid UTF-8 followed by some valid non-ASCII UTF-8 characters: */
+
+ /* EILSEQ; output is truncated at the first invalid UTF-8 character. */
+ test_iconv ("ASCII", "UTF-8", iutf8, strlen (iu2a), (size_t) -1, iu2a,
+ EILSEQ);
+
+ /* With TRANSLIT enabled: EILSEQ; output still truncated at the first invalid
+ UTF-8 character. */
+ test_iconv ("ASCII//TRANSLIT", "UTF-8", iutf8, strlen (iu2a), (size_t) -1,
+ iu2a, EILSEQ);
+
+ /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
+ valid UTF-8 non-ASCII characters. */
+ test_iconv ("ASCII//IGNORE", "UTF-8", iutf8, strlen (iutf8), (size_t) -1,
+ iu2a_ignore, EILSEQ);
+
+ /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
+ characters and transliterates valid non-ASCII UTF-8 characters. We test
+ four combinations. */
+
+ test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
+ iu2a_both, 0);
+ /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
+ test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
+ iu2a_both, 0);
+ test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
+ iu2a_both, 0);
+ /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
+ test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
+ iu2a_both, 0);
+
+
+ /* 4. Invalid UTF-8 with valid non-ASCII UTF-8 chars appearing first: */
+
+ /* EILSEQ; output is truncated at the first non-ASCII character. */
+ test_iconv ("ASCII", "UTF-8", jutf8, strlen (ju2a), (size_t) -1, ju2a,
+ EILSEQ);
+
+ /* With TRANSLIT enabled: EILSEQ; output now truncated at the first invalid
+ UTF-8 character. */
+ test_iconv ("ASCII//TRANSLIT", "UTF-8", jutf8, strlen (jutf8) - 5,
+ (size_t) -1, ju2a_translit, EILSEQ);
+ test_iconv ("ASCII//translit", "UTF-8", jutf8, strlen (jutf8) - 5,
+ (size_t) -1, ju2a_translit, EILSEQ);
+
+ /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
+ valid UTF-8 non-ASCII characters. */
+ test_iconv ("ASCII//IGNORE", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
+ ju2a_ignore, EILSEQ);
+ test_iconv ("ASCII//ignore", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
+ ju2a_ignore, EILSEQ);
+
+ /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
+ characters and transliterates valid non-ASCII UTF-8 characters. We test
+ several combinations. */
+
+ test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
+ ju2a_both, 0);
+ /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
+ test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
+ ju2a_both, 0);
+ test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
+ ju2a_both, 0);
+ /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
+ test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
+ ju2a_both, 0);
+ test_iconv ("ASCII//translit,ignore", "UTF-8", jutf8, strlen (jutf8), 2,
+ ju2a_both, 0);
+ /* Trailing whitespace and separators should be ignored. */
+ test_iconv ("ASCII//IGNORE,TRANSLIT ", "UTF-8", jutf8, strlen (jutf8), 2,
+ ju2a_both, 0);
+ test_iconv ("ASCII//IGNORE,TRANSLIT/", "UTF-8", jutf8, strlen (jutf8), 2,
+ ju2a_both, 0);
+ test_iconv ("ASCII//IGNORE,TRANSLIT//", "UTF-8", jutf8, strlen (jutf8), 2,
+ ju2a_both, 0);
+ test_iconv ("ASCII//IGNORE,TRANSLIT,", "UTF-8", jutf8, strlen (jutf8), 2,
+ ju2a_both, 0);
+ test_iconv ("ASCII//IGNORE,TRANSLIT,,", "UTF-8", jutf8, strlen (jutf8), 2,
+ ju2a_both, 0);
+ test_iconv ("ASCII//IGNORE,TRANSLIT /,", "UTF-8", jutf8, strlen (jutf8), 2,
+ ju2a_both, 0);
+
+ /* TRANSLIT or IGNORE suffixes in fromcode should be ignored. */
+ test_iconv ("ASCII", "UTF-8//TRANSLIT", jutf8, strlen (ju2a), (size_t) -1,
+ ju2a, EILSEQ);
+ test_iconv ("ASCII", "UTF-8//IGNORE", jutf8, strlen (ju2a), (size_t) -1,
+ ju2a, EILSEQ);
+ test_iconv ("ASCII", "UTF-8//TRANSLIT,IGNORE", jutf8, strlen (ju2a),
+ (size_t) -1, ju2a, EILSEQ);
+
+
+ /* 5. Charset names of the form "A/B/": */
+
+ /* ISO-8859-1 is converted to UTF-8 without needing transliteration. */
+ test_iconv ("ISO-10646/UTF-8", "ISO-8859-1", iso8859_1_a,
+ strlen (iso8859_1_a), 0, utf8_a, 0);
+ test_iconv ("ISO-10646/UTF-8/", "ISO-8859-1", iso8859_1_a,
+ strlen (iso8859_1_a), 0, utf8_a, 0);
+ test_iconv ("ISO-10646/UTF-8/IGNORE", "ISO-8859-1", iso8859_1_a,
+ strlen (iso8859_1_a), 0, utf8_a, 0);
+ test_iconv ("ISO-10646/UTF-8//IGNORE", "ISO-8859-1", iso8859_1_a,
+ strlen (iso8859_1_a), 0, utf8_a, 0);
+ test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ISO-8859-1", iso8859_1_a,
+ strlen (iso8859_1_a), 0, utf8_a, 0);
+ test_iconv ("ISO-10646/UTF-8//TRANSLIT", "ISO-8859-1", iso8859_1_a,
+ strlen (iso8859_1_a), 0, utf8_a, 0);
+ test_iconv ("ISO-10646/UTF-8//TRANSLIT/IGNORE", "ISO-8859-1", iso8859_1_a,
+ strlen (iso8859_1_a), 0, utf8_a, 0);
+ test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ISO-8859-1", iso8859_1_a,
+ strlen (iso8859_1_a), 0, utf8_a, 0);
+ test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ISO-8859-1", iso8859_1_a,
+ strlen (iso8859_1_a), 0, utf8_a, 0);
+
+ /* UTF-8 with accented A's is converted to ASCII with transliteration. */
+ test_iconv ("ASCII", "ISO-10646/UTF-8", utf8_a,
+ 0, (size_t) -1, empty, EILSEQ);
+ test_iconv ("ASCII//IGNORE", "ISO-10646/UTF-8", utf8_a,
+ strlen (utf8_a), (size_t) -1, empty, EILSEQ);
+ test_iconv ("ASCII//TRANSLIT", "ISO-10646/UTF-8", utf8_a,
+ strlen (utf8_a), 12, ascii_a, 0);
+
+ /* Invalid ASCII is converted to UTF-8 only with IGNORE. */
+ test_iconv ("ISO-10646/UTF-8", "ASCII", iascii, strlen (empty), (size_t) -1,
+ empty, EILSEQ);
+ test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ASCII", iascii, strlen (empty),
+ (size_t) -1, empty, EILSEQ);
+ test_iconv ("ISO-10646/UTF-8/IGNORE", "ASCII", iascii, strlen (iascii),
+ (size_t) -1, ia2u_ignore, EILSEQ);
+ test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ASCII", iascii,
+ strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
+ /* Due to bug 19519, iconv was ignoring IGNORE for the following three
+ inputs: */
+ test_iconv ("ISO-10646/UTF-8/TRANSLIT/IGNORE", "ASCII", iascii,
+ strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
+ test_iconv ("ISO-10646/UTF-8//TRANSLIT,IGNORE", "ASCII", iascii,
+ strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
+ test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ASCII", iascii,
+ strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
+
+ return 0;
+}
+
+#include <support/test-driver.c>
new file mode 100644
@@ -0,0 +1,280 @@
+#!/bin/bash
+# Test for some known iconv(1) hangs from bug 19519, and miscellaneous
+# iconv(1) program error conditions.
+# Copyright (C) 2020 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+codir=$1
+test_wrapper_env="$2"
+run_program_env="$3"
+
+# We have to have some directories in the library path.
+LIBPATH=$codir:$codir/iconvdata
+
+# How the start the iconv(1) program. $from is not defined/expanded yet.
+ICONV='
+$codir/elf/ld.so --library-path $LIBPATH --inhibit-rpath ${from}.so
+$codir/iconv/iconv_prog
+'
+ICONV="$test_wrapper_env $run_program_env $ICONV"
+
+# List of known hangs;
+# Gathered by running an exhaustive 2 byte input search against glibc-2.28
+hangarray=(
+"\x00\x23;-c;ANSI_X3.110;UTF-8//TRANSLIT//IGNORE"
+"\x00\xa1;-c;ARMSCII-8;UTF-8//TRANSLIT//IGNORE"
+"\x00\xa1;-c;ASMO_449;UTF-8//TRANSLIT//IGNORE"
+"\x00\x81;-c;BIG5;UTF-8//TRANSLIT//IGNORE"
+"\x00\xff;-c;BIG5HKSCS;UTF-8//TRANSLIT//IGNORE"
+"\x00\xff;-c;BRF;UTF-8//TRANSLIT//IGNORE"
+"\x00\xff;-c;BS_4730;UTF-8//TRANSLIT//IGNORE"
+"\x00\x81;-c;CP1250;UTF-8//TRANSLIT//IGNORE"
+"\x00\x98;-c;CP1251;UTF-8//TRANSLIT//IGNORE"
+"\x00\x81;-c;CP1252;UTF-8//TRANSLIT//IGNORE"
+"\x00\x81;-c;CP1253;UTF-8//TRANSLIT//IGNORE"
+"\x00\x81;-c;CP1254;UTF-8//TRANSLIT//IGNORE"
+"\x00\x81;-c;CP1255;UTF-8//TRANSLIT//IGNORE"
+"\x00\x81;-c;CP1257;UTF-8//TRANSLIT//IGNORE"
+"\x00\x81;-c;CP1258;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;CP932;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;CSA_Z243.4-1985-1;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;CSA_Z243.4-1985-2;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;DEC-MCS;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;DIN_66003;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;DS_2089;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-AT-DE;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-AT-DE-A;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-CA-FR;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-DK-NO;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-DK-NO-A;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-ES;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-ES-A;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-ES-S;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-FI-SE;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-FI-SE-A;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-FR;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-IS-FRISS;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-IT;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-PT;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-UK;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;EBCDIC-US;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ES;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ES2;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;EUC-CN;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;EUC-JISX0213;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;EUC-JP;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;EUC-JP-MS;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;EUC-KR;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;EUC-TW;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;GB18030;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;GB_1988-80;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;GBK;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;GOST_19768-74;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;GREEK7;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;GREEK7-OLD;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;GREEK-CCITT;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;HP-GREEK8;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;HP-ROMAN8;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;HP-ROMAN9;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;HP-THAI8;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;HP-TURKISH8;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;IBM038;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;IBM1004;UTF-8//TRANSLIT//IGNORE"
+"\x00\xff;-c;IBM1008;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;IBM1046;UTF-8//TRANSLIT//IGNORE"
+"\x00\x51;-c;IBM1132;UTF-8//TRANSLIT//IGNORE"
+"\x00\xa0;-c;IBM1133;UTF-8//TRANSLIT//IGNORE"
+"\x00\xce;-c;IBM1137;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;IBM1161;UTF-8//TRANSLIT//IGNORE"
+"\x00\xdb;-c;IBM1162;UTF-8//TRANSLIT//IGNORE"
+"\x00\x70;-c;IBM12712;UTF-8//TRANSLIT//IGNORE"
+# These are known hangs that are yet to be fixed:
+# "\x00\x0f;-c;IBM1364;UTF-8"
+# "\x00\x0f;-c;IBM1371;UTF-8"
+# "\x00\x0f;-c;IBM1388;UTF-8"
+# "\x00\x0f;-c;IBM1390;UTF-8"
+# "\x00\x0f;-c;IBM1399;UTF-8"
+"\x00\x53;-c;IBM16804;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;IBM274;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;IBM275;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;IBM281;UTF-8//TRANSLIT//IGNORE"
+"\x00\x57;-c;IBM290;UTF-8//TRANSLIT//IGNORE"
+"\x00\x45;-c;IBM420;UTF-8//TRANSLIT//IGNORE"
+"\x00\x68;-c;IBM423;UTF-8//TRANSLIT//IGNORE"
+"\x00\x70;-c;IBM424;UTF-8//TRANSLIT//IGNORE"
+"\x00\x53;-c;IBM4517;UTF-8//TRANSLIT//IGNORE"
+"\x00\x53;-c;IBM4899;UTF-8//TRANSLIT//IGNORE"
+"\x00\xa5;-c;IBM4909;UTF-8//TRANSLIT//IGNORE"
+"\x00\xdc;-c;IBM4971;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;IBM803;UTF-8//TRANSLIT//IGNORE"
+"\x00\x91;-c;IBM851;UTF-8//TRANSLIT//IGNORE"
+"\x00\x9b;-c;IBM856;UTF-8//TRANSLIT//IGNORE"
+"\x00\xd5;-c;IBM857;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;IBM864;UTF-8//TRANSLIT//IGNORE"
+"\x00\x94;-c;IBM868;UTF-8//TRANSLIT//IGNORE"
+"\x00\x94;-c;IBM869;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;IBM874;UTF-8//TRANSLIT//IGNORE"
+"\x00\x6a;-c;IBM875;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;IBM880;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;IBM891;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;IBM903;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;IBM904;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;IBM905;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;IBM9066;UTF-8//TRANSLIT//IGNORE"
+"\x00\x48;-c;IBM918;UTF-8//TRANSLIT//IGNORE"
+"\x00\x57;-c;IBM930;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;IBM932;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;IBM933;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;IBM935;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;IBM937;UTF-8//TRANSLIT//IGNORE"
+"\x00\x41;-c;IBM939;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;IBM943;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;INIS;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;INIS-8;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;INIS-CYRILLIC;UTF-8//TRANSLIT//IGNORE"
+"\x00\xec;-c;ISIRI-3342;UTF-8//TRANSLIT//IGNORE"
+"\x00\xec;-c;ISO_10367-BOX;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO-2022-CN;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO-2022-CN-EXT;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO-2022-JP;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO-2022-JP-2;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO-2022-JP-3;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO-2022-KR;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO_2033;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO_5427;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO_5427-EXT;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO_5428;UTF-8//TRANSLIT//IGNORE"
+"\x00\xa4;-c;ISO_6937;UTF-8//TRANSLIT//IGNORE"
+"\x00\xa0;-c;ISO_6937-2;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO-8859-11;UTF-8//TRANSLIT//IGNORE"
+"\x00\xa5;-c;ISO-8859-3;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO-8859-6;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO-8859-7;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;ISO-8859-8;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;ISO-IR-197;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;ISO-IR-209;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;IT;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;JIS_C6220-1969-RO;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;JIS_C6229-1984-B;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;JOHAB;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;JUS_I.B1.002;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;KOI-8;UTF-8//TRANSLIT//IGNORE"
+"\x00\x88;-c;KOI8-T;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;KSC5636;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;LATIN-GREEK;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;LATIN-GREEK-1;UTF-8//TRANSLIT//IGNORE"
+"\x00\xf6;-c;MAC-IS;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;MSZ_7795.3;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;NATS-DANO;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;NATS-SEFI;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;NC_NC00-10;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;NF_Z_62-010;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;NF_Z_62-010_1973;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;NS_4551-1;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;NS_4551-2;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;PT;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;PT2;UTF-8//TRANSLIT//IGNORE"
+"\x00\x98;-c;RK1048;UTF-8//TRANSLIT//IGNORE"
+"\x00\x98;-c;SEN_850200_B;UTF-8//TRANSLIT//IGNORE"
+"\x00\x98;-c;SEN_850200_C;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;Shift_JISX0213;UTF-8//TRANSLIT//IGNORE"
+"\x00\x80;-c;SJIS;UTF-8//TRANSLIT//IGNORE"
+"\x00\x23;-c;T.61-8BIT;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;TIS-620;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;TSCII;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;UHC;UTF-8//TRANSLIT//IGNORE"
+"\x00\xd8;-c;UNICODE;UTF-8//TRANSLIT//IGNORE"
+"\x00\xdc;-c;UTF-16;UTF-8//TRANSLIT//IGNORE"
+"\xdc\x00;-c;UTF-16BE;UTF-8//TRANSLIT//IGNORE"
+"\x00\xdc;-c;UTF-16LE;UTF-8//TRANSLIT//IGNORE"
+"\xff\xff;-c;UTF-7;UTF-8//TRANSLIT//IGNORE"
+"\x00\x81;-c;WIN-SAMI-2;UTF-8//TRANSLIT//IGNORE"
+)
+
+# List of option combinations that *should* lead to an error
+errorarray=(
+# Converting from/to invalid character sets should cause error
+"\x00\x00;;INVALID;INVALID"
+"\x00\x00;;INVALID;UTF-8"
+"\x00\x00;;UTF-8;INVALID"
+)
+
+# Requires $twobyte input, $c flag, $from, and $to to be set; sets $ret
+execute_test ()
+{
+ eval PROG=\"$ICONV\"
+ echo -en "$twobyte" \
+ | timeout -k 4 3 $PROG $c -f $from -t "$to" &>/dev/null
+ ret=$?
+}
+
+check_hangtest_result ()
+{
+ if [ "$ret" -eq "124" ] || [ "$ret" -eq "137" ]; then # timeout/hang
+ result="HANG"
+ else
+ if [ "$ret" -eq "139" ]; then # segfault
+ result="SEGFAULT"
+ else
+ if [ "$ret" -gt "127" ]; then # unexpected error
+ result="UNEXPECTED"
+ else
+ result="OK"
+ fi
+ fi
+ fi
+
+ echo -n "$result: from: \"$from\", to: \"$to\","
+ echo " input \"$twobyte\", flags \"$c\""
+
+ if [ "$result" != "OK" ]; then
+ exit 1
+ fi
+}
+
+for hangcommand in "${hangarray[@]}"; do
+ twobyte="$(echo "$hangcommand" | cut -d";" -f 1)"
+ c="$(echo "$hangcommand" | cut -d";" -f 2)"
+ from="$(echo "$hangcommand" | cut -d";" -f 3)"
+ to="$(echo "$hangcommand" | cut -d";" -f 4)"
+ execute_test
+ check_hangtest_result
+done
+
+check_errtest_result ()
+{
+ if [ "$ret" -eq "1" ]; then # we errored out as expected
+ result="PASS"
+ else
+ result="FAIL"
+ fi
+ echo -n "$result: from: \"$from\", to: \"$to\","
+ echo " input \"$twobyte\", flags \"$c\", return code $ret"
+
+ if [ "$result" != "PASS" ]; then
+ exit 1
+ fi
+}
+
+for errorcommand in "${errorarray[@]}"; do
+ twobyte="$(echo "$errorcommand" | cut -d";" -f 1)"
+ c="$(echo "$errorcommand" | cut -d";" -f 2)"
+ from="$(echo "$errorcommand" | cut -d";" -f 3)"
+ to="$(echo "$errorcommand" | cut -d";" -f 4)"
+ execute_test
+ check_errtest_result
+done
@@ -1119,11 +1119,16 @@ _nl_find_msg (struct loaded_l10nfile *domain_file,
outcharset = encoding;
# ifdef _LIBC
- /* We always want to use transliteration. */
- outcharset = norm_add_slashes (outcharset, "TRANSLIT");
- charset = norm_add_slashes (charset, "");
- int r = __gconv_open (outcharset, charset, &convd->conv,
- GCONV_AVOID_NOCONV);
+
+ struct gconv_spec conv_spec
+ = { .fromcode = norm_add_slashes (charset, ""),
+ .tocode = norm_add_slashes (outcharset, ""),
+ /* We always want to use transliteration. */
+ .translit = true,
+ .ignore = false
+ };
+ int r = __gconv_open (&conv_spec, &convd->conv,
+ GCONV_AVOID_NOCONV);
if (__builtin_expect (r != __GCONV_OK, 0))
{
/* If the output encoding is the same there is