diff mbox

[v2,1/2] iconv_prog: Don't slurp whole input at once [BZ #6050]

Message ID 1439332066-13664-2-git-send-email-ben@0x539.de
State New, archived
Headers show

Commit Message

Benjamin Herr Aug. 11, 2015, 10:27 p.m. UTC
As described in the bug report comments, contrary to the comment in
'process_fd' , iconv can deal with being passed invalid multibyte
sequences, as it leaves the input pointer at the start of the sequence
and we can just call it again once more bytes are availabe.

Therefore we do not need to read the whole input at once when reading
from an fd, and can process it in chunks instead. This enables the iconv
program to be used in pipelines sensibly, and to not choke on very large
input files.

To still support reporting the position of invalid sequences in the
input, we count consumed input bytes explicitly. As overflow of that
counter is now possible, we use saturating addition and just mention in
the error message when (size_t) -1 has been reached.
---
2015-08-08  Benjamin Herr  <ben@0x539.de>

	[BZ #6050]
	* iconv/iconv_prog.c: Don't slurp the whole input at once, instead
	pass chunks to iconv and carry over incomplete multibyte sequences.


 iconv/iconv_prog.c | 247 +++++++++++++++++++++++++++++------------------------
 1 file changed, 135 insertions(+), 112 deletions(-)

Comments

Ondrej Bilka Aug. 11, 2015, 10:53 p.m. UTC | #1
On Wed, Aug 12, 2015 at 12:27:45AM +0200, Benjamin Herr wrote:
> As described in the bug report comments, contrary to the comment in
> 'process_fd' , iconv can deal with being passed invalid multibyte
> sequences, as it leaves the input pointer at the start of the sequence
> and we can just call it again once more bytes are availabe.
> 
> Therefore we do not need to read the whole input at once when reading
> from an fd, and can process it in chunks instead. This enables the iconv
> program to be used in pipelines sensibly, and to not choke on very large
> input files.
> 
> To still support reporting the position of invalid sequences in the
> input, we count consumed input bytes explicitly. As overflow of that
> counter is now possible, we use saturating addition and just mention in
> the error message when (size_t) -1 has been reached.
> ---
> 2015-08-08  Benjamin Herr  <ben@0x539.de>
> 
> 	[BZ #6050]
> 	* iconv/iconv_prog.c: Don't slurp the whole input at once, instead
> 	pass chunks to iconv and carry over incomplete multibyte sequences.
>
Could you next time split patches like that to first do refactoring to
shuffle lines around and then actual patch? It simplifies review. 

I read this patch and it look reasonable but is quite big so I would
prefer second set of eyes to also check that.
> 
>  iconv/iconv_prog.c | 247 +++++++++++++++++++++++++++++------------------------
>  1 file changed, 135 insertions(+), 112 deletions(-)
> 
> diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c
> index e249bce..63b8e33 100644
> --- a/iconv/iconv_prog.c
> +++ b/iconv/iconv_prog.c
> @@ -465,29 +465,93 @@ conversion stopped due to problem in writing the output"));
>    return 0;
>  }
>  
> +static void
> +report_iconv_error (size_t position)
> +{
> +  switch (errno)
> +    {
> +    case EILSEQ:
> +      if (position < (size_t) -1)
> +	error (0, 0,
> +	       _("illegal input sequence at position %zu"), position);
> +      else
> +	error (0, 0,
> +	       _("illegal input sequence past position %zu"),
> +	       position - 1);
> +      break;
> +    case EINVAL:
> +      error (0, 0, _("\
> +incomplete character or shift sequence at end of buffer"));
> +      break;
> +    case EBADF:
> +      error (0, 0, _("internal error (illegal descriptor)"));
> +      break;
> +    default:
> +      error (0, 0, _("unknown iconv() error %d"), errno);
> +      break;
> +    }
> +}
>  
ok, as its moved from below.

> +#define BUF_SIZE	32768
>  static int
> -process_block (iconv_t cd, char *addr, size_t len, FILE **output,
> -	       const char *output_file)
> +flush_state (iconv_t cd, FILE **output, const char *output_file, size_t offset)
>  {
> -#define OUTBUF_SIZE	32768
> -  const char *start = addr;
> -  char outbuf[OUTBUF_SIZE];
> +  char outbuf[BUF_SIZE];
> +  char *outptr;
> +  size_t outlen;
> +  size_t n;
> +
> +  /* All the input test is processed.  For state-dependent
> +     character sets we have to flush the state now.  */
> +  outptr = outbuf;
> +  outlen = BUF_SIZE;
> +  n = iconv (cd, NULL, NULL, &outptr, &outlen);
> +
> +  if (outptr != outbuf)
> +    {
> +      if (write_output (outbuf, outptr, output, output_file) == -1)
> +	return -1;
> +    }
> +
> +  if (n == (size_t) -1)
> +    {
> +      report_iconv_error (offset);
> +      return 1;
> +    }
> +
> +  return 0;
> +}
> +
ok
> +static size_t
> +saturating_add (size_t a, size_t b)
> +{
> +  if ((size_t) -1 - a > b)
> +    return a + b;
> +  else
> +    return -1;
> +}
> +
ok
> +static int
> +process_part (iconv_t cd, char **addr, size_t *len, size_t offset,
> +	      FILE **output, const char *output_file)
> +{
> +  const char *start = *addr;
> +  char outbuf[BUF_SIZE];
>    char *outptr;
>    size_t outlen;
>    size_t n;
>    int ret = 0;
>  
> -  while (len > 0)
> +  while (*len > 0)
>      {
>        outptr = outbuf;
> -      outlen = OUTBUF_SIZE;
> -      n = iconv (cd, &addr, &len, &outptr, &outlen);
> +      outlen = BUF_SIZE;
> +      n = iconv (cd, addr, len, &outptr, &outlen);
>  
>        if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
>  	{
>  	  ret = 1;
> -	  if (len == 0)
> +	  if (*len == 0)
>  	    n = 0;
>  	  else
>  	    errno = E2BIG;
> @@ -500,52 +564,18 @@ process_block (iconv_t cd, char *addr, size_t len, FILE **output,
>  	    break;
>  	}
>  
> -      if (n != (size_t) -1)
> +      /* Incomplete multibyte characters might be completed by the next
> +         chunk, so do not treat them as an error here. */
> +      if (n != (size_t) -1 || errno == EINVAL)
>  	{
> -	  /* All the input test is processed.  For state-dependent
> -	     character sets we have to flush the state now.  */
> -	  outptr = outbuf;
> -	  outlen = OUTBUF_SIZE;
> -	  n = iconv (cd, NULL, NULL, &outptr, &outlen);
> -
> -	  if (outptr != outbuf)
> -	    {
> -	      ret = write_output (outbuf, outptr, output, output_file);
> -	      if (ret != 0)
> -		break;
> -	    }
> -
> -	  if (n != (size_t) -1)
> -	    break;
> -
> -	  if (omit_invalid && errno == EILSEQ)
> -	    {
> -	      ret = 1;
> -	      break;
> -	    }
> +	  ret = 0;
> +	  break;
>  	}
>  
>        if (errno != E2BIG)
>  	{
>  	  /* iconv() ran into a problem.  */
> -	  switch (errno)
> -	    {
> -	    case EILSEQ:
> -	      if (! omit_invalid)
> -		error (0, 0, _("illegal input sequence at position %ld"),
> -		       (long int) (addr - start));
> -	      break;
> -	    case EINVAL:
> -	      error (0, 0, _("\
> -incomplete character or shift sequence at end of buffer"));
> -	      break;
> -	    case EBADF:
> -	      error (0, 0, _("internal error (illegal descriptor)"));
> -	      break;
> -	    default:
> -	      error (0, 0, _("unknown iconv() error %d"), errno);
> -	      break;
> -	    }
> +          report_iconv_error (saturating_add (offset, (*addr - start)));
>  
>  	  return -1;
>  	}
> @@ -554,83 +584,76 @@ incomplete character or shift sequence at end of buffer"));
>    return ret;
>  }
>  
ok
> -
>  static int
> -process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
> +process_block (iconv_t cd, char *addr, size_t len, FILE **output,
> +	       const char *output_file)
>  {
> -  /* we have a problem with reading from a desriptor since we must not
> -     provide the iconv() function an incomplete character or shift
> -     sequence at the end of the buffer.  Since we have to deal with
> -     arbitrary encodings we must read the whole text in a buffer and
> -     process it in one step.  */
> -  static char *inbuf = NULL;
> -  static size_t maxlen = 0;
> -  char *inptr = NULL;
> -  size_t actlen = 0;
> -
> -  while (actlen < maxlen)
> -    {
> -      ssize_t n = read (fd, inptr, maxlen - actlen);
> +  char *start = addr;
>  
> -      if (n == 0)
> -	/* No more text to read.  */
> -	break;
> +  /* Process everything in one go.  */
> +  int ret = process_part (cd, &addr, &len, 0, output, output_file);
>  
> -      if (n == -1)
> -	{
> -	  /* Error while reading.  */
> -	  error (0, errno, _("error while reading the input"));
> -	  return -1;
> -	}
> +  if (ret != 0)
> +    return ret;
>  
> -      inptr += n;
> -      actlen += n;
> +  /* If there is input left over, there is an incomplete multibyte
> +     sequence at the end.  */
> +  if (len > 0)
> +    {
> +      errno = EINVAL;
> +      report_iconv_error (addr - start);
> +      return -1;
>      }
>  
> -  if (actlen == maxlen)
> -    while (1)
> -      {
> -	ssize_t n;
> -	char *new_inbuf;
> +  return flush_state (cd, output, output_file, addr - start);
> +}
>  
> -	/* Increase the buffer.  */
> -	new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
> -	if (new_inbuf == NULL)
> -	  {
> -	    error (0, errno, _("unable to allocate buffer for input"));
> -	    return -1;
> -	  }
> -	inbuf = new_inbuf;
> -	maxlen += 32768;
> -	inptr = inbuf + actlen;
>  
> -	do
> -	  {
> -	    n = read (fd, inptr, maxlen - actlen);
> +static int
> +process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
> +{
> +  char inbuf[BUF_SIZE];
> +  size_t len = 0;
> +  size_t offset = 0;
> +  ssize_t n;
>  
> -	    if (n == 0)
> -	      /* No more text to read.  */
> -	      break;
> +  /* Read into the buffer past unconsumed bytes from the last iteration.  */
> +  while ((n = read (fd, inbuf + len, BUF_SIZE - len)) > 0)
> +    {
> +      int ret;
> +      char *inptr;
>  
> -	    if (n == -1)
> -	      {
> -		/* Error while reading.  */
> -		error (0, errno, _("error while reading the input"));
> -		return -1;
> -	      }
> +      len += n;
>  
> -	    inptr += n;
> -	    actlen += n;
> -	  }
> -	while (actlen < maxlen);
> +      inptr = inbuf;
> +      /* Process what we have read.  */
> +      ret = process_part (cd, &inptr, &len, offset, output, output_file);
> +      if (ret != 0)
> +	return ret;
> +      /* Keep track of overall position in the input for error reporting.  */
> +      offset = saturating_add (offset, inptr - inbuf);
>  
> -	if (n == 0)
> -	  /* Break again so we leave both loops.  */
> -	  break;
> -      }
> +      /* Keep incomplete multibyte characters, if any.  */
> +      memmove (inbuf, inptr, len);
> +    }
> +
> +  if (n == -1)
> +    {
> +      /* Error while reading.  */
> +      error (0, errno, _("error while reading the input"));
> +      return -1;
> +    }
> +
> +  /* No more input available, so incomplete multibyte sequences are
> +     not going to get completed at this point. */
> +  if (len > 0)
> +    {
> +      errno = EINVAL;
> +      report_iconv_error (offset);
> +      return -1;
> +    }
>  
> -  /* Now we have all the input in the buffer.  Process it in one run.  */
> -  return process_block (cd, inbuf, actlen, output, output_file);
> +  return flush_state (cd, output, output_file, offset);
>  }
>  
>
diff mbox

Patch

diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c
index e249bce..63b8e33 100644
--- a/iconv/iconv_prog.c
+++ b/iconv/iconv_prog.c
@@ -465,29 +465,93 @@  conversion stopped due to problem in writing the output"));
   return 0;
 }
 
+static void
+report_iconv_error (size_t position)
+{
+  switch (errno)
+    {
+    case EILSEQ:
+      if (position < (size_t) -1)
+	error (0, 0,
+	       _("illegal input sequence at position %zu"), position);
+      else
+	error (0, 0,
+	       _("illegal input sequence past position %zu"),
+	       position - 1);
+      break;
+    case EINVAL:
+      error (0, 0, _("\
+incomplete character or shift sequence at end of buffer"));
+      break;
+    case EBADF:
+      error (0, 0, _("internal error (illegal descriptor)"));
+      break;
+    default:
+      error (0, 0, _("unknown iconv() error %d"), errno);
+      break;
+    }
+}
 
+#define BUF_SIZE	32768
 static int
-process_block (iconv_t cd, char *addr, size_t len, FILE **output,
-	       const char *output_file)
+flush_state (iconv_t cd, FILE **output, const char *output_file, size_t offset)
 {
-#define OUTBUF_SIZE	32768
-  const char *start = addr;
-  char outbuf[OUTBUF_SIZE];
+  char outbuf[BUF_SIZE];
+  char *outptr;
+  size_t outlen;
+  size_t n;
+
+  /* All the input test is processed.  For state-dependent
+     character sets we have to flush the state now.  */
+  outptr = outbuf;
+  outlen = BUF_SIZE;
+  n = iconv (cd, NULL, NULL, &outptr, &outlen);
+
+  if (outptr != outbuf)
+    {
+      if (write_output (outbuf, outptr, output, output_file) == -1)
+	return -1;
+    }
+
+  if (n == (size_t) -1)
+    {
+      report_iconv_error (offset);
+      return 1;
+    }
+
+  return 0;
+}
+
+static size_t
+saturating_add (size_t a, size_t b)
+{
+  if ((size_t) -1 - a > b)
+    return a + b;
+  else
+    return -1;
+}
+
+static int
+process_part (iconv_t cd, char **addr, size_t *len, size_t offset,
+	      FILE **output, const char *output_file)
+{
+  const char *start = *addr;
+  char outbuf[BUF_SIZE];
   char *outptr;
   size_t outlen;
   size_t n;
   int ret = 0;
 
-  while (len > 0)
+  while (*len > 0)
     {
       outptr = outbuf;
-      outlen = OUTBUF_SIZE;
-      n = iconv (cd, &addr, &len, &outptr, &outlen);
+      outlen = BUF_SIZE;
+      n = iconv (cd, addr, len, &outptr, &outlen);
 
       if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
 	{
 	  ret = 1;
-	  if (len == 0)
+	  if (*len == 0)
 	    n = 0;
 	  else
 	    errno = E2BIG;
@@ -500,52 +564,18 @@  process_block (iconv_t cd, char *addr, size_t len, FILE **output,
 	    break;
 	}
 
-      if (n != (size_t) -1)
+      /* Incomplete multibyte characters might be completed by the next
+         chunk, so do not treat them as an error here. */
+      if (n != (size_t) -1 || errno == EINVAL)
 	{
-	  /* All the input test is processed.  For state-dependent
-	     character sets we have to flush the state now.  */
-	  outptr = outbuf;
-	  outlen = OUTBUF_SIZE;
-	  n = iconv (cd, NULL, NULL, &outptr, &outlen);
-
-	  if (outptr != outbuf)
-	    {
-	      ret = write_output (outbuf, outptr, output, output_file);
-	      if (ret != 0)
-		break;
-	    }
-
-	  if (n != (size_t) -1)
-	    break;
-
-	  if (omit_invalid && errno == EILSEQ)
-	    {
-	      ret = 1;
-	      break;
-	    }
+	  ret = 0;
+	  break;
 	}
 
       if (errno != E2BIG)
 	{
 	  /* iconv() ran into a problem.  */
-	  switch (errno)
-	    {
-	    case EILSEQ:
-	      if (! omit_invalid)
-		error (0, 0, _("illegal input sequence at position %ld"),
-		       (long int) (addr - start));
-	      break;
-	    case EINVAL:
-	      error (0, 0, _("\
-incomplete character or shift sequence at end of buffer"));
-	      break;
-	    case EBADF:
-	      error (0, 0, _("internal error (illegal descriptor)"));
-	      break;
-	    default:
-	      error (0, 0, _("unknown iconv() error %d"), errno);
-	      break;
-	    }
+          report_iconv_error (saturating_add (offset, (*addr - start)));
 
 	  return -1;
 	}
@@ -554,83 +584,76 @@  incomplete character or shift sequence at end of buffer"));
   return ret;
 }
 
-
 static int
-process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
+process_block (iconv_t cd, char *addr, size_t len, FILE **output,
+	       const char *output_file)
 {
-  /* we have a problem with reading from a desriptor since we must not
-     provide the iconv() function an incomplete character or shift
-     sequence at the end of the buffer.  Since we have to deal with
-     arbitrary encodings we must read the whole text in a buffer and
-     process it in one step.  */
-  static char *inbuf = NULL;
-  static size_t maxlen = 0;
-  char *inptr = NULL;
-  size_t actlen = 0;
-
-  while (actlen < maxlen)
-    {
-      ssize_t n = read (fd, inptr, maxlen - actlen);
+  char *start = addr;
 
-      if (n == 0)
-	/* No more text to read.  */
-	break;
+  /* Process everything in one go.  */
+  int ret = process_part (cd, &addr, &len, 0, output, output_file);
 
-      if (n == -1)
-	{
-	  /* Error while reading.  */
-	  error (0, errno, _("error while reading the input"));
-	  return -1;
-	}
+  if (ret != 0)
+    return ret;
 
-      inptr += n;
-      actlen += n;
+  /* If there is input left over, there is an incomplete multibyte
+     sequence at the end.  */
+  if (len > 0)
+    {
+      errno = EINVAL;
+      report_iconv_error (addr - start);
+      return -1;
     }
 
-  if (actlen == maxlen)
-    while (1)
-      {
-	ssize_t n;
-	char *new_inbuf;
+  return flush_state (cd, output, output_file, addr - start);
+}
 
-	/* Increase the buffer.  */
-	new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
-	if (new_inbuf == NULL)
-	  {
-	    error (0, errno, _("unable to allocate buffer for input"));
-	    return -1;
-	  }
-	inbuf = new_inbuf;
-	maxlen += 32768;
-	inptr = inbuf + actlen;
 
-	do
-	  {
-	    n = read (fd, inptr, maxlen - actlen);
+static int
+process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
+{
+  char inbuf[BUF_SIZE];
+  size_t len = 0;
+  size_t offset = 0;
+  ssize_t n;
 
-	    if (n == 0)
-	      /* No more text to read.  */
-	      break;
+  /* Read into the buffer past unconsumed bytes from the last iteration.  */
+  while ((n = read (fd, inbuf + len, BUF_SIZE - len)) > 0)
+    {
+      int ret;
+      char *inptr;
 
-	    if (n == -1)
-	      {
-		/* Error while reading.  */
-		error (0, errno, _("error while reading the input"));
-		return -1;
-	      }
+      len += n;
 
-	    inptr += n;
-	    actlen += n;
-	  }
-	while (actlen < maxlen);
+      inptr = inbuf;
+      /* Process what we have read.  */
+      ret = process_part (cd, &inptr, &len, offset, output, output_file);
+      if (ret != 0)
+	return ret;
+      /* Keep track of overall position in the input for error reporting.  */
+      offset = saturating_add (offset, inptr - inbuf);
 
-	if (n == 0)
-	  /* Break again so we leave both loops.  */
-	  break;
-      }
+      /* Keep incomplete multibyte characters, if any.  */
+      memmove (inbuf, inptr, len);
+    }
+
+  if (n == -1)
+    {
+      /* Error while reading.  */
+      error (0, errno, _("error while reading the input"));
+      return -1;
+    }
+
+  /* No more input available, so incomplete multibyte sequences are
+     not going to get completed at this point. */
+  if (len > 0)
+    {
+      errno = EINVAL;
+      report_iconv_error (offset);
+      return -1;
+    }
 
-  /* Now we have all the input in the buffer.  Process it in one run.  */
-  return process_block (cd, inbuf, actlen, output, output_file);
+  return flush_state (cd, output, output_file, offset);
 }