From patchwork Tue Aug 11 22:27:45 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Benjamin Herr X-Patchwork-Id: 8141 Received: (qmail 16986 invoked by alias); 11 Aug 2015 22:29:06 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 16947 invoked by uid 89); 11 Aug 2015 22:29:06 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=1.6 required=5.0 tests=AWL, BAYES_50, KAM_LAZY_DOMAIN_SECURITY, RCVD_IN_DNSWL_NONE, RP_MATCHES_RCVD autolearn=no version=3.3.2 X-HELO: stormwind.0x539.de From: Benjamin Herr To: libc-alpha@sourceware.org Subject: [PATCH v2 1/2] iconv_prog: Don't slurp whole input at once [BZ #6050] Date: Wed, 12 Aug 2015 00:27:45 +0200 Message-Id: <1439332066-13664-2-git-send-email-ben@0x539.de> In-Reply-To: <1439332066-13664-1-git-send-email-ben@0x539.de> References: <1439332066-13664-1-git-send-email-ben@0x539.de> As described in the bug report comments, contrary to the comment in 'process_fd' , iconv can deal with being passed invalid multibyte sequences, as it leaves the input pointer at the start of the sequence and we can just call it again once more bytes are availabe. Therefore we do not need to read the whole input at once when reading from an fd, and can process it in chunks instead. This enables the iconv program to be used in pipelines sensibly, and to not choke on very large input files. To still support reporting the position of invalid sequences in the input, we count consumed input bytes explicitly. As overflow of that counter is now possible, we use saturating addition and just mention in the error message when (size_t) -1 has been reached. --- 2015-08-08 Benjamin Herr [BZ #6050] * iconv/iconv_prog.c: Don't slurp the whole input at once, instead pass chunks to iconv and carry over incomplete multibyte sequences. iconv/iconv_prog.c | 247 +++++++++++++++++++++++++++++------------------------ 1 file changed, 135 insertions(+), 112 deletions(-) diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c index e249bce..63b8e33 100644 --- a/iconv/iconv_prog.c +++ b/iconv/iconv_prog.c @@ -465,29 +465,93 @@ conversion stopped due to problem in writing the output")); return 0; } +static void +report_iconv_error (size_t position) +{ + switch (errno) + { + case EILSEQ: + if (position < (size_t) -1) + error (0, 0, + _("illegal input sequence at position %zu"), position); + else + error (0, 0, + _("illegal input sequence past position %zu"), + position - 1); + break; + case EINVAL: + error (0, 0, _("\ +incomplete character or shift sequence at end of buffer")); + break; + case EBADF: + error (0, 0, _("internal error (illegal descriptor)")); + break; + default: + error (0, 0, _("unknown iconv() error %d"), errno); + break; + } +} +#define BUF_SIZE 32768 static int -process_block (iconv_t cd, char *addr, size_t len, FILE **output, - const char *output_file) +flush_state (iconv_t cd, FILE **output, const char *output_file, size_t offset) { -#define OUTBUF_SIZE 32768 - const char *start = addr; - char outbuf[OUTBUF_SIZE]; + char outbuf[BUF_SIZE]; + char *outptr; + size_t outlen; + size_t n; + + /* All the input test is processed. For state-dependent + character sets we have to flush the state now. */ + outptr = outbuf; + outlen = BUF_SIZE; + n = iconv (cd, NULL, NULL, &outptr, &outlen); + + if (outptr != outbuf) + { + if (write_output (outbuf, outptr, output, output_file) == -1) + return -1; + } + + if (n == (size_t) -1) + { + report_iconv_error (offset); + return 1; + } + + return 0; +} + +static size_t +saturating_add (size_t a, size_t b) +{ + if ((size_t) -1 - a > b) + return a + b; + else + return -1; +} + +static int +process_part (iconv_t cd, char **addr, size_t *len, size_t offset, + FILE **output, const char *output_file) +{ + const char *start = *addr; + char outbuf[BUF_SIZE]; char *outptr; size_t outlen; size_t n; int ret = 0; - while (len > 0) + while (*len > 0) { outptr = outbuf; - outlen = OUTBUF_SIZE; - n = iconv (cd, &addr, &len, &outptr, &outlen); + outlen = BUF_SIZE; + n = iconv (cd, addr, len, &outptr, &outlen); if (n == (size_t) -1 && omit_invalid && errno == EILSEQ) { ret = 1; - if (len == 0) + if (*len == 0) n = 0; else errno = E2BIG; @@ -500,52 +564,18 @@ process_block (iconv_t cd, char *addr, size_t len, FILE **output, break; } - if (n != (size_t) -1) + /* Incomplete multibyte characters might be completed by the next + chunk, so do not treat them as an error here. */ + if (n != (size_t) -1 || errno == EINVAL) { - /* All the input test is processed. For state-dependent - character sets we have to flush the state now. */ - outptr = outbuf; - outlen = OUTBUF_SIZE; - n = iconv (cd, NULL, NULL, &outptr, &outlen); - - if (outptr != outbuf) - { - ret = write_output (outbuf, outptr, output, output_file); - if (ret != 0) - break; - } - - if (n != (size_t) -1) - break; - - if (omit_invalid && errno == EILSEQ) - { - ret = 1; - break; - } + ret = 0; + break; } if (errno != E2BIG) { /* iconv() ran into a problem. */ - switch (errno) - { - case EILSEQ: - if (! omit_invalid) - error (0, 0, _("illegal input sequence at position %ld"), - (long int) (addr - start)); - break; - case EINVAL: - error (0, 0, _("\ -incomplete character or shift sequence at end of buffer")); - break; - case EBADF: - error (0, 0, _("internal error (illegal descriptor)")); - break; - default: - error (0, 0, _("unknown iconv() error %d"), errno); - break; - } + report_iconv_error (saturating_add (offset, (*addr - start))); return -1; } @@ -554,83 +584,76 @@ incomplete character or shift sequence at end of buffer")); return ret; } - static int -process_fd (iconv_t cd, int fd, FILE **output, const char *output_file) +process_block (iconv_t cd, char *addr, size_t len, FILE **output, + const char *output_file) { - /* we have a problem with reading from a desriptor since we must not - provide the iconv() function an incomplete character or shift - sequence at the end of the buffer. Since we have to deal with - arbitrary encodings we must read the whole text in a buffer and - process it in one step. */ - static char *inbuf = NULL; - static size_t maxlen = 0; - char *inptr = NULL; - size_t actlen = 0; - - while (actlen < maxlen) - { - ssize_t n = read (fd, inptr, maxlen - actlen); + char *start = addr; - if (n == 0) - /* No more text to read. */ - break; + /* Process everything in one go. */ + int ret = process_part (cd, &addr, &len, 0, output, output_file); - if (n == -1) - { - /* Error while reading. */ - error (0, errno, _("error while reading the input")); - return -1; - } + if (ret != 0) + return ret; - inptr += n; - actlen += n; + /* If there is input left over, there is an incomplete multibyte + sequence at the end. */ + if (len > 0) + { + errno = EINVAL; + report_iconv_error (addr - start); + return -1; } - if (actlen == maxlen) - while (1) - { - ssize_t n; - char *new_inbuf; + return flush_state (cd, output, output_file, addr - start); +} - /* Increase the buffer. */ - new_inbuf = (char *) realloc (inbuf, maxlen + 32768); - if (new_inbuf == NULL) - { - error (0, errno, _("unable to allocate buffer for input")); - return -1; - } - inbuf = new_inbuf; - maxlen += 32768; - inptr = inbuf + actlen; - do - { - n = read (fd, inptr, maxlen - actlen); +static int +process_fd (iconv_t cd, int fd, FILE **output, const char *output_file) +{ + char inbuf[BUF_SIZE]; + size_t len = 0; + size_t offset = 0; + ssize_t n; - if (n == 0) - /* No more text to read. */ - break; + /* Read into the buffer past unconsumed bytes from the last iteration. */ + while ((n = read (fd, inbuf + len, BUF_SIZE - len)) > 0) + { + int ret; + char *inptr; - if (n == -1) - { - /* Error while reading. */ - error (0, errno, _("error while reading the input")); - return -1; - } + len += n; - inptr += n; - actlen += n; - } - while (actlen < maxlen); + inptr = inbuf; + /* Process what we have read. */ + ret = process_part (cd, &inptr, &len, offset, output, output_file); + if (ret != 0) + return ret; + /* Keep track of overall position in the input for error reporting. */ + offset = saturating_add (offset, inptr - inbuf); - if (n == 0) - /* Break again so we leave both loops. */ - break; - } + /* Keep incomplete multibyte characters, if any. */ + memmove (inbuf, inptr, len); + } + + if (n == -1) + { + /* Error while reading. */ + error (0, errno, _("error while reading the input")); + return -1; + } + + /* No more input available, so incomplete multibyte sequences are + not going to get completed at this point. */ + if (len > 0) + { + errno = EINVAL; + report_iconv_error (offset); + return -1; + } - /* Now we have all the input in the buffer. Process it in one run. */ - return process_block (cd, inbuf, actlen, output, output_file); + return flush_state (cd, output, output_file, offset); }