stdio: fix vfscanf with matches longer than INT_MAX (bug 27650)

Message ID 20210325140102.31043-1-hi@alyssa.is
State Committed
Commit b03e4d7bd25b1ac485f858f0a857ba6085e8c9b0
Delegated to: Florian Weimer
Headers
Series stdio: fix vfscanf with matches longer than INT_MAX (bug 27650) |

Commit Message

Alyssa Ross March 25, 2021, 2:01 p.m. UTC
  Patterns like %*[ can safely be used to match a great many characters,
and it's quite realisitic to use them for more than INT_MAX characters
from an IO stream.

With the previous approach, after INT_MAX characters (v)fscanf would
return successfully, indicating an end to the match, even though there
wasn't one.
---

I have not done a copyright assignment yet, but I think this change
should be small enough to be exempt?

 stdio-common/vfscanf-internal.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)
  

Comments

Florian Weimer March 25, 2021, 5:25 p.m. UTC | #1
* Alyssa Ross:

> I have not done a copyright assignment yet, but I think this change
> should be small enough to be exempt?

Yes, I think it's small enough.

The test case wouldn't be, though.  I think the one on the bug needs
some large (infinite) input on the stdin, though.  A real test case
for glibc should probably involve pipe, fork, and fdopen.  fopencookie
could work, too.

>  stdio-common/vfscanf-internal.c | 13 ++++---------
>  1 file changed, 4 insertions(+), 9 deletions(-)
>
> diff --git a/stdio-common/vfscanf-internal.c b/stdio-common/vfscanf-internal.c
> index 38e74776a5..1d81e16f4e 100644
> --- a/stdio-common/vfscanf-internal.c
> +++ b/stdio-common/vfscanf-internal.c
> @@ -2479,11 +2479,6 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr,
>  	  else
>  	    not_in = 0;
>  
> -	  if (width < 0)
> -	    /* There is no width given so there is also no limit on the
> -	       number of characters we read.  Therefore we set width to
> -	       a very high value to make the algorithm easier.  */
> -	    width = INT_MAX;
>  
>  #ifdef COMPILE_WSCANF
>  	  /* Find the beginning and the end of the scanlist.  We are not
> @@ -2647,7 +2642,7 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr,
>  			}
>  		    }
>  		}
> -	      while (--width > 0 && inchar () != WEOF);
> +	      while ((width < 0 || --width > 0) && inchar () != WEOF);
>  	    out:
>  #else
>  	      char buf[MB_LEN_MAX];
> @@ -2732,7 +2727,7 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr,
>  			}
>  		    }
>  
> -		  if (--width <= 0)
> +		  if (width >= 0 && --width <= 0)
>  		    break;
>  		}
>  	      while (inchar () != EOF);
> @@ -2884,7 +2879,7 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr,
>  		  assert (n <= MB_LEN_MAX);
>  		  str += n;
>  		}
> -	      while (--width > 0 && inchar () != WEOF);
> +	      while ((width < 0 || --width > 0) && inchar () != WEOF);
>  	    out2:
>  #else
>  	      do
> @@ -2938,7 +2933,7 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr,
>  			}
>  		    }
>  		}
> -	      while (--width > 0 && inchar () != EOF);
> +	      while ((width < 0 || --width > 0) && inchar () != EOF);
>  #endif
>  
>  	      if (__glibc_unlikely (now == read_in))

So I tried to review this.  -U100 helped.  I was worried about width
starting out as positive and going negative.  But as far as I can
tell, processing stops once width == 0, so the issue cannot happen.

Do you want to work on the test case?  Will the copyright assignment
be an obstacle?
  
Alyssa Ross March 25, 2021, 8:28 p.m. UTC | #2
>> I have not done a copyright assignment yet, but I think this change
>> should be small enough to be exempt?
>
> Yes, I think it's small enough.
>
> The test case wouldn't be, though.  I think the one on the bug needs
> some large (infinite) input on the stdin, though.  A real test case
> for glibc should probably involve pipe, fork, and fdopen.  fopencookie
> could work, too.

Oh, thanks for telling me about fopencookie!  I'd never have known about
that otherwise.  I've started having a go at a test case using it and it
seems like it'll work well.

>>  stdio-common/vfscanf-internal.c | 13 ++++---------
>>  1 file changed, 4 insertions(+), 9 deletions(-)
>>
>> diff --git a/stdio-common/vfscanf-internal.c b/stdio-common/vfscanf-internal.c
>> index 38e74776a5..1d81e16f4e 100644
>> --- a/stdio-common/vfscanf-internal.c
>> +++ b/stdio-common/vfscanf-internal.c
>> @@ -2479,11 +2479,6 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr,
>>  	  else
>>  	    not_in = 0;
>>  
>> -	  if (width < 0)
>> -	    /* There is no width given so there is also no limit on the
>> -	       number of characters we read.  Therefore we set width to
>> -	       a very high value to make the algorithm easier.  */
>> -	    width = INT_MAX;
>>  
>>  #ifdef COMPILE_WSCANF
>>  	  /* Find the beginning and the end of the scanlist.  We are not
>> @@ -2647,7 +2642,7 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr,
>>  			}
>>  		    }
>>  		}
>> -	      while (--width > 0 && inchar () != WEOF);
>> +	      while ((width < 0 || --width > 0) && inchar () != WEOF);
>>  	    out:
>>  #else
>>  	      char buf[MB_LEN_MAX];
>> @@ -2732,7 +2727,7 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr,
>>  			}
>>  		    }
>>  
>> -		  if (--width <= 0)
>> +		  if (width >= 0 && --width <= 0)
>>  		    break;
>>  		}
>>  	      while (inchar () != EOF);
>> @@ -2884,7 +2879,7 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr,
>>  		  assert (n <= MB_LEN_MAX);
>>  		  str += n;
>>  		}
>> -	      while (--width > 0 && inchar () != WEOF);
>> +	      while ((width < 0 || --width > 0) && inchar () != WEOF);
>>  	    out2:
>>  #else
>>  	      do
>> @@ -2938,7 +2933,7 @@ __vfscanf_internal (FILE *s, const char *format, va_list argptr,
>>  			}
>>  		    }
>>  		}
>> -	      while (--width > 0 && inchar () != EOF);
>> +	      while ((width < 0 || --width > 0) && inchar () != EOF);
>>  #endif
>>  
>>  	      if (__glibc_unlikely (now == read_in))
>
> So I tried to review this.  -U100 helped.  I was worried about width
> starting out as positive and going negative.  But as far as I can
> tell, processing stops once width == 0, so the issue cannot happen.

That's my understanding too.

> Do you want to work on the test case?  Will the copyright assignment
> be an obstacle?

I'm happy to have a go at a test case.  I think I have most of one
already.  Copyright assignment will only be a problem in that it'll slow
things down a bit!  I just sent one in for Emacs and I've asked about
doing one for glibc as well.

One question about the test: fscanf-ing through INT_MAX characters on a
trivial memcpy-based fopencookie stream takes 20 seconds on my
(admittedly fairly old) machine.  How slow is too slow for a test?
  
Florian Weimer March 25, 2021, 9:24 p.m. UTC | #3
* Alyssa Ross:

>>> I have not done a copyright assignment yet, but I think this change
>>> should be small enough to be exempt?
>>
>> Yes, I think it's small enough.
>>
>> The test case wouldn't be, though.  I think the one on the bug needs
>> some large (infinite) input on the stdin, though.  A real test case
>> for glibc should probably involve pipe, fork, and fdopen.  fopencookie
>> could work, too.
>
> Oh, thanks for telling me about fopencookie!  I'd never have known about
> that otherwise.  I've started having a go at a test case using it and it
> seems like it'll work well.

Or you could test sscanf via <support/blob_repeat.h> on 64-bit
architectures.  It would avoid the repeated memcpy calls, at the cost
of an initial strlen on the entire buffer.

> One question about the test: fscanf-ing through INT_MAX characters on a
> trivial memcpy-based fopencookie stream takes 20 seconds on my
> (admittedly fairly old) machine.  How slow is too slow for a test?

Opinions on that vary.  Twenty seconds is stretching things as far as
I'm concerned.  I guess it depends what you mean by “fairly old”.

We have a second category of tests, xtests, that only run with “make
xcheck”.  We could put the test there if it takes too long to run
otherwise.
  
Alyssa Ross March 26, 2021, noon UTC | #4
Florian Weimer <fw@deneb.enyo.de> writes:

>> Oh, thanks for telling me about fopencookie!  I'd never have known about
>> that otherwise.  I've started having a go at a test case using it and it
>> seems like it'll work well.
>
> Or you could test sscanf via <support/blob_repeat.h> on 64-bit
> architectures.  It would avoid the repeated memcpy calls, at the cost
> of an initial strlen on the entire buffer.

Would that be a problem on 32-bit?  We'd only need to map
INT_MAX bytes + 1 page, so the other half of the address space would be
enough for everything else, wouldn't it?

>> One question about the test: fscanf-ing through INT_MAX characters on a
>> trivial memcpy-based fopencookie stream takes 20 seconds on my
>> (admittedly fairly old) machine.  How slow is too slow for a test?
>
> Opinions on that vary.  Twenty seconds is stretching things as far as
> I'm concerned.  I guess it depends what you mean by “fairly old”.
>
> We have a second category of tests, xtests, that only run with “make
> xcheck”.  We could put the test there if it takes too long to run
> otherwise.

That machine is a laptop from 2012.  On my other laptop, from 2017, it
takes <5 seconds, and the fopencookie and blob_repeat versions aren't
really distinguishable from each other in terms of time taken.  I would
like to go with the blob_repeat version if it'll work everywhere,
though, because not having to write a custom stream implementation makes
the test a lot shorter and easier to understand.
  
Florian Weimer March 26, 2021, 12:17 p.m. UTC | #5
* Alyssa Ross:

> Florian Weimer <fw@deneb.enyo.de> writes:
>
>>> Oh, thanks for telling me about fopencookie!  I'd never have known about
>>> that otherwise.  I've started having a go at a test case using it and it
>>> seems like it'll work well.
>>
>> Or you could test sscanf via <support/blob_repeat.h> on 64-bit
>> architectures.  It would avoid the repeated memcpy calls, at the cost
>> of an initial strlen on the entire buffer.
>
> Would that be a problem on 32-bit?  We'd only need to map
> INT_MAX bytes + 1 page, so the other half of the address space would be
> enough for everything else, wouldn't it?

True, it would work on most 32-bit targets, particularly when running
on 64-bit kernels, where more address space is available to userspace.
s390 (without the x) has just 31-bit addresses, so it won't be able to
run the test, but that's fine, given that it's an architecture-independent
issue.

(blob_repeat sets up page aliasing behind the scenes, so the actual
memory requirements are quite limited, it's just address space that
could be problematic.)

>>> One question about the test: fscanf-ing through INT_MAX characters on a
>>> trivial memcpy-based fopencookie stream takes 20 seconds on my
>>> (admittedly fairly old) machine.  How slow is too slow for a test?
>>
>> Opinions on that vary.  Twenty seconds is stretching things as far as
>> I'm concerned.  I guess it depends what you mean by “fairly old”.
>>
>> We have a second category of tests, xtests, that only run with “make
>> xcheck”.  We could put the test there if it takes too long to run
>> otherwise.
>
> That machine is a laptop from 2012.  On my other laptop, from 2017, it
> takes <5 seconds, and the fopencookie and blob_repeat versions aren't
> really distinguishable from each other in terms of time taken.  I would
> like to go with the blob_repeat version if it'll work everywhere,
> though, because not having to write a custom stream implementation makes
> the test a lot shorter and easier to understand.

Sounds good to me.  Less than five seconds is fine, I think.
  
Alyssa Ross March 29, 2021, 12:01 p.m. UTC | #6
On Thu, Mar 25, 2021 at 06:25:17PM +0100, Florian Weimer wrote:
> Do you want to work on the test case?  Will the copyright assignment
> be an obstacle?

Okay, I've written a test case (thanks for all your help).  Do I need to
seperately test swscanf and friends, since their implementation is
seperate?  And should I post the revised patch with the test case as
soon as it's ready, or should I wait until I have my copyright paperwork
sorted?
  
Florian Weimer March 29, 2021, 1:34 p.m. UTC | #7
* Alyssa Ross:

> On Thu, Mar 25, 2021 at 06:25:17PM +0100, Florian Weimer wrote:
>> Do you want to work on the test case?  Will the copyright assignment
>> be an obstacle?
>
> Okay, I've written a test case (thanks for all your help).  Do I need to
> seperately test swscanf and friends, since their implementation is
> seperate?  And should I post the revised patch with the test case as
> soon as it's ready, or should I wait until I have my copyright paperwork
> sorted?

I think we can apply the fix now and the test case later.  You can
post the test case now, we can mark it as FSF Assignment Missing in
Patchwork.
  
Florian Weimer May 3, 2021, 8:57 a.m. UTC | #8
* Alyssa Ross:

> Patterns like %*[ can safely be used to match a great many characters,
> and it's quite realisitic to use them for more than INT_MAX characters
> from an IO stream.
>
> With the previous approach, after INT_MAX characters (v)fscanf would
> return successfully, indicating an end to the match, even though there
> wasn't one.

I've pushed to this commit, thanks.

We can integrate the test once your copyright assignment is on file.

Thanks,
Florian
  
Alyssa Ross May 9, 2021, 4:32 p.m. UTC | #9
On Mon, May 03, 2021 at 10:57:12AM +0200, Florian Weimer wrote:
> * Alyssa Ross:
>
> > Patterns like %*[ can safely be used to match a great many characters,
> > and it's quite realisitic to use them for more than INT_MAX characters
> > from an IO stream.
> >
> > With the previous approach, after INT_MAX characters (v)fscanf would
> > return successfully, indicating an end to the match, even though there
> > wasn't one.
>
> I've pushed to this commit, thanks.
>
> We can integrate the test once your copyright assignment is on file.

My copyright assignment should be sorted now (as of May 6).
  

Patch

diff --git a/stdio-common/vfscanf-internal.c b/stdio-common/vfscanf-internal.c
index 38e74776a5..1d81e16f4e 100644
--- a/stdio-common/vfscanf-internal.c
+++ b/stdio-common/vfscanf-internal.c
@@ -2479,11 +2479,6 @@  __vfscanf_internal (FILE *s, const char *format, va_list argptr,
 	  else
 	    not_in = 0;
 
-	  if (width < 0)
-	    /* There is no width given so there is also no limit on the
-	       number of characters we read.  Therefore we set width to
-	       a very high value to make the algorithm easier.  */
-	    width = INT_MAX;
 
 #ifdef COMPILE_WSCANF
 	  /* Find the beginning and the end of the scanlist.  We are not
@@ -2647,7 +2642,7 @@  __vfscanf_internal (FILE *s, const char *format, va_list argptr,
 			}
 		    }
 		}
-	      while (--width > 0 && inchar () != WEOF);
+	      while ((width < 0 || --width > 0) && inchar () != WEOF);
 	    out:
 #else
 	      char buf[MB_LEN_MAX];
@@ -2732,7 +2727,7 @@  __vfscanf_internal (FILE *s, const char *format, va_list argptr,
 			}
 		    }
 
-		  if (--width <= 0)
+		  if (width >= 0 && --width <= 0)
 		    break;
 		}
 	      while (inchar () != EOF);
@@ -2884,7 +2879,7 @@  __vfscanf_internal (FILE *s, const char *format, va_list argptr,
 		  assert (n <= MB_LEN_MAX);
 		  str += n;
 		}
-	      while (--width > 0 && inchar () != WEOF);
+	      while ((width < 0 || --width > 0) && inchar () != WEOF);
 	    out2:
 #else
 	      do
@@ -2938,7 +2933,7 @@  __vfscanf_internal (FILE *s, const char *format, va_list argptr,
 			}
 		    }
 		}
-	      while (--width > 0 && inchar () != EOF);
+	      while ((width < 0 || --width > 0) && inchar () != EOF);
 #endif
 
 	      if (__glibc_unlikely (now == read_in))