posix: if glob has a trailing slash match directories only.
Commit Message
If pattern has a trailing slash match directories only.
This patch fixes [BZ #22513].
+2017-11-28 Dmitry Goncharov <dgoncharov@users.sf.net>
+
+ [BZ #22513]
+ * posix/glob.c (glob_in_dir): Make glob with a trailing slash match
+ directores only.
+ * posix/globtest.sh: Add tests.
Tested on x86-64 linux.
Comments
(+cc: bug-gnulib@, since they share this code)
Dmitry Goncharov wrote:
> If pattern has a trailing slash match directories only.
> This patch fixes [BZ #22513].
>
> +2017-11-28 Dmitry Goncharov <dgoncharov@users.sf.net>
> +
> + [BZ #22513]
> + * posix/glob.c (glob_in_dir): Make glob with a trailing slash match
> + directores only.
> + * posix/globtest.sh: Add tests.
>
> Tested on x86-64 linux.
Thanks. Looks like a reasonable thing to do.
> diff --git a/posix/glob.c b/posix/glob.c
> index cb39779d07..78873d83c6 100644
> --- a/posix/glob.c
> +++ b/posix/glob.c
> @@ -1342,7 +1342,33 @@ glob_in_dir (const char *pattern, const char *directory, int flags,
> if (flags & GLOB_ONLYDIR)
> switch (readdir_result_type (d))
> {
> - case DT_DIR: case DT_LNK: case DT_UNKNOWN: break;
> + case DT_DIR: case DT_LNK: break;
> + case DT_UNKNOWN:
> + {
> + int dir;
> + size_t namlen = strlen (d.name);
> + size_t fullsize;
> + bool alloca_fullname
> + = (! size_add_wrapv (dirlen + 1, namlen + 1, &fullsize)
> + && glob_use_alloca (alloca_used, fullsize));
> + char *fullname;
> + if (alloca_fullname)
> + fullname = alloca_account (fullsize, alloca_used);
> + else
> + {
> + fullname = malloc (fullsize);
> + if (fullname == NULL)
> + return GLOB_NOSPACE;
> + }
> + mempcpy (mempcpy (mempcpy (fullname, directory, dirlen),
> + "/", 1),
> + d.name, namlen + 1);
> + dir = is_dir (fullname, flags, pglob);
> + if (__glibc_unlikely (!alloca_fullname))
> + free (fullname);
> + if (dir)
> + break;
> + }
> default: continue;
> }
>
If I understand correctly, this treats DT_UNKNOWN like DT_DIR when stat
indicates that it is a directory.
What should happen in the DT_LNK case? Should the same logic trip for
it as well so we can distinguish between a symlink to a directory and
other symlinks?
> diff --git a/posix/globtest.sh b/posix/globtest.sh
> index 73f7ae31cc..4f0c03715a 100755
> --- a/posix/globtest.sh
> +++ b/posix/globtest.sh
> @@ -43,13 +43,19 @@ export LC_ALL
>
> # Create the arena
> testdir=${common_objpfx}posix/globtest-dir
> +testdir2=${common_objpfx}posix/globtest-dir2
> testout=${common_objpfx}posix/globtest-out
> rm -rf $testdir $testout
> mkdir $testdir
> +mkdir $testdir2
> +mkdir $testdir2/hello1d
> +mkdir $testdir2/hello2d
> +echo 1 > $testdir2/hello1f
> +echo 2 > $testdir2/hello2f
>
> cleanup() {
> chmod 777 $testdir/noread
> - rm -fr $testdir $testout
> + rm -fr $testdir $testdir2 $testout
> }
>
> trap cleanup 0 HUP INT QUIT TERM
> @@ -815,6 +821,41 @@ if test $failed -ne 0; then
> result=1
> fi
>
> +# Test that if the specified glob ends with a slash then only directories are
> +# matched.
> +# First run with the glob that has no slash to demonstrate presence of a slash
> +# makes a difference.
> +failed=0
> +${test_program_prefix} \
> +${common_objpfx}posix/globtest "$testdir2" "hello*" |
> +sort > $testout
> +cat <<"EOF" | $CMP - $testout >> $logfile || failed=1
> +`hello1d'
> +`hello1f'
> +`hello2d'
> +`hello2f'
> +EOF
> +
> +if test $failed -ne 0; then
> + echo "pattern+meta test failed" >> $logfile
> + result=1
> +fi
> +
> +# The same pattern+meta test with a slash this time.
> +failed=0
> +${test_program_prefix} \
> +${common_objpfx}posix/globtest "$testdir2" "hello*/" |
> +sort > $testout
> +cat <<"EOF" | $CMP - $testout >> $logfile || failed=1
> +`hello1d/'
> +`hello2d/'
> +EOF
> +
> +if test $failed -ne 0; then
> + echo "pattern+meta+slash test failed" >> $logfile
> + result=1
> +fi
> +
> if test $result -eq 0; then
> echo "All OK." > $logfile
> fi
Thanks for the test. It does a good job of motivating the change.
Thanks and hope that helps,
Jonathan
On 11/28/2017 01:08 PM, Dmitry Goncharov wrote:
> If pattern has a trailing slash match directories only.
> This patch fixes [BZ #22513].
I do not observe the bug on Fedora 27, which uses glibc 2.26-16, so
presumably this bug was introduced in the recent merges from Gnulib.
However, I not observe the bug with Gnulib master lib/glob.c either.
Could you explain what introduced the bug, e.g., by isolating the Glibc
commit that introduced it? Perhaps we botched the merge, and if so we
should fix the botch rather than work around the bug with some extra code.
On Tue, Nov 28, 2017 at 05:58:12PM -0800, Paul Eggert wrote:
> On 11/28/2017 01:08 PM, Dmitry Goncharov wrote:
> > If pattern has a trailing slash match directories only.
> > This patch fixes [BZ #22513].
>
> I do not observe the bug on Fedora 27, which uses glibc 2.26-16, so
> presumably this bug was introduced in the recent merges from Gnulib.
> However, I not observe the bug with Gnulib master lib/glob.c either.
> Could you explain what introduced the bug, e.g., by isolating the Glibc
> commit that introduced it? Perhaps we botched the merge, and if so we
> should fix the botch rather than work around the bug with some extra code.
>
commit 1cab5444231a4a1fab9c3abb107d22af4eb09327 (tag: cvs/libc-ud-971031)
Author: Ulrich Drepper <drepper@redhat.com>
Date: Fri Oct 31 22:55:02 1997 +0000
introduced the following piece of code among others
+#ifdef HAVE_D_TYPE
+ /* If we shall match only directories use the information
+ provided by the dirent if possible. */
+ if ((flags & GLOB_ONLYDIR)
+ && d->d_type != DT_UNKNOWN && d->d_type != DT_DIR)
+ continue;
+#endif
+
if (fnmatch (pattern, name,
(!(flags & GLOB_PERIOD) ? FNM_PERIOD : 0) |
((flags & GLOB_NOESCAPE) ? FNM_NOESCAPE : 0)
As long as your filesystem initializes d->d_type you won't see the issue.
When d->d_type is DT_UNKNOWN then d may not be a directory.
regards, Dmitry
Dmitry Goncharov wrote:
> Author: Ulrich Drepper<drepper@redhat.com>
> Date: Fri Oct 31 22:55:02 1997 +0000
Ah, so this is a longstanding bug in glob, and was not introduced by the recent
merge from Gnulib. That's a relief.
> As long as your filesystem initializes d->d_type you won't see the issue.
> When d->d_type is DT_UNKNOWN then d may not be a directory.
Thanks for clarifying. Now that I understand it better, though, I still see a
problem. As noted in the Glibc manual here:
https://www.gnu.org/software/libc/manual/html_node/More-Flags-for-Globbing.html#index-GLOB_005fONLYDIR
GLOB_ONLYDIR is merely an efficiency flag: it means "do not return entries that
can easily be shown to be non-directories". Your patch would change the
semantics of GLOB_ONLYDIR so that it means "return only entries that are known
to be directories", which differs from the Glibc documentation.
The bug here is not in the implementation of GLOB_ONLYDIR; it is in the part of
glob that calls itself recursively with GLOB_ONLYDIR, and which expects only
directories (or symlinks to directories) to be matched.
This issue came up in September, here:
https://sourceware.org/ml/libc-alpha/2017-09/msg00888.html
On Wed, Nov 29, 2017 at 1:35 AM, Paul Eggert <eggert@cs.ucla.edu> wrote:
>>
>> Thanks for clarifying. Now that I understand it better, though, I still see a problem. As noted in the Glibc manual here:
>>
>> https://www.gnu.org/software/libc/manual/html_node/More-Flags-for-Globbing.html#index-GLOB_005fONLYDIR
>>
>> GLOB_ONLYDIR is merely an efficiency flag: it means "do not return entries that can easily be shown to be non-directories". Your patch would change the semantics of GLOB_ONLYDIR so that it means "return only entries that are known to be directories", which differs from the Glibc documentation.
>>
>>
>>
> You are right. There is indeed this semantics.
The patch is compatible with "do not return entries that can easily be
shown to be non-directories".
One option is we can decide the patch is compatible and apply it.
If the existing semantics has to be preserved then we can introduce
another flag and use the new flag instead of GLOB_ONLYDIR in
if (pattern[0] && pattern[strlen (pattern) - 1] == '/')
flags |= GLOB_ONLYDIR;
and everywhere else where required.
regards, Dmitry
On Tue, Nov 28, 2017 at 5:04 PM, Jonathan Nieder <jrnieder@gmail.com> wrote:
> (+cc: bug-gnulib@, since they share this code)
> If I understand correctly, this treats DT_UNKNOWN like DT_DIR when stat
> indicates that it is a directory.
The patch checks if an entry is really a dir if type is DT_UNKNOWN.
>
> What should happen in the DT_LNK case? Should the same logic trip for
> it as well so we can distinguish between a symlink to a directory and
> other symlinks?
You are right. i'll add a test and post another patch.
regards, Dmitry
On 11/29/2017 01:50 PM, Dmitry Goncharov wrote:
> One option is we can decide the patch is compatible and apply it.
It's compatible only if we ignore performance. But I don't think that
would be right: I expect that some callers expect GLOB_ONLYDIR to be
fast (because that's what the Glibc documentation says), and that they
will be disappointed by this performance regression.
> If the existing semantics has to be preserved then we can introduce
> another flag and use the new flag instead of GLOB_ONLYDIR in
>
> if (pattern[0] && pattern[strlen (pattern) - 1] == '/')
> flags |= GLOB_ONLYDIR;
That would be a better approach. Another possibility might be to change
the last assignment to:
flags |= GLOB_ONLYDIR | GLOB_MARK;
and then at the end, filter out all matches that aren't marked with
trailing '/'. This would avoid creating a new GLOB_XXX option and would
probably be easier to implement.
On Wed, Nov 29, 2017 at 03:01:46PM -0800, Paul Eggert wrote:
> On 11/29/2017 01:50 PM, Dmitry Goncharov wrote:
> > One option is we can decide the patch is compatible and apply it.
>
> It's compatible only if we ignore performance. But I don't think that
> would be right: I expect that some callers expect GLOB_ONLYDIR to be
> fast (because that's what the Glibc documentation says), and that they
> will be disappointed by this performance regression.
Paul, i am missing where you see the performance hit.
The current contract makes the caller check after the library by calling stat
on each entry. A stricter contract like
"if GLOB_ONLDIR is set then glob returns only directories"
is more efficient on the filesystems which set the type.
On the filesystems which don't set the type stat'd be called once for each
entry by the library rather than by the caller which keeps the performance
intact.
>
> > If the existing semantics has to be preserved then we can introduce
> > another flag and use the new flag instead of GLOB_ONLYDIR in
> >
> > if (pattern[0] && pattern[strlen (pattern) - 1] == '/')
> > flags |= GLOB_ONLYDIR;
>
> That would be a better approach. Another possibility might be to change
> the last assignment to:
>
> flags |= GLOB_ONLYDIR | GLOB_MARK;
>
> and then at the end, filter out all matches that aren't marked with
> trailing '/'. This would avoid creating a new GLOB_XXX option and would
> probably be easier to implement.
Thansk for the pointer. Will have a look.
regards, Dmitry
Dmitry Goncharov wrote:
> Paul, i am missing where you see the performance hit.
> The current contract makes the caller check after the library by calling stat
> on each entry.
No, these callers need not call stat. glob itself doesn't call stat, when it
calls itself recursively with this flag. Instead, glob simply charges ahead and
gets an ENOTDIR (or whatever( in the next level of recursion. For this sort of
usage, it is a win for glob to filter out directories if that is easy (because
there is then no need even to issue the syscalls that result in ENOTDIR), but it
can be a loss for glob to filter out directories via stat calls.
@@ -1342,7 +1342,33 @@ glob_in_dir (const char *pattern, const char *directory, int flags,
if (flags & GLOB_ONLYDIR)
switch (readdir_result_type (d))
{
- case DT_DIR: case DT_LNK: case DT_UNKNOWN: break;
+ case DT_DIR: case DT_LNK: break;
+ case DT_UNKNOWN:
+ {
+ int dir;
+ size_t namlen = strlen (d.name);
+ size_t fullsize;
+ bool alloca_fullname
+ = (! size_add_wrapv (dirlen + 1, namlen + 1, &fullsize)
+ && glob_use_alloca (alloca_used, fullsize));
+ char *fullname;
+ if (alloca_fullname)
+ fullname = alloca_account (fullsize, alloca_used);
+ else
+ {
+ fullname = malloc (fullsize);
+ if (fullname == NULL)
+ return GLOB_NOSPACE;
+ }
+ mempcpy (mempcpy (mempcpy (fullname, directory, dirlen),
+ "/", 1),
+ d.name, namlen + 1);
+ dir = is_dir (fullname, flags, pglob);
+ if (__glibc_unlikely (!alloca_fullname))
+ free (fullname);
+ if (dir)
+ break;
+ }
default: continue;
}
@@ -43,13 +43,19 @@ export LC_ALL
# Create the arena
testdir=${common_objpfx}posix/globtest-dir
+testdir2=${common_objpfx}posix/globtest-dir2
testout=${common_objpfx}posix/globtest-out
rm -rf $testdir $testout
mkdir $testdir
+mkdir $testdir2
+mkdir $testdir2/hello1d
+mkdir $testdir2/hello2d
+echo 1 > $testdir2/hello1f
+echo 2 > $testdir2/hello2f
cleanup() {
chmod 777 $testdir/noread
- rm -fr $testdir $testout
+ rm -fr $testdir $testdir2 $testout
}
trap cleanup 0 HUP INT QUIT TERM
@@ -815,6 +821,41 @@ if test $failed -ne 0; then
result=1
fi
+# Test that if the specified glob ends with a slash then only directories are
+# matched.
+# First run with the glob that has no slash to demonstrate presence of a slash
+# makes a difference.
+failed=0
+${test_program_prefix} \
+${common_objpfx}posix/globtest "$testdir2" "hello*" |
+sort > $testout
+cat <<"EOF" | $CMP - $testout >> $logfile || failed=1
+`hello1d'
+`hello1f'
+`hello2d'
+`hello2f'
+EOF
+
+if test $failed -ne 0; then
+ echo "pattern+meta test failed" >> $logfile
+ result=1
+fi
+
+# The same pattern+meta test with a slash this time.
+failed=0
+${test_program_prefix} \
+${common_objpfx}posix/globtest "$testdir2" "hello*/" |
+sort > $testout
+cat <<"EOF" | $CMP - $testout >> $logfile || failed=1
+`hello1d/'
+`hello2d/'
+EOF
+
+if test $failed -ne 0; then
+ echo "pattern+meta+slash test failed" >> $logfile
+ result=1
+fi
+
if test $result -eq 0; then
echo "All OK." > $logfile
fi