[v4,3/3] posix: regexec(): fix REG_STARTEND, pmatch->rm_so != 0 w/^ anchor
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
dj/TryBot-32bit |
success
|
Build for i686
|
Commit Message
re_search_internal () starts with
/* If initial states with non-begbuf contexts have no elements,
the regex must be anchored. If preg->newline_anchor is set,
we'll never use init_state_nl, so do not check it. */
if (dfa->init_state->nodes.nelem == 0
&& dfa->init_state_word->nodes.nelem == 0
&& (dfa->init_state_nl->nodes.nelem == 0
|| !preg->newline_anchor))
{
if (start != 0 && last_start != 0)
return REG_NOMATCH;
start = last_start = 0;
}
and heretofor start and last_start (for example when "abc", {1, 2},
so matching just the "b") were != 0, and the return was taken for a "^b"
regex, which is erroneous.
Fix this by giving re_search_internal (string+rm_so, start=0),
then fixing up the returned matches in an after-pass.
This brings us to compatibility with the BSD spec and implementations.
Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
posix/regexec.c | 41 ++++++++++++++++++++++++++++-------------
1 file changed, 28 insertions(+), 13 deletions(-)
Comments
Hey Paul,
Could you take a look if this change make sense? We usually tend just sync
with gnulib, and you and other gnulib developers seems way more active
working on this code than glibc community.
On 07/05/23 19:56, наб via Libc-alpha wrote:
> re_search_internal () starts with
> /* If initial states with non-begbuf contexts have no elements,
> the regex must be anchored. If preg->newline_anchor is set,
> we'll never use init_state_nl, so do not check it. */
> if (dfa->init_state->nodes.nelem == 0
> && dfa->init_state_word->nodes.nelem == 0
> && (dfa->init_state_nl->nodes.nelem == 0
> || !preg->newline_anchor))
> {
> if (start != 0 && last_start != 0)
> return REG_NOMATCH;
> start = last_start = 0;
> }
> and heretofor start and last_start (for example when "abc", {1, 2},
> so matching just the "b") were != 0, and the return was taken for a "^b"
> regex, which is erroneous.
>
> Fix this by giving re_search_internal (string+rm_so, start=0),
> then fixing up the returned matches in an after-pass.
>
> This brings us to compatibility with the BSD spec and implementations.
>
> Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
> ---
> posix/regexec.c | 41 ++++++++++++++++++++++++++++-------------
> 1 file changed, 28 insertions(+), 13 deletions(-)
>
> diff --git a/posix/regexec.c b/posix/regexec.c
> index bd0cd412d0..2ef868e1f6 100644
> --- a/posix/regexec.c
> +++ b/posix/regexec.c
> @@ -187,38 +187,53 @@ static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
> string; if REG_NOTEOL is set, then $ does not match at the end.
>
> Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
> - EFLAGS is invalid. */
> + EFLAGS is invalid.
> +
> + If REG_STARTEND, the bounds are
> + [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo)
> + instead of the usual
> + [STRING, STRING + strlen(STRING)),
> + but returned matches are still referenced to STRING,
> + and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$").
> + re_search_internal () has a built-in assumption of
> + (start != 0) <=> (^ doesn't match), so give it a truncated view
> + and fix up the matches afterward. */
>
> int
> regexec (const regex_t *__restrict preg, const char *__restrict string,
> size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
> {
> reg_errcode_t err;
> - Idx start, length;
> + Idx startoff = 0, length;
> re_dfa_t *dfa = preg->buffer;
> + size_t i = 0;
>
> if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
> return REG_BADPAT;
>
> if (eflags & REG_STARTEND)
> {
> - start = pmatch[0].rm_so;
> - length = pmatch[0].rm_eo;
> + startoff = pmatch[0].rm_so;
> + string += startoff;
> + length = pmatch[0].rm_eo - startoff;
> }
> else
> - {
> - start = 0;
> - length = strlen (string);
> - }
> + length = strlen (string);
>
> lock_lock (dfa->lock);
> if (preg->no_sub)
> - err = re_search_internal (preg, string, length, start, length,
> - length, 0, NULL, eflags);
> - else
> - err = re_search_internal (preg, string, length, start, length,
> - length, nmatch, pmatch, eflags);
> + nmatch = 0;
> + err = re_search_internal (preg, string, length, 0, length,
> + length, nmatch, pmatch, eflags);
> lock_unlock (dfa->lock);
> +
> + if (err == REG_NOERROR && startoff)
> + for (i = 0; i < nmatch; ++i)
> + if (pmatch[i].rm_so != -1)
> + {
> + pmatch[i].rm_so += startoff;
> + pmatch[i].rm_eo += startoff;
> + }
> return err != REG_NOERROR;
> }
>
@@ -187,38 +187,53 @@ static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
string; if REG_NOTEOL is set, then $ does not match at the end.
Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
- EFLAGS is invalid. */
+ EFLAGS is invalid.
+
+ If REG_STARTEND, the bounds are
+ [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo)
+ instead of the usual
+ [STRING, STRING + strlen(STRING)),
+ but returned matches are still referenced to STRING,
+ and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$").
+ re_search_internal () has a built-in assumption of
+ (start != 0) <=> (^ doesn't match), so give it a truncated view
+ and fix up the matches afterward. */
int
regexec (const regex_t *__restrict preg, const char *__restrict string,
size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
{
reg_errcode_t err;
- Idx start, length;
+ Idx startoff = 0, length;
re_dfa_t *dfa = preg->buffer;
+ size_t i = 0;
if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
return REG_BADPAT;
if (eflags & REG_STARTEND)
{
- start = pmatch[0].rm_so;
- length = pmatch[0].rm_eo;
+ startoff = pmatch[0].rm_so;
+ string += startoff;
+ length = pmatch[0].rm_eo - startoff;
}
else
- {
- start = 0;
- length = strlen (string);
- }
+ length = strlen (string);
lock_lock (dfa->lock);
if (preg->no_sub)
- err = re_search_internal (preg, string, length, start, length,
- length, 0, NULL, eflags);
- else
- err = re_search_internal (preg, string, length, start, length,
- length, nmatch, pmatch, eflags);
+ nmatch = 0;
+ err = re_search_internal (preg, string, length, 0, length,
+ length, nmatch, pmatch, eflags);
lock_unlock (dfa->lock);
+
+ if (err == REG_NOERROR && startoff)
+ for (i = 0; i < nmatch; ++i)
+ if (pmatch[i].rm_so != -1)
+ {
+ pmatch[i].rm_so += startoff;
+ pmatch[i].rm_eo += startoff;
+ }
return err != REG_NOERROR;
}