From patchwork Wed May 13 12:49:45 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ondrej Bilka X-Patchwork-Id: 6694 Received: (qmail 101989 invoked by alias); 13 May 2015 12:50:13 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 101979 invoked by uid 89); 13 May 2015 12:50:12 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=0.7 required=5.0 tests=AWL, BAYES_50, FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2 X-HELO: popelka.ms.mff.cuni.cz Date: Wed, 13 May 2015 14:49:45 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= To: libc-alpha@sourceware.org Subject: [RFC] Improve wordexp performance. Message-ID: <20150513124945.GA4067@domone> MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.20 (2009-06-14) Hi, as I read Paul's wordexp patch I found that you use inefficient idiom. Checking character membership with strchr is slow due to startup cost. Much better is just use table lookup. This patch does just that. You could generalize this to strchr in bits/string2.h with constant haystack but code would be ugly It is related to improvement on my todo list namely precompute strpbrk skip table which is done unless you have sse42 thats faster than lookup table approach. * posix/wordexp.c: Precompute character tables. diff --git a/posix/wordexp.c b/posix/wordexp.c index e711d43..8939ca9 100644 --- a/posix/wordexp.c +++ b/posix/wordexp.c @@ -45,6 +45,35 @@ #include #include <_itoa.h> + +static int +in_charset (char c, uint32_t *charset) +{ + return charset[c / 32] & (1 << (c % 32)); +} + +static void +make_charset (char *a, uint32_t *b) +{ + int i; + for (i = 0; i < 8; i++) + b[i] = 0; + while (*a) + { + b[*a / 32] |= 1 << (*a % 32); + a++; + } +} + +struct ifss { + char *ifs; + char *ifs_white; + uint32_t ifs_table[8]; + uint32_t ifs_white_table[8]; +}; + +struct ifss nulifs; + /* Undefine the following line for the production version. */ /* #define NDEBUG 1 */ #include @@ -63,18 +92,16 @@ extern char **__libc_argv attribute_hidden; /* Some forward declarations */ static int parse_dollars (char **word, size_t *word_length, size_t *max_length, const char *words, size_t *offset, int flags, - wordexp_t *pwordexp, const char *ifs, - const char *ifs_white, int quoted) + wordexp_t *pwordexp, struct ifss *ifsdata, int quoted) internal_function; static int parse_backtick (char **word, size_t *word_length, size_t *max_length, const char *words, size_t *offset, int flags, wordexp_t *pwordexp, - const char *ifs, const char *ifs_white) + struct ifss *ifsdata) internal_function; static int parse_dquote (char **word, size_t *word_length, size_t *max_length, const char *words, size_t *offset, int flags, - wordexp_t *pwordexp, const char *ifs, - const char *ifs_white) + wordexp_t *pwordexp, struct ifss *ifsdata) internal_function; static int eval_expr (char *expr, long int *result) internal_function; @@ -381,8 +408,7 @@ parse_tilde (char **word, size_t *word_length, size_t *max_length, static int internal_function do_parse_glob (const char *glob_word, char **word, size_t *word_length, - size_t *max_length, wordexp_t *pwordexp, const char *ifs, - const char *ifs_white) + size_t *max_length, wordexp_t *pwordexp, struct ifss *ifsdata) { int error; unsigned int match; @@ -397,7 +423,7 @@ do_parse_glob (const char *glob_word, char **word, size_t *word_length, return WRDE_NOSPACE; } - if (ifs && !*ifs) + if (ifsdata->ifs && !*(ifsdata->ifs)) { /* No field splitting allowed. */ assert (globbuf.gl_pathv[0] != NULL); @@ -414,7 +440,7 @@ do_parse_glob (const char *glob_word, char **word, size_t *word_length, return *word ? 0 : WRDE_NOSPACE; } - assert (ifs == NULL || *ifs != '\0'); + assert (ifsdata->ifs == NULL || *(ifsdata->ifs) != '\0'); if (*word != NULL) { free (*word); @@ -439,7 +465,7 @@ static int internal_function parse_glob (char **word, size_t *word_length, size_t *max_length, const char *words, size_t *offset, int flags, - wordexp_t *pwordexp, const char *ifs, const char *ifs_white) + wordexp_t *pwordexp, struct ifss *ifsdata) { /* We are poised just after a '*', a '[' or a '?'. */ int error = WRDE_NOSPACE; @@ -452,7 +478,7 @@ parse_glob (char **word, size_t *word_length, size_t *max_length, glob_list.we_offs = 0; for (; words[*offset] != '\0'; ++*offset) { - if (strchr (ifs, words[*offset]) != NULL) + if (in_charset (words[*offset], ifsdata->ifs_table)) /* Reached IFS */ break; @@ -488,7 +514,7 @@ parse_glob (char **word, size_t *word_length, size_t *max_length, if (quoted != 1 && words[*offset] == '$') { error = parse_dollars (word, word_length, max_length, words, - offset, flags, &glob_list, ifs, ifs_white, + offset, flags, &glob_list, ifsdata, quoted == 2); if (error) goto tidy_up; @@ -523,7 +549,7 @@ parse_glob (char **word, size_t *word_length, size_t *max_length, *word = w_newword (word_length, max_length); for (i = 0; error == 0 && i < glob_list.we_wordc; i++) error = do_parse_glob (glob_list.we_wordv[i], word, word_length, - max_length, pwordexp, ifs, ifs_white); + max_length, pwordexp, ifsdata); /* Now tidy up */ tidy_up: @@ -685,7 +711,7 @@ parse_arith (char **word, size_t *word_length, size_t *max_length, { case '$': error = parse_dollars (&expr, &expr_length, &expr_maxlen, - words, offset, flags, NULL, NULL, NULL, 1); + words, offset, flags, NULL, &nulifs, 1); /* The ``1'' here is to tell parse_dollars not to * split the fields. */ @@ -699,7 +725,7 @@ parse_arith (char **word, size_t *word_length, size_t *max_length, case '`': (*offset)++; error = parse_backtick (&expr, &expr_length, &expr_maxlen, - words, offset, flags, NULL, NULL, NULL); + words, offset, flags, NULL, &nulifs); /* The first NULL here is to tell parse_backtick not to * split the fields. */ @@ -884,8 +910,7 @@ exec_comm_child (char *comm, int *fildes, int showerr, int noexec) static int internal_function exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length, - int flags, wordexp_t *pwordexp, const char *ifs, - const char *ifs_white) + int flags, wordexp_t *pwordexp, struct ifss *ifsdata) { int fildes[2]; #define bufsize 128 @@ -1010,10 +1035,10 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length, for (i = 0; i < buflen; ++i) { - if (strchr (ifs, buffer[i]) != NULL) + if (in_charset (buffer[i], ifsdata->ifs_table)) { /* Current character is IFS */ - if (strchr (ifs_white, buffer[i]) == NULL) + if (!in_charset (buffer[i], ifsdata->ifs_white_table)) { /* Current character is IFS but not whitespace */ if (copying == 2) @@ -1142,7 +1167,7 @@ static int internal_function parse_comm (char **word, size_t *word_length, size_t *max_length, const char *words, size_t *offset, int flags, wordexp_t *pwordexp, - const char *ifs, const char *ifs_white) + struct ifss *ifsdata) { /* We are poised just after "$(" */ int paren_depth = 1; @@ -1191,7 +1216,7 @@ parse_comm (char **word, size_t *word_length, size_t *max_length, #endif error = exec_comm (comm, word, word_length, max_length, - flags, pwordexp, ifs, ifs_white); + flags, pwordexp, ifsdata); #ifdef __libc_ptf_call __libc_ptf_call (pthread_setcancelstate, (state, NULL), 0); @@ -1222,14 +1247,12 @@ parse_comm (char **word, size_t *word_length, size_t *max_length, return WRDE_SYNTAX; } -#define CHAR_IN_SET(ch, char_set) \ - (memchr (char_set "", ch, sizeof (char_set) - 1) != NULL) static int internal_function parse_param (char **word, size_t *word_length, size_t *max_length, const char *words, size_t *offset, int flags, wordexp_t *pwordexp, - const char *ifs, const char *ifs_white, int quoted) + struct ifss *ifsdata, int quoted) { /* We are poised just after "$" */ enum action @@ -1269,6 +1292,8 @@ parse_param (char **word, size_t *word_length, size_t *max_length, if (brace) ++*offset; + uint32_t set1[8] = {0x0, 0x410, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0}; /* *@$ */ + /* First collect the parameter name. */ if (words[*offset] == '#') @@ -1306,7 +1331,7 @@ parse_param (char **word, size_t *word_length, size_t *max_length, } while (isdigit(words[++*offset])); } - else if (CHAR_IN_SET (words[*offset], "*@$")) + else if (in_charset (words[*offset], set1)) { /* Special parameter. */ special = 1; @@ -1349,8 +1374,9 @@ parse_param (char **word, size_t *word_length, size_t *max_length, } break; - case ':': - if (!CHAR_IN_SET (words[1 + *offset], "-=?+")) + case ':':; + uint32_t set2[8] = {0x0, 0xa0002800, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; /* -=?+ */ + if (!in_charset (words[1 + *offset], set2)) goto syntax; colon_seen = 1; @@ -1648,7 +1674,7 @@ envsubst: case '$': offset = 0; error = parse_dollars (&expanded, &exp_len, &exp_maxl, p, - &offset, flags, NULL, NULL, NULL, 1); + &offset, flags, NULL, &nulifs, 1); if (error) { if (free_value) @@ -1991,22 +2017,22 @@ envsubst: } /* Skip IFS whitespace before the field */ - field_begin += strspn (field_begin, ifs_white); + field_begin += strspn (field_begin, ifsdata->ifs_white); if (!seen_nonws_ifs && *field_begin == 0) /* Nothing but whitespace */ break; /* Search for the end of the field */ - field_end = field_begin + strcspn (field_begin, ifs); + field_end = field_begin + strcspn (field_begin, ifsdata->ifs); /* Set up pointer to the character after end of field and skip whitespace IFS after it. */ - next_field = field_end + strspn (field_end, ifs_white); + next_field = field_end + strspn (field_end, ifsdata->ifs_white); /* Skip at most one non-whitespace IFS character after the field */ seen_nonws_ifs = 0; - if (*next_field && strchr (ifs, *next_field)) + if (*next_field && in_charset (*next_field, ifsdata->ifs_table)) { seen_nonws_ifs = 1; next_field++; @@ -2052,13 +2078,12 @@ do_error: return error; } -#undef CHAR_IN_SET static int internal_function parse_dollars (char **word, size_t *word_length, size_t *max_length, const char *words, size_t *offset, int flags, - wordexp_t *pwordexp, const char *ifs, const char *ifs_white, + wordexp_t *pwordexp, struct ifss *ifsdata, int quoted) { /* We are poised _at_ "$" */ @@ -2097,7 +2122,7 @@ parse_dollars (char **word, size_t *word_length, size_t *max_length, (*offset) += 2; return parse_comm (word, word_length, max_length, words, offset, flags, - quoted? NULL : pwordexp, ifs, ifs_white); + quoted? NULL : pwordexp, ifsdata); case '[': (*offset) += 2; @@ -2109,7 +2134,7 @@ parse_dollars (char **word, size_t *word_length, size_t *max_length, default: ++(*offset); /* parse_param needs to know if "{" is there */ return parse_param (word, word_length, max_length, words, offset, flags, - pwordexp, ifs, ifs_white, quoted); + pwordexp, ifsdata, quoted); } } @@ -2117,7 +2142,7 @@ static int internal_function parse_backtick (char **word, size_t *word_length, size_t *max_length, const char *words, size_t *offset, int flags, - wordexp_t *pwordexp, const char *ifs, const char *ifs_white) + wordexp_t *pwordexp, struct ifss *ifsdata) { /* We are poised just after "`" */ int error; @@ -2133,7 +2158,7 @@ parse_backtick (char **word, size_t *word_length, size_t *max_length, case '`': /* Go -- give the script to the shell */ error = exec_comm (comm, word, word_length, max_length, flags, - pwordexp, ifs, ifs_white); + pwordexp, ifsdata); free (comm); return error; @@ -2181,7 +2206,7 @@ static int internal_function parse_dquote (char **word, size_t *word_length, size_t *max_length, const char *words, size_t *offset, int flags, - wordexp_t *pwordexp, const char * ifs, const char * ifs_white) + wordexp_t *pwordexp, struct ifss *ifsdata) { /* We are poised just after a double-quote */ int error; @@ -2195,7 +2220,7 @@ parse_dquote (char **word, size_t *word_length, size_t *max_length, case '$': error = parse_dollars (word, word_length, max_length, words, offset, - flags, pwordexp, ifs, ifs_white, 1); + flags, pwordexp, ifsdata, 1); /* The ``1'' here is to tell parse_dollars not to * split the fields. It may need to, however ("$@"). */ @@ -2207,7 +2232,7 @@ parse_dquote (char **word, size_t *word_length, size_t *max_length, case '`': ++(*offset); error = parse_backtick (word, word_length, max_length, words, - offset, flags, NULL, NULL, NULL); + offset, flags, NULL, &nulifs); /* The first NULL here is to tell parse_backtick not to * split the fields. */ @@ -2340,6 +2365,13 @@ wordexp (const char *words, wordexp_t *pwordexp, int flags) *whch = '\0'; } + struct ifss ifsdata; + ifsdata.ifs = ifs; + ifsdata.ifs_white = ifs_white; + make_charset (ifs, ifsdata.ifs_table); + make_charset (ifs_white, ifsdata.ifs_white_table); + + for (words_offset = 0 ; words[words_offset] ; ++words_offset) switch (words[words_offset]) { @@ -2354,7 +2386,7 @@ wordexp (const char *words, wordexp_t *pwordexp, int flags) case '$': error = parse_dollars (&word, &word_length, &max_length, words, - &words_offset, flags, pwordexp, ifs, ifs_white, + &words_offset, flags, pwordexp, &ifsdata, 0); if (error) @@ -2365,8 +2397,7 @@ wordexp (const char *words, wordexp_t *pwordexp, int flags) case '`': ++words_offset; error = parse_backtick (&word, &word_length, &max_length, words, - &words_offset, flags, pwordexp, ifs, - ifs_white); + &words_offset, flags, pwordexp, &ifsdata); if (error) goto do_error; @@ -2376,7 +2407,7 @@ wordexp (const char *words, wordexp_t *pwordexp, int flags) case '"': ++words_offset; error = parse_dquote (&word, &word_length, &max_length, words, - &words_offset, flags, pwordexp, ifs, ifs_white); + &words_offset, flags, pwordexp, &ifsdata); if (error) goto do_error; @@ -2422,21 +2453,24 @@ wordexp (const char *words, wordexp_t *pwordexp, int flags) case '[': case '?': error = parse_glob (&word, &word_length, &max_length, words, - &words_offset, flags, pwordexp, ifs, ifs_white); + &words_offset, flags, pwordexp, &ifsdata); if (error) goto do_error; break; - default: + default:; /* Is it a word separator? */ - if (strchr (" \t", words[words_offset]) == NULL) - { - char ch = words[words_offset]; + char ch = words[words_offset]; + uint32_t set3[8] = {0x200, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; /* \t */ + if (!in_charset (ch, set3)) + { /* Not a word separator -- but is it a valid word char? */ - if (strchr ("\n|&;<>(){}", ch)) + uint32_t set4[8] = {0x400, 0x58000340, 0x0, 0x38000000, 0x0, 0x0, + 0x0, 0x0}; /* \n|&;<>(){} */ + if (in_charset (ch, set4)) { /* Fail */ error = WRDE_BADCHAR;