[RFC,v2] Improve wordexp performance.

Message ID 20150513172657.GA21390@domone
State New, archived
Headers

Commit Message

Ondrej Bilka May 13, 2015, 5:26 p.m. UTC
  On Wed, May 13, 2015 at 03:59:20PM +0000, Joseph Myers wrote:
> On Wed, 13 May 2015, Ondřej Bílka wrote:
> Anything like this using possibly signed char seems suspect; I'd expect it 
> to use unsigned char unless there is a clearly documented reason why 
> negative values aren't possible.
> 
Sure, was bit sleepy and forgot about that. Here is v2 which intended
cast to unsigned.
  

Patch

diff --git a/posix/wordexp.c b/posix/wordexp.c
index e711d43..40021c5 100644
--- a/posix/wordexp.c
+++ b/posix/wordexp.c
@@ -45,6 +45,37 @@ 
 #include <bits/libc-lock.h>
 #include <_itoa.h>
 
+
+static int
+in_charset (char _c, uint32_t *charset)
+{
+  unsigned char c = _c;
+  return charset[c / 32] & (1 << (c % 32));
+}
+
+static void 
+make_charset (char *_a, uint32_t *b)
+{
+  unsigned char *a = (unsigned char *) _a;
+  int i;
+  for (i = 0; i < 8; i++)
+    b[i] = 0;   
+  while (*a)
+   {
+     b[*a / 32] |= 1 << (*a % 32);
+     a++;
+   }
+}
+
+struct ifss {
+  char *ifs;
+  char *ifs_white;
+  uint32_t ifs_table[8];
+  uint32_t ifs_white_table[8];
+};
+
+struct ifss nulifs;
+
 /* Undefine the following line for the production version.  */
 /* #define NDEBUG 1 */
 #include <assert.h>
@@ -63,18 +94,16 @@  extern char **__libc_argv attribute_hidden;
 /* Some forward declarations */
 static int parse_dollars (char **word, size_t *word_length, size_t *max_length,
 			  const char *words, size_t *offset, int flags,
-			  wordexp_t *pwordexp, const char *ifs,
-			  const char *ifs_white, int quoted)
+			  wordexp_t *pwordexp, struct ifss *ifsdata, int quoted)
      internal_function;
 static int parse_backtick (char **word, size_t *word_length,
 			   size_t *max_length, const char *words,
 			   size_t *offset, int flags, wordexp_t *pwordexp,
-			   const char *ifs, const char *ifs_white)
+			   struct ifss *ifsdata)
      internal_function;
 static int parse_dquote (char **word, size_t *word_length, size_t *max_length,
 			 const char *words, size_t *offset, int flags,
-			 wordexp_t *pwordexp, const char *ifs,
-			 const char *ifs_white)
+			 wordexp_t *pwordexp, struct ifss *ifsdata)
      internal_function;
 static int eval_expr (char *expr, long int *result) internal_function;
 
@@ -381,8 +410,7 @@  parse_tilde (char **word, size_t *word_length, size_t *max_length,
 static int
 internal_function
 do_parse_glob (const char *glob_word, char **word, size_t *word_length,
-	       size_t *max_length, wordexp_t *pwordexp, const char *ifs,
-	       const char *ifs_white)
+	       size_t *max_length, wordexp_t *pwordexp, struct ifss *ifsdata)
 {
   int error;
   unsigned int match;
@@ -397,7 +425,7 @@  do_parse_glob (const char *glob_word, char **word, size_t *word_length,
       return WRDE_NOSPACE;
     }
 
-  if (ifs && !*ifs)
+  if (ifsdata->ifs && !*(ifsdata->ifs))
     {
       /* No field splitting allowed. */
       assert (globbuf.gl_pathv[0] != NULL);
@@ -414,7 +442,7 @@  do_parse_glob (const char *glob_word, char **word, size_t *word_length,
       return *word ? 0 : WRDE_NOSPACE;
     }
 
-  assert (ifs == NULL || *ifs != '\0');
+  assert (ifsdata->ifs == NULL || *(ifsdata->ifs) != '\0');
   if (*word != NULL)
     {
       free (*word);
@@ -439,7 +467,7 @@  static int
 internal_function
 parse_glob (char **word, size_t *word_length, size_t *max_length,
 	    const char *words, size_t *offset, int flags,
-	    wordexp_t *pwordexp, const char *ifs, const char *ifs_white)
+	    wordexp_t *pwordexp, struct ifss *ifsdata)
 {
   /* We are poised just after a '*', a '[' or a '?'. */
   int error = WRDE_NOSPACE;
@@ -452,7 +480,7 @@  parse_glob (char **word, size_t *word_length, size_t *max_length,
   glob_list.we_offs = 0;
   for (; words[*offset] != '\0'; ++*offset)
     {
-      if (strchr (ifs, words[*offset]) != NULL)
+      if (in_charset (words[*offset], ifsdata->ifs_table))
 	/* Reached IFS */
 	break;
 
@@ -488,7 +516,7 @@  parse_glob (char **word, size_t *word_length, size_t *max_length,
       if (quoted != 1 && words[*offset] == '$')
 	{
 	  error = parse_dollars (word, word_length, max_length, words,
-				 offset, flags, &glob_list, ifs, ifs_white,
+				 offset, flags, &glob_list, ifsdata,
 				 quoted == 2);
 	  if (error)
 	    goto tidy_up;
@@ -523,7 +551,7 @@  parse_glob (char **word, size_t *word_length, size_t *max_length,
   *word = w_newword (word_length, max_length);
   for (i = 0; error == 0 && i < glob_list.we_wordc; i++)
     error = do_parse_glob (glob_list.we_wordv[i], word, word_length,
-			   max_length, pwordexp, ifs, ifs_white);
+			   max_length, pwordexp, ifsdata);
 
   /* Now tidy up */
 tidy_up:
@@ -685,7 +713,7 @@  parse_arith (char **word, size_t *word_length, size_t *max_length,
 	{
 	case '$':
 	  error = parse_dollars (&expr, &expr_length, &expr_maxlen,
-				 words, offset, flags, NULL, NULL, NULL, 1);
+				 words, offset, flags, NULL, &nulifs, 1);
 	  /* The ``1'' here is to tell parse_dollars not to
 	   * split the fields.
 	   */
@@ -699,7 +727,7 @@  parse_arith (char **word, size_t *word_length, size_t *max_length,
 	case '`':
 	  (*offset)++;
 	  error = parse_backtick (&expr, &expr_length, &expr_maxlen,
-				  words, offset, flags, NULL, NULL, NULL);
+				  words, offset, flags, NULL, &nulifs);
 	  /* The first NULL here is to tell parse_backtick not to
 	   * split the fields.
 	   */
@@ -884,8 +912,7 @@  exec_comm_child (char *comm, int *fildes, int showerr, int noexec)
 static int
 internal_function
 exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
-	   int flags, wordexp_t *pwordexp, const char *ifs,
-	   const char *ifs_white)
+	   int flags, wordexp_t *pwordexp, struct ifss *ifsdata)
 {
   int fildes[2];
 #define bufsize 128
@@ -1010,10 +1037,10 @@  exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
 
 	  for (i = 0; i < buflen; ++i)
 	    {
-	      if (strchr (ifs, buffer[i]) != NULL)
+	      if (in_charset (buffer[i], ifsdata->ifs_table))
 		{
 		  /* Current character is IFS */
-		  if (strchr (ifs_white, buffer[i]) == NULL)
+		  if (!in_charset (buffer[i], ifsdata->ifs_white_table))
 		    {
 		      /* Current character is IFS but not whitespace */
 		      if (copying == 2)
@@ -1142,7 +1169,7 @@  static int
 internal_function
 parse_comm (char **word, size_t *word_length, size_t *max_length,
 	    const char *words, size_t *offset, int flags, wordexp_t *pwordexp,
-	    const char *ifs, const char *ifs_white)
+	    struct ifss *ifsdata)
 {
   /* We are poised just after "$(" */
   int paren_depth = 1;
@@ -1191,7 +1218,7 @@  parse_comm (char **word, size_t *word_length, size_t *max_length,
 #endif
 
 		  error = exec_comm (comm, word, word_length, max_length,
-				     flags, pwordexp, ifs, ifs_white);
+				     flags, pwordexp, ifsdata);
 
 #ifdef __libc_ptf_call
 		  __libc_ptf_call (pthread_setcancelstate, (state, NULL), 0);
@@ -1222,14 +1249,12 @@  parse_comm (char **word, size_t *word_length, size_t *max_length,
   return WRDE_SYNTAX;
 }
 
-#define CHAR_IN_SET(ch, char_set) \
-  (memchr (char_set "", ch, sizeof (char_set) - 1) != NULL)
 
 static int
 internal_function
 parse_param (char **word, size_t *word_length, size_t *max_length,
 	     const char *words, size_t *offset, int flags, wordexp_t *pwordexp,
-	     const char *ifs, const char *ifs_white, int quoted)
+	     struct ifss *ifsdata, int quoted)
 {
   /* We are poised just after "$" */
   enum action
@@ -1269,6 +1294,8 @@  parse_param (char **word, size_t *word_length, size_t *max_length,
   if (brace)
     ++*offset;
 
+  uint32_t set1[8] = {0x0, 0x410, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0}; /* *@$ */
+
   /* First collect the parameter name. */
 
   if (words[*offset] == '#')
@@ -1306,7 +1333,7 @@  parse_param (char **word, size_t *word_length, size_t *max_length,
 	}
       while (isdigit(words[++*offset]));
     }
-  else if (CHAR_IN_SET (words[*offset], "*@$"))
+  else if (in_charset (words[*offset], set1))
     {
       /* Special parameter. */
       special = 1;
@@ -1349,8 +1376,9 @@  parse_param (char **word, size_t *word_length, size_t *max_length,
 	    }
 	  break;
 
-	case ':':
-	  if (!CHAR_IN_SET (words[1 + *offset], "-=?+"))
+	case ':':;
+          uint32_t set2[8] = {0x0, 0xa0002800, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; /* -=?+ */
+	  if (!in_charset (words[1 + *offset], set2))
 	    goto syntax;
 
 	  colon_seen = 1;
@@ -1648,7 +1676,7 @@  envsubst:
 		case '$':
 		  offset = 0;
 		  error = parse_dollars (&expanded, &exp_len, &exp_maxl, p,
-					 &offset, flags, NULL, NULL, NULL, 1);
+					 &offset, flags, NULL, &nulifs, 1);
 		  if (error)
 		    {
 		      if (free_value)
@@ -1991,22 +2019,22 @@  envsubst:
 	    }
 
 	  /* Skip IFS whitespace before the field */
-	  field_begin += strspn (field_begin, ifs_white);
+	  field_begin += strspn (field_begin, ifsdata->ifs_white);
 
 	  if (!seen_nonws_ifs && *field_begin == 0)
 	    /* Nothing but whitespace */
 	    break;
 
 	  /* Search for the end of the field */
-	  field_end = field_begin + strcspn (field_begin, ifs);
+	  field_end = field_begin + strcspn (field_begin, ifsdata->ifs);
 
 	  /* Set up pointer to the character after end of field and
 	     skip whitespace IFS after it. */
-	  next_field = field_end + strspn (field_end, ifs_white);
+	  next_field = field_end + strspn (field_end, ifsdata->ifs_white);
 
 	  /* Skip at most one non-whitespace IFS character after the field */
 	  seen_nonws_ifs = 0;
-	  if (*next_field && strchr (ifs, *next_field))
+	  if (*next_field && in_charset (*next_field, ifsdata->ifs_table))
 	    {
 	      seen_nonws_ifs = 1;
 	      next_field++;
@@ -2052,13 +2080,12 @@  do_error:
   return error;
 }
 
-#undef CHAR_IN_SET
 
 static int
 internal_function
 parse_dollars (char **word, size_t *word_length, size_t *max_length,
 	       const char *words, size_t *offset, int flags,
-	       wordexp_t *pwordexp, const char *ifs, const char *ifs_white,
+	       wordexp_t *pwordexp, struct ifss *ifsdata,
 	       int quoted)
 {
   /* We are poised _at_ "$" */
@@ -2097,7 +2124,7 @@  parse_dollars (char **word, size_t *word_length, size_t *max_length,
 
       (*offset) += 2;
       return parse_comm (word, word_length, max_length, words, offset, flags,
-			 quoted? NULL : pwordexp, ifs, ifs_white);
+			 quoted? NULL : pwordexp, ifsdata);
 
     case '[':
       (*offset) += 2;
@@ -2109,7 +2136,7 @@  parse_dollars (char **word, size_t *word_length, size_t *max_length,
     default:
       ++(*offset);	/* parse_param needs to know if "{" is there */
       return parse_param (word, word_length, max_length, words, offset, flags,
-			   pwordexp, ifs, ifs_white, quoted);
+			   pwordexp, ifsdata, quoted);
     }
 }
 
@@ -2117,7 +2144,7 @@  static int
 internal_function
 parse_backtick (char **word, size_t *word_length, size_t *max_length,
 		const char *words, size_t *offset, int flags,
-		wordexp_t *pwordexp, const char *ifs, const char *ifs_white)
+		wordexp_t *pwordexp, struct ifss *ifsdata)
 {
   /* We are poised just after "`" */
   int error;
@@ -2133,7 +2160,7 @@  parse_backtick (char **word, size_t *word_length, size_t *max_length,
 	case '`':
 	  /* Go -- give the script to the shell */
 	  error = exec_comm (comm, word, word_length, max_length, flags,
-			     pwordexp, ifs, ifs_white);
+			     pwordexp, ifsdata);
 	  free (comm);
 	  return error;
 
@@ -2181,7 +2208,7 @@  static int
 internal_function
 parse_dquote (char **word, size_t *word_length, size_t *max_length,
 	      const char *words, size_t *offset, int flags,
-	      wordexp_t *pwordexp, const char * ifs, const char * ifs_white)
+	      wordexp_t *pwordexp, struct ifss *ifsdata)
 {
   /* We are poised just after a double-quote */
   int error;
@@ -2195,7 +2222,7 @@  parse_dquote (char **word, size_t *word_length, size_t *max_length,
 
 	case '$':
 	  error = parse_dollars (word, word_length, max_length, words, offset,
-				 flags, pwordexp, ifs, ifs_white, 1);
+				 flags, pwordexp, ifsdata, 1);
 	  /* The ``1'' here is to tell parse_dollars not to
 	   * split the fields.  It may need to, however ("$@").
 	   */
@@ -2207,7 +2234,7 @@  parse_dquote (char **word, size_t *word_length, size_t *max_length,
 	case '`':
 	  ++(*offset);
 	  error = parse_backtick (word, word_length, max_length, words,
-				  offset, flags, NULL, NULL, NULL);
+				  offset, flags, NULL, &nulifs);
 	  /* The first NULL here is to tell parse_backtick not to
 	   * split the fields.
 	   */
@@ -2340,6 +2367,13 @@  wordexp (const char *words, wordexp_t *pwordexp, int flags)
       *whch = '\0';
     }
 
+  struct ifss ifsdata;
+  ifsdata.ifs = ifs;
+  ifsdata.ifs_white = ifs_white;
+  make_charset (ifs, ifsdata.ifs_table);
+  make_charset (ifs_white, ifsdata.ifs_white_table);
+
+
   for (words_offset = 0 ; words[words_offset] ; ++words_offset)
     switch (words[words_offset])
       {
@@ -2354,7 +2388,7 @@  wordexp (const char *words, wordexp_t *pwordexp, int flags)
 
       case '$':
 	error = parse_dollars (&word, &word_length, &max_length, words,
-			       &words_offset, flags, pwordexp, ifs, ifs_white,
+			       &words_offset, flags, pwordexp, &ifsdata,
 			       0);
 
 	if (error)
@@ -2365,8 +2399,7 @@  wordexp (const char *words, wordexp_t *pwordexp, int flags)
       case '`':
 	++words_offset;
 	error = parse_backtick (&word, &word_length, &max_length, words,
-				&words_offset, flags, pwordexp, ifs,
-				ifs_white);
+				&words_offset, flags, pwordexp, &ifsdata);
 
 	if (error)
 	  goto do_error;
@@ -2376,7 +2409,7 @@  wordexp (const char *words, wordexp_t *pwordexp, int flags)
       case '"':
 	++words_offset;
 	error = parse_dquote (&word, &word_length, &max_length, words,
-			      &words_offset, flags, pwordexp, ifs, ifs_white);
+			      &words_offset, flags, pwordexp, &ifsdata);
 
 	if (error)
 	  goto do_error;
@@ -2422,21 +2455,24 @@  wordexp (const char *words, wordexp_t *pwordexp, int flags)
       case '[':
       case '?':
 	error = parse_glob (&word, &word_length, &max_length, words,
-			    &words_offset, flags, pwordexp, ifs, ifs_white);
+			    &words_offset, flags, pwordexp, &ifsdata);
 
 	if (error)
 	  goto do_error;
 
 	break;
 
-      default:
+      default:;
 	/* Is it a word separator? */
-	if (strchr (" \t", words[words_offset]) == NULL)
-	  {
-	    char ch = words[words_offset];
+	char ch = words[words_offset];
 
+        uint32_t set3[8] = {0x200, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; /* \t */
+	if (!in_charset (ch, set3))
+	  {
 	    /* Not a word separator -- but is it a valid word char? */
-	    if (strchr ("\n|&;<>(){}", ch))
+            uint32_t set4[8] = {0x400, 0x58000340, 0x0, 0x38000000, 0x0, 0x0, 
+                                0x0, 0x0}; /* \n|&;<>(){} */
+	    if (in_charset (ch, set4))
 	      {
 		/* Fail */
 		error = WRDE_BADCHAR;