[BZ,#16009] fix memory handling in strxfrm_l

Message ID 54797C90.7030504@web.de
State Superseded
Headers

Commit Message

Leonhard Holz Nov. 29, 2014, 7:58 a.m. UTC
  Hello,

this patch solves bug #16009 by implementing an additional path in 
strxfrm that does not depend on caching the weight and rule indices.

In detail the following changed:

* The old main loop was factored out of strxfrm_l into the function 
do_xfrm_cached to be able to alternativly use the non-caching version 
do_xfrm.

* strxfrm_l allocates a a fixed size array on the stack. If this is not 
sufficiant to store the weight and rule indices, the non-caching path is 
taken. As the cache size is not dependent on the input there can be no 
problems with integer overflows or stack allocations greater than 
__MAX_ALLOCA_CUTOFF. Note that malloc-ing is not possible because the 
definition of strxfrm does not allow an oom errorhandling.

* The uncached path determines the weight and rule index for every char 
and for every pass again. Handling of backward sequences needs a special 
threatment, I found no way to implement it without allocation. This is 
now done by pushing the backward sequence on the stack within a single 
linked list that can later easily be traversed backwards (but not 
free'd) and avoids the problem of stack allocations beyond 
__MAX_ALLOCA_CUTOFF. Here again, malloc is not possible.

* Passing all the locale data array by array resulted in very long 
parameter lists, so I introduced a structure that holds them.

* Checking for zero src string has been moved a bit upwards, it is 
before the locale data initialization now.

* To verify that the non-caching path works correct I added a test run 
to localedata/sort-test.sh & localedata/xfrm-test.c where all strings 
are patched up with spaces so that they are too large for the caching path.

make tests && make xcheck report no additional errors. Unfortunately the 
diff of strxfrm_l.c is a bit messy, it looks factoring out the main loop 
was too much for git. :|

Leonhard

2014-11-29  Leonhard Holz <leonhard.holz@web.de>

         [BZ #16009]
	* string/strxfrm_l.c (STRXFRM): Allocate fixed size cache
	for weights and rules. Use do_xfrm_cached if data fits in
	cache, do_xfrm otherwise. Moved former main loop to...
	* (do_xfrm_cached): New function.
	* (do_xfrm): Non-caching version of do_xfrm_cached. Uses
	find_idx, find_position and stack_push.
	* (find_idx): New function.
	* (find_position): Likewise.
	* (stack_push): New macro.
	* localedata/sort-test.sh: Added test run for do_xfrm.
	* localedata/xfrm-test.c (main): Added command line
	option -nocache to run the test with strings that are
	too large for the STRXFRM cache.

diff --git a/localedata/sort-test.sh b/localedata/sort-test.sh
index e37129a..c464b05 100644
--- a/localedata/sort-test.sh
+++ b/localedata/sort-test.sh
@@ -53,11 +53,18 @@ for l in $lang; do
    ${common_objpfx}localedata/xfrm-test $id < $cns.in \
    > ${common_objpfx}localedata/$cns.xout || here=1
   cmp -s $cns.in ${common_objpfx}localedata/$cns.xout || here=1
+  ${test_program_prefix_before_env} \
+   ${run_program_env} \
+   LC_ALL=$l ${test_program_prefix_after_env} \
+   ${common_objpfx}localedata/xfrm-test $id -nocache < $cns.in \
+   > ${common_objpfx}localedata/$cns.xoutl || here=1
+  cmp -s $cns.in ${common_objpfx}localedata/$cns.xoutl || here=1
   if test $here -eq 0; then
     echo "$l xfrm-test OK"
   else
     echo "$l xfrm-test FAIL"
     diff -u $cns.in ${common_objpfx}localedata/$cns.xout | sed 's/^/  /'
+    diff -u $cns.in ${common_objpfx}localedata/$cns.xoutl | sed 's/^/  /'
     status=1
   fi
 done
diff --git a/localedata/xfrm-test.c b/localedata/xfrm-test.c
index d2aba7d..9ac57bf 100644
--- a/localedata/xfrm-test.c
+++ b/localedata/xfrm-test.c
@@ -24,6 +24,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+/* Keep in sync with string/strxfrm_l.c.  */
+#define CACHE_SIZE 4095
 
 struct lines
 {
@@ -36,7 +38,7 @@ static int xstrcmp (const void *, const void *);
 int
 main (int argc, char *argv[])
 {
-  int result = 0;
+  int result = 0, nocache = 0;
   size_t nstrings, nstrings_max;
   struct lines *strings;
   char *line = NULL;
@@ -44,7 +46,16 @@ main (int argc, char *argv[])
   size_t n;
 
   if (argc < 2)
-    error (1, 0, "usage: %s <random seed>", argv[0]);
+    error (1, 0, "usage: %s <random seed> [-nocache]", argv[0]);
+
+  if (argc == 3)
+    if (strcmp (argv[2], "-nocache") == 0)
+      nocache = 1;
+    else
+      {
+	printf ("Unknown option %s!\n", argv[2]);
+	exit (1);
+      }
 
   setlocale (LC_ALL, "");
 
@@ -59,9 +70,9 @@ main (int argc, char *argv[])
 
   while (1)
     {
-      char saved, *newp;
-      int needed;
-      int l;
+      char saved, *word, *newp;
+      size_t l, line_len, needed;
+
       if (getline (&line, &len, stdin) < 0)
 	break;
 
@@ -83,10 +94,35 @@ main (int argc, char *argv[])
 
       saved = line[l];
       line[l] = '\0';
-      needed = strxfrm (NULL, line, 0);
+
+      if (nocache)
+	{
+	  line_len = strlen (line);
+	  word = malloc (line_len + CACHE_SIZE + 1);
+	  if (word == NULL)
+	    {
+	      perror (argv[0]);
+	      exit (1);
+	    }
+	  memset (word, ' ', CACHE_SIZE);
+	  memcpy (word + CACHE_SIZE, line, line_len);
+	  word[line_len + CACHE_SIZE] = '\0';
+	}
+      else
+        word = line;
+
+      needed = strxfrm (NULL, word, 0);
       newp = malloc (needed + 1);
-      strxfrm (newp, line, needed + 1);
+      if (newp == NULL)
+	{
+	  perror (argv[0]);
+	  exit (1);
+	}
+      strxfrm (newp, word, needed + 1);
       strings[nstrings].xfrm = newp;
+
+      if (nocache)
+	free (word);
       line[l] = saved;
       ++nstrings;
     }
  

Comments

Leonhard Holz Dec. 8, 2014, 3:26 p.m. UTC | #1
This patch needs a review: 
https://sourceware.org/ml/libc-alpha/2014-11/msg00903.html
  
Leonhard Holz Dec. 15, 2014, 10:58 a.m. UTC | #2
This patch is waiting for review: 
https://sourceware.org/ml/libc-alpha/2014-11/msg00903.html
  
Siddhesh Poyarekar Dec. 16, 2014, 7:51 a.m. UTC | #3
On Sat, Nov 29, 2014 at 08:58:08AM +0100, Leonhard Holz wrote:
> Hello,
> 
> this patch solves bug #16009 by implementing an additional path in strxfrm
> that does not depend on caching the weight and rule indices.

Here's a first pass review.  Sorry it took me a while to get to it.
Also in future, please inline your patches instead of attaching them
as separate files; that is easier to review.

> diff --git a/string/strxfrm_l.c b/string/strxfrm_l.c
> index 2d3f1bd..cb45edc 100644
> --- a/string/strxfrm_l.c
> +++ b/string/strxfrm_l.c
> @@ -29,7 +29,6 @@
>  # define STRING_TYPE char
>  # define USTRING_TYPE unsigned char
>  # define STRXFRM __strxfrm_l
> -# define STRCMP strcmp
>  # define STRLEN strlen
>  # define STPNCPY __stpncpy
>  # define WEIGHT_H "../locale/weight.h"

This is an unrelated and trivial change.  Post a separate patch for it
and I'll include it right away.

> @@ -40,9 +39,41 @@
>  #define CONCAT(a,b) CONCAT1(a,b)
>  #define CONCAT1(a,b) a##b
>  
> +/* The maximum number of chars for that indices are cached.  Size it so that
> +   all strings not considered "large" fit in.  CACHE_SIZE * 4  has to be lower
> +   than __MAX_ALLOCA_CUTOFF.  Keep localedata/xfrm-test.c in sync.  */
> +#define CACHE_SIZE 4095
> +
>  #include "../locale/localeinfo.h"
>  #include WEIGHT_H
>  
> +/* Single linked list for handling backward sequences.  */
> +typedef struct idx_stack
> +{
> +  struct idx_stack* prev;
> +  int32_t idx;
> +  size_t len;
> +} idx_stack_t;
> +
> +#define stack_push(_stack, _idx, _len)\
> +{\
> +  idx_stack_t *prev = _stack;\
> +  _stack = alloca (sizeof (idx_stack_t));\
> +  _stack->prev = prev;\
> +  _stack->idx = _idx;\
> +  _stack->len = _len;\
> +}

The formatting is wrong.  See other macro definitions in the glibc
sources for formatting examples.

> +
> +/* Group locale data for shorter parameter lists.  */
> +typedef struct
> +{
> +  uint_fast32_t nrules;
> +  unsigned char *rulesets;
> +  USTRING_TYPE *weights;
> +  int32_t *table;
> +  USTRING_TYPE *extra;
> +  int32_t *indirect;
> +} locale_data_t;
>  
>  #ifndef WIDE_CHAR_VERSION
>  
> @@ -81,117 +112,270 @@ utf8_encode (char *buf, int val)
>  }
>  #endif
>  
> +/* Find next weight and rule index.  Inlined since called for every char.  */
> +static __always_inline size_t
> +find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
> +	  unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
> +{
> +  int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
> +			 -1);
> +  *rule_idx = tmp >> 24;
> +  int32_t idx = tmp & 0xffffff;
> +  size_t len = l_data->weights[idx++];
> +
> +  /* Skip over indices of previous levels.  */
> +  for (int i = 0; i < pass; i++)
> +    {
> +      idx += len;
> +      len = l_data->weights[idx++];
> +    }
>  
> -size_t
> -STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
> +  *weight_idx = idx;
> +  return len;
> +}
> +
> +static int
> +find_position (const USTRING_TYPE *us, const locale_data_t *l_data,
> +	       const int pass)
>  {
> -  struct __locale_data *current = l->__locales[LC_COLLATE];
> -  uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
> -  /* We don't assign the following values right away since it might be
> -     unnecessary in case there are no rules.  */
> -  const unsigned char *rulesets;
> -  const int32_t *table;
> -  const USTRING_TYPE *weights;
> -  const USTRING_TYPE *extra;
> -  const int32_t *indirect;
> +  int32_t weight_idx;
> +  unsigned char rule_idx;
> +  const USTRING_TYPE *usrc = us;
> +
> +  find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
> +  return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
> +}
> +
> +/* Do the transformation.  */
> +static size_t
> +do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n,
> +	 const locale_data_t *l_data)
> +{
> +  int32_t weight_idx;
> +  unsigned char rule_idx;
> +  size_t needed = 0;
>    uint_fast32_t pass;
> -  size_t needed;
>    size_t last_needed;
> -  const USTRING_TYPE *usrc;
> -  size_t srclen = STRLEN (src);
> -  int32_t *idxarr;
> -  unsigned char *rulearr;
> -  size_t idxmax;
> -  size_t idxcnt;
> -  int use_malloc;
>  
> -  if (nrules == 0)
> +  /* Now the passes over the weights.  */
> +  for (pass = 0; pass < l_data->nrules; ++pass)
>      {
> -      if (n != 0)
> -	STPNCPY (dest, src, MIN (srclen + 1, n));
> +      last_needed = needed;
> +      idx_stack_t *backw = NULL;
> +      const USTRING_TYPE *cur = usrc;
>  
> -      return srclen;
> -    }
> +       /* We assume that if a rule has defined `position' in one section
> +         this is true for all of them.  */
> +      int position = find_position (cur, l_data, pass);
>  
> -  rulesets = (const unsigned char *)
> -    current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
> -  table = (const int32_t *)
> -    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
> -  weights = (const USTRING_TYPE *)
> -    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
> -  extra = (const USTRING_TYPE *)
> -    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
> -  indirect = (const int32_t *)
> -    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
> -  use_malloc = 0;
> +      if (position == 0)
> +	{
> +	  while (*cur != L('\0'))
> +	    {
> +	      size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
> +				     pass);
> +	      int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
>  
> -  assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
> -  assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
> -  assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
> -  assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
> +	      if ((rule & sort_forward) != 0)
> +		{
> +		  /* Handle the pushed backward sequence.  */
> +		  while (backw != NULL)
> +		    {
> +		      if (needed + backw->len < n)
> +			while (backw->len-- > 0)
> +			  dest[needed++] = l_data->weights[backw->idx++];
> +		      else
> +			/* No more characters fit into the buffer.  */
> +			needed += backw->len;
> +
> +		      backw = backw->prev;
> +		    }
>  
> -  /* Handle an empty string as a special case.  */
> -  if (srclen == 0)
> -    {
> -      if (n != 0)
> -	*dest = L('\0');
> -      return 0;
> -    }
> +		  /* Now handle the forward element.  */
> +		  if (needed + len < n)
> +		    while (len-- > 0)
> +		      dest[needed++] = l_data->weights[weight_idx++];
> +		  else
> +		    /* No more characters fit into the buffer.  */
> +		    needed += len;
> +		}
> +	      else
> +		/* Remember weight index and length of backward sequence.  */
> +		stack_push (backw, weight_idx, len);

This looks dangerous; you could overflow the stack with it if you have
enough backward sequences.  You'll be better off doing what the
strcoll uncached algorithm does, i.e. traverse all backward sequences
and then from back to front, get the weights of each sequence again.
It is slower, but won't need the additional space.

> +	    }
>  
> -  /* We need the elements of the string as unsigned values since they
> -     are used as indeces.  */
> -  usrc = (const USTRING_TYPE *) src;
> -
> -  /* Perform the first pass over the string and while doing this find
> -     and store the weights for each character.  Since we want this to
> -     be as fast as possible we are using `alloca' to store the temporary
> -     values.  But since there is no limit on the length of the string
> -     we have to use `malloc' if the string is too long.  We should be
> -     very conservative here.  */
> -  if (! __libc_use_alloca ((srclen + 1) * (sizeof (int32_t) + 1)))
> -    {
> -      idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
> -      rulearr = (unsigned char *) &idxarr[srclen];
> -
> -      if (idxarr == NULL)
> -	/* No memory.  Well, go with the stack then.
> -
> -	   XXX Once this implementation is stable we will handle this
> -	   differently.  Instead of precomputing the indeces we will
> -	   do this in time.  This means, though, that this happens for
> -	   every pass again.  */
> -	goto try_stack;
> -      use_malloc = 1;
> -    }
> -  else
> -    {
> -    try_stack:
> -      idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
> -      rulearr = (unsigned char *) alloca (srclen + 1);
> +
> +	  /* Handle the pushed backward sequence.  */
> +	  while (backw != NULL)
> +	    {
> +	      if (needed + backw->len < n)
> +		while (backw->len-- > 0)
> +		  dest[needed++] = l_data->weights[backw->idx++];
> +	      else
> +		/* No more characters fit into the buffer.  */
> +		needed += backw->len;
> +
> +	      backw = backw->prev;
> +	    }
> +	}
> +      else
> +	{
> +	  int val = 1;
> +#ifndef WIDE_CHAR_VERSION
> +	  char buf[7];
> +	  size_t buflen;
> +#endif
> +	  size_t i;
> +
> +	  while (*cur != L('\0'))
> +	    {
> +	      size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
> +				     pass);
> +	      int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
> +
> +	      if ((rule & sort_forward) != 0)
> +		{
> +		  /* Handle the pushed backward sequence.  */
> +		  while (backw != NULL)
> +		    {
> +		      if (backw->len != 0)
> +			{
> +#ifdef WIDE_CHAR_VERSION
> +			  if (needed + 1 + backw->len < n)
> +			    {
> +			      dest[needed] = val;
> +			      for (i = 0; i < backw->len; ++i)
> +				dest[needed + 1 + i] =
> +				  l_data->weights[backw->idx + i];
> +			    }
> +			  needed += 1 + backw->len;
> +#else
> +			  buflen = utf8_encode (buf, val);
> +			  if (needed + buflen + backw->len < n)
> +			    {
> +			      for (i = 0; i < buflen; ++i)
> +				dest[needed + i] = buf[i];
> +			      for (i = 0; i < backw->len; ++i)
> +				dest[needed + buflen + i] =
> +				  l_data->weights[backw->idx + i];
> +			    }
> +			  needed += buflen + backw->len;
> +#endif
> +			  val = 1;
> +			}
> +		      else
> +			++val;
> +
> +		      backw = backw->prev;
> +		    }
> +
> +		  /* Now handle the forward element.  */
> +		  if (len != 0)
> +		    {
> +#ifdef WIDE_CHAR_VERSION
> +		      if (needed + 1 + len < n)
> +			{
> +			  dest[needed] = val;
> +			  for (i = 0; i < len; ++i)
> +			    dest[needed + 1 + i] =
> +			      l_data->weights[weight_idx + i];
> +			}
> +		      needed += 1 + len;
> +#else
> +		      buflen = utf8_encode (buf, val);
> +		      if (needed + buflen + len < n)
> +			{
> +			  for (i = 0; i < buflen; ++i)
> +			    dest[needed + i] = buf[i];
> +			  for (i = 0; i < len; ++i)
> +			    dest[needed + buflen + i] =
> +			      l_data->weights[weight_idx + i];
> +			}
> +		      needed += buflen + len;
> +#endif
> +		      val = 1;
> +		    }
> +		  else
> +		    ++val;
> +		}
> +	      else
> +		/* Remember weight index and length of backward sequence.  */
> +		stack_push (backw, weight_idx, len);
> +	    }
> +
> +	  /* Handle the pushed backward sequence.  */
> +	  while (backw != NULL)
> +	    {
> +	      if (backw->len != 0)
> +		{
> +#ifdef WIDE_CHAR_VERSION
> +		  if (needed + 1 + backw->len < n)
> +		    {
> +		      dest[needed] = val;
> +		      for (i = 0; i < backw->len; ++i)
> +			dest[needed + 1 + i] =
> +			  l_data->weights[backw->idx + i];
> +		    }
> +		  needed += 1 + backw->len;
> +#else
> +		  buflen = utf8_encode (buf, val);
> +		  if (needed + buflen + backw->len < n)
> +		    {
> +		      for (i = 0; i < buflen; ++i)
> +			dest[needed + i] = buf[i];
> +		      for (i = 0; i < backw->len; ++i)
> +			dest[needed + buflen + i] =
> +			  l_data->weights[backw->idx + i];
> +		    }
> +		  needed += buflen + backw->len;
> +#endif
> +		  val = 1;
> +		}
> +	      else
> +		++val;
> +
> +	      backw = backw->prev;
> +	    }
> +	}
> +
> +      /* Finally store the byte to separate the passes or terminate
> +	 the string.  */
> +      if (needed < n)
> +	dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
> +      ++needed;
>      }
>  
> -  idxmax = 0;
> -  do
> +  /* This is a little optimization: many collation specifications have
> +     a `position' rule at the end and if no non-ignored character
> +     is found the last \1 byte is immediately followed by a \0 byte
> +     signalling this.  We can avoid the \1 byte(s).  */

Split out this optimization into a separate patch.

> +  if (needed > 2 && needed == last_needed + 1)
>      {
> -      int32_t tmp = findidx (table, indirect, extra, &usrc, -1);
> -      rulearr[idxmax] = tmp >> 24;
> -      idxarr[idxmax] = tmp & 0xffffff;
> -
> -      ++idxmax;
> +      /* Remove the \1 byte.  */
> +      if (--needed <= n)
> +	dest[needed - 1] = L('\0');
>      }
> -  while (*usrc != L('\0'));
>  
> -  /* This element is only read, the value never used but to determine
> -     another value which then is ignored.  */
> -  rulearr[idxmax] = '\0';
> +  /* Return the number of bytes/words we need, but don't count the NUL
> +     byte/word at the end.  */
> +  return needed - 1;
> +}
>  
> -  /* Now the passes over the weights.  We now use the indeces we found
> -     before.  */
> -  needed = 0;
> -  for (pass = 0; pass < nrules; ++pass)
> +/* Do the transformation using weight-index and rule cache.  */
> +static size_t
> +do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data,
> +		size_t idxmax, int32_t *idxarr, const unsigned char *rulearr)
> +{
> +  size_t needed = 0;
> +  uint_fast32_t pass;
> +  size_t last_needed;
> +  size_t idxcnt;
> +
> +  /* Now the passes over the weights.  */
> +  for (pass = 0; pass < l_data->nrules; ++pass)
>      {
>        size_t backw_stop = ~0ul;
> -      int rule = rulesets[rulearr[0] * nrules + pass];
> +      int rule = l_data->rulesets[rulearr[0] * l_data->nrules + pass];

Declare local variables rulesets, weights, etc. so that you avoid most
of the changes below.

>        /* We assume that if a rule has defined `position' in one section
>  	 this is true for all of them.  */
>        int position = rule & sort_position;
> @@ -213,11 +397,12 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  		      for (backw = idxcnt; backw > backw_stop; )
>  			{
>  			  --backw;
> -			  len = weights[idxarr[backw]++];
> +			  len = l_data->weights[idxarr[backw]++];
>  
>  			  if (needed + len < n)
>  			    while (len-- > 0)
> -			      dest[needed++] = weights[idxarr[backw]++];
> +			      dest[needed++] = l_data->weights[
> +						 idxarr[backw]++];
>  			  else
>  			    {
>  				/* No more characters fit into the buffer.  */
> @@ -230,10 +415,10 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  		    }
>  
>  		  /* Now handle the forward element.  */
> -		  len = weights[idxarr[idxcnt]++];
> +		  len = l_data->weights[idxarr[idxcnt]++];
>  		  if (needed + len < n)
>  		    while (len-- > 0)
> -		      dest[needed++] = weights[idxarr[idxcnt]++];
> +		      dest[needed++] = l_data->weights[idxarr[idxcnt]++];
>  		  else
>  		    {
>  		      /* No more characters fit into the buffer.  */
> @@ -248,7 +433,8 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  		    backw_stop = idxcnt;
>  		}
>  
> -	      rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
> +	      rule = l_data->rulesets[rulearr[idxcnt + 1] * l_data->nrules
> +				      + pass];
>  	    }
>  
>  
> @@ -260,11 +446,11 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  	      backw = idxcnt;
>  	      while (backw > backw_stop)
>  		{
> -		  size_t len = weights[idxarr[--backw]++];
> +		  size_t len = l_data->weights[idxarr[--backw]++];
>  
>  		  if (needed + len < n)
>  		    while (len-- > 0)
> -		      dest[needed++] = weights[idxarr[backw]++];
> +		      dest[needed++] = l_data->weights[idxarr[backw]++];
>  		  else
>  		    {
>  		      /* No more characters fit into the buffer.  */
> @@ -297,7 +483,7 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  		      for (backw = idxcnt; backw > backw_stop; )
>  			{
>  			  --backw;
> -			  len = weights[idxarr[backw]++];
> +			  len = l_data->weights[idxarr[backw]++];
>  			  if (len != 0)
>  			    {
>  #ifdef WIDE_CHAR_VERSION
> @@ -306,7 +492,7 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  				  dest[needed] = val;
>  				  for (i = 0; i < len; ++i)
>  				    dest[needed + 1 + i] =
> -				      weights[idxarr[backw] + i];
> +				      l_data->weights[idxarr[backw] + i];
>  				}
>  			      needed += 1 + len;
>  #else
> @@ -317,7 +503,7 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  				    dest[needed + i] = buf[i];
>  				  for (i = 0; i < len; ++i)
>  				    dest[needed + buflen + i] =
> -				      weights[idxarr[backw] + i];
> +				      l_data->weights[idxarr[backw] + i];
>  				}
>  			      needed += buflen + len;
>  #endif
> @@ -332,7 +518,7 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  		    }
>  
>  		  /* Now handle the forward element.  */
> -		  len = weights[idxarr[idxcnt]++];
> +		  len = l_data->weights[idxarr[idxcnt]++];
>  		  if (len != 0)
>  		    {
>  #ifdef WIDE_CHAR_VERSION
> @@ -341,7 +527,7 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  			  dest[needed] = val;
>  			  for (i = 0; i < len; ++i)
>  			    dest[needed + 1 + i] =
> -			      weights[idxarr[idxcnt] + i];
> +			      l_data->weights[idxarr[idxcnt] + i];
>  			}
>  		      needed += 1 + len;
>  #else
> @@ -352,7 +538,7 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  			    dest[needed + i] = buf[i];
>  			  for (i = 0; i < len; ++i)
>  			    dest[needed + buflen + i] =
> -			      weights[idxarr[idxcnt] + i];
> +			      l_data->weights[idxarr[idxcnt] + i];
>  			}
>  		      needed += buflen + len;
>  #endif
> @@ -371,7 +557,8 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  		    backw_stop = idxcnt;
>  		}
>  
> -	      rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
> +	      rule = l_data->rulesets[rulearr[idxcnt + 1] * l_data->nrules
> +				      + pass];
>  	    }
>  
>  	  if (backw_stop != ~0ul)
> @@ -382,7 +569,7 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  	      backw = idxmax - 1;
>  	      while (backw > backw_stop)
>  		{
> -		  size_t len = weights[idxarr[--backw]++];
> +		  size_t len = l_data->weights[idxarr[--backw]++];
>  		  if (len != 0)
>  		    {
>  #ifdef WIDE_CHAR_VERSION
> @@ -391,7 +578,7 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  			  dest[needed] = val;
>  			  for (i = 0; i < len; ++i)
>  			    dest[needed + 1 + i] =
> -			      weights[idxarr[backw] + i];
> +			      l_data->weights[idxarr[backw] + i];
>  			}
>  		      needed += 1 + len;
>  #else
> @@ -402,7 +589,7 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  			    dest[needed + i] = buf[i];
>  			  for (i = 0; i < len; ++i)
>  			    dest[needed + buflen + i] =
> -			      weights[idxarr[backw] + i];
> +			      l_data->weights[idxarr[backw] + i];
>  			}
>  		      needed += buflen + len;
>  #endif
> @@ -418,7 +605,7 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>        /* Finally store the byte to separate the passes or terminate
>  	 the string.  */
>        if (needed < n)
> -	dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
> +	dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
>        ++needed;
>      }
>  
> @@ -433,14 +620,87 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  	dest[needed - 1] = L('\0');
>      }
>  
> -  /* Free the memory if needed.  */
> -  if (use_malloc)
> -    free (idxarr);
> -
>    /* Return the number of bytes/words we need, but don't count the NUL
>       byte/word at the end.  */
>    return needed - 1;
>  }
> +
> +size_t
> +STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
> +{
> +  locale_data_t l_data;
> +  struct __locale_data *current = l->__locales[LC_COLLATE];
> +  l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
> +
> +  /* Handle byte comparison case.  */
> +  if (l_data.nrules == 0)
> +    {
> +      size_t srclen = STRLEN (src);
> +
> +      if (n != 0)
> +	STPNCPY (dest, src, MIN (srclen + 1, n));
> +
> +      return srclen;
> +    }
> +
> +  /* Handle an empty string, code hereafter relies on strlen (src) > 0.  */
> +  if (*src == L('\0'))
> +    {
> +      if (n != 0)
> +	*dest = L('\0');
> +      return 0;
> +    }
> +
> +  /* Get the locale data.  */
> +  l_data.rulesets = (unsigned char *)
> +    current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
> +  l_data.table = (int32_t *)
> +    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
> +  l_data.weights = (USTRING_TYPE *)
> +    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
> +  l_data.extra = (USTRING_TYPE *)
> +    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
> +  l_data.indirect = (int32_t *)
> +    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
> +
> +  assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
> +  assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
> +  assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
> +  assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);
> +
> +  /* We need the elements of the string as unsigned values since they
> +     are used as indeces.  */
> +  const USTRING_TYPE *usrc = (const USTRING_TYPE *) src;
> +
> +  /* Allocate cache for strings up to 4 KByte on the stack and fill it with
> +     weight and rule indices.  If the cache size is not sufficient, continue
> +     with the uncached xfrm version.  */
> +  size_t idxmax = 0;
> +  const USTRING_TYPE *cur = usrc;
> +  int32_t *idxarr = alloca (CACHE_SIZE * sizeof (int32_t));
> +  unsigned char *rulearr = alloca (CACHE_SIZE + 1);
> +
> +  do
> +    {
> +      int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
> +			     -1);
> +      rulearr[idxmax] = tmp >> 24;
> +      idxarr[idxmax] = tmp & 0xffffff;
> +
> +      ++idxmax;
> +    }
> +  while (*cur != L('\0') && idxmax < CACHE_SIZE);

If you're going the route of defining a fixed size, you'll have to go
through the pain of proving that the size you've chosen is optimal.
Instead, just use malloc to allocate the required size and if it
fails, fall back to the non-cached version.  In that case, your test
case changes are no longer valid and you'll need to verify by manually
disabling the cached path to test the non-cached paths.

Siddhesh
  
Leonhard Holz Dec. 31, 2014, 9:36 a.m. UTC | #4
>> +		/* Remember weight index and length of backward sequence.  */
>> +		stack_push (backw, weight_idx, len);
>
> This looks dangerous; you could overflow the stack with it if you have
> enough backward sequences.  You'll be better off doing what the
> strcoll uncached algorithm does, i.e. traverse all backward sequences
> and then from back to front, get the weights of each sequence again.
> It is slower, but won't need the additional space.

Ok. This needs some calculations for offsets that will not make the code 
easier to read but maybe we can refactor it later.

>> +  /* This is a little optimization: many collation specifications have
>> +     a `position' rule at the end and if no non-ignored character
>> +     is found the last \1 byte is immediately followed by a \0 byte
>> +     signalling this.  We can avoid the \1 byte(s).  */
>
> Split out this optimization into a separate patch.

This is copied from the cached path so it needs to be there to reproduce 
the functionality.

> If you're going the route of defining a fixed size, you'll have to go
> through the pain of proving that the size you've chosen is optimal.
> Instead, just use malloc to allocate the required size and if it
> fails, fall back to the non-cached version.  In that case, your test
> case changes are no longer valid and you'll need to verify by manually
> disabling the cached path to test the non-cached paths.
>

I want to object here. The needed cache size can be in a region where 
the malloc will not fail but hurt the system performance considerable 
(swapping), especially as the malloced size is five times the string 
size. I think that low level functions like strxfrm should not carry 
such risks.

Leonhard
  
Siddhesh Poyarekar Dec. 31, 2014, 9:47 a.m. UTC | #5
On Wed, Dec 31, 2014 at 10:36:11AM +0100, Leonhard Holz wrote:
> I want to object here. The needed cache size can be in a region where the
> malloc will not fail but hurt the system performance considerable
> (swapping), especially as the malloced size is five times the string size. I
> think that low level functions like strxfrm should not carry such risks.

Fair enough.  In that case just mention in a comment that the size
you've chosen is arbitrary and only ensures that it is small enough to
be contained on the stack.  If someone wants to prove that the size is
optimal (or another size is optimal) then they can do that on top of
your change.

Siddhesh
  

Patch

diff --git a/string/strxfrm_l.c b/string/strxfrm_l.c
index 2d3f1bd..cb45edc 100644
--- a/string/strxfrm_l.c
+++ b/string/strxfrm_l.c
@@ -29,7 +29,6 @@ 
 # define STRING_TYPE char
 # define USTRING_TYPE unsigned char
 # define STRXFRM __strxfrm_l
-# define STRCMP strcmp
 # define STRLEN strlen
 # define STPNCPY __stpncpy
 # define WEIGHT_H "../locale/weight.h"
@@ -40,9 +39,41 @@ 
 #define CONCAT(a,b) CONCAT1(a,b)
 #define CONCAT1(a,b) a##b
 
+/* The maximum number of chars for that indices are cached.  Size it so that
+   all strings not considered "large" fit in.  CACHE_SIZE * 4  has to be lower
+   than __MAX_ALLOCA_CUTOFF.  Keep localedata/xfrm-test.c in sync.  */
+#define CACHE_SIZE 4095
+
 #include "../locale/localeinfo.h"
 #include WEIGHT_H
 
+/* Single linked list for handling backward sequences.  */
+typedef struct idx_stack
+{
+  struct idx_stack* prev;
+  int32_t idx;
+  size_t len;
+} idx_stack_t;
+
+#define stack_push(_stack, _idx, _len)\
+{\
+  idx_stack_t *prev = _stack;\
+  _stack = alloca (sizeof (idx_stack_t));\
+  _stack->prev = prev;\
+  _stack->idx = _idx;\
+  _stack->len = _len;\
+}
+
+/* Group locale data for shorter parameter lists.  */
+typedef struct
+{
+  uint_fast32_t nrules;
+  unsigned char *rulesets;
+  USTRING_TYPE *weights;
+  int32_t *table;
+  USTRING_TYPE *extra;
+  int32_t *indirect;
+} locale_data_t;
 
 #ifndef WIDE_CHAR_VERSION
 
@@ -81,117 +112,270 @@  utf8_encode (char *buf, int val)
 }
 #endif
 
+/* Find next weight and rule index.  Inlined since called for every char.  */
+static __always_inline size_t
+find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
+	  unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
+{
+  int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
+			 -1);
+  *rule_idx = tmp >> 24;
+  int32_t idx = tmp & 0xffffff;
+  size_t len = l_data->weights[idx++];
+
+  /* Skip over indices of previous levels.  */
+  for (int i = 0; i < pass; i++)
+    {
+      idx += len;
+      len = l_data->weights[idx++];
+    }
 
-size_t
-STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
+  *weight_idx = idx;
+  return len;
+}
+
+static int
+find_position (const USTRING_TYPE *us, const locale_data_t *l_data,
+	       const int pass)
 {
-  struct __locale_data *current = l->__locales[LC_COLLATE];
-  uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
-  /* We don't assign the following values right away since it might be
-     unnecessary in case there are no rules.  */
-  const unsigned char *rulesets;
-  const int32_t *table;
-  const USTRING_TYPE *weights;
-  const USTRING_TYPE *extra;
-  const int32_t *indirect;
+  int32_t weight_idx;
+  unsigned char rule_idx;
+  const USTRING_TYPE *usrc = us;
+
+  find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
+  return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
+}
+
+/* Do the transformation.  */
+static size_t
+do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n,
+	 const locale_data_t *l_data)
+{
+  int32_t weight_idx;
+  unsigned char rule_idx;
+  size_t needed = 0;
   uint_fast32_t pass;
-  size_t needed;
   size_t last_needed;
-  const USTRING_TYPE *usrc;
-  size_t srclen = STRLEN (src);
-  int32_t *idxarr;
-  unsigned char *rulearr;
-  size_t idxmax;
-  size_t idxcnt;
-  int use_malloc;
 
-  if (nrules == 0)
+  /* Now the passes over the weights.  */
+  for (pass = 0; pass < l_data->nrules; ++pass)
     {
-      if (n != 0)
-	STPNCPY (dest, src, MIN (srclen + 1, n));
+      last_needed = needed;
+      idx_stack_t *backw = NULL;
+      const USTRING_TYPE *cur = usrc;
 
-      return srclen;
-    }
+       /* We assume that if a rule has defined `position' in one section
+         this is true for all of them.  */
+      int position = find_position (cur, l_data, pass);
 
-  rulesets = (const unsigned char *)
-    current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
-  table = (const int32_t *)
-    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
-  weights = (const USTRING_TYPE *)
-    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
-  extra = (const USTRING_TYPE *)
-    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
-  indirect = (const int32_t *)
-    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
-  use_malloc = 0;
+      if (position == 0)
+	{
+	  while (*cur != L('\0'))
+	    {
+	      size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
+				     pass);
+	      int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
 
-  assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
-  assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
-  assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
-  assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
+	      if ((rule & sort_forward) != 0)
+		{
+		  /* Handle the pushed backward sequence.  */
+		  while (backw != NULL)
+		    {
+		      if (needed + backw->len < n)
+			while (backw->len-- > 0)
+			  dest[needed++] = l_data->weights[backw->idx++];
+		      else
+			/* No more characters fit into the buffer.  */
+			needed += backw->len;
+
+		      backw = backw->prev;
+		    }
 
-  /* Handle an empty string as a special case.  */
-  if (srclen == 0)
-    {
-      if (n != 0)
-	*dest = L('\0');
-      return 0;
-    }
+		  /* Now handle the forward element.  */
+		  if (needed + len < n)
+		    while (len-- > 0)
+		      dest[needed++] = l_data->weights[weight_idx++];
+		  else
+		    /* No more characters fit into the buffer.  */
+		    needed += len;
+		}
+	      else
+		/* Remember weight index and length of backward sequence.  */
+		stack_push (backw, weight_idx, len);
+	    }
 
-  /* We need the elements of the string as unsigned values since they
-     are used as indeces.  */
-  usrc = (const USTRING_TYPE *) src;
-
-  /* Perform the first pass over the string and while doing this find
-     and store the weights for each character.  Since we want this to
-     be as fast as possible we are using `alloca' to store the temporary
-     values.  But since there is no limit on the length of the string
-     we have to use `malloc' if the string is too long.  We should be
-     very conservative here.  */
-  if (! __libc_use_alloca ((srclen + 1) * (sizeof (int32_t) + 1)))
-    {
-      idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
-      rulearr = (unsigned char *) &idxarr[srclen];
-
-      if (idxarr == NULL)
-	/* No memory.  Well, go with the stack then.
-
-	   XXX Once this implementation is stable we will handle this
-	   differently.  Instead of precomputing the indeces we will
-	   do this in time.  This means, though, that this happens for
-	   every pass again.  */
-	goto try_stack;
-      use_malloc = 1;
-    }
-  else
-    {
-    try_stack:
-      idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
-      rulearr = (unsigned char *) alloca (srclen + 1);
+
+	  /* Handle the pushed backward sequence.  */
+	  while (backw != NULL)
+	    {
+	      if (needed + backw->len < n)
+		while (backw->len-- > 0)
+		  dest[needed++] = l_data->weights[backw->idx++];
+	      else
+		/* No more characters fit into the buffer.  */
+		needed += backw->len;
+
+	      backw = backw->prev;
+	    }
+	}
+      else
+	{
+	  int val = 1;
+#ifndef WIDE_CHAR_VERSION
+	  char buf[7];
+	  size_t buflen;
+#endif
+	  size_t i;
+
+	  while (*cur != L('\0'))
+	    {
+	      size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
+				     pass);
+	      int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
+
+	      if ((rule & sort_forward) != 0)
+		{
+		  /* Handle the pushed backward sequence.  */
+		  while (backw != NULL)
+		    {
+		      if (backw->len != 0)
+			{
+#ifdef WIDE_CHAR_VERSION
+			  if (needed + 1 + backw->len < n)
+			    {
+			      dest[needed] = val;
+			      for (i = 0; i < backw->len; ++i)
+				dest[needed + 1 + i] =
+				  l_data->weights[backw->idx + i];
+			    }
+			  needed += 1 + backw->len;
+#else
+			  buflen = utf8_encode (buf, val);
+			  if (needed + buflen + backw->len < n)
+			    {
+			      for (i = 0; i < buflen; ++i)
+				dest[needed + i] = buf[i];
+			      for (i = 0; i < backw->len; ++i)
+				dest[needed + buflen + i] =
+				  l_data->weights[backw->idx + i];
+			    }
+			  needed += buflen + backw->len;
+#endif
+			  val = 1;
+			}
+		      else
+			++val;
+
+		      backw = backw->prev;
+		    }
+
+		  /* Now handle the forward element.  */
+		  if (len != 0)
+		    {
+#ifdef WIDE_CHAR_VERSION
+		      if (needed + 1 + len < n)
+			{
+			  dest[needed] = val;
+			  for (i = 0; i < len; ++i)
+			    dest[needed + 1 + i] =
+			      l_data->weights[weight_idx + i];
+			}
+		      needed += 1 + len;
+#else
+		      buflen = utf8_encode (buf, val);
+		      if (needed + buflen + len < n)
+			{
+			  for (i = 0; i < buflen; ++i)
+			    dest[needed + i] = buf[i];
+			  for (i = 0; i < len; ++i)
+			    dest[needed + buflen + i] =
+			      l_data->weights[weight_idx + i];
+			}
+		      needed += buflen + len;
+#endif
+		      val = 1;
+		    }
+		  else
+		    ++val;
+		}
+	      else
+		/* Remember weight index and length of backward sequence.  */
+		stack_push (backw, weight_idx, len);
+	    }
+
+	  /* Handle the pushed backward sequence.  */
+	  while (backw != NULL)
+	    {
+	      if (backw->len != 0)
+		{
+#ifdef WIDE_CHAR_VERSION
+		  if (needed + 1 + backw->len < n)
+		    {
+		      dest[needed] = val;
+		      for (i = 0; i < backw->len; ++i)
+			dest[needed + 1 + i] =
+			  l_data->weights[backw->idx + i];
+		    }
+		  needed += 1 + backw->len;
+#else
+		  buflen = utf8_encode (buf, val);
+		  if (needed + buflen + backw->len < n)
+		    {
+		      for (i = 0; i < buflen; ++i)
+			dest[needed + i] = buf[i];
+		      for (i = 0; i < backw->len; ++i)
+			dest[needed + buflen + i] =
+			  l_data->weights[backw->idx + i];
+		    }
+		  needed += buflen + backw->len;
+#endif
+		  val = 1;
+		}
+	      else
+		++val;
+
+	      backw = backw->prev;
+	    }
+	}
+
+      /* Finally store the byte to separate the passes or terminate
+	 the string.  */
+      if (needed < n)
+	dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
+      ++needed;
     }
 
-  idxmax = 0;
-  do
+  /* This is a little optimization: many collation specifications have
+     a `position' rule at the end and if no non-ignored character
+     is found the last \1 byte is immediately followed by a \0 byte
+     signalling this.  We can avoid the \1 byte(s).  */
+  if (needed > 2 && needed == last_needed + 1)
     {
-      int32_t tmp = findidx (table, indirect, extra, &usrc, -1);
-      rulearr[idxmax] = tmp >> 24;
-      idxarr[idxmax] = tmp & 0xffffff;
-
-      ++idxmax;
+      /* Remove the \1 byte.  */
+      if (--needed <= n)
+	dest[needed - 1] = L('\0');
     }
-  while (*usrc != L('\0'));
 
-  /* This element is only read, the value never used but to determine
-     another value which then is ignored.  */
-  rulearr[idxmax] = '\0';
+  /* Return the number of bytes/words we need, but don't count the NUL
+     byte/word at the end.  */
+  return needed - 1;
+}
 
-  /* Now the passes over the weights.  We now use the indeces we found
-     before.  */
-  needed = 0;
-  for (pass = 0; pass < nrules; ++pass)
+/* Do the transformation using weight-index and rule cache.  */
+static size_t
+do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data,
+		size_t idxmax, int32_t *idxarr, const unsigned char *rulearr)
+{
+  size_t needed = 0;
+  uint_fast32_t pass;
+  size_t last_needed;
+  size_t idxcnt;
+
+  /* Now the passes over the weights.  */
+  for (pass = 0; pass < l_data->nrules; ++pass)
     {
       size_t backw_stop = ~0ul;
-      int rule = rulesets[rulearr[0] * nrules + pass];
+      int rule = l_data->rulesets[rulearr[0] * l_data->nrules + pass];
       /* We assume that if a rule has defined `position' in one section
 	 this is true for all of them.  */
       int position = rule & sort_position;
@@ -213,11 +397,12 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 		      for (backw = idxcnt; backw > backw_stop; )
 			{
 			  --backw;
-			  len = weights[idxarr[backw]++];
+			  len = l_data->weights[idxarr[backw]++];
 
 			  if (needed + len < n)
 			    while (len-- > 0)
-			      dest[needed++] = weights[idxarr[backw]++];
+			      dest[needed++] = l_data->weights[
+						 idxarr[backw]++];
 			  else
 			    {
 				/* No more characters fit into the buffer.  */
@@ -230,10 +415,10 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 		    }
 
 		  /* Now handle the forward element.  */
-		  len = weights[idxarr[idxcnt]++];
+		  len = l_data->weights[idxarr[idxcnt]++];
 		  if (needed + len < n)
 		    while (len-- > 0)
-		      dest[needed++] = weights[idxarr[idxcnt]++];
+		      dest[needed++] = l_data->weights[idxarr[idxcnt]++];
 		  else
 		    {
 		      /* No more characters fit into the buffer.  */
@@ -248,7 +433,8 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 		    backw_stop = idxcnt;
 		}
 
-	      rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
+	      rule = l_data->rulesets[rulearr[idxcnt + 1] * l_data->nrules
+				      + pass];
 	    }
 
 
@@ -260,11 +446,11 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 	      backw = idxcnt;
 	      while (backw > backw_stop)
 		{
-		  size_t len = weights[idxarr[--backw]++];
+		  size_t len = l_data->weights[idxarr[--backw]++];
 
 		  if (needed + len < n)
 		    while (len-- > 0)
-		      dest[needed++] = weights[idxarr[backw]++];
+		      dest[needed++] = l_data->weights[idxarr[backw]++];
 		  else
 		    {
 		      /* No more characters fit into the buffer.  */
@@ -297,7 +483,7 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 		      for (backw = idxcnt; backw > backw_stop; )
 			{
 			  --backw;
-			  len = weights[idxarr[backw]++];
+			  len = l_data->weights[idxarr[backw]++];
 			  if (len != 0)
 			    {
 #ifdef WIDE_CHAR_VERSION
@@ -306,7 +492,7 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 				  dest[needed] = val;
 				  for (i = 0; i < len; ++i)
 				    dest[needed + 1 + i] =
-				      weights[idxarr[backw] + i];
+				      l_data->weights[idxarr[backw] + i];
 				}
 			      needed += 1 + len;
 #else
@@ -317,7 +503,7 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 				    dest[needed + i] = buf[i];
 				  for (i = 0; i < len; ++i)
 				    dest[needed + buflen + i] =
-				      weights[idxarr[backw] + i];
+				      l_data->weights[idxarr[backw] + i];
 				}
 			      needed += buflen + len;
 #endif
@@ -332,7 +518,7 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 		    }
 
 		  /* Now handle the forward element.  */
-		  len = weights[idxarr[idxcnt]++];
+		  len = l_data->weights[idxarr[idxcnt]++];
 		  if (len != 0)
 		    {
 #ifdef WIDE_CHAR_VERSION
@@ -341,7 +527,7 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 			  dest[needed] = val;
 			  for (i = 0; i < len; ++i)
 			    dest[needed + 1 + i] =
-			      weights[idxarr[idxcnt] + i];
+			      l_data->weights[idxarr[idxcnt] + i];
 			}
 		      needed += 1 + len;
 #else
@@ -352,7 +538,7 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 			    dest[needed + i] = buf[i];
 			  for (i = 0; i < len; ++i)
 			    dest[needed + buflen + i] =
-			      weights[idxarr[idxcnt] + i];
+			      l_data->weights[idxarr[idxcnt] + i];
 			}
 		      needed += buflen + len;
 #endif
@@ -371,7 +557,8 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 		    backw_stop = idxcnt;
 		}
 
-	      rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
+	      rule = l_data->rulesets[rulearr[idxcnt + 1] * l_data->nrules
+				      + pass];
 	    }
 
 	  if (backw_stop != ~0ul)
@@ -382,7 +569,7 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 	      backw = idxmax - 1;
 	      while (backw > backw_stop)
 		{
-		  size_t len = weights[idxarr[--backw]++];
+		  size_t len = l_data->weights[idxarr[--backw]++];
 		  if (len != 0)
 		    {
 #ifdef WIDE_CHAR_VERSION
@@ -391,7 +578,7 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 			  dest[needed] = val;
 			  for (i = 0; i < len; ++i)
 			    dest[needed + 1 + i] =
-			      weights[idxarr[backw] + i];
+			      l_data->weights[idxarr[backw] + i];
 			}
 		      needed += 1 + len;
 #else
@@ -402,7 +589,7 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 			    dest[needed + i] = buf[i];
 			  for (i = 0; i < len; ++i)
 			    dest[needed + buflen + i] =
-			      weights[idxarr[backw] + i];
+			      l_data->weights[idxarr[backw] + i];
 			}
 		      needed += buflen + len;
 #endif
@@ -418,7 +605,7 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
       /* Finally store the byte to separate the passes or terminate
 	 the string.  */
       if (needed < n)
-	dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
+	dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
       ++needed;
     }
 
@@ -433,14 +620,87 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 	dest[needed - 1] = L('\0');
     }
 
-  /* Free the memory if needed.  */
-  if (use_malloc)
-    free (idxarr);
-
   /* Return the number of bytes/words we need, but don't count the NUL
      byte/word at the end.  */
   return needed - 1;
 }
+
+size_t
+STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
+{
+  locale_data_t l_data;
+  struct __locale_data *current = l->__locales[LC_COLLATE];
+  l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
+
+  /* Handle byte comparison case.  */
+  if (l_data.nrules == 0)
+    {
+      size_t srclen = STRLEN (src);
+
+      if (n != 0)
+	STPNCPY (dest, src, MIN (srclen + 1, n));
+
+      return srclen;
+    }
+
+  /* Handle an empty string, code hereafter relies on strlen (src) > 0.  */
+  if (*src == L('\0'))
+    {
+      if (n != 0)
+	*dest = L('\0');
+      return 0;
+    }
+
+  /* Get the locale data.  */
+  l_data.rulesets = (unsigned char *)
+    current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
+  l_data.table = (int32_t *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
+  l_data.weights = (USTRING_TYPE *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
+  l_data.extra = (USTRING_TYPE *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
+  l_data.indirect = (int32_t *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
+
+  assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
+  assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
+  assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
+  assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);
+
+  /* We need the elements of the string as unsigned values since they
+     are used as indeces.  */
+  const USTRING_TYPE *usrc = (const USTRING_TYPE *) src;
+
+  /* Allocate cache for strings up to 4 KByte on the stack and fill it with
+     weight and rule indices.  If the cache size is not sufficient, continue
+     with the uncached xfrm version.  */
+  size_t idxmax = 0;
+  const USTRING_TYPE *cur = usrc;
+  int32_t *idxarr = alloca (CACHE_SIZE * sizeof (int32_t));
+  unsigned char *rulearr = alloca (CACHE_SIZE + 1);
+
+  do
+    {
+      int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
+			     -1);
+      rulearr[idxmax] = tmp >> 24;
+      idxarr[idxmax] = tmp & 0xffffff;
+
+      ++idxmax;
+    }
+  while (*cur != L('\0') && idxmax < CACHE_SIZE);
+
+  /* This element is only read, the value never used but to determine
+     another value which then is ignored.  */
+  rulearr[idxmax] = '\0';
+
+  /* Do the transformation.  */
+  if (*cur == L('\0'))
+    return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
+  else
+    return do_xfrm (usrc, dest, n, &l_data);
+}
 libc_hidden_def (STRXFRM)
 
 #ifndef WIDE_CHAR_VERSION