[BZ,#16009] fix memory handling in strxfrm_l

Message ID 54A42278.1050500@web.de
State Superseded
Headers

Commit Message

Leonhard Holz Dec. 31, 2014, 4:21 p.m. UTC
  Here are the updated diffs. Tests look ok but do not complete because of 
an (hopefully unrelated) compile error in string/tester.c.

Am 31.12.2014 um 10:47 schrieb Siddhesh Poyarekar:
> On Wed, Dec 31, 2014 at 10:36:11AM +0100, Leonhard Holz wrote:
>> I want to object here. The needed cache size can be in a region where the
>> malloc will not fail but hurt the system performance considerable
>> (swapping), especially as the malloced size is five times the string size. I
>> think that low level functions like strxfrm should not carry such risks.
>
> Fair enough.  In that case just mention in a comment that the size
> you've chosen is arbitrary and only ensures that it is small enough to
> be contained on the stack.  If someone wants to prove that the size is
> optimal (or another size is optimal) then they can do that on top of
> your change.
>
> Siddhesh
>

diff --git a/string/strxfrm_l.c b/string/strxfrm_l.c
index 2d3f1bd..7ba6bac 100644
--- a/string/strxfrm_l.c
+++ b/string/strxfrm_l.c
@@ -40,9 +40,24 @@
 #define CONCAT(a,b) CONCAT1(a,b)
 #define CONCAT1(a,b) a##b
 
+/* Maximum string size that is calculated with cached indices.  Right now this
+   is an arbitrary value open to optimizations.  SMALL_STR_SIZE * 4 has to be
+   lower than __MAX_ALLOCA_CUTOFF.  Keep localedata/xfrm-test.c in sync.  */
+#define SMALL_STR_SIZE 4095
+
 #include "../locale/localeinfo.h"
 #include WEIGHT_H
 
+/* Group locale data for shorter parameter lists.  */
+typedef struct
+{
+  uint_fast32_t nrules;
+  unsigned char *rulesets;
+  USTRING_TYPE *weights;
+  int32_t *table;
+  USTRING_TYPE *extra;
+  int32_t *indirect;
+} locale_data_t;
 
 #ifndef WIDE_CHAR_VERSION
 
@@ -81,113 +96,325 @@ utf8_encode (char *buf, int val)
 }
 #endif
 
+/* Find next weight and rule index.  Inlined since called for every char.  */
+static __always_inline size_t
+find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
+	  unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
+{
+  int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
+			 -1);
+  *rule_idx = tmp >> 24;
+  int32_t idx = tmp & 0xffffff;
+  size_t len = l_data->weights[idx++];
+
+  /* Skip over indices of previous levels.  */
+  for (int i = 0; i < pass; i++)
+    {
+      idx += len;
+      len = l_data->weights[idx++];
+    }
 
-size_t
-STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
+  *weight_idx = idx;
+  return len;
+}
+
+static int
+find_position (const USTRING_TYPE *us, const locale_data_t *l_data,
+	       const int pass)
 {
-  struct __locale_data *current = l->__locales[LC_COLLATE];
-  uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
-  /* We don't assign the following values right away since it might be
-     unnecessary in case there are no rules.  */
-  const unsigned char *rulesets;
-  const int32_t *table;
-  const USTRING_TYPE *weights;
-  const USTRING_TYPE *extra;
-  const int32_t *indirect;
+  int32_t weight_idx;
+  unsigned char rule_idx;
+  const USTRING_TYPE *usrc = us;
+
+  find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
+  return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
+}
+
+/* Do the transformation.  */
+static size_t
+do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n,
+	 const locale_data_t *l_data)
+{
+  int32_t weight_idx;
+  unsigned char rule_idx;
   uint_fast32_t pass;
-  size_t needed;
+  size_t needed = 0;
   size_t last_needed;
-  const USTRING_TYPE *usrc;
-  size_t srclen = STRLEN (src);
-  int32_t *idxarr;
-  unsigned char *rulearr;
-  size_t idxmax;
-  size_t idxcnt;
-  int use_malloc;
 
-  if (nrules == 0)
+  /* Now the passes over the weights.  */
+  for (pass = 0; pass < l_data->nrules; ++pass)
     {
-      if (n != 0)
-	STPNCPY (dest, src, MIN (srclen + 1, n));
+      size_t backw_len = 0;
+      last_needed = needed;
+      const USTRING_TYPE *cur = usrc;
+      const USTRING_TYPE *backw_start = NULL;
 
-      return srclen;
-    }
+       /* We assume that if a rule has defined `position' in one section
+         this is true for all of them.  */
+      int position = find_position (cur, l_data, pass);
 
-  rulesets = (const unsigned char *)
-    current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
-  table = (const int32_t *)
-    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
-  weights = (const USTRING_TYPE *)
-    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
-  extra = (const USTRING_TYPE *)
-    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
-  indirect = (const int32_t *)
-    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
-  use_malloc = 0;
+      if (position == 0)
+	{
+	  while (*cur != L('\0'))
+	    {
+	      const USTRING_TYPE *pos = cur;
+	      size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
+				     pass);
+	      int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
 
-  assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
-  assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
-  assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
-  assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
+	      if ((rule & sort_forward) != 0)
+		{
+		  /* Handle the pushed backward sequence.  */
+		  if (backw_start != NULL)
+		    { 
+		      for (size_t i = backw_len; i > 0; )
+			{
+			  int32_t weight_idx;
+			  unsigned char rule_idx;
+			  size_t len = find_idx (&backw_start, &weight_idx,
+						 &rule_idx, l_data, pass);
+			  if (needed + i < n)
+			    for (size_t j = len; j > 0; j--)
+			      dest[needed + i - j] =
+				l_data->weights[weight_idx++];
+
+			  i -= len;
+			}
 
-  /* Handle an empty string as a special case.  */
-  if (srclen == 0)
-    {
-      if (n != 0)
-	*dest = L('\0');
-      return 0;
-    }
+		      needed += backw_len;
+		      backw_start = NULL;
+		      backw_len = 0;
+		    }
 
-  /* We need the elements of the string as unsigned values since they
-     are used as indeces.  */
-  usrc = (const USTRING_TYPE *) src;
-
-  /* Perform the first pass over the string and while doing this find
-     and store the weights for each character.  Since we want this to
-     be as fast as possible we are using `alloca' to store the temporary
-     values.  But since there is no limit on the length of the string
-     we have to use `malloc' if the string is too long.  We should be
-     very conservative here.  */
-  if (! __libc_use_alloca ((srclen + 1) * (sizeof (int32_t) + 1)))
-    {
-      idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
-      rulearr = (unsigned char *) &idxarr[srclen];
-
-      if (idxarr == NULL)
-	/* No memory.  Well, go with the stack then.
-
-	   XXX Once this implementation is stable we will handle this
-	   differently.  Instead of precomputing the indeces we will
-	   do this in time.  This means, though, that this happens for
-	   every pass again.  */
-	goto try_stack;
-      use_malloc = 1;
-    }
-  else
-    {
-    try_stack:
-      idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
-      rulearr = (unsigned char *) alloca (srclen + 1);
+		  /* Now handle the forward element.  */
+		  if (needed + len < n)
+		    while (len-- > 0)
+		      dest[needed++] = l_data->weights[weight_idx++];
+		  else
+		    /* No more characters fit into the buffer.  */
+		    needed += len;
+		}
+	      else
+		{
+		  /* Remember start of the backward sequence & track length.  */
+		  if (backw_start == NULL)
+		    backw_start = pos;
+		  backw_len += len;
+		}
+	    }
+
+
+	  /* Handle the pushed backward sequence.  */
+	  if (backw_start != NULL)
+	    { 
+	      for (size_t i = backw_len; i > 0; )
+		{
+		  size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
+					 l_data, pass);
+		  if (needed + i < n)
+		    for (size_t j = len; j > 0; j--)
+		      dest[needed + i - j] =
+			l_data->weights[weight_idx++];
+
+		  i -= len;
+		}
+
+	      needed += backw_len;
+	    }
+	}
+      else
+	{
+	  int val = 1;
+#ifndef WIDE_CHAR_VERSION
+	  char buf[7];
+	  size_t buflen;
+#endif
+	  size_t i;
+
+	  while (*cur != L('\0'))
+	    {
+	      const USTRING_TYPE *pos = cur;
+	      size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
+				     pass);
+	      int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
+
+	      if ((rule & sort_forward) != 0)
+		{
+		  /* Handle the pushed backward sequence.  */
+		  if (backw_start != NULL)
+		    {
+		      for (size_t p = backw_len; p > 0; p--)
+			{
+			  size_t len;
+			  int32_t weight_idx;
+			  unsigned char rule_idx;
+			  const USTRING_TYPE *backw_cur = backw_start;
+
+			  /* To prevent a warning init the used vars.  */
+			  len = find_idx (&backw_cur, &weight_idx,
+					  &rule_idx, l_data, pass);
+
+			  for (i = 1; i < p; i++)
+			    len = find_idx (&backw_cur, &weight_idx,
+					    &rule_idx, l_data, pass);
+
+			  if (len != 0)
+			    {
+#ifdef WIDE_CHAR_VERSION
+			      if (needed + 1 + len < n)
+				{
+				  dest[needed] = val;
+				  for (i = 0; i < len; ++i)
+				    dest[needed + 1 + i] =
+				      l_data->weights[weight_idx + i];
+				}
+			      needed += 1 + len;
+#else
+			      buflen = utf8_encode (buf, val);
+			      if (needed + buflen + len < n)
+				{
+				  for (i = 0; i < buflen; ++i)
+				    dest[needed + i] = buf[i];
+				  for (i = 0; i < len; ++i)
+				    dest[needed + buflen + i] =
+				      l_data->weights[weight_idx + i];
+				}
+			      needed += buflen + len;
+#endif
+			      val = 1;
+			    }
+			  else
+			    ++val;
+			}
+
+		      backw_start = NULL;
+		      backw_len = 0;
+		    }
+
+		  /* Now handle the forward element.  */
+		  if (len != 0)
+		    {
+#ifdef WIDE_CHAR_VERSION
+		      if (needed + 1 + len < n)
+			{
+			  dest[needed] = val;
+			  for (i = 0; i < len; ++i)
+			    dest[needed + 1 + i] =
+			      l_data->weights[weight_idx + i];
+			}
+		      needed += 1 + len;
+#else
+		      buflen = utf8_encode (buf, val);
+		      if (needed + buflen + len < n)
+			{
+			  for (i = 0; i < buflen; ++i)
+			    dest[needed + i] = buf[i];
+			  for (i = 0; i < len; ++i)
+			    dest[needed + buflen + i] =
+			      l_data->weights[weight_idx + i];
+			}
+		      needed += buflen + len;
+#endif
+		      val = 1;
+		    }
+		  else
+		    ++val;
+		}
+	      else
+		{
+		  /* Remember start of the backward sequence & track length.  */
+		  if (backw_start == NULL)
+		    backw_start = pos;
+		  backw_len++;
+		}
+	    }
+
+	  /* Handle the pushed backward sequence.  */
+	  if (backw_start != NULL)
+	    {
+	      for (size_t p = backw_len; p > 0; p--)
+		{
+		  size_t len;
+		  int32_t weight_idx;
+		  unsigned char rule_idx;
+		  const USTRING_TYPE *backw_cur = backw_start;
+
+		  /* To prevent a warning init the used vars.  */
+		  len = find_idx (&backw_cur, &weight_idx,
+				  &rule_idx, l_data, pass);
+
+		  for (i = 1; i < p; i++)
+		    len = find_idx (&backw_cur, &weight_idx,
+				    &rule_idx, l_data, pass);
+
+		  if (len != 0)
+		    {
+#ifdef WIDE_CHAR_VERSION
+		      if (needed + 1 + len < n)
+			{
+			  dest[needed] = val;
+			  for (i = 0; i < len; ++i)
+			    dest[needed + 1 + i] =
+			      l_data->weights[weight_idx + i];
+			}
+		      needed += 1 + len;
+#else
+		      buflen = utf8_encode (buf, val);
+		      if (needed + buflen + len < n)
+			{
+			  for (i = 0; i < buflen; ++i)
+			    dest[needed + i] = buf[i];
+			  for (i = 0; i < len; ++i)
+			    dest[needed + buflen + i] =
+			      l_data->weights[weight_idx + i];
+			}
+		      needed += buflen + len;
+#endif
+		      val = 1;
+		    }
+		  else
+		    ++val;
+		}
+	    }
+	}
+
+      /* Finally store the byte to separate the passes or terminate
+	 the string.  */
+      if (needed < n)
+	dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
+      ++needed;
     }
 
-  idxmax = 0;
-  do
+  /* This is a little optimization: many collation specifications have
+     a `position' rule at the end and if no non-ignored character
+     is found the last \1 byte is immediately followed by a \0 byte
+     signalling this.  We can avoid the \1 byte(s).  */
+  if (needed > 2 && needed == last_needed + 1)
     {
-      int32_t tmp = findidx (table, indirect, extra, &usrc, -1);
-      rulearr[idxmax] = tmp >> 24;
-      idxarr[idxmax] = tmp & 0xffffff;
-
-      ++idxmax;
+      /* Remove the \1 byte.  */
+      if (--needed <= n)
+	dest[needed - 1] = L('\0');
     }
-  while (*usrc != L('\0'));
 
-  /* This element is only read, the value never used but to determine
-     another value which then is ignored.  */
-  rulearr[idxmax] = '\0';
+  /* Return the number of bytes/words we need, but don't count the NUL
+     byte/word at the end.  */
+  return needed - 1;
+}
+
+/* Do the transformation using weight-index and rule cache.  */
+static size_t
+do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data,
+		size_t idxmax, int32_t *idxarr, const unsigned char *rulearr)
+{
+  uint_fast32_t nrules = l_data->nrules;
+  unsigned char *rulesets = l_data->rulesets;
+  USTRING_TYPE *weights = l_data->weights;
+  uint_fast32_t pass;
+  size_t needed = 0;
+  size_t last_needed;
+  size_t idxcnt;
 
-  /* Now the passes over the weights.  We now use the indeces we found
-     before.  */
-  needed = 0;
+  /* Now the passes over the weights.  */
   for (pass = 0; pass < nrules; ++pass)
     {
       size_t backw_stop = ~0ul;
@@ -433,14 +660,87 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 	dest[needed - 1] = L('\0');
     }
 
-  /* Free the memory if needed.  */
-  if (use_malloc)
-    free (idxarr);
-
   /* Return the number of bytes/words we need, but don't count the NUL
      byte/word at the end.  */
   return needed - 1;
 }
+
+size_t
+STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
+{
+  locale_data_t l_data;
+  struct __locale_data *current = l->__locales[LC_COLLATE];
+  l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
+
+  /* Handle byte comparison case.  */
+  if (l_data.nrules == 0)
+    {
+      size_t srclen = STRLEN (src);
+
+      if (n != 0)
+	STPNCPY (dest, src, MIN (srclen + 1, n));
+
+      return srclen;
+    }
+
+  /* Handle an empty string, code hereafter relies on strlen (src) > 0.  */
+  if (*src == L('\0'))
+    {
+      if (n != 0)
+	*dest = L('\0');
+      return 0;
+    }
+
+  /* Get the locale data.  */
+  l_data.rulesets = (unsigned char *)
+    current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
+  l_data.table = (int32_t *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
+  l_data.weights = (USTRING_TYPE *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
+  l_data.extra = (USTRING_TYPE *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
+  l_data.indirect = (int32_t *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
+
+  assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
+  assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
+  assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
+  assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);
+
+  /* We need the elements of the string as unsigned values since they
+     are used as indeces.  */
+  const USTRING_TYPE *usrc = (const USTRING_TYPE *) src;
+
+  /* Allocate cache for small strings on the stack and fill it with weight and
+     rule indices.  If the cache size is not sufficient, continue with the
+     uncached xfrm version.  */
+  size_t idxmax = 0;
+  const USTRING_TYPE *cur = usrc;
+  int32_t *idxarr = alloca (SMALL_STR_SIZE * sizeof (int32_t));
+  unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1);
+
+  do
+    {
+      int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
+			     -1);
+      rulearr[idxmax] = tmp >> 24;
+      idxarr[idxmax] = tmp & 0xffffff;
+
+      ++idxmax;
+    }
+  while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);
+
+  /* This element is only read, the value never used but to determine
+     another value which then is ignored.  */
+  rulearr[idxmax] = '\0';
+
+  /* Do the transformation.  */
+  if (*cur == L('\0'))
+    return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
+  else
+    return do_xfrm (usrc, dest, n, &l_data);
+}
 libc_hidden_def (STRXFRM)
 
 #ifndef WIDE_CHAR_VERSION
diff --git a/localedata/xfrm-test.c b/localedata/xfrm-test.c
index d2aba7d..c40a7e6 100644
--- a/localedata/xfrm-test.c
+++ b/localedata/xfrm-test.c
@@ -24,6 +24,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+/* Keep in sync with string/strxfrm_l.c.  */
+#define SMALL_STR_SIZE 4095
 
 struct lines
 {
@@ -36,7 +38,7 @@ static int xstrcmp (const void *, const void *);
 int
 main (int argc, char *argv[])
 {
-  int result = 0;
+  int result = 0, nocache = 0;
   size_t nstrings, nstrings_max;
   struct lines *strings;
   char *line = NULL;
@@ -44,7 +46,18 @@ main (int argc, char *argv[])
   size_t n;
 
   if (argc < 2)
-    error (1, 0, "usage: %s <random seed>", argv[0]);
+    error (1, 0, "usage: %s <random seed> [-nocache]", argv[0]);
+
+  if (argc == 3)
+    {
+      if (strcmp (argv[2], "-nocache") == 0)
+	nocache = 1;
+      else
+	{
+	  printf ("Unknown option %s!\n", argv[2]);
+	  exit (1);
+	}
+    }
 
   setlocale (LC_ALL, "");
 
@@ -59,9 +72,9 @@ main (int argc, char *argv[])
 
   while (1)
     {
-      char saved, *newp;
-      int needed;
-      int l;
+      char saved, *word, *newp;
+      size_t l, line_len, needed;
+
       if (getline (&line, &len, stdin) < 0)
 	break;
 
@@ -83,10 +96,35 @@ main (int argc, char *argv[])
 
       saved = line[l];
       line[l] = '\0';
-      needed = strxfrm (NULL, line, 0);
+
+      if (nocache)
+	{
+	  line_len = strlen (line);
+	  word = malloc (line_len + SMALL_STR_SIZE + 1);
+	  if (word == NULL)
+	    {
+	      perror (argv[0]);
+	      exit (1);
+	    }
+	  memset (word, ' ', SMALL_STR_SIZE);
+	  memcpy (word + SMALL_STR_SIZE, line, line_len);
+	  word[line_len + SMALL_STR_SIZE] = '\0';
+	}
+      else
+        word = line;
+
+      needed = strxfrm (NULL, word, 0);
       newp = malloc (needed + 1);
-      strxfrm (newp, line, needed + 1);
+      if (newp == NULL)
+	{
+	  perror (argv[0]);
+	  exit (1);
+	}
+      strxfrm (newp, word, needed + 1);
       strings[nstrings].xfrm = newp;
+
+      if (nocache)
+	free (word);
       line[l] = saved;
       ++nstrings;
     }
  

Comments

Leonhard Holz Jan. 8, 2015, 7:15 a.m. UTC | #1
Any chance to make it into the upcoming release?

https://sourceware.org/ml/libc-alpha/2014-12/msg01001.html
  
Siddhesh Poyarekar Jan. 8, 2015, 8:45 a.m. UTC | #2
On Thu, Jan 08, 2015 at 08:15:54AM +0100, Leonhard Holz wrote:
> Any chance to make it into the upcoming release?
> 
> https://sourceware.org/ml/libc-alpha/2014-12/msg01001.html

I hope so.  I'll try to get to it tomorrow.

Siddhesh
  
Siddhesh Poyarekar Jan. 9, 2015, 9:31 a.m. UTC | #3
On Wed, Dec 31, 2014 at 05:21:12PM +0100, Leonhard Holz wrote:
> Here are the updated diffs. Tests look ok but do not complete because of an
> (hopefully unrelated) compile error in string/tester.c.

Mostly minor nits below.  Please fix them up and repost with a
ChangeLog and I'll commit it.

The first nit is: please avoid putting diffs of different individual
files as attachments and more importantly, please avoid 'attaching'
the diffs and inline them instead.  It makes it difficult to respond
with an inline review of the patch.  The easiest way to send the patch
inline is to use git-send-email: all you have to do is form your email
(along with the ChangeLog) as the git commit comment and then do
format-patch + send-email.

And now we move on to the actual patch :)

Thanks,
Siddhesh

> diff --git a/localedata/sort-test.sh b/localedata/sort-test.sh
> index e37129a..c464b05 100644
> --- a/localedata/sort-test.sh
> +++ b/localedata/sort-test.sh
> @@ -53,11 +53,18 @@ for l in $lang; do
>     ${common_objpfx}localedata/xfrm-test $id < $cns.in \
>     > ${common_objpfx}localedata/$cns.xout || here=1
>    cmp -s $cns.in ${common_objpfx}localedata/$cns.xout || here=1
> +  ${test_program_prefix_before_env} \
> +   ${run_program_env} \
> +   LC_ALL=$l ${test_program_prefix_after_env} \
> +   ${common_objpfx}localedata/xfrm-test $id -nocache < $cns.in \
> +   > ${common_objpfx}localedata/$cns.xoutl || here=1
> +  cmp -s $cns.in ${common_objpfx}localedata/$cns.xoutl || here=1

Call this $cns.nocache.xout.

>    if test $here -eq 0; then
>      echo "$l xfrm-test OK"
>    else
>      echo "$l xfrm-test FAIL"
>      diff -u $cns.in ${common_objpfx}localedata/$cns.xout | sed 's/^/  /'
> +    diff -u $cns.in ${common_objpfx}localedata/$cns.xoutl | sed 's/^/  /'
>      status=1
>    fi
>  done

> diff --git a/string/strxfrm_l.c b/string/strxfrm_l.c
> index 2d3f1bd..7ba6bac 100644
> --- a/string/strxfrm_l.c
> +++ b/string/strxfrm_l.c
> @@ -40,9 +40,24 @@
>  #define CONCAT(a,b) CONCAT1(a,b)
>  #define CONCAT1(a,b) a##b
>  
> +/* Maximum string size that is calculated with cached indices.  Right now this
> +   is an arbitrary value open to optimizations.  SMALL_STR_SIZE * 4 has to be
> +   lower than __MAX_ALLOCA_CUTOFF.  Keep localedata/xfrm-test.c in sync.  */
> +#define SMALL_STR_SIZE 4095
> +
>  #include "../locale/localeinfo.h"
>  #include WEIGHT_H
>  
> +/* Group locale data for shorter parameter lists.  */
> +typedef struct
> +{
> +  uint_fast32_t nrules;
> +  unsigned char *rulesets;
> +  USTRING_TYPE *weights;
> +  int32_t *table;
> +  USTRING_TYPE *extra;
> +  int32_t *indirect;
> +} locale_data_t;
>  
>  #ifndef WIDE_CHAR_VERSION
>  
> @@ -81,113 +96,325 @@ utf8_encode (char *buf, int val)
>  }
>  #endif
>  
> +/* Find next weight and rule index.  Inlined since called for every char.  */
> +static __always_inline size_t
> +find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
> +	  unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
> +{
> +  int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
> +			 -1);
> +  *rule_idx = tmp >> 24;
> +  int32_t idx = tmp & 0xffffff;
> +  size_t len = l_data->weights[idx++];
> +
> +  /* Skip over indices of previous levels.  */
> +  for (int i = 0; i < pass; i++)
> +    {
> +      idx += len;
> +      len = l_data->weights[idx++];
> +    }
>  
> -size_t
> -STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
> +  *weight_idx = idx;
> +  return len;
> +}
> +
> +static int
> +find_position (const USTRING_TYPE *us, const locale_data_t *l_data,
> +	       const int pass)
>  {
> -  struct __locale_data *current = l->__locales[LC_COLLATE];
> -  uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
> -  /* We don't assign the following values right away since it might be
> -     unnecessary in case there are no rules.  */
> -  const unsigned char *rulesets;
> -  const int32_t *table;
> -  const USTRING_TYPE *weights;
> -  const USTRING_TYPE *extra;
> -  const int32_t *indirect;
> +  int32_t weight_idx;
> +  unsigned char rule_idx;
> +  const USTRING_TYPE *usrc = us;
> +
> +  find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
> +  return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
> +}
> +
> +/* Do the transformation.  */
> +static size_t
> +do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n,
> +	 const locale_data_t *l_data)
> +{
> +  int32_t weight_idx;
> +  unsigned char rule_idx;
>    uint_fast32_t pass;
> -  size_t needed;
> +  size_t needed = 0;
>    size_t last_needed;
> -  const USTRING_TYPE *usrc;
> -  size_t srclen = STRLEN (src);
> -  int32_t *idxarr;
> -  unsigned char *rulearr;
> -  size_t idxmax;
> -  size_t idxcnt;
> -  int use_malloc;
>  
> -  if (nrules == 0)
> +  /* Now the passes over the weights.  */
> +  for (pass = 0; pass < l_data->nrules; ++pass)
>      {
> -      if (n != 0)
> -	STPNCPY (dest, src, MIN (srclen + 1, n));
> +      size_t backw_len = 0;
> +      last_needed = needed;
> +      const USTRING_TYPE *cur = usrc;
> +      const USTRING_TYPE *backw_start = NULL;
>  
> -      return srclen;
> -    }
> +       /* We assume that if a rule has defined `position' in one section
> +         this is true for all of them.  */
> +      int position = find_position (cur, l_data, pass);
>  
> -  rulesets = (const unsigned char *)
> -    current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
> -  table = (const int32_t *)
> -    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
> -  weights = (const USTRING_TYPE *)
> -    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
> -  extra = (const USTRING_TYPE *)
> -    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
> -  indirect = (const int32_t *)
> -    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
> -  use_malloc = 0;
> +      if (position == 0)
> +	{
> +	  while (*cur != L('\0'))
> +	    {
> +	      const USTRING_TYPE *pos = cur;
> +	      size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
> +				     pass);
> +	      int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
>  
> -  assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
> -  assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
> -  assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
> -  assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
> +	      if ((rule & sort_forward) != 0)
> +		{
> +		  /* Handle the pushed backward sequence.  */
> +		  if (backw_start != NULL)
> +		    { 

Remove the extra space at the end.

> +		      for (size_t i = backw_len; i > 0; )
> +			{
> +			  int32_t weight_idx;
> +			  unsigned char rule_idx;
> +			  size_t len = find_idx (&backw_start, &weight_idx,
> +						 &rule_idx, l_data, pass);
> +			  if (needed + i < n)
> +			    for (size_t j = len; j > 0; j--)
> +			      dest[needed + i - j] =
> +				l_data->weights[weight_idx++];
> +
> +			  i -= len;
> +			}
>  
> -  /* Handle an empty string as a special case.  */
> -  if (srclen == 0)
> -    {
> -      if (n != 0)
> -	*dest = L('\0');
> -      return 0;
> -    }
> +		      needed += backw_len;
> +		      backw_start = NULL;
> +		      backw_len = 0;
> +		    }
>  
> -  /* We need the elements of the string as unsigned values since they
> -     are used as indeces.  */
> -  usrc = (const USTRING_TYPE *) src;
> -
> -  /* Perform the first pass over the string and while doing this find
> -     and store the weights for each character.  Since we want this to
> -     be as fast as possible we are using `alloca' to store the temporary
> -     values.  But since there is no limit on the length of the string
> -     we have to use `malloc' if the string is too long.  We should be
> -     very conservative here.  */
> -  if (! __libc_use_alloca ((srclen + 1) * (sizeof (int32_t) + 1)))
> -    {
> -      idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
> -      rulearr = (unsigned char *) &idxarr[srclen];
> -
> -      if (idxarr == NULL)
> -	/* No memory.  Well, go with the stack then.
> -
> -	   XXX Once this implementation is stable we will handle this
> -	   differently.  Instead of precomputing the indeces we will
> -	   do this in time.  This means, though, that this happens for
> -	   every pass again.  */
> -	goto try_stack;
> -      use_malloc = 1;
> -    }
> -  else
> -    {
> -    try_stack:
> -      idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
> -      rulearr = (unsigned char *) alloca (srclen + 1);
> +		  /* Now handle the forward element.  */
> +		  if (needed + len < n)
> +		    while (len-- > 0)
> +		      dest[needed++] = l_data->weights[weight_idx++];
> +		  else
> +		    /* No more characters fit into the buffer.  */
> +		    needed += len;
> +		}
> +	      else
> +		{
> +		  /* Remember start of the backward sequence & track length.  */
> +		  if (backw_start == NULL)
> +		    backw_start = pos;
> +		  backw_len += len;
> +		}
> +	    }
> +
> +
> +	  /* Handle the pushed backward sequence.  */
> +	  if (backw_start != NULL)
> +	    { 

remove extra space at the end.

> +	      for (size_t i = backw_len; i > 0; )
> +		{
> +		  size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
> +					 l_data, pass);
> +		  if (needed + i < n)
> +		    for (size_t j = len; j > 0; j--)
> +		      dest[needed + i - j] =
> +			l_data->weights[weight_idx++];
> +
> +		  i -= len;
> +		}
> +
> +	      needed += backw_len;
> +	    }
> +	}
> +      else
> +	{
> +	  int val = 1;
> +#ifndef WIDE_CHAR_VERSION
> +	  char buf[7];
> +	  size_t buflen;
> +#endif
> +	  size_t i;
> +
> +	  while (*cur != L('\0'))
> +	    {
> +	      const USTRING_TYPE *pos = cur;
> +	      size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
> +				     pass);
> +	      int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
> +
> +	      if ((rule & sort_forward) != 0)
> +		{
> +		  /* Handle the pushed backward sequence.  */
> +		  if (backw_start != NULL)
> +		    {
> +		      for (size_t p = backw_len; p > 0; p--)
> +			{
> +			  size_t len;
> +			  int32_t weight_idx;
> +			  unsigned char rule_idx;
> +			  const USTRING_TYPE *backw_cur = backw_start;
> +
> +			  /* To prevent a warning init the used vars.  */
> +			  len = find_idx (&backw_cur, &weight_idx,
> +					  &rule_idx, l_data, pass);
> +
> +			  for (i = 1; i < p; i++)
> +			    len = find_idx (&backw_cur, &weight_idx,
> +					    &rule_idx, l_data, pass);
> +
> +			  if (len != 0)
> +			    {
> +#ifdef WIDE_CHAR_VERSION
> +			      if (needed + 1 + len < n)
> +				{
> +				  dest[needed] = val;
> +				  for (i = 0; i < len; ++i)
> +				    dest[needed + 1 + i] =
> +				      l_data->weights[weight_idx + i];
> +				}
> +			      needed += 1 + len;
> +#else
> +			      buflen = utf8_encode (buf, val);
> +			      if (needed + buflen + len < n)
> +				{
> +				  for (i = 0; i < buflen; ++i)
> +				    dest[needed + i] = buf[i];
> +				  for (i = 0; i < len; ++i)
> +				    dest[needed + buflen + i] =
> +				      l_data->weights[weight_idx + i];
> +				}
> +			      needed += buflen + len;
> +#endif
> +			      val = 1;
> +			    }
> +			  else
> +			    ++val;
> +			}
> +
> +		      backw_start = NULL;
> +		      backw_len = 0;
> +		    }
> +
> +		  /* Now handle the forward element.  */
> +		  if (len != 0)
> +		    {
> +#ifdef WIDE_CHAR_VERSION
> +		      if (needed + 1 + len < n)
> +			{
> +			  dest[needed] = val;
> +			  for (i = 0; i < len; ++i)
> +			    dest[needed + 1 + i] =
> +			      l_data->weights[weight_idx + i];
> +			}
> +		      needed += 1 + len;
> +#else
> +		      buflen = utf8_encode (buf, val);
> +		      if (needed + buflen + len < n)
> +			{
> +			  for (i = 0; i < buflen; ++i)
> +			    dest[needed + i] = buf[i];
> +			  for (i = 0; i < len; ++i)
> +			    dest[needed + buflen + i] =
> +			      l_data->weights[weight_idx + i];
> +			}
> +		      needed += buflen + len;
> +#endif
> +		      val = 1;
> +		    }
> +		  else
> +		    ++val;
> +		}
> +	      else
> +		{
> +		  /* Remember start of the backward sequence & track length.  */
> +		  if (backw_start == NULL)
> +		    backw_start = pos;
> +		  backw_len++;
> +		}
> +	    }
> +
> +	  /* Handle the pushed backward sequence.  */
> +	  if (backw_start != NULL)
> +	    {
> +	      for (size_t p = backw_len; p > 0; p--)
> +		{
> +		  size_t len;
> +		  int32_t weight_idx;
> +		  unsigned char rule_idx;
> +		  const USTRING_TYPE *backw_cur = backw_start;
> +
> +		  /* To prevent a warning init the used vars.  */
> +		  len = find_idx (&backw_cur, &weight_idx,
> +				  &rule_idx, l_data, pass);
> +
> +		  for (i = 1; i < p; i++)
> +		    len = find_idx (&backw_cur, &weight_idx,
> +				    &rule_idx, l_data, pass);
> +
> +		  if (len != 0)
> +		    {
> +#ifdef WIDE_CHAR_VERSION
> +		      if (needed + 1 + len < n)
> +			{
> +			  dest[needed] = val;
> +			  for (i = 0; i < len; ++i)
> +			    dest[needed + 1 + i] =
> +			      l_data->weights[weight_idx + i];
> +			}
> +		      needed += 1 + len;
> +#else
> +		      buflen = utf8_encode (buf, val);
> +		      if (needed + buflen + len < n)
> +			{
> +			  for (i = 0; i < buflen; ++i)
> +			    dest[needed + i] = buf[i];
> +			  for (i = 0; i < len; ++i)
> +			    dest[needed + buflen + i] =
> +			      l_data->weights[weight_idx + i];
> +			}
> +		      needed += buflen + len;
> +#endif
> +		      val = 1;
> +		    }
> +		  else
> +		    ++val;
> +		}
> +	    }
> +	}
> +
> +      /* Finally store the byte to separate the passes or terminate
> +	 the string.  */
> +      if (needed < n)
> +	dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
> +      ++needed;
>      }
>  
> -  idxmax = 0;
> -  do
> +  /* This is a little optimization: many collation specifications have
> +     a `position' rule at the end and if no non-ignored character
> +     is found the last \1 byte is immediately followed by a \0 byte
> +     signalling this.  We can avoid the \1 byte(s).  */
> +  if (needed > 2 && needed == last_needed + 1)
>      {
> -      int32_t tmp = findidx (table, indirect, extra, &usrc, -1);
> -      rulearr[idxmax] = tmp >> 24;
> -      idxarr[idxmax] = tmp & 0xffffff;
> -
> -      ++idxmax;
> +      /* Remove the \1 byte.  */
> +      if (--needed <= n)
> +	dest[needed - 1] = L('\0');
>      }
> -  while (*usrc != L('\0'));
>  
> -  /* This element is only read, the value never used but to determine
> -     another value which then is ignored.  */
> -  rulearr[idxmax] = '\0';
> +  /* Return the number of bytes/words we need, but don't count the NUL
> +     byte/word at the end.  */
> +  return needed - 1;
> +}
> +
> +/* Do the transformation using weight-index and rule cache.  */
> +static size_t
> +do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data,
> +		size_t idxmax, int32_t *idxarr, const unsigned char *rulearr)
> +{
> +  uint_fast32_t nrules = l_data->nrules;
> +  unsigned char *rulesets = l_data->rulesets;
> +  USTRING_TYPE *weights = l_data->weights;
> +  uint_fast32_t pass;
> +  size_t needed = 0;
> +  size_t last_needed;
> +  size_t idxcnt;
>  
> -  /* Now the passes over the weights.  We now use the indeces we found
> -     before.  */
> -  needed = 0;
> +  /* Now the passes over the weights.  */
>    for (pass = 0; pass < nrules; ++pass)
>      {
>        size_t backw_stop = ~0ul;
> @@ -433,14 +660,87 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
>  	dest[needed - 1] = L('\0');
>      }
>  
> -  /* Free the memory if needed.  */
> -  if (use_malloc)
> -    free (idxarr);
> -
>    /* Return the number of bytes/words we need, but don't count the NUL
>       byte/word at the end.  */
>    return needed - 1;
>  }
> +
> +size_t
> +STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
> +{
> +  locale_data_t l_data;
> +  struct __locale_data *current = l->__locales[LC_COLLATE];
> +  l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
> +
> +  /* Handle byte comparison case.  */
> +  if (l_data.nrules == 0)
> +    {
> +      size_t srclen = STRLEN (src);
> +
> +      if (n != 0)
> +	STPNCPY (dest, src, MIN (srclen + 1, n));
> +
> +      return srclen;
> +    }
> +
> +  /* Handle an empty string, code hereafter relies on strlen (src) > 0.  */
> +  if (*src == L('\0'))
> +    {
> +      if (n != 0)
> +	*dest = L('\0');
> +      return 0;
> +    }
> +
> +  /* Get the locale data.  */
> +  l_data.rulesets = (unsigned char *)
> +    current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
> +  l_data.table = (int32_t *)
> +    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
> +  l_data.weights = (USTRING_TYPE *)
> +    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
> +  l_data.extra = (USTRING_TYPE *)
> +    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
> +  l_data.indirect = (int32_t *)
> +    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
> +
> +  assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
> +  assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
> +  assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
> +  assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);
> +
> +  /* We need the elements of the string as unsigned values since they
> +     are used as indeces.  */
> +  const USTRING_TYPE *usrc = (const USTRING_TYPE *) src;
> +
> +  /* Allocate cache for small strings on the stack and fill it with weight and
> +     rule indices.  If the cache size is not sufficient, continue with the
> +     uncached xfrm version.  */
> +  size_t idxmax = 0;
> +  const USTRING_TYPE *cur = usrc;
> +  int32_t *idxarr = alloca (SMALL_STR_SIZE * sizeof (int32_t));
> +  unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1);
> +
> +  do
> +    {
> +      int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
> +			     -1);
> +      rulearr[idxmax] = tmp >> 24;
> +      idxarr[idxmax] = tmp & 0xffffff;
> +
> +      ++idxmax;
> +    }
> +  while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);
> +
> +  /* This element is only read, the value never used but to determine
> +     another value which then is ignored.  */
> +  rulearr[idxmax] = '\0';
> +
> +  /* Do the transformation.  */
> +  if (*cur == L('\0'))
> +    return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
> +  else
> +    return do_xfrm (usrc, dest, n, &l_data);
> +}
>  libc_hidden_def (STRXFRM)
>  
>  #ifndef WIDE_CHAR_VERSION

> diff --git a/localedata/xfrm-test.c b/localedata/xfrm-test.c
> index d2aba7d..c40a7e6 100644
> --- a/localedata/xfrm-test.c
> +++ b/localedata/xfrm-test.c
> @@ -24,6 +24,8 @@
>  #include <stdlib.h>
>  #include <string.h>
>  
> +/* Keep in sync with string/strxfrm_l.c.  */
> +#define SMALL_STR_SIZE 4095
>  
>  struct lines
>  {
> @@ -36,7 +38,7 @@ static int xstrcmp (const void *, const void *);
>  int
>  main (int argc, char *argv[])
>  {
> -  int result = 0;
> +  int result = 0, nocache = 0;

Make nocache bool.

>    size_t nstrings, nstrings_max;
>    struct lines *strings;
>    char *line = NULL;
> @@ -44,7 +46,18 @@ main (int argc, char *argv[])
>    size_t n;
>  
>    if (argc < 2)
> -    error (1, 0, "usage: %s <random seed>", argv[0]);
> +    error (1, 0, "usage: %s <random seed> [-nocache]", argv[0]);
> +
> +  if (argc == 3)
> +    {
> +      if (strcmp (argv[2], "-nocache") == 0)
> +	nocache = 1;
> +      else
> +	{
> +	  printf ("Unknown option %s!\n", argv[2]);
> +	  exit (1);
> +	}
> +    }
>  
>    setlocale (LC_ALL, "");
>  
> @@ -59,9 +72,9 @@ main (int argc, char *argv[])
>  
>    while (1)
>      {
> -      char saved, *newp;
> -      int needed;
> -      int l;
> +      char saved, *word, *newp;
> +      size_t l, line_len, needed;
> +
>        if (getline (&line, &len, stdin) < 0)
>  	break;
>  
> @@ -83,10 +96,35 @@ main (int argc, char *argv[])
>  
>        saved = line[l];
>        line[l] = '\0';
> -      needed = strxfrm (NULL, line, 0);
> +
> +      if (nocache)
> +	{
> +	  line_len = strlen (line);
> +	  word = malloc (line_len + SMALL_STR_SIZE + 1);
> +	  if (word == NULL)
> +	    {
> +	      perror (argv[0]);

Don't use perror in the test; it prints to stderr, which is then not
captured in the test.out file.  Use %m instead.

> +	      exit (1);
> +	    }
> +	  memset (word, ' ', SMALL_STR_SIZE);
> +	  memcpy (word + SMALL_STR_SIZE, line, line_len);
> +	  word[line_len + SMALL_STR_SIZE] = '\0';
> +	}
> +      else
> +        word = line;
> +
> +      needed = strxfrm (NULL, word, 0);
>        newp = malloc (needed + 1);
> -      strxfrm (newp, line, needed + 1);
> +      if (newp == NULL)
> +	{
> +	  perror (argv[0]);

Likewise.

> +	  exit (1);
> +	}
> +      strxfrm (newp, word, needed + 1);
>        strings[nstrings].xfrm = newp;
> +
> +      if (nocache)
> +	free (word);
>        line[l] = saved;
>        ++nstrings;
>      }
  

Patch

diff --git a/localedata/sort-test.sh b/localedata/sort-test.sh
index e37129a..c464b05 100644
--- a/localedata/sort-test.sh
+++ b/localedata/sort-test.sh
@@ -53,11 +53,18 @@  for l in $lang; do
    ${common_objpfx}localedata/xfrm-test $id < $cns.in \
    > ${common_objpfx}localedata/$cns.xout || here=1
   cmp -s $cns.in ${common_objpfx}localedata/$cns.xout || here=1
+  ${test_program_prefix_before_env} \
+   ${run_program_env} \
+   LC_ALL=$l ${test_program_prefix_after_env} \
+   ${common_objpfx}localedata/xfrm-test $id -nocache < $cns.in \
+   > ${common_objpfx}localedata/$cns.xoutl || here=1
+  cmp -s $cns.in ${common_objpfx}localedata/$cns.xoutl || here=1
   if test $here -eq 0; then
     echo "$l xfrm-test OK"
   else
     echo "$l xfrm-test FAIL"
     diff -u $cns.in ${common_objpfx}localedata/$cns.xout | sed 's/^/  /'
+    diff -u $cns.in ${common_objpfx}localedata/$cns.xoutl | sed 's/^/  /'
     status=1
   fi
 done