stdlib: Simplify arc4random_uniform

Message ID 20220728124528.39169-1-adhemerval.zanella@linaro.org
State Superseded
Headers
Series stdlib: Simplify arc4random_uniform |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Adhemerval Zanella July 28, 2022, 12:45 p.m. UTC
  It uses the bitmask with rejection [1], which calculates a mask
being the lowest power of two bounding the request upper bound,
successively queries new random values, and rejects values
outside the requested range.

Performance-wise, there is no much gain in trying to converse
bits since arc4random is wrapper on getrandom syscall.  It should
be cheaper to just query a uint32_t value.  The algorithm also
avoids mudulo and divide operations, which might be costly
depending of the architecture.

[1] https://www.pcg-random.org/posts/bounded-rands.html
---
 stdlib/arc4random_uniform.c | 131 +++++++++---------------------------
 1 file changed, 32 insertions(+), 99 deletions(-)
  

Comments

Yann Droneaud July 28, 2022, 2:46 p.m. UTC | #1
Hi,

Le 28/07/2022 à 14:45, Adhemerval Zanella a écrit :
> It uses the bitmask with rejection [1], which calculates a mask
> being the lowest power of two bounding the request upper bound,
> successively queries new random values, and rejects values
> outside the requested range.
>
> Performance-wise, there is no much gain in trying to converse
> bits since arc4random is wrapper on getrandom syscall.  It should
> be cheaper to just query a uint32_t value.  The algorithm also
> avoids mudulo and divide operations, which might be costly
> depending of the architecture.
>
> [1] https://www.pcg-random.org/posts/bounded-rands.html
> ---
>   stdlib/arc4random_uniform.c | 131 +++++++++---------------------------
>   1 file changed, 32 insertions(+), 99 deletions(-)
>
> diff --git a/stdlib/arc4random_uniform.c b/stdlib/arc4random_uniform.c
> index 1326dfa593..425282cd15 100644
> --- a/stdlib/arc4random_uniform.c
> +++ b/stdlib/arc4random_uniform.c
> @@ -17,38 +17,19 @@
>      License along with the GNU C Library; if not, see
>      <https://www.gnu.org/licenses/>.  */
>   
> -#include <endian.h>
> -#include <libc-lock.h>
>   #include <stdlib.h>
>   #include <sys/param.h>
>   
> -/* Return the number of bytes which cover values up to the limit.  */
> -__attribute__ ((const))
> -static uint32_t
> -byte_count (uint32_t n)
> -{
> -  if (n < (1U << 8))
> -    return 1;
> -  else if (n < (1U << 16))
> -    return 2;
> -  else if (n < (1U << 24))
> -    return 3;
> -  else
> -    return 4;
> -}
> +/* Return a uniformly distributed random number less than N.  The algorithm
> +   calculates a mask being the lowest power of two bounding the upper bound
> +   N, successively queries new random values, and rejects values outside of
> +   the request range.
>   
> -/* Fill the lower bits of the result with randomness, according to the
> -   number of bytes requested.  */
> -static void
> -random_bytes (uint32_t *result, uint32_t byte_count)
> -{
> -  *result = 0;
> -  unsigned char *ptr = (unsigned char *) result;
> -  if (__BYTE_ORDER == __BIG_ENDIAN)
> -    ptr += 4 - byte_count;
> -  __arc4random_buf (ptr, byte_count);
> -}
> +   For reject values, it also tries if the remaining entropy could fit on
> +   the asked range after range adjustment.
>   
> +   The algorithm avoids modulo and divide operations, which might be costly
> +   depending on the architecture.  */
>   uint32_t
>   __arc4random_uniform (uint32_t n)
>   {
> @@ -57,83 +38,35 @@ __arc4random_uniform (uint32_t n)
>          only possible result for limit 1.  */
>       return 0;
>   
> -  /* The bits variable serves as a source for bits.  Prefetch the
> -     minimum number of bytes needed.  */
> -  uint32_t count = byte_count (n);
> -  uint32_t bits_length = count * CHAR_BIT;
> -  uint32_t bits;
> -  random_bytes (&bits, count);
> -
>     /* Powers of two are easy.  */
>     if (powerof2 (n))
> -    return bits & (n - 1);
> -
> -  /* The general case.  This algorithm follows Jérémie Lumbroso,
> -     Optimal Discrete Uniform Generation from Coin Flips, and
> -     Applications (2013), who credits Donald E. Knuth and Andrew
> -     C. Yao, The complexity of nonuniform random number generation
> -     (1976), for solving the general case.
> +    return __arc4random () & (n - 1);
>   
> -     The implementation below unrolls the initialization stage of the
> -     loop, where v is less than n.  */
> +  /* mask is the smallest power of 2 minus 1 number larger than n.  */
> +  int z = __builtin_clz (n);
> +  uint32_t mask = ~UINT32_C(0) >> z;
> +  int bits = CHAR_BIT * sizeof (uint32_t) - z;
>   
> -  /* Use 64-bit variables even though the intermediate results are
> -     never larger than 33 bits.  This ensures the code is easier to
> -     compile on 64-bit architectures.  */
> -  uint64_t v;
> -  uint64_t c;
> -
> -  /* Initialize v and c.  v is the smallest power of 2 which is larger
> -     than n.*/
> -  {
> -    uint32_t log2p1 = 32 - __builtin_clz (n);
> -    v = 1ULL << log2p1;
> -    c = bits & (v - 1);
> -    bits >>= log2p1;
> -    bits_length -= log2p1;
> -  }
> -
> -  /* At the start of the loop, c is uniformly distributed within the
> -     half-open interval [0, v), and v < 2n < 2**33.  */
> -  while (true)
> +  while (1)
>       {
> -      if (v >= n)
> -        {
> -          /* If the candidate is less than n, accept it.  */
> -          if (c < n)
> -            /* c is uniformly distributed on [0, n).  */
> -            return c;
> -          else
> -            {
> -              /* c is uniformly distributed on [n, v).  */
> -              v -= n;
> -              c -= n;
> -              /* The distribution was shifted, so c is uniformly
> -                 distributed on [0, v) again.  */
> -            }
> -        }
> -      /* v < n here.  */
> -
> -      /* Replenish the bit source if necessary.  */
> -      if (bits_length == 0)
> -        {
> -          /* Overwrite the least significant byte.  */
> -	  random_bytes (&bits, 1);
> -	  bits_length = CHAR_BIT;
> -        }
> -
> -      /* Double the range.  No overflow because v < n < 2**32.  */
> -      v *= 2;
> -      /* v < 2n here.  */
> -
> -      /* Extract a bit and append it to c.  c remains less than v and
> -         thus 2**33.  */
> -      c = (c << 1) | (bits & 1);
> -      bits >>= 1;
> -      --bits_length;
> -
> -      /* At this point, c is uniformly distributed on [0, v) again,
> -         and v < 2n < 2**33.  */
> +      uint32_t value = __arc4random ();
> +
> +      /* Return if the lower power of 2 minus 1 satisfy the condition.  */
> +      uint32_t r = value & mask;
> +      if (r < n)
> +	return r;
> +
> +      /* Otherwise check if remaining bits of entropy provides fits in the
> +	 bound.  */
> +      int bits_left = z;
> +      while (bits_left >= bits)
> +	{
> +	  value >>= bits;
> +	  r = value & mask;
> +	  if (r < n)
> +	    return r;
> +	  bits_left -= bits;
> +	}
>       }
>   }
>   libc_hidden_def (__arc4random_uniform)


It's the algorithm I suggestedin 
<9b6f8e87-9226-7828-3569-cff0e3575f9a@opteya.com>
https://sourceware.org/pipermail/libc-alpha/2022-April/138070.html


LGTM.


Reviewed-by: Yann Droneaud <ydroneaud@opteya.com>


Thanks.
  
Adhemerval Zanella July 28, 2022, 2:58 p.m. UTC | #2
On 28/07/22 11:46, Yann Droneaud wrote:
> Hi,
> 
> Le 28/07/2022 à 14:45, Adhemerval Zanella a écrit :
>> It uses the bitmask with rejection [1], which calculates a mask
>> being the lowest power of two bounding the request upper bound,
>> successively queries new random values, and rejects values
>> outside the requested range.
>>
>> Performance-wise, there is no much gain in trying to converse
>> bits since arc4random is wrapper on getrandom syscall.  It should
>> be cheaper to just query a uint32_t value.  The algorithm also
>> avoids mudulo and divide operations, which might be costly
>> depending of the architecture.
>>
>> [1] https://www.pcg-random.org/posts/bounded-rands.html
>> ---
>>   stdlib/arc4random_uniform.c | 131 +++++++++---------------------------
>>   1 file changed, 32 insertions(+), 99 deletions(-)
>>
>> diff --git a/stdlib/arc4random_uniform.c b/stdlib/arc4random_uniform.c
>> index 1326dfa593..425282cd15 100644
>> --- a/stdlib/arc4random_uniform.c
>> +++ b/stdlib/arc4random_uniform.c
>> @@ -17,38 +17,19 @@
>>      License along with the GNU C Library; if not, see
>>      <https://www.gnu.org/licenses/>.  */
>>   -#include <endian.h>
>> -#include <libc-lock.h>
>>   #include <stdlib.h>
>>   #include <sys/param.h>
>>   -/* Return the number of bytes which cover values up to the limit.  */
>> -__attribute__ ((const))
>> -static uint32_t
>> -byte_count (uint32_t n)
>> -{
>> -  if (n < (1U << 8))
>> -    return 1;
>> -  else if (n < (1U << 16))
>> -    return 2;
>> -  else if (n < (1U << 24))
>> -    return 3;
>> -  else
>> -    return 4;
>> -}
>> +/* Return a uniformly distributed random number less than N.  The algorithm
>> +   calculates a mask being the lowest power of two bounding the upper bound
>> +   N, successively queries new random values, and rejects values outside of
>> +   the request range.
>>   -/* Fill the lower bits of the result with randomness, according to the
>> -   number of bytes requested.  */
>> -static void
>> -random_bytes (uint32_t *result, uint32_t byte_count)
>> -{
>> -  *result = 0;
>> -  unsigned char *ptr = (unsigned char *) result;
>> -  if (__BYTE_ORDER == __BIG_ENDIAN)
>> -    ptr += 4 - byte_count;
>> -  __arc4random_buf (ptr, byte_count);
>> -}
>> +   For reject values, it also tries if the remaining entropy could fit on
>> +   the asked range after range adjustment.
>>   +   The algorithm avoids modulo and divide operations, which might be costly
>> +   depending on the architecture.  */
>>   uint32_t
>>   __arc4random_uniform (uint32_t n)
>>   {
>> @@ -57,83 +38,35 @@ __arc4random_uniform (uint32_t n)
>>          only possible result for limit 1.  */
>>       return 0;
>>   -  /* The bits variable serves as a source for bits.  Prefetch the
>> -     minimum number of bytes needed.  */
>> -  uint32_t count = byte_count (n);
>> -  uint32_t bits_length = count * CHAR_BIT;
>> -  uint32_t bits;
>> -  random_bytes (&bits, count);
>> -
>>     /* Powers of two are easy.  */
>>     if (powerof2 (n))
>> -    return bits & (n - 1);
>> -
>> -  /* The general case.  This algorithm follows Jérémie Lumbroso,
>> -     Optimal Discrete Uniform Generation from Coin Flips, and
>> -     Applications (2013), who credits Donald E. Knuth and Andrew
>> -     C. Yao, The complexity of nonuniform random number generation
>> -     (1976), for solving the general case.
>> +    return __arc4random () & (n - 1);
>>   -     The implementation below unrolls the initialization stage of the
>> -     loop, where v is less than n.  */
>> +  /* mask is the smallest power of 2 minus 1 number larger than n.  */
>> +  int z = __builtin_clz (n);
>> +  uint32_t mask = ~UINT32_C(0) >> z;
>> +  int bits = CHAR_BIT * sizeof (uint32_t) - z;
>>   -  /* Use 64-bit variables even though the intermediate results are
>> -     never larger than 33 bits.  This ensures the code is easier to
>> -     compile on 64-bit architectures.  */
>> -  uint64_t v;
>> -  uint64_t c;
>> -
>> -  /* Initialize v and c.  v is the smallest power of 2 which is larger
>> -     than n.*/
>> -  {
>> -    uint32_t log2p1 = 32 - __builtin_clz (n);
>> -    v = 1ULL << log2p1;
>> -    c = bits & (v - 1);
>> -    bits >>= log2p1;
>> -    bits_length -= log2p1;
>> -  }
>> -
>> -  /* At the start of the loop, c is uniformly distributed within the
>> -     half-open interval [0, v), and v < 2n < 2**33.  */
>> -  while (true)
>> +  while (1)
>>       {
>> -      if (v >= n)
>> -        {
>> -          /* If the candidate is less than n, accept it.  */
>> -          if (c < n)
>> -            /* c is uniformly distributed on [0, n).  */
>> -            return c;
>> -          else
>> -            {
>> -              /* c is uniformly distributed on [n, v).  */
>> -              v -= n;
>> -              c -= n;
>> -              /* The distribution was shifted, so c is uniformly
>> -                 distributed on [0, v) again.  */
>> -            }
>> -        }
>> -      /* v < n here.  */
>> -
>> -      /* Replenish the bit source if necessary.  */
>> -      if (bits_length == 0)
>> -        {
>> -          /* Overwrite the least significant byte.  */
>> -      random_bytes (&bits, 1);
>> -      bits_length = CHAR_BIT;
>> -        }
>> -
>> -      /* Double the range.  No overflow because v < n < 2**32.  */
>> -      v *= 2;
>> -      /* v < 2n here.  */
>> -
>> -      /* Extract a bit and append it to c.  c remains less than v and
>> -         thus 2**33.  */
>> -      c = (c << 1) | (bits & 1);
>> -      bits >>= 1;
>> -      --bits_length;
>> -
>> -      /* At this point, c is uniformly distributed on [0, v) again,
>> -         and v < 2n < 2**33.  */
>> +      uint32_t value = __arc4random ();
>> +
>> +      /* Return if the lower power of 2 minus 1 satisfy the condition.  */
>> +      uint32_t r = value & mask;
>> +      if (r < n)
>> +    return r;
>> +
>> +      /* Otherwise check if remaining bits of entropy provides fits in the
>> +     bound.  */
>> +      int bits_left = z;
>> +      while (bits_left >= bits)
>> +    {
>> +      value >>= bits;
>> +      r = value & mask;
>> +      if (r < n)
>> +        return r;
>> +      bits_left -= bits;
>> +    }
>>       }
>>   }
>>   libc_hidden_def (__arc4random_uniform)
> 
> 
> It's the algorithm I suggestedin <9b6f8e87-9226-7828-3569-cff0e3575f9a@opteya.com>
> https://sourceware.org/pipermail/libc-alpha/2022-April/138070.html

Indeed I recall you pointed out this method, with current arc4random
being a getrandom wrapper it does make more sense to use a simpler
solution.

> 
> 
> LGTM.
> 
> 
> Reviewed-by: Yann Droneaud <ydroneaud@opteya.com>
> 

Thanks.
  
Paul Eggert July 28, 2022, 8:35 p.m. UTC | #3
On 7/28/22 05:45, Adhemerval Zanella via Libc-alpha wrote:
> It uses the bitmask with rejection [1], which calculates a mask
> being the lowest power of two bounding the request upper bound,
> successively queries new random values, and rejects values
> outside the requested range.
> 
> Performance-wise, there is no much gain in trying to converse

converse -> conserve

> bits since arc4random is wrapper on getrandom syscall.  It should
> be cheaper to just query a uint32_t value.  The algorithm also
> avoids mudulo and divide operations, which might be costly

mudulo -> modulo


> +      int bits_left = z;
> +      while (bits_left >= bits)
> +	{
...
> +	  bits_left -= bits;
> +	}

Clearer would be "for (int bits_left = z; bits_left >= bits; bits_left 
-= bits)".

Looks good otherwise; thanks.
  

Patch

diff --git a/stdlib/arc4random_uniform.c b/stdlib/arc4random_uniform.c
index 1326dfa593..425282cd15 100644
--- a/stdlib/arc4random_uniform.c
+++ b/stdlib/arc4random_uniform.c
@@ -17,38 +17,19 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <endian.h>
-#include <libc-lock.h>
 #include <stdlib.h>
 #include <sys/param.h>
 
-/* Return the number of bytes which cover values up to the limit.  */
-__attribute__ ((const))
-static uint32_t
-byte_count (uint32_t n)
-{
-  if (n < (1U << 8))
-    return 1;
-  else if (n < (1U << 16))
-    return 2;
-  else if (n < (1U << 24))
-    return 3;
-  else
-    return 4;
-}
+/* Return a uniformly distributed random number less than N.  The algorithm
+   calculates a mask being the lowest power of two bounding the upper bound
+   N, successively queries new random values, and rejects values outside of
+   the request range.
 
-/* Fill the lower bits of the result with randomness, according to the
-   number of bytes requested.  */
-static void
-random_bytes (uint32_t *result, uint32_t byte_count)
-{
-  *result = 0;
-  unsigned char *ptr = (unsigned char *) result;
-  if (__BYTE_ORDER == __BIG_ENDIAN)
-    ptr += 4 - byte_count;
-  __arc4random_buf (ptr, byte_count);
-}
+   For reject values, it also tries if the remaining entropy could fit on
+   the asked range after range adjustment.
 
+   The algorithm avoids modulo and divide operations, which might be costly
+   depending on the architecture.  */
 uint32_t
 __arc4random_uniform (uint32_t n)
 {
@@ -57,83 +38,35 @@  __arc4random_uniform (uint32_t n)
        only possible result for limit 1.  */
     return 0;
 
-  /* The bits variable serves as a source for bits.  Prefetch the
-     minimum number of bytes needed.  */
-  uint32_t count = byte_count (n);
-  uint32_t bits_length = count * CHAR_BIT;
-  uint32_t bits;
-  random_bytes (&bits, count);
-
   /* Powers of two are easy.  */
   if (powerof2 (n))
-    return bits & (n - 1);
-
-  /* The general case.  This algorithm follows Jérémie Lumbroso,
-     Optimal Discrete Uniform Generation from Coin Flips, and
-     Applications (2013), who credits Donald E. Knuth and Andrew
-     C. Yao, The complexity of nonuniform random number generation
-     (1976), for solving the general case.
+    return __arc4random () & (n - 1);
 
-     The implementation below unrolls the initialization stage of the
-     loop, where v is less than n.  */
+  /* mask is the smallest power of 2 minus 1 number larger than n.  */
+  int z = __builtin_clz (n);
+  uint32_t mask = ~UINT32_C(0) >> z;
+  int bits = CHAR_BIT * sizeof (uint32_t) - z;
 
-  /* Use 64-bit variables even though the intermediate results are
-     never larger than 33 bits.  This ensures the code is easier to
-     compile on 64-bit architectures.  */
-  uint64_t v;
-  uint64_t c;
-
-  /* Initialize v and c.  v is the smallest power of 2 which is larger
-     than n.*/
-  {
-    uint32_t log2p1 = 32 - __builtin_clz (n);
-    v = 1ULL << log2p1;
-    c = bits & (v - 1);
-    bits >>= log2p1;
-    bits_length -= log2p1;
-  }
-
-  /* At the start of the loop, c is uniformly distributed within the
-     half-open interval [0, v), and v < 2n < 2**33.  */
-  while (true)
+  while (1)
     {
-      if (v >= n)
-        {
-          /* If the candidate is less than n, accept it.  */
-          if (c < n)
-            /* c is uniformly distributed on [0, n).  */
-            return c;
-          else
-            {
-              /* c is uniformly distributed on [n, v).  */
-              v -= n;
-              c -= n;
-              /* The distribution was shifted, so c is uniformly
-                 distributed on [0, v) again.  */
-            }
-        }
-      /* v < n here.  */
-
-      /* Replenish the bit source if necessary.  */
-      if (bits_length == 0)
-        {
-          /* Overwrite the least significant byte.  */
-	  random_bytes (&bits, 1);
-	  bits_length = CHAR_BIT;
-        }
-
-      /* Double the range.  No overflow because v < n < 2**32.  */
-      v *= 2;
-      /* v < 2n here.  */
-
-      /* Extract a bit and append it to c.  c remains less than v and
-         thus 2**33.  */
-      c = (c << 1) | (bits & 1);
-      bits >>= 1;
-      --bits_length;
-
-      /* At this point, c is uniformly distributed on [0, v) again,
-         and v < 2n < 2**33.  */
+      uint32_t value = __arc4random ();
+
+      /* Return if the lower power of 2 minus 1 satisfy the condition.  */
+      uint32_t r = value & mask;
+      if (r < n)
+	return r;
+
+      /* Otherwise check if remaining bits of entropy provides fits in the
+	 bound.  */
+      int bits_left = z;
+      while (bits_left >= bits)
+	{
+	  value >>= bits;
+	  r = value & mask;
+	  if (r < n)
+	    return r;
+	  bits_left -= bits;
+	}
     }
 }
 libc_hidden_def (__arc4random_uniform)