Remove atomic operations from malloc.c

Message ID 54DB130F.9070300@web.de
State New, archived
Headers

Commit Message

Leonhard Holz Feb. 11, 2015, 8:30 a.m. UTC
  The fastbin path of malloc/free features atomic operations, which were probably an 
attempt for lock-free code. But there is an ABA problem in malloc's

while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim))
         != victim);

because even if *fd == victim, victim->fd might have changed in the meantime. 
There is no easy way to fix this, an old comment mentions the need for a CAS2 
operation with tagged pointers. Consequently malloc is right now wrapped by a lock 
to avoid problems.

Anyhow, while the atomic ops avoid a lock in free they are costly too. Especially 
at high contention the compare_and_exchange may fail multiple times (see 
http://blog.memsql.com/common-pitfalls-in-writing-lock-free-algorithms/ for a 
discussion). So I measured how removing atomics and installing a lock around free 
affects performance and it turns out (at least in my case with 2 cores), that it 
has no effect:

Current implementation
threads time per iteration
1       116.709
8       705.080
16      1394.74
32      2923.03

Without atomics
threads time per iteration
1       112.541
8       715.897
16      1403.67
32      2881.30

So I propose to remove the atomic ops because it simplifies the code and 
reactivates a security check (size of top fastbin chunk in free).


	Remove atomic operations from mallocs fastbin path.
	* malloc/malloc.c (_int_malloc): Remove compare_and_exchange.
	(_int_free): Remove compare_and_exchange and lock arena if needed.
	(malloc_consolidate): Remove atomic_exchange.
  

Comments

Will Newton Feb. 11, 2015, 8:37 a.m. UTC | #1
On 11 February 2015 at 08:30, Leonhard Holz <leonhard.holz@web.de> wrote:
> The fastbin path of malloc/free features atomic operations, which were
> probably an attempt for lock-free code. But there is an ABA problem in
> malloc's
>
> while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim))
>         != victim);
>
> because even if *fd == victim, victim->fd might have changed in the
> meantime. There is no easy way to fix this, an old comment mentions the need
> for a CAS2 operation with tagged pointers. Consequently malloc is right now
> wrapped by a lock to avoid problems.
>
> Anyhow, while the atomic ops avoid a lock in free they are costly too.
> Especially at high contention the compare_and_exchange may fail multiple
> times (see
> http://blog.memsql.com/common-pitfalls-in-writing-lock-free-algorithms/ for
> a discussion). So I measured how removing atomics and installing a lock
> around free affects performance and it turns out (at least in my case with 2
> cores), that it has no effect:
>
> Current implementation
> threads time per iteration
> 1       116.709
> 8       705.080
> 16      1394.74
> 32      2923.03
>
> Without atomics
> threads time per iteration
> 1       112.541
> 8       715.897
> 16      1403.67
> 32      2881.30

Did you try running the malloc benchtest?

> So I propose to remove the atomic ops because it simplifies the code and
> reactivates a security check (size of top fastbin chunk in free).
>
>
>         Remove atomic operations from mallocs fastbin path.
>         * malloc/malloc.c (_int_malloc): Remove compare_and_exchange.
>         (_int_free): Remove compare_and_exchange and lock arena if needed.
>         (malloc_consolidate): Remove atomic_exchange.
>
>
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index aa7edbf..f43c14c 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -219,7 +219,6 @@
>  #include <malloc-machine.h>
>  #include <malloc-sysdep.h>
>
> -#include <atomic.h>
>  #include <_itoa.h>
>  #include <bits/wordsize.h>
>  #include <sys/sysinfo.h>
> @@ -3332,15 +3331,7 @@ _int_malloc (mstate av, size_t bytes)
>      {
>        idx = fastbin_index (nb);
>        mfastbinptr *fb = &fastbin (av, idx);
> -      mchunkptr pp = *fb;
> -      do
> -        {
> -          victim = pp;
> -          if (victim == NULL)
> -            break;
> -        }
> -      while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd,
> victim))
> -             != victim);
> +      victim = *fb;
>        if (victim != 0)
>          {
>            if (__builtin_expect (fastbin_index (chunksize (victim)) != idx,
> 0))
> @@ -3350,6 +3341,7 @@ _int_malloc (mstate av, size_t bytes)
>                malloc_printerr (check_action, errstr, chunk2mem (victim));
>                return NULL;
>              }
> +         *fb = victim->fd;
>            check_remalloced_chunk (av, victim, nb);
>            void *p = chunk2mem (victim);
>            alloc_perturb (p, bytes);
> @@ -3857,29 +3849,18 @@ _int_free (mstate av, mchunkptr p, int have_lock)
>  #endif
>        ) {
>
> +    if (!have_lock)
> +      {
> +       (void) mutex_lock (&av->mutex);
> +       locked = 1;
> +      }
> +
>      if (__builtin_expect (chunk_at_offset (p, size)->size <= 2 * SIZE_SZ,
> 0)
>         || __builtin_expect (chunksize (chunk_at_offset (p, size))
>                              >= av->system_mem, 0))
>        {
> -       /* We might not have a lock at this point and concurrent
> modifications
> -          of system_mem might have let to a false positive.  Redo the test
> -          after getting the lock.  */
> -       if (have_lock
> -           || ({ assert (locked == 0);
> -                 mutex_lock(&av->mutex);
> -                 locked = 1;
> -                 chunk_at_offset (p, size)->size <= 2 * SIZE_SZ
> -                   || chunksize (chunk_at_offset (p, size)) >=
> av->system_mem;
> -             }))
> -         {
> -           errstr = "free(): invalid next size (fast)";
> -           goto errout;
> -         }
> -       if (! have_lock)
> -         {
> -           (void)mutex_unlock(&av->mutex);
> -           locked = 0;
> -         }
> +       errstr = "free(): invalid next size (fast)";
> +       goto errout;
>        }
>
>      free_perturb (chunk2mem(p), size - 2 * SIZE_SZ);
> @@ -3887,35 +3868,34 @@ _int_free (mstate av, mchunkptr p, int have_lock)
>      set_fastchunks(av);
>      unsigned int idx = fastbin_index(size);
>      fb = &fastbin (av, idx);
> +    mchunkptr old = *fb;
>
> -    /* Atomically link P to its fastbin: P->FD = *FB; *FB = P;  */
> -    mchunkptr old = *fb, old2;
> -    unsigned int old_idx = ~0u;
> -    do
> +    /* Check that the top of the bin is not the record we are going to add
> +       (i.e., double free).  */
> +    if (__builtin_expect (old == p, 0))
>        {
> -       /* Check that the top of the bin is not the record we are going to
> add
> -          (i.e., double free).  */
> -       if (__builtin_expect (old == p, 0))
> -         {
> -           errstr = "double free or corruption (fasttop)";
> -           goto errout;
> -         }
> -       /* Check that size of fastbin chunk at the top is the same as
> -          size of the chunk that we are adding.  We can dereference OLD
> -          only if we have the lock, otherwise it might have already been
> -          deallocated.  See use of OLD_IDX below for the actual check.  */
> -       if (have_lock && old != NULL)
> -         old_idx = fastbin_index(chunksize(old));
> -       p->fd = old2 = old;
> +       errstr = "double free or corruption (fasttop)";
> +       goto errout;
>        }
> -    while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2)) !=
> old2);
>
> -    if (have_lock && old != NULL && __builtin_expect (old_idx != idx, 0))
> +    /* Check that size of fastbin chunk at the top is the same as
> +       size of the chunk that we are adding.  */
> +    if (old != NULL && __builtin_expect (fastbin_index(chunksize(old)) !=
> idx, 0))
>        {
>         errstr = "invalid fastbin entry (free)";
>         goto errout;
>        }
> -  }
> +
> +    /* Link P to its fastbin: P->FD = *FB; *FB = P;  */
> +    p->fd = old;
> +    *fb = p;
> +
> +    if (!have_lock)
> +      {
> +       (void) mutex_unlock(&av->mutex);
> +       locked = 0;
> +      }
> +   }
>
>    /*
>      Consolidate other non-mmapped chunks as they arrive.
> @@ -4121,7 +4101,8 @@ static void malloc_consolidate(mstate av)
>      maxfb = &fastbin (av, NFASTBINS - 1);
>      fb = &fastbin (av, 0);
>      do {
> -      p = atomic_exchange_acq (fb, 0);
> +      p = *fb;
> +      *fb = NULL;
>        if (p != 0) {
>         do {
>           check_inuse_chunk(av, p);
  
Leonhard Holz Feb. 11, 2015, 9:03 a.m. UTC | #2
>> Current implementation
>> threads time per iteration
>> 1       116.709
>> 8       705.080
>> 16      1394.74
>> 32      2923.03
>>
>> Without atomics
>> threads time per iteration
>> 1       112.541
>> 8       715.897
>> 16      1403.67
>> 32      2881.30
>
> Did you try running the malloc benchtest?
>

Yes, these are the numbers from bench-malloc-thread.
  
Will Newton Feb. 11, 2015, 9:25 a.m. UTC | #3
On 11 February 2015 at 09:03, Leonhard Holz <leonhard.holz@web.de> wrote:
>>> Current implementation
>>> threads time per iteration
>>> 1       116.709
>>> 8       705.080
>>> 16      1394.74
>>> 32      2923.03
>>>
>>> Without atomics
>>> threads time per iteration
>>> 1       112.541
>>> 8       715.897
>>> 16      1403.67
>>> 32      2881.30
>>
>>
>> Did you try running the malloc benchtest?
>>
>
> Yes, these are the numbers from bench-malloc-thread.

Good to know it has been useful to someone. ;-)

Benchmarking malloc is notoriously hard due to the wide variety of
workloads so it would be good to also run some other more system type
benchmark e.g. sysbench in order to convince people that their real
world workloads won't regress.
  
Torvald Riegel Feb. 11, 2015, 11:01 a.m. UTC | #4
On Wed, 2015-02-11 at 09:30 +0100, Leonhard Holz wrote:
> The fastbin path of malloc/free features atomic operations, which were probably an 
> attempt for lock-free code. But there is an ABA problem in malloc's
> 
> while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim))
>          != victim);
> 
> because even if *fd == victim, victim->fd might have changed in the meantime. 
> There is no easy way to fix this, an old comment mentions the need for a CAS2 
> operation with tagged pointers. Consequently malloc is right now wrapped by a lock 
> to avoid problems.

I'll have a look at the correctness side of this soon.

> Anyhow, while the atomic ops avoid a lock in free they are costly too. Especially 
> at high contention the compare_and_exchange may fail multiple times (see 
> http://blog.memsql.com/common-pitfalls-in-writing-lock-free-algorithms/ for a 
> discussion). So I measured how removing atomics and installing a lock around free 
> affects performance and it turns out (at least in my case with 2 cores), that it 
> has no effect:
> 
> Current implementation
> threads time per iteration
> 1       116.709
> 8       705.080
> 16      1394.74
> 32      2923.03
> 
> Without atomics
> threads time per iteration
> 1       112.541
> 8       715.897
> 16      1403.67
> 32      2881.30

If your machine has just two cores, then at the very least you should
measure for just two threads too; a bigger number of threads is not
putting more contention on any of the synchronization bits, there's just
some more likelihood to having to wait for a thread that isn't running.

Also, to really assess performance, this has to be benchmarked on a
machine with more cores.  Additionally, you could argue why it should
not make a difference, and if that's a compelling argument, we could
follow it instead of the benchmark (which, as Will mentions, is hard to
make representative of real-world workloads).
  
Adhemerval Zanella Netto Feb. 11, 2015, 12:50 p.m. UTC | #5
On 11-02-2015 09:01, Torvald Riegel wrote:
> On Wed, 2015-02-11 at 09:30 +0100, Leonhard Holz wrote:
>> The fastbin path of malloc/free features atomic operations, which were probably an 
>> attempt for lock-free code. But there is an ABA problem in malloc's
>>
>> while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim))
>>          != victim);
>>
>> because even if *fd == victim, victim->fd might have changed in the meantime. 
>> There is no easy way to fix this, an old comment mentions the need for a CAS2 
>> operation with tagged pointers. Consequently malloc is right now wrapped by a lock 
>> to avoid problems.
> I'll have a look at the correctness side of this soon.
>
>> Anyhow, while the atomic ops avoid a lock in free they are costly too. Especially 
>> at high contention the compare_and_exchange may fail multiple times (see 
>> http://blog.memsql.com/common-pitfalls-in-writing-lock-free-algorithms/ for a 
>> discussion). So I measured how removing atomics and installing a lock around free 
>> affects performance and it turns out (at least in my case with 2 cores), that it 
>> has no effect:
>>
>> Current implementation
>> threads time per iteration
>> 1       116.709
>> 8       705.080
>> 16      1394.74
>> 32      2923.03
>>
>> Without atomics
>> threads time per iteration
>> 1       112.541
>> 8       715.897
>> 16      1403.67
>> 32      2881.30
> If your machine has just two cores, then at the very least you should
> measure for just two threads too; a bigger number of threads is not
> putting more contention on any of the synchronization bits, there's just
> some more likelihood to having to wait for a thread that isn't running.
>
> Also, to really assess performance, this has to be benchmarked on a
> machine with more cores.  Additionally, you could argue why it should
> not make a difference, and if that's a compelling argument, we could
> follow it instead of the benchmark (which, as Will mentions, is hard to
> make representative of real-world workloads).
>
I did get into the changes itself, but at least for powerpc (POWER8/16c/128T)
I am not seeing improvements with the patch.  In fact it seems to increase
contention:

           time per iteration
nths       master     patch
1    	   51.422    75.046
8          53.077    78.507
16         57.430    89.385
32         71.206   108.359
64        114.370   172.115
128       251.731   330.924
  
Leonhard Holz Feb. 11, 2015, 1:29 p.m. UTC | #6
> I did get into the changes itself, but at least for powerpc (POWER8/16c/128T)
> I am not seeing improvements with the patch.  In fact it seems to increase
> contention:
>
>             time per iteration
> nths       master     patch
> 1    	   51.422    75.046
> 8          53.077    78.507
> 16         57.430    89.385
> 32         71.206   108.359
> 64        114.370   172.115
> 128       251.731   330.924
>

Thank you for testing! Maybe the costs of a mutex_lock are higher on PowerPC than 
on i686? Anyway it looks like I have to take a different approach...
  
Adhemerval Zanella Netto Feb. 11, 2015, 1:47 p.m. UTC | #7
On 11-02-2015 11:29, Leonhard Holz wrote:
>> I did get into the changes itself, but at least for powerpc (POWER8/16c/128T)
>> I am not seeing improvements with the patch.  In fact it seems to increase
>> contention:
>>
>>             time per iteration
>> nths       master     patch
>> 1           51.422    75.046
>> 8          53.077    78.507
>> 16         57.430    89.385
>> 32         71.206   108.359
>> 64        114.370   172.115
>> 128       251.731   330.924
>>
>
> Thank you for testing! Maybe the costs of a mutex_lock are higher on PowerPC than on i686? Anyway it looks like I have to take a different approach...
>
PowerPC uses now the default implementation at sysdeps/nptl/lowlevellock.h which 
basically translates to acquire CAS followed by a futex operation in contention
case.  So I think the gain is for powerpc (specially with high SMT), busy-wait
using like a spinlock yields better performance than possible issuing a futex
operations.
  
Leonhard Holz Feb. 11, 2015, 3:20 p.m. UTC | #8
Am 11.02.2015 um 14:47 schrieb Adhemerval Zanella:
> On 11-02-2015 11:29, Leonhard Holz wrote:
>>> I did get into the changes itself, but at least for powerpc (POWER8/16c/128T)
>>> I am not seeing improvements with the patch.  In fact it seems to increase
>>> contention:
>>>
>>>              time per iteration
>>> nths       master     patch
>>> 1           51.422    75.046
>>> 8          53.077    78.507
>>> 16         57.430    89.385
>>> 32         71.206   108.359
>>> 64        114.370   172.115
>>> 128       251.731   330.924
>>>
>>
>> Thank you for testing! Maybe the costs of a mutex_lock are higher on PowerPC than on i686? Anyway it looks like I have to take a different approach...
>>
> PowerPC uses now the default implementation at sysdeps/nptl/lowlevellock.h which
> basically translates to acquire CAS followed by a futex operation in contention
> case.  So I think the gain is for powerpc (specially with high SMT), busy-wait
> using like a spinlock yields better performance than possible issuing a futex
> operations.
>

Indeed, using a spinlock for the fastbin path would be my next try.
  
Torvald Riegel Feb. 11, 2015, 3:36 p.m. UTC | #9
On Wed, 2015-02-11 at 11:47 -0200, Adhemerval Zanella wrote:
> On 11-02-2015 11:29, Leonhard Holz wrote:
> >> I did get into the changes itself, but at least for powerpc (POWER8/16c/128T)
> >> I am not seeing improvements with the patch.  In fact it seems to increase
> >> contention:
> >>
> >>             time per iteration
> >> nths       master     patch
> >> 1           51.422    75.046
> >> 8          53.077    78.507
> >> 16         57.430    89.385
> >> 32         71.206   108.359
> >> 64        114.370   172.115
> >> 128       251.731   330.924
> >>
> >
> > Thank you for testing! Maybe the costs of a mutex_lock are higher on PowerPC than on i686? Anyway it looks like I have to take a different approach...

I don't think it's just that, but it could be a part.  When you use a
futex-based lock such as our lowlevellock, lock release needs an atomic
RMW operation as well (to find out whether there is any waiter).  That's
something that the (broken) list removal code doesn't need.

> PowerPC uses now the default implementation at sysdeps/nptl/lowlevellock.h which 
> basically translates to acquire CAS followed by a futex operation in contention
> case.  So I think the gain is for powerpc (specially with high SMT), busy-wait
> using like a spinlock yields better performance than possible issuing a futex
> operations.

Using spin-waiting should help, but I would be cautious in just using
spin-waiting.
  
Torvald Riegel Feb. 11, 2015, 3:58 p.m. UTC | #10
On Wed, 2015-02-11 at 09:30 +0100, Leonhard Holz wrote:
> The fastbin path of malloc/free features atomic operations, which were probably an 
> attempt for lock-free code. But there is an ABA problem in malloc's
> 
> while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim))
>          != victim);
> 
> because even if *fd == victim, victim->fd might have changed in the meantime. 

I agree that this would be prone to ABA under memory reuse and
concurrent removals from the free-list; also, it would likely lack a
memory barrier in the load of victim (so that loading ->fd actually sees
the right data).

However, I haven't see a case of concurrent free-list removal; all calls
to _int_malloc seem to happen while the (non-recursive, AFAICT) arena
lock is acquired.  I'm not really familiar with the malloc code, so
maybe I've missed something.  If you are aware of such concurrency, can
you point me to it?

If we don't have concurrent removals, concurrent additions to the
free-list should be fine.  Nobody can remove an element from the
free-list, so we "own" every element in the free-list.  If concurrent
_int_free adds elements to it, that doesn't matter, the memory of
everything in the free-list can't be reused, so we don't hit the ABA.

If you agree with this analysis, I'd still encourage you to send a patch
with documentation explaining this.

> There is no easy way to fix this, an old comment mentions the need for a CAS2 
> operation with tagged pointers. Consequently malloc is right now wrapped by a lock 
> to avoid problems.

I agree that allowing concurrent removal from the free-list would be
difficult.  It's not impossible to do something there, but it would be
complex.  For example, hazard pointers for users of the arena, or an
obstruction-free free-list would be possibilities (it doesn't have to
really be a list or a stack as right now, it just needs to be a bag of
free stuff).

However, I'm not familiar enough with malloc and it's performance
properties to make a concrete suggestion.
  
Leonhard Holz Feb. 11, 2015, 4:46 p.m. UTC | #11
Am 11.02.2015 um 16:58 schrieb Torvald Riegel:
> On Wed, 2015-02-11 at 09:30 +0100, Leonhard Holz wrote:
>> The fastbin path of malloc/free features atomic operations, which were probably an
>> attempt for lock-free code. But there is an ABA problem in malloc's
>>
>> while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim))
>>           != victim);
>>
>> because even if *fd == victim, victim->fd might have changed in the meantime.
>
> I agree that this would be prone to ABA under memory reuse and
> concurrent removals from the free-list; also, it would likely lack a
> memory barrier in the load of victim (so that loading ->fd actually sees
> the right data).
>
> However, I haven't see a case of concurrent free-list removal; all calls
> to _int_malloc seem to happen while the (non-recursive, AFAICT) arena
> lock is acquired.  I'm not really familiar with the malloc code, so
> maybe I've missed something.  If you are aware of such concurrency, can
> you point me to it?
>

Right, as long as every call to _int_malloc_ has the arena lock (which is the case 
in the current code) everything is fine. But it seems odd to first aquire a lock 
and afterwards use atomic operations and I think that the initial aim of the code 
was to work lockless. See 
https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=blob;f=malloc/malloc.c;hb=425ce2edb9d11cc1ff650fac16dfbc450241896a#l3586

> If we don't have concurrent removals, concurrent additions to the
> free-list should be fine.  Nobody can remove an element from the
> free-list, so we "own" every element in the free-list.  If concurrent
> _int_free adds elements to it, that doesn't matter, the memory of
> everything in the free-list can't be reused, so we don't hit the ABA.
>

I agree.

> I agree that allowing concurrent removal from the free-list would be
> difficult.  It's not impossible to do something there, but it would be
> complex.  For example, hazard pointers for users of the arena, or an
> obstruction-free free-list would be possibilities (it doesn't have to
> really be a list or a stack as right now, it just needs to be a bag of
> free stuff).
>
> However, I'm not familiar enough with malloc and it's performance
> properties to make a concrete suggestion.
>

Yes, a different data structure could also be a way to go. Or maybe a spinlock for 
the fastbin path instead of the mutex as the free-list insert & removal operations 
are very fast. A way to avoid the complicated code for non-blocking concurrency 
would be nice.
  
Torvald Riegel Feb. 11, 2015, 8:23 p.m. UTC | #12
On Wed, 2015-02-11 at 17:46 +0100, Leonhard Holz wrote:
> Am 11.02.2015 um 16:58 schrieb Torvald Riegel:
> > On Wed, 2015-02-11 at 09:30 +0100, Leonhard Holz wrote:
> >> The fastbin path of malloc/free features atomic operations, which were probably an
> >> attempt for lock-free code. But there is an ABA problem in malloc's
> >>
> >> while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim))
> >>           != victim);
> >>
> >> because even if *fd == victim, victim->fd might have changed in the meantime.
> >
> > I agree that this would be prone to ABA under memory reuse and
> > concurrent removals from the free-list; also, it would likely lack a
> > memory barrier in the load of victim (so that loading ->fd actually sees
> > the right data).
> >
> > However, I haven't see a case of concurrent free-list removal; all calls
> > to _int_malloc seem to happen while the (non-recursive, AFAICT) arena
> > lock is acquired.  I'm not really familiar with the malloc code, so
> > maybe I've missed something.  If you are aware of such concurrency, can
> > you point me to it?
> >
> 
> Right, as long as every call to _int_malloc_ has the arena lock (which is the case 
> in the current code) everything is fine. But it seems odd to first aquire a lock 
> and afterwards use atomic operations

But it is necessary in the code, right?  There is still concurrency
between insertions to the free-list and removals.  The lock just makes
removals mutually exclusive.  (I'm not trying to say that the malloc
synchronization is ideal -- just that this isn't really odd in code that
uses both locks and atomics).

> and I think that the initial aim of the code 
> was to work lockless. See 
> https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=blob;f=malloc/malloc.c;hb=425ce2edb9d11cc1ff650fac16dfbc450241896a#l3586

Maybe.  But there are other ways to handle it than DCAS.

> Yes, a different data structure could also be a way to go. Or maybe a spinlock for 
> the fastbin path instead of the mutex as the free-list insert & removal operations 
> are very fast. A way to avoid the complicated code for non-blocking concurrency 
> would be nice.

It would be convenient, and lock-based code can be very efficient -- but
I custom concurrent code using atomics and such does have its place, and
can be more efficient too.

For HW with HTM, a combination of transactions and a lock-based fallback
path might also be something that could work well in cases such as this
one.
  
Siddhesh Poyarekar Feb. 18, 2015, 1 p.m. UTC | #13
On Wed, Feb 11, 2015 at 12:01:08PM +0100, Torvald Riegel wrote:
> If your machine has just two cores, then at the very least you should
> measure for just two threads too; a bigger number of threads is not
> putting more contention on any of the synchronization bits, there's just
> some more likelihood to having to wait for a thread that isn't running.
> 
> Also, to really assess performance, this has to be benchmarked on a
> machine with more cores.  Additionally, you could argue why it should
> not make a difference, and if that's a compelling argument, we could
> follow it instead of the benchmark (which, as Will mentions, is hard to
> make representative of real-world workloads).

The default malloc implementation creates 8 * n arenas on a system
with n cores, so for anything up to 8 * n threads, you're just
measuring contention between threads for the CPU since they're all
working on different arenas.

Maybe one way to guarantee such contention is a test with one thread
that allocates on an arena and another thread that frees from the same
arena.  I don't think the current benchmark does that.

Siddhesh
  
Rich Felker Feb. 19, 2015, 2:07 a.m. UTC | #14
On Wed, Feb 18, 2015 at 06:30:10PM +0530, Siddhesh Poyarekar wrote:
> On Wed, Feb 11, 2015 at 12:01:08PM +0100, Torvald Riegel wrote:
> > If your machine has just two cores, then at the very least you should
> > measure for just two threads too; a bigger number of threads is not
> > putting more contention on any of the synchronization bits, there's just
> > some more likelihood to having to wait for a thread that isn't running.
> > 
> > Also, to really assess performance, this has to be benchmarked on a
> > machine with more cores.  Additionally, you could argue why it should
> > not make a difference, and if that's a compelling argument, we could
> > follow it instead of the benchmark (which, as Will mentions, is hard to
> > make representative of real-world workloads).
> 
> The default malloc implementation creates 8 * n arenas on a system
> with n cores, so for anything up to 8 * n threads, you're just
> measuring contention between threads for the CPU since they're all
> working on different arenas.
> 
> Maybe one way to guarantee such contention is a test with one thread
> that allocates on an arena and another thread that frees from the same
> arena.  I don't think the current benchmark does that.

I would really like to see more attention to this usage case (allocate
in one thread, free in another). It's an idiomatic msg/data-passing
strategy and probably the least complex in most cases, and it's a
shame if people are avoiding it for performance reasons.

Rich
  
Leonhard Holz Feb. 19, 2015, 6:54 a.m. UTC | #15
Am 19.02.2015 um 03:07 schrieb Rich Felker:
> On Wed, Feb 18, 2015 at 06:30:10PM +0530, Siddhesh Poyarekar wrote:
>> On Wed, Feb 11, 2015 at 12:01:08PM +0100, Torvald Riegel wrote:
>>> If your machine has just two cores, then at the very least you should
>>> measure for just two threads too; a bigger number of threads is not
>>> putting more contention on any of the synchronization bits, there's just
>>> some more likelihood to having to wait for a thread that isn't running.
>>>
>>> Also, to really assess performance, this has to be benchmarked on a
>>> machine with more cores.  Additionally, you could argue why it should
>>> not make a difference, and if that's a compelling argument, we could
>>> follow it instead of the benchmark (which, as Will mentions, is hard to
>>> make representative of real-world workloads).
>>
>> The default malloc implementation creates 8 * n arenas on a system
>> with n cores, so for anything up to 8 * n threads, you're just
>> measuring contention between threads for the CPU since they're all
>> working on different arenas.
>>
>> Maybe one way to guarantee such contention is a test with one thread
>> that allocates on an arena and another thread that frees from the same
>> arena.  I don't think the current benchmark does that.
>
> I would really like to see more attention to this usage case (allocate
> in one thread, free in another). It's an idiomatic msg/data-passing
> strategy and probably the least complex in most cases, and it's a
> shame if people are avoiding it for performance reasons.
>

I agree. The benchmark can be changed to use a shared array of allocated blocks, I 
could implement it next week maybe.
  
Siddhesh Poyarekar Feb. 19, 2015, 7:16 a.m. UTC | #16
On Thu, Feb 19, 2015 at 07:54:35AM +0100, Leonhard Holz wrote:
> I agree. The benchmark can be changed to use a shared array of allocated
> blocks, I could implement it next week maybe.

Please don't change this benchmark.  Create a new one instead.

Thanks,
Siddhesh
  

Patch

diff --git a/malloc/malloc.c b/malloc/malloc.c
index aa7edbf..f43c14c 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -219,7 +219,6 @@ 
  #include <malloc-machine.h>
  #include <malloc-sysdep.h>

-#include <atomic.h>
  #include <_itoa.h>
  #include <bits/wordsize.h>
  #include <sys/sysinfo.h>
@@ -3332,15 +3331,7 @@  _int_malloc (mstate av, size_t bytes)
      {
        idx = fastbin_index (nb);
        mfastbinptr *fb = &fastbin (av, idx);
-      mchunkptr pp = *fb;
-      do
-        {
-          victim = pp;
-          if (victim == NULL)
-            break;
-        }
-      while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim))
-             != victim);
+      victim = *fb;
        if (victim != 0)
          {
            if (__builtin_expect (fastbin_index (chunksize (victim)) != idx, 0))
@@ -3350,6 +3341,7 @@  _int_malloc (mstate av, size_t bytes)
                malloc_printerr (check_action, errstr, chunk2mem (victim));
                return NULL;
              }
+	  *fb = victim->fd;
            check_remalloced_chunk (av, victim, nb);
            void *p = chunk2mem (victim);
            alloc_perturb (p, bytes);
@@ -3857,29 +3849,18 @@  _int_free (mstate av, mchunkptr p, int have_lock)
  #endif
        ) {

+    if (!have_lock)
+      {
+	(void) mutex_lock (&av->mutex);
+	locked = 1;
+      }
+
      if (__builtin_expect (chunk_at_offset (p, size)->size <= 2 * SIZE_SZ, 0)
  	|| __builtin_expect (chunksize (chunk_at_offset (p, size))
  			     >= av->system_mem, 0))
        {
-	/* We might not have a lock at this point and concurrent modifications
-	   of system_mem might have let to a false positive.  Redo the test
-	   after getting the lock.  */
-	if (have_lock
-	    || ({ assert (locked == 0);
-		  mutex_lock(&av->mutex);
-		  locked = 1;
-		  chunk_at_offset (p, size)->size <= 2 * SIZE_SZ
-		    || chunksize (chunk_at_offset (p, size)) >= av->system_mem;
-	      }))
-	  {
-	    errstr = "free(): invalid next size (fast)";
-	    goto errout;
-	  }
-	if (! have_lock)
-	  {
-	    (void)mutex_unlock(&av->mutex);
-	    locked = 0;
-	  }
+	errstr = "free(): invalid next size (fast)";
+	goto errout;
        }

      free_perturb (chunk2mem(p), size - 2 * SIZE_SZ);
@@ -3887,35 +3868,34 @@  _int_free (mstate av, mchunkptr p, int have_lock)
      set_fastchunks(av);
      unsigned int idx = fastbin_index(size);
      fb = &fastbin (av, idx);
+    mchunkptr old = *fb;

-    /* Atomically link P to its fastbin: P->FD = *FB; *FB = P;  */
-    mchunkptr old = *fb, old2;
-    unsigned int old_idx = ~0u;
-    do
+    /* Check that the top of the bin is not the record we are going to add
+       (i.e., double free).  */
+    if (__builtin_expect (old == p, 0))
        {
-	/* Check that the top of the bin is not the record we are going to add
-	   (i.e., double free).  */
-	if (__builtin_expect (old == p, 0))
-	  {
-	    errstr = "double free or corruption (fasttop)";
-	    goto errout;
-	  }
-	/* Check that size of fastbin chunk at the top is the same as
-	   size of the chunk that we are adding.  We can dereference OLD
-	   only if we have the lock, otherwise it might have already been
-	   deallocated.  See use of OLD_IDX below for the actual check.  */
-	if (have_lock && old != NULL)
-	  old_idx = fastbin_index(chunksize(old));
-	p->fd = old2 = old;
+	errstr = "double free or corruption (fasttop)";
+	goto errout;
        }
-    while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2)) != old2);

-    if (have_lock && old != NULL && __builtin_expect (old_idx != idx, 0))
+    /* Check that size of fastbin chunk at the top is the same as
+       size of the chunk that we are adding.  */
+    if (old != NULL && __builtin_expect (fastbin_index(chunksize(old)) != idx, 0))
        {
  	errstr = "invalid fastbin entry (free)";
  	goto errout;
        }
-  }
+
+    /* Link P to its fastbin: P->FD = *FB; *FB = P;  */
+    p->fd = old;
+    *fb = p;
+
+    if (!have_lock)
+      {
+	(void) mutex_unlock(&av->mutex);
+	locked = 0;
+      }
+   }

    /*
      Consolidate other non-mmapped chunks as they arrive.
@@ -4121,7 +4101,8 @@  static void malloc_consolidate(mstate av)
      maxfb = &fastbin (av, NFASTBINS - 1);
      fb = &fastbin (av, 0);
      do {
-      p = atomic_exchange_acq (fb, 0);
+      p = *fb;
+      *fb = NULL;
        if (p != 0) {
  	do {
  	  check_inuse_chunk(av, p);