diff mbox

Linux: Implement membarrier function

Message ID Pine.LNX.4.44L0.1812101113430.1562-100000@iolanthe.rowland.org
State New
Headers show

Commit Message

Alan Stern Dec. 10, 2018, 4:22 p.m. UTC
On Thu, 6 Dec 2018, Paul E. McKenney wrote:

> Hello, David,
> 
> I took a crack at extending LKMM to accommodate what I think would
> support what you have in your paper.  Please see the very end of this
> email for a patch against the "dev" branch of my -rcu tree.
> 
> This gives the expected result for the following three litmus tests,
> but is probably deficient or otherwise misguided in other ways.  I have
> added the LKMM maintainers on CC for their amusement.  ;-)
> 
> Thoughts?

Since sys_membarrier() provides a heavyweight barrier comparable to 
synchronize_rcu(), the memory model should treat the two in the same 
way.  That's what this patch does.

The corresponding critical section would be any region of code bounded
by compiler barriers.  Since the LKMM doesn't currently handle plain
accesses, the effect is the same as if a compiler barrier were present
between each pair of instructions.  Basically, each instruction acts as
its own critical section.  Therefore the patch below defines memb-rscsi
as the trivial identity relation.  When plain accesses and compiler 
barriers are added to the memory model, a different definition will be 
needed.

This gives the correct results for the three C-Goldblat-memb-* litmus 
tests in Paul's email.

Alan

PS: The patch below is meant to apply on top of the SRCU patches, which
are not yet in the mainline kernel.

Comments

Paul E. McKenney Dec. 10, 2018, 6:25 p.m. UTC | #1
On Mon, Dec 10, 2018 at 11:22:31AM -0500, Alan Stern wrote:
> On Thu, 6 Dec 2018, Paul E. McKenney wrote:
> 
> > Hello, David,
> > 
> > I took a crack at extending LKMM to accommodate what I think would
> > support what you have in your paper.  Please see the very end of this
> > email for a patch against the "dev" branch of my -rcu tree.
> > 
> > This gives the expected result for the following three litmus tests,
> > but is probably deficient or otherwise misguided in other ways.  I have
> > added the LKMM maintainers on CC for their amusement.  ;-)
> > 
> > Thoughts?
> 
> Since sys_membarrier() provides a heavyweight barrier comparable to 
> synchronize_rcu(), the memory model should treat the two in the same 
> way.  That's what this patch does.
> 
> The corresponding critical section would be any region of code bounded
> by compiler barriers.  Since the LKMM doesn't currently handle plain
> accesses, the effect is the same as if a compiler barrier were present
> between each pair of instructions.  Basically, each instruction acts as
> its own critical section.  Therefore the patch below defines memb-rscsi
> as the trivial identity relation.  When plain accesses and compiler 
> barriers are added to the memory model, a different definition will be 
> needed.
> 
> This gives the correct results for the three C-Goldblat-memb-* litmus 
> tests in Paul's email.

Yow!!!

My first reaction was that this cannot possibly be correct because
sys_membarrier(), which is probably what we should call it, does not
wait for anything.  But your formulation has the corresponding readers
being "id", which as you say above is just a single event.

But what makes this work for the following litmus test?

------------------------------------------------------------------------

C membrcu

{
}

P0(intptr_t *x0, intptr_t *x1)
{
	WRITE_ONCE(*x0, 2);
	smp_memb();
	intptr_t r2 = READ_ONCE(*x1);
}


P1(intptr_t *x1, intptr_t *x2)
{
	WRITE_ONCE(*x1, 2);
	smp_memb();
	intptr_t r2 = READ_ONCE(*x2);
}


P2(intptr_t *x2, intptr_t *x3)
{
	WRITE_ONCE(*x2, 2);
	smp_memb();
	intptr_t r2 = READ_ONCE(*x3);
}


P3(intptr_t *x3, intptr_t *x4)
{
	rcu_read_lock();
	WRITE_ONCE(*x3, 2);
	intptr_t r2 = READ_ONCE(*x4);
	rcu_read_unlock();
}


P4(intptr_t *x4, intptr_t *x5)
{
	rcu_read_lock();
	WRITE_ONCE(*x4, 2);
	intptr_t r2 = READ_ONCE(*x5);
	rcu_read_unlock();
}


P5(intptr_t *x0, intptr_t *x5)
{
	rcu_read_lock();
	WRITE_ONCE(*x5, 2);
	intptr_t r2 = READ_ONCE(*x0);
	rcu_read_unlock();
}

exists
(5:r2=0 /\ 0:r2=0 /\ 1:r2=0 /\ 2:r2=0 /\ 3:r2=0 /\ 4:r2=0)

------------------------------------------------------------------------

For this, herd gives "Never".  Of course, if I reverse the write and
read in any of P3(), P4(), or P5(), I get "Sometimes", which does make
sense.  But what is preserving the order between P3() and P4() and
between P4() and P5()?  I am not immediately seeing how the analogy
with RCU carries over to this case.

							Thanx, Paul

> Alan
> 
> PS: The patch below is meant to apply on top of the SRCU patches, which
> are not yet in the mainline kernel.
> 
> 
> 
> Index: usb-4.x/tools/memory-model/linux-kernel.bell
> ===================================================================
> --- usb-4.x.orig/tools/memory-model/linux-kernel.bell
> +++ usb-4.x/tools/memory-model/linux-kernel.bell
> @@ -24,6 +24,7 @@ instructions RMW[{'once,'acquire,'releas
>  enum Barriers = 'wmb (*smp_wmb*) ||
>  		'rmb (*smp_rmb*) ||
>  		'mb (*smp_mb*) ||
> +		'memb (*sys_membarrier*) ||
>  		'rcu-lock (*rcu_read_lock*)  ||
>  		'rcu-unlock (*rcu_read_unlock*) ||
>  		'sync-rcu (*synchronize_rcu*) ||
> Index: usb-4.x/tools/memory-model/linux-kernel.cat
> ===================================================================
> --- usb-4.x.orig/tools/memory-model/linux-kernel.cat
> +++ usb-4.x/tools/memory-model/linux-kernel.cat
> @@ -33,7 +33,7 @@ let mb = ([M] ; fencerel(Mb) ; [M]) |
>  	([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) |
>  	([M] ; po ; [UL] ; (co | po) ; [LKW] ;
>  		fencerel(After-unlock-lock) ; [M])
> -let gp = po ; [Sync-rcu | Sync-srcu] ; po?
> +let gp = po ; [Sync-rcu | Sync-srcu | Memb] ; po?
>  
>  let strong-fence = mb | gp
>  
> @@ -102,8 +102,10 @@ acyclic pb as propagation
>   *)
>  let rcu-gp = [Sync-rcu]		(* Compare with gp *)
>  let srcu-gp = [Sync-srcu]
> +let memb-gp = [Memb]
>  let rcu-rscsi = rcu-rscs^-1
>  let srcu-rscsi = srcu-rscs^-1
> +let memb-rscsi = id
>  
>  (*
>   * The synchronize_rcu() strong fence is special in that it can order not
> @@ -119,15 +121,19 @@ let rcu-link = po? ; hb* ; pb* ; prop ;
>   * the synchronize_srcu() and srcu_read_[un]lock() calls refer to the same
>   * struct srcu_struct location.
>   *)
> -let rec rcu-fence = rcu-gp | srcu-gp |
> +let rec rcu-fence = rcu-gp | srcu-gp | memb-gp |
>  	(rcu-gp ; rcu-link ; rcu-rscsi) |
>  	((srcu-gp ; rcu-link ; srcu-rscsi) & loc) |
> +	(memb-gp ; rcu-link ; memb-rscsi) |
>  	(rcu-rscsi ; rcu-link ; rcu-gp) |
>  	((srcu-rscsi ; rcu-link ; srcu-gp) & loc) |
> +	(memb-rscsi ; rcu-link ; memb-gp) |
>  	(rcu-gp ; rcu-link ; rcu-fence ; rcu-link ; rcu-rscsi) |
>  	((srcu-gp ; rcu-link ; rcu-fence ; rcu-link ; srcu-rscsi) & loc) |
> +	(memb-gp ; rcu-link ; rcu-fence ; rcu-link ; memb-rscsi) |
>  	(rcu-rscsi ; rcu-link ; rcu-fence ; rcu-link ; rcu-gp) |
>  	((srcu-rscsi ; rcu-link ; rcu-fence ; rcu-link ; srcu-gp) & loc) |
> +	(memb-rscsi ; rcu-link ; rcu-fence ; rcu-link ; memb-gp) |
>  	(rcu-fence ; rcu-link ; rcu-fence)
>  
>  (* rb orders instructions just as pb does *)
> Index: usb-4.x/tools/memory-model/linux-kernel.def
> ===================================================================
> --- usb-4.x.orig/tools/memory-model/linux-kernel.def
> +++ usb-4.x/tools/memory-model/linux-kernel.def
> @@ -20,6 +20,7 @@ smp_store_mb(X,V) { __store{once}(X,V);
>  smp_mb() { __fence{mb}; }
>  smp_rmb() { __fence{rmb}; }
>  smp_wmb() { __fence{wmb}; }
> +smp_memb() { __fence{memb}; }
>  smp_mb__before_atomic() { __fence{before-atomic}; }
>  smp_mb__after_atomic() { __fence{after-atomic}; }
>  smp_mb__after_spinlock() { __fence{after-spinlock}; }
>
Alan Stern Dec. 11, 2018, 4:21 p.m. UTC | #2
On Mon, 10 Dec 2018, Paul E. McKenney wrote:

> On Mon, Dec 10, 2018 at 11:22:31AM -0500, Alan Stern wrote:
> > On Thu, 6 Dec 2018, Paul E. McKenney wrote:
> > 
> > > Hello, David,
> > > 
> > > I took a crack at extending LKMM to accommodate what I think would
> > > support what you have in your paper.  Please see the very end of this
> > > email for a patch against the "dev" branch of my -rcu tree.
> > > 
> > > This gives the expected result for the following three litmus tests,
> > > but is probably deficient or otherwise misguided in other ways.  I have
> > > added the LKMM maintainers on CC for their amusement.  ;-)
> > > 
> > > Thoughts?
> > 
> > Since sys_membarrier() provides a heavyweight barrier comparable to 
> > synchronize_rcu(), the memory model should treat the two in the same 
> > way.  That's what this patch does.
> > 
> > The corresponding critical section would be any region of code bounded
> > by compiler barriers.  Since the LKMM doesn't currently handle plain
> > accesses, the effect is the same as if a compiler barrier were present
> > between each pair of instructions.  Basically, each instruction acts as
> > its own critical section.  Therefore the patch below defines memb-rscsi
> > as the trivial identity relation.  When plain accesses and compiler 
> > barriers are added to the memory model, a different definition will be 
> > needed.
> > 
> > This gives the correct results for the three C-Goldblat-memb-* litmus 
> > tests in Paul's email.
> 
> Yow!!!
> 
> My first reaction was that this cannot possibly be correct because
> sys_membarrier(), which is probably what we should call it, does not
> wait for anything.  But your formulation has the corresponding readers
> being "id", which as you say above is just a single event.
> 
> But what makes this work for the following litmus test?
> 
> ------------------------------------------------------------------------
> 
> C membrcu
> 
> {
> }
> 
> P0(intptr_t *x0, intptr_t *x1)
> {
> 	WRITE_ONCE(*x0, 2);
> 	smp_memb();
> 	intptr_t r2 = READ_ONCE(*x1);
> }
> 
> 
> P1(intptr_t *x1, intptr_t *x2)
> {
> 	WRITE_ONCE(*x1, 2);
> 	smp_memb();
> 	intptr_t r2 = READ_ONCE(*x2);
> }
> 
> 
> P2(intptr_t *x2, intptr_t *x3)
> {
> 	WRITE_ONCE(*x2, 2);
> 	smp_memb();
> 	intptr_t r2 = READ_ONCE(*x3);
> }
> 
> 
> P3(intptr_t *x3, intptr_t *x4)
> {
> 	rcu_read_lock();
> 	WRITE_ONCE(*x3, 2);
> 	intptr_t r2 = READ_ONCE(*x4);
> 	rcu_read_unlock();
> }
> 
> 
> P4(intptr_t *x4, intptr_t *x5)
> {
> 	rcu_read_lock();
> 	WRITE_ONCE(*x4, 2);
> 	intptr_t r2 = READ_ONCE(*x5);
> 	rcu_read_unlock();
> }
> 
> 
> P5(intptr_t *x0, intptr_t *x5)
> {
> 	rcu_read_lock();
> 	WRITE_ONCE(*x5, 2);
> 	intptr_t r2 = READ_ONCE(*x0);
> 	rcu_read_unlock();
> }
> 
> exists
> (5:r2=0 /\ 0:r2=0 /\ 1:r2=0 /\ 2:r2=0 /\ 3:r2=0 /\ 4:r2=0)
> 
> ------------------------------------------------------------------------
> 
> For this, herd gives "Never".  Of course, if I reverse the write and
> read in any of P3(), P4(), or P5(), I get "Sometimes", which does make
> sense.  But what is preserving the order between P3() and P4() and
> between P4() and P5()?  I am not immediately seeing how the analogy
> with RCU carries over to this case.

That isn't how it works.  Nothing preserves the orders you mentioned.
It's more like: the order between P1 and P4 is preserved, as is the
order between P0 and P5.  You'll see below...

(I readily agree that this result is not simple or obvious.  It took me
quite a while to formulate the following analysis.)

To begin with, since there aren't any synchronize_rcu calls in the
test, the rcu_read_lock and rcu_read_unlock calls do nothing.  They
can be eliminated.

Also, I find the variable names "x0" - "x5" to be a little hard to
work with.  If you don't mind, I'll replace them with "a" - "f".

Now, a little digression on how sys_membarrier works.  It starts by
executing a full memory barrier.  Then it injects memory barriers into
the instruction streams of all the other CPUs and waits for them all
to complete.  Then it executes an ending memory barrier.

These barriers are ordered as described.  Therefore we have

	mb0s < mb05 < mb0e,
	mb1s < mb14 < mb1e,  and
	mb2s < mb23 < mb2e,

where mb0s is the starting barrier of the sys_memb call on P0, mb05 is
the barrier that it injects into P5, mb0e is the ending barrier of the
call, and similarly for the other sys_memb calls.  The '<' signs mean
that the thing on their left finishes before the thing on their right
does.

Rewriting the litmus test in these terms gives:

        P0      P1      P2      P3      P4      P5
        Wa=2    Wb=2    Wc=2    [mb23]  [mb14]  [mb05]
        mb0s    mb1s    mb2s    Wd=2    We=2    Wf=2
        mb0e    mb1e    mb2e    Re=0    Rf=0    Ra=0
        Rb=0    Rc=0    Rd=0

Here the brackets in "[mb23]", "[mb14]", and "[mb05]" mean that the
positions of these barriers in their respective threads' program
orderings is undetermined; they need not come at the top as shown.

(Also, in case David is unfamiliar with it, the "Wa=2" notation is
shorthand for "Write 2 to a" and "Rb=0" is short for "Read 0 from b".)

Finally, here are a few facts which may be well known and obvious, but
I'll state them anyway:

	A CPU cannot reorder instructions across a memory barrier.
	If x is po-after a barrier then x executes after the barrier
	is finished.

	If a store is po-before a barrier then the store propagates
	to every CPU before the barrier finishes.

	If a store propagates to some CPU before a load on that CPU
	reads from the same location, then the load will obtain the
	value from that store or a co-later store.  This implies that
	if a load obtains a value co-earlier than some store then the
	load must have executed before the store propagated to the
	load's CPU.

The proof consists of three main stages, each requiring three steps.
Using the facts that b - f are all read as 0, I'll show that P1
executes Rc before P3 executes Re, then that P0 executes Rb before P4
executes Rf, and lastly that P5's Ra must obtain 2, not 0.  This will
demonstrate that the litmus test is not allowed.

1.	Suppose that mb23 ends up coming po-later than Wd in P3.
	Then we would have:

		Wd propagates to P2 < mb23 < mb2e < Rd,

	and so Rd would obtain 2, not 0.  Hence mb23 must come
	po-before Wd (as shown in the listing):  mb23 < Wd.

2.	Since mb23 therefore occurs po-before Re and instructions
	cannot be reordered across barriers,  mb23 < Re.

3.	Since Rc obtains 0, we must have:

		Rc < Wc propagates to P1 < mb2s < mb23 < Re.

	Thus Rc < Re.

4.	Suppose that mb14 ends up coming po-later than We in P4.
	Then we would have:

		We propagates to P3 < mb14 < mb1e < Rc < Re,

	and so Re would obtain 2, not 0.  Hence mb14 must come
	po-before We (as shown in the listing):  mb14 < We.

5.	Since mb14 therefore occurs po-before Rf and instructions
	cannot be reordered across barriers,  mb14 < Rf.

6.	Since Rb obtains 0, we must have:

		Rb < Wb propagates to P0 < mb1s < mb14 < Rf.

	Thus Rb < Rf.

7.	Suppose that mb05 ends up coming po-later than Wf in P5.
	Then we would have:

		Wf propagates to P4 < mb05 < mb0e < Rb < Rf,

	and so Rf would obtain 2, not 0.  Hence mb05 must come
	po-before Wf (as shown in the listing):  mb05 < Wf.

8.	Since mb05 therefore occurs po-before Ra and instructions
	cannot be reordered across barriers,  mb05 < Ra.

9.	Now we have:

		Wa propagates to P5 < mb0s < mb05 < Ra,

	and so Ra must obtain 2, not 0.  QED.

Alan
Paul E. McKenney Dec. 11, 2018, 7:08 p.m. UTC | #3
On Tue, Dec 11, 2018 at 11:21:15AM -0500, Alan Stern wrote:
> On Mon, 10 Dec 2018, Paul E. McKenney wrote:
> 
> > On Mon, Dec 10, 2018 at 11:22:31AM -0500, Alan Stern wrote:
> > > On Thu, 6 Dec 2018, Paul E. McKenney wrote:
> > > 
> > > > Hello, David,
> > > > 
> > > > I took a crack at extending LKMM to accommodate what I think would
> > > > support what you have in your paper.  Please see the very end of this
> > > > email for a patch against the "dev" branch of my -rcu tree.
> > > > 
> > > > This gives the expected result for the following three litmus tests,
> > > > but is probably deficient or otherwise misguided in other ways.  I have
> > > > added the LKMM maintainers on CC for their amusement.  ;-)
> > > > 
> > > > Thoughts?
> > > 
> > > Since sys_membarrier() provides a heavyweight barrier comparable to 
> > > synchronize_rcu(), the memory model should treat the two in the same 
> > > way.  That's what this patch does.
> > > 
> > > The corresponding critical section would be any region of code bounded
> > > by compiler barriers.  Since the LKMM doesn't currently handle plain
> > > accesses, the effect is the same as if a compiler barrier were present
> > > between each pair of instructions.  Basically, each instruction acts as
> > > its own critical section.  Therefore the patch below defines memb-rscsi
> > > as the trivial identity relation.  When plain accesses and compiler 
> > > barriers are added to the memory model, a different definition will be 
> > > needed.
> > > 
> > > This gives the correct results for the three C-Goldblat-memb-* litmus 
> > > tests in Paul's email.
> > 
> > Yow!!!
> > 
> > My first reaction was that this cannot possibly be correct because
> > sys_membarrier(), which is probably what we should call it, does not
> > wait for anything.  But your formulation has the corresponding readers
> > being "id", which as you say above is just a single event.
> > 
> > But what makes this work for the following litmus test?
> > 
> > ------------------------------------------------------------------------
> > 
> > C membrcu
> > 
> > {
> > }
> > 
> > P0(intptr_t *x0, intptr_t *x1)
> > {
> > 	WRITE_ONCE(*x0, 2);
> > 	smp_memb();
> > 	intptr_t r2 = READ_ONCE(*x1);
> > }
> > 
> > 
> > P1(intptr_t *x1, intptr_t *x2)
> > {
> > 	WRITE_ONCE(*x1, 2);
> > 	smp_memb();
> > 	intptr_t r2 = READ_ONCE(*x2);
> > }
> > 
> > 
> > P2(intptr_t *x2, intptr_t *x3)
> > {
> > 	WRITE_ONCE(*x2, 2);
> > 	smp_memb();
> > 	intptr_t r2 = READ_ONCE(*x3);
> > }
> > 
> > 
> > P3(intptr_t *x3, intptr_t *x4)
> > {
> > 	rcu_read_lock();
> > 	WRITE_ONCE(*x3, 2);
> > 	intptr_t r2 = READ_ONCE(*x4);
> > 	rcu_read_unlock();
> > }
> > 
> > 
> > P4(intptr_t *x4, intptr_t *x5)
> > {
> > 	rcu_read_lock();
> > 	WRITE_ONCE(*x4, 2);
> > 	intptr_t r2 = READ_ONCE(*x5);
> > 	rcu_read_unlock();
> > }
> > 
> > 
> > P5(intptr_t *x0, intptr_t *x5)
> > {
> > 	rcu_read_lock();
> > 	WRITE_ONCE(*x5, 2);
> > 	intptr_t r2 = READ_ONCE(*x0);
> > 	rcu_read_unlock();
> > }
> > 
> > exists
> > (5:r2=0 /\ 0:r2=0 /\ 1:r2=0 /\ 2:r2=0 /\ 3:r2=0 /\ 4:r2=0)
> > 
> > ------------------------------------------------------------------------
> > 
> > For this, herd gives "Never".  Of course, if I reverse the write and
> > read in any of P3(), P4(), or P5(), I get "Sometimes", which does make
> > sense.  But what is preserving the order between P3() and P4() and
> > between P4() and P5()?  I am not immediately seeing how the analogy
> > with RCU carries over to this case.
> 
> That isn't how it works.  Nothing preserves the orders you mentioned.
> It's more like: the order between P1 and P4 is preserved, as is the
> order between P0 and P5.  You'll see below...
> 
> (I readily agree that this result is not simple or obvious.  It took me
> quite a while to formulate the following analysis.)

For whatever it is worth, David Goldblatt agrees with you to at
least some extent.  I have sent him an inquiry.  ;-)

> To begin with, since there aren't any synchronize_rcu calls in the
> test, the rcu_read_lock and rcu_read_unlock calls do nothing.  They
> can be eliminated.

Agreed.  I was just being lazy.

> Also, I find the variable names "x0" - "x5" to be a little hard to
> work with.  If you don't mind, I'll replace them with "a" - "f".

Easy enough to translate, so have at it!

> Now, a little digression on how sys_membarrier works.  It starts by
> executing a full memory barrier.  Then it injects memory barriers into
> the instruction streams of all the other CPUs and waits for them all
> to complete.  Then it executes an ending memory barrier.
> 
> These barriers are ordered as described.  Therefore we have
> 
> 	mb0s < mb05 < mb0e,
> 	mb1s < mb14 < mb1e,  and
> 	mb2s < mb23 < mb2e,
> 
> where mb0s is the starting barrier of the sys_memb call on P0, mb05 is
> the barrier that it injects into P5, mb0e is the ending barrier of the
> call, and similarly for the other sys_memb calls.  The '<' signs mean
> that the thing on their left finishes before the thing on their right
> does.
> 
> Rewriting the litmus test in these terms gives:
> 
>         P0      P1      P2      P3      P4      P5
>         Wa=2    Wb=2    Wc=2    [mb23]  [mb14]  [mb05]
>         mb0s    mb1s    mb2s    Wd=2    We=2    Wf=2
>         mb0e    mb1e    mb2e    Re=0    Rf=0    Ra=0
>         Rb=0    Rc=0    Rd=0
> 
> Here the brackets in "[mb23]", "[mb14]", and "[mb05]" mean that the
> positions of these barriers in their respective threads' program
> orderings is undetermined; they need not come at the top as shown.
> 
> (Also, in case David is unfamiliar with it, the "Wa=2" notation is
> shorthand for "Write 2 to a" and "Rb=0" is short for "Read 0 from b".)
> 
> Finally, here are a few facts which may be well known and obvious, but
> I'll state them anyway:
> 
> 	A CPU cannot reorder instructions across a memory barrier.
> 	If x is po-after a barrier then x executes after the barrier
> 	is finished.
> 
> 	If a store is po-before a barrier then the store propagates
> 	to every CPU before the barrier finishes.
> 
> 	If a store propagates to some CPU before a load on that CPU
> 	reads from the same location, then the load will obtain the
> 	value from that store or a co-later store.  This implies that
> 	if a load obtains a value co-earlier than some store then the
> 	load must have executed before the store propagated to the
> 	load's CPU.
> 
> The proof consists of three main stages, each requiring three steps.
> Using the facts that b - f are all read as 0, I'll show that P1
> executes Rc before P3 executes Re, then that P0 executes Rb before P4
> executes Rf, and lastly that P5's Ra must obtain 2, not 0.  This will
> demonstrate that the litmus test is not allowed.
> 
> 1.	Suppose that mb23 ends up coming po-later than Wd in P3.
> 	Then we would have:
> 
> 		Wd propagates to P2 < mb23 < mb2e < Rd,
> 
> 	and so Rd would obtain 2, not 0.  Hence mb23 must come
> 	po-before Wd (as shown in the listing):  mb23 < Wd.
> 
> 2.	Since mb23 therefore occurs po-before Re and instructions
> 	cannot be reordered across barriers,  mb23 < Re.
> 
> 3.	Since Rc obtains 0, we must have:
> 
> 		Rc < Wc propagates to P1 < mb2s < mb23 < Re.
> 
> 	Thus Rc < Re.
> 
> 4.	Suppose that mb14 ends up coming po-later than We in P4.
> 	Then we would have:
> 
> 		We propagates to P3 < mb14 < mb1e < Rc < Re,
> 
> 	and so Re would obtain 2, not 0.  Hence mb14 must come
> 	po-before We (as shown in the listing):  mb14 < We.
> 
> 5.	Since mb14 therefore occurs po-before Rf and instructions
> 	cannot be reordered across barriers,  mb14 < Rf.
> 
> 6.	Since Rb obtains 0, we must have:
> 
> 		Rb < Wb propagates to P0 < mb1s < mb14 < Rf.
> 
> 	Thus Rb < Rf.
> 
> 7.	Suppose that mb05 ends up coming po-later than Wf in P5.
> 	Then we would have:
> 
> 		Wf propagates to P4 < mb05 < mb0e < Rb < Rf,
> 
> 	and so Rf would obtain 2, not 0.  Hence mb05 must come
> 	po-before Wf (as shown in the listing):  mb05 < Wf.
> 
> 8.	Since mb05 therefore occurs po-before Ra and instructions
> 	cannot be reordered across barriers,  mb05 < Ra.
> 
> 9.	Now we have:
> 
> 		Wa propagates to P5 < mb0s < mb05 < Ra,
> 
> 	and so Ra must obtain 2, not 0.  QED.

Like this, then, with maximal reordering of P3-P5's reads?

         P0      P1      P2      P3      P4      P5
         Wa=2
         mb0s
                                                 [mb05]
         mb0e                                    Ra=0
         Rb=0    Wb=2
                 mb1s
                                         [mb14]
                 mb1e                    Rf=0
                 Rc=0    Wc=2                    Wf=2
                         mb2s
                                 [mb23]
                         mb2e    Re=0
                         Rd=0            We=2
                                 Wd=2

But don't the sys_membarrier() calls affect everyone, especially given
the shared-variable communication?  If so, why wouldn't this more strict
variant hold?

         P0      P1      P2      P3      P4      P5
         Wa=2
         mb0s
                                 [mb05]  [mb05]  [mb05]
         mb0e
         Rb=0    Wb=2
                 mb1s
                                 [mb14]  [mb14]  [mb14]
                 mb1e
                 Rc=0    Wc=2
                         mb2s
                                 [mb23]  [mb23]  [mb23]
                         mb2e    Re=0    Rf=0    Ra=0
                         Rd=0            We=2    Wf=2
                                 Wd=2

In which case, wouldn't this cycle be forbidden even if it had only one
sys_membarrier() call?

Ah, but the IPIs are not necessarily synchronized across the CPUs,
so that the following could happen:

         P0      P1      P2      P3      P4      P5
         Wa=2
         mb0s
                                 [mb05]  [mb05]  [mb05]
         mb0e                                    Ra=0
         Rb=0    Wb=2
                 mb1s
                                 [mb14]  [mb14]
                                         Rf=0
                                                 Wf=2
                                                 [mb14]
                 mb1e
                 Rc=0    Wc=2
                         mb2s
                                 [mb23]
                                 Re=0
                                         We=2
                                         [mb23]  [mb23]
                         mb2e
                         Rd=0
                                 Wd=2

I guess in light of this post in 2001, I really don't have an excuse,
do I?  ;-)

	https://lists.gt.net/linux/kernel/223555

Or am I still missing something here?

							Thanx, Paul
Alan Stern Dec. 11, 2018, 8:09 p.m. UTC | #4
On Tue, 11 Dec 2018, Paul E. McKenney wrote:

> > Rewriting the litmus test in these terms gives:
> > 
> >         P0      P1      P2      P3      P4      P5
> >         Wa=2    Wb=2    Wc=2    [mb23]  [mb14]  [mb05]
> >         mb0s    mb1s    mb2s    Wd=2    We=2    Wf=2
> >         mb0e    mb1e    mb2e    Re=0    Rf=0    Ra=0
> >         Rb=0    Rc=0    Rd=0
> > 
> > Here the brackets in "[mb23]", "[mb14]", and "[mb05]" mean that the
> > positions of these barriers in their respective threads' program
> > orderings is undetermined; they need not come at the top as shown.
> > 
> > (Also, in case David is unfamiliar with it, the "Wa=2" notation is
> > shorthand for "Write 2 to a" and "Rb=0" is short for "Read 0 from b".)
> > 
> > Finally, here are a few facts which may be well known and obvious, but
> > I'll state them anyway:
> > 
> > 	A CPU cannot reorder instructions across a memory barrier.
> > 	If x is po-after a barrier then x executes after the barrier
> > 	is finished.
> > 
> > 	If a store is po-before a barrier then the store propagates
> > 	to every CPU before the barrier finishes.
> > 
> > 	If a store propagates to some CPU before a load on that CPU
> > 	reads from the same location, then the load will obtain the
> > 	value from that store or a co-later store.  This implies that
> > 	if a load obtains a value co-earlier than some store then the
> > 	load must have executed before the store propagated to the
> > 	load's CPU.
> > 
> > The proof consists of three main stages, each requiring three steps.
> > Using the facts that b - f are all read as 0, I'll show that P1
> > executes Rc before P3 executes Re, then that P0 executes Rb before P4
> > executes Rf, and lastly that P5's Ra must obtain 2, not 0.  This will
> > demonstrate that the litmus test is not allowed.
> > 
> > 1.	Suppose that mb23 ends up coming po-later than Wd in P3.
> > 	Then we would have:
> > 
> > 		Wd propagates to P2 < mb23 < mb2e < Rd,
> > 
> > 	and so Rd would obtain 2, not 0.  Hence mb23 must come
> > 	po-before Wd (as shown in the listing):  mb23 < Wd.
> > 
> > 2.	Since mb23 therefore occurs po-before Re and instructions
> > 	cannot be reordered across barriers,  mb23 < Re.
> > 
> > 3.	Since Rc obtains 0, we must have:
> > 
> > 		Rc < Wc propagates to P1 < mb2s < mb23 < Re.
> > 
> > 	Thus Rc < Re.
> > 
> > 4.	Suppose that mb14 ends up coming po-later than We in P4.
> > 	Then we would have:
> > 
> > 		We propagates to P3 < mb14 < mb1e < Rc < Re,
> > 
> > 	and so Re would obtain 2, not 0.  Hence mb14 must come
> > 	po-before We (as shown in the listing):  mb14 < We.
> > 
> > 5.	Since mb14 therefore occurs po-before Rf and instructions
> > 	cannot be reordered across barriers,  mb14 < Rf.
> > 
> > 6.	Since Rb obtains 0, we must have:
> > 
> > 		Rb < Wb propagates to P0 < mb1s < mb14 < Rf.
> > 
> > 	Thus Rb < Rf.
> > 
> > 7.	Suppose that mb05 ends up coming po-later than Wf in P5.
> > 	Then we would have:
> > 
> > 		Wf propagates to P4 < mb05 < mb0e < Rb < Rf,
> > 
> > 	and so Rf would obtain 2, not 0.  Hence mb05 must come
> > 	po-before Wf (as shown in the listing):  mb05 < Wf.
> > 
> > 8.	Since mb05 therefore occurs po-before Ra and instructions
> > 	cannot be reordered across barriers,  mb05 < Ra.
> > 
> > 9.	Now we have:
> > 
> > 		Wa propagates to P5 < mb0s < mb05 < Ra,
> > 
> > 	and so Ra must obtain 2, not 0.  QED.
> 
> Like this, then, with maximal reordering of P3-P5's reads?
> 
>          P0      P1      P2      P3      P4      P5
>          Wa=2
>          mb0s
>                                                  [mb05]
>          mb0e                                    Ra=0
>          Rb=0    Wb=2
>                  mb1s
>                                          [mb14]
>                  mb1e                    Rf=0
>                  Rc=0    Wc=2                    Wf=2
>                          mb2s
>                                  [mb23]
>                          mb2e    Re=0
>                          Rd=0            We=2
>                                  Wd=2

Yes, that's right.  This shows how P5's Ra must obtain 2 instead of 0.

> But don't the sys_membarrier() calls affect everyone, especially given
> the shared-variable communication?

They do, but the other effects are irrelevant for this proof.

>  If so, why wouldn't this more strict
> variant hold?
> 
>          P0      P1      P2      P3      P4      P5
>          Wa=2
>          mb0s
>                                  [mb05]  [mb05]  [mb05]

You have misunderstood the naming scheme.  mb05 is the barrier injected 
by P0's sys_membarrier call into P5.  So the three barriers above 
should be named "mb03", "mb04", and "mb05".  And you left out mb01 and 
mb02.

>          mb0e
>          Rb=0    Wb=2
>                  mb1s
>                                  [mb14]  [mb14]  [mb14]
>                  mb1e
>                  Rc=0    Wc=2
>                          mb2s
>                                  [mb23]  [mb23]  [mb23]
>                          mb2e    Re=0    Rf=0    Ra=0
>                          Rd=0            We=2    Wf=2
>                                  Wd=2

Yes, this does hold.  But since it doesn't affect the end result, 
there's no point in mentioning all those other barriers.

> In which case, wouldn't this cycle be forbidden even if it had only one
> sys_membarrier() call?

No, it wouldn't.  I don't understand why you might think it would.  

This is just like RCU, if you imagine a tiny critical section between 
each adjacent pair of instructions.  You wouldn't expect RCU to enforce 
ordering among six CPUs with only one synchronize_rcu call.

> Ah, but the IPIs are not necessarily synchronized across the CPUs,
> so that the following could happen:
> 
>          P0      P1      P2      P3      P4      P5
>          Wa=2
>          mb0s
>                                  [mb05]  [mb05]  [mb05]
>          mb0e                                    Ra=0
>          Rb=0    Wb=2
>                  mb1s
>                                  [mb14]  [mb14]
>                                          Rf=0
>                                                  Wf=2
>                                                  [mb14]
>                  mb1e
>                  Rc=0    Wc=2
>                          mb2s
>                                  [mb23]
>                                  Re=0
>                                          We=2
>                                          [mb23]  [mb23]
>                          mb2e
>                          Rd=0
>                                  Wd=2

Yes it could.  But even in this execution you would end up with Ra=2 
instead of Ra=0.

> I guess in light of this post in 2001, I really don't have an excuse,
> do I?  ;-)
> 
> 	https://lists.gt.net/linux/kernel/223555
> 
> Or am I still missing something here?

You tell me...

Alan
Paul E. McKenney Dec. 11, 2018, 9:22 p.m. UTC | #5
On Tue, Dec 11, 2018 at 03:09:33PM -0500, Alan Stern wrote:
> On Tue, 11 Dec 2018, Paul E. McKenney wrote:
> 
> > > Rewriting the litmus test in these terms gives:
> > > 
> > >         P0      P1      P2      P3      P4      P5
> > >         Wa=2    Wb=2    Wc=2    [mb23]  [mb14]  [mb05]
> > >         mb0s    mb1s    mb2s    Wd=2    We=2    Wf=2
> > >         mb0e    mb1e    mb2e    Re=0    Rf=0    Ra=0
> > >         Rb=0    Rc=0    Rd=0
> > > 
> > > Here the brackets in "[mb23]", "[mb14]", and "[mb05]" mean that the
> > > positions of these barriers in their respective threads' program
> > > orderings is undetermined; they need not come at the top as shown.
> > > 
> > > (Also, in case David is unfamiliar with it, the "Wa=2" notation is
> > > shorthand for "Write 2 to a" and "Rb=0" is short for "Read 0 from b".)
> > > 
> > > Finally, here are a few facts which may be well known and obvious, but
> > > I'll state them anyway:
> > > 
> > > 	A CPU cannot reorder instructions across a memory barrier.
> > > 	If x is po-after a barrier then x executes after the barrier
> > > 	is finished.
> > > 
> > > 	If a store is po-before a barrier then the store propagates
> > > 	to every CPU before the barrier finishes.
> > > 
> > > 	If a store propagates to some CPU before a load on that CPU
> > > 	reads from the same location, then the load will obtain the
> > > 	value from that store or a co-later store.  This implies that
> > > 	if a load obtains a value co-earlier than some store then the
> > > 	load must have executed before the store propagated to the
> > > 	load's CPU.
> > > 
> > > The proof consists of three main stages, each requiring three steps.
> > > Using the facts that b - f are all read as 0, I'll show that P1
> > > executes Rc before P3 executes Re, then that P0 executes Rb before P4
> > > executes Rf, and lastly that P5's Ra must obtain 2, not 0.  This will
> > > demonstrate that the litmus test is not allowed.
> > > 
> > > 1.	Suppose that mb23 ends up coming po-later than Wd in P3.
> > > 	Then we would have:
> > > 
> > > 		Wd propagates to P2 < mb23 < mb2e < Rd,
> > > 
> > > 	and so Rd would obtain 2, not 0.  Hence mb23 must come
> > > 	po-before Wd (as shown in the listing):  mb23 < Wd.
> > > 
> > > 2.	Since mb23 therefore occurs po-before Re and instructions
> > > 	cannot be reordered across barriers,  mb23 < Re.
> > > 
> > > 3.	Since Rc obtains 0, we must have:
> > > 
> > > 		Rc < Wc propagates to P1 < mb2s < mb23 < Re.
> > > 
> > > 	Thus Rc < Re.
> > > 
> > > 4.	Suppose that mb14 ends up coming po-later than We in P4.
> > > 	Then we would have:
> > > 
> > > 		We propagates to P3 < mb14 < mb1e < Rc < Re,
> > > 
> > > 	and so Re would obtain 2, not 0.  Hence mb14 must come
> > > 	po-before We (as shown in the listing):  mb14 < We.
> > > 
> > > 5.	Since mb14 therefore occurs po-before Rf and instructions
> > > 	cannot be reordered across barriers,  mb14 < Rf.
> > > 
> > > 6.	Since Rb obtains 0, we must have:
> > > 
> > > 		Rb < Wb propagates to P0 < mb1s < mb14 < Rf.
> > > 
> > > 	Thus Rb < Rf.
> > > 
> > > 7.	Suppose that mb05 ends up coming po-later than Wf in P5.
> > > 	Then we would have:
> > > 
> > > 		Wf propagates to P4 < mb05 < mb0e < Rb < Rf,
> > > 
> > > 	and so Rf would obtain 2, not 0.  Hence mb05 must come
> > > 	po-before Wf (as shown in the listing):  mb05 < Wf.
> > > 
> > > 8.	Since mb05 therefore occurs po-before Ra and instructions
> > > 	cannot be reordered across barriers,  mb05 < Ra.
> > > 
> > > 9.	Now we have:
> > > 
> > > 		Wa propagates to P5 < mb0s < mb05 < Ra,
> > > 
> > > 	and so Ra must obtain 2, not 0.  QED.
> > 
> > Like this, then, with maximal reordering of P3-P5's reads?
> > 
> >          P0      P1      P2      P3      P4      P5
> >          Wa=2
> >          mb0s
> >                                                  [mb05]
> >          mb0e                                    Ra=0
> >          Rb=0    Wb=2
> >                  mb1s
> >                                          [mb14]
> >                  mb1e                    Rf=0
> >                  Rc=0    Wc=2                    Wf=2
> >                          mb2s
> >                                  [mb23]
> >                          mb2e    Re=0
> >                          Rd=0            We=2
> >                                  Wd=2
> 
> Yes, that's right.  This shows how P5's Ra must obtain 2 instead of 0.
> 
> > But don't the sys_membarrier() calls affect everyone, especially given
> > the shared-variable communication?
> 
> They do, but the other effects are irrelevant for this proof.

If I understand correctly, the shared-variable communication within
sys_membarrier() is included in your proof in the form of ordering
between memory barriers in the mainline sys_membarrier() code and
in the IPI handlers.

> >  If so, why wouldn't this more strict
> > variant hold?
> > 
> >          P0      P1      P2      P3      P4      P5
> >          Wa=2
> >          mb0s
> >                                  [mb05]  [mb05]  [mb05]
> 
> You have misunderstood the naming scheme.  mb05 is the barrier injected 
> by P0's sys_membarrier call into P5.  So the three barriers above 
> should be named "mb03", "mb04", and "mb05".  And you left out mb01 and 
> mb02.

The former is a copy-and-paste error on my part, the latter was
intentional because the IPIs among P0, P1, and P2 don't seem to
strengthen the ordering.

> >          mb0e
> >          Rb=0    Wb=2
> >                  mb1s
> >                                  [mb14]  [mb14]  [mb14]
> >                  mb1e
> >                  Rc=0    Wc=2
> >                          mb2s
> >                                  [mb23]  [mb23]  [mb23]
> >                          mb2e    Re=0    Rf=0    Ra=0
> >                          Rd=0            We=2    Wf=2
> >                                  Wd=2
> 
> Yes, this does hold.  But since it doesn't affect the end result, 
> there's no point in mentioning all those other barriers.
> 
> > In which case, wouldn't this cycle be forbidden even if it had only one
> > sys_membarrier() call?
> 
> No, it wouldn't.  I don't understand why you might think it would.  

Because I hadn't yet thought of the scenario I showed below.

> This is just like RCU, if you imagine a tiny critical section between 
> each adjacent pair of instructions.  You wouldn't expect RCU to enforce 
> ordering among six CPUs with only one synchronize_rcu call.

Yes, I do now agree in light of the scenario shown below.

> > Ah, but the IPIs are not necessarily synchronized across the CPUs,
> > so that the following could happen:
> > 
> >          P0      P1      P2      P3      P4      P5
> >          Wa=2
> >          mb0s
> >                                  [mb05]  [mb05]  [mb05]
> >          mb0e                                    Ra=0
> >          Rb=0    Wb=2
> >                  mb1s
> >                                  [mb14]  [mb14]
> >                                          Rf=0
> >                                                  Wf=2
> >                                                  [mb14]
> >                  mb1e
> >                  Rc=0    Wc=2
> >                          mb2s
> >                                  [mb23]
> >                                  Re=0
> >                                          We=2
> >                                          [mb23]  [mb23]
> >                          mb2e
> >                          Rd=0
> >                                  Wd=2
> 
> Yes it could.  But even in this execution you would end up with Ra=2 
> instead of Ra=0.

Agreed.  Or I should have said that the above execution is forbidden,
either way.

> > I guess in light of this post in 2001, I really don't have an excuse,
> > do I?  ;-)
> > 
> > 	https://lists.gt.net/linux/kernel/223555
> > 
> > Or am I still missing something here?
> 
> You tell me...

I think I am on board.  ;-)

							Thanx, Paul
Paul E. McKenney Dec. 12, 2018, 5:07 p.m. UTC | #6
On Tue, Dec 11, 2018 at 01:22:04PM -0800, Paul E. McKenney wrote:
> On Tue, Dec 11, 2018 at 03:09:33PM -0500, Alan Stern wrote:
> > On Tue, 11 Dec 2018, Paul E. McKenney wrote:
> > 
> > > > Rewriting the litmus test in these terms gives:
> > > > 
> > > >         P0      P1      P2      P3      P4      P5
> > > >         Wa=2    Wb=2    Wc=2    [mb23]  [mb14]  [mb05]
> > > >         mb0s    mb1s    mb2s    Wd=2    We=2    Wf=2
> > > >         mb0e    mb1e    mb2e    Re=0    Rf=0    Ra=0
> > > >         Rb=0    Rc=0    Rd=0
> > > > 
> > > > Here the brackets in "[mb23]", "[mb14]", and "[mb05]" mean that the
> > > > positions of these barriers in their respective threads' program
> > > > orderings is undetermined; they need not come at the top as shown.
> > > > 
> > > > (Also, in case David is unfamiliar with it, the "Wa=2" notation is
> > > > shorthand for "Write 2 to a" and "Rb=0" is short for "Read 0 from b".)
> > > > 
> > > > Finally, here are a few facts which may be well known and obvious, but
> > > > I'll state them anyway:
> > > > 
> > > > 	A CPU cannot reorder instructions across a memory barrier.
> > > > 	If x is po-after a barrier then x executes after the barrier
> > > > 	is finished.
> > > > 
> > > > 	If a store is po-before a barrier then the store propagates
> > > > 	to every CPU before the barrier finishes.
> > > > 
> > > > 	If a store propagates to some CPU before a load on that CPU
> > > > 	reads from the same location, then the load will obtain the
> > > > 	value from that store or a co-later store.  This implies that
> > > > 	if a load obtains a value co-earlier than some store then the
> > > > 	load must have executed before the store propagated to the
> > > > 	load's CPU.
> > > > 
> > > > The proof consists of three main stages, each requiring three steps.
> > > > Using the facts that b - f are all read as 0, I'll show that P1
> > > > executes Rc before P3 executes Re, then that P0 executes Rb before P4
> > > > executes Rf, and lastly that P5's Ra must obtain 2, not 0.  This will
> > > > demonstrate that the litmus test is not allowed.
> > > > 
> > > > 1.	Suppose that mb23 ends up coming po-later than Wd in P3.
> > > > 	Then we would have:
> > > > 
> > > > 		Wd propagates to P2 < mb23 < mb2e < Rd,
> > > > 
> > > > 	and so Rd would obtain 2, not 0.  Hence mb23 must come
> > > > 	po-before Wd (as shown in the listing):  mb23 < Wd.
> > > > 
> > > > 2.	Since mb23 therefore occurs po-before Re and instructions
> > > > 	cannot be reordered across barriers,  mb23 < Re.
> > > > 
> > > > 3.	Since Rc obtains 0, we must have:
> > > > 
> > > > 		Rc < Wc propagates to P1 < mb2s < mb23 < Re.
> > > > 
> > > > 	Thus Rc < Re.
> > > > 
> > > > 4.	Suppose that mb14 ends up coming po-later than We in P4.
> > > > 	Then we would have:
> > > > 
> > > > 		We propagates to P3 < mb14 < mb1e < Rc < Re,
> > > > 
> > > > 	and so Re would obtain 2, not 0.  Hence mb14 must come
> > > > 	po-before We (as shown in the listing):  mb14 < We.
> > > > 
> > > > 5.	Since mb14 therefore occurs po-before Rf and instructions
> > > > 	cannot be reordered across barriers,  mb14 < Rf.
> > > > 
> > > > 6.	Since Rb obtains 0, we must have:
> > > > 
> > > > 		Rb < Wb propagates to P0 < mb1s < mb14 < Rf.
> > > > 
> > > > 	Thus Rb < Rf.
> > > > 
> > > > 7.	Suppose that mb05 ends up coming po-later than Wf in P5.
> > > > 	Then we would have:
> > > > 
> > > > 		Wf propagates to P4 < mb05 < mb0e < Rb < Rf,
> > > > 
> > > > 	and so Rf would obtain 2, not 0.  Hence mb05 must come
> > > > 	po-before Wf (as shown in the listing):  mb05 < Wf.
> > > > 
> > > > 8.	Since mb05 therefore occurs po-before Ra and instructions
> > > > 	cannot be reordered across barriers,  mb05 < Ra.
> > > > 
> > > > 9.	Now we have:
> > > > 
> > > > 		Wa propagates to P5 < mb0s < mb05 < Ra,
> > > > 
> > > > 	and so Ra must obtain 2, not 0.  QED.
> > > 
> > > Like this, then, with maximal reordering of P3-P5's reads?
> > > 
> > >          P0      P1      P2      P3      P4      P5
> > >          Wa=2
> > >          mb0s
> > >                                                  [mb05]
> > >          mb0e                                    Ra=0
> > >          Rb=0    Wb=2
> > >                  mb1s
> > >                                          [mb14]
> > >                  mb1e                    Rf=0
> > >                  Rc=0    Wc=2                    Wf=2
> > >                          mb2s
> > >                                  [mb23]
> > >                          mb2e    Re=0
> > >                          Rd=0            We=2
> > >                                  Wd=2
> > 
> > Yes, that's right.  This shows how P5's Ra must obtain 2 instead of 0.
> > 
> > > But don't the sys_membarrier() calls affect everyone, especially given
> > > the shared-variable communication?
> > 
> > They do, but the other effects are irrelevant for this proof.
> 
> If I understand correctly, the shared-variable communication within
> sys_membarrier() is included in your proof in the form of ordering
> between memory barriers in the mainline sys_membarrier() code and
> in the IPI handlers.
> 
> > >  If so, why wouldn't this more strict
> > > variant hold?
> > > 
> > >          P0      P1      P2      P3      P4      P5
> > >          Wa=2
> > >          mb0s
> > >                                  [mb05]  [mb05]  [mb05]
> > 
> > You have misunderstood the naming scheme.  mb05 is the barrier injected 
> > by P0's sys_membarrier call into P5.  So the three barriers above 
> > should be named "mb03", "mb04", and "mb05".  And you left out mb01 and 
> > mb02.
> 
> The former is a copy-and-paste error on my part, the latter was
> intentional because the IPIs among P0, P1, and P2 don't seem to
> strengthen the ordering.
> 
> > >          mb0e
> > >          Rb=0    Wb=2
> > >                  mb1s
> > >                                  [mb14]  [mb14]  [mb14]
> > >                  mb1e
> > >                  Rc=0    Wc=2
> > >                          mb2s
> > >                                  [mb23]  [mb23]  [mb23]
> > >                          mb2e    Re=0    Rf=0    Ra=0
> > >                          Rd=0            We=2    Wf=2
> > >                                  Wd=2
> > 
> > Yes, this does hold.  But since it doesn't affect the end result, 
> > there's no point in mentioning all those other barriers.
> > 
> > > In which case, wouldn't this cycle be forbidden even if it had only one
> > > sys_membarrier() call?
> > 
> > No, it wouldn't.  I don't understand why you might think it would.  
> 
> Because I hadn't yet thought of the scenario I showed below.
> 
> > This is just like RCU, if you imagine a tiny critical section between 
> > each adjacent pair of instructions.  You wouldn't expect RCU to enforce 
> > ordering among six CPUs with only one synchronize_rcu call.
> 
> Yes, I do now agree in light of the scenario shown below.
> 
> > > Ah, but the IPIs are not necessarily synchronized across the CPUs,
> > > so that the following could happen:
> > > 
> > >          P0      P1      P2      P3      P4      P5
> > >          Wa=2
> > >          mb0s
> > >                                  [mb05]  [mb05]  [mb05]
> > >          mb0e                                    Ra=0
> > >          Rb=0    Wb=2
> > >                  mb1s
> > >                                  [mb14]  [mb14]
> > >                                          Rf=0
> > >                                                  Wf=2
> > >                                                  [mb14]
> > >                  mb1e
> > >                  Rc=0    Wc=2
> > >                          mb2s
> > >                                  [mb23]
> > >                                  Re=0
> > >                                          We=2
> > >                                          [mb23]  [mb23]
> > >                          mb2e
> > >                          Rd=0
> > >                                  Wd=2
> > 
> > Yes it could.  But even in this execution you would end up with Ra=2 
> > instead of Ra=0.
> 
> Agreed.  Or I should have said that the above execution is forbidden,
> either way.
> 
> > > I guess in light of this post in 2001, I really don't have an excuse,
> > > do I?  ;-)
> > > 
> > > 	https://lists.gt.net/linux/kernel/223555
> > > 
> > > Or am I still missing something here?
> > 
> > You tell me...
> 
> I think I am on board.  ;-)

And more to the point, here is a three-process variant showing a cycle
that is permitted:


         P0      P1      P2
         Wa=2    Wb=2    Wc=2
         mb0s
                 [mb01]  [mb02]
         mb0e
         Rb=0    Rc=0    Ra=0

As can be seen by reordering it as follows:

         P0      P1      P2
                         Ra=0
         Wa=2
         mb0s
                 [mb01]
                 Rc=0
                         Wc=2
                         [mb02]
         mb0e
         Rb=0
                 Wb=2

Make sense?

							Thanx, Paul
Alan Stern Dec. 12, 2018, 6:04 p.m. UTC | #7
On Wed, 12 Dec 2018, Paul E. McKenney wrote:

> > > > Or am I still missing something here?
> > > 
> > > You tell me...
> > 
> > I think I am on board.  ;-)
> 
> And more to the point, here is a three-process variant showing a cycle
> that is permitted:
> 
> 
>          P0      P1      P2
>          Wa=2    Wb=2    Wc=2
>          mb0s
>                  [mb01]  [mb02]
>          mb0e
>          Rb=0    Rc=0    Ra=0
> 
> As can be seen by reordering it as follows:
> 
>          P0      P1      P2
>                          Ra=0
>          Wa=2
>          mb0s
>                  [mb01]
>                  Rc=0
>                          Wc=2
>                          [mb02]
>          mb0e
>          Rb=0
>                  Wb=2
> 
> Make sense?

You got it!

Alan
Paul E. McKenney Dec. 12, 2018, 7:42 p.m. UTC | #8
On Wed, Dec 12, 2018 at 01:04:44PM -0500, Alan Stern wrote:
> On Wed, 12 Dec 2018, Paul E. McKenney wrote:
> 
> > > > > Or am I still missing something here?
> > > > 
> > > > You tell me...
> > > 
> > > I think I am on board.  ;-)
> > 
> > And more to the point, here is a three-process variant showing a cycle
> > that is permitted:
> > 
> > 
> >          P0      P1      P2
> >          Wa=2    Wb=2    Wc=2
> >          mb0s
> >                  [mb01]  [mb02]
> >          mb0e
> >          Rb=0    Rc=0    Ra=0
> > 
> > As can be seen by reordering it as follows:
> > 
> >          P0      P1      P2
> >                          Ra=0
> >          Wa=2
> >          mb0s
> >                  [mb01]
> >                  Rc=0
> >                          Wc=2
> >                          [mb02]
> >          mb0e
> >          Rb=0
> >                  Wb=2
> > 
> > Make sense?
> 
> You got it!

OK.  How about this one?

         P0      P1                 P2      P3
         Wa=2    rcu_read_lock()    Wc=2    Wd=2
         memb    Wb=2               Rd=0    synchronize_rcu();
         Rb=0    Rc=0                       Ra=0
	         rcu_read_unlock()

The model should say that it is allowed.  Taking a look...

         P0      P1                 P2      P3
				    Rd=0
					    Wd=2
					    synchronize_rcu();
	                                    Ra=0
	 Wa=2
	 membs
	         rcu_read_lock()
		 [m01]
		 Rc=0
		 		    Wc=2
				    [m02]   [m03]
	 membe
	 Rb=0
	         Wb=2
		 rcu_read_unlock()

Looks allowed to me.  If the synchronization of P1 and P2 were
interchanged, it should be forbidden:

         P0      P1      P2                 P3
         Wa=2    Wb=2    rcu_read_lock()    Wd=2
         memb    Rc=0    Wc=2               synchronize_rcu();
         Rb=0            Rd=0               Ra=0
                         rcu_read_unlock()

Taking a look...

         P0      P1      P2                 P3
                         rcu_read_lock()
                         Rd=0
         Wa=2    Wb=2                       Wd=2
         membs                              synchronize_rcu();
                 [m01]
                 Rc=0
                         Wc=2
                         rcu_read_unlock()
			 [m02]              Ra=0 [Forbidden?]
	 membe
         Rb=0

I believe that this ordering forbids the cycle:

	Wa=1 > membs -> [m01] -> Rc=0 -> Wc=2 -> rcu_read_unlock() ->
		return from synchronize_rcu() -> Ra

Does this make sense, or am I missing something?

							Thanx, Paul
Alan Stern Dec. 12, 2018, 9:32 p.m. UTC | #9
On Wed, 12 Dec 2018, Paul E. McKenney wrote:

> OK.  How about this one?
> 
>          P0      P1                 P2      P3
>          Wa=2    rcu_read_lock()    Wc=2    Wd=2
>          memb    Wb=2               Rd=0    synchronize_rcu();
>          Rb=0    Rc=0                       Ra=0
> 	         rcu_read_unlock()
> 
> The model should say that it is allowed.  Taking a look...
> 
>          P0      P1                 P2      P3
> 				    Rd=0
> 					    Wd=2
> 					    synchronize_rcu();
> 	                                    Ra=0
> 	 Wa=2
> 	 membs
> 	         rcu_read_lock()
> 		 [m01]
> 		 Rc=0
> 		 		    Wc=2
> 				    [m02]   [m03]
> 	 membe
> 	 Rb=0
> 	         Wb=2
> 		 rcu_read_unlock()
> 
> Looks allowed to me.  If the synchronization of P1 and P2 were
> interchanged, it should be forbidden:
> 
>          P0      P1      P2                 P3
>          Wa=2    Wb=2    rcu_read_lock()    Wd=2
>          memb    Rc=0    Wc=2               synchronize_rcu();
>          Rb=0            Rd=0               Ra=0
>                          rcu_read_unlock()
> 
> Taking a look...
> 
>          P0      P1      P2                 P3
>                          rcu_read_lock()
>                          Rd=0
>          Wa=2    Wb=2                       Wd=2
>          membs                              synchronize_rcu();
>                  [m01]
>                  Rc=0
>                          Wc=2
>                          rcu_read_unlock()
> 			 [m02]              Ra=0 [Forbidden?]
> 	 membe
>          Rb=0

Have you tried writing these as real litmus tests and running them 
through herd?

> I believe that this ordering forbids the cycle:
> 
> 	Wa=1 > membs -> [m01] -> Rc=0 -> Wc=2 -> rcu_read_unlock() ->
> 		return from synchronize_rcu() -> Ra
> 
> Does this make sense, or am I missing something?

It's hard to tell.  What you have written here isn't justified by the
litmus test source code, since the position of m01 in P1's program
order is undetermined.  How do you justify m01 -> Rc, for example?

Write it this way instead, using the relations defined in the 
sys_membarrier patch for linux-kernel.cat:

	memb ->memb-gp memb ->rcu-link Rc ->memb-rscsi Rc ->rcu-link
		
		rcu_read_unlock ->rcu-rscsi rcu_read_lock ->rcu-link 

		synchronize_rcu ->rcu-gp synchronize_rcu ->rcu-link memb

Recall that:

	memb-gp is the identity relation on sys_membarrier events,

	rcu-link includes (po? ; fre ; po),

	memb-rscsi is the identity relation on all events,

	rcu-rscsi links unlocks to their corresponding locks, and

	rcu-gp is the identity relation on synchronize_rcu events.

These facts justify the cycle above.

Leaving off the final rcu-link step, the sequence matches the
definition of rcu-fence (the relations are memb-gp, memb-rscsi, 
rcu-rscsi, rcu-gp with rcu-links in between).  Therefore the cycle is 
forbidden.

Alan
Paul E. McKenney Dec. 12, 2018, 9:52 p.m. UTC | #10
On Wed, Dec 12, 2018 at 04:32:50PM -0500, Alan Stern wrote:
> On Wed, 12 Dec 2018, Paul E. McKenney wrote:
> 
> > OK.  How about this one?
> > 
> >          P0      P1                 P2      P3
> >          Wa=2    rcu_read_lock()    Wc=2    Wd=2
> >          memb    Wb=2               Rd=0    synchronize_rcu();
> >          Rb=0    Rc=0                       Ra=0
> > 	         rcu_read_unlock()
> > 
> > The model should say that it is allowed.  Taking a look...
> > 
> >          P0      P1                 P2      P3
> > 				    Rd=0
> > 					    Wd=2
> > 					    synchronize_rcu();
> > 	                                    Ra=0
> > 	 Wa=2
> > 	 membs
> > 	         rcu_read_lock()
> > 		 [m01]
> > 		 Rc=0
> > 		 		    Wc=2
> > 				    [m02]   [m03]
> > 	 membe
> > 	 Rb=0
> > 	         Wb=2
> > 		 rcu_read_unlock()
> > 
> > Looks allowed to me.  If the synchronization of P1 and P2 were
> > interchanged, it should be forbidden:
> > 
> >          P0      P1      P2                 P3
> >          Wa=2    Wb=2    rcu_read_lock()    Wd=2
> >          memb    Rc=0    Wc=2               synchronize_rcu();
> >          Rb=0            Rd=0               Ra=0
> >                          rcu_read_unlock()
> > 
> > Taking a look...
> > 
> >          P0      P1      P2                 P3
> >                          rcu_read_lock()
> >                          Rd=0
> >          Wa=2    Wb=2                       Wd=2
> >          membs                              synchronize_rcu();
> >                  [m01]
> >                  Rc=0
> >                          Wc=2
> >                          rcu_read_unlock()
> > 			 [m02]              Ra=0 [Forbidden?]
> > 	 membe
> >          Rb=0

For one thing, Wb=2 needs to be down here, apologies!  Which then ...

> Have you tried writing these as real litmus tests and running them 
> through herd?

That comes later, but yes, I will do that.

> > I believe that this ordering forbids the cycle:
> > 
> > 	Wa=1 > membs -> [m01] -> Rc=0 -> Wc=2 -> rcu_read_unlock() ->
> > 		return from synchronize_rcu() -> Ra
> > 
> > Does this make sense, or am I missing something?
> 
> It's hard to tell.  What you have written here isn't justified by the
> litmus test source code, since the position of m01 in P1's program
> order is undetermined.  How do you justify m01 -> Rc, for example?

... justifies Rc=0 following [m01].

> Write it this way instead, using the relations defined in the 
> sys_membarrier patch for linux-kernel.cat:
> 
> 	memb ->memb-gp memb ->rcu-link Rc ->memb-rscsi Rc ->rcu-link
> 		
> 		rcu_read_unlock ->rcu-rscsi rcu_read_lock ->rcu-link 
> 
> 		synchronize_rcu ->rcu-gp synchronize_rcu ->rcu-link memb
> 
> Recall that:
> 
> 	memb-gp is the identity relation on sys_membarrier events,
> 
> 	rcu-link includes (po? ; fre ; po),
> 
> 	memb-rscsi is the identity relation on all events,
> 
> 	rcu-rscsi links unlocks to their corresponding locks, and
> 
> 	rcu-gp is the identity relation on synchronize_rcu events.
> 
> These facts justify the cycle above.
> 
> Leaving off the final rcu-link step, the sequence matches the
> definition of rcu-fence (the relations are memb-gp, memb-rscsi, 
> rcu-rscsi, rcu-gp with rcu-links in between).  Therefore the cycle is 
> forbidden.

Understood, but that would be using the model to check the model.  ;-)

							Thanx, Paul
Alan Stern Dec. 12, 2018, 10:12 p.m. UTC | #11
On Wed, 12 Dec 2018, Paul E. McKenney wrote:

> > > I believe that this ordering forbids the cycle:
> > > 
> > > 	Wa=1 > membs -> [m01] -> Rc=0 -> Wc=2 -> rcu_read_unlock() ->
> > > 		return from synchronize_rcu() -> Ra
> > > 
> > > Does this make sense, or am I missing something?
> > 
> > It's hard to tell.  What you have written here isn't justified by the
> > litmus test source code, since the position of m01 in P1's program
> > order is undetermined.  How do you justify m01 -> Rc, for example?
> 
> ... justifies Rc=0 following [m01].
> 
> > Write it this way instead, using the relations defined in the 
> > sys_membarrier patch for linux-kernel.cat:
> > 
> > 	memb ->memb-gp memb ->rcu-link Rc ->memb-rscsi Rc ->rcu-link
> > 		
> > 		rcu_read_unlock ->rcu-rscsi rcu_read_lock ->rcu-link 
> > 
> > 		synchronize_rcu ->rcu-gp synchronize_rcu ->rcu-link memb
> > 
> > Recall that:
> > 
> > 	memb-gp is the identity relation on sys_membarrier events,
> > 
> > 	rcu-link includes (po? ; fre ; po),
> > 
> > 	memb-rscsi is the identity relation on all events,
> > 
> > 	rcu-rscsi links unlocks to their corresponding locks, and
> > 
> > 	rcu-gp is the identity relation on synchronize_rcu events.
> > 
> > These facts justify the cycle above.
> > 
> > Leaving off the final rcu-link step, the sequence matches the
> > definition of rcu-fence (the relations are memb-gp, memb-rscsi, 
> > rcu-rscsi, rcu-gp with rcu-links in between).  Therefore the cycle is 
> > forbidden.
> 
> Understood, but that would be using the model to check the model.  ;-)

Well, what are you trying to accomplish?  Do you want to find an 
argument similar to the one I posted for the 6-CPU test to show that 
this test should be forbidden?

Alan
Paul E. McKenney Dec. 12, 2018, 10:19 p.m. UTC | #12
On Wed, Dec 12, 2018 at 01:52:45PM -0800, Paul E. McKenney wrote:
> On Wed, Dec 12, 2018 at 04:32:50PM -0500, Alan Stern wrote:
> > On Wed, 12 Dec 2018, Paul E. McKenney wrote:
> > 
> > > OK.  How about this one?
> > > 
> > >          P0      P1                 P2      P3
> > >          Wa=2    rcu_read_lock()    Wc=2    Wd=2
> > >          memb    Wb=2               Rd=0    synchronize_rcu();
> > >          Rb=0    Rc=0                       Ra=0
> > > 	         rcu_read_unlock()
> > > 
> > > The model should say that it is allowed.  Taking a look...
> > > 
> > >          P0      P1                 P2      P3
> > > 				    Rd=0
> > > 					    Wd=2
> > > 					    synchronize_rcu();
> > > 	                                    Ra=0
> > > 	 Wa=2
> > > 	 membs
> > > 	         rcu_read_lock()
> > > 		 [m01]
> > > 		 Rc=0
> > > 		 		    Wc=2
> > > 				    [m02]   [m03]
> > > 	 membe
> > > 	 Rb=0
> > > 	         Wb=2
> > > 		 rcu_read_unlock()
> > > 
> > > Looks allowed to me.  If the synchronization of P1 and P2 were
> > > interchanged, it should be forbidden:
> > > 
> > >          P0      P1      P2                 P3
> > >          Wa=2    Wb=2    rcu_read_lock()    Wd=2
> > >          memb    Rc=0    Wc=2               synchronize_rcu();
> > >          Rb=0            Rd=0               Ra=0
> > >                          rcu_read_unlock()
> > > 
> > > Taking a look...
> > > 
> > >          P0      P1      P2                 P3
> > >                          rcu_read_lock()
> > >                          Rd=0
> > >          Wa=2    Wb=2                       Wd=2
> > >          membs                              synchronize_rcu();
> > >                  [m01]
> > >                  Rc=0
> > >                          Wc=2
> > >                          rcu_read_unlock()
> > > 			 [m02]              Ra=0 [Forbidden?]
> > > 	 membe
> > >          Rb=0
> 
> For one thing, Wb=2 needs to be down here, apologies!  Which then ...
> 
> > Have you tried writing these as real litmus tests and running them 
> > through herd?
> 
> That comes later, but yes, I will do that.
> 
> > > I believe that this ordering forbids the cycle:
> > > 
> > > 	Wa=1 > membs -> [m01] -> Rc=0 -> Wc=2 -> rcu_read_unlock() ->
> > > 		return from synchronize_rcu() -> Ra
> > > 
> > > Does this make sense, or am I missing something?
> > 
> > It's hard to tell.  What you have written here isn't justified by the
> > litmus test source code, since the position of m01 in P1's program
> > order is undetermined.  How do you justify m01 -> Rc, for example?
> 
> ... justifies Rc=0 following [m01].
> 
> > Write it this way instead, using the relations defined in the 
> > sys_membarrier patch for linux-kernel.cat:
> > 
> > 	memb ->memb-gp memb ->rcu-link Rc ->memb-rscsi Rc ->rcu-link
> > 		
> > 		rcu_read_unlock ->rcu-rscsi rcu_read_lock ->rcu-link 
> > 
> > 		synchronize_rcu ->rcu-gp synchronize_rcu ->rcu-link memb
> > 
> > Recall that:
> > 
> > 	memb-gp is the identity relation on sys_membarrier events,
> > 
> > 	rcu-link includes (po? ; fre ; po),
> > 
> > 	memb-rscsi is the identity relation on all events,
> > 
> > 	rcu-rscsi links unlocks to their corresponding locks, and
> > 
> > 	rcu-gp is the identity relation on synchronize_rcu events.
> > 
> > These facts justify the cycle above.
> > 
> > Leaving off the final rcu-link step, the sequence matches the
> > definition of rcu-fence (the relations are memb-gp, memb-rscsi, 
> > rcu-rscsi, rcu-gp with rcu-links in between).  Therefore the cycle is 
> > forbidden.
> 
> Understood, but that would be using the model to check the model.  ;-)

And here are the litmus tests in the same order as above.  They do give
the results we both called out above, which is encouraging.

							Thanx, Paul

------------------------------------------------------------------------

C C-memb-RCU-1
(*
 * Result: Sometimes
 *)

{
}


P0(int *x0, int *x1)
{
	WRITE_ONCE(*x0, 1);
	smp_memb();
	r1 = READ_ONCE(*x1);
}

P1(int *x1, int *x2)
{
	rcu_read_lock();
	WRITE_ONCE(*x1, 1);
	r1 = READ_ONCE(*x2);
	rcu_read_unlock();
}

P2(int *x2, int *x3)
{
	WRITE_ONCE(*x2, 1);
	r1 = READ_ONCE(*x3);
}

P3(int *x3, int *x0)
{
	WRITE_ONCE(*x3, 1);
	synchronize_rcu();
	r1 = READ_ONCE(*x0);
}

exists (0:r1=0 /\ 1:r1=0 /\ 2:r1=0 /\ 3:r1=0)

------------------------------------------------------------------------

C C-memb-RCU-1
(*
 * Result: Never
 *)

{
}


P0(int *x0, int *x1)
{
	WRITE_ONCE(*x0, 1);
	smp_memb();
	r1 = READ_ONCE(*x1);
}

P1(int *x1, int *x2)
{
	WRITE_ONCE(*x1, 1);
	r1 = READ_ONCE(*x2);
}

P2(int *x2, int *x3)
{
	rcu_read_lock();
	WRITE_ONCE(*x2, 1);
	r1 = READ_ONCE(*x3);
	rcu_read_unlock();
}

P3(int *x3, int *x0)
{
	WRITE_ONCE(*x3, 1);
	synchronize_rcu();
	r1 = READ_ONCE(*x0);
}

exists (0:r1=0 /\ 1:r1=0 /\ 2:r1=0 /\ 3:r1=0)
Paul E. McKenney Dec. 12, 2018, 10:49 p.m. UTC | #13
On Wed, Dec 12, 2018 at 05:12:18PM -0500, Alan Stern wrote:
> On Wed, 12 Dec 2018, Paul E. McKenney wrote:
> 
> > > > I believe that this ordering forbids the cycle:
> > > > 
> > > > 	Wa=1 > membs -> [m01] -> Rc=0 -> Wc=2 -> rcu_read_unlock() ->
> > > > 		return from synchronize_rcu() -> Ra
> > > > 
> > > > Does this make sense, or am I missing something?
> > > 
> > > It's hard to tell.  What you have written here isn't justified by the
> > > litmus test source code, since the position of m01 in P1's program
> > > order is undetermined.  How do you justify m01 -> Rc, for example?
> > 
> > ... justifies Rc=0 following [m01].
> > 
> > > Write it this way instead, using the relations defined in the 
> > > sys_membarrier patch for linux-kernel.cat:
> > > 
> > > 	memb ->memb-gp memb ->rcu-link Rc ->memb-rscsi Rc ->rcu-link
> > > 		
> > > 		rcu_read_unlock ->rcu-rscsi rcu_read_lock ->rcu-link 
> > > 
> > > 		synchronize_rcu ->rcu-gp synchronize_rcu ->rcu-link memb
> > > 
> > > Recall that:
> > > 
> > > 	memb-gp is the identity relation on sys_membarrier events,
> > > 
> > > 	rcu-link includes (po? ; fre ; po),
> > > 
> > > 	memb-rscsi is the identity relation on all events,
> > > 
> > > 	rcu-rscsi links unlocks to their corresponding locks, and
> > > 
> > > 	rcu-gp is the identity relation on synchronize_rcu events.
> > > 
> > > These facts justify the cycle above.
> > > 
> > > Leaving off the final rcu-link step, the sequence matches the
> > > definition of rcu-fence (the relations are memb-gp, memb-rscsi, 
> > > rcu-rscsi, rcu-gp with rcu-links in between).  Therefore the cycle is 
> > > forbidden.
> > 
> > Understood, but that would be using the model to check the model.  ;-)
> 
> Well, what are you trying to accomplish?  Do you want to find an 
> argument similar to the one I posted for the 6-CPU test to show that 
> this test should be forbidden?

I am trying to check odd corner cases.  Your sys_membarrier() model
is quite nice and certainly fits nicely with the rest of the model,
but where I come from, that is actually reason for suspicion.  ;-)

All kidding aside, your argument for the 6-CPU test was extremely
valuable, as it showed me a way to think of that test from an
implementation viewpoint.  Then the question is whether or not that
viewpoint actually matches the model, which seems to be the case thus far.

A good next step would be to automatically generate random tests along
with an automatically generated prediction, like I did for RCU a few
years back.  I should be able to generalize my time-based cheat for RCU to
also cover SRCU, though sys_membarrier() will require a bit more thought.
(The time-based cheat was to have fixed duration RCU grace periods and
RCU read-side critical sections, with the grace period duration being
slightly longer than that of the critical sections.  The number of
processes is of course limited by the chosen durations, but that limit
can easily be made insanely large.)

I guess that I still haven't gotten over being a bit surprised that the
RCU counting rule also applies to sys_membarrier().  ;-)

							Thanx, Paul
Alan Stern Dec. 13, 2018, 3:49 p.m. UTC | #14
On Wed, 12 Dec 2018, Paul E. McKenney wrote:

> > Well, what are you trying to accomplish?  Do you want to find an 
> > argument similar to the one I posted for the 6-CPU test to show that 
> > this test should be forbidden?
> 
> I am trying to check odd corner cases.  Your sys_membarrier() model
> is quite nice and certainly fits nicely with the rest of the model,
> but where I come from, that is actually reason for suspicion.  ;-)
> 
> All kidding aside, your argument for the 6-CPU test was extremely
> valuable, as it showed me a way to think of that test from an
> implementation viewpoint.  Then the question is whether or not that
> viewpoint actually matches the model, which seems to be the case thus far.

It should, since I formulated the reasoning behind that viewpoint 
directly from the model.  The basic idea is this:

	By induction, show that whenever we have A ->rcu-fence B then
	anything po-before A executes before anything po-after B, and
	furthermore, any write which propagates to A's CPU before A
	executes will propagate to every CPU before B finishes (i.e.,
	before anything po-after B executes).

	Using this, show that whenever X ->rb Y holds then X must
	execute before Y.

That's what the 6-CPU argument did.  In that litmus test we have
mb2 ->rcu-fence mb23, Rc ->rb Re, mb1 ->rcu-fence mb14, Rb ->rb Rf,
mb0 ->rcu-fence mb05, and lastly Ra ->rb Ra.  The last one is what 
shows that the test is forbidden.

> A good next step would be to automatically generate random tests along
> with an automatically generated prediction, like I did for RCU a few
> years back.  I should be able to generalize my time-based cheat for RCU to
> also cover SRCU, though sys_membarrier() will require a bit more thought.
> (The time-based cheat was to have fixed duration RCU grace periods and
> RCU read-side critical sections, with the grace period duration being
> slightly longer than that of the critical sections.  The number of
> processes is of course limited by the chosen durations, but that limit
> can easily be made insanely large.)

Imagine that each sys_membarrier call takes a fixed duration and each 
other instruction takes slightly less (the idea being that each 
instruction is a critical section).  Instructions can be reordered 
(although not across a sys_membarrier call), but no matter how the 
reordering is done, the result is disallowed.

> I guess that I still haven't gotten over being a bit surprised that the
> RCU counting rule also applies to sys_membarrier().  ;-)

Why not?  They are both synchronization mechanisms with heavy-weight
write sides and light-weight read sides, and most importantly, they
provide the same Guarantee.

Alan
Paul E. McKenney Dec. 14, 2018, 12:20 a.m. UTC | #15
On Thu, Dec 13, 2018 at 10:49:49AM -0500, Alan Stern wrote:
> On Wed, 12 Dec 2018, Paul E. McKenney wrote:
> 
> > > Well, what are you trying to accomplish?  Do you want to find an 
> > > argument similar to the one I posted for the 6-CPU test to show that 
> > > this test should be forbidden?
> > 
> > I am trying to check odd corner cases.  Your sys_membarrier() model
> > is quite nice and certainly fits nicely with the rest of the model,
> > but where I come from, that is actually reason for suspicion.  ;-)
> > 
> > All kidding aside, your argument for the 6-CPU test was extremely
> > valuable, as it showed me a way to think of that test from an
> > implementation viewpoint.  Then the question is whether or not that
> > viewpoint actually matches the model, which seems to be the case thus far.
> 
> It should, since I formulated the reasoning behind that viewpoint 
> directly from the model.  The basic idea is this:
> 
> 	By induction, show that whenever we have A ->rcu-fence B then
> 	anything po-before A executes before anything po-after B, and
> 	furthermore, any write which propagates to A's CPU before A
> 	executes will propagate to every CPU before B finishes (i.e.,
> 	before anything po-after B executes).
> 
> 	Using this, show that whenever X ->rb Y holds then X must
> 	execute before Y.
> 
> That's what the 6-CPU argument did.  In that litmus test we have
> mb2 ->rcu-fence mb23, Rc ->rb Re, mb1 ->rcu-fence mb14, Rb ->rb Rf,
> mb0 ->rcu-fence mb05, and lastly Ra ->rb Ra.  The last one is what 
> shows that the test is forbidden.

I really am not trying to be difficult.  Well, no more difficult than
I normally am, anyway.  Which admittedly isn't saying much.  ;-)

> > A good next step would be to automatically generate random tests along
> > with an automatically generated prediction, like I did for RCU a few
> > years back.  I should be able to generalize my time-based cheat for RCU to
> > also cover SRCU, though sys_membarrier() will require a bit more thought.
> > (The time-based cheat was to have fixed duration RCU grace periods and
> > RCU read-side critical sections, with the grace period duration being
> > slightly longer than that of the critical sections.  The number of
> > processes is of course limited by the chosen durations, but that limit
> > can easily be made insanely large.)
> 
> Imagine that each sys_membarrier call takes a fixed duration and each 
> other instruction takes slightly less (the idea being that each 
> instruction is a critical section).  Instructions can be reordered 
> (although not across a sys_membarrier call), but no matter how the 
> reordering is done, the result is disallowed.

It gets a bit trickier with interleavings of different combinations
of RCU, SRCU, and sys_membarrier().  Yes, your cat code very elegantly
sorts this out, but my goal is to be able to explain a given example
to someone.

> > I guess that I still haven't gotten over being a bit surprised that the
> > RCU counting rule also applies to sys_membarrier().  ;-)
> 
> Why not?  They are both synchronization mechanisms with heavy-weight
> write sides and light-weight read sides, and most importantly, they
> provide the same Guarantee.

True, but I do feel the need to poke at it.

The zero-size sys_membarrier() read-side critical sections do make
things act a bit differently, for example, interchanging the accesses
in an RCU read-side critical section has no effect, while doing so in
a sys_membarrier() reader can cause the result to be allowed.  One key
point is that everything before the end of a read-side critical section
of any type is ordered before any later grace period of that same type,
and vice versa.

This is why reordering accesses matters for sys_membarrier() readers but
not for RCU and SRCU readers -- in the case of RCU and SRCU readers,
the accesses are inside the read-side critical section, while for
sys_membarrier() readers, the read-side critical sections don't have
an inside.  So yes, ordering also matters in the case of SRCU and
RCU readers for accesses outside of the read-side critical sections.
The reason sys_membarrier() seems surprising to me isn't because it is
any different in theoretical structure, but rather because the practice
is to put RCU and SRCU read-side accesses inside a read-side critical
sections, which is impossible for sys_membarrier().

The other thing that took some time to get used to is the possibility
of long delays during sys_membarrier() execution, allowing significant
execution and reordering between different CPUs' IPIs.  This was key
to my understanding of the six-process example, and probably needs to
be clearly called out, including in an example or two.

The interleaving restrictions are straightforward for me, but the
fixed-time approach does have some interesting cross-talk potential
between sys_membarrier() and RCU read-side critical sections whose
accesses have been reversed.  I don't believe that it is possible to
leverage this "order the other guy's read-side critical sections" effect
in the general case, but I could be missing something.

If you are claiming that I am worrying unnecessarily, you are probably
right.  But if I didn't worry unnecessarily, RCU wouldn't work at all!  ;-)

							Thanx, Paul
Alan Stern Dec. 14, 2018, 2:26 a.m. UTC | #16
On Thu, 13 Dec 2018, Paul E. McKenney wrote:

> > > A good next step would be to automatically generate random tests along
> > > with an automatically generated prediction, like I did for RCU a few
> > > years back.  I should be able to generalize my time-based cheat for RCU to
> > > also cover SRCU, though sys_membarrier() will require a bit more thought.
> > > (The time-based cheat was to have fixed duration RCU grace periods and
> > > RCU read-side critical sections, with the grace period duration being
> > > slightly longer than that of the critical sections.  The number of
> > > processes is of course limited by the chosen durations, but that limit
> > > can easily be made insanely large.)
> > 
> > Imagine that each sys_membarrier call takes a fixed duration and each 
> > other instruction takes slightly less (the idea being that each 
> > instruction is a critical section).  Instructions can be reordered 
> > (although not across a sys_membarrier call), but no matter how the 
> > reordering is done, the result is disallowed.

This turns out not to be right.  Instead, imagine that each 
sys_membarrier call takes a fixed duration, T.  Other instructions can 
take arbitrary amounts of time and can be reordered abitrarily, with 
two restrictions:

	Instructions cannot be reordered past a sys_membarrier call;

	If instructions A and B are reordered then the time duration
	from B to A must be less than T.

If you prefer, you can replace the second restriction with something a 
little more liberal:

	If A and B are reordered and A ends up executing after a 
	sys_membarrier call (on any CPU) then B cannot execute before 
	that sys_membarrier call.

Of course, this form is a consequence of the more restrictive form.

> It gets a bit trickier with interleavings of different combinations
> of RCU, SRCU, and sys_membarrier().  Yes, your cat code very elegantly
> sorts this out, but my goal is to be able to explain a given example
> to someone.

I don't think you're going to be able to fit different combinations of
RCU, SRCU, and sys_membarrier into this picture.  How would you allow
tests with incorrect interleaving, such as GP - memb - RSCS - nothing,
while forbidding similar tests with correct interleaving?

Alan
Paul E. McKenney Dec. 14, 2018, 5:20 a.m. UTC | #17
On Thu, Dec 13, 2018 at 09:26:47PM -0500, Alan Stern wrote:
> On Thu, 13 Dec 2018, Paul E. McKenney wrote:
> 
> > > > A good next step would be to automatically generate random tests along
> > > > with an automatically generated prediction, like I did for RCU a few
> > > > years back.  I should be able to generalize my time-based cheat for RCU to
> > > > also cover SRCU, though sys_membarrier() will require a bit more thought.
> > > > (The time-based cheat was to have fixed duration RCU grace periods and
> > > > RCU read-side critical sections, with the grace period duration being
> > > > slightly longer than that of the critical sections.  The number of
> > > > processes is of course limited by the chosen durations, but that limit
> > > > can easily be made insanely large.)
> > > 
> > > Imagine that each sys_membarrier call takes a fixed duration and each 
> > > other instruction takes slightly less (the idea being that each 
> > > instruction is a critical section).  Instructions can be reordered 
> > > (although not across a sys_membarrier call), but no matter how the 
> > > reordering is done, the result is disallowed.
> 
> This turns out not to be right.  Instead, imagine that each 
> sys_membarrier call takes a fixed duration, T.  Other instructions can 
> take arbitrary amounts of time and can be reordered abitrarily, with 
> two restrictions:
> 
> 	Instructions cannot be reordered past a sys_membarrier call;
> 
> 	If instructions A and B are reordered then the time duration
> 	from B to A must be less than T.
> 
> If you prefer, you can replace the second restriction with something a 
> little more liberal:
> 
> 	If A and B are reordered and A ends up executing after a 
> 	sys_membarrier call (on any CPU) then B cannot execute before 
> 	that sys_membarrier call.
> 
> Of course, this form is a consequence of the more restrictive form.

Makes sense.  And the zero-size critical sections are why sys_membarrier()
cannot be directly used for classic deferred reclamation.

> > It gets a bit trickier with interleavings of different combinations
> > of RCU, SRCU, and sys_membarrier().  Yes, your cat code very elegantly
> > sorts this out, but my goal is to be able to explain a given example
> > to someone.
> 
> I don't think you're going to be able to fit different combinations of
> RCU, SRCU, and sys_membarrier into this picture.  How would you allow
> tests with incorrect interleaving, such as GP - memb - RSCS - nothing,
> while forbidding similar tests with correct interleaving?

Well, no, I cannot do a simple linear scan tracking time, which is what
the current scripts do.  I must instead find longest sequence with all
operations of the same type (RCU, SRCU, or memb) and work out their
worst-case timing.  If the overall effect of a given sequence is to
go backwards in time, the result is allowed.  Otherwise eliminate that
sequence from the cycle and repeat.  If everything is eliminated, the
cycle is forbidden.

Which can be thought of as an iterative process similar to something
called "rcu-fence", can't it?  ;-)

							Thanx, Paul
Alan Stern Dec. 14, 2018, 3:31 p.m. UTC | #18
On Thu, 13 Dec 2018, Paul E. McKenney wrote:

> > > I guess that I still haven't gotten over being a bit surprised that the
> > > RCU counting rule also applies to sys_membarrier().  ;-)
> > 
> > Why not?  They are both synchronization mechanisms with heavy-weight
> > write sides and light-weight read sides, and most importantly, they
> > provide the same Guarantee.
> 
> True, but I do feel the need to poke at it.
> 
> The zero-size sys_membarrier() read-side critical sections do make
> things act a bit differently, for example, interchanging the accesses
> in an RCU read-side critical section has no effect, while doing so in
> a sys_membarrier() reader can cause the result to be allowed.  One key
> point is that everything before the end of a read-side critical section
> of any type is ordered before any later grace period of that same type,
> and vice versa.
> 
> This is why reordering accesses matters for sys_membarrier() readers but
> not for RCU and SRCU readers -- in the case of RCU and SRCU readers,
> the accesses are inside the read-side critical section, while for
> sys_membarrier() readers, the read-side critical sections don't have
> an inside.  So yes, ordering also matters in the case of SRCU and
> RCU readers for accesses outside of the read-side critical sections.
> The reason sys_membarrier() seems surprising to me isn't because it is
> any different in theoretical structure, but rather because the practice
> is to put RCU and SRCU read-side accesses inside a read-side critical
> sections, which is impossible for sys_membarrier().

RCU and sys_membarrier are more similar than you might think at first.  
For one thing, if there were primitives for blocking and unblocking
reception of IPIs, those primitives would delimit critical sections for
sys_membarrier.  (Maybe such things do exist; I wouldn't know.)

For another, the way we model RCU isn't fully accurate for the Linux
kernel, as you know.  Since individual instructions cannot be
preempted, each instruction is a tiny read-side critical section.
Thus, litmus tests like this one:

	P0			P1
	Wa=1			Wb=1
	synchronize_rcu()	Ra=0
	Rb=0

actually are forbidden in the kernel (provided P1 isn't part of the
idle loop!), even though the LKMM allows them.  However, it wouldn't
be forbidden if the accesses in P1 were swapped -- just like with
sys_membarrier.

Put these two observations together and you see that sys_membarrier is
almost exactly the same as RCU without explicit read-side critical
sections. Perhaps this isn't surprising, given that the initial
implementation of sys_membarrier() was pretty much the same as
synchronize_rcu().

> The other thing that took some time to get used to is the possibility
> of long delays during sys_membarrier() execution, allowing significant
> execution and reordering between different CPUs' IPIs.  This was key
> to my understanding of the six-process example, and probably needs to
> be clearly called out, including in an example or two.

In all the examples I'm aware of, no more than one of the IPIs
generated by each sys_membarrier call really matters.  (Of course,
there's no way to know in advance which one it will be, so you have to
send an IPI to every CPU.)  The execution delays and reordering
between different CPUs' IPIs don't appear to be significant.

> The interleaving restrictions are straightforward for me, but the
> fixed-time approach does have some interesting cross-talk potential
> between sys_membarrier() and RCU read-side critical sections whose
> accesses have been reversed.  I don't believe that it is possible to
> leverage this "order the other guy's read-side critical sections" effect
> in the general case, but I could be missing something.

I regard the fixed-time approach as nothing more than a heuristic
aid.  It's not an accurate explaination of what's really going on.

> If you are claiming that I am worrying unnecessarily, you are probably
> right.  But if I didn't worry unnecessarily, RCU wouldn't work at all!  ;-)

Alan
Paul E. McKenney Dec. 14, 2018, 6:43 p.m. UTC | #19
On Fri, Dec 14, 2018 at 10:31:51AM -0500, Alan Stern wrote:
> On Thu, 13 Dec 2018, Paul E. McKenney wrote:
> 
> > > > I guess that I still haven't gotten over being a bit surprised that the
> > > > RCU counting rule also applies to sys_membarrier().  ;-)
> > > 
> > > Why not?  They are both synchronization mechanisms with heavy-weight
> > > write sides and light-weight read sides, and most importantly, they
> > > provide the same Guarantee.
> > 
> > True, but I do feel the need to poke at it.
> > 
> > The zero-size sys_membarrier() read-side critical sections do make
> > things act a bit differently, for example, interchanging the accesses
> > in an RCU read-side critical section has no effect, while doing so in
> > a sys_membarrier() reader can cause the result to be allowed.  One key
> > point is that everything before the end of a read-side critical section
> > of any type is ordered before any later grace period of that same type,
> > and vice versa.
> > 
> > This is why reordering accesses matters for sys_membarrier() readers but
> > not for RCU and SRCU readers -- in the case of RCU and SRCU readers,
> > the accesses are inside the read-side critical section, while for
> > sys_membarrier() readers, the read-side critical sections don't have
> > an inside.  So yes, ordering also matters in the case of SRCU and
> > RCU readers for accesses outside of the read-side critical sections.
> > The reason sys_membarrier() seems surprising to me isn't because it is
> > any different in theoretical structure, but rather because the practice
> > is to put RCU and SRCU read-side accesses inside a read-side critical
> > sections, which is impossible for sys_membarrier().
> 
> RCU and sys_membarrier are more similar than you might think at first.  
> For one thing, if there were primitives for blocking and unblocking
> reception of IPIs, those primitives would delimit critical sections for
> sys_membarrier.  (Maybe such things do exist; I wouldn't know.)

Within the kernel of course local_irq_disable() and friends.  In
userspace, there have been proposals to make the IPI handler interact
with rseq or equivalent, which would have a roughly similar effect.

> For another, the way we model RCU isn't fully accurate for the Linux
> kernel, as you know.  Since individual instructions cannot be
> preempted, each instruction is a tiny read-side critical section.
> Thus, litmus tests like this one:
> 
> 	P0			P1
> 	Wa=1			Wb=1
> 	synchronize_rcu()	Ra=0
> 	Rb=0
> 
> actually are forbidden in the kernel (provided P1 isn't part of the
> idle loop!), even though the LKMM allows them.  However, it wouldn't
> be forbidden if the accesses in P1 were swapped -- just like with
> sys_membarrier.

And that P1 isn't executing on a CPU that RCU believes to be offline,
but yes.

But this is an implementation choice, and SRCU makes a different choice,
which would allow the litmus test shown above.  And it would be good to
keep this freedom for the implementation, in other words, this difference
is a good thing, so let's please keep it.  ;-)

> Put these two observations together and you see that sys_membarrier is
> almost exactly the same as RCU without explicit read-side critical
> sections. Perhaps this isn't surprising, given that the initial
> implementation of sys_membarrier() was pretty much the same as
> synchronize_rcu().

Heh!  The initial implementation in the Linux kernel was exactly
synchronize_sched().  ;-)

I would say that sys_membarrier() has zero-sized read-side critical
sections, either comprising a single instruction (as is the case for
synchronize_sched(), actually), preempt-disable regions of code
(which are irrelevant to userspace execution), or the spaces between
consecutive pairs of instructions (as is the case for the newer
IPI-based implementation).

The model picks the single-instruction option, and I haven't yet found
a problem with this -- which is no surprise given that, as you say,
an actual implementation makes this same choice.

> > The other thing that took some time to get used to is the possibility
> > of long delays during sys_membarrier() execution, allowing significant
> > execution and reordering between different CPUs' IPIs.  This was key
> > to my understanding of the six-process example, and probably needs to
> > be clearly called out, including in an example or two.
> 
> In all the examples I'm aware of, no more than one of the IPIs
> generated by each sys_membarrier call really matters.  (Of course,
> there's no way to know in advance which one it will be, so you have to
> send an IPI to every CPU.)  The execution delays and reordering
> between different CPUs' IPIs don't appear to be significant.

Well, there are litmus tests that are allowed in which the allowed
execution is more easily explained in terms of delays between different
CPUs' IPIs, so it seems worth keeping track of.

There might be a litmus test that can tell the difference between
simultaneous and non-simultaneous IPIs, but I cannot immediately think of
one that matters.  Might be a failure of imagination on my part, though.

> > The interleaving restrictions are straightforward for me, but the
> > fixed-time approach does have some interesting cross-talk potential
> > between sys_membarrier() and RCU read-side critical sections whose
> > accesses have been reversed.  I don't believe that it is possible to
> > leverage this "order the other guy's read-side critical sections" effect
> > in the general case, but I could be missing something.
> 
> I regard the fixed-time approach as nothing more than a heuristic
> aid.  It's not an accurate explaination of what's really going on.

Agreed, albeit a useful heuristic aid in scripts generating litmus tests.

							Thanx, Paul

> > If you are claiming that I am worrying unnecessarily, you are probably
> > right.  But if I didn't worry unnecessarily, RCU wouldn't work at all!  ;-)
> 
> Alan
>
Alan Stern Dec. 14, 2018, 9:39 p.m. UTC | #20
On Fri, 14 Dec 2018, Paul E. McKenney wrote:

> I would say that sys_membarrier() has zero-sized read-side critical
> sections, either comprising a single instruction (as is the case for
> synchronize_sched(), actually), preempt-disable regions of code
> (which are irrelevant to userspace execution), or the spaces between
> consecutive pairs of instructions (as is the case for the newer
> IPI-based implementation).
> 
> The model picks the single-instruction option, and I haven't yet found
> a problem with this -- which is no surprise given that, as you say,
> an actual implementation makes this same choice.

I believe that for RCU tests the LKMM gives the same results for
length-zero critical sections interspersed between all the instructions
and length-one critical sections surrounding all instructions (except
synchronize_rcu).  But the proof is tricky and I haven't checked it
carefully.

> > > The other thing that took some time to get used to is the possibility
> > > of long delays during sys_membarrier() execution, allowing significant
> > > execution and reordering between different CPUs' IPIs.  This was key
> > > to my understanding of the six-process example, and probably needs to
> > > be clearly called out, including in an example or two.
> > 
> > In all the examples I'm aware of, no more than one of the IPIs
> > generated by each sys_membarrier call really matters.  (Of course,
> > there's no way to know in advance which one it will be, so you have to
> > send an IPI to every CPU.)  The execution delays and reordering
> > between different CPUs' IPIs don't appear to be significant.
> 
> Well, there are litmus tests that are allowed in which the allowed
> execution is more easily explained in terms of delays between different
> CPUs' IPIs, so it seems worth keeping track of.
> 
> There might be a litmus test that can tell the difference between
> simultaneous and non-simultaneous IPIs, but I cannot immediately think of
> one that matters.  Might be a failure of imagination on my part, though.

	P0	P1	P2
	Wc=1	[mb01]	Rb=1
	memb	Wa=1	Rc=0
	Ra=0	Wb=1	[mb02]

The IPIs have to appear in the positions shown, which means they cannot
be simultaneous.  The test is allowed because P2's reads can be
reordered.

Alan
Paul E. McKenney Dec. 16, 2018, 6:51 p.m. UTC | #21
On Fri, Dec 14, 2018 at 04:39:34PM -0500, Alan Stern wrote:
> On Fri, 14 Dec 2018, Paul E. McKenney wrote:
> 
> > I would say that sys_membarrier() has zero-sized read-side critical
> > sections, either comprising a single instruction (as is the case for
> > synchronize_sched(), actually), preempt-disable regions of code
> > (which are irrelevant to userspace execution), or the spaces between
> > consecutive pairs of instructions (as is the case for the newer
> > IPI-based implementation).
> > 
> > The model picks the single-instruction option, and I haven't yet found
> > a problem with this -- which is no surprise given that, as you say,
> > an actual implementation makes this same choice.
> 
> I believe that for RCU tests the LKMM gives the same results for
> length-zero critical sections interspersed between all the instructions
> and length-one critical sections surrounding all instructions (except
> synchronize_rcu).  But the proof is tricky and I haven't checked it
> carefully.

That assertion is completely consistent with my implementation experience,
give or take the usual caveats about idle and offline execution.

> > > > The other thing that took some time to get used to is the possibility
> > > > of long delays during sys_membarrier() execution, allowing significant
> > > > execution and reordering between different CPUs' IPIs.  This was key
> > > > to my understanding of the six-process example, and probably needs to
> > > > be clearly called out, including in an example or two.
> > > 
> > > In all the examples I'm aware of, no more than one of the IPIs
> > > generated by each sys_membarrier call really matters.  (Of course,
> > > there's no way to know in advance which one it will be, so you have to
> > > send an IPI to every CPU.)  The execution delays and reordering
> > > between different CPUs' IPIs don't appear to be significant.
> > 
> > Well, there are litmus tests that are allowed in which the allowed
> > execution is more easily explained in terms of delays between different
> > CPUs' IPIs, so it seems worth keeping track of.
> > 
> > There might be a litmus test that can tell the difference between
> > simultaneous and non-simultaneous IPIs, but I cannot immediately think of
> > one that matters.  Might be a failure of imagination on my part, though.
> 
> 	P0	P1	P2
> 	Wc=1	[mb01]	Rb=1
> 	memb	Wa=1	Rc=0
> 	Ra=0	Wb=1	[mb02]
> 
> The IPIs have to appear in the positions shown, which means they cannot
> be simultaneous.  The test is allowed because P2's reads can be
> reordered.

OK, so "simultaneous" IPIs could be emulated in a real implementation by
having sys_membarrier() send each IPI (but not wait for a response), then
execute a full memory barrier and set a shared variable.  Each IPI handler
would spin waiting for the shared variable to be set, then execute a full
memory barrier and atomically increment yet another shared variable and
return from interrupt.  When that other shared variable's value reached
the number of IPIs sent, the sys_membarrier() would execute its final
(already existing) full memory barrier and return.  Horribly expensive
and definitely not recommended, but eminently doable.

The difference between current sys_membarrier() and the "simultaneous"
variant described above is similar to the difference between
non-multicopy-atomic and multicopy-atomic memory ordering.  So, after
thinking it through, my guess is that pretty much any litmus test that
can discern between multicopy-atomic and  non-multicopy-atomic should
be transformable into something that can distinguish between the current
and the "simultaneous" sys_membarrier() implementation.

Seem reasonable?

Or alternatively, may I please apply your Signed-off-by to your earlier
sys_membarrier() patch so that I can queue it?  I will probably also
change smp_memb() to membarrier() or some such.  Again, within the
Linux kernel, membarrier() can be emulated with smp_call_function()
invoking a handler that does smp_mb().

							Thanx, Paul
Alan Stern Dec. 17, 2018, 4:02 p.m. UTC | #22
On Sun, 16 Dec 2018, Paul E. McKenney wrote:

> OK, so "simultaneous" IPIs could be emulated in a real implementation by
> having sys_membarrier() send each IPI (but not wait for a response), then
> execute a full memory barrier and set a shared variable.  Each IPI handler
> would spin waiting for the shared variable to be set, then execute a full
> memory barrier and atomically increment yet another shared variable and
> return from interrupt.  When that other shared variable's value reached
> the number of IPIs sent, the sys_membarrier() would execute its final
> (already existing) full memory barrier and return.  Horribly expensive
> and definitely not recommended, but eminently doable.

I don't think that's right.  What would make the IPIs "simultaneous"  
would be if none of the handlers return until all of them have started
executing.  For example, you could have each handler increment a shared
variable and then spin, waiting for the variable to reach the number of
CPUs, before returning.

What you wrote was to have each handler wait until all the IPIs had 
been sent, which is not the same thing at all.

> The difference between current sys_membarrier() and the "simultaneous"
> variant described above is similar to the difference between
> non-multicopy-atomic and multicopy-atomic memory ordering.  So, after
> thinking it through, my guess is that pretty much any litmus test that
> can discern between multicopy-atomic and  non-multicopy-atomic should
> be transformable into something that can distinguish between the current
> and the "simultaneous" sys_membarrier() implementation.
> 
> Seem reasonable?

Yes.

> Or alternatively, may I please apply your Signed-off-by to your earlier
> sys_membarrier() patch so that I can queue it?  I will probably also
> change smp_memb() to membarrier() or some such.  Again, within the
> Linux kernel, membarrier() can be emulated with smp_call_function()
> invoking a handler that does smp_mb().

Do you really want to put sys_membarrier into the LKMM?  I'm not so 
sure it's appropriate.

Alan
Paul E. McKenney Dec. 17, 2018, 6:32 p.m. UTC | #23
On Mon, Dec 17, 2018 at 11:02:40AM -0500, Alan Stern wrote:
> On Sun, 16 Dec 2018, Paul E. McKenney wrote:
> 
> > OK, so "simultaneous" IPIs could be emulated in a real implementation by
> > having sys_membarrier() send each IPI (but not wait for a response), then
> > execute a full memory barrier and set a shared variable.  Each IPI handler
> > would spin waiting for the shared variable to be set, then execute a full
> > memory barrier and atomically increment yet another shared variable and
> > return from interrupt.  When that other shared variable's value reached
> > the number of IPIs sent, the sys_membarrier() would execute its final
> > (already existing) full memory barrier and return.  Horribly expensive
> > and definitely not recommended, but eminently doable.
> 
> I don't think that's right.  What would make the IPIs "simultaneous"  
> would be if none of the handlers return until all of them have started
> executing.  For example, you could have each handler increment a shared
> variable and then spin, waiting for the variable to reach the number of
> CPUs, before returning.
> 
> What you wrote was to have each handler wait until all the IPIs had 
> been sent, which is not the same thing at all.

You are right, the handlers need to do the atomic increment before
waiting for the shared variable to be set, and the sys_membarrier()
must wait for the incremented variable to reach its final value before
setting the shared variable.

> > The difference between current sys_membarrier() and the "simultaneous"
> > variant described above is similar to the difference between
> > non-multicopy-atomic and multicopy-atomic memory ordering.  So, after
> > thinking it through, my guess is that pretty much any litmus test that
> > can discern between multicopy-atomic and  non-multicopy-atomic should
> > be transformable into something that can distinguish between the current
> > and the "simultaneous" sys_membarrier() implementation.
> > 
> > Seem reasonable?
> 
> Yes.
> 
> > Or alternatively, may I please apply your Signed-off-by to your earlier
> > sys_membarrier() patch so that I can queue it?  I will probably also
> > change smp_memb() to membarrier() or some such.  Again, within the
> > Linux kernel, membarrier() can be emulated with smp_call_function()
> > invoking a handler that does smp_mb().
> 
> Do you really want to put sys_membarrier into the LKMM?  I'm not so 
> sure it's appropriate.

We do need it for the benefit of the C++ folks, but you are right that
it need not be accepted into the kernel to be useful to them.

So agreed, let's hold off for the time being.

							Thanx, Paul
diff mbox

Patch

Index: usb-4.x/tools/memory-model/linux-kernel.bell
===================================================================
--- usb-4.x.orig/tools/memory-model/linux-kernel.bell
+++ usb-4.x/tools/memory-model/linux-kernel.bell
@@ -24,6 +24,7 @@  instructions RMW[{'once,'acquire,'releas
 enum Barriers = 'wmb (*smp_wmb*) ||
 		'rmb (*smp_rmb*) ||
 		'mb (*smp_mb*) ||
+		'memb (*sys_membarrier*) ||
 		'rcu-lock (*rcu_read_lock*)  ||
 		'rcu-unlock (*rcu_read_unlock*) ||
 		'sync-rcu (*synchronize_rcu*) ||
Index: usb-4.x/tools/memory-model/linux-kernel.cat
===================================================================
--- usb-4.x.orig/tools/memory-model/linux-kernel.cat
+++ usb-4.x/tools/memory-model/linux-kernel.cat
@@ -33,7 +33,7 @@  let mb = ([M] ; fencerel(Mb) ; [M]) |
 	([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) |
 	([M] ; po ; [UL] ; (co | po) ; [LKW] ;
 		fencerel(After-unlock-lock) ; [M])
-let gp = po ; [Sync-rcu | Sync-srcu] ; po?
+let gp = po ; [Sync-rcu | Sync-srcu | Memb] ; po?
 
 let strong-fence = mb | gp
 
@@ -102,8 +102,10 @@  acyclic pb as propagation
  *)
 let rcu-gp = [Sync-rcu]		(* Compare with gp *)
 let srcu-gp = [Sync-srcu]
+let memb-gp = [Memb]
 let rcu-rscsi = rcu-rscs^-1
 let srcu-rscsi = srcu-rscs^-1
+let memb-rscsi = id
 
 (*
  * The synchronize_rcu() strong fence is special in that it can order not
@@ -119,15 +121,19 @@  let rcu-link = po? ; hb* ; pb* ; prop ;
  * the synchronize_srcu() and srcu_read_[un]lock() calls refer to the same
  * struct srcu_struct location.
  *)
-let rec rcu-fence = rcu-gp | srcu-gp |
+let rec rcu-fence = rcu-gp | srcu-gp | memb-gp |
 	(rcu-gp ; rcu-link ; rcu-rscsi) |
 	((srcu-gp ; rcu-link ; srcu-rscsi) & loc) |
+	(memb-gp ; rcu-link ; memb-rscsi) |
 	(rcu-rscsi ; rcu-link ; rcu-gp) |
 	((srcu-rscsi ; rcu-link ; srcu-gp) & loc) |
+	(memb-rscsi ; rcu-link ; memb-gp) |
 	(rcu-gp ; rcu-link ; rcu-fence ; rcu-link ; rcu-rscsi) |
 	((srcu-gp ; rcu-link ; rcu-fence ; rcu-link ; srcu-rscsi) & loc) |
+	(memb-gp ; rcu-link ; rcu-fence ; rcu-link ; memb-rscsi) |
 	(rcu-rscsi ; rcu-link ; rcu-fence ; rcu-link ; rcu-gp) |
 	((srcu-rscsi ; rcu-link ; rcu-fence ; rcu-link ; srcu-gp) & loc) |
+	(memb-rscsi ; rcu-link ; rcu-fence ; rcu-link ; memb-gp) |
 	(rcu-fence ; rcu-link ; rcu-fence)
 
 (* rb orders instructions just as pb does *)
Index: usb-4.x/tools/memory-model/linux-kernel.def
===================================================================
--- usb-4.x.orig/tools/memory-model/linux-kernel.def
+++ usb-4.x/tools/memory-model/linux-kernel.def
@@ -20,6 +20,7 @@  smp_store_mb(X,V) { __store{once}(X,V);
 smp_mb() { __fence{mb}; }
 smp_rmb() { __fence{rmb}; }
 smp_wmb() { __fence{wmb}; }
+smp_memb() { __fence{memb}; }
 smp_mb__before_atomic() { __fence{before-atomic}; }
 smp_mb__after_atomic() { __fence{after-atomic}; }
 smp_mb__after_spinlock() { __fence{after-spinlock}; }