memcmp-sse4.S EqualHappy bug

Message ID 20150617172903.GC4317@redhat.com
State Dropped
Headers

Commit Message

Andrea Arcangeli June 17, 2015, 5:29 p.m. UTC
  Hello everyone,

last week I run into some problem because of an erratic behavior of
memcmp/bcmp that returns "0" even thought the two regions are never
equal at any given time if using the memcmp-sse4.S version (the
default in most recent Linux on x86-64 hardware with sse4).

My bug was that I was getting the zeropage sometime erratically mapped
(that was a bug in the testsuite in userland but I didn't fix that
yet) so I added a memcmp(page, zeropage, page_size) in the code to
detect when it would happen. Unfortunately this memcmp started to
report all zero even when the "page" was not a zeropage, and it kept
reporting it even after I actually fixed such a bug in the
testsuite.

The original testcase was here (not guaranteed permalink but it should
work for a long while):

https://git.kernel.org/cgit/linux/kernel/git/andrea/aa.git/tree/tools/testing/selftests/vm/userfaultfd.c?h=userfault21

I had a contended pthread_mutex_t at the start of the page given as
parameter to memcmp (that was changing all the time), immediately
followed by an unsigned long long counter which was never zero at any
given time.

Now I reduced the testcase to the minimum and I appended it at the end
of this email.

By the C standard I assume this is not a bug because the C language
assumes everything is single threaded (and if I put a mutex lock
around the memcmp to prevent the memory to change under it, of course
it works correctly then). However having memcmp returning 0 when the
two areas can never zero at any given time (no matter part of the
memory compared is changing) looks risky. In my case I was testing
userfaultfd so I had to first think at all sort of tlb flushes or race
conditions in the kernel before I considered the possibility of memcmp
being "buggy" (buggy not by C standard terms but still...). I started
to consider it was memcmp failing after I checked the counter by hand
to be non zero before starting the memcmp.

The problem is that the unrolled loop using sse4 only do ptest so they
can't return positive negative values directly. When the unrolled loop
breaks out it jumps to an offset that repeat the test re-reading the
memory (but this time it will get an equal copy) and it will return 0
without continuing comparing the rest of the data.

I think it's possible to fix this without having to alter the fast
path of the sse4 unrolled loops. The rsi/rdi pointers of the two areas
being compared are already adjusted perfectly when jumping to the
right offset, to prepare for the non-sse4 final comparison.

In order to fix this problem it should be enough to adjust rdx length
argument as well in addition to the rsi/rdi pointers and to check if
rdx is going down to zero with that last final comparison that was
equal. If rdx isn't at the end, it should be enough to start of memcmp
again from scratch. So it will continue. There's no need to extract
the "different" value read in the two xmm with pextrq, just letting it
continue comparing if it found equal data in the last comparison
without sse4 is enough. If the data has changed under it, it doesn't
matter whatever garbage it read in the sse4 unrolled loop.

L(8bytes):
	mov	-8(%rdi), %rax
	mov	-8(%rsi), %rcx
	cmp	%rax, %rcx
	jne	L(diffin8bytes)
	^^^^^^^^ here check %rdx if at the end and restart if not
	xor	%eax, %eax
	ret

There are other places like "8bytes" above to adjust.

The propagation of rdx along rdi/rsi should be something along these
lines (broken/incomplete):



Before doing any more work on this I'd first like some confirmation
that it is good idea to fix this (i.e. that the current unexpected
behavior is not intentional). And if yes, help would be appreciated.

Thanks,
Andrea

==
/*
 * EqualHappy memcmp/bcmp bug reproducer
 *
 *  Copyright (C) 2015  Red Hat, Inc.
 *
 *  This work is licensed under the terms of the GNU GPL, version 2. See
 *  the COPYING file in the top-level directory.
 *
 *  gcc -Wall -O2 -o equalhappybug equalhappybug.c -lpthread
 *  gcc -DWORKAROUND -Wall -O2 -o equalhappybug equalhappybug.c -lpthread
 */

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>

static long nr_cpus, page_size;
#define NON_ZERO_OFFSET 16U
static char *page, *zeropage;
volatile int finished;

static int my_bcmp(char *str1, char *str2, unsigned long n)
{
	unsigned long i;
	for (i = 0; i < n; i++)
		if (str1[i] != str2[i])
			return 1;
	return 0;
}

static void *locking_thread(void *arg)
{
	char val;

	while (!finished) {
		val = page[NON_ZERO_OFFSET];
		if (val != 1)
			fprintf(stderr,
				"wrong value %d\n", val), exit(1);

#ifdef WORKAROUND
		if (!my_bcmp(page, zeropage, page_size))
			fprintf(stderr,
				"page all zero thread %p\n", page), exit(1);
#else
		unsigned long loops = 0;
		//while (!bcmp(page, zeropage, page_size))
		while (!memcmp(page, zeropage, page_size)) {
			loops += 1;
			asm volatile("" : : : "memory");
		}
		if (loops)
			fprintf(stderr,
				"page all zero thread %p %lu\n"
				"EqualHappy bug found\n",
				page, loops), exit(1);
#endif

		*page = 0;
		asm volatile("" : : : "memory");
		*page = 1;
	}

	return NULL;
}

static int stress(void)
{
	unsigned long cpu;
	pthread_t locking_threads[nr_cpus];

	if (!my_bcmp(page, zeropage, page_size))
		fprintf(stderr,
			"page all zero thread %p\n", page), exit(1);

	printf("Test started\n");

	finished = 0;
	for (cpu = 0; cpu < nr_cpus; cpu++)
		if (pthread_create(&locking_threads[cpu], NULL,
				   locking_thread, NULL))
			return 1;

	sleep(10);

	finished = 1;
	for (cpu = 0; cpu < nr_cpus; cpu++)
		if (pthread_join(locking_threads[cpu], NULL))
			return 1;

	printf("ZeroHappy bug not found\n");
	return 0;
}

static int userfaultfd_stress(void)
{
	void *tmp_area;

	if (posix_memalign(&tmp_area, page_size, page_size)) {
		fprintf(stderr, "out of memory\n");
		return 1;
	}
	page = tmp_area;
	memset(page, 0xff, page_size);
	bzero(page, NON_ZERO_OFFSET);
	page[NON_ZERO_OFFSET] = 1;

	if (posix_memalign(&tmp_area, page_size, page_size)) {
		fprintf(stderr, "out of memory\n");
		return 1;
	}
	zeropage = tmp_area;
	bzero(zeropage, page_size);

	return stress();
}

int main(int argc, char **argv)
{
	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
	if (nr_cpus < 0)
		perror("_SC_NPROCESSORS_ONLN"), exit(1);
	page_size = sysconf(_SC_PAGE_SIZE);
	if (page_size < 0)
		perror("_SC_NPROCESSORS_ONLN"), exit(1);
	if (NON_ZERO_OFFSET >= page_size)
		fprintf(stderr, "Impossible to run this test\n"), exit(1);
	return userfaultfd_stress();
}
  

Comments

Szabolcs Nagy June 17, 2015, 6:59 p.m. UTC | #1
* Andrea Arcangeli <aarcange@redhat.com> [2015-06-17 19:29:03 +0200]:
> last week I run into some problem because of an erratic behavior of
> memcmp/bcmp that returns "0" even thought the two regions are never
> equal at any given time if using the memcmp-sse4.S version (the
> default in most recent Linux on x86-64 hardware with sse4).
> 
> My bug was that I was getting the zeropage sometime erratically mapped
> (that was a bug in the testsuite in userland but I didn't fix that
> yet) so I added a memcmp(page, zeropage, page_size) in the code to
> detect when it would happen. Unfortunately this memcmp started to
> report all zero even when the "page" was not a zeropage, and it kept
> reporting it even after I actually fixed such a bug in the
> testsuite.
> 
> The original testcase was here (not guaranteed permalink but it should
> work for a long while):
> 
> https://git.kernel.org/cgit/linux/kernel/git/andrea/aa.git/tree/tools/testing/selftests/vm/userfaultfd.c?h=userfault21
> 
> I had a contended pthread_mutex_t at the start of the page given as
> parameter to memcmp (that was changing all the time), immediately
> followed by an unsigned long long counter which was never zero at any
> given time.
> 
> Now I reduced the testcase to the minimum and I appended it at the end
> of this email.
> 
> By the C standard I assume this is not a bug because the C language
> assumes everything is single threaded (and if I put a mutex lock
> around the memcmp to prevent the memory to change under it, of course
> it works correctly then). However having memcmp returning 0 when the
> two areas can never zero at any given time (no matter part of the
> memory compared is changing) looks risky. In my case I was testing
> userfaultfd so I had to first think at all sort of tlb flushes or race
> conditions in the kernel before I considered the possibility of memcmp
> being "buggy" (buggy not by C standard terms but still...). I started
> to consider it was memcmp failing after I checked the counter by hand
> to be non zero before starting the memcmp.
> 

c11 has threads and a memory model that makes concurrency issues
observable in standard c.

however you have a data race that is undefined behaviour:

objects passed to memcmp are not supposed to be modified concurrently
without synchronization.

> The problem is that the unrolled loop using sse4 only do ptest so they
> can't return positive negative values directly. When the unrolled loop
> breaks out it jumps to an offset that repeat the test re-reading the
> memory (but this time it will get an equal copy) and it will return 0
> without continuing comparing the rest of the data.
> 

that is unfortunate but i think your test code should be fixed.

(to avoid the observed behaviour the libc would have to guarantee
atomic memcmp which is nontrivial to do)
  
Ondrej Bilka June 17, 2015, 8:19 p.m. UTC | #2
On Wed, Jun 17, 2015 at 08:59:52PM +0200, Szabolcs Nagy wrote:
> * Andrea Arcangeli <aarcange@redhat.com> [2015-06-17 19:29:03 +0200]:
> > last week I run into some problem because of an erratic behavior of
> > memcmp/bcmp that returns "0" even thought the two regions are never
> > equal at any given time if using the memcmp-sse4.S version (the
> > default in most recent Linux on x86-64 hardware with sse4).
> > 
> > My bug was that I was getting the zeropage sometime erratically mapped
> > (that was a bug in the testsuite in userland but I didn't fix that
> > yet) so I added a memcmp(page, zeropage, page_size) in the code to
> > detect when it would happen. Unfortunately this memcmp started to
> > report all zero even when the "page" was not a zeropage, and it kept
> > reporting it even after I actually fixed such a bug in the
> > testsuite.
> > 
> > The original testcase was here (not guaranteed permalink but it should
> > work for a long while):
> > 
> > https://git.kernel.org/cgit/linux/kernel/git/andrea/aa.git/tree/tools/testing/selftests/vm/userfaultfd.c?h=userfault21
> > 
> > I had a contended pthread_mutex_t at the start of the page given as
> > parameter to memcmp (that was changing all the time), immediately
> > followed by an unsigned long long counter which was never zero at any
> > given time.
> > 
> > Now I reduced the testcase to the minimum and I appended it at the end
> > of this email.
> > 
> > By the C standard I assume this is not a bug because the C language
> > assumes everything is single threaded (and if I put a mutex lock
> > around the memcmp to prevent the memory to change under it, of course
> > it works correctly then). However having memcmp returning 0 when the
> > two areas can never zero at any given time (no matter part of the
> > memory compared is changing) looks risky. In my case I was testing
> > userfaultfd so I had to first think at all sort of tlb flushes or race
> > conditions in the kernel before I considered the possibility of memcmp
> > being "buggy" (buggy not by C standard terms but still...). I started
> > to consider it was memcmp failing after I checked the counter by hand
> > to be non zero before starting the memcmp.
> > 
> 
> c11 has threads and a memory model that makes concurrency issues
> observable in standard c.
> 
> however you have a data race that is undefined behaviour:
> 
> objects passed to memcmp are not supposed to be modified concurrently
> without synchronization.
> 
> > The problem is that the unrolled loop using sse4 only do ptest so they
> > can't return positive negative values directly. When the unrolled loop
> > breaks out it jumps to an offset that repeat the test re-reading the
> > memory (but this time it will get an equal copy) and it will return 0
> > without continuing comparing the rest of the data.
> > 
> 
> that is unfortunate but i think your test code should be fixed.
> 
> (to avoid the observed behaviour the libc would have to guarantee
> atomic memcmp which is nontrivial to do)

More precisely impossible without very big hammer. My most effective
solution would be syscall that marks affected pages as read only to
suspend threads that try to write into them before doing comparison You need
simultaneous load of both memory areas which hardware doesn't support.

Without that there would still be race, simplest example would be

char x[4];
memcmp(x,x+1,1)

where another thread just does following:
*((int *)x) = 1;
memcmp read x[0] == 1
*((int *)x) = 256;
memcmp read x[1] == 1
  
Andrea Arcangeli June 17, 2015, 9:06 p.m. UTC | #3
On Wed, Jun 17, 2015 at 08:59:52PM +0200, Szabolcs Nagy wrote:
> c11 has threads and a memory model that makes concurrency issues
> observable in standard c.
> 
> however you have a data race that is undefined behaviour:
> 
> objects passed to memcmp are not supposed to be modified concurrently
> without synchronization.

All right, I expected this was not a bug by the standard.

In my initial test code I had a contended pthread mutex at the start
of the page that was passed to memcmp, I wasn't flipping bits myself,
but I guess that's still undefined behavior if it's
pthread_mutex_lock/unlock that are changing the memory passed to memcmp.

> that is unfortunate but i think your test code should be fixed.

That's ok, I already fixed it my test code after realizing the
problem or it wouldn't run correctly on most x86-64 out there.

> (to avoid the observed behaviour the libc would have to guarantee
> atomic memcmp which is nontrivial to do)

The end of the memory passed to memcmp is never changing and it is
always different. I didn't expect an atomic behavior within the part
of the page that is being changed, full atomicity is not to be
expected and it was never provided by any version.

If sometime memcmp returns != 0 inside the non-atomic part of the
memory that's fine. What was unexpected is that sometime it returns 0
without checking the end of the memory that is always different and
never changes.

I made a proposal that could fix the unexpected behavior still without
requiring a full atomic memcmp, but then it's up to you. My proposed
change wouldn't affect the sse4 fast paths but it clearly would have
overhead to propagate the "length" in rdx and to check it before
returning 0, perhaps a dozen asm instruction more for each memcmp in
average (independent of the length).

If no change is made to memcmp/bcmp to avoid the unexpected behavior
in the undefined case, I would suggest to write about this detail in
memcmp and bcmp manpage, as it could save some time and it's better
this new behavior gets advertised.

Thanks,
Andrea
  
Rich Felker June 18, 2015, 4:46 a.m. UTC | #4
On Wed, Jun 17, 2015 at 11:06:12PM +0200, Andrea Arcangeli wrote:
> On Wed, Jun 17, 2015 at 08:59:52PM +0200, Szabolcs Nagy wrote:
> > c11 has threads and a memory model that makes concurrency issues
> > observable in standard c.
> > 
> > however you have a data race that is undefined behaviour:
> > 
> > objects passed to memcmp are not supposed to be modified concurrently
> > without synchronization.
> 
> All right, I expected this was not a bug by the standard.
> 
> In my initial test code I had a contended pthread mutex at the start
> of the page that was passed to memcmp, I wasn't flipping bits myself,
> but I guess that's still undefined behavior if it's
> pthread_mutex_lock/unlock that are changing the memory passed to memcmp.
> 
> > that is unfortunate but i think your test code should be fixed.
> 
> That's ok, I already fixed it my test code after realizing the
> problem or it wouldn't run correctly on most x86-64 out there.
> 
> > (to avoid the observed behaviour the libc would have to guarantee
> > atomic memcmp which is nontrivial to do)
> 
> The end of the memory passed to memcmp is never changing and it is
> always different. I didn't expect an atomic behavior within the part
> of the page that is being changed, full atomicity is not to be
> expected and it was never provided by any version.

This is why undefined behavior is program-global (and in both space
and time). As soon as you invoke undefined behavior anywhere (e.g. a
data race), anything can happen, or can already have happened in the
past relative to point of UB.

Rich
  
Torvald Riegel June 18, 2015, 9:50 a.m. UTC | #5
On Wed, 2015-06-17 at 23:06 +0200, Andrea Arcangeli wrote:
> On Wed, Jun 17, 2015 at 08:59:52PM +0200, Szabolcs Nagy wrote:
> > c11 has threads and a memory model that makes concurrency issues
> > observable in standard c.
> > 
> > however you have a data race that is undefined behaviour:
> > 
> > objects passed to memcmp are not supposed to be modified concurrently
> > without synchronization.
> 
> All right, I expected this was not a bug by the standard.
> 
> In my initial test code I had a contended pthread mutex at the start
> of the page that was passed to memcmp, I wasn't flipping bits myself,
> but I guess that's still undefined behavior if it's
> pthread_mutex_lock/unlock that are changing the memory passed to memcmp.

Yes, it is.

You also need to consider what you are telling the compiler by using
non-atomic (or non-synchronizing) accesses on a piece of memory.  If you
do that, you promise that there's no concurrent modification, which can
allow the compiler to optimize based on prior knowledge about the
memory's content.  Allowing such optimizations is important for generic
code, for example (ie, that one can write code written for the general
case and still expect the compiler to optimize for the specific case).
(That may not be the case in your specific case, but it's often
overlooked, and it is one reason why UB is "global" as Rich points out.)

Nonetheless, at least the C++ committee is working on specifying a means
to access non-atomic-typed data atomically (but requiring all concurrent
accesses to also be atomic then (e.g., by using the same means)).
  
Andrea Arcangeli June 18, 2015, 12:49 p.m. UTC | #6
On Thu, Jun 18, 2015 at 11:50:40AM +0200, Torvald Riegel wrote:
> Nonetheless, at least the C++ committee is working on specifying a means
> to access non-atomic-typed data atomically (but requiring all concurrent
> accesses to also be atomic then (e.g., by using the same means)).

On a side note in the kernel we already do this to access not-atomic
regions "safely" (or as safe as feasible today) in C:

static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
{
	switch (size) {
	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;
	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;
	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;
	default:
		barrier();
		__builtin_memcpy((void *)res, (const void *)p, size);
		barrier();
	}
}

static __always_inline void __write_once_size(volatile void *p, void *res, int size)
{
	switch (size) {
	case 1: *(volatile __u8 *)p = *(__u8 *)res; break;
	case 2: *(volatile __u16 *)p = *(__u16 *)res; break;
	case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
	case 8: *(volatile __u64 *)p = *(__u64 *)res; break;
	default:
		barrier();
		__builtin_memcpy((void *)p, (const void *)res, size);
		barrier();
	}
}

#define READ_ONCE(x) \
	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })

#define WRITE_ONCE(x, val) \
	({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })

As you can see if __builtin_memcpy starts to leverage the pure C
semantics (like not copying everything if the data changes at the
start of the memory buffer so triggering undefined beahvior), things
would get "interesting" as far as the kernel is concerned. I mean we
depend on memcpy to explicitly not behave like memcmp is behaving now.

For non-atomic regions, we can read/write it fine with
READ_ONCE/WRITE_ONCE, we just lost a way to do memcmp in userland on
non-atomic regions.

Thanks,
Andrea
  
Ondrej Bilka June 18, 2015, 1:46 p.m. UTC | #7
On Thu, Jun 18, 2015 at 02:49:00PM +0200, Andrea Arcangeli wrote:
> On Thu, Jun 18, 2015 at 11:50:40AM +0200, Torvald Riegel wrote:
> > Nonetheless, at least the C++ committee is working on specifying a means
> > to access non-atomic-typed data atomically (but requiring all concurrent
> > accesses to also be atomic then (e.g., by using the same means)).
> 
> On a side note in the kernel we already do this to access not-atomic
> regions "safely" (or as safe as feasible today) in C:
> 
> static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
> {
> 	switch (size) {
> 	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
> 	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;
> 	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;
> 	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;
> 	default:
> 		barrier();
> 		__builtin_memcpy((void *)res, (const void *)p, size);
> 		barrier();
> 	}
> }
> 
> static __always_inline void __write_once_size(volatile void *p, void *res, int size)
> {
> 	switch (size) {
> 	case 1: *(volatile __u8 *)p = *(__u8 *)res; break;
> 	case 2: *(volatile __u16 *)p = *(__u16 *)res; break;
> 	case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
> 	case 8: *(volatile __u64 *)p = *(__u64 *)res; break;
> 	default:
> 		barrier();
> 		__builtin_memcpy((void *)p, (const void *)res, size);
> 		barrier();
> 	}
> }
> 
> #define READ_ONCE(x) \
> 	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
> 
> #define WRITE_ONCE(x, val) \
> 	({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
> 
> As you can see if __builtin_memcpy starts to leverage the pure C
> semantics (like not copying everything if the data changes at the
> start of the memory buffer so triggering undefined beahvior), things
> would get "interesting" as far as the kernel is concerned. I mean we
> depend on memcpy to explicitly not behave like memcmp is behaving now.
> 
> For non-atomic regions, we can read/write it fine with
> READ_ONCE/WRITE_ONCE, we just lost a way to do memcmp in userland on
> non-atomic regions.
> 
As I wrote before it wouldn't work. You need create something like READ_ONCE2(x,y) 
to get both arguments simultaneously, otherwise I could create race with
(memcmp(x,x+1,1)) and other thread atomically changing both characters
by first writing 1, then 256.
  
Torvald Riegel June 18, 2015, 2:23 p.m. UTC | #8
On Thu, 2015-06-18 at 14:49 +0200, Andrea Arcangeli wrote:
> On Thu, Jun 18, 2015 at 11:50:40AM +0200, Torvald Riegel wrote:
> > Nonetheless, at least the C++ committee is working on specifying a means
> > to access non-atomic-typed data atomically (but requiring all concurrent
> > accesses to also be atomic then (e.g., by using the same means)).
> 
> On a side note in the kernel we already do this to access not-atomic
> regions "safely" (or as safe as feasible today) in C:
> 
> static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
> {
> 	switch (size) {
> 	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
> 	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;
> 	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;
> 	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;
> 	default:
> 		barrier();
> 		__builtin_memcpy((void *)res, (const void *)p, size);
> 		barrier();
> 	}
> }
> 
> static __always_inline void __write_once_size(volatile void *p, void *res, int size)
> {
> 	switch (size) {
> 	case 1: *(volatile __u8 *)p = *(__u8 *)res; break;
> 	case 2: *(volatile __u16 *)p = *(__u16 *)res; break;
> 	case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
> 	case 8: *(volatile __u64 *)p = *(__u64 *)res; break;
> 	default:
> 		barrier();
> 		__builtin_memcpy((void *)p, (const void *)res, size);
> 		barrier();
> 	}
> }
> 
> #define READ_ONCE(x) \
> 	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
> 
> #define WRITE_ONCE(x, val) \
> 	({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
> 
> As you can see if __builtin_memcpy starts to leverage the pure C
> semantics (like not copying everything if the data changes at the
> start of the memory buffer so triggering undefined beahvior), things
> would get "interesting" as far as the kernel is concerned. I mean we
> depend on memcpy to explicitly not behave like memcmp is behaving now.

There are a few differences to your memcmp test case though.  If
barrier() is a compiler barrier, this should "remove" prior knowledge
the compiler has about memory.  Second, you access volatile-qualified
data, which at least requires to do the accesses byte-wise or similar.
Does the memcmp test case also fail when you acess volatiles?

Note that I'm not saying that this is guaranteed to work.  It's still UB
if there is a data race.  However, in practice, this might just continue
to work.

> For non-atomic regions, we can read/write it fine with
> READ_ONCE/WRITE_ONCE, we just lost a way to do memcmp in userland on
> non-atomic regions.

Userland programs should really try to transition to atomics and the
C11/C++11 memory model or just the __atomic builtins, if possible.
  
Andrea Arcangeli June 18, 2015, 3:54 p.m. UTC | #9
On Wed, Jun 17, 2015 at 10:19:58PM +0200, Ondřej Bílka wrote:
> More precisely impossible without very big hammer. My most effective
> solution would be syscall that marks affected pages as read only to
> suspend threads that try to write into them before doing comparison You need
> simultaneous load of both memory areas which hardware doesn't support.
> 
> Without that there would still be race, simplest example would be
> 
> char x[4];
> memcmp(x,x+1,1)
> 
> where another thread just does following:
> *((int *)x) = 1;
> memcmp read x[0] == 1
> *((int *)x) = 256;
> memcmp read x[1] == 1

This actually shows a case where the result is always undefined.

Now suppose the above case but len is 4096 (not 1) and the 4bytes are
followed by 4092 bytes all different and never changing (I write the
program so I can force them never to change). How does memcmp retun 0
in that case?

> memcmp read x[0] == 1

ok

> memcmp read x[1] == 1

ok

4094 bytes are left to compare, do you stop here and return 0 because
first two bytes are equal?

Because current memcmp stops (for other reasons but it stops).

If the first part of memcmp changes randomly returning an undefined
value above or below zero is expected too, -1 or 1 both would be
expected and undefined results, what's feeling weird and unexpected is
only that it returns 0 despite the end is different and never
changing. That's the only issue I'm raising here about memcmp/bcmp.

By physical hardware terms no matter how you change the part of the
memory at the start of memcmp, it is defined it cannot return 0, and
sse4 version does return 0.

I fully understand your arguments about the standard and I expected
this behavior was permitted.

I'm also pointing out we go a bit beyond in what we pretend from C
with the READ_ONCE/WRITE_ONCE/volatile/asm("memory") to provide RCU
(and to implement the spinlock/mutex). I just wanted to express my
views on the practical aspects and how we could enforce that if a part
of memory (a part that is separated by atomic granularity of the arch,
a variable you need to know and isn't 1 byte minimum on alpha for
example) the memcmp is well defined that can't return 0 (i.e. if it
returns 0 it actually read all bytes of "length" parameter and at some
point in time each byte individually was always equal, and the last
part of the page is never changed and never equal).

I'm fine if no change is done, and it'd be great if at least the
manpage of memcmp/bcmp is updated. If it was up to me though I'd
prefer to fix this case so 0 isn't happily returned too soon
unexpectedly, as the unrolled loop fast path won't require change.
  
Ondrej Bilka June 18, 2015, 6:05 p.m. UTC | #10
On Thu, Jun 18, 2015 at 05:54:42PM +0200, Andrea Arcangeli wrote:
> On Wed, Jun 17, 2015 at 10:19:58PM +0200, Ondřej Bílka wrote:
> I fully understand your arguments about the standard and I expected
> this behavior was permitted.
> 
> I'm also pointing out we go a bit beyond in what we pretend from C
> with the READ_ONCE/WRITE_ONCE/volatile/asm("memory") to provide RCU
> (and to implement the spinlock/mutex). I just wanted to express my
> views on the practical aspects and how we could enforce that if a part
> of memory (a part that is separated by atomic granularity of the arch,
> a variable you need to know and isn't 1 byte minimum on alpha for
> example) the memcmp is well defined that can't return 0 (i.e. if it
> returns 0 it actually read all bytes of "length" parameter and at some
> point in time each byte individually was always equal, and the last
> part of the page is never changed and never equal).
> 
> I'm fine if no change is done, and it'd be great if at least the
> manpage of memcmp/bcmp is updated. If it was up to me though I'd
> prefer to fix this case so 0 isn't happily returned too soon
> unexpectedly, as the unrolled loop fast path won't require change.

I see now. As I am writing new memcmp I don't see that likely, as it
adds extra overhead thats hard to justify.

Rereading is needed for good performance, a loop checks 64 bytes at
once and sse2 uses destructive operation so original data wont be there.

A best workaround would be add after final subtraction check if its zero
then call 
memcmp(x+found+1, y+found+1, remaining)

That could be almost free as you need to just add je branch after
subtraction.

However now I need test new way to check first 16 bytes that would avoid
rereading. Problem that it would scale worse when you need combine
results into two 64-byte masks instead one.

mov %rdx, %rcx
neg %rcx
movdqu (%rsi), %xmm0
movdqu (%rdi), %xmm1
movdqu %xmm0, %xmm2
pcmpgtb %xmm1, %xmm0
pcmpgtb %xmm2, %xmm1
pmovmskb %xmm0, %eax
pmovmskb %xmm1, %edx
bswap %eax
bswap %edx
shr %cl, %eax
shr %cl, %edx
cmp $-16, %rcx
jae L(next_48_bytes)
sub %edx, %eax
L(ret):
ret
L(next_48_bytes):
sub %edx, %eax
jne L(ret)
  
Rich Felker June 18, 2015, 9:43 p.m. UTC | #11
On Thu, Jun 18, 2015 at 08:05:17PM +0200, Ondřej Bílka wrote:
> On Thu, Jun 18, 2015 at 05:54:42PM +0200, Andrea Arcangeli wrote:
> > On Wed, Jun 17, 2015 at 10:19:58PM +0200, Ondřej Bílka wrote:
> > I fully understand your arguments about the standard and I expected
> > this behavior was permitted.
> > 
> > I'm also pointing out we go a bit beyond in what we pretend from C
> > with the READ_ONCE/WRITE_ONCE/volatile/asm("memory") to provide RCU
> > (and to implement the spinlock/mutex). I just wanted to express my
> > views on the practical aspects and how we could enforce that if a part
> > of memory (a part that is separated by atomic granularity of the arch,
> > a variable you need to know and isn't 1 byte minimum on alpha for
> > example) the memcmp is well defined that can't return 0 (i.e. if it
> > returns 0 it actually read all bytes of "length" parameter and at some
> > point in time each byte individually was always equal, and the last
> > part of the page is never changed and never equal).
> > 
> > I'm fine if no change is done, and it'd be great if at least the
> > manpage of memcmp/bcmp is updated. If it was up to me though I'd
> > prefer to fix this case so 0 isn't happily returned too soon
> > unexpectedly, as the unrolled loop fast path won't require change.
> 
> I see now. As I am writing new memcmp I don't see that likely, as it
> adds extra overhead thats hard to justify.
> 
> Rereading is needed for good performance, a loop checks 64 bytes at
> once and sse2 uses destructive operation so original data wont be there.
> 
> A best workaround would be add after final subtraction check if its zero
> then call 
> memcmp(x+found+1, y+found+1, remaining)
> 
> That could be almost free as you need to just add je branch after
> subtraction.

I would much rather see that je branch to a HCF instruction. If you've
detected UB then you should not hide it but make it crash immediately.

Rich
  
Andrea Arcangeli June 19, 2015, 1:29 p.m. UTC | #12
On Thu, Jun 18, 2015 at 08:05:17PM +0200, Ondřej Bílka wrote:
> I see now. As I am writing new memcmp I don't see that likely, as it
> adds extra overhead thats hard to justify.
> 
> Rereading is needed for good performance, a loop checks 64 bytes at
> once and sse2 uses destructive operation so original data wont be there.
> 
> A best workaround would be add after final subtraction check if its zero
> then call 
> memcmp(x+found+1, y+found+1, remaining)
> 
> That could be almost free as you need to just add je branch after
> subtraction.

Yes, it's free for the unrolled loop, just the breakout of the
unrolled loop needs to adjust rdx in addition of rsi/rdi to be able to
check it to see if it's at the end before returning zero.

> However now I need test new way to check first 16 bytes that would avoid
> rereading. Problem that it would scale worse when you need combine
> results into two 64-byte masks instead one.
> 
> mov %rdx, %rcx
> neg %rcx
> movdqu (%rsi), %xmm0
> movdqu (%rdi), %xmm1
> movdqu %xmm0, %xmm2
> pcmpgtb %xmm1, %xmm0
> pcmpgtb %xmm2, %xmm1
> pmovmskb %xmm0, %eax
> pmovmskb %xmm1, %edx
> bswap %eax
> bswap %edx

The unrolled loop I think it's faster if it does ptest only, those are
plenty more sse4 instructions than current code does in the sse4
part. I guess it's likely measurably slower, but then mine is just a
guess and I haven't benchmarked.

Just saying the problem is not re-reading, re-reading is fine. Above
zero or below zero values would be undefined anyway no matter if we
re-read or not. The only defined thing is the function cannot return 0
and it currently does.

> shr %cl, %eax
> shr %cl, %edx
> cmp $-16, %rcx
> jae L(next_48_bytes)
> sub %edx, %eax
> L(ret):
> ret
> L(next_48_bytes):
> sub %edx, %eax
> jne L(ret)
>
  

Patch

diff --git a/equalhappybug/memcmp-sse4.S b/equalhappybug/memcmp-sse4.S
index 0de80df..5812792 100644
--- a/equalhappybug/memcmp-sse4.S
+++ b/equalhappybug/memcmp-sse4.S
@@ -205,7 +205,6 @@  L(less32bytesin128):
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
 
 L(less512bytes):
-	sub	$256, %rdx
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -712,34 +706,41 @@  L(L2_L3_aligned_128bytes_loop):
 
 	.p2align 4
 L(64bytesormore_loop_end):
+	sub	$16, %rdx
 	add	$16, %rdi
 	add	$16, %rsi
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytes)
 
+	sub	$16, %rdx
 	add	$16, %rdi
 	add	$16, %rsi
 	ptest	%xmm3, %xmm0
 	jnc	L(16bytes)
 
+	sub	$16, %rdx
 	add	$16, %rdi
 	add	$16, %rsi
 	ptest	%xmm4, %xmm0
 	jnc	L(16bytes)
 
+	sub	$16, %rdx
 	add	$16, %rdi
 	add	$16, %rsi
 	jmp	L(16bytes)
 
 L(256bytesin256):
+	sub	$256, %rdx
 	add	$256, %rdi
 	add	$256, %rsi
 	jmp	L(16bytes)
 L(240bytesin256):
+	sub	$240, %rdx
 	add	$240, %rdi
 	add	$240, %rsi
 	jmp	L(16bytes)
 L(224bytesin256):
+	sub	$224, %rdx
 	add	$224, %rdi
 	add	$224, %rsi
 	jmp	L(16bytes)
@@ -748,45 +749,56 @@  L(208bytesin256):
 	add	$208, %rsi
 	jmp	L(16bytes)
 L(192bytesin256):
+	sub	$192, %rdx
 	add	$192, %rdi
 	add	$192, %rsi
 	jmp	L(16bytes)
 L(176bytesin256):
+	sub	$176, %rdx
 	add	$176, %rdi
 	add	$176, %rsi
 	jmp	L(16bytes)
 L(160bytesin256):
+	sub	$160, %rdx
 	add	$160, %rdi
 	add	$160, %rsi
 	jmp	L(16bytes)
 L(144bytesin256):
+	sub	$144, %rdx
 	add	$144, %rdi
 	add	$144, %rsi
 	jmp	L(16bytes)
 L(128bytesin256):
+	sub	$128, %rdx
 	add	$128, %rdi
 	add	$128, %rsi
 	jmp	L(16bytes)
 L(112bytesin256):
+	sub	$112, %rdx
 	add	$112, %rdi
 	add	$112, %rsi
 	jmp	L(16bytes)
 L(96bytesin256):
+	sub	$96, %rdx
 	add	$96, %rdi
 	add	$96, %rsi
 	jmp	L(16bytes)
 L(80bytesin256):
+	sub	$80, %rdx
 	add	$80, %rdi
 	add	$80, %rsi
 	jmp	L(16bytes)
 L(64bytesin256):
+	sub	$64, %rdx
 	add	$64, %rdi
 	add	$64, %rsi
 	jmp	L(16bytes)
 L(48bytesin256):
+	sub	$16, %rdx
 	add	$16, %rdi
 	add	$16, %rsi
 L(32bytesin256):
+	sub	$16, %rdx
 	add	$16, %rdi
 	add	$16, %rsi
 L(16bytesin256):