@@ -139,6 +139,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
atomic64_set(&inode->i_sequence, 0);
+ atomic64_set(&inode->i_sequence2, 0);
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
inode->i_fop = &no_open_fops;
@@ -683,6 +683,7 @@ struct inode {
};
atomic64_t i_version;
atomic64_t i_sequence; /* see futex */
+ atomic64_t i_sequence2; /* see futex2 */
atomic_t i_count;
atomic_t i_dio_count;
atomic_t i_writecount;
@@ -46,6 +46,8 @@
#define FUTEX_SIZE_MASK 0x3
+#define FUTEX_SHARED_FLAG 8
+
/*
* Support for robust futexes: the kernel cleans up held futexes at
* thread exit time.
@@ -14,8 +14,10 @@
*/
#include <linux/freezer.h>
+#include <linux/hugetlb.h>
#include <linux/jhash.h>
#include <linux/memblock.h>
+#include <linux/pagemap.h>
#include <linux/sched/wake_q.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
@@ -23,8 +25,8 @@
/**
* struct futex_key - Components to build unique key for a futex
- * @pointer: Pointer to current->mm
- * @index: Start address of the page containing futex
+ * @pointer: Pointer to current->mm or inode's UUID for file backed futexes
+ * @index: Start address of the page containing futex or index of the page
* @offset: Address offset of uaddr in a page
*/
struct futex_key {
@@ -78,7 +80,12 @@ struct futex_bucket {
};
/* Mask for futex2 flag operations */
-#define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_CLOCK_REALTIME)
+#define FUTEX2_MASK (FUTEX_SIZE_MASK | FUTEX_CLOCK_REALTIME | FUTEX_SHARED_FLAG)
+
+#define is_object_shared ((futexv->objects[i].flags & FUTEX_SHARED_FLAG) ? true : false)
+
+#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */
+#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
static struct futex_bucket *futex_table;
static unsigned int futex2_hashsize;
@@ -126,16 +133,198 @@ static inline int bucket_get_waiters(struct futex_bucket *bucket)
#endif
}
+/**
+ * futex_get_inode_uuid - Gets an UUID for an inode
+ * @inode: inode to get UUID
+ *
+ * Generate a machine wide unique identifier for this inode.
+ *
+ * This relies on u64 not wrapping in the life-time of the machine; which with
+ * 1ns resolution means almost 585 years.
+ *
+ * This further relies on the fact that a well formed program will not unmap
+ * the file while it has a (shared) futex waiting on it. This mapping will have
+ * a file reference which pins the mount and inode.
+ *
+ * If for some reason an inode gets evicted and read back in again, it will get
+ * a new sequence number and will _NOT_ match, even though it is the exact same
+ * file.
+ *
+ * It is important that match_futex() will never have a false-positive, esp.
+ * for PI futexes that can mess up the state. The above argues that false-negatives
+ * are only possible for malformed programs.
+ *
+ * Returns: UUID for the given inode
+ */
+static u64 futex_get_inode_uuid(struct inode *inode)
+{
+ static atomic64_t i_seq;
+ u64 old;
+
+ /* Does the inode already have a sequence number? */
+ old = atomic64_read(&inode->i_sequence2);
+
+ if (likely(old))
+ return old;
+
+ for (;;) {
+ u64 new = atomic64_add_return(1, &i_seq);
+
+ if (WARN_ON_ONCE(!new))
+ continue;
+
+ old = atomic64_cmpxchg_relaxed(&inode->i_sequence2, 0, new);
+ if (old)
+ return old;
+ return new;
+ }
+}
+
+/**
+ * futex_get_shared_key - Get a key for a shared futex
+ * @address: Futex memory address
+ * @mm: Current process mm_struct pointer
+ * @key: Key struct to be filled
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+static int futex_get_shared_key(uintptr_t address, struct mm_struct *mm,
+ struct futex_key *key)
+{
+ int ret;
+ struct page *page, *tail;
+ struct address_space *mapping;
+
+again:
+ ret = get_user_pages_fast(address, 1, 0, &page);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * The treatment of mapping from this point on is critical. The page
+ * lock protects many things but in this context the page lock
+ * stabilizes mapping, prevents inode freeing in the shared
+ * file-backed region case and guards against movement to swap cache.
+ *
+ * Strictly speaking the page lock is not needed in all cases being
+ * considered here and page lock forces unnecessarily serialization
+ * From this point on, mapping will be re-verified if necessary and
+ * page lock will be acquired only if it is unavoidable
+ *
+ * Mapping checks require the head page for any compound page so the
+ * head page and mapping is looked up now. For anonymous pages, it
+ * does not matter if the page splits in the future as the key is
+ * based on the address. For filesystem-backed pages, the tail is
+ * required as the index of the page determines the key. For
+ * base pages, there is no tail page and tail == page.
+ */
+ tail = page;
+ page = compound_head(page);
+ mapping = READ_ONCE(page->mapping);
+
+ /*
+ * If page->mapping is NULL, then it cannot be a PageAnon
+ * page; but it might be the ZERO_PAGE or in the gate area or
+ * in a special mapping (all cases which we are happy to fail);
+ * or it may have been a good file page when get_user_pages_fast
+ * found it, but truncated or holepunched or subjected to
+ * invalidate_complete_page2 before we got the page lock (also
+ * cases which we are happy to fail). And we hold a reference,
+ * so refcount care in invalidate_complete_page's remove_mapping
+ * prevents drop_caches from setting mapping to NULL beneath us.
+ *
+ * The case we do have to guard against is when memory pressure made
+ * shmem_writepage move it from filecache to swapcache beneath us:
+ * an unlikely race, but we do need to retry for page->mapping.
+ */
+ if (unlikely(!mapping)) {
+ int shmem_swizzled;
+
+ /*
+ * Page lock is required to identify which special case above
+ * applies. If this is really a shmem page then the page lock
+ * will prevent unexpected transitions.
+ */
+ lock_page(page);
+ shmem_swizzled = PageSwapCache(page) || page->mapping;
+ unlock_page(page);
+ put_page(page);
+
+ if (shmem_swizzled)
+ goto again;
+
+ return -EFAULT;
+ }
+
+ /*
+ * If the futex key is stored on an anonymous page, then the associated
+ * object is the mm which is implicitly pinned by the calling process.
+ *
+ * NOTE: When userspace waits on a MAP_SHARED mapping, even if
+ * it's a read-only handle, it's expected that futexes attach to
+ * the object not the particular process.
+ */
+ if (PageAnon(page)) {
+ key->offset |= FUT_OFF_MMSHARED;
+ } else {
+ struct inode *inode;
+
+ /*
+ * The associated futex object in this case is the inode and
+ * the page->mapping must be traversed. Ordinarily this should
+ * be stabilised under page lock but it's not strictly
+ * necessary in this case as we just want to pin the inode, not
+ * update the radix tree or anything like that.
+ *
+ * The RCU read lock is taken as the inode is finally freed
+ * under RCU. If the mapping still matches expectations then the
+ * mapping->host can be safely accessed as being a valid inode.
+ */
+ rcu_read_lock();
+
+ if (READ_ONCE(page->mapping) != mapping) {
+ rcu_read_unlock();
+ put_page(page);
+
+ goto again;
+ }
+
+ inode = READ_ONCE(mapping->host);
+ if (!inode) {
+ rcu_read_unlock();
+ put_page(page);
+
+ goto again;
+ }
+
+ key->pointer = futex_get_inode_uuid(inode);
+ key->index = (unsigned long)basepage_index(tail);
+ key->offset |= FUT_OFF_INODE;
+
+ rcu_read_unlock();
+ }
+
+ put_page(page);
+
+ return 0;
+}
+
/**
* futex_get_bucket - Check if the user address is valid, prepare internal
* data and calculate the hash
* @uaddr: futex user address
* @key: data that uniquely identifies a futex
+ * @shared: is this a shared futex?
+ *
+ * For private futexes, each uaddr will be unique for a given mm_struct, and it
+ * won't be freed for the life time of the process. For shared futexes, check
+ * futex_get_shared_key().
*
* Return: address of bucket on success, error code otherwise
*/
static struct futex_bucket *futex_get_bucket(void __user *uaddr,
- struct futex_key *key)
+ struct futex_key *key,
+ bool shared)
{
uintptr_t address = (uintptr_t)uaddr;
u32 hash_key;
@@ -151,6 +340,9 @@ static struct futex_bucket *futex_get_bucket(void __user *uaddr,
key->pointer = (u64)address;
key->index = (unsigned long)current->mm;
+ if (shared)
+ futex_get_shared_key(address, current->mm, key);
+
/* Generate hash key for this futex using uaddr and current->mm */
hash_key = jhash2((u32 *)key, sizeof(*key) / sizeof(u32), 0);
@@ -288,6 +480,7 @@ static int futex_enqueue(struct futex_waiter_head *futexv, unsigned int nr_futex
int i, ret;
u32 uval, val;
u32 __user *uaddr;
+ bool retry = false;
struct futex_bucket *bucket;
retry:
@@ -297,6 +490,18 @@ static int futex_enqueue(struct futex_waiter_head *futexv, unsigned int nr_futex
uaddr = (u32 __user *)futexv->objects[i].uaddr;
val = (u32)futexv->objects[i].val;
+ if (is_object_shared && retry) {
+ struct futex_bucket *tmp =
+ futex_get_bucket((void __user *)uaddr,
+ &futexv->objects[i].key, true);
+ if (IS_ERR(tmp)) {
+ __set_current_state(TASK_RUNNING);
+ futex_dequeue_multiple(futexv, i);
+ return PTR_ERR(tmp);
+ }
+ futexv->objects[i].bucket = tmp;
+ }
+
bucket = futexv->objects[i].bucket;
bucket_inc_waiters(bucket);
@@ -317,6 +522,7 @@ static int futex_enqueue(struct futex_waiter_head *futexv, unsigned int nr_futex
if (__get_user(uval, uaddr))
return -EFAULT;
+ retry = true;
goto retry;
}
@@ -430,6 +636,7 @@ static int __futex_waitv(struct futex_waiter_head *futexv, unsigned int nr_futex
static long ksys_futex_wait(void __user *uaddr, u64 val, unsigned int flags,
struct __kernel_timespec __user *timo)
{
+ bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false;
unsigned int size = flags & FUTEX_SIZE_MASK;
struct futex_waiter *waiter;
struct futex_waiter_head *futexv;
@@ -459,7 +666,7 @@ static long ksys_futex_wait(void __user *uaddr, u64 val, unsigned int flags,
INIT_LIST_HEAD(&waiter->list);
/* Get an unlocked hash bucket */
- waiter->bucket = futex_get_bucket(uaddr, &waiter->key);
+ waiter->bucket = futex_get_bucket(uaddr, &waiter->key, shared);
if (IS_ERR(waiter->bucket))
return PTR_ERR(waiter->bucket);
@@ -491,7 +698,6 @@ COMPAT_SYSCALL_DEFINE4(compat_futex_wait, void __user *, uaddr, compat_u64, val,
SYSCALL_DEFINE4(futex_wait, void __user *, uaddr, u64, val, unsigned int, flags,
struct __kernel_timespec __user *, timo)
{
-
return ksys_futex_wait(uaddr, val, flags, timo);
}
@@ -554,6 +760,7 @@ static inline bool futex_match(struct futex_key key1, struct futex_key key2)
SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake,
unsigned int, flags)
{
+ bool shared = (flags & FUTEX_SHARED_FLAG) ? true : false;
unsigned int size = flags & FUTEX_SIZE_MASK;
struct futex_waiter waiter, *aux, *tmp;
struct futex_bucket *bucket;
@@ -566,7 +773,7 @@ SYSCALL_DEFINE3(futex_wake, void __user *, uaddr, unsigned int, nr_wake,
if (size != FUTEX_32)
return -EINVAL;
- bucket = futex_get_bucket(uaddr, &waiter.key);
+ bucket = futex_get_bucket(uaddr, &waiter.key, shared);
if (IS_ERR(bucket))
return PTR_ERR(bucket);