RHEL4/kernel/futex.c
<<
>>
Prefs
   1/*
   2 *  Fast Userspace Mutexes (which I call "Futexes!").
   3 *  (C) Rusty Russell, IBM 2002
   4 *
   5 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
   6 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
   7 *
   8 *  Removed page pinning, fix privately mapped COW pages and other cleanups
   9 *  (C) Copyright 2003 Jamie Lokier
  10 *
  11 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  12 *  enough at me, Linus for the original (flawed) idea, Matthew
  13 *  Kirkwood for proof-of-concept implementation.
  14 *
  15 *  "The futexes are also cursed."
  16 *  "But they come in a choice of three flavours!"
  17 *
  18 *  This program is free software; you can redistribute it and/or modify
  19 *  it under the terms of the GNU General Public License as published by
  20 *  the Free Software Foundation; either version 2 of the License, or
  21 *  (at your option) any later version.
  22 *
  23 *  This program is distributed in the hope that it will be useful,
  24 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  25 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  26 *  GNU General Public License for more details.
  27 *
  28 *  You should have received a copy of the GNU General Public License
  29 *  along with this program; if not, write to the Free Software
  30 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  31 */
  32#include <linux/slab.h>
  33#include <linux/poll.h>
  34#include <linux/fs.h>
  35#include <linux/file.h>
  36#include <linux/jhash.h>
  37#include <linux/init.h>
  38#include <linux/futex.h>
  39#include <linux/mount.h>
  40#include <linux/pagemap.h>
  41#include <linux/syscalls.h>
  42
  43#define FUTEX_HASHBITS 8
  44
  45/*
  46 * Futexes are matched on equal values of this key.
  47 * The key type depends on whether it's a shared or private mapping.
  48 * Don't rearrange members without looking at hash_futex().
  49 *
  50 * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
  51 * We set bit 0 to indicate if it's an inode-based key.
  52 */
  53union futex_key {
  54        struct {
  55                unsigned long pgoff;
  56                struct inode *inode;
  57                int offset;
  58        } shared;
  59        struct {
  60                unsigned long uaddr;
  61                struct mm_struct *mm;
  62                int offset;
  63        } private;
  64        struct {
  65                unsigned long word;
  66                void *ptr;
  67                int offset;
  68        } both;
  69};
  70
  71/*
  72 * We use this hashed waitqueue instead of a normal wait_queue_t, so
  73 * we can wake only the relevant ones (hashed queues may be shared).
  74 *
  75 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
  76 * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
  77 * The order of wakup is always to make the first condition true, then
  78 * wake up q->waiters, then make the second condition true.
  79 */
  80struct futex_q {
  81        struct list_head list;
  82        wait_queue_head_t waiters;
  83
  84        /* Which hash list lock to use. */
  85        spinlock_t *lock_ptr;
  86
  87        /* Key which the futex is hashed on. */
  88        union futex_key key;
  89
  90        /* For fd, sigio sent using these. */
  91        int fd;
  92        struct file *filp;
  93};
  94
  95/*
  96 * Split the global futex_lock into every hash list lock.
  97 */
  98struct futex_hash_bucket {
  99       spinlock_t              lock;
 100       struct list_head       chain;
 101};
 102
 103static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
 104
 105/* Futex-fs vfsmount entry: */
 106static struct vfsmount *futex_mnt;
 107
 108/*
 109 * We hash on the keys returned from get_futex_key (see below).
 110 */
 111static struct futex_hash_bucket *hash_futex(union futex_key *key)
 112{
 113        u32 hash = jhash2((u32*)&key->both.word,
 114                          (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
 115                          key->both.offset);
 116        return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
 117}
 118
 119/*
 120 * Return 1 if two futex_keys are equal, 0 otherwise.
 121 */
 122static inline int match_futex(union futex_key *key1, union futex_key *key2)
 123{
 124        return (key1->both.word == key2->both.word
 125                && key1->both.ptr == key2->both.ptr
 126                && key1->both.offset == key2->both.offset);
 127}
 128
 129/*
 130 * Get parameters which are the keys for a futex.
 131 *
 132 * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
 133 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 134 * We can usually work out the index without swapping in the page.
 135 *
 136 * Returns: 0, or negative error code.
 137 * The key words are stored in *key on success.
 138 *
 139 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
 140 */
 141static int get_futex_key(unsigned long uaddr, union futex_key *key)
 142{
 143        struct mm_struct *mm = current->mm;
 144        struct vm_area_struct *vma;
 145        struct page *page;
 146        int err;
 147
 148        /*
 149         * The futex address must be "naturally" aligned.
 150         */
 151        key->both.offset = uaddr % PAGE_SIZE;
 152        if (unlikely((key->both.offset % sizeof(u32)) != 0))
 153                return -EINVAL;
 154        uaddr -= key->both.offset;
 155
 156        /*
 157         * The futex is hashed differently depending on whether
 158         * it's in a shared or private mapping.  So check vma first.
 159         */
 160        vma = find_extend_vma(mm, uaddr);
 161        if (unlikely(!vma))
 162                return -EFAULT;
 163
 164        /*
 165         * Permissions.
 166         */
 167        if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
 168                return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
 169
 170        /*
 171         * Private mappings are handled in a simple way.
 172         *
 173         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 174         * it's a read-only handle, it's expected that futexes attach to
 175         * the object not the particular process.  Therefore we use
 176         * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
 177         * mappings of _writable_ handles.
 178         */
 179        if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
 180                key->private.mm = mm;
 181                key->private.uaddr = uaddr;
 182                return 0;
 183        }
 184
 185        /*
 186         * Linear file mappings are also simple.
 187         */
 188        key->shared.inode = vma->vm_file->f_dentry->d_inode;
 189        key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
 190        if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
 191                key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
 192                                     + vma->vm_pgoff);
 193                return 0;
 194        }
 195
 196        /*
 197         * We could walk the page table to read the non-linear
 198         * pte, and get the page index without fetching the page
 199         * from swap.  But that's a lot of code to duplicate here
 200         * for a rare case, so we simply fetch the page.
 201         */
 202
 203        /*
 204         * Do a quick atomic lookup first - this is the fastpath.
 205         */
 206        spin_lock(&current->mm->page_table_lock);
 207        page = follow_page(mm, uaddr, 0);
 208        if (likely(page != NULL)) {
 209                key->shared.pgoff =
 210                        page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 211                spin_unlock(&current->mm->page_table_lock);
 212                return 0;
 213        }
 214        spin_unlock(&current->mm->page_table_lock);
 215
 216        /*
 217         * Do it the general way.
 218         */
 219        err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
 220        if (err >= 0) {
 221                key->shared.pgoff =
 222                        page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 223                put_page(page);
 224                return 0;
 225        }
 226        return err;
 227}
 228
 229/*
 230 * Take a reference to the resource addressed by a key.
 231 * Can be called while holding spinlocks.
 232 *
 233 * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
 234 * function, if it is called at all.  mmap_sem keeps key->shared.inode valid.
 235 */
 236static inline void get_key_refs(union futex_key *key)
 237{
 238        if (key->both.ptr != 0) {
 239                if (key->both.offset & 1)
 240                        atomic_inc(&key->shared.inode->i_count);
 241                else
 242                        atomic_inc(&key->private.mm->mm_count);
 243        }
 244}
 245
 246/*
 247 * Drop a reference to the resource addressed by a key.
 248 * The hash bucket spinlock must not be held.
 249 */
 250static void drop_key_refs(union futex_key *key)
 251{
 252        if (key->both.ptr != 0) {
 253                if (key->both.offset & 1)
 254                        iput(key->shared.inode);
 255                else
 256                        mmdrop(key->private.mm);
 257        }
 258}
 259
 260static inline int get_futex_value_locked(int *dest, int __user *from)
 261{
 262        int ret;
 263
 264        inc_preempt_count();
 265        ret = __copy_from_user_inatomic(dest, from, sizeof(int));
 266        dec_preempt_count();
 267
 268        return ret ? -EFAULT : 0;
 269}
 270
 271/*
 272 * The hash bucket lock must be held when this is called.
 273 * Afterwards, the futex_q must not be accessed.
 274 */
 275static void wake_futex(struct futex_q *q)
 276{
 277        list_del_init(&q->list);
 278        if (q->filp)
 279                send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
 280        /*
 281         * The lock in wake_up_all() is a crucial memory barrier after the
 282         * list_del_init() and also before assigning to q->lock_ptr.
 283         */
 284        wake_up_all(&q->waiters);
 285        /*
 286         * The waiting task can free the futex_q as soon as this is written,
 287         * without taking any locks.  This must come last.
 288         *
 289         * A memory barrier is required here to prevent the following store
 290         * to lock_ptr from getting ahead of the wakeup. Clearing the lock
 291         * at the end of wake_up_all() does not prevent this store from
 292         * moving.
 293         */
 294        wmb();
 295        q->lock_ptr = NULL;
 296}
 297
 298/*
 299 * Wake up all waiters hashed on the physical page that is mapped
 300 * to this virtual address:
 301 */
 302static int futex_wake(unsigned long uaddr, int nr_wake)
 303{
 304        union futex_key key;
 305        struct futex_hash_bucket *bh;
 306        struct list_head *head;
 307        struct futex_q *this, *next;
 308        int ret;
 309
 310        down_read(&current->mm->mmap_sem);
 311
 312        ret = get_futex_key(uaddr, &key);
 313        if (unlikely(ret != 0))
 314                goto out;
 315
 316        bh = hash_futex(&key);
 317        spin_lock(&bh->lock);
 318        head = &bh->chain;
 319
 320        list_for_each_entry_safe(this, next, head, list) {
 321                if (match_futex (&this->key, &key)) {
 322                        wake_futex(this);
 323                        if (++ret >= nr_wake)
 324                                break;
 325                }
 326        }
 327
 328        spin_unlock(&bh->lock);
 329out:
 330        up_read(&current->mm->mmap_sem);
 331        return ret;
 332}
 333
 334/*
 335 * Requeue all waiters hashed on one physical page to another
 336 * physical page.
 337 */
 338static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
 339                         int nr_wake, int nr_requeue, int *valp)
 340{
 341        union futex_key key1, key2;
 342        struct futex_hash_bucket *bh1, *bh2;
 343        struct list_head *head1;
 344        struct futex_q *this, *next;
 345        int ret, drop_count = 0;
 346
 347 retry:
 348        down_read(&current->mm->mmap_sem);
 349
 350        ret = get_futex_key(uaddr1, &key1);
 351        if (unlikely(ret != 0))
 352                goto out;
 353        ret = get_futex_key(uaddr2, &key2);
 354        if (unlikely(ret != 0))
 355                goto out;
 356
 357        bh1 = hash_futex(&key1);
 358        bh2 = hash_futex(&key2);
 359
 360        if (bh1 < bh2)
 361                spin_lock(&bh1->lock);
 362        spin_lock(&bh2->lock);
 363        if (bh1 > bh2)
 364                spin_lock(&bh1->lock);
 365
 366        if (likely(valp != NULL)) {
 367                int curval;
 368
 369                ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
 370
 371                if (unlikely(ret)) {
 372                        spin_unlock(&bh1->lock);
 373                        if (bh1 != bh2)
 374                                spin_unlock(&bh2->lock);
 375
 376                        /* If we would have faulted, release mmap_sem, fault
 377                         * it in and start all over again.
 378                         */
 379                        up_read(&current->mm->mmap_sem);
 380
 381                        ret = get_user(curval, (int __user *)uaddr1);
 382
 383                        if (!ret)
 384                                goto retry;
 385
 386                        return ret;
 387                }
 388                if (curval != *valp) {
 389                        ret = -EAGAIN;
 390                        goto out_unlock;
 391                }
 392        }
 393
 394        head1 = &bh1->chain;
 395        list_for_each_entry_safe(this, next, head1, list) {
 396                if (!match_futex (&this->key, &key1))
 397                        continue;
 398                if (++ret <= nr_wake) {
 399                        wake_futex(this);
 400                } else {
 401                        list_move_tail(&this->list, &bh2->chain);
 402                        this->lock_ptr = &bh2->lock;
 403                        this->key = key2;
 404                        get_key_refs(&key2);
 405                        drop_count++;
 406
 407                        if (ret - nr_wake >= nr_requeue)
 408                                break;
 409                        /* Make sure to stop if key1 == key2 */
 410                        if (head1 == &bh2->chain && head1 != &next->list)
 411                                head1 = &this->list;
 412                }
 413        }
 414
 415out_unlock:
 416        spin_unlock(&bh1->lock);
 417        if (bh1 != bh2)
 418                spin_unlock(&bh2->lock);
 419
 420        /* drop_key_refs() must be called outside the spinlocks. */
 421        while (--drop_count >= 0)
 422                drop_key_refs(&key1);
 423
 424out:
 425        up_read(&current->mm->mmap_sem);
 426        return ret;
 427}
 428
 429/* The key must be already stored in q->key. */
 430static inline struct futex_hash_bucket *
 431queue_lock(struct futex_q *q, int fd, struct file *filp)
 432{
 433        struct futex_hash_bucket *hb;
 434
 435        q->fd = fd;
 436        q->filp = filp;
 437
 438        init_waitqueue_head(&q->waiters);
 439
 440        get_key_refs(&q->key);
 441        hb = hash_futex(&q->key);
 442        q->lock_ptr = &hb->lock;
 443
 444        spin_lock(&hb->lock);
 445        return hb;
 446}
 447
 448static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 449{
 450        list_add_tail(&q->list, &hb->chain);
 451        spin_unlock(&hb->lock);
 452}
 453
 454static inline void
 455queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 456{
 457        spin_unlock(&hb->lock);
 458        drop_key_refs(&q->key);
 459}
 460
 461/*
 462 * queue_me and unqueue_me must be called as a pair, each
 463 * exactly once.  They are called with the hashed spinlock held.
 464 */
 465
 466/* The key must be already stored in q->key. */
 467static void queue_me(struct futex_q *q, int fd, struct file *filp)
 468{
 469        struct futex_hash_bucket *hb;
 470
 471        hb = queue_lock(q, fd, filp);
 472        __queue_me(q, hb);
 473}
 474
 475/* Return 1 if we were still queued (ie. 0 means we were woken) */
 476static int unqueue_me(struct futex_q *q)
 477{
 478        spinlock_t *lock_ptr;
 479        int ret = 0;
 480
 481        /* In the common case we don't take the spinlock, which is nice. */
 482 retry:
 483        lock_ptr = q->lock_ptr;
 484        barrier();
 485        if (lock_ptr != 0) {
 486                spin_lock(lock_ptr);
 487                /*
 488                 * q->lock_ptr can change between reading it and
 489                 * spin_lock(), causing us to take the wrong lock.  This
 490                 * corrects the race condition.
 491                 *
 492                 * Reasoning goes like this: if we have the wrong lock,
 493                 * q->lock_ptr must have changed (maybe several times)
 494                 * between reading it and the spin_lock().  It can
 495                 * change again after the spin_lock() but only if it was
 496                 * already changed before the spin_lock().  It cannot,
 497                 * however, change back to the original value.  Therefore
 498                 * we can detect whether we acquired the correct lock.
 499                 */
 500                if (unlikely(lock_ptr != q->lock_ptr)) {
 501                        spin_unlock(lock_ptr);
 502                        goto retry;
 503                }
 504                WARN_ON(list_empty(&q->list));
 505                list_del(&q->list);
 506                spin_unlock(lock_ptr);
 507                ret = 1;
 508        }
 509
 510        drop_key_refs(&q->key);
 511        return ret;
 512}
 513
 514static int futex_wait(unsigned long uaddr, int val, unsigned long time)
 515{
 516        struct task_struct *curr = current;
 517        DECLARE_WAITQUEUE(wait, curr);
 518        struct futex_hash_bucket *hb;
 519        int ret, curval;
 520        struct futex_q q;
 521
 522 retry:
 523        down_read(&curr->mm->mmap_sem);
 524
 525        ret = get_futex_key(uaddr, &q.key);
 526        if (unlikely(ret != 0))
 527                goto out_release_sem;
 528
 529        hb = queue_lock(&q, -1, NULL);
 530
 531        /*
 532         * Access the page AFTER the futex is queued.
 533         * Order is important:
 534         *
 535         *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
 536         *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
 537         *
 538         * The basic logical guarantee of a futex is that it blocks ONLY
 539         * if cond(var) is known to be true at the time of blocking, for
 540         * any cond.  If we queued after testing *uaddr, that would open
 541         * a race condition where we could block indefinitely with
 542         * cond(var) false, which would violate the guarantee.
 543         *
 544         * A consequence is that futex_wait() can return zero and absorb
 545         * a wakeup when *uaddr != val on entry to the syscall.  This is
 546         * rare, but normal.
 547         *
 548         * We hold the mmap semaphore, so the mapping cannot have changed
 549         * since we looked it up in get_futex_key.
 550         */
 551        ret = get_futex_value_locked(&curval, (int __user *)uaddr);
 552
 553        if (unlikely(ret)) {
 554                queue_unlock(&q, hb);
 555
 556                /*
 557                 * If we would have faulted, release mmap_sem, fault it in and
 558                 * start all over again.
 559                 */
 560                up_read(&curr->mm->mmap_sem);
 561
 562                ret = get_user(curval, (int __user *)uaddr);
 563
 564                if (!ret)
 565                        goto retry;
 566                return ret;
 567        }
 568        if (curval != val) {
 569                ret = -EWOULDBLOCK;
 570                queue_unlock(&q, hb);
 571                goto out_release_sem;
 572        }
 573
 574        /* Only actually queue if *uaddr contained val.  */
 575        __queue_me(&q, hb);
 576
 577        /*
 578         * Now the futex is queued and we have checked the data, we
 579         * don't want to hold mmap_sem while we sleep.
 580         */
 581        up_read(&curr->mm->mmap_sem);
 582
 583        /*
 584         * There might have been scheduling since the queue_me(), as we
 585         * cannot hold a spinlock across the get_user() in case it
 586         * faults, and we cannot just set TASK_INTERRUPTIBLE state when
 587         * queueing ourselves into the futex hash.  This code thus has to
 588         * rely on the futex_wake() code removing us from hash when it
 589         * wakes us up.
 590         */
 591
 592        /* add_wait_queue is the barrier after __set_current_state. */
 593        __set_current_state(TASK_INTERRUPTIBLE);
 594        add_wait_queue(&q.waiters, &wait);
 595        /*
 596         * !list_empty() is safe here without any lock.
 597         * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
 598         */
 599        if (likely(!list_empty(&q.list)))
 600                time = schedule_timeout(time);
 601        __set_current_state(TASK_RUNNING);
 602
 603        /*
 604         * NOTE: we don't remove ourselves from the waitqueue because
 605         * we are the only user of it.
 606         */
 607
 608        /* If we were woken (and unqueued), we succeeded, whatever. */
 609        if (!unqueue_me(&q))
 610                return 0;
 611        if (time == 0)
 612                return -ETIMEDOUT;
 613        /*
 614         * We expect signal_pending(current), but another thread may
 615         * have handled it for us already.
 616         */
 617        return -EINTR;
 618
 619 out_release_sem:
 620        up_read(&curr->mm->mmap_sem);
 621        return ret;
 622}
 623
 624static int futex_close(struct inode *inode, struct file *filp)
 625{
 626        struct futex_q *q = filp->private_data;
 627
 628        unqueue_me(q);
 629        kfree(q);
 630        return 0;
 631}
 632
 633/* This is one-shot: once it's gone off you need a new fd */
 634static unsigned int futex_poll(struct file *filp,
 635                               struct poll_table_struct *wait)
 636{
 637        struct futex_q *q = filp->private_data;
 638        int ret = 0;
 639
 640        poll_wait(filp, &q->waiters, wait);
 641
 642        /*
 643         * list_empty() is safe here without any lock.
 644         * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
 645         */
 646        if (list_empty(&q->list))
 647                ret = POLLIN | POLLRDNORM;
 648
 649        return ret;
 650}
 651
 652static struct file_operations futex_fops = {
 653        .release        = futex_close,
 654        .poll           = futex_poll,
 655};
 656
 657/*
 658 * Signal allows caller to avoid the race which would occur if they
 659 * set the sigio stuff up afterwards.
 660 */
 661static int futex_fd(unsigned long uaddr, int signal)
 662{
 663        struct futex_q *q;
 664        struct file *filp;
 665        int ret, err;
 666
 667        ret = -EINVAL;
 668        if (signal < 0 || signal > _NSIG)
 669                goto out;
 670
 671        ret = get_unused_fd();
 672        if (ret < 0)
 673                goto out;
 674        filp = get_empty_filp();
 675        if (!filp) {
 676                put_unused_fd(ret);
 677                ret = -ENFILE;
 678                goto out;
 679        }
 680        filp->f_op = &futex_fops;
 681        filp->f_vfsmnt = mntget(futex_mnt);
 682        filp->f_dentry = dget(futex_mnt->mnt_root);
 683        filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
 684
 685        if (signal) {
 686                int err;
 687                err = f_setown(filp, current->pid, 1);
 688                if (err < 0) {
 689                        put_unused_fd(ret);
 690                        put_filp(filp);
 691                        ret = err;
 692                        goto out;
 693                }
 694                filp->f_owner.signum = signal;
 695        }
 696
 697        q = kmalloc(sizeof(*q), GFP_KERNEL);
 698        if (!q) {
 699                put_unused_fd(ret);
 700                put_filp(filp);
 701                ret = -ENOMEM;
 702                goto out;
 703        }
 704
 705        down_read(&current->mm->mmap_sem);
 706        err = get_futex_key(uaddr, &q->key);
 707
 708        if (unlikely(err != 0)) {
 709                up_read(&current->mm->mmap_sem);
 710                put_unused_fd(ret);
 711                put_filp(filp);
 712                kfree(q);
 713                return err;
 714        }
 715
 716        /*
 717         * queue_me() must be called before releasing mmap_sem, because
 718         * key->shared.inode needs to be referenced while holding it.
 719         */
 720        filp->private_data = q;
 721
 722        queue_me(q, ret, filp);
 723        up_read(&current->mm->mmap_sem);
 724
 725        /* Now we map fd to filp, so userspace can access it */
 726        fd_install(ret, filp);
 727out:
 728        return ret;
 729}
 730
 731long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
 732                unsigned long uaddr2, int val2, int val3)
 733{
 734        int ret;
 735
 736        switch (op) {
 737        case FUTEX_WAIT:
 738                ret = futex_wait(uaddr, val, timeout);
 739                break;
 740        case FUTEX_WAKE:
 741                ret = futex_wake(uaddr, val);
 742                break;
 743        case FUTEX_FD:
 744                /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
 745                ret = futex_fd(uaddr, val);
 746                break;
 747        case FUTEX_REQUEUE:
 748                ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
 749                break;
 750        case FUTEX_CMP_REQUEUE:
 751                ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
 752                break;
 753        default:
 754                ret = -ENOSYS;
 755        }
 756        return ret;
 757}
 758
 759
 760asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
 761                          struct timespec __user *utime, u32 __user *uaddr2,
 762                          int val3)
 763{
 764        struct timespec t;
 765        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
 766        int val2 = 0;
 767
 768        if (utime && (op == FUTEX_WAIT)) {
 769                if (copy_from_user(&t, utime, sizeof(t)) != 0)
 770                        return -EFAULT;
 771                if ((t.tv_sec < 0) || (((unsigned) t.tv_nsec) >= NSEC_PER_SEC))
 772                        return -EINVAL;
 773                timeout = timespec_to_jiffies(&t) + 1;
 774        }
 775        /*
 776         * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
 777         */
 778        if (op >= FUTEX_REQUEUE)
 779                val2 = (int) (unsigned long) utime;
 780
 781        return do_futex((unsigned long)uaddr, op, val, timeout,
 782                        (unsigned long)uaddr2, val2, val3);
 783}
 784
 785static struct super_block *
 786futexfs_get_sb(struct file_system_type *fs_type,
 787               int flags, const char *dev_name, void *data)
 788{
 789        return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA);
 790}
 791
 792static struct file_system_type futex_fs_type = {
 793        .name           = "futexfs",
 794        .get_sb         = futexfs_get_sb,
 795        .kill_sb        = kill_anon_super,
 796};
 797
 798static int __init init(void)
 799{
 800        unsigned int i;
 801
 802        register_filesystem(&futex_fs_type);
 803        futex_mnt = kern_mount(&futex_fs_type);
 804
 805        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
 806                INIT_LIST_HEAD(&futex_queues[i].chain);
 807                futex_queues[i].lock = SPIN_LOCK_UNLOCKED;
 808        }
 809        return 0;
 810}
 811__initcall(init);
 812