RHEL4/ipc/mqueue.c
<<
>>
Prefs
   1/*
   2 * POSIX message queues filesystem for Linux.
   3 *
   4 * Copyright (C) 2003,2004  Krzysztof Benedyczak    (golbi@mat.uni.torun.pl)
   5 *                          Michal Wronski          (wrona@mat.uni.torun.pl)
   6 *
   7 * Spinlocks:               Mohamed Abbas           (abbas.mohamed@intel.com)
   8 * Lockless receive & send, fd based notify:
   9 *                          Manfred Spraul          (manfred@colorfullife.com)
  10 *
  11 * This file is released under the GPL.
  12 */
  13
  14#include <linux/init.h>
  15#include <linux/pagemap.h>
  16#include <linux/file.h>
  17#include <linux/mount.h>
  18#include <linux/namei.h>
  19#include <linux/sysctl.h>
  20#include <linux/poll.h>
  21#include <linux/mqueue.h>
  22#include <linux/msg.h>
  23#include <linux/skbuff.h>
  24#include <linux/netlink.h>
  25#include <net/sock.h>
  26#include "util.h"
  27
  28#define MQUEUE_MAGIC    0x19800202
  29#define DIRENT_SIZE     20
  30#define FILENT_SIZE     80
  31
  32#define SEND            0
  33#define RECV            1
  34
  35#define STATE_NONE      0
  36#define STATE_PENDING   1
  37#define STATE_READY     2
  38
  39/* used by sysctl */
  40#define FS_MQUEUE       1
  41#define CTL_QUEUESMAX   2
  42#define CTL_MSGMAX      3
  43#define CTL_MSGSIZEMAX  4
  44
  45/* default values */
  46#define DFLT_QUEUESMAX  256     /* max number of message queues */
  47#define DFLT_MSGMAX     10      /* max number of messages in each queue */
  48#define HARD_MSGMAX     (131072/sizeof(void*))
  49#define DFLT_MSGSIZEMAX 8192    /* max message size */
  50
  51#define NOTIFY_COOKIE_LEN       32
  52
  53struct ext_wait_queue {         /* queue of sleeping tasks */
  54        struct task_struct *task;
  55        struct list_head list;
  56        struct msg_msg *msg;    /* ptr of loaded message */
  57        int state;              /* one of STATE_* values */
  58};
  59
  60struct mqueue_inode_info {
  61        spinlock_t lock;
  62        struct inode vfs_inode;
  63        wait_queue_head_t wait_q;
  64
  65        struct msg_msg **messages;
  66        struct mq_attr attr;
  67
  68        struct sigevent notify;
  69        pid_t notify_owner;
  70        struct user_struct *user;       /* user who created, for accouting */
  71        struct sock *notify_sock;
  72        struct sk_buff *notify_cookie;
  73
  74        /* for tasks waiting for free space and messages, respectively */
  75        struct ext_wait_queue e_wait_q[2];
  76
  77        unsigned long qsize; /* size of queue in memory (sum of all msgs) */
  78};
  79
  80static struct inode_operations mqueue_dir_inode_operations;
  81static struct file_operations mqueue_file_operations;
  82static struct super_operations mqueue_super_ops;
  83static void remove_notification(struct mqueue_inode_info *info);
  84
  85static spinlock_t mq_lock;
  86static kmem_cache_t *mqueue_inode_cachep;
  87static struct vfsmount *mqueue_mnt;
  88
  89static unsigned int queues_count;
  90static unsigned int queues_max  = DFLT_QUEUESMAX;
  91static unsigned int msg_max     = DFLT_MSGMAX;
  92static unsigned int msgsize_max = DFLT_MSGSIZEMAX;
  93
  94static struct ctl_table_header * mq_sysctl_table;
  95
  96static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
  97{
  98        return container_of(inode, struct mqueue_inode_info, vfs_inode);
  99}
 100
 101static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 102                                                        struct mq_attr *attr)
 103{
 104        struct inode *inode;
 105
 106        inode = new_inode(sb);
 107        if (inode) {
 108                inode->i_mode = mode;
 109                inode->i_uid = current->fsuid;
 110                inode->i_gid = current->fsgid;
 111                inode->i_blksize = PAGE_CACHE_SIZE;
 112                inode->i_blocks = 0;
 113                inode->i_mtime = inode->i_ctime = inode->i_atime =
 114                                CURRENT_TIME;
 115
 116                if (S_ISREG(mode)) {
 117                        struct mqueue_inode_info *info;
 118                        struct task_struct *p = current;
 119                        struct user_struct *u = p->user;
 120                        unsigned long mq_bytes, mq_msg_tblsz;
 121
 122                        inode->i_fop = &mqueue_file_operations;
 123                        inode->i_size = FILENT_SIZE;
 124                        /* mqueue specific info */
 125                        info = MQUEUE_I(inode);
 126                        spin_lock_init(&info->lock);
 127                        init_waitqueue_head(&info->wait_q);
 128                        INIT_LIST_HEAD(&info->e_wait_q[0].list);
 129                        INIT_LIST_HEAD(&info->e_wait_q[1].list);
 130                        info->messages = NULL;
 131                        info->notify_owner = 0;
 132                        info->qsize = 0;
 133                        info->user = NULL;      /* set when all is ok */
 134                        memset(&info->attr, 0, sizeof(info->attr));
 135                        info->attr.mq_maxmsg = DFLT_MSGMAX;
 136                        info->attr.mq_msgsize = DFLT_MSGSIZEMAX;
 137                        if (attr) {
 138                                info->attr.mq_maxmsg = attr->mq_maxmsg;
 139                                info->attr.mq_msgsize = attr->mq_msgsize;
 140                        }
 141                        mq_msg_tblsz = info->attr.mq_maxmsg * sizeof(struct msg_msg *);
 142                        mq_bytes = (mq_msg_tblsz +
 143                                (info->attr.mq_maxmsg * info->attr.mq_msgsize));
 144
 145                        spin_lock(&mq_lock);
 146                        if (u->mq_bytes + mq_bytes < u->mq_bytes ||
 147                            u->mq_bytes + mq_bytes >
 148                            p->rlim[RLIMIT_MSGQUEUE].rlim_cur) {
 149                                spin_unlock(&mq_lock);
 150                                goto out_inode;
 151                        }
 152                        u->mq_bytes += mq_bytes;
 153                        spin_unlock(&mq_lock);
 154
 155                        info->messages = kmalloc(mq_msg_tblsz, GFP_KERNEL);
 156                        if (!info->messages) {
 157                                spin_lock(&mq_lock);
 158                                u->mq_bytes -= mq_bytes;
 159                                spin_unlock(&mq_lock);
 160                                goto out_inode;
 161                        }
 162                        /* all is ok */
 163                        info->user = get_uid(u);
 164                } else if (S_ISDIR(mode)) {
 165                        inode->i_nlink++;
 166                        /* Some things misbehave if size == 0 on a directory */
 167                        inode->i_size = 2 * DIRENT_SIZE;
 168                        inode->i_op = &mqueue_dir_inode_operations;
 169                        inode->i_fop = &simple_dir_operations;
 170                }
 171        }
 172        return inode;
 173out_inode:
 174        make_bad_inode(inode);
 175        iput(inode);
 176        return NULL;
 177}
 178
 179static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 180{
 181        struct inode *inode;
 182
 183        sb->s_blocksize = PAGE_CACHE_SIZE;
 184        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 185        sb->s_magic = MQUEUE_MAGIC;
 186        sb->s_op = &mqueue_super_ops;
 187
 188        inode = mqueue_get_inode(sb, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
 189        if (!inode)
 190                return -ENOMEM;
 191
 192        sb->s_root = d_alloc_root(inode);
 193        if (!sb->s_root) {
 194                iput(inode);
 195                return -ENOMEM;
 196        }
 197
 198        return 0;
 199}
 200
 201static struct super_block *mqueue_get_sb(struct file_system_type *fs_type,
 202                                         int flags, const char *dev_name,
 203                                         void *data)
 204{
 205        return get_sb_single(fs_type, flags, data, mqueue_fill_super);
 206}
 207
 208static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
 209{
 210        struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
 211
 212        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
 213                SLAB_CTOR_CONSTRUCTOR)
 214                inode_init_once(&p->vfs_inode);
 215}
 216
 217static struct inode *mqueue_alloc_inode(struct super_block *sb)
 218{
 219        struct mqueue_inode_info *ei;
 220
 221        ei = kmem_cache_alloc(mqueue_inode_cachep, SLAB_KERNEL);
 222        if (!ei)
 223                return NULL;
 224        return &ei->vfs_inode;
 225}
 226
 227static void mqueue_destroy_inode(struct inode *inode)
 228{
 229        kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
 230}
 231
 232static void mqueue_delete_inode(struct inode *inode)
 233{
 234        struct mqueue_inode_info *info;
 235        struct user_struct *user;
 236        unsigned long mq_bytes;
 237        int i;
 238
 239        if (S_ISDIR(inode->i_mode)) {
 240                clear_inode(inode);
 241                return;
 242        }
 243        info = MQUEUE_I(inode);
 244        spin_lock(&info->lock);
 245        for (i = 0; i < info->attr.mq_curmsgs; i++)
 246                free_msg(info->messages[i]);
 247        kfree(info->messages);
 248        spin_unlock(&info->lock);
 249
 250        clear_inode(inode);
 251
 252        mq_bytes = (info->attr.mq_maxmsg * sizeof(struct msg_msg *) +
 253                   (info->attr.mq_maxmsg * info->attr.mq_msgsize));
 254        user = info->user;
 255        if (user) {
 256                spin_lock(&mq_lock);
 257                user->mq_bytes -= mq_bytes;
 258                queues_count--;
 259                spin_unlock(&mq_lock);
 260                free_uid(user);
 261        }
 262}
 263
 264static int mqueue_create(struct inode *dir, struct dentry *dentry,
 265                                int mode, struct nameidata *nd)
 266{
 267        struct inode *inode;
 268        struct mq_attr *attr = dentry->d_fsdata;
 269        int error;
 270
 271        spin_lock(&mq_lock);
 272        if (queues_count >= queues_max && !capable(CAP_SYS_RESOURCE)) {
 273                error = -ENOSPC;
 274                goto out_lock;
 275        }
 276        queues_count++;
 277        spin_unlock(&mq_lock);
 278
 279        inode = mqueue_get_inode(dir->i_sb, mode, attr);
 280        if (!inode) {
 281                error = -ENOMEM;
 282                spin_lock(&mq_lock);
 283                queues_count--;
 284                goto out_lock;
 285        }
 286
 287        dir->i_size += DIRENT_SIZE;
 288        dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
 289
 290        d_instantiate(dentry, inode);
 291        dget(dentry);
 292        return 0;
 293out_lock:
 294        spin_unlock(&mq_lock);
 295        return error;
 296}
 297
 298static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
 299{
 300        struct inode *inode = dentry->d_inode;
 301
 302        dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
 303        dir->i_size -= DIRENT_SIZE;
 304        inode->i_nlink--;
 305        dput(dentry);
 306        return 0;
 307}
 308
 309/*
 310*       This is routine for system read from queue file.
 311*       To avoid mess with doing here some sort of mq_receive we allow
 312*       to read only queue size & notification info (the only values
 313*       that are interesting from user point of view and aren't accessible
 314*       through std routines)
 315*/
 316static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
 317                                size_t count, loff_t * off)
 318{
 319        struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
 320        char buffer[FILENT_SIZE];
 321        size_t slen;
 322        loff_t o;
 323
 324        if (!count)
 325                return 0;
 326
 327        spin_lock(&info->lock);
 328        snprintf(buffer, sizeof(buffer),
 329                        "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
 330                        info->qsize,
 331                        info->notify_owner ? info->notify.sigev_notify : 0,
 332                        (info->notify_owner &&
 333                         info->notify.sigev_notify == SIGEV_SIGNAL) ?
 334                                info->notify.sigev_signo : 0,
 335                        info->notify_owner);
 336        spin_unlock(&info->lock);
 337        buffer[sizeof(buffer)-1] = '\0';
 338        slen = strlen(buffer)+1;
 339
 340        o = *off;
 341        if (o > slen)
 342                return 0;
 343
 344        if (o + count > slen)
 345                count = slen - o;
 346
 347        if (copy_to_user(u_data, buffer + o, count))
 348                return -EFAULT;
 349
 350        *off = o + count;
 351        filp->f_dentry->d_inode->i_atime = filp->f_dentry->d_inode->i_ctime = CURRENT_TIME;
 352        return count;
 353}
 354
 355static int mqueue_flush_file(struct file *filp)
 356{
 357        struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
 358
 359        spin_lock(&info->lock);
 360        if (current->tgid == info->notify_owner)
 361                remove_notification(info);
 362
 363        spin_unlock(&info->lock);
 364        return 0;
 365}
 366
 367static unsigned int mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab)
 368{
 369        struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
 370        int retval = 0;
 371
 372        poll_wait(filp, &info->wait_q, poll_tab);
 373
 374        spin_lock(&info->lock);
 375        if (info->attr.mq_curmsgs)
 376                retval = POLLIN | POLLRDNORM;
 377
 378        if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
 379                retval |= POLLOUT | POLLWRNORM;
 380        spin_unlock(&info->lock);
 381
 382        return retval;
 383}
 384
 385/* Adds current to info->e_wait_q[sr] before element with smaller prio */
 386static void wq_add(struct mqueue_inode_info *info, int sr,
 387                        struct ext_wait_queue *ewp)
 388{
 389        struct ext_wait_queue *walk;
 390
 391        ewp->task = current;
 392
 393        list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
 394                if (walk->task->static_prio <= current->static_prio) {
 395                        list_add_tail(&ewp->list, &walk->list);
 396                        return;
 397                }
 398        }
 399        list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
 400}
 401
 402/*
 403 * Puts current task to sleep. Caller must hold queue lock. After return
 404 * lock isn't held.
 405 * sr: SEND or RECV
 406 */
 407static int wq_sleep(struct mqueue_inode_info *info, int sr,
 408                        long timeout, struct ext_wait_queue *ewp)
 409{
 410        int retval;
 411        signed long time;
 412
 413        wq_add(info, sr, ewp);
 414
 415        for (;;) {
 416                set_current_state(TASK_INTERRUPTIBLE);
 417
 418                spin_unlock(&info->lock);
 419                time = schedule_timeout(timeout);
 420
 421                while (ewp->state == STATE_PENDING)
 422                        cpu_relax();
 423
 424                if (ewp->state == STATE_READY) {
 425                        retval = 0;
 426                        goto out;
 427                }
 428                spin_lock(&info->lock);
 429                if (ewp->state == STATE_READY) {
 430                        retval = 0;
 431                        goto out_unlock;
 432                }
 433                if (signal_pending(current)) {
 434                        retval = -ERESTARTSYS;
 435                        break;
 436                }
 437                if (time == 0) {
 438                        retval = -ETIMEDOUT;
 439                        break;
 440                }
 441        }
 442        list_del(&ewp->list);
 443out_unlock:
 444        spin_unlock(&info->lock);
 445out:
 446        return retval;
 447}
 448
 449/*
 450 * Returns waiting task that should be serviced first or NULL if none exists
 451 */
 452static struct ext_wait_queue *wq_get_first_waiter(
 453                struct mqueue_inode_info *info, int sr)
 454{
 455        struct list_head *ptr;
 456
 457        ptr = info->e_wait_q[sr].list.prev;
 458        if (ptr == &info->e_wait_q[sr].list)
 459                return NULL;
 460        return list_entry(ptr, struct ext_wait_queue, list);
 461}
 462
 463/* Auxiliary functions to manipulate messages' list */
 464static void msg_insert(struct msg_msg *ptr, struct mqueue_inode_info *info)
 465{
 466        int k;
 467
 468        k = info->attr.mq_curmsgs - 1;
 469        while (k >= 0 && info->messages[k]->m_type >= ptr->m_type) {
 470                info->messages[k + 1] = info->messages[k];
 471                k--;
 472        }
 473        info->attr.mq_curmsgs++;
 474        info->qsize += ptr->m_ts;
 475        info->messages[k + 1] = ptr;
 476}
 477
 478static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
 479{
 480        info->qsize -= info->messages[--info->attr.mq_curmsgs]->m_ts;
 481        return info->messages[info->attr.mq_curmsgs];
 482}
 483
 484static inline void set_cookie(struct sk_buff *skb, char code)
 485{
 486        ((char*)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
 487}
 488
 489/*
 490 * The next function is only to split too long sys_mq_timedsend
 491 */
 492static void __do_notify(struct mqueue_inode_info *info)
 493{
 494        /* notification
 495         * invoked when there is registered process and there isn't process
 496         * waiting synchronously for message AND state of queue changed from
 497         * empty to not empty. Here we are sure that no one is waiting
 498         * synchronously. */
 499        if (info->notify_owner &&
 500            info->attr.mq_curmsgs == 1) {
 501                struct siginfo sig_i;
 502                switch (info->notify.sigev_notify) {
 503                case SIGEV_NONE:
 504                        break;
 505                case SIGEV_SIGNAL:
 506                        /* sends signal */
 507
 508                        sig_i.si_signo = info->notify.sigev_signo;
 509                        sig_i.si_errno = 0;
 510                        sig_i.si_code = SI_MESGQ;
 511                        sig_i.si_value = info->notify.sigev_value;
 512                        sig_i.si_pid = current->tgid;
 513                        sig_i.si_uid = current->uid;
 514
 515                        kill_proc_info(info->notify.sigev_signo,
 516                                       &sig_i, info->notify_owner);
 517                        break;
 518                case SIGEV_THREAD:
 519                        set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
 520                        netlink_sendskb(info->notify_sock,
 521                                        info->notify_cookie, 0);
 522                        break;
 523                }
 524                /* after notification unregisters process */
 525                info->notify_owner = 0;
 526        }
 527        wake_up(&info->wait_q);
 528}
 529
 530static long prepare_timeout(const struct timespec __user *u_arg)
 531{
 532        struct timespec ts, nowts;
 533        long timeout;
 534
 535        if (u_arg) {
 536                if (unlikely(copy_from_user(&ts, u_arg,
 537                                        sizeof(struct timespec))))
 538                        return -EFAULT;
 539
 540                if (unlikely(ts.tv_nsec < 0 || ts.tv_sec < 0
 541                        || ts.tv_nsec >= NSEC_PER_SEC))
 542                        return -EINVAL;
 543                nowts = CURRENT_TIME;
 544                /* first subtract as jiffies can't be too big */
 545                ts.tv_sec -= nowts.tv_sec;
 546                if (ts.tv_nsec < nowts.tv_nsec) {
 547                        ts.tv_nsec += NSEC_PER_SEC;
 548                        ts.tv_sec--;
 549                }
 550                ts.tv_nsec -= nowts.tv_nsec;
 551                if (ts.tv_sec < 0)
 552                        return 0;
 553
 554                timeout = timespec_to_jiffies(&ts) + 1;
 555        } else
 556                return MAX_SCHEDULE_TIMEOUT;
 557
 558        return timeout;
 559}
 560
 561static void remove_notification(struct mqueue_inode_info *info)
 562{
 563        if (info->notify_owner != 0 &&
 564            info->notify.sigev_notify == SIGEV_THREAD) {
 565                set_cookie(info->notify_cookie, NOTIFY_REMOVED);
 566                netlink_sendskb(info->notify_sock, info->notify_cookie, 0);
 567        }
 568        info->notify_owner = 0;
 569}
 570
 571static int mq_attr_ok(struct mq_attr *attr)
 572{
 573        if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0)
 574                return 0;
 575        if (capable(CAP_SYS_RESOURCE)) {
 576                if (attr->mq_maxmsg > HARD_MSGMAX)
 577                        return 0;
 578        } else {
 579                if (attr->mq_maxmsg > msg_max ||
 580                                attr->mq_msgsize > msgsize_max)
 581                        return 0;
 582        }
 583        /* check for overflow */
 584        if (attr->mq_msgsize > ULONG_MAX/attr->mq_maxmsg)
 585                return 0;
 586        if ((unsigned long)(attr->mq_maxmsg * attr->mq_msgsize) +
 587            (attr->mq_maxmsg * sizeof (struct msg_msg *)) <
 588            (unsigned long)(attr->mq_maxmsg * attr->mq_msgsize))
 589                return 0;
 590        return 1;
 591}
 592
 593/*
 594 * Invoked when creating a new queue via sys_mq_open
 595 */
 596static struct file *do_create(struct dentry *dir, struct dentry *dentry,
 597                        int oflag, mode_t mode, struct mq_attr __user *u_attr)
 598{
 599        struct mq_attr attr;
 600        int ret;
 601
 602        if (u_attr) {
 603                ret = -EFAULT;
 604                if (copy_from_user(&attr, u_attr, sizeof(attr)))
 605                        goto out;
 606                ret = -EINVAL;
 607                if (!mq_attr_ok(&attr))
 608                        goto out;
 609                /* store for use during create */
 610                dentry->d_fsdata = &attr;
 611        }
 612
 613        ret = vfs_create(dir->d_inode, dentry, mode, NULL);
 614        dentry->d_fsdata = NULL;
 615        if (ret)
 616                goto out;
 617
 618        return dentry_open(dentry, mqueue_mnt, oflag);
 619
 620out:
 621        dput(dentry);
 622        mntput(mqueue_mnt);
 623        return ERR_PTR(ret);
 624}
 625
 626/* Opens existing queue */
 627static struct file *do_open(struct dentry *dentry, int oflag)
 628{
 629static int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
 630                                        MAY_READ | MAY_WRITE };
 631
 632        if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) {
 633                dput(dentry);
 634                mntput(mqueue_mnt);
 635                return ERR_PTR(-EINVAL);
 636        }
 637
 638        if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], NULL)) {
 639                dput(dentry);
 640                mntput(mqueue_mnt);
 641                return ERR_PTR(-EACCES);
 642        }
 643
 644        return dentry_open(dentry, mqueue_mnt, oflag);
 645}
 646
 647asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
 648                                struct mq_attr __user *u_attr)
 649{
 650        struct dentry *dentry;
 651        struct file *filp;
 652        char *name;
 653        int fd, error;
 654
 655        if (IS_ERR(name = getname(u_name)))
 656                return PTR_ERR(name);
 657
 658        fd = get_unused_fd();
 659        if (fd < 0)
 660                goto out_putname;
 661
 662        down(&mqueue_mnt->mnt_root->d_inode->i_sem);
 663        dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
 664        if (IS_ERR(dentry)) {
 665                error = PTR_ERR(dentry);
 666                goto out_err;
 667        }
 668        mntget(mqueue_mnt);
 669
 670        if (oflag & O_CREAT) {
 671                if (dentry->d_inode) {  /* entry already exists */
 672                        error = -EEXIST;
 673                        if (oflag & O_EXCL)
 674                                goto out;
 675                        filp = do_open(dentry, oflag);
 676                } else {
 677                        filp = do_create(mqueue_mnt->mnt_root, dentry,
 678                                                oflag, mode, u_attr);
 679                }
 680        } else {
 681                error = -ENOENT;
 682                if (!dentry->d_inode)
 683                        goto out;
 684                filp = do_open(dentry, oflag);
 685        }
 686
 687        if (IS_ERR(filp)) {
 688                error = PTR_ERR(filp);
 689                goto out_putfd;
 690        }
 691
 692        set_close_on_exec(fd, 1);
 693        fd_install(fd, filp);
 694        goto out_upsem;
 695
 696out:
 697        dput(dentry);
 698        mntput(mqueue_mnt);
 699out_putfd:
 700        put_unused_fd(fd);
 701out_err:
 702        fd = error;
 703out_upsem:
 704        up(&mqueue_mnt->mnt_root->d_inode->i_sem);
 705out_putname:
 706        putname(name);
 707        return fd;
 708}
 709
 710asmlinkage long sys_mq_unlink(const char __user *u_name)
 711{
 712        int err;
 713        char *name;
 714        struct dentry *dentry;
 715        struct inode *inode = NULL;
 716
 717        name = getname(u_name);
 718        if (IS_ERR(name))
 719                return PTR_ERR(name);
 720
 721        down(&mqueue_mnt->mnt_root->d_inode->i_sem);
 722        dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
 723        if (IS_ERR(dentry)) {
 724                err = PTR_ERR(dentry);
 725                goto out_unlock;
 726        }
 727
 728        if (!dentry->d_inode) {
 729                err = -ENOENT;
 730                goto out_err;
 731        }
 732
 733        inode = dentry->d_inode;
 734        if (inode)
 735                atomic_inc(&inode->i_count);
 736
 737        err = vfs_unlink(dentry->d_parent->d_inode, dentry);
 738out_err:
 739        dput(dentry);
 740
 741out_unlock:
 742        up(&mqueue_mnt->mnt_root->d_inode->i_sem);
 743        putname(name);
 744        if (inode)
 745                iput(inode);
 746
 747        return err;
 748}
 749
 750/* Pipelined send and receive functions.
 751 *
 752 * If a receiver finds no waiting message, then it registers itself in the
 753 * list of waiting receivers. A sender checks that list before adding the new
 754 * message into the message array. If there is a waiting receiver, then it
 755 * bypasses the message array and directly hands the message over to the
 756 * receiver.
 757 * The receiver accepts the message and returns without grabbing the queue
 758 * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers
 759 * are necessary. The same algorithm is used for sysv semaphores, see
 760 * ipc/sem.c fore more details.
 761 *
 762 * The same algorithm is used for senders.
 763 */
 764
 765/* pipelined_send() - send a message directly to the task waiting in
 766 * sys_mq_timedreceive() (without inserting message into a queue).
 767 */
 768static inline void pipelined_send(struct mqueue_inode_info *info,
 769                                  struct msg_msg *message,
 770                                  struct ext_wait_queue *receiver)
 771{
 772        receiver->msg = message;
 773        list_del(&receiver->list);
 774        receiver->state = STATE_PENDING;
 775        wake_up_process(receiver->task);
 776        wmb();
 777        receiver->state = STATE_READY;
 778}
 779
 780/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
 781 * gets its message and put to the queue (we have one free place for sure). */
 782static inline void pipelined_receive(struct mqueue_inode_info *info)
 783{
 784        struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
 785
 786        if (!sender) {
 787                /* for poll */
 788                wake_up_interruptible(&info->wait_q);
 789                return;
 790        }
 791        msg_insert(sender->msg, info);
 792        list_del(&sender->list);
 793        sender->state = STATE_PENDING;
 794        wake_up_process(sender->task);
 795        wmb();
 796        sender->state = STATE_READY;
 797}
 798
 799asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 800        size_t msg_len, unsigned int msg_prio,
 801        const struct timespec __user *u_abs_timeout)
 802{
 803        struct file *filp;
 804        struct inode *inode;
 805        struct ext_wait_queue wait;
 806        struct ext_wait_queue *receiver;
 807        struct msg_msg *msg_ptr;
 808        struct mqueue_inode_info *info;
 809        long timeout;
 810        int ret;
 811
 812        if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
 813                return -EINVAL;
 814
 815        timeout = prepare_timeout(u_abs_timeout);
 816
 817        ret = -EBADF;
 818        filp = fget(mqdes);
 819        if (unlikely(!filp))
 820                goto out;
 821
 822        inode = filp->f_dentry->d_inode;
 823        if (unlikely(filp->f_op != &mqueue_file_operations))
 824                goto out_fput;
 825        info = MQUEUE_I(inode);
 826
 827        if (unlikely(!(filp->f_mode & FMODE_WRITE)))
 828                goto out_fput;
 829
 830        if (unlikely(msg_len > info->attr.mq_msgsize)) {
 831                ret = -EMSGSIZE;
 832                goto out_fput;
 833        }
 834
 835        /* First try to allocate memory, before doing anything with
 836         * existing queues. */
 837        msg_ptr = load_msg(u_msg_ptr, msg_len);
 838        if (IS_ERR(msg_ptr)) {
 839                ret = PTR_ERR(msg_ptr);
 840                goto out_fput;
 841        }
 842        msg_ptr->m_ts = msg_len;
 843        msg_ptr->m_type = msg_prio;
 844
 845        spin_lock(&info->lock);
 846
 847        if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
 848                if (filp->f_flags & O_NONBLOCK) {
 849                        spin_unlock(&info->lock);
 850                        ret = -EAGAIN;
 851                } else if (unlikely(timeout < 0)) {
 852                        spin_unlock(&info->lock);
 853                        ret = timeout;
 854                } else {
 855                        wait.task = current;
 856                        wait.msg = (void *) msg_ptr;
 857                        wait.state = STATE_NONE;
 858                        ret = wq_sleep(info, SEND, timeout, &wait);
 859                }
 860                if (ret < 0)
 861                        free_msg(msg_ptr);
 862        } else {
 863                receiver = wq_get_first_waiter(info, RECV);
 864                if (receiver) {
 865                        pipelined_send(info, msg_ptr, receiver);
 866                } else {
 867                        /* adds message to the queue */
 868                        msg_insert(msg_ptr, info);
 869                        __do_notify(info);
 870                }
 871                inode->i_atime = inode->i_mtime = inode->i_ctime =
 872                                CURRENT_TIME;
 873                spin_unlock(&info->lock);
 874                ret = 0;
 875        }
 876out_fput:
 877        fput(filp);
 878out:
 879        return ret;
 880}
 881
 882asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
 883        size_t msg_len, unsigned int __user *u_msg_prio,
 884        const struct timespec __user *u_abs_timeout)
 885{
 886        long timeout;
 887        ssize_t ret;
 888        struct msg_msg *msg_ptr;
 889        struct file *filp;
 890        struct inode *inode;
 891        struct mqueue_inode_info *info;
 892        struct ext_wait_queue wait;
 893
 894        timeout = prepare_timeout(u_abs_timeout);
 895
 896        ret = -EBADF;
 897        filp = fget(mqdes);
 898        if (unlikely(!filp))
 899                goto out;
 900
 901        inode = filp->f_dentry->d_inode;
 902        if (unlikely(filp->f_op != &mqueue_file_operations))
 903                goto out_fput;
 904        info = MQUEUE_I(inode);
 905
 906        if (unlikely(!(filp->f_mode & FMODE_READ)))
 907                goto out_fput;
 908
 909        /* checks if buffer is big enough */
 910        if (unlikely(msg_len < info->attr.mq_msgsize)) {
 911                ret = -EMSGSIZE;
 912                goto out_fput;
 913        }
 914
 915        spin_lock(&info->lock);
 916        if (info->attr.mq_curmsgs == 0) {
 917                if (filp->f_flags & O_NONBLOCK) {
 918                        spin_unlock(&info->lock);
 919                        ret = -EAGAIN;
 920                        msg_ptr = NULL;
 921                } else if (unlikely(timeout < 0)) {
 922                        spin_unlock(&info->lock);
 923                        ret = timeout;
 924                        msg_ptr = NULL;
 925                } else {
 926                        wait.task = current;
 927                        wait.state = STATE_NONE;
 928                        ret = wq_sleep(info, RECV, timeout, &wait);
 929                        msg_ptr = wait.msg;
 930                }
 931        } else {
 932                msg_ptr = msg_get(info);
 933
 934                inode->i_atime = inode->i_mtime = inode->i_ctime =
 935                                CURRENT_TIME;
 936
 937                /* There is now free space in queue. */
 938                pipelined_receive(info);
 939                spin_unlock(&info->lock);
 940                ret = 0;
 941        }
 942        if (ret == 0) {
 943                ret = msg_ptr->m_ts;
 944
 945                if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
 946                        store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
 947                        ret = -EFAULT;
 948                }
 949                free_msg(msg_ptr);
 950        }
 951out_fput:
 952        fput(filp);
 953out:
 954        return ret;
 955}
 956
 957/*
 958 * Notes: the case when user wants us to deregister (with NULL as pointer)
 959 * and he isn't currently owner of notification, will be silently discarded.
 960 * It isn't explicitly defined in the POSIX.
 961 */
 962asmlinkage long sys_mq_notify(mqd_t mqdes,
 963                                const struct sigevent __user *u_notification)
 964{
 965        int ret;
 966        struct file *filp;
 967        struct sock *sock;
 968        struct inode *inode;
 969        struct sigevent notification;
 970        struct mqueue_inode_info *info;
 971        struct sk_buff *nc;
 972
 973        nc = NULL;
 974        sock = NULL;
 975        if (u_notification != NULL) {
 976                if (copy_from_user(&notification, u_notification,
 977                                        sizeof(struct sigevent)))
 978                        return -EFAULT;
 979
 980                if (unlikely(notification.sigev_notify != SIGEV_NONE &&
 981                             notification.sigev_notify != SIGEV_SIGNAL &&
 982                             notification.sigev_notify != SIGEV_THREAD))
 983                        return -EINVAL;
 984                if (notification.sigev_notify == SIGEV_SIGNAL &&
 985                        (notification.sigev_signo < 0 ||
 986                         notification.sigev_signo > _NSIG)) {
 987                        return -EINVAL;
 988                }
 989                if (notification.sigev_notify == SIGEV_THREAD) {
 990                        /* create the notify skb */
 991                        nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
 992                        ret = -ENOMEM;
 993                        if (!nc)
 994                                goto out;
 995                        ret = -EFAULT;
 996                        if (copy_from_user(nc->data,
 997                                        notification.sigev_value.sival_ptr,
 998                                        NOTIFY_COOKIE_LEN)) {
 999                                goto out;
1000                        }
1001
1002                        /* TODO: add a header? */
1003                        skb_put(nc, NOTIFY_COOKIE_LEN);
1004                        /* and attach it to the socket */
1005retry:
1006                        filp = fget(notification.sigev_signo);
1007                        ret = -EBADF;
1008                        if (!filp)
1009                                goto out;
1010                        sock = netlink_getsockbyfilp(filp);
1011                        fput(filp);
1012                        if (IS_ERR(sock)) {
1013                                ret = PTR_ERR(sock);
1014                                sock = NULL;
1015                                goto out;
1016                        }
1017
1018                        ret = netlink_attachskb(sock, nc, 0, MAX_SCHEDULE_TIMEOUT);
1019                        if (ret == 1)
1020                                goto retry;
1021                        if (ret) {
1022                                sock = NULL;
1023                                nc = NULL;
1024                                goto out;
1025                        }
1026                }
1027        }
1028
1029        ret = -EBADF;
1030        filp = fget(mqdes);
1031        if (!filp)
1032                goto out;
1033
1034        inode = filp->f_dentry->d_inode;
1035        if (unlikely(filp->f_op != &mqueue_file_operations))
1036                goto out_fput;
1037        info = MQUEUE_I(inode);
1038
1039        ret = 0;
1040        spin_lock(&info->lock);
1041        if (u_notification == NULL) {
1042                if (info->notify_owner == current->tgid) {
1043                        remove_notification(info);
1044                        inode->i_atime = inode->i_ctime = CURRENT_TIME;
1045                }
1046        } else if (info->notify_owner != 0) {
1047                ret = -EBUSY;
1048        } else {
1049                switch (notification.sigev_notify) {
1050                case SIGEV_NONE:
1051                        info->notify.sigev_notify = SIGEV_NONE;
1052                        break;
1053                case SIGEV_THREAD:
1054                        info->notify_sock = sock;
1055                        info->notify_cookie = nc;
1056                        sock = NULL;
1057                        nc = NULL;
1058                        info->notify.sigev_notify = SIGEV_THREAD;
1059                        break;
1060                case SIGEV_SIGNAL:
1061                        info->notify.sigev_signo = notification.sigev_signo;
1062                        info->notify.sigev_value = notification.sigev_value;
1063                        info->notify.sigev_notify = SIGEV_SIGNAL;
1064                        break;
1065                }
1066                info->notify_owner = current->tgid;
1067                inode->i_atime = inode->i_ctime = CURRENT_TIME;
1068        }
1069        spin_unlock(&info->lock);
1070out_fput:
1071        fput(filp);
1072out:
1073        if (sock) {
1074                netlink_detachskb(sock, nc);
1075        } else if (nc) {
1076                dev_kfree_skb(nc);
1077        }
1078        return ret;
1079}
1080
1081asmlinkage long sys_mq_getsetattr(mqd_t mqdes,
1082                        const struct mq_attr __user *u_mqstat,
1083                        struct mq_attr __user *u_omqstat)
1084{
1085        int ret;
1086        struct mq_attr mqstat, omqstat;
1087        struct file *filp;
1088        struct inode *inode;
1089        struct mqueue_inode_info *info;
1090
1091        if (u_mqstat != NULL) {
1092                if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr)))
1093                        return -EFAULT;
1094                if (mqstat.mq_flags & (~O_NONBLOCK))
1095                        return -EINVAL;
1096        }
1097
1098        ret = -EBADF;
1099        filp = fget(mqdes);
1100        if (!filp)
1101                goto out;
1102
1103        inode = filp->f_dentry->d_inode;
1104        if (unlikely(filp->f_op != &mqueue_file_operations))
1105                goto out_fput;
1106        info = MQUEUE_I(inode);
1107
1108        spin_lock(&info->lock);
1109
1110        omqstat = info->attr;
1111        omqstat.mq_flags = filp->f_flags & O_NONBLOCK;
1112        if (u_mqstat) {
1113                if (mqstat.mq_flags & O_NONBLOCK)
1114                        filp->f_flags |= O_NONBLOCK;
1115                else
1116                        filp->f_flags &= ~O_NONBLOCK;
1117
1118                inode->i_atime = inode->i_ctime = CURRENT_TIME;
1119        }
1120
1121        spin_unlock(&info->lock);
1122
1123        ret = 0;
1124        if (u_omqstat != NULL && copy_to_user(u_omqstat, &omqstat,
1125                                                sizeof(struct mq_attr)))
1126                ret = -EFAULT;
1127
1128out_fput:
1129        fput(filp);
1130out:
1131        return ret;
1132}
1133
1134static struct inode_operations mqueue_dir_inode_operations = {
1135        .lookup = simple_lookup,
1136        .create = mqueue_create,
1137        .unlink = mqueue_unlink,
1138};
1139
1140static struct file_operations mqueue_file_operations = {
1141        .flush = mqueue_flush_file,
1142        .poll = mqueue_poll_file,
1143        .read = mqueue_read_file,
1144};
1145
1146static struct super_operations mqueue_super_ops = {
1147        .alloc_inode = mqueue_alloc_inode,
1148        .destroy_inode = mqueue_destroy_inode,
1149        .statfs = simple_statfs,
1150        .delete_inode = mqueue_delete_inode,
1151        .drop_inode = generic_delete_inode,
1152};
1153
1154static struct file_system_type mqueue_fs_type = {
1155        .name = "mqueue",
1156        .get_sb = mqueue_get_sb,
1157        .kill_sb = kill_litter_super,
1158};
1159
1160static int msg_max_limit_min = DFLT_MSGMAX;
1161static int msg_max_limit_max = HARD_MSGMAX;
1162
1163static int msg_maxsize_limit_min = DFLT_MSGSIZEMAX;
1164static int msg_maxsize_limit_max = INT_MAX;
1165
1166static ctl_table mq_sysctls[] = {
1167        {
1168                .ctl_name       = CTL_QUEUESMAX,
1169                .procname       = "queues_max",
1170                .data           = &queues_max,
1171                .maxlen         = sizeof(int),
1172                .mode           = 0644,
1173                .proc_handler   = &proc_dointvec,
1174        },
1175        {
1176                .ctl_name       = CTL_MSGMAX,
1177                .procname       = "msg_max",
1178                .data           = &msg_max,
1179                .maxlen         = sizeof(int),
1180                .mode           = 0644,
1181                .proc_handler   = &proc_dointvec_minmax,
1182                .extra1         = &msg_max_limit_min,
1183                .extra2         = &msg_max_limit_max,
1184        },
1185        {
1186                .ctl_name       = CTL_MSGSIZEMAX,
1187                .procname       = "msgsize_max",
1188                .data           = &msgsize_max,
1189                .maxlen         = sizeof(int),
1190                .mode           = 0644,
1191                .proc_handler   = &proc_dointvec_minmax,
1192                .extra1         = &msg_maxsize_limit_min,
1193                .extra2         = &msg_maxsize_limit_max,
1194        },
1195        { .ctl_name = 0 }
1196};
1197
1198static ctl_table mq_sysctl_dir[] = {
1199        {
1200                .ctl_name       = FS_MQUEUE,
1201                .procname       = "mqueue",
1202                .mode           = 0555,
1203                .child          = mq_sysctls,
1204        },
1205        { .ctl_name = 0 }
1206};
1207
1208static ctl_table mq_sysctl_root[] = {
1209        {
1210                .ctl_name       = CTL_FS,
1211                .procname       = "fs",
1212                .mode           = 0555,
1213                .child          = mq_sysctl_dir,
1214        },
1215        { .ctl_name = 0 }
1216};
1217
1218static int __init init_mqueue_fs(void)
1219{
1220        int error;
1221
1222        mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
1223                                sizeof(struct mqueue_inode_info), 0,
1224                                SLAB_HWCACHE_ALIGN, init_once, NULL);
1225        if (mqueue_inode_cachep == NULL)
1226                return -ENOMEM;
1227
1228        mq_sysctl_table = register_sysctl_table(mq_sysctl_root, 0);
1229        if (!mq_sysctl_table) {
1230                error = -ENOMEM;
1231                goto out_cache;
1232        }
1233
1234        error = register_filesystem(&mqueue_fs_type);
1235        if (error)
1236                goto out_sysctl;
1237
1238        if (IS_ERR(mqueue_mnt = kern_mount(&mqueue_fs_type))) {
1239                error = PTR_ERR(mqueue_mnt);
1240                goto out_filesystem;
1241        }
1242
1243        /* internal initialization - not common for vfs */
1244        queues_count = 0;
1245        spin_lock_init(&mq_lock);
1246
1247        return 0;
1248
1249out_filesystem:
1250        unregister_filesystem(&mqueue_fs_type);
1251out_sysctl:
1252        unregister_sysctl_table(mq_sysctl_table);
1253out_cache:
1254        if (kmem_cache_destroy(mqueue_inode_cachep)) {
1255                printk(KERN_INFO
1256                        "mqueue_inode_cache: not all structures were freed\n");
1257        }
1258        return error;
1259}
1260
1261__initcall(init_mqueue_fs);
1262