RHEL4/kernel/fork.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/fork.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 *  'fork.c' contains the help-routines for the 'fork' system call
   9 * (see also entry.S and others).
  10 * Fork is rather simple, once you get the hang of it, but the memory
  11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
  12 */
  13
  14#include <linux/config.h>
  15#include <linux/slab.h>
  16#include <linux/init.h>
  17#include <linux/unistd.h>
  18#include <linux/smp_lock.h>
  19#include <linux/module.h>
  20#include <linux/vmalloc.h>
  21#include <linux/completion.h>
  22#include <linux/namespace.h>
  23#include <linux/personality.h>
  24#include <linux/mempolicy.h>
  25#include <linux/sem.h>
  26#include <linux/file.h>
  27#include <linux/key.h>
  28#include <linux/binfmts.h>
  29#include <linux/mman.h>
  30#include <linux/fs.h>
  31#include <linux/cpu.h>
  32#include <linux/security.h>
  33#include <linux/swap.h>
  34#include <linux/syscalls.h>
  35#include <linux/jiffies.h>
  36#include <linux/futex.h>
  37#include <linux/task_io_accounting_ops.h>
  38#include <linux/ptrace.h>
  39#include <linux/mount.h>
  40#include <linux/audit.h>
  41#include <linux/profile.h>
  42#include <linux/rmap.h>
  43#include <linux/hash.h>
  44
  45#include <asm/pgtable.h>
  46#include <asm/pgalloc.h>
  47#include <asm/uaccess.h>
  48#include <asm/mmu_context.h>
  49#include <asm/cacheflush.h>
  50#include <asm/tlbflush.h>
  51
  52/* The idle threads do not count..
  53 * Protected by write_lock_irq(&tasklist_lock)
  54 */
  55int nr_threads;
  56
  57int max_threads;
  58unsigned long total_forks;      /* Handle normal Linux uptimes. */
  59
  60DEFINE_PER_CPU(unsigned long, process_counts) = 0;
  61
  62rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
  63
  64EXPORT_SYMBOL(tasklist_lock);
  65
  66#define MM_FLAGS_HASH_BITS 10
  67#define MM_FLAGS_HASH_SIZE (1 << MM_FLAGS_HASH_BITS)
  68struct hlist_head mm_flags_hash[MM_FLAGS_HASH_SIZE] =
  69        { [ 0 ... MM_FLAGS_HASH_SIZE - 1 ] = HLIST_HEAD_INIT };
  70DEFINE_SPINLOCK(mm_flags_lock);
  71#define MM_HASH_SHIFT ((sizeof(struct mm_struct) >= 1024) ? 10  \
  72                       : (sizeof(struct mm_struct) >= 512) ? 9  \
  73                       : 8)
  74#define mm_flags_hash_fn(mm) \
  75        hash_long((unsigned long)(mm) >> MM_HASH_SHIFT, MM_FLAGS_HASH_BITS)
  76
  77int nr_processes(void)
  78{
  79        int cpu;
  80        int total = 0;
  81
  82        for_each_online_cpu(cpu)
  83                total += per_cpu(process_counts, cpu);
  84
  85        return total;
  86}
  87
  88#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
  89# define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
  90# define free_task_struct(tsk)  kmem_cache_free(task_struct_cachep, (tsk))
  91static kmem_cache_t *task_struct_cachep;
  92#endif
  93
  94void free_task(struct task_struct *tsk)
  95{
  96        kfree(task_aux(tsk));
  97        free_thread_info(tsk->thread_info);
  98        free_task_struct(tsk);
  99}
 100EXPORT_SYMBOL(free_task);
 101
 102void __put_task_struct(struct task_struct *tsk)
 103{
 104        WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
 105        WARN_ON(atomic_read(&tsk->usage));
 106        WARN_ON(tsk == current);
 107
 108        security_task_free(tsk);
 109        free_uid(tsk->user);
 110        put_group_info(tsk->group_info);
 111
 112        if (!profile_handoff_task(tsk))
 113                free_task(tsk);
 114}
 115
 116void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 117{
 118        unsigned long flags;
 119
 120        wait->flags &= ~WQ_FLAG_EXCLUSIVE;
 121        spin_lock_irqsave(&q->lock, flags);
 122        __add_wait_queue(q, wait);
 123        spin_unlock_irqrestore(&q->lock, flags);
 124}
 125
 126EXPORT_SYMBOL(add_wait_queue);
 127
 128void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
 129{
 130        unsigned long flags;
 131
 132        wait->flags |= WQ_FLAG_EXCLUSIVE;
 133        spin_lock_irqsave(&q->lock, flags);
 134        __add_wait_queue_tail(q, wait);
 135        spin_unlock_irqrestore(&q->lock, flags);
 136}
 137
 138EXPORT_SYMBOL(add_wait_queue_exclusive);
 139
 140void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 141{
 142        unsigned long flags;
 143
 144        spin_lock_irqsave(&q->lock, flags);
 145        __remove_wait_queue(q, wait);
 146        spin_unlock_irqrestore(&q->lock, flags);
 147}
 148
 149EXPORT_SYMBOL(remove_wait_queue);
 150
 151
 152/*
 153 * Note: we use "set_current_state()" _after_ the wait-queue add,
 154 * because we need a memory barrier there on SMP, so that any
 155 * wake-function that tests for the wait-queue being active
 156 * will be guaranteed to see waitqueue addition _or_ subsequent
 157 * tests in this thread will see the wakeup having taken place.
 158 *
 159 * The spin_unlock() itself is semi-permeable and only protects
 160 * one way (it only protects stuff inside the critical region and
 161 * stops them from bleeding out - it would still allow subsequent
 162 * loads to move into the the critical region).
 163 */
 164void fastcall prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
 165{
 166        unsigned long flags;
 167
 168        wait->flags &= ~WQ_FLAG_EXCLUSIVE;
 169        spin_lock_irqsave(&q->lock, flags);
 170        if (list_empty(&wait->task_list))
 171                __add_wait_queue(q, wait);
 172        /*
 173         * don't alter the task state if this is just going to
 174         * queue an async wait queue callback
 175         */
 176        if (is_sync_wait(wait))
 177                set_current_state(state);
 178        spin_unlock_irqrestore(&q->lock, flags);
 179}
 180
 181EXPORT_SYMBOL(prepare_to_wait);
 182
 183void fastcall
 184prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 185{
 186        unsigned long flags;
 187
 188        wait->flags |= WQ_FLAG_EXCLUSIVE;
 189        spin_lock_irqsave(&q->lock, flags);
 190        if (list_empty(&wait->task_list))
 191                __add_wait_queue_tail(q, wait);
 192        /*
 193         * don't alter the task state if this is just going to
 194         * queue an async wait queue callback
 195         */
 196        if (is_sync_wait(wait))
 197                set_current_state(state);
 198        spin_unlock_irqrestore(&q->lock, flags);
 199}
 200
 201EXPORT_SYMBOL(prepare_to_wait_exclusive);
 202
 203void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 204{
 205        unsigned long flags;
 206
 207        __set_current_state(TASK_RUNNING);
 208        /*
 209         * We can check for list emptiness outside the lock
 210         * IFF:
 211         *  - we use the "careful" check that verifies both
 212         *    the next and prev pointers, so that there cannot
 213         *    be any half-pending updates in progress on other
 214         *    CPU's that we haven't seen yet (and that might
 215         *    still change the stack area.
 216         * and
 217         *  - all other users take the lock (ie we can only
 218         *    have _one_ other CPU that looks at or modifies
 219         *    the list).
 220         */
 221        if (!list_empty_careful(&wait->task_list)) {
 222                spin_lock_irqsave(&q->lock, flags);
 223                list_del_init(&wait->task_list);
 224                spin_unlock_irqrestore(&q->lock, flags);
 225        }
 226}
 227
 228EXPORT_SYMBOL(finish_wait);
 229
 230int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
 231{
 232        int ret = default_wake_function(wait, mode, sync, key);
 233
 234        if (ret)
 235                list_del_init(&wait->task_list);
 236        return ret;
 237}
 238
 239EXPORT_SYMBOL(autoremove_wake_function);
 240
 241static struct task_struct_aux init_task_aux;
 242
 243void __init fork_init(unsigned long mempages)
 244{
 245        task_aux(current) = &init_task_aux;
 246#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 247#ifndef ARCH_MIN_TASKALIGN
 248#define ARCH_MIN_TASKALIGN      L1_CACHE_BYTES
 249#endif
 250        /* create a slab on which task_structs can be allocated */
 251        task_struct_cachep =
 252                kmem_cache_create("task_struct", sizeof(struct task_struct),
 253                        ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
 254#endif
 255
 256        /*
 257         * The default maximum number of threads is set to a safe
 258         * value: the thread structures can take up at most half
 259         * of memory.
 260         */
 261        max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8;
 262        /*
 263         * we need to allow at least 20 threads to boot a system
 264         */
 265        if(max_threads < 20)
 266                max_threads = 20;
 267
 268        init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
 269        init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 270}
 271
 272static struct task_struct *dup_task_struct(struct task_struct *orig)
 273{
 274        struct task_struct_aux *aux;
 275        struct task_struct *tsk;
 276        struct thread_info *ti;
 277
 278        prepare_to_copy(orig);
 279
 280        tsk = alloc_task_struct();
 281        if (!tsk)
 282                return NULL;
 283
 284        ti = alloc_thread_info(tsk);
 285        if (!ti) {
 286                free_task_struct(tsk);
 287                return NULL;
 288        }
 289
 290        aux = kmalloc(sizeof(*aux), GFP_KERNEL);
 291        if (!aux) {
 292                free_thread_info(ti);
 293                free_task_struct(tsk);
 294                return NULL;
 295        }
 296
 297        *ti = *orig->thread_info;
 298        *aux = *task_aux(orig);
 299        *tsk = *orig;
 300        tsk->thread_info = ti;
 301        ti->task = tsk;
 302        task_aux(tsk) = aux;
 303
 304        /* One for us, one for whoever does the "release_task()" (usually parent) */
 305        atomic_set(&tsk->usage,2);
 306        return tsk;
 307}
 308
 309/* Must be called with the mm_flags_lock held.  */
 310static struct mm_flags *__find_mm_flags(struct mm_struct *addr)
 311{
 312        struct hlist_head *head;
 313        struct hlist_node *node;
 314        struct mm_flags *p;
 315
 316        head = &mm_flags_hash[mm_flags_hash_fn(addr)];
 317        hlist_for_each_entry(p, node, head, hlist) {
 318                if (p->addr == addr)
 319                        return p;
 320        }
 321        return NULL;
 322}
 323
 324unsigned long get_mm_flags(struct mm_struct *mm)
 325{
 326        struct mm_flags *p;
 327        unsigned long flags = MMF_DUMP_FILTER_DEFAULT;
 328
 329        spin_lock(&mm_flags_lock);
 330        p = __find_mm_flags(mm);
 331        if (p)
 332                flags = p->flags;
 333        spin_unlock(&mm_flags_lock);
 334
 335        return flags;
 336}
 337
 338int set_mm_flags(struct mm_struct *mm, unsigned long flags, int check_dup)
 339{
 340        struct mm_flags *p, *new_p;
 341
 342        flags &= MMF_DUMP_FILTER_MASK;
 343
 344        if (check_dup) {
 345                /* Check if the entry has already existed.  */
 346                spin_lock(&mm_flags_lock);
 347                p = __find_mm_flags(mm);
 348                if (p) {
 349                        p->flags = flags;
 350                        spin_unlock(&mm_flags_lock);
 351                        return 0;
 352                }
 353                spin_unlock(&mm_flags_lock);
 354
 355                /* Do nothing if the `flags' is equal to the default.  */
 356                if (flags == MMF_DUMP_FILTER_DEFAULT)
 357                        return 0;
 358        }
 359
 360        /* Try to add a new entry.  */
 361        new_p = kmalloc(sizeof(*new_p), GFP_KERNEL);
 362        if (!new_p)
 363                return -ENOMEM;
 364
 365        spin_lock(&mm_flags_lock);
 366        if (!check_dup || !(p = __find_mm_flags(mm))) {
 367                struct hlist_head *head;
 368                head = &mm_flags_hash[mm_flags_hash_fn(mm)];
 369                p = new_p;
 370                p->addr = mm;
 371                hlist_add_head(&p->hlist, head);
 372        } else
 373                kfree(new_p);
 374        p->flags = flags;
 375        spin_unlock(&mm_flags_lock);
 376
 377        return 0;
 378}
 379
 380static void free_mm_flags(struct mm_struct *mm) {
 381        struct mm_flags *p;
 382
 383        spin_lock(&mm_flags_lock);
 384        p = __find_mm_flags(mm);
 385        if (p) {
 386                hlist_del(&p->hlist);
 387                kfree(p);
 388        }
 389        spin_unlock(&mm_flags_lock);
 390}
 391
 392#ifdef CONFIG_MMU
 393static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 394{
 395        struct vm_area_struct * mpnt, *tmp, **pprev;
 396        struct rb_node **rb_link, *rb_parent;
 397        int retval;
 398        unsigned long charge;
 399        struct mempolicy *pol;
 400
 401        down_write(&oldmm->mmap_sem);
 402        flush_cache_mm(current->mm);
 403        mm->locked_vm = 0;
 404        mm->mmap = NULL;
 405        mm->mmap_cache = NULL;
 406        mm->free_area_cache = oldmm->mmap_base;
 407        mm->map_count = 0;
 408        mm->rss = 0;
 409        mm->anon_rss = 0;
 410        cpus_clear(mm->cpu_vm_mask);
 411        mm->mm_rb = RB_ROOT;
 412        rb_link = &mm->mm_rb.rb_node;
 413        rb_parent = NULL;
 414        pprev = &mm->mmap;
 415
 416        /*
 417         * Add it to the mmlist after the parent.
 418         * Doing it this way means that we can order the list,
 419         * and fork() won't mess up the ordering significantly.
 420         * Add it first so that swapoff can see any swap entries.
 421         */
 422        spin_lock(&mmlist_lock);
 423        list_add(&mm->mmlist, &current->mm->mmlist);
 424        mmlist_nr++;
 425        spin_unlock(&mmlist_lock);
 426
 427        for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
 428                struct file *file;
 429
 430                if (mpnt->vm_flags & VM_DONTCOPY) {
 431                        __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
 432                                                        -vma_pages(mpnt));
 433                        continue;
 434                }
 435                charge = 0;
 436                if (mpnt->vm_flags & VM_ACCOUNT) {
 437                        unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
 438                        if (security_vm_enough_memory(len))
 439                                goto fail_nomem;
 440                        charge = len;
 441                }
 442                tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 443                if (!tmp)
 444                        goto fail_nomem;
 445                *tmp = *mpnt;
 446                pol = mpol_copy(vma_policy(mpnt));
 447                retval = PTR_ERR(pol);
 448                if (IS_ERR(pol))
 449                        goto fail_nomem_policy;
 450                vma_set_policy(tmp, pol);
 451                tmp->vm_flags &= ~VM_LOCKED;
 452                tmp->vm_mm = mm;
 453                tmp->vm_next = NULL;
 454                anon_vma_link(tmp);
 455                file = tmp->vm_file;
 456                if (file) {
 457                        struct inode *inode = file->f_dentry->d_inode;
 458                        get_file(file);
 459                        if (tmp->vm_flags & VM_DENYWRITE)
 460                                atomic_dec(&inode->i_writecount);
 461      
 462                        /* insert tmp into the share list, just after mpnt */
 463                        spin_lock(&file->f_mapping->i_mmap_lock);
 464                        flush_dcache_mmap_lock(file->f_mapping);
 465                        vma_prio_tree_add(tmp, mpnt);
 466                        flush_dcache_mmap_unlock(file->f_mapping);
 467                        spin_unlock(&file->f_mapping->i_mmap_lock);
 468                }
 469
 470                /*
 471                 * Link in the new vma and copy the page table entries:
 472                 * link in first so that swapoff can see swap entries,
 473                 * and try_to_unmap_one's find_vma find the new vma.
 474                 */
 475                spin_lock(&mm->page_table_lock);
 476                *pprev = tmp;
 477                pprev = &tmp->vm_next;
 478
 479                __vma_link_rb(mm, tmp, rb_link, rb_parent);
 480                rb_link = &tmp->vm_rb.rb_right;
 481                rb_parent = &tmp->vm_rb;
 482
 483                mm->map_count++;
 484                retval = copy_page_range(mm, current->mm, tmp);
 485                spin_unlock(&mm->page_table_lock);
 486
 487                if (tmp->vm_ops && tmp->vm_ops->open)
 488                        tmp->vm_ops->open(tmp);
 489
 490                if (retval)
 491                        goto out;
 492        }
 493#ifdef arch_dup_mmap
 494        arch_dup_mmap(mm, oldmm);
 495#endif
 496        retval = 0;
 497
 498out:
 499        flush_tlb_mm(current->mm);
 500        up_write(&oldmm->mmap_sem);
 501        return retval;
 502fail_nomem_policy:
 503        kmem_cache_free(vm_area_cachep, tmp);
 504fail_nomem:
 505        retval = -ENOMEM;
 506        vm_unacct_memory(charge);
 507        goto out;
 508}
 509
 510static inline int mm_alloc_pgd(struct mm_struct * mm)
 511{
 512        mm->pgd = pgd_alloc(mm);
 513        if (unlikely(!mm->pgd))
 514                return -ENOMEM;
 515        return 0;
 516}
 517
 518static inline void mm_free_pgd(struct mm_struct * mm)
 519{
 520        pgd_free(mm->pgd);
 521}
 522#else
 523#define dup_mmap(mm, oldmm)     (0)
 524#define mm_alloc_pgd(mm)        (0)
 525#define mm_free_pgd(mm)
 526#endif /* CONFIG_MMU */
 527
 528spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 529int mmlist_nr;
 530
 531#define allocate_mm()   (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
 532#define free_mm(mm)     (kmem_cache_free(mm_cachep, (mm)))
 533
 534#include <linux/init_task.h>
 535
 536static struct mm_struct * mm_init(struct mm_struct * mm)
 537{
 538        unsigned long mm_flags;
 539
 540        atomic_set(&mm->mm_users, 1);
 541        atomic_set(&mm->mm_count, 1);
 542        init_rwsem(&mm->mmap_sem);
 543        mm->core_waiters = 0;
 544        mm->page_table_lock = SPIN_LOCK_UNLOCKED;
 545        mm->ioctx_list_lock = RW_LOCK_UNLOCKED;
 546        mm->ioctx_list = NULL;
 547        mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
 548        mm->free_area_cache = TASK_UNMAPPED_BASE;
 549
 550        mm_flags = get_mm_flags(current->mm);
 551        if (mm_flags != MMF_DUMP_FILTER_DEFAULT) {
 552                if (unlikely(set_mm_flags(mm, mm_flags, 0) < 0))
 553                        goto fail_nomem;
 554        }
 555
 556        if (likely(!mm_alloc_pgd(mm))) {
 557                mm->def_flags = 0;
 558                return mm;
 559        }
 560
 561        if (mm_flags != MMF_DUMP_FILTER_DEFAULT)
 562                free_mm_flags(mm);
 563fail_nomem:
 564        free_mm(mm);
 565        return NULL;
 566}
 567
 568/*
 569 * Allocate and initialize an mm_struct.
 570 */
 571struct mm_struct * mm_alloc(void)
 572{
 573        struct mm_struct * mm;
 574
 575        mm = allocate_mm();
 576        if (mm) {
 577                memset(mm, 0, sizeof(*mm));
 578                mm = mm_init(mm);
 579        }
 580        return mm;
 581}
 582
 583/*
 584 * Called when the last reference to the mm
 585 * is dropped: either by a lazy thread or by
 586 * mmput. Free the page directory and the mm.
 587 */
 588void fastcall __mmdrop(struct mm_struct *mm)
 589{
 590        BUG_ON(mm == &init_mm);
 591        free_mm_flags(mm);
 592        mm_free_pgd(mm);
 593        destroy_context(mm);
 594        free_mm(mm);
 595}
 596
 597/*
 598 * Decrement the use count and release all resources for an mm.
 599 */
 600void mmput(struct mm_struct *mm)
 601{
 602        if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
 603                list_del(&mm->mmlist);
 604                mmlist_nr--;
 605                spin_unlock(&mmlist_lock);
 606                exit_aio(mm);
 607                exit_mmap(mm);
 608                put_swap_token(mm);
 609                mmdrop(mm);
 610        }
 611}
 612EXPORT_SYMBOL_GPL(mmput);
 613
 614/**
 615 * get_task_mm - acquire a reference to the task's mm
 616 *
 617 * Returns %NULL if the task has no mm.  Checks if the use count
 618 * of the mm is non-zero and if so returns a reference to it, after
 619 * bumping up the use count.  User must release the mm via mmput()
 620 * after use.  Typically used by /proc and ptrace.
 621 *
 622 * If the use count is zero, it means that this mm is going away,
 623 * so return %NULL.  This only happens in the case of an AIO daemon
 624 * which has temporarily adopted an mm (see use_mm), in the course
 625 * of its final mmput, before exit_aio has completed.
 626 */
 627struct mm_struct *get_task_mm(struct task_struct *task)
 628{
 629        struct mm_struct *mm;
 630
 631        task_lock(task);
 632        mm = task->mm;
 633        if (mm) {
 634                spin_lock(&mmlist_lock);
 635                if (!atomic_read(&mm->mm_users))
 636                        mm = NULL;
 637                else
 638                        atomic_inc(&mm->mm_users);
 639                spin_unlock(&mmlist_lock);
 640        }
 641        task_unlock(task);
 642        return mm;
 643}
 644EXPORT_SYMBOL_GPL(get_task_mm);
 645
 646/* Please note the differences between mmput and mm_release.
 647 * mmput is called whenever we stop holding onto a mm_struct,
 648 * error success whatever.
 649 *
 650 * mm_release is called after a mm_struct has been removed
 651 * from the current process.
 652 *
 653 * This difference is important for error handling, when we
 654 * only half set up a mm_struct for a new process and need to restore
 655 * the old one.  Because we mmput the new mm_struct before
 656 * restoring the old one. . .
 657 * Eric Biederman 10 January 1998
 658 */
 659void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 660{
 661        struct completion *vfork_done = task_aux(tsk)->vfork_done;
 662
 663        /* Get rid of any cached register state */
 664        deactivate_mm(tsk, mm);
 665
 666        /* notify parent sleeping on vfork() */
 667        if (vfork_done) {
 668                task_aux(tsk)->vfork_done = NULL;
 669                complete(vfork_done);
 670        }
 671
 672        /*
 673         * If we're exiting normally, clear a user-space tid field if
 674         * requested.  We leave this alone when dying by signal, to leave
 675         * the value intact in a core dump, and to save the unnecessary
 676         * trouble otherwise.  Userland only wants this done for a sys_exit.
 677         */
 678        if (tsk->clear_child_tid
 679            && !(tsk->flags & PF_SIGNALED)
 680            && atomic_read(&mm->mm_users) > 1) {
 681                u32 __user * tidptr = tsk->clear_child_tid;
 682                tsk->clear_child_tid = NULL;
 683
 684                /*
 685                 * We don't check the error code - if userspace has
 686                 * not set up a proper pointer then tough luck.
 687                 */
 688                put_user(0, tidptr);
 689                sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
 690        }
 691}
 692
 693static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 694{
 695        struct mm_struct * mm, *oldmm;
 696        int retval;
 697
 698        tsk->min_flt = tsk->maj_flt = 0;
 699        tsk->nvcsw = tsk->nivcsw = 0;
 700
 701        tsk->mm = NULL;
 702        tsk->active_mm = NULL;
 703
 704        /*
 705         * Are we cloning a kernel thread?
 706         *
 707         * We need to steal a active VM for that..
 708         */
 709        oldmm = current->mm;
 710        if (!oldmm)
 711                return 0;
 712
 713        if (clone_flags & CLONE_VM) {
 714                atomic_inc(&oldmm->mm_users);
 715                mm = oldmm;
 716                /*
 717                 * There are cases where the PTL is held to ensure no
 718                 * new threads start up in user mode using an mm, which
 719                 * allows optimizing out ipis; the tlb_gather_mmu code
 720                 * is an example.
 721                 */
 722                spin_unlock_wait(&oldmm->page_table_lock);
 723                goto good_mm;
 724        }
 725
 726        retval = -ENOMEM;
 727        mm = allocate_mm();
 728        if (!mm)
 729                goto fail_nomem;
 730
 731        /* Copy the current MM stuff.. */
 732        memcpy(mm, oldmm, sizeof(*mm));
 733        if (!mm_init(mm))
 734                goto fail_nomem;
 735
 736        if (init_new_context(tsk,mm))
 737                goto fail_nocontext;
 738
 739        retval = dup_mmap(mm, oldmm);
 740        if (retval)
 741                goto free_pt;
 742
 743good_mm:
 744        tsk->mm = mm;
 745        tsk->active_mm = mm;
 746        return 0;
 747
 748free_pt:
 749        mmput(mm);
 750fail_nomem:
 751        return retval;
 752
 753fail_nocontext:
 754        /*
 755         * If init_new_context() failed, we cannot use mmput() to free the mm
 756         * because it calls destroy_context()
 757         */
 758        free_mm_flags(mm);
 759        mm_free_pgd(mm);
 760        free_mm(mm);
 761        return retval;
 762}
 763
 764static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
 765{
 766        struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
 767        /* We don't need to lock fs - think why ;-) */
 768        if (fs) {
 769                atomic_set(&fs->count, 1);
 770                fs->lock = RW_LOCK_UNLOCKED;
 771                fs->umask = old->umask;
 772                read_lock(&old->lock);
 773                fs->rootmnt = mntget(old->rootmnt);
 774                fs->root = dget(old->root);
 775                fs->pwdmnt = mntget(old->pwdmnt);
 776                fs->pwd = dget(old->pwd);
 777                if (old->altroot) {
 778                        fs->altrootmnt = mntget(old->altrootmnt);
 779                        fs->altroot = dget(old->altroot);
 780                } else {
 781                        fs->altrootmnt = NULL;
 782                        fs->altroot = NULL;
 783                }
 784                read_unlock(&old->lock);
 785        }
 786        return fs;
 787}
 788
 789struct fs_struct *copy_fs_struct(struct fs_struct *old)
 790{
 791        return __copy_fs_struct(old);
 792}
 793
 794EXPORT_SYMBOL_GPL(copy_fs_struct);
 795
 796static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 797{
 798        if (clone_flags & CLONE_FS) {
 799                atomic_inc(&current->fs->count);
 800                return 0;
 801        }
 802        tsk->fs = __copy_fs_struct(current->fs);
 803        if (!tsk->fs)
 804                return -ENOMEM;
 805        return 0;
 806}
 807
 808static int count_open_files(struct files_struct *files, int size)
 809{
 810        int i;
 811
 812        /* Find the last open fd */
 813        for (i = size/(8*sizeof(long)); i > 0; ) {
 814                if (files->open_fds->fds_bits[--i])
 815                        break;
 816        }
 817        i = (i+1) * 8 * sizeof(long);
 818        return i;
 819}
 820
 821static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 822{
 823        struct files_struct *oldf, *newf;
 824        struct file **old_fds, **new_fds;
 825        int open_files, nfds, size, i, error = 0;
 826
 827        /*
 828         * A background process may not have any files ...
 829         */
 830        oldf = current->files;
 831        if (!oldf)
 832                goto out;
 833
 834        if (clone_flags & CLONE_FILES) {
 835                atomic_inc(&oldf->count);
 836                goto out;
 837        }
 838
 839        /*
 840         * Note: we may be using current for both targets (See exec.c)
 841         * This works because we cache current->files (old) as oldf. Don't
 842         * break this.
 843         */
 844        tsk->files = NULL;
 845        error = -ENOMEM;
 846        newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
 847        if (!newf) 
 848                goto out;
 849
 850        atomic_set(&newf->count, 1);
 851
 852        newf->file_lock     = SPIN_LOCK_UNLOCKED;
 853        newf->next_fd       = 0;
 854        newf->max_fds       = NR_OPEN_DEFAULT;
 855        newf->max_fdset     = __FD_SETSIZE;
 856        newf->close_on_exec = &newf->close_on_exec_init;
 857        newf->open_fds      = &newf->open_fds_init;
 858        newf->fd            = &newf->fd_array[0];
 859
 860        /* We don't yet have the oldf readlock, but even if the old
 861           fdset gets grown now, we'll only copy up to "size" fds */
 862        size = oldf->max_fdset;
 863        if (size > __FD_SETSIZE) {
 864                newf->max_fdset = 0;
 865                spin_lock(&newf->file_lock);
 866                error = expand_fdset(newf, size-1);
 867                spin_unlock(&newf->file_lock);
 868                if (error)
 869                        goto out_release;
 870        }
 871        spin_lock(&oldf->file_lock);
 872
 873        open_files = count_open_files(oldf, size);
 874
 875        /*
 876         * Check whether we need to allocate a larger fd array.
 877         * Note: we're not a clone task, so the open count won't
 878         * change.
 879         */
 880        nfds = NR_OPEN_DEFAULT;
 881        if (open_files > nfds) {
 882                spin_unlock(&oldf->file_lock);
 883                newf->max_fds = 0;
 884                spin_lock(&newf->file_lock);
 885                error = expand_fd_array(newf, open_files-1);
 886                spin_unlock(&newf->file_lock);
 887                if (error) 
 888                        goto out_release;
 889                nfds = newf->max_fds;
 890                spin_lock(&oldf->file_lock);
 891        }
 892
 893        old_fds = oldf->fd;
 894        new_fds = newf->fd;
 895
 896        memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
 897        memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
 898
 899        for (i = open_files; i != 0; i--) {
 900                struct file *f = *old_fds++;
 901                if (f) {
 902                        get_file(f);
 903                } else {
 904                        /*
 905                         * The fd may be claimed in the fd bitmap but not yet
 906                         * instantiated in the files array if a sibling thread
 907                         * is partway through open().  So make sure that this
 908                         * fd is available to the new process.
 909                         */
 910                        FD_CLR(open_files - i, newf->open_fds);
 911                }
 912                *new_fds++ = f;
 913        }
 914        spin_unlock(&oldf->file_lock);
 915
 916        /* compute the remainder to be cleared */
 917        size = (newf->max_fds - open_files) * sizeof(struct file *);
 918
 919        /* This is long word aligned thus could use a optimized version */ 
 920        memset(new_fds, 0, size); 
 921
 922        if (newf->max_fdset > open_files) {
 923                int left = (newf->max_fdset-open_files)/8;
 924                int start = open_files / (8 * sizeof(unsigned long));
 925
 926                memset(&newf->open_fds->fds_bits[start], 0, left);
 927                memset(&newf->close_on_exec->fds_bits[start], 0, left);
 928        }
 929
 930        tsk->files = newf;
 931        error = 0;
 932out:
 933        return error;
 934
 935out_release:
 936        free_fdset (newf->close_on_exec, newf->max_fdset);
 937        free_fdset (newf->open_fds, newf->max_fdset);
 938        kmem_cache_free(files_cachep, newf);
 939        goto out;
 940}
 941
 942/*
 943 *      Helper to unshare the files of the current task.
 944 *      We don't want to expose copy_files internals to
 945 *      the exec layer of the kernel.
 946 */
 947
 948int unshare_files(void)
 949{
 950        struct files_struct *files  = current->files;
 951        int rc;
 952
 953        if(!files)
 954                BUG();
 955
 956        /* This can race but the race causes us to copy when we don't
 957           need to and drop the copy */
 958        if(atomic_read(&files->count) == 1)
 959        {
 960                atomic_inc(&files->count);
 961                return 0;
 962        }
 963        rc = copy_files(0, current);
 964        if(rc)
 965                current->files = files;
 966        return rc;
 967}
 968
 969EXPORT_SYMBOL(unshare_files);
 970
 971static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 972{
 973        struct sighand_struct *sig;
 974
 975        if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
 976                atomic_inc(&current->sighand->count);
 977                return 0;
 978        }
 979        sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
 980        tsk->sighand = sig;
 981        if (!sig)
 982                return -ENOMEM;
 983        spin_lock_init(&sig->siglock);
 984        atomic_set(&sig->count, 1);
 985        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
 986        return 0;
 987}
 988
 989static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
 990{
 991        struct signal_struct *sig;
 992        int ret;
 993
 994        if (clone_flags & CLONE_THREAD) {
 995                atomic_inc(&current->signal->count);
 996                atomic_inc(&current->signal->live);
 997                return 0;
 998        }
 999        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
1000        tsk->signal = sig;
1001        if (!sig)
1002                return -ENOMEM;
1003
1004        ret = copy_thread_group_keys(tsk);
1005        if (ret < 0) {
1006                kmem_cache_free(signal_cachep, sig);
1007                return ret;
1008        }
1009
1010        atomic_set(&sig->count, 1);
1011        atomic_set(&sig->live, 1);
1012        sig->group_exit = 0;
1013        sig->group_exit_code = 0;
1014        sig->group_exit_task = NULL;
1015        sig->group_stop_count = 0;
1016        sig->stop_state = 0;
1017        sig->curr_target = NULL;
1018        init_sigpending(&sig->shared_pending);
1019        INIT_LIST_HEAD(&sig->posix_timers);
1020
1021        sig->tty = current->signal->tty;
1022        sig->pgrp = process_group(current);
1023        sig->session = current->signal->session;
1024        sig->leader = 0;        /* session leadership doesn't inherit */
1025        sig->tty_old_pgrp = 0;
1026
1027        sig->utime = sig->stime = sig->cutime = sig->cstime = 0;
1028        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
1029        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
1030        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
1031
1032        return 0;
1033}
1034
1035static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
1036{
1037        unsigned long new_flags = p->flags;
1038
1039        new_flags &= ~PF_SUPERPRIV;
1040        new_flags |= PF_FORKNOEXEC;
1041        if (!(clone_flags & CLONE_PTRACE))
1042                p->ptrace = 0;
1043        p->flags = new_flags;
1044}
1045
1046asmlinkage long sys_set_tid_address(int __user *tidptr)
1047{
1048        current->clear_child_tid = tidptr;
1049
1050        return current->pid;
1051}
1052
1053/*
1054 * This creates a new process as a copy of the old one,
1055 * but does not actually start it yet.
1056 *
1057 * It copies the registers, and all the appropriate
1058 * parts of the process environment (as per the clone
1059 * flags). The actual kick-off is left to the caller.
1060 */
1061static task_t *copy_process(unsigned long clone_flags,
1062                                 unsigned long stack_start,
1063                                 struct pt_regs *regs,
1064                                 unsigned long stack_size,
1065                                 int __user *parent_tidptr,
1066                                 int __user *child_tidptr,
1067                                 int pid)
1068{
1069        int retval;
1070        struct task_struct *p = NULL;
1071
1072        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1073                return ERR_PTR(-EINVAL);
1074
1075        /*
1076         * Thread groups must share signals as well, and detached threads
1077         * can only be started up within the thread group.
1078         */
1079        if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
1080                return ERR_PTR(-EINVAL);
1081
1082        /*
1083         * Shared signal handlers imply shared VM. By way of the above,
1084         * thread groups also imply shared VM. Blocking this case allows
1085         * for various simplifications in other code.
1086         */
1087        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
1088                return ERR_PTR(-EINVAL);
1089
1090        retval = security_task_create(clone_flags);
1091        if (retval)
1092                goto fork_out;
1093
1094        retval = -ENOMEM;
1095        p = dup_task_struct(current);
1096        if (!p)
1097                goto fork_out;
1098        p->tux_info = NULL;
1099
1100        retval = -EAGAIN;
1101        if (atomic_read(&p->user->processes) >=
1102                        p->rlim[RLIMIT_NPROC].rlim_cur) {
1103                if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1104                                p->user != &root_user)
1105                        goto bad_fork_free;
1106        }
1107
1108        atomic_inc(&p->user->__count);
1109        atomic_inc(&p->user->processes);
1110        get_group_info(p->group_info);
1111
1112        /*
1113         * If multiple threads are within copy_process(), then this check
1114         * triggers too late. This doesn't hurt, the check is only there
1115         * to stop root fork bombs.
1116         */
1117        if (nr_threads >= max_threads)
1118                goto bad_fork_cleanup_count;
1119
1120        if (!try_module_get(p->thread_info->exec_domain->module))
1121                goto bad_fork_cleanup_count;
1122
1123        if (p->binfmt && !try_module_get(p->binfmt->module))
1124                goto bad_fork_cleanup_put_domain;
1125
1126        p->did_exec = 0;
1127        copy_flags(clone_flags, p);
1128        p->pid = pid;
1129        retval = -EFAULT;
1130        if (clone_flags & CLONE_PARENT_SETTID)
1131                if (put_user(p->pid, parent_tidptr))
1132                        goto bad_fork_cleanup;
1133
1134        p->proc_dentry = NULL;
1135
1136        INIT_LIST_HEAD(&p->children);
1137        INIT_LIST_HEAD(&p->sibling);
1138        init_waitqueue_head(&p->wait_chldexit);
1139        task_aux(p)->vfork_done = NULL;
1140        spin_lock_init(&p->alloc_lock);
1141        spin_lock_init(&p->proc_lock);
1142
1143        clear_tsk_thread_flag(p, TIF_SIGPENDING);
1144        init_sigpending(&p->pending);
1145
1146        task_io_accounting_init(p);
1147
1148        p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
1149        p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
1150        init_timer(&p->real_timer);
1151        p->real_timer.data = (unsigned long) p;
1152
1153        p->utime = p->stime = 0;
1154        p->lock_depth = -1;             /* -1 = no lock */
1155        do_posix_clock_monotonic_gettime(&p->start_time);
1156        p->security = NULL;
1157        p->io_context = NULL;
1158        p->io_wait = NULL;
1159        p->audit_context = NULL;
1160#ifdef CONFIG_NUMA
1161        p->mempolicy = mpol_copy(p->mempolicy);
1162        if (IS_ERR(p->mempolicy)) {
1163                retval = PTR_ERR(p->mempolicy);
1164                p->mempolicy = NULL;
1165                goto bad_fork_cleanup;
1166        }
1167#endif
1168
1169        if ((retval = security_task_alloc(p)))
1170                goto bad_fork_cleanup_policy;
1171        if ((retval = audit_alloc(p)))
1172                goto bad_fork_cleanup_security;
1173        /* copy all the process information */
1174        if ((retval = copy_semundo(clone_flags, p)))
1175                goto bad_fork_cleanup_audit;
1176        if ((retval = copy_files(clone_flags, p)))
1177                goto bad_fork_cleanup_semundo;
1178        if ((retval = copy_fs(clone_flags, p)))
1179                goto bad_fork_cleanup_files;
1180        if ((retval = copy_sighand(clone_flags, p)))
1181                goto bad_fork_cleanup_fs;
1182        if ((retval = copy_signal(clone_flags, p)))
1183                goto bad_fork_cleanup_sighand;
1184        if ((retval = copy_mm(clone_flags, p)))
1185                goto bad_fork_cleanup_signal;
1186        if ((retval = copy_keys(clone_flags, p)))
1187                goto bad_fork_cleanup_mm;
1188        if ((retval = copy_namespace(clone_flags, p)))
1189                goto bad_fork_cleanup_keys;
1190        retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
1191        if (retval)
1192                goto bad_fork_cleanup_namespace;
1193
1194        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1195        /*
1196         * Clear TID on mm_release()?
1197         */
1198        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1199
1200        /*
1201         * Syscall tracing should be turned off in the child regardless
1202         * of CLONE_PTRACE.
1203         */
1204        clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1205
1206        /* ok, now we should be set up.. */
1207        p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
1208        p->pdeath_signal = 0;
1209        p->exit_state = 0;
1210
1211        /* Perform scheduler related setup */
1212        sched_fork(p);
1213
1214        /*
1215         * Ok, make it visible to the rest of the system.
1216         * We dont wake it up yet.
1217         */
1218        p->tgid = p->pid;
1219        p->group_leader = p;
1220        INIT_LIST_HEAD(&p->ptrace_children);
1221        INIT_LIST_HEAD(&p->ptrace_list);
1222
1223        /* Need tasklist lock for parent etc handling! */
1224        write_lock_irq(&tasklist_lock);
1225
1226        /*
1227         * The task hasn't been attached yet, so cpus_allowed mask cannot
1228         * have changed. The cpus_allowed mask of the parent may have
1229         * changed after it was copied first time, and it may then move to
1230         * another CPU - so we re-copy it here and set the child's CPU to
1231         * the parent's CPU. This avoids alot of nasty races.
1232         */
1233        p->cpus_allowed = current->cpus_allowed;
1234        set_task_cpu(p, smp_processor_id());
1235
1236        /*
1237         * Check for pending SIGKILL! The new thread should not be allowed
1238         * to slip out of an OOM kill. (or normal SIGKILL.)
1239         */
1240        if (sigismember(&current->pending.signal, SIGKILL)) {
1241                write_unlock_irq(&tasklist_lock);
1242                retval = -EINTR;
1243                goto bad_fork_cleanup_namespace;
1244        }
1245
1246        /* CLONE_PARENT re-uses the old parent */
1247        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1248                p->real_parent = current->real_parent;
1249                p->parent_exec_id = current->parent_exec_id;
1250        } else {
1251                p->real_parent = current;
1252                p->parent_exec_id = current->self_exec_id;
1253        }
1254        p->parent = p->real_parent;
1255
1256        spin_lock(&current->sighand->siglock);
1257        if (clone_flags & CLONE_THREAD) {
1258                /*
1259                 * Important: if an exit-all has been started then
1260                 * do not create this new thread - the whole thread
1261                 * group is supposed to exit anyway.
1262                 */
1263                if (current->signal->group_exit) {
1264                        spin_unlock(&current->sighand->siglock);
1265                        write_unlock_irq(&tasklist_lock);
1266                        retval = -EAGAIN;
1267                        goto bad_fork_cleanup_namespace;
1268                }
1269                p->tgid = current->tgid;
1270                p->group_leader = current->group_leader;
1271
1272                if (current->signal->group_stop_count > 0) {
1273                        /*
1274                         * There is an all-stop in progress for the group.
1275                         * We ourselves will stop as soon as we check signals.
1276                         * Make the new thread part of that group stop too.
1277                         */
1278                        current->signal->group_stop_count++;
1279                        set_tsk_thread_flag(p, TIF_SIGPENDING);
1280                }
1281        }
1282
1283        SET_LINKS(p);
1284        if (unlikely(p->ptrace & PT_PTRACED))
1285                __ptrace_link(p, current->parent);
1286
1287        if (thread_group_leader(p)) {
1288                attach_pid(p, PIDTYPE_PGID, process_group(p));
1289                attach_pid(p, PIDTYPE_SID, p->signal->session);
1290                if (p->pid)
1291                        __get_cpu_var(process_counts)++;
1292        }
1293        attach_pid(p, PIDTYPE_TGID, p->tgid);
1294        attach_pid(p, PIDTYPE_PID, p->pid);
1295
1296        if (!current->signal->tty && p->signal->tty)
1297                p->signal->tty = NULL;
1298
1299        nr_threads++;
1300        spin_unlock(&current->sighand->siglock);
1301        write_unlock_irq(&tasklist_lock);
1302        retval = 0;
1303
1304fork_out:
1305        if (retval)
1306                return ERR_PTR(retval);
1307        return p;
1308
1309bad_fork_cleanup_namespace:
1310        exit_namespace(p);
1311bad_fork_cleanup_keys:
1312        exit_keys(p);
1313bad_fork_cleanup_mm:
1314        if (p->mm)
1315                mmput(p->mm);
1316bad_fork_cleanup_signal:
1317        exit_signal(p);
1318bad_fork_cleanup_sighand:
1319        exit_sighand(p);
1320bad_fork_cleanup_fs:
1321        exit_fs(p); /* blocking */
1322bad_fork_cleanup_files:
1323        exit_files(p); /* blocking */
1324bad_fork_cleanup_semundo:
1325        exit_sem(p);
1326bad_fork_cleanup_audit:
1327        audit_free(p);
1328bad_fork_cleanup_security:
1329        security_task_free(p);
1330bad_fork_cleanup_policy:
1331#ifdef CONFIG_NUMA
1332        mpol_free(p->mempolicy);
1333#endif
1334bad_fork_cleanup:
1335        if (p->binfmt)
1336                module_put(p->binfmt->module);
1337bad_fork_cleanup_put_domain:
1338        module_put(p->thread_info->exec_domain->module);
1339bad_fork_cleanup_count:
1340        put_group_info(p->group_info);
1341        atomic_dec(&p->user->processes);
1342        free_uid(p->user);
1343bad_fork_free:
1344        free_task(p);
1345        goto fork_out;
1346}
1347
1348struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1349{
1350        memset(regs, 0, sizeof(struct pt_regs));
1351        return regs;
1352}
1353
1354task_t * __devinit fork_idle(int cpu)
1355{
1356        task_t *task;
1357        struct pt_regs regs;
1358
1359        task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
1360        if (!task)
1361                return ERR_PTR(-ENOMEM);
1362        init_idle(task, cpu);
1363        unhash_process(task);
1364        return task;
1365}
1366
1367static inline int fork_traceflag (unsigned clone_flags)
1368{
1369        if (clone_flags & CLONE_UNTRACED)
1370                return 0;
1371        else if (clone_flags & CLONE_VFORK) {
1372                if (current->ptrace & PT_TRACE_VFORK)
1373                        return PTRACE_EVENT_VFORK;
1374        } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
1375                if (current->ptrace & PT_TRACE_CLONE)
1376                        return PTRACE_EVENT_CLONE;
1377        } else if (current->ptrace & PT_TRACE_FORK)
1378                return PTRACE_EVENT_FORK;
1379
1380        return 0;
1381}
1382
1383/*
1384 *  Ok, this is the main fork-routine.
1385 *
1386 * It copies the process, and if successful kick-starts
1387 * it and waits for it to finish using the VM if required.
1388 */
1389long do_fork(unsigned long clone_flags,
1390              unsigned long stack_start,
1391              struct pt_regs *regs,
1392              unsigned long stack_size,
1393              int __user *parent_tidptr,
1394              int __user *child_tidptr)
1395{
1396        struct task_struct *p;
1397        int trace = 0;
1398        long pid = alloc_pidmap();
1399
1400        if (pid < 0)
1401                return -EAGAIN;
1402        if (unlikely(current->ptrace)) {
1403                trace = fork_traceflag (clone_flags);
1404                if (trace)
1405                        clone_flags |= CLONE_PTRACE;
1406        }
1407
1408        p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
1409        /*
1410         * Do this prior waking up the new thread - the thread pointer
1411         * might get invalid after that point, if the thread exits quickly.
1412         */
1413        if (!IS_ERR(p)) {
1414                struct completion vfork;
1415
1416                if (clone_flags & CLONE_VFORK) {
1417                        task_aux(p)->vfork_done = &vfork;
1418                        init_completion(&vfork);
1419                }
1420
1421                if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
1422                        /*
1423                         * We'll start up with an immediate SIGSTOP.
1424                         */
1425                        sigaddset(&p->pending.signal, SIGSTOP);
1426                        set_tsk_thread_flag(p, TIF_SIGPENDING);
1427                }
1428
1429                if (!(clone_flags & CLONE_STOPPED))
1430                        wake_up_new_task(p, clone_flags);
1431                else
1432                        p->state = TASK_STOPPED;
1433                ++total_forks;
1434
1435                if (unlikely (trace)) {
1436                        current->ptrace_message = pid;
1437                        ptrace_notify ((trace << 8) | SIGTRAP);
1438                }
1439
1440                if (clone_flags & CLONE_VFORK) {
1441                        wait_for_completion(&vfork);
1442                        if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
1443                                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1444                }
1445        } else {
1446                free_pidmap(pid);
1447                pid = PTR_ERR(p);
1448        }
1449        return pid;
1450}
1451
1452/* SLAB cache for signal_struct structures (tsk->signal) */
1453kmem_cache_t *signal_cachep;
1454
1455/* SLAB cache for sighand_struct structures (tsk->sighand) */
1456kmem_cache_t *sighand_cachep;
1457
1458/* SLAB cache for files_struct structures (tsk->files) */
1459kmem_cache_t *files_cachep;
1460
1461/* SLAB cache for fs_struct structures (tsk->fs) */
1462kmem_cache_t *fs_cachep;
1463
1464/* SLAB cache for vm_area_struct structures */
1465kmem_cache_t *vm_area_cachep;
1466
1467/* SLAB cache for mm_struct structures (tsk->mm) */
1468kmem_cache_t *mm_cachep;
1469
1470void __init proc_caches_init(void)
1471{
1472        sighand_cachep = kmem_cache_create("sighand_cache",
1473                        sizeof(struct sighand_struct), 0,
1474                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1475        signal_cachep = kmem_cache_create("signal_cache",
1476                        sizeof(struct signal_struct), 0,
1477                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1478        files_cachep = kmem_cache_create("files_cache", 
1479                        sizeof(struct files_struct), 0,
1480                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1481        fs_cachep = kmem_cache_create("fs_cache", 
1482                        sizeof(struct fs_struct), 0,
1483                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1484        vm_area_cachep = kmem_cache_create("vm_area_struct",
1485                        sizeof(struct vm_area_struct), 0,
1486                        SLAB_PANIC, NULL, NULL);
1487        mm_cachep = kmem_cache_create("mm_struct",
1488                        sizeof(struct mm_struct), 0,
1489                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1490}
1491