RHEL4/kernel/sched.c
<<
>>
Prefs
   1/*
   2 *  kernel/sched.c
   3 *
   4 *  Kernel scheduler and related syscalls
   5 *
   6 *  Copyright (C) 1991-2002  Linus Torvalds
   7 *
   8 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9 *              make semaphores SMP safe
  10 *  1998-11-19  Implemented schedule_timeout() and related stuff
  11 *              by Andrea Arcangeli
  12 *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13 *              hybrid priority-list and round-robin design with
  14 *              an array-switch method of distributing timeslices
  15 *              and per-CPU runqueues.  Cleanups and useful suggestions
  16 *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17 *  2003-09-03  Interactivity tuning by Con Kolivas.
  18 *  2004-04-02  Scheduler domains code by Nick Piggin
  19 */
  20
  21#include <linux/mm.h>
  22#include <linux/module.h>
  23#include <linux/nmi.h>
  24#include <linux/init.h>
  25#include <asm/uaccess.h>
  26#include <linux/highmem.h>
  27#include <linux/smp_lock.h>
  28#include <linux/pagemap.h>
  29#include <asm/mmu_context.h>
  30#include <linux/interrupt.h>
  31#include <linux/completion.h>
  32#include <linux/kernel_stat.h>
  33#include <linux/security.h>
  34#include <linux/notifier.h>
  35#include <linux/profile.h>
  36#include <linux/suspend.h>
  37#include <linux/blkdev.h>
  38#include <linux/delay.h>
  39#include <linux/smp.h>
  40#include <linux/timer.h>
  41#include <linux/rcupdate.h>
  42#include <linux/cpu.h>
  43#include <linux/percpu.h>
  44#include <linux/kthread.h>
  45#include <linux/seq_file.h>
  46#include <linux/times.h>
  47#include <linux/kprobes.h>
  48#include <asm/tlb.h>
  49
  50#include <asm/unistd.h>
  51
  52#ifdef CONFIG_NUMA
  53#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
  54#else
  55#define cpu_to_node_mask(cpu) (cpu_online_map)
  56#endif
  57
  58/*
  59 * Convert user-nice values [ -20 ... 0 ... 19 ]
  60 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  61 * and back.
  62 */
  63#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  64#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  65#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  66
  67/*
  68 * 'User priority' is the nice value converted to something we
  69 * can work with better when scaling various scheduler parameters,
  70 * it's a [ 0 ... 39 ] range.
  71 */
  72#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  73#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  74#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  75
  76/*
  77 * Some helpers for converting nanosecond timing to jiffy resolution
  78 */
  79#define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
  80#define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
  81
  82/** Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
  83* so long as it is large enough.
  84*/
  85#define MAX_PINNED_INTERVAL     512
  86
  87/*
  88 * These are the 'tuning knobs' of the scheduler:
  89 *
  90 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
  91 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
  92 * Timeslices get refilled after they expire.
  93 */
  94#define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
  95#define DEF_TIMESLICE           (100 * HZ / 1000)
  96#define ON_RUNQUEUE_WEIGHT       30
  97#define CHILD_PENALTY            95
  98#define PARENT_PENALTY          100
  99#define EXIT_WEIGHT               3
 100#define PRIO_BONUS_RATIO         25
 101#define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
 102#define INTERACTIVE_DELTA         2
 103#define MAX_SLEEP_AVG           (DEF_TIMESLICE * MAX_BONUS)
 104#define STARVATION_LIMIT        (MAX_SLEEP_AVG)
 105#define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
 106#define CREDIT_LIMIT            100
 107
 108/*
 109 * If a task is 'interactive' then we reinsert it in the active
 110 * array after it has expired its current timeslice. (it will not
 111 * continue to run immediately, it will still roundrobin with
 112 * other interactive tasks.)
 113 *
 114 * This part scales the interactivity limit depending on niceness.
 115 *
 116 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
 117 * Here are a few examples of different nice levels:
 118 *
 119 *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
 120 *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
 121 *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
 122 *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
 123 *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
 124 *
 125 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
 126 *  priority range a task can explore, a value of '1' means the
 127 *  task is rated interactive.)
 128 *
 129 * Ie. nice +19 tasks can never get 'interactive' enough to be
 130 * reinserted into the active array. And only heavily CPU-hog nice -20
 131 * tasks will be expired. Default nice 0 tasks are somewhere between,
 132 * it takes some effort for them to get interactive, but it's not
 133 * too hard.
 134 */
 135
 136#define CURRENT_BONUS(p) \
 137        (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
 138                MAX_SLEEP_AVG)
 139
 140#ifdef CONFIG_SMP
 141#define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
 142                (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
 143                        num_online_cpus())
 144#else
 145#define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
 146                (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
 147#endif
 148
 149#define SCALE(v1,v1_max,v2_max) \
 150        (v1) * (v2_max) / (v1_max)
 151
 152#define DELTA(p) \
 153        (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
 154
 155#define TASK_INTERACTIVE(p) \
 156        ((p)->prio <= (p)->static_prio - DELTA(p))
 157
 158#define INTERACTIVE_SLEEP(p) \
 159        (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 160                (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 161
 162#define HIGH_CREDIT(p) \
 163        ((p)->interactive_credit > CREDIT_LIMIT)
 164
 165#define LOW_CREDIT(p) \
 166        ((p)->interactive_credit < -CREDIT_LIMIT)
 167
 168#define TASK_PREEMPTS_CURR(p, rq) \
 169        ((p)->prio < (rq)->curr->prio)
 170
 171/*
 172 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
 173 * to time slice values: [800ms ... 100ms ... 5ms]
 174 *
 175 * The higher a thread's priority, the bigger timeslices
 176 * it gets during one round of execution. But even the lowest
 177 * priority thread gets MIN_TIMESLICE worth of execution time.
 178 */
 179
 180#define SCALE_PRIO(x, prio) \
 181        max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
 182
 183static unsigned int task_timeslice(task_t *p)
 184{
 185        if (p->static_prio < NICE_TO_PRIO(0))
 186                return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
 187        else
 188                return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
 189}
 190#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)       \
 191                                < (long long) (sd)->cache_hot_time)
 192
 193enum idle_type
 194{
 195        IDLE,
 196        NOT_IDLE,
 197        NEWLY_IDLE,
 198        MAX_IDLE_TYPES
 199};
 200
 201struct sched_domain;
 202
 203/*
 204 * These are the runqueue data structures:
 205 */
 206
 207#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
 208
 209typedef struct runqueue runqueue_t;
 210
 211struct prio_array {
 212        unsigned int nr_active;
 213        unsigned long bitmap[BITMAP_SIZE];
 214        struct list_head queue[MAX_PRIO];
 215};
 216
 217/*
 218 * This is the main, per-CPU runqueue data structure.
 219 *
 220 * Locking rule: those places that want to lock multiple runqueues
 221 * (such as the load balancing or the thread migration code), lock
 222 * acquire operations must be ordered by ascending &runqueue.
 223 */
 224struct runqueue {
 225        spinlock_t lock;
 226
 227        /*
 228         * nr_running and cpu_load should be in the same cacheline because
 229         * remote CPUs use both these fields when doing load calculation.
 230         */
 231        unsigned long nr_running;
 232#ifdef CONFIG_SMP
 233        unsigned long cpu_load;
 234        unsigned int irq_pct;
 235        int irq_quantum;
 236        u64 prev_irq_ticks;
 237#endif
 238        unsigned long long nr_switches;
 239
 240        /*
 241         * This is part of a global counter where only the total sum
 242         * over all CPUs matters. A task can increase this counter on
 243         * one CPU and if it got migrated afterwards it may decrease
 244         * it on another CPU. Always updated under the runqueue lock:
 245         */
 246        unsigned long nr_uninterruptible;
 247
 248        unsigned long switch_timestamp;
 249        unsigned long long timestamp_last_tick;
 250        task_t *curr, *idle;
 251        struct mm_struct *prev_mm;
 252        prio_array_t *active, *expired, arrays[2];
 253        int best_expired_prio;
 254        atomic_t nr_iowait;
 255
 256#ifdef CONFIG_SMP
 257        struct sched_domain *sd;
 258
 259        /* For active balancing */
 260        int active_balance;
 261        int push_cpu;
 262
 263        task_t *migration_thread;
 264        struct list_head migration_queue;
 265        int cpu;
 266#endif
 267
 268#ifdef CONFIG_SCHEDSTATS
 269        /* latency stats */
 270        struct sched_info rq_sched_info;
 271
 272        /* sys_sched_yield() stats */
 273        unsigned long yld_exp_empty;
 274        unsigned long yld_act_empty;
 275        unsigned long yld_both_empty;
 276        unsigned long yld_cnt;
 277
 278        /* schedule() stats */
 279        unsigned long sched_noswitch;
 280        unsigned long sched_switch;
 281        unsigned long sched_cnt;
 282        unsigned long sched_goidle;
 283
 284        /* pull_task() stats */
 285        unsigned long pt_gained[MAX_IDLE_TYPES];
 286        unsigned long pt_lost[MAX_IDLE_TYPES];
 287
 288        /* active_load_balance() stats */
 289        unsigned long alb_cnt;
 290        unsigned long alb_lost;
 291        unsigned long alb_gained;
 292        unsigned long alb_failed;
 293
 294        /* try_to_wake_up() stats */
 295        unsigned long ttwu_cnt;
 296        unsigned long ttwu_attempts;
 297        unsigned long ttwu_moved;
 298
 299        /* wake_up_new_task() stats */
 300        unsigned long wunt_cnt;
 301        unsigned long wunt_moved;
 302
 303        /* sched_migrate_task() stats */
 304        unsigned long smt_cnt;
 305
 306        /* sched_balance_exec() stats */
 307        unsigned long sbe_cnt;
 308#endif
 309};
 310
 311static DEFINE_PER_CPU(struct runqueue, runqueues);
 312
 313/*
 314 * sched-domains (multiprocessor balancing) declarations:
 315 */
 316#ifdef CONFIG_SMP
 317#define SCHED_LOAD_SCALE        128UL   /* increase resolution of load */
 318
 319#define SD_BALANCE_NEWIDLE      1       /* Balance when about to become idle */
 320#define SD_BALANCE_EXEC         2       /* Balance on exec */
 321#define SD_WAKE_IDLE            4       /* Wake to idle CPU on task wakeup */
 322#define SD_WAKE_AFFINE          8       /* Wake task to waking CPU */
 323#define SD_WAKE_BALANCE         16      /* Perform balancing at task wakeup */
 324#define SD_SHARE_CPUPOWER       32      /* Domain members share cpu power */
 325
 326struct sched_group {
 327        struct sched_group *next;       /* Must be a circular list */
 328        cpumask_t cpumask;
 329
 330        /*
 331         * CPU power of this group, SCHED_LOAD_SCALE being max power for a
 332         * single CPU. This is read only (except for setup, hotplug CPU).
 333         */
 334        unsigned long cpu_power;
 335};
 336
 337struct sched_domain {
 338        /* These fields must be setup */
 339        struct sched_domain *parent;    /* top domain must be null terminated */
 340        struct sched_group *groups;     /* the balancing groups of the domain */
 341        cpumask_t span;                 /* span of all CPUs in this domain */
 342        unsigned long min_interval;     /* Minimum balance interval ms */
 343        unsigned long max_interval;     /* Maximum balance interval ms */
 344        unsigned int busy_factor;       /* less balancing by factor if busy */
 345        unsigned int imbalance_pct;     /* No balance until over watermark */
 346        unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
 347        unsigned int cache_nice_tries;  /* Leave cache hot tasks for # tries */
 348        unsigned int per_cpu_gain;      /* CPU % gained by adding domain cpus */
 349        int flags;                      /* See SD_* */
 350
 351        /* Runtime fields. */
 352        unsigned long last_balance;     /* init to jiffies. units in jiffies */
 353        unsigned int balance_interval;  /* initialise to 1. units in ms. */
 354        unsigned int nr_balance_failed; /* initialise to 0 */
 355
 356#ifdef CONFIG_SCHEDSTATS
 357        /* load_balance() stats */
 358        unsigned long lb_cnt[MAX_IDLE_TYPES];
 359        unsigned long lb_failed[MAX_IDLE_TYPES];
 360        unsigned long lb_imbalance[MAX_IDLE_TYPES];
 361        unsigned long lb_nobusyg[MAX_IDLE_TYPES];
 362        unsigned long lb_nobusyq[MAX_IDLE_TYPES];
 363
 364        /* sched_balance_exec() stats */
 365        unsigned long sbe_attempts;
 366        unsigned long sbe_pushed;
 367
 368        /* try_to_wake_up() stats */
 369        unsigned long ttwu_wake_affine;
 370        unsigned long ttwu_wake_balance;
 371#endif
 372};
 373
 374#ifndef ARCH_HAS_SCHED_TUNE
 375#ifdef CONFIG_SCHED_SMT
 376#define ARCH_HAS_SCHED_WAKE_IDLE
 377/* Common values for SMT siblings */
 378#define SD_SIBLING_INIT (struct sched_domain) {         \
 379        .span                   = CPU_MASK_NONE,        \
 380        .parent                 = NULL,                 \
 381        .groups                 = NULL,                 \
 382        .min_interval           = 1,                    \
 383        .max_interval           = 2,                    \
 384        .busy_factor            = 8,                    \
 385        .imbalance_pct          = 110,                  \
 386        .cache_hot_time         = 0,                    \
 387        .cache_nice_tries       = 0,                    \
 388        .per_cpu_gain           = 25,                   \
 389        .flags                  = SD_BALANCE_NEWIDLE    \
 390                                | SD_BALANCE_EXEC       \
 391                                | SD_WAKE_AFFINE        \
 392                                | SD_WAKE_IDLE          \
 393                                | SD_SHARE_CPUPOWER,    \
 394        .last_balance           = jiffies,              \
 395        .balance_interval       = 1,                    \
 396        .nr_balance_failed      = 0,                    \
 397}
 398#endif
 399
 400/* Common values for CPUs */
 401#define SD_CPU_INIT (struct sched_domain) {             \
 402        .span                   = CPU_MASK_NONE,        \
 403        .parent                 = NULL,                 \
 404        .groups                 = NULL,                 \
 405        .min_interval           = 1,                    \
 406        .max_interval           = 4,                    \
 407        .busy_factor            = 64,                   \
 408        .imbalance_pct          = 125,                  \
 409        .cache_hot_time         = cache_decay_ticks*1000000 ? : (5*1000000/2),\
 410        .cache_nice_tries       = 1,                    \
 411        .per_cpu_gain           = 100,                  \
 412        .flags                  = SD_BALANCE_NEWIDLE    \
 413                                | SD_BALANCE_EXEC       \
 414                                | SD_WAKE_AFFINE        \
 415                                | SD_WAKE_BALANCE,      \
 416        .last_balance           = jiffies,              \
 417        .balance_interval       = 1,                    \
 418        .nr_balance_failed      = 0,                    \
 419}
 420
 421#ifdef CONFIG_SCHED_MC
 422#ifndef SD_MC_INIT
 423/* for now its same as SD_CPU_INIT.
 424 * TBD: Tune Domain parameters!
 425 */
 426#define SD_MC_INIT   SD_CPU_INIT
 427#endif
 428#endif
 429
 430/* Arch can override this macro in processor.h */
 431#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
 432#define SD_NODE_INIT (struct sched_domain) {            \
 433        .span                   = CPU_MASK_NONE,        \
 434        .parent                 = NULL,                 \
 435        .groups                 = NULL,                 \
 436        .min_interval           = 8,                    \
 437        .max_interval           = 32,                   \
 438        .busy_factor            = 32,                   \
 439        .imbalance_pct          = 125,                  \
 440        .cache_hot_time         = (10*1000000),         \
 441        .cache_nice_tries       = 1,                    \
 442        .per_cpu_gain           = 100,                  \
 443        .flags                  = SD_BALANCE_EXEC       \
 444                                | SD_WAKE_BALANCE,      \
 445        .last_balance           = jiffies,              \
 446        .balance_interval       = 1,                    \
 447        .nr_balance_failed      = 0,                    \
 448}
 449#endif
 450#endif /* ARCH_HAS_SCHED_TUNE */
 451#endif
 452
 453
 454#define for_each_domain(cpu, domain) \
 455        for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
 456
 457#define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 458#define this_rq()               (&__get_cpu_var(runqueues))
 459#define task_rq(p)              cpu_rq(task_cpu(p))
 460#define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 461
 462/*
 463 * Default context-switch locking:
 464 */
 465#ifndef prepare_arch_switch
 466# define prepare_arch_switch(rq, next)  do { } while (0)
 467# define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
 468# define task_running(rq, p)            ((rq)->curr == (p))
 469#endif
 470
 471/*
 472 * task_rq_lock - lock the runqueue a given task resides on and disable
 473 * interrupts.  Note the ordering: we can safely lookup the task_rq without
 474 * explicitly disabling preemption.
 475 */
 476static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 477{
 478        struct runqueue *rq;
 479
 480repeat_lock_task:
 481        local_irq_save(*flags);
 482        rq = task_rq(p);
 483        spin_lock(&rq->lock);
 484        if (unlikely(rq != task_rq(p))) {
 485                spin_unlock_irqrestore(&rq->lock, *flags);
 486                goto repeat_lock_task;
 487        }
 488        return rq;
 489}
 490
 491static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 492{
 493        spin_unlock_irqrestore(&rq->lock, *flags);
 494}
 495
 496#ifdef CONFIG_SCHEDSTATS
 497/*
 498 * bump this up when changing the output format or the meaning of an existing
 499 * format, so that tools can adapt (or abort)
 500 */
 501#define SCHEDSTAT_VERSION 10
 502
 503static int show_schedstat(struct seq_file *seq, void *v)
 504{
 505        int cpu;
 506        enum idle_type itype;
 507
 508        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
 509        seq_printf(seq, "timestamp %lu\n", jiffies);
 510        for_each_online_cpu(cpu) {
 511                runqueue_t *rq = cpu_rq(cpu);
 512#ifdef CONFIG_SMP
 513                struct sched_domain *sd;
 514                int dcnt = 0;
 515#endif
 516
 517                /* runqueue-specific stats */
 518                seq_printf(seq,
 519                    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
 520                    "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
 521                    cpu, rq->yld_both_empty,
 522                    rq->yld_act_empty, rq->yld_exp_empty,
 523                    rq->yld_cnt, rq->sched_noswitch,
 524                    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
 525                    rq->alb_cnt, rq->alb_gained, rq->alb_lost,
 526                    rq->alb_failed,
 527                    rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
 528                    rq->wunt_cnt, rq->wunt_moved,
 529                    rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
 530                    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
 531
 532                for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++)
 533                        seq_printf(seq, " %lu %lu", rq->pt_gained[itype],
 534                                                    rq->pt_lost[itype]);
 535                seq_printf(seq, "\n");
 536
 537#ifdef CONFIG_SMP
 538                /* domain-specific stats */
 539                for_each_domain(cpu, sd) {
 540                        char mask_str[NR_CPUS];
 541
 542                        cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
 543                        seq_printf(seq, "domain%d %s", dcnt++, mask_str);
 544                        for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) {
 545                                seq_printf(seq, " %lu %lu %lu %lu %lu",
 546                                    sd->lb_cnt[itype],
 547                                    sd->lb_failed[itype],
 548                                    sd->lb_imbalance[itype],
 549                                    sd->lb_nobusyq[itype],
 550                                    sd->lb_nobusyg[itype]);
 551                        }
 552                        seq_printf(seq, " %lu %lu %lu %lu\n",
 553                            sd->sbe_pushed, sd->sbe_attempts,
 554                            sd->ttwu_wake_affine, sd->ttwu_wake_balance);
 555                }
 556#endif
 557        }
 558        return 0;
 559}
 560
 561static int schedstat_open(struct inode *inode, struct file *file)
 562{
 563        unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
 564        char *buf = kmalloc(size, GFP_KERNEL);
 565        struct seq_file *m;
 566        int res;
 567
 568        if (!buf)
 569                return -ENOMEM;
 570        res = single_open(file, show_schedstat, NULL);
 571        if (!res) {
 572                m = file->private_data;
 573                m->buf = buf;
 574                m->size = size;
 575        } else
 576                kfree(buf);
 577        return res;
 578}
 579
 580struct file_operations proc_schedstat_operations = {
 581        .open    = schedstat_open,
 582        .read    = seq_read,
 583        .llseek  = seq_lseek,
 584        .release = single_release,
 585};
 586
 587# define schedstat_inc(rq, field)       rq->field++;
 588# define schedstat_add(rq, field, amt)  rq->field += amt;
 589#else /* !CONFIG_SCHEDSTATS */
 590# define schedstat_inc(rq, field)       do { } while (0);
 591# define schedstat_add(rq, field, amt)  do { } while (0);
 592#endif
 593
 594/*
 595 * rq_lock - lock a given runqueue and disable interrupts.
 596 */
 597static inline runqueue_t *this_rq_lock(void)
 598{
 599        runqueue_t *rq;
 600
 601        local_irq_disable();
 602        rq = this_rq();
 603        spin_lock(&rq->lock);
 604
 605        return rq;
 606}
 607
 608static inline void rq_unlock(runqueue_t *rq)
 609{
 610        spin_unlock_irq(&rq->lock);
 611}
 612
 613#ifdef CONFIG_SCHEDSTATS
 614/*
 615 * Called when a process is dequeued from the active array and given
 616 * the cpu.  We should note that with the exception of interactive
 617 * tasks, the expired queue will become the active queue after the active
 618 * queue is empty, without explicitly dequeuing and requeuing tasks in the
 619 * expired queue.  (Interactive tasks may be requeued directly to the
 620 * active queue, thus delaying tasks in the expired queue from running;
 621 * see scheduler_tick()).
 622 *
 623 * This function is only called from sched_info_arrive(), rather than
 624 * dequeue_task(). Even though a task may be queued and dequeued multiple
 625 * times as it is shuffled about, we're really interested in knowing how
 626 * long it was from the *first* time it was queued to the time that it
 627 * finally hit a cpu.
 628 */
 629static inline void sched_info_dequeued(task_t *t)
 630{
 631        t->sched_info.last_queued = 0;
 632}
 633
 634/*
 635 * Called when a task finally hits the cpu.  We can now calculate how
 636 * long it was waiting to run.  We also note when it began so that we
 637 * can keep stats on how long its timeslice is.
 638 */
 639static inline void sched_info_arrive(task_t *t)
 640{
 641        unsigned long now = jiffies, diff = 0;
 642        struct runqueue *rq = task_rq(t);
 643
 644        if (t->sched_info.last_queued)
 645                diff = now - t->sched_info.last_queued;
 646        sched_info_dequeued(t);
 647        t->sched_info.run_delay += diff;
 648        t->sched_info.last_arrival = now;
 649        t->sched_info.pcnt++;
 650
 651        if (!rq)
 652                return;
 653
 654        rq->rq_sched_info.run_delay += diff;
 655        rq->rq_sched_info.pcnt++;
 656}
 657
 658/*
 659 * Called when a process is queued into either the active or expired
 660 * array.  The time is noted and later used to determine how long we
 661 * had to wait for us to reach the cpu.  Since the expired queue will
 662 * become the active queue after active queue is empty, without dequeuing
 663 * and requeuing any tasks, we are interested in queuing to either. It
 664 * is unusual but not impossible for tasks to be dequeued and immediately
 665 * requeued in the same or another array: this can happen in sched_yield(),
 666 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
 667 * to runqueue.
 668 *
 669 * This function is only called from enqueue_task(), but also only updates
 670 * the timestamp if it is already not set.  It's assumed that
 671 * sched_info_dequeued() will clear that stamp when appropriate.
 672 */
 673static inline void sched_info_queued(task_t *t)
 674{
 675        if (!t->sched_info.last_queued)
 676                t->sched_info.last_queued = jiffies;
 677}
 678
 679/*
 680 * Called when a process ceases being the active-running process, either
 681 * voluntarily or involuntarily.  Now we can calculate how long we ran.
 682 */
 683static inline void sched_info_depart(task_t *t)
 684{
 685        struct runqueue *rq = task_rq(t);
 686        unsigned long diff = jiffies - t->sched_info.last_arrival;
 687
 688        t->sched_info.cpu_time += diff;
 689
 690        if (rq)
 691                rq->rq_sched_info.cpu_time += diff;
 692}
 693
 694/*
 695 * Called when tasks are switched involuntarily due, typically, to expiring
 696 * their time slice.  (This may also be called when switching to or from
 697 * the idle task.)  We are only called when prev != next.
 698 */
 699static inline void sched_info_switch(task_t *prev, task_t *next)
 700{
 701        struct runqueue *rq = task_rq(prev);
 702
 703        /*
 704         * prev now departs the cpu.  It's not interesting to record
 705         * stats about how efficient we were at scheduling the idle
 706         * process, however.
 707         */
 708        if (prev != rq->idle)
 709                sched_info_depart(prev);
 710
 711        if (next != rq->idle)
 712                sched_info_arrive(next);
 713}
 714#else
 715#define sched_info_queued(t)            do { } while (0)
 716#define sched_info_switch(t, next)      do { } while (0)
 717#endif /* CONFIG_SCHEDSTATS */
 718
 719/*
 720 * Adding/removing a task to/from a priority array:
 721 */
 722static void dequeue_task(struct task_struct *p, prio_array_t *array)
 723{
 724        array->nr_active--;
 725        list_del(&p->run_list);
 726        if (list_empty(array->queue + p->prio))
 727                __clear_bit(p->prio, array->bitmap);
 728}
 729
 730static void enqueue_task(struct task_struct *p, prio_array_t *array)
 731{
 732        sched_info_queued(p);
 733        list_add_tail(&p->run_list, array->queue + p->prio);
 734        __set_bit(p->prio, array->bitmap);
 735        array->nr_active++;
 736        p->array = array;
 737}
 738
 739/*
 740 * Used by the migration code - we pull tasks from the head of the
 741 * remote queue so we want these tasks to show up at the head of the
 742 * local queue:
 743 */
 744static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
 745{
 746        list_add(&p->run_list, array->queue + p->prio);
 747        __set_bit(p->prio, array->bitmap);
 748        array->nr_active++;
 749        p->array = array;
 750}
 751
 752/*
 753 * effective_prio - return the priority that is based on the static
 754 * priority but is modified by bonuses/penalties.
 755 *
 756 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
 757 * into the -5 ... 0 ... +5 bonus/penalty range.
 758 *
 759 * We use 25% of the full 0...39 priority range so that:
 760 *
 761 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
 762 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
 763 *
 764 * Both properties are important to certain workloads.
 765 */
 766static int effective_prio(task_t *p)
 767{
 768        int bonus, prio;
 769
 770        if (rt_task(p))
 771                return p->prio;
 772
 773        bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 774
 775        prio = p->static_prio - bonus;
 776        if (prio < MAX_RT_PRIO)
 777                prio = MAX_RT_PRIO;
 778        if (prio > MAX_PRIO-1)
 779                prio = MAX_PRIO-1;
 780        return prio;
 781}
 782
 783/*
 784 * __activate_task - move a task to the runqueue.
 785 */
 786static inline void __activate_task(task_t *p, runqueue_t *rq)
 787{
 788        enqueue_task(p, rq->active);
 789        rq->nr_running++;
 790}
 791
 792/*
 793 * __activate_idle_task - move idle task to the _front_ of runqueue.
 794 */
 795static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 796{
 797        enqueue_task_head(p, rq->active);
 798        rq->nr_running++;
 799}
 800
 801static void recalc_task_prio(task_t *p, unsigned long long now)
 802{
 803        unsigned long long __sleep_time = now - p->timestamp;
 804        unsigned long sleep_time;
 805
 806        if (__sleep_time > NS_MAX_SLEEP_AVG)
 807                sleep_time = NS_MAX_SLEEP_AVG;
 808        else
 809                sleep_time = (unsigned long)__sleep_time;
 810
 811        if (likely(sleep_time > 0)) {
 812                /*
 813                 * User tasks that sleep a long time are categorised as
 814                 * idle and will get just interactive status to stay active &
 815                 * prevent them suddenly becoming cpu hogs and starving
 816                 * other processes.
 817                 */
 818                if (p->mm && p->activated != -1 &&
 819                        sleep_time > INTERACTIVE_SLEEP(p)) {
 820                                p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
 821                                                DEF_TIMESLICE);
 822                                if (!HIGH_CREDIT(p))
 823                                        p->interactive_credit++;
 824                } else {
 825                        /*
 826                         * The lower the sleep avg a task has the more
 827                         * rapidly it will rise with sleep time.
 828                         */
 829                        sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
 830
 831                        /*
 832                         * Tasks with low interactive_credit are limited to
 833                         * one timeslice worth of sleep avg bonus.
 834                         */
 835                        if (LOW_CREDIT(p) &&
 836                            sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
 837                                sleep_time = JIFFIES_TO_NS(task_timeslice(p));
 838
 839                        /*
 840                         * Non high_credit tasks waking from uninterruptible
 841                         * sleep are limited in their sleep_avg rise as they
 842                         * are likely to be cpu hogs waiting on I/O
 843                         */
 844                        if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
 845                                if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
 846                                        sleep_time = 0;
 847                                else if (p->sleep_avg + sleep_time >=
 848                                                INTERACTIVE_SLEEP(p)) {
 849                                        p->sleep_avg = INTERACTIVE_SLEEP(p);
 850                                        sleep_time = 0;
 851                                }
 852                        }
 853
 854                        /*
 855                         * This code gives a bonus to interactive tasks.
 856                         *
 857                         * The boost works by updating the 'average sleep time'
 858                         * value here, based on ->timestamp. The more time a
 859                         * task spends sleeping, the higher the average gets -
 860                         * and the higher the priority boost gets as well.
 861                         */
 862                        p->sleep_avg += sleep_time;
 863
 864                        if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
 865                                p->sleep_avg = NS_MAX_SLEEP_AVG;
 866                                if (!HIGH_CREDIT(p))
 867                                        p->interactive_credit++;
 868                        }
 869                }
 870        }
 871
 872        p->prio = effective_prio(p);
 873}
 874
 875/*
 876 * activate_task - move a task to the runqueue and do priority recalculation
 877 *
 878 * Update all the scheduling statistics stuff. (sleep average
 879 * calculation, priority modifiers, etc.)
 880 */
 881static void activate_task(task_t *p, runqueue_t *rq, int local)
 882{
 883        unsigned long long now;
 884
 885        now = sched_clock();
 886#ifdef CONFIG_SMP
 887        if (!local) {
 888                /* Compensate for drifting sched_clock */
 889                runqueue_t *this_rq = this_rq();
 890                now = (now - this_rq->timestamp_last_tick)
 891                        + rq->timestamp_last_tick;
 892        }
 893#endif
 894
 895        recalc_task_prio(p, now);
 896
 897        /*
 898         * This checks to make sure it's not an uninterruptible task
 899         * that is now waking up.
 900         */
 901        if (!p->activated) {
 902                /*
 903                 * Tasks which were woken up by interrupts (ie. hw events)
 904                 * are most likely of interactive nature. So we give them
 905                 * the credit of extending their sleep time to the period
 906                 * of time they spend on the runqueue, waiting for execution
 907                 * on a CPU, first time around:
 908                 */
 909                if (in_interrupt())
 910                        p->activated = 2;
 911                else {
 912                        /*
 913                         * Normal first-time wakeups get a credit too for
 914                         * on-runqueue time, but it will be weighted down:
 915                         */
 916                        p->activated = 1;
 917                }
 918        }
 919        p->timestamp = now;
 920
 921        __activate_task(p, rq);
 922}
 923
 924/*
 925 * deactivate_task - remove a task from the runqueue.
 926 */
 927static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 928{
 929        rq->nr_running--;
 930        dequeue_task(p, p->array);
 931        p->array = NULL;
 932}
 933
 934/*
 935 * resched_task - mark a task 'to be rescheduled now'.
 936 *
 937 * On UP this means the setting of the need_resched flag, on SMP it
 938 * might also involve a cross-CPU call to trigger the scheduler on
 939 * the target CPU.
 940 */
 941#ifdef CONFIG_SMP
 942static void resched_task(task_t *p)
 943{
 944        int need_resched, nrpolling;
 945
 946        BUG_ON(!spin_is_locked(&task_rq(p)->lock));
 947
 948        /* minimise the chance of sending an interrupt to poll_idle() */
 949        nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 950        need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
 951        nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 952
 953        if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
 954                smp_send_reschedule(task_cpu(p));
 955}
 956#else
 957static inline void resched_task(task_t *p)
 958{
 959        set_tsk_need_resched(p);
 960}
 961#endif
 962
 963/**
 964 * task_curr - is this task currently executing on a CPU?
 965 * @p: the task in question.
 966 */
 967inline int task_curr(const task_t *p)
 968{
 969        return cpu_curr(task_cpu(p)) == p;
 970}
 971
 972int wake_balance=1;
 973
 974#ifdef CONFIG_SMP
 975
 976#define LOWLAT_ENABLED()                (wake_balance == 2)
 977
 978enum request_type {
 979        REQ_MOVE_TASK,
 980        REQ_SET_DOMAIN,
 981};
 982
 983typedef struct {
 984        struct list_head list;
 985        enum request_type type;
 986
 987        /* For REQ_MOVE_TASK */
 988        task_t *task;
 989        int dest_cpu;
 990
 991        /* For REQ_SET_DOMAIN */
 992        struct sched_domain *sd;
 993
 994        struct completion done;
 995} migration_req_t;
 996
 997/*
 998 * The task's runqueue lock must be held.
 999 * Returns true if you have to wait for migration thread.
1000 */
1001static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
1002{
1003        runqueue_t *rq = task_rq(p);
1004
1005        /*
1006         * If the task is not on a runqueue (and not running), then
1007         * it is sufficient to simply update the task's cpu field.
1008         */
1009        if (!p->array && !task_running(rq, p)) {
1010                set_task_cpu(p, dest_cpu);
1011                return 0;
1012        }
1013
1014        init_completion(&req->done);
1015        req->type = REQ_MOVE_TASK;
1016        req->task = p;
1017        req->dest_cpu = dest_cpu;
1018        list_add(&req->list, &rq->migration_queue);
1019        return 1;
1020}
1021
1022/*
1023 * wait_task_inactive - wait for a thread to unschedule.
1024 *
1025 * The caller must ensure that the task *will* unschedule sometime soon,
1026 * else this function might spin for a *long* time. This function can't
1027 * be called with interrupts off, or it may introduce deadlock with
1028 * smp_call_function() if an IPI is sent by the same process we are
1029 * waiting to become inactive.
1030 */
1031void wait_task_inactive(task_t * p)
1032{
1033        unsigned long flags;
1034        runqueue_t *rq;
1035        int preempted;
1036
1037repeat:
1038        rq = task_rq_lock(p, &flags);
1039        /* Must be off runqueue entirely, not preempted. */
1040        if (unlikely(p->array || task_running(rq, p))) {
1041                /* If it's preempted, we yield.  It could be a while. */
1042                preempted = !task_running(rq, p);
1043                task_rq_unlock(rq, &flags);
1044                cpu_relax();
1045                if (preempted)
1046                        yield();
1047                goto repeat;
1048        }
1049        task_rq_unlock(rq, &flags);
1050}
1051
1052/***
1053 * kick_process - kick a running thread to enter/exit the kernel
1054 * @p: the to-be-kicked thread
1055 *
1056 * Cause a process which is running on another CPU to enter
1057 * kernel-mode, without any delay. (to get signals handled.)
1058 */
1059void kick_process(task_t *p)
1060{
1061        int cpu;
1062
1063        preempt_disable();
1064        cpu = task_cpu(p);
1065        if ((cpu != smp_processor_id()) && task_curr(p))
1066                smp_send_reschedule(cpu);
1067        preempt_enable();
1068}
1069
1070static unsigned long current_load(runqueue_t *rq)
1071{
1072        unsigned long load = rq->nr_running * SCHED_LOAD_SCALE;
1073
1074        if (LOWLAT_ENABLED())
1075                load += (SCHED_LOAD_SCALE * rq->irq_pct) / 100;
1076        return load;
1077}
1078
1079/*
1080 * Return a low guess at the load of a migration-source cpu.
1081 *
1082 * We want to under-estimate the load of migration sources, to
1083 * balance conservatively.
1084 */
1085static inline unsigned long source_load(int cpu)
1086{
1087        runqueue_t *rq = cpu_rq(cpu);
1088
1089        return min(rq->cpu_load, current_load(rq));
1090}
1091
1092/*
1093 * Return a high guess at the load of a migration-target cpu
1094 */
1095static inline unsigned long target_load(int cpu)
1096{
1097        runqueue_t *rq = cpu_rq(cpu);
1098
1099        return max(rq->cpu_load, current_load(rq));
1100}
1101
1102#endif
1103
1104/*
1105 * wake_idle() is useful especially on SMT architectures to wake a
1106 * task onto an idle sibling if we would otherwise wake it onto a
1107 * busy sibling.
1108 *
1109 * Returns the CPU we should wake onto.
1110 */
1111#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1112static int wake_idle(int cpu, task_t *p)
1113{
1114        cpumask_t tmp;
1115        runqueue_t *rq = cpu_rq(cpu);
1116        struct sched_domain *sd;
1117        int i;
1118        unsigned long load = current_load(rq) + SCHED_LOAD_SCALE;
1119
1120        if (LOWLAT_ENABLED()) {
1121                if (idle_cpu(cpu))
1122                        load -= SCHED_LOAD_SCALE;
1123                if (load <= SCHED_LOAD_SCALE)
1124                        return cpu;
1125        } else if (idle_cpu(cpu)) {
1126                return cpu;
1127        }
1128
1129        sd = rq->sd;
1130        if (!(sd->flags & SD_WAKE_IDLE) && !LOWLAT_ENABLED())
1131                return cpu;
1132
1133        cpus_and(tmp, sd->span, p->cpus_allowed);
1134
1135        for_each_cpu_mask(i, tmp) {
1136                if (LOWLAT_ENABLED()) {
1137                        unsigned long l = current_load(cpu_rq(i))
1138                                                        + SCHED_LOAD_SCALE;
1139
1140                        if (idle_cpu(i))
1141                                l -= SCHED_LOAD_SCALE;
1142                        if (l <= SCHED_LOAD_SCALE)
1143                                return i;
1144                        if (l < load) {
1145                                cpu = i;
1146                                load = l;
1147                        }
1148                } else if (idle_cpu(i)) {
1149                        return i;
1150                }
1151        }
1152
1153        return cpu;
1154}
1155#else
1156static inline int wake_idle(int cpu, task_t *p)
1157{
1158        return cpu;
1159}
1160#endif
1161
1162/***
1163 * try_to_wake_up - wake up a thread
1164 * @p: the to-be-woken-up thread
1165 * @state: the mask of task states that can be woken
1166 * @sync: do a synchronous wakeup?
1167 *
1168 * Put it on the run-queue if it's not already there. The "current"
1169 * thread is always on the run-queue (except when the actual
1170 * re-schedule is in progress), and as such you're allowed to do
1171 * the simpler "current->state = TASK_RUNNING" to mark yourself
1172 * runnable without the overhead of this.
1173 *
1174 * returns failure only if the task is already active.
1175 */
1176static int try_to_wake_up(task_t * p, unsigned int state, int sync)
1177{
1178        int cpu, this_cpu, success = 0;
1179        unsigned long flags;
1180        long old_state;
1181        runqueue_t *rq;
1182#ifdef CONFIG_SMP
1183        unsigned long load, this_load;
1184        struct sched_domain *sd;
1185        int new_cpu;
1186#endif
1187
1188        rq = task_rq_lock(p, &flags);
1189        schedstat_inc(rq, ttwu_cnt);
1190        old_state = p->state;
1191        if (!(old_state & state))
1192                goto out;
1193
1194        if (p->array)
1195                goto out_running;
1196
1197        cpu = task_cpu(p);
1198        this_cpu = smp_processor_id();
1199
1200#ifdef CONFIG_SMP
1201        if (unlikely(task_running(rq, p)))
1202                goto out_activate;
1203
1204        new_cpu = cpu;
1205
1206        if (!wake_balance)
1207                goto out_set_cpu;
1208
1209        if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1210                goto out_set_cpu;
1211
1212        load = source_load(cpu);
1213        this_load = target_load(this_cpu);
1214
1215        /*
1216         * If sync wakeup then subtract the (maximum possible) effect of
1217         * the currently running task from the load of the current CPU:
1218         */
1219        if (sync)
1220                this_load -= SCHED_LOAD_SCALE;
1221
1222        /* Don't pull the task off an idle CPU to a busy one */
1223        if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1224                goto out_set_cpu;
1225
1226        new_cpu = this_cpu; /* Wake to this CPU if we can */
1227
1228        /*
1229         * Scan domains for affine wakeup and passive balancing
1230         * possibilities.
1231         */
1232        for_each_domain(this_cpu, sd) {
1233                unsigned int imbalance;
1234                /*
1235                 * Start passive balancing when half the imbalance_pct
1236                 * limit is reached.
1237                 */
1238                imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1239
1240                if ((sd->flags & SD_WAKE_AFFINE) &&
1241                                !task_hot(p, rq->timestamp_last_tick, sd)) {
1242                        /*
1243                         * This domain has SD_WAKE_AFFINE and p is cache cold
1244                         * in this domain.
1245                         */
1246                        if (cpu_isset(cpu, sd->span)) {
1247                                schedstat_inc(sd, ttwu_wake_affine);
1248                                goto out_set_cpu;
1249                        }
1250                } else if ((sd->flags & SD_WAKE_BALANCE) &&
1251                                imbalance*this_load <= 100*load) {
1252                        /*
1253                         * This domain has SD_WAKE_BALANCE and there is
1254                         * an imbalance.
1255                         */
1256                        if (cpu_isset(cpu, sd->span)) {
1257                                schedstat_inc(sd, ttwu_wake_balance);
1258                                goto out_set_cpu;
1259                        }
1260                }
1261        }
1262
1263        new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1264out_set_cpu:
1265        schedstat_inc(rq, ttwu_attempts);
1266        new_cpu = wake_idle(new_cpu, p);
1267        if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
1268                schedstat_inc(rq, ttwu_moved);
1269                set_task_cpu(p, new_cpu);
1270                task_rq_unlock(rq, &flags);
1271                /* might preempt at this point */
1272                rq = task_rq_lock(p, &flags);
1273                old_state = p->state;
1274                if (!(old_state & state))
1275                        goto out;
1276                if (p->array)
1277                        goto out_running;
1278
1279                this_cpu = smp_processor_id();
1280                cpu = task_cpu(p);
1281        }
1282
1283out_activate:
1284#endif /* CONFIG_SMP */
1285        if (old_state == TASK_UNINTERRUPTIBLE) {
1286                rq->nr_uninterruptible--;
1287                /*
1288                 * Tasks on involuntary sleep don't earn
1289                 * sleep_avg beyond just interactive state.
1290                 */
1291                p->activated = -1;
1292        }
1293
1294        /*
1295         * Sync wakeups (i.e. those types of wakeups where the waker
1296         * has indicated that it will leave the CPU in short order)
1297         * don't trigger a preemption, if the woken up task will run on
1298         * this cpu. (in this case the 'I will reschedule' promise of
1299         * the waker guarantees that the freshly woken up task is going
1300         * to be considered on this CPU.)
1301         */
1302        activate_task(p, rq, cpu == this_cpu);
1303        if (!sync || cpu != this_cpu) {
1304                if (TASK_PREEMPTS_CURR(p, rq))
1305                        resched_task(rq->curr);
1306        }
1307        success = 1;
1308
1309out_running:
1310        p->state = TASK_RUNNING;
1311out:
1312        task_rq_unlock(rq, &flags);
1313
1314        return success;
1315}
1316
1317int fastcall wake_up_process(task_t * p)
1318{
1319        return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1320                                 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1321}
1322
1323EXPORT_SYMBOL(wake_up_process);
1324
1325int fastcall wake_up_state(task_t *p, unsigned int state)
1326{
1327        return try_to_wake_up(p, state, 0);
1328}
1329
1330#ifdef CONFIG_SMP
1331static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1332                           struct sched_domain *sd);
1333#endif
1334
1335/*
1336 * Perform scheduler related setup for a newly forked process p.
1337 * p is forked by current.
1338 */
1339void fastcall sched_fork(task_t *p)
1340{
1341        /*
1342         * We mark the process as running here, but have not actually
1343         * inserted it onto the runqueue yet. This guarantees that
1344         * nobody will actually run it, and a signal or other external
1345         * event cannot wake it up and insert it on the runqueue either.
1346         */
1347        p->state = TASK_RUNNING;
1348        INIT_LIST_HEAD(&p->run_list);
1349        p->array = NULL;
1350        spin_lock_init(&p->switch_lock);
1351#ifdef CONFIG_SCHEDSTATS
1352        memset(&p->sched_info, 0, sizeof(p->sched_info));
1353#endif
1354#ifdef CONFIG_PREEMPT
1355        /*
1356         * During context-switch we hold precisely one spinlock, which
1357         * schedule_tail drops. (in the common case it's this_rq()->lock,
1358         * but it also can be p->switch_lock.) So we compensate with a count
1359         * of 1. Also, we want to start with kernel preemption disabled.
1360         */
1361        p->thread_info->preempt_count = 1;
1362#endif
1363        /*
1364         * Share the timeslice between parent and child, thus the
1365         * total amount of pending timeslices in the system doesn't change,
1366         * resulting in more scheduling fairness.
1367         */
1368        local_irq_disable();
1369        p->time_slice = (current->time_slice + 1) >> 1;
1370        /*
1371         * The remainder of the first timeslice might be recovered by
1372         * the creator if the child exits early enough.
1373         */
1374        p->first_time_slice = current->pid;
1375        current->time_slice >>= 1;
1376        p->timestamp = sched_clock();
1377        if (unlikely(!current->time_slice)) {
1378                /*
1379                 * This case is rare, it happens when the parent has only
1380                 * a single jiffy left from its timeslice. Taking the
1381                 * runqueue lock is not a problem.
1382                 */
1383                current->time_slice = 1;
1384                preempt_disable();
1385                scheduler_tick(0, 0);
1386                local_irq_enable();
1387                preempt_enable();
1388        } else
1389                local_irq_enable();
1390}
1391
1392/*
1393 * wake_up_new_task - wake up a newly created task for the first time.
1394 *
1395 * This function will do some initial scheduler statistics housekeeping
1396 * that must be done for every newly created context, then puts the task
1397 * on the runqueue and wakes it.
1398 */
1399void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1400{
1401        unsigned long flags;
1402        int this_cpu, cpu;
1403        runqueue_t *rq, *this_rq;
1404
1405        rq = task_rq_lock(p, &flags);
1406        cpu = task_cpu(p);
1407        this_cpu = smp_processor_id();
1408
1409        BUG_ON(p->state != TASK_RUNNING);
1410
1411        schedstat_inc(rq, wunt_cnt);
1412        /*
1413         * We decrease the sleep average of forking parents
1414         * and children as well, to keep max-interactive tasks
1415         * from forking tasks that are max-interactive. The parent
1416         * (current) is done further down, under its lock.
1417         */
1418        p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1419                CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1420
1421        p->interactive_credit = 0;
1422
1423        p->prio = effective_prio(p);
1424
1425        if (likely(cpu == this_cpu)) {
1426                if (!(clone_flags & CLONE_VM)) {
1427                        /*
1428                         * The VM isn't cloned, so we're in a good position to
1429                         * do child-runs-first in anticipation of an exec. This
1430                         * usually avoids a lot of COW overhead.
1431                         */
1432                        if (unlikely(!current->array))
1433                                __activate_task(p, rq);
1434                        else {
1435                                p->prio = current->prio;
1436                                list_add_tail(&p->run_list, &current->run_list);
1437                                p->array = current->array;
1438                                p->array->nr_active++;
1439                                rq->nr_running++;
1440                        }
1441                        set_need_resched();
1442                } else
1443                        /* Run child last */
1444                        __activate_task(p, rq);
1445                /*
1446                 * We skip the following code due to cpu == this_cpu
1447                 *
1448                 *   task_rq_unlock(rq, &flags);
1449                 *   this_rq = task_rq_lock(current, &flags);
1450                 */
1451                this_rq = rq;
1452        } else {
1453                this_rq = cpu_rq(this_cpu);
1454
1455                /*
1456                 * Not the local CPU - must adjust timestamp. This should
1457                 * get optimised away in the !CONFIG_SMP case.
1458                 */
1459                p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1460                                        + rq->timestamp_last_tick;
1461                __activate_task(p, rq);
1462                if (TASK_PREEMPTS_CURR(p, rq))
1463                        resched_task(rq->curr);
1464
1465                schedstat_inc(rq, wunt_moved);
1466                /*
1467                 * Parent and child are on different CPUs, now get the
1468                 * parent runqueue to update the parent's ->sleep_avg:
1469                 */
1470                task_rq_unlock(rq, &flags);
1471                this_rq = task_rq_lock(current, &flags);
1472        }
1473        current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1474                PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1475        task_rq_unlock(this_rq, &flags);
1476}
1477
1478/*
1479 * Potentially available exiting-child timeslices are
1480 * retrieved here - this way the creator does not get
1481 * penalized for creating too many threads.
1482 *
1483 * (this cannot be used to 'generate' timeslices
1484 * artificially, because any timeslice recovered here
1485 * was given away by the parent in the first place.)
1486 */
1487void fastcall sched_exit(task_t * p)
1488{
1489        unsigned long flags;
1490        runqueue_t *rq;
1491        struct task_struct* creator = NULL;
1492
1493        /*
1494         * If the child was a (relative-) CPU hog then decrease
1495         * the sleep_avg of the creator as well.
1496         */
1497        if (p->first_time_slice) {
1498                creator = find_task_by_pid((pid_t)p->first_time_slice);
1499                if (creator && task_cpu(p) == task_cpu(creator)) {
1500                        rq = task_rq_lock(creator, &flags);
1501                        creator->time_slice += p->time_slice;
1502                        if (unlikely(creator->time_slice > task_timeslice(p)))
1503                                creator->time_slice = task_timeslice(p);
1504
1505                        if (p->sleep_avg < creator->sleep_avg)
1506                                creator->sleep_avg = creator->sleep_avg /
1507                (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1508                (EXIT_WEIGHT + 1);
1509        task_rq_unlock(rq, &flags);
1510                }
1511        }
1512}
1513
1514/**
1515 * finish_task_switch - clean up after a task-switch
1516 * @prev: the thread we just switched away from.
1517 *
1518 * We enter this with the runqueue still locked, and finish_arch_switch()
1519 * will unlock it along with doing any other architecture-specific cleanup
1520 * actions.
1521 *
1522 * Note that we may have delayed dropping an mm in context_switch(). If
1523 * so, we finish that here outside of the runqueue lock.  (Doing it
1524 * with the lock held can cause deadlocks; see schedule() for
1525 * details.)
1526 */
1527static inline void finish_task_switch(task_t *prev)
1528{
1529        runqueue_t *rq = this_rq();
1530        struct mm_struct *mm = rq->prev_mm;
1531        unsigned long prev_task_flags;
1532
1533        rq->prev_mm = NULL;
1534
1535        /*
1536         * A task struct has one reference for the use as "current".
1537         * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
1538         * calls schedule one last time. The schedule call will never return,
1539         * and the scheduled task must drop that reference.
1540         * The test for EXIT_ZOMBIE must occur while the runqueue locks are
1541         * still held, otherwise prev could be scheduled on another cpu, die
1542         * there before we look at prev->state, and then the reference would
1543         * be dropped twice.
1544         *              Manfred Spraul <manfred@colorfullife.com>
1545         */
1546        prev_task_flags = prev->flags;
1547        finish_arch_switch(rq, prev);
1548        if (mm)
1549                mmdrop(mm);
1550        if (unlikely(prev_task_flags & PF_DEAD)) {
1551                /*
1552                 * Remove function-return probe instances associated with this task
1553                 * and put them back on the free list.
1554                 */
1555                kprobe_flush_task(prev);
1556                put_task_struct(prev);
1557        }
1558}
1559
1560/**
1561 * schedule_tail - first thing a freshly forked thread must call.
1562 * @prev: the thread we just switched away from.
1563 */
1564asmlinkage void schedule_tail(task_t *prev)
1565{
1566        finish_task_switch(prev);
1567
1568        if (current->set_child_tid)
1569                put_user(current->pid, current->set_child_tid);
1570}
1571
1572/*
1573 * context_switch - switch to the new MM and the new
1574 * thread's register state.
1575 */
1576static inline
1577task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1578{
1579        struct mm_struct *mm = next->mm;
1580        struct mm_struct *oldmm = prev->active_mm;
1581
1582        if (unlikely(!mm)) {
1583                next->active_mm = oldmm;
1584                atomic_inc(&oldmm->mm_count);
1585                enter_lazy_tlb(oldmm, next);
1586        } else
1587                switch_mm(oldmm, mm, next);
1588
1589        if (unlikely(!prev->mm)) {
1590                prev->active_mm = NULL;
1591                WARN_ON(rq->prev_mm);
1592                rq->prev_mm = oldmm;
1593        }
1594
1595        /* Here we just switch the register state and the stack. */
1596        switch_to(prev, next, prev);
1597
1598        return prev;
1599}
1600
1601/*
1602 * nr_running, nr_uninterruptible and nr_context_switches:
1603 *
1604 * externally visible scheduler statistics: current number of runnable
1605 * threads, current number of uninterruptible-sleeping threads, total
1606 * number of context switches performed since bootup.
1607 */
1608unsigned long nr_running(void)
1609{
1610        unsigned long i, sum = 0;
1611
1612        for_each_online_cpu(i)
1613                sum += cpu_rq(i)->nr_running;
1614
1615        return sum;
1616}
1617
1618unsigned long nr_uninterruptible(void)
1619{
1620        unsigned long i, sum = 0;
1621
1622        for_each_cpu(i)
1623                sum += cpu_rq(i)->nr_uninterruptible;
1624
1625        /*
1626         * Since we read the counters lockless, it might be slightly
1627         * inaccurate. Do not allow it to go below zero though:
1628         */
1629        if (unlikely((long)sum < 0))
1630                sum = 0;
1631
1632        return sum;
1633}
1634
1635unsigned long long nr_context_switches(void)
1636{
1637        unsigned long long i, sum = 0;
1638
1639        for_each_cpu(i)
1640                sum += cpu_rq(i)->nr_switches;
1641
1642        return sum;
1643}
1644
1645unsigned long nr_iowait(void)
1646{
1647        unsigned long i, sum = 0;
1648
1649        for_each_cpu(i)
1650                sum += atomic_read(&cpu_rq(i)->nr_iowait);
1651
1652        return sum;
1653}
1654
1655/* cpus with isolated domains */
1656cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
1657
1658#ifdef CONFIG_SMP
1659
1660/*
1661 * double_rq_lock - safely lock two runqueues
1662 *
1663 * We must take them in cpu order to match code in
1664 * dependent_sleeper and wake_dependent_sleeper.
1665 *
1666 * Note this does not disable interrupts like task_rq_lock,
1667 * you need to do so manually before calling.
1668 */
1669static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1670{
1671        if (rq1 == rq2)
1672                spin_lock(&rq1->lock);
1673        else {
1674                if (rq1->cpu < rq2->cpu) {
1675                        spin_lock(&rq1->lock);
1676                        spin_lock(&rq2->lock);
1677                } else {
1678                        spin_lock(&rq2->lock);
1679                        spin_lock(&rq1->lock);
1680                }
1681        }
1682}
1683
1684/*
1685 * double_rq_unlock - safely unlock two runqueues
1686 *
1687 * Note this does not restore interrupts like task_rq_unlock,
1688 * you need to do so manually after calling.
1689 */
1690static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1691{
1692        spin_unlock(&rq1->lock);
1693        if (rq1 != rq2)
1694                spin_unlock(&rq2->lock);
1695}
1696
1697/*
1698 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1699 */
1700static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1701{
1702        if (unlikely(!spin_trylock(&busiest->lock))) {
1703                if (busiest->cpu < this_rq->cpu) {
1704                        spin_unlock(&this_rq->lock);
1705                        spin_lock(&busiest->lock);
1706                        spin_lock(&this_rq->lock);
1707                } else
1708                        spin_lock(&busiest->lock);
1709        }
1710}
1711
1712/*
1713 * find_idlest_cpu - find the least busy runqueue.
1714 */
1715static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1716                           struct sched_domain *sd)
1717{
1718        unsigned long load, min_load, this_load;
1719        int i, min_cpu;
1720        cpumask_t mask;
1721
1722        min_cpu = UINT_MAX;
1723        min_load = ULONG_MAX;
1724
1725        cpus_and(mask, sd->span, p->cpus_allowed);
1726
1727        for_each_cpu_mask(i, mask) {
1728                load = target_load(i);
1729
1730                if (load < min_load) {
1731                        min_cpu = i;
1732                        min_load = load;
1733
1734                        /* break out early on an idle CPU: */
1735                        if (!min_load)
1736                                break;
1737                }
1738        }
1739
1740        /* add +1 to account for the new task */
1741        this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1742
1743        /*
1744         * Would with the addition of the new task to the
1745         * current CPU there be an imbalance between this
1746         * CPU and the idlest CPU?
1747         *
1748         * Use half of the balancing threshold - new-context is
1749         * a good opportunity to balance.
1750         */
1751        if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1752                return min_cpu;
1753
1754        return this_cpu;
1755}
1756
1757/*
1758 * If dest_cpu is allowed for this process, migrate the task to it.
1759 * This is accomplished by forcing the cpu_allowed mask to only
1760 * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1761 * the cpu_allowed mask is restored.
1762 */
1763static void sched_migrate_task(task_t *p, int dest_cpu)
1764{
1765        migration_req_t req;
1766        runqueue_t *rq;
1767        unsigned long flags;
1768
1769        rq = task_rq_lock(p, &flags);
1770        if (!cpu_isset(dest_cpu, p->cpus_allowed)
1771            || unlikely(cpu_is_offline(dest_cpu)))
1772                goto out;
1773
1774        schedstat_inc(rq, smt_cnt);
1775        /* force the process onto the specified CPU */
1776        if (migrate_task(p, dest_cpu, &req)) {
1777                /* Need to wait for migration thread (might exit: take ref). */
1778                struct task_struct *mt = rq->migration_thread;
1779                get_task_struct(mt);
1780                task_rq_unlock(rq, &flags);
1781                wake_up_process(mt);
1782                put_task_struct(mt);
1783                wait_for_completion(&req.done);
1784                return;
1785        }
1786out:
1787        task_rq_unlock(rq, &flags);
1788}
1789
1790/*
1791 * sched_exec(): find the highest-level, exec-balance-capable
1792 * domain and try to migrate the task to the least loaded CPU.
1793 *
1794 * execve() is a valuable balancing opportunity, because at this point
1795 * the task has the smallest effective memory and cache footprint.
1796 */
1797void sched_exec(void)
1798{
1799        struct sched_domain *tmp, *sd = NULL;
1800        int new_cpu, this_cpu = get_cpu();
1801
1802        schedstat_inc(this_rq(), sbe_cnt);
1803        /* Prefer the current CPU if there's only this task running */
1804        if (this_rq()->nr_running <= 1)
1805                goto out;
1806
1807        for_each_domain(this_cpu, tmp)
1808                if (tmp->flags & SD_BALANCE_EXEC)
1809                        sd = tmp;
1810
1811        if (sd) {
1812                schedstat_inc(sd, sbe_attempts);
1813                new_cpu = find_idlest_cpu(current, this_cpu, sd);
1814                if (new_cpu != this_cpu) {
1815                        schedstat_inc(sd, sbe_pushed);
1816                        put_cpu();
1817                        sched_migrate_task(current, new_cpu);
1818                        return;
1819                }
1820        }
1821out:
1822        put_cpu();
1823}
1824
1825/*
1826 * pull_task - move a task from a remote runqueue to the local runqueue.
1827 * Both runqueues must be locked.
1828 */
1829static inline
1830void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1831               runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1832{
1833        dequeue_task(p, src_array);
1834        src_rq->nr_running--;
1835        set_task_cpu(p, this_cpu);
1836        this_rq->nr_running++;
1837        enqueue_task(p, this_array);
1838        p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1839                                + this_rq->timestamp_last_tick;
1840        /*
1841         * Note that idle threads have a prio of MAX_PRIO, for this test
1842         * to be always true for them.
1843         */
1844        if (TASK_PREEMPTS_CURR(p, this_rq))
1845                resched_task(this_rq->curr);
1846}
1847
1848/*
1849 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1850 */
1851static inline
1852int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1853                     struct sched_domain *sd, enum idle_type idle, int *all_pinned)
1854{
1855        /*
1856         * We do not migrate tasks that are:
1857         * 1) running (obviously), or
1858         * 2) cannot be migrated to this CPU due to cpus_allowed, or
1859         * 3) are cache-hot on their current CPU.
1860         */
1861        if (task_running(rq, p))
1862                return 0;
1863        if (!cpu_isset(this_cpu, p->cpus_allowed))
1864                return 0;
1865
1866        *all_pinned = 0;
1867
1868        /* Aggressive migration if we've failed balancing */
1869        if (idle == NEWLY_IDLE ||
1870                        sd->nr_balance_failed < sd->cache_nice_tries) {
1871                if (task_hot(p, rq->timestamp_last_tick, sd))
1872                        return 0;
1873        }
1874
1875        return 1;
1876}
1877
1878/*
1879 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
1880 * as part of a balancing operation within "domain". Returns the number of
1881 * tasks moved.
1882 *
1883 * Called with both runqueues locked.
1884 */
1885static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1886                      unsigned long max_nr_move, struct sched_domain *sd,
1887                      enum idle_type idle, int *all_pinned)
1888{
1889        prio_array_t *array, *dst_array;
1890        struct list_head *head, *curr;
1891        int idx, pulled = 0, pinned = 0;
1892        task_t *tmp;
1893
1894        if (max_nr_move <= 0 || busiest->nr_running <= 1)
1895                goto out;
1896
1897        pinned=1;
1898
1899        /*
1900         * We first consider expired tasks. Those will likely not be
1901         * executed in the near future, and they are most likely to
1902         * be cache-cold, thus switching CPUs has the least effect
1903         * on them.
1904         */
1905        if (busiest->expired->nr_active) {
1906                array = busiest->expired;
1907                dst_array = this_rq->expired;
1908        } else {
1909                array = busiest->active;
1910                dst_array = this_rq->active;
1911        }
1912
1913new_array:
1914        /* Start searching at priority 0: */
1915        idx = 0;
1916skip_bitmap:
1917        if (!idx)
1918                idx = sched_find_first_bit(array->bitmap);
1919        else
1920                idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1921        if (idx >= MAX_PRIO) {
1922                if (array == busiest->expired && busiest->active->nr_active) {
1923                        array = busiest->active;
1924                        dst_array = this_rq->active;
1925                        goto new_array;
1926                }
1927                goto out;
1928        }
1929
1930        head = array->queue + idx;
1931        curr = head->prev;
1932skip_queue:
1933        tmp = list_entry(curr, task_t, run_list);
1934
1935        curr = curr->prev;
1936
1937        if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
1938                if (curr != head)
1939                        goto skip_queue;
1940                idx++;
1941                goto skip_bitmap;
1942        }
1943
1944        /*
1945         * Right now, this is the only place pull_task() is called,
1946         * so we can safely collect pull_task() stats here rather than
1947         * inside pull_task().
1948         */
1949        schedstat_inc(this_rq, pt_gained[idle]);
1950        schedstat_inc(busiest, pt_lost[idle]);
1951
1952        pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1953        pulled++;
1954
1955        /* We only want to steal up to the prescribed number of tasks. */
1956        if (pulled < max_nr_move) {
1957                if (curr != head)
1958                        goto skip_queue;
1959                idx++;
1960                goto skip_bitmap;
1961        }
1962out:
1963        if (all_pinned)
1964                *all_pinned = pinned;
1965
1966        return pulled;
1967}
1968
1969/*
1970 * find_busiest_group finds and returns the busiest CPU group within the
1971 * domain. It calculates and returns the number of tasks which should be
1972 * moved to restore balance via the imbalance parameter.
1973 */
1974static struct sched_group *
1975find_busiest_group(struct sched_domain *sd, int this_cpu,
1976                   unsigned long *imbalance, enum idle_type idle)
1977{
1978        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1979        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1980
1981        max_load = this_load = total_load = total_pwr = 0;
1982
1983        do {
1984                unsigned long load;
1985                int local_group;
1986                int i, nr_cpus = 0;
1987
1988                local_group = cpu_isset(this_cpu, group->cpumask);
1989
1990                /* Tally up the load of all CPUs in the group */
1991                avg_load = 0;
1992
1993                for_each_cpu_mask(i, group->cpumask) {
1994                        /* Bias balancing toward cpus of our domain */
1995                        if (local_group)
1996                                load = target_load(i);
1997                        else
1998                                load = source_load(i);
1999
2000                        nr_cpus++;
2001                        avg_load += load;
2002                }
2003
2004                if (!nr_cpus)
2005                        goto nextgroup;
2006
2007                total_load += avg_load;
2008                total_pwr += group->cpu_power;
2009
2010                /* Adjust by relative CPU power of the group */
2011                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2012
2013                if (local_group) {
2014                        this_load = avg_load;
2015                        this = group;
2016                        goto nextgroup;
2017                } else if (avg_load > max_load) {
2018                        max_load = avg_load;
2019                        busiest = group;
2020                }
2021nextgroup:
2022                group = group->next;
2023        } while (group != sd->groups);
2024
2025        if (!busiest || this_load >= max_load)
2026                goto out_balanced;
2027
2028        avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2029
2030        if (this_load >= avg_load ||
2031                        100*max_load <= sd->imbalance_pct*this_load)
2032                goto out_balanced;
2033
2034        /*
2035         * We're trying to get all the cpus to the average_load, so we don't
2036         * want to push ourselves above the average load, nor do we wish to
2037         * reduce the max loaded cpu below the average load, as either of these
2038         * actions would just result in more rebalancing later, and ping-pong
2039         * tasks around. Thus we look for the minimum possible imbalance.
2040         * Negative imbalances (*we* are more loaded than anyone else) will
2041         * be counted as no imbalance for these purposes -- we can't fix that
2042         * by pulling tasks to us.  Be careful of negative numbers as they'll
2043         * appear as very large values with unsigned longs.
2044         */
2045        *imbalance = min(max_load - avg_load, avg_load - this_load);
2046
2047        /* How much load to actually move to equalise the imbalance */
2048        *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
2049                                / SCHED_LOAD_SCALE;
2050
2051        if (*imbalance < SCHED_LOAD_SCALE - 1) {
2052                unsigned long pwr_now = 0, pwr_move = 0;
2053                unsigned long tmp;
2054
2055                if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
2056                        *imbalance = 1;
2057                        return busiest;
2058                }
2059
2060                /*
2061                 * OK, we don't have enough imbalance to justify moving tasks,
2062                 * however we may be able to increase total CPU power used by
2063                 * moving them.
2064                 */
2065
2066                pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
2067                pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
2068                pwr_now /= SCHED_LOAD_SCALE;
2069
2070                /* Amount of load we'd subtract */
2071                tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
2072                if (max_load > tmp)
2073                        pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
2074                                                        max_load - tmp);
2075
2076                /* Amount of load we'd add */
2077                tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
2078                if (max_load < tmp)
2079                        tmp = max_load;
2080                pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
2081                pwr_move /= SCHED_LOAD_SCALE;
2082
2083                /* Move if we gain another 8th of a CPU worth of throughput */
2084                if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
2085                        goto out_balanced;
2086
2087                *imbalance = 1;
2088                return busiest;
2089        }
2090
2091        /* Get rid of the scaling factor, rounding down as we divide */
2092        *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
2093
2094        return busiest;
2095
2096out_balanced:
2097        *imbalance = 0;
2098        return NULL;
2099}
2100
2101/*
2102 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2103 */
2104static runqueue_t *find_busiest_queue(struct sched_group *group)
2105{
2106        unsigned long load, max_load = 0;
2107        runqueue_t *busiest = NULL;
2108        int i;
2109
2110        for_each_cpu_mask(i, group->cpumask) {
2111                load = source_load(i);
2112
2113                if (load > max_load) {
2114                        max_load = load;
2115                        busiest = cpu_rq(i);
2116                }
2117        }
2118
2119        return busiest;
2120}
2121
2122/*
2123 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2124 * tasks if there is an imbalance.
2125 *
2126 * Called with this_rq unlocked.
2127 */
2128static int load_balance(int this_cpu, runqueue_t *this_rq,
2129                        struct sched_domain *sd, enum idle_type idle)
2130{
2131        struct sched_group *group;
2132        runqueue_t *busiest;
2133        unsigned long imbalance;
2134        int nr_moved, all_pinned=0;
2135
2136        spin_lock(&this_rq->lock);
2137        schedstat_inc(sd, lb_cnt[idle]);
2138
2139        group = find_busiest_group(sd, this_cpu, &imbalance, idle);
2140        if (!group) {
2141                schedstat_inc(sd, lb_nobusyg[idle]);
2142                goto out_balanced;
2143        }
2144
2145        busiest = find_busiest_queue(group);
2146        if (!busiest) {
2147                schedstat_inc(sd, lb_nobusyq[idle]);
2148                goto out_balanced;
2149        }
2150
2151        /*
2152         * This should be "impossible", but since load
2153         * balancing is inherently racy and statistical,
2154         * it could happen in theory.
2155         */
2156        if (unlikely(busiest == this_rq)) {
2157                WARN_ON(1);
2158                goto out_balanced;
2159        }
2160
2161        schedstat_add(sd, lb_imbalance[idle], imbalance);
2162
2163        nr_moved = 0;
2164        if (busiest->nr_running > 1) {
2165                /*
2166                 * Attempt to move tasks. If find_busiest_group has found
2167                 * an imbalance but busiest->nr_running <= 1, the group is
2168                 * still unbalanced. nr_moved simply stays zero, so it is
2169                 * correctly treated as an imbalance.
2170                 */
2171                double_lock_balance(this_rq, busiest);
2172                nr_moved = move_tasks(this_rq, this_cpu, busiest,
2173                                                imbalance, sd, idle,
2174                                                &all_pinned);
2175                spin_unlock(&busiest->lock);
2176
2177                /* All tasks on this runqueue were pinned by CPU affinity */
2178                if (unlikely(all_pinned))
2179                        goto out_balanced;
2180        }
2181
2182        spin_unlock(&this_rq->lock);
2183
2184        if (!nr_moved) {
2185                schedstat_inc(sd, lb_failed[idle]);
2186                sd->nr_balance_failed++;
2187
2188                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2189                        int wake = 0;
2190
2191                        spin_lock(&busiest->lock);
2192                        if (!busiest->active_balance) {
2193                                busiest->active_balance = 1;
2194                                busiest->push_cpu = this_cpu;
2195                                wake = 1;
2196                        }
2197                        spin_unlock(&busiest->lock);
2198                        if (wake)
2199                                wake_up_process(busiest->migration_thread);
2200
2201                        /*
2202                         * We've kicked active balancing, reset the failure
2203                         * counter.
2204                         */
2205                        sd->nr_balance_failed = sd->cache_nice_tries;
2206                }
2207        } else
2208                sd->nr_balance_failed = 0;
2209
2210        /* We were unbalanced, so reset the balancing interval */
2211        sd->balance_interval = sd->min_interval;
2212
2213        return nr_moved;
2214
2215out_balanced:
2216        spin_unlock(&this_rq->lock);
2217
2218        sd->nr_balance_failed = 0;
2219        /* tune up the balancing interval */
2220        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2221                        (sd->balance_interval < sd->max_interval))
2222                sd->balance_interval *= 2;
2223        return 0;
2224}
2225
2226/*
2227 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2228 * tasks if there is an imbalance.
2229 *
2230 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2231 * this_rq is locked.
2232 */
2233static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2234                                struct sched_domain *sd)
2235{
2236        struct sched_group *group;
2237        runqueue_t *busiest = NULL;
2238        unsigned long imbalance;
2239        int nr_moved = 0;
2240
2241        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2242        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2243        if (!group) {
2244                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2245                goto out_balanced;
2246        }
2247
2248        busiest = find_busiest_queue(group);
2249        if (!busiest || busiest == this_rq) {
2250                schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2251                goto out_balanced;
2252        }
2253
2254        /* Attempt to move tasks */
2255        double_lock_balance(this_rq, busiest);
2256
2257        schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2258        nr_moved = move_tasks(this_rq, this_cpu, busiest,
2259                        imbalance, sd, NEWLY_IDLE, NULL);
2260        if (!nr_moved) {
2261                schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2262        } else
2263                sd->nr_balance_failed = 0;
2264
2265        spin_unlock(&busiest->lock);
2266        return nr_moved;
2267
2268out_balanced:
2269        schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2270        sd->nr_balance_failed = 0;
2271        return 0;
2272}
2273
2274/*
2275 * idle_balance is called by schedule() if this_cpu is about to become
2276 * idle. Attempts to pull tasks from other CPUs.
2277 */
2278static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2279{
2280        struct sched_domain *sd;
2281
2282        for_each_domain(this_cpu, sd) {
2283                if (sd->flags & SD_BALANCE_NEWIDLE) {
2284                        if (load_balance_newidle(this_cpu, this_rq, sd)) {
2285                                /* We've pulled tasks over so stop searching */
2286                                break;
2287                        }
2288                }
2289        }
2290}
2291
2292/*
2293 * active_load_balance is run by migration threads. It pushes a running
2294 * task off the cpu. It can be required to correctly have at least 1 task
2295 * running on each physical CPU where possible, and not have a physical /
2296 * logical imbalance.
2297 *
2298 * Called with busiest locked.
2299 */
2300static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
2301{
2302        struct sched_domain *sd;
2303        runqueue_t *target_rq;
2304        int target_cpu = busiest->push_cpu;
2305
2306        schedstat_inc(busiest, alb_cnt);
2307        if (busiest->nr_running <= 1)
2308                return;
2309
2310        target_rq = cpu_rq(target_cpu);
2311
2312        /*
2313         * This condition is "impossible", but since load
2314         * balancing is inherently a bit racy and statistical,
2315         * it can trigger.. Reported by Bjorn Helgaas on a
2316         * 128-cpu setup.
2317         */
2318        BUG_ON(busiest == target_rq);
2319
2320        double_lock_balance(busiest, target_rq);
2321
2322        for_each_domain(target_cpu, sd)
2323                if (cpu_isset(busiest_cpu, sd->span))
2324                                break;
2325
2326        if (unlikely(sd == NULL))
2327                goto out;
2328
2329        if (move_tasks(target_rq, target_cpu, busiest, 1, sd, IDLE, NULL)) {
2330                schedstat_inc(busiest, alb_lost);
2331                schedstat_inc(target_rq, alb_gained);
2332        } else {
2333                schedstat_inc(busiest, alb_failed);
2334        }
2335out:
2336        spin_unlock(&target_rq->lock);
2337}
2338
2339/*
2340 * rebalance_tick will get called every timer tick, on every CPU.
2341 *
2342 * It checks each scheduling domain to see if it is due to be balanced,
2343 * and initiates a balancing operation if so.
2344 *
2345 * Balancing parameters are set up in arch_init_sched_domains.
2346 */
2347
2348/* Don't have all balancing operations going off at once */
2349#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2350
2351#define IRQ_QUANTUM     (HZ / 10)
2352
2353static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2354                           enum idle_type idle)
2355{
2356        unsigned long old_load, this_load;
2357        unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2358        struct sched_domain *sd;
2359
2360        if (unlikely(--this_rq->irq_quantum <= 0)) {
2361                struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2362                u64 irq_ticks = cpustat->irq + cpustat->softirq;
2363                unsigned int ticks = irq_ticks - this_rq->prev_irq_ticks;
2364                unsigned int pct = (ticks * 100) / IRQ_QUANTUM;
2365
2366                this_rq->irq_pct = (this_rq->irq_pct + pct) / 2;
2367                this_rq->prev_irq_ticks = irq_ticks;
2368                this_rq->irq_quantum = IRQ_QUANTUM;
2369        }
2370
2371        /* Update our load */
2372        old_load = this_rq->cpu_load;
2373        this_load = current_load(this_rq);
2374        /*
2375         * Round up the averaging division if load is increasing. This
2376         * prevents us from getting stuck on 9 if the load is 10, for
2377         * example.
2378         */
2379        if (this_load > old_load)
2380                old_load++;
2381        this_rq->cpu_load = (old_load + this_load) / 2;
2382
2383        /*
2384         * Isolated cpus don't get load-balanced.
2385         */
2386        if (cpu_isset(this_cpu, cpu_isolated_map))
2387                return;
2388
2389        for_each_domain(this_cpu, sd) {
2390                unsigned long interval = sd->balance_interval;
2391
2392                if (idle != IDLE)
2393                        interval *= sd->busy_factor;
2394
2395                /* scale ms to jiffies */
2396                interval = msecs_to_jiffies(interval);
2397                if (unlikely(!interval))
2398                        interval = 1;
2399
2400                if (j - sd->last_balance >= interval) {
2401                        if (load_balance(this_cpu, this_rq, sd, idle)) {
2402                                /* We've pulled tasks over so no longer idle */
2403                                idle = NOT_IDLE;
2404                        }
2405                        sd->last_balance += interval;
2406                }
2407        }
2408}
2409#else
2410/*
2411 * on UP we do not need to balance between CPUs:
2412 */
2413static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2414{
2415}
2416static inline void idle_balance(int cpu, runqueue_t *rq)
2417{
2418}
2419#endif
2420
2421static inline int wake_priority_sleeper(runqueue_t *rq)
2422{
2423        int ret = 0;
2424#ifdef CONFIG_SCHED_SMT
2425        spin_lock(&rq->lock);
2426        /*
2427         * If an SMT sibling task has been put to sleep for priority
2428         * reasons reschedule the idle task to see if it can now run.
2429         */
2430        if (rq->nr_running) {
2431                resched_task(rq->idle);
2432                ret = 1;
2433        }
2434        spin_unlock(&rq->lock);
2435#endif
2436        return ret;
2437}
2438
2439DEFINE_PER_CPU(struct kernel_stat, kstat);
2440
2441EXPORT_PER_CPU_SYMBOL(kstat);
2442
2443/*
2444 * We place interactive tasks back into the active array, if possible.
2445 *
2446 * To guarantee that this does not starve expired tasks we ignore the
2447 * interactivity of a task if the first expired task had to wait more
2448 * than a 'reasonable' amount of time. This deadline timeout is
2449 * load-dependent, as the frequency of array switched decreases with
2450 * increasing number of running tasks. We also ignore the interactivity
2451 * if a better static_prio task has expired:
2452 */
2453#define EXPIRED_STARVING(rq) \
2454        ((STARVATION_LIMIT && \
2455          (jiffies - (rq)->switch_timestamp > STARVATION_LIMIT)) || \
2456         ((rq)->curr->static_prio > (rq)->best_expired_prio))
2457
2458/*
2459 * This function gets called by the timer code, with HZ frequency.
2460 * We call it with interrupts disabled.
2461 *
2462 * It also gets called by the fork code, when changing the parent's
2463 * timeslices.
2464 */
2465void scheduler_tick(int user_ticks, int sys_ticks)
2466{
2467        int cpu = smp_processor_id();
2468        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2469        runqueue_t *rq = this_rq();
2470        task_t *p = current;
2471
2472        rq->timestamp_last_tick = sched_clock();
2473
2474        if (rcu_pending(cpu))
2475                rcu_check_callbacks(cpu, user_ticks);
2476
2477        /* note: this timer irq context must be accounted for as well */
2478        if (hardirq_count() - HARDIRQ_OFFSET) {
2479                cpustat->irq += sys_ticks;
2480                sys_ticks = 0;
2481        } else if (softirq_count()) {
2482                cpustat->softirq += sys_ticks;
2483                sys_ticks = 0;
2484        }
2485
2486        if (p == rq->idle) {
2487                if (atomic_read(&rq->nr_iowait) > 0)
2488                        cpustat->iowait += sys_ticks;
2489                else
2490                        cpustat->idle += sys_ticks;
2491                if (wake_priority_sleeper(rq))
2492                        goto out;
2493                rebalance_tick(cpu, rq, IDLE);
2494                return;
2495        }
2496        if (TASK_NICE(p) > 0)
2497                cpustat->nice += user_ticks;
2498        else
2499                cpustat->user += user_ticks;
2500        cpustat->system += sys_ticks;
2501
2502        /* Task might have expired already, but not scheduled off yet */
2503        if (p->array != rq->active) {
2504                set_tsk_need_resched(p);
2505                goto out;
2506        }
2507        spin_lock(&rq->lock);
2508        /*
2509         * The task was running during this tick - update the
2510         * time slice counter. Note: we do not update a thread's
2511         * priority until it either goes to sleep or uses up its
2512         * timeslice. This makes it possible for interactive tasks
2513         * to use up their timeslices at their highest priority levels.
2514         */
2515        if (rt_task(p)) {
2516                /*
2517                 * RR tasks need a special form of timeslice management.
2518                 * FIFO tasks have no timeslices.
2519                 */
2520                if ((p->policy == SCHED_RR) && !--p->time_slice) {
2521                        p->time_slice = task_timeslice(p);
2522                        p->first_time_slice = 0;
2523                        set_tsk_need_resched(p);
2524
2525                        /* put it at the end of the queue: */
2526                        dequeue_task(p, rq->active);
2527                        enqueue_task(p, rq->active);
2528                }
2529                goto out_unlock;
2530        }
2531        if (!--p->time_slice) {
2532                dequeue_task(p, rq->active);
2533                set_tsk_need_resched(p);
2534                p->prio = effective_prio(p);
2535                p->time_slice = task_timeslice(p);
2536                p->first_time_slice = 0;
2537
2538                if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2539                        enqueue_task(p, rq->expired);
2540                        if (p->static_prio < rq->best_expired_prio)
2541                                rq->best_expired_prio = p->static_prio;
2542                } else
2543                        enqueue_task(p, rq->active);
2544        } else {
2545                /*
2546                 * Prevent a too long timeslice allowing a task to monopolize
2547                 * the CPU. We do this by splitting up the timeslice into
2548                 * smaller pieces.
2549                 *
2550                 * Note: this does not mean the task's timeslices expire or
2551                 * get lost in any way, they just might be preempted by
2552                 * another task of equal priority. (one with higher
2553                 * priority would have preempted this task already.) We
2554                 * requeue this task to the end of the list on this priority
2555                 * level, which is in essence a round-robin of tasks with
2556                 * equal priority.
2557                 *
2558                 * This only applies to tasks in the interactive
2559                 * delta range with at least TIMESLICE_GRANULARITY to requeue.
2560                 */
2561                if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2562                        p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2563                        (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2564                        (p->array == rq->active)) {
2565
2566                        dequeue_task(p, rq->active);
2567                        set_tsk_need_resched(p);
2568                        p->prio = effective_prio(p);
2569                        enqueue_task(p, rq->active);
2570                }
2571        }
2572out_unlock:
2573        spin_unlock(&rq->lock);
2574out:
2575        rebalance_tick(cpu, rq, NOT_IDLE);
2576}
2577
2578#ifdef CONFIG_SCHED_SMT
2579static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2580{
2581        struct sched_domain *sd = this_rq->sd;
2582        cpumask_t sibling_map;
2583        int i;
2584
2585        if (!(sd->flags & SD_SHARE_CPUPOWER))
2586                return;
2587
2588        /*
2589         * Unlock the current runqueue because we have to lock in
2590         * CPU order to avoid deadlocks. Caller knows that we might
2591         * unlock. We keep IRQs disabled.
2592         */
2593        spin_unlock(&this_rq->lock);
2594
2595        sibling_map = sd->span;
2596
2597        for_each_cpu_mask(i, sibling_map)
2598                spin_lock(&cpu_rq(i)->lock);
2599        /*
2600         * We clear this CPU from the mask. This both simplifies the
2601         * inner loop and keps this_rq locked when we exit:
2602         */
2603        cpu_clear(this_cpu, sibling_map);
2604
2605        for_each_cpu_mask(i, sibling_map) {
2606                runqueue_t *smt_rq = cpu_rq(i);
2607
2608                /*
2609                 * If an SMT sibling task is sleeping due to priority
2610                 * reasons wake it up now.
2611                 */
2612                if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2613                        resched_task(smt_rq->idle);
2614        }
2615
2616        for_each_cpu_mask(i, sibling_map)
2617                spin_unlock(&cpu_rq(i)->lock);
2618        /*
2619         * We exit with this_cpu's rq still held and IRQs
2620         * still disabled:
2621         */
2622}
2623
2624static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2625{
2626        struct sched_domain *sd = this_rq->sd;
2627        cpumask_t sibling_map;
2628        prio_array_t *array;
2629        int ret = 0, i;
2630        task_t *p;
2631
2632        if (!(sd->flags & SD_SHARE_CPUPOWER))
2633                return 0;
2634
2635        /*
2636         * The same locking rules and details apply as for
2637         * wake_sleeping_dependent():
2638         */
2639        spin_unlock(&this_rq->lock);
2640        sibling_map = sd->span;
2641        for_each_cpu_mask(i, sibling_map)
2642                spin_lock(&cpu_rq(i)->lock);
2643        cpu_clear(this_cpu, sibling_map);
2644
2645        /*
2646         * Establish next task to be run - it might have gone away because
2647         * we released the runqueue lock above:
2648         */
2649        if (!this_rq->nr_running)
2650                goto out_unlock;
2651        array = this_rq->active;
2652        if (!array->nr_active)
2653                array = this_rq->expired;
2654        BUG_ON(!array->nr_active);
2655
2656        p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
2657                task_t, run_list);
2658
2659        for_each_cpu_mask(i, sibling_map) {
2660                runqueue_t *smt_rq = cpu_rq(i);
2661                task_t *smt_curr = smt_rq->curr;
2662
2663                /*
2664                 * If a user task with lower static priority than the
2665                 * running task on the SMT sibling is trying to schedule,
2666                 * delay it till there is proportionately less timeslice
2667                 * left of the sibling task to prevent a lower priority
2668                 * task from using an unfair proportion of the
2669                 * physical cpu's resources. -ck
2670                 */
2671                if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2672                        task_timeslice(p) || rt_task(smt_curr)) &&
2673                        p->mm && smt_curr->mm && !rt_task(p))
2674                                ret = 1;
2675
2676                /*
2677                 * Reschedule a lower priority task on the SMT sibling,
2678                 * or wake it up if it has been put to sleep for priority
2679                 * reasons.
2680                 */
2681                if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2682                        task_timeslice(smt_curr) || rt_task(p)) &&
2683                        smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2684                        (smt_curr == smt_rq->idle && smt_rq->nr_running))
2685                                resched_task(smt_curr);
2686        }
2687out_unlock:
2688        for_each_cpu_mask(i, sibling_map)
2689                spin_unlock(&cpu_rq(i)->lock);
2690        return ret;
2691}
2692#else
2693static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2694{
2695}
2696
2697static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2698{
2699        return 0;
2700}
2701#endif
2702
2703/*
2704 * schedule() is the main scheduler function.
2705 */
2706asmlinkage void __sched schedule(void)
2707{
2708        long *switch_count;
2709        task_t *prev, *next;
2710        runqueue_t *rq;
2711        prio_array_t *array;
2712        struct list_head *queue;
2713        unsigned long long now;
2714        unsigned long run_time;
2715        int cpu, idx;
2716
2717        //WARN_ON(system_state == SYSTEM_BOOTING);
2718        /*
2719         * Test if we are atomic.  Since do_exit() needs to call into
2720         * schedule() atomically, we ignore that path for now.
2721         * Otherwise, whine if we are scheduling when we should not be.
2722         */
2723        if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) {
2724                if (unlikely(in_atomic())) {
2725                        printk(KERN_ERR "bad: scheduling while atomic!\n");
2726                        dump_stack();
2727                }
2728        }
2729
2730need_resched:
2731        preempt_disable();
2732        prev = current;
2733        rq = this_rq();
2734
2735        /*
2736         * The idle thread is not allowed to schedule!
2737         * Remove this check after it has been exercised a bit.
2738         */
2739        if (unlikely(current == rq->idle) && current->state != TASK_RUNNING) {
2740                printk(KERN_ERR "bad: scheduling from the idle thread!\n");
2741                dump_stack();
2742        }
2743
2744        release_kernel_lock(prev);
2745        schedstat_inc(rq, sched_cnt);
2746        now = sched_clock();
2747        if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
2748                run_time = now - prev->timestamp;
2749        else
2750                run_time = NS_MAX_SLEEP_AVG;
2751
2752        /*
2753         * Tasks with interactive credits get charged less run_time
2754         * at high sleep_avg to delay them losing their interactive
2755         * status
2756         */
2757        if (HIGH_CREDIT(prev))
2758                run_time /= (CURRENT_BONUS(prev) ? : 1);
2759
2760        spin_lock_irq(&rq->lock);
2761
2762        if (unlikely(current->flags & PF_DEAD))
2763                current->