1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/init.h>
25#include <asm/uaccess.h>
26#include <linux/highmem.h>
27#include <linux/smp_lock.h>
28#include <linux/pagemap.h>
29#include <asm/mmu_context.h>
30#include <linux/interrupt.h>
31#include <linux/completion.h>
32#include <linux/kernel_stat.h>
33#include <linux/security.h>
34#include <linux/notifier.h>
35#include <linux/profile.h>
36#include <linux/suspend.h>
37#include <linux/blkdev.h>
38#include <linux/delay.h>
39#include <linux/smp.h>
40#include <linux/timer.h>
41#include <linux/rcupdate.h>
42#include <linux/cpu.h>
43#include <linux/percpu.h>
44#include <linux/kthread.h>
45#include <linux/seq_file.h>
46#include <linux/times.h>
47#include <linux/kprobes.h>
48#include <asm/tlb.h>
49
50#include <asm/unistd.h>
51
52#ifdef CONFIG_NUMA
53#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
54#else
55#define cpu_to_node_mask(cpu) (cpu_online_map)
56#endif
57
58
59
60
61
62
63#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
64#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
65#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
66
67
68
69
70
71
72#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
73#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
74#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
75
76
77
78
79#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
80#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
81
82
83
84
85#define MAX_PINNED_INTERVAL 512
86
87
88
89
90
91
92
93
94#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
95#define DEF_TIMESLICE (100 * HZ / 1000)
96#define ON_RUNQUEUE_WEIGHT 30
97#define CHILD_PENALTY 95
98#define PARENT_PENALTY 100
99#define EXIT_WEIGHT 3
100#define PRIO_BONUS_RATIO 25
101#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
102#define INTERACTIVE_DELTA 2
103#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
104#define STARVATION_LIMIT (MAX_SLEEP_AVG)
105#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
106#define CREDIT_LIMIT 100
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136#define CURRENT_BONUS(p) \
137 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
138 MAX_SLEEP_AVG)
139
140#ifdef CONFIG_SMP
141#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
142 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
143 num_online_cpus())
144#else
145#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
146 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
147#endif
148
149#define SCALE(v1,v1_max,v2_max) \
150 (v1) * (v2_max) / (v1_max)
151
152#define DELTA(p) \
153 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
154
155#define TASK_INTERACTIVE(p) \
156 ((p)->prio <= (p)->static_prio - DELTA(p))
157
158#define INTERACTIVE_SLEEP(p) \
159 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
160 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
161
162#define HIGH_CREDIT(p) \
163 ((p)->interactive_credit > CREDIT_LIMIT)
164
165#define LOW_CREDIT(p) \
166 ((p)->interactive_credit < -CREDIT_LIMIT)
167
168#define TASK_PREEMPTS_CURR(p, rq) \
169 ((p)->prio < (rq)->curr->prio)
170
171
172
173
174
175
176
177
178
179
180#define SCALE_PRIO(x, prio) \
181 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
182
183static unsigned int task_timeslice(task_t *p)
184{
185 if (p->static_prio < NICE_TO_PRIO(0))
186 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
187 else
188 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
189}
190#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
191 < (long long) (sd)->cache_hot_time)
192
193enum idle_type
194{
195 IDLE,
196 NOT_IDLE,
197 NEWLY_IDLE,
198 MAX_IDLE_TYPES
199};
200
201struct sched_domain;
202
203
204
205
206
207#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
208
209typedef struct runqueue runqueue_t;
210
211struct prio_array {
212 unsigned int nr_active;
213 unsigned long bitmap[BITMAP_SIZE];
214 struct list_head queue[MAX_PRIO];
215};
216
217
218
219
220
221
222
223
224struct runqueue {
225 spinlock_t lock;
226
227
228
229
230
231 unsigned long nr_running;
232#ifdef CONFIG_SMP
233 unsigned long cpu_load;
234 unsigned int irq_pct;
235 int irq_quantum;
236 u64 prev_irq_ticks;
237#endif
238 unsigned long long nr_switches;
239
240
241
242
243
244
245
246 unsigned long nr_uninterruptible;
247
248 unsigned long switch_timestamp;
249 unsigned long long timestamp_last_tick;
250 task_t *curr, *idle;
251 struct mm_struct *prev_mm;
252 prio_array_t *active, *expired, arrays[2];
253 int best_expired_prio;
254 atomic_t nr_iowait;
255
256#ifdef CONFIG_SMP
257 struct sched_domain *sd;
258
259
260 int active_balance;
261 int push_cpu;
262
263 task_t *migration_thread;
264 struct list_head migration_queue;
265 int cpu;
266#endif
267
268#ifdef CONFIG_SCHEDSTATS
269
270 struct sched_info rq_sched_info;
271
272
273 unsigned long yld_exp_empty;
274 unsigned long yld_act_empty;
275 unsigned long yld_both_empty;
276 unsigned long yld_cnt;
277
278
279 unsigned long sched_noswitch;
280 unsigned long sched_switch;
281 unsigned long sched_cnt;
282 unsigned long sched_goidle;
283
284
285 unsigned long pt_gained[MAX_IDLE_TYPES];
286 unsigned long pt_lost[MAX_IDLE_TYPES];
287
288
289 unsigned long alb_cnt;
290 unsigned long alb_lost;
291 unsigned long alb_gained;
292 unsigned long alb_failed;
293
294
295 unsigned long ttwu_cnt;
296 unsigned long ttwu_attempts;
297 unsigned long ttwu_moved;
298
299
300 unsigned long wunt_cnt;
301 unsigned long wunt_moved;
302
303
304 unsigned long smt_cnt;
305
306
307 unsigned long sbe_cnt;
308#endif
309};
310
311static DEFINE_PER_CPU(struct runqueue, runqueues);
312
313
314
315
316#ifdef CONFIG_SMP
317#define SCHED_LOAD_SCALE 128UL
318
319#define SD_BALANCE_NEWIDLE 1
320#define SD_BALANCE_EXEC 2
321#define SD_WAKE_IDLE 4
322#define SD_WAKE_AFFINE 8
323#define SD_WAKE_BALANCE 16
324#define SD_SHARE_CPUPOWER 32
325
326struct sched_group {
327 struct sched_group *next;
328 cpumask_t cpumask;
329
330
331
332
333
334 unsigned long cpu_power;
335};
336
337struct sched_domain {
338
339 struct sched_domain *parent;
340 struct sched_group *groups;
341 cpumask_t span;
342 unsigned long min_interval;
343 unsigned long max_interval;
344 unsigned int busy_factor;
345 unsigned int imbalance_pct;
346 unsigned long long cache_hot_time;
347 unsigned int cache_nice_tries;
348 unsigned int per_cpu_gain;
349 int flags;
350
351
352 unsigned long last_balance;
353 unsigned int balance_interval;
354 unsigned int nr_balance_failed;
355
356#ifdef CONFIG_SCHEDSTATS
357
358 unsigned long lb_cnt[MAX_IDLE_TYPES];
359 unsigned long lb_failed[MAX_IDLE_TYPES];
360 unsigned long lb_imbalance[MAX_IDLE_TYPES];
361 unsigned long lb_nobusyg[MAX_IDLE_TYPES];
362 unsigned long lb_nobusyq[MAX_IDLE_TYPES];
363
364
365 unsigned long sbe_attempts;
366 unsigned long sbe_pushed;
367
368
369 unsigned long ttwu_wake_affine;
370 unsigned long ttwu_wake_balance;
371#endif
372};
373
374#ifndef ARCH_HAS_SCHED_TUNE
375#ifdef CONFIG_SCHED_SMT
376#define ARCH_HAS_SCHED_WAKE_IDLE
377
378#define SD_SIBLING_INIT (struct sched_domain) { \
379 .span = CPU_MASK_NONE, \
380 .parent = NULL, \
381 .groups = NULL, \
382 .min_interval = 1, \
383 .max_interval = 2, \
384 .busy_factor = 8, \
385 .imbalance_pct = 110, \
386 .cache_hot_time = 0, \
387 .cache_nice_tries = 0, \
388 .per_cpu_gain = 25, \
389 .flags = SD_BALANCE_NEWIDLE \
390 | SD_BALANCE_EXEC \
391 | SD_WAKE_AFFINE \
392 | SD_WAKE_IDLE \
393 | SD_SHARE_CPUPOWER, \
394 .last_balance = jiffies, \
395 .balance_interval = 1, \
396 .nr_balance_failed = 0, \
397}
398#endif
399
400
401#define SD_CPU_INIT (struct sched_domain) { \
402 .span = CPU_MASK_NONE, \
403 .parent = NULL, \
404 .groups = NULL, \
405 .min_interval = 1, \
406 .max_interval = 4, \
407 .busy_factor = 64, \
408 .imbalance_pct = 125, \
409 .cache_hot_time = cache_decay_ticks*1000000 ? : (5*1000000/2),\
410 .cache_nice_tries = 1, \
411 .per_cpu_gain = 100, \
412 .flags = SD_BALANCE_NEWIDLE \
413 | SD_BALANCE_EXEC \
414 | SD_WAKE_AFFINE \
415 | SD_WAKE_BALANCE, \
416 .last_balance = jiffies, \
417 .balance_interval = 1, \
418 .nr_balance_failed = 0, \
419}
420
421#ifdef CONFIG_SCHED_MC
422#ifndef SD_MC_INIT
423
424
425
426#define SD_MC_INIT SD_CPU_INIT
427#endif
428#endif
429
430
431#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
432#define SD_NODE_INIT (struct sched_domain) { \
433 .span = CPU_MASK_NONE, \
434 .parent = NULL, \
435 .groups = NULL, \
436 .min_interval = 8, \
437 .max_interval = 32, \
438 .busy_factor = 32, \
439 .imbalance_pct = 125, \
440 .cache_hot_time = (10*1000000), \
441 .cache_nice_tries = 1, \
442 .per_cpu_gain = 100, \
443 .flags = SD_BALANCE_EXEC \
444 | SD_WAKE_BALANCE, \
445 .last_balance = jiffies, \
446 .balance_interval = 1, \
447 .nr_balance_failed = 0, \
448}
449#endif
450#endif
451#endif
452
453
454#define for_each_domain(cpu, domain) \
455 for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
456
457#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
458#define this_rq() (&__get_cpu_var(runqueues))
459#define task_rq(p) cpu_rq(task_cpu(p))
460#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
461
462
463
464
465#ifndef prepare_arch_switch
466# define prepare_arch_switch(rq, next) do { } while (0)
467# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
468# define task_running(rq, p) ((rq)->curr == (p))
469#endif
470
471
472
473
474
475
476static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
477{
478 struct runqueue *rq;
479
480repeat_lock_task:
481 local_irq_save(*flags);
482 rq = task_rq(p);
483 spin_lock(&rq->lock);
484 if (unlikely(rq != task_rq(p))) {
485 spin_unlock_irqrestore(&rq->lock, *flags);
486 goto repeat_lock_task;
487 }
488 return rq;
489}
490
491static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
492{
493 spin_unlock_irqrestore(&rq->lock, *flags);
494}
495
496#ifdef CONFIG_SCHEDSTATS
497
498
499
500
501#define SCHEDSTAT_VERSION 10
502
503static int show_schedstat(struct seq_file *seq, void *v)
504{
505 int cpu;
506 enum idle_type itype;
507
508 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
509 seq_printf(seq, "timestamp %lu\n", jiffies);
510 for_each_online_cpu(cpu) {
511 runqueue_t *rq = cpu_rq(cpu);
512#ifdef CONFIG_SMP
513 struct sched_domain *sd;
514 int dcnt = 0;
515#endif
516
517
518 seq_printf(seq,
519 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
520 "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
521 cpu, rq->yld_both_empty,
522 rq->yld_act_empty, rq->yld_exp_empty,
523 rq->yld_cnt, rq->sched_noswitch,
524 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
525 rq->alb_cnt, rq->alb_gained, rq->alb_lost,
526 rq->alb_failed,
527 rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
528 rq->wunt_cnt, rq->wunt_moved,
529 rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
530 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
531
532 for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++)
533 seq_printf(seq, " %lu %lu", rq->pt_gained[itype],
534 rq->pt_lost[itype]);
535 seq_printf(seq, "\n");
536
537#ifdef CONFIG_SMP
538
539 for_each_domain(cpu, sd) {
540 char mask_str[NR_CPUS];
541
542 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
543 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
544 for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) {
545 seq_printf(seq, " %lu %lu %lu %lu %lu",
546 sd->lb_cnt[itype],
547 sd->lb_failed[itype],
548 sd->lb_imbalance[itype],
549 sd->lb_nobusyq[itype],
550 sd->lb_nobusyg[itype]);
551 }
552 seq_printf(seq, " %lu %lu %lu %lu\n",
553 sd->sbe_pushed, sd->sbe_attempts,
554 sd->ttwu_wake_affine, sd->ttwu_wake_balance);
555 }
556#endif
557 }
558 return 0;
559}
560
561static int schedstat_open(struct inode *inode, struct file *file)
562{
563 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
564 char *buf = kmalloc(size, GFP_KERNEL);
565 struct seq_file *m;
566 int res;
567
568 if (!buf)
569 return -ENOMEM;
570 res = single_open(file, show_schedstat, NULL);
571 if (!res) {
572 m = file->private_data;
573 m->buf = buf;
574 m->size = size;
575 } else
576 kfree(buf);
577 return res;
578}
579
580struct file_operations proc_schedstat_operations = {
581 .open = schedstat_open,
582 .read = seq_read,
583 .llseek = seq_lseek,
584 .release = single_release,
585};
586
587# define schedstat_inc(rq, field) rq->field++;
588# define schedstat_add(rq, field, amt) rq->field += amt;
589#else
590# define schedstat_inc(rq, field) do { } while (0);
591# define schedstat_add(rq, field, amt) do { } while (0);
592#endif
593
594
595
596
597static inline runqueue_t *this_rq_lock(void)
598{
599 runqueue_t *rq;
600
601 local_irq_disable();
602 rq = this_rq();
603 spin_lock(&rq->lock);
604
605 return rq;
606}
607
608static inline void rq_unlock(runqueue_t *rq)
609{
610 spin_unlock_irq(&rq->lock);
611}
612
613#ifdef CONFIG_SCHEDSTATS
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629static inline void sched_info_dequeued(task_t *t)
630{
631 t->sched_info.last_queued = 0;
632}
633
634
635
636
637
638
639static inline void sched_info_arrive(task_t *t)
640{
641 unsigned long now = jiffies, diff = 0;
642 struct runqueue *rq = task_rq(t);
643
644 if (t->sched_info.last_queued)
645 diff = now - t->sched_info.last_queued;
646 sched_info_dequeued(t);
647 t->sched_info.run_delay += diff;
648 t->sched_info.last_arrival = now;
649 t->sched_info.pcnt++;
650
651 if (!rq)
652 return;
653
654 rq->rq_sched_info.run_delay += diff;
655 rq->rq_sched_info.pcnt++;
656}
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673static inline void sched_info_queued(task_t *t)
674{
675 if (!t->sched_info.last_queued)
676 t->sched_info.last_queued = jiffies;
677}
678
679
680
681
682
683static inline void sched_info_depart(task_t *t)
684{
685 struct runqueue *rq = task_rq(t);
686 unsigned long diff = jiffies - t->sched_info.last_arrival;
687
688 t->sched_info.cpu_time += diff;
689
690 if (rq)
691 rq->rq_sched_info.cpu_time += diff;
692}
693
694
695
696
697
698
699static inline void sched_info_switch(task_t *prev, task_t *next)
700{
701 struct runqueue *rq = task_rq(prev);
702
703
704
705
706
707
708 if (prev != rq->idle)
709 sched_info_depart(prev);
710
711 if (next != rq->idle)
712 sched_info_arrive(next);
713}
714#else
715#define sched_info_queued(t) do { } while (0)
716#define sched_info_switch(t, next) do { } while (0)
717#endif
718
719
720
721
722static void dequeue_task(struct task_struct *p, prio_array_t *array)
723{
724 array->nr_active--;
725 list_del(&p->run_list);
726 if (list_empty(array->queue + p->prio))
727 __clear_bit(p->prio, array->bitmap);
728}
729
730static void enqueue_task(struct task_struct *p, prio_array_t *array)
731{
732 sched_info_queued(p);
733 list_add_tail(&p->run_list, array->queue + p->prio);
734 __set_bit(p->prio, array->bitmap);
735 array->nr_active++;
736 p->array = array;
737}
738
739
740
741
742
743
744static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
745{
746 list_add(&p->run_list, array->queue + p->prio);
747 __set_bit(p->prio, array->bitmap);
748 array->nr_active++;
749 p->array = array;
750}
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766static int effective_prio(task_t *p)
767{
768 int bonus, prio;
769
770 if (rt_task(p))
771 return p->prio;
772
773 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
774
775 prio = p->static_prio - bonus;
776 if (prio < MAX_RT_PRIO)
777 prio = MAX_RT_PRIO;
778 if (prio > MAX_PRIO-1)
779 prio = MAX_PRIO-1;
780 return prio;
781}
782
783
784
785
786static inline void __activate_task(task_t *p, runqueue_t *rq)
787{
788 enqueue_task(p, rq->active);
789 rq->nr_running++;
790}
791
792
793
794
795static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
796{
797 enqueue_task_head(p, rq->active);
798 rq->nr_running++;
799}
800
801static void recalc_task_prio(task_t *p, unsigned long long now)
802{
803 unsigned long long __sleep_time = now - p->timestamp;
804 unsigned long sleep_time;
805
806 if (__sleep_time > NS_MAX_SLEEP_AVG)
807 sleep_time = NS_MAX_SLEEP_AVG;
808 else
809 sleep_time = (unsigned long)__sleep_time;
810
811 if (likely(sleep_time > 0)) {
812
813
814
815
816
817
818 if (p->mm && p->activated != -1 &&
819 sleep_time > INTERACTIVE_SLEEP(p)) {
820 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
821 DEF_TIMESLICE);
822 if (!HIGH_CREDIT(p))
823 p->interactive_credit++;
824 } else {
825
826
827
828
829 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
830
831
832
833
834
835 if (LOW_CREDIT(p) &&
836 sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
837 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
838
839
840
841
842
843
844 if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
845 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
846 sleep_time = 0;
847 else if (p->sleep_avg + sleep_time >=
848 INTERACTIVE_SLEEP(p)) {
849 p->sleep_avg = INTERACTIVE_SLEEP(p);
850 sleep_time = 0;
851 }
852 }
853
854
855
856
857
858
859
860
861
862 p->sleep_avg += sleep_time;
863
864 if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
865 p->sleep_avg = NS_MAX_SLEEP_AVG;
866 if (!HIGH_CREDIT(p))
867 p->interactive_credit++;
868 }
869 }
870 }
871
872 p->prio = effective_prio(p);
873}
874
875
876
877
878
879
880
881static void activate_task(task_t *p, runqueue_t *rq, int local)
882{
883 unsigned long long now;
884
885 now = sched_clock();
886#ifdef CONFIG_SMP
887 if (!local) {
888
889 runqueue_t *this_rq = this_rq();
890 now = (now - this_rq->timestamp_last_tick)
891 + rq->timestamp_last_tick;
892 }
893#endif
894
895 recalc_task_prio(p, now);
896
897
898
899
900
901 if (!p->activated) {
902
903
904
905
906
907
908
909 if (in_interrupt())
910 p->activated = 2;
911 else {
912
913
914
915
916 p->activated = 1;
917 }
918 }
919 p->timestamp = now;
920
921 __activate_task(p, rq);
922}
923
924
925
926
927static void deactivate_task(struct task_struct *p, runqueue_t *rq)
928{
929 rq->nr_running--;
930 dequeue_task(p, p->array);
931 p->array = NULL;
932}
933
934
935
936
937
938
939
940
941#ifdef CONFIG_SMP
942static void resched_task(task_t *p)
943{
944 int need_resched, nrpolling;
945
946 BUG_ON(!spin_is_locked(&task_rq(p)->lock));
947
948
949 nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
950 need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
951 nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
952
953 if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
954 smp_send_reschedule(task_cpu(p));
955}
956#else
957static inline void resched_task(task_t *p)
958{
959 set_tsk_need_resched(p);
960}
961#endif
962
963
964
965
966
967inline int task_curr(const task_t *p)
968{
969 return cpu_curr(task_cpu(p)) == p;
970}
971
972int wake_balance=1;
973
974#ifdef CONFIG_SMP
975
976#define LOWLAT_ENABLED() (wake_balance == 2)
977
978enum request_type {
979 REQ_MOVE_TASK,
980 REQ_SET_DOMAIN,
981};
982
983typedef struct {
984 struct list_head list;
985 enum request_type type;
986
987
988 task_t *task;
989 int dest_cpu;
990
991
992 struct sched_domain *sd;
993
994 struct completion done;
995} migration_req_t;
996
997
998
999
1000
1001static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
1002{
1003 runqueue_t *rq = task_rq(p);
1004
1005
1006
1007
1008
1009 if (!p->array && !task_running(rq, p)) {
1010 set_task_cpu(p, dest_cpu);
1011 return 0;
1012 }
1013
1014 init_completion(&req->done);
1015 req->type = REQ_MOVE_TASK;
1016 req->task = p;
1017 req->dest_cpu = dest_cpu;
1018 list_add(&req->list, &rq->migration_queue);
1019 return 1;
1020}
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031void wait_task_inactive(task_t * p)
1032{
1033 unsigned long flags;
1034 runqueue_t *rq;
1035 int preempted;
1036
1037repeat:
1038 rq = task_rq_lock(p, &flags);
1039
1040 if (unlikely(p->array || task_running(rq, p))) {
1041
1042 preempted = !task_running(rq, p);
1043 task_rq_unlock(rq, &flags);
1044 cpu_relax();
1045 if (preempted)
1046 yield();
1047 goto repeat;
1048 }
1049 task_rq_unlock(rq, &flags);
1050}
1051
1052
1053
1054
1055
1056
1057
1058
1059void kick_process(task_t *p)
1060{
1061 int cpu;
1062
1063 preempt_disable();
1064 cpu = task_cpu(p);
1065 if ((cpu != smp_processor_id()) && task_curr(p))
1066 smp_send_reschedule(cpu);
1067 preempt_enable();
1068}
1069
1070static unsigned long current_load(runqueue_t *rq)
1071{
1072 unsigned long load = rq->nr_running * SCHED_LOAD_SCALE;
1073
1074 if (LOWLAT_ENABLED())
1075 load += (SCHED_LOAD_SCALE * rq->irq_pct) / 100;
1076 return load;
1077}
1078
1079
1080
1081
1082
1083
1084
1085static inline unsigned long source_load(int cpu)
1086{
1087 runqueue_t *rq = cpu_rq(cpu);
1088
1089 return min(rq->cpu_load, current_load(rq));
1090}
1091
1092
1093
1094
1095static inline unsigned long target_load(int cpu)
1096{
1097 runqueue_t *rq = cpu_rq(cpu);
1098
1099 return max(rq->cpu_load, current_load(rq));
1100}
1101
1102#endif
1103
1104
1105
1106
1107
1108
1109
1110
1111#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1112static int wake_idle(int cpu, task_t *p)
1113{
1114 cpumask_t tmp;
1115 runqueue_t *rq = cpu_rq(cpu);
1116 struct sched_domain *sd;
1117 int i;
1118 unsigned long load = current_load(rq) + SCHED_LOAD_SCALE;
1119
1120 if (LOWLAT_ENABLED()) {
1121 if (idle_cpu(cpu))
1122 load -= SCHED_LOAD_SCALE;
1123 if (load <= SCHED_LOAD_SCALE)
1124 return cpu;
1125 } else if (idle_cpu(cpu)) {
1126 return cpu;
1127 }
1128
1129 sd = rq->sd;
1130 if (!(sd->flags & SD_WAKE_IDLE) && !LOWLAT_ENABLED())
1131 return cpu;
1132
1133 cpus_and(tmp, sd->span, p->cpus_allowed);
1134
1135 for_each_cpu_mask(i, tmp) {
1136 if (LOWLAT_ENABLED()) {
1137 unsigned long l = current_load(cpu_rq(i))
1138 + SCHED_LOAD_SCALE;
1139
1140 if (idle_cpu(i))
1141 l -= SCHED_LOAD_SCALE;
1142 if (l <= SCHED_LOAD_SCALE)
1143 return i;
1144 if (l < load) {
1145 cpu = i;
1146 load = l;
1147 }
1148 } else if (idle_cpu(i)) {
1149 return i;
1150 }
1151 }
1152
1153 return cpu;
1154}
1155#else
1156static inline int wake_idle(int cpu, task_t *p)
1157{
1158 return cpu;
1159}
1160#endif
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176static int try_to_wake_up(task_t * p, unsigned int state, int sync)
1177{
1178 int cpu, this_cpu, success = 0;
1179 unsigned long flags;
1180 long old_state;
1181 runqueue_t *rq;
1182#ifdef CONFIG_SMP
1183 unsigned long load, this_load;
1184 struct sched_domain *sd;
1185 int new_cpu;
1186#endif
1187
1188 rq = task_rq_lock(p, &flags);
1189 schedstat_inc(rq, ttwu_cnt);
1190 old_state = p->state;
1191 if (!(old_state & state))
1192 goto out;
1193
1194 if (p->array)
1195 goto out_running;
1196
1197 cpu = task_cpu(p);
1198 this_cpu = smp_processor_id();
1199
1200#ifdef CONFIG_SMP
1201 if (unlikely(task_running(rq, p)))
1202 goto out_activate;
1203
1204 new_cpu = cpu;
1205
1206 if (!wake_balance)
1207 goto out_set_cpu;
1208
1209 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1210 goto out_set_cpu;
1211
1212 load = source_load(cpu);
1213 this_load = target_load(this_cpu);
1214
1215
1216
1217
1218
1219 if (sync)
1220 this_load -= SCHED_LOAD_SCALE;
1221
1222
1223 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1224 goto out_set_cpu;
1225
1226 new_cpu = this_cpu;
1227
1228
1229
1230
1231
1232 for_each_domain(this_cpu, sd) {
1233 unsigned int imbalance;
1234
1235
1236
1237
1238 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1239
1240 if ((sd->flags & SD_WAKE_AFFINE) &&
1241 !task_hot(p, rq->timestamp_last_tick, sd)) {
1242
1243
1244
1245
1246 if (cpu_isset(cpu, sd->span)) {
1247 schedstat_inc(sd, ttwu_wake_affine);
1248 goto out_set_cpu;
1249 }
1250 } else if ((sd->flags & SD_WAKE_BALANCE) &&
1251 imbalance*this_load <= 100*load) {
1252
1253
1254
1255
1256 if (cpu_isset(cpu, sd->span)) {
1257 schedstat_inc(sd, ttwu_wake_balance);
1258 goto out_set_cpu;
1259 }
1260 }
1261 }
1262
1263 new_cpu = cpu;
1264out_set_cpu:
1265 schedstat_inc(rq, ttwu_attempts);
1266 new_cpu = wake_idle(new_cpu, p);
1267 if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
1268 schedstat_inc(rq, ttwu_moved);
1269 set_task_cpu(p, new_cpu);
1270 task_rq_unlock(rq, &flags);
1271
1272 rq = task_rq_lock(p, &flags);
1273 old_state = p->state;
1274 if (!(old_state & state))
1275 goto out;
1276 if (p->array)
1277 goto out_running;
1278
1279 this_cpu = smp_processor_id();
1280 cpu = task_cpu(p);
1281 }
1282
1283out_activate:
1284#endif
1285 if (old_state == TASK_UNINTERRUPTIBLE) {
1286 rq->nr_uninterruptible--;
1287
1288
1289
1290
1291 p->activated = -1;
1292 }
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302 activate_task(p, rq, cpu == this_cpu);
1303 if (!sync || cpu != this_cpu) {
1304 if (TASK_PREEMPTS_CURR(p, rq))
1305 resched_task(rq->curr);
1306 }
1307 success = 1;
1308
1309out_running:
1310 p->state = TASK_RUNNING;
1311out:
1312 task_rq_unlock(rq, &flags);
1313
1314 return success;
1315}
1316
1317int fastcall wake_up_process(task_t * p)
1318{
1319 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1320 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1321}
1322
1323EXPORT_SYMBOL(wake_up_process);
1324
1325int fastcall wake_up_state(task_t *p, unsigned int state)
1326{
1327 return try_to_wake_up(p, state, 0);
1328}
1329
1330#ifdef CONFIG_SMP
1331static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1332 struct sched_domain *sd);
1333#endif
1334
1335
1336
1337
1338
1339void fastcall sched_fork(task_t *p)
1340{
1341
1342
1343
1344
1345
1346
1347 p->state = TASK_RUNNING;
1348 INIT_LIST_HEAD(&p->run_list);
1349 p->array = NULL;
1350 spin_lock_init(&p->switch_lock);
1351#ifdef CONFIG_SCHEDSTATS
1352 memset(&p->sched_info, 0, sizeof(p->sched_info));
1353#endif
1354#ifdef CONFIG_PREEMPT
1355
1356
1357
1358
1359
1360
1361 p->thread_info->preempt_count = 1;
1362#endif
1363
1364
1365
1366
1367
1368 local_irq_disable();
1369 p->time_slice = (current->time_slice + 1) >> 1;
1370
1371
1372
1373
1374 p->first_time_slice = current->pid;
1375 current->time_slice >>= 1;
1376 p->timestamp = sched_clock();
1377 if (unlikely(!current->time_slice)) {
1378
1379
1380
1381
1382
1383 current->time_slice = 1;
1384 preempt_disable();
1385 scheduler_tick(0, 0);
1386 local_irq_enable();
1387 preempt_enable();
1388 } else
1389 local_irq_enable();
1390}
1391
1392
1393
1394
1395
1396
1397
1398
1399void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1400{
1401 unsigned long flags;
1402 int this_cpu, cpu;
1403 runqueue_t *rq, *this_rq;
1404
1405 rq = task_rq_lock(p, &flags);
1406 cpu = task_cpu(p);
1407 this_cpu = smp_processor_id();
1408
1409 BUG_ON(p->state != TASK_RUNNING);
1410
1411 schedstat_inc(rq, wunt_cnt);
1412
1413
1414
1415
1416
1417
1418 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1419 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1420
1421 p->interactive_credit = 0;
1422
1423 p->prio = effective_prio(p);
1424
1425 if (likely(cpu == this_cpu)) {
1426 if (!(clone_flags & CLONE_VM)) {
1427
1428
1429
1430
1431
1432 if (unlikely(!current->array))
1433 __activate_task(p, rq);
1434 else {
1435 p->prio = current->prio;
1436 list_add_tail(&p->run_list, ¤t->run_list);
1437 p->array = current->array;
1438 p->array->nr_active++;
1439 rq->nr_running++;
1440 }
1441 set_need_resched();
1442 } else
1443
1444 __activate_task(p, rq);
1445
1446
1447
1448
1449
1450
1451 this_rq = rq;
1452 } else {
1453 this_rq = cpu_rq(this_cpu);
1454
1455
1456
1457
1458
1459 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1460 + rq->timestamp_last_tick;
1461 __activate_task(p, rq);
1462 if (TASK_PREEMPTS_CURR(p, rq))
1463 resched_task(rq->curr);
1464
1465 schedstat_inc(rq, wunt_moved);
1466
1467
1468
1469
1470 task_rq_unlock(rq, &flags);
1471 this_rq = task_rq_lock(current, &flags);
1472 }
1473 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1474 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1475 task_rq_unlock(this_rq, &flags);
1476}
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487void fastcall sched_exit(task_t * p)
1488{
1489 unsigned long flags;
1490 runqueue_t *rq;
1491 struct task_struct* creator = NULL;
1492
1493
1494
1495
1496
1497 if (p->first_time_slice) {
1498 creator = find_task_by_pid((pid_t)p->first_time_slice);
1499 if (creator && task_cpu(p) == task_cpu(creator)) {
1500 rq = task_rq_lock(creator, &flags);
1501 creator->time_slice += p->time_slice;
1502 if (unlikely(creator->time_slice > task_timeslice(p)))
1503 creator->time_slice = task_timeslice(p);
1504
1505 if (p->sleep_avg < creator->sleep_avg)
1506 creator->sleep_avg = creator->sleep_avg /
1507 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1508 (EXIT_WEIGHT + 1);
1509 task_rq_unlock(rq, &flags);
1510 }
1511 }
1512}
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527static inline void finish_task_switch(task_t *prev)
1528{
1529 runqueue_t *rq = this_rq();
1530 struct mm_struct *mm = rq->prev_mm;
1531 unsigned long prev_task_flags;
1532
1533 rq->prev_mm = NULL;
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546 prev_task_flags = prev->flags;
1547 finish_arch_switch(rq, prev);
1548 if (mm)
1549 mmdrop(mm);
1550 if (unlikely(prev_task_flags & PF_DEAD)) {
1551
1552
1553
1554
1555 kprobe_flush_task(prev);
1556 put_task_struct(prev);
1557 }
1558}
1559
1560
1561
1562
1563
1564asmlinkage void schedule_tail(task_t *prev)
1565{
1566 finish_task_switch(prev);
1567
1568 if (current->set_child_tid)
1569 put_user(current->pid, current->set_child_tid);
1570}
1571
1572
1573
1574
1575
1576static inline
1577task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1578{
1579 struct mm_struct *mm = next->mm;
1580 struct mm_struct *oldmm = prev->active_mm;
1581
1582 if (unlikely(!mm)) {
1583 next->active_mm = oldmm;
1584 atomic_inc(&oldmm->mm_count);
1585 enter_lazy_tlb(oldmm, next);
1586 } else
1587 switch_mm(oldmm, mm, next);
1588
1589 if (unlikely(!prev->mm)) {
1590 prev->active_mm = NULL;
1591 WARN_ON(rq->prev_mm);
1592 rq->prev_mm = oldmm;
1593 }
1594
1595
1596 switch_to(prev, next, prev);
1597
1598 return prev;
1599}
1600
1601
1602
1603
1604
1605
1606
1607
1608unsigned long nr_running(void)
1609{
1610 unsigned long i, sum = 0;
1611
1612 for_each_online_cpu(i)
1613 sum += cpu_rq(i)->nr_running;
1614
1615 return sum;
1616}
1617
1618unsigned long nr_uninterruptible(void)
1619{
1620 unsigned long i, sum = 0;
1621
1622 for_each_cpu(i)
1623 sum += cpu_rq(i)->nr_uninterruptible;
1624
1625
1626
1627
1628
1629 if (unlikely((long)sum < 0))
1630 sum = 0;
1631
1632 return sum;
1633}
1634
1635unsigned long long nr_context_switches(void)
1636{
1637 unsigned long long i, sum = 0;
1638
1639 for_each_cpu(i)
1640 sum += cpu_rq(i)->nr_switches;
1641
1642 return sum;
1643}
1644
1645unsigned long nr_iowait(void)
1646{
1647 unsigned long i, sum = 0;
1648
1649 for_each_cpu(i)
1650 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1651
1652 return sum;
1653}
1654
1655
1656cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
1657
1658#ifdef CONFIG_SMP
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1670{
1671 if (rq1 == rq2)
1672 spin_lock(&rq1->lock);
1673 else {
1674 if (rq1->cpu < rq2->cpu) {
1675 spin_lock(&rq1->lock);
1676 spin_lock(&rq2->lock);
1677 } else {
1678 spin_lock(&rq2->lock);
1679 spin_lock(&rq1->lock);
1680 }
1681 }
1682}
1683
1684
1685
1686
1687
1688
1689
1690static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1691{
1692 spin_unlock(&rq1->lock);
1693 if (rq1 != rq2)
1694 spin_unlock(&rq2->lock);
1695}
1696
1697
1698
1699
1700static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1701{
1702 if (unlikely(!spin_trylock(&busiest->lock))) {
1703 if (busiest->cpu < this_rq->cpu) {
1704 spin_unlock(&this_rq->lock);
1705 spin_lock(&busiest->lock);
1706 spin_lock(&this_rq->lock);
1707 } else
1708 spin_lock(&busiest->lock);
1709 }
1710}
1711
1712
1713
1714
1715static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1716 struct sched_domain *sd)
1717{
1718 unsigned long load, min_load, this_load;
1719 int i, min_cpu;
1720 cpumask_t mask;
1721
1722 min_cpu = UINT_MAX;
1723 min_load = ULONG_MAX;
1724
1725 cpus_and(mask, sd->span, p->cpus_allowed);
1726
1727 for_each_cpu_mask(i, mask) {
1728 load = target_load(i);
1729
1730 if (load < min_load) {
1731 min_cpu = i;
1732 min_load = load;
1733
1734
1735 if (!min_load)
1736 break;
1737 }
1738 }
1739
1740
1741 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1752 return min_cpu;
1753
1754 return this_cpu;
1755}
1756
1757
1758
1759
1760
1761
1762
1763static void sched_migrate_task(task_t *p, int dest_cpu)
1764{
1765 migration_req_t req;
1766 runqueue_t *rq;
1767 unsigned long flags;
1768
1769 rq = task_rq_lock(p, &flags);
1770 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1771 || unlikely(cpu_is_offline(dest_cpu)))
1772 goto out;
1773
1774 schedstat_inc(rq, smt_cnt);
1775
1776 if (migrate_task(p, dest_cpu, &req)) {
1777
1778 struct task_struct *mt = rq->migration_thread;
1779 get_task_struct(mt);
1780 task_rq_unlock(rq, &flags);
1781 wake_up_process(mt);
1782 put_task_struct(mt);
1783 wait_for_completion(&req.done);
1784 return;
1785 }
1786out:
1787 task_rq_unlock(rq, &flags);
1788}
1789
1790
1791
1792
1793
1794
1795
1796
1797void sched_exec(void)
1798{
1799 struct sched_domain *tmp, *sd = NULL;
1800 int new_cpu, this_cpu = get_cpu();
1801
1802 schedstat_inc(this_rq(), sbe_cnt);
1803
1804 if (this_rq()->nr_running <= 1)
1805 goto out;
1806
1807 for_each_domain(this_cpu, tmp)
1808 if (tmp->flags & SD_BALANCE_EXEC)
1809 sd = tmp;
1810
1811 if (sd) {
1812 schedstat_inc(sd, sbe_attempts);
1813 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1814 if (new_cpu != this_cpu) {
1815 schedstat_inc(sd, sbe_pushed);
1816 put_cpu();
1817 sched_migrate_task(current, new_cpu);
1818 return;
1819 }
1820 }
1821out:
1822 put_cpu();
1823}
1824
1825
1826
1827
1828
1829static inline
1830void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1831 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1832{
1833 dequeue_task(p, src_array);
1834 src_rq->nr_running--;
1835 set_task_cpu(p, this_cpu);
1836 this_rq->nr_running++;
1837 enqueue_task(p, this_array);
1838 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1839 + this_rq->timestamp_last_tick;
1840
1841
1842
1843
1844 if (TASK_PREEMPTS_CURR(p, this_rq))
1845 resched_task(this_rq->curr);
1846}
1847
1848
1849
1850
1851static inline
1852int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1853 struct sched_domain *sd, enum idle_type idle, int *all_pinned)
1854{
1855
1856
1857
1858
1859
1860
1861 if (task_running(rq, p))
1862 return 0;
1863 if (!cpu_isset(this_cpu, p->cpus_allowed))
1864 return 0;
1865
1866 *all_pinned = 0;
1867
1868
1869 if (idle == NEWLY_IDLE ||
1870 sd->nr_balance_failed < sd->cache_nice_tries) {
1871 if (task_hot(p, rq->timestamp_last_tick, sd))
1872 return 0;
1873 }
1874
1875 return 1;
1876}
1877
1878
1879
1880
1881
1882
1883
1884
1885static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1886 unsigned long max_nr_move, struct sched_domain *sd,
1887 enum idle_type idle, int *all_pinned)
1888{
1889 prio_array_t *array, *dst_array;
1890 struct list_head *head, *curr;
1891 int idx, pulled = 0, pinned = 0;
1892 task_t *tmp;
1893
1894 if (max_nr_move <= 0 || busiest->nr_running <= 1)
1895 goto out;
1896
1897 pinned=1;
1898
1899
1900
1901
1902
1903
1904
1905 if (busiest->expired->nr_active) {
1906 array = busiest->expired;
1907 dst_array = this_rq->expired;
1908 } else {
1909 array = busiest->active;
1910 dst_array = this_rq->active;
1911 }
1912
1913new_array:
1914
1915 idx = 0;
1916skip_bitmap:
1917 if (!idx)
1918 idx = sched_find_first_bit(array->bitmap);
1919 else
1920 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1921 if (idx >= MAX_PRIO) {
1922 if (array == busiest->expired && busiest->active->nr_active) {
1923 array = busiest->active;
1924 dst_array = this_rq->active;
1925 goto new_array;
1926 }
1927 goto out;
1928 }
1929
1930 head = array->queue + idx;
1931 curr = head->prev;
1932skip_queue:
1933 tmp = list_entry(curr, task_t, run_list);
1934
1935 curr = curr->prev;
1936
1937 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
1938 if (curr != head)
1939 goto skip_queue;
1940 idx++;
1941 goto skip_bitmap;
1942 }
1943
1944
1945
1946
1947
1948
1949 schedstat_inc(this_rq, pt_gained[idle]);
1950 schedstat_inc(busiest, pt_lost[idle]);
1951
1952 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1953 pulled++;
1954
1955
1956 if (pulled < max_nr_move) {
1957 if (curr != head)
1958 goto skip_queue;
1959 idx++;
1960 goto skip_bitmap;
1961 }
1962out:
1963 if (all_pinned)
1964 *all_pinned = pinned;
1965
1966 return pulled;
1967}
1968
1969
1970
1971
1972
1973
1974static struct sched_group *
1975find_busiest_group(struct sched_domain *sd, int this_cpu,
1976 unsigned long *imbalance, enum idle_type idle)
1977{
1978 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1979 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1980
1981 max_load = this_load = total_load = total_pwr = 0;
1982
1983 do {
1984 unsigned long load;
1985 int local_group;
1986 int i, nr_cpus = 0;
1987
1988 local_group = cpu_isset(this_cpu, group->cpumask);
1989
1990
1991 avg_load = 0;
1992
1993 for_each_cpu_mask(i, group->cpumask) {
1994
1995 if (local_group)
1996 load = target_load(i);
1997 else
1998 load = source_load(i);
1999
2000 nr_cpus++;
2001 avg_load += load;
2002 }
2003
2004 if (!nr_cpus)
2005 goto nextgroup;
2006
2007 total_load += avg_load;
2008 total_pwr += group->cpu_power;
2009
2010
2011 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2012
2013 if (local_group) {
2014 this_load = avg_load;
2015 this = group;
2016 goto nextgroup;
2017 } else if (avg_load > max_load) {
2018 max_load = avg_load;
2019 busiest = group;
2020 }
2021nextgroup:
2022 group = group->next;
2023 } while (group != sd->groups);
2024
2025 if (!busiest || this_load >= max_load)
2026 goto out_balanced;
2027
2028 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2029
2030 if (this_load >= avg_load ||
2031 100*max_load <= sd->imbalance_pct*this_load)
2032 goto out_balanced;
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045 *imbalance = min(max_load - avg_load, avg_load - this_load);
2046
2047
2048 *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
2049 / SCHED_LOAD_SCALE;
2050
2051 if (*imbalance < SCHED_LOAD_SCALE - 1) {
2052 unsigned long pwr_now = 0, pwr_move = 0;
2053 unsigned long tmp;
2054
2055 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
2056 *imbalance = 1;
2057 return busiest;
2058 }
2059
2060
2061
2062
2063
2064
2065
2066 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
2067 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
2068 pwr_now /= SCHED_LOAD_SCALE;
2069
2070
2071 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
2072 if (max_load > tmp)
2073 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
2074 max_load - tmp);
2075
2076
2077 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
2078 if (max_load < tmp)
2079 tmp = max_load;
2080 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
2081 pwr_move /= SCHED_LOAD_SCALE;
2082
2083
2084 if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
2085 goto out_balanced;
2086
2087 *imbalance = 1;
2088 return busiest;
2089 }
2090
2091
2092 *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
2093
2094 return busiest;
2095
2096out_balanced:
2097 *imbalance = 0;
2098 return NULL;
2099}
2100
2101
2102
2103
2104static runqueue_t *find_busiest_queue(struct sched_group *group)
2105{
2106 unsigned long load, max_load = 0;
2107 runqueue_t *busiest = NULL;
2108 int i;
2109
2110 for_each_cpu_mask(i, group->cpumask) {
2111 load = source_load(i);
2112
2113 if (load > max_load) {
2114 max_load = load;
2115 busiest = cpu_rq(i);
2116 }
2117 }
2118
2119 return busiest;
2120}
2121
2122
2123
2124
2125
2126
2127
2128static int load_balance(int this_cpu, runqueue_t *this_rq,
2129 struct sched_domain *sd, enum idle_type idle)
2130{
2131 struct sched_group *group;
2132 runqueue_t *busiest;
2133 unsigned long imbalance;
2134 int nr_moved, all_pinned=0;
2135
2136 spin_lock(&this_rq->lock);
2137 schedstat_inc(sd, lb_cnt[idle]);
2138
2139 group = find_busiest_group(sd, this_cpu, &imbalance, idle);
2140 if (!group) {
2141 schedstat_inc(sd, lb_nobusyg[idle]);
2142 goto out_balanced;
2143 }
2144
2145 busiest = find_busiest_queue(group);
2146 if (!busiest) {
2147 schedstat_inc(sd, lb_nobusyq[idle]);
2148 goto out_balanced;
2149 }
2150
2151
2152
2153
2154
2155
2156 if (unlikely(busiest == this_rq)) {
2157 WARN_ON(1);
2158 goto out_balanced;
2159 }
2160
2161 schedstat_add(sd, lb_imbalance[idle], imbalance);
2162
2163 nr_moved = 0;
2164 if (busiest->nr_running > 1) {
2165
2166
2167
2168
2169
2170
2171 double_lock_balance(this_rq, busiest);
2172 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2173 imbalance, sd, idle,
2174 &all_pinned);
2175 spin_unlock(&busiest->lock);
2176
2177
2178 if (unlikely(all_pinned))
2179 goto out_balanced;
2180 }
2181
2182 spin_unlock(&this_rq->lock);
2183
2184 if (!nr_moved) {
2185 schedstat_inc(sd, lb_failed[idle]);
2186 sd->nr_balance_failed++;
2187
2188 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2189 int wake = 0;
2190
2191 spin_lock(&busiest->lock);
2192 if (!busiest->active_balance) {
2193 busiest->active_balance = 1;
2194 busiest->push_cpu = this_cpu;
2195 wake = 1;
2196 }
2197 spin_unlock(&busiest->lock);
2198 if (wake)
2199 wake_up_process(busiest->migration_thread);
2200
2201
2202
2203
2204
2205 sd->nr_balance_failed = sd->cache_nice_tries;
2206 }
2207 } else
2208 sd->nr_balance_failed = 0;
2209
2210
2211 sd->balance_interval = sd->min_interval;
2212
2213 return nr_moved;
2214
2215out_balanced:
2216 spin_unlock(&this_rq->lock);
2217
2218 sd->nr_balance_failed = 0;
2219
2220 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2221 (sd->balance_interval < sd->max_interval))
2222 sd->balance_interval *= 2;
2223 return 0;
2224}
2225
2226
2227
2228
2229
2230
2231
2232
2233static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2234 struct sched_domain *sd)
2235{
2236 struct sched_group *group;
2237 runqueue_t *busiest = NULL;
2238 unsigned long imbalance;
2239 int nr_moved = 0;
2240
2241 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2242 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2243 if (!group) {
2244 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2245 goto out_balanced;
2246 }
2247
2248 busiest = find_busiest_queue(group);
2249 if (!busiest || busiest == this_rq) {
2250 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2251 goto out_balanced;
2252 }
2253
2254
2255 double_lock_balance(this_rq, busiest);
2256
2257 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2258 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2259 imbalance, sd, NEWLY_IDLE, NULL);
2260 if (!nr_moved) {
2261 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2262 } else
2263 sd->nr_balance_failed = 0;
2264
2265 spin_unlock(&busiest->lock);
2266 return nr_moved;
2267
2268out_balanced:
2269 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2270 sd->nr_balance_failed = 0;
2271 return 0;
2272}
2273
2274
2275
2276
2277
2278static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2279{
2280 struct sched_domain *sd;
2281
2282 for_each_domain(this_cpu, sd) {
2283 if (sd->flags & SD_BALANCE_NEWIDLE) {
2284 if (load_balance_newidle(this_cpu, this_rq, sd)) {
2285
2286 break;
2287 }
2288 }
2289 }
2290}
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
2301{
2302 struct sched_domain *sd;
2303 runqueue_t *target_rq;
2304 int target_cpu = busiest->push_cpu;
2305
2306 schedstat_inc(busiest, alb_cnt);
2307 if (busiest->nr_running <= 1)
2308 return;
2309
2310 target_rq = cpu_rq(target_cpu);
2311
2312
2313
2314
2315
2316
2317
2318 BUG_ON(busiest == target_rq);
2319
2320 double_lock_balance(busiest, target_rq);
2321
2322 for_each_domain(target_cpu, sd)
2323 if (cpu_isset(busiest_cpu, sd->span))
2324 break;
2325
2326 if (unlikely(sd == NULL))
2327 goto out;
2328
2329 if (move_tasks(target_rq, target_cpu, busiest, 1, sd, IDLE, NULL)) {
2330 schedstat_inc(busiest, alb_lost);
2331 schedstat_inc(target_rq, alb_gained);
2332 } else {
2333 schedstat_inc(busiest, alb_failed);
2334 }
2335out:
2336 spin_unlock(&target_rq->lock);
2337}
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2350
2351#define IRQ_QUANTUM (HZ / 10)
2352
2353static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2354 enum idle_type idle)
2355{
2356 unsigned long old_load, this_load;
2357 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2358 struct sched_domain *sd;
2359
2360 if (unlikely(--this_rq->irq_quantum <= 0)) {
2361 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2362 u64 irq_ticks = cpustat->irq + cpustat->softirq;
2363 unsigned int ticks = irq_ticks - this_rq->prev_irq_ticks;
2364 unsigned int pct = (ticks * 100) / IRQ_QUANTUM;
2365
2366 this_rq->irq_pct = (this_rq->irq_pct + pct) / 2;
2367 this_rq->prev_irq_ticks = irq_ticks;
2368 this_rq->irq_quantum = IRQ_QUANTUM;
2369 }
2370
2371
2372 old_load = this_rq->cpu_load;
2373 this_load = current_load(this_rq);
2374
2375
2376
2377
2378
2379 if (this_load > old_load)
2380 old_load++;
2381 this_rq->cpu_load = (old_load + this_load) / 2;
2382
2383
2384
2385
2386 if (cpu_isset(this_cpu, cpu_isolated_map))
2387 return;
2388
2389 for_each_domain(this_cpu, sd) {
2390 unsigned long interval = sd->balance_interval;
2391
2392 if (idle != IDLE)
2393 interval *= sd->busy_factor;
2394
2395
2396 interval = msecs_to_jiffies(interval);
2397 if (unlikely(!interval))
2398 interval = 1;
2399
2400 if (j - sd->last_balance >= interval) {
2401 if (load_balance(this_cpu, this_rq, sd, idle)) {
2402
2403 idle = NOT_IDLE;
2404 }
2405 sd->last_balance += interval;
2406 }
2407 }
2408}
2409#else
2410
2411
2412
2413static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2414{
2415}
2416static inline void idle_balance(int cpu, runqueue_t *rq)
2417{
2418}
2419#endif
2420
2421static inline int wake_priority_sleeper(runqueue_t *rq)
2422{
2423 int ret = 0;
2424#ifdef CONFIG_SCHED_SMT
2425 spin_lock(&rq->lock);
2426
2427
2428
2429
2430 if (rq->nr_running) {
2431 resched_task(rq->idle);
2432 ret = 1;
2433 }
2434 spin_unlock(&rq->lock);
2435#endif
2436 return ret;
2437}
2438
2439DEFINE_PER_CPU(struct kernel_stat, kstat);
2440
2441EXPORT_PER_CPU_SYMBOL(kstat);
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453#define EXPIRED_STARVING(rq) \
2454 ((STARVATION_LIMIT && \
2455 (jiffies - (rq)->switch_timestamp > STARVATION_LIMIT)) || \
2456 ((rq)->curr->static_prio > (rq)->best_expired_prio))
2457
2458
2459
2460
2461
2462
2463
2464
2465void scheduler_tick(int user_ticks, int sys_ticks)
2466{
2467 int cpu = smp_processor_id();
2468 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2469 runqueue_t *rq = this_rq();
2470 task_t *p = current;
2471
2472 rq->timestamp_last_tick = sched_clock();
2473
2474 if (rcu_pending(cpu))
2475 rcu_check_callbacks(cpu, user_ticks);
2476
2477
2478 if (hardirq_count() - HARDIRQ_OFFSET) {
2479 cpustat->irq += sys_ticks;
2480 sys_ticks = 0;
2481 } else if (softirq_count()) {
2482 cpustat->softirq += sys_ticks;
2483 sys_ticks = 0;
2484 }
2485
2486 if (p == rq->idle) {
2487 if (atomic_read(&rq->nr_iowait) > 0)
2488 cpustat->iowait += sys_ticks;
2489 else
2490 cpustat->idle += sys_ticks;
2491 if (wake_priority_sleeper(rq))
2492 goto out;
2493 rebalance_tick(cpu, rq, IDLE);
2494 return;
2495 }
2496 if (TASK_NICE(p) > 0)
2497 cpustat->nice += user_ticks;
2498 else
2499 cpustat->user += user_ticks;
2500 cpustat->system += sys_ticks;
2501
2502
2503 if (p->array != rq->active) {
2504 set_tsk_need_resched(p);
2505 goto out;
2506 }
2507 spin_lock(&rq->lock);
2508
2509
2510
2511
2512
2513
2514
2515 if (rt_task(p)) {
2516
2517
2518
2519
2520 if ((p->policy == SCHED_RR) && !--p->time_slice) {
2521 p->time_slice = task_timeslice(p);
2522 p->first_time_slice = 0;
2523 set_tsk_need_resched(p);
2524
2525
2526 dequeue_task(p, rq->active);
2527 enqueue_task(p, rq->active);
2528 }
2529 goto out_unlock;
2530 }
2531 if (!--p->time_slice) {
2532 dequeue_task(p, rq->active);
2533 set_tsk_need_resched(p);
2534 p->prio = effective_prio(p);
2535 p->time_slice = task_timeslice(p);
2536 p->first_time_slice = 0;
2537
2538 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2539 enqueue_task(p, rq->expired);
2540 if (p->static_prio < rq->best_expired_prio)
2541 rq->best_expired_prio = p->static_prio;
2542 } else
2543 enqueue_task(p, rq->active);
2544 } else {
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2562 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2563 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2564 (p->array == rq->active)) {
2565
2566 dequeue_task(p, rq->active);
2567 set_tsk_need_resched(p);
2568 p->prio = effective_prio(p);
2569 enqueue_task(p, rq->active);
2570 }
2571 }
2572out_unlock:
2573 spin_unlock(&rq->lock);
2574out:
2575 rebalance_tick(cpu, rq, NOT_IDLE);
2576}
2577
2578#ifdef CONFIG_SCHED_SMT
2579static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2580{
2581 struct sched_domain *sd = this_rq->sd;
2582 cpumask_t sibling_map;
2583 int i;
2584
2585 if (!(sd->flags & SD_SHARE_CPUPOWER))
2586 return;
2587
2588
2589
2590
2591
2592
2593 spin_unlock(&this_rq->lock);
2594
2595 sibling_map = sd->span;
2596
2597 for_each_cpu_mask(i, sibling_map)
2598 spin_lock(&cpu_rq(i)->lock);
2599
2600
2601
2602
2603 cpu_clear(this_cpu, sibling_map);
2604
2605 for_each_cpu_mask(i, sibling_map) {
2606 runqueue_t *smt_rq = cpu_rq(i);
2607
2608
2609
2610
2611
2612 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2613 resched_task(smt_rq->idle);
2614 }
2615
2616 for_each_cpu_mask(i, sibling_map)
2617 spin_unlock(&cpu_rq(i)->lock);
2618
2619
2620
2621
2622}
2623
2624static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2625{
2626 struct sched_domain *sd = this_rq->sd;
2627 cpumask_t sibling_map;
2628 prio_array_t *array;
2629 int ret = 0, i;
2630 task_t *p;
2631
2632 if (!(sd->flags & SD_SHARE_CPUPOWER))
2633 return 0;
2634
2635
2636
2637
2638
2639 spin_unlock(&this_rq->lock);
2640 sibling_map = sd->span;
2641 for_each_cpu_mask(i, sibling_map)
2642 spin_lock(&cpu_rq(i)->lock);
2643 cpu_clear(this_cpu, sibling_map);
2644
2645
2646
2647
2648
2649 if (!this_rq->nr_running)
2650 goto out_unlock;
2651 array = this_rq->active;
2652 if (!array->nr_active)
2653 array = this_rq->expired;
2654 BUG_ON(!array->nr_active);
2655
2656 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
2657 task_t, run_list);
2658
2659 for_each_cpu_mask(i, sibling_map) {
2660 runqueue_t *smt_rq = cpu_rq(i);
2661 task_t *smt_curr = smt_rq->curr;
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2672 task_timeslice(p) || rt_task(smt_curr)) &&
2673 p->mm && smt_curr->mm && !rt_task(p))
2674 ret = 1;
2675
2676
2677
2678
2679
2680
2681 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2682 task_timeslice(smt_curr) || rt_task(p)) &&
2683 smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2684 (smt_curr == smt_rq->idle && smt_rq->nr_running))
2685 resched_task(smt_curr);
2686 }
2687out_unlock:
2688 for_each_cpu_mask(i, sibling_map)
2689 spin_unlock(&cpu_rq(i)->lock);
2690 return ret;
2691}
2692#else
2693static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2694{
2695}
2696
2697static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2698{
2699 return 0;
2700}
2701#endif
2702
2703
2704
2705
2706asmlinkage void __sched schedule(void)
2707{
2708 long *switch_count;
2709 task_t *prev, *next;
2710 runqueue_t *rq;
2711 prio_array_t *array;
2712 struct list_head *queue;
2713 unsigned long long now;
2714 unsigned long run_time;
2715 int cpu, idx;
2716
2717
2718
2719
2720
2721
2722
2723 if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) {
2724 if (unlikely(in_atomic())) {
2725 printk(KERN_ERR "bad: scheduling while atomic!\n");
2726 dump_stack();
2727 }
2728 }
2729
2730need_resched:
2731 preempt_disable();
2732 prev = current;
2733 rq = this_rq();
2734
2735
2736
2737
2738
2739 if (unlikely(current == rq->idle) && current->state != TASK_RUNNING) {
2740 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
2741 dump_stack();
2742 }
2743
2744 release_kernel_lock(prev);
2745 schedstat_inc(rq, sched_cnt);
2746 now = sched_clock();
2747 if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
2748 run_time = now - prev->timestamp;
2749 else
2750 run_time = NS_MAX_SLEEP_AVG;
2751
2752
2753
2754
2755
2756
2757 if (HIGH_CREDIT(prev))
2758 run_time /= (CURRENT_BONUS(prev) ? : 1);
2759
2760 spin_lock_irq(&rq->lock);
2761
2762 if (unlikely(current->flags & PF_DEAD))
2763 current->