1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/slab.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/file.h>
23#include <linux/writeback.h>
24#include <linux/suspend.h>
25#include <linux/blkdev.h>
26#include <linux/buffer_head.h>
27
28#include <linux/mm_inline.h>
29#include <linux/pagevec.h>
30#include <linux/backing-dev.h>
31#include <linux/rmap.h>
32#include <linux/topology.h>
33#include <linux/cpu.h>
34#include <linux/notifier.h>
35#include <linux/rwsem.h>
36
37#include <asm/tlbflush.h>
38#include <asm/div64.h>
39
40#include <linux/swapops.h>
41
42
43typedef enum {
44
45 PAGE_KEEP,
46
47 PAGE_CONGESTED,
48
49 PAGE_ACTIVATE,
50
51 PAGE_SUCCESS,
52
53 PAGE_CLEAN,
54} pageout_t;
55
56struct scan_control {
57
58 unsigned long nr_to_scan;
59
60
61 unsigned long nr_scanned;
62
63
64 unsigned long nr_reclaimed;
65
66
67 unsigned long nr_congested;
68
69 unsigned long nr_mapped;
70
71
72 int nr_to_reclaim;
73
74
75 unsigned int priority;
76
77
78 unsigned int gfp_mask;
79
80 int may_writepage;
81
82
83 unsigned int nr_ios;
84
85 int order;
86};
87
88
89
90
91
92struct shrinker {
93 shrinker_t shrinker;
94 struct list_head list;
95 int seeks;
96 long nr;
97};
98
99#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
100
101#ifdef ARCH_HAS_PREFETCH
102#define prefetch_prev_lru_page(_page, _base, _field) \
103 do { \
104 if ((_page)->lru.prev != _base) { \
105 struct page *prev; \
106 \
107 prev = lru_to_page(&(_page->lru)); \
108 prefetch(&prev->_field); \
109 } \
110 } while (0)
111#else
112#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
113#endif
114
115#ifdef ARCH_HAS_PREFETCHW
116#define prefetchw_prev_lru_page(_page, _base, _field) \
117 do { \
118 if ((_page)->lru.prev != _base) { \
119 struct page *prev; \
120 \
121 prev = lru_to_page(&(_page->lru)); \
122 prefetchw(&prev->_field); \
123 } \
124 } while (0)
125#else
126#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
127#endif
128
129
130
131
132int vm_swappiness = 60;
133
134
135
136
137int vm_inactive_percent = 0;
138
139static long total_memory;
140
141static LIST_HEAD(shrinker_list);
142static DECLARE_RWSEM(shrinker_rwsem);
143
144
145
146
147struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
148{
149 struct shrinker *shrinker;
150
151 shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
152 if (shrinker) {
153 shrinker->shrinker = theshrinker;
154 shrinker->seeks = seeks;
155 shrinker->nr = 0;
156 down_write(&shrinker_rwsem);
157 list_add(&shrinker->list, &shrinker_list);
158 up_write(&shrinker_rwsem);
159 }
160 return shrinker;
161}
162EXPORT_SYMBOL(set_shrinker);
163
164
165
166
167void remove_shrinker(struct shrinker *shrinker)
168{
169 down_write(&shrinker_rwsem);
170 list_del(&shrinker->list);
171 up_write(&shrinker_rwsem);
172 kfree(shrinker);
173}
174EXPORT_SYMBOL(remove_shrinker);
175
176#define SHRINK_BATCH 128
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
195 unsigned long lru_pages)
196{
197 struct shrinker *shrinker;
198
199 if (scanned == 0)
200 scanned = SWAP_CLUSTER_MAX;
201
202 if (!down_read_trylock(&shrinker_rwsem))
203 return 0;
204
205 list_for_each_entry(shrinker, &shrinker_list, list) {
206 unsigned long long delta;
207 unsigned long total_scan;
208
209 delta = (4 * scanned) / shrinker->seeks;
210 delta *= (*shrinker->shrinker)(0, gfp_mask);
211 do_div(delta, lru_pages + 1);
212 shrinker->nr += delta;
213 if (shrinker->nr < 0)
214 shrinker->nr = LONG_MAX;
215
216 total_scan = shrinker->nr;
217 shrinker->nr = 0;
218
219 while (total_scan >= SHRINK_BATCH) {
220 long this_scan = SHRINK_BATCH;
221 int shrink_ret;
222
223 shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
224 if (shrink_ret == -1)
225 break;
226 mod_page_state(slabs_scanned, this_scan);
227 total_scan -= this_scan;
228
229 cond_resched();
230 }
231
232 shrinker->nr += total_scan;
233 }
234 up_read(&shrinker_rwsem);
235 return 0;
236}
237
238
239static inline int page_mapping_inuse(struct page *page)
240{
241 struct address_space *mapping;
242
243
244 if (page_mapped(page))
245 return 1;
246
247
248 if (PageSwapCache(page))
249 return 1;
250
251 mapping = page_mapping(page);
252 if (!mapping)
253 return 0;
254
255
256 return mapping_mapped(mapping) && !pagecache_over_max();
257}
258
259static inline int is_page_cache_freeable(struct page *page)
260{
261 return page_count(page) - !!PagePrivate(page) == 2;
262}
263
264static int may_write_to_queue(struct backing_dev_info *bdi)
265{
266 if (current_is_kswapd())
267 return 1;
268 if (current_is_pdflush())
269 return 1;
270 if (!bdi_write_congested(bdi))
271 return 1;
272 if (bdi == current->backing_dev_info)
273 return 1;
274 return 0;
275}
276
277
278
279
280
281
282
283
284
285
286
287
288
289static void handle_write_error(struct address_space *mapping,
290 struct page *page, int error)
291{
292 lock_page(page);
293 if (page_mapping(page) == mapping) {
294 if (error == -ENOSPC)
295 set_bit(AS_ENOSPC, &mapping->flags);
296 else
297 set_bit(AS_EIO, &mapping->flags);
298 }
299 unlock_page(page);
300}
301
302
303
304
305static pageout_t pageout(struct page *page, struct address_space *mapping)
306{
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324 if (!is_page_cache_freeable(page))
325 return PAGE_KEEP;
326 if (!mapping)
327 return PAGE_KEEP;
328 if (mapping->a_ops->writepage == NULL)
329 return PAGE_ACTIVATE;
330 if (!may_write_to_queue(mapping->backing_dev_info))
331 return PAGE_CONGESTED;
332
333 if (clear_page_dirty_for_io(page)) {
334 int res;
335 struct writeback_control wbc = {
336 .sync_mode = WB_SYNC_NONE,
337 .nr_to_write = SWAP_CLUSTER_MAX,
338 .nonblocking = 1,
339 .for_reclaim = 1,
340 };
341
342 SetPageReclaim(page);
343 res = mapping->a_ops->writepage(page, &wbc);
344 if (res < 0)
345 handle_write_error(mapping, page, res);
346 if (res == WRITEPAGE_ACTIVATE) {
347 ClearPageReclaim(page);
348 return PAGE_ACTIVATE;
349 }
350 if (!PageWriteback(page)) {
351
352 ClearPageReclaim(page);
353 }
354
355 return PAGE_SUCCESS;
356 }
357
358 return PAGE_CLEAN;
359}
360
361
362
363
364static int shrink_list(struct list_head *page_list, struct scan_control *sc)
365{
366 LIST_HEAD(ret_pages);
367 struct pagevec freed_pvec;
368 int pgactivate = 0;
369 int reclaimed = 0;
370
371 cond_resched();
372
373 pagevec_init(&freed_pvec, 1);
374 while (!list_empty(page_list)) {
375 struct address_space *mapping;
376 struct page *page;
377 int may_enter_fs;
378 int referenced;
379
380 cond_resched();
381
382 page = lru_to_page(page_list);
383 list_del(&page->lru);
384
385 if (TestSetPageLocked(page))
386 goto keep;
387
388 BUG_ON(PageActive(page));
389
390 if (PageWriteback(page))
391 goto keep_locked;
392
393 sc->nr_scanned++;
394
395 if (page_mapped(page) || PageSwapCache(page))
396 sc->nr_scanned++;
397
398 referenced = page_referenced(page, 1, sc->priority <= 0);
399
400 if (referenced && page_mapping_inuse(page))
401 goto activate_locked;
402
403#ifdef CONFIG_SWAP
404
405
406
407
408 if (PageAnon(page) && !PageSwapCache(page)) {
409 if (!add_to_swap(page))
410 goto activate_locked;
411 }
412#endif
413
414 mapping = page_mapping(page);
415 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
416 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
417
418
419
420
421
422 if (page_mapped(page) && mapping) {
423 switch (try_to_unmap(page)) {
424 case SWAP_FAIL:
425 goto activate_locked;
426 case SWAP_AGAIN:
427 goto keep_locked;
428 case SWAP_SUCCESS:
429 ;
430 }
431 }
432
433 if (PageDirty(page)) {
434 if (referenced)
435 goto keep_locked;
436 if (!may_enter_fs)
437 goto keep_locked;
438 if (laptop_mode && !sc->may_writepage)
439 goto keep_locked;
440
441
442 switch(pageout(page, mapping)) {
443 case PAGE_CONGESTED:
444 sc->nr_congested++;
445
446 case PAGE_KEEP:
447 goto keep_locked;
448 case PAGE_ACTIVATE:
449 goto activate_locked;
450 case PAGE_SUCCESS:
451 if (PageWriteback(page) || PageDirty(page)) {
452 sc->nr_ios++;
453 goto keep;
454 }
455
456
457
458
459 if (TestSetPageLocked(page))
460 goto keep;
461 if (PageDirty(page) || PageWriteback(page)) {
462 sc->nr_ios++;
463 goto keep_locked;
464 }
465 mapping = page_mapping(page);
466 case PAGE_CLEAN:
467 ;
468 }
469 }
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492 if (PagePrivate(page)) {
493 if (!try_to_release_page(page, sc->gfp_mask))
494 goto activate_locked;
495 if (!mapping && page_count(page) == 1)
496 goto free_it;
497 }
498
499 if (!mapping)
500 goto keep_locked;
501
502 spin_lock_irq(&mapping->tree_lock);
503
504
505
506
507
508
509 if (page_count(page) != 2 || PageDirty(page)) {
510 spin_unlock_irq(&mapping->tree_lock);
511 goto keep_locked;
512 }
513
514#ifdef CONFIG_SWAP
515 if (PageSwapCache(page)) {
516 swp_entry_t swap = { .val = page->private };
517 __delete_from_swap_cache(page);
518 spin_unlock_irq(&mapping->tree_lock);
519 swap_free(swap);
520 __put_page(page);
521 goto free_it;
522 }
523#endif
524
525 __remove_from_page_cache(page);
526 spin_unlock_irq(&mapping->tree_lock);
527 __put_page(page);
528
529free_it:
530 unlock_page(page);
531 reclaimed++;
532 if (!pagevec_add(&freed_pvec, page))
533 __pagevec_release_nonlru(&freed_pvec);
534 continue;
535
536activate_locked:
537 SetPageActive(page);
538 pgactivate++;
539keep_locked:
540 unlock_page(page);
541keep:
542 list_add(&page->lru, &ret_pages);
543 BUG_ON(PageLRU(page));
544 }
545 list_splice(&ret_pages, page_list);
546 if (pagevec_count(&freed_pvec))
547 __pagevec_release_nonlru(&freed_pvec);
548 mod_page_state(pgactivate, pgactivate);
549 sc->nr_reclaimed += reclaimed;
550 return reclaimed;
551}
552
553
554
555
556
557
558
559
560
561
562
563static void shrink_cache(struct zone *zone, struct scan_control *sc)
564{
565 LIST_HEAD(page_list);
566 struct pagevec pvec;
567 int max_scan = sc->nr_to_scan;
568
569 pagevec_init(&pvec, 1);
570
571 lru_add_drain();
572 spin_lock_irq(&zone->lru_lock);
573 while (max_scan > 0) {
574 struct page *page;
575 int nr_taken = 0;
576 int nr_scan = 0;
577 int nr_freed;
578
579 while (nr_scan++ < SWAP_CLUSTER_MAX &&
580 !list_empty(&zone->inactive_list)) {
581 page = lru_to_page(&zone->inactive_list);
582
583 prefetchw_prev_lru_page(page,
584 &zone->inactive_list, flags);
585
586 if (!TestClearPageLRU(page))
587 BUG();
588 list_del(&page->lru);
589 if (get_page_testone(page)) {
590
591
592
593 __put_page(page);
594 SetPageLRU(page);
595 list_add(&page->lru, &zone->inactive_list);
596 continue;
597 }
598 list_add(&page->lru, &page_list);
599 nr_taken++;
600 }
601 zone->nr_inactive -= nr_taken;
602 zone->pages_scanned += nr_scan;
603 spin_unlock_irq(&zone->lru_lock);
604
605 if (nr_taken == 0)
606 goto done;
607
608 max_scan -= nr_scan;
609 if (current_is_kswapd())
610 mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
611 else
612 mod_page_state_zone(zone, pgscan_direct, nr_scan);
613 nr_freed = shrink_list(&page_list, sc);
614 if (current_is_kswapd())
615 mod_page_state(kswapd_steal, nr_freed);
616 mod_page_state_zone(zone, pgsteal, nr_freed);
617 sc->nr_to_reclaim -= nr_freed;
618
619 spin_lock_irq(&zone->lru_lock);
620
621
622
623 while (!list_empty(&page_list)) {
624 page = lru_to_page(&page_list);
625 if (TestSetPageLRU(page))
626 BUG();
627 list_del(&page->lru);
628 if (PageActive(page))
629 add_page_to_active_list(zone, page);
630 else
631 add_page_to_inactive_list(zone, page);
632 if (!pagevec_add(&pvec, page)) {
633 spin_unlock_irq(&zone->lru_lock);
634 __pagevec_release(&pvec);
635 spin_lock_irq(&zone->lru_lock);
636 }
637 }
638 }
639 spin_unlock_irq(&zone->lru_lock);
640done:
641 pagevec_release(&pvec);
642}
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661static void
662refill_inactive_zone(struct zone *zone, struct scan_control *sc)
663{
664 int pgmoved;
665 int pgdeactivate = 0;
666 int pgscanned = 0;
667 int nr_pages = sc->nr_to_scan;
668 LIST_HEAD(l_hold);
669 LIST_HEAD(l_inactive);
670 LIST_HEAD(l_active);
671 struct page *page;
672 struct pagevec pvec;
673 int reclaim_mapped = 0;
674 long mapped_ratio;
675 long distress;
676 long swap_tendency;
677
678 lru_add_drain();
679 pgmoved = 0;
680 spin_lock_irq(&zone->lru_lock);
681 while (pgscanned < nr_pages && !list_empty(&zone->active_list)) {
682 page = lru_to_page(&zone->active_list);
683 prefetchw_prev_lru_page(page, &zone->active_list, flags);
684 if (!TestClearPageLRU(page))
685 BUG();
686 list_del(&page->lru);
687 if (get_page_testone(page)) {
688
689
690
691
692
693
694 __put_page(page);
695 SetPageLRU(page);
696 list_add(&page->lru, &zone->active_list);
697 } else {
698 list_add(&page->lru, &l_hold);
699 pgmoved++;
700 }
701 pgscanned++;
702 }
703 zone->nr_active -= pgmoved;
704 spin_unlock_irq(&zone->lru_lock);
705
706
707
708
709
710 distress = 100 >> zone->prev_priority;
711
712
713
714
715
716
717 mapped_ratio = (sc->nr_mapped * 100) / total_memory;
718
719
720
721
722
723
724
725
726
727
728 swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
729
730
731
732
733
734 if (swap_tendency >= 100)
735 reclaim_mapped = 1;
736
737 while (!list_empty(&l_hold)) {
738 cond_resched();
739 page = lru_to_page(&l_hold);
740 list_del(&page->lru);
741
742
743
744 if ((zone->nr_active*vm_inactive_percent/100 > zone->nr_inactive) &&
745 (total_swap_pages && PageAnon(page))) {
746 int referenced;
747
748 referenced = page_referenced(page, 0, sc->priority <= 0);
749 list_add(&page->lru, &l_inactive);
750 continue;
751 }
752
753 if (page_mapped(page)) {
754 if (!reclaim_mapped ||
755 (total_swap_pages == 0 && PageAnon(page)) ||
756 page_referenced(page, 0, sc->priority <= 0)) {
757 list_add(&page->lru, &l_active);
758 continue;
759 }
760 }
761 list_add(&page->lru, &l_inactive);
762 }
763
764 pagevec_init(&pvec, 1);
765 pgmoved = 0;
766 spin_lock_irq(&zone->lru_lock);
767 while (!list_empty(&l_inactive)) {
768 page = lru_to_page(&l_inactive);
769 prefetchw_prev_lru_page(page, &l_inactive, flags);
770 if (TestSetPageLRU(page))
771 BUG();
772 if (!TestClearPageActive(page))
773 BUG();
774 list_move(&page->lru, &zone->inactive_list);
775 pgmoved++;
776 if (!pagevec_add(&pvec, page)) {
777 zone->nr_inactive += pgmoved;
778 spin_unlock_irq(&zone->lru_lock);
779 pgdeactivate += pgmoved;
780 pgmoved = 0;
781 if (buffer_heads_over_limit)
782 pagevec_strip(&pvec);
783 __pagevec_release(&pvec);
784 spin_lock_irq(&zone->lru_lock);
785 }
786 }
787 zone->nr_inactive += pgmoved;
788 pgdeactivate += pgmoved;
789 if (buffer_heads_over_limit) {
790 spin_unlock_irq(&zone->lru_lock);
791 pagevec_strip(&pvec);
792 spin_lock_irq(&zone->lru_lock);
793 }
794
795 pgmoved = 0;
796 while (!list_empty(&l_active)) {
797 page = lru_to_page(&l_active);
798 prefetchw_prev_lru_page(page, &l_active, flags);
799 if (TestSetPageLRU(page))
800 BUG();
801 BUG_ON(!PageActive(page));
802 list_move(&page->lru, &zone->active_list);
803 pgmoved++;
804 if (!pagevec_add(&pvec, page)) {
805 zone->nr_active += pgmoved;
806 pgmoved = 0;
807 spin_unlock_irq(&zone->lru_lock);
808 __pagevec_release(&pvec);
809 spin_lock_irq(&zone->lru_lock);
810 }
811 }
812 zone->nr_active += pgmoved;
813 spin_unlock_irq(&zone->lru_lock);
814 pagevec_release(&pvec);
815
816 mod_page_state_zone(zone, pgrefill, pgscanned);
817 mod_page_state(pgdeactivate, pgdeactivate);
818}
819
820
821
822
823static void
824shrink_zone(struct zone *zone, struct scan_control *sc)
825{
826 unsigned long nr_active;
827 unsigned long nr_inactive;
828
829
830
831
832
833 zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
834 nr_active = zone->nr_scan_active;
835 if (nr_active >= SWAP_CLUSTER_MAX)
836 zone->nr_scan_active = 0;
837 else
838 nr_active = 0;
839
840 zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
841 nr_inactive = zone->nr_scan_inactive;
842 if (nr_inactive >= SWAP_CLUSTER_MAX)
843 zone->nr_scan_inactive = 0;
844 else
845 nr_inactive = 0;
846
847 sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
848
849 while (nr_active || nr_inactive) {
850 if (current->flags & PF_MEMDIE)
851 break;
852
853 if ((zone->free_pages > zone->pages_high*2) && !sc->order)
854 break;
855
856 if (nr_active) {
857 sc->nr_to_scan = min(nr_active,
858 (unsigned long)SWAP_CLUSTER_MAX);
859 nr_active -= sc->nr_to_scan;
860 refill_inactive_zone(zone, sc);
861 }
862
863 if (nr_inactive) {
864 sc->nr_to_scan = min(nr_inactive,
865 (unsigned long)SWAP_CLUSTER_MAX);
866 nr_inactive -= sc->nr_to_scan;
867 shrink_cache(zone, sc);
868 if (sc->nr_to_reclaim <= 0)
869 break;
870
871 } else if (zone->nr_active*vm_inactive_percent/100 > zone->nr_inactive) {
872 sc->nr_to_scan = min(zone->nr_active,
873 (unsigned long)SWAP_CLUSTER_MAX);
874 shrink_cache(zone, sc);
875 if (sc->nr_to_reclaim <= 0)
876 break;
877 }
878 }
879
880 throttle_vm_writeout();
881}
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899static void
900shrink_caches(struct zone **zones, struct scan_control *sc)
901{
902 int i;
903
904 for (i = 0; zones[i] != NULL; i++) {
905 struct zone *zone = zones[i];
906
907 if (current->flags & PF_MEMDIE)
908 return NULL;
909
910 if (zone->present_pages == 0)
911 continue;
912
913 zone->temp_priority = sc->priority;
914 if (zone->prev_priority > sc->priority)
915 zone->prev_priority = sc->priority;
916
917 if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
918 continue;
919
920 shrink_zone(zone, sc);
921 }
922}
923
924static int free_below_min(struct zone **zones, unsigned int gfp_mask,
925 int can_try_harder, int alloc_type,
926 unsigned int order)
927{
928 unsigned long free = 0;
929 unsigned long min = 0;
930 int i;
931
932 for (i = 0; zones[i] != NULL; i++) {
933 struct zone *zone = zones[i];
934
935 free += zone->free_pages;
936 min += zone_min(zone,gfp_mask,can_try_harder,alloc_type,order);
937 }
938
939 return (min >= free);
940}
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955int try_to_free_pages(struct zone **zones,
956 unsigned int gfp_mask, unsigned int order, int can_try_harder, int alloc_type)
957{
958 int priority;
959 int ret = 0;
960 int total_scanned = 0, total_reclaimed = 0;
961 struct reclaim_state *reclaim_state = current->reclaim_state;
962 struct scan_control sc;
963 unsigned long lru_pages = 0;
964 int i;
965
966 sc.gfp_mask = gfp_mask;
967 sc.may_writepage = 0;
968 sc.order = order;
969
970 inc_page_state(allocstall);
971
972 for (i = 0; zones[i] != NULL; i++) {
973 struct zone *zone = zones[i];
974
975 zone->temp_priority = DEF_PRIORITY;
976 lru_pages += zone->nr_active + zone->nr_inactive;
977 }
978
979 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
980 sc.nr_mapped = read_page_state(nr_mapped);
981 sc.nr_scanned = 0;
982 sc.nr_reclaimed = 0;
983 sc.nr_congested = 0;
984 sc.priority = priority;
985 sc.nr_ios = 0;
986
987 if (current->flags & PF_MEMDIE)
988 goto out;
989
990 shrink_caches(zones, &sc);
991 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
992 if (reclaim_state) {
993 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
994 reclaim_state->reclaimed_slab = 0;
995 }
996 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) {
997 ret = 1;
998 goto out;
999 }
1000 total_scanned += sc.nr_scanned;
1001 total_reclaimed += sc.nr_reclaimed;
1002
1003
1004
1005
1006
1007
1008
1009
1010 if (total_scanned > SWAP_CLUSTER_MAX + SWAP_CLUSTER_MAX/2) {
1011 wakeup_bdflush(laptop_mode ? 0 : total_scanned);
1012 sc.may_writepage = 1;
1013 }
1014
1015
1016 if (sc.nr_scanned && sc.nr_ios && priority < DEF_PRIORITY - 2)
1017 blk_congestion_wait(WRITE, HZ/10);
1018 }
1019 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY) &&
1020 !total_reclaimed && sc.nr_congested < SWAP_CLUSTER_MAX &&
1021 free_below_min(zones, gfp_mask, can_try_harder, alloc_type, order))
1022 out_of_memory(gfp_mask);
1023out:
1024 for (i = 0; zones[i] != 0; i++)
1025 zones[i]->prev_priority = zones[i]->temp_priority;
1026 return ret;
1027}
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054static int balance_pgdat(pg_data_t *pgdat, int nr_pages)
1055{
1056 int to_free = nr_pages;
1057 int all_zones_ok;
1058 int priority;
1059 int i;
1060 int total_scanned, total_reclaimed;
1061 struct reclaim_state *reclaim_state = current->reclaim_state;
1062 struct scan_control sc;
1063
1064loop_again:
1065 total_scanned = 0;
1066 total_reclaimed = 0;
1067 sc.gfp_mask = GFP_KERNEL;
1068 sc.may_writepage = 0;
1069 sc.nr_mapped = read_page_state(nr_mapped);
1070 sc.order = nr_pages?1:0;
1071
1072 inc_page_state(pageoutrun);
1073
1074 for (i = 0; i < pgdat->nr_zones; i++) {
1075 struct zone *zone = pgdat->node_zones + i;
1076
1077 zone->temp_priority = DEF_PRIORITY;
1078 }
1079
1080 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1081 int end_zone = 0;
1082 unsigned long lru_pages = 0;
1083
1084 sc.nr_ios = 0;
1085 all_zones_ok = 1;
1086
1087 if (nr_pages == 0) {
1088
1089
1090
1091
1092 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1093 struct zone *zone = pgdat->node_zones + i;
1094
1095 if (zone->present_pages == 0)
1096 continue;
1097
1098 if (zone->all_unreclaimable &&
1099 priority != DEF_PRIORITY)
1100 continue;
1101
1102 if (zone->free_pages <=
1103 (zone->pages_high + zone->protection[i])) {
1104 end_zone = i;
1105 goto scan;
1106 }
1107 }
1108 goto out;
1109 } else {
1110 end_zone = pgdat->nr_zones - 1;
1111 }
1112scan:
1113 for (i = 0; i <= end_zone; i++) {
1114 struct zone *zone = pgdat->node_zones + i;
1115
1116 lru_pages += zone->nr_active + zone->nr_inactive;
1117 }
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128 for (i = 0; i <= end_zone; i++) {
1129 struct zone *zone = pgdat->node_zones + i;
1130
1131 if (zone->present_pages == 0)
1132 continue;
1133
1134 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1135 continue;
1136
1137 if (nr_pages == 0) {
1138 if (zone->free_pages <=
1139 (zone->pages_high + zone->protection[end_zone]))
1140 all_zones_ok = 0;
1141 }
1142 zone->temp_priority = priority;
1143 if (zone->prev_priority > priority)
1144 zone->prev_priority = priority;
1145 sc.nr_scanned = 0;
1146 sc.nr_reclaimed = 0;
1147 sc.nr_congested = 0;
1148 sc.priority = priority;
1149 shrink_zone(zone, &sc);
1150 reclaim_state->reclaimed_slab = 0;
1151 shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
1152 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
1153 total_reclaimed += sc.nr_reclaimed;
1154 total_scanned += sc.nr_scanned;
1155 if (zone->all_unreclaimable)
1156 continue;
1157 if (zone->pages_scanned >= (zone->nr_active +
1158 zone->nr_inactive) * 4)
1159 zone->all_unreclaimable = 1;
1160
1161
1162
1163
1164
1165 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1166 total_scanned > total_reclaimed+total_reclaimed/2)
1167 sc.may_writepage = 1;
1168 }
1169 if (nr_pages && to_free > total_reclaimed)
1170 continue;
1171 if (all_zones_ok)
1172 break;
1173
1174
1175
1176
1177 if (total_scanned && sc.nr_ios && priority < DEF_PRIORITY - 2)
1178 blk_congestion_wait(WRITE, HZ/10);
1179
1180
1181
1182
1183
1184
1185
1186 if (total_reclaimed >= SWAP_CLUSTER_MAX)
1187 break;
1188 }
1189out:
1190 for (i = 0; i < pgdat->nr_zones; i++) {
1191 struct zone *zone = pgdat->node_zones + i;
1192
1193 zone->prev_priority = zone->temp_priority;
1194 }
1195 if (!all_zones_ok) {
1196 cond_resched();
1197 goto loop_again;
1198 }
1199
1200 return total_reclaimed;
1201}
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216static int kswapd(void *p)
1217{
1218 pg_data_t *pgdat = (pg_data_t*)p;
1219 struct task_struct *tsk = current;
1220 DEFINE_WAIT(wait);
1221 struct reclaim_state reclaim_state = {
1222 .reclaimed_slab = 0,
1223 };
1224 cpumask_t cpumask;
1225
1226 daemonize("kswapd%d", pgdat->node_id);
1227 cpumask = node_to_cpumask(pgdat->node_id);
1228 if (!cpus_empty(cpumask))
1229 set_cpus_allowed(tsk, cpumask);
1230 current->reclaim_state = &reclaim_state;
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244 tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
1245
1246 for ( ; ; ) {
1247 if (current->flags & PF_FREEZE)
1248 refrigerator(PF_FREEZE);
1249 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
1250 schedule();
1251 finish_wait(&pgdat->kswapd_wait, &wait);
1252
1253 balance_pgdat(pgdat, 0);
1254 }
1255 return 0;
1256}
1257
1258
1259
1260
1261void wakeup_kswapd(struct zone *zone)
1262{
1263 if (zone->present_pages == 0)
1264 return;
1265 if (zone->free_pages > zone->pages_low)
1266 return;
1267 if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
1268 return;
1269 wake_up_interruptible(&zone->zone_pgdat->kswapd_wait);
1270}
1271
1272#ifdef CONFIG_PM
1273
1274
1275
1276
1277int shrink_all_memory(int nr_pages)
1278{
1279 pg_data_t *pgdat;
1280 int nr_to_free = nr_pages;
1281 int ret = 0;
1282 struct reclaim_state reclaim_state = {
1283 .reclaimed_slab = 0,
1284 };
1285
1286 current->reclaim_state = &reclaim_state;
1287 for_each_pgdat(pgdat) {
1288 int freed;
1289 freed = balance_pgdat(pgdat, nr_to_free);
1290 ret += freed;
1291 nr_to_free -= freed;
1292 if (nr_to_free <= 0)
1293 break;
1294 }
1295 current->reclaim_state = NULL;
1296 return ret;
1297}
1298#endif
1299
1300#ifdef CONFIG_HOTPLUG_CPU
1301
1302
1303
1304
1305static int __devinit cpu_callback(struct notifier_block *nfb,
1306 unsigned long action,
1307 void *hcpu)
1308{
1309 pg_data_t *pgdat;
1310 cpumask_t mask;
1311
1312 if (action == CPU_ONLINE) {
1313 for_each_pgdat(pgdat) {
1314 mask = node_to_cpumask(pgdat->node_id);
1315 if (any_online_cpu(mask) != NR_CPUS)
1316
1317 set_cpus_allowed(pgdat->kswapd, mask);
1318 }
1319 }
1320 return NOTIFY_OK;
1321}
1322#endif
1323
1324static int __init kswapd_init(void)
1325{
1326 pg_data_t *pgdat;
1327 swap_setup();
1328 for_each_pgdat(pgdat)
1329 pgdat->kswapd
1330 = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
1331 total_memory = nr_free_pagecache_pages();
1332 hotcpu_notifier(cpu_callback, 0);
1333 return 0;
1334}
1335
1336module_init(kswapd_init)
1337