RHEL4/mm/swap.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/swap.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 */
   6
   7/*
   8 * This file contains the default values for the opereation of the
   9 * Linux VM subsystem. Fine-tuning documentation can be found in
  10 * Documentation/sysctl/vm.txt.
  11 * Started 18.12.91
  12 * Swap aging added 23.2.95, Stephen Tweedie.
  13 * Buffermem limits added 12.3.98, Rik van Riel.
  14 */
  15
  16#include <linux/mm.h>
  17#include <linux/sched.h>
  18#include <linux/kernel_stat.h>
  19#include <linux/swap.h>
  20#include <linux/mman.h>
  21#include <linux/pagemap.h>
  22#include <linux/pagevec.h>
  23#include <linux/init.h>
  24#include <linux/module.h>
  25#include <linux/mm_inline.h>
  26#include <linux/buffer_head.h>  /* for try_to_release_page() */
  27#include <linux/module.h>
  28#include <linux/percpu_counter.h>
  29#include <linux/percpu.h>
  30#include <linux/cpu.h>
  31#include <linux/notifier.h>
  32#include <linux/init.h>
  33
  34/* How many pages do we try to swap or page in/out together? */
  35int page_cluster;
  36
  37/*
  38 * When the pagecache is over /proc/sys/vm/pagecache does the following:
  39 * - mark_page_accessed() keeps unmapped pages on the inactive_list.
  40 * - moves munmap()'d pages to the inactive_list.
  41 * - shrink_list() wont activate unmapped and referenced pages from
  42 *   mapped object.
  43 */
  44int pagecache_maxpercent = 100;
  45
  46void put_page(struct page *page)
  47{
  48        if (unlikely(PageCompound(page))) {
  49                page = (struct page *)page->private;
  50                if (put_page_testzero(page)) {
  51                        void (*dtor)(struct page *page);
  52
  53                        dtor = (void (*)(struct page *))page[1].mapping;
  54                        (*dtor)(page);
  55                }
  56                return;
  57        }
  58        if (!PageReserved(page) && put_page_testzero(page))
  59                __page_cache_release(page);
  60}
  61EXPORT_SYMBOL(put_page);
  62
  63/*
  64 * Writeback is about to end against a page which has been marked for immediate
  65 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
  66 * inactive list.  The page still has PageWriteback set, which will pin it.
  67 *
  68 * We don't expect many pages to come through here, so don't bother batching
  69 * things up.
  70 *
  71 * To avoid placing the page at the tail of the LRU while PG_writeback is still
  72 * set, this function will clear PG_writeback before performing the page
  73 * motion.  Do that inside the lru lock because once PG_writeback is cleared
  74 * we may not touch the page.
  75 *
  76 * Returns zero if it cleared PG_writeback.
  77 */
  78int rotate_reclaimable_page(struct page *page)
  79{
  80        struct zone *zone;
  81        unsigned long flags;
  82
  83        if (PageLocked(page))
  84                return 1;
  85        if (PageDirty(page))
  86                return 1;
  87        if (PageActive(page))
  88                return 1;
  89        if (!PageLRU(page))
  90                return 1;
  91
  92        zone = page_zone(page);
  93        spin_lock_irqsave(&zone->lru_lock, flags);
  94        if (PageLRU(page) && !PageActive(page)) {
  95                list_del(&page->lru);
  96                list_add_tail(&page->lru, &zone->inactive_list);
  97                inc_page_state(pgrotated);
  98        }
  99        if (!test_clear_page_writeback(page))
 100                BUG();
 101        spin_unlock_irqrestore(&zone->lru_lock, flags);
 102        return 0;
 103}
 104
 105/*
 106 * FIXME: speed this up?
 107 */
 108void fastcall activate_page(struct page *page)
 109{
 110        struct zone *zone = page_zone(page);
 111
 112        spin_lock_irq(&zone->lru_lock);
 113        if (PageLRU(page) && !PageActive(page)) {
 114                del_page_from_inactive_list(zone, page);
 115                SetPageActive(page);
 116                add_page_to_active_list(zone, page);
 117                inc_page_state(pgactivate);
 118        }
 119        spin_unlock_irq(&zone->lru_lock);
 120}
 121
 122static DEFINE_PER_CPU(struct pagevec, deactivate_pvecs) = { 0, };
 123
 124static void __pagevec_deactivate(struct pagevec *pvec)
 125{
 126        int i;
 127        struct zone *zone = NULL;
 128
 129        for (i = 0; i < pagevec_count(pvec); i++) {
 130                struct page *page = pvec->pages[i];
 131                struct zone *pagezone = page_zone(page);
 132
 133                if (pagezone != zone) {
 134                        if (zone)
 135                                spin_unlock_irq(&zone->lru_lock);
 136                        zone = pagezone;
 137                        spin_lock_irq(&zone->lru_lock);
 138                }
 139
 140                /*
 141                 * Deactivate the page if it is unmapped.
 142                 */
 143                if (PageLRU(page) && PageActive(page) && !page_mapped(page)) {
 144                        ClearPageActive(page);
 145                        del_page_from_active_list(zone, page);
 146                        add_page_to_inactive_list(zone, page);
 147                        inc_page_state(pgdeactivate);
 148                }
 149        }
 150        if (zone)
 151                spin_unlock_irq(&zone->lru_lock);
 152        release_pages(pvec->pages, pvec->nr, pvec->cold);
 153        pagevec_reinit(pvec);
 154}
 155
 156void fastcall deactivate_unmapped_page(struct page *page)
 157{
 158        struct pagevec *pvec;
 159
 160        if (PageActive(page) && PageLRU(page)) {
 161                pvec = &get_cpu_var(deactivate_pvecs);
 162                page_cache_get(page);
 163                if (!pagevec_add(pvec, page))
 164                        __pagevec_deactivate(pvec);
 165                put_cpu_var(deactivate_pvecs);
 166        }
 167}
 168
 169static DEFINE_PER_CPU(struct pagevec, mark_accessed_pvecs) = { 0, };
 170
 171static void __pagevec_mark_accessed(struct pagevec *pvec)
 172{
 173        int i;
 174        struct zone *zone = NULL;
 175
 176        for (i = 0; i < pagevec_count(pvec); i++) {
 177                struct page *page = pvec->pages[i];
 178                struct zone *pagezone = page_zone(page);
 179
 180                if (pagezone != zone) {
 181                        if (zone)
 182                                spin_unlock_irq(&zone->lru_lock);
 183                        zone = pagezone;
 184                        spin_lock_irq(&zone->lru_lock);
 185                }
 186                if (PageLRU(page) && !PageActive(page)) {
 187                        /*
 188                         * Move unmapped pages to the head of the
 189                         * inactive list.  Move mapped pages to the
 190                         * head of the active list.
 191                         */
 192                        if (!page_mapped(page) && pagecache_over_max()) {
 193                                list_move(&page->lru, &zone->inactive_list);
 194                        } else {
 195                                del_page_from_inactive_list(zone, page);
 196                                SetPageActive(page);
 197                                add_page_to_active_list(zone, page);
 198                                inc_page_state(pgactivate);
 199                                ClearPageReferenced(page);
 200                        }
 201                }
 202        }
 203        if (zone)
 204                spin_unlock_irq(&zone->lru_lock);
 205        release_pages(pvec->pages, pvec->nr, pvec->cold);
 206        pagevec_reinit(pvec);
 207}
 208
 209/*
 210 * Mark a page as having seen activity.
 211 *
 212 * inactive,unreferenced        ->      inactive,referenced
 213 * inactive,referenced          ->      active,unreferenced
 214 * active,unreferenced          ->      active,referenced
 215 *      When pagecache_over_max() is true:
 216 * inactive,referenced,unmapped ->      head of inactive,referenced
 217 */
 218void fastcall mark_page_accessed(struct page *page)
 219{
 220        if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
 221                struct pagevec *pvec;
 222
 223                pvec = &get_cpu_var(mark_accessed_pvecs);
 224                page_cache_get(page);
 225                if (!pagevec_add(pvec, page))
 226                        __pagevec_mark_accessed(pvec);
 227                put_cpu_var(mark_accessed_pvecs);
 228        } else if (!PageReferenced(page)) {
 229                SetPageReferenced(page);
 230        }
 231}
 232
 233EXPORT_SYMBOL(mark_page_accessed);
 234
 235/**
 236 * lru_cache_add: add a page to the page lists
 237 * @page: the page to add
 238 */
 239static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
 240static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
 241
 242void fastcall lru_cache_add(struct page *page)
 243{
 244        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
 245
 246        page_cache_get(page);
 247        if (!pagevec_add(pvec, page))
 248                __pagevec_lru_add(pvec);
 249        put_cpu_var(lru_add_pvecs);
 250}
 251
 252void fastcall lru_cache_add_active(struct page *page)
 253{
 254        struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
 255
 256        page_cache_get(page);
 257        if (!pagevec_add(pvec, page))
 258                __pagevec_lru_add_active(pvec);
 259        put_cpu_var(lru_add_active_pvecs);
 260}
 261
 262void lru_add_drain(void)
 263{
 264        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
 265
 266        if (pagevec_count(pvec))
 267                __pagevec_lru_add(pvec);
 268        pvec = &__get_cpu_var(lru_add_active_pvecs);
 269        if (pagevec_count(pvec))
 270                __pagevec_lru_add_active(pvec);
 271        pvec = &__get_cpu_var(mark_accessed_pvecs);
 272        if (pagevec_count(pvec))
 273                __pagevec_mark_accessed(pvec);
 274        pvec = &__get_cpu_var(deactivate_pvecs);
 275        if (pagevec_count(pvec))
 276                __pagevec_deactivate(pvec);
 277        put_cpu_var(lru_add_pvecs);
 278}
 279
 280/*
 281 * This path almost never happens for VM activity - pages are normally
 282 * freed via pagevecs.  But it gets used by networking.
 283 */
 284void fastcall __page_cache_release(struct page *page)
 285{
 286        unsigned long flags;
 287        struct zone *zone = page_zone(page);
 288
 289        spin_lock_irqsave(&zone->lru_lock, flags);
 290        if (TestClearPageLRU(page))
 291                del_page_from_lru(zone, page);
 292        if (page_count(page) != 0)
 293                page = NULL;
 294        spin_unlock_irqrestore(&zone->lru_lock, flags);
 295        if (page)
 296                free_hot_page(page);
 297}
 298
 299EXPORT_SYMBOL(__page_cache_release);
 300
 301/*
 302 * Batched page_cache_release().  Decrement the reference count on all the
 303 * passed pages.  If it fell to zero then remove the page from the LRU and
 304 * free it.
 305 *
 306 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
 307 * for the remainder of the operation.
 308 *
 309 * The locking in this function is against shrink_cache(): we recheck the
 310 * page count inside the lock to see whether shrink_cache grabbed the page
 311 * via the LRU.  If it did, give up: shrink_cache will free it.
 312 */
 313void release_pages(struct page **pages, int nr, int cold)
 314{
 315        int i;
 316        struct pagevec pages_to_free;
 317        struct zone *zone = NULL;
 318
 319        pagevec_init(&pages_to_free, cold);
 320        for (i = 0; i < nr; i++) {
 321                struct page *page = pages[i];
 322                struct zone *pagezone;
 323
 324                if (PageReserved(page) || !put_page_testzero(page))
 325                        continue;
 326
 327                pagezone = page_zone(page);
 328                if (pagezone != zone) {
 329                        if (zone)
 330                                spin_unlock_irq(&zone->lru_lock);
 331                        zone = pagezone;
 332                        spin_lock_irq(&zone->lru_lock);
 333                }
 334                if (TestClearPageLRU(page))
 335                        del_page_from_lru(zone, page);
 336                if (page_count(page) == 0) {
 337                        if (!pagevec_add(&pages_to_free, page)) {
 338                                spin_unlock_irq(&zone->lru_lock);
 339                                __pagevec_free(&pages_to_free);
 340                                pagevec_reinit(&pages_to_free);
 341                                zone = NULL;    /* No lock is held */
 342                        }
 343                }
 344        }
 345        if (zone)
 346                spin_unlock_irq(&zone->lru_lock);
 347
 348        pagevec_free(&pages_to_free);
 349}
 350
 351/*
 352 * The pages which we're about to release may be in the deferred lru-addition
 353 * queues.  That would prevent them from really being freed right now.  That's
 354 * OK from a correctness point of view but is inefficient - those pages may be
 355 * cache-warm and we want to give them back to the page allocator ASAP.
 356 *
 357 * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
 358 * and __pagevec_lru_add_active() call release_pages() directly to avoid
 359 * mutual recursion.
 360 */
 361void __pagevec_release(struct pagevec *pvec)
 362{
 363        lru_add_drain();
 364        release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
 365        pagevec_reinit(pvec);
 366}
 367
 368/*
 369 * pagevec_release() for pages which are known to not be on the LRU
 370 *
 371 * This function reinitialises the caller's pagevec.
 372 */
 373void __pagevec_release_nonlru(struct pagevec *pvec)
 374{
 375        int i;
 376        struct pagevec pages_to_free;
 377
 378        pagevec_init(&pages_to_free, pvec->cold);
 379        pages_to_free.cold = pvec->cold;
 380        for (i = 0; i < pagevec_count(pvec); i++) {
 381                struct page *page = pvec->pages[i];
 382
 383                BUG_ON(PageLRU(page));
 384                if (put_page_testzero(page))
 385                        pagevec_add(&pages_to_free, page);
 386        }
 387        pagevec_free(&pages_to_free);
 388        pagevec_reinit(pvec);
 389}
 390
 391/*
 392 * Add the passed pages to the LRU, then drop the caller's refcount
 393 * on them.  Reinitialises the caller's pagevec.
 394 */
 395void __pagevec_lru_add(struct pagevec *pvec)
 396{
 397        int i;
 398        struct zone *zone = NULL;
 399
 400        for (i = 0; i < pagevec_count(pvec); i++) {
 401                struct page *page = pvec->pages[i];
 402                struct zone *pagezone = page_zone(page);
 403
 404                if (pagezone != zone) {
 405                        if (zone)
 406                                spin_unlock_irq(&zone->lru_lock);
 407                        zone = pagezone;
 408                        spin_lock_irq(&zone->lru_lock);
 409                }
 410                if (TestSetPageLRU(page))
 411                        BUG();
 412                add_page_to_inactive_list(zone, page);
 413        }
 414        if (zone)
 415                spin_unlock_irq(&zone->lru_lock);
 416        release_pages(pvec->pages, pvec->nr, pvec->cold);
 417        pagevec_reinit(pvec);
 418}
 419
 420EXPORT_SYMBOL(__pagevec_lru_add);
 421
 422void __pagevec_lru_add_active(struct pagevec *pvec)
 423{
 424        int i;
 425        struct zone *zone = NULL;
 426
 427        for (i = 0; i < pagevec_count(pvec); i++) {
 428                struct page *page = pvec->pages[i];
 429                struct zone *pagezone = page_zone(page);
 430
 431                if (pagezone != zone) {
 432                        if (zone)
 433                                spin_unlock_irq(&zone->lru_lock);
 434                        zone = pagezone;
 435                        spin_lock_irq(&zone->lru_lock);
 436                }
 437                if (TestSetPageLRU(page))
 438                        BUG();
 439                if (TestSetPageActive(page))
 440                        BUG();
 441                add_page_to_active_list(zone, page);
 442        }
 443        if (zone)
 444                spin_unlock_irq(&zone->lru_lock);
 445        release_pages(pvec->pages, pvec->nr, pvec->cold);
 446        pagevec_reinit(pvec);
 447}
 448
 449/*
 450 * Try to drop buffers from the pages in a pagevec
 451 */
 452void pagevec_strip(struct pagevec *pvec)
 453{
 454        int i;
 455
 456        for (i = 0; i < pagevec_count(pvec); i++) {
 457                struct page *page = pvec->pages[i];
 458
 459                if (PagePrivate(page) && !TestSetPageLocked(page)) {
 460                        if (PagePrivate(page))
 461                                try_to_release_page(page, 0);
 462                        unlock_page(page);
 463                }
 464        }
 465}
 466
 467/**
 468 * pagevec_lookup - gang pagecache lookup
 469 * @pvec:       Where the resulting pages are placed
 470 * @mapping:    The address_space to search
 471 * @start:      The starting page index
 472 * @nr_pages:   The maximum number of pages
 473 *
 474 * pagevec_lookup() will search for and return a group of up to @nr_pages pages
 475 * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
 476 * reference against the pages in @pvec.
 477 *
 478 * The search returns a group of mapping-contiguous pages with ascending
 479 * indexes.  There may be holes in the indices due to not-present pages.
 480 *
 481 * pagevec_lookup() returns the number of pages which were found.
 482 */
 483unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 484                pgoff_t start, unsigned nr_pages)
 485{
 486        pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 487        return pagevec_count(pvec);
 488}
 489
 490unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 491                pgoff_t *index, int tag, unsigned nr_pages)
 492{
 493        pvec->nr = find_get_pages_tag(mapping, index, tag,
 494                                        nr_pages, pvec->pages);
 495        return pagevec_count(pvec);
 496}
 497
 498
 499#ifdef CONFIG_SMP
 500/*
 501 * We tolerate a little inaccuracy to avoid ping-ponging the counter between
 502 * CPUs
 503 */
 504#define ACCT_THRESHOLD  max(16, NR_CPUS * 2)
 505
 506static DEFINE_PER_CPU(long, committed_space) = 0;
 507
 508void vm_acct_memory(long pages)
 509{
 510        long *local;
 511
 512        preempt_disable();
 513        local = &__get_cpu_var(committed_space);
 514        *local += pages;
 515        if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
 516                atomic_add(*local, &vm_committed_space);
 517                *local = 0;
 518        }
 519        preempt_enable();
 520}
 521EXPORT_SYMBOL(vm_acct_memory);
 522
 523#ifdef CONFIG_HOTPLUG_CPU
 524static void lru_drain_cache(unsigned int cpu)
 525{
 526        struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
 527
 528        /* CPU is dead, so no locking needed. */
 529        if (pagevec_count(pvec))
 530                __pagevec_lru_add(pvec);
 531        pvec = &per_cpu(lru_add_active_pvecs, cpu);
 532        if (pagevec_count(pvec))
 533                __pagevec_lru_add_active(pvec);
 534}
 535
 536/* Drop the CPU's cached committed space back into the central pool. */
 537static int cpu_swap_callback(struct notifier_block *nfb,
 538                             unsigned long action,
 539                             void *hcpu)
 540{
 541        long *committed;
 542
 543        committed = &per_cpu(committed_space, (long)hcpu);
 544        if (action == CPU_DEAD) {
 545                atomic_add(*committed, &vm_committed_space);
 546                *committed = 0;
 547                lru_drain_cache((long)hcpu);
 548        }
 549        return NOTIFY_OK;
 550}
 551#endif /* CONFIG_HOTPLUG_CPU */
 552#endif /* CONFIG_SMP */
 553
 554#ifdef CONFIG_SMP
 555void percpu_counter_mod(struct percpu_counter *fbc, long amount)
 556{
 557        long count;
 558        long *pcount;
 559        int cpu = get_cpu();
 560
 561        pcount = per_cpu_ptr(fbc->counters, cpu);
 562        count = *pcount + amount;
 563        if (count >= FBC_BATCH || count <= -FBC_BATCH) {
 564                spin_lock(&fbc->lock);
 565                fbc->count += count;
 566                spin_unlock(&fbc->lock);
 567                count = 0;
 568        }
 569        *pcount = count;
 570        put_cpu();
 571}
 572EXPORT_SYMBOL(percpu_counter_mod);
 573#endif
 574
 575/*
 576 * Perform any setup for the swap system
 577 */
 578void __init swap_setup(void)
 579{
 580        unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
 581
 582        /* Use a smaller cluster for small-memory machines */
 583        if (megs < 16)
 584                page_cluster = 2;
 585        else
 586                page_cluster = 3;
 587        /*
 588         * Right now other parts of the system means that we
 589         * _really_ don't want to cluster much more
 590         */
 591        hotcpu_notifier(cpu_swap_callback, 0);
 592}
 593