RHEL4/mm/rmap.c
<<
>>
Prefs
   1/*
   2 * mm/rmap.c - physical to virtual reverse mappings
   3 *
   4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   5 * Released under the General Public License (GPL).
   6 *
   7 * Simple, low overhead reverse mapping scheme.
   8 * Please try to keep this thing as modular as possible.
   9 *
  10 * Provides methods for unmapping each kind of mapped page:
  11 * the anon methods track anonymous pages, and
  12 * the file methods track pages belonging to an inode.
  13 *
  14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
  18 */
  19
  20/*
  21 * Lock ordering in mm:
  22 *
  23 * inode->i_sem (while writing or truncating, not reading or faulting)
  24 *   inode->i_alloc_sem
  25 *
  26 * When a page fault occurs in writing from user to file, down_read
  27 * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within
  28 * down_read of mmap_sem; i_sem and down_write of mmap_sem are never
  29 * taken together; in truncation, i_sem is taken outermost.
  30 *
  31 * mm->mmap_sem
  32 *   page->flags PG_locked (lock_page)
  33 *     mapping->i_mmap_lock
  34 *       anon_vma->lock
  35 *         mm->page_table_lock
  36 *           zone->lru_lock (in mark_page_accessed)
  37 *           swap_list_lock (in swap_free etc's swap_info_get)
  38 *             swap_device_lock (in swap_duplicate, swap_info_get)
  39 *             mapping->private_lock (in __set_page_dirty_buffers)
  40 *             inode_lock (in set_page_dirty's __mark_inode_dirty)
  41 *               sb_lock (within inode_lock in fs/fs-writeback.c)
  42 *               mapping->tree_lock (widely used, in set_page_dirty,
  43 *                         in arch-dependent flush_dcache_mmap_lock,
  44 *                         within inode_lock in __sync_single_inode)
  45 */
  46
  47#include <linux/mm.h>
  48#include <linux/pagemap.h>
  49#include <linux/swap.h>
  50#include <linux/swapops.h>
  51#include <linux/slab.h>
  52#include <linux/init.h>
  53#include <linux/rmap.h>
  54#include <linux/rcupdate.h>
  55
  56#include <asm/tlbflush.h>
  57
  58#ifdef CONFIG_HIGHMEM
  59extern atomic_t nr_mapped_high;
  60static inline void inc_mapped_high(struct page *page)
  61{
  62        if (is_highmem(page_zone(page)))
  63                atomic_inc(&nr_mapped_high);
  64}
  65
  66static inline void dec_mapped_high(struct page *page)
  67{
  68        if (is_highmem(page_zone(page)))
  69                atomic_dec(&nr_mapped_high);
  70}
  71#else
  72#define inc_mapped_high(page)
  73#define dec_mapped_high(page)
  74#endif
  75
  76//#define RMAP_DEBUG /* can be enabled only for debugging */
  77
  78kmem_cache_t *anon_vma_cachep;
  79
  80static inline void validate_anon_vma(struct vm_area_struct *find_vma)
  81{
  82#ifdef RMAP_DEBUG
  83        struct anon_vma *anon_vma = find_vma->anon_vma;
  84        struct vm_area_struct *vma;
  85        unsigned int mapcount = 0;
  86        int found = 0;
  87
  88        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  89                mapcount++;
  90                BUG_ON(mapcount > 100000);
  91                if (vma == find_vma)
  92                        found = 1;
  93        }
  94        BUG_ON(!found);
  95#endif
  96}
  97
  98/* This must be called under the mmap_sem. */
  99int anon_vma_prepare(struct vm_area_struct *vma)
 100{
 101        struct anon_vma *anon_vma = vma->anon_vma;
 102
 103        might_sleep();
 104        if (unlikely(!anon_vma)) {
 105                struct mm_struct *mm = vma->vm_mm;
 106                struct anon_vma *allocated, *locked;
 107
 108                anon_vma = find_mergeable_anon_vma(vma);
 109                if (anon_vma) {
 110                        allocated = NULL;
 111                        locked = anon_vma;
 112                        spin_lock(&locked->lock);
 113                } else {
 114                        anon_vma = anon_vma_alloc();
 115                        if (unlikely(!anon_vma))
 116                                return -ENOMEM;
 117                        allocated = anon_vma;
 118                        locked = NULL;
 119                }
 120
 121                /* page_table_lock to protect against threads */
 122                spin_lock(&mm->page_table_lock);
 123                if (likely(!vma->anon_vma)) {
 124                        vma->anon_vma = anon_vma;
 125                        list_add(&vma->anon_vma_node, &anon_vma->head);
 126                        allocated = NULL;
 127                }
 128                spin_unlock(&mm->page_table_lock);
 129
 130                if (locked)
 131                        spin_unlock(&locked->lock);
 132                if (unlikely(allocated))
 133                        anon_vma_free(allocated);
 134        }
 135        return 0;
 136}
 137
 138void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
 139{
 140        if (!vma->anon_vma) {
 141                BUG_ON(!next->anon_vma);
 142                vma->anon_vma = next->anon_vma;
 143                list_add(&vma->anon_vma_node, &next->anon_vma_node);
 144        } else {
 145                /* if they're both non-null they must be the same */
 146                BUG_ON(vma->anon_vma != next->anon_vma);
 147        }
 148        list_del(&next->anon_vma_node);
 149}
 150
 151void __anon_vma_link(struct vm_area_struct *vma)
 152{
 153        struct anon_vma *anon_vma = vma->anon_vma;
 154
 155        if (anon_vma) {
 156                list_add(&vma->anon_vma_node, &anon_vma->head);
 157                validate_anon_vma(vma);
 158        }
 159}
 160
 161void anon_vma_link(struct vm_area_struct *vma)
 162{
 163        struct anon_vma *anon_vma = vma->anon_vma;
 164
 165        if (anon_vma) {
 166                spin_lock(&anon_vma->lock);
 167                list_add(&vma->anon_vma_node, &anon_vma->head);
 168                validate_anon_vma(vma);
 169                spin_unlock(&anon_vma->lock);
 170        }
 171}
 172
 173void anon_vma_unlink(struct vm_area_struct *vma)
 174{
 175        struct anon_vma *anon_vma = vma->anon_vma;
 176        int empty;
 177
 178        if (!anon_vma)
 179                return;
 180
 181        spin_lock(&anon_vma->lock);
 182        validate_anon_vma(vma);
 183        list_del(&vma->anon_vma_node);
 184
 185        /* We must garbage collect the anon_vma if it's empty */
 186        empty = list_empty(&anon_vma->head);
 187        spin_unlock(&anon_vma->lock);
 188
 189        if (empty)
 190                anon_vma_free(anon_vma);
 191}
 192
 193static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
 194{
 195        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 196                                                SLAB_CTOR_CONSTRUCTOR) {
 197                struct anon_vma *anon_vma = data;
 198
 199                spin_lock_init(&anon_vma->lock);
 200                INIT_LIST_HEAD(&anon_vma->head);
 201        }
 202}
 203
 204void __init anon_vma_init(void)
 205{
 206        anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
 207                        0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
 208}
 209
 210/*
 211 * Getting a lock on a stable anon_vma from a page off the LRU is
 212 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
 213 */
 214static struct anon_vma *page_lock_anon_vma(struct page *page)
 215{
 216        struct anon_vma *anon_vma = NULL;
 217        unsigned long anon_mapping;
 218
 219        rcu_read_lock();
 220        anon_mapping = (unsigned long) page->mapping;
 221        if (!(anon_mapping & PAGE_MAPPING_ANON))
 222                goto out;
 223        if (!page_mapped(page))
 224                goto out;
 225
 226        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
 227        spin_lock(&anon_vma->lock);
 228out:
 229        rcu_read_unlock();
 230        return anon_vma;
 231}
 232
 233/*
 234 * At what user virtual address is page expected in vma?
 235 */
 236static inline unsigned long
 237vma_address(struct page *page, struct vm_area_struct *vma)
 238{
 239        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 240        unsigned long address;
 241
 242        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 243        if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
 244                /* page should be within any vma from prio_tree_next */
 245                BUG_ON(!PageAnon(page));
 246                return -EFAULT;
 247        }
 248        return address;
 249}
 250
 251/*
 252 * At what user virtual address is page expected in vma? checking that the
 253 * page matches the vma: currently only used by unuse_process, on anon pages.
 254 */
 255unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 256{
 257        if (PageAnon(page)) {
 258                if ((void *)vma->anon_vma !=
 259                    (void *)page->mapping - PAGE_MAPPING_ANON)
 260                        return -EFAULT;
 261        } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
 262                if (vma->vm_file->f_mapping != page->mapping)
 263                        return -EFAULT;
 264        } else
 265                return -EFAULT;
 266        return vma_address(page, vma);
 267}
 268
 269/*
 270 * Subfunctions of page_referenced: page_referenced_one called
 271 * repeatedly from either page_referenced_anon or page_referenced_file.
 272 */
 273static int page_referenced_one(struct page *page,
 274        struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token)
 275{
 276        struct mm_struct *mm = vma->vm_mm;
 277        unsigned long address;
 278        pgd_t *pgd;
 279        pmd_t *pmd;
 280        pte_t *pte;
 281        int referenced = 0;
 282
 283        if (!mm->rss)
 284                goto out;
 285        address = vma_address(page, vma);
 286        if (address == -EFAULT)
 287                goto out;
 288
 289        spin_lock(&mm->page_table_lock);
 290
 291        pgd = pgd_offset(mm, address);
 292        if (!pgd_present(*pgd))
 293                goto out_unlock;
 294
 295        pmd = pmd_offset(pgd, address);
 296        if (!pmd_present(*pmd))
 297                goto out_unlock;
 298
 299        pte = pte_offset_map(pmd, address);
 300        if (!pte_present(*pte))
 301                goto out_unmap;
 302
 303        if (page_to_pfn(page) != pte_pfn(*pte))
 304                goto out_unmap;
 305
 306        if (ptep_clear_flush_young(vma, address, pte))
 307                referenced++;
 308
 309        if (mm != current->mm && !ignore_token && has_swap_token(mm))
 310                referenced++;
 311
 312        (*mapcount)--;
 313
 314out_unmap:
 315        pte_unmap(pte);
 316out_unlock:
 317        spin_unlock(&mm->page_table_lock);
 318out:
 319        return referenced;
 320}
 321
 322static int page_referenced_anon(struct page *page, int ignore_token)
 323{
 324        unsigned int mapcount;
 325        struct anon_vma *anon_vma;
 326        struct vm_area_struct *vma;
 327        int referenced = 0;
 328
 329        anon_vma = page_lock_anon_vma(page);
 330        if (!anon_vma)
 331                return referenced;
 332
 333        mapcount = page_mapcount(page);
 334        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 335                referenced += page_referenced_one(page, vma, &mapcount,
 336                                                        ignore_token);
 337                if (!mapcount)
 338                        break;
 339        }
 340        spin_unlock(&anon_vma->lock);
 341        return referenced;
 342}
 343
 344/**
 345 * page_referenced_file - referenced check for object-based rmap
 346 * @page: the page we're checking references on.
 347 *
 348 * For an object-based mapped page, find all the places it is mapped and
 349 * check/clear the referenced flag.  This is done by following the page->mapping
 350 * pointer, then walking the chain of vmas it holds.  It returns the number
 351 * of references it found.
 352 *
 353 * This function is only called from page_referenced for object-based pages.
 354 */
 355static int page_referenced_file(struct page *page, int ignore_token)
 356{
 357        unsigned int mapcount;
 358        struct address_space *mapping = page->mapping;
 359        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 360        struct vm_area_struct *vma;
 361        struct prio_tree_iter iter;
 362        int referenced = 0;
 363
 364        /*
 365         * The caller's checks on page->mapping and !PageAnon have made
 366         * sure that this is a file page: the check for page->mapping
 367         * excludes the case just before it gets set on an anon page.
 368         */
 369        BUG_ON(PageAnon(page));
 370
 371        /*
 372         * The page lock not only makes sure that page->mapping cannot
 373         * suddenly be NULLified by truncation, it makes sure that the
 374         * structure at mapping cannot be freed and reused yet,
 375         * so we can safely take mapping->i_mmap_lock.
 376         */
 377        BUG_ON(!PageLocked(page));
 378
 379        spin_lock(&mapping->i_mmap_lock);
 380
 381        /*
 382         * i_mmap_lock does not stabilize mapcount at all, but mapcount
 383         * is more likely to be accurate if we note it after spinning.
 384         */
 385        mapcount = page_mapcount(page);
 386
 387        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 388                if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
 389                                  == (VM_LOCKED|VM_MAYSHARE)) {
 390                        referenced++;
 391                        break;
 392                }
 393                referenced += page_referenced_one(page, vma, &mapcount,
 394                                                        ignore_token);
 395                if (!mapcount)
 396                        break;
 397        }
 398
 399        spin_unlock(&mapping->i_mmap_lock);
 400        return referenced;
 401}
 402
 403/**
 404 * page_referenced - test if the page was referenced
 405 * @page: the page to test
 406 * @is_locked: caller holds lock on the page
 407 *
 408 * Quick test_and_clear_referenced for all mappings to a page,
 409 * returns the number of ptes which referenced the page.
 410 */
 411int page_referenced(struct page *page, int is_locked, int ignore_token)
 412{
 413        int referenced = 0;
 414
 415        if (!swap_token_default_timeout)
 416                ignore_token = 1;
 417
 418        if (page_test_and_clear_young(page))
 419                referenced++;
 420
 421        if (TestClearPageReferenced(page))
 422                referenced++;
 423
 424        if (page_mapped(page) && page->mapping) {
 425                if (PageAnon(page))
 426                        referenced += page_referenced_anon(page, ignore_token);
 427                else if (is_locked)
 428                        referenced += page_referenced_file(page, ignore_token);
 429                else if (TestSetPageLocked(page))
 430                        referenced++;
 431                else {
 432                        if (page->mapping)
 433                                referenced += page_referenced_file(page,
 434                                                                ignore_token);
 435                        unlock_page(page);
 436                }
 437        }
 438        return referenced;
 439}
 440
 441/**
 442 * page_add_anon_rmap - add pte mapping to an anonymous page
 443 * @page:       the page to add the mapping to
 444 * @vma:        the vm area in which the mapping is added
 445 * @address:    the user virtual address mapped
 446 *
 447 * The caller needs to hold the mm->page_table_lock.
 448 */
 449void page_add_anon_rmap(struct page *page,
 450        struct vm_area_struct *vma, unsigned long address)
 451{
 452        struct anon_vma *anon_vma = vma->anon_vma;
 453        pgoff_t index;
 454
 455        BUG_ON(PageReserved(page));
 456        BUG_ON(!anon_vma);
 457
 458        vma->vm_mm->anon_rss++;
 459
 460        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 461        index = (address - vma->vm_start) >> PAGE_SHIFT;
 462        index += vma->vm_pgoff;
 463        index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 464
 465        if (atomic_inc_and_test(&page->_mapcount)) {
 466                page->index = index;
 467                page->mapping = (struct address_space *) anon_vma;
 468                inc_page_state(nr_mapped);
 469                inc_mapped_high(page);
 470        }
 471        /* else checking page index and mapping is racy */
 472}
 473
 474/**
 475 * page_add_file_rmap - add pte mapping to a file page
 476 * @page: the page to add the mapping to
 477 *
 478 * The caller needs to hold the mm->page_table_lock.
 479 */
 480void page_add_file_rmap(struct page *page)
 481{
 482        BUG_ON(PageAnon(page));
 483        if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 484                return;
 485
 486        if (atomic_inc_and_test(&page->_mapcount)) {
 487                inc_page_state(nr_mapped);
 488                inc_mapped_high(page);
 489        }
 490}
 491
 492/**
 493 * page_remove_rmap - take down pte mapping from a page
 494 * @page: page to remove mapping from
 495 *
 496 * Caller needs to hold the mm->page_table_lock.
 497 */
 498void page_remove_rmap(struct page *page)
 499{
 500        BUG_ON(PageReserved(page));
 501
 502        if (atomic_add_negative(-1, &page->_mapcount)) {
 503                if (unlikely(page_mapcount(page) < 0)) {
 504                        printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
 505                        printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
 506                        printk (KERN_EMERG "  page->count = %x\n", page_count(page));
 507                        printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
 508                        BUG();
 509                }
 510                /*
 511                 * It would be tidy to reset the PageAnon mapping here,
 512                 * but that might overwrite a racing page_add_anon_rmap
 513                 * which increments mapcount after us but sets mapping
 514                 * before us: so leave the reset to free_hot_cold_page,
 515                 * and remember that it's only reliable while mapped.
 516                 * Leaving it set also helps swapoff to reinstate ptes
 517                 * faster for those pages still in swapcache.
 518                 */
 519                if (page_test_and_clear_dirty(page))
 520                        set_page_dirty(page);
 521                dec_page_state(nr_mapped);
 522                dec_mapped_high(page);
 523
 524                /*
 525                 * Deactivate the page when the last munmap() occurs.  
 526                 */
 527                if (pagecache_over_max() && !PageAnon(page))
 528                        deactivate_unmapped_page(page);
 529        }
 530}
 531
 532/*
 533 * Subfunctions of try_to_unmap: try_to_unmap_one called
 534 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
 535 */
 536static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 537{
 538        struct mm_struct *mm = vma->vm_mm;
 539        unsigned long address;
 540        pgd_t *pgd;
 541        pmd_t *pmd;
 542        pte_t *pte;
 543        pte_t pteval;
 544        int ret = SWAP_AGAIN;
 545
 546        if (!mm->rss)
 547                goto out;
 548        address = vma_address(page, vma);
 549        if (address == -EFAULT)
 550                goto out;
 551
 552        /*
 553         * We need the page_table_lock to protect us from page faults,
 554         * munmap, fork, etc...
 555         */
 556        spin_lock(&mm->page_table_lock);
 557
 558        pgd = pgd_offset(mm, address);
 559        if (!pgd_present(*pgd))
 560                goto out_unlock;
 561
 562        pmd = pmd_offset(pgd, address);
 563        if (!pmd_present(*pmd))
 564                goto out_unlock;
 565
 566        pte = pte_offset_map(pmd, address);
 567        if (!pte_present(*pte))
 568                goto out_unmap;
 569
 570        if (page_to_pfn(page) != pte_pfn(*pte))
 571                goto out_unmap;
 572
 573        /*
 574         * If the page is mlock()d, we cannot swap it out.
 575         * If it's recently referenced (perhaps page_referenced
 576         * skipped over this mm) then we should reactivate it.
 577         */
 578        if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
 579                        ptep_clear_flush_young(vma, address, pte)) {
 580                ret = SWAP_FAIL;
 581                goto out_unmap;
 582        }
 583
 584        /*
 585         * Don't pull an anonymous page out from under get_user_pages.
 586         * GUP carefully breaks COW and raises page count (while holding
 587         * page_table_lock, as we have here) to make sure that the page
 588         * cannot be freed.  If we unmap that page here, a user write
 589         * access to the virtual address will bring back the page, but
 590         * its raised count will (ironically) be taken to mean it's not
 591         * an exclusive swap page, do_wp_page will replace it by a copy
 592         * page, and the user never get to see the data GUP was holding
 593         * the original page for.
 594         *
 595         * This test is also useful for when swapoff (unuse_process) has
 596         * to drop page lock: its reference to the page stops existing
 597         * ptes from being unmapped, so swapoff can make progress.
 598         */
 599        if (PageSwapCache(page) &&
 600            page_count(page) != page_mapcount(page) + 2) {
 601                ret = SWAP_FAIL;
 602                goto out_unmap;
 603        }
 604
 605        /* Nuke the page table entry. */
 606        flush_cache_page(vma, address);
 607        pteval = ptep_clear_flush(vma, address, pte);
 608
 609        /* Move the dirty bit to the physical page now the pte is gone. */
 610        if (pte_dirty(pteval))
 611                set_page_dirty(page);
 612
 613        if (PageAnon(page)) {
 614                swp_entry_t entry = { .val = page->private };
 615                /*
 616                 * Store the swap location in the pte.
 617                 * See handle_pte_fault() ...
 618                 */
 619                BUG_ON(!PageSwapCache(page));
 620                swap_duplicate(entry);
 621                set_pte(pte, swp_entry_to_pte(entry));
 622                BUG_ON(pte_file(*pte));
 623                mm->anon_rss--;
 624        }
 625
 626        mm->rss--;
 627        page_remove_rmap(page);
 628        page_cache_release(page);
 629
 630out_unmap:
 631        pte_unmap(pte);
 632out_unlock:
 633        spin_unlock(&mm->page_table_lock);
 634out:
 635        return ret;
 636}
 637
 638/*
 639 * objrmap doesn't work for nonlinear VMAs because the assumption that
 640 * offset-into-file correlates with offset-into-virtual-addresses does not hold.
 641 * Consequently, given a particular page and its ->index, we cannot locate the
 642 * ptes which are mapping that page without an exhaustive linear search.
 643 *
 644 * So what this code does is a mini "virtual scan" of each nonlinear VMA which
 645 * maps the file to which the target page belongs.  The ->vm_private_data field
 646 * holds the current cursor into that scan.  Successive searches will circulate
 647 * around the vma's virtual address space.
 648 *
 649 * So as more replacement pressure is applied to the pages in a nonlinear VMA,
 650 * more scanning pressure is placed against them as well.   Eventually pages
 651 * will become fully unmapped and are eligible for eviction.
 652 *
 653 * For very sparsely populated VMAs this is a little inefficient - chances are
 654 * there there won't be many ptes located within the scan cluster.  In this case
 655 * maybe we could scan further - to the end of the pte page, perhaps.
 656 */
 657#define CLUSTER_SIZE    min(32*PAGE_SIZE, PMD_SIZE)
 658#define CLUSTER_MASK    (~(CLUSTER_SIZE - 1))
 659
 660static void try_to_unmap_cluster(unsigned long cursor,
 661        unsigned int *mapcount, struct vm_area_struct *vma)
 662{
 663        struct mm_struct *mm = vma->vm_mm;
 664        pgd_t *pgd;
 665        pmd_t *pmd;
 666        pte_t *pte, *original_pte;
 667        pte_t pteval;
 668        struct page *page;
 669        unsigned long address;
 670        unsigned long end;
 671        unsigned long pfn;
 672
 673        /*
 674         * We need the page_table_lock to protect us from page faults,
 675         * munmap, fork, etc...
 676         */
 677        spin_lock(&mm->page_table_lock);
 678
 679        address = (vma->vm_start + cursor) & CLUSTER_MASK;
 680        end = address + CLUSTER_SIZE;
 681        if (address < vma->vm_start)
 682                address = vma->vm_start;
 683        if (end > vma->vm_end)
 684                end = vma->vm_end;
 685
 686        pgd = pgd_offset(mm, address);
 687        if (!pgd_present(*pgd))
 688                goto out_unlock;
 689
 690        pmd = pmd_offset(pgd, address);
 691        if (!pmd_present(*pmd))
 692                goto out_unlock;
 693
 694        for (original_pte = pte = pte_offset_map(pmd, address);
 695                        address < end; pte++, address += PAGE_SIZE) {
 696
 697                if (!pte_present(*pte))
 698                        continue;
 699
 700                pfn = pte_pfn(*pte);
 701                if (!pfn_valid(pfn))
 702                        continue;
 703
 704                page = pfn_to_page(pfn);
 705                BUG_ON(PageAnon(page));
 706                if (PageReserved(page))
 707                        continue;
 708
 709                if (ptep_clear_flush_young(vma, address, pte))
 710                        continue;
 711
 712                /* Nuke the page table entry. */
 713                flush_cache_page(vma, address);
 714                pteval = ptep_clear_flush(vma, address, pte);
 715
 716                /* If nonlinear, store the file page offset in the pte. */
 717                if (page->index != linear_page_index(vma, address))
 718                        set_pte(pte, pgoff_to_pte(page->index));
 719
 720                /* Move the dirty bit to the physical page now the pte is gone. */
 721                if (pte_dirty(pteval))
 722                        set_page_dirty(page);
 723
 724                page_remove_rmap(page);
 725                page_cache_release(page);
 726                mm->rss--;
 727                (*mapcount)--;
 728        }
 729
 730        pte_unmap(original_pte);
 731
 732out_unlock:
 733        spin_unlock(&mm->page_table_lock);
 734}
 735
 736static int try_to_unmap_anon(struct page *page)
 737{
 738        struct anon_vma *anon_vma;
 739        struct vm_area_struct *vma;
 740        int ret = SWAP_AGAIN;
 741
 742        anon_vma = page_lock_anon_vma(page);
 743        if (!anon_vma)
 744                return ret;
 745
 746        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 747                ret = try_to_unmap_one(page, vma);
 748                if (ret == SWAP_FAIL || !page_mapped(page))
 749                        break;
 750        }
 751        spin_unlock(&anon_vma->lock);
 752        return ret;
 753}
 754
 755/**
 756 * try_to_unmap_file - unmap file page using the object-based rmap method
 757 * @page: the page to unmap
 758 *
 759 * Find all the mappings of a page using the mapping pointer and the vma chains
 760 * contained in the address_space struct it points to.
 761 *
 762 * This function is only called from try_to_unmap for object-based pages.
 763 */
 764static int try_to_unmap_file(struct page *page)
 765{
 766        struct address_space *mapping = page->mapping;
 767        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 768        struct vm_area_struct *vma;
 769        struct prio_tree_iter iter;
 770        int ret = SWAP_AGAIN;
 771        unsigned long cursor;
 772        unsigned long max_nl_cursor = 0;
 773        unsigned long max_nl_size = 0;
 774        unsigned int mapcount;
 775
 776        spin_lock(&mapping->i_mmap_lock);
 777        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 778                ret = try_to_unmap_one(page, vma);
 779                if (ret == SWAP_FAIL || !page_mapped(page))
 780                        goto out;
 781        }
 782
 783        if (list_empty(&mapping->i_mmap_nonlinear))
 784                goto out;
 785
 786        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 787                                                shared.vm_set.list) {
 788                if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 789                        continue;
 790                cursor = (unsigned long) vma->vm_private_data;
 791                if (cursor > max_nl_cursor)
 792                        max_nl_cursor = cursor;
 793                cursor = vma->vm_end - vma->vm_start;
 794                if (cursor > max_nl_size)
 795                        max_nl_size = cursor;
 796        }
 797
 798        if (max_nl_size == 0) { /* any nonlinears locked or reserved */
 799                ret = SWAP_FAIL;
 800                goto out;
 801        }
 802
 803        /*
 804         * We don't try to search for this page in the nonlinear vmas,
 805         * and page_referenced wouldn't have found it anyway.  Instead
 806         * just walk the nonlinear vmas trying to age and unmap some.
 807         * The mapcount of the page we came in with is irrelevant,
 808         * but even so use it as a guide to how hard we should try?
 809         */
 810        mapcount = page_mapcount(page);
 811        if (!mapcount)
 812                goto out;
 813        cond_resched_lock(&mapping->i_mmap_lock);
 814
 815        max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
 816        if (max_nl_cursor == 0)
 817                max_nl_cursor = CLUSTER_SIZE;
 818
 819        do {
 820                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 821                                                shared.vm_set.list) {
 822                        if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 823                                continue;
 824                        cursor = (unsigned long) vma->vm_private_data;
 825                        while (vma->vm_mm->rss &&
 826                                cursor < max_nl_cursor &&
 827                                cursor < vma->vm_end - vma->vm_start) {
 828                                try_to_unmap_cluster(cursor, &mapcount, vma);
 829                                cursor += CLUSTER_SIZE;
 830                                vma->vm_private_data = (void *) cursor;
 831                                if ((int)mapcount <= 0)
 832                                        goto out;
 833                        }
 834                        vma->vm_private_data = (void *) max_nl_cursor;
 835                }
 836                cond_resched_lock(&mapping->i_mmap_lock);
 837                max_nl_cursor += CLUSTER_SIZE;
 838        } while (max_nl_cursor <= max_nl_size);
 839
 840        /*
 841         * Don't loop forever (perhaps all the remaining pages are
 842         * in locked vmas).  Reset cursor on all unreserved nonlinear
 843         * vmas, now forgetting on which ones it had fallen behind.
 844         */
 845        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 846                                                shared.vm_set.list) {
 847                if (!(vma->vm_flags & VM_RESERVED))
 848                        vma->vm_private_data = NULL;
 849        }
 850out:
 851        spin_unlock(&mapping->i_mmap_lock);
 852        return ret;
 853}
 854
 855/**
 856 * try_to_unmap - try to remove all page table mappings to a page
 857 * @page: the page to get unmapped
 858 *
 859 * Tries to remove all the page table entries which are mapping this
 860 * page, used in the pageout path.  Caller must hold the page lock.
 861 * Return values are:
 862 *
 863 * SWAP_SUCCESS - we succeeded in removing all mappings
 864 * SWAP_AGAIN   - we missed a mapping, try again later
 865 * SWAP_FAIL    - the page is unswappable
 866 */
 867int try_to_unmap(struct page *page)
 868{
 869        int ret;
 870
 871        BUG_ON(PageReserved(page));
 872        BUG_ON(!PageLocked(page));
 873
 874        if (PageAnon(page))
 875                ret = try_to_unmap_anon(page);
 876        else
 877                ret = try_to_unmap_file(page);
 878
 879        if (!page_mapped(page))
 880                ret = SWAP_SUCCESS;
 881        return ret;
 882}
 883