RHEL4/mm/filemap.c
<<
>>
Prefs
   1/*
   2 *      linux/mm/filemap.c
   3 *
   4 * Copyright (C) 1994-1999  Linus Torvalds
   5 */
   6
   7/*
   8 * This file handles the generic file mmap semantics used by
   9 * most "normal" filesystems (but you don't /have/ to use this:
  10 * the NFS filesystem used to do this differently, for example)
  11 */
  12#include <linux/config.h>
  13#include <linux/module.h>
  14#include <linux/slab.h>
  15#include <linux/compiler.h>
  16#include <linux/fs.h>
  17#include <linux/aio.h>
  18#include <linux/kernel_stat.h>
  19#include <linux/mm.h>
  20#include <linux/swap.h>
  21#include <linux/mman.h>
  22#include <linux/pagemap.h>
  23#include <linux/file.h>
  24#include <linux/uio.h>
  25#include <linux/hash.h>
  26#include <linux/writeback.h>
  27#include <linux/pagevec.h>
  28#include <linux/blkdev.h>
  29#include <linux/security.h>
  30/*
  31 * This is needed for the following functions:
  32 *  - try_to_release_page
  33 *  - block_invalidatepage
  34 *  - generic_osync_inode
  35 *
  36 * FIXME: remove all knowledge of the buffer layer from the core VM
  37 */
  38#include <linux/buffer_head.h> /* for generic_osync_inode */
  39
  40#include <asm/uaccess.h>
  41#include <asm/mman.h>
  42
  43/*
  44 * Shared mappings implemented 30.11.1994. It's not fully working yet,
  45 * though.
  46 *
  47 * Shared mappings now work. 15.8.1995  Bruno.
  48 *
  49 * finished 'unifying' the page and buffer cache and SMP-threaded the
  50 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  51 *
  52 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  53 */
  54
  55/*
  56 * Lock ordering:
  57 *
  58 *  ->i_mmap_lock               (vmtruncate)
  59 *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
  60 *      ->swap_list_lock
  61 *        ->swap_device_lock    (exclusive_swap_page, others)
  62 *          ->mapping->tree_lock
  63 *
  64 *  ->i_sem
  65 *    ->i_mmap_lock             (truncate->unmap_mapping_range)
  66 *
  67 *  ->mmap_sem
  68 *    ->i_mmap_lock
  69 *      ->page_table_lock       (various places, mainly in mmap.c)
  70 *        ->mapping->tree_lock  (arch-dependent flush_dcache_mmap_lock)
  71 *
  72 *  ->mmap_sem
  73 *    ->lock_page               (access_process_vm)
  74 *
  75 *  ->mmap_sem
  76 *    ->i_sem                   (msync)
  77 *
  78 *  ->i_sem
  79 *    ->i_alloc_sem             (various)
  80 *
  81 *  ->inode_lock
  82 *    ->sb_lock                 (fs/fs-writeback.c)
  83 *    ->mapping->tree_lock      (__sync_single_inode)
  84 *
  85 *  ->i_mmap_lock
  86 *    ->anon_vma.lock           (vma_adjust)
  87 *
  88 *  ->anon_vma.lock
  89 *    ->page_table_lock         (anon_vma_prepare and various)
  90 *
  91 *  ->page_table_lock
  92 *    ->swap_device_lock        (try_to_unmap_one)
  93 *    ->private_lock            (try_to_unmap_one)
  94 *    ->tree_lock               (try_to_unmap_one)
  95 *    ->zone.lru_lock           (follow_page->mark_page_accessed)
  96 *    ->private_lock            (page_remove_rmap->set_page_dirty)
  97 *    ->tree_lock               (page_remove_rmap->set_page_dirty)
  98 *    ->inode_lock              (page_remove_rmap->set_page_dirty)
  99 *    ->inode_lock              (zap_pte_range->set_page_dirty)
 100 *    ->private_lock            (zap_pte_range->__set_page_dirty_buffers)
 101 *
 102 *  ->task->proc_lock
 103 *    ->dcache_lock             (proc_pid_lookup)
 104 */
 105
 106/*
 107 * Remove a page from the page cache and free it. Caller has to make
 108 * sure the page is locked and that nobody else uses it - or that usage
 109 * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
 110 */
 111void __remove_from_page_cache(struct page *page)
 112{
 113        struct address_space *mapping = page->mapping;
 114
 115        radix_tree_delete(&mapping->page_tree, page->index);
 116        page->mapping = NULL;
 117        mapping->nrpages--;
 118        pagecache_acct(-1);
 119}
 120
 121void remove_from_page_cache(struct page *page)
 122{
 123        struct address_space *mapping = page->mapping;
 124
 125        if (unlikely(!PageLocked(page)))
 126                PAGE_BUG(page);
 127
 128        spin_lock_irq(&mapping->tree_lock);
 129        __remove_from_page_cache(page);
 130        spin_unlock_irq(&mapping->tree_lock);
 131}
 132
 133static inline int sync_page(struct page *page)
 134{
 135        struct address_space *mapping;
 136
 137        /*
 138         * FIXME, fercrissake.  What is this barrier here for?
 139         */
 140        smp_mb();
 141        mapping = page_mapping(page);
 142        if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 143                return mapping->a_ops->sync_page(page);
 144        return 0;
 145}
 146
 147/**
 148 * filemap_fdatawrite_range - start writeback against all of a mapping's
 149 * dirty pages that lie within the byte offsets <start, end>
 150 * @mapping: address space structure to write
 151 * @start: offset in bytes where the range starts
 152 * @end : offset in bytes where the range ends
 153 *
 154 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
 155 * opposed to a regular memory * cleansing writeback.  The difference between
 156 * these two operations is that if a dirty page/buffer is encountered, it must
 157 * be waited upon, and not just skipped over.
 158 */
 159static int __filemap_fdatawrite_range(struct address_space *mapping,
 160        loff_t start, loff_t end, int sync_mode)
 161{
 162        int ret;
 163        struct writeback_control wbc = {
 164                .sync_mode = sync_mode,
 165                .nr_to_write = mapping->nrpages * 2,
 166                .start = start,
 167                .end = end,
 168        };
 169
 170        if (mapping->backing_dev_info->memory_backed)
 171                return 0;
 172
 173        ret = do_writepages(mapping, &wbc);
 174        return ret;
 175}
 176
 177static inline int __filemap_fdatawrite(struct address_space *mapping,
 178        int sync_mode)
 179{
 180        return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
 181}
 182
 183int filemap_fdatawrite(struct address_space *mapping)
 184{
 185        return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
 186}
 187EXPORT_SYMBOL(filemap_fdatawrite);
 188
 189static int filemap_fdatawrite_range(struct address_space *mapping,
 190        loff_t start, loff_t end)
 191{
 192        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 193}
 194
 195/*
 196 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 197 * purposes - I/O may not be started against all dirty pages.
 198 */
 199int filemap_flush(struct address_space *mapping)
 200{
 201        return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
 202}
 203EXPORT_SYMBOL(filemap_flush);
 204
 205/*
 206 * Wait for writeback to complete against pages indexed by start->end
 207 * inclusive
 208 */
 209static int wait_on_page_writeback_range(struct address_space *mapping,
 210                                pgoff_t start, pgoff_t end)
 211{
 212        struct pagevec pvec;
 213        int nr_pages;
 214        int ret = 0;
 215        pgoff_t index;
 216
 217        if (end < start)
 218                return 0;
 219
 220        pagevec_init(&pvec, 0);
 221        index = start;
 222        while ((index <= end) &&
 223                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 224                        PAGECACHE_TAG_WRITEBACK,
 225                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
 226                unsigned i;
 227
 228                for (i = 0; i < nr_pages; i++) {
 229                        struct page *page = pvec.pages[i];
 230
 231                        /* until radix tree lookup accepts end_index */
 232                        if (page->index > end)
 233                                continue;
 234
 235                        wait_on_page_writeback(page);
 236                        if (PageError(page))
 237                                ret = -EIO;
 238                }
 239                pagevec_release(&pvec);
 240                cond_resched();
 241        }
 242
 243        /* Check for outstanding write errors */
 244        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
 245                ret = -ENOSPC;
 246        if (test_and_clear_bit(AS_EIO, &mapping->flags))
 247                ret = -EIO;
 248
 249        return ret;
 250}
 251
 252/*
 253 * Write and wait upon all the pages in the passed range.  This is a "data
 254 * integrity" operation.  It waits upon in-flight writeout before starting and
 255 * waiting upon new writeout.  If there was an IO error, return it.
 256 *
 257 * We need to re-take i_sem during the generic_osync_inode list walk because
 258 * it is otherwise livelockable.
 259 */
 260int sync_page_range(struct inode *inode, struct address_space *mapping,
 261                        loff_t pos, size_t count)
 262{
 263        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 264        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
 265        int ret;
 266
 267        if (mapping->backing_dev_info->memory_backed || !count)
 268                return 0;
 269        ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
 270        if (ret == 0) {
 271                down(&inode->i_sem);
 272                ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 273                up(&inode->i_sem);
 274        }
 275        if (ret == 0)
 276                ret = wait_on_page_writeback_range(mapping, start, end);
 277        return ret;
 278}
 279EXPORT_SYMBOL(sync_page_range);
 280
 281/*
 282 * Note: Holding i_sem across sync_page_range_nolock is not a good idea
 283 * as it forces O_SYNC writers to different parts of the same file
 284 * to be serialised right until io completion.
 285 */
 286int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
 287                        loff_t pos, size_t count)
 288{
 289        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 290        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
 291        int ret;
 292
 293        if (mapping->backing_dev_info->memory_backed || !count)
 294                return 0;
 295        ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
 296        if (ret == 0)
 297                ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 298        if (ret == 0)
 299                ret = wait_on_page_writeback_range(mapping, start, end);
 300        return ret;
 301}
 302EXPORT_SYMBOL(sync_page_range_nolock);
 303
 304/**
 305 * filemap_fdatawait - walk the list of under-writeback pages of the given
 306 *     address space and wait for all of them.
 307 *
 308 * @mapping: address space structure to wait for
 309 */
 310int filemap_fdatawait(struct address_space *mapping)
 311{
 312        loff_t i_size = i_size_read(mapping->host);
 313
 314        if (i_size == 0)
 315                return 0;
 316
 317        return wait_on_page_writeback_range(mapping, 0,
 318                                (i_size - 1) >> PAGE_CACHE_SHIFT);
 319}
 320EXPORT_SYMBOL(filemap_fdatawait);
 321
 322int filemap_write_and_wait(struct address_space *mapping)
 323{
 324        int retval = 0;
 325
 326        if (mapping->nrpages) {
 327                retval = filemap_fdatawrite(mapping);
 328                if (retval == 0)
 329                        retval = filemap_fdatawait(mapping);
 330        }
 331        return retval;
 332}
 333EXPORT_SYMBOL(filemap_write_and_wait);
 334
 335/*
 336 * This function is used to add newly allocated pagecache pages:
 337 * the page is new, so we can just run SetPageLocked() against it.
 338 * The other page state flags were set by rmqueue().
 339 *
 340 * This function does not add the page to the LRU.  The caller must do that.
 341 */
 342int add_to_page_cache(struct page *page, struct address_space *mapping,
 343                pgoff_t offset, int gfp_mask)
 344{
 345        int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 346
 347        if (error == 0) {
 348                spin_lock_irq(&mapping->tree_lock);
 349                error = radix_tree_insert(&mapping->page_tree, offset, page);
 350                if (!error) {
 351                        page_cache_get(page);
 352                        SetPageLocked(page);
 353                        page->mapping = mapping;
 354                        page->index = offset;
 355                        mapping->nrpages++;
 356                        pagecache_acct(1);
 357                }
 358                spin_unlock_irq(&mapping->tree_lock);
 359                radix_tree_preload_end();
 360        }
 361        return error;
 362}
 363
 364EXPORT_SYMBOL(add_to_page_cache);
 365
 366int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 367                                pgoff_t offset, int gfp_mask)
 368{
 369        int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
 370        if (ret == 0)
 371                lru_cache_add(page);
 372        return ret;
 373}
 374
 375/*
 376 * In order to wait for pages to become available there must be
 377 * waitqueues associated with pages. By using a hash table of
 378 * waitqueues where the bucket discipline is to maintain all
 379 * waiters on the same queue and wake all when any of the pages
 380 * become available, and for the woken contexts to check to be
 381 * sure the appropriate page became available, this saves space
 382 * at a cost of "thundering herd" phenomena during rare hash
 383 * collisions.
 384 */
 385struct page_wait_queue {
 386        struct page *page;
 387        int bit;
 388        wait_queue_t wait;
 389};
 390
 391static int page_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
 392{
 393        struct page *page = key;
 394        struct page_wait_queue *wq;
 395
 396        wq = container_of(wait, struct page_wait_queue, wait);
 397        if (wq->page != page || test_bit(wq->bit, &page->flags))
 398                return 0;
 399        else
 400                return autoremove_wake_function(wait, mode, sync, NULL);
 401}
 402
 403#define __DEFINE_PAGE_WAIT(name, p, b, f)                               \
 404        struct page_wait_queue name = {                                 \
 405                .page   = p,                                            \
 406                .bit    = b,                                            \
 407                .wait   = {                                             \
 408                        .task   = current,                              \
 409                        .func   = page_wake_function,                   \
 410                        .flags  = f,                                    \
 411                        .task_list = LIST_HEAD_INIT(name.wait.task_list),\
 412                },                                                      \
 413        }
 414
 415#define DEFINE_PAGE_WAIT(name, p, b)    __DEFINE_PAGE_WAIT(name, p, b, 0)
 416#define DEFINE_PAGE_WAIT_EXCLUSIVE(name, p, b)                          \
 417                __DEFINE_PAGE_WAIT(name, p, b, WQ_FLAG_EXCLUSIVE)
 418
 419static wait_queue_head_t *page_waitqueue(struct page *page)
 420{
 421        const struct zone *zone = page_zone(page);
 422
 423        return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
 424}
 425
 426static void wake_up_page(struct page *page)
 427{
 428        const unsigned int mode = TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE;
 429        wait_queue_head_t *waitqueue = page_waitqueue(page);
 430
 431        if (waitqueue_active(waitqueue))
 432                __wake_up(waitqueue, mode, 1, page);
 433}
 434
 435void fastcall wait_on_page_bit(struct page *page, int bit_nr)
 436{
 437        wait_queue_head_t *waitqueue = page_waitqueue(page);
 438        DEFINE_PAGE_WAIT(wait, page, bit_nr);
 439
 440        do {
 441                prepare_to_wait(waitqueue, &wait.wait, TASK_UNINTERRUPTIBLE);
 442                if (test_bit(bit_nr, &page->flags)) {
 443                        sync_page(page);
 444                        io_schedule();
 445                }
 446        } while (test_bit(bit_nr, &page->flags));
 447        finish_wait(waitqueue, &wait.wait);
 448}
 449
 450EXPORT_SYMBOL(wait_on_page_bit);
 451
 452/**
 453 * unlock_page() - unlock a locked page
 454 *
 455 * @page: the page
 456 *
 457 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
 458 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
 459 * mechananism between PageLocked pages and PageWriteback pages is shared.
 460 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 461 *
 462 * The first mb is necessary to safely close the critical section opened by the
 463 * TestSetPageLocked(), the second mb is necessary to enforce ordering between
 464 * the clear_bit and the read of the waitqueue (to avoid SMP races with a
 465 * parallel wait_on_page_locked()).
 466 */
 467void fastcall unlock_page(struct page *page)
 468{
 469        smp_mb__before_clear_bit();
 470        if (!TestClearPageLocked(page))
 471                BUG();
 472        smp_mb__after_clear_bit(); 
 473        wake_up_page(page);
 474}
 475
 476EXPORT_SYMBOL(unlock_page);
 477EXPORT_SYMBOL(lock_page);
 478
 479/*
 480 * End writeback against a page.
 481 */
 482void end_page_writeback(struct page *page)
 483{
 484        if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
 485                if (!test_clear_page_writeback(page))
 486                        BUG();
 487                smp_mb__after_clear_bit();
 488        }
 489        wake_up_page(page);
 490}
 491
 492EXPORT_SYMBOL(end_page_writeback);
 493
 494/*
 495 * Get a lock on the page, assuming we need to sleep to get it.
 496 *
 497 * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
 498 * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
 499 * chances are that on the second loop, the block layer's plug list is empty,
 500 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
 501 */
 502void fastcall __lock_page(struct page *page)
 503{
 504        wait_queue_head_t *wqh = page_waitqueue(page);
 505        DEFINE_PAGE_WAIT_EXCLUSIVE(wait, page, PG_locked);
 506
 507        while (TestSetPageLocked(page)) {
 508                prepare_to_wait_exclusive(wqh, &wait.wait, TASK_UNINTERRUPTIBLE);
 509                if (PageLocked(page)) {
 510                        sync_page(page);
 511                        io_schedule();
 512                }
 513        }
 514        finish_wait(wqh, &wait.wait);
 515}
 516
 517EXPORT_SYMBOL(__lock_page);
 518
 519/*
 520 * a rather lightweight function, finding and getting a reference to a
 521 * hashed page atomically.
 522 */
 523struct page * find_get_page(struct address_space *mapping, unsigned long offset)
 524{
 525        struct page *page;
 526
 527        spin_lock_irq(&mapping->tree_lock);
 528        page = radix_tree_lookup(&mapping->page_tree, offset);
 529        if (page)
 530                page_cache_get(page);
 531        spin_unlock_irq(&mapping->tree_lock);
 532        return page;
 533}
 534
 535EXPORT_SYMBOL(find_get_page);
 536
 537/*
 538 * Same as above, but trylock it instead of incrementing the count.
 539 */
 540struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
 541{
 542        struct page *page;
 543
 544        spin_lock_irq(&mapping->tree_lock);
 545        page = radix_tree_lookup(&mapping->page_tree, offset);
 546        if (page && TestSetPageLocked(page))
 547                page = NULL;
 548        spin_unlock_irq(&mapping->tree_lock);
 549        return page;
 550}
 551
 552EXPORT_SYMBOL(find_trylock_page);
 553
 554/**
 555 * find_lock_page - locate, pin and lock a pagecache page
 556 *
 557 * @mapping - the address_space to search
 558 * @offset - the page index
 559 *
 560 * Locates the desired pagecache page, locks it, increments its reference
 561 * count and returns its address.
 562 *
 563 * Returns zero if the page was not present. find_lock_page() may sleep.
 564 */
 565struct page *find_lock_page(struct address_space *mapping,
 566                                unsigned long offset)
 567{
 568        struct page *page;
 569
 570        spin_lock_irq(&mapping->tree_lock);
 571repeat:
 572        page = radix_tree_lookup(&mapping->page_tree, offset);
 573        if (page) {
 574                page_cache_get(page);
 575                if (TestSetPageLocked(page)) {
 576                        spin_unlock_irq(&mapping->tree_lock);
 577                        lock_page(page);
 578                        spin_lock_irq(&mapping->tree_lock);
 579
 580                        /* Has the page been truncated while we slept? */
 581                        if (page->mapping != mapping || page->index != offset) {
 582                                unlock_page(page);
 583                                page_cache_release(page);
 584                                goto repeat;
 585                        }
 586                }
 587        }
 588        spin_unlock_irq(&mapping->tree_lock);
 589        return page;
 590}
 591
 592EXPORT_SYMBOL(find_lock_page);
 593
 594/**
 595 * find_or_create_page - locate or add a pagecache page
 596 *
 597 * @mapping - the page's address_space
 598 * @index - the page's index into the mapping
 599 * @gfp_mask - page allocation mode
 600 *
 601 * Locates a page in the pagecache.  If the page is not present, a new page
 602 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
 603 * LRU list.  The returned page is locked and has its reference count
 604 * incremented.
 605 *
 606 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
 607 * allocation!
 608 *
 609 * find_or_create_page() returns the desired page's address, or zero on
 610 * memory exhaustion.
 611 */
 612struct page *find_or_create_page(struct address_space *mapping,
 613                unsigned long index, unsigned int gfp_mask)
 614{
 615        struct page *page, *cached_page = NULL;
 616        int err;
 617repeat:
 618        page = find_lock_page(mapping, index);
 619        if (!page) {
 620                if (!cached_page) {
 621                        cached_page = alloc_page(gfp_mask);
 622                        if (!cached_page)
 623                                return NULL;
 624                }
 625                err = add_to_page_cache_lru(cached_page, mapping,
 626                                        index, gfp_mask);
 627                if (!err) {
 628                        page = cached_page;
 629                        cached_page = NULL;
 630                } else if (err == -EEXIST)
 631                        goto repeat;
 632        }
 633        if (cached_page)
 634                page_cache_release(cached_page);
 635        return page;
 636}
 637
 638EXPORT_SYMBOL(find_or_create_page);
 639
 640/**
 641 * find_get_pages - gang pagecache lookup
 642 * @mapping:    The address_space to search
 643 * @start:      The starting page index
 644 * @nr_pages:   The maximum number of pages
 645 * @pages:      Where the resulting pages are placed
 646 *
 647 * find_get_pages() will search for and return a group of up to
 648 * @nr_pages pages in the mapping.  The pages are placed at @pages.
 649 * find_get_pages() takes a reference against the returned pages.
 650 *
 651 * The search returns a group of mapping-contiguous pages with ascending
 652 * indexes.  There may be holes in the indices due to not-present pages.
 653 *
 654 * find_get_pages() returns the number of pages which were found.
 655 */
 656unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 657                            unsigned int nr_pages, struct page **pages)
 658{
 659        unsigned int i;
 660        unsigned int ret;
 661
 662        spin_lock_irq(&mapping->tree_lock);
 663        ret = radix_tree_gang_lookup(&mapping->page_tree,
 664                                (void **)pages, start, nr_pages);
 665        for (i = 0; i < ret; i++)
 666                page_cache_get(pages[i]);
 667        spin_unlock_irq(&mapping->tree_lock);
 668        return ret;
 669}
 670
 671/*
 672 * Like find_get_pages, except we only return pages which are tagged with
 673 * `tag'.   We update *index to index the next page for the traversal.
 674 */
 675unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 676                        int tag, unsigned int nr_pages, struct page **pages)
 677{
 678        unsigned int i;
 679        unsigned int ret;
 680
 681        spin_lock_irq(&mapping->tree_lock);
 682        ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
 683                                (void **)pages, *index, nr_pages, tag);
 684        for (i = 0; i < ret; i++)
 685                page_cache_get(pages[i]);
 686        if (ret)
 687                *index = pages[ret - 1]->index + 1;
 688        spin_unlock_irq(&mapping->tree_lock);
 689        return ret;
 690}
 691
 692/*
 693 * Same as grab_cache_page, but do not wait if the page is unavailable.
 694 * This is intended for speculative data generators, where the data can
 695 * be regenerated if the page couldn't be grabbed.  This routine should
 696 * be safe to call while holding the lock for another page.
 697 *
 698 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 699 * and deadlock against the caller's locked page.
 700 */
 701struct page *
 702grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 703{
 704        struct page *page = find_get_page(mapping, index);
 705        int gfp_mask;
 706
 707        if (page) {
 708                if (!TestSetPageLocked(page))
 709                        return page;
 710                page_cache_release(page);
 711                return NULL;
 712        }
 713        gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
 714        page = alloc_pages(gfp_mask, 0);
 715        if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
 716                page_cache_release(page);
 717                page = NULL;
 718        }
 719        return page;
 720}
 721
 722EXPORT_SYMBOL(grab_cache_page_nowait);
 723
 724/*
 725 * This is a generic file read routine, and uses the
 726 * mapping->a_ops->readpage() function for the actual low-level
 727 * stuff.
 728 *
 729 * This is really ugly. But the goto's actually try to clarify some
 730 * of the logic when it comes to error handling etc.
 731 *
 732 * Note the struct file* is only passed for the use of readpage.  It may be
 733 * NULL.
 734 */
 735void do_generic_mapping_read(struct address_space *mapping,
 736                             struct file_ra_state *_ra,
 737                             struct file *filp,
 738                             loff_t *ppos,
 739                             read_descriptor_t *desc,
 740                             read_actor_t actor,
 741                             int nonblock)
 742{
 743        struct inode *inode = mapping->host;
 744        unsigned long index, end_index, offset;
 745        unsigned long last_index;
 746        unsigned long next_index;
 747        loff_t isize;
 748        struct page *cached_page;
 749        int error;
 750        struct file_ra_state ra = *_ra;
 751
 752        cached_page = NULL;
 753        index = *ppos >> PAGE_CACHE_SHIFT;
 754        next_index = index;
 755        last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
 756        offset = *ppos & ~PAGE_CACHE_MASK;
 757
 758        isize = i_size_read(inode);
 759        if (!isize)
 760                goto out;
 761
 762        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 763        for (;;) {
 764                struct page *page;
 765                unsigned long nr, ret;
 766
 767                /* nr is the maximum number of bytes to copy from this page */
 768                nr = PAGE_CACHE_SIZE;
 769                if (index >= end_index) {
 770                        if (index > end_index)
 771                                goto out;
 772                        nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
 773                        if (nr <= offset) {
 774                                goto out;
 775                        }
 776                }
 777                nr = nr - offset;
 778
 779                cond_resched();
 780                if (!nonblock && index == next_index)
 781                        next_index = page_cache_readahead(mapping, &ra, filp, 
 782                                                          index, 
 783                                                          last_index - index);
 784
 785find_page:
 786                page = find_get_page(mapping, index);
 787                if (unlikely(page == NULL)) {
 788                        if (nonblock) {
 789                                desc->error = -EWOULDBLOCKIO;
 790                                break;
 791                        }
 792                        handle_ra_miss(mapping, &ra, index);
 793                        goto no_cached_page;
 794                }
 795                if (!PageUptodate(page)) {
 796                        if (nonblock) {
 797                                page_cache_release(page);
 798                                desc->error = -EWOULDBLOCKIO;
 799                                break;
 800                        }
 801                        goto page_not_up_to_date;
 802                }
 803page_ok:
 804
 805                /* If users can be writing to this page using arbitrary
 806                 * virtual addresses, take care about potential aliasing
 807                 * before reading the page on the kernel side.
 808                 */
 809                if (mapping_writably_mapped(mapping))
 810                        flush_dcache_page(page);
 811
 812                /*
 813                 * Mark the page accessed if we read the beginning.
 814                 */
 815                if (!offset)
 816                        mark_page_accessed(page);
 817
 818                /*
 819                 * Ok, we have the page, and it's up-to-date, so
 820                 * now we can copy it to user space...
 821                 *
 822                 * The actor routine returns how many bytes were actually used..
 823                 * NOTE! This may not be the same as how much of a user buffer
 824                 * we filled up (we may be padding etc), so we can only update
 825                 * "pos" here (the actor routine has to update the user buffer
 826                 * pointers and the remaining count).
 827                 */
 828                ret = actor(desc, page, offset, nr);
 829                offset += ret;
 830                index += offset >> PAGE_CACHE_SHIFT;
 831                offset &= ~PAGE_CACHE_MASK;
 832
 833                page_cache_release(page);
 834                if (ret == nr && desc->count)
 835                        continue;
 836                goto out;
 837
 838page_not_up_to_date:
 839                /* Get exclusive access to the page ... */
 840                lock_page(page);
 841
 842                /* Did it get unhashed before we got the lock? */
 843                if (!page->mapping) {
 844                        unlock_page(page);
 845                        page_cache_release(page);
 846                        continue;
 847                }
 848
 849                /* Did somebody else fill it already? */
 850                if (PageUptodate(page)) {
 851                        unlock_page(page);
 852                        goto page_ok;
 853                }
 854
 855readpage:
 856                /* Start the actual read. The read will unlock the page. */
 857                error = mapping->a_ops->readpage(filp, page);
 858
 859                if (unlikely(error))
 860                        goto readpage_error;
 861
 862                if (!PageUptodate(page)) {
 863                        lock_page(page);
 864                        if (!PageUptodate(page)) {
 865                                if (page->mapping == NULL) {
 866                                        /*
 867                                         * invalidate_inode_pages got it
 868                                         */
 869                                        unlock_page(page);
 870                                        page_cache_release(page);
 871                                        goto find_page;
 872                                }
 873                                unlock_page(page);
 874                                error = -EIO;
 875                                goto readpage_error;
 876                        }
 877                        unlock_page(page);
 878                }
 879
 880                /*
 881                 * i_size must be checked after we have done ->readpage.
 882                 *
 883                 * Checking i_size after the readpage allows us to calculate
 884                 * the correct value for "nr", which means the zero-filled
 885                 * part of the page is not copied back to userspace (unless
 886                 * another truncate extends the file - this is desired though).
 887                 */
 888                isize = i_size_read(inode);
 889                end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 890                if (unlikely(!isize || index > end_index)) {
 891                        page_cache_release(page);
 892                        goto out;
 893                }
 894
 895                /* nr is the maximum number of bytes to copy from this page */
 896                nr = PAGE_CACHE_SIZE;
 897                if (index == end_index) {
 898                        nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
 899                        if (nr <= offset) {
 900                                page_cache_release(page);
 901                                goto out;
 902                        }
 903                }
 904                nr = nr - offset;
 905                goto page_ok;
 906
 907readpage_error:
 908                /* UHHUH! A synchronous read error occurred. Report it */
 909                desc->error = error;
 910                page_cache_release(page);
 911                goto out;
 912
 913no_cached_page:
 914                /*
 915                 * Ok, it wasn't cached, so we need to create a new
 916                 * page..
 917                 */
 918                if (!cached_page) {
 919                        cached_page = page_cache_alloc_cold(mapping);
 920                        if (!cached_page) {
 921                                desc->error = -ENOMEM;
 922                                goto out;
 923                        }
 924                }
 925                error = add_to_page_cache_lru(cached_page, mapping,
 926                                                index, GFP_KERNEL);
 927                if (error) {
 928                        if (error == -EEXIST)
 929                                goto find_page;
 930                        desc->error = error;
 931                        goto out;
 932                }
 933                page = cached_page;
 934                cached_page = NULL;
 935                goto readpage;
 936        }
 937
 938out:
 939        *_ra = ra;
 940
 941        *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
 942        if (cached_page)
 943                page_cache_release(cached_page);
 944        if (filp)
 945                file_accessed(filp);
 946}
 947
 948EXPORT_SYMBOL(do_generic_mapping_read);
 949
 950int file_read_actor(read_descriptor_t *desc, struct page *page,
 951                        unsigned long offset, unsigned long size)
 952{
 953        char *kaddr;
 954        unsigned long left, count = desc->count;
 955
 956        if (size > count)
 957                size = count;
 958
 959        /*
 960         * Faults on the destination of a read are common, so do it before
 961         * taking the kmap.
 962         */
 963        if (!fault_in_pages_writeable(desc->arg.buf, size)) {
 964                kaddr = kmap_atomic(page, KM_USER0);
 965                left = __copy_to_user_inatomic(desc->arg.buf,
 966                                                kaddr + offset, size);
 967                kunmap_atomic(kaddr, KM_USER0);
 968                if (left == 0)
 969                        goto success;
 970        }
 971
 972        /* Do it the slow way */
 973        kaddr = kmap(page);
 974        left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
 975        kunmap(page);
 976
 977        if (left) {
 978                size -= left;
 979                desc->error = -EFAULT;
 980        }
 981success:
 982        desc->count = count - size;
 983        desc->written += size;
 984        desc->arg.buf += size;
 985        return size;
 986}
 987
 988/*
 989 * This is the "read()" routine for all filesystems
 990 * that can use the page cache directly.
 991 */
 992ssize_t
 993__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 994                unsigned long nr_segs, loff_t *ppos)
 995{
 996        struct file *filp = iocb->ki_filp;
 997        ssize_t retval;
 998        unsigned long seg;
 999        size_t count;
1000
1001        count = 0;
1002        for (seg = 0; seg < nr_segs; seg++) {
1003                const struct iovec *iv = &iov[seg];
1004
1005                /*
1006                 * If any segment has a negative length, or the cumulative
1007                 * length ever wraps negative then return -EINVAL.
1008                 */
1009                count += iv->iov_len;
1010                if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1011                        return -EINVAL;
1012                if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1013                        continue;
1014                if (seg == 0)
1015                        return -EFAULT;
1016                nr_segs = seg;
1017                count -= iv->iov_len;   /* This segment is no good */
1018                break;
1019        }
1020
1021        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1022        if (filp->f_flags & O_DIRECT) {
1023                loff_t pos = *ppos, size;
1024                struct address_space *mapping;
1025                struct inode *inode;
1026
1027                mapping = filp->f_mapping;
1028                inode = mapping->host;
1029                retval = 0;
1030                if (!count)
1031                        goto out; /* skip atime */
1032                size = i_size_read(inode);
1033                if (pos < size) {
1034                        retval = generic_file_direct_IO(READ, iocb,
1035                                                iov, pos, nr_segs);
1036                        if (retval > 0)
1037                                *ppos = pos + retval;
1038                }
1039                file_accessed(filp);
1040                goto out;
1041        }
1042
1043        retval = 0;
1044        if (count) {
1045                for (seg = 0; seg < nr_segs; seg++) {
1046                        read_descriptor_t desc;
1047
1048                        desc.written = 0;
1049                        desc.arg.buf = iov[seg].iov_base;
1050                        desc.count = iov[seg].iov_len;
1051                        if (desc.count == 0)
1052                                continue;
1053                        desc.error = 0;
1054                        do_generic_file_read(filp,ppos,&desc,file_read_actor,0);
1055                        retval += desc.written;
1056                        if (desc.error) {
1057                                retval = retval ?: desc.error;  
1058                                break;
1059                        }
1060                }
1061        }
1062out:
1063        return retval;
1064}
1065
1066EXPORT_SYMBOL(__generic_file_aio_read);
1067
1068ssize_t
1069generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
1070{
1071        struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1072
1073        BUG_ON(iocb->ki_pos != pos);
1074        return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
1075}
1076
1077EXPORT_SYMBOL(generic_file_aio_read);
1078
1079ssize_t
1080generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1081{
1082        struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1083        struct kiocb kiocb;
1084        ssize_t ret;
1085
1086        init_sync_kiocb(&kiocb, filp);
1087        ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
1088        if (-EIOCBQUEUED == ret)
1089                ret = wait_on_sync_kiocb(&kiocb);
1090        return ret;
1091}
1092
1093EXPORT_SYMBOL(generic_file_read);
1094
1095int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1096{
1097        ssize_t written;
1098        unsigned long count = desc->count;
1099        struct file *file = desc->arg.data;
1100
1101        if (size > count)
1102                size = count;
1103
1104        written = file->f_op->sendpage(file, page, offset,
1105                                       size, &file->f_pos, size<count);
1106        if (written < 0) {
1107                desc->error = written;
1108                written = 0;
1109        }
1110        desc->count = count - written;
1111        desc->written += written;
1112        return written;
1113}
1114
1115ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
1116                         size_t count, read_actor_t actor, void *target)
1117{
1118        read_descriptor_t desc;
1119
1120        if (!count)
1121                return 0;
1122
1123        desc.written = 0;
1124        desc.count = count;
1125        desc.arg.data = target;
1126        desc.error = 0;
1127
1128        do_generic_file_read(in_file, ppos, &desc, actor, 0);
1129        if (desc.written)
1130                return desc.written;
1131        return desc.error;
1132}
1133
1134EXPORT_SYMBOL(generic_file_sendfile);
1135
1136static ssize_t
1137do_readahead(struct address_space *mapping, struct file *filp,
1138             unsigned long index, unsigned long nr)
1139{
1140        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1141                return -EINVAL;
1142
1143        force_page_cache_readahead(mapping, filp, index,
1144                                        max_sane_readahead(nr));
1145        return 0;
1146}
1147
1148asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1149{
1150        ssize_t ret;
1151        struct file *file;
1152
1153        ret = -EBADF;
1154        file = fget(fd);
1155        if (file) {
1156                if (file->f_mode & FMODE_READ) {
1157                        struct address_space *mapping = file->f_mapping;
1158                        unsigned long start = offset >> PAGE_CACHE_SHIFT;
1159                        unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1160                        unsigned long len = end - start + 1;
1161                        ret = do_readahead(mapping, file, start, len);
1162                }
1163                fput(file);
1164        }
1165        return ret;
1166}
1167
1168#ifdef CONFIG_MMU
1169/*
1170 * This adds the requested page to the page cache if it isn't already there,
1171 * and schedules an I/O to read in its contents from disk.
1172 */
1173static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1174static int fastcall page_cache_read(struct file * file, unsigned long offset)
1175{
1176        struct address_space *mapping = file->f_mapping;
1177        struct page *page; 
1178        int error;
1179
1180        page = page_cache_alloc_cold(mapping);
1181        if (!page)
1182                return -ENOMEM;
1183
1184        error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1185        if (!error) {
1186                error = mapping->a_ops->readpage(file, page);
1187                page_cache_release(page);
1188                return error;
1189        }
1190
1191        /*
1192         * We arrive here in the unlikely event that someone 
1193         * raced with us and added our page to the cache first
1194         * or we are out of memory for radix-tree nodes.
1195         */
1196        page_cache_release(page);
1197        return error == -EEXIST ? 0 : error;
1198}
1199
1200#define MMAP_LOTSAMISS  (100)
1201
1202/*
1203 * filemap_nopage() is invoked via the vma operations vector for a
1204 * mapped memory region to read in file data during a page fault.
1205 *
1206 * The goto's are kind of ugly, but this streamlines the normal case of having
1207 * it in the page cache, and handles the special cases reasonably without
1208 * having a lot of duplicated code.
1209 */
1210struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int *type)
1211{
1212        int error;
1213        struct file *file = area->vm_file;
1214        struct address_space *mapping = file->f_mapping;
1215        struct file_ra_state *ra = &file->f_ra;
1216        struct inode *inode = mapping->host;
1217        struct page *page;
1218        unsigned long size, pgoff, endoff;
1219        int did_readaround = 0, majmin = VM_FAULT_MINOR;
1220
1221        pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1222        endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1223
1224retry_all:
1225        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1226        if (pgoff >= size)
1227                goto outside_data_content;
1228
1229        /* If we don't want any read-ahead, don't bother */
1230        if (VM_RandomReadHint(area))
1231                goto no_cached_page;
1232
1233        /*
1234         * The "size" of the file, as far as mmap is concerned, isn't bigger
1235         * than the mapping
1236         */
1237        if (size > endoff)
1238                size = endoff;
1239
1240        /*
1241         * The readahead code wants to be told about each and every page
1242         * so it can build and shrink its windows appropriately
1243         *
1244         * For sequential accesses, we use the generic readahead logic.
1245         */
1246        if (VM_SequentialReadHint(area))
1247                page_cache_readahead(mapping, ra, file, pgoff, 1);
1248
1249        /*
1250         * Do we have something in the page cache already?
1251         */
1252retry_find:
1253        page = find_get_page(mapping, pgoff);
1254        if (!page) {
1255                unsigned long ra_pages;
1256
1257                if (VM_SequentialReadHint(area)) {
1258                        handle_ra_miss(mapping, ra, pgoff);
1259                        goto no_cached_page;
1260                }
1261                ra->mmap_miss++;
1262
1263                /*
1264                 * Do we miss much more than hit in this file? If so,
1265                 * stop bothering with read-ahead. It will only hurt.
1266                 */
1267                if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
1268                        goto no_cached_page;
1269
1270                /*
1271                 * To keep the pgmajfault counter straight, we need to
1272                 * check did_readaround, as this is an inner loop.
1273                 */
1274                if (!did_readaround) {
1275                        majmin = VM_FAULT_MAJOR;
1276                        inc_page_state(pgmajfault);
1277                }
1278                did_readaround = 1;
1279                ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1280                if (ra_pages) {
1281                        pgoff_t start = 0;
1282
1283                        if (pgoff > ra_pages / 2)
1284                                start = pgoff - ra_pages / 2;
1285                        do_page_cache_readahead(mapping, file, start, ra_pages);
1286                }
1287                page = find_get_page(mapping, pgoff);
1288                if (!page)
1289                        goto no_cached_page;
1290        }
1291
1292        if (!did_readaround)
1293                ra->mmap_hit++;
1294
1295        /*
1296         * Ok, found a page in the page cache, now we need to check
1297         * that it's up-to-date.
1298         */
1299        if (!PageUptodate(page))
1300                goto page_not_uptodate;
1301
1302success:
1303        /*
1304         * Found the page and have a reference on it.
1305         */
1306        mark_page_accessed(page);
1307        if (type)
1308                *type = majmin;
1309        return page;
1310
1311outside_data_content:
1312        /*
1313         * An external ptracer can access pages that normally aren't
1314         * accessible..
1315         */
1316        if (area->vm_mm == current->mm)
1317                return NULL;
1318        /* Fall through to the non-read-ahead case */
1319no_cached_page:
1320        /*
1321         * We're only likely to ever get here if MADV_RANDOM is in
1322         * effect.
1323         */
1324        error = page_cache_read(file, pgoff);
1325        grab_swap_token();
1326
1327        /*
1328         * The page we want has now been added to the page cache.
1329         * In the unlikely event that someone removed it in the
1330         * meantime, we'll just come back here and read it again.
1331         */
1332        if (error >= 0)
1333                goto retry_find;
1334
1335        /*
1336         * An error return from page_cache_read can result if the
1337         * system is low on memory, or a problem occurs while trying
1338         * to schedule I/O.
1339         */
1340        if (error == -ENOMEM)
1341                return NOPAGE_OOM;
1342        return NULL;
1343
1344page_not_uptodate:
1345        if (!did_readaround) {
1346                majmin = VM_FAULT_MAJOR;
1347                inc_page_state(pgmajfault);
1348        }
1349        lock_page(page);
1350
1351        /* Did it get unhashed while we waited for it? */
1352        if (!page->mapping) {
1353                unlock_page(page);
1354                page_cache_release(page);
1355                goto retry_all;
1356        }
1357
1358        /* Did somebody else get it up-to-date? */
1359        if (PageUptodate(page)) {
1360                unlock_page(page);
1361                goto success;
1362        }
1363
1364        if (!mapping->a_ops->readpage(file, page)) {
1365                wait_on_page_locked(page);
1366                if (PageUptodate(page))
1367                        goto success;
1368        }
1369
1370        /*
1371         * Umm, take care of errors if the page isn't up-to-date.
1372         * Try to re-read it _once_. We do this synchronously,
1373         * because there really aren't any performance issues here
1374         * and we need to check for errors.
1375         */
1376        lock_page(page);
1377
1378        /* Somebody truncated the page on us? */
1379        if (!page->mapping) {
1380                unlock_page(page);
1381                page_cache_release(page);
1382                goto retry_all;
1383        }
1384
1385        /* Somebody else successfully read it in? */
1386        if (PageUptodate(page)) {
1387                unlock_page(page);
1388                goto success;
1389        }
1390        ClearPageError(page);
1391        if (!mapping->a_ops->readpage(file, page)) {
1392                wait_on_page_locked(page);
1393                if (PageUptodate(page))
1394                        goto success;
1395        }
1396
1397        /*
1398         * Things didn't work out. Return zero to tell the
1399         * mm layer so, possibly freeing the page cache page first.
1400         */
1401        page_cache_release(page);
1402        return NULL;
1403}
1404
1405EXPORT_SYMBOL(filemap_nopage);
1406
1407static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
1408                                        int nonblock)
1409{
1410        struct address_space *mapping = file->f_mapping;
1411        struct page *page;
1412        int error;
1413
1414        /*
1415         * Do we have something in the page cache already?
1416         */
1417retry_find:
1418        page = find_get_page(mapping, pgoff);
1419        if (!page) {
1420                if (nonblock)
1421                        return NULL;
1422                goto no_cached_page;
1423        }
1424
1425        /*
1426         * Ok, found a page in the page cache, now we need to check
1427         * that it's up-to-date.
1428         */
1429        if (!PageUptodate(page))
1430                goto page_not_uptodate;
1431
1432success:
1433        /*
1434         * Found the page and have a reference on it.
1435         */
1436        mark_page_accessed(page);
1437        return page;
1438
1439no_cached_page:
1440        error = page_cache_read(file, pgoff);
1441
1442        /*
1443         * The page we want has now been added to the page cache.
1444         * In the unlikely event that someone removed it in the
1445         * meantime, we'll just come back here and read it again.
1446         */
1447        if (error >= 0)
1448                goto retry_find;
1449
1450        /*
1451         * An error return from page_cache_read can result if the
1452         * system is low on memory, or a problem occurs while trying
1453         * to schedule I/O.
1454         */
1455        return NULL;
1456
1457page_not_uptodate:
1458        lock_page(page);
1459
1460        /* Did it get unhashed while we waited for it? */
1461        if (!page->mapping) {
1462                unlock_page(page);
1463                goto err;
1464        }
1465
1466        /* Did somebody else get it up-to-date? */
1467        if (PageUptodate(page)) {
1468                unlock_page(page);
1469                goto success;
1470        }
1471
1472        if (!mapping->a_ops->readpage(file, page)) {
1473                wait_on_page_locked(page);
1474                if (PageUptodate(page))
1475                        goto success;
1476        }
1477
1478        /*
1479         * Umm, take care of errors if the page isn't up-to-date.
1480         * Try to re-read it _once_. We do this synchronously,
1481         * because there really aren't any performance issues here
1482         * and we need to check for errors.
1483         */
1484        lock_page(page);
1485
1486        /* Somebody truncated the page on us? */
1487        if (!page->mapping) {
1488                unlock_page(page);
1489                goto err;
1490        }
1491        /* Somebody else successfully read it in? */
1492        if (PageUptodate(page)) {
1493                unlock_page(page);
1494                goto success;
1495        }
1496
1497        ClearPageError(page);
1498        if (!mapping->a_ops->readpage(file, page)) {
1499                wait_on_page_locked(page);
1500                if (PageUptodate(page))
1501                        goto success;
1502        }
1503
1504        /*
1505         * Things didn't work out. Return zero to tell the
1506         * mm layer so, possibly freeing the page cache page first.
1507         */
1508err:
1509        page_cache_release(page);
1510
1511        return NULL;
1512}
1513
1514static int filemap_populate(struct vm_area_struct *vma,
1515                        unsigned long addr,
1516                        unsigned long len,
1517                        pgprot_t prot,
1518                        unsigned long pgoff,
1519                        int nonblock)
1520{
1521        struct file *file = vma->vm_file;
1522        struct address_space *mapping = file->f_mapping;
1523        struct inode *inode = mapping->host;
1524        unsigned long size;
1525        struct mm_struct *mm = vma->vm_mm;
1526        struct page *page;
1527        int err;
1528
1529        if (!nonblock)
1530                force_page_cache_readahead(mapping, vma->vm_file,
1531                                        pgoff, len >> PAGE_CACHE_SHIFT);
1532
1533repeat:
1534        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1535        if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
1536                return -EINVAL;
1537
1538        page = filemap_getpage(file, pgoff, nonblock);
1539        if (!page && !nonblock)
1540                return -ENOMEM;
1541        if (page) {
1542                err = install_page(mm, vma, addr, page, prot);
1543                if (err) {
1544                        page_cache_release(page);
1545                        return err;
1546                }
1547        } else {
1548                err = install_file_pte(mm, vma, addr, pgoff, prot);
1549                if (err)
1550                        return err;
1551        }
1552
1553        len -= PAGE_SIZE;
1554        addr += PAGE_SIZE;
1555        pgoff++;
1556        if (len)
1557                goto repeat;
1558
1559        return 0;
1560}
1561
1562struct vm_operations_struct generic_file_vm_ops = {
1563        .nopage         = filemap_nopage,
1564        .populate       = filemap_populate,
1565};
1566
1567/* This is used for a general mmap of a disk file */
1568
1569int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1570{
1571        struct address_space *mapping = file->f_mapping;
1572
1573        if (!mapping->a_ops->readpage)
1574                return -ENOEXEC;
1575        file_accessed(file);
1576        vma->vm_ops = &generic_file_vm_ops;
1577        return 0;
1578}
1579
1580/*
1581 * This is for filesystems which do not implement ->writepage.
1582 */
1583int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1584{
1585        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1586                return -EINVAL;
1587        return generic_file_mmap(file, vma);
1588}
1589
1590/* This is used for a general mmap of a disk file without modify atime */
1591
1592int generic_file_noatime_mmap(struct file * file, struct vm_area_struct * vma)
1593{
1594        struct address_space *mapping = file->f_mapping;
1595
1596        if (!mapping->a_ops->readpage)
1597                return -ENOEXEC;
1598        vma->vm_ops = &generic_file_vm_ops;
1599        return 0;
1600}
1601#else
1602int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1603{
1604        return -ENOSYS;
1605}
1606int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1607{
1608        return -ENOSYS;
1609}
1610int generic_file_noatime_mmap(struct file * file, struct vm_area_struct * vma)
1611{
1612        return -ENOSYS;
1613}
1614#endif /* CONFIG_MMU */
1615
1616EXPORT_SYMBOL(generic_file_mmap);
1617EXPORT_SYMBOL(generic_file_readonly_mmap);
1618EXPORT_SYMBOL(generic_file_noatime_mmap);
1619
1620static inline struct page *__read_cache_page(struct address_space *mapping,
1621                                unsigned long index,
1622                                int (*filler)(void *,struct page*),
1623                                void *data)
1624{
1625        struct page *page, *cached_page = NULL;
1626        int err;
1627repeat:
1628        page = find_get_page(mapping, index);
1629        if (!page) {
1630                if (!cached_page) {
1631                        cached_page = page_cache_alloc_cold(mapping);
1632                        if (!cached_page)
1633                                return ERR_PTR(-ENOMEM);
1634                }
1635                err = add_to_page_cache_lru(cached_page, mapping,
1636                                        index, GFP_KERNEL);
1637                if (err == -EEXIST)
1638                        goto repeat;
1639                if (err < 0) {
1640                        /* Presumably ENOMEM for radix tree node */
1641                        page_cache_release(cached_page);
1642                        return ERR_PTR(err);
1643                }
1644                page = cached_page;
1645                cached_page = NULL;
1646                err = filler(data, page);
1647                if (err < 0) {
1648                        page_cache_release(page);
1649                        page = ERR_PTR(err);
1650                }
1651        }
1652        if (cached_page)
1653                page_cache_release(cached_page);
1654        return page;
1655}
1656
1657/*
1658 * Read into the page cache. If a page already exists,
1659 * and PageUptodate() is not set, try to fill the page.
1660 */
1661struct page *read_cache_page(struct address_space *mapping,
1662                                unsigned long index,
1663                                int (*filler)(void *,struct page*),
1664                                void *data)
1665{
1666        struct page *page;
1667        int err;
1668
1669retry:
1670        page = __read_cache_page(mapping, index, filler, data);
1671        if (IS_ERR(page))
1672                goto out;
1673        mark_page_accessed(page);
1674        if (PageUptodate(page))
1675                goto out;
1676
1677        lock_page(page);
1678        if (!page->mapping) {
1679                unlock_page(page);
1680                page_cache_release(page);
1681                goto retry;
1682        }
1683        if (PageUptodate(page)) {
1684                unlock_page(page);
1685                goto out;
1686        }
1687        err = filler(data, page);
1688        if (err < 0) {
1689                page_cache_release(page);
1690                page = ERR_PTR(err);
1691        }
1692 out:
1693        return page;
1694}
1695
1696EXPORT_SYMBOL(read_cache_page);
1697
1698/*
1699 * If the page was newly created, increment its refcount and add it to the
1700 * caller's lru-buffering pagevec.  This function is specifically for
1701 * generic_file_write().
1702 */
1703static inline struct page *
1704__grab_cache_page(struct address_space *mapping, unsigned long index,
1705                        struct page **cached_page, struct pagevec *lru_pvec)
1706{
1707        int err;
1708        struct page *page;
1709repeat:
1710        page = find_lock_page(mapping, index);
1711        if (!page) {
1712                if (!*cached_page) {
1713                        *cached_page = page_cache_alloc(mapping);
1714                        if (!*cached_page)
1715                                return NULL;
1716                }
1717                err = add_to_page_cache(*cached_page, mapping,
1718                                        index, GFP_KERNEL);
1719                if (err == -EEXIST)
1720                        goto repeat;
1721                if (err == 0) {
1722                        page = *cached_page;
1723                        page_cache_get(page);
1724                        if (!pagevec_add(lru_pvec, page))
1725                                __pagevec_lru_add(lru_pvec);
1726                        *cached_page = NULL;
1727                }
1728        }
1729        return page;
1730}
1731
1732/*
1733 * The logic we want is
1734 *
1735 *      if suid or (sgid and xgrp)
1736 *              remove privs
1737 */
1738int should_remove_suid(struct dentry *dentry)
1739{
1740        mode_t mode = dentry->d_inode->i_mode;
1741        int kill = 0;
1742
1743        /* suid always must be killed */
1744        if (unlikely(mode & S_ISUID))
1745                kill = ATTR_KILL_SUID;
1746
1747        /*
1748         * sgid without any exec bits is just a mandatory locking mark; leave
1749         * it alone.  If some exec bits are set, it's a real sgid; kill it.
1750         */
1751        if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1752                kill |= ATTR_KILL_SGID;
1753
1754        if (unlikely(kill && !capable(CAP_FSETID)))
1755                return kill;
1756
1757        return 0;
1758}
1759
1760int __remove_suid(struct dentry *dentry, int kill)
1761{
1762        struct iattr newattrs;
1763
1764        newattrs.ia_valid = ATTR_FORCE | kill;
1765        return notify_change(dentry, &newattrs);
1766}
1767
1768int remove_suid(struct dentry *dentry)
1769{
1770        int kill = should_remove_suid(dentry);
1771
1772        if (unlikely(kill))
1773                return __remove_suid(dentry, kill);
1774
1775        return 0;
1776}
1777EXPORT_SYMBOL(remove_suid);
1778
1779/*
1780 * Copy as much as we can into the page and return the number of bytes which
1781 * were sucessfully copied.  If a fault is encountered then clear the page
1782 * out to (offset+bytes) and return the number of bytes which were copied.
1783 */
1784static inline size_t
1785filemap_copy_from_user(struct page *page, unsigned long offset,
1786                        const char __user *buf, unsigned bytes)
1787{
1788        char *kaddr;
1789        int left;
1790
1791        kaddr = kmap_atomic(page, KM_USER0);
1792        left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
1793        kunmap_atomic(kaddr, KM_USER0);
1794
1795        if (left != 0) {
1796                /* Do it the slow way */
1797                kaddr = kmap(page);
1798                left = __copy_from_user(kaddr + offset, buf, bytes);
1799                kunmap(page);
1800        }
1801        return bytes - left;
1802}
1803
1804static size_t
1805__filemap_copy_from_user_iovec(char *vaddr, 
1806                        const struct iovec *iov, size_t base, size_t bytes)
1807{
1808        size_t copied = 0, left = 0;
1809
1810        while (bytes) {
1811                char __user *buf = iov->iov_base + base;
1812                int copy = min(bytes, iov->iov_len - base);
1813
1814                base = 0;
1815                left = __copy_from_user_inatomic(vaddr, buf, copy);
1816                copied += copy;
1817                bytes -= copy;
1818                vaddr += copy;
1819                iov++;
1820
1821                if (unlikely(left)) {
1822                        /* zero the rest of the target like __copy_from_user */
1823                        if (bytes)
1824                                memset(vaddr, 0, bytes);
1825                        break;
1826                }
1827        }
1828        return copied - left;
1829}
1830
1831/*
1832 * This has the same sideeffects and return value as filemap_copy_from_user().
1833 * The difference is that on a fault we need to memset the remainder of the
1834 * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
1835 * single-segment behaviour.
1836 */
1837static inline size_t
1838filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
1839                        const struct iovec *iov, size_t base, size_t bytes)
1840{
1841        char *kaddr;
1842        size_t copied;
1843
1844        kaddr = kmap_atomic(page, KM_USER0);
1845        copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
1846                                                base, bytes);
1847        kunmap_atomic(kaddr, KM_USER0);
1848        if (copied != bytes) {
1849                kaddr = kmap(page);
1850                copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
1851                                                        base, bytes);
1852                kunmap(page);
1853        }
1854        return copied;
1855}
1856
1857static inline void
1858filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1859{
1860        const struct iovec *iov = *iovp;
1861        size_t base = *basep;
1862
1863        while (bytes) {
1864                int copy = min(bytes, iov->iov_len - base);
1865
1866                bytes -= copy;
1867                base += copy;
1868                if (iov->iov_len == base) {
1869                        iov++;
1870                        base = 0;
1871                }
1872        }
1873        *iovp = iov;
1874        *basep = base;
1875}
1876
1877/*
1878 * Performs necessary checks before doing a write
1879 *
1880 * Can adjust writing position aor amount of bytes to write.
1881 * Returns appropriate error code that caller should return or
1882 * zero in case that write should be allowed.
1883 */
1884inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1885{
1886        struct inode *inode = file->f_mapping->host;
1887        unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1888
1889        if (unlikely(*pos < 0))
1890                return -EINVAL;
1891
1892        if (unlikely(file->f_error)) {
1893                int err = file->f_error;
1894                file->f_error = 0;
1895                return err;
1896        }
1897
1898        if (!isblk) {
1899                /* FIXME: this is for backwards compatibility with 2.4 */
1900                if (file->f_flags & O_APPEND)
1901                        *pos = i_size_read(inode);
1902
1903                if (limit != RLIM_INFINITY) {
1904                        if (*pos >= limit) {
1905                                send_sig(SIGXFSZ, current, 0);
1906                                return -EFBIG;
1907                        }
1908                        if (*count > limit - (typeof(limit))*pos) {
1909                                *count = limit - (typeof(limit))*pos;
1910                        }
1911                }
1912        }
1913
1914        /*
1915         * LFS rule
1916         */
1917        if (unlikely(*pos + *count > MAX_NON_LFS &&
1918                                !(file->f_flags & O_LARGEFILE))) {
1919                if (*pos >= MAX_NON_LFS) {
1920                        send_sig(SIGXFSZ, current, 0);
1921                        return -EFBIG;
1922                }
1923                if (*count > MAX_NON_LFS - (unsigned long)*pos) {
1924                        *count = MAX_NON_LFS - (unsigned long)*pos;
1925                }
1926        }
1927
1928        /*
1929         * Are we about to exceed the fs block limit ?
1930         *
1931         * If we have written data it becomes a short write.  If we have
1932         * exceeded without writing data we send a signal and return EFBIG.
1933         * Linus frestrict idea will clean these up nicely..
1934         */
1935        if (likely(!isblk)) {
1936                if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
1937                        if (*count || *pos > inode->i_sb->s_maxbytes) {
1938                                send_sig(SIGXFSZ, current, 0);
1939                                return -EFBIG;
1940                        }
1941                        /* zero-length writes at ->s_maxbytes are OK */
1942                }
1943
1944                if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
1945                        *count = inode->i_sb->s_maxbytes - *pos;
1946        } else {
1947                loff_t isize;
1948                if (bdev_read_only(I_BDEV(inode)))
1949                        return -EPERM;
1950                isize = i_size_read(inode);
1951                if (*pos >= isize) {
1952                        if (*count || *pos > isize)
1953                                return -ENOSPC;
1954                }
1955
1956                if (*pos + *count > isize)
1957                        *count = isize - *pos;
1958        }
1959        return 0;
1960}
1961EXPORT_SYMBOL(generic_write_checks);
1962
1963ssize_t
1964generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1965                unsigned long *nr_segs, loff_t pos, loff_t *ppos,
1966                size_t count, size_t ocount)
1967{
1968        struct file     *file = iocb->ki_filp;
1969        struct address_space *mapping = file->f_mapping;
1970        struct inode    *inode = mapping->host;
1971        ssize_t         written;
1972
1973        if (count != ocount)
1974                *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
1975
1976        written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
1977        if (written > 0) {
1978                loff_t end = pos + written;
1979                if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
1980                        i_size_write(inode,  end);
1981                        mark_inode_dirty(inode);
1982                }
1983                *ppos = end;
1984        }
1985
1986        /*
1987         * Sync the fs metadata but not the minor inode changes and
1988         * of course not the data as we did direct DMA for the IO.
1989         * i_sem is held, which protects generic_osync_inode() from
1990         * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
1991         */
1992        if ((written >= 0 || written == -EIOCBQUEUED) &&
1993            ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
1994                int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
1995                if (err < 0)
1996                        written = err;
1997        }
1998        return written;
1999}
2000EXPORT_SYMBOL(generic_file_direct_write);
2001
2002ssize_t
2003generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2004                unsigned long nr_segs, loff_t pos, loff_t *ppos,
2005                size_t count, ssize_t written)
2006{
2007        struct file *file = iocb->ki_filp;
2008        struct address_space * mapping = file->f_mapping;
2009        struct address_space_operations *a_ops = mapping->a_ops;
2010        struct inode    *inode = mapping->host;
2011        long            status = 0;
2012        struct page     *page;
2013        struct page     *cached_page = NULL;
2014        size_t          bytes;
2015        struct pagevec  lru_pvec;
2016        const struct iovec *cur_iov = iov; /* current iovec */
2017        size_t          iov_base = 0;      /* offset in the current iovec */
2018        char __user     *buf;
2019
2020        pagevec_init(&lru_pvec, 0);
2021
2022        /*
2023         * handle partial DIO write.  Adjust cur_iov if needed.
2024         */
2025        if (likely(nr_segs == 1))
2026                buf = iov->iov_base + written;
2027        else {
2028                filemap_set_next_iovec(&cur_iov, &iov_base, written);
2029                buf = cur_iov->iov_base + iov_base;
2030        }
2031
2032        do {
2033                unsigned long index;
2034                unsigned long offset;
2035                unsigned long maxlen;
2036                size_t copied;
2037
2038                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2039                index = pos >> PAGE_CACHE_SHIFT;
2040                bytes = PAGE_CACHE_SIZE - offset;
2041                if (bytes > count)
2042                        bytes = count;
2043
2044                /*
2045                 * Bring in the user page that we will copy from _first_.
2046                 * Otherwise there's a nasty deadlock on copying from the
2047                 * same page as we're writing to, without it being marked
2048                 * up-to-date.
2049                 */
2050                maxlen = cur_iov->iov_len - iov_base;
2051                if (maxlen > bytes)
2052                        maxlen = bytes;
2053                fault_in_pages_readable(buf, maxlen);
2054                page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
2055                if (!page) {
2056                        status = -ENOMEM;
2057                        break;
2058                }
2059
2060                status = a_ops->prepare_write(file, page, offset, offset+bytes);
2061                if (unlikely(status)) {
2062                        loff_t isize = i_size_read(inode);
2063                        /*
2064                         * prepare_write() may have instantiated a few blocks
2065                         * outside i_size.  Trim these off again.
2066                         */
2067                        unlock_page(page);
2068                        page_cache_release(page);
2069                        if (pos + bytes > isize)
2070                                vmtruncate(inode, isize);
2071                        break;
2072                }
2073                if (likely(nr_segs == 1))
2074                        copied = filemap_copy_from_user(page, offset,
2075                                                        buf, bytes);
2076                else
2077                        copied = filemap_copy_from_user_iovec(page, offset,
2078                                                cur_iov, iov_base, bytes);
2079                flush_dcache_page(page);
2080                status = a_ops->commit_write(file, page, offset, offset+bytes);
2081                if (likely(copied > 0)) {
2082                        if (!status)
2083                                status = copied;
2084
2085                        if (status >= 0) {
2086                                written += status;
2087                                count -= status;
2088                                pos += status;
2089                                buf += status;
2090                                if (unlikely(nr_segs > 1)) {
2091                                        filemap_set_next_iovec(&cur_iov,
2092                                                        &iov_base, status);
2093                                        if (count)
2094                                                buf = cur_iov->iov_base + iov_base;
2095                                } else {
2096                                        iov_base += status;
2097                                }
2098                        }
2099                }
2100                if (unlikely(copied != bytes))
2101                        if (status >= 0)
2102                                status = -EFAULT;
2103                unlock_page(page);
2104                mark_page_accessed(page);
2105                page_cache_release(page);
2106                if (status < 0)
2107                        break;
2108                balance_dirty_pages_ratelimited(mapping);
2109                cond_resched();
2110        } while (count);
2111        *ppos = pos;
2112
2113        if (cached_page)
2114                page_cache_release(cached_page);
2115
2116        /*
2117         * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
2118         */
2119        if (likely(status >= 0)) {
2120                if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2121                        if (!a_ops->writepage || !is_sync_kiocb(iocb))
2122                                status = generic_osync_inode(inode, mapping,
2123                                                OSYNC_METADATA|OSYNC_DATA);
2124                }
2125        }
2126        
2127        /*
2128         * If we get here for O_DIRECT writes then we must have fallen through
2129         * to buffered writes (block instantiation inside i_size).  So we sync
2130         * the file data here, to try to honour O_DIRECT expectations.
2131         */
2132        if (unlikely(file->f_flags & O_DIRECT) && written)
2133                status = filemap_write_and_wait(mapping);
2134
2135        pagevec_lru_add(&lru_pvec);
2136        return written ? written : status;
2137}
2138EXPORT_SYMBOL(generic_file_buffered_write);
2139
2140ssize_t
2141__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2142                                unsigned long nr_segs, loff_t *ppos)
2143{
2144        struct file *file = iocb->ki_filp;
2145        struct address_space * mapping = file->f_mapping;
2146        size_t ocount;          /* original count */
2147        size_t count;           /* after file limit checks */
2148        struct inode    *inode = mapping->host;
2149        unsigned long   seg;
2150        loff_t          pos;
2151        ssize_t         written;
2152        ssize_t         err;
2153
2154        ocount = 0;
2155        for (seg = 0; seg < nr_segs; seg++) {
2156                const struct iovec *iv = &iov[seg];
2157
2158                /*
2159                 * If any segment has a negative length, or the cumulative
2160                 * length ever wraps negative then return -EINVAL.
2161                 */
2162                ocount += iv->iov_len;
2163                if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
2164                        return -EINVAL;
2165                if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
2166                        continue;
2167                if (seg == 0)
2168                        return -EFAULT;
2169                nr_segs = seg;
2170                ocount -= iv->iov_len;  /* This segment is no good */
2171                break;
2172        }
2173
2174        count = ocount;
2175        pos = *ppos;
2176
2177        /* We can write back this queue in page reclaim */
2178        current->backing_dev_info = mapping->backing_dev_info;
2179        written = 0;
2180
2181        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2182        if (err)
2183                goto out;
2184
2185        if (count == 0)
2186                goto out;
2187
2188        err = remove_suid(file->f_dentry);
2189        if (err)
2190                goto out;
2191
2192        inode_update_time(inode, 1);
2193
2194        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2195        if (unlikely(file->f_flags & O_DIRECT)) {
2196                written = generic_file_direct_write(iocb, iov,
2197                                &nr_segs, pos, ppos, count, ocount);
2198                if (written < 0 || written == count)
2199                        goto out;
2200                /*
2201                 * direct-io write to a hole: fall through to buffered I/O
2202                 * for completing the rest of the request.
2203                 */
2204                pos += written;
2205                count -= written;
2206        }
2207
2208        written = generic_file_buffered_write(iocb, iov, nr_segs,
2209                        pos, ppos, count, written);
2210out:
2211        current->backing_dev_info = NULL;
2212        return written ? written : err;
2213}
2214EXPORT_SYMBOL(generic_file_aio_write_nolock);
2215
2216ssize_t
2217generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2218                                unsigned long nr_segs, loff_t *ppos)
2219{
2220        struct file *file = iocb->ki_filp;
2221        struct address_space *mapping = file->f_mapping;
2222        struct inode *inode = mapping->host;
2223        ssize_t ret;
2224        loff_t pos = *ppos;
2225
2226        ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
2227
2228        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2229                int err;
2230
2231                err = sync_page_range_nolock(inode, mapping, pos, ret);
2232                if (err < 0)
2233                        ret = err;
2234        }
2235        return ret;
2236}
2237
2238ssize_t
2239__generic_file_write_nolock(struct file *file, const struct iovec *iov,
2240                                unsigned long nr_segs, loff_t *ppos)
2241{
2242        struct kiocb kiocb;
2243        ssize_t ret;
2244
2245        init_sync_kiocb(&kiocb, file);
2246        ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2247        if (ret == -EIOCBQUEUED)
2248                ret = wait_on_sync_kiocb(&kiocb);
2249        return ret;
2250}
2251
2252ssize_t
2253generic_file_write_nolock(struct file *file, const struct iovec *iov,
2254                                unsigned long nr_segs, loff_t *ppos)
2255{
2256        struct kiocb kiocb;
2257        ssize_t ret;
2258
2259        init_sync_kiocb(&kiocb, file);
2260        ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2261        if (-EIOCBQUEUED == ret)
2262                ret = wait_on_sync_kiocb(&kiocb);
2263        return ret;
2264}
2265EXPORT_SYMBOL(generic_file_write_nolock);
2266
2267ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
2268                               size_t count, loff_t pos)
2269{
2270        struct file *file = iocb->ki_filp;
2271        struct address_space *mapping = file->f_mapping;
2272        struct inode *inode = mapping->host;
2273        ssize_t ret;
2274        struct iovec local_iov = { .iov_base = (void __user *)buf,
2275                                        .iov_len = count };
2276
2277        BUG_ON(iocb->ki_pos != pos);
2278
2279        down(&inode->i_sem);
2280        ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
2281                                                &iocb->ki_pos);
2282        up(&inode->i_sem);
2283
2284        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2285                ssize_t err;
2286
2287                err = sync_page_range(inode, mapping, pos, ret);
2288                if (err < 0)
2289                        ret = err;
2290        }
2291        return ret;
2292}
2293EXPORT_SYMBOL(generic_file_aio_write);
2294
2295ssize_t generic_file_write(struct file *file, const char __user *buf,
2296                           size_t count, loff_t *ppos)
2297{
2298        struct address_space *mapping = file->f_mapping;
2299        struct inode *inode = mapping->host;
2300        ssize_t ret;
2301        struct iovec local_iov = { .iov_base = (void __user *)buf,
2302                                        .iov_len = count };
2303
2304        down(&inode->i_sem);
2305        ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
2306        up(&inode->i_sem);
2307
2308        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2309                ssize_t err;
2310
2311                err = sync_page_range(inode, mapping, *ppos - ret, ret);
2312                if (err < 0)
2313                        ret = err;
2314        }
2315        return ret;
2316}
2317EXPORT_SYMBOL(generic_file_write);
2318
2319ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
2320                        unsigned long nr_segs, loff_t *ppos)
2321{
2322        struct kiocb kiocb;
2323        ssize_t ret;
2324
2325        init_sync_kiocb(&kiocb, filp);
2326        ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
2327        if (-EIOCBQUEUED == ret)
2328                ret = wait_on_sync_kiocb(&kiocb);
2329        return ret;
2330}
2331EXPORT_SYMBOL(generic_file_readv);
2332
2333ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
2334                        unsigned long nr_segs, loff_t *ppos)
2335{
2336        struct address_space *mapping = file->f_mapping;
2337        struct inode *inode = mapping->host;
2338        ssize_t ret;
2339
2340        down(&inode->i_sem);
2341        ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
2342        up(&inode->i_sem);
2343
2344        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2345                int err;
2346
2347                err = sync_page_range(inode, mapping, *ppos - ret, ret);
2348                if (err < 0)
2349                        ret = err;
2350        }
2351        return ret;
2352}
2353EXPORT_SYMBOL(generic_file_writev);
2354
2355/*
2356 * Called under i_sem for writes to S_ISREG files
2357 */
2358ssize_t
2359generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2360        loff_t offset, unsigned long nr_segs)
2361{
2362        struct file *file = iocb->ki_filp;
2363        struct address_space *mapping = file->f_mapping;
2364        ssize_t retval;
2365
2366        retval = filemap_write_and_wait(mapping);
2367        if (retval == 0) {
2368                retval = mapping->a_ops->direct_IO(rw, iocb, iov,
2369                                                offset, nr_segs);
2370                if (rw == WRITE && mapping->nrpages)
2371                        invalidate_inode_pages2(mapping);
2372        }
2373        return retval;
2374}
2375EXPORT_SYMBOL_GPL(generic_file_direct_IO);
2376