RHEL4/fs/buffer.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/config.h>
  22#include <linux/kernel.h>
  23#include <linux/fs.h>
  24#include <linux/mm.h>
  25#include <linux/percpu.h>
  26#include <linux/slab.h>
  27#include <linux/smp_lock.h>
  28#include <linux/blkdev.h>
  29#include <linux/file.h>
  30#include <linux/quotaops.h>
  31#include <linux/highmem.h>
  32#include <linux/module.h>
  33#include <linux/writeback.h>
  34#include <linux/hash.h>
  35#include <linux/suspend.h>
  36#include <linux/buffer_head.h>
  37#include <linux/task_io_accounting_ops.h>
  38#include <linux/bio.h>
  39#include <linux/notifier.h>
  40#include <linux/cpu.h>
  41#include <asm/bitops.h>
  42#include <linux/sysctl.h>
  43#include <linux/gfp.h>
  44
  45/* A global variable is a bit ugly, but it keeps the code simple */
  46int sysctl_drop_caches;
  47
  48static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  49static void invalidate_bh_lrus(void);
  50
  51#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  52
  53struct bh_wait_queue {
  54        struct buffer_head *bh;
  55        wait_queue_t wait;
  56};
  57
  58#define __DEFINE_BH_WAIT(name, b, f)                                    \
  59        struct bh_wait_queue name = {                                   \
  60                .bh     = b,                                            \
  61                .wait   = {                                             \
  62                                .task   = current,                      \
  63                                .flags  = f,                            \
  64                                .func   = bh_wake_function,             \
  65                                .task_list =                            \
  66                                        LIST_HEAD_INIT(name.wait.task_list),\
  67                        },                                              \
  68        }
  69#define DEFINE_BH_WAIT(name, bh)        __DEFINE_BH_WAIT(name, bh, 0)
  70#define DEFINE_BH_WAIT_EXCLUSIVE(name, bh) \
  71                __DEFINE_BH_WAIT(name, bh, WQ_FLAG_EXCLUSIVE)
  72
  73/*
  74 * Hashed waitqueue_head's for wait_on_buffer()
  75 */
  76#define BH_WAIT_TABLE_ORDER     7
  77static struct bh_wait_queue_head {
  78        wait_queue_head_t wqh;
  79} ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER];
  80
  81inline void
  82init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  83{
  84        bh->b_end_io = handler;
  85        bh->b_private = private;
  86}
  87
  88/*
  89 * Return the address of the waitqueue_head to be used for this
  90 * buffer_head
  91 */
  92wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
  93{
  94        return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
  95}
  96EXPORT_SYMBOL(bh_waitq_head);
  97
  98void wake_up_buffer(struct buffer_head *bh)
  99{
 100        wait_queue_head_t *wq = bh_waitq_head(bh);
 101
 102        smp_mb();
 103        if (waitqueue_active(wq))
 104                __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, bh);
 105}
 106EXPORT_SYMBOL(wake_up_buffer);
 107
 108static int bh_wake_function(wait_queue_t *wait, unsigned mode,
 109                                int sync, void *key)
 110{
 111        struct buffer_head *bh = key;
 112        struct bh_wait_queue *wq;
 113
 114        wq = container_of(wait, struct bh_wait_queue, wait);
 115        if (wq->bh != bh || buffer_locked(bh))
 116                return 0;
 117        else
 118                return autoremove_wake_function(wait, mode, sync, key);
 119}
 120
 121static void sync_buffer(struct buffer_head *bh)
 122{
 123        struct block_device *bd;
 124
 125        smp_mb();
 126        bd = bh->b_bdev;
 127        if (bd)
 128                blk_run_address_space(bd->bd_inode->i_mapping);
 129}
 130
 131void fastcall __lock_buffer(struct buffer_head *bh)
 132{
 133        wait_queue_head_t *wqh = bh_waitq_head(bh);
 134        DEFINE_BH_WAIT_EXCLUSIVE(wait, bh);
 135
 136        do {
 137                prepare_to_wait_exclusive(wqh, &wait.wait,
 138                                        TASK_UNINTERRUPTIBLE);
 139                if (buffer_locked(bh)) {
 140                        sync_buffer(bh);
 141                        io_schedule();
 142                }
 143        } while (test_set_buffer_locked(bh));
 144        finish_wait(wqh, &wait.wait);
 145}
 146EXPORT_SYMBOL(__lock_buffer);
 147
 148void fastcall unlock_buffer(struct buffer_head *bh)
 149{
 150        smp_mb__before_clear_bit();
 151        clear_buffer_locked(bh);
 152        smp_mb__after_clear_bit();
 153        wake_up_buffer(bh);
 154}
 155
 156/*
 157 * Block until a buffer comes unlocked.  This doesn't stop it
 158 * from becoming locked again - you have to lock it yourself
 159 * if you want to preserve its state.
 160 */
 161void __wait_on_buffer(struct buffer_head * bh)
 162{
 163        wait_queue_head_t *wqh = bh_waitq_head(bh);
 164        DEFINE_BH_WAIT(wait, bh);
 165
 166        do {
 167                prepare_to_wait(wqh, &wait.wait, TASK_UNINTERRUPTIBLE);
 168                if (buffer_locked(bh)) {
 169                        sync_buffer(bh);
 170                        io_schedule();
 171                }
 172        } while (buffer_locked(bh));
 173        finish_wait(wqh, &wait.wait);
 174}
 175
 176static void
 177__set_page_buffers(struct page *page, struct buffer_head *head)
 178{
 179        page_cache_get(page);
 180        SetPagePrivate(page);
 181        page->private = (unsigned long)head;
 182}
 183
 184static void
 185__clear_page_buffers(struct page *page)
 186{
 187        ClearPagePrivate(page);
 188        page->private = 0;
 189        page_cache_release(page);
 190}
 191
 192static void buffer_io_error(struct buffer_head *bh)
 193{
 194        char b[BDEVNAME_SIZE];
 195
 196        printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 197                        bdevname(bh->b_bdev, b),
 198                        (unsigned long long)bh->b_blocknr);
 199}
 200
 201/*
 202 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 203 * unlock the buffer. This is what ll_rw_block uses too.
 204 */
 205void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 206{
 207        if (uptodate) {
 208                set_buffer_uptodate(bh);
 209        } else {
 210                /* This happens, due to failed READA attempts. */
 211                clear_buffer_uptodate(bh);
 212        }
 213        unlock_buffer(bh);
 214        put_bh(bh);
 215}
 216
 217void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 218{
 219        char b[BDEVNAME_SIZE];
 220
 221        if (uptodate) {
 222                set_buffer_uptodate(bh);
 223        } else {
 224                if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 225                        buffer_io_error(bh);
 226                        printk(KERN_WARNING "lost page write due to "
 227                                        "I/O error on %s\n",
 228                                       bdevname(bh->b_bdev, b));
 229                }
 230                set_buffer_write_io_error(bh);
 231                clear_buffer_uptodate(bh);
 232        }
 233        unlock_buffer(bh);
 234        put_bh(bh);
 235}
 236
 237/*
 238 * Write out and wait upon all the dirty data associated with a block
 239 * device via its mapping.  Does not take the superblock lock.
 240 */
 241int sync_blockdev(struct block_device *bdev)
 242{
 243        int ret = 0;
 244
 245        if (bdev) {
 246                int err;
 247
 248                ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
 249                err = filemap_fdatawait(bdev->bd_inode->i_mapping);
 250                if (!ret)
 251                        ret = err;
 252        }
 253        return ret;
 254}
 255EXPORT_SYMBOL(sync_blockdev);
 256
 257/*
 258 * Write out and wait upon all dirty data associated with this
 259 * superblock.  Filesystem data as well as the underlying block
 260 * device.  Takes the superblock lock.
 261 */
 262int fsync_super(struct super_block *sb)
 263{
 264        sync_inodes_sb(sb, 0);
 265        DQUOT_SYNC(sb);
 266        lock_super(sb);
 267        if (sb->s_dirt && sb->s_op->write_super)
 268                sb->s_op->write_super(sb);
 269        unlock_super(sb);
 270        if (sb->s_op->sync_fs)
 271                sb->s_op->sync_fs(sb, 1);
 272        sync_blockdev(sb->s_bdev);
 273        sync_inodes_sb(sb, 1);
 274
 275        return sync_blockdev(sb->s_bdev);
 276}
 277
 278/*
 279 * Write out and wait upon all dirty data associated with this
 280 * device.   Filesystem data as well as the underlying block
 281 * device.  Takes the superblock lock.
 282 */
 283int fsync_bdev(struct block_device *bdev)
 284{
 285        struct super_block *sb = get_super(bdev);
 286        if (sb) {
 287                int res = fsync_super(sb);
 288                drop_super(sb);
 289                return res;
 290        }
 291        return sync_blockdev(bdev);
 292}
 293
 294/**
 295 * freeze_bdev  --  lock a filesystem and force it into a consistent state
 296 * @bdev:       blockdevice to lock
 297 *
 298 * This takes the block device bd_mount_sem to make sure no new mounts
 299 * happen on bdev until thaw_bdev() is called.
 300 * If a superblock is found on this device, we take the s_umount semaphore
 301 * on it to make sure nobody unmounts until the snapshot creation is done.
 302 */
 303struct super_block *freeze_bdev(struct block_device *bdev)
 304{
 305        struct super_block *sb;
 306
 307        down(&bdev->bd_mount_sem);
 308        sb = get_super(bdev);
 309        if (sb && !(sb->s_flags & MS_RDONLY)) {
 310                sb->s_frozen = SB_FREEZE_WRITE;
 311                wmb();
 312
 313                sync_inodes_sb(sb, 0);
 314                DQUOT_SYNC(sb);
 315
 316                lock_super(sb);
 317                if (sb->s_dirt && sb->s_op->write_super)
 318                        sb->s_op->write_super(sb);
 319                unlock_super(sb);
 320
 321                if (sb->s_op->sync_fs)
 322                        sb->s_op->sync_fs(sb, 1);
 323
 324                sync_blockdev(sb->s_bdev);
 325                sync_inodes_sb(sb, 1);
 326
 327                sb->s_frozen = SB_FREEZE_TRANS;
 328                wmb();
 329
 330                sync_blockdev(sb->s_bdev);
 331
 332                if (sb->s_op->write_super_lockfs)
 333                        sb->s_op->write_super_lockfs(sb);
 334        }
 335
 336        sync_blockdev(bdev);
 337        return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
 338}
 339EXPORT_SYMBOL(freeze_bdev);
 340
 341/**
 342 * thaw_bdev  -- unlock filesystem
 343 * @bdev:       blockdevice to unlock
 344 * @sb:         associated superblock
 345 *
 346 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 347 */
 348void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 349{
 350        if (sb) {
 351                BUG_ON(sb->s_bdev != bdev);
 352
 353                if (sb->s_op->unlockfs)
 354                        sb->s_op->unlockfs(sb);
 355                sb->s_frozen = SB_UNFROZEN;
 356                wmb();
 357                wake_up(&sb->s_wait_unfrozen);
 358                drop_super(sb);
 359        }
 360
 361        up(&bdev->bd_mount_sem);
 362}
 363EXPORT_SYMBOL(thaw_bdev);
 364
 365/*
 366 * sync everything.  Start out by waking pdflush, because that writes back
 367 * all queues in parallel.
 368 */
 369static void do_sync(unsigned long wait)
 370{
 371        wakeup_bdflush(0);
 372        sync_inodes(0);         /* All mappings, inodes and their blockdevs */
 373        DQUOT_SYNC(NULL);
 374        sync_supers();          /* Write the superblocks */
 375        sync_filesystems(0);    /* Start syncing the filesystems */
 376        sync_filesystems(wait); /* Waitingly sync the filesystems */
 377        sync_inodes(wait);      /* Mappings, inodes and blockdevs, again. */
 378        if (!wait)
 379                printk("Emergency Sync complete\n");
 380        if (unlikely(laptop_mode))
 381                laptop_sync_completion();
 382}
 383
 384asmlinkage long sys_sync(void)
 385{
 386        do_sync(1);
 387        return 0;
 388}
 389
 390void emergency_sync(void)
 391{
 392        pdflush_operation(do_sync, 0);
 393}
 394
 395/*
 396 * Generic function to fsync a file.
 397 *
 398 * filp may be NULL if called via the msync of a vma.
 399 */
 400 
 401int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 402{
 403        struct inode * inode = dentry->d_inode;
 404        struct super_block * sb;
 405        int ret, err;
 406
 407        /* sync the inode to buffers */
 408        ret = write_inode_now_err(inode, 0);
 409
 410        /* sync the superblock to buffers */
 411        sb = inode->i_sb;
 412        lock_super(sb);
 413        if (sb->s_op->write_super)
 414                sb->s_op->write_super(sb);
 415        unlock_super(sb);
 416
 417        /* .. finally sync the buffers to disk */
 418        err = sync_blockdev(sb->s_bdev);
 419        if (!ret)
 420                ret = err;
 421        return ret;
 422}
 423
 424asmlinkage long sys_fsync(unsigned int fd)
 425{
 426        struct file * file;
 427        struct address_space *mapping;
 428        int ret, err;
 429
 430        ret = -EBADF;
 431        file = fget(fd);
 432        if (!file)
 433                goto out;
 434
 435        mapping = file->f_mapping;
 436
 437        ret = -EINVAL;
 438        if (!file->f_op || !file->f_op->fsync) {
 439                /* Why?  We can still call filemap_fdatawrite */
 440                goto out_putf;
 441        }
 442
 443        /* We need to protect against concurrent writers.. */
 444        down(&mapping->host->i_sem);
 445        current->flags |= PF_SYNCWRITE;
 446        ret = filemap_fdatawrite(mapping);
 447        err = file->f_op->fsync(file, file->f_dentry, 0);
 448        if (!ret)
 449                ret = err;
 450        err = filemap_fdatawait(mapping);
 451        if (!ret)
 452                ret = err;
 453        current->flags &= ~PF_SYNCWRITE;
 454        up(&mapping->host->i_sem);
 455
 456out_putf:
 457        fput(file);
 458out:
 459        return ret;
 460}
 461
 462asmlinkage long sys_fdatasync(unsigned int fd)
 463{
 464        struct file * file;
 465        struct address_space *mapping;
 466        int ret, err;
 467
 468        ret = -EBADF;
 469        file = fget(fd);
 470        if (!file)
 471                goto out;
 472
 473        ret = -EINVAL;
 474        if (!file->f_op || !file->f_op->fsync)
 475                goto out_putf;
 476
 477        mapping = file->f_mapping;
 478
 479        down(&mapping->host->i_sem);
 480        current->flags |= PF_SYNCWRITE;
 481        ret = filemap_fdatawrite(mapping);
 482        err = file->f_op->fsync(file, file->f_dentry, 1);
 483        if (!ret)
 484                ret = err;
 485        err = filemap_fdatawait(mapping);
 486        if (!ret)
 487                ret = err;
 488        current->flags &= ~PF_SYNCWRITE;
 489        up(&mapping->host->i_sem);
 490
 491out_putf:
 492        fput(file);
 493out:
 494        return ret;
 495}
 496
 497/*
 498 * Various filesystems appear to want __find_get_block to be non-blocking.
 499 * But it's the page lock which protects the buffers.  To get around this,
 500 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 501 * private_lock.
 502 *
 503 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 504 * may be quite high.  This code could TryLock the page, and if that
 505 * succeeds, there is no need to take private_lock. (But if
 506 * private_lock is contended then so is mapping->tree_lock).
 507 */
 508static struct buffer_head *
 509__find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
 510{
 511        struct inode *bd_inode = bdev->bd_inode;
 512        struct address_space *bd_mapping = bd_inode->i_mapping;
 513        struct buffer_head *ret = NULL;
 514        pgoff_t index;
 515        struct buffer_head *bh;
 516        struct buffer_head *head;
 517        struct page *page;
 518        int all_mapped = 1;
 519
 520        index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 521        page = find_get_page(bd_mapping, index);
 522        if (!page)
 523                goto out;
 524
 525        spin_lock(&bd_mapping->private_lock);
 526        if (!page_has_buffers(page))
 527                goto out_unlock;
 528        head = page_buffers(page);
 529        bh = head;
 530        do {
 531                if (bh->b_blocknr == block) {
 532                        ret = bh;
 533                        get_bh(bh);
 534                        goto out_unlock;
 535                }
 536                if (!buffer_mapped(bh))
 537                        all_mapped = 0;
 538                bh = bh->b_this_page;
 539        } while (bh != head);
 540
 541        /* we might be here because some of the buffers on this page are 
 542         * not mapped.  This is due to various races between
 543         * file io on the block device and getblk.  It gets dealt with
 544         * elsewhere, don't buffer_error if we had some unmapped buffers
 545         */
 546        if (all_mapped) {
 547                printk("__find_get_block_slow() failed. "
 548                        "block=%llu, b_blocknr=%llu\n",
 549                        (unsigned long long)block, (unsigned long long)bh->b_blocknr);
 550                printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
 551                printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 552        }
 553out_unlock:
 554        spin_unlock(&bd_mapping->private_lock);
 555        page_cache_release(page);
 556out:
 557        return ret;
 558}
 559
 560/* If invalidate_buffers() will trash dirty buffers, it means some kind
 561   of fs corruption is going on. Trashing dirty data always imply losing
 562   information that was supposed to be just stored on the physical layer
 563   by the user.
 564
 565   Thus invalidate_buffers in general usage is not allwowed to trash
 566   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 567   be preserved.  These buffers are simply skipped.
 568  
 569   We also skip buffers which are still in use.  For example this can
 570   happen if a userspace program is reading the block device.
 571
 572   NOTE: In the case where the user removed a removable-media-disk even if
 573   there's still dirty data not synced on disk (due a bug in the device driver
 574   or due an error of the user), by not destroying the dirty buffers we could
 575   generate corruption also on the next media inserted, thus a parameter is
 576   necessary to handle this case in the most safe way possible (trying
 577   to not corrupt also the new disk inserted with the data belonging to
 578   the old now corrupted disk). Also for the ramdisk the natural thing
 579   to do in order to release the ramdisk memory is to destroy dirty buffers.
 580
 581   These are two special cases. Normal usage imply the device driver
 582   to issue a sync on the device (without waiting I/O completion) and
 583   then an invalidate_buffers call that doesn't trash dirty buffers.
 584
 585   For handling cache coherency with the blkdev pagecache the 'update' case
 586   is been introduced. It is needed to re-read from disk any pinned
 587   buffer. NOTE: re-reading from disk is destructive so we can do it only
 588   when we assume nobody is changing the buffercache under our I/O and when
 589   we think the disk contains more recent information than the buffercache.
 590   The update == 1 pass marks the buffers we need to update, the update == 2
 591   pass does the actual I/O. */
 592void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 593{
 594        struct address_space *mapping = bdev->bd_inode->i_mapping;
 595
 596        if (mapping->nrpages == 0)
 597                return;
 598
 599        invalidate_bh_lrus();
 600        /*
 601         * FIXME: what about destroy_dirty_buffers?
 602         * We really want to use invalidate_inode_pages2() for
 603         * that, but not until that's cleaned up.
 604         */
 605        invalidate_inode_pages(mapping);
 606}
 607
 608/*
 609 * Kick pdflush then try to free up some ZONE_NORMAL memory.
 610 */
 611static void free_more_memory(void)
 612{
 613        struct zone **zones;
 614        pg_data_t *pgdat;
 615
 616        wakeup_bdflush(1024);
 617        yield();
 618
 619        for_each_pgdat(pgdat) {
 620                zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
 621                if (*zones)
 622                        try_to_free_pages(zones, GFP_NOFS, 0, 1, ZONE_NORMAL);
 623        }
 624}
 625
 626/*
 627 * I/O completion handler for block_read_full_page() - pages
 628 * which come unlocked at the end of I/O.
 629 */
 630static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 631{
 632        static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 633        unsigned long flags;
 634        struct buffer_head *tmp;
 635        struct page *page;
 636        int page_uptodate = 1;
 637
 638        BUG_ON(!buffer_async_read(bh));
 639
 640        page = bh->b_page;
 641        if (uptodate) {
 642                set_buffer_uptodate(bh);
 643        } else {
 644                clear_buffer_uptodate(bh);
 645                buffer_io_error(bh);
 646                SetPageError(page);
 647        }
 648
 649        /*
 650         * Be _very_ careful from here on. Bad things can happen if
 651         * two buffer heads end IO at almost the same time and both
 652         * decide that the page is now completely done.
 653         */
 654        spin_lock_irqsave(&page_uptodate_lock, flags);
 655        clear_buffer_async_read(bh);
 656        unlock_buffer(bh);
 657        tmp = bh;
 658        do {
 659                if (!buffer_uptodate(tmp))
 660                        page_uptodate = 0;
 661                if (buffer_async_read(tmp)) {
 662                        BUG_ON(!buffer_locked(tmp));
 663                        goto still_busy;
 664                }
 665                tmp = tmp->b_this_page;
 666        } while (tmp != bh);
 667        spin_unlock_irqrestore(&page_uptodate_lock, flags);
 668
 669        /*
 670         * If none of the buffers had errors and they are all
 671         * uptodate then we can set the page uptodate.
 672         */
 673        if (page_uptodate && !PageError(page))
 674                SetPageUptodate(page);
 675        unlock_page(page);
 676        return;
 677
 678still_busy:
 679        spin_unlock_irqrestore(&page_uptodate_lock, flags);
 680        return;
 681}
 682
 683/*
 684 * Completion handler for block_write_full_page() - pages which are unlocked
 685 * during I/O, and which have PageWriteback cleared upon I/O completion.
 686 */
 687void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 688{
 689        char b[BDEVNAME_SIZE];
 690        static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 691        unsigned long flags;
 692        struct buffer_head *tmp;
 693        struct page *page;
 694
 695        BUG_ON(!buffer_async_write(bh));
 696
 697        page = bh->b_page;
 698        if (uptodate) {
 699                set_buffer_uptodate(bh);
 700        } else {
 701                if (printk_ratelimit()) {
 702                        buffer_io_error(bh);
 703                        printk(KERN_WARNING "lost page write due to "
 704                                        "I/O error on %s\n",
 705                               bdevname(bh->b_bdev, b));
 706                }
 707                set_bit(AS_EIO, &page->mapping->flags);
 708                clear_buffer_uptodate(bh);
 709                SetPageError(page);
 710        }
 711
 712        spin_lock_irqsave(&page_uptodate_lock, flags);
 713        clear_buffer_async_write(bh);
 714        unlock_buffer(bh);
 715        tmp = bh->b_this_page;
 716        while (tmp != bh) {
 717                if (buffer_async_write(tmp)) {
 718                        BUG_ON(!buffer_locked(tmp));
 719                        goto still_busy;
 720                }
 721                tmp = tmp->b_this_page;
 722        }
 723        spin_unlock_irqrestore(&page_uptodate_lock, flags);
 724        end_page_writeback(page);
 725        return;
 726
 727still_busy:
 728        spin_unlock_irqrestore(&page_uptodate_lock, flags);
 729        return;
 730}
 731
 732/*
 733 * If a page's buffers are under async readin (end_buffer_async_read
 734 * completion) then there is a possibility that another thread of
 735 * control could lock one of the buffers after it has completed
 736 * but while some of the other buffers have not completed.  This
 737 * locked buffer would confuse end_buffer_async_read() into not unlocking
 738 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 739 * that this buffer is not under async I/O.
 740 *
 741 * The page comes unlocked when it has no locked buffer_async buffers
 742 * left.
 743 *
 744 * PageLocked prevents anyone starting new async I/O reads any of
 745 * the buffers.
 746 *
 747 * PageWriteback is used to prevent simultaneous writeout of the same
 748 * page.
 749 *
 750 * PageLocked prevents anyone from starting writeback of a page which is
 751 * under read I/O (PageWriteback is only ever set against a locked page).
 752 */
 753static void mark_buffer_async_read(struct buffer_head *bh)
 754{
 755        bh->b_end_io = end_buffer_async_read;
 756        set_buffer_async_read(bh);
 757}
 758
 759void mark_buffer_async_write(struct buffer_head *bh)
 760{
 761        bh->b_end_io = end_buffer_async_write;
 762        set_buffer_async_write(bh);
 763}
 764EXPORT_SYMBOL(mark_buffer_async_write);
 765
 766
 767/*
 768 * fs/buffer.c contains helper functions for buffer-backed address space's
 769 * fsync functions.  A common requirement for buffer-based filesystems is
 770 * that certain data from the backing blockdev needs to be written out for
 771 * a successful fsync().  For example, ext2 indirect blocks need to be
 772 * written back and waited upon before fsync() returns.
 773 *
 774 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 775 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 776 * management of a list of dependent buffers at ->i_mapping->private_list.
 777 *
 778 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 779 * from their controlling inode's queue when they are being freed.  But
 780 * try_to_free_buffers() will be operating against the *blockdev* mapping
 781 * at the time, not against the S_ISREG file which depends on those buffers.
 782 * So the locking for private_list is via the private_lock in the address_space
 783 * which backs the buffers.  Which is different from the address_space 
 784 * against which the buffers are listed.  So for a particular address_space,
 785 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 786 * mapping->private_list will always be protected by the backing blockdev's
 787 * ->private_lock.
 788 *
 789 * Which introduces a requirement: all buffers on an address_space's
 790 * ->private_list must be from the same address_space: the blockdev's.
 791 *
 792 * address_spaces which do not place buffers at ->private_list via these
 793 * utility functions are free to use private_lock and private_list for
 794 * whatever they want.  The only requirement is that list_empty(private_list)
 795 * be true at clear_inode() time.
 796 *
 797 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 798 * filesystems should do that.  invalidate_inode_buffers() should just go
 799 * BUG_ON(!list_empty).
 800 *
 801 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 802 * take an address_space, not an inode.  And it should be called
 803 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 804 * queued up.
 805 *
 806 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 807 * list if it is already on a list.  Because if the buffer is on a list,
 808 * it *must* already be on the right one.  If not, the filesystem is being
 809 * silly.  This will save a ton of locking.  But first we have to ensure
 810 * that buffers are taken *off* the old inode's list when they are freed
 811 * (presumably in truncate).  That requires careful auditing of all
 812 * filesystems (do it inside bforget()).  It could also be done by bringing
 813 * b_inode back.
 814 */
 815
 816/*
 817 * The buffer's backing address_space's private_lock must be held
 818 */
 819static inline void __remove_assoc_queue(struct buffer_head *bh)
 820{
 821        list_del_init(&bh->b_assoc_buffers);
 822}
 823
 824int inode_has_buffers(struct inode *inode)
 825{
 826        return !list_empty(&inode->i_data.private_list);
 827}
 828
 829/*
 830 * osync is designed to support O_SYNC io.  It waits synchronously for
 831 * all already-submitted IO to complete, but does not queue any new
 832 * writes to the disk.
 833 *
 834 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 835 * you dirty the buffers, and then use osync_inode_buffers to wait for
 836 * completion.  Any other dirty buffers which are not yet queued for
 837 * write will not be flushed to disk by the osync.
 838 */
 839static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 840{
 841        struct buffer_head *bh;
 842        struct list_head *p;
 843        int err = 0;
 844
 845        spin_lock(lock);
 846repeat:
 847        list_for_each_prev(p, list) {
 848                bh = BH_ENTRY(p);
 849                if (buffer_locked(bh)) {
 850                        get_bh(bh);
 851                        spin_unlock(lock);
 852                        wait_on_buffer(bh);
 853                        if (!buffer_uptodate(bh))
 854                                err = -EIO;
 855                        brelse(bh);
 856                        spin_lock(lock);
 857                        goto repeat;
 858                }
 859        }
 860        spin_unlock(lock);
 861        return err;
 862}
 863
 864/**
 865 * sync_mapping_buffers - write out and wait upon a mapping's "associated"
 866 *                        buffers
 867 * @buffer_mapping - the mapping which backs the buffers' data
 868 * @mapping - the mapping which wants those buffers written
 869 *
 870 * Starts I/O against the buffers at mapping->private_list, and waits upon
 871 * that I/O.
 872 *
 873 * Basically, this is a convenience function for fsync().  @buffer_mapping is
 874 * the blockdev which "owns" the buffers and @mapping is a file or directory
 875 * which needs those buffers to be written for a successful fsync().
 876 */
 877int sync_mapping_buffers(struct address_space *mapping)
 878{
 879        struct address_space *buffer_mapping = mapping->assoc_mapping;
 880
 881        if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 882                return 0;
 883
 884        return fsync_buffers_list(&buffer_mapping->private_lock,
 885                                        &mapping->private_list);
 886}
 887EXPORT_SYMBOL(sync_mapping_buffers);
 888
 889/*
 890 * Called when we've recently written block `bblock', and it is known that
 891 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 892 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 893 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 894 */
 895void write_boundary_block(struct block_device *bdev,
 896                        sector_t bblock, unsigned blocksize)
 897{
 898        struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 899        if (bh) {
 900                if (buffer_dirty(bh))
 901                        ll_rw_block(WRITE, 1, &bh);
 902                put_bh(bh);
 903        }
 904}
 905
 906void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 907{
 908        struct address_space *mapping = inode->i_mapping;
 909        struct address_space *buffer_mapping = bh->b_page->mapping;
 910
 911        mark_buffer_dirty(bh);
 912        if (!mapping->assoc_mapping) {
 913                mapping->assoc_mapping = buffer_mapping;
 914        } else {
 915                if (mapping->assoc_mapping != buffer_mapping)
 916                        BUG();
 917        }
 918        if (list_empty(&bh->b_assoc_buffers)) {
 919                spin_lock(&buffer_mapping->private_lock);
 920                list_move_tail(&bh->b_assoc_buffers,
 921                                &mapping->private_list);
 922                spin_unlock(&buffer_mapping->private_lock);
 923        }
 924}
 925EXPORT_SYMBOL(mark_buffer_dirty_inode);
 926
 927/*
 928 * Add a page to the dirty page list.
 929 *
 930 * It is a sad fact of life that this function is called from several places
 931 * deeply under spinlocking.  It may not sleep.
 932 *
 933 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 934 * dirty-state coherency between the page and the buffers.  It the page does
 935 * not have buffers then when they are later attached they will all be set
 936 * dirty.
 937 *
 938 * The buffers are dirtied before the page is dirtied.  There's a small race
 939 * window in which a writepage caller may see the page cleanness but not the
 940 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 941 * before the buffers, a concurrent writepage caller could clear the page dirty
 942 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 943 * page on the dirty page list.
 944 *
 945 * We use private_lock to lock against try_to_free_buffers while using the
 946 * page's buffer list.  Also use this to protect against clean buffers being
 947 * added to the page after it was set dirty.
 948 *
 949 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 950 * address_space though.
 951 */
 952int __set_page_dirty_buffers(struct page *page)
 953{
 954        struct address_space * const mapping = page->mapping;
 955
 956        spin_lock(&mapping->private_lock);
 957        if (page_has_buffers(page)) {
 958                struct buffer_head *head = page_buffers(page);
 959                struct buffer_head *bh = head;
 960
 961                do {
 962                        set_buffer_dirty(bh);
 963                        bh = bh->b_this_page;
 964                } while (bh != head);
 965        }
 966        spin_unlock(&mapping->private_lock);
 967
 968        if (!TestSetPageDirty(page)) {
 969                spin_lock_irq(&mapping->tree_lock);
 970                if (page->mapping) {    /* Race with truncate? */
 971                        if (!mapping->backing_dev_info->memory_backed) {
 972                                inc_page_state(nr_dirty);
 973                                task_io_account_write(PAGE_CACHE_SIZE);
 974                        }
 975                        radix_tree_tag_set(&mapping->page_tree,
 976                                                page_index(page),
 977                                                PAGECACHE_TAG_DIRTY);
 978                }
 979                spin_unlock_irq(&mapping->tree_lock);
 980                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 981        }
 982        
 983        return 0;
 984}
 985EXPORT_SYMBOL(__set_page_dirty_buffers);
 986
 987/*
 988 * Write out and wait upon a list of buffers.
 989 *
 990 * We have conflicting pressures: we want to make sure that all
 991 * initially dirty buffers get waited on, but that any subsequently
 992 * dirtied buffers don't.  After all, we don't want fsync to last
 993 * forever if somebody is actively writing to the file.
 994 *
 995 * Do this in two main stages: first we copy dirty buffers to a
 996 * temporary inode list, queueing the writes as we go.  Then we clean
 997 * up, waiting for those writes to complete.
 998 * 
 999 * During this second stage, any subsequent updates to the file may end
1000 * up refiling the buffer on the original inode's dirty list again, so
1001 * there is a chance we will end up with a buffer queued for write but
1002 * not yet completed on that list.  So, as a final cleanup we go through
1003 * the osync code to catch these locked, dirty buffers without requeuing
1004 * any newly dirty buffers for write.
1005 */
1006static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
1007{
1008        struct buffer_head *bh;
1009        struct list_head tmp;
1010        int err = 0, err2;
1011
1012        INIT_LIST_HEAD(&tmp);
1013
1014        spin_lock(lock);
1015        while (!list_empty(list)) {
1016                bh = BH_ENTRY(list->next);
1017                list_del_init(&bh->b_assoc_buffers);
1018                if (buffer_dirty(bh) || buffer_locked(bh)) {
1019                        list_add(&bh->b_assoc_buffers, &tmp);
1020                        if (buffer_dirty(bh)) {
1021                                get_bh(bh);
1022                                spin_unlock(lock);
1023                                /*
1024                                 * Ensure any pending I/O completes so that
1025                                 * ll_rw_block() actually writes the current
1026                                 * contents - it is a noop if I/O is still in
1027                                 * flight on potentially older contents.
1028                                 */
1029                                wait_on_buffer(bh);
1030                                ll_rw_block(WRITE, 1, &bh);
1031                                brelse(bh);
1032                                spin_lock(lock);
1033                        }
1034                }
1035        }
1036
1037        while (!list_empty(&tmp)) {
1038                bh = BH_ENTRY(tmp.prev);
1039                __remove_assoc_queue(bh);
1040                get_bh(bh);
1041                spin_unlock(lock);
1042                wait_on_buffer(bh);
1043                if (!buffer_uptodate(bh))
1044                        err = -EIO;
1045                brelse(bh);
1046                spin_lock(lock);
1047        }
1048        
1049        spin_unlock(lock);
1050        err2 = osync_buffers_list(lock, list);
1051        if (err)
1052                return err;
1053        else
1054                return err2;
1055}
1056
1057/*
1058 * Invalidate any and all dirty buffers on a given inode.  We are
1059 * probably unmounting the fs, but that doesn't mean we have already
1060 * done a sync().  Just drop the buffers from the inode list.
1061 *
1062 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
1063 * assumes that all the buffers are against the blockdev.  Not true
1064 * for reiserfs.
1065 */
1066void invalidate_inode_buffers(struct inode *inode)
1067{
1068        if (inode_has_buffers(inode)) {
1069                struct address_space *mapping = &inode->i_data;
1070                struct list_head *list = &mapping->private_list;
1071                struct address_space *buffer_mapping = mapping->assoc_mapping;
1072
1073                spin_lock(&buffer_mapping->private_lock);
1074                while (!list_empty(list))
1075                        __remove_assoc_queue(BH_ENTRY(list->next));
1076                spin_unlock(&buffer_mapping->private_lock);
1077        }
1078}
1079
1080/*
1081 * Remove any clean buffers from the inode's buffer list.  This is called
1082 * when we're trying to free the inode itself.  Those buffers can pin it.
1083 *
1084 * Returns true if all buffers were removed.
1085 */
1086int remove_inode_buffers(struct inode *inode)
1087{
1088        int ret = 1;
1089
1090        if (inode_has_buffers(inode)) {
1091                struct address_space *mapping = &inode->i_data;
1092                struct list_head *list = &mapping->private_list;
1093                struct address_space *buffer_mapping = mapping->assoc_mapping;
1094
1095                spin_lock(&buffer_mapping->private_lock);
1096                while (!list_empty(list)) {
1097                        struct buffer_head *bh = BH_ENTRY(list->next);
1098                        if (buffer_dirty(bh)) {
1099                                ret = 0;
1100                                break;
1101                        }
1102                        __remove_assoc_queue(bh);
1103                }
1104                spin_unlock(&buffer_mapping->private_lock);
1105        }
1106        return ret;
1107}
1108
1109/*
1110 * Create the appropriate buffers when given a page for data area and
1111 * the size of each buffer.. Use the bh->b_this_page linked list to
1112 * follow the buffers created.  Return NULL if unable to create more
1113 * buffers.
1114 *
1115 * The retry flag is used to differentiate async IO (paging, swapping)
1116 * which may not fail from ordinary buffer allocations.
1117 */
1118static struct buffer_head *
1119create_buffers(struct page * page, unsigned long size, int retry)
1120{
1121        struct buffer_head *bh, *head;
1122        long offset;
1123
1124try_again:
1125        head = NULL;
1126        offset = PAGE_SIZE;
1127        while ((offset -= size) >= 0) {
1128                bh = alloc_buffer_head(GFP_NOFS);
1129                if (!bh)
1130                        goto no_grow;
1131
1132                bh->b_bdev = NULL;
1133                bh->b_this_page = head;
1134                bh->b_blocknr = -1;
1135                head = bh;
1136
1137                bh->b_state = 0;
1138                atomic_set(&bh->b_count, 0);
1139                bh->b_size = size;
1140
1141                /* Link the buffer to its page */
1142                set_bh_page(bh, page, offset);
1143
1144                bh->b_end_io = NULL;
1145        }
1146        return head;
1147/*
1148 * In case anything failed, we just free everything we got.
1149 */
1150no_grow:
1151        if (head) {
1152                do {
1153                        bh = head;
1154                        head = head->b_this_page;
1155                        free_buffer_head(bh);
1156                } while (head);
1157        }
1158
1159        /*
1160         * Return failure for non-async IO requests.  Async IO requests
1161         * are not allowed to fail, so we have to wait until buffer heads
1162         * become available.  But we don't want tasks sleeping with 
1163         * partially complete buffers, so all were released above.
1164         */
1165        if (!retry)
1166                return NULL;
1167
1168        /* We're _really_ low on memory. Now we just
1169         * wait for old buffer heads to become free due to
1170         * finishing IO.  Since this is an async request and
1171         * the reserve list is empty, we're sure there are 
1172         * async buffer heads in use.
1173         */
1174        free_more_memory();
1175        goto try_again;
1176}
1177
1178static inline void
1179link_dev_buffers(struct page *page, struct buffer_head *head)
1180{
1181        struct buffer_head *bh, *tail;
1182
1183        bh = head;
1184        do {
1185                tail = bh;
1186                bh = bh->b_this_page;
1187        } while (bh);
1188        tail->b_this_page = head;
1189        __set_page_buffers(page, head);
1190}
1191
1192/*
1193 * Initialise the state of a blockdev page's buffers.
1194 */ 
1195static void
1196init_page_buffers(struct page *page, struct block_device *bdev,
1197                        sector_t block, int size)
1198{
1199        struct buffer_head *head = page_buffers(page);
1200        struct buffer_head *bh = head;
1201        int uptodate = PageUptodate(page);
1202
1203        do {
1204                if (!buffer_mapped(bh)) {
1205                        init_buffer(bh, NULL, NULL);
1206                        bh->b_bdev = bdev;
1207                        bh->b_blocknr = block;
1208                        if (uptodate)
1209                                set_buffer_uptodate(bh);
1210                        set_buffer_mapped(bh);
1211                }
1212                block++;
1213                bh = bh->b_this_page;
1214        } while (bh != head);
1215}
1216
1217/*
1218 * Create the page-cache page that contains the requested block.
1219 *
1220 * This is user purely for blockdev mappings.
1221 */
1222static struct page *
1223grow_dev_page(struct block_device *bdev, sector_t block,
1224                pgoff_t index, int size)
1225{
1226        struct inode *inode = bdev->bd_inode;
1227        struct page *page;
1228        struct buffer_head *bh;
1229
1230        page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1231        if (!page)
1232                return NULL;
1233
1234        if (!PageLocked(page))
1235                BUG();
1236
1237        if (page_has_buffers(page)) {
1238                bh = page_buffers(page);
1239                if (bh->b_size == size) {
1240                        init_page_buffers(page, bdev, block, size);
1241                        return page;
1242                }
1243                if (!try_to_free_buffers(page))
1244                        goto failed;
1245        }
1246
1247        /*
1248         * Allocate some buffers for this page
1249         */
1250        bh = create_buffers(page, size, 0);
1251        if (!bh)
1252                goto failed;
1253
1254        /*
1255         * Link the page to the buffers and initialise them.  Take the
1256         * lock to be atomic wrt __find_get_block(), which does not
1257         * run under the page lock.
1258         */
1259        spin_lock(&inode->i_mapping->private_lock);
1260        link_dev_buffers(page, bh);
1261        init_page_buffers(page, bdev, block, size);
1262        spin_unlock(&inode->i_mapping->private_lock);
1263        return page;
1264
1265failed:
1266        BUG();
1267        unlock_page(page);
1268        page_cache_release(page);
1269        return NULL;
1270}
1271
1272/*
1273 * Create buffers for the specified block device block's page.  If
1274 * that page was dirty, the buffers are set dirty also.
1275 *
1276 * Except that's a bug.  Attaching dirty buffers to a dirty
1277 * blockdev's page can result in filesystem corruption, because
1278 * some of those buffers may be aliases of filesystem data.
1279 * grow_dev_page() will go BUG() if this happens.
1280 */
1281static inline int
1282grow_buffers(struct block_device *bdev, sector_t block, int size)
1283{
1284        struct page *page;
1285        pgoff_t index;
1286        int sizebits;
1287
1288        sizebits = -1;
1289        do {
1290                sizebits++;
1291        } while ((size << sizebits) < PAGE_SIZE);
1292
1293        index = block >> sizebits;
1294
1295        /*
1296         * Check for a block which wants to lie outside our maximum possible
1297         * pagecache index.  (this comparison is done using sector_t types).
1298         */
1299        if (unlikely(index != block >> sizebits)) {
1300                char b[BDEVNAME_SIZE];
1301
1302                printk(KERN_ERR "%s: requested out-of-range block %llu for "
1303                        "device %s\n",
1304                        __FUNCTION__, (unsigned long long)block,
1305                        bdevname(bdev, b));
1306                return -EIO;
1307        }
1308        block = index << sizebits;
1309        /* Create a page with the proper size buffers.. */
1310        page = grow_dev_page(bdev, block, index, size);
1311        if (!page)
1312                return 0;
1313        unlock_page(page);
1314        page_cache_release(page);
1315        return 1;
1316}
1317
1318struct buffer_head *
1319__getblk_slow(struct block_device *bdev, sector_t block, int size)
1320{
1321        /* Size must be multiple of hard sectorsize */
1322        if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1323                        (size < 512 || size > PAGE_SIZE))) {
1324                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1325                                        size);
1326                printk(KERN_ERR "hardsect size: %d\n",
1327                                        bdev_hardsect_size(bdev));
1328
1329                dump_stack();
1330                return NULL;
1331        }
1332
1333        for (;;) {
1334                struct buffer_head * bh;
1335                int ret;
1336
1337                bh = __find_get_block(bdev, block, size);
1338                if (bh)
1339                        return bh;
1340
1341                ret = grow_buffers(bdev, block, size);
1342                if (ret < 0)
1343                        return NULL;
1344                if (ret == 0)
1345                        free_more_memory();
1346        }
1347}
1348
1349/*
1350 * The relationship between dirty buffers and dirty pages:
1351 *
1352 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1353 * the page is tagged dirty in its radix tree.
1354 *
1355 * At all times, the dirtiness of the buffers represents the dirtiness of
1356 * subsections of the page.  If the page has buffers, the page dirty bit is
1357 * merely a hint about the true dirty state.
1358 *
1359 * When a page is set dirty in its entirety, all its buffers are marked dirty
1360 * (if the page has buffers).
1361 *
1362 * When a buffer is marked dirty, its page is dirtied, but the page's other
1363 * buffers are not.
1364 *
1365 * Also.  When blockdev buffers are explicitly read with bread(), they
1366 * individually become uptodate.  But their backing page remains not
1367 * uptodate - even if all of its buffers are uptodate.  A subsequent
1368 * block_read_full_page() against that page will discover all the uptodate
1369 * buffers, will set the page uptodate and will perform no I/O.
1370 */
1371
1372/**
1373 * mark_buffer_dirty - mark a buffer_head as needing writeout
1374 *
1375 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1376 * backing page dirty, then tag the page as dirty in its address_space's radix
1377 * tree and then attach the address_space's inode to its superblock's dirty
1378 * inode list.
1379 *
1380 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1381 * mapping->tree_lock and the global inode_lock.
1382 */
1383void fastcall mark_buffer_dirty(struct buffer_head *bh)
1384{
1385        /*
1386         * Very *carefully* optimize the it-is-already-dirty case.
1387         *
1388         * Don't let the final "is it dirty" escape to before we
1389         * perhaps modified the buffer.
1390         */
1391        if (buffer_dirty(bh)) {
1392                smp_mb();
1393                if (buffer_dirty(bh))
1394                        return;
1395        }
1396
1397        if (!test_set_buffer_dirty(bh))
1398                __set_page_dirty_nobuffers(bh->b_page);
1399}
1400
1401/*
1402 * Decrement a buffer_head's reference count.  If all buffers against a page
1403 * have zero reference count, are clean and unlocked, and if the page is clean
1404 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1405 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1406 * a page but it ends up not being freed, and buffers may later be reattached).
1407 */
1408void __brelse(struct buffer_head * buf)
1409{
1410        if (atomic_read(&buf->b_count)) {
1411                put_bh(buf);
1412                return;
1413        }
1414        printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1415        WARN_ON(1);
1416}
1417
1418/*
1419 * bforget() is like brelse(), except it discards any
1420 * potentially dirty data.
1421 */
1422void __bforget(struct buffer_head *bh)
1423{
1424        clear_buffer_dirty(bh);
1425        if (!list_empty(&bh->b_assoc_buffers)) {
1426                struct address_space *buffer_mapping = bh->b_page->mapping;
1427
1428                spin_lock(&buffer_mapping->private_lock);
1429                list_del_init(&bh->b_assoc_buffers);
1430                spin_unlock(&buffer_mapping->private_lock);
1431        }
1432        __brelse(bh);
1433}
1434
1435static struct buffer_head *__bread_slow(struct buffer_head *bh)
1436{
1437        lock_buffer(bh);
1438        if (buffer_uptodate(bh)) {
1439                unlock_buffer(bh);
1440                return bh;
1441        } else {
1442                get_bh(bh);
1443                bh->b_end_io = end_buffer_read_sync;
1444                submit_bh(READ, bh);
1445                wait_on_buffer(bh);
1446                if (buffer_uptodate(bh))
1447                        return bh;
1448        }
1449        brelse(bh);
1450        return NULL;
1451}
1452
1453/*
1454 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1455 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1456 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1457 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1458 * CPU's LRUs at the same time.
1459 *
1460 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1461 * sb_find_get_block().
1462 *
1463 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1464 * a local interrupt disable for that.
1465 */
1466
1467#define BH_LRU_SIZE     8
1468
1469struct bh_lru {
1470        struct buffer_head *bhs[BH_LRU_SIZE];
1471};
1472
1473static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1474
1475#ifdef CONFIG_SMP
1476#define bh_lru_lock()   local_irq_disable()
1477#define bh_lru_unlock() local_irq_enable()
1478#else
1479#define bh_lru_lock()   preempt_disable()
1480#define bh_lru_unlock() preempt_enable()
1481#endif
1482
1483static inline void check_irqs_on(void)
1484{
1485#ifdef irqs_disabled
1486        BUG_ON(irqs_disabled());
1487#endif
1488}
1489
1490/*
1491 * The LRU management algorithm is dopey-but-simple.  Sorry.
1492 */
1493static void bh_lru_install(struct buffer_head *bh)
1494{
1495        struct buffer_head *evictee = NULL;
1496        struct bh_lru *lru;
1497
1498        check_irqs_on();
1499        bh_lru_lock();
1500        lru = &__get_cpu_var(bh_lrus);
1501        if (lru->bhs[0] != bh) {
1502                struct buffer_head *bhs[BH_LRU_SIZE];
1503                int in;
1504                int out = 0;
1505
1506                get_bh(bh);
1507                bhs[out++] = bh;
1508                for (in = 0; in < BH_LRU_SIZE; in++) {
1509                        struct buffer_head *bh2 = lru->bhs[in];
1510
1511                        if (bh2 == bh) {
1512                                __brelse(bh2);
1513                        } else {
1514                                if (out >= BH_LRU_SIZE) {
1515                                        BUG_ON(evictee != NULL);
1516                                        evictee = bh2;
1517                                } else {
1518                                        bhs[out++] = bh2;
1519                                }
1520                        }
1521                }
1522                while (out < BH_LRU_SIZE)
1523                        bhs[out++] = NULL;
1524                memcpy(lru->bhs, bhs, sizeof(bhs));
1525        }
1526        bh_lru_unlock();
1527
1528        if (evictee)
1529                __brelse(evictee);
1530}
1531
1532/*
1533 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1534 */
1535static inline struct buffer_head *
1536lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1537{
1538        struct buffer_head *ret = NULL;
1539        struct bh_lru *lru;
1540        int i;
1541
1542        check_irqs_on();
1543        bh_lru_lock();
1544        lru = &__get_cpu_var(bh_lrus);
1545        for (i = 0; i < BH_LRU_SIZE; i++) {
1546                struct buffer_head *bh = lru->bhs[i];
1547
1548                if (bh && bh->b_bdev == bdev &&
1549                                bh->b_blocknr == block && bh->b_size == size) {
1550                        if (i) {
1551                                while (i) {
1552                                        lru->bhs[i] = lru->bhs[i - 1];
1553                                        i--;
1554                                }
1555                                lru->bhs[0] = bh;
1556                        }
1557                        get_bh(bh);
1558                        ret = bh;
1559                        break;
1560                }
1561        }
1562        bh_lru_unlock();
1563        return ret;
1564}
1565
1566/*
1567 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1568 * it in the LRU and mark it as accessed.  If it is not present then return
1569 * NULL
1570 */
1571struct buffer_head *
1572__find_get_block(struct block_device *bdev, sector_t block, int size)
1573{
1574        struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1575
1576        if (bh == NULL) {
1577                bh = __find_get_block_slow(bdev, block, size);
1578                if (bh)
1579                        bh_lru_install(bh);
1580        }
1581        if (bh)
1582                touch_buffer(bh);
1583        return bh;
1584}
1585EXPORT_SYMBOL(__find_get_block);
1586
1587/*
1588 * __getblk will locate (and, if necessary, create) the buffer_head
1589 * which corresponds to the passed block_device, block and size. The
1590 * returned buffer has its reference count incremented.
1591 *
1592 * __getblk() cannot fail - it just keeps trying.  If you pass it an
1593 * illegal block number, __getblk() will happily return a buffer_head
1594 * which represents the non-existent block.  Very weird.
1595 *
1596 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1597 * attempt is failing.  FIXME, perhaps?
1598 */
1599struct buffer_head *
1600__getblk(struct block_device *bdev, sector_t block, int size)
1601{
1602        struct buffer_head *bh = __find_get_block(bdev, block, size);
1603
1604        might_sleep();
1605        if (bh == NULL)
1606                bh = __getblk_slow(bdev, block, size);
1607        return bh;
1608}
1609EXPORT_SYMBOL(__getblk);
1610
1611/*
1612 * Do async read-ahead on a buffer..
1613 */
1614void __breadahead(struct block_device *bdev, sector_t block, int size)
1615{
1616        struct buffer_head *bh = __getblk(bdev, block, size);
1617        if (likely(bh)) {
1618                ll_rw_block(READA, 1, &bh);
1619                brelse(bh);
1620        }
1621}
1622EXPORT_SYMBOL(__breadahead);
1623
1624/**
1625 *  __bread() - reads a specified block and returns the bh
1626 *  @block: number of block
1627 *  @size: size (in bytes) to read
1628 * 
1629 *  Reads a specified block, and returns buffer head that contains it.
1630 *  It returns NULL if the block was unreadable.
1631 */
1632struct buffer_head *
1633__bread(struct block_device *bdev, sector_t block, int size)
1634{
1635        struct buffer_head *bh = __getblk(bdev, block, size);
1636
1637        if (likely(bh) && !buffer_uptodate(bh))
1638                bh = __bread_slow(bh);
1639        return bh;
1640}
1641EXPORT_SYMBOL(__bread);
1642
1643/*
1644 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1645 * This doesn't race because it runs in each cpu either in irq
1646 * or with preempt disabled.
1647 */
1648static void invalidate_bh_lru(void *arg)
1649{
1650        struct bh_lru *b = &get_cpu_var(bh_lrus);
1651        int i;
1652
1653        for (i = 0; i < BH_LRU_SIZE; i++) {
1654                brelse(b->bhs[i]);
1655                b->bhs[i] = NULL;
1656        }
1657        put_cpu_var(bh_lrus);
1658}
1659        
1660static void invalidate_bh_lrus(void)
1661{
1662        on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1663}
1664
1665void set_bh_page(struct buffer_head *bh,
1666                struct page *page, unsigned long offset)
1667{
1668        bh->b_page = page;
1669        if (offset >= PAGE_SIZE)
1670                BUG();
1671        if (PageHighMem(page))
1672                /*
1673                 * This catches illegal uses and preserves the offset:
1674                 */
1675                bh->b_data = (char *)(0 + offset);
1676        else
1677                bh->b_data = page_address(page) + offset;
1678}
1679EXPORT_SYMBOL(set_bh_page);
1680
1681/*
1682 * Called when truncating a buffer on a page completely.
1683 */
1684static inline void discard_buffer(struct buffer_head * bh)
1685{
1686        lock_buffer(bh);
1687        clear_buffer_dirty(bh);
1688        bh->b_bdev = NULL;
1689        clear_buffer_mapped(bh);
1690        clear_buffer_req(bh);
1691        clear_buffer_new(bh);
1692        clear_buffer_delay(bh);
1693        unlock_buffer(bh);
1694}
1695
1696/**
1697 * try_to_release_page() - release old fs-specific metadata on a page
1698 *
1699 * @page: the page which the kernel is trying to free
1700 * @gfp_mask: memory allocation flags (and I/O mode)
1701 *
1702 * The address_space is to try to release any data against the page
1703 * (presumably at page->private).  If the release was successful, return `1'.
1704 * Otherwise return zero.
1705 *
1706 * The @gfp_mask argument specifies whether I/O may be performed to release
1707 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1708 *
1709 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1710 */
1711int try_to_release_page(struct page *page, int gfp_mask)
1712{
1713        struct address_space * const mapping = page->mapping;
1714
1715        BUG_ON(!PageLocked(page));
1716        if (PageWriteback(page))
1717                return 0;
1718        
1719        if (mapping && mapping->a_ops->releasepage)
1720                return mapping->a_ops->releasepage(page, gfp_mask);
1721        return try_to_free_buffers(page);
1722}
1723EXPORT_SYMBOL(try_to_release_page);
1724
1725/**
1726 * block_invalidatepage - invalidate part of all of a buffer-backed page
1727 *
1728 * @page: the page which is affected
1729 * @offset: the index of the truncation point
1730 *
1731 * block_invalidatepage() is called when all or part of the page has become
1732 * invalidatedby a truncate operation.
1733 *
1734 * block_invalidatepage() does not have to release all buffers, but it must
1735 * ensure that no dirty buffer is left outside @offset and that no I/O
1736 * is underway against any of the blocks which are outside the truncation
1737 * point.  Because the caller is about to free (and possibly reuse) those
1738 * blocks on-disk.
1739 */
1740int block_invalidatepage(struct page *page, unsigned long offset)
1741{
1742        struct buffer_head *head, *bh, *next;
1743        unsigned int curr_off = 0;
1744        int ret = 1;
1745
1746        BUG_ON(!PageLocked(page));
1747        if (!page_has_buffers(page))
1748                goto out;
1749
1750        head = page_buffers(page);
1751        bh = head;
1752        do {
1753                unsigned int next_off = curr_off + bh->b_size;
1754                next = bh->b_this_page;
1755
1756                /*
1757                 * is this block fully invalidated?
1758                 */
1759                if (offset <= curr_off)
1760                        discard_buffer(bh);
1761                curr_off = next_off;
1762                bh = next;
1763        } while (bh != head);
1764
1765        /*
1766         * We release buffers only if the entire page is being invalidated.
1767         * The get_block cached value has been unconditionally invalidated,
1768         * so real IO is not possible anymore.
1769         */
1770        if (offset == 0)
1771                ret = try_to_release_page(page, 0);
1772out:
1773        return ret;
1774}
1775EXPORT_SYMBOL(block_invalidatepage);
1776
1777/*
1778 * We attach and possibly dirty the buffers atomically wrt
1779 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1780 * is already excluded via the page lock.
1781 */
1782void create_empty_buffers(struct page *page,
1783                        unsigned long blocksize, unsigned long b_state)
1784{
1785        struct buffer_head *bh, *head, *tail;
1786
1787        head = create_buffers(page, blocksize, 1);
1788        bh = head;
1789        do {
1790                bh->b_state |= b_state;
1791                tail = bh;
1792                bh = bh->b_this_page;
1793        } while (bh);
1794        tail->b_this_page = head;
1795
1796        spin_lock(&page->mapping->private_lock);
1797        if (PageUptodate(page) || PageDirty(page)) {
1798                bh = head;
1799                do {
1800                        if (PageDirty(page))
1801                                set_buffer_dirty(bh);
1802                        if (PageUptodate(page))
1803                                set_buffer_uptodate(bh);
1804                        bh = bh->b_this_page;
1805                } while (bh != head);
1806        }
1807        __set_page_buffers(page, head);
1808        spin_unlock(&page->mapping->private_lock);
1809}
1810EXPORT_SYMBOL(create_empty_buffers);
1811
1812/*
1813 * We are taking a block for data and we don't want any output from any
1814 * buffer-cache aliases starting from return from that function and
1815 * until the moment when something will explicitly mark the buffer
1816 * dirty (hopefully that will not happen until we will free that block ;-)
1817 * We don't even need to mark it not-uptodate - nobody can expect
1818 * anything from a newly allocated buffer anyway. We used to used
1819 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1820 * don't want to mark the alias unmapped, for example - it would confuse
1821 * anyone who might pick it with bread() afterwards...
1822 *
1823 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1824 * be writeout I/O going on against recently-freed buffers.  We don't
1825 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1826 * only if we really need to.  That happens here.
1827 */
1828void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1829{
1830        struct buffer_head *old_bh;
1831
1832        might_sleep();
1833
1834        old_bh = __find_get_block_slow(bdev, block, 0);
1835        if (old_bh) {
1836                clear_buffer_dirty(old_bh);
1837                wait_on_buffer(old_bh);
1838                clear_buffer_req(old_bh);
1839                __brelse(old_bh);
1840        }
1841}
1842EXPORT_SYMBOL(unmap_underlying_metadata);
1843
1844/*
1845 * NOTE! All mapped/uptodate combinations are valid:
1846 *
1847 *      Mapped  Uptodate        Meaning
1848 *
1849 *      No      No              "unknown" - must do get_block()
1850 *      No      Yes             "hole" - zero-filled
1851 *      Yes     No              "allocated" - allocated on disk, not read in
1852 *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1853 *
1854 * "Dirty" is valid only with the last case (mapped+uptodate).
1855 */
1856
1857/*
1858 * While block_write_full_page is writing back the dirty buffers under
1859 * the page lock, whoever dirtied the buffers may decide to clean them
1860 * again at any time.  We handle that by only looking at the buffer
1861 * state inside lock_buffer().
1862 *
1863 * If block_write_full_page() is called for regular writeback
1864 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1865 * locked buffer.   This only can happen if someone has written the buffer
1866 * directly, with submit_bh().  At the address_space level PageWriteback
1867 * prevents this contention from occurring.
1868 */
1869static int __block_write_full_page(struct inode *inode, struct page *page,
1870                        get_block_t *get_block, struct writeback_control *wbc)
1871{
1872        int err;
1873        sector_t block;
1874        sector_t last_block;
1875        struct buffer_head *bh, *head;
1876        int nr_underway = 0;
1877
1878        BUG_ON(!PageLocked(page));
1879
1880        last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1881
1882        if (!page_has_buffers(page)) {
1883                create_empty_buffers(page, 1 << inode->i_blkbits,
1884                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
1885        }
1886
1887        /*
1888         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1889         * here, and the (potentially unmapped) buffers may become dirty at
1890         * any time.  If a buffer becomes dirty here after we've inspected it
1891         * then we just miss that fact, and the page stays dirty.
1892         *
1893         * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1894         * handle that here by just cleaning them.
1895         */
1896
1897        block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1898        head = page_buffers(page);
1899        bh = head;
1900
1901        /*
1902         * Get all the dirty buffers mapped to disk addresses and
1903         * handle any aliases from the underlying blockdev's mapping.
1904         */
1905        do {
1906                if (block > last_block) {
1907                        /*
1908                         * mapped buffers outside i_size will occur, because
1909                         * this page can be outside i_size when there is a
1910                         * truncate in progress.
1911                         */
1912                        /*
1913                         * The buffer was zeroed by block_write_full_page()
1914                         */
1915                        clear_buffer_dirty(bh);
1916                        set_buffer_uptodate(bh);
1917                } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1918                        err = get_block(inode, block, bh, 1);
1919                        if (err)
1920                                goto recover;
1921                        if (buffer_new(bh)) {
1922                                /* blockdev mappings never come here */
1923                                clear_buffer_new(bh);
1924                                unmap_underlying_metadata(bh->b_bdev,
1925                                                        bh->b_blocknr);
1926                        }
1927                }
1928                bh = bh->b_this_page;
1929                block++;
1930        } while (bh != head);
1931
1932        do {
1933                get_bh(bh);
1934                if (!buffer_mapped(bh))
1935                        continue;
1936                /*
1937                 * If it's a fully non-blocking write attempt and we cannot
1938                 * lock the buffer then redirty the page.  Note that this can
1939                 * potentially cause a busy-wait loop from pdflush and kswapd
1940                 * activity, but those code paths have their own higher-level
1941                 * throttling.
1942                 */
1943                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1944                        lock_buffer(bh);
1945                } else if (test_set_buffer_locked(bh)) {
1946                        redirty_page_for_writepage(wbc, page);
1947                        continue;
1948                }
1949                if (test_clear_buffer_dirty(bh)) {
1950                        mark_buffer_async_write(bh);
1951                } else {
1952                        unlock_buffer(bh);
1953                }
1954        } while ((bh = bh->b_this_page) != head);
1955
1956        /*
1957         * The page and its buffers are protected by PageWriteback(), so we can
1958         * drop the bh refcounts early.
1959         */
1960        BUG_ON(PageWriteback(page));
1961        set_page_writeback(page);
1962        unlock_page(page);
1963
1964        do {
1965                struct buffer_head *next = bh->b_this_page;
1966                if (buffer_async_write(bh)) {
1967                        submit_bh(WRITE, bh);
1968                        nr_underway++;
1969                }
1970                put_bh(bh);
1971                bh = next;
1972        } while (bh != head);
1973
1974        err = 0;
1975done:
1976        if (nr_underway == 0) {
1977                /*
1978                 * The page was marked dirty, but the buffers were
1979                 * clean.  Someone wrote them back by hand with
1980                 * ll_rw_block/submit_bh.  A rare case.
1981                 */
1982                int uptodate = 1;
1983                do {
1984                        if (!buffer_uptodate(bh)) {
1985                                uptodate = 0;
1986                                break;
1987                        }
1988                        bh = bh->b_this_page;
1989                } while (bh != head);
1990                if (uptodate)
1991                        SetPageUptodate(page);
1992                end_page_writeback(page);
1993                /*
1994                 * The page and buffer_heads can be released at any time from
1995                 * here on.
1996                 */
1997                wbc->pages_skipped++;   /* We didn't write this page */
1998        }
1999        return err;
2000
2001recover:
2002        /*
2003         * ENOSPC, or some other error.  We may already have added some
2004         * blocks to the file, so we need to write these out to avoid
2005         * exposing stale data.
2006         * The page is currently locked and not marked for writeback
2007         */
2008        bh = head;
2009        /* Recovery: lock and submit the mapped buffers */
2010        do {
2011                get_bh(bh);
2012                if (buffer_mapped(bh) && buffer_dirty(bh)) {
2013                        lock_buffer(bh);
2014                        mark_buffer_async_write(bh);
2015                } else {
2016                        /*
2017                         * The buffer may have been set dirty during
2018                         * attachment to a dirty page.
2019                         */
2020                        clear_buffer_dirty(bh);
2021                }
2022        } while ((bh = bh->b_this_page) != head);
2023        SetPageError(page);
2024        BUG_ON(PageWriteback(page));
2025        set_page_writeback(page);
2026        unlock_page(page);
2027        do {
2028                struct buffer_head *next = bh->b_this_page;
2029                if (buffer_async_write(bh)) {
2030                        clear_buffer_dirty(bh);
2031                        submit_bh(WRITE, bh);
2032                        nr_underway++;
2033                }
2034                put_bh(bh);
2035                bh = next;
2036        } while (bh != head);
2037        goto done;
2038}
2039
2040static int __block_prepare_write(struct inode *inode, struct page *page,
2041                unsigned from, unsigned to, get_block_t *get_block)
2042{
2043        unsigned block_start, block_end;
2044        sector_t block;
2045        int err = 0;
2046        unsigned blocksize, bbits;
2047        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
2048
2049        BUG_ON(!PageLocked(page));
2050        BUG_ON(from > PAGE_CACHE_SIZE);
2051        BUG_ON(to > PAGE_CACHE_SIZE);
2052        BUG_ON(from > to);
2053
2054        blocksize = 1 << inode->i_blkbits;
2055        if (!page_has_buffers(page))
2056                create_empty_buffers(page, blocksize, 0);
2057        head = page_buffers(page);
2058
2059        bbits = inode->i_blkbits;
2060        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2061
2062        for(bh = head, block_start = 0; bh != head || !block_start;
2063            block++, block_start=block_end, bh = bh->b_this_page) {
2064                block_end = block_start + blocksize;
2065                if (block_end <= from || block_start >= to) {
2066                        if (PageUptodate(page)) {
2067                                if (!buffer_uptodate(bh))
2068                                        set_buffer_uptodate(bh);
2069                        }
2070                        continue;
2071                }
2072                if (buffer_new(bh))
2073                        clear_buffer_new(bh);
2074                if (!buffer_mapped(bh)) {
2075                        err = get_block(inode, block, bh, 1);
2076                        if (err)
2077                                goto out;
2078                        if (buffer_new(bh)) {
2079                                unmap_underlying_metadata(bh->b_bdev,
2080                                                        bh->b_blocknr);
2081                                if (PageUptodate(page)) {
2082                                        set_buffer_uptodate(bh);
2083                                        continue;
2084                                }
2085                                if (block_end > to || block_start < from) {
2086                                        void *kaddr;
2087
2088                                        kaddr = kmap_atomic(page, KM_USER0);
2089                                        if (block_end > to)
2090                                                memset(kaddr+to, 0,
2091                                                        block_end-to);
2092                                        if (block_start < from)
2093                                                memset(kaddr+block_start,
2094                                                        0, from-block_start);
2095                                        flush_dcache_page(page);
2096                                        kunmap_atomic(kaddr, KM_USER0);
2097                                }
2098                                continue;
2099                        }
2100                }
2101                if (PageUptodate(page)) {
2102                        if (!buffer_uptodate(bh))
2103                                set_buffer_uptodate(bh);
2104                        continue; 
2105                }
2106                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2107                     (block_start < from || block_end > to)) {
2108                        ll_rw_block(READ, 1, &bh);
2109                        *wait_bh++=bh;
2110                }
2111        }
2112        /*
2113         * If we issued read requests - let them complete.
2114         */
2115        while(wait_bh > wait) {
2116                wait_on_buffer(*--wait_bh);
2117                if (!buffer_uptodate(*wait_bh))
2118                        return -EIO;
2119        }
2120
2121        bh = head;
2122        do {
2123                if (buffer_new(bh))
2124                        clear_buffer_new(bh);
2125        } while ((bh = bh->b_this_page) != head);
2126
2127        return 0;
2128out:
2129        /*
2130         * Zero out any newly allocated blocks to avoid exposing stale
2131         * data.  If BH_New is set, we know that the block was newly
2132         * allocated in the above loop.
2133         */
2134        bh = head;
2135        block_start = 0;
2136        do {
2137                block_end = block_start+blocksize;
2138                if (block_end <= from)
2139                        goto next_bh;
2140                if (block_start >= to)
2141                        break;
2142                if (buffer_new(bh)) {
2143                        void *kaddr;
2144
2145                        clear_buffer_new(bh);
2146                        kaddr = kmap_atomic(page, KM_USER0);
2147                        memset(kaddr+block_start, 0, bh->b_size);
2148                        kunmap_atomic(kaddr, KM_USER0);
2149                        set_buffer_uptodate(bh);
2150                        mark_buffer_dirty(bh);
2151                }
2152next_bh:
2153                block_start = block_end;
2154                bh = bh->b_this_page;
2155        } while (bh != head);
2156        return err;
2157}
2158
2159static int __block_commit_write(struct inode *inode, struct page *page,
2160                unsigned from, unsigned to)
2161{
2162        unsigned block_start, block_end;
2163        int partial = 0;
2164        unsigned blocksize;
2165        struct buffer_head *bh, *head;
2166
2167        blocksize = 1 << inode->i_blkbits;
2168
2169        for(bh = head = page_buffers(page), block_start = 0;
2170            bh != head || !block_start;
2171            block_start=block_end, bh = bh->b_this_page) {
2172                block_end = block_start + blocksize;
2173                if (block_end <= from || block_start >= to) {
2174                        if (!buffer_uptodate(bh))
2175                                partial = 1;
2176                } else {
2177                        set_buffer_uptodate(bh);
2178                        mark_buffer_dirty(bh);
2179                }
2180        }
2181
2182        /*
2183         * If this is a partial write which happened to make all buffers
2184         * uptodate then we can optimize away a bogus readpage() for
2185         * the next read(). Here we 'discover' whether the page went
2186         * uptodate as a result of this (potentially partial) write.
2187         */
2188        if (!partial)
2189                SetPageUptodate(page);
2190        return 0;
2191}
2192
2193/*
2194 * Generic "read page" function for block devices that have the normal
2195 * get_block functionality. This is most of the block device filesystems.
2196 * Reads the page asynchronously --- the unlock_buffer() and
2197 * set/clear_buffer_uptodate() functions propagate buffer state into the
2198 * page struct once IO has completed.
2199 */
2200int block_read_full_page(struct page *page, get_block_t *get_block)
2201{
2202        struct inode *inode = page->mapping->host;
2203        sector_t iblock, lblock;
2204        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2205        unsigned int blocksize;
2206        int nr, i;
2207        int fully_mapped = 1;
2208
2209        if (!PageLocked(page))
2210                PAGE_BUG(page);
2211        blocksize = 1 << inode->i_blkbits;
2212        if (!page_has_buffers(page))
2213                create_empty_buffers(page, blocksize, 0);
2214        head = page_buffers(page);
2215
2216        iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2217        lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2218        bh = head;
2219        nr = 0;
2220        i = 0;
2221
2222        do {
2223                if (buffer_uptodate(bh))
2224                        continue;
2225
2226                if (!buffer_mapped(bh)) {
2227                        int err = 0;
2228
2229                        fully_mapped = 0;
2230                        if (iblock < lblock) {
2231                                err = get_block(inode, iblock, bh, 0);
2232                                if (err)
2233                                        SetPageError(page);
2234                        }
2235                        if (!buffer_mapped(bh)) {
2236                                void *kaddr = kmap_atomic(page, KM_USER0);
2237                                memset(kaddr + i * blocksize, 0, blocksize);
2238                                flush_dcache_page(page);
2239                                kunmap_atomic(kaddr, KM_USER0);
2240                                if (!err)
2241                                        set_buffer_uptodate(bh);
2242                                continue;
2243                        }
2244                        /*
2245                         * get_block() might have updated the buffer
2246                         * synchronously
2247                         */
2248                        if (buffer_uptodate(bh))
2249                                continue;
2250                }
2251                arr[nr++] = bh;
2252        } while (i++, iblock++, (bh = bh->b_this_page) != head);
2253
2254        if (fully_mapped)
2255                SetPageMappedToDisk(page);
2256
2257        if (!nr) {
2258                /*
2259                 * All buffers are uptodate - we can set the page uptodate
2260                 * as well. But not if get_block() returned an error.
2261                 */
2262                if (!PageError(page))
2263                        SetPageUptodate(page);
2264                unlock_page(page);
2265                return 0;
2266        }
2267
2268        /* Stage two: lock the buffers */
2269        for (i = 0; i < nr; i++) {
2270                bh = arr[i];
2271                lock_buffer(bh);
2272                mark_buffer_async_read(bh);
2273        }
2274
2275        /*
2276         * Stage 3: start the IO.  Check for uptodateness
2277         * inside the buffer lock in case another process reading
2278         * the underlying blockdev brought it uptodate (the sct fix).
2279         */
2280        for (i = 0; i < nr; i++) {
2281                bh = arr[i];
2282                if (buffer_uptodate(bh))
2283                        end_buffer_async_read(bh, 1);
2284                else
2285                        submit_bh(READ, bh);
2286        }
2287        return 0;
2288}
2289
2290/* utility function for filesystems that need to do work on expanding
2291 * truncates.  Uses prepare/commit_write to allow the filesystem to
2292 * deal with the hole.  
2293 */
2294int generic_cont_expand(struct inode *inode, loff_t size)
2295{
2296        struct address_space *mapping = inode->i_mapping;
2297        struct page *page;
2298        unsigned long index, offset, limit;
2299        int err;
2300
2301        err = -EFBIG;
2302        limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2303        if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2304                send_sig(SIGXFSZ, current, 0);
2305                goto out;
2306        }
2307        if (size > inode->i_sb->s_maxbytes)
2308                goto out;
2309
2310        offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2311
2312        /* ugh.  in prepare/commit_write, if from==to==start of block, we 
2313        ** skip the prepare.  make sure we never send an offset for the start
2314        ** of a block
2315        */
2316        if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2317                offset++;
2318        }
2319        index = size >> PAGE_CACHE_SHIFT;
2320        err = -ENOMEM;
2321        page = grab_cache_page(mapping, index);
2322        if (!page)
2323                goto out;
2324        err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2325        if (!err) {
2326                err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2327        }
2328        unlock_page(page);
2329        page_cache_release(page);
2330        if (err > 0)
2331                err = 0;
2332out:
2333        return err;
2334}
2335
2336/*
2337 * For moronic filesystems that do not allow holes in file.
2338 * We may have to extend the file.
2339 */
2340
2341int cont_prepare_write(struct page *page, unsigned offset,
2342                unsigned to, get_block_t *get_block, loff_t *bytes)
2343{
2344        struct address_space *mapping = page->mapping;
2345        struct inode *inode = mapping->host;
2346        struct page *new_page;
2347        pgoff_t pgpos;
2348        long status;
2349        unsigned zerofrom;
2350        unsigned blocksize = 1 << inode->i_blkbits;
2351        void *kaddr;
2352
2353        while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2354                status = -ENOMEM;
2355                new_page = grab_cache_page(mapping, pgpos);
2356                if (!new_page)
2357                        goto out;
2358                /* we might sleep */
2359                if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2360                        unlock_page(new_page);
2361                        page_cache_release(new_page);
2362                        continue;
2363                }
2364                zerofrom = *bytes & ~PAGE_CACHE_MASK;
2365                if (zerofrom & (blocksize-1)) {
2366                        *bytes |= (blocksize-1);
2367                        (*bytes)++;
2368                }
2369                status = __block_prepare_write(inode, new_page, zerofrom,
2370                                                PAGE_CACHE_SIZE, get_block);
2371                if (status)
2372                        goto out_unmap;
2373                kaddr = kmap_atomic(new_page, KM_USER0);
2374                memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2375                flush_dcache_page(new_page);
2376                kunmap_atomic(kaddr, KM_USER0);
2377                __block_commit_write(inode, new_page,
2378                                zerofrom, PAGE_CACHE_SIZE);
2379                unlock_page(new_page);
2380                page_cache_release(new_page);
2381        }
2382
2383        if (page->index < pgpos) {
2384                /* completely inside the area */
2385                zerofrom = offset;
2386        } else {
2387                /* page covers the boundary, find the boundary offset */
2388                zerofrom = *bytes & ~PAGE_CACHE_MASK;
2389
2390                /* if we will expand the thing last block will be filled */
2391                if (to > zerofrom && (zerofrom & (blocksize-1))) {
2392                        *bytes |= (blocksize-1);
2393                        (*bytes)++;
2394                }
2395
2396                /* starting below the boundary? Nothing to zero out */
2397                if (offset <= zerofrom)
2398                        zerofrom = offset;
2399        }
2400        status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2401        if (status)
2402                goto out1;
2403        if (zerofrom < offset) {
2404                kaddr = kmap_atomic(page, KM_USER0);
2405                memset(kaddr+zerofrom, 0, offset-zerofrom);
2406                flush_dcache_page(page);
2407                kunmap_atomic(kaddr, KM_USER0);
2408                __block_commit_write(inode, page, zerofrom, offset);
2409        }
2410        return 0;
2411out1:
2412        ClearPageUptodate(page);
2413        return status;
2414
2415out_unmap:
2416        ClearPageUptodate(new_page);
2417        unlock_page(new_page);
2418        page_cache_release(new_page);
2419out:
2420        return status;
2421}
2422
2423int block_prepare_write(struct page *page, unsigned from, unsigned to,
2424                        get_block_t *get_block)
2425{
2426        struct inode *inode = page->mapping->host;
2427        int err = __block_prepare_write(inode, page, from, to, get_block);
2428        if (err)
2429                ClearPageUptodate(page);
2430        return err;
2431}
2432
2433int block_commit_write(struct page *page, unsigned from, unsigned to)
2434{
2435        struct inode *inode = page->mapping->host;
2436        __block_commit_write(inode,page,from,to);
2437        return 0;
2438}
2439
2440int generic_commit_write(struct file *file, struct page *page,
2441                unsigned from, unsigned to)
2442{
2443        struct inode *inode = page->mapping->host;
2444        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2445        __block_commit_write(inode,page,from,to);
2446        /*
2447         * No need to use i_size_read() here, the i_size
2448         * cannot change under us because we hold i_sem.
2449         */
2450        if (pos > inode->i_size) {
2451                i_size_write(inode, pos);
2452                mark_inode_dirty(inode);
2453        }
2454        return 0;
2455}
2456
2457
2458/*
2459 * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2460 * immediately, while under the page lock.  So it needs a special end_io
2461 * handler which does not touch the bh after unlocking it.
2462 *
2463 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2464 * a race there is benign: unlock_buffer() only use the bh's address for
2465 * hashing after unlocking the buffer, so it doesn't actually touch the bh
2466 * itself.
2467 */
2468static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2469{
2470        if (uptodate) {
2471                set_buffer_uptodate(bh);
2472        } else {
2473                /* This happens, due to failed READA attempts. */
2474                clear_buffer_uptodate(bh);
2475        }
2476        unlock_buffer(bh);
2477}
2478
2479/*
2480 * On entry, the page is fully not uptodate.
2481 * On exit the page is fully uptodate in the areas outside (from,to)
2482 */
2483int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2484                        get_block_t *get_block)
2485{
2486        struct inode *inode = page->mapping->host;
2487        const unsigned blkbits = inode->i_blkbits;
2488        const unsigned blocksize = 1 << blkbits;
2489        struct buffer_head map_bh;
2490        struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2491        unsigned block_in_page;
2492        unsigned block_start;
2493        sector_t block_in_file;
2494        char *kaddr;
2495        int nr_reads = 0;
2496        int i;
2497        int ret = 0;
2498        int is_mapped_to_disk = 1;
2499        int dirtied_it = 0;
2500
2501        if (PageMappedToDisk(page))
2502                return 0;
2503
2504        block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2505        map_bh.b_page = page;
2506
2507        /*
2508         * We loop across all blocks in the page, whether or not they are
2509         * part of the affected region.  This is so we can discover if the
2510         * page is fully mapped-to-disk.
2511         */
2512        for (block_start = 0, block_in_page = 0;
2513                  block_start < PAGE_CACHE_SIZE;
2514                  block_in_page++, block_start += blocksize) {
2515                unsigned block_end = block_start + blocksize;
2516                int create;
2517
2518                map_bh.b_state = 0;
2519                create = 1;
2520                if (block_start >= to)
2521                        create = 0;
2522                ret = get_block(inode, block_in_file + block_in_page,
2523                                        &map_bh, create);
2524                if (ret)
2525                        goto failed;
2526                if (!buffer_mapped(&map_bh))
2527                        is_mapped_to_disk = 0;
2528                if (buffer_new(&map_bh))
2529                        unmap_underlying_metadata(map_bh.b_bdev,
2530                                                        map_bh.b_blocknr);
2531                if (PageUptodate(page))
2532                        continue;
2533                if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2534                        kaddr = kmap_atomic(page, KM_USER0);
2535                        if (block_start < from) {
2536                                memset(kaddr+block_start, 0, from-block_start);
2537                                dirtied_it = 1;
2538                        }
2539                        if (block_end > to) {
2540                                memset(kaddr + to, 0, block_end - to);
2541                                dirtied_it = 1;
2542                        }
2543                        flush_dcache_page(page);
2544                        kunmap_atomic(kaddr, KM_USER0);
2545                        continue;
2546                }
2547                if (buffer_uptodate(&map_bh))
2548                        continue;       /* reiserfs does this */
2549                if (block_start < from || block_end > to) {
2550                        struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2551
2552                        if (!bh) {
2553                                ret = -ENOMEM;
2554                                goto failed;
2555                        }
2556                        bh->b_state = map_bh.b_state;
2557                        atomic_set(&bh->b_count, 0);
2558                        bh->b_this_page = NULL;
2559                        bh->b_page = page;
2560                        bh->b_blocknr = map_bh.b_blocknr;
2561                        bh->b_size = blocksize;
2562                        bh->b_data = (char *)(long)block_start;
2563                        bh->b_bdev = map_bh.b_bdev;
2564                        bh->b_private = NULL;
2565                        read_bh[nr_reads++] = bh;
2566                }
2567        }
2568
2569        if (nr_reads) {
2570                struct buffer_head *bh;
2571
2572                /*
2573                 * The page is locked, so these buffers are protected from
2574                 * any VM or truncate activity.  Hence we don't need to care
2575                 * for the buffer_head refcounts.
2576                 */
2577                for (i = 0; i < nr_reads; i++) {
2578                        bh = read_bh[i];
2579                        lock_buffer(bh);
2580                        bh->b_end_io = end_buffer_read_nobh;
2581                        submit_bh(READ, bh);
2582                }
2583                for (i = 0; i < nr_reads; i++) {
2584                        bh = read_bh[i];
2585                        wait_on_buffer(bh);
2586                        if (!buffer_uptodate(bh))
2587                                ret = -EIO;
2588                        free_buffer_head(bh);
2589                        read_bh[i] = NULL;
2590                }
2591                if (ret)
2592                        goto failed;
2593        }
2594
2595        if (is_mapped_to_disk)
2596                SetPageMappedToDisk(page);
2597        SetPageUptodate(page);
2598
2599        /*
2600         * Setting the page dirty here isn't necessary for the prepare_write
2601         * function - commit_write will do that.  But if/when this function is
2602         * used within the pagefault handler to ensure that all mmapped pages
2603         * have backing space in the filesystem, we will need to dirty the page
2604         * if its contents were altered.
2605         */
2606        if (dirtied_it)
2607                set_page_dirty(page);
2608
2609        return 0;
2610
2611failed:
2612        for (i = 0; i < nr_reads; i++) {
2613                if (read_bh[i])
2614                        free_buffer_head(read_bh[i]);
2615        }
2616
2617        /*
2618         * Error recovery is pretty slack.  Clear the page and mark it dirty
2619         * so we'll later zero out any blocks which _were_ allocated.
2620         */
2621        kaddr = kmap_atomic(page, KM_USER0);
2622        memset(kaddr, 0, PAGE_CACHE_SIZE);
2623        kunmap_atomic(kaddr, KM_USER0);
2624        SetPageUptodate(page);
2625        set_page_dirty(page);
2626        return ret;
2627}
2628EXPORT_SYMBOL(nobh_prepare_write);
2629
2630int nobh_commit_write(struct file *file, struct page *page,
2631                unsigned from, unsigned to)
2632{
2633        struct inode *inode = page->mapping->host;
2634        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2635
2636        set_page_dirty(page);
2637        if (pos > inode->i_size) {
2638                i_size_write(inode, pos);
2639                mark_inode_dirty(inode);
2640        }
2641        return 0;
2642}
2643EXPORT_SYMBOL(nobh_commit_write);
2644
2645/*
2646 * This function assumes that ->prepare_write() uses nobh_prepare_write().
2647 */
2648int nobh_truncate_page(struct address_space *mapping, loff_t from)
2649{
2650        struct inode *inode = mapping->host;
2651        unsigned blocksize = 1 << inode->i_blkbits;
2652        pgoff_t index = from >> PAGE_CACHE_SHIFT;
2653        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2654        unsigned to;
2655        struct page *page;
2656        struct address_space_operations *a_ops = mapping->a_ops;
2657        char *kaddr;
2658        int ret = 0;
2659
2660        if ((offset & (blocksize - 1)) == 0)
2661                goto out;
2662
2663        ret = -ENOMEM;
2664        page = grab_cache_page(mapping, index);
2665        if (!page)
2666                goto out;
2667
2668        to = (offset + blocksize) & ~(blocksize - 1);
2669        ret = a_ops->prepare_write(NULL, page, offset, to);
2670        if (ret == 0) {
2671                kaddr = kmap_atomic(page, KM_USER0);
2672                memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2673                flush_dcache_page(page);
2674                kunmap_atomic(kaddr, KM_USER0);
2675                set_page_dirty(page);
2676        }
2677        unlock_page(page);
2678        page_cache_release(page);
2679out:
2680        return ret;
2681}
2682EXPORT_SYMBOL(nobh_truncate_page);
2683
2684int block_truncate_page(struct address_space *mapping,
2685                        loff_t from, get_block_t *get_block)
2686{
2687        pgoff_t index = from >> PAGE_CACHE_SHIFT;
2688        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2689        unsigned blocksize;
2690        pgoff_t iblock;
2691        unsigned length, pos;
2692        struct inode *inode = mapping->host;
2693        struct page *page;
2694        struct buffer_head *bh;
2695        void *kaddr;
2696        int err;
2697
2698        blocksize = 1 << inode->i_blkbits;
2699        length = offset & (blocksize - 1);
2700
2701        /* Block boundary? Nothing to do */
2702        if (!length)
2703                return 0;
2704
2705        length = blocksize - length;
2706        iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2707        
2708        page = grab_cache_page(mapping, index);
2709        err = -ENOMEM;
2710        if (!page)
2711                goto out;
2712
2713        if (!page_has_buffers(page))
2714                create_empty_buffers(page, blocksize, 0);
2715
2716        /* Find the buffer that contains "offset" */
2717        bh = page_buffers(page);
2718        pos = blocksize;
2719        while (offset >= pos) {
2720                bh = bh->b_this_page;
2721                iblock++;
2722                pos += blocksize;
2723        }
2724
2725        err = 0;
2726        if (!buffer_mapped(bh)) {
2727                err = get_block(inode, iblock, bh, 0);
2728                if (err)
2729                        goto unlock;
2730                /* unmapped? It's a hole - nothing to do */
2731                if (!buffer_mapped(bh))
2732                        goto unlock;
2733        }
2734
2735        /* Ok, it's mapped. Make sure it's up-to-date */
2736        if (PageUptodate(page))
2737                set_buffer_uptodate(bh);
2738
2739        if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2740                err = -EIO;
2741                ll_rw_block(READ, 1, &bh);
2742                wait_on_buffer(bh);
2743                /* Uhhuh. Read error. Complain and punt. */
2744                if (!buffer_uptodate(bh))
2745                        goto unlock;
2746        }
2747
2748        kaddr = kmap_atomic(page, KM_USER0);
2749        memset(kaddr + offset, 0, length);
2750        flush_dcache_page(page);
2751        kunmap_atomic(kaddr, KM_USER0);
2752
2753        mark_buffer_dirty(bh);
2754        err = 0;
2755
2756unlock:
2757        unlock_page(page);
2758        page_cache_release(page);
2759out:
2760        return err;
2761}
2762
2763/*
2764 * The generic ->writepage function for buffer-backed address_spaces
2765 */
2766int block_write_full_page(struct page *page, get_block_t *get_block,
2767                        struct writeback_control *wbc)
2768{
2769        struct inode * const inode = page->mapping->host;
2770        loff_t i_size = i_size_read(inode);
2771        const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2772        unsigned offset;
2773        void *kaddr;
2774
2775        /* Is the page fully inside i_size? */
2776        if (page->index < end_index)
2777                return __block_write_full_page(inode, page, get_block, wbc);
2778
2779        /* Is the page fully outside i_size? (truncate in progress) */
2780        offset = i_size & (PAGE_CACHE_SIZE-1);
2781        if (page->index >= end_index+1 || !offset) {
2782                /*
2783                 * The page may have dirty, unmapped buffers.  For example,
2784                 * they may have been added in ext3_writepage().  Make them
2785                 * freeable here, so the page does not leak.
2786                 */
2787                block_invalidatepage(page, 0);
2788                unlock_page(page);
2789                return 0; /* don't care */
2790        }
2791
2792        /*
2793         * The page straddles i_size.  It must be zeroed out on each and every
2794         * writepage invokation because it may be mmapped.  "A file is mapped
2795         * in multiples of the page size.  For a file that is not a multiple of
2796         * the  page size, the remaining memory is zeroed when mapped, and
2797         * writes to that region are not written out to the file."
2798         */
2799        kaddr = kmap_atomic(page, KM_USER0);
2800        memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2801        flush_dcache_page(page);
2802        kunmap_atomic(kaddr, KM_USER0);
2803        return __block_write_full_page(inode, page, get_block, wbc);