RHEL4/mm/readahead.c
<<
>>
Prefs
   1/*
   2 * mm/readahead.c - address_space-level file readahead.
   3 *
   4 * Copyright (C) 2002, Linus Torvalds
   5 *
   6 * 09Apr2002    akpm@zip.com.au
   7 *              Initial version.
   8 */
   9
  10#include <linux/kernel.h>
  11#include <linux/fs.h>
  12#include <linux/mm.h>
  13#include <linux/module.h>
  14#include <linux/blkdev.h>
  15#include <linux/backing-dev.h>
  16#include <linux/task_io_accounting_ops.h>
  17#include <linux/pagevec.h>
  18
  19void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
  20{
  21}
  22EXPORT_SYMBOL(default_unplug_io_fn);
  23
  24struct backing_dev_info default_backing_dev_info = {
  25        .ra_pages       = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
  26        .state          = 0,
  27        .unplug_io_fn   = default_unplug_io_fn,
  28};
  29EXPORT_SYMBOL_GPL(default_backing_dev_info);
  30
  31/*
  32 * Initialise a struct file's readahead state.  Assumes that the caller has
  33 * memset *ra to zero.
  34 */
  35void
  36file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
  37{
  38        ra->ra_pages = mapping->backing_dev_info->ra_pages;
  39        ra->average = ra->ra_pages / 2;
  40}
  41
  42/*
  43 * Return max readahead size for this inode in number-of-pages.
  44 */
  45static inline unsigned long get_max_readahead(struct file_ra_state *ra)
  46{
  47        return ra->ra_pages;
  48}
  49
  50static inline unsigned long get_min_readahead(struct file_ra_state *ra)
  51{
  52        return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
  53}
  54
  55#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
  56
  57/**
  58 * read_cache_pages - populate an address space with some pages, and
  59 *                      start reads against them.
  60 * @mapping: the address_space
  61 * @pages: The address of a list_head which contains the target pages.  These
  62 *   pages have their ->index populated and are otherwise uninitialised.
  63 * @filler: callback routine for filling a single page.
  64 * @data: private data for the callback routine.
  65 *
  66 * Hides the details of the LRU cache etc from the filesystems.
  67 */
  68int read_cache_pages(struct address_space *mapping, struct list_head *pages,
  69                 int (*filler)(void *, struct page *), void *data)
  70{
  71        struct page *page;
  72        struct pagevec lru_pvec;
  73        int ret = 0;
  74
  75        pagevec_init(&lru_pvec, 0);
  76
  77        while (!list_empty(pages)) {
  78                page = list_to_page(pages);
  79                list_del(&page->lru);
  80                if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
  81                        page_cache_release(page);
  82                        continue;
  83                }
  84                ret = filler(data, page);
  85                if (!pagevec_add(&lru_pvec, page))
  86                        __pagevec_lru_add(&lru_pvec);
  87                if (ret) {
  88                        while (!list_empty(pages)) {
  89                                struct page *victim;
  90
  91                                victim = list_to_page(pages);
  92                                list_del(&victim->lru);
  93                                page_cache_release(victim);
  94                        }
  95                        break;
  96                }
  97                task_io_account_read(PAGE_CACHE_SIZE);
  98        }
  99        pagevec_lru_add(&lru_pvec);
 100        return ret;
 101}
 102
 103EXPORT_SYMBOL(read_cache_pages);
 104
 105static int read_pages(struct address_space *mapping, struct file *filp,
 106                struct list_head *pages, unsigned nr_pages)
 107{
 108        unsigned page_idx;
 109        struct pagevec lru_pvec;
 110        int ret = 0;
 111
 112        if (mapping->a_ops->readpages) {
 113                ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
 114                goto out;
 115        }
 116
 117        pagevec_init(&lru_pvec, 0);
 118        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 119                struct page *page = list_to_page(pages);
 120                list_del(&page->lru);
 121                if (!add_to_page_cache(page, mapping,
 122                                        page->index, GFP_KERNEL)) {
 123                        mapping->a_ops->readpage(filp, page);
 124                        if (!pagevec_add(&lru_pvec, page))
 125                                __pagevec_lru_add(&lru_pvec);
 126                } else {
 127                        page_cache_release(page);
 128                }
 129        }
 130        pagevec_lru_add(&lru_pvec);
 131out:
 132        return ret;
 133}
 134
 135/*
 136 * Readahead design.
 137 *
 138 * The fields in struct file_ra_state represent the most-recently-executed
 139 * readahead attempt:
 140 *
 141 * start:       Page index at which we started the readahead
 142 * size:        Number of pages in that read
 143 *              Together, these form the "current window".
 144 *              Together, start and size represent the `readahead window'.
 145 * next_size:   The number of pages to read on the next readahead miss.
 146 *              Has the magical value -1UL if readahead has been disabled.
 147 * prev_page:   The page which the readahead algorithm most-recently inspected.
 148 *              prev_page is mainly an optimisation: if page_cache_readahead
 149 *              sees that it is again being called for a page which it just
 150 *              looked at, it can return immediately without making any state
 151 *              changes.
 152 * ahead_start,
 153 * ahead_size:  Together, these form the "ahead window".
 154 * ra_pages:    The externally controlled max readahead for this fd.
 155 *
 156 * When readahead is in the "maximally shrunk" state (next_size == -1UL),
 157 * readahead is disabled.  In this state, prev_page and size are used, inside
 158 * handle_ra_miss(), to detect the resumption of sequential I/O.  Once there
 159 * has been a decent run of sequential I/O (defined by get_min_readahead),
 160 * readahead is reenabled.
 161 *
 162 * The readahead code manages two windows - the "current" and the "ahead"
 163 * windows.  The intent is that while the application is walking the pages
 164 * in the current window, I/O is underway on the ahead window.  When the
 165 * current window is fully traversed, it is replaced by the ahead window
 166 * and the ahead window is invalidated.  When this copying happens, the
 167 * new current window's pages are probably still locked.  When I/O has
 168 * completed, we submit a new batch of I/O, creating a new ahead window.
 169 *
 170 * So:
 171 *
 172 *   ----|----------------|----------------|-----
 173 *       ^start           ^start+size
 174 *                        ^ahead_start     ^ahead_start+ahead_size
 175 *
 176 *         ^ When this page is read, we submit I/O for the
 177 *           ahead window.
 178 *
 179 * A `readahead hit' occurs when a read request is made against a page which is
 180 * inside the current window.  Hits are good, and the window size (next_size)
 181 * is grown aggressively when hits occur.  Two pages are added to the next
 182 * window size on each hit, which will end up doubling the next window size by
 183 * the time I/O is submitted for it.
 184 *
 185 * If readahead hits are more sparse (say, the application is only reading
 186 * every second page) then the window will build more slowly.
 187 *
 188 * On a readahead miss (the application seeked away) the readahead window is
 189 * shrunk by 25%.  We don't want to drop it too aggressively, because it is a
 190 * good assumption that an application which has built a good readahead window
 191 * will continue to perform linear reads.  Either at the new file position, or
 192 * at the old one after another seek.
 193 *
 194 * After enough misses, readahead is fully disabled. (next_size = -1UL).
 195 *
 196 * There is a special-case: if the first page which the application tries to
 197 * read happens to be the first page of the file, it is assumed that a linear
 198 * read is about to happen and the window is immediately set to half of the
 199 * device maximum.
 200 * 
 201 * A page request at (start + size) is not a miss at all - it's just a part of
 202 * sequential file reading.
 203 *
 204 * This function is to be called for every page which is read, rather than when
 205 * it is time to perform readahead.  This is so the readahead algorithm can
 206 * centrally work out the access patterns.  This could be costly with many tiny
 207 * read()s, so we specifically optimise for that case with prev_page.
 208 */
 209
 210/*
 211 * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
 212 * the pages first, then submits them all for I/O. This avoids the very bad
 213 * behaviour which would occur if page allocations are causing VM writeback.
 214 * We really don't want to intermingle reads and writes like that.
 215 *
 216 * Returns the number of pages which actually had IO started against them.
 217 */
 218static inline int
 219__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 220                        unsigned long offset, unsigned long nr_to_read)
 221{
 222        struct inode *inode = mapping->host;
 223        struct page *page;
 224        unsigned long end_index;        /* The last page we want to read */
 225        LIST_HEAD(page_pool);
 226        int page_idx;
 227        int ret = 0;
 228        loff_t isize = i_size_read(inode);
 229
 230        if (isize == 0)
 231                goto out;
 232
 233        end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
 234
 235        /*
 236         * Preallocate as many pages as we will need.
 237         */
 238        spin_lock_irq(&mapping->tree_lock);
 239        for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
 240                unsigned long page_offset = offset + page_idx;
 241                
 242                if (page_offset > end_index)
 243                        break;
 244
 245                page = radix_tree_lookup(&mapping->page_tree, page_offset);
 246                if (page)
 247                        continue;
 248
 249                spin_unlock_irq(&mapping->tree_lock);
 250                page = page_cache_alloc_cold(mapping);
 251                spin_lock_irq(&mapping->tree_lock);
 252                if (!page)
 253                        break;
 254                page->index = page_offset;
 255                list_add(&page->lru, &page_pool);
 256                ret++;
 257        }
 258        spin_unlock_irq(&mapping->tree_lock);
 259
 260        /*
 261         * Now start the IO.  We ignore I/O errors - if the page is not
 262         * uptodate then the caller will launch readpage again, and
 263         * will then handle the error.
 264         */
 265        if (ret)
 266                read_pages(mapping, filp, &page_pool, ret);
 267        BUG_ON(!list_empty(&page_pool));
 268out:
 269        return ret;
 270}
 271
 272/*
 273 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 274 * memory at once.
 275 */
 276int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 277                unsigned long offset, unsigned long nr_to_read)
 278{
 279        int ret = 0;
 280
 281        if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
 282                return -EINVAL;
 283
 284        while (nr_to_read) {
 285                int err;
 286
 287                unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
 288
 289                if (this_chunk > nr_to_read)
 290                        this_chunk = nr_to_read;
 291                err = __do_page_cache_readahead(mapping, filp,
 292                                                offset, this_chunk);
 293                if (err < 0) {
 294                        ret = err;
 295                        break;
 296                }
 297                ret += err;
 298                offset += this_chunk;
 299                nr_to_read -= this_chunk;
 300        }
 301        return ret;
 302}
 303
 304/*
 305 * This version skips the IO if the queue is read-congested, and will tell the
 306 * block layer to abandon the readahead if request allocation would block.
 307 *
 308 * force_page_cache_readahead() will ignore queue congestion and will block on
 309 * request queues.
 310 */
 311int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 312                        unsigned long offset, unsigned long nr_to_read)
 313{
 314        if (!bdi_read_congested(mapping->backing_dev_info))
 315                return __do_page_cache_readahead(mapping, filp,
 316                                                offset, nr_to_read);
 317        return 0;
 318}
 319
 320/*
 321 * Check how effective readahead is being.  If the amount of started IO is
 322 * less than expected then the file is partly or fully in pagecache and
 323 * readahead isn't helping.  Shrink the window.
 324 *
 325 * But don't shrink it too much - the application may read the same page
 326 * occasionally.
 327 */
 328static inline void
 329check_ra_success(struct file_ra_state *ra, pgoff_t attempt,
 330                        pgoff_t actual, pgoff_t orig_next_size)
 331{
 332        if (actual == 0) {
 333                if (orig_next_size > 1) {
 334                        ra->next_size = orig_next_size - 1;
 335                        if (ra->ahead_size)
 336                                ra->ahead_size = ra->next_size;
 337                } else {
 338                        ra->next_size = -1UL;
 339                        ra->size = 0;
 340                }
 341        }
 342}
 343
 344/*
 345 * Move towards, or away from, re-enabling readahead.
 346 *
 347 * For sequential IOs, move ra->size towards re-enabling
 348 * readahead by the number of sequential pages read.
 349 * Non-sequential IOs will move away from re-enabling
 350 * readahead, by decreasing ra->size.
 351 */
 352
 353static inline void
 354check_ra_reenable(struct file_ra_state *ra, unsigned long offset,
 355                  const unsigned long max, unsigned long nr_read)
 356{
 357        if (offset != ra->prev_page + 1) {      /* Not sequential */
 358                ra->size = ra->size?ra->size-1:0;
 359        } else {                                /* A sequential read */
 360                ra->size += nr_read;
 361                if (ra->size >= max) {          /* Resume readahead */
 362                        ra->start = offset - max;
 363                        ra->next_size = max;
 364                        ra->size = max;
 365                        ra->ahead_start = 0;
 366                        ra->ahead_size = 0;
 367                        ra->average = max / 2;
 368                }
 369        }
 370}
 371
 372/*
 373 * page_cache_readahead is the main function.  If performs the adaptive
 374 * readahead window size management and submits the readahead I/O.
 375 */
 376unsigned long
 377page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
 378                     struct file *filp, unsigned long offset,
 379                     unsigned long nr_to_read)
 380{
 381        unsigned max;
 382        unsigned orig_next_size;
 383        unsigned actual;
 384        int first_access=0;
 385        unsigned long average;
 386
 387        /*
 388         * Here we detect the case where the application is performing
 389         * sub-page sized reads.  We avoid doing extra work and bogusly
 390         * perturbing the readahead window expansion logic.
 391         * If next_size is zero, this is the very first read for this
 392         * file handle, or the window is maximally shrunk.
 393         */
 394        if (offset == ra->prev_page) {
 395                if (ra->next_size != 0)
 396                        goto out;
 397        }
 398
 399        max = get_max_readahead(ra);
 400        if (max == 0)
 401                goto out;       /* No readahead */
 402
 403        if (ra->next_size == -1UL)
 404                goto ra_off;    /* Maximally shrunk */
 405
 406        orig_next_size = ra->next_size;
 407
 408        if (ra->next_size == 0) {
 409                /*
 410                 * Special case - first read.
 411                 * We'll assume it's a whole-file read, and
 412                 * grow the window fast.
 413                 */
 414                first_access=1;
 415                ra->next_size = max / 2;
 416                ra->prev_page = offset;
 417                ra->currnt_wnd_hit++;
 418                goto do_io;
 419        }
 420
 421        ra->prev_page = offset;
 422
 423        if (offset >= ra->start && offset <= (ra->start + ra->size)) {
 424                /*
 425                 * A readahead hit.  Either inside the window, or one
 426                 * page beyond the end.  Expand the next readahead size.
 427                 */
 428                ra->next_size += 2;
 429
 430                if (ra->currnt_wnd_hit <= (max * 2))
 431                        ra->currnt_wnd_hit++;
 432        } else {
 433                /*
 434                 * A miss - lseek, pagefault, pread, etc.  Shrink the readahead
 435                 * window.
 436                 */
 437                ra->next_size -= 2;
 438
 439                average = ra->average;
 440                if (average < ra->currnt_wnd_hit) {
 441                        average++;
 442                }
 443                ra->average = (average + ra->currnt_wnd_hit) / 2;
 444                ra->currnt_wnd_hit = 1;
 445        }
 446
 447        if ((long)ra->next_size > (long)max)
 448                ra->next_size = max;
 449        if ((long)ra->next_size <= 0L) {
 450                ra->next_size = -1UL;
 451                ra->size = 0;
 452                goto ra_off;            /* Readahead is off */
 453        }
 454
 455        /*
 456         * Is this request outside the current window?
 457         */
 458        if (offset < ra->start || offset >= (ra->start + ra->size)) {
 459                /*
 460                 * A miss against the current window.  Have we merely
 461                 * advanced into the ahead window?
 462                 */
 463                if (offset == ra->ahead_start) {
 464                        /*
 465                         * Yes, we have.  The ahead window now becomes
 466                         * the current window.
 467                         */
 468                        ra->start = ra->ahead_start;
 469                        ra->size = ra->ahead_size;
 470                        ra->prev_page = ra->start;
 471                        ra->ahead_start = 0;
 472                        ra->ahead_size = 0;
 473
 474                        /*
 475                         * Control now returns, probably to sleep until I/O
 476                         * completes against the first ahead page.
 477                         * When the second page in the old ahead window is
 478                         * requested, control will return here and more I/O
 479                         * will be submitted to build the new ahead window.
 480                         */
 481                        goto out;
 482                }
 483do_io:
 484                /*
 485                 * This is the "unusual" path.  We come here during
 486                 * startup or after an lseek.  We invalidate the
 487                 * ahead window and get some I/O underway for the new
 488                 * current window.
 489                 */
 490                if (!first_access) {
 491                         /* Heuristic: there is a high probability
 492                          * that around  ra->average number of
 493                          * pages shall be accessed in the next
 494                          * current window.
 495                          */
 496                        average = ra->average;
 497                        if (ra->currnt_wnd_hit > average)
 498                                average = (ra->currnt_wnd_hit + ra->average + 1) / 2;
 499
 500                        ra->next_size = min(average , (unsigned long)max);
 501                }
 502                ra->start = offset;
 503                ra->size = ra->next_size;
 504                if (ra->size < nr_to_read)
 505                        ra->size = min(nr_to_read, (unsigned long)max);
 506                ra->prev_page = offset + ra->size - 1;
 507                ra->ahead_start = 0;            /* Invalidate these */
 508                ra->ahead_size = 0;
 509                actual = do_page_cache_readahead(mapping, filp, offset,
 510                                                 ra->size);
 511                if(!first_access) {
 512                        /*
 513                         * do not adjust the readahead window size the first
 514                         * time, the ahead window might get closed if all
 515                         * the pages are already in the cache.
 516                         */
 517                        check_ra_success(ra, ra->size, actual, orig_next_size);
 518                }
 519        } else {
 520                /*
 521                 * This read request is within the current window.  It may be
 522                 * time to submit I/O for the ahead window while the
 523                 * application is about to step into the ahead window.
 524                 */
 525                if (ra->ahead_start == 0) {
 526                        /*
 527                         * If the average io-size is more than maximum
 528                         * readahead size of the file the io pattern is
 529                         * sequential. Hence  bring in the readahead window
 530                         * immediately.
 531                         * If the average io-size is less than maximum
 532                         * readahead size of the file the io pattern is
 533                         * random. Hence don't bother to readahead.
 534                         */
 535                        average = ra->average;
 536                        if (ra->currnt_wnd_hit > average)
 537                                average = (ra->currnt_wnd_hit + ra->average + 1) / 2;
 538
 539                        if (average > max) {
 540                                ra->ahead_start = ra->start + ra->size;
 541                                ra->ahead_size = ra->next_size;
 542                                actual = do_page_cache_readahead(mapping, filp,
 543                                        ra->ahead_start, ra->ahead_size);
 544                                check_ra_success(ra, ra->ahead_size,
 545                                                actual, orig_next_size);
 546                        }
 547                }
 548        }
 549out:
 550        return ra->prev_page + 1;
 551
 552ra_off:
 553        /* Even if we have decided to terminate readahead for this file,
 554         * a single large read request still wants to be able to
 555         * populate the page cache with a single large bio, so if
 556         * necessary, kick off one limited readahead IO burst to
 557         * complete as much of the read as we can.
 558         */
 559        if (nr_to_read > 1) {
 560                nr_to_read = min(nr_to_read, (unsigned long) max);
 561                check_ra_reenable(ra, offset, max, nr_to_read);
 562                ra->prev_page = offset + nr_to_read - 1;
 563                do_page_cache_readahead(mapping, filp, offset, nr_to_read);
 564        }
 565        goto out;
 566}
 567
 568
 569/*
 570 * handle_ra_miss() is called when it is known that a page which should have
 571 * been present in the pagecache (we just did some readahead there) was in fact
 572 * not found.  This will happen if it was evicted by the VM (readahead
 573 * thrashing) or if the readahead window is maximally shrunk.
 574 *
 575 * If the window has been maximally shrunk (next_size == -1UL) then look to see
 576 * if we are getting misses against sequential file offsets.  If so, and this
 577 * persists then resume readahead.
 578 *
 579 * Otherwise we're thrashing, so shrink the readahead window by three pages.
 580 * This is because it is grown by two pages on a readahead hit.  Theory being
 581 * that the readahead window size will stabilise around the maximum level at
 582 * which there is no thrashing.
 583 */
 584void handle_ra_miss(struct address_space *mapping,
 585                struct file_ra_state *ra, pgoff_t offset)
 586{
 587        if (ra->next_size == -1UL) {
 588                const unsigned long max = get_max_readahead(ra);
 589
 590                check_ra_reenable(ra, offset, max, 1);
 591                ra->prev_page = offset;
 592        } else {
 593                const unsigned long min = get_min_readahead(ra);
 594
 595                ra->next_size -= 3;
 596                if (ra->next_size < min)
 597                        ra->next_size = min;
 598        }
 599}
 600
 601/*
 602 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
 603 * sensible upper limit.
 604 */
 605unsigned long max_sane_readahead(unsigned long nr)
 606{
 607        unsigned long active;
 608        unsigned long inactive;
 609        unsigned long free;
 610
 611        __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
 612        return min(nr, (inactive + free) / 2);
 613}
 614