RHEL4/mm/madvise.c
<<
>>
Prefs
   1/*
   2 *      linux/mm/madvise.c
   3 *
   4 * Copyright (C) 1999  Linus Torvalds
   5 * Copyright (C) 2002  Christoph Hellwig
   6 */
   7
   8#include <linux/mman.h>
   9#include <linux/pagemap.h>
  10#include <linux/hugetlb.h>
  11#include <linux/mempolicy.h>
  12
  13/*
  14 * Any behaviour which results in changes to the vma->vm_flags needs to
  15 * take mmap_sem for writing. Others, which simply traverse vmas, need
  16 * to only take it for reading.
  17 */
  18static int madvise_need_mmap_write(int behavior)
  19{
  20        switch (behavior) {
  21        case MADV_WILLNEED:
  22        case MADV_DONTNEED:
  23                return 0;
  24        default:
  25                /* be safe, default to 1. list exceptions explicitly */
  26                return 1;
  27        }
  28}
  29
  30/*
  31 * We can potentially split a vm area into separate
  32 * areas, each area with its own behavior.
  33 */
  34static long madvise_behavior(struct vm_area_struct * vma,
  35        struct vm_area_struct **prev,
  36        unsigned long start, unsigned long end, int behavior)
  37
  38{
  39        struct mm_struct * mm = vma->vm_mm;
  40        int error = 0;
  41        pgoff_t pgoff;
  42        int new_flags = vma->vm_flags;
  43
  44        switch (behavior) {
  45        case MADV_NORMAL:
  46                new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  47                break;
  48        case MADV_SEQUENTIAL:
  49                new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
  50                break;
  51        case MADV_RANDOM:
  52                new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
  53                break;
  54        case MADV_DONTFORK:
  55                new_flags |= VM_DONTCOPY;
  56                break;
  57        case MADV_DOFORK:
  58                new_flags &= ~VM_DONTCOPY;
  59        default:
  60                break;
  61        }
  62
  63        if (new_flags == vma->vm_flags) {
  64                *prev = vma;
  65                goto out;
  66        }
  67
  68        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  69        *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
  70                        vma->vm_file, pgoff, vma_policy(vma));
  71
  72        if (*prev) {
  73                vma = *prev;
  74                goto success;
  75        }
  76
  77        *prev = vma;
  78
  79        if (start != vma->vm_start) {
  80                error = split_vma(mm, vma, start, 1);
  81                if (error)
  82                        goto out;
  83        }
  84
  85        if (end != vma->vm_end) {
  86                error = split_vma(mm, vma, end, 0);
  87                if (error)
  88                        goto out;
  89        }
  90
  91success:
  92        /*
  93         * vm_flags is protected by the mmap_sem held in write mode.
  94         */
  95        VM_ClearReadHint(vma);
  96        vma->vm_flags = new_flags;
  97
  98out:
  99        if (error == -ENOMEM)
 100                error = -EAGAIN;
 101        return error;
 102}
 103
 104/*
 105 * Schedule all required I/O operations.  Do not wait for completion.
 106 */
 107static long madvise_willneed(struct vm_area_struct * vma,
 108                struct vm_area_struct **prev,
 109                unsigned long start, unsigned long end)
 110{
 111        struct file *file = vma->vm_file;
 112
 113        if (!file)
 114                return -EBADF;
 115
 116        *prev = vma;
 117
 118        start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 119        if (end > vma->vm_end)
 120                end = vma->vm_end;
 121        end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 122
 123        force_page_cache_readahead(file->f_mapping,
 124                        file, start, max_sane_readahead(end - start));
 125        return 0;
 126}
 127
 128/*
 129 * Application no longer needs these pages.  If the pages are dirty,
 130 * it's OK to just throw them away.  The app will be more careful about
 131 * data it wants to keep.  Be sure to free swap resources too.  The
 132 * zap_page_range call sets things up for refill_inactive to actually free
 133 * these pages later if no one else has touched them in the meantime,
 134 * although we could add these pages to a global reuse list for
 135 * refill_inactive to pick up before reclaiming other pages.
 136 *
 137 * NB: This interface discards data rather than pushes it out to swap,
 138 * as some implementations do.  This has performance implications for
 139 * applications like large transactional databases which want to discard
 140 * pages in anonymous maps after committing to backing store the data
 141 * that was kept in them.  There is no reason to write this data out to
 142 * the swap area if the application is discarding it.
 143 *
 144 * An interface that causes the system to free clean pages and flush
 145 * dirty pages is already available as msync(MS_INVALIDATE).
 146 */
 147static long madvise_dontneed(struct vm_area_struct * vma,
 148                struct vm_area_struct **prev,
 149                unsigned long start, unsigned long end)
 150{
 151        *prev = vma;
 152        if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
 153                return -EINVAL;
 154
 155        if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
 156                struct zap_details details = {
 157                        .nonlinear_vma = vma,
 158                        .last_index = ULONG_MAX,
 159                };
 160                zap_page_range(vma, start, end - start, &details);
 161        } else
 162                zap_page_range(vma, start, end - start, NULL);
 163        return 0;
 164}
 165
 166static long madvise_vma(struct vm_area_struct * vma, 
 167                struct vm_area_struct **prev, 
 168                unsigned long start, unsigned long end, int behavior)
 169{
 170        long error = -EBADF;
 171
 172        switch (behavior) {
 173        case MADV_DOFORK:
 174                if (vma->vm_flags & VM_IO) {
 175                        error = -EINVAL;
 176                        break;
 177                }
 178        case MADV_DONTFORK:
 179        case MADV_NORMAL:
 180        case MADV_SEQUENTIAL:
 181        case MADV_RANDOM:
 182                error = madvise_behavior(vma, prev, start, end, behavior);
 183                break;
 184
 185        case MADV_WILLNEED:
 186                error = madvise_willneed(vma, prev, start, end);
 187                break;
 188
 189        case MADV_DONTNEED:
 190                error = madvise_dontneed(vma, prev, start, end);
 191                break;
 192
 193        default:
 194                error = -EINVAL;
 195                break;
 196        }
 197                
 198        return error;
 199}
 200
 201/*
 202 * The madvise(2) system call.
 203 *
 204 * Applications can use madvise() to advise the kernel how it should
 205 * handle paging I/O in this VM area.  The idea is to help the kernel
 206 * use appropriate read-ahead and caching techniques.  The information
 207 * provided is advisory only, and can be safely disregarded by the
 208 * kernel without affecting the correct operation of the application.
 209 *
 210 * behavior values:
 211 *  MADV_NORMAL - the default behavior is to read clusters.  This
 212 *              results in some read-ahead and read-behind.
 213 *  MADV_RANDOM - the system should read the minimum amount of data
 214 *              on any access, since it is unlikely that the appli-
 215 *              cation will need more than what it asks for.
 216 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
 217 *              once, so they can be aggressively read ahead, and
 218 *              can be freed soon after they are accessed.
 219 *  MADV_WILLNEED - the application is notifying the system to read
 220 *              some pages ahead.
 221 *  MADV_DONTNEED - the application is finished with the given range,
 222 *              so the kernel can free resources associated with it.
 223 *
 224 * return values:
 225 *  zero    - success
 226 *  -EINVAL - start + len < 0, start is not page-aligned,
 227 *              "behavior" is not a valid value, or application
 228 *              is attempting to release locked or shared pages.
 229 *  -ENOMEM - addresses in the specified range are not currently
 230 *              mapped, or are outside the AS of the process.
 231 *  -EIO    - an I/O error occurred while paging in data.
 232 *  -EBADF  - map exists, but area maps something that isn't a file.
 233 *  -EAGAIN - a kernel resource was temporarily unavailable.
 234 */
 235asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
 236{
 237        unsigned long end, tmp;
 238        struct vm_area_struct * vma, *prev;
 239        int unmapped_error = 0;
 240        int error = -EINVAL;
 241        int write;
 242        size_t len;
 243
 244        write = madvise_need_mmap_write(behavior);
 245        if (write)
 246                down_write(&current->mm->mmap_sem);
 247        else
 248                down_read(&current->mm->mmap_sem);
 249
 250        if (start & ~PAGE_MASK)
 251                goto out;
 252        len = (len_in + ~PAGE_MASK) & PAGE_MASK;
 253
 254        /* Check to see whether len was rounded up from small -ve to zero */
 255        if (len_in && !len)
 256                goto out;
 257
 258        end = start + len;
 259        if (end < start)
 260                goto out;
 261
 262        error = 0;
 263        if (end == start)
 264                goto out;
 265
 266        /*
 267         * If the interval [start,end) covers some unmapped address
 268         * ranges, just ignore them, but return -ENOMEM at the end.
 269         */
 270        vma = find_vma_prev(current->mm, start, &prev);
 271        if (vma && start > vma->vm_start)
 272                prev = vma;
 273
 274        for (;;) {
 275                /* Still start < end. */
 276                error = -ENOMEM;
 277                if (!vma)
 278                        goto out;
 279
 280                /* Here start < vma->vm_end. */
 281                if (start < vma->vm_start) {
 282                        unmapped_error = -ENOMEM;
 283                        start = vma->vm_start;
 284                        if (start >= end)
 285                                goto out;
 286                }
 287
 288                /* Here vma->vm_start <= start < vma->vm_end. */
 289                tmp = vma->vm_end;
 290                if (end < tmp)
 291                        tmp = end;
 292
 293                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
 294                error = madvise_vma(vma, &prev, start, tmp, behavior);
 295                if (error)
 296                        goto out;
 297
 298                start = tmp;
 299                if (start < prev->vm_end)
 300                        start = prev->vm_end;
 301                error = unmapped_error;
 302                if (start >= end)
 303                        goto out;
 304                vma = prev->vm_next;
 305        }
 306
 307out:
 308        if (write)
 309                up_write(&current->mm->mmap_sem);
 310        else
 311                up_read(&current->mm->mmap_sem);
 312
 313        return error;
 314}
 315