RHEL4/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * Subject to the GNU Public License, version 2.
   6 *
   7 * NUMA policy allows the user to give hints in which node(s) memory should
   8 * be allocated.
   9 *
  10 * Support four policies per VMA and per process:
  11 *
  12 * The VMA policy has priority over the process policy for a page fault.
  13 *
  14 * interleave     Allocate memory interleaved over a set of nodes,
  15 *                with normal fallback if it fails.
  16 *                For VMA based allocations this interleaves based on the
  17 *                offset into the backing object or offset into the mapping
  18 *                for anonymous memory. For process policy an process counter
  19 *                is used.
  20 * bind           Only allocate memory on a specific set of nodes,
  21 *                no fallback.
  22 * preferred       Try a specific node first before normal fallback.
  23 *                As a special case node -1 here means do the allocation
  24 *                on the local CPU. This is normally identical to default,
  25 *                but useful to set in a VMA when you have a non default
  26 *                process policy.
  27 * default        Allocate on the local node first, or when on a VMA
  28 *                use the process policy. This is what Linux always did
  29 *                in a NUMA aware kernel and still does by, ahem, default.
  30 *
  31 * The process policy is applied for most non interrupt memory allocations
  32 * in that process' context. Interrupts ignore the policies and always
  33 * try to allocate on the local CPU. The VMA policy is only applied for memory
  34 * allocations for a VMA in the VM.
  35 *
  36 * Currently there are a few corner cases in swapping where the policy
  37 * is not applied, but the majority should be handled. When process policy
  38 * is used it is not remembered over swap outs/swap ins.
  39 *
  40 * Only the highest zone in the zone hierarchy gets policied. Allocations
  41 * requesting a lower zone just use default policy. This implies that
  42 * on systems with highmem kernel lowmem allocation don't get policied.
  43 * Same with GFP_DMA allocations.
  44 *
  45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  46 * all users and remembered even when nobody has memory mapped.
  47 */
  48
  49/* Notebook:
  50   fix mmap readahead to honour policy and enable policy for any page cache
  51   object
  52   statistics for bigpages
  53   global policy for page cache? currently it uses process policy. Requires
  54   first item above.
  55   handle mremap for shared memory (currently ignored for the policy)
  56   grows down?
  57   make bind policy root only? It can trigger oom much faster and the
  58   kernel is not always grateful with that.
  59   could replace all the switch()es with a mempolicy_ops structure.
  60*/
  61
  62#include <linux/mempolicy.h>
  63#include <linux/mm.h>
  64#include <linux/highmem.h>
  65#include <linux/hugetlb.h>
  66#include <linux/kernel.h>
  67#include <linux/sched.h>
  68#include <linux/mm.h>
  69#include <linux/gfp.h>
  70#include <linux/slab.h>
  71#include <linux/string.h>
  72#include <linux/module.h>
  73#include <linux/interrupt.h>
  74#include <linux/init.h>
  75#include <linux/compat.h>
  76#include <linux/mempolicy.h>
  77#include <asm/uaccess.h>
  78
  79static kmem_cache_t *policy_cache;
  80static kmem_cache_t *sn_cache;
  81
  82#define PDprintk(fmt...)
  83
  84/* Highest zone. An specific allocation for a zone below that is not
  85   policied. */
  86static int policy_zone;
  87
  88static struct mempolicy default_policy = {
  89        .refcnt = ATOMIC_INIT(1), /* never free it */
  90        .policy = MPOL_DEFAULT,
  91};
  92
  93/* Check if all specified nodes are online */
  94static int nodes_online(unsigned long *nodes)
  95{
  96        DECLARE_BITMAP(online2, MAX_NUMNODES);
  97
  98        bitmap_copy(online2, node_online_map, MAX_NUMNODES);
  99        if (bitmap_empty(online2, MAX_NUMNODES))
 100                set_bit(0, online2);
 101        if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
 102                return -EINVAL;
 103        return 0;
 104}
 105
 106/* Do sanity checking on a policy */
 107static int mpol_check_policy(int mode, unsigned long *nodes)
 108{
 109        int empty = bitmap_empty(nodes, MAX_NUMNODES);
 110
 111        switch (mode) {
 112        case MPOL_DEFAULT:
 113                if (!empty)
 114                        return -EINVAL;
 115                break;
 116        case MPOL_BIND:
 117        case MPOL_INTERLEAVE:
 118                /* Preferred will only use the first bit, but allow
 119                   more for now. */
 120                if (empty)
 121                        return -EINVAL;
 122                break;
 123        }
 124        return nodes_online(nodes);
 125}
 126
 127/* Copy a node mask from user space. */
 128static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
 129                     unsigned long maxnode, int mode)
 130{
 131        unsigned long k;
 132        unsigned long nlongs;
 133        unsigned long endmask;
 134
 135        --maxnode;
 136        bitmap_zero(nodes, MAX_NUMNODES);
 137        if (maxnode == 0 || !nmask)
 138                return 0;
 139
 140        nlongs = BITS_TO_LONGS(maxnode);
 141        if (nlongs == 0)
 142                return -EINVAL;
 143
 144        if ((maxnode % BITS_PER_LONG) == 0)
 145                endmask = ~0UL;
 146        else
 147                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 148
 149        /* When the user specified more nodes than supported just check
 150           if the non supported part is all zero. */
 151        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 152                if (nlongs > PAGE_SIZE/sizeof(long))
 153                        return -EINVAL;
 154                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 155                        unsigned long t;
 156                        if (get_user(t,  nmask + k))
 157                                return -EFAULT;
 158                        if (k == nlongs - 1) {
 159                                if (t & endmask)
 160                                        return -EINVAL;
 161                        } else if (t)
 162                                return -EINVAL;
 163                }
 164                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 165                endmask = ~0UL;
 166        }
 167
 168        if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
 169                return -EFAULT;
 170        nodes[nlongs-1] &= endmask;
 171        return 0;
 172}
 173
 174/* Generate a custom zonelist for the BIND policy. */
 175static struct zonelist *bind_zonelist(unsigned long *nodes)
 176{
 177        struct zonelist *zl;
 178        int num, max, nd;
 179
 180        max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
 181        zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 182        if (!zl)
 183                return NULL;
 184        num = 0;
 185        for (nd = find_first_bit(nodes, MAX_NUMNODES);
 186             nd < MAX_NUMNODES;
 187             nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
 188                int k;
 189                for (k = MAX_NR_ZONES-1; k >= 0; k--) {
 190                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
 191                        if (!z->present_pages)
 192                                continue;
 193                        zl->zones[num++] = z;
 194                        if (k > policy_zone)
 195                                policy_zone = k;
 196                }
 197        }
 198        BUG_ON(num >= max);
 199        zl->zones[num] = NULL;
 200        return zl;
 201}
 202
 203/* Create a new policy */
 204static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
 205{
 206        struct mempolicy *policy;
 207
 208        PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
 209        if (mode == MPOL_DEFAULT)
 210                return NULL;
 211        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 212        if (!policy)
 213                return ERR_PTR(-ENOMEM);
 214        atomic_set(&policy->refcnt, 1);
 215        switch (mode) {
 216        case MPOL_INTERLEAVE:
 217                bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
 218                if (bitmap_weight(nodes, MAX_NUMNODES) == 0) {
 219                        kmem_cache_free(policy_cache, policy);
 220                        return ERR_PTR(-EINVAL);
 221                }
 222                break;
 223        case MPOL_PREFERRED:
 224                policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
 225                if (policy->v.preferred_node >= MAX_NUMNODES)
 226                        policy->v.preferred_node = -1;
 227                break;
 228        case MPOL_BIND:
 229                policy->v.zonelist = bind_zonelist(nodes);
 230                if (policy->v.zonelist == NULL) {
 231                        kmem_cache_free(policy_cache, policy);
 232                        return ERR_PTR(-ENOMEM);
 233                }
 234                break;
 235        }
 236        policy->policy = mode;
 237        return policy;
 238}
 239
 240/* Ensure all existing pages follow the policy. */
 241static int
 242verify_pages(unsigned long addr, unsigned long end, unsigned long *nodes)
 243{
 244        while (addr < end) {
 245                struct page *p;
 246                pte_t *pte;
 247                pmd_t *pmd;
 248                pgd_t *pgd = pgd_offset_k(addr);
 249                if (pgd_none(*pgd)) {
 250                        addr = (addr + PGDIR_SIZE) & PGDIR_MASK;
 251                        continue;
 252                }
 253                pmd = pmd_offset(pgd, addr);
 254                if (pmd_none(*pmd)) {
 255                        addr = (addr + PMD_SIZE) & PMD_MASK;
 256                        continue;
 257                }
 258                p = NULL;
 259                pte = pte_offset_map(pmd, addr);
 260                if (pte_present(*pte))
 261                        p = pte_page(*pte);
 262                pte_unmap(pte);
 263                if (p) {
 264                        unsigned nid = page_to_nid(p);
 265                        if (!test_bit(nid, nodes))
 266                                return -EIO;
 267                }
 268                addr += PAGE_SIZE;
 269        }
 270        return 0;
 271}
 272
 273/* Step 1: check the range */
 274static struct vm_area_struct *
 275check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 276            unsigned long *nodes, unsigned long flags)
 277{
 278        int err;
 279        struct vm_area_struct *first, *vma, *prev;
 280
 281        first = find_vma(mm, start);
 282        if (!first)
 283                return ERR_PTR(-EFAULT);
 284        prev = NULL;
 285        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 286                if (!vma->vm_next && vma->vm_end < end)
 287                        return ERR_PTR(-EFAULT);
 288                if (prev && prev->vm_end < vma->vm_start)
 289                        return ERR_PTR(-EFAULT);
 290                if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
 291                        err = verify_pages(vma->vm_start, vma->vm_end, nodes);
 292                        if (err) {
 293                                first = ERR_PTR(err);
 294                                break;
 295                        }
 296                }
 297                prev = vma;
 298        }
 299        return first;
 300}
 301
 302/* Apply policy to a single VMA */
 303static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 304{
 305        int err = 0;
 306        struct mempolicy *old = vma->vm_policy;
 307
 308        PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 309                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 310                 vma->vm_ops, vma->vm_file,
 311                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 312
 313        if (vma->vm_ops && vma->vm_ops->set_policy)
 314                err = vma->vm_ops->set_policy(vma, new);
 315        if (!err) {
 316                mpol_get(new);
 317                vma->vm_policy = new;
 318                mpol_free(old);
 319        }
 320        return err;
 321}
 322
 323/* Step 2: apply policy to a range and do splits. */
 324static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 325                       unsigned long end, struct mempolicy *new)
 326{
 327        struct vm_area_struct *next;
 328        int err;
 329
 330        err = 0;
 331        for (; vma && vma->vm_start < end; vma = next) {
 332                next = vma->vm_next;
 333                if (vma->vm_start < start)
 334                        err = split_vma(vma->vm_mm, vma, start, 1);
 335                if (!err && vma->vm_end > end)
 336                        err = split_vma(vma->vm_mm, vma, end, 0);
 337                if (!err)
 338                        err = policy_vma(vma, new);
 339                if (err)
 340                        break;
 341        }
 342        return err;
 343}
 344
 345/* Change policy for a memory range */
 346asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 347                          unsigned long mode,
 348                          unsigned long __user *nmask, unsigned long maxnode,
 349                          unsigned flags)
 350{
 351        struct vm_area_struct *vma;
 352        struct mm_struct *mm = current->mm;
 353        struct mempolicy *new;
 354        unsigned long end;
 355        DECLARE_BITMAP(nodes, MAX_NUMNODES);
 356        int err;
 357
 358        if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
 359                return -EINVAL;
 360        if (start & ~PAGE_MASK)
 361                return -EINVAL;
 362        if (mode == MPOL_DEFAULT)
 363                flags &= ~MPOL_MF_STRICT;
 364        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 365        end = start + len;
 366        if (end < start)
 367                return -EINVAL;
 368        if (end == start)
 369                return 0;
 370
 371        err = get_nodes(nodes, nmask, maxnode, mode);
 372        if (err)
 373                return err;
 374
 375        if (mpol_check_policy(mode, nodes))
 376                return -EINVAL;
 377
 378        new = mpol_new(mode, nodes);
 379        if (IS_ERR(new))
 380                return PTR_ERR(new);
 381
 382        PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 383                        mode,nodes[0]);
 384
 385        down_write(&mm->mmap_sem);
 386        vma = check_range(mm, start, end, nodes, flags);
 387        err = PTR_ERR(vma);
 388        if (!IS_ERR(vma))
 389                err = mbind_range(vma, start, end, new);
 390        up_write(&mm->mmap_sem);
 391        mpol_free(new);
 392        return err;
 393}
 394
 395/* Set the process memory policy */
 396asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 397                                   unsigned long maxnode)
 398{
 399        int err;
 400        struct mempolicy *new;
 401        DECLARE_BITMAP(nodes, MAX_NUMNODES);
 402
 403        if (mode < 0 || mode > MPOL_MAX)
 404                return -EINVAL;
 405        err = get_nodes(nodes, nmask, maxnode, mode);
 406        if (err)
 407                return err;
 408        if (mpol_check_policy(mode, nodes))
 409                return -EINVAL;
 410        new = mpol_new(mode, nodes);
 411        if (IS_ERR(new))
 412                return PTR_ERR(new);
 413        mpol_free(current->mempolicy);
 414        current->mempolicy = new;
 415        if (new && new->policy == MPOL_INTERLEAVE)
 416                current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
 417        return 0;
 418}
 419
 420/* Fill a zone bitmap for a policy */
 421static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
 422{
 423        int i;
 424
 425        bitmap_zero(nodes, MAX_NUMNODES);
 426        switch (p->policy) {
 427        case MPOL_BIND:
 428                for (i = 0; p->v.zonelist->zones[i]; i++)
 429                        __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
 430                break;
 431        case MPOL_DEFAULT:
 432                break;
 433        case MPOL_INTERLEAVE:
 434                bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
 435                break;
 436        case MPOL_PREFERRED:
 437                /* or use current node instead of online map? */
 438                if (p->v.preferred_node < 0)
 439                        bitmap_copy(nodes, node_online_map, MAX_NUMNODES);
 440                else
 441                        __set_bit(p->v.preferred_node, nodes);
 442                break;
 443        default:
 444                BUG();
 445        }
 446}
 447
 448static int lookup_node(struct mm_struct *mm, unsigned long addr)
 449{
 450        struct page *p;
 451        int err;
 452
 453        err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 454        if (err >= 0) {
 455                err = page_to_nid(p);
 456                put_page(p);
 457        }
 458        return err;
 459}
 460
 461/* Copy a kernel node mask to user space */
 462static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 463                              void *nodes, unsigned nbytes)
 464{
 465        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 466
 467        if (copy > nbytes) {
 468                if (copy > PAGE_SIZE)
 469                        return -EINVAL;
 470                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 471                        return -EFAULT;
 472                copy = nbytes;
 473        }
 474        return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
 475}
 476
 477/* Retrieve NUMA policy */
 478asmlinkage long sys_get_mempolicy(int __user *policy,
 479                                  unsigned long __user *nmask,
 480                                  unsigned long maxnode,
 481                                  unsigned long addr, unsigned long flags)
 482{
 483        int err, pval;
 484        struct mm_struct *mm = current->mm;
 485        struct vm_area_struct *vma = NULL;
 486        struct mempolicy *pol = current->mempolicy;
 487
 488        if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 489                return -EINVAL;
 490        if (nmask != NULL && maxnode < numnodes)
 491                return -EINVAL;
 492        if (flags & MPOL_F_ADDR) {
 493                down_read(&mm->mmap_sem);
 494                vma = find_vma_intersection(mm, addr, addr+1);
 495                if (!vma) {
 496                        up_read(&mm->mmap_sem);
 497                        return -EFAULT;
 498                }
 499                if (vma->vm_ops && vma->vm_ops->get_policy)
 500                        pol = vma->vm_ops->get_policy(vma, addr);
 501                else
 502                        pol = vma->vm_policy;
 503        } else if (addr)
 504                return -EINVAL;
 505
 506        if (!pol)
 507                pol = &default_policy;
 508
 509        if (flags & MPOL_F_NODE) {
 510                if (flags & MPOL_F_ADDR) {
 511                        err = lookup_node(mm, addr);
 512                        if (err < 0)
 513                                goto out;
 514                        pval = err;
 515                } else if (pol == current->mempolicy &&
 516                                pol->policy == MPOL_INTERLEAVE) {
 517                        pval = current->il_next;
 518                } else {
 519                        err = -EINVAL;
 520                        goto out;
 521                }
 522        } else
 523                pval = pol->policy;
 524
 525        if (vma) {
 526                up_read(&current->mm->mmap_sem);
 527                vma = NULL;
 528        }
 529
 530        if (policy && put_user(pval, policy))
 531                return -EFAULT;
 532
 533        err = 0;
 534        if (nmask) {
 535                DECLARE_BITMAP(nodes, MAX_NUMNODES);
 536                get_zonemask(pol, nodes);
 537                err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
 538        }
 539
 540 out:
 541        if (vma)
 542                up_read(&current->mm->mmap_sem);
 543        return err;
 544}
 545
 546#ifdef CONFIG_COMPAT
 547
 548asmlinkage long compat_get_mempolicy(int __user *policy,
 549                                     compat_ulong_t __user *nmask,
 550                                     compat_ulong_t maxnode,
 551                                     compat_ulong_t addr, compat_ulong_t flags)
 552{
 553        long err;
 554        unsigned long __user *nm = NULL;
 555        unsigned long nr_bits, alloc_size;
 556        DECLARE_BITMAP(bm, MAX_NUMNODES);
 557
 558        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 559        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 560
 561        if (nmask)
 562                nm = compat_alloc_user_space(alloc_size);
 563
 564        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 565
 566        if (!err && nmask) {
 567                err = copy_from_user(bm, nm, alloc_size);
 568                /* ensure entire bitmap is zeroed */
 569                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 570                err |= compat_put_bitmap(nmask, bm, nr_bits);
 571        }
 572
 573        return err;
 574}
 575
 576asmlinkage long compat_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 577                                     compat_ulong_t maxnode)
 578{
 579        long err = 0;
 580        unsigned long __user *nm = NULL;
 581        unsigned long nr_bits, alloc_size;
 582        DECLARE_BITMAP(bm, MAX_NUMNODES);
 583
 584        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 585        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 586
 587        if (nmask) {
 588                err = compat_get_bitmap(bm, nmask, nr_bits);
 589                nm = compat_alloc_user_space(alloc_size);
 590                err |= copy_to_user(nm, bm, alloc_size);
 591        }
 592
 593        if (err)
 594                return -EFAULT;
 595
 596        return sys_set_mempolicy(mode, nm, nr_bits+1);
 597}
 598
 599asmlinkage long compat_mbind(compat_ulong_t start, compat_ulong_t len,
 600                             compat_ulong_t mode, compat_ulong_t __user *nmask,
 601                             compat_ulong_t maxnode, compat_ulong_t flags)
 602{
 603        long err = 0;
 604        unsigned long __user *nm = NULL;
 605        unsigned long nr_bits, alloc_size;
 606        DECLARE_BITMAP(bm, MAX_NUMNODES);
 607
 608        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 609        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 610
 611        if (nmask) {
 612                err = compat_get_bitmap(bm, nmask, nr_bits);
 613                nm = compat_alloc_user_space(alloc_size);
 614                err |= copy_to_user(nm, bm, alloc_size);
 615        }
 616
 617        if (err)
 618                return -EFAULT;
 619
 620        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 621}
 622
 623#endif
 624
 625/* Return effective policy for a VMA */
 626struct mempolicy *
 627get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 628{
 629        struct mempolicy *pol = task->mempolicy;
 630
 631        if (vma) {
 632                if (vma->vm_ops && vma->vm_ops->get_policy)
 633                        pol = vma->vm_ops->get_policy(vma, addr);
 634                else if (vma->vm_policy &&
 635                                vma->vm_policy->policy != MPOL_DEFAULT)
 636                        pol = vma->vm_policy;
 637        }
 638        if (!pol)
 639                pol = &default_policy;
 640        return pol;
 641}
 642
 643/* Return a zonelist representing a mempolicy */
 644static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy)
 645{
 646        int nd;
 647
 648        switch (policy->policy) {
 649        case MPOL_PREFERRED:
 650                nd = policy->v.preferred_node;
 651                if (nd < 0)
 652                        nd = numa_node_id();
 653                break;
 654        case MPOL_BIND:
 655                /* Lower zones don't get a policy applied */
 656                if (gfp >= policy_zone)
 657                        return policy->v.zonelist;
 658                /*FALL THROUGH*/
 659        case MPOL_INTERLEAVE: /* should not happen */
 660        case MPOL_DEFAULT:
 661                nd = numa_node_id();
 662                break;
 663        default:
 664                nd = 0;
 665                BUG();
 666        }
 667        return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
 668}
 669
 670/* Do dynamic interleaving for a process */
 671static unsigned interleave_nodes(struct mempolicy *policy)
 672{
 673        unsigned nid, next;
 674        struct task_struct *me = current;
 675
 676        nid = me->il_next;
 677        BUG_ON(nid >= MAX_NUMNODES);
 678        next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
 679        if (next >= MAX_NUMNODES)
 680                next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
 681        me->il_next = next;
 682        return nid;
 683}
 684
 685/* Do static interleaving for a VMA with known offset. */
 686static unsigned offset_il_node(struct mempolicy *pol,
 687                struct vm_area_struct *vma, unsigned long off)
 688{
 689        unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
 690        unsigned target = (unsigned)off % nnodes;
 691        int c;
 692        int nid = -1;
 693
 694        c = 0;
 695        do {
 696                nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
 697                c++;
 698        } while (c <= target);
 699        BUG_ON(nid >= MAX_NUMNODES);
 700        BUG_ON(!test_bit(nid, pol->v.nodes));
 701        return nid;
 702}
 703
 704/* Allocate a page in interleaved policy.
 705   Own path because it needs to do special accounting. */
 706static struct page *alloc_page_interleave(unsigned gfp, unsigned order, unsigned nid)
 707{
 708        struct zonelist *zl;
 709        struct page *page;
 710
 711        BUG_ON(!test_bit(nid, node_online_map));
 712        zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
 713        page = __alloc_pages(gfp, order, zl);
 714        if (page && page_zone(page) == zl->zones[0]) {
 715                zl->zones[0]->pageset[get_cpu()].interleave_hit++;
 716                put_cpu();
 717        }
 718        return page;
 719}
 720
 721/**
 722 *      alloc_page_vma  - Allocate a page for a VMA.
 723 *
 724 *      @gfp:
 725 *      %GFP_USER    user allocation.
 726 *      %GFP_KERNEL  kernel allocations,
 727 *      %GFP_HIGHMEM highmem/user allocations,
 728 *      %GFP_FS      allocation should not call back into a file system.
 729 *      %GFP_ATOMIC  don't sleep.
 730 *
 731 *      @vma:  Pointer to VMA or NULL if not available.
 732 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 733 *
 734 *      This function allocates a page from the kernel page pool and applies
 735 *      a NUMA policy associated with the VMA or the current process.
 736 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
 737 *      mm_struct of the VMA to prevent it from going away. Should be used for
 738 *      all allocations for pages that will be mapped into
 739 *      user space. Returns NULL when no page can be allocated.
 740 *
 741 *      Should be called with the mm_sem of the vma hold.
 742 */
 743struct page *
 744alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr)
 745{
 746        struct mempolicy *pol = get_vma_policy(current, vma, addr);
 747
 748        if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 749                unsigned nid;
 750                if (vma) {
 751                        unsigned long off;
 752                        BUG_ON(addr >= vma->vm_end);
 753                        BUG_ON(addr < vma->vm_start);
 754                        off = vma->vm_pgoff;
 755                        off += (addr - vma->vm_start) >> PAGE_SHIFT;
 756                        nid = offset_il_node(pol, vma, off);
 757                } else {
 758                        /* fall back to process interleaving */
 759                        nid = interleave_nodes(pol);
 760                }
 761                return alloc_page_interleave(gfp, 0, nid);
 762        }
 763        return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
 764}
 765
 766/**
 767 *      alloc_pages_current - Allocate pages.
 768 *
 769 *      @gfp:
 770 *              %GFP_USER   user allocation,
 771 *              %GFP_KERNEL kernel allocation,
 772 *              %GFP_HIGHMEM highmem allocation,
 773 *              %GFP_FS     don't call back into a file system.
 774 *              %GFP_ATOMIC don't sleep.
 775 *      @order: Power of two of allocation size in pages. 0 is a single page.
 776 *
 777 *      Allocate a page from the kernel page pool.  When not in
 778 *      interrupt context and apply the current process NUMA policy.
 779 *      Returns NULL when no page can be allocated.
 780 */
 781struct page *alloc_pages_current(unsigned gfp, unsigned order)
 782{
 783        struct mempolicy *pol = current->mempolicy;
 784
 785        if (!pol || in_interrupt())
 786                pol = &default_policy;
 787        if (pol->policy == MPOL_INTERLEAVE)
 788                return alloc_page_interleave(gfp, order, interleave_nodes(pol));
 789        return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
 790}
 791EXPORT_SYMBOL(alloc_pages_current);
 792
 793/* Slow path of a mempolicy copy */
 794struct mempolicy *__mpol_copy(struct mempolicy *old)
 795{
 796        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 797
 798        if (!new)
 799                return ERR_PTR(-ENOMEM);
 800        *new = *old;
 801        atomic_set(&new->refcnt, 1);
 802        if (new->policy == MPOL_BIND) {
 803                int sz = ksize(old->v.zonelist);
 804                new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
 805                if (!new->v.zonelist) {
 806                        kmem_cache_free(policy_cache, new);
 807                        return ERR_PTR(-ENOMEM);
 808                }
 809                memcpy(new->v.zonelist, old->v.zonelist, sz);
 810        }
 811        return new;
 812}
 813
 814/* Slow path of a mempolicy comparison */
 815int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 816{
 817        if (!a || !b)
 818                return 0;
 819        if (a->policy != b->policy)
 820                return 0;
 821        switch (a->policy) {
 822        case MPOL_DEFAULT:
 823                return 1;
 824        case MPOL_INTERLEAVE:
 825                return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
 826        case MPOL_PREFERRED:
 827                return a->v.preferred_node == b->v.preferred_node;
 828        case MPOL_BIND: {
 829                int i;
 830                for (i = 0; a->v.zonelist->zones[i]; i++)
 831                        if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
 832                                return 0;
 833                return b->v.zonelist->zones[i] == NULL;
 834        }
 835        default:
 836                BUG();
 837                return 0;
 838        }
 839}
 840
 841/* Slow path of a mpol destructor. */
 842void __mpol_free(struct mempolicy *p)
 843{
 844        if (!atomic_dec_and_test(&p->refcnt))
 845                return;
 846        if (p->policy == MPOL_BIND)
 847                kfree(p->v.zonelist);
 848        p->policy = MPOL_DEFAULT;
 849        kmem_cache_free(policy_cache, p);
 850}
 851
 852/*
 853 * Hugetlb policy. Same as above, just works with node numbers instead of
 854 * zonelists.
 855 */
 856
 857/* Find first node suitable for an allocation */
 858int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 859{
 860        struct mempolicy *pol = get_vma_policy(current, vma, addr);
 861
 862        switch (pol->policy) {
 863        case MPOL_DEFAULT:
 864                return numa_node_id();
 865        case MPOL_BIND:
 866                return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
 867        case MPOL_INTERLEAVE:
 868                return interleave_nodes(pol);
 869        case MPOL_PREFERRED:
 870                return pol->v.preferred_node >= 0 ?
 871                                pol->v.preferred_node : numa_node_id();
 872        }
 873        BUG();
 874        return 0;
 875}
 876
 877/* Find secondary valid nodes for an allocation */
 878int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
 879{
 880        struct mempolicy *pol = get_vma_policy(current, vma, addr);
 881
 882        switch (pol->policy) {
 883        case MPOL_PREFERRED:
 884        case MPOL_DEFAULT:
 885        case MPOL_INTERLEAVE:
 886                return 1;
 887        case MPOL_BIND: {
 888                struct zone **z;
 889                for (z = pol->v.zonelist->zones; *z; z++)
 890                        if ((*z)->zone_pgdat->node_id == nid)
 891                                return 1;
 892                return 0;
 893        }
 894        default:
 895                BUG();
 896                return 0;
 897        }
 898}
 899
 900/*
 901 * Shared memory backing store policy support.
 902 *
 903 * Remember policies even when nobody has shared memory mapped.
 904 * The policies are kept in Red-Black tree linked from the inode.
 905 * They are protected by the sp->sem semaphore, which should be held
 906 * for any accesses to the tree.
 907 */
 908
 909/* lookup first element intersecting start-end */
 910/* Caller holds sp->sem */
 911static struct sp_node *
 912sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 913{
 914        struct rb_node *n = sp->root.rb_node;
 915
 916        while (n) {
 917                struct sp_node *p = rb_entry(n, struct sp_node, nd);
 918                if (start >= p->end) {
 919                        n = n->rb_right;
 920                } else if (end < p->start) {
 921                        n = n->rb_left;
 922                } else {
 923                        break;
 924                }
 925        }
 926        if (!n)
 927                return NULL;
 928        for (;;) {
 929                struct sp_node *w = NULL;
 930                struct rb_node *prev = rb_prev(n);
 931                if (!prev)
 932                        break;
 933                w = rb_entry(prev, struct sp_node, nd);
 934                if (w->end <= start)
 935                        break;
 936                n = prev;
 937        }
 938        return rb_entry(n, struct sp_node, nd);
 939}
 940
 941/* Insert a new shared policy into the list. */
 942/* Caller holds sp->sem */
 943static void sp_insert(struct shared_policy *sp, struct sp_node *new)
 944{
 945        struct rb_node **p = &sp->root.rb_node;
 946        struct rb_node *parent = NULL;
 947        struct sp_node *nd;
 948
 949        while (*p) {
 950                parent = *p;
 951                nd = rb_entry(parent, struct sp_node, nd);
 952                if (new->start < nd->start)
 953                        p = &(*p)->rb_left;
 954                else if (new->end > nd->end)
 955                        p = &(*p)->rb_right;
 956                else
 957                        BUG();
 958        }
 959        rb_link_node(&new->nd, parent, p);
 960        rb_insert_color(&new->nd, &sp->root);
 961        PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
 962                 new->policy ? new->policy->policy : 0);
 963}
 964
 965/* Find shared policy intersecting idx */
 966struct mempolicy *
 967mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 968{
 969        struct mempolicy *pol = NULL;
 970        struct sp_node *sn;
 971
 972        down(&sp->sem);
 973        sn = sp_lookup(sp, idx, idx+1);
 974        if (sn) {
 975                mpol_get(sn->policy);
 976                pol = sn->policy;
 977        }
 978        up(&sp->sem);
 979        return pol;
 980}
 981
 982static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 983{
 984        PDprintk("deleting %lx-l%x\n", n->start, n->end);
 985        rb_erase(&n->nd, &sp->root);
 986        mpol_free(n->policy);
 987        kmem_cache_free(sn_cache, n);
 988}
 989
 990struct sp_node *
 991sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
 992{
 993        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
 994
 995        if (!n)
 996                return NULL;
 997        n->start = start;
 998        n->end = end;
 999        mpol_get(pol);
1000        n->policy = pol;
1001        return n;
1002}
1003
1004/* Replace a policy range. */
1005static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1006                                 unsigned long end, struct sp_node *new)
1007{
1008        struct sp_node *n, *new2;
1009
1010        down(&sp->sem);
1011        n = sp_lookup(sp, start, end);
1012        /* Take care of old policies in the same range. */
1013        while (n && n->start < end) {
1014                struct rb_node *next = rb_next(&n->nd);
1015                if (n->start >= start) {
1016                        if (n->end <= end)
1017                                sp_delete(sp, n);
1018                        else
1019                                n->start = end;
1020                } else {
1021                        /* Old policy spanning whole new range. */
1022                        if (n->end > end) {
1023                                new2 = sp_alloc(end, n->end, n->policy);
1024                                if (!new2) {
1025                                        up(&sp->sem);
1026                                        return -ENOMEM;
1027                                }
1028                                n->end = end;
1029                                sp_insert(sp, new2);
1030                        }
1031                        /* Old crossing beginning, but not end (easy) */
1032                        if (n->start < start && n->end > start)
1033                                n->end = start;
1034                }
1035                if (!next)
1036                        break;
1037                n = rb_entry(next, struct sp_node, nd);
1038        }
1039        if (new)
1040                sp_insert(sp, new);
1041        up(&sp->sem);
1042        return 0;
1043}
1044
1045int mpol_set_shared_policy(struct shared_policy *info,
1046                        struct vm_area_struct *vma, struct mempolicy *npol)
1047{
1048        int err;
1049        struct sp_node *new = NULL;
1050        unsigned long sz = vma_pages(vma);
1051
1052        PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1053                 vma->vm_pgoff,
1054                 sz, npol? npol->policy : -1,
1055                npol ? npol->v.nodes[0] : -1);
1056
1057        if (npol) {
1058                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1059                if (!new)
1060                        return -ENOMEM;
1061        }
1062        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1063        if (err && new)
1064                kmem_cache_free(sn_cache, new);
1065        return err;
1066}
1067
1068/* Free a backing policy store on inode delete. */
1069void mpol_free_shared_policy(struct shared_policy *p)
1070{
1071        struct sp_node *n;
1072        struct rb_node *next;
1073
1074        down(&p->sem);
1075        next = rb_first(&p->root);
1076        while (next) {
1077                n = rb_entry(next, struct sp_node, nd);
1078                next = rb_next(&n->nd);
1079                rb_erase(&n->nd, &p->root);
1080                mpol_free(n->policy);
1081                kmem_cache_free(sn_cache, n);
1082        }
1083        up(&p->sem);
1084}
1085
1086/* assumes fs == KERNEL_DS */
1087void __init numa_policy_init(void)
1088{
1089        policy_cache = kmem_cache_create("numa_policy",
1090                                         sizeof(struct mempolicy),
1091                                         0, SLAB_PANIC, NULL, NULL);
1092
1093        sn_cache = kmem_cache_create("shared_policy_node",
1094                                     sizeof(struct sp_node),
1095                                     0, SLAB_PANIC, NULL, NULL);
1096
1097        /* Set interleaving policy for system init. This way not all
1098           the data structures allocated at system boot end up in node zero. */
1099
1100        if (sys_set_mempolicy(MPOL_INTERLEAVE, node_online_map, MAX_NUMNODES) < 0)
1101                printk("numa_policy_init: interleaving failed\n");
1102}
1103
1104/* Reset policy of current process to default.
1105 * Assumes fs == KERNEL_DS */
1106void numa_default_policy(void)
1107{
1108        sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1109}
1110