RHEL4/mm/prio_tree.c
<<
>>
Prefs
   1/*
   2 * mm/prio_tree.c - priority search tree for mapping->i_mmap
   3 *
   4 * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
   5 *
   6 * This file is released under the GPL v2.
   7 *
   8 * Based on the radix priority search tree proposed by Edward M. McCreight
   9 * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
  10 *
  11 * 02Feb2004    Initial version
  12 */
  13
  14#include <linux/init.h>
  15#include <linux/module.h>
  16#include <linux/mm.h>
  17#include <linux/prio_tree.h>
  18
  19/*
  20 * A clever mix of heap and radix trees forms a radix priority search tree (PST)
  21 * which is useful for storing intervals, e.g, we can consider a vma as a closed
  22 * interval of file pages [offset_begin, offset_end], and store all vmas that
  23 * map a file in a PST. Then, using the PST, we can answer a stabbing query,
  24 * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
  25 * given input interval X (a set of consecutive file pages), in "O(log n + m)"
  26 * time where 'log n' is the height of the PST, and 'm' is the number of stored
  27 * intervals (vmas) that overlap (map) with the input interval X (the set of
  28 * consecutive file pages).
  29 *
  30 * In our implementation, we store closed intervals of the form [radix_index,
  31 * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
  32 * is designed for storing intervals with unique radix indices, i.e., each
  33 * interval have different radix_index. However, this limitation can be easily
  34 * overcome by using the size, i.e., heap_index - radix_index, as part of the
  35 * index, so we index the tree using [(radix_index,size), heap_index].
  36 *
  37 * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
  38 * machine, the maximum height of a PST can be 64. We can use a balanced version
  39 * of the priority search tree to optimize the tree height, but the balanced
  40 * tree proposed by McCreight is too complex and memory-hungry for our purpose.
  41 */
  42
  43/*
  44 * The following macros are used for implementing prio_tree for i_mmap
  45 */
  46
  47#define RADIX_INDEX(vma)  ((vma)->vm_pgoff)
  48#define VMA_SIZE(vma)     (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
  49/* avoid overflow */
  50#define HEAP_INDEX(vma)   ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
  51
  52#define GET_INDEX_VMA(vma, radix, heap)         \
  53do {                                            \
  54        radix = RADIX_INDEX(vma);               \
  55        heap = HEAP_INDEX(vma);                 \
  56} while (0)
  57
  58#define GET_INDEX(node, radix, heap)            \
  59do {                                            \
  60        struct vm_area_struct *__tmp =          \
  61          prio_tree_entry(node, struct vm_area_struct, shared.prio_tree_node);\
  62        GET_INDEX_VMA(__tmp, radix, heap);      \
  63} while (0)
  64
  65static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
  66
  67void __init prio_tree_init(void)
  68{
  69        unsigned int i;
  70
  71        for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
  72                index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
  73        index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
  74}
  75
  76/*
  77 * Maximum heap_index that can be stored in a PST with index_bits bits
  78 */
  79static inline unsigned long prio_tree_maxindex(unsigned int bits)
  80{
  81        return index_bits_to_maxindex[bits - 1];
  82}
  83
  84static void prio_tree_remove(struct prio_tree_root *, struct prio_tree_node *);
  85
  86/*
  87 * Extend a priority search tree so that it can store a node with heap_index
  88 * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
  89 * However, this function is used rarely and the common case performance is
  90 * not bad.
  91 */
  92static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
  93                struct prio_tree_node *node, unsigned long max_heap_index)
  94{
  95        struct prio_tree_node *first = NULL, *prev, *last = NULL;
  96
  97        if (max_heap_index > prio_tree_maxindex(root->index_bits))
  98                root->index_bits++;
  99
 100        while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
 101                root->index_bits++;
 102
 103                if (prio_tree_empty(root))
 104                        continue;
 105
 106                if (first == NULL) {
 107                        first = root->prio_tree_node;
 108                        prio_tree_remove(root, root->prio_tree_node);
 109                        INIT_PRIO_TREE_NODE(first);
 110                        last = first;
 111                } else {
 112                        prev = last;
 113                        last = root->prio_tree_node;
 114                        prio_tree_remove(root, root->prio_tree_node);
 115                        INIT_PRIO_TREE_NODE(last);
 116                        prev->left = last;
 117                        last->parent = prev;
 118                }
 119        }
 120
 121        INIT_PRIO_TREE_NODE(node);
 122
 123        if (first) {
 124                node->left = first;
 125                first->parent = node;
 126        } else
 127                last = node;
 128
 129        if (!prio_tree_empty(root)) {
 130                last->left = root->prio_tree_node;
 131                last->left->parent = last;
 132        }
 133
 134        root->prio_tree_node = node;
 135        return node;
 136}
 137
 138/*
 139 * Replace a prio_tree_node with a new node and return the old node
 140 */
 141static struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
 142                struct prio_tree_node *old, struct prio_tree_node *node)
 143{
 144        INIT_PRIO_TREE_NODE(node);
 145
 146        if (prio_tree_root(old)) {
 147                BUG_ON(root->prio_tree_node != old);
 148                /*
 149                 * We can reduce root->index_bits here. However, it is complex
 150                 * and does not help much to improve performance (IMO).
 151                 */
 152                node->parent = node;
 153                root->prio_tree_node = node;
 154        } else {
 155                node->parent = old->parent;
 156                if (old->parent->left == old)
 157                        old->parent->left = node;
 158                else
 159                        old->parent->right = node;
 160        }
 161
 162        if (!prio_tree_left_empty(old)) {
 163                node->left = old->left;
 164                old->left->parent = node;
 165        }
 166
 167        if (!prio_tree_right_empty(old)) {
 168                node->right = old->right;
 169                old->right->parent = node;
 170        }
 171
 172        return old;
 173}
 174
 175/*
 176 * Insert a prio_tree_node @node into a radix priority search tree @root. The
 177 * algorithm typically takes O(log n) time where 'log n' is the number of bits
 178 * required to represent the maximum heap_index. In the worst case, the algo
 179 * can take O((log n)^2) - check prio_tree_expand.
 180 *
 181 * If a prior node with same radix_index and heap_index is already found in
 182 * the tree, then returns the address of the prior node. Otherwise, inserts
 183 * @node into the tree and returns @node.
 184 */
 185static struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
 186                struct prio_tree_node *node)
 187{
 188        struct prio_tree_node *cur, *res = node;
 189        unsigned long radix_index, heap_index;
 190        unsigned long r_index, h_index, index, mask;
 191        int size_flag = 0;
 192
 193        GET_INDEX(node, radix_index, heap_index);
 194
 195        if (prio_tree_empty(root) ||
 196                        heap_index > prio_tree_maxindex(root->index_bits))
 197                return prio_tree_expand(root, node, heap_index);
 198
 199        cur = root->prio_tree_node;
 200        mask = 1UL << (root->index_bits - 1);
 201
 202        while (mask) {
 203                GET_INDEX(cur, r_index, h_index);
 204
 205                if (r_index == radix_index && h_index == heap_index)
 206                        return cur;
 207
 208                if (h_index < heap_index ||
 209                    (h_index == heap_index && r_index > radix_index)) {
 210                        struct prio_tree_node *tmp = node;
 211                        node = prio_tree_replace(root, cur, node);
 212                        cur = tmp;
 213                        /* swap indices */
 214                        index = r_index;
 215                        r_index = radix_index;
 216                        radix_index = index;
 217                        index = h_index;
 218                        h_index = heap_index;
 219                        heap_index = index;
 220                }
 221
 222                if (size_flag)
 223                        index = heap_index - radix_index;
 224                else
 225                        index = radix_index;
 226
 227                if (index & mask) {
 228                        if (prio_tree_right_empty(cur)) {
 229                                INIT_PRIO_TREE_NODE(node);
 230                                cur->right = node;
 231                                node->parent = cur;
 232                                return res;
 233                        } else
 234                                cur = cur->right;
 235                } else {
 236                        if (prio_tree_left_empty(cur)) {
 237                                INIT_PRIO_TREE_NODE(node);
 238                                cur->left = node;
 239                                node->parent = cur;
 240                                return res;
 241                        } else
 242                                cur = cur->left;
 243                }
 244
 245                mask >>= 1;
 246
 247                if (!mask) {
 248                        mask = 1UL << (BITS_PER_LONG - 1);
 249                        size_flag = 1;
 250                }
 251        }
 252        /* Should not reach here */
 253        BUG();
 254        return NULL;
 255}
 256
 257/*
 258 * Remove a prio_tree_node @node from a radix priority search tree @root. The
 259 * algorithm takes O(log n) time where 'log n' is the number of bits required
 260 * to represent the maximum heap_index.
 261 */
 262static void prio_tree_remove(struct prio_tree_root *root,
 263                struct prio_tree_node *node)
 264{
 265        struct prio_tree_node *cur;
 266        unsigned long r_index, h_index_right, h_index_left;
 267
 268        cur = node;
 269
 270        while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
 271                if (!prio_tree_left_empty(cur))
 272                        GET_INDEX(cur->left, r_index, h_index_left);
 273                else {
 274                        cur = cur->right;
 275                        continue;
 276                }
 277
 278                if (!prio_tree_right_empty(cur))
 279                        GET_INDEX(cur->right, r_index, h_index_right);
 280                else {
 281                        cur = cur->left;
 282                        continue;
 283                }
 284
 285                /* both h_index_left and h_index_right cannot be 0 */
 286                if (h_index_left >= h_index_right)
 287                        cur = cur->left;
 288                else
 289                        cur = cur->right;
 290        }
 291
 292        if (prio_tree_root(cur)) {
 293                BUG_ON(root->prio_tree_node != cur);
 294                INIT_PRIO_TREE_ROOT(root);
 295                return;
 296        }
 297
 298        if (cur->parent->right == cur)
 299                cur->parent->right = cur->parent;
 300        else
 301                cur->parent->left = cur->parent;
 302
 303        while (cur != node)
 304                cur = prio_tree_replace(root, cur->parent, cur);
 305}
 306
 307/*
 308 * Following functions help to enumerate all prio_tree_nodes in the tree that
 309 * overlap with the input interval X [radix_index, heap_index]. The enumeration
 310 * takes O(log n + m) time where 'log n' is the height of the tree (which is
 311 * proportional to # of bits required to represent the maximum heap_index) and
 312 * 'm' is the number of prio_tree_nodes that overlap the interval X.
 313 */
 314
 315static struct prio_tree_node *prio_tree_left(struct prio_tree_iter *iter,
 316                unsigned long *r_index, unsigned long *h_index)
 317{
 318        if (prio_tree_left_empty(iter->cur))
 319                return NULL;
 320
 321        GET_INDEX(iter->cur->left, *r_index, *h_index);
 322
 323        if (iter->r_index <= *h_index) {
 324                iter->cur = iter->cur->left;
 325                iter->mask >>= 1;
 326                if (iter->mask) {
 327                        if (iter->size_level)
 328                                iter->size_level++;
 329                } else {
 330                        if (iter->size_level) {
 331                                BUG_ON(!prio_tree_left_empty(iter->cur));
 332                                BUG_ON(!prio_tree_right_empty(iter->cur));
 333                                iter->size_level++;
 334                                iter->mask = ULONG_MAX;
 335                        } else {
 336                                iter->size_level = 1;
 337                                iter->mask = 1UL << (BITS_PER_LONG - 1);
 338                        }
 339                }
 340                return iter->cur;
 341        }
 342
 343        return NULL;
 344}
 345
 346static struct prio_tree_node *prio_tree_right(struct prio_tree_iter *iter,
 347                unsigned long *r_index, unsigned long *h_index)
 348{
 349        unsigned long value;
 350
 351        if (prio_tree_right_empty(iter->cur))
 352                return NULL;
 353
 354        if (iter->size_level)
 355                value = iter->value;
 356        else
 357                value = iter->value | iter->mask;
 358
 359        if (iter->h_index < value)
 360                return NULL;
 361
 362        GET_INDEX(iter->cur->right, *r_index, *h_index);
 363
 364        if (iter->r_index <= *h_index) {
 365                iter->cur = iter->cur->right;
 366                iter->mask >>= 1;
 367                iter->value = value;
 368                if (iter->mask) {
 369                        if (iter->size_level)
 370                                iter->size_level++;
 371                } else {
 372                        if (iter->size_level) {
 373                                BUG_ON(!prio_tree_left_empty(iter->cur));
 374                                BUG_ON(!prio_tree_right_empty(iter->cur));
 375                                iter->size_level++;
 376                                iter->mask = ULONG_MAX;
 377                        } else {
 378                                iter->size_level = 1;
 379                                iter->mask = 1UL << (BITS_PER_LONG - 1);
 380                        }
 381                }
 382                return iter->cur;
 383        }
 384
 385        return NULL;
 386}
 387
 388static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter)
 389{
 390        iter->cur = iter->cur->parent;
 391        if (iter->mask == ULONG_MAX)
 392                iter->mask = 1UL;
 393        else if (iter->size_level == 1)
 394                iter->mask = 1UL;
 395        else
 396                iter->mask <<= 1;
 397        if (iter->size_level)
 398                iter->size_level--;
 399        if (!iter->size_level && (iter->value & iter->mask))
 400                iter->value ^= iter->mask;
 401        return iter->cur;
 402}
 403
 404static inline int overlap(struct prio_tree_iter *iter,
 405                unsigned long r_index, unsigned long h_index)
 406{
 407        return iter->h_index >= r_index && iter->r_index <= h_index;
 408}
 409
 410/*
 411 * prio_tree_first:
 412 *
 413 * Get the first prio_tree_node that overlaps with the interval [radix_index,
 414 * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
 415 * traversal of the tree.
 416 */
 417static struct prio_tree_node *prio_tree_first(struct prio_tree_iter *iter)
 418{
 419        struct prio_tree_root *root;
 420        unsigned long r_index, h_index;
 421
 422        INIT_PRIO_TREE_ITER(iter);
 423
 424        root = iter->root;
 425        if (prio_tree_empty(root))
 426                return NULL;
 427
 428        GET_INDEX(root->prio_tree_node, r_index, h_index);
 429
 430        if (iter->r_index > h_index)
 431                return NULL;
 432
 433        iter->mask = 1UL << (root->index_bits - 1);
 434        iter->cur = root->prio_tree_node;
 435
 436        while (1) {
 437                if (overlap(iter, r_index, h_index))
 438                        return iter->cur;
 439
 440                if (prio_tree_left(iter, &r_index, &h_index))
 441                        continue;
 442
 443                if (prio_tree_right(iter, &r_index, &h_index))
 444                        continue;
 445
 446                break;
 447        }
 448        return NULL;
 449}
 450
 451/*
 452 * prio_tree_next:
 453 *
 454 * Get the next prio_tree_node that overlaps with the input interval in iter
 455 */
 456static struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter)
 457{
 458        unsigned long r_index, h_index;
 459
 460repeat:
 461        while (prio_tree_left(iter, &r_index, &h_index))
 462                if (overlap(iter, r_index, h_index))
 463                        return iter->cur;
 464
 465        while (!prio_tree_right(iter, &r_index, &h_index)) {
 466                while (!prio_tree_root(iter->cur) &&
 467                                iter->cur->parent->right == iter->cur)
 468                        prio_tree_parent(iter);
 469
 470                if (prio_tree_root(iter->cur))
 471                        return NULL;
 472
 473                prio_tree_parent(iter);
 474        }
 475
 476        if (overlap(iter, r_index, h_index))
 477                return iter->cur;
 478
 479        goto repeat;
 480}
 481
 482/*
 483 * Radix priority search tree for address_space->i_mmap
 484 *
 485 * For each vma that map a unique set of file pages i.e., unique [radix_index,
 486 * heap_index] value, we have a corresponing priority search tree node. If
 487 * multiple vmas have identical [radix_index, heap_index] value, then one of
 488 * them is used as a tree node and others are stored in a vm_set list. The tree
 489 * node points to the first vma (head) of the list using vm_set.head.
 490 *
 491 * prio_tree_root
 492 *      |
 493 *      A       vm_set.head
 494 *     / \      /
 495 *    L   R -> H-I-J-K-M-N-O-P-Q-S
 496 *    ^   ^    <-- vm_set.list -->
 497 *  tree nodes
 498 *
 499 * We need some way to identify whether a vma is a tree node, head of a vm_set
 500 * list, or just a member of a vm_set list. We cannot use vm_flags to store
 501 * such information. The reason is, in the above figure, it is possible that
 502 * vm_flags' of R and H are covered by the different mmap_sems. When R is
 503 * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
 504 * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
 505 * That's why some trick involving shared.vm_set.parent is used for identifying
 506 * tree nodes and list head nodes.
 507 *
 508 * vma radix priority search tree node rules:
 509 *
 510 * vma->shared.vm_set.parent != NULL    ==> a tree node
 511 *      vma->shared.vm_set.head != NULL ==> list of others mapping same range
 512 *      vma->shared.vm_set.head == NULL ==> no others map the same range
 513 *
 514 * vma->shared.vm_set.parent == NULL
 515 *      vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
 516 *      vma->shared.vm_set.head == NULL ==> a list node
 517 */
 518
 519/*
 520 * Add a new vma known to map the same set of pages as the old vma:
 521 * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
 522 * Note that it just happens to work correctly on i_mmap_nonlinear too.
 523 */
 524void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
 525{
 526        /* Leave these BUG_ONs till prio_tree patch stabilizes */
 527        BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
 528        BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
 529
 530        vma->shared.vm_set.head = NULL;
 531        vma->shared.vm_set.parent = NULL;
 532
 533        if (!old->shared.vm_set.parent)
 534                list_add(&vma->shared.vm_set.list,
 535                                &old->shared.vm_set.list);
 536        else if (old->shared.vm_set.head)
 537                list_add_tail(&vma->shared.vm_set.list,
 538                                &old->shared.vm_set.head->shared.vm_set.list);
 539        else {
 540                INIT_LIST_HEAD(&vma->shared.vm_set.list);
 541                vma->shared.vm_set.head = old;
 542                old->shared.vm_set.head = vma;
 543        }
 544}
 545
 546void vma_prio_tree_insert(struct vm_area_struct *vma,
 547                          struct prio_tree_root *root)
 548{
 549        struct prio_tree_node *ptr;
 550        struct vm_area_struct *old;
 551
 552        vma->shared.vm_set.head = NULL;
 553
 554        ptr = prio_tree_insert(root, &vma->shared.prio_tree_node);
 555        if (ptr != &vma->shared.prio_tree_node) {
 556                old = prio_tree_entry(ptr, struct vm_area_struct,
 557                                        shared.prio_tree_node);
 558                vma_prio_tree_add(vma, old);
 559        }
 560}
 561
 562void vma_prio_tree_remove(struct vm_area_struct *vma,
 563                          struct prio_tree_root *root)
 564{
 565        struct vm_area_struct *node, *head, *new_head;
 566
 567        if (!vma->shared.vm_set.head) {
 568                if (!vma->shared.vm_set.parent)
 569                        list_del_init(&vma->shared.vm_set.list);
 570                else
 571                        prio_tree_remove(root, &vma->shared.prio_tree_node);
 572        } else {
 573                /* Leave this BUG_ON till prio_tree patch stabilizes */
 574                BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
 575                if (vma->shared.vm_set.parent) {
 576                        head = vma->shared.vm_set.head;
 577                        if (!list_empty(&head->shared.vm_set.list)) {
 578                                new_head = list_entry(
 579                                        head->shared.vm_set.list.next,
 580                                        struct vm_area_struct,
 581                                        shared.vm_set.list);
 582                                list_del_init(&head->shared.vm_set.list);
 583                        } else
 584                                new_head = NULL;
 585
 586                        prio_tree_replace(root, &vma->shared.prio_tree_node,
 587                                        &head->shared.prio_tree_node);
 588                        head->shared.vm_set.head = new_head;
 589                        if (new_head)
 590                                new_head->shared.vm_set.head = head;
 591
 592                } else {
 593                        node = vma->shared.vm_set.head;
 594                        if (!list_empty(&vma->shared.vm_set.list)) {
 595                                new_head = list_entry(
 596                                        vma->shared.vm_set.list.next,
 597                                        struct vm_area_struct,
 598                                        shared.vm_set.list);
 599                                list_del_init(&vma->shared.vm_set.list);
 600                                node->shared.vm_set.head = new_head;
 601                                new_head->shared.vm_set.head = node;
 602                        } else
 603                                node->shared.vm_set.head = NULL;
 604                }
 605        }
 606}
 607
 608/*
 609 * Helper function to enumerate vmas that map a given file page or a set of
 610 * contiguous file pages. The function returns vmas that at least map a single
 611 * page in the given range of contiguous file pages.
 612 */
 613struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
 614                                        struct prio_tree_iter *iter)
 615{
 616        struct prio_tree_node *ptr;
 617        struct vm_area_struct *next;
 618
 619        if (!vma) {
 620                /*
 621                 * First call is with NULL vma
 622                 */
 623                ptr = prio_tree_first(iter);
 624                if (ptr) {
 625                        next = prio_tree_entry(ptr, struct vm_area_struct,
 626                                                shared.prio_tree_node);
 627                        prefetch(next->shared.vm_set.head);
 628                        return next;
 629                } else
 630                        return NULL;
 631        }
 632
 633        if (vma->shared.vm_set.parent) {
 634                if (vma->shared.vm_set.head) {
 635                        next = vma->shared.vm_set.head;
 636                        prefetch(next->shared.vm_set.list.next);
 637                        return next;
 638                }
 639        } else {
 640                next = list_entry(vma->shared.vm_set.list.next,
 641                                struct vm_area_struct, shared.vm_set.list);
 642                if (!next->shared.vm_set.head) {
 643                        prefetch(next->shared.vm_set.list.next);
 644                        return next;
 645                }
 646        }
 647
 648        ptr = prio_tree_next(iter);
 649        if (ptr) {
 650                next = prio_tree_entry(ptr, struct vm_area_struct,
 651                                        shared.prio_tree_node);
 652                prefetch(next->shared.vm_set.head);
 653                return next;
 654        } else
 655                return NULL;
 656}
 657